Added Manuel Ferrara site to Jules Jordan. Refactored Jules Jordan photo scraper for better compatability and quality.

This commit is contained in:
2020-01-08 05:12:14 +01:00
parent a5046b3b7b
commit a773e0bf54
64 changed files with 167 additions and 154 deletions

View File

@@ -7,6 +7,10 @@ const slugify = require('../utils/slugify');
const clusterId = '617fb597b659459bafe6472470d9073a';
const authKey = 'YmFuZy1yZWFkOktqVDN0RzJacmQ1TFNRazI=';
function getScreenUrl(item, scene) {
return `https://i.bang.com/screenshots/${scene.dvd.id}/movie/${scene.order}/${item.screenId}.jpg`;
}
function encodeId(id) {
return Buffer
.from(id, 'hex')
@@ -53,8 +57,8 @@ function scrapeScene(scene, site) {
const photos = defaultPoster ? photoset : photoset.slice(1);
const poster = defaultPoster || photoset[0];
release.poster = `https://i.bang.com/screenshots/${scene.dvd.id}/movie/${scene.order}/${poster.screenId}.jpg`;
release.photos = photos.map(photo => `https://i.bang.com/screenshots/${scene.dvd.id}/movie/${scene.order}/${photo.screenId}.jpg`);
release.poster = getScreenUrl(poster, scene);
release.photos = photos.map(photo => getScreenUrl(photo, scene));
release.trailer = {
src: `https://i.bang.com/v/${scene.dvd.id}/${scene.identifier}/preview.mp4`,

View File

@@ -1,6 +1,6 @@
'use strict';
const Promise = require('bluebird');
// const Promise = require('bluebird');
const bhttp = require('bhttp');
const cheerio = require('cheerio');
const { JSDOM } = require('jsdom');
@@ -32,6 +32,8 @@ function scrapePhotos(html, type) {
return [
src.replace('thumbs/', 'photos/'),
src.replace('thumbs/', '1600watermarked/'),
src.replace('thumbs/', '1280watermarked/'),
src.replace('thumbs/', '1024watermarked/'),
src,
];
@@ -40,8 +42,10 @@ function scrapePhotos(html, type) {
return photos;
}
async function getPhotos(entryId, site, page = 1, type = 'highres') {
const albumUrl = `https://www.julesjordan.com/trial/gallery.php?id=${entryId}&type=${type}&page=${page}`;
async function getPhotosLegacy(entryId, site, type = 'highres', page = 1) {
const albumUrl = `${site.url}/trial/gallery.php?id=${entryId}&type=${type}&page=${page}`;
console.warn(`Jules Jordan is using legacy photo scraper for ${albumUrl} (page ${page})`);
const html = await fetchPhotos(albumUrl);
const $ = cheerio.load(html, { normalizeWhitespace: true });
@@ -65,12 +69,48 @@ async function getPhotos(entryId, site, page = 1, type = 'highres') {
if (allPhotos.length === 0 && type === 'highres') {
// photos not available, try for screencaps instead
return getPhotos(entryId, site, 1, 'caps');
return getPhotosLegacy(entryId, site, 'caps', 1);
}
return allPhotos;
}
async function getPhotos(entryId, site, type = 'highres', page = 1) {
const albumUrl = `${site.url}/trial/gallery.php?id=${entryId}&type=${type}&page=${page}`;
const res = await bhttp.get(albumUrl);
const html = res.body.toString();
const sourceLines = html.split(/\n/).filter(line => line.match(/ptx\["\w+"\]/));
const sources = sourceLines.reduce((acc, sourceLine) => {
const quality = sourceLine.match(/\["\w+"\]/)[0].slice(2, -2);
const source = sourceLine.slice(sourceLine.indexOf('/trial'), sourceLine.indexOf('.jpg') + 4);
if (!source) return acc;
if (!acc[quality]) acc[quality] = [];
acc[quality].push(`${site.url}${source}`);
return acc;
}, {});
if (type === 'highres') {
if (sources['1600'] && sources['1600'].length > 0) return sources['1600'];
if (sources['1280'] && sources['1280'].length > 0) return sources['1280'];
if (sources['1024'] && sources['1024'].length > 0) return sources['1024'];
if (sources.Thumbs && sources.Thumbs.length > 0) return sources.Thumbs;
// no photos available, try for screencaps instead
return getPhotos(entryId, site, 'caps', 1);
}
if (sources.jpg && sources.jpg.length > 0) return sources.jpg;
if (sources['Video Cap Thumbs'] && sources['Video Cap Thumbs'].length > 0) return sources['Video Cap Thumbs'];
// no screencaps available either, try legacy scraper just in case
return getPhotosLegacy(entryId, site, 'highres', 1);
}
function scrapeLatest(html, site) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const scenesElements = $('.update_details').toArray();
@@ -171,9 +211,24 @@ async function scrapeScene(html, url, site) {
release.title = $('.title_bar_hilite').text().trim();
[release.entryId] = $('.suggest_tags a').attr('href').match(/\d+/);
release.date = moment
.utc($('.update_date').text(), 'MM/DD/YYYY')
.toDate();
const dateElement = $('.update_date').text().trim();
const dateComment = $('*')
.contents()
.toArray()
.find(({ type, data }) => type === 'comment' && data.match('Date OFF'));
if (dateElement) {
release.date = moment
.utc($('.update_date').text(), 'MM/DD/YYYY')
.toDate();
}
if (dateComment) {
release.date = moment
.utc(dateComment.nodeValue.match(/\d{2}\/\d{2}\/\d{4}/), 'MM/DD/YYYY')
.toDate();
}
release.description = $('.update_description').text().trim();
@@ -190,10 +245,12 @@ async function scrapeScene(html, url, site) {
const trailerLine = infoLines.find(line => line.match('movie["Trailer_720"]'));
release.trailer = {
src: trailerLine.slice(trailerLine.indexOf('path:"') + 6, trailerLine.indexOf('",movie')),
quality: 720,
};
if (site.slug !== 'manuelferrara') {
release.trailer = {
src: trailerLine.slice(trailerLine.indexOf('path:"') + 6, trailerLine.indexOf('",movie')),
quality: 720,
};
}
release.photos = await getPhotos(release.entryId, site);