forked from DebaucheryLibrarian/traxxx
Updated Jules Jordan scraper.
This commit is contained in:
@@ -69,7 +69,10 @@ async function fetchUnprintScene(scraper, url, entity, baseRelease, options, typ
|
||||
entity,
|
||||
baseRelease,
|
||||
headers: res.headers,
|
||||
}, options);
|
||||
include,
|
||||
beforeFetchScenes: options.beforeFetchScenes,
|
||||
parameters: options.parameters,
|
||||
}, options); // options parameter should probably be retired
|
||||
}
|
||||
|
||||
return res.status;
|
||||
|
||||
@@ -929,7 +929,7 @@ async function associateReleaseMedia(releases, type = 'release') {
|
||||
logger.error(util.inspect(error.entries, null, null, { color: true }));
|
||||
}
|
||||
|
||||
logger.error(`Failed to store ${type} ${role}: ${error.message}`);
|
||||
logger.error(`Failed to store ${type} ${role}: ${error.message} (${error.detail || 'no detail'}`);
|
||||
}
|
||||
}, Promise.resolve());
|
||||
}
|
||||
|
||||
@@ -5,118 +5,13 @@ const Promise = require('bluebird');
|
||||
const cheerio = require('cheerio');
|
||||
const { JSDOM } = require('jsdom');
|
||||
const moment = require('moment');
|
||||
const unprint = require('unprint');
|
||||
|
||||
const qu = require('../utils/qu');
|
||||
const http = require('../utils/http');
|
||||
const { heightToCm } = require('../utils/convert');
|
||||
const slugify = require('../utils/slugify');
|
||||
|
||||
async function fetchPhotos(url) {
|
||||
const res = await http.get(url);
|
||||
|
||||
return res.body.toString();
|
||||
}
|
||||
|
||||
function scrapePhotos(html, type) {
|
||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||
|
||||
const photos = $('.photo_gallery_thumbnail_wrapper .thumbs')
|
||||
.toArray()
|
||||
.map((photoElement) => {
|
||||
const src = $(photoElement).attr('src');
|
||||
|
||||
// high res often available in alternative directories, but not always, provide original as fallback
|
||||
if (type === 'caps') {
|
||||
return [
|
||||
src.replace('capthumbs/', 'caps/'),
|
||||
src,
|
||||
];
|
||||
}
|
||||
|
||||
return [
|
||||
src.replace('thumbs/', 'photos/'),
|
||||
src.replace('thumbs/', '1600watermarked/'),
|
||||
src.replace('thumbs/', '1280watermarked/'),
|
||||
src.replace('thumbs/', '1024watermarked/'),
|
||||
src,
|
||||
];
|
||||
});
|
||||
|
||||
return photos;
|
||||
}
|
||||
|
||||
async function getPhotosLegacy(entryId, site, type = 'highres', page = 1) {
|
||||
const albumUrl = `${site.url}/trial/gallery.php?id=${entryId}&type=${type}&page=${page}`;
|
||||
|
||||
// logger.warn(`Jules Jordan is using legacy photo scraper for ${albumUrl} (page ${page})`);
|
||||
|
||||
const html = await fetchPhotos(albumUrl);
|
||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||
|
||||
// don't add first URL to pages to prevent unnecessary duplicate request
|
||||
const photos = scrapePhotos(html, type);
|
||||
const pages = Array.from(new Set($('.page_numbers a').toArray().map((el) => $(el).attr('href'))));
|
||||
|
||||
const otherPhotos = pages
|
||||
? await Promise.map(pages, async (pageX) => {
|
||||
const pageUrl = `https://www.julesjordan.com/trial/${pageX}`;
|
||||
const pageHtml = await fetchPhotos(pageUrl);
|
||||
|
||||
return scrapePhotos(pageHtml, type);
|
||||
}, {
|
||||
concurrency: 2,
|
||||
})
|
||||
: [];
|
||||
|
||||
const allPhotos = photos.concat(otherPhotos.flat());
|
||||
|
||||
if (allPhotos.length === 0 && type === 'highres') {
|
||||
// photos not available, try for screencaps instead
|
||||
return getPhotosLegacy(entryId, site, 'caps', 1);
|
||||
}
|
||||
|
||||
return allPhotos;
|
||||
}
|
||||
|
||||
async function getPhotos(entryId, site, type = 'highres', page = 1) {
|
||||
const albumUrl = `${site.parameters?.photos || `${site.url}/gallery.php`}?id=${entryId}&type=${type}&page=${page}`;
|
||||
|
||||
const res = await http.get(albumUrl);
|
||||
const html = res.body.toString();
|
||||
|
||||
const sourceLines = html.split(/\n/).filter((line) => line.match(/ptx\["\w+"\]/));
|
||||
const sources = sourceLines.reduce((acc, sourceLine) => {
|
||||
const quality = sourceLine.match(/\["\w+"\]/)[0].slice(2, -2);
|
||||
const sourceStart = sourceLine.match(/\/trial|\/tour|\/content/);
|
||||
|
||||
if (!sourceStart) return acc;
|
||||
const source = sourceLine.slice(sourceStart.index, sourceLine.indexOf('.jpg') + 4);
|
||||
|
||||
if (!source) return acc;
|
||||
if (!acc[quality]) acc[quality] = [];
|
||||
|
||||
acc[quality].push(`${site.url}${source}`);
|
||||
|
||||
return acc;
|
||||
}, {});
|
||||
|
||||
if (type === 'highres') {
|
||||
if (sources['1600'] && sources['1600'].length > 0) return sources['1600'];
|
||||
if (sources['1280'] && sources['1280'].length > 0) return sources['1280'];
|
||||
if (sources['1024'] && sources['1024'].length > 0) return sources['1024'];
|
||||
if (sources.Thumbs && sources.Thumbs.length > 0) return sources.Thumbs;
|
||||
|
||||
// no photos available, try for screencaps instead
|
||||
return getPhotos(entryId, site, 'caps', 1);
|
||||
}
|
||||
|
||||
if (sources.jpg && sources.jpg.length > 0) return sources.jpg;
|
||||
if (sources['Video Cap Thumbs'] && sources['Video Cap Thumbs'].length > 0) return sources['Video Cap Thumbs'];
|
||||
|
||||
// no screencaps available either, try legacy scraper just in case
|
||||
return getPhotosLegacy(entryId, site, 'highres', 1);
|
||||
}
|
||||
|
||||
function getEntryId(html) {
|
||||
const entryId = html.match(/showtagform\((\d+)\)/);
|
||||
|
||||
@@ -134,28 +29,28 @@ function getEntryId(html) {
|
||||
}
|
||||
|
||||
function scrapeAll(scenes, site, entryIdFromTitle) {
|
||||
return scenes.map(({ el, query }) => {
|
||||
return scenes.map(({ element, query }) => {
|
||||
const release = {};
|
||||
const title = query.cnt('.content_img div, .dvd_info > a, a.update_title, a[title] + a[title]') || query.cnt('a[title*=" "]');
|
||||
const title = query.content('.content_img div, .dvd_info > a, a.update_title, a[title] + a[title], .overlay-text') || query.content('a[title*=" "]');
|
||||
|
||||
release.title = title?.slice(0, title.match(/starring:/i)?.index || Infinity).trim();
|
||||
release.url = query.url('.content_img a, .dvd_info > a, a.update_title, a[title*=" "]');
|
||||
release.url = query.url('.content_img a, .dvd_info > a, a.update_title, a[title]');
|
||||
release.date = query.date('.update_date', 'MM/DD/YYYY');
|
||||
|
||||
release.entryId = (entryIdFromTitle && slugify(release.title)) || el.dataset.setid || query.q('.rating_box')?.dataset.id;
|
||||
release.entryId = (entryIdFromTitle && slugify(release.title)) || element.dataset.setid || query.element('.rating_box')?.dataset.id || query.attribute('a img', 'id')?.match(/set-target-(\d+)/)?.[1];
|
||||
|
||||
release.actors = query.all('.content_img .update_models a, .update_models a').map((actorEl) => ({
|
||||
name: query.cnt(actorEl),
|
||||
url: query.url(actorEl, null),
|
||||
name: unprint.query.content(actorEl),
|
||||
url: unprint.query.url(actorEl, null),
|
||||
}));
|
||||
|
||||
const dvdPhotos = query.imgs('.dvd_preview_thumb');
|
||||
const photoCount = Number(query.q('a img.thumbs', 'cnt')) || 1;
|
||||
const photoCount = Number(query.attribute('a img.thumbs', 'cnt')) || 1;
|
||||
|
||||
[release.poster, ...release.photos] = dvdPhotos.length
|
||||
? dvdPhotos
|
||||
: Array.from({ length: photoCount }).map((value, index) => {
|
||||
const src = query.img('a img.thumbs', `src${index}_1x`) || query.img('a img.thumbs', `src${index}`) || query.img('a img.thumbs');
|
||||
const src = query.img('a img.thumbs', { attribute: `src${index}_1x` }) || query.img('a img.thumbs', { attribute: `src${index}` }) || query.img('a img.thumbs');
|
||||
const prefixedSrc = qu.prefixUrl(src, site.url);
|
||||
|
||||
if (src) {
|
||||
@@ -239,56 +134,105 @@ function scrapeUpcoming(html, site) {
|
||||
});
|
||||
}
|
||||
|
||||
async function scrapeScene({ html, query }, url, site, options) {
|
||||
function extractLegacyTrailer(html, context) {
|
||||
const trailerLines = html.split('\n').filter((line) => /movie\["trailer\w*"\]\[/i.test(line));
|
||||
|
||||
if (trailerLines.length) {
|
||||
return trailerLines.map((trailerLine) => {
|
||||
// const src = trailerLine.match(/path:"([\w-:/.&=?%]+)"/)?.[1];
|
||||
const src = trailerLine.match(/path:"(.+)"/)?.[1];
|
||||
const quality = trailerLine.match(/movie_height:'(\d+)/)?.[1];
|
||||
|
||||
return src && {
|
||||
src: /^http/.test(src) ? src : `${context.entity.url}${src}`,
|
||||
quality: quality && Number(quality.replace('558', '540')),
|
||||
};
|
||||
}).filter(Boolean);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
const qualities = [
|
||||
'photos',
|
||||
'1600watermarked',
|
||||
'1280watermarked',
|
||||
'1024watermarked',
|
||||
'thumbs',
|
||||
];
|
||||
|
||||
function getPhotos(query, release, context) {
|
||||
// https://thumbs.julesjordan.com/members/content//upload/dl03/julesjordan/whitney_wright_dredd/1024watermarked/whitney_wright_julesjordan.com-20.jpg
|
||||
// https://thumbs.julesjordan.com/members/content//upload/dl03/julesjordan/bambi_barton_manuel_ferrara/1024watermarked/bambi_barton_julesjordan_com-13.jpg
|
||||
if (!release.actors?.length > 0) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const photoCount = query.number('//div[contains(@class, "title-heading-content")][contains(text(), "Photos")]');
|
||||
|
||||
if (photoCount) {
|
||||
// slug actor order is not always the same as actor list order, prefer trailer slug if available
|
||||
const path = query.dataset('.movieformat_button', 'src')?.match(/:(.*)_trailer/)?.[1] || release.actors.map((actor) => slugify(actor.name || actor, '_')).join('_');
|
||||
|
||||
const derivedActorSlug = path.replace(`_${release.actors.slice(1).map(({ name }) => slugify(name, '_'))}`, '');
|
||||
const actorSlug = derivedActorSlug === path // no replacement took place, so the slug is likely invalid
|
||||
? slugify(release.actors[0].name || release.actors[0], '_')
|
||||
: derivedActorSlug;
|
||||
|
||||
return Array.from({ length: photoCount }, (value, index) => qualities
|
||||
.flatMap((quality) => [
|
||||
`https://thumbs.${context.entity.slug}.com/trial/content//upload/dl03/${context.entity.slug}/${path}/${quality}/${actorSlug}_${context.entity.slug}_com-${index + 1}.jpg`,
|
||||
`https://thumbs.${context.entity.slug}.com/trial/content//upload/dl03/${context.entity.slug}/${path}/${quality}/${actorSlug}_${context.entity.slug}.com-${index + 1}.jpg`, // .com instead of _com
|
||||
]));
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
async function scrapeScene({ html, query }, context) {
|
||||
const release = {};
|
||||
|
||||
release.entryId = getEntryId(html);
|
||||
release.title = query.cnt('.title_bar_hilite');
|
||||
release.description = query.cnt('.update_description');
|
||||
release.title = query.content('.title_bar_hilite, .movie_title');
|
||||
release.description = query.content('.update_description') || query.text('//div[./span[contains(text(), "Description")]]');
|
||||
release.entryId = context.entity.parameters?.entryIdFromTitle ? slugify(release.title) : getEntryId(html);
|
||||
|
||||
release.date = query.date('.update_date', 'MM/DD/YYYY', null, 'innerHTML');
|
||||
release.date = query.date(['.update_date', '//div[./span[contains(text(), "Date")]]'], 'MM/DD/YYYY');
|
||||
|
||||
release.actors = query.all('.backgroundcolor_info > .update_models a, .item .update_models a').map((actorEl) => ({
|
||||
name: query.cnt(actorEl),
|
||||
url: query.url(actorEl, null),
|
||||
release.actors = query.all('.backgroundcolor_info > .update_models a, .item .update_models a, .player-scene-description .update_models a').map((actorEl) => ({
|
||||
name: unprint.query.content(actorEl),
|
||||
url: unprint.query.url(actorEl, null),
|
||||
}));
|
||||
|
||||
release.tags = query.cnts('.update_tags a');
|
||||
release.tags = query.contents('.update_tags a, .player-scene-description a[href*="/categories"]');
|
||||
release.director = release.tags?.find((tag) => ['mike john', 'van styles'].includes(tag?.trim().toLowerCase()));
|
||||
|
||||
const posterPath = html.match(/useimage = "(.*)"/)?.[1];
|
||||
const posterPath = query.poster('#video-player') || html.match(/useimage = "(.*)"/)?.[1];
|
||||
|
||||
if (posterPath) {
|
||||
const poster = /^http/.test(posterPath) ? posterPath : `${site.url}${posterPath}`;
|
||||
const poster = /^http/.test(posterPath) ? posterPath : `${context.entity.url}${posterPath}`;
|
||||
|
||||
if (poster) {
|
||||
release.poster = {
|
||||
src: poster,
|
||||
referer: site.url,
|
||||
referer: context.entity.url,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
if (options.includeTrailers && site.slug !== 'manuelferrara') {
|
||||
const trailerLines = html.split('\n').filter((line) => /movie\["trailer\w*"\]\[/i.test(line));
|
||||
|
||||
if (trailerLines.length) {
|
||||
release.trailer = trailerLines.map((trailerLine) => {
|
||||
// const src = trailerLine.match(/path:"([\w-:/.&=?%]+)"/)?.[1];
|
||||
const src = trailerLine.match(/path:"(.+)"/)?.[1];
|
||||
const quality = trailerLine.match(/movie_height:'(\d+)/)?.[1];
|
||||
|
||||
return src && {
|
||||
src: /^http/.test(src) ? src : `${site.url}${src}`,
|
||||
quality: quality && Number(quality.replace('558', '540')),
|
||||
};
|
||||
}).filter(Boolean);
|
||||
}
|
||||
if (query.exists('source[data-bitrate="trailer"]')) {
|
||||
release.trailer = [
|
||||
query.video('source[data-bitrate="trailer_1080" i]'),
|
||||
query.video('source[data-bitrate="trailer_720" i]'),
|
||||
query.video('source[data-bitrate="trailer" i]'), // also seems to be 720p
|
||||
query.video('source[data-bitrate="trailer_mobile" i]'), // also seems to be 720p
|
||||
];
|
||||
} else if (context.include.trailers && context.entity.slug !== 'manuelferrara') {
|
||||
release.trailer = extractLegacyTrailer(html, context);
|
||||
}
|
||||
|
||||
if (options.includePhotos) {
|
||||
release.photos = await getPhotos(release.entryId, site);
|
||||
}
|
||||
// release.photos = async () => await getPhotos(release.entryId, context.entity); // probably no longer works on any site
|
||||
// release.photos = query.imgs('#images img');
|
||||
release.photos = getPhotos(query, release, context);
|
||||
|
||||
if (query.exists('.update_dvds a')) {
|
||||
release.movie = {
|
||||
@@ -381,9 +325,13 @@ async function fetchLatest(site, page = 1, include, preData, entryIdFromTitle =
|
||||
: `${site.url}/trial/categories/movies_${page}_d.html`;
|
||||
|
||||
// const res = await http.get(url);
|
||||
const res = await qu.getAll(url, '.update_details');
|
||||
const res = await unprint.get(url, { selectAll: '.update_details, .grid-item' });
|
||||
|
||||
return res.ok ? scrapeAll(res.items, site, entryIdFromTitle) : res.status;
|
||||
if (res.ok) {
|
||||
return scrapeAll(res.context, site, typeof site.parameters?.entryIdFromTitle === 'boolean' ? site.parameters.entryIdFromTitle : entryIdFromTitle);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchUpcoming(site) {
|
||||
@@ -399,12 +347,6 @@ async function fetchUpcoming(site) {
|
||||
return res.statusCode;
|
||||
}
|
||||
|
||||
async function fetchScene(url, site, baseRelease, include) {
|
||||
const res = await qu.get(url);
|
||||
|
||||
return res.ok ? scrapeScene(res.item, url, site, include) : res.status;
|
||||
}
|
||||
|
||||
async function fetchMovie(url, site) {
|
||||
const res = await qu.get(url);
|
||||
|
||||
@@ -447,5 +389,8 @@ module.exports = {
|
||||
fetchMovie,
|
||||
fetchProfile,
|
||||
fetchUpcoming,
|
||||
fetchScene,
|
||||
scrapeScene: {
|
||||
scraper: scrapeScene,
|
||||
unprint: true,
|
||||
},
|
||||
};
|
||||
|
||||
11
src/tools/knex-error.js
Normal file
11
src/tools/knex-error.js
Normal file
@@ -0,0 +1,11 @@
|
||||
const knex = require('../knex');
|
||||
|
||||
async function init() {
|
||||
try {
|
||||
await knex.raw('INSERT INTO actors_profiles (actor_id) VALUES (1000000000) ON CONFLICT DO NOTHING RETURNING *;');
|
||||
} catch (error) {
|
||||
console.log(error);
|
||||
}
|
||||
}
|
||||
|
||||
init();
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
const knex = require('../knex');
|
||||
const chunk = require('./chunk');
|
||||
const logger = require('../logger')(__filename);
|
||||
|
||||
async function bulkUpsert(table, items, conflict, update = true, chunkSize) {
|
||||
if (items.length === 0) {
|
||||
@@ -26,9 +27,14 @@ async function bulkUpsert(table, items, conflict, update = true, chunkSize) {
|
||||
query: knex(table).insert(chunkItems),
|
||||
}).transacting(transaction));
|
||||
|
||||
const responses = await Promise.all(queries);
|
||||
try {
|
||||
const responses = await Promise.all(queries);
|
||||
|
||||
return responses.flat().map((response) => response.rows).flat();
|
||||
return responses.flat().map((response) => response.rows).flat();
|
||||
} catch (error) {
|
||||
logger.error(`Failed bulk insert: ${error.message} (${error.detail})`);
|
||||
throw error;
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user