Updated Jules Jordan scraper for new site, changed entryId to always use URLs (upcoming WIP).
This commit is contained in:
@@ -1,44 +1,38 @@
|
||||
'use strict';
|
||||
|
||||
const util = require('util');
|
||||
const Promise = require('bluebird');
|
||||
const unprint = require('unprint');
|
||||
|
||||
const argv = require('../argv');
|
||||
const { heightToCm } = require('../utils/convert');
|
||||
const slugify = require('../utils/slugify');
|
||||
const tryUrls = require('../utils/try-urls');
|
||||
|
||||
function getEntryId(html) {
|
||||
const entryId = html.match(/showtagform\((\d+)\)/);
|
||||
|
||||
if (entryId) {
|
||||
return entryId[1];
|
||||
function getEntryId(url) {
|
||||
if (!url) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const setIdIndex = html.indexOf('setid:"');
|
||||
const entryId = new URL(url).pathname.split('/').at(-1).match('(.*?)_vids.html');
|
||||
|
||||
if (setIdIndex) {
|
||||
return html.slice(setIdIndex, html.indexOf(',', setIdIndex)).match(/\d+/)?.[0];
|
||||
if (entryId) {
|
||||
return slugify(entryId[1]);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function getEntryIdFromTitle(release) {
|
||||
// return slugify([release.title, release.date && unprint.formatDate(release.date, 'YYYY-MM-DD')]); // date not shown on updates page
|
||||
// return slugify(release.title);
|
||||
return slugify([release.title, ...(release.actors?.map((actor) => actor.name || actor).toSorted() || [])]);
|
||||
}
|
||||
|
||||
function scrapeAll(scenes, site, entryIdFromTitle) {
|
||||
return scenes.map(({ element, query }) => {
|
||||
function scrapeAll(scenes, site) {
|
||||
return scenes.map(({ query }) => {
|
||||
const release = {};
|
||||
const title = query.content('.content_img div, .dvd_info > a, a.update_title, .update_title a, a[title] + a[title], .overlay-text')
|
||||
const title = query.content('.jj-card-title, .content_img div, .dvd_info > a, a.update_title, .update_title a, a[title] + a[title], .overlay-text')
|
||||
|| query.content('a[title*=" "]');
|
||||
|
||||
release.title = title?.slice(0, title.match(/starring:/i)?.index || Infinity).trim();
|
||||
release.url = query.url('.content_img a, .dvd_info > a, a.update_title, .update_title a, a[title]');
|
||||
release.date = query.date('.update_date', ['MM/DD/YYYY', 'YYYY-MM-DD']);
|
||||
release.url = query.url('.jj-card-thumb, .content_img a, .dvd_info > a, a.update_title, .update_title a, a[title]');
|
||||
release.date = query.date('.jj-card-date, .update_date', ['MM/DD/YYYY', 'YYYY-MM-DD', 'MMMM D, YYYY']);
|
||||
|
||||
release.entryId = getEntryId(release.url);
|
||||
|
||||
release.actors = query.all('.content_img .update_models a, .update_models a').map((actorEl) => ({
|
||||
name: unprint.query.content(actorEl),
|
||||
@@ -70,17 +64,16 @@ function scrapeAll(scenes, site, entryIdFromTitle) {
|
||||
return null;
|
||||
}).filter(Boolean);
|
||||
|
||||
const teaserScript = query.html('script');
|
||||
release.teaser = query.video('.jj-card-video', { attribute: 'data-src' });
|
||||
|
||||
if (teaserScript) {
|
||||
release.teaser = teaserScript.slice(teaserScript.indexOf('http'), teaserScript.indexOf('.mp4') + 4);
|
||||
if (!release.teaser) {
|
||||
const teaserScript = query.html('script');
|
||||
|
||||
if (teaserScript) {
|
||||
release.teaser = teaserScript.slice(teaserScript.indexOf('http'), teaserScript.indexOf('.mp4') + 4);
|
||||
}
|
||||
}
|
||||
|
||||
release.entryId = (entryIdFromTitle && getEntryIdFromTitle(release))
|
||||
|| element.dataset.setid
|
||||
|| query.element('.rating_box')?.dataset.id
|
||||
|| query.attribute('a img', 'id')?.match(/set-target-(\d+)/)?.[1];
|
||||
|
||||
return release;
|
||||
});
|
||||
}
|
||||
@@ -91,7 +84,7 @@ async function fetchLatest(site, page = 1, include, preData, entryIdFromTitle =
|
||||
: `${site.url}/trial/categories/movies_${page}_d.html`;
|
||||
|
||||
// const res = await http.get(url);
|
||||
const res = await unprint.get(url, { selectAll: '.update_details, .grid-item' });
|
||||
const res = await unprint.get(url, { selectAll: '.scenes-listing-grid .jj-content-card' });
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeAll(res.context, site, typeof site.parameters?.entryIdFromTitle === 'boolean' ? site.parameters.entryIdFromTitle : entryIdFromTitle);
|
||||
@@ -100,7 +93,8 @@ async function fetchLatest(site, page = 1, include, preData, entryIdFromTitle =
|
||||
return res.status;
|
||||
}
|
||||
|
||||
function scrapeUpcoming(scenes, channel) {
|
||||
/* disable until we have entryId solution
|
||||
function scrapeUpcoming(scenes, _channel) {
|
||||
return scenes.map(({ query, html }) => {
|
||||
const release = {};
|
||||
|
||||
@@ -135,6 +129,7 @@ async function fetchUpcoming(site) {
|
||||
|
||||
return res.status;
|
||||
}
|
||||
*/
|
||||
|
||||
function extractLegacyTrailer(html, context) {
|
||||
const trailerLines = html.split('\n').filter((line) => /movie\["trailer\w*"\]\[/i.test(line));
|
||||
@@ -194,17 +189,19 @@ function getPhotos(query, release, context) {
|
||||
async function scrapeScene({ html, query }, context) {
|
||||
const release = {};
|
||||
|
||||
release.title = query.content('.title_bar_hilite, .movie_title');
|
||||
release.description = query.content('.update_description') || query.text('//div[./span[contains(text(), "Description")]]');
|
||||
release.entryId = getEntryId(context.url);
|
||||
|
||||
release.date = query.date(['.update_date', '//div[./span[contains(text(), "Date")]]'], ['MM/DD/YYYY', 'YYYY-MM-DD']);
|
||||
release.title = query.content('.scene-title, .title_bar_hilite, .movie_title');
|
||||
release.description = query.content('.scene-desc, .update_description') || query.text('//div[./span[contains(text(), "Description")]]');
|
||||
|
||||
release.actors = query.all('.backgroundcolor_info > .update_models a, .item .update_models a, .player-scene-description .update_models a').map((actorEl) => ({
|
||||
release.date = query.date(['.meta-item:nth-child(2) .val, .update_date', '//div[./span[contains(text(), "Date")]]'], ['MM/DD/YYYY', 'YYYY-MM-DD', 'MMMM D, YYYY']);
|
||||
|
||||
release.actors = query.all('.meta-item .update_models a, .backgroundcolor_info > .update_models a, .item .update_models a, .player-scene-description .update_models a').map((actorEl) => ({
|
||||
name: unprint.query.content(actorEl),
|
||||
url: unprint.query.url(actorEl, null),
|
||||
}));
|
||||
|
||||
release.tags = query.contents('.update_tags a, .player-scene-description a[href*="/categories"]');
|
||||
release.tags = query.contents('.scene-cats a, .update_tags a, .player-scene-description a[href*="/categories"]');
|
||||
release.director = release.tags?.find((tag) => ['mike john', 'van styles'].includes(tag?.trim().toLowerCase()));
|
||||
|
||||
const posterPath = query.poster('#video-player', { forceGetAttribute: true }) // without getAttribute, missing poster is returned as page URL
|
||||
@@ -245,7 +242,7 @@ async function scrapeScene({ html, query }, context) {
|
||||
// #images img selects a list of images that is present on every page; the JJ website removes the ones that failed to load with JS (lol)
|
||||
release.photos = [
|
||||
...context.baseRelease?.photos?.map((sources) => sources.at(-1).src) || [],
|
||||
...query.imgs('#images img'),
|
||||
...query.imgs('.tp-photos-strip img, #images img'),
|
||||
...query.imgs('img.update_thumb', { attribute: 'src0_1x' }),
|
||||
].filter(Boolean).map((source) => Array.from(new Set([
|
||||
source.replace(/.jpg$/, '-full.jpg'),
|
||||
@@ -270,9 +267,7 @@ async function scrapeScene({ html, query }, context) {
|
||||
|
||||
release.stars = query.number('.avg_rating');
|
||||
|
||||
release.entryId = context.entity.parameters?.entryIdFromTitle
|
||||
? getEntryIdFromTitle(release)
|
||||
: getEntryId(html);
|
||||
release.qualities = query.contents('.res-item .res-lbl').map((resolution) => Number(resolution.match(/\d+$/)?.[0])).filter(Boolean);
|
||||
|
||||
return release;
|
||||
}
|
||||
@@ -296,7 +291,7 @@ function scrapeMovie({ query }, { url }) {
|
||||
scene.date = unprint.query.date(sceneEl, '//span[contains(@class, "dvd-scene-description") and span[contains(text(), "Date")]]', 'MM/DD/YYYY');
|
||||
scene.actors = unprint.query.contents(sceneEl, '.update_models a');
|
||||
|
||||
scene.entryId = getEntryIdFromTitle(scene);
|
||||
scene.entryId = getEntryId(scene.url);
|
||||
|
||||
return scene;
|
||||
});
|
||||
@@ -339,48 +334,34 @@ function scrapeProfile({ query }, url, entity) {
|
||||
verifyType: 'image',
|
||||
}));
|
||||
|
||||
profile.scenes = scrapeAll(unprint.initAll(query.all('.grid-item')), entity, true);
|
||||
profile.scenes = scrapeAll(unprint.initAll(query.all('.mbp-scenes-grid .jj-content-card, .grid-item')), entity, true);
|
||||
|
||||
return profile;
|
||||
}
|
||||
|
||||
async function fetchProfile({ name: actorName, url }, entity) {
|
||||
async function fetchProfile({ name: actorName, url: actorUrl }, entity) {
|
||||
const actorSlugA = slugify(actorName, '');
|
||||
const actorSlugB = slugify(actorName, '-');
|
||||
|
||||
const urls = [
|
||||
url,
|
||||
actorUrl,
|
||||
`${entity.parameters?.profile || `${entity.url}/trial/models`}/${actorSlugA}.html`,
|
||||
`${entity.parameters?.profile || `${entity.url}/trial/models`}/${actorSlugB}.html`,
|
||||
];
|
||||
|
||||
return urls.reduce(async (chain, profileUrl) => {
|
||||
const profile = await chain;
|
||||
const { res, url } = await tryUrls(urls);
|
||||
|
||||
if (profile) {
|
||||
return profile;
|
||||
}
|
||||
if (res.ok) {
|
||||
return scrapeProfile(res.context, url, entity);
|
||||
}
|
||||
|
||||
if (!profileUrl) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const res = await unprint.get(profileUrl, {
|
||||
followRedirects: false,
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeProfile(res.context, profileUrl, entity);
|
||||
}
|
||||
|
||||
return null;
|
||||
}, Promise.resolve());
|
||||
return res.status;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
fetchLatest,
|
||||
fetchProfile,
|
||||
fetchUpcoming,
|
||||
// fetchUpcoming,
|
||||
scrapeScene,
|
||||
scrapeMovie,
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user