Fixed and refactored Dorcel scraper.

This commit is contained in:
DebaucheryLibrarian 2025-03-05 02:48:43 +01:00
parent bce340e3c2
commit 42b5c0c150
5 changed files with 191 additions and 140 deletions

8
package-lock.json generated
View File

@ -89,7 +89,7 @@
"tunnel": "0.0.6",
"ua-parser-js": "^1.0.37",
"undici": "^5.28.1",
"unprint": "^0.15.0",
"unprint": "^0.15.5",
"url-pattern": "^1.0.3",
"v-tooltip": "^2.1.3",
"video.js": "^8.6.1",
@ -18312,9 +18312,9 @@
}
},
"node_modules/unprint": {
"version": "0.15.0",
"resolved": "https://registry.npmjs.org/unprint/-/unprint-0.15.0.tgz",
"integrity": "sha512-F/nfsSAPoQFfZCYGsxOxaNX05jfzQTP/lLo3BUeOPotp9RaRfcI6ylf6ts6GqFoMAD1Y6I7M31MiriDc+SgNDQ==",
"version": "0.15.5",
"resolved": "https://registry.npmjs.org/unprint/-/unprint-0.15.5.tgz",
"integrity": "sha512-Zc3aZeQ26zvrOdvJ4RjuHdVHD8JsDfqMR626JtQWpsymljq6mWMgSQh6rdMBXLYfv3eGPzQdbo0NPnu5KAerRA==",
"dependencies": {
"axios": "^0.27.2",
"bottleneck": "^2.19.5",

View File

@ -148,7 +148,7 @@
"tunnel": "0.0.6",
"ua-parser-js": "^1.0.37",
"undici": "^5.28.1",
"unprint": "^0.15.0",
"unprint": "^0.15.5",
"url-pattern": "^1.0.3",
"v-tooltip": "^2.1.3",
"video.js": "^8.6.1",

View File

@ -11548,7 +11548,7 @@ const sites = [
},
{
slug: 'creamher',
name: 'Goth Girlfriends',
name: 'Cream Her',
url: 'https://www.creamher.com',
parent: 'spizoo',
},

View File

@ -1,232 +1,278 @@
'use strict';
const qu = require('../utils/qu');
const slugify = require('../utils/slugify');
const unprint = require('unprint');
const cookie = require('cookie');
function extractSources(sources) {
if (sources?.length > 0) {
return sources
.flat()
.map((src) => {
const [width, height] = src.match(/(\d{3,4})?_(\d{3,4})/)?.slice(1) || [];
return {
src,
width,
height,
};
})
.toSorted((posterA, posterB) => {
return posterB.height - posterA.height;
})
.map(({ src }) => src);
}
return null;
}
function scrapeAll(scenes, channel) {
return scenes.map(({ query }) => {
const release = {};
release.url = query.url('.title', 'href', { origin: channel.url });
release.url = query.url('.title', { origin: channel.url });
release.entryId = new URL(release.url).pathname.match(/\/scene\/(\d+)/)?.[1];
release.title = query.cnt('.title');
release.title = query.content('.title');
release.actors = query.all('.actors a').map((actorEl) => ({
name: query.cnt(actorEl),
url: query.url(actorEl, null, 'href', { origin: channel.url }),
name: unprint.query.content(actorEl),
url: unprint.query.url(actorEl, null, { origin: channel.url }),
}));
const fallbackPoster = query.img('.thumb img');
release.poster = query.sourceSet('.thumb img', 'data-srcset') || [fallbackPoster.replace('_crop', ''), fallbackPoster];
release.poster = extractSources(query.sourceSets('.thumb source', 'data-srcset')) || query.img('.thumb img');
release.teaser = [
query.video('.thumb-ratio', 'data-hq-preview'),
query.video('.thumb-ratio', 'data-preview'),
query.video('.thumb-ratio', { attribute: 'data-hq-preview' }),
query.video('.thumb-ratio', { attribute: 'data-preview' }),
];
return release;
});
}
async function beforeFetchLatest(channel) {
// scene page only seems to accept language preferences from session
const { res } = await unprint.get(`${channel.url}/en/news-videos-x-marc-dorcel`, {
headers: {
'X-Requested-With': 'XMLHttpRequest',
'Accept-Language': 'en-US,en', // fetch English rather than French titles
},
});
const sessionCookie = cookie.parse(res.headers['set-cookie'][0])?.dorcelclub;
return `dorcelclub=${sessionCookie}`;
}
async function fetchLatest(channel, page = 1, _options, { beforeFetchLatest: sessionCookie }) {
const url = `${channel.url}/scene/list/more/?lang=en&page=${page}&sorting=new`;
const res = await unprint.post(url, null, {
selectAll: '.scene',
headers: {
'X-Requested-With': 'XMLHttpRequest',
'Accept-Language': 'en-US,en', // fetch English rather than French titles
Cookie: sessionCookie,
},
});
if (res.ok) {
return scrapeAll(res.context, channel);
}
return res.status;
}
function scrapeScene({ query }, url, channel) {
const release = {};
release.entryId = new URL(url).pathname.match(/\/scene\/(\d+)/)?.[1];
release.title = query.cnt('h1.title');
release.description = query.cnt('.content-description .full p');
release.title = query.content('h1.title');
release.description = query.content('.content-description .full p');
release.date = query.date('.publish_date', 'MMMM DD, YYYY');
release.duration = query.dur('.duration');
release.date = query.date('.publish_date', 'MMM DD, YYYY') || query.date('.out_date', 'YYYY', { match: /\d{4}/ });
if (!query.exists('.publish_date')) {
release.datePrecision = 'year';
}
release.duration = query.duration('.duration');
release.actors = query.all('.actress a').map((actorEl) => ({
name: query.cnt(actorEl),
url: query.url(actorEl, null, 'href', { origin: channel.url }),
name: unprint.query.content(actorEl),
url: unprint.query.url(actorEl, null, { origin: channel.url }),
}));
release.director = query.cnt('.director')?.split(/\s*:\s*/)[1];
release.director = query.content('.director')?.split(/\s*:\s*/)[1];
const fallbackPoster = query.img('.player img');
release.poster = query.sourceSet('.player img', 'data-srcset') || [fallbackPoster?.replace('_crop', ''), fallbackPoster];
release.poster = extractSources(query.sourceSets('.player source', 'data-srcset')) || query.img('.player img');
const movieUrl = query.url('.movie a', 'href', { origin: channel.url });
const movieUrl = query.url('.movie a', { origin: channel.url });
if (movieUrl) {
release.movie = {
entryId: new URL(movieUrl).pathname.match(/\/porn-movie\/([\w-]+)/)?.[1],
title: query.cnt('.movie a'),
url: query.url('.movie a', 'href', { origin: channel.url }),
title: query.content('.movie a'),
url: query.url('.movie a', { origin: channel.url }),
};
}
return release;
}
async function fetchScene(url, channel) {
const res = await unprint.get(url, {
headers: {
'Accept-Language': 'en-US,en', // fetch English rather than French titles
Referer: `${channel.url}/en/news-videos-x-marc-dorcel`,
},
});
if (res.ok) {
return scrapeScene(res.context, url, channel);
}
return res.status;
}
function scrapeMovies(movies, channel) {
return movies.map(({ query }) => {
const release = {};
release.url = query.url(null, 'href', { origin: channel.url });
release.url = query.url(null, { origin: channel.url })?.replace('/film-x', '/en/porn-movie'); // French -> English fallback in case language headers didn't work
release.entryId = new URL(release.url).pathname.match(/\/porn-movie\/([\w-]+)/)?.[1];
release.title = query.cnt('h2');
release.title = query.content('h2');
release.covers = [query.sourceSet('img', 'data-srcset')];
release.covers = [extractSources(query.sourceSets('.thumb-ratio source', 'data-srcset')) || query.img('.thumb-ratio img')];
return release;
});
}
function scrapeMovie({ query, el }, url, channel) {
async function fetchMovies(channel, page = 1, { beforeFetchLatest: sessionCookie }) {
const url = `${channel.url}/movies/more?lang=en&page=${page}&sorting=new`;
const res = await unprint.post(url, null, {
selectAll: '.items .movie',
headers: {
'X-Requested-With': 'XMLHttpRequest',
'Accept-Language': 'en-US,en', // fetch English rather than French titles
Referer: 'https://www.dorcelclub.com/en/porn-movie?sorting=new', // might be used to derive sorting
Cookie: sessionCookie, // seems necessary for English results
},
});
if (res.ok && res.context) {
return scrapeMovies(res.context, channel);
}
return res.status;
}
function scrapeMovie({ query }, url, channel) {
const release = {};
release.title = query.cnt('.header h1');
release.description = query.cnt('.content-text p');
release.title = query.content('.header h1');
release.description = query.content('.content-text p');
release.entryId = new URL(url).pathname.match(/\/porn-movie\/([\w-]+)/)?.[1];
release.date = query.date('.out_date', 'YYYY');
release.date = query.date('.out_date', 'YYYY', { match: /\d{4}/ });
release.datePrecision = 'year';
release.duration = query.dur('.duration');
release.duration = query.duration('.duration');
release.actors = query.all('.actors .actor').map((actorEl) => ({
name: query.cnt(actorEl, '.name'),
url: query.url(actorEl, 'a', 'href', { origin: channel.url }),
avatar: query.sourceSet(actorEl, '.thumbnail img', 'data-srcset'),
name: unprint.query.content(actorEl, '.name'),
url: unprint.query.url(actorEl, 'a', { origin: channel.url }),
avatar: extractSources(unprint.query.sourceSets(actorEl, '.thumbnail source', 'data-srcset')) || unprint.query.img(actorEl, '.thumbnail img'),
}));
release.poster = query.sourceSet('.banner', 'data-src')?.[0];
release.covers = [query.all(query.el('.cover').parentElement, 'source')
?.map((coverEl) => query.sourceSet(coverEl, null, 'data-srcset'))
.flat()
.sort((coverA, coverB) => {
const resA = Number(coverA.match(/_(\d{3,})_/)?.[1]);
const resB = Number(coverB.match(/_(\d{3,})_/)?.[1]);
release.poster = extractSources(query.sourceSets('//picture[img[contains(@class, \'banner\')]]//source', 'data-srcset')) || query.img('img.banner');
release.covers = [extractSources(query.sourceSets('//picture[img[contains(@class, \'cover\')]]//source', 'data-srcset')) || query.img('img.cover')];
if (resA < resB) return 1;
if (resA > resB) return -1;
return 0;
})
.concat(query.sourceSet('.cover', 'data-src')?.[0])];
release.scenes = scrapeAll(qu.initAll(el, '.scene'), channel);
release.scenes = scrapeAll(unprint.initAll(query.all('.scene')), channel);
return release;
}
async function scrapeProfile({ query, el }, entity, avatar) {
async function fetchMovie(url, channel) {
const res = await unprint.get(url, {
select: '.content',
headers: {
'Accept-Language': 'en-US,en', // fetch English rather than French titles
Referer: `${channel.url}/en/porn-movie`,
},
});
if (res.ok && res.context) {
return scrapeMovie(res.context, url, channel);
}
return res.status;
}
async function scrapeProfile({ query }, entity) {
const profile = {};
profile.description = query.cnt('.content-description .content-text > p, .content-description .full p'); // different structure for overflowing vs short text
profile.nationality = query.cnt('.nationality');
profile.description = query.content('.content-description .content-text > p, .content-description .full p'); // different structure for overflowing vs short text
profile.nationality = query.content('.nationality');
profile.banner = query.img('.header img:not([src*="actor/banner"])'); // ignore stock banner
if (avatar) {
profile.avatar = [
avatar.replace('crop_', ''),
avatar,
];
}
profile.avatar = extractSources(query.sourceSets('.banner source[data-srcset*="actorsquare"]', 'data-srcset'))
|| query.img('.banner img[src*="actorsqure"]'); // usually banner, but worth trying
profile.releases = scrapeAll(qu.initAll(el, '.scene'), entity);
profile.releases = scrapeAll(unprint.initAll(query.all('.scene')), entity);
return profile;
}
async function beforeFetchLatest(channel) {
// scene page only seems to accept language preferences from session
const session = qu.session();
await qu.getAll(`${channel.url}/en/news-videos-x-marc-dorcel`, '.scene', {
'X-Requested-With': 'XMLHttpRequest',
'Accept-Language': 'en-US,en', // fetch English rather than French titles
}, { session });
return session;
}
async function fetchLatest(channel, page = 1, options, { beforeFetchLatest: session }) {
const url = `${channel.url}/scene/list/more/?lang=en&page=${page}&sorting=new`;
const res = await qu.getAll(url, '.scene', {
'X-Requested-With': 'XMLHttpRequest',
'Accept-Language': 'en-US,en', // fetch English rather than French titles
}, { session });
if (res.ok) {
return scrapeAll(res.items, channel);
async function getActorUrl(baseActor, entity) {
if (baseActor.url) {
return baseActor.url;
}
return res.status;
}
async function fetchMovies(channel, page = 1) {
const url = `${channel.url}/movies/more?lang=en&page=${page}&sorting=new`;
const res = await qu.getAll(url, '.movie', {
'X-Requested-With': 'XMLHttpRequest',
'Accept-Language': 'en-US,en', // fetch English rather than French titles
Referer: 'https://www.dorcelclub.com/en/porn-movie?sorting=new', // might be used to derive sorting
});
if (res.ok && res.items) {
return scrapeMovies(res.items, channel);
}
return res.status;
}
async function fetchScene(url, channel) {
const res = await qu.get(url, null, {
'Accept-Language': 'en-US,en', // fetch English rather than French titles
Referer: `${channel.url}/en/news-videos-x-marc-dorcel`,
});
if (res.ok) {
return scrapeScene(res.item, url, channel);
}
return res.status;
}
async function fetchMovie(url, channel) {
const res = await qu.get(url, '.content', {
'Accept-Language': 'en-US,en', // fetch English rather than French titles
Referer: `${channel.url}/en/porn-movie`,
});
if (res.ok && res.item) {
return scrapeMovie(res.item, url, channel);
}
return res.status;
}
async function fetchProfile(baseActor, { entity }) {
// URL slugs are unpredictable: /jessie-volt, /aleska_diamond, /liza-del_sierra
const searchRes = await qu.postAll(`${entity.url}/en/search`, { s: baseActor.name }, '.actors .actor', { 'Accept-Language': 'en-US,en' });
// AJAX API at /search/ajax/display doesn't actually return results unless an actor ID is passed
const searchRes = await unprint.post(`${entity.url}/en/search`, new URLSearchParams({ s: baseActor.name }), {
selectAll: '#search .actor',
headers: {
// 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
'Accept-Language': 'en-US,en',
},
});
if (!searchRes.ok) {
return searchRes.status;
}
const actorItem = searchRes.items.find(({ query }) => slugify(query.cnt('.name')) === baseActor.slug);
const actorItem = searchRes.context.find(({ query }) => query.content('.name') === baseActor.name);
if (!actorItem) {
return null;
}
const actorUrl = actorItem.query.url('a', 'href', { origin: entity.url });
const actorAvatar = actorItem.query.img();
return actorItem.query.url('a', { origin: entity.url });
}
const actorRes = await qu.get(actorUrl, null, { 'Accept-Language': 'en-US,en' });
async function fetchProfile(baseActor, { entity }) {
const actorUrl = await getActorUrl(baseActor, entity);
if (!actorUrl) {
return null;
}
const actorRes = await unprint.get(actorUrl, {
headers: {
'Accept-Language': 'en-US,en',
},
});
if (actorRes.ok) {
return scrapeProfile(actorRes.item, entity, actorAvatar);
return scrapeProfile(actorRes.context, entity);
}
return null;

View File

@ -198,14 +198,19 @@ async function scrapeUpcomingReleases(scraper, entity, preData) {
return emptyReleases;
}
async function scrapeMovies(scraper, entity) {
async function scrapeMovies(scraper, entity, preData) {
if (!argv.movies || !scraper.fetchMovies) {
return [];
}
try {
const context = {
...preData,
include,
parameters: getRecursiveParameters(entity),
};
// return await scrapeReleases(scraper, entity, preData, true);
return await scraper.fetchMovies(entity);
return await scraper.fetchMovies(entity, 1, context); // TODO: implement pagination
} catch (error) {
logger.warn(`Failed to scrape movies for '${entity.slug}' (${entity.parent?.slug}): ${error.message}`);
}