Fixed and refactored Dorcel scraper.
This commit is contained in:
parent
bce340e3c2
commit
42b5c0c150
|
@ -89,7 +89,7 @@
|
|||
"tunnel": "0.0.6",
|
||||
"ua-parser-js": "^1.0.37",
|
||||
"undici": "^5.28.1",
|
||||
"unprint": "^0.15.0",
|
||||
"unprint": "^0.15.5",
|
||||
"url-pattern": "^1.0.3",
|
||||
"v-tooltip": "^2.1.3",
|
||||
"video.js": "^8.6.1",
|
||||
|
@ -18312,9 +18312,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/unprint": {
|
||||
"version": "0.15.0",
|
||||
"resolved": "https://registry.npmjs.org/unprint/-/unprint-0.15.0.tgz",
|
||||
"integrity": "sha512-F/nfsSAPoQFfZCYGsxOxaNX05jfzQTP/lLo3BUeOPotp9RaRfcI6ylf6ts6GqFoMAD1Y6I7M31MiriDc+SgNDQ==",
|
||||
"version": "0.15.5",
|
||||
"resolved": "https://registry.npmjs.org/unprint/-/unprint-0.15.5.tgz",
|
||||
"integrity": "sha512-Zc3aZeQ26zvrOdvJ4RjuHdVHD8JsDfqMR626JtQWpsymljq6mWMgSQh6rdMBXLYfv3eGPzQdbo0NPnu5KAerRA==",
|
||||
"dependencies": {
|
||||
"axios": "^0.27.2",
|
||||
"bottleneck": "^2.19.5",
|
||||
|
|
|
@ -148,7 +148,7 @@
|
|||
"tunnel": "0.0.6",
|
||||
"ua-parser-js": "^1.0.37",
|
||||
"undici": "^5.28.1",
|
||||
"unprint": "^0.15.0",
|
||||
"unprint": "^0.15.5",
|
||||
"url-pattern": "^1.0.3",
|
||||
"v-tooltip": "^2.1.3",
|
||||
"video.js": "^8.6.1",
|
||||
|
|
|
@ -11548,7 +11548,7 @@ const sites = [
|
|||
},
|
||||
{
|
||||
slug: 'creamher',
|
||||
name: 'Goth Girlfriends',
|
||||
name: 'Cream Her',
|
||||
url: 'https://www.creamher.com',
|
||||
parent: 'spizoo',
|
||||
},
|
||||
|
|
|
@ -1,232 +1,278 @@
|
|||
'use strict';
|
||||
|
||||
const qu = require('../utils/qu');
|
||||
const slugify = require('../utils/slugify');
|
||||
const unprint = require('unprint');
|
||||
const cookie = require('cookie');
|
||||
|
||||
function extractSources(sources) {
|
||||
if (sources?.length > 0) {
|
||||
return sources
|
||||
.flat()
|
||||
.map((src) => {
|
||||
const [width, height] = src.match(/(\d{3,4})?_(\d{3,4})/)?.slice(1) || [];
|
||||
|
||||
return {
|
||||
src,
|
||||
width,
|
||||
height,
|
||||
};
|
||||
})
|
||||
.toSorted((posterA, posterB) => {
|
||||
return posterB.height - posterA.height;
|
||||
})
|
||||
.map(({ src }) => src);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function scrapeAll(scenes, channel) {
|
||||
return scenes.map(({ query }) => {
|
||||
const release = {};
|
||||
|
||||
release.url = query.url('.title', 'href', { origin: channel.url });
|
||||
release.url = query.url('.title', { origin: channel.url });
|
||||
release.entryId = new URL(release.url).pathname.match(/\/scene\/(\d+)/)?.[1];
|
||||
|
||||
release.title = query.cnt('.title');
|
||||
release.title = query.content('.title');
|
||||
|
||||
release.actors = query.all('.actors a').map((actorEl) => ({
|
||||
name: query.cnt(actorEl),
|
||||
url: query.url(actorEl, null, 'href', { origin: channel.url }),
|
||||
name: unprint.query.content(actorEl),
|
||||
url: unprint.query.url(actorEl, null, { origin: channel.url }),
|
||||
}));
|
||||
|
||||
const fallbackPoster = query.img('.thumb img');
|
||||
release.poster = query.sourceSet('.thumb img', 'data-srcset') || [fallbackPoster.replace('_crop', ''), fallbackPoster];
|
||||
release.poster = extractSources(query.sourceSets('.thumb source', 'data-srcset')) || query.img('.thumb img');
|
||||
|
||||
release.teaser = [
|
||||
query.video('.thumb-ratio', 'data-hq-preview'),
|
||||
query.video('.thumb-ratio', 'data-preview'),
|
||||
query.video('.thumb-ratio', { attribute: 'data-hq-preview' }),
|
||||
query.video('.thumb-ratio', { attribute: 'data-preview' }),
|
||||
];
|
||||
|
||||
return release;
|
||||
});
|
||||
}
|
||||
|
||||
async function beforeFetchLatest(channel) {
|
||||
// scene page only seems to accept language preferences from session
|
||||
const { res } = await unprint.get(`${channel.url}/en/news-videos-x-marc-dorcel`, {
|
||||
headers: {
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
'Accept-Language': 'en-US,en', // fetch English rather than French titles
|
||||
},
|
||||
});
|
||||
|
||||
const sessionCookie = cookie.parse(res.headers['set-cookie'][0])?.dorcelclub;
|
||||
|
||||
return `dorcelclub=${sessionCookie}`;
|
||||
}
|
||||
|
||||
async function fetchLatest(channel, page = 1, _options, { beforeFetchLatest: sessionCookie }) {
|
||||
const url = `${channel.url}/scene/list/more/?lang=en&page=${page}&sorting=new`;
|
||||
|
||||
const res = await unprint.post(url, null, {
|
||||
selectAll: '.scene',
|
||||
headers: {
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
'Accept-Language': 'en-US,en', // fetch English rather than French titles
|
||||
Cookie: sessionCookie,
|
||||
},
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeAll(res.context, channel);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
function scrapeScene({ query }, url, channel) {
|
||||
const release = {};
|
||||
|
||||
release.entryId = new URL(url).pathname.match(/\/scene\/(\d+)/)?.[1];
|
||||
|
||||
release.title = query.cnt('h1.title');
|
||||
release.description = query.cnt('.content-description .full p');
|
||||
release.title = query.content('h1.title');
|
||||
release.description = query.content('.content-description .full p');
|
||||
|
||||
release.date = query.date('.publish_date', 'MMMM DD, YYYY');
|
||||
release.duration = query.dur('.duration');
|
||||
release.date = query.date('.publish_date', 'MMM DD, YYYY') || query.date('.out_date', 'YYYY', { match: /\d{4}/ });
|
||||
|
||||
if (!query.exists('.publish_date')) {
|
||||
release.datePrecision = 'year';
|
||||
}
|
||||
|
||||
release.duration = query.duration('.duration');
|
||||
|
||||
release.actors = query.all('.actress a').map((actorEl) => ({
|
||||
name: query.cnt(actorEl),
|
||||
url: query.url(actorEl, null, 'href', { origin: channel.url }),
|
||||
name: unprint.query.content(actorEl),
|
||||
url: unprint.query.url(actorEl, null, { origin: channel.url }),
|
||||
}));
|
||||
|
||||
release.director = query.cnt('.director')?.split(/\s*:\s*/)[1];
|
||||
release.director = query.content('.director')?.split(/\s*:\s*/)[1];
|
||||
|
||||
const fallbackPoster = query.img('.player img');
|
||||
release.poster = query.sourceSet('.player img', 'data-srcset') || [fallbackPoster?.replace('_crop', ''), fallbackPoster];
|
||||
release.poster = extractSources(query.sourceSets('.player source', 'data-srcset')) || query.img('.player img');
|
||||
|
||||
const movieUrl = query.url('.movie a', 'href', { origin: channel.url });
|
||||
const movieUrl = query.url('.movie a', { origin: channel.url });
|
||||
|
||||
if (movieUrl) {
|
||||
release.movie = {
|
||||
entryId: new URL(movieUrl).pathname.match(/\/porn-movie\/([\w-]+)/)?.[1],
|
||||
title: query.cnt('.movie a'),
|
||||
url: query.url('.movie a', 'href', { origin: channel.url }),
|
||||
title: query.content('.movie a'),
|
||||
url: query.url('.movie a', { origin: channel.url }),
|
||||
};
|
||||
}
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
async function fetchScene(url, channel) {
|
||||
const res = await unprint.get(url, {
|
||||
headers: {
|
||||
'Accept-Language': 'en-US,en', // fetch English rather than French titles
|
||||
Referer: `${channel.url}/en/news-videos-x-marc-dorcel`,
|
||||
},
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeScene(res.context, url, channel);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
function scrapeMovies(movies, channel) {
|
||||
return movies.map(({ query }) => {
|
||||
const release = {};
|
||||
|
||||
release.url = query.url(null, 'href', { origin: channel.url });
|
||||
release.url = query.url(null, { origin: channel.url })?.replace('/film-x', '/en/porn-movie'); // French -> English fallback in case language headers didn't work
|
||||
release.entryId = new URL(release.url).pathname.match(/\/porn-movie\/([\w-]+)/)?.[1];
|
||||
|
||||
release.title = query.cnt('h2');
|
||||
release.title = query.content('h2');
|
||||
|
||||
release.covers = [query.sourceSet('img', 'data-srcset')];
|
||||
release.covers = [extractSources(query.sourceSets('.thumb-ratio source', 'data-srcset')) || query.img('.thumb-ratio img')];
|
||||
|
||||
return release;
|
||||
});
|
||||
}
|
||||
|
||||
function scrapeMovie({ query, el }, url, channel) {
|
||||
async function fetchMovies(channel, page = 1, { beforeFetchLatest: sessionCookie }) {
|
||||
const url = `${channel.url}/movies/more?lang=en&page=${page}&sorting=new`;
|
||||
|
||||
const res = await unprint.post(url, null, {
|
||||
selectAll: '.items .movie',
|
||||
headers: {
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
'Accept-Language': 'en-US,en', // fetch English rather than French titles
|
||||
Referer: 'https://www.dorcelclub.com/en/porn-movie?sorting=new', // might be used to derive sorting
|
||||
Cookie: sessionCookie, // seems necessary for English results
|
||||
},
|
||||
});
|
||||
|
||||
if (res.ok && res.context) {
|
||||
return scrapeMovies(res.context, channel);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
function scrapeMovie({ query }, url, channel) {
|
||||
const release = {};
|
||||
|
||||
release.title = query.cnt('.header h1');
|
||||
release.description = query.cnt('.content-text p');
|
||||
release.title = query.content('.header h1');
|
||||
release.description = query.content('.content-text p');
|
||||
|
||||
release.entryId = new URL(url).pathname.match(/\/porn-movie\/([\w-]+)/)?.[1];
|
||||
|
||||
release.date = query.date('.out_date', 'YYYY');
|
||||
release.date = query.date('.out_date', 'YYYY', { match: /\d{4}/ });
|
||||
release.datePrecision = 'year';
|
||||
|
||||
release.duration = query.dur('.duration');
|
||||
release.duration = query.duration('.duration');
|
||||
|
||||
release.actors = query.all('.actors .actor').map((actorEl) => ({
|
||||
name: query.cnt(actorEl, '.name'),
|
||||
url: query.url(actorEl, 'a', 'href', { origin: channel.url }),
|
||||
avatar: query.sourceSet(actorEl, '.thumbnail img', 'data-srcset'),
|
||||
name: unprint.query.content(actorEl, '.name'),
|
||||
url: unprint.query.url(actorEl, 'a', { origin: channel.url }),
|
||||
avatar: extractSources(unprint.query.sourceSets(actorEl, '.thumbnail source', 'data-srcset')) || unprint.query.img(actorEl, '.thumbnail img'),
|
||||
}));
|
||||
|
||||
release.poster = query.sourceSet('.banner', 'data-src')?.[0];
|
||||
release.covers = [query.all(query.el('.cover').parentElement, 'source')
|
||||
?.map((coverEl) => query.sourceSet(coverEl, null, 'data-srcset'))
|
||||
.flat()
|
||||
.sort((coverA, coverB) => {
|
||||
const resA = Number(coverA.match(/_(\d{3,})_/)?.[1]);
|
||||
const resB = Number(coverB.match(/_(\d{3,})_/)?.[1]);
|
||||
release.poster = extractSources(query.sourceSets('//picture[img[contains(@class, \'banner\')]]//source', 'data-srcset')) || query.img('img.banner');
|
||||
release.covers = [extractSources(query.sourceSets('//picture[img[contains(@class, \'cover\')]]//source', 'data-srcset')) || query.img('img.cover')];
|
||||
|
||||
if (resA < resB) return 1;
|
||||
if (resA > resB) return -1;
|
||||
|
||||
return 0;
|
||||
})
|
||||
.concat(query.sourceSet('.cover', 'data-src')?.[0])];
|
||||
|
||||
release.scenes = scrapeAll(qu.initAll(el, '.scene'), channel);
|
||||
release.scenes = scrapeAll(unprint.initAll(query.all('.scene')), channel);
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
async function scrapeProfile({ query, el }, entity, avatar) {
|
||||
async function fetchMovie(url, channel) {
|
||||
const res = await unprint.get(url, {
|
||||
select: '.content',
|
||||
headers: {
|
||||
'Accept-Language': 'en-US,en', // fetch English rather than French titles
|
||||
Referer: `${channel.url}/en/porn-movie`,
|
||||
},
|
||||
});
|
||||
|
||||
if (res.ok && res.context) {
|
||||
return scrapeMovie(res.context, url, channel);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function scrapeProfile({ query }, entity) {
|
||||
const profile = {};
|
||||
|
||||
profile.description = query.cnt('.content-description .content-text > p, .content-description .full p'); // different structure for overflowing vs short text
|
||||
profile.nationality = query.cnt('.nationality');
|
||||
profile.description = query.content('.content-description .content-text > p, .content-description .full p'); // different structure for overflowing vs short text
|
||||
profile.nationality = query.content('.nationality');
|
||||
|
||||
profile.banner = query.img('.header img:not([src*="actor/banner"])'); // ignore stock banner
|
||||
|
||||
if (avatar) {
|
||||
profile.avatar = [
|
||||
avatar.replace('crop_', ''),
|
||||
avatar,
|
||||
];
|
||||
}
|
||||
profile.avatar = extractSources(query.sourceSets('.banner source[data-srcset*="actorsquare"]', 'data-srcset'))
|
||||
|| query.img('.banner img[src*="actorsqure"]'); // usually banner, but worth trying
|
||||
|
||||
profile.releases = scrapeAll(qu.initAll(el, '.scene'), entity);
|
||||
profile.releases = scrapeAll(unprint.initAll(query.all('.scene')), entity);
|
||||
|
||||
return profile;
|
||||
}
|
||||
|
||||
async function beforeFetchLatest(channel) {
|
||||
// scene page only seems to accept language preferences from session
|
||||
const session = qu.session();
|
||||
|
||||
await qu.getAll(`${channel.url}/en/news-videos-x-marc-dorcel`, '.scene', {
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
'Accept-Language': 'en-US,en', // fetch English rather than French titles
|
||||
}, { session });
|
||||
|
||||
return session;
|
||||
}
|
||||
|
||||
async function fetchLatest(channel, page = 1, options, { beforeFetchLatest: session }) {
|
||||
const url = `${channel.url}/scene/list/more/?lang=en&page=${page}&sorting=new`;
|
||||
|
||||
const res = await qu.getAll(url, '.scene', {
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
'Accept-Language': 'en-US,en', // fetch English rather than French titles
|
||||
}, { session });
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeAll(res.items, channel);
|
||||
async function getActorUrl(baseActor, entity) {
|
||||
if (baseActor.url) {
|
||||
return baseActor.url;
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchMovies(channel, page = 1) {
|
||||
const url = `${channel.url}/movies/more?lang=en&page=${page}&sorting=new`;
|
||||
|
||||
const res = await qu.getAll(url, '.movie', {
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
'Accept-Language': 'en-US,en', // fetch English rather than French titles
|
||||
Referer: 'https://www.dorcelclub.com/en/porn-movie?sorting=new', // might be used to derive sorting
|
||||
});
|
||||
|
||||
if (res.ok && res.items) {
|
||||
return scrapeMovies(res.items, channel);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchScene(url, channel) {
|
||||
const res = await qu.get(url, null, {
|
||||
'Accept-Language': 'en-US,en', // fetch English rather than French titles
|
||||
Referer: `${channel.url}/en/news-videos-x-marc-dorcel`,
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeScene(res.item, url, channel);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchMovie(url, channel) {
|
||||
const res = await qu.get(url, '.content', {
|
||||
'Accept-Language': 'en-US,en', // fetch English rather than French titles
|
||||
Referer: `${channel.url}/en/porn-movie`,
|
||||
});
|
||||
|
||||
if (res.ok && res.item) {
|
||||
return scrapeMovie(res.item, url, channel);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchProfile(baseActor, { entity }) {
|
||||
// URL slugs are unpredictable: /jessie-volt, /aleska_diamond, /liza-del_sierra
|
||||
const searchRes = await qu.postAll(`${entity.url}/en/search`, { s: baseActor.name }, '.actors .actor', { 'Accept-Language': 'en-US,en' });
|
||||
// AJAX API at /search/ajax/display doesn't actually return results unless an actor ID is passed
|
||||
const searchRes = await unprint.post(`${entity.url}/en/search`, new URLSearchParams({ s: baseActor.name }), {
|
||||
selectAll: '#search .actor',
|
||||
headers: {
|
||||
// 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
|
||||
'Accept-Language': 'en-US,en',
|
||||
},
|
||||
});
|
||||
|
||||
if (!searchRes.ok) {
|
||||
return searchRes.status;
|
||||
}
|
||||
|
||||
const actorItem = searchRes.items.find(({ query }) => slugify(query.cnt('.name')) === baseActor.slug);
|
||||
const actorItem = searchRes.context.find(({ query }) => query.content('.name') === baseActor.name);
|
||||
|
||||
if (!actorItem) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const actorUrl = actorItem.query.url('a', 'href', { origin: entity.url });
|
||||
const actorAvatar = actorItem.query.img();
|
||||
return actorItem.query.url('a', { origin: entity.url });
|
||||
}
|
||||
|
||||
const actorRes = await qu.get(actorUrl, null, { 'Accept-Language': 'en-US,en' });
|
||||
async function fetchProfile(baseActor, { entity }) {
|
||||
const actorUrl = await getActorUrl(baseActor, entity);
|
||||
|
||||
if (!actorUrl) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const actorRes = await unprint.get(actorUrl, {
|
||||
headers: {
|
||||
'Accept-Language': 'en-US,en',
|
||||
},
|
||||
});
|
||||
|
||||
if (actorRes.ok) {
|
||||
return scrapeProfile(actorRes.item, entity, actorAvatar);
|
||||
return scrapeProfile(actorRes.context, entity);
|
||||
}
|
||||
|
||||
return null;
|
||||
|
|
|
@ -198,14 +198,19 @@ async function scrapeUpcomingReleases(scraper, entity, preData) {
|
|||
return emptyReleases;
|
||||
}
|
||||
|
||||
async function scrapeMovies(scraper, entity) {
|
||||
async function scrapeMovies(scraper, entity, preData) {
|
||||
if (!argv.movies || !scraper.fetchMovies) {
|
||||
return [];
|
||||
}
|
||||
|
||||
try {
|
||||
const context = {
|
||||
...preData,
|
||||
include,
|
||||
parameters: getRecursiveParameters(entity),
|
||||
};
|
||||
// return await scrapeReleases(scraper, entity, preData, true);
|
||||
return await scraper.fetchMovies(entity);
|
||||
return await scraper.fetchMovies(entity, 1, context); // TODO: implement pagination
|
||||
} catch (error) {
|
||||
logger.warn(`Failed to scrape movies for '${entity.slug}' (${entity.parent?.slug}): ${error.message}`);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue