From 42b5c0c15089c6678a5eb71fc402054559ca7de1 Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Wed, 5 Mar 2025 02:48:43 +0100 Subject: [PATCH] Fixed and refactored Dorcel scraper. --- package-lock.json | 8 +- package.json | 2 +- seeds/02_sites.js | 2 +- src/scrapers/dorcel.js | 310 +++++++++++++++++++++++------------------ src/updates.js | 9 +- 5 files changed, 191 insertions(+), 140 deletions(-) diff --git a/package-lock.json b/package-lock.json index 76277b87..85f1b6ed 100644 --- a/package-lock.json +++ b/package-lock.json @@ -89,7 +89,7 @@ "tunnel": "0.0.6", "ua-parser-js": "^1.0.37", "undici": "^5.28.1", - "unprint": "^0.15.0", + "unprint": "^0.15.5", "url-pattern": "^1.0.3", "v-tooltip": "^2.1.3", "video.js": "^8.6.1", @@ -18312,9 +18312,9 @@ } }, "node_modules/unprint": { - "version": "0.15.0", - "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.15.0.tgz", - "integrity": "sha512-F/nfsSAPoQFfZCYGsxOxaNX05jfzQTP/lLo3BUeOPotp9RaRfcI6ylf6ts6GqFoMAD1Y6I7M31MiriDc+SgNDQ==", + "version": "0.15.5", + "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.15.5.tgz", + "integrity": "sha512-Zc3aZeQ26zvrOdvJ4RjuHdVHD8JsDfqMR626JtQWpsymljq6mWMgSQh6rdMBXLYfv3eGPzQdbo0NPnu5KAerRA==", "dependencies": { "axios": "^0.27.2", "bottleneck": "^2.19.5", diff --git a/package.json b/package.json index 531f7267..70360454 100755 --- a/package.json +++ b/package.json @@ -148,7 +148,7 @@ "tunnel": "0.0.6", "ua-parser-js": "^1.0.37", "undici": "^5.28.1", - "unprint": "^0.15.0", + "unprint": "^0.15.5", "url-pattern": "^1.0.3", "v-tooltip": "^2.1.3", "video.js": "^8.6.1", diff --git a/seeds/02_sites.js b/seeds/02_sites.js index 556e7bb0..df6fa2e1 100755 --- a/seeds/02_sites.js +++ b/seeds/02_sites.js @@ -11548,7 +11548,7 @@ const sites = [ }, { slug: 'creamher', - name: 'Goth Girlfriends', + name: 'Cream Her', url: 'https://www.creamher.com', parent: 'spizoo', }, diff --git a/src/scrapers/dorcel.js b/src/scrapers/dorcel.js index 5e849362..c726316a 100755 --- a/src/scrapers/dorcel.js +++ b/src/scrapers/dorcel.js @@ -1,232 +1,278 @@ 'use strict'; -const qu = require('../utils/qu'); -const slugify = require('../utils/slugify'); +const unprint = require('unprint'); +const cookie = require('cookie'); + +function extractSources(sources) { + if (sources?.length > 0) { + return sources + .flat() + .map((src) => { + const [width, height] = src.match(/(\d{3,4})?_(\d{3,4})/)?.slice(1) || []; + + return { + src, + width, + height, + }; + }) + .toSorted((posterA, posterB) => { + return posterB.height - posterA.height; + }) + .map(({ src }) => src); + } + + return null; +} function scrapeAll(scenes, channel) { return scenes.map(({ query }) => { const release = {}; - release.url = query.url('.title', 'href', { origin: channel.url }); + release.url = query.url('.title', { origin: channel.url }); release.entryId = new URL(release.url).pathname.match(/\/scene\/(\d+)/)?.[1]; - release.title = query.cnt('.title'); + release.title = query.content('.title'); release.actors = query.all('.actors a').map((actorEl) => ({ - name: query.cnt(actorEl), - url: query.url(actorEl, null, 'href', { origin: channel.url }), + name: unprint.query.content(actorEl), + url: unprint.query.url(actorEl, null, { origin: channel.url }), })); - const fallbackPoster = query.img('.thumb img'); - release.poster = query.sourceSet('.thumb img', 'data-srcset') || [fallbackPoster.replace('_crop', ''), fallbackPoster]; + release.poster = extractSources(query.sourceSets('.thumb source', 'data-srcset')) || query.img('.thumb img'); release.teaser = [ - query.video('.thumb-ratio', 'data-hq-preview'), - query.video('.thumb-ratio', 'data-preview'), + query.video('.thumb-ratio', { attribute: 'data-hq-preview' }), + query.video('.thumb-ratio', { attribute: 'data-preview' }), ]; return release; }); } +async function beforeFetchLatest(channel) { + // scene page only seems to accept language preferences from session + const { res } = await unprint.get(`${channel.url}/en/news-videos-x-marc-dorcel`, { + headers: { + 'X-Requested-With': 'XMLHttpRequest', + 'Accept-Language': 'en-US,en', // fetch English rather than French titles + }, + }); + + const sessionCookie = cookie.parse(res.headers['set-cookie'][0])?.dorcelclub; + + return `dorcelclub=${sessionCookie}`; +} + +async function fetchLatest(channel, page = 1, _options, { beforeFetchLatest: sessionCookie }) { + const url = `${channel.url}/scene/list/more/?lang=en&page=${page}&sorting=new`; + + const res = await unprint.post(url, null, { + selectAll: '.scene', + headers: { + 'X-Requested-With': 'XMLHttpRequest', + 'Accept-Language': 'en-US,en', // fetch English rather than French titles + Cookie: sessionCookie, + }, + }); + + if (res.ok) { + return scrapeAll(res.context, channel); + } + + return res.status; +} + function scrapeScene({ query }, url, channel) { const release = {}; release.entryId = new URL(url).pathname.match(/\/scene\/(\d+)/)?.[1]; - release.title = query.cnt('h1.title'); - release.description = query.cnt('.content-description .full p'); + release.title = query.content('h1.title'); + release.description = query.content('.content-description .full p'); - release.date = query.date('.publish_date', 'MMMM DD, YYYY'); - release.duration = query.dur('.duration'); + release.date = query.date('.publish_date', 'MMM DD, YYYY') || query.date('.out_date', 'YYYY', { match: /\d{4}/ }); + + if (!query.exists('.publish_date')) { + release.datePrecision = 'year'; + } + + release.duration = query.duration('.duration'); release.actors = query.all('.actress a').map((actorEl) => ({ - name: query.cnt(actorEl), - url: query.url(actorEl, null, 'href', { origin: channel.url }), + name: unprint.query.content(actorEl), + url: unprint.query.url(actorEl, null, { origin: channel.url }), })); - release.director = query.cnt('.director')?.split(/\s*:\s*/)[1]; + release.director = query.content('.director')?.split(/\s*:\s*/)[1]; - const fallbackPoster = query.img('.player img'); - release.poster = query.sourceSet('.player img', 'data-srcset') || [fallbackPoster?.replace('_crop', ''), fallbackPoster]; + release.poster = extractSources(query.sourceSets('.player source', 'data-srcset')) || query.img('.player img'); - const movieUrl = query.url('.movie a', 'href', { origin: channel.url }); + const movieUrl = query.url('.movie a', { origin: channel.url }); if (movieUrl) { release.movie = { entryId: new URL(movieUrl).pathname.match(/\/porn-movie\/([\w-]+)/)?.[1], - title: query.cnt('.movie a'), - url: query.url('.movie a', 'href', { origin: channel.url }), + title: query.content('.movie a'), + url: query.url('.movie a', { origin: channel.url }), }; } return release; } +async function fetchScene(url, channel) { + const res = await unprint.get(url, { + headers: { + 'Accept-Language': 'en-US,en', // fetch English rather than French titles + Referer: `${channel.url}/en/news-videos-x-marc-dorcel`, + }, + }); + + if (res.ok) { + return scrapeScene(res.context, url, channel); + } + + return res.status; +} + function scrapeMovies(movies, channel) { return movies.map(({ query }) => { const release = {}; - release.url = query.url(null, 'href', { origin: channel.url }); + release.url = query.url(null, { origin: channel.url })?.replace('/film-x', '/en/porn-movie'); // French -> English fallback in case language headers didn't work release.entryId = new URL(release.url).pathname.match(/\/porn-movie\/([\w-]+)/)?.[1]; - release.title = query.cnt('h2'); + release.title = query.content('h2'); - release.covers = [query.sourceSet('img', 'data-srcset')]; + release.covers = [extractSources(query.sourceSets('.thumb-ratio source', 'data-srcset')) || query.img('.thumb-ratio img')]; return release; }); } -function scrapeMovie({ query, el }, url, channel) { +async function fetchMovies(channel, page = 1, { beforeFetchLatest: sessionCookie }) { + const url = `${channel.url}/movies/more?lang=en&page=${page}&sorting=new`; + + const res = await unprint.post(url, null, { + selectAll: '.items .movie', + headers: { + 'X-Requested-With': 'XMLHttpRequest', + 'Accept-Language': 'en-US,en', // fetch English rather than French titles + Referer: 'https://www.dorcelclub.com/en/porn-movie?sorting=new', // might be used to derive sorting + Cookie: sessionCookie, // seems necessary for English results + }, + }); + + if (res.ok && res.context) { + return scrapeMovies(res.context, channel); + } + + return res.status; +} + +function scrapeMovie({ query }, url, channel) { const release = {}; - release.title = query.cnt('.header h1'); - release.description = query.cnt('.content-text p'); + release.title = query.content('.header h1'); + release.description = query.content('.content-text p'); release.entryId = new URL(url).pathname.match(/\/porn-movie\/([\w-]+)/)?.[1]; - release.date = query.date('.out_date', 'YYYY'); + release.date = query.date('.out_date', 'YYYY', { match: /\d{4}/ }); release.datePrecision = 'year'; - release.duration = query.dur('.duration'); + release.duration = query.duration('.duration'); release.actors = query.all('.actors .actor').map((actorEl) => ({ - name: query.cnt(actorEl, '.name'), - url: query.url(actorEl, 'a', 'href', { origin: channel.url }), - avatar: query.sourceSet(actorEl, '.thumbnail img', 'data-srcset'), + name: unprint.query.content(actorEl, '.name'), + url: unprint.query.url(actorEl, 'a', { origin: channel.url }), + avatar: extractSources(unprint.query.sourceSets(actorEl, '.thumbnail source', 'data-srcset')) || unprint.query.img(actorEl, '.thumbnail img'), })); - release.poster = query.sourceSet('.banner', 'data-src')?.[0]; - release.covers = [query.all(query.el('.cover').parentElement, 'source') - ?.map((coverEl) => query.sourceSet(coverEl, null, 'data-srcset')) - .flat() - .sort((coverA, coverB) => { - const resA = Number(coverA.match(/_(\d{3,})_/)?.[1]); - const resB = Number(coverB.match(/_(\d{3,})_/)?.[1]); + release.poster = extractSources(query.sourceSets('//picture[img[contains(@class, \'banner\')]]//source', 'data-srcset')) || query.img('img.banner'); + release.covers = [extractSources(query.sourceSets('//picture[img[contains(@class, \'cover\')]]//source', 'data-srcset')) || query.img('img.cover')]; - if (resA < resB) return 1; - if (resA > resB) return -1; - - return 0; - }) - .concat(query.sourceSet('.cover', 'data-src')?.[0])]; - - release.scenes = scrapeAll(qu.initAll(el, '.scene'), channel); + release.scenes = scrapeAll(unprint.initAll(query.all('.scene')), channel); return release; } -async function scrapeProfile({ query, el }, entity, avatar) { +async function fetchMovie(url, channel) { + const res = await unprint.get(url, { + select: '.content', + headers: { + 'Accept-Language': 'en-US,en', // fetch English rather than French titles + Referer: `${channel.url}/en/porn-movie`, + }, + }); + + if (res.ok && res.context) { + return scrapeMovie(res.context, url, channel); + } + + return res.status; +} + +async function scrapeProfile({ query }, entity) { const profile = {}; - profile.description = query.cnt('.content-description .content-text > p, .content-description .full p'); // different structure for overflowing vs short text - profile.nationality = query.cnt('.nationality'); + profile.description = query.content('.content-description .content-text > p, .content-description .full p'); // different structure for overflowing vs short text + profile.nationality = query.content('.nationality'); profile.banner = query.img('.header img:not([src*="actor/banner"])'); // ignore stock banner - if (avatar) { - profile.avatar = [ - avatar.replace('crop_', ''), - avatar, - ]; - } + profile.avatar = extractSources(query.sourceSets('.banner source[data-srcset*="actorsquare"]', 'data-srcset')) + || query.img('.banner img[src*="actorsqure"]'); // usually banner, but worth trying - profile.releases = scrapeAll(qu.initAll(el, '.scene'), entity); + profile.releases = scrapeAll(unprint.initAll(query.all('.scene')), entity); return profile; } -async function beforeFetchLatest(channel) { - // scene page only seems to accept language preferences from session - const session = qu.session(); - - await qu.getAll(`${channel.url}/en/news-videos-x-marc-dorcel`, '.scene', { - 'X-Requested-With': 'XMLHttpRequest', - 'Accept-Language': 'en-US,en', // fetch English rather than French titles - }, { session }); - - return session; -} - -async function fetchLatest(channel, page = 1, options, { beforeFetchLatest: session }) { - const url = `${channel.url}/scene/list/more/?lang=en&page=${page}&sorting=new`; - - const res = await qu.getAll(url, '.scene', { - 'X-Requested-With': 'XMLHttpRequest', - 'Accept-Language': 'en-US,en', // fetch English rather than French titles - }, { session }); - - if (res.ok) { - return scrapeAll(res.items, channel); +async function getActorUrl(baseActor, entity) { + if (baseActor.url) { + return baseActor.url; } - return res.status; -} - -async function fetchMovies(channel, page = 1) { - const url = `${channel.url}/movies/more?lang=en&page=${page}&sorting=new`; - - const res = await qu.getAll(url, '.movie', { - 'X-Requested-With': 'XMLHttpRequest', - 'Accept-Language': 'en-US,en', // fetch English rather than French titles - Referer: 'https://www.dorcelclub.com/en/porn-movie?sorting=new', // might be used to derive sorting - }); - - if (res.ok && res.items) { - return scrapeMovies(res.items, channel); - } - - return res.status; -} - -async function fetchScene(url, channel) { - const res = await qu.get(url, null, { - 'Accept-Language': 'en-US,en', // fetch English rather than French titles - Referer: `${channel.url}/en/news-videos-x-marc-dorcel`, - }); - - if (res.ok) { - return scrapeScene(res.item, url, channel); - } - - return res.status; -} - -async function fetchMovie(url, channel) { - const res = await qu.get(url, '.content', { - 'Accept-Language': 'en-US,en', // fetch English rather than French titles - Referer: `${channel.url}/en/porn-movie`, - }); - - if (res.ok && res.item) { - return scrapeMovie(res.item, url, channel); - } - - return res.status; -} - -async function fetchProfile(baseActor, { entity }) { // URL slugs are unpredictable: /jessie-volt, /aleska_diamond, /liza-del_sierra - const searchRes = await qu.postAll(`${entity.url}/en/search`, { s: baseActor.name }, '.actors .actor', { 'Accept-Language': 'en-US,en' }); + // AJAX API at /search/ajax/display doesn't actually return results unless an actor ID is passed + const searchRes = await unprint.post(`${entity.url}/en/search`, new URLSearchParams({ s: baseActor.name }), { + selectAll: '#search .actor', + headers: { + // 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8', + 'Accept-Language': 'en-US,en', + }, + }); if (!searchRes.ok) { return searchRes.status; } - const actorItem = searchRes.items.find(({ query }) => slugify(query.cnt('.name')) === baseActor.slug); + const actorItem = searchRes.context.find(({ query }) => query.content('.name') === baseActor.name); if (!actorItem) { return null; } - const actorUrl = actorItem.query.url('a', 'href', { origin: entity.url }); - const actorAvatar = actorItem.query.img(); + return actorItem.query.url('a', { origin: entity.url }); +} - const actorRes = await qu.get(actorUrl, null, { 'Accept-Language': 'en-US,en' }); +async function fetchProfile(baseActor, { entity }) { + const actorUrl = await getActorUrl(baseActor, entity); + + if (!actorUrl) { + return null; + } + + const actorRes = await unprint.get(actorUrl, { + headers: { + 'Accept-Language': 'en-US,en', + }, + }); if (actorRes.ok) { - return scrapeProfile(actorRes.item, entity, actorAvatar); + return scrapeProfile(actorRes.context, entity); } return null; diff --git a/src/updates.js b/src/updates.js index 181db656..7aca1528 100755 --- a/src/updates.js +++ b/src/updates.js @@ -198,14 +198,19 @@ async function scrapeUpcomingReleases(scraper, entity, preData) { return emptyReleases; } -async function scrapeMovies(scraper, entity) { +async function scrapeMovies(scraper, entity, preData) { if (!argv.movies || !scraper.fetchMovies) { return []; } try { + const context = { + ...preData, + include, + parameters: getRecursiveParameters(entity), + }; // return await scrapeReleases(scraper, entity, preData, true); - return await scraper.fetchMovies(entity); + return await scraper.fetchMovies(entity, 1, context); // TODO: implement pagination } catch (error) { logger.warn(`Failed to scrape movies for '${entity.slug}' (${entity.parent?.slug}): ${error.message}`); }