traxxx/src/scrapers/dorcel.js

292 lines
8.2 KiB
JavaScript
Executable File

'use strict';
const unprint = require('unprint');
const cookie = require('cookie');
const slugify = require('../utils/slugify');
function extractSources(sources) {
if (sources?.length > 0) {
return sources
.flat()
.map((src) => {
const [width, height] = src.match(/(\d{3,4})?_(\d{3,4})/)?.slice(1) || [];
return {
src,
width,
height,
};
})
.toSorted((posterA, posterB) => {
return posterB.height - posterA.height;
})
.map(({ src }) => src);
}
return null;
}
function scrapeAll(scenes, channel) {
return scenes.map(({ query }) => {
const release = {};
release.url = query.url('.title', { origin: channel.url });
release.entryId = new URL(release.url).pathname.match(/\/scene\/(\d+)/)?.[1];
release.title = query.content('.title');
release.actors = query.all('.actors a').map((actorEl) => ({
name: unprint.query.content(actorEl),
url: unprint.query.url(actorEl, null, { origin: channel.url }),
}));
release.poster = extractSources(query.sourceSets('.thumb source', 'data-srcset')) || query.img('.thumb img');
release.teaser = [
query.video('.thumb-ratio', { attribute: 'data-hq-preview' }),
query.video('.thumb-ratio', { attribute: 'data-preview' }),
];
return release;
});
}
async function beforeFetchLatest(channel) {
// scene page only seems to accept language preferences from session
const { res } = await unprint.get(`${channel.url}/en/news-videos-x-marc-dorcel`, {
headers: {
'X-Requested-With': 'XMLHttpRequest',
'Accept-Language': 'en-US,en', // fetch English rather than French titles
},
});
const sessionCookie = cookie.parse(res.headers['set-cookie'][0])?.dorcelclub;
return `dorcelclub=${sessionCookie}`;
}
async function fetchLatest(channel, page = 1, _options, { beforeFetchLatest: sessionCookie }) {
const url = `${channel.url}/scene/list/more/?lang=en&page=${page}&sorting=new`;
const res = await unprint.post(url, null, {
selectAll: '.scene',
headers: {
'X-Requested-With': 'XMLHttpRequest',
'Accept-Language': 'en-US,en', // fetch English rather than French titles
Cookie: sessionCookie,
},
});
if (res.ok) {
return scrapeAll(res.context, channel);
}
return res.status;
}
function scrapeScene({ query }, url, channel) {
const release = {};
release.entryId = new URL(url).pathname.match(/\/scene\/(\d+)/)?.[1];
release.title = query.content('h1.title');
release.description = query.content('.content-description .full p');
release.date = query.date('.publish_date', 'MMM DD, YYYY') || query.date('.out_date', 'YYYY', { match: /\d{4}/ });
if (!query.exists('.publish_date')) {
release.datePrecision = 'year';
}
release.duration = query.duration('.duration');
release.actors = query.all('.actress a').map((actorEl) => ({
name: unprint.query.content(actorEl),
url: unprint.query.url(actorEl, null, { origin: channel.url }),
}));
release.director = query.content('.director')?.split(/\s*:\s*/)[1];
release.poster = extractSources(query.sourceSets('.player source', 'data-srcset')) || query.img('.player img');
const movieUrl = query.url('.movie a', { origin: channel.url });
if (movieUrl) {
release.movie = {
entryId: new URL(movieUrl).pathname.match(/\/porn-movie\/([\w-]+)/)?.[1],
title: query.content('.movie a'),
url: query.url('.movie a', { origin: channel.url }),
};
}
return release;
}
async function fetchScene(url, channel) {
const res = await unprint.get(url, {
headers: {
'Accept-Language': 'en-US,en', // fetch English rather than French titles
Referer: `${channel.url}/en/news-videos-x-marc-dorcel`,
},
});
if (res.ok) {
return scrapeScene(res.context, url, channel);
}
return res.status;
}
function scrapeMovies(movies, channel) {
return movies.map(({ query }) => {
const release = {};
release.url = query.url(null, { origin: channel.url })?.replace('/film-x', '/en/porn-movie'); // French -> English fallback in case language headers didn't work
release.entryId = new URL(release.url).pathname.match(/\/porn-movie\/([\w-]+)/)?.[1];
release.title = query.content('h2');
release.covers = [extractSources(query.sourceSets('.thumb-ratio source', 'data-srcset')) || query.img('.thumb-ratio img')];
return release;
});
}
async function fetchMovies(channel, page = 1, { beforeFetchLatest: sessionCookie }) {
const url = `${channel.url}/movies/more?lang=en&page=${page}&sorting=new`;
const res = await unprint.post(url, null, {
selectAll: '.items .movie',
headers: {
'X-Requested-With': 'XMLHttpRequest',
'Accept-Language': 'en-US,en', // fetch English rather than French titles
Referer: 'https://www.dorcelclub.com/en/porn-movie?sorting=new', // might be used to derive sorting
Cookie: sessionCookie, // seems necessary for English results
},
});
if (res.ok && res.context) {
return scrapeMovies(res.context, channel);
}
return res.status;
}
function scrapeMovie({ query }, url, channel) {
const release = {};
release.title = query.content('.header h1');
release.description = query.content('.content-text p');
release.entryId = new URL(url).pathname.match(/\/porn-movie\/([\w-]+)/)?.[1];
release.date = query.date('.out_date', 'YYYY', { match: /\d{4}/ });
release.datePrecision = 'year';
release.duration = query.duration('.duration');
release.actors = query.all('.actors .actor').map((actorEl) => ({
name: unprint.query.content(actorEl, '.name'),
url: unprint.query.url(actorEl, 'a', { origin: channel.url }),
avatar: extractSources(unprint.query.sourceSets(actorEl, '.thumbnail source', 'data-srcset')) || unprint.query.img(actorEl, '.thumbnail img'),
}));
release.poster = extractSources(query.sourceSets('//picture[img[contains(@class, \'banner\')]]//source', 'data-srcset')) || query.img('img.banner');
release.covers = [extractSources(query.sourceSets('//picture[img[contains(@class, \'cover\')]]//source', 'data-srcset')) || query.img('img.cover')];
release.scenes = scrapeAll(unprint.initAll(query.all('.scene')), channel);
return release;
}
async function fetchMovie(url, channel) {
const res = await unprint.get(url, {
select: '.content',
headers: {
'Accept-Language': 'en-US,en', // fetch English rather than French titles
Referer: `${channel.url}/en/porn-movie`,
},
});
if (res.ok && res.context) {
return scrapeMovie(res.context, url, channel);
}
return res.status;
}
async function scrapeProfile({ query }, entity) {
const profile = {};
profile.description = query.content('.content-description .content-text > p, .content-description .full p'); // different structure for overflowing vs short text
profile.nationality = query.content('.nationality');
profile.banner = query.img('.header img:not([src*="actor/banner"])'); // ignore stock banner
profile.avatar = extractSources(query.sourceSets('.banner source[data-srcset*="actorsquare"]', 'data-srcset'))
|| query.img('.banner img[src*="actorsqure"]'); // usually banner, but worth trying
profile.releases = scrapeAll(unprint.initAll(query.all('.scene')), entity);
return profile;
}
async function getActorUrl(baseActor, entity) {
if (baseActor.url) {
return baseActor.url;
}
// URL slugs are unpredictable: /jessie-volt, /aleska_diamond, /liza-del_sierra
// AJAX API at /search/ajax/display doesn't actually return results unless an actor ID is passed
const searchRes = await unprint.post(`${entity.url}/en/search`, { s: baseActor.name }, {
selectAll: '#search .actor',
form: true,
headers: {
// 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
'Accept-Language': 'en-US,en',
},
});
if (!searchRes.ok) {
return searchRes.status;
}
const actorItem = searchRes.context.find(({ query }) => slugify(query.content('.name')) === baseActor.slug);
if (!actorItem) {
return null;
}
return actorItem.query.url('a', { origin: entity.url });
}
async function fetchProfile(baseActor, { entity }) {
const actorUrl = await getActorUrl(baseActor, entity);
if (!actorUrl) {
return null;
}
const actorRes = await unprint.get(actorUrl, {
headers: {
'Accept-Language': 'en-US,en',
},
});
if (actorRes.ok) {
return scrapeProfile(actorRes.context, entity);
}
return null;
}
module.exports = {
beforeFetchLatest,
fetchLatest,
fetchScene,
fetchMovie,
fetchMovies,
fetchProfile,
};