Updated Dorcel scraper, added movie support.

This commit is contained in:
DebaucheryLibrarian
2020-11-19 02:01:13 +01:00
parent ecc90be12c
commit 77f9193669
16 changed files with 240 additions and 73 deletions

View File

@@ -1,93 +1,124 @@
'use strict';
const qu = require('../utils/q');
const qu = require('../utils/qu');
const slugify = require('../utils/slugify');
function scrapeAll(scenes) {
function scrapeAll(scenes, channel) {
return scenes.map(({ query }) => {
const release = {};
release.url = query.url('.title a');
release.entryId = new URL(release.url).pathname.match(/\/scene\/(\d+)/)[1];
release.url = query.url('.title', 'href', { origin: channel.url });
release.entryId = new URL(release.url).pathname.match(/\/scene\/(\d+)/)?.[1];
release.title = query.cnt('.title a');
release.date = query.date('.date', 'MMMM DD, YYYY', /\w+ \d{1,2}, \d{4}/);
release.duration = query.number('.length') * 60;
release.title = query.cnt('.title');
release.actors = query.all('.actors a').map(actorEl => ({
name: query.cnt(actorEl),
url: query.url(actorEl, null),
url: query.url(actorEl, null, 'href', { origin: channel.url }),
}));
release.poster = query.img('.poster noscript img');
release.stars = query.count('.rating .star1');
release.tags = [query.cnt('.collection a')];
const fallbackPoster = query.img('.thumb img');
release.poster = query.sourceSet('.thumb img', 'data-srcset') || [fallbackPoster.replace('_crop', ''), fallbackPoster];
return release;
});
}
function scrapeScene({ query }, url) {
function scrapeScene({ query }, url, channel) {
const release = {};
release.entryId = new URL(url).pathname.match(/\/scene\/(\d+)/)[1];
release.entryId = new URL(url).pathname.match(/\/scene\/(\d+)/)?.[1];
release.title = query.cnt('.infos .title h1');
release.description = query.cnt('#description p:nth-child(2)');
release.title = query.cnt('h1.title');
release.description = query.cnt('.content-description .full p');
release.date = query.date('.infos .date', 'MMMM DD, YYYY', /\w+ \d{1,2}, \d{4}/);
release.duration = query.number('.infos .length') * 60;
release.date = query.date('.publish_date', 'MMMM DD, YYYY');
release.duration = query.dur('.duration');
release.actors = query.all('.infos .actors a').map(actorEl => ({
release.actors = query.all('.actress a').map(actorEl => ({
name: query.cnt(actorEl),
url: query.url(actorEl, null),
url: query.url(actorEl, null, 'href', { origin: channel.url }),
}));
release.poster = query.img('.poster noscript img');
release.stars = query.count('.infos .rating .star1');
release.director = query.cnt('.director')?.split(/\s*:\s*/)[1];
release.poster = query.sourceSet('.player img', 'data-srcset');
if (query.exists('.movie')) {
release.movie = {
name: query.cnt('.movie a'),
url: query.url('.movie a'),
};
release.movie.entryId = new URL(release.movie.url).pathname.split('/').slice(-1)[0];
}
release.movie = {
title: query.cnt('.movie a'),
url: query.url('.movie a', 'href', { origin: channel.url }),
};
return release;
}
function scrapeProfile({ query, el }, avatar) {
function scrapeMovies(movies, channel) {
return movies.map(({ query }) => {
const release = {};
release.url = query.url(null, 'href', { origin: channel.url });
release.entryId = new URL(release.url).pathname.match(/\/porn-movie\/([\w-]+)/)?.[1];
release.title = query.cnt('h2');
release.covers = [query.sourceSet('img', 'data-srcset')];
return release;
});
}
function scrapeMovie({ query, el }, url, channel) {
const release = {};
release.title = query.cnt('.header h1');
release.description = query.cnt('.content-text p');
release.entryId = new URL(url).pathname.match(/\/porn-movie\/([\w-]+)/)?.[1];
release.date = query.date('.out_date', 'YYYY');
release.datePrecision = 'year';
release.duration = query.dur('.duration');
release.actors = query.all('.actors .actor').map(actorEl => ({
name: query.cnt(actorEl, '.name'),
url: query.url(actorEl, 'a', 'href', { origin: channel.url }),
avatar: query.sourceSet(actorEl, '.thumbnail img', 'data-srcset'),
}));
release.poster = query.sourceSet('.banner', 'data-srcset');
release.covers = [query.sourceSet('.cover', 'data-srcset')];
release.scenes = scrapeAll(qu.initAll(el, '.scene'), channel);
return release;
}
async function scrapeProfile({ query, el }, entity, avatar) {
const profile = {};
profile.birthdate = qu.parseDate(query.text('.birthdate'), 'MMMM DD, YYYY');
profile.nationality = query.text('.nationality');
profile.hairColor = query.text('.hair');
profile.description = query.cnt('.content-description .content-text > p, .content-description .full p'); // different structure for overflowing vs short text
profile.nationality = query.cnt('.nationality');
profile.description = query.cnt('.bio_results p');
profile.banner = query.img('.header img:not([src*="actor/banner"])'); // ignore stock banner
if (avatar) {
profile.avatar = [
avatar.replace('_crop', ''),
avatar.replace('crop_', ''),
avatar,
];
}
// TODO: add pagination
profile.releases = scrapeAll(qu.initAll(el, '.scene'));
profile.releases = scrapeAll(qu.initAll(el, '.scene'), entity);
return profile;
}
// TODO: add movies
async function fetchLatest(channel, page = 1) {
const url = `${channel.url}/en/news-videos-x-marc-dorcel-ajax?page=${page}&sorting=publish_date`;
const url = `${channel.url}/scene/list/more/?lang=en&page=${page}&sorting=new`;
const res = await qu.getAll(url, '.scene', {
'X-Requested-With': 'XMLHttpRequest',
'Accept-Language': 'en-US,en', // fetch English rather than French titles
});
if (res.ok) {
@@ -97,8 +128,26 @@ async function fetchLatest(channel, page = 1) {
return res.status;
}
async function fetchMovies(channel, page = 1) {
const url = `${channel.url}/movies/more?lang=en&page=${page}&sorting=new`;
const res = await qu.getAll(url, '.movie', {
'X-Requested-With': 'XMLHttpRequest',
'Accept-Language': 'en-US,en', // fetch English rather than French titles
Referer: 'https://www.dorcelclub.com/en/porn-movie?sorting=new', // might be used to derive sorting
});
if (res.ok) {
return scrapeMovies(res.items, channel);
}
return res.status;
}
async function fetchScene(url, channel) {
const res = await qu.get(url);
const res = await qu.get(url, '.content', {
'Accept-Language': 'en-US,en', // fetch English rather than French titles
});
if (res.ok) {
return scrapeScene(res.item, url, channel);
@@ -107,25 +156,48 @@ async function fetchScene(url, channel) {
return res.status;
}
async function fetchProfile({ name: actorName, url: actorUrl }, entity, include) {
const searchRes = await qu.getAll(`${entity.url}/en/pornstars?search=${slugify(actorName, '+')}`, '.actor');
const actorItem = searchRes.ok && searchRes.items.find(actor => slugify(actor.query.cnt('h2')) === slugify(actorName));
const actorItemUrl = actorItem?.query.url();
const actorItemAvatar = actorItem?.query.img();
const url = actorUrl || actorItemUrl || `${entity.url}/en/pornstar/${slugify(actorName, '-')}`;
const res = await qu.get(url);
async function fetchMovie(url, channel) {
const res = await qu.get(url, '.content', {
'Accept-Language': 'en-US,en', // fetch English rather than French titles
});
if (res.ok) {
return scrapeProfile(res.item, actorItemAvatar, entity, include);
return scrapeMovie(res.item, url, channel);
}
return res.status;
}
async function fetchProfile(baseActor, { entity }) {
// URL slugs are unpredictable: /jessie-volt, /aleska_diamond, /liza-del_sierra
const searchRes = await qu.postAll(`${entity.url}/en/search`, { s: baseActor.name }, '.actors .actor', { 'Accept-Language': 'en-US,en' });
if (!searchRes.ok) {
return searchRes.status;
}
const actorItem = searchRes.items.find(({ query }) => slugify(query.cnt('.name')) === baseActor.slug);
if (!actorItem) {
return null;
}
const actorUrl = actorItem.query.url('a', 'href', { origin: entity.url });
const actorAvatar = actorItem.query.img();
const actorRes = await qu.get(actorUrl, null, { 'Accept-Language': 'en-US,en' });
if (actorRes.ok) {
return scrapeProfile(actorRes.item, entity, actorAvatar);
}
return null;
}
module.exports = {
fetchLatest,
fetchScene,
fetchMovie,
fetchMovies,
fetchProfile,
};