Updated Dorcel scraper, added movie support.

This commit is contained in:
DebaucheryLibrarian 2020-11-19 02:01:13 +01:00
parent ecc90be12c
commit 77f9193669
16 changed files with 240 additions and 73 deletions

View File

@ -22,6 +22,8 @@ function initActorActions(store, router) {
$offset:Int = 0,
$after:Datetime = "1900-01-01",
$before:Datetime = "2100-01-01",
$afterTime:Datetime = "1900-01-01",
$beforeTime:Datetime = "2100-01-01",
$orderBy:[ReleasesOrderBy!]
$selectableTags: [String],
$includedTags: [String!],
@ -173,10 +175,23 @@ function initActorActions(store, router) {
}
scenesConnection(
filter: {
date: {
lessThan: $before,
greaterThan: $after,
}
or: [
{
date: {
lessThan: $before,
greaterThan: $after
}
},
{
date: {
isNull: true
},
createdAt: {
lessThan: $beforeTime,
greaterThan: $afterTime,
}
}
]
and: [
{
or: $includedEntities
@ -205,6 +220,8 @@ function initActorActions(store, router) {
offset: Math.max(0, (pageNumber - 1)) * limit,
after,
before,
afterTime: store.getters.after,
beforeTime: store.getters.before,
selectableTags: config.selectableTags,
orderBy,
excludeTags: store.state.ui.filter,

View File

@ -835,12 +835,14 @@ exports.up = knex => Promise.resolve()
table.integer('movie_id', 16)
.notNullable()
.references('id')
.inTable('movies');
.inTable('movies')
.onDelete('cascade');
table.integer('scene_id', 16)
.notNullable()
.references('id')
.inTable('releases');
.inTable('releases')
.onDelete('cascade');
table.unique(['movie_id', 'scene_id']);
@ -874,6 +876,20 @@ exports.up = knex => Promise.resolve()
.references('id')
.inTable('media');
}))
.then(() => knex.schema.createTable('movies_posters', (table) => {
table.integer('movie_id', 16)
.notNullable()
.references('id')
.inTable('movies')
.onDelete('cascade');
table.text('media_id', 21)
.notNullable()
.references('id')
.inTable('media');
table.unique(['movie_id', 'media_id']);
}))
.then(() => knex.schema.createTable('clips', (table) => {
table.increments('id', 16);
@ -1126,7 +1142,7 @@ exports.up = knex => Promise.resolve()
FROM movies_scenes
LEFT JOIN
releases ON releases.id = movies_scenes.scene_id
LEFT JOIN
INNER JOIN
releases_photos ON releases_photos.release_id = releases.id
LEFT JOIN
media ON media.id = releases_photos.media_id
@ -1172,8 +1188,9 @@ exports.down = (knex) => { // eslint-disable-line arrow-body-style
DROP TABLE IF EXISTS releases_tags CASCADE;
DROP TABLE IF EXISTS releases_search CASCADE;
DROP TABLE IF EXISTS movies_covers CASCADE;
DROP TABLE IF EXISTS movies_scenes CASCADE;
DROP TABLE IF EXISTS movies_covers CASCADE;
DROP TABLE IF EXISTS movies_posters CASCADE;
DROP TABLE IF EXISTS movies_trailers CASCADE;
DROP TABLE IF EXISTS clips_tags CASCADE;

Binary file not shown.

Before

Width:  |  Height:  |  Size: 301 KiB

After

Width:  |  Height:  |  Size: 529 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 277 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 440 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.2 KiB

After

Width:  |  Height:  |  Size: 5.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 36 KiB

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

View File

@ -386,6 +386,11 @@ const tags = [
priority: 8,
group: 'penetration',
},
{
name: 'double barrel blowjob',
slug: 'double-barrel-blowjob',
group: 'oral',
},
{
name: 'double blowjob',
slug: 'double-blowjob',

View File

@ -626,7 +626,7 @@ const tagPosters = [
['ebony', 2, 'Nia Nacci for Sweetheart Video'],
['facefucking', 5, 'Mia Moore B for Throated'],
['facial', 0, 'Brooklyn Gray in "All About Ass 4" for Evil Angel'],
['fake-boobs', 14, 'Rikki Six for Dream Dolls'],
['fake-boobs', 7, 'Charley Atwell for iCandiGirls'],
['fake-cum', 2, 'Mimi Allen for Fucked Up Facials'],
['family', 0, 'Teanna Trump in "A Family Appear: Part One" for Brazzers'],
['femdom', 0, 'Alina Li in "Asian Domination… She Holds Jules Jordan\'s Cock Hostage!" for Jules Jordan'],
@ -786,6 +786,7 @@ const tagPhotos = [
['facefucking', 2, 'Jynx Maze for Throated'],
['facefucking', 4, 'Brooklyn Gray in "Throats Fucks 6" for Evil Angel'],
['facefucking', 3, 'Adriana Chechik in "Performing Magic Butt Tricks With Jules Jordan. What Will Disappear In Her Ass?" for Jules Jordan'],
['fake-boobs', 14, 'Rikki Six for Dream Dolls'],
['fake-boobs', 2, 'Gia Milana in "Hot Anal Latina" for HardX'],
['fake-boobs', 1, 'Lela Star in "Thick" for Jules Jordan'],
['fake-boobs', 16, 'Marsha May in "Once You Go Black 7" for Jules Jordan'],
@ -798,7 +799,6 @@ const tagPhotos = [
['fake-boobs', 8, 'Amber Alena for Score'],
['fake-boobs', 4, 'Capri Cavanni for Big Tits in Sports'],
// ['fake-boobs', 6, 'Cathy Heaven in "Heavenly Ass" for Big Wett Butts'],
['fake-boobs', 7, 'Madison Ivy for Baby Got Boobs (Brazzers)'],
['fake-boobs', 12, 'Nikki Monroe and Kortney Kane for Big Tits In Uniform'],
['fake-cum', 0, 'Jynx Maze for Cumshot Surprise (Porn Pros)'],
['fake-cum', 1, 'Ricki White for Fucked Up Facials'],

View File

@ -1,93 +1,124 @@
'use strict';
const qu = require('../utils/q');
const qu = require('../utils/qu');
const slugify = require('../utils/slugify');
function scrapeAll(scenes) {
function scrapeAll(scenes, channel) {
return scenes.map(({ query }) => {
const release = {};
release.url = query.url('.title a');
release.entryId = new URL(release.url).pathname.match(/\/scene\/(\d+)/)[1];
release.url = query.url('.title', 'href', { origin: channel.url });
release.entryId = new URL(release.url).pathname.match(/\/scene\/(\d+)/)?.[1];
release.title = query.cnt('.title a');
release.date = query.date('.date', 'MMMM DD, YYYY', /\w+ \d{1,2}, \d{4}/);
release.duration = query.number('.length') * 60;
release.title = query.cnt('.title');
release.actors = query.all('.actors a').map(actorEl => ({
name: query.cnt(actorEl),
url: query.url(actorEl, null),
url: query.url(actorEl, null, 'href', { origin: channel.url }),
}));
release.poster = query.img('.poster noscript img');
release.stars = query.count('.rating .star1');
release.tags = [query.cnt('.collection a')];
const fallbackPoster = query.img('.thumb img');
release.poster = query.sourceSet('.thumb img', 'data-srcset') || [fallbackPoster.replace('_crop', ''), fallbackPoster];
return release;
});
}
function scrapeScene({ query }, url) {
function scrapeScene({ query }, url, channel) {
const release = {};
release.entryId = new URL(url).pathname.match(/\/scene\/(\d+)/)[1];
release.entryId = new URL(url).pathname.match(/\/scene\/(\d+)/)?.[1];
release.title = query.cnt('.infos .title h1');
release.description = query.cnt('#description p:nth-child(2)');
release.title = query.cnt('h1.title');
release.description = query.cnt('.content-description .full p');
release.date = query.date('.infos .date', 'MMMM DD, YYYY', /\w+ \d{1,2}, \d{4}/);
release.duration = query.number('.infos .length') * 60;
release.date = query.date('.publish_date', 'MMMM DD, YYYY');
release.duration = query.dur('.duration');
release.actors = query.all('.infos .actors a').map(actorEl => ({
release.actors = query.all('.actress a').map(actorEl => ({
name: query.cnt(actorEl),
url: query.url(actorEl, null),
url: query.url(actorEl, null, 'href', { origin: channel.url }),
}));
release.poster = query.img('.poster noscript img');
release.stars = query.count('.infos .rating .star1');
release.director = query.cnt('.director')?.split(/\s*:\s*/)[1];
release.poster = query.sourceSet('.player img', 'data-srcset');
if (query.exists('.movie')) {
release.movie = {
name: query.cnt('.movie a'),
url: query.url('.movie a'),
};
release.movie.entryId = new URL(release.movie.url).pathname.split('/').slice(-1)[0];
}
release.movie = {
title: query.cnt('.movie a'),
url: query.url('.movie a', 'href', { origin: channel.url }),
};
return release;
}
function scrapeProfile({ query, el }, avatar) {
function scrapeMovies(movies, channel) {
return movies.map(({ query }) => {
const release = {};
release.url = query.url(null, 'href', { origin: channel.url });
release.entryId = new URL(release.url).pathname.match(/\/porn-movie\/([\w-]+)/)?.[1];
release.title = query.cnt('h2');
release.covers = [query.sourceSet('img', 'data-srcset')];
return release;
});
}
function scrapeMovie({ query, el }, url, channel) {
const release = {};
release.title = query.cnt('.header h1');
release.description = query.cnt('.content-text p');
release.entryId = new URL(url).pathname.match(/\/porn-movie\/([\w-]+)/)?.[1];
release.date = query.date('.out_date', 'YYYY');
release.datePrecision = 'year';
release.duration = query.dur('.duration');
release.actors = query.all('.actors .actor').map(actorEl => ({
name: query.cnt(actorEl, '.name'),
url: query.url(actorEl, 'a', 'href', { origin: channel.url }),
avatar: query.sourceSet(actorEl, '.thumbnail img', 'data-srcset'),
}));
release.poster = query.sourceSet('.banner', 'data-srcset');
release.covers = [query.sourceSet('.cover', 'data-srcset')];
release.scenes = scrapeAll(qu.initAll(el, '.scene'), channel);
return release;
}
async function scrapeProfile({ query, el }, entity, avatar) {
const profile = {};
profile.birthdate = qu.parseDate(query.text('.birthdate'), 'MMMM DD, YYYY');
profile.nationality = query.text('.nationality');
profile.hairColor = query.text('.hair');
profile.description = query.cnt('.content-description .content-text > p, .content-description .full p'); // different structure for overflowing vs short text
profile.nationality = query.cnt('.nationality');
profile.description = query.cnt('.bio_results p');
profile.banner = query.img('.header img:not([src*="actor/banner"])'); // ignore stock banner
if (avatar) {
profile.avatar = [
avatar.replace('_crop', ''),
avatar.replace('crop_', ''),
avatar,
];
}
// TODO: add pagination
profile.releases = scrapeAll(qu.initAll(el, '.scene'));
profile.releases = scrapeAll(qu.initAll(el, '.scene'), entity);
return profile;
}
// TODO: add movies
async function fetchLatest(channel, page = 1) {
const url = `${channel.url}/en/news-videos-x-marc-dorcel-ajax?page=${page}&sorting=publish_date`;
const url = `${channel.url}/scene/list/more/?lang=en&page=${page}&sorting=new`;
const res = await qu.getAll(url, '.scene', {
'X-Requested-With': 'XMLHttpRequest',
'Accept-Language': 'en-US,en', // fetch English rather than French titles
});
if (res.ok) {
@ -97,8 +128,26 @@ async function fetchLatest(channel, page = 1) {
return res.status;
}
async function fetchMovies(channel, page = 1) {
const url = `${channel.url}/movies/more?lang=en&page=${page}&sorting=new`;
const res = await qu.getAll(url, '.movie', {
'X-Requested-With': 'XMLHttpRequest',
'Accept-Language': 'en-US,en', // fetch English rather than French titles
Referer: 'https://www.dorcelclub.com/en/porn-movie?sorting=new', // might be used to derive sorting
});
if (res.ok) {
return scrapeMovies(res.items, channel);
}
return res.status;
}
async function fetchScene(url, channel) {
const res = await qu.get(url);
const res = await qu.get(url, '.content', {
'Accept-Language': 'en-US,en', // fetch English rather than French titles
});
if (res.ok) {
return scrapeScene(res.item, url, channel);
@ -107,25 +156,48 @@ async function fetchScene(url, channel) {
return res.status;
}
async function fetchProfile({ name: actorName, url: actorUrl }, entity, include) {
const searchRes = await qu.getAll(`${entity.url}/en/pornstars?search=${slugify(actorName, '+')}`, '.actor');
const actorItem = searchRes.ok && searchRes.items.find(actor => slugify(actor.query.cnt('h2')) === slugify(actorName));
const actorItemUrl = actorItem?.query.url();
const actorItemAvatar = actorItem?.query.img();
const url = actorUrl || actorItemUrl || `${entity.url}/en/pornstar/${slugify(actorName, '-')}`;
const res = await qu.get(url);
async function fetchMovie(url, channel) {
const res = await qu.get(url, '.content', {
'Accept-Language': 'en-US,en', // fetch English rather than French titles
});
if (res.ok) {
return scrapeProfile(res.item, actorItemAvatar, entity, include);
return scrapeMovie(res.item, url, channel);
}
return res.status;
}
async function fetchProfile(baseActor, { entity }) {
// URL slugs are unpredictable: /jessie-volt, /aleska_diamond, /liza-del_sierra
const searchRes = await qu.postAll(`${entity.url}/en/search`, { s: baseActor.name }, '.actors .actor', { 'Accept-Language': 'en-US,en' });
if (!searchRes.ok) {
return searchRes.status;
}
const actorItem = searchRes.items.find(({ query }) => slugify(query.cnt('.name')) === baseActor.slug);
if (!actorItem) {
return null;
}
const actorUrl = actorItem.query.url('a', 'href', { origin: entity.url });
const actorAvatar = actorItem.query.img();
const actorRes = await qu.get(actorUrl, null, { 'Accept-Language': 'en-US,en' });
if (actorRes.ok) {
return scrapeProfile(actorRes.item, entity, actorAvatar);
}
return null;
}
module.exports = {
fetchLatest,
fetchScene,
fetchMovie,
fetchMovies,
fetchProfile,
};

View File

@ -139,7 +139,7 @@ async function scrapeReleases(scraper, entity, preData, isUpcoming) {
}
async function scrapeLatestReleases(scraper, entity, preData) {
if ((!argv.latest && !argv.last) || !scraper.fetchLatest) {
if (!argv.latest || !scraper.fetchLatest) {
return emptyReleases;
}

View File

@ -242,6 +242,42 @@ function urls(context, selector = 'a', attr = 'href', { origin, protocol = 'http
return attr ? urlEls.map(urlEl => prefixUrl(urlEl, origin, protocol)) : urlEls;
}
function sourceSet(context, selector, attr, options = {}) {
const srcset = q(context, selector, attr);
if (!srcset) {
return null;
}
const sources = srcset
.split(/\s*,\s*/)
.map((source) => {
const [link, descriptor] = source.split(' ');
return {
descriptor: descriptor || 'fallback',
url: prefixUrl(link, options.origin, options.protocol),
};
})
.sort((sourceA, sourceB) => {
if (sourceB.descriptor === 'fallback' || parseInt(sourceA.descriptor, 10) > parseInt(sourceB.descriptor, 10)) {
return -1;
}
if (parseInt(sourceA.descriptor, 10) < parseInt(sourceB.descriptor, 10)) {
return 1;
}
return 0;
});
if (options.includeDescriptor) {
return sources;
}
return sources.map(source => source.url);
}
function poster(context, selector = 'video', attr = 'poster', { origin, protocol = 'https' } = {}) {
const posterEl = q(context, selector, attr);
@ -267,17 +303,17 @@ function duration(context, selector, match, attr = 'textContent') {
const durationMatch = durationString.match(match || /(\d+:)?\d+:\d+/);
if (durationMatch) {
const segments = ['00'].concat(durationMatch[0].split(':')).slice(-3);
const segments = ['00'].concat(durationMatch[0].split(/[:hm]/)).slice(-3);
return moment.duration(segments.join(':')).asSeconds();
}
const timestampMatch = durationString.match(/T(\d+H)?(\d+M)?\d+S/);
const timestampMatch = durationString.match(/(\d+H)?\s*(\d+M)?\s*\d+S?/i);
if (timestampMatch) {
const hours = timestampMatch[0].match(/(\d+)H/)?.[1] || 0;
const minutes = timestampMatch[0].match(/(\d+)M/)?.[1] || 0;
const seconds = timestampMatch[0].match(/(\d+)S/)?.[1] || 0;
const hours = timestampMatch[0].match(/(\d+)H/i)?.[1] || 0;
const minutes = timestampMatch[0].match(/(\d+)M/i)?.[1] || 0;
const seconds = timestampMatch[0].match(/(\d+)(S|$)/i)?.[1] || 0;
return (Number(hours) * 3600) + (Number(minutes) * 60) + Number(seconds);
}
@ -345,6 +381,10 @@ const quFuncs = {
num: number,
poster,
q,
sourceSet,
sources: sourceSet,
srcs: sourceSet,
srcset: sourceSet,
style,
styles,
text,
@ -415,10 +455,12 @@ function extractAll(htmlValue, selector) {
return initAll(window.document, selector, window);
}
async function get(urlValue, selector, headers, options, queryAll = false) {
const res = await http.get(urlValue, headers, options);
async function request(method = 'get', urlValue, body, selector, headers, options, queryAll = false) {
const res = await (method === 'post'
? http.post(urlValue, body, headers, options)
: http[method](urlValue, headers, options));
if (res.statusCode === 200) {
if (res.ok) {
const item = queryAll
? extractAll(res.body.toString(), selector)
: extract(res.body.toString(), selector);
@ -443,8 +485,20 @@ async function get(urlValue, selector, headers, options, queryAll = false) {
};
}
async function get(urlValue, selector, headers, options) {
return request('get', urlValue, null, selector, headers, options, false);
}
async function post(urlValue, body, selector, headers, options) {
return request('post', urlValue, body, selector, headers, options, false);
}
async function getAll(urlValue, selector, headers, options) {
return get(urlValue, selector, headers, options, true);
return request('get,', urlValue, selector, headers, options, true);
}
async function postAll(urlValue, body, selector, headers, options) {
return request('post', urlValue, body, selector, headers, options, true);
}
module.exports = {
@ -470,6 +524,8 @@ module.exports = {
geta: getAll,
qu: quFuncs,
query: quFuncs,
post,
postAll,
prefixUrl,
...legacyFuncs,
};