Fixed various Kelly Madison scraper issues.

This commit is contained in:
DebaucheryLibrarian 2020-12-17 02:05:01 +01:00
parent d0f8e21466
commit cd8e810c35
25 changed files with 43 additions and 32 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 1017 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 874 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 729 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 45 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

View File

Before

Width:  |  Height:  |  Size: 5.7 MiB

After

Width:  |  Height:  |  Size: 5.7 MiB

View File

Before

Width:  |  Height:  |  Size: 550 KiB

After

Width:  |  Height:  |  Size: 550 KiB

View File

Before

Width:  |  Height:  |  Size: 839 KiB

After

Width:  |  Height:  |  Size: 839 KiB

View File

Before

Width:  |  Height:  |  Size: 3.0 MiB

After

Width:  |  Height:  |  Size: 3.0 MiB

View File

Before

Width:  |  Height:  |  Size: 3.9 MiB

After

Width:  |  Height:  |  Size: 3.9 MiB

View File

Before

Width:  |  Height:  |  Size: 90 KiB

After

Width:  |  Height:  |  Size: 90 KiB

View File

Before

Width:  |  Height:  |  Size: 699 KiB

After

Width:  |  Height:  |  Size: 699 KiB

View File

Before

Width:  |  Height:  |  Size: 699 KiB

After

Width:  |  Height:  |  Size: 699 KiB

View File

Before

Width:  |  Height:  |  Size: 706 KiB

After

Width:  |  Height:  |  Size: 706 KiB

View File

Before

Width:  |  Height:  |  Size: 708 KiB

After

Width:  |  Height:  |  Size: 708 KiB

View File

Before

Width:  |  Height:  |  Size: 821 KiB

After

Width:  |  Height:  |  Size: 821 KiB

View File

Before

Width:  |  Height:  |  Size: 682 KiB

After

Width:  |  Height:  |  Size: 682 KiB

View File

Before

Width:  |  Height:  |  Size: 843 KiB

After

Width:  |  Height:  |  Size: 843 KiB

View File

Before

Width:  |  Height:  |  Size: 6.5 MiB

After

Width:  |  Height:  |  Size: 6.5 MiB

View File

@ -788,8 +788,9 @@ const tagPhotos = [
['ebony', 1, 'Ana Foxxx in "DP Me 4" for HardX'], ['ebony', 1, 'Ana Foxxx in "DP Me 4" for HardX'],
['facial', 2, 'Ashly Anderson for Hookup Hotshot'], ['facial', 2, 'Ashly Anderson for Hookup Hotshot'],
['facial', 'poster', 'Jynx Maze'], ['facial', 'poster', 'Jynx Maze'],
['facefucking', 0, 'Ashly Anderson in "Rough Love" for Hookup Hotshot'], ['facefucking', 6, 'Halle Hayes in "Towering Temptress" for 5K Porn'],
['facefucking', 1, 'Paige Owens in "Dark Meat 12" for Evil Angel'], ['facefucking', 1, 'Paige Owens in "Dark Meat 12" for Evil Angel'],
['facefucking', 0, 'Ashly Anderson in "Rough Love" for Hookup Hotshot'],
['facefucking', 2, 'Jynx Maze for Throated'], ['facefucking', 2, 'Jynx Maze for Throated'],
['facefucking', 4, 'Brooklyn Gray in "Throats Fucks 6" for Evil Angel'], ['facefucking', 4, 'Brooklyn Gray in "Throats Fucks 6" for Evil Angel'],
['facefucking', 3, 'Adriana Chechik in "Performing Magic Butt Tricks With Jules Jordan. What Will Disappear In Her Ass?" for Jules Jordan'], ['facefucking', 3, 'Adriana Chechik in "Performing Magic Butt Tricks With Jules Jordan. What Will Disappear In Her Ass?" for Jules Jordan'],

View File

@ -17,9 +17,9 @@ const siteMapBySlug = Object.entries(siteMapByKey).reduce((acc, [key, value]) =>
function scrapeLatest(scenes, site) { function scrapeLatest(scenes, site) {
return scenes.map(({ query }) => { return scenes.map(({ query }) => {
const release = { site }; const release = {};
release.shootId = query.q('.card-meta .text-right, .row .text-right', true); release.shootId = query.q('.card-meta .text-right, .row .text-right, .card-footer-item:last-child', true);
const siteId = release.shootId.match(/\d?\w{2}/)[0]; const siteId = release.shootId.match(/\d?\w{2}/)[0];
const siteSlug = siteMapByKey[siteId]; const siteSlug = siteMapByKey[siteId];
@ -29,24 +29,24 @@ function scrapeLatest(scenes, site) {
return null; return null;
} }
const { pathname } = new URL(query.url('h5 a, .ep-title a')); const { pathname } = new URL(query.url('h5 a, .ep-title a, .title a'));
[release.entryId] = pathname.match(/\d+$/); [release.entryId] = pathname.match(/\d+$/);
release.url = `${site.url}${pathname}`; release.url = `${site.url}${pathname}`;
release.title = query.q('h5 a, .ep-title a', true); release.title = query.cnt('h5 a, .ep-title a, .title a');
release.date = query.date('.card-meta .text-left, .row .col-4:first-child', ['MMM D', 'MMM D, YYYY'], /\w+ \d+(, \w+)?/); release.date = query.date('.card-meta .text-left, .row .col-4:first-child, .card-footer-item:first-child', ['MMM D', 'MMM D, YYYY'], /\w+ \d+(, \w+)?/);
release.actors = query.all('.models a, .ep-models a', true); release.actors = query.cnts('.models a, .ep-models a, a[href*="models/"]');
release.duration = query.dur('.content a'); release.duration = query.dur('.content a');
const duration = query.q('.content a, .ep-runtime strong', true).match(/(\d+) min/)[1]; const duration = query.cnt('.content a, .ep-runtime strong, .subtitle:last-child a')?.match(/(\d+) min/)?.[1];
if (duration) release.duration = Number(duration) * 60; if (duration) release.duration = Number(duration) * 60;
if (query.exists('.episodes-preview')) { if (query.exists('.episodes-preview')) {
[release.poster, ...release.photos] = query.imgs('.episodes-preview img'); [release.poster, ...release.photos] = query.imgs('.episodes-preview img');
} else { } else {
release.poster = query.img('.card-img-top'); release.poster = query.img('.card-img-top, .image img');
release.teaser = { release.teaser = {
src: query.video('video'), src: query.video('video'),
}; };
@ -56,32 +56,32 @@ function scrapeLatest(scenes, site) {
}).filter(scene => scene); }).filter(scene => scene);
} }
async function scrapeScene({ query, html }, url, baseRelease) { async function scrapeScene({ query, html }, url, baseRelease, channel, session) {
const { pathname, origin } = new URL(url); const { pathname } = new URL(url);
const release = {}; const release = {};
[release.entryId] = pathname.match(/\d+$/); [release.entryId] = pathname.match(/\d+$/);
const titleString = query.q('.card-header.row h4, .trailer-starring span', true); const titleString = query.cnt('.card-header.row h4, .trailer-starring span, .level-left .level-item');
const episode = titleString.match(/#\d+$/)[0]; const episode = titleString?.match(/#\d+$/)?.[0];
release.title = query.q('.trailer-title', true) || titleString.match(/Trailer: ([\w\s]+) -/)[1]; release.title = query.cnt('.trailer-title') || titleString?.match(/(?:Trailer: )?([\w\s]+) -/)?.[1];
release.channel = slugify(titleString.match(/([\w\s]+) #\d+$/)[1], ''); release.channel = slugify(titleString?.match(/([\w\s]+) #\d+$/)?.[1], '');
const siteKey = siteMapBySlug[release.channel]; const siteKey = siteMapBySlug[release.channel];
release.shootId = `${siteKey} ${episode}`; release.shootId = `${siteKey} ${episode}`;
release.description = query.q('p.card-text, h5.heavy + p', true); release.description = query.text('p.card-text, h5.heavy + p, .card-content .is-three-fifths');
// order not reliable, get keys // order not reliable, get keys
const detailElsByKey = query.all('.card-body h4.card-title, .video-summary h5').reduce((acc, rowEl) => ({ const detailElsByKey = query.all('.card-body h4.card-title, .video-summary h5, .columns li').reduce((acc, rowEl) => ({
...acc, ...acc,
[slugify(rowEl?.textContent.match(/(\w+):/)?.[1])]: rowEl, [slugify(rowEl?.textContent.match(/(\w+):/)?.[1])]: rowEl,
}), {}); }), {});
release.date = query.date(detailElsByKey.published, null, 'YYYY-MM-DD'); release.date = query.date(detailElsByKey.published, null, 'YYYY-MM-DD', /\d{4}-\d{2}-\d{2}/);
release.duration = query.dur(detailElsByKey.episode); release.duration = query.dur(detailElsByKey.episode);
release.actors = query.all(detailElsByKey.starring, 'a', true); release.actors = query.cnts(detailElsByKey.starring, 'a');
const posterPrefix = html.indexOf('poster:'); const posterPrefix = html.indexOf('poster:');
const poster = query.img('.trailer-poster') || html.slice(html.indexOf('http', posterPrefix), html.indexOf('.jpg', posterPrefix) + 4); const poster = query.img('.trailer-poster') || html.slice(html.indexOf('http', posterPrefix), html.indexOf('.jpg', posterPrefix) + 4);
@ -94,20 +94,20 @@ async function scrapeScene({ query, html }, url, baseRelease) {
} }
} }
const token = query.meta('name=_token'); // const token = query.meta('name=_token');
const trailerInfoUrl = `${origin}/episodes/trailer/sources/${release.entryId}?type=trailer`; // const trailerInfoUrl = `${channel.url}/episodes/trailer/sources/${release.entryId}?type=trailer`;
const trailerInfoRes = await http.post(trailerInfoUrl, null, { const trailerInfoUrl = html.match(/'(http.*\/trailer\/sources.*)'/)?.[1];
headers: { const trailerInfoRes = await http.post(trailerInfoUrl, null, { session });
'X-CSRF-Token': token,
'X-Requested-With': 'XMLHttpRequest',
},
});
if (trailerInfoRes.ok && trailerInfoRes.body.sources.length > 0) { if (trailerInfoRes.ok && trailerInfoRes.body.sources?.length > 0) {
release.trailer = trailerInfoRes.body.sources.map(trailer => ({ release.trailer = trailerInfoRes.body.sources.map(trailer => ({
src: trailer.src, src: trailer.src,
type: trailer.type, type: trailer.type,
quality: trailer.res.replace(4000, 2160), /* unreliable, sometimes actual video is 720p
quality: trailer.res
.replace(4000, 2160)
.replace(5000, 2880),
*/
})); }));
} }
@ -151,11 +151,13 @@ async function fetchLatest(channel, page = 1) {
} }
async function fetchScene(url, channel, baseRelease) { async function fetchScene(url, channel, baseRelease) {
const session = http.session();
const res = await qu.get(url, null, { const res = await qu.get(url, null, {
'X-Requested-With': 'XMLHttpRequest', 'X-Requested-With': 'XMLHttpRequest',
}); }, { session });
return res.ok ? scrapeScene(res.item, url, baseRelease) : res.status; return res.ok ? scrapeScene(res.item, url, baseRelease, channel, session) : res.status;
} }
async function fetchProfile({ name: actorName }) { async function fetchProfile({ name: actorName }) {

View File

@ -79,6 +79,10 @@ async function request(method = 'get', url, body, requestOptions = {}, limiter)
const options = { const options = {
...defaultOptions, ...defaultOptions,
...requestOptions, ...requestOptions,
headers: {
...defaultOptions.headers,
...requestOptions.headers,
},
responseTimeout: requestOptions.responseTimeout || requestOptions.timeout || 60000, responseTimeout: requestOptions.responseTimeout || requestOptions.timeout || 60000,
stream: !!requestOptions.destination, stream: !!requestOptions.destination,
session: null, session: null,
@ -158,7 +162,7 @@ async function head(url, options) {
} }
function getSession(options) { function getSession(options) {
return bhttp.session(options); return bhttp.session({ ...defaultOptions, ...options });
} }
module.exports = { module.exports = {

View File

@ -69,6 +69,10 @@ function prefixUrl(urlValue, origin, protocol = 'https') {
} }
function q(context, selector, attrArg, applyTrim = true) { function q(context, selector, attrArg, applyTrim = true) {
if (!selector && context.nodeName === '#document') {
return null;
}
const attr = attrArg === true ? 'textContent' : attrArg; const attr = attrArg === true ? 'textContent' : attrArg;
if (attr) { if (attr) {