Fixed various Kelly Madison scraper issues.

This commit is contained in:
DebaucheryLibrarian 2020-12-17 02:05:01 +01:00
parent d0f8e21466
commit cd8e810c35
25 changed files with 43 additions and 32 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 1017 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 874 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.5 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 729 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 45 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 44 KiB

View File

Before

Width:  |  Height:  |  Size: 5.7 MiB

After

Width:  |  Height:  |  Size: 5.7 MiB

View File

Before

Width:  |  Height:  |  Size: 550 KiB

After

Width:  |  Height:  |  Size: 550 KiB

View File

Before

Width:  |  Height:  |  Size: 839 KiB

After

Width:  |  Height:  |  Size: 839 KiB

View File

Before

Width:  |  Height:  |  Size: 3.0 MiB

After

Width:  |  Height:  |  Size: 3.0 MiB

View File

Before

Width:  |  Height:  |  Size: 3.9 MiB

After

Width:  |  Height:  |  Size: 3.9 MiB

View File

Before

Width:  |  Height:  |  Size: 90 KiB

After

Width:  |  Height:  |  Size: 90 KiB

View File

Before

Width:  |  Height:  |  Size: 699 KiB

After

Width:  |  Height:  |  Size: 699 KiB

View File

Before

Width:  |  Height:  |  Size: 699 KiB

After

Width:  |  Height:  |  Size: 699 KiB

View File

Before

Width:  |  Height:  |  Size: 706 KiB

After

Width:  |  Height:  |  Size: 706 KiB

View File

Before

Width:  |  Height:  |  Size: 708 KiB

After

Width:  |  Height:  |  Size: 708 KiB

View File

Before

Width:  |  Height:  |  Size: 821 KiB

After

Width:  |  Height:  |  Size: 821 KiB

View File

Before

Width:  |  Height:  |  Size: 682 KiB

After

Width:  |  Height:  |  Size: 682 KiB

View File

Before

Width:  |  Height:  |  Size: 843 KiB

After

Width:  |  Height:  |  Size: 843 KiB

View File

Before

Width:  |  Height:  |  Size: 6.5 MiB

After

Width:  |  Height:  |  Size: 6.5 MiB

View File

@ -788,8 +788,9 @@ const tagPhotos = [
['ebony', 1, 'Ana Foxxx in "DP Me 4" for HardX'],
['facial', 2, 'Ashly Anderson for Hookup Hotshot'],
['facial', 'poster', 'Jynx Maze'],
['facefucking', 0, 'Ashly Anderson in "Rough Love" for Hookup Hotshot'],
['facefucking', 6, 'Halle Hayes in "Towering Temptress" for 5K Porn'],
['facefucking', 1, 'Paige Owens in "Dark Meat 12" for Evil Angel'],
['facefucking', 0, 'Ashly Anderson in "Rough Love" for Hookup Hotshot'],
['facefucking', 2, 'Jynx Maze for Throated'],
['facefucking', 4, 'Brooklyn Gray in "Throats Fucks 6" for Evil Angel'],
['facefucking', 3, 'Adriana Chechik in "Performing Magic Butt Tricks With Jules Jordan. What Will Disappear In Her Ass?" for Jules Jordan'],

View File

@ -17,9 +17,9 @@ const siteMapBySlug = Object.entries(siteMapByKey).reduce((acc, [key, value]) =>
function scrapeLatest(scenes, site) {
return scenes.map(({ query }) => {
const release = { site };
const release = {};
release.shootId = query.q('.card-meta .text-right, .row .text-right', true);
release.shootId = query.q('.card-meta .text-right, .row .text-right, .card-footer-item:last-child', true);
const siteId = release.shootId.match(/\d?\w{2}/)[0];
const siteSlug = siteMapByKey[siteId];
@ -29,24 +29,24 @@ function scrapeLatest(scenes, site) {
return null;
}
const { pathname } = new URL(query.url('h5 a, .ep-title a'));
const { pathname } = new URL(query.url('h5 a, .ep-title a, .title a'));
[release.entryId] = pathname.match(/\d+$/);
release.url = `${site.url}${pathname}`;
release.title = query.q('h5 a, .ep-title a', true);
release.title = query.cnt('h5 a, .ep-title a, .title a');
release.date = query.date('.card-meta .text-left, .row .col-4:first-child', ['MMM D', 'MMM D, YYYY'], /\w+ \d+(, \w+)?/);
release.actors = query.all('.models a, .ep-models a', true);
release.date = query.date('.card-meta .text-left, .row .col-4:first-child, .card-footer-item:first-child', ['MMM D', 'MMM D, YYYY'], /\w+ \d+(, \w+)?/);
release.actors = query.cnts('.models a, .ep-models a, a[href*="models/"]');
release.duration = query.dur('.content a');
const duration = query.q('.content a, .ep-runtime strong', true).match(/(\d+) min/)[1];
const duration = query.cnt('.content a, .ep-runtime strong, .subtitle:last-child a')?.match(/(\d+) min/)?.[1];
if (duration) release.duration = Number(duration) * 60;
if (query.exists('.episodes-preview')) {
[release.poster, ...release.photos] = query.imgs('.episodes-preview img');
} else {
release.poster = query.img('.card-img-top');
release.poster = query.img('.card-img-top, .image img');
release.teaser = {
src: query.video('video'),
};
@ -56,32 +56,32 @@ function scrapeLatest(scenes, site) {
}).filter(scene => scene);
}
async function scrapeScene({ query, html }, url, baseRelease) {
const { pathname, origin } = new URL(url);
async function scrapeScene({ query, html }, url, baseRelease, channel, session) {
const { pathname } = new URL(url);
const release = {};
[release.entryId] = pathname.match(/\d+$/);
const titleString = query.q('.card-header.row h4, .trailer-starring span', true);
const episode = titleString.match(/#\d+$/)[0];
const titleString = query.cnt('.card-header.row h4, .trailer-starring span, .level-left .level-item');
const episode = titleString?.match(/#\d+$/)?.[0];
release.title = query.q('.trailer-title', true) || titleString.match(/Trailer: ([\w\s]+) -/)[1];
release.channel = slugify(titleString.match(/([\w\s]+) #\d+$/)[1], '');
release.title = query.cnt('.trailer-title') || titleString?.match(/(?:Trailer: )?([\w\s]+) -/)?.[1];
release.channel = slugify(titleString?.match(/([\w\s]+) #\d+$/)?.[1], '');
const siteKey = siteMapBySlug[release.channel];
release.shootId = `${siteKey} ${episode}`;
release.description = query.q('p.card-text, h5.heavy + p', true);
release.description = query.text('p.card-text, h5.heavy + p, .card-content .is-three-fifths');
// order not reliable, get keys
const detailElsByKey = query.all('.card-body h4.card-title, .video-summary h5').reduce((acc, rowEl) => ({
const detailElsByKey = query.all('.card-body h4.card-title, .video-summary h5, .columns li').reduce((acc, rowEl) => ({
...acc,
[slugify(rowEl?.textContent.match(/(\w+):/)?.[1])]: rowEl,
}), {});
release.date = query.date(detailElsByKey.published, null, 'YYYY-MM-DD');
release.date = query.date(detailElsByKey.published, null, 'YYYY-MM-DD', /\d{4}-\d{2}-\d{2}/);
release.duration = query.dur(detailElsByKey.episode);
release.actors = query.all(detailElsByKey.starring, 'a', true);
release.actors = query.cnts(detailElsByKey.starring, 'a');
const posterPrefix = html.indexOf('poster:');
const poster = query.img('.trailer-poster') || html.slice(html.indexOf('http', posterPrefix), html.indexOf('.jpg', posterPrefix) + 4);
@ -94,20 +94,20 @@ async function scrapeScene({ query, html }, url, baseRelease) {
}
}
const token = query.meta('name=_token');
const trailerInfoUrl = `${origin}/episodes/trailer/sources/${release.entryId}?type=trailer`;
const trailerInfoRes = await http.post(trailerInfoUrl, null, {
headers: {
'X-CSRF-Token': token,
'X-Requested-With': 'XMLHttpRequest',
},
});
// const token = query.meta('name=_token');
// const trailerInfoUrl = `${channel.url}/episodes/trailer/sources/${release.entryId}?type=trailer`;
const trailerInfoUrl = html.match(/'(http.*\/trailer\/sources.*)'/)?.[1];
const trailerInfoRes = await http.post(trailerInfoUrl, null, { session });
if (trailerInfoRes.ok && trailerInfoRes.body.sources.length > 0) {
if (trailerInfoRes.ok && trailerInfoRes.body.sources?.length > 0) {
release.trailer = trailerInfoRes.body.sources.map(trailer => ({
src: trailer.src,
type: trailer.type,
quality: trailer.res.replace(4000, 2160),
/* unreliable, sometimes actual video is 720p
quality: trailer.res
.replace(4000, 2160)
.replace(5000, 2880),
*/
}));
}
@ -151,11 +151,13 @@ async function fetchLatest(channel, page = 1) {
}
async function fetchScene(url, channel, baseRelease) {
const session = http.session();
const res = await qu.get(url, null, {
'X-Requested-With': 'XMLHttpRequest',
});
}, { session });
return res.ok ? scrapeScene(res.item, url, baseRelease) : res.status;
return res.ok ? scrapeScene(res.item, url, baseRelease, channel, session) : res.status;
}
async function fetchProfile({ name: actorName }) {

View File

@ -79,6 +79,10 @@ async function request(method = 'get', url, body, requestOptions = {}, limiter)
const options = {
...defaultOptions,
...requestOptions,
headers: {
...defaultOptions.headers,
...requestOptions.headers,
},
responseTimeout: requestOptions.responseTimeout || requestOptions.timeout || 60000,
stream: !!requestOptions.destination,
session: null,
@ -158,7 +162,7 @@ async function head(url, options) {
}
function getSession(options) {
return bhttp.session(options);
return bhttp.session({ ...defaultOptions, ...options });
}
module.exports = {

View File

@ -69,6 +69,10 @@ function prefixUrl(urlValue, origin, protocol = 'https') {
}
function q(context, selector, attrArg, applyTrim = true) {
if (!selector && context.nodeName === '#document') {
return null;
}
const attr = attrArg === true ? 'textContent' : attrArg;
if (attr) {