Fixed various Kelly Madison scraper issues.

2020-12-17 02:05:01 +01:00 · 2020-12-17 02:05:01 +01:00 · cd8e810c35
parent d0f8e21466
commit cd8e810c35
25 changed files with 43 additions and 32 deletions
--- a/public/img/tags/facefucking/6.jpeg
+++ b/public/img/tags/facefucking/6.jpeg
--- a/public/img/tags/facefucking/6a.jpeg
+++ b/public/img/tags/facefucking/6a.jpeg
--- a/public/img/tags/facefucking/lazy/6.jpeg
+++ b/public/img/tags/facefucking/lazy/6.jpeg
--- a/public/img/tags/facefucking/lazy/6a.jpeg
+++ b/public/img/tags/facefucking/lazy/6a.jpeg
--- a/public/img/tags/facefucking/originals/6.jpeg
+++ b/public/img/tags/facefucking/originals/6.jpeg
--- a/public/img/tags/facefucking/thumbs/6.jpeg
+++ b/public/img/tags/facefucking/thumbs/6.jpeg
--- a/public/img/tags/facefucking/thumbs/6a.jpeg
+++ b/public/img/tags/facefucking/thumbs/6a.jpeg
--- a/public/img/tags/fake-boobs/originals/17.jpeg
+++ b/public/img/tags/fake-boobs/originals/17.jpeg
--- a/public/img/tags/fake-boobs/originals/18.jpeg
+++ b/public/img/tags/fake-boobs/originals/18.jpeg
--- a/public/img/tags/fake-boobs/originals/18a.jpeg
+++ b/public/img/tags/fake-boobs/originals/18a.jpeg
--- a/public/img/tags/fake-boobs/originals/19.jpeg
+++ b/public/img/tags/fake-boobs/originals/19.jpeg
--- a/public/img/tags/fake-boobs/originals/20.jpeg
+++ b/public/img/tags/fake-boobs/originals/20.jpeg
--- a/public/img/tags/fake-boobs/originals/21.jpeg
+++ b/public/img/tags/fake-boobs/originals/21.jpeg
--- a/public/img/tags/fake-boobs/originals/22.jpeg
+++ b/public/img/tags/fake-boobs/originals/22.jpeg
--- a/public/img/tags/fake-boobs/originals/23.jpeg
+++ b/public/img/tags/fake-boobs/originals/23.jpeg
--- a/public/img/tags/fake-boobs/originals/23b.jpeg
+++ b/public/img/tags/fake-boobs/originals/23b.jpeg
--- a/public/img/tags/fake-boobs/originals/23c.jpeg
+++ b/public/img/tags/fake-boobs/originals/23c.jpeg
--- a/public/img/tags/fake-boobs/originals/23d.jpeg
+++ b/public/img/tags/fake-boobs/originals/23d.jpeg
--- a/public/img/tags/fake-boobs/originals/23e.jpeg
+++ b/public/img/tags/fake-boobs/originals/23e.jpeg
--- a/public/img/tags/fake-boobs/originals/23f.jpeg
+++ b/public/img/tags/fake-boobs/originals/23f.jpeg
--- a/public/img/tags/fake-boobs/originals/9.jpeg
+++ b/public/img/tags/fake-boobs/originals/9.jpeg
--- a/seeds/04_media.js
+++ b/seeds/04_media.js
@ -788,8 +788,9 @@ const tagPhotos = [
 	['ebony', 1, 'Ana Foxxx in "DP Me 4" for HardX'],
 	['facial', 2, 'Ashly Anderson for Hookup Hotshot'],
 	['facial', 'poster', 'Jynx Maze'],
-	['facefucking', 0, 'Ashly Anderson in "Rough Love" for Hookup Hotshot'],
+	['facefucking', 6, 'Halle Hayes in "Towering Temptress" for 5K Porn'],
 	['facefucking', 1, 'Paige Owens in "Dark Meat 12" for Evil Angel'],
+	['facefucking', 0, 'Ashly Anderson in "Rough Love" for Hookup Hotshot'],
 	['facefucking', 2, 'Jynx Maze for Throated'],
 	['facefucking', 4, 'Brooklyn Gray in "Throats Fucks 6" for Evil Angel'],
 	['facefucking', 3, 'Adriana Chechik in "Performing Magic Butt Tricks With Jules Jordan. What Will Disappear In Her Ass?" for Jules Jordan'],
--- a/src/scrapers/kellymadison.js
+++ b/src/scrapers/kellymadison.js
@ -17,9 +17,9 @@ const siteMapBySlug = Object.entries(siteMapByKey).reduce((acc, [key, value]) =>

 function scrapeLatest(scenes, site) {
 	return scenes.map(({ query }) => {
-		const release = { site };
+		const release = {};

-		release.shootId = query.q('.card-meta .text-right, .row .text-right', true);
+		release.shootId = query.q('.card-meta .text-right, .row .text-right, .card-footer-item:last-child', true);

 		const siteId = release.shootId.match(/\d?\w{2}/)[0];
 		const siteSlug = siteMapByKey[siteId];
@ -29,24 +29,24 @@ function scrapeLatest(scenes, site) {
 			return null;
 		}

-		const { pathname } = new URL(query.url('h5 a, .ep-title a'));
+		const { pathname } = new URL(query.url('h5 a, .ep-title a, .title a'));
 		[release.entryId] = pathname.match(/\d+$/);
 		release.url = `${site.url}${pathname}`;

-		release.title = query.q('h5 a, .ep-title a', true);
+		release.title = query.cnt('h5 a, .ep-title a, .title a');

-		release.date = query.date('.card-meta .text-left, .row .col-4:first-child', ['MMM D', 'MMM D, YYYY'], /\w+ \d+(, \w+)?/);
-		release.actors = query.all('.models a, .ep-models a', true);
+		release.date = query.date('.card-meta .text-left, .row .col-4:first-child, .card-footer-item:first-child', ['MMM D', 'MMM D, YYYY'], /\w+ \d+(, \w+)?/);
+		release.actors = query.cnts('.models a, .ep-models a, a[href*="models/"]');

 		release.duration = query.dur('.content a');

-		const duration = query.q('.content a, .ep-runtime strong', true).match(/(\d+) min/)[1];
+		const duration = query.cnt('.content a, .ep-runtime strong, .subtitle:last-child a')?.match(/(\d+) min/)?.[1];
 		if (duration) release.duration = Number(duration) * 60;

 		if (query.exists('.episodes-preview')) {
 			[release.poster, ...release.photos] = query.imgs('.episodes-preview img');
 		} else {
-			release.poster = query.img('.card-img-top');
+			release.poster = query.img('.card-img-top, .image img');
 			release.teaser = {
 				src: query.video('video'),
 			};
@ -56,32 +56,32 @@ function scrapeLatest(scenes, site) {
 	}).filter(scene => scene);
 }

-async function scrapeScene({ query, html }, url, baseRelease) {
-	const { pathname, origin } = new URL(url);
+async function scrapeScene({ query, html }, url, baseRelease, channel, session) {
+	const { pathname } = new URL(url);
 	const release = {};

 	[release.entryId] = pathname.match(/\d+$/);

-	const titleString = query.q('.card-header.row h4, .trailer-starring span', true);
-	const episode = titleString.match(/#\d+$/)[0];
+	const titleString = query.cnt('.card-header.row h4, .trailer-starring span, .level-left .level-item');
+	const episode = titleString?.match(/#\d+$/)?.[0];

-	release.title = query.q('.trailer-title', true) || titleString.match(/Trailer: ([\w\s]+) -/)[1];
-	release.channel = slugify(titleString.match(/([\w\s]+) #\d+$/)[1], '');
+	release.title = query.cnt('.trailer-title') || titleString?.match(/(?:Trailer: )?([\w\s]+) -/)?.[1];
+	release.channel = slugify(titleString?.match(/([\w\s]+) #\d+$/)?.[1], '');

 	const siteKey = siteMapBySlug[release.channel];

 	release.shootId = `${siteKey} ${episode}`;
-	release.description = query.q('p.card-text, h5.heavy + p', true);
+	release.description = query.text('p.card-text, h5.heavy + p, .card-content .is-three-fifths');

 	// order not reliable, get keys
-	const detailElsByKey = query.all('.card-body h4.card-title, .video-summary h5').reduce((acc, rowEl) => ({
+	const detailElsByKey = query.all('.card-body h4.card-title, .video-summary h5, .columns li').reduce((acc, rowEl) => ({
 		...acc,
 		[slugify(rowEl?.textContent.match(/(\w+):/)?.[1])]: rowEl,
 	}), {});

-	release.date = query.date(detailElsByKey.published, null, 'YYYY-MM-DD');
+	release.date = query.date(detailElsByKey.published, null, 'YYYY-MM-DD', /\d{4}-\d{2}-\d{2}/);
 	release.duration = query.dur(detailElsByKey.episode);
-	release.actors = query.all(detailElsByKey.starring, 'a', true);
+	release.actors = query.cnts(detailElsByKey.starring, 'a');

 	const posterPrefix = html.indexOf('poster:');
 	const poster = query.img('.trailer-poster') || html.slice(html.indexOf('http', posterPrefix), html.indexOf('.jpg', posterPrefix) + 4);
@ -94,20 +94,20 @@ async function scrapeScene({ query, html }, url, baseRelease) {
 		}
 	}

-	const token = query.meta('name=_token');
-	const trailerInfoUrl = `${origin}/episodes/trailer/sources/${release.entryId}?type=trailer`;
-	const trailerInfoRes = await http.post(trailerInfoUrl, null, {
-		headers: {
-			'X-CSRF-Token': token,
-			'X-Requested-With': 'XMLHttpRequest',
-		},
-	});
+	// const token = query.meta('name=_token');
+	// const trailerInfoUrl = `${channel.url}/episodes/trailer/sources/${release.entryId}?type=trailer`;
+	const trailerInfoUrl = html.match(/'(http.*\/trailer\/sources.*)'/)?.[1];
+	const trailerInfoRes = await http.post(trailerInfoUrl, null, { session });

-	if (trailerInfoRes.ok && trailerInfoRes.body.sources.length > 0) {
+	if (trailerInfoRes.ok && trailerInfoRes.body.sources?.length > 0) {
 		release.trailer = trailerInfoRes.body.sources.map(trailer => ({
 			src: trailer.src,
 			type: trailer.type,
-			quality: trailer.res.replace(4000, 2160),
+			/* unreliable, sometimes actual video is 720p
+			quality: trailer.res
+				.replace(4000, 2160)
+				.replace(5000, 2880),
+			*/
 		}));
 	}

@ -151,11 +151,13 @@ async function fetchLatest(channel, page = 1) {
 }

 async function fetchScene(url, channel, baseRelease) {
+	const session = http.session();
+
 	const res = await qu.get(url, null, {
 		'X-Requested-With': 'XMLHttpRequest',
-	});
+	}, { session });

-	return res.ok ? scrapeScene(res.item, url, baseRelease) : res.status;
+	return res.ok ? scrapeScene(res.item, url, baseRelease, channel, session) : res.status;
 }

 async function fetchProfile({ name: actorName }) {
--- a/src/utils/http.js
+++ b/src/utils/http.js
@ -79,6 +79,10 @@ async function request(method = 'get', url, body, requestOptions = {}, limiter)
 	const options = {
 		...defaultOptions,
 		...requestOptions,
+		headers: {
+			...defaultOptions.headers,
+			...requestOptions.headers,
+		},
 		responseTimeout: requestOptions.responseTimeout || requestOptions.timeout || 60000,
 		stream: !!requestOptions.destination,
 		session: null,
@ -158,7 +162,7 @@ async function head(url, options) {
 }

 function getSession(options) {
-	return bhttp.session(options);
+	return bhttp.session({ ...defaultOptions, ...options });
 }

 module.exports = {
--- a/src/utils/qu.js
+++ b/src/utils/qu.js
@ -69,6 +69,10 @@ function prefixUrl(urlValue, origin, protocol = 'https') {
 }

 function q(context, selector, attrArg, applyTrim = true) {
+	if (!selector && context.nodeName === '#document') {
+		return null;
+	}
+
 	const attr = attrArg === true ? 'textContent' : attrArg;

 	if (attr) {