Expanded puppeteer options. Fixed Mike Adriano scraper. Fixed convert utility.

2024-06-12 03:21:45 +02:00 · 2024-06-12 03:21:45 +02:00 · 8c37071145
parent 54b2cd1209
commit 8c37071145
11 changed files with 180 additions and 93 deletions
--- a/.eslintrc
+++ b/.eslintrc
@ -11,6 +11,7 @@
        "no-tabs": "off",
        "no-unused-vars": ["error", {"argsIgnorePattern": "^_"}],
        "no-console": 0,
+		"arrow-body-style": 0,
 		"default-param-last": 0,
        "template-curly-spacing": "off",
        "max-len": 0,
--- a/package-lock.json
+++ b/package-lock.json
@ -88,7 +88,7 @@
                "tunnel": "0.0.6",
                "ua-parser-js": "^1.0.37",
                "undici": "^5.28.1",
-                "unprint": "^0.10.12",
+                "unprint": "^0.11.2",
                "url-pattern": "^1.0.3",
                "v-tooltip": "^2.1.3",
                "video.js": "^8.6.1",
@ -18293,9 +18293,9 @@
            }
        },
        "node_modules/unprint": {
-            "version": "0.10.12",
-            "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.10.12.tgz",
-            "integrity": "sha512-EbRGhkoOcmnMmQBaKZA6Tky6gpEwrhy4tDB1KeajSGhqli7zhlNe3WqsTQPtLBNKa/4M2PJZS8l0GOOjvTLndQ==",
+            "version": "0.11.2",
+            "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.11.2.tgz",
+            "integrity": "sha512-i4WJxmEnd6LKYbcnKAjX8bkaPRdyDlhAAqpxej0qIX0pjK5d17hp51x/RGDMfEe63dlcJtGCn9bhZrGcMY4PXQ==",
            "dependencies": {
                "axios": "^0.27.2",
                "bottleneck": "^2.19.5",
--- a/package.json
+++ b/package.json
@ -147,7 +147,7 @@
        "tunnel": "0.0.6",
        "ua-parser-js": "^1.0.37",
        "undici": "^5.28.1",
-        "unprint": "^0.10.12",
+        "unprint": "^0.11.2",
        "url-pattern": "^1.0.3",
        "v-tooltip": "^2.1.3",
        "video.js": "^8.6.1",
--- a/seeds/00_tags.js
+++ b/seeds/00_tags.js
@ -166,6 +166,10 @@ const tags = [
 		name: 'ass worship',
 		slug: 'ass-worship',
 	},
+	{
+		name: 'audition',
+		slug: 'audition',
+	},
 	{
 		name: 'babe',
 		slug: 'babe',
--- a/seeds/02_sites.js
+++ b/seeds/02_sites.js
@ -6026,7 +6026,7 @@ const sites = [
 	{
 		slug: 'trueanal',
 		name: 'True Anal',
-		url: 'https://trueanal.com',
+		url: 'https://tour.trueanal.com',
 		description: 'TrueAnal is the hottest site with all hardcore Anal content and only the most popular pornstars getting their asses pounded and gapped with huge cock and more!',
 		tags: ['anal'],
 		parent: 'mikeadriano',
@ -6034,14 +6034,14 @@ const sites = [
 	{
 		slug: 'analonly',
 		name: 'Anal Only',
-		url: 'https://analonly.com',
+		url: 'https://tour.analonly.com',
 		tags: ['anal'],
 		parent: 'mikeadriano',
 	},
 	{
 		slug: 'allanal',
 		name: 'All Anal',
-		url: 'https://allanal.com',
+		url: 'https://tour.allanal.com',
 		description: 'Popular babes getting their tight asses filled with cock! Pure anal fucking only at AllAnal!',
 		tags: ['anal', 'mff'],
 		parent: 'mikeadriano',
@ -6049,18 +6049,29 @@ const sites = [
 	{
 		slug: 'nympho',
 		name: 'Nympho',
-		url: 'https://nympho.com',
+		url: 'https://tour.nympho.com',
 		description: 'These Babes have an appetite for nasty, sloppy fucking!',
 		parent: 'mikeadriano',
 	},
 	{
 		slug: 'swallowed',
 		name: 'Swallowed',
-		url: 'https://swallowed.com',
+		url: 'https://tour.swallowed.com',
 		description: 'Swallowed is a Premium adult website for the hottest Blowjobs content online with only the most popular pornstars swallowing cock!',
 		tags: ['blowjob', 'deepthroat', 'facefucking'],
 		parent: 'mikeadriano',
 	},
+	{
+		slug: 'dirtyauditions',
+		name: 'Dirty Auditions',
+		url: 'https://dirtyauditions.com',
+		description: 'Watch hot pornstars tryout for DirtyAuditions in exclusive and extreme HD videos!',
+		tags: ['audition'],
+		parent: 'mikeadriano',
+		parameters: {
+			useBrowser: true,
+		},
+	},
 	// MILE HIGH MEDIA
 	{
 		slug: 'doghousedigital',
--- a/src/.eslintrc
+++ b/src/.eslintrc
@ -14,6 +14,7 @@
        "no-underscore-dangle": 0,
 		"default-param-last": 0,
        "prefer-destructuring": "off",
+		"arrow-body-style": 0,
        "template-curly-spacing": "off",
        "object-curly-newline": "off"
    }
--- a/src/actors.js
+++ b/src/actors.js
@ -443,13 +443,13 @@ async function curateProfile(profile, actor) {
 		curatedProfile.hip = Number(profile.hip) || profile.hip?.match?.(/\d+/)?.[0] || null;

 		// combined measurement value
-		const measurements = profile.measurements?.match(/(\d+)(\w+)\s*[-x]\s*(\d+)\s*[-x]\s*(\d+)/); // ExCoGi uses x, Jules Jordan has spaces between the dashes
+		const measurements = profile.measurements?.match(/(\d+)(\w+)(?:\s*[-x]\s*(\d+)\s*[-x]\s*(\d+))?/); // ExCoGi uses x, Jules Jordan has spaces between the dashes

 		if (measurements) {
-			curatedProfile.bust = Number(measurements[1]);
-			curatedProfile.cup = measurements[2];
-			curatedProfile.waist = Number(measurements[3]);
-			curatedProfile.hip = Number(measurements[4]);
+			curatedProfile.bust = Number(measurements[1]) || null;
+			curatedProfile.cup = measurements[2] || null;
+			curatedProfile.waist = Number(measurements[3]) || null;
+			curatedProfile.hip = Number(measurements[4]) || null;
 		}

 		curatedProfile.penisLength = Number(profile.penisLength) || profile.penisLength?.match?.(/\d+/)?.[0] || null;
--- a/src/scrapers/mikeadriano.js
+++ b/src/scrapers/mikeadriano.js
@ -1,88 +1,166 @@
 'use strict';

-const qu = require('../utils/qu');
-const http = require('../utils/http');
+const unprint = require('unprint');

-function scrapeAll(scenes) {
+const http = require('../utils/http');
+const slugify = require('../utils/slugify');
+const { convert } = require('../utils/convert');
+
+function scrapeAll(scenes, channel) {
 	return scenes.map(({ query }) => {
 		const release = {};

-		release.title = query.cnt('h3.title a, .content-title-wrap a');
-		release.url = query.url('h3.title a, .content-title-wrap a');
+		release.title = query.content('h3.title a, .content-title-wrap a');
+		release.url = query.url('h3.title a, h1.title a, .content-title-wrap a', { origin: channel.url });

 		const pathname = new URL(release.url).pathname;
-		release.entryId = pathname.match(/\/view\/(\d+)/)?.[1] || pathname.match(/\/view\/([\w-]+)/)?.[1];

-		release.description = query.cnt('.desc, .content-description');
-		release.date = query.date('.date, time, .hide', 'Do MMM YYYY');
+		release.entryId = pathname.match(/\/scenes\/([\w-]+)/)?.[1];

-		release.actors = query.cnts('h4.models a, .content-models a');
-		release.duration = query.dur('.total-time');
+		release.description = query.content('.desc, .content-description');
+		release.date = query.date('.date, time, .hide', 'Do MMM YYYY', { match: null });
+
+		release.actors = query.contents('h4.models a, .content-models a');
+		release.duration = query.duration('//span[contains(@class, "total-time") and text()[contains(., ":")]]'); // total-time is also used for photo counts on True Anal

 		const [poster, ...primaryPhotos] = query.imgs('a img');
-		const secondaryPhotos = query.styles('.thumb-top, .thumb-bottom, .thumb-mouseover', 'background-image').map((style) => style.match(/url\((.*)\)/)[1]);
+		const secondaryPhotos = query.styles('.thumb-top, .thumb-bottom, .thumb-mouseover', { styleAttribute: 'background-image' }).map((style) => style.match(/url\((.*)\)/)?.[1]);
+
+		release.poster = [
+			poster.replace(/-c\d+x\d+/, ''),
+			poster,
+		];

-		release.poster = poster;
 		release.photos = primaryPhotos.concat(secondaryPhotos);

 		return release;
 	});
 }

-async function scrapeScene({ query }, url) {
+async function scrapeScene({ query }, url, channel) {
 	const release = {};

 	const pathname = new URL(url).pathname;
-	release.entryId = pathname.match(/\/view\/(\d+)/)?.[1] || pathname.match(/\/view\/([\w-]+)/)?.[1];
+	const data = query.json('#__NEXT_DATA__')?.props?.pageProps?.content;

-	release.title = query.cnt('.content-page-info .title');
-	release.description = query.cnt('.content-page-info .desc');
-	release.date = query.date('.content-page-info .date, .content-page-info .hide, .post-date', 'Do MMM YYYY');
+	release.entryId = data?.slug || pathname.match(/\/scenes\/([\w-]+)/)?.[1];

-	release.actors = query.cnts('.content-page-info .models a');
-	release.duration = query.dur('.content-page-info .total-time:last-child');
+	release.title = data?.title || query.content('.content-page-info .title');
+	release.description = data?.description || query.content('.content-page-info .desc');
+	release.date = data?.formatted_date
+		? unprint.extractDate(data.formatted_date, 'Do MMM YYYY', { match: null })
+		: query.date('.content-page-info .date, .content-page-info .hide, .post-date', 'Do MMM YYYY', { match: null });

-	release.poster = query.poster('.content-page-header video, .content-page-header-inner video') || query.poster('#main-player', 'data-screenshot');
-	release.trailer = query.video('.content-page-header source, .content-page-header-inner source') || query.q('#main-player', 'data-url');
+	release.actors = data?.models_thumbs?.map((actor) => ({
+		name: actor.name,
+		url: actor.slug && `${channel.url}/models/${actor.slug}`,
+		avatar: actor.thumb,
+	}))
+		|| query.elements('.content-page-info .models a').map((actorEl) => ({
+			name: unprint.query(actorEl),
+			url: unprint.url(actorEl, null),
+		}));
+
+	release.duration = data?.seconds_duration || query.duration('.content-page-info .total-time:last-child');
+
+	release.poster = [data?.trailer_screencap, data?.thumb, data?.extra_thumbails?.[0]].filter(Boolean);
+	release.photos = data?.extra_thumbnails?.slice(1); // first photo is poster
+
+	release.trailer = data?.trailer_url || null;
+	release.caps = data?.thumbs;
+
+	release.tags = data?.tags;
+
+	release.qualities = data?.videos && Object.values(data.videos).map((video) => video.height);

 	return release;
 }

-async function fetchLatest(channel, page = 1) {
-	const { host } = new URL(channel.url);
-	const url = `https://tour.${host}/videos?page=${page}`;
+async function scrapeProfile({ query }) {
+	const profile = {};

-	const res = await qu.get(url);
+	const bio = Object.fromEntries(query.all('.model-info li, .model-desc li').map((el) => [
+		slugify(unprint.query.content(el, 'span')),
+		unprint.query.text(el),
+	]));
+
+	const avatar = query.img('.model-photo img, img[alt="model"]');
+
+	if (avatar) {
+		profile.avatar = [
+			avatar.replace(/-\d+x\d+/, ''),
+			avatar,
+		];
+	}
+
+	if (bio && Object.keys(bio).length > 0) {
+		profile.description = bio.bio;
+
+		profile.dateOfBirth = bio.birthdate && unprint.extractDate(bio.birthdate, 'YYYY-MM-DD');
+		profile.birthPlace = bio.born;
+
+		profile.measurements = bio.measurements;
+
+		profile.height = convert(bio.height, 'cm');
+		profile.weight = convert(bio.weight, 'lb', 'kg');
+
+		profile.eyes = bio.eyes;
+		profile.hairColor = bio.hair;
+	}
+
+	return profile;
+}
+
+async function fetchLatestContent(url, parameters) {
+	if (parameters.useBrowser) {
+		const res = await http.get(url, {
+			bypassBrowser: 'shared',
+			bypass: {
+				evaluate: async () => {
+					// images lazy loaded by JS, gradually scroll through page
+					return Array.from(document.querySelectorAll('.content-item ')).reduce(async (chain, el) => {
+						await chain;
+
+						return new Promise((resolve) => {
+							el.scrollIntoView();
+							setTimeout(resolve, 20);
+						});
+					}, Promise.resolve());
+				},
+			},
+		});
+
+		if (res.statusCode !== 200) {
+			return {
+				ok: false,
+				status: res.statusCode,
+			};
+		}
+
+		const context = unprint.init(res.body);
+
+		return {
+			ok: true,
+			status: res.statusCode,
+			context,
+		};
+	}
+
+	const res = await unprint.get(url);
+
+	return res;
+}
+
+async function fetchLatest(channel, page = 1, { parameters }) {
+	const url = `${channel.url}/scenes?page=${page}`;
+	const res = await fetchLatestContent(url, parameters);

 	if (res.ok) {
-		if (res.item.query.exists('a[href*="stackpath.com"]')) {
+		if (res.context.query.exists('a[href*="stackpath.com"]')) {
 			throw new Error('URL blocked by StackPath');
 		}

-		return scrapeAll(qu.initAll(res.item.el, '.content-item-large, .content-item, .content-border'), channel);
-	}
-
-	return res.status;
-}
-
-async function fetchUpcoming(channel) {
-	const { host } = new URL(channel.url);
-	const url = `https://tour.${host}`;
-
-	const res = await qu.get(url);
-
-	if (res.ok) {
-		if (res.item.query.exists('a[href*="stackpath.com"]')) {
-			throw new Error('URL blocked by StackPath');
-		}
-
-		const sceneItem = qu.init(res.item.el, '#upcoming-content');
-
-		if (sceneItem) {
-			return scrapeAll([sceneItem], channel);
-		}
-
-		return null;
+		return scrapeAll(unprint.initAll(res.context.query.all('.content-item-large, .content-item, .content-border')), channel);
 	}

 	return res.status;
@ -92,58 +170,40 @@ async function fetchScene(url, channel) {
 	const cookieJar = http.cookieJar();
 	const session = http.session({ cookieJar });

-	/* not working
-	const resA = await http.get(url, {
-		session,
-		extract: {
-			runScripts: 'dangerously',
-		},
-	});
-
-	cookieJar.setCookieSync(http.toughCookie.Cookie.parse(resA.document.cookie), url);
-
-	console.log(res.req);
-	*/
-
 	const res = await http.get(url, {
 		session,
 	});

 	if (res.ok) {
-		const item = qu.init(res.document);
+		const context = unprint.init(res.body);

-		if (item.query.exists('a[href*="stackpath.com"]')) {
+		if (context.query.exists('a[href*="stackpath.com"]')) {
 			throw new Error('URL blocked by StackPath');
 		}

-		return scrapeScene(item, url, channel);
+		return scrapeScene(context, url, channel);
 	}

 	return res.status;
 }

-/* API protected
-async function fetchProfile({ name: actorName }, context , site) {
+async function fetchProfile(actor, context) {
 	const session = http.session();

-	await http.get(`https://tour.${site.slug}.com`, { session });
+	await http.get(context.channel.url, { session });

-	const url = `https://tour.${site.slug}.com/search-preview`;
-	const res = await http.post(url, { q: actorName }, {
-		session,
-		headers: {
-			'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
-			origin: `https://tour.${site.slug}.com`,
-		},
-	});
+	const url = `${context.channel.url}/models/${actor.slug}`;
+	const res = await unprint.get(url);

-	console.log(res.body.toString());
+	if (res.ok) {
+		return scrapeProfile(res.context, context.channel);
+	}
+
+	return res.status;
 }
-*/

 module.exports = {
 	fetchLatest,
-	fetchUpcoming,
-	// fetchProfile,
+	fetchProfile,
 	fetchScene,
 };
--- a/src/scrapers/scrapers.js
+++ b/src/scrapers/scrapers.js
@ -211,6 +211,7 @@ const scrapers = {
 		deeplush: nubiles,
 		devilsfilm: famedigital,
 		digitalplayground: aylo,
+		dirtyauditions: mikeadriano,
 		dorcelclub: dorcel,
 		doubleviewcasting: firstanalquest,
 		dtfsluts: fullpornnetwork,
--- a/src/utils/convert.js
+++ b/src/utils/convert.js
@ -79,7 +79,7 @@ function convertApi(input, fromOrTo, to) {

 		const inputNumber = Number(typeof input === 'string' ? input.match(/\d+(\.\d+)?/)?.[0] : input);

-		return Math.round(convert(inputNumber).from(fromOrTo).to(to)) || null;
+		return Math.round(convert(inputNumber, fromOrTo).to(to)) || null;
 	} catch (error) {
 		logger.error(error);
 		return null;
--- a/src/utils/http.js
+++ b/src/utils/http.js
@ -162,7 +162,7 @@ async function getBrowserSession(identifier, options = {}) {
 	return limiters.bypass.schedule(async () => {
 		if (!browser) {
 			browser = await puppeteer.launch({
-				headless: typeof options.headless === 'undefined' ? 'new' : options.headless,
+				headless: typeof options.bypass?.headless === 'undefined' ? 'new' : options.bypass.headless,
 				// headless: false,
 			});

@ -177,10 +177,19 @@ async function getBrowserSession(identifier, options = {}) {
 	});
 }

-async function bypassBrowserRequest(url, _options) {
-	const { tab } = await getBrowserSession(new URL(url).hostname);
+async function bypassBrowserRequest(url, options) {
+	const { tab } = await getBrowserSession(new URL(url).hostname, options);

 	const res = await tab.goto(url);
+
+	if (options.bypass?.delay) {
+		await Promise.delay(options.bypass.delay);
+	}
+
+	if (typeof options.bypass?.evaluate === 'function') {
+		await tab.evaluate(options.bypass.evaluate, options.bypass);
+	}
+
 	const rawBody = await tab.content();

 	const headers = res.headers();