Refactored Kelly Madison scraper, using API.

2025-12-14 00:43:56 +01:00
parent c1d548c3df
commit 81b2d25f13
7 changed files with 183 additions and 181 deletions
--- a/package-lock.json
+++ b/package-lock.json
@@ -90,7 +90,7 @@
                "tunnel": "0.0.6",
                "ua-parser-js": "^1.0.37",
                "undici": "^5.28.1",
-                "unprint": "^0.16.1",
+                "unprint": "^0.16.3",
                "url-pattern": "^1.0.3",
                "v-tooltip": "^2.1.3",
                "video.js": "^8.6.1",
@@ -18376,9 +18376,10 @@
            }
        },
        "node_modules/unprint": {
-            "version": "0.16.1",
-            "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.16.1.tgz",
-            "integrity": "sha512-vOT6kdoZwVae9iHS5H+eBOqTZaVJRJWrBJrfnAEIzqPO8KseFvajd+kLZSL9iCE6Al5S0hi2TuMW89c8YK3Baw==",
+            "version": "0.16.3",
+            "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.16.3.tgz",
+            "integrity": "sha512-PnToOzQhneDFzf4FzOVQciWWtFTk/Xx7ZkngM+S8n8wfeRfOH7YiYa4EhbD6ZJdEcR2xfVRtlMl3w2fI7nRgPw==",
+            "license": "ISC",
            "dependencies": {
                "axios": "^0.27.2",
                "bottleneck": "^2.19.5",
--- a/package.json
+++ b/package.json
@@ -149,7 +149,7 @@
        "tunnel": "0.0.6",
        "ua-parser-js": "^1.0.37",
        "undici": "^5.28.1",
-        "unprint": "^0.16.1",
+        "unprint": "^0.16.3",
        "url-pattern": "^1.0.3",
        "v-tooltip": "^2.1.3",
        "video.js": "^8.6.1",
--- a/seeds/00_tags.js
+++ b/seeds/00_tags.js
@@ -324,6 +324,14 @@ const tags = [
 		name: 'choking',
 		slug: 'choking',
 	},
+	{
+		name: 'condom',
+		slug: 'condom',
+	},
+	{
+		name: 'no condom',
+		slug: 'no-condom',
+	},
 	{
 		name: 'corporal punishment',
 		slug: 'corporal-punishment',
@@ -1645,6 +1653,10 @@ const aliases = [
 		for: 'enhanced-boobs',
 		secondary: true,
 	},
+	{
+		name: 'implants',
+		for: 'enhanced-boobs',
+	},
 	{
 		name: 'boob job',
 		for: 'titty-fucking',
--- a/seeds/01_networks.js
+++ b/seeds/01_networks.js
@@ -114,7 +114,6 @@ const networks = [
 		name: '5K Vids',
 		url: 'https://www.5kvids.com',
 		parameters: {
-			// layout: 'api',
 			apiKey: 'fiveKCash',
 			apiAddress: 'https://www.8kmilfs.com/api',
 		},
@@ -464,7 +463,6 @@ const networks = [
 		url: 'https://www.kellymadison.com',
 		description: 'Home of Kelly Madison and Ryan Madison',
 		parameters: {
-			// layout: 'api',
 			apiKey: 'kellyCash',
 			apiAddress: 'https://www.pornfidelity.com/api',
 		},
--- a/seeds/02_sites.js
+++ b/seeds/02_sites.js
@@ -5329,6 +5329,7 @@ const sites = [
 			siteId: 3,
 			// older scene pages are only available on PF, even though they are categorized on TS or KM
 			archive: 'https://www.pornfidelity.com',
+			short: 'TF',
 		},
 	},
 	{
@@ -5339,6 +5340,7 @@ const sites = [
 		parent: 'kellymadison',
 		parameters: {
 			siteId: 2,
+			short: 'PF',
 		},
 	},
 	{
@@ -5350,6 +5352,7 @@ const sites = [
 		parameters: {
 			siteId: 1,
 			archive: 'https://www.pornfidelity.com',
+			short: 'KM',
 		},
 	},
 	{
@@ -5360,6 +5363,7 @@ const sites = [
 		parent: '5kvids',
 		parameters: {
 			siteId: 1,
+			short: '5KP',
 		},
 	},
 	{
@@ -5370,6 +5374,7 @@ const sites = [
 		parent: '5kvids',
 		parameters: {
 			siteId: 2,
+			short: '5KT',
 		},
 	},
 	{
@@ -5379,6 +5384,7 @@ const sites = [
 		parent: '5kvids',
 		parameters: {
 			siteId: 3,
+			short: '8KM',
 		},
 	},
 	{
@@ -5388,6 +5394,7 @@ const sites = [
 		parent: '5kvids',
 		parameters: {
 			siteId: 4,
+			short: '8KT',
 		},
 	},
 	// KILLERGRAM
--- a/seeds/06_affiliates.js
+++ b/seeds/06_affiliates.js
@@ -57,6 +57,13 @@ const affiliates = [
 		parameters: 'nats=OTczLjEuMy4zLjAuMC4wLjAuMA',
 		comment: '50% rev share',
 	},
+	{
+		id: 'evilangel',
+		network: 'evilangel',
+		url: 'https://www.g2fame.com/evilangel/go.php?pr=8&su=2&si=128&ad=277470&pa=index&ar=&buffer=',
+		parameters: 'nats=OTczLjEuMy4zLjAuMC4wLjAuMA',
+		comment: '50% rev share',
+	},
 	{
 		id: '_kellymadison',
 		network: 'kellymadison',
--- a/src/scrapers/kellymadison.js
+++ b/src/scrapers/kellymadison.js
@@ -2,86 +2,81 @@

 const config = require('config');
 const unprint = require('unprint');
-const { parse } = require('csv-parse/sync');
+// const { parse } = require('csv-parse/sync');

 const slugify = require('../utils/slugify');
-const qu = require('../utils/qu');
 const http = require('../utils/http');
 const { feetInchesToCm, femaleFeetUsToEu } = require('../utils/convert');

-const siteMapByKey = {
-	PF: 'pornfidelity',
-	TF: 'teenfidelity',
-	KM: 'kellymadison',
-	'5KP': '5kporn',
-	'5KT': '5kteens',
+const thumbKeyRegex = /(thumb\d+_url)|(episode_thumb_image_\d+_url)/;
+
+const qualityMap = {
+	'480p': 480,
+	mobile: 720, // as of recent, might've been lower in the past
+	'720p': 720,
+	'1080p': 1080,
+	'2k': 1440,
+	'4k': 2160,
+	'5k': 2280,
+	'8k': 4320,
 };

-const siteMapBySlug = Object.entries(siteMapByKey).reduce((acc, [key, value]) => ({ ...acc, [value]: key }), {});
+function scrapeSceneApi(data, channel) {
+	const release = {};

-function scrapeLatest(scenes, site) {
-	return scenes.map(({ query }) => {
-		const release = {};
+	release.entryId = data.id;

-		release.shootId = query.q('.card-meta .text-right, .row .text-right, .card-footer-item:last-child', true);
+	if (data.url) {
+		// provided URL works but always points to 8KMilfs instead of dedicated site
+		const { pathname } = new URL(data.url);

-		const { pathname } = new URL(query.url('h5 a, .ep-title a, .title a'));
-		[release.entryId] = pathname.match(/\d+$/);
-
-		release.title = query.cnt('h5 a, .ep-title a, .title a');
-
-		release.date = query.date('.card-meta .text-left, .row .col-4:first-child, .card-footer-item:first-child', ['MMM D', 'MMM D, YYYY'], /\w+ \d+(, \w+)?/);
-		release.actors = query.cnts('.models a, .ep-models a, a[href*="models/"]');
-
-		// older scenes do not have a working scene page on their native site, but they (often, not always) do on Porn Fidelity
-		// scenes older than year do not show a date; this is not when the URLs stop working, but it's a rough guideline
-		release.url = site.parameters.archive && !release.date
-			? `${site.parameters.archive}${pathname}`
-			: `${site.url}${pathname}`;
-
-		release.duration = query.dur('.content a');
-
-		const duration = query.cnt('.content a, .ep-runtime strong, .subtitle:last-child a')?.match(/(\d+) min/)?.[1];
-		if (duration) release.duration = Number(duration) * 60;
-
-		if (query.exists('.episodes-preview')) {
-			[release.poster, ...release.photos] = query.imgs('.episodes-preview img');
-		} else {
-			release.poster = query.img('.card-img-top, .image img');
-			release.teaser = {
-				src: query.video('video'),
-			};
-		}
-
-		/* using site ID, filter no longer needed
-		const siteId = release.shootId.match(/\d?\w{2}/)[0];
-		const siteSlug = siteMapByKey[siteId];
-
-		if (site.slug !== siteSlug) {
-			// using generic network overview, scene is not from the site we want
-			return { ...acc, unextracted: [...acc.unextracted, release] };
-		}
-
-		return { ...acc, scenes: [...acc.scenes, release] };
-		*/
-
-		return release;
-	});
-}
-
-async function fetchLatest(channel, page = 1) {
-	const url = `${channel.url}/episodes/search?page=${page}&site=${channel.parameters.siteId || ''}`; // TLS issues with teenfidelity.com, same overview on all sites
-	const res = await http.get(url, {
-		headers: {
-			'X-Requested-With': 'XMLHttpRequest',
-		},
-	});
-
-	if (res.ok && res.body.status === 'success') {
-		return scrapeLatest(qu.extractAll(res.body.html, '.episode, .ep'), channel);
+		release.url = unprint.prefixUrl(pathname, channel.url);
 	}

-	return res.status;
+	if (channel.parameters.short && data.sequence_number) {
+		release.shootId = `${channel.parameters.short} #${data.sequence_number}`;
+	}
+
+	release.title = data.title;
+	release.description = data.short_description;
+
+	release.date = new Date(data.publish_on);
+
+	if (data.fullEpisodeLength) {
+		release.duration = data.fullEpisodeLength;
+	} else if (data.full_episode_minutes) {
+		// full_episode_seconds is always available so far, but no need to count on it
+		release.duration = (data.full_episode_minutes + (data.full_episode_seconds || 0)) * 60;
+	}
+
+	release.actors = data.models.map((model) => ({
+		name: model.name,
+		gender: model.sex?.toLowerCase(),
+		url: unprint.prefixUrl(`/models/${model.slug}`, channel.url),
+	}));
+
+	release.poster = data.thumb_url || data.thumb_image_url;
+
+	release.photos = [
+		data.poster_image_url,
+		...Object.entries(data).filter(([key]) => thumbKeyRegex.test(key)).map(([_key, url]) => url),
+	].filter(Boolean); // photo thumbs include poster, don't filter here but in client
+
+	const trailers = data.trailerVideos || data.trailer;
+
+	if (trailers) {
+		release.trailer = Object.entries(trailers)
+			.filter(([key, trailer]) => !key.toLowerCase().includes('_sfw') && !trailer.url?.toLowerCase().includes('_sfw'))
+			.map(([_key, trailer]) => ({
+				src: trailer.url,
+				quality: qualityMap[trailer.resolution?.toLowerCase()] || null,
+			}));
+	}
+
+	release.tags = data.categories.map((category) => category.name);
+	release.photoCount = data.photosetPhotoCount || data.episode_photoset_photo_count;
+
+	return release;
 }

 async function fetchLatestApi(channel, page = 1, { parameters }) {
@@ -92,126 +87,112 @@ async function fetchLatestApi(channel, page = 1, { parameters }) {
 		},
 	});

-	console.log(res.body.data[1]);
-
 	if (res.ok) {
-		const data = parse(res.body, {
-			columns: true,
-			skip_empty_lines: true,
-		});
-
-		console.log(data);
-
-		return null;
+		return res.body.data.map((data) => scrapeSceneApi(data, channel));
 	}

 	return res.status;
 }

-async function scrapeScene({ query, html }, url, baseRelease, channel, session) {
-	const { pathname } = new URL(url);
-	const release = {};
+/* not practical via API, updates endpoint contains all necessary data
+async function fetchSceneApi(url, entity, baseRelease, { parameters }) {
+	// const episodeId = new URL(url).pathname.match(/\/episodes\/\w+\/(\d+)/)?.[1];
+	const episodeId = new URL(url).pathname.match(/\/episodes\/(\d+)/)?.[1];

-	[release.entryId] = pathname.match(/\d+$/);
-
-	const titleString = query.cnt('.card-header.row h4, .trailer-starring span, .level-left .level-item');
-	const episode = titleString?.match(/#\d+$/)?.[0];
-
-	release.title = query.cnt('.trailer-title') || titleString?.match(/(?:Trailer: )?(.+) -/)?.[1];
-	release.channel = slugify(titleString?.match(/([\w\s]+) #\d+$/)?.[1], '');
-
-	const siteKey = siteMapBySlug[release.channel];
-
-	release.shootId = `${siteKey} ${episode}`;
-	release.description = query.text('p.card-text, h5.heavy + p, .card-content .is-three-fifths');
-
-	// order not reliable, get keys
-	const detailElsByKey = query.all('.card-body h4.card-title, .video-summary h5, .columns li').reduce((acc, rowEl) => ({
-		...acc,
-		[slugify(rowEl?.textContent.match(/(\w+):/)?.[1])]: rowEl,
-	}), {});
-
-	release.date = query.date(detailElsByKey.published, null, 'YYYY-MM-DD', /\d{4}-\d{2}-\d{2}/);
-	release.duration = query.dur(detailElsByKey.episode);
-	release.actors = query.cnts(detailElsByKey.starring, 'a');
-
-	const posterPrefix = html.indexOf('poster:');
-	const poster = query.img('.trailer-poster') || html.slice(html.indexOf('http', posterPrefix), html.indexOf('.jpg', posterPrefix) + 4);
-
-	if (poster) {
-		if (baseRelease?.poster) {
-			release.photos = [poster, ...(baseRelease.photos || [])];
-		} else {
-			release.poster = poster;
-		}
+	if (!episodeId) {
+		return null;
 	}

-	// const token = query.meta('name=_token');
-	// const trailerInfoUrl = `${channel.url}/episodes/trailer/sources/${release.entryId}?type=trailer`;
-	const trailerInfoUrl = html.match(/'(http.*\/trailer\/sources.*)'/)?.[1];
-
-	if (trailerInfoUrl) {
-		const trailerInfoRes = await http.post(trailerInfoUrl, null, { session });
-
-		if (trailerInfoRes.ok && trailerInfoRes.body.sources?.length > 0) {
-			release.trailer = trailerInfoRes.body.sources.map((trailer) => ({
-				src: trailer.src,
-				type: trailer.type,
-				/* unreliable, sometimes actual video is 720p
-				quality: trailer.res
-					.replace(4000, 2160)
-					.replace(5000, 2880),
-				*/
-			}));
-		}
-	}
-
-	return release;
-}
-
-async function fetchScene(url, channel, baseRelease) {
-	const session = http.session();
-
-	const res = await qu.get(url, null, {
-		'X-Requested-With': 'XMLHttpRequest',
-	}, {
-		session,
-		followRedirects: false, // redirects to sign-up page if scene not found
+	// JSON API doesn't return poster images, CSV API doesn't have pagination. UPDATE: requested and received both, yet to test
+	const res = await http.get(`${parameters.apiAddress}/affiliates/episodes/${episodeId}`, {
+		headers: {
+			Authorization: `Bearer ${config.apiKeys[parameters.apiKey]}`,
+		},
 	});

-	return res.ok
-		? scrapeScene(res.item, url, baseRelease, channel, session)
-		: res.status;
+	console.log(res.body);
+
+	return;
+
+	if (res.ok) {
+		return scrapeSceneApi(res.body.data, entity);
+	}
+
+	return res.status;
+}
+*/
+
+function composeBio(bioKeys, bioValues) {
+	return bioKeys.reduce((acc, key, index) => ({
+		...acc,
+		[slugify(key, '_')]: bioValues[index],
+	}), {});
+}
+
+function getBio(query) {
+	// Kelly Madison, Fidelity
+	if (query.exists('.profile-stats')) {
+		const bioKeys = query.contents('.profile-stats li strong');
+		const bioValues = query.texts('.profile-stats li');
+
+		return composeBio(bioKeys, bioValues);
+	}
+
+	// 8K
+	if (query.exists('//h4[contains(text(), "Stats")]')) {
+		const bioKeys = query.contents('(//h4[contains(text(), "Stats")])[1]//following-sibling::div//strong');
+		const bioValues = query.contents('(//h4[contains(text(), "Stats")])[1]//following-sibling::div//p/text()');
+
+		return composeBio(bioKeys, bioValues);
+	}
+
+	// 5K
+	if (query.exists('.bio-overlay-1')) {
+		const bioKeys = query.contents('.bio-overlay-1 td:first-child');
+		const bioValues = query.contents('.bio-overlay-1 td:last-child');
+
+		return composeBio(bioKeys, bioValues);
+	}
+
+	return null;
 }

 function scrapeProfile({ query }) {
 	const profile = {};
+	const bio = getBio(query);

-	const bioKeys = query.contents('table.table td:nth-child(1), table.table th');
-	const bioValues = query.contents('table.table td:nth-child(2)');
+	const questions = query.contents('.model-faq .content-body .accordion-header, .card .card-header button');
+	const answers = query.contents('.model-faq .content-body .accordion-body, .card .collapse .card-body');

-	const bio = bioKeys.reduce((acc, key, index) => ({
-		...acc,
-		[slugify(key, '_')]: bioValues[index],
-	}), {});
-
-	if (bio.ethnicity) profile.ethnicity = bio.ethnicity;
-	if (bio.measurements) profile.measurements = bio.measurements;
-	if (bio.birthplace) profile.birthPlace = bio.birthplace;
-	if (bio.shoe_size) profile.foot = femaleFeetUsToEu(bio.shoe_size);
-
-	if (bio.height) {
-		const [feet, inches] = bio.height.match(/\d+/g);
-		profile.height = feetInchesToCm(feet, inches);
+	if (questions.length > 0 && questions.length === answers.length) {
+		profile.description = questions.map((question, index) => `**${question}**\n${answers[index]}`).join('\n');
 	}

-	if (bio.birthday) {
-		const [month, day] = bio.birthday.split('/');
-		const birthday = new Date(Date.UTC(0, Number(month) - 1, Number(day)));
+	if (bio) {
+		if (bio.ethnicity) profile.ethnicity = bio.ethnicity;
+		if (bio.measurements) profile.measurements = bio.measurements;
+		if (bio.birthplace) profile.birthPlace = bio.birthplace;
+		if (bio.shoe_size) profile.foot = femaleFeetUsToEu(bio.shoe_size);

-		birthday.setUTCFullYear(0); // indicate birth year is unknown
+		if (bio.height) {
+			const [feet, inches] = bio.height.match(/\d+/g);
+			profile.height = feetInchesToCm(feet, inches);
+		}

-		profile.dateOfBirth = new Date(birthday);
+		if (bio.age) profile.age = Number(bio.age);
+
+		if (bio.birthday) {
+			const [month, day] = bio.birthday.split('/');
+			const birthday = new Date(Date.UTC(0, Number(month) - 1, Number(day)));
+
+			if (profile.age) {
+				birthday.setUTCFullYear(new Date().getFullYear() - profile.age); // indicate birth year is unknown
+			} else {
+				birthday.setUTCFullYear(0); // indicate birth year is unknown
+			}
+
+			profile.dateOfBirth = new Date(birthday);
+		}
 	}

 	profile.avatar = query.img('img[src*="model"][src*="headshot"]');
@@ -223,7 +204,8 @@ function scrapeProfile({ query }) {
 async function fetchProfile({ name: actorName }, { entity }) {
 	const actorSlug = slugify(actorName);

-	const res = await unprint.get(`${entity.url}/models/${actorSlug}`, {
+	// 8K sites don't have avatar or interview on model page, always use 5K site
+	const res = await unprint.get(`${entity.slug === '5kvids' ? 'https://www.5kporn.com' : entity.url}/models/${actorSlug}`, {
 		headers: {
 			'X-Requested-With': 'XMLHttpRequest',
 		},
@@ -237,11 +219,6 @@ async function fetchProfile({ name: actorName }, { entity }) {
 }

 module.exports = {
-	fetchLatest,
+	fetchLatest: fetchLatestApi,
 	fetchProfile,
-	fetchScene,
-	api: {
-		fetchLatest: fetchLatestApi,
-		// fetchScene, fetchSceneApi,
-	},
 };