Refactored Score.

2026-01-15 04:53:31 +01:00
parent 003aff49f9
commit 37275f8930
10 changed files with 1200 additions and 188 deletions
--- a/src/entities.js
+++ b/src/entities.js
@@ -36,7 +36,7 @@ function curateEntity(entity, includeParameters = false) {
 		id: entity.id,
 		name: entity.name,
 		url: entity.url,
-		origin: new URL(entity.url).origin,
+		origin: entity.url && new URL(entity.url).origin,
 		description: entity.description,
 		slug: entity.slug,
 		type: entity.type,
--- a/src/scrapers/kellymadison.js
+++ b/src/scrapers/kellymadison.js
@@ -189,7 +189,7 @@ function scrapeProfile({ query }) {
 			const birthday = new Date(Date.UTC(0, Number(month) - 1, Number(day)));

 			if (profile.age) {
-				birthday.setUTCFullYear(new Date().getFullYear() - profile.age); // indicate birth year is unknown
+				birthday.setUTCFullYear(new Date().getFullYear() - profile.age);
 			} else {
 				birthday.setUTCFullYear(0); // indicate birth year is unknown
 			}
--- a/src/scrapers/score.js
+++ b/src/scrapers/score.js
@@ -1,253 +1,276 @@
 'use strict';

-const { ex, exa, get } = require('../utils/q');
+const unprint = require('unprint');
+
 const slugify = require('../utils/slugify');
-const http = require('../utils/http');
-const { heightToCm, lbsToKg } = require('../utils/convert');
+const { stripQuery } = require('../utils/url');
+const { convert } = require('../utils/convert');

-function scrapePhotos(html) {
-	const { qis } = ex(html, '#photos-page');
-	const photos = qis('img');
+const sizeRegex = /_lg|_xl|_tn/;

-	return photos.map((photo) => [
-		photo
-			.replace('x_800', 'x_xl')
-			.replace('_tn', ''),
-		photo,
-	]);
-}
-
-async function fetchPhotos(url) {
-	const res = await http.get(url);
-
-	if (res.statusCode === 200) {
-		return scrapePhotos(res.body.toString(), url);
+function resizeSrc(src) {
+	if (!src) {
+		return null;
 	}

-	return [];
+	return Array.from(new Set([
+		src.replace(sizeRegex, '_1280'),
+		src.replace(sizeRegex, '_800'),
+		src.replace(sizeRegex, '_xl'),
+		src,
+	]));
 }

-function scrapeAll(html, site) {
-	return exa(html, '.container .video, .container-fluid .video').map(({ q, qa, qd, ql }) => {
+function scrapeAll(scenes, channel, parameters) {
+	return scenes.map(({ query }) => {
 		const release = {};
+		const poster = query.img('.item-img img');

-		release.title = q('.title, .i-title', true);
+		const url = stripQuery(query.url('a.i-title, .item-img a'));

-		const linkEl = q('a');
-		const url = new URL(linkEl.href);
-		release.url = `${url.origin}${url.pathname}`;
+		release.title = query.content('a.i-title, h2.i-title');
+		release.duration = query.duration('.time-ol');

-		// this is a photo album, not a scene (used for profiles)
-		if (/photos\//.test(url)) return null;
+		release.date = query.date('.i-date', ['MMM. Do', 'MMM. YYYY'], { match: /(\w+\.? \d{1,2}\w+)|(\w+\.? \d{4})/ });

-		[release.entryId] = url.pathname.split('/').slice(-2);
+		if (!release.date) {
+			const date = query.dateAgo('.i-date');

-		release.date = qd('.i-date', 'MMM DD', /\w+ \d{1,2}$/)
-            || qd('.dt-box', 'MMM.DD YYYY');
-		release.actors = site?.parameters?.actors || qa('.model, .i-model', true);
-		release.duration = ql('.i-amount, .amount');
-
-		const posterEl = q('.item-img img');
-
-		if (posterEl) {
-			release.poster = `https:${posterEl.src}`;
+			if (date) {
+				release.date = date.date;
+				release.datePrecision = date.precision === 'week' ? 'month' : date.precision;
+			}
 		}

-		if (posterEl?.dataset.gifPreview) {
-			release.teaser = {
-				src: `https:${posterEl.dataset.gifPreview}`,
-			};
+		release.actors = query.content('.i-model').split(',').map((actor) => actor.trim());
+
+		if (url.includes('join.') || url.includes('/join')) {
+			// no link available, attempt to reconstruct from poster URL
+			const entryId = poster?.match(/posting_(\d+)/)?.[1];
+
+			if (entryId) {
+				// we can get deep data from this
+				release.entryId = entryId;
+				release.url = `${channel.origin}${parameters.path}/${slugify(release.actors[0], '-', { lower: false })}/${entryId}/`;
+			} else {
+				// lost cause, make up entryId to register shallow data
+				release.entryId = slugify(release.title);
+			}
+		} else {
+			release.url = url;
+			release.entryId = new URL(release.url).pathname.match(/\/(\d+)\/?$/)[1];
 		}

+		if (poster) {
+			const caps = Array.from(new Set(Array.from({ length: 6 }, (_src, index) => {
+				const file = `${String(index + 1).padStart(2, '0')}_lg`;
+
+				return poster.replace(/0\d_lg/, file);
+			}))).map((src) => resizeSrc(src));
+
+			release.poster = Array.from({ length: caps[0].length }).flatMap((_value, index) => caps.map((src) => src[index])); // try all the best sources first
+
+			if (caps.length > 1) {
+				release.caps = caps;
+			}
+		}
+
+		release.photos = query.imgs('.thumbs img'); // cards layout
+
+		release.teaser = [
+			query.video('.preview-clip source[type="video/mp4"]'),
+			query.video('.preview-clip source[type="video/webm"]'),
+		].filter(Boolean);
+
 		return release;
-	}).filter(Boolean);
+	});
 }

-async function scrapeScene(html, url, site) {
-	const { qu } = ex(html, '#videos-page, #content section');
+async function fetchLatest(channel, page = 1, { parameters }) {
+	const res = await unprint.get(`${channel.origin}${parameters.path}/?page=${page}`, {
+		interface: 'request', // seemingly less prone to HTTPParserError: Response does not match the HTTP/1.1 protocol (Invalid character in chunk size)
+		selectAll: '.videos .video, .video-wide', // video-wide for cards layout e.g. Big Boobs POV
+	});
+
+	if (res.ok) {
+		return scrapeAll(res.context, channel, parameters);
+	}
+
+	return res.status;
+}
+
+function scrapeScene({ query }, url) {
 	const release = {};

-	[release.entryId] = new URL(url).pathname.split('/').slice(-2);
+	const info = Object.fromEntries(query.all('.stat').map((infoEl) => [
+		slugify(unprint.query.content(infoEl, '.label')),
+		unprint.query.content(infoEl, '.value'),
+	]));

-	release.title = qu.q('h2.text-uppercase, h2.title, #breadcrumb-top + h1', true)
-        || qu.q('h1.m-title', true)?.split(/»|\//).slice(-1)[0].trim();
-	release.description = qu.text('.p-desc, .desc');
+	release.url = stripQuery(url);
+	release.entryId = new URL(url).pathname.match(/\/(\d+)\/?$/)[1];

-	release.actors = qu.all('.value a[href*=models], .value a[href*=performer], .value a[href*=teen-babes]', true);
+	release.title = query.content('.p-desc h2, #videos_page-page h1');
+	release.description = query.text('.p-desc, .desc');

-	if (release.actors.length === 0) {
-		const actorEl = qu.all('.stat').find((stat) => /Featuring/.test(stat.textContent));
-		const actorString = qu.text(actorEl);
+	release.date = unprint.extractDate(info.date, 'MMMM Do, YYYY', { match: /\w+ \d{1,2}\w+, \d{4}/ });
+	release.duration = unprint.extractDuration(info.duration) || Number(info.duration) * 60 || null;

-		release.actors = actorString?.split(/,\band\b|,/g).map((actor) => actor.trim()) || [];
-	}
+	release.actors = query.all('//span[contains(text(), "Featuring")]/following-sibling::span/a').map((actorEl) => ({
+		name: unprint.query.content(actorEl),
+		url: stripQuery(unprint.query.url(actorEl, null)),
+	}));

-	if (release.actors.length === 0 && site.parameters?.actors) release.actors = site.parameters.actors;
+	release.tags = query.contents('.p-desc a[href*="tag/"], .desc a[href*="tag/"]');

-	release.tags = qu.all('a[href*=tag]', true);
+	const style = query.content('.vp style');
+	const poster = query.img('#videos_page-page .item-img img') || style?.match(/background-image: url\('(http[\w.:/_-]+)'\);/)?.[1];
+	const fallbackPoster = resizeSrc(query.img('meta[itemprop="image"]', { attribute: 'content' })); // usually a different image

-	const dateEl = qu.all('.value').find((el) => /\w+ \d+\w+, \d{4}/.test(el.textContent));
-	release.date = qu.date(dateEl, null, 'MMMM Do, YYYY')
-        || qu.date('.date', 'MMMM Do, YYYY', /\w+ \d{1,2}\w+, \d{4}/)
-        || qu.date('.info .holder', 'MM/DD/YYYY', /\d{2}\/\d{2}\/\d{4}/);
+	const photos = query.all('.gallery .thumb').map((imgEl) => {
+		const link = unprint.query.url(imgEl, 'a');
+		const img = unprint.query.img(imgEl, 'img');
+		const isJoin = !link || link.includes('join.') || link.includes('/join');

-	const durationEl = qu.all('value').find((el) => /\d{1,3}:\d{2}/.test(el.textContent));
-	release.duration = qu.dur(durationEl);
+		return Array.from(new Set([
+			...isJoin ? [] : [link],
+			img.replace('_tn', ''),
+			img,
+		]));
+	});

-	release.poster = qu.poster('video') || qu.img('.flowplayer img') || html.match(/posterImage: '(.*\.jpg)'/)?.[1] || null; // _800.jpg is larger than _xl.jpg in landscape
-	const photosUrl = qu.url('.stat a[href*=photos]');
+	if (poster) {
+		release.poster = resizeSrc(poster);

-	if (photosUrl) {
-		release.photos = await fetchPhotos(photosUrl);
+		if (fallbackPoster?.includes(poster)) {
+			release.photos = [fallbackPoster, ...photos]; // fallback poster isn't usually in photoset, append
+		} else {
+			release.photos = photos;
+		}
 	} else {
-		release.photos = qu.imgs('img[src*=ThumbNails], .p-photos .tn img').map((photo) => [
-			photo.replace('_tn', ''),
-			photo,
-		]);
+		release.poster = fallbackPoster;
+		release.photos = photos;
 	}

-	const trailers = qu.all('a[href*=Trailers]');
-
-	if (trailers) {
-		release.trailer = trailers.map((trailer) => {
-			const src = `https:${trailer.href}`;
-			const format = trailer.textContent.trim().match(/^\w+/)[0].toLowerCase();
-			const quality = parseInt(trailer.textContent.trim().match(/\d+([a-zA-Z]+)?$/)[0], 10);
-
-			return format === 'mp4' ? { src, quality } : null;
-		}).filter(Boolean);
-	}
-
-	const stars = qu.q('.rate-box').dataset.score;
-	if (stars) release.rating = { stars };
+	release.trailer = query.all('.vp video source').map((videoEl) => ({
+		src: unprint.query.video(videoEl, null),
+		quality: parseInt(unprint.query.attribute(videoEl, null, 'res'), 10) || null,
+	}));

 	return release;
 }

-function scrapeModels(html, actorName) {
-	const { qa } = ex(html);
-	const model = qa('.model a').find((link) => link.title === actorName);
-
-	return model?.href || null;
-}
-
-async function fetchActorReleases(url, accReleases = []) {
-	const res = await get(url);
+async function fetchScene(url, channel, baseRelease) {
+	const res = await unprint.get(url, {
+		interface: 'request',
+	});

 	if (res.ok) {
-		const releases = accReleases.concat(scrapeAll(res.item.document.body.outerHTML));
-		const nextPage = res.item.qu.url('.next-pg');
-
-		if (nextPage && new URL(nextPage).searchParams.has('page')) { // last page has 'next' button linking to join page
-			return fetchActorReleases(nextPage, releases);
-		}
-
-		return releases;
+		return scrapeScene(res.context, url, channel, baseRelease);
 	}

-	return null;
+	return res.status;
 }

-async function scrapeProfile(html, actorUrl, withReleases) {
-	const { q, qa, qi } = ex(html, '#model-page');
-	const profile = { gender: 'female' };
+function scrapeProfile({ query }, url) {
+	const profile = { url };
+	const { pathname } = new URL(url);

-	const bio = qa('.stat').reduce((acc, el) => {
-		const prop = q(el, '.label', true).slice(0, -1);
-		const key = slugify(prop, '_');
-		const value = q(el, '.value', true);
+	const bio = Object.fromEntries(query.all('.m-info .stat').map((bioEl) => [
+		slugify(unprint.query.content(bioEl, '.label'), '_'),
+		unprint.query.content(bioEl, '.value'),
+	]));

-		return {
-			...acc,
-			[key]: value,
-		};
-	}, {});
-
-	if (bio.location) profile.residencePlace = bio.location.replace('Czech Repulic', 'Czech Republic'); // see Laura Lion
-
-	if (bio.birthday) {
-		const birthMonth = bio.birthday.match(/^\w+/)[0].toLowerCase();
-		const [birthDay] = bio.birthday.match(/\d+/);
-
-		profile.birthday = [birthMonth, birthDay]; // currently unused, not to be confused with birthdate
+	if (pathname.includes('big-boob-models')) {
+		profile.gender = 'female';
 	}

-	if (bio.ethnicity) profile.ethnicity = bio.ethnicity;
-	if (bio.hair_color) profile.hair = bio.hair_color;
+	if (pathname.includes('male-performer')) {
+		profile.gender = 'male';
+	}

-	if (bio.height) profile.height = heightToCm(bio.height);
-	if (bio.weight) profile.weight = lbsToKg(bio.weight);
+	profile.avatar = query.img('.item-img a img:not([src*="posting"])');

-	if (bio.bra_size) profile.bust = bio.bra_size;
-	if (bio.measurements) [, profile.waist, profile.hip] = bio.measurements.split('-');
+	profile.placeOfResidence = bio.location;
+	profile.ethnicity = bio.ethnicity;

-	if (bio.occupation) profile.occupation = bio.occupation;
+	profile.height = convert(bio.height, 'cm');
+	profile.weight = convert(bio.weight, 'lb', 'kg');

-	const avatar = qi('img');
-	if (avatar) profile.avatar = avatar;
+	if (bio.bra_size && bio.measurements) {
+		profile.measurements = bio.measurements.replace(/^\d+-/, `${bio.bra_size}-`);
+	} else {
+		profile.measurements = bio.measurements || bio.bra_size;
+	}

-	if (withReleases) {
-		const { origin, pathname } = new URL(actorUrl);
-		profile.releases = await fetchActorReleases(`${origin}${pathname}/scenes?page=1`);
+	profile.hairColor = bio.hair_color;
+
+	const birthday = unprint.extractDate(bio.birthday, 'MMMM D', { match: /\w+.?\s+\d{1,2}/ });
+
+	if (birthday) {
+		birthday.setFullYear(0); // indicate birth year is unknown
+		profile.dateOfBirth = birthday;
 	}

 	return profile;
 }

-async function fetchLatest(site, page = 1) {
-	const latestPath = site.parameters?.path || '/big-boob-videos';
-	const url = `${site.url}${latestPath}?page=${page}`;
-	const res = await http.get(url);
-
-	if (res.statusCode === 200) {
-		return scrapeAll(res.body.toString(), site);
+async function getActorUrl(actor) {
+	if (actor.url) {
+		return actor.url;
 	}

-	return res.statusCode;
-}
+	const searchRes = await unprint.post('https://www.scoreland.com/search-es/', {
+		keywords: actor.name,
+		's_filters[site]': 'all',
+		's_filters[type]': 'models',
+	}, {
+		interface: 'request',
+		form: true,
+		followRedirects: false,
+	});

-async function fetchScene(url, site) {
-	const res = await http.get(url);
+	const res = await unprint.get(searchRes.headers.location, {
+		interface: 'request',
+		cookies: {
+			cisession: searchRes.cookies.cisession,
+		},
+		// followRedirects: false,
+		selectAll: '.li-item.model',
+	});

-	if (res.statusCode === 200) {
-		return scrapeScene(res.body.toString(), url, site);
+	if (res.ok) {
+		const actorEl = res.context.find(({ query }) => slugify(query.content('.i-model')) === actor.slug);
+		const url = actorEl?.query.url('.i-model');
+
+		if (url) {
+			// messy nats link pointing to unpredictable sites, all data seems to be available on scoreland
+			const { pathname } = new URL(url);
+			const actorPath = pathname.match(/\/[\w-]+\/\d+\/?$/);
+
+			if (actorPath) {
+				return `https://www.scoreland.com/big-boob-models${actorPath[0]}`;
+			}
+		}
 	}

 	return null;
 }

-async function fetchProfile({ name: actorName }, context, include, page = 1, source = 0) {
-	const letter = actorName.charAt(0).toUpperCase();
+async function fetchProfile(actor) {
+	const url = await getActorUrl(actor);

-	const sources = [
-		`https://www.scoreland.com/big-boob-models/browse/${letter}/?page=${page}`,
-		`https://www.50plusmilfs.com/xxx-milf-models/browse/${letter}/?page=${page}`,
-	];
+	if (url) {
+		const res = await unprint.get(url, {
+			interface: 'request',
+			select: '#model-page',
+		});

-	const url = sources[source];
-
-	const res = await http.get(url, {
-		followRedirects: false,
-	});
-
-	if (res.statusCode === 200) {
-		const actorUrl = scrapeModels(res.body.toString(), actorName);
-
-		if (actorUrl) {
-			const actorRes = await http.get(actorUrl);
-
-			if (actorRes.statusCode === 200) {
-				return scrapeProfile(actorRes.body.toString(), actorUrl, include.scenes);
-			}
-
-			return null;
+		if (res.ok) {
+			return scrapeProfile(res.context, url);
 		}

-		return fetchProfile({ name: actorName }, context, include, page + 1, source);
-	}
-
-	if (sources[source + 1]) {
-		return fetchProfile({ name: actorName }, context, include, 1, source + 1);
+		return res.status;
 	}

 	return null;