Refactored PornHub scraper for unprint, added www. prefix.

2026-02-09 01:43:12 +01:00
parent 5bea829acb
commit 75c2a77aea
1 changed files with 26 additions and 48 deletions
--- a/src/scrapers/pornhub.js
+++ b/src/scrapers/pornhub.js
@@ -1,31 +1,22 @@
 'use strict';

-const { JSDOM } = require('jsdom');
-const moment = require('moment');
+const unprint = require('unprint');

-const http = require('../utils/http');
 const slugify = require('../utils/slugify');

-async function scrapeProfile(html, _url, actorName) {
-	const { document } = new JSDOM(html).window;
+async function scrapeProfile({ query }, _url) {
+	const profile = {};

-	const entries = Array.from(document.querySelectorAll('.infoPiece'), (el) => el.textContent.replace(/\n|\t/g, '').split(':'));
+	const entries = query.contents('.infoPiece').map((content) => content.split(':'));
 	const bio = entries.reduce((acc, [key, value]) => (key ? { ...acc, [slugify(key, '_')]: value.trim() } : acc), {});

-	const profile = {
-		name: actorName,
-	};
-
-	const descriptionString = document.querySelector('div[itemprop="description"]') || document.querySelector('.longBio');
-	const avatarEl = document.querySelector('#getAvatar') || document.querySelector('.thumbImage img');
+	profile.description = query.content('div[itemprop="description"]') || query.content('.longBio');

 	if (bio.gender) profile.gender = bio.gender;
 	if (bio.ethnicity) profile.ethnicity = bio.ethnicity;

-	if (descriptionString) profile.description = descriptionString.textContent;
-
-	if (bio.birthday && !/-0001/.test(bio.birthday)) profile.birthdate = moment.utc(bio.Birthday, 'MMM D, YYYY').toDate(); // birthyear sometimes -0001, see Spencer Bradley as of january 2020
-	if (bio.born) profile.birthdate = moment.utc(bio.born, 'YYYY-MM-DD').toDate();
+	if (bio.birthday && !/-0001/.test(bio.birthday)) profile.dateOfBirth = unprint.extractDate(bio.Birthday, 'MMM D, YYYY'); // birthyear sometimes -0001, see Spencer Bradley as of january 2020
+	if (bio.born) profile.dateOfBirth = unprint.extractDate(bio.born, 'YYYY-MM-DD');

 	profile.birthPlace = bio.birth_place || bio.birthplace;
 	profile.residencePlace = bio.city_and_country;
@@ -33,46 +24,33 @@ async function scrapeProfile(html, _url, actorName) {
 	if (bio.measurements && bio.measurements !== '--') profile.measurements = bio.measurements;
 	if (bio.fake_boobs) profile.naturalBoobs = bio.fake_boobs.toLowerCase() === 'no';

-	if (bio.height) profile.height = Number(bio.height.match(/\(\d+/)[0].slice(1));
-	if (bio.weight) profile.weight = Number(bio.weight.match(/\(\d+/)[0].slice(1));
+	if (bio.height) profile.height = unprint.extractNumber(bio.height, { match: /\((\d+)\s*cm\)/, matchIndex: 1 });
+	if (bio.weight) profile.weight = unprint.extractNumber(bio.weight, { match: /\((\d+)\s*kg\)/, matchIndex: 1 });
 	if (bio.hair_color) profile.hairColor = bio.hair_color;
 	if (bio.eyes) profile.eyeColor = bio.eye_color;
-	if (bio.piercings) profile.hasPiercings = bio.piercings.toLowerCase() === 'yes';
-	if (bio.tattoos) profile.hasTattoos = bio.tattoos.toLowerCase() === 'yes';

-	if (avatarEl && !/default\//.test(avatarEl.src)) profile.avatar = avatarEl.src;
-	profile.social = Array.from(document.querySelectorAll('.socialList a'), (el) => el.href).filter((link) => link !== 'https://www.twitter.com/'); // PH links to Twitter itself for some reason
+	if (/yes/i.test(bio.piercings)) profile.hasPiercings = true;
+	if (/no/i.test(bio.piercings)) profile.hasPiercings = false;
+
+	if (/yes/i.test(bio.tattoos)) profile.hasTattoos = true;
+	if (/no/i.test(bio.tattoos)) profile.hasTattoos = false;
+
+	const avatar = query.img('#getAvatar') || query.img('.thumbImage img');
+
+	if (avatar && !/default\//.test(avatar)) {
+		profile.avatar = avatar;
+	}
+
+	profile.socials = query.urls('.socialList a').filter((link) => link !== 'https://www.twitter.com/'); // PH links to Twitter itself for some reason

 	return profile;
 }

-async function fetchProfile({ name: actorName }) {
-	const actorSlug = actorName.toLowerCase().replace(/\s+/g, '-');
+async function fetchProfile(actor) {
+	const url = `https://www.pornhub.com/pornstar/${actor.slug}`;
+	const res = await unprint.get(url);

-	/* Model pages are not reliably associated with actual porn stars
-    const modelUrl = `https://pornhub.com/model/${actorSlug}`;
-    const pornstarUrl = `https://pornhub.com/pornstar/${actorSlug}`;
-
-    const [modelRes, pornstarRes] = await Promise.all([
-        http.get(modelUrl),
-        http.get(pornstarUrl),
-    ]);
-
-    const model = modelRes.statusCode === 200 && await scrapeProfile(modelRes.body.toString(), modelUrl, actorName);
-    const pornstar = pornstarRes.statusCode === 200 && await scrapeProfile(pornstarRes.body.toString(), pornstarUrl, actorName);
-
-    if (model && pornstar) {
-        return {
-            ...model,
-            ...pornstar,
-        };
-    }
-    */
-
-	const pornstarUrl = `https://pornhub.com/pornstar/${actorSlug}`;
-	const pornstarRes = await http.get(pornstarUrl);
-
-	return scrapeProfile(pornstarRes.body.toString(), pornstarUrl, actorName);
+	return scrapeProfile(res.context, url);
 }

 module.exports = {