diff --git a/src/scrapers/pornhub.js b/src/scrapers/pornhub.js index f9486013..fd86dc3c 100755 --- a/src/scrapers/pornhub.js +++ b/src/scrapers/pornhub.js @@ -1,31 +1,22 @@ 'use strict'; -const { JSDOM } = require('jsdom'); -const moment = require('moment'); +const unprint = require('unprint'); -const http = require('../utils/http'); const slugify = require('../utils/slugify'); -async function scrapeProfile(html, _url, actorName) { - const { document } = new JSDOM(html).window; +async function scrapeProfile({ query }, _url) { + const profile = {}; - const entries = Array.from(document.querySelectorAll('.infoPiece'), (el) => el.textContent.replace(/\n|\t/g, '').split(':')); + const entries = query.contents('.infoPiece').map((content) => content.split(':')); const bio = entries.reduce((acc, [key, value]) => (key ? { ...acc, [slugify(key, '_')]: value.trim() } : acc), {}); - const profile = { - name: actorName, - }; - - const descriptionString = document.querySelector('div[itemprop="description"]') || document.querySelector('.longBio'); - const avatarEl = document.querySelector('#getAvatar') || document.querySelector('.thumbImage img'); + profile.description = query.content('div[itemprop="description"]') || query.content('.longBio'); if (bio.gender) profile.gender = bio.gender; if (bio.ethnicity) profile.ethnicity = bio.ethnicity; - if (descriptionString) profile.description = descriptionString.textContent; - - if (bio.birthday && !/-0001/.test(bio.birthday)) profile.birthdate = moment.utc(bio.Birthday, 'MMM D, YYYY').toDate(); // birthyear sometimes -0001, see Spencer Bradley as of january 2020 - if (bio.born) profile.birthdate = moment.utc(bio.born, 'YYYY-MM-DD').toDate(); + if (bio.birthday && !/-0001/.test(bio.birthday)) profile.dateOfBirth = unprint.extractDate(bio.Birthday, 'MMM D, YYYY'); // birthyear sometimes -0001, see Spencer Bradley as of january 2020 + if (bio.born) profile.dateOfBirth = unprint.extractDate(bio.born, 'YYYY-MM-DD'); profile.birthPlace = bio.birth_place || bio.birthplace; profile.residencePlace = bio.city_and_country; @@ -33,46 +24,33 @@ async function scrapeProfile(html, _url, actorName) { if (bio.measurements && bio.measurements !== '--') profile.measurements = bio.measurements; if (bio.fake_boobs) profile.naturalBoobs = bio.fake_boobs.toLowerCase() === 'no'; - if (bio.height) profile.height = Number(bio.height.match(/\(\d+/)[0].slice(1)); - if (bio.weight) profile.weight = Number(bio.weight.match(/\(\d+/)[0].slice(1)); + if (bio.height) profile.height = unprint.extractNumber(bio.height, { match: /\((\d+)\s*cm\)/, matchIndex: 1 }); + if (bio.weight) profile.weight = unprint.extractNumber(bio.weight, { match: /\((\d+)\s*kg\)/, matchIndex: 1 }); if (bio.hair_color) profile.hairColor = bio.hair_color; if (bio.eyes) profile.eyeColor = bio.eye_color; - if (bio.piercings) profile.hasPiercings = bio.piercings.toLowerCase() === 'yes'; - if (bio.tattoos) profile.hasTattoos = bio.tattoos.toLowerCase() === 'yes'; - if (avatarEl && !/default\//.test(avatarEl.src)) profile.avatar = avatarEl.src; - profile.social = Array.from(document.querySelectorAll('.socialList a'), (el) => el.href).filter((link) => link !== 'https://www.twitter.com/'); // PH links to Twitter itself for some reason + if (/yes/i.test(bio.piercings)) profile.hasPiercings = true; + if (/no/i.test(bio.piercings)) profile.hasPiercings = false; + + if (/yes/i.test(bio.tattoos)) profile.hasTattoos = true; + if (/no/i.test(bio.tattoos)) profile.hasTattoos = false; + + const avatar = query.img('#getAvatar') || query.img('.thumbImage img'); + + if (avatar && !/default\//.test(avatar)) { + profile.avatar = avatar; + } + + profile.socials = query.urls('.socialList a').filter((link) => link !== 'https://www.twitter.com/'); // PH links to Twitter itself for some reason return profile; } -async function fetchProfile({ name: actorName }) { - const actorSlug = actorName.toLowerCase().replace(/\s+/g, '-'); +async function fetchProfile(actor) { + const url = `https://www.pornhub.com/pornstar/${actor.slug}`; + const res = await unprint.get(url); - /* Model pages are not reliably associated with actual porn stars - const modelUrl = `https://pornhub.com/model/${actorSlug}`; - const pornstarUrl = `https://pornhub.com/pornstar/${actorSlug}`; - - const [modelRes, pornstarRes] = await Promise.all([ - http.get(modelUrl), - http.get(pornstarUrl), - ]); - - const model = modelRes.statusCode === 200 && await scrapeProfile(modelRes.body.toString(), modelUrl, actorName); - const pornstar = pornstarRes.statusCode === 200 && await scrapeProfile(pornstarRes.body.toString(), pornstarUrl, actorName); - - if (model && pornstar) { - return { - ...model, - ...pornstar, - }; - } - */ - - const pornstarUrl = `https://pornhub.com/pornstar/${actorSlug}`; - const pornstarRes = await http.get(pornstarUrl); - - return scrapeProfile(pornstarRes.body.toString(), pornstarUrl, actorName); + return scrapeProfile(res.context, url); } module.exports = {