From 0ae7d2669a1c3f2644e513035be7b2d4a0634961 Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Mon, 24 Feb 2020 00:31:36 +0100 Subject: [PATCH] Added profile scraping to Private. --- config/default.js | 7 ++-- src/scrapers/private.js | 91 +++++++++++++++++++++++++++++++++++++--- src/scrapers/scrapers.js | 1 + src/utils/q.js | 2 + 4 files changed, 93 insertions(+), 8 deletions(-) diff --git a/config/default.js b/config/default.js index f7df591a..7d79b13d 100644 --- a/config/default.js +++ b/config/default.js @@ -91,9 +91,10 @@ module.exports = { 'pervertgallery', 'povperverts', ], - 'kellymadison', - 'bangbros', + 'private', 'ddfnetwork', + 'bangbros', + 'kellymadison', 'legalporno', 'score', 'boobpedia', @@ -107,7 +108,7 @@ module.exports = { path: './media', thumbnailSize: 320, // width for 16:9 will be exactly 576px thumbnailQuality: 100, - videoQuality: [480, 360, 320, 540, 720, 1080, 2160, 240, 180], + videoQuality: [480, 360, 320, 540, 720, 1080, 2160, 240, 180], limit: 25, // max number of photos per release }, titleSlugLength: 50, diff --git a/src/scrapers/private.js b/src/scrapers/private.js index 7e586304..ba5dcb26 100644 --- a/src/scrapers/private.js +++ b/src/scrapers/private.js @@ -5,6 +5,9 @@ const bhttp = require('bhttp'); const cheerio = require('cheerio'); const moment = require('moment'); +const { get, geta } = require('../utils/q'); +const slugify = require('../utils/slugify'); + async function getPhotos(entryId, site) { const { hostname } = new URL(site.url); @@ -23,17 +26,20 @@ function scrapeLatest(html, site) { return sceneElements.map((element) => { const sceneLinkElement = $(element).find('h3 a'); + const thumbnailElement = $(element).find('a img'); const url = sceneLinkElement.attr('href'); - const title = sceneLinkElement.text(); + // const title = sceneLinkElement.text(); const entryId = url.split('/').slice(-1)[0]; - const date = moment.utc($(element).find('.scene-date'), 'MM/DD/YYYY').toDate(); + const titleText = thumbnailElement.attr('alt'); + const title = titleText.slice(titleText.indexOf(':') + 1).trim(); + + const date = moment.utc($(element).find('.scene-date'), ['MM/DD/YYYY', 'YYYY-MM-DD']).toDate(); const actors = $(element).find('.scene-models a').map((actorIndex, actorElement) => $(actorElement).text()).toArray(); const likes = Number($(element).find('.scene-votes').text()); - const thumbnailElement = $(element).find('img.img-responsive'); const photoCount = Number(thumbnailElement.attr('thumbs_num')); const poster = thumbnailElement.attr('src'); const photos = Array.from({ length: photoCount }, (val, index) => thumbnailElement.attr(`src${index + 1}`)); @@ -58,7 +64,7 @@ function scrapeLatest(html, site) { async function scrapeScene(html, url, site) { const $ = cheerio.load(html, { normalizeWhitespace: true }); - const release = {}; + const release = { url }; [release.entryId] = url.split('/').slice(-1); release.title = $('.video-wrapper meta[itemprop="name"]').attr('content'); @@ -89,11 +95,66 @@ async function scrapeScene(html, url, site) { release.movie = $('a[data-track="FULL MOVIE"]').attr('href'); const siteElement = $('.content-wrapper .logos-sites a'); - if (siteElement) release.channel = siteElement.text().replace(/\s+/g, '').toLowerCase(); + if (siteElement) release.channel = slugify(siteElement.text(), { delimiter: '' }); return release; } +function scrapeProfile({ html, q, qa, qtx }) { + const profile = {}; + + const bio = qa('.model-facts li:not(.model-facts-long)', true).reduce((acc, fact) => { + const [key, value] = fact.split(':'); + const trimmedValue = value.trim(); + + if (trimmedValue.length === 0 || trimmedValue === '-') return acc; + return { ...acc, [slugify(key, { delimiter: '_' })]: trimmedValue }; + }, {}); + + const description = q('.model-facts-long', true); + if (description) profile.description = description; + + const aliases = qtx('.aka')?.split(/,\s*/); + if (aliases) profile.aliases = aliases; + + if (bio.birth_place) profile.birthPlace = bio.birth_place; + if (bio.nationality) profile.nationality = bio.nationality; + + if (bio.measurements) { + const [bust, waist, hip] = bio.measurements.split('-'); + + if (bust) profile.bust = bust; + if (waist) profile.waist = Number(waist); + if (hip) profile.hip = Number(hip); + } + + if (bio.weight) profile.weight = Number(bio.weight.match(/^\d+/)[0]); + if (bio.height) profile.height = Number(bio.height.match(/^\d+/)[0]); + + if (bio.hair_color) profile.hair = bio.hair_color; + if (bio.eye_color) profile.eye = bio.eye_color; + + if (bio.tattoos) { + profile.hasTattoos = true; + profile.tattoos = bio.tattoos; + } + + if (bio.tattoos) { + profile.hasTattoos = true; + profile.tattoos = bio.tattoos; + } + + if (bio.piercings) { + profile.hasPiercings = true; + profile.piercings = bio.piercings; + } + + profile.avatar = q('.img-pornstar img').dataset.src; + profile.releases = scrapeLatest(html); + + return profile; +} + async function fetchLatest(site, page = 1) { const { hostname } = new URL(site.url); @@ -114,7 +175,27 @@ async function fetchScene(url, site) { return scrapeScene(res.body.toString(), url, site); } +async function fetchProfile(actorName) { + const actorSearchSlug = slugify(actorName, { delimiter: '+' }); + const url = `https://www.private.com/search.php?query=${actorSearchSlug}`; + const modelLinks = await geta(url, '.model h3 a'); + + if (modelLinks) { + const actorSlug = slugify(actorName); + const model = modelLinks.find(({ text }) => slugify(text) === actorSlug); + + if (model) { + const qProfile = await get(model.el.href); + + return qProfile && scrapeProfile(qProfile); + } + } + + return null; +} + module.exports = { fetchLatest, fetchScene, + fetchProfile, }; diff --git a/src/scrapers/scrapers.js b/src/scrapers/scrapers.js index 4781243b..22f23c0a 100644 --- a/src/scrapers/scrapers.js +++ b/src/scrapers/scrapers.js @@ -163,6 +163,7 @@ module.exports = { pimpxxx: cherrypimps, pornhub, povperverts: fullpornnetwork, + private: privateNetwork, realitykings, score, thatsitcomshow: nubiles, diff --git a/src/utils/q.js b/src/utils/q.js index dacae2ca..67568422 100644 --- a/src/utils/q.js +++ b/src/utils/q.js @@ -5,6 +5,7 @@ const moment = require('moment'); const http = require('./http'); function trim(str) { + if (!str) return null; return str.trim().replace(/\s+/g, ' '); } @@ -194,6 +195,7 @@ function init(element, window) { element, el: element, html: element.outerHTML || element.body.outerHTML, + text: trim(element.textContent), ...(window && { window, document: window.document,