From 915eb75719d8538e3214a20463684ff16a504419 Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Sat, 22 Feb 2020 23:25:10 +0100 Subject: [PATCH] Refactored Vixen scraper, using API endpoint and added actor profile and releases scraper. Release scraper will return base release when present and 'deep' argument is false. --- assets/js/actors/actions.js | 1 + config/default.js | 8 ++ src/actors.js | 2 +- src/releases.js | 2 +- src/scrape-releases.js | 7 ++ src/scrapers/scrapers.js | 32 +++-- src/scrapers/vixen.js | 232 ++++++++++++++++++++++-------------- 7 files changed, 182 insertions(+), 102 deletions(-) diff --git a/assets/js/actors/actions.js b/assets/js/actors/actions.js index cf51be12..f4d54721 100644 --- a/assets/js/actors/actions.js +++ b/assets/js/actors/actions.js @@ -140,6 +140,7 @@ function initActorActions(store, _router) { url title date + slug ${releaseActorsFragment} ${releaseTagsFragment} ${releasePosterFragment} diff --git a/config/default.js b/config/default.js index 22cb8714..f7df591a 100644 --- a/config/default.js +++ b/config/default.js @@ -43,6 +43,14 @@ module.exports = { 'burningangel', 'brazzers', 'milehighmedia', + [ + 'vixen', + 'tushy', + 'blacked', + 'tushyraw', + 'blackedraw', + 'deeper', + ], [ // Nubiles 'nubiles', diff --git a/src/actors.js b/src/actors.js index 9087d12b..5fa99877 100644 --- a/src/actors.js +++ b/src/actors.js @@ -381,7 +381,7 @@ async function scrapeActors(actorNames) { logger.verbose(`Searching '${actorName}' on ${scraperSlug}`); - const profile = await scraper.fetchProfile(actorEntry ? actorEntry.name : actorName, scraperSlug); + const profile = await scraper.fetchProfile(actorEntry ? actorEntry.name : actorName, scraperSlug, argv.withReleases); if (profile) { logger.verbose(`Found profile for '${actorName}' on ${scraperSlug}`); diff --git a/src/releases.js b/src/releases.js index 084025ea..3b8a8eef 100644 --- a/src/releases.js +++ b/src/releases.js @@ -153,7 +153,7 @@ function curateReleases(releases) { } async function attachChannelSite(release) { - if (!release.site.isFallback) { + if (!release.site?.isFallback) { return release; } diff --git a/src/scrape-releases.js b/src/scrape-releases.js index 56333332..15f1cd39 100644 --- a/src/scrape-releases.js +++ b/src/scrape-releases.js @@ -45,6 +45,13 @@ async function scrapeRelease(source, basicRelease = null, type = 'scene', prefli throw new Error(`Could not find site ${url} in database`); } + if (!argv.deep && release) { + return { + ...release, + site, + }; + } + const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug]; if (!scraper) { diff --git a/src/scrapers/scrapers.js b/src/scrapers/scrapers.js index 21e2f646..4781243b 100644 --- a/src/scrapers/scrapers.js +++ b/src/scrapers/scrapers.js @@ -114,9 +114,15 @@ module.exports = { }, actors: { '21sextury': sextury, + analbbc: fullpornnetwork, + analized: fullpornnetwork, + analviolation: fullpornnetwork, anilos: nubiles, babes, + baddaddypov: fullpornnetwork, bangbros, + blacked: vixen, + blackedraw: vixen, blowpass, boobpedia, brattysis: nubiles, @@ -124,47 +130,47 @@ module.exports = { burningangel, cherrypimps, ddfnetwork, + deeper: vixen, deeplush: nubiles, digitalplayground, + dtfsluts: fullpornnetwork, evilangel, fakehub, famedigital, freeones, freeonesLegacy, + girlfaction: fullpornnetwork, + hergape: fullpornnetwork, + homemadeanalwhores: fullpornnetwork, hotcrazymess: nubiles, iconmale, + jamesdeen: fullpornnetwork, julesjordan, kellymadison, legalporno, men, - analbbc: fullpornnetwork, - analized: fullpornnetwork, - analviolation: fullpornnetwork, - baddaddypov: fullpornnetwork, - dtfsluts: fullpornnetwork, - girlfaction: fullpornnetwork, - hergape: fullpornnetwork, - homemadeanalwhores: fullpornnetwork, - jamesdeen: fullpornnetwork, - mugfucked: fullpornnetwork, - onlyprince: fullpornnetwork, - pervertgallery: fullpornnetwork, - povperverts: fullpornnetwork, metrohd, milehighmedia, mofos, + mugfucked: fullpornnetwork, naughtyamerica, nfbusty: nubiles, nubilefilms: nubiles, nubiles, nubilesporn: nubiles, + onlyprince: fullpornnetwork, + pervertgallery: fullpornnetwork, pimpxxx: cherrypimps, pornhub, + povperverts: fullpornnetwork, realitykings, score, thatsitcomshow: nubiles, transangels, + tushy: vixen, + tushyraw: vixen, twistys, + vixen, wicked, xempire, }, diff --git a/src/scrapers/vixen.js b/src/scrapers/vixen.js index b6030bbd..2cae3a77 100644 --- a/src/scrapers/vixen.js +++ b/src/scrapers/vixen.js @@ -1,10 +1,18 @@ 'use strict'; /* eslint-disable newline-per-chained-call */ -const bhttp = require('bhttp'); -const cheerio = require('cheerio'); +const Promise = require('bluebird'); const moment = require('moment'); +const { get, post } = require('../utils/http'); +const slugify = require('../utils/slugify'); + +const genderMap = { + F: 'female', + M: 'male', + T: 'transsexual', // not yet observed +}; + function getPosterFallbacks(poster) { return poster .filter(image => /landscape/i.test(image.name)) @@ -17,54 +25,46 @@ function getPosterFallbacks(poster) { .flat(); } -function scrapeLatest(html, site) { - const $ = cheerio.load(html, { normalizeWhitespace: true }); +function getTeaserFallbacks(teaser) { + return teaser + .filter(video => /landscape/i.test(video.name)) + .map(video => ({ + src: video.src, + type: video.type, + quality: Number(String(video.height).replace('353', '360')), + })); +} - const stateScript = $('script:contains("INITIAL_STATE")').html(); - const { videos: scenes } = JSON.parse(stateScript.slice(stateScript.indexOf('{'), stateScript.indexOf('};') + 1)); +function getAvatarFallbacks(avatar) { + return avatar + .sort((imageA, imageB) => imageB.height - imageA.height) + .map(image => [image.highdpi?.['3x'], image.highdpi?.['2x'], image.src]) + .flat(); +} +function scrapeAll(scenes, site, origin) { return scenes.map((scene) => { - const entryId = String(scene.newId); + const release = {}; - const { - title, - models: actors, - } = scene; + release.title = scene.title; - const url = `${site.url}${scene.targetUrl}`; - const date = moment.utc(scene.releaseDateFormatted, 'MMMM DD, YYYY').toDate(); - const stars = Number(scene.textRating) / 2; + release.entryId = String(scene.newId); + release.url = `${site?.url || origin}${scene.targetUrl}`; - // largest thumbnail. poster is the same image but bigger, too large for storage space efficiency - const poster = scene.images.listing.slice(-1)[0].src; - const teaser = scene.previews.listing.slice(-1)[0]; + release.date = moment.utc(scene.releaseDate).toDate(); + release.shootDate = moment.utc(scene.shootDate).toDate(); - return { - url, - entryId, - title, - actors, - date, - poster, - teaser: { - src: teaser.src, - type: teaser.type, - quality: teaser.height, - }, - rating: { - stars, - }, - site, - }; + release.actors = scene.models; + release.stars = Number(scene.textRating) / 2; + + release.poster = getPosterFallbacks(scene.images.poster); + release.teaser = getTeaserFallbacks(scene.previews.poster); + + return release; }); } -function scrapeUpcoming(html, site) { - const statePrefix = html.indexOf('__INITIAL_STATE__'); - const stateString = html.slice(html.indexOf('{', statePrefix), html.indexOf('};', statePrefix) + 1); - const data = JSON.parse(stateString); - - const scene = data.page.data['/'].data?.nextScene; +function scrapeUpcoming(scene, site) { if (!scene || scene.isPreReleasePeriod) return null; const release = {}; @@ -75,33 +75,23 @@ function scrapeUpcoming(html, site) { .map(component => `${component.charAt(0).toUpperCase()}${component.slice(1)}`) .join(' '); - release.date = moment.utc(scene.releaseDate).toDate(); release.url = `${site.url}${scene.targetUrl}`; + release.date = moment.utc(scene.releaseDate).toDate(); + release.shootDate = moment.utc(scene.shootDate).toDate(); + release.actors = scene.models; release.poster = getPosterFallbacks(scene.images.poster); - release.teaser = scene.previews.poster - .filter(teaser => /landscape/i.test(teaser.name)) - .map(teaser => ({ - src: teaser.src, - type: teaser.type, - quality: Number(String(teaser.height).replace('353', '360')), - })); + release.teaser = getTeaserFallbacks(scene.previews.poster); release.entryId = (release.poster[0] || release.teaser[0])?.match(/\/(\d+)/)?.[1]; return [release]; } -async function scrapeScene(html, url) { - const $ = cheerio.load(html, { normalizeWhitespace: true }); - - const stateObject = $('script:contains("INITIAL_STATE")'); - const data = JSON.parse(stateObject.html().trim().slice(27, -1)); - - const pageData = data.page.data[data.location.pathname].data; - const scene = data.videos.find(video => video.newId === pageData.video); +async function scrapeScene(data, url, site, baseRelease) { + const scene = data.video; const release = { url, @@ -114,64 +104,132 @@ async function scrapeScene(html, url) { tags: scene.tags, }; - release.entryId = pageData.video; - release.actors = scene.models; + release.entryId = scene.newId; + + release.date = moment.utc(scene.releaseDate).toDate(); + release.shootDate = moment.utc(scene.shootDate).toDate(); + + release.actors = baseRelease?.actors || scene.models; - // release.poster = scene.rotatingThumbsUrlSizes[0]['1040w']; release.poster = getPosterFallbacks(scene.images.poster); - release.photos = pageData.pictureset.map(photo => photo.main[0].src); + release.photos = data.pictureset.map(photo => photo.main[0].src); - const trailer = scene.previews.listing.find(preview => preview.height === 353); - if (trailer) release.trailer = { src: trailer }; + release.teaser = getTeaserFallbacks(scene.previews.poster); - // trailer must exist! + const qualities = [360, 480, 720, 1080, 2160]; + const trailersUrl = `${site.url}/api/__tkn/${scene.previewVideoUrl1080P}/trailer/${qualities.join('+')}`; + const trailersRes = await post(trailersUrl, null, { headers: { referer: url } }); - release.teaser = scene.previews.poster - .filter(teaser => /landscape/i.test(teaser.name)) - .map(teaser => ({ - src: teaser.src, - type: teaser.type, - quality: Number(String(teaser.height).replace('353', '360')), - })); - - release.date = new Date(scene.releaseDate); + if (trailersRes.code === 200) { + release.trailer = qualities.map(quality => (trailersRes.body[quality] ? { + src: trailersRes.body[quality].token, + quality, + } : null)).filter(Boolean); + } return release; } -async function fetchLatest(site, page = 1) { - const url = `${site.url}/videos?page=${page}&size=7`; - const res = await bhttp.get(url); +async function fetchActorReleases(pages, model, origin) { + const releasesPerPage = await Promise.map(pages, async (page) => { + const url = `${origin}/api${model.targetUrl}?page=${page}`; + const res = await get(url); - if (res.statusCode === 200) { - return scrapeLatest(res.body.toString(), site); + if (res.code === 200) { + return scrapeAll(res.body.data.videos.videos, null, origin); + } + + return []; + }, { concurrency: 3 }); + + return releasesPerPage.flat(); +} + +async function scrapeProfile(data, origin, withReleases) { + const model = data.model; + const profile = {}; + + profile.birthdate = new Date(model.dateOfBirth); + profile.gender = genderMap[model.sex]; + + profile.hair = model.hairColour; + profile.nationality = model.nationality; + + if (model.biography.trim().length > 0) profile.description = model.biography; + + if (model.cupSize && model.bustMeasurment) profile.bust = `${model.bustMeasurment}${model.cupSize}`; + if (model.waistMeasurment) profile.waist = model.waistMeasurment; + if (model.hipMeasurment) profile.hip = model.hipMeasurment; + + profile.avatar = getAvatarFallbacks(model.images.listing); + profile.poster = getAvatarFallbacks(model.images.profile); + profile.banner = getAvatarFallbacks(model.images.poster); + + const releases = scrapeAll(data.videos.videos, null, origin); + + if (withReleases) { + const pageCount = Math.ceil(data.videos.count / 6); + const otherReleases = await fetchActorReleases((Array.from({ length: pageCount - 1 }, (value, index) => index + 2)), model, origin); + + profile.releases = [...releases, ...otherReleases]; + } else { + profile.releases = releases; } - return res.statusCode; + return profile; +} + +async function fetchLatest(site, page = 1) { + const url = `${site.url}/api/videos?page=${page}`; + const res = await get(url); + + if (res.code === 200) { + return scrapeAll(res.body.data.videos, site); + } + + return res.code; } async function fetchUpcoming(site) { - const res = await bhttp.get(site.url); + const apiUrl = `${site.url}/api`; + const res = await get(apiUrl); - if (res.statusCode === 200) { - return scrapeUpcoming(res.body.toString(), site); + if (res.code === 200) { + return scrapeUpcoming(res.body.data.nextScene, site); } - return res.statusCode; + return res.code; } -async function fetchScene(url, site) { - const res = await bhttp.get(url); +async function fetchScene(url, site, baseRelease) { + const { origin, pathname } = new URL(url); + const apiUrl = `${origin}/api${pathname}`; - if (res.statusCode === 200) { - return scrapeScene(res.body.toString(), url, site); + const res = await get(apiUrl); + + if (res.code === 200) { + return scrapeScene(res.body.data, url, site, baseRelease); } - throw new Error(`Vixen response not OK for scene (${url}): ${res.statusCode}`); + return res.code; +} + +async function fetchProfile(actorName, scraperSlug, withReleases) { + const origin = `https://www.${scraperSlug}.com`; + const actorSlug = slugify(actorName); + const url = `${origin}/api/${actorSlug}`; + const res = await get(url); + + if (res.code === 200) { + return scrapeProfile(res.body.data, origin, withReleases); + } + + return null; } module.exports = { fetchLatest, fetchUpcoming, fetchScene, + fetchProfile, };