Refactored Vixen scraper, using API endpoint and added actor profile and releases scraper. Release scraper will return base release when present and 'deep' argument is false.

This commit is contained in:
ThePendulum 2020-02-22 23:25:10 +01:00
parent d977a5e712
commit 915eb75719
7 changed files with 182 additions and 102 deletions

View File

@ -140,6 +140,7 @@ function initActorActions(store, _router) {
url url
title title
date date
slug
${releaseActorsFragment} ${releaseActorsFragment}
${releaseTagsFragment} ${releaseTagsFragment}
${releasePosterFragment} ${releasePosterFragment}

View File

@ -43,6 +43,14 @@ module.exports = {
'burningangel', 'burningangel',
'brazzers', 'brazzers',
'milehighmedia', 'milehighmedia',
[
'vixen',
'tushy',
'blacked',
'tushyraw',
'blackedraw',
'deeper',
],
[ [
// Nubiles // Nubiles
'nubiles', 'nubiles',

View File

@ -381,7 +381,7 @@ async function scrapeActors(actorNames) {
logger.verbose(`Searching '${actorName}' on ${scraperSlug}`); logger.verbose(`Searching '${actorName}' on ${scraperSlug}`);
const profile = await scraper.fetchProfile(actorEntry ? actorEntry.name : actorName, scraperSlug); const profile = await scraper.fetchProfile(actorEntry ? actorEntry.name : actorName, scraperSlug, argv.withReleases);
if (profile) { if (profile) {
logger.verbose(`Found profile for '${actorName}' on ${scraperSlug}`); logger.verbose(`Found profile for '${actorName}' on ${scraperSlug}`);

View File

@ -153,7 +153,7 @@ function curateReleases(releases) {
} }
async function attachChannelSite(release) { async function attachChannelSite(release) {
if (!release.site.isFallback) { if (!release.site?.isFallback) {
return release; return release;
} }

View File

@ -45,6 +45,13 @@ async function scrapeRelease(source, basicRelease = null, type = 'scene', prefli
throw new Error(`Could not find site ${url} in database`); throw new Error(`Could not find site ${url} in database`);
} }
if (!argv.deep && release) {
return {
...release,
site,
};
}
const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug]; const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug];
if (!scraper) { if (!scraper) {

View File

@ -114,9 +114,15 @@ module.exports = {
}, },
actors: { actors: {
'21sextury': sextury, '21sextury': sextury,
analbbc: fullpornnetwork,
analized: fullpornnetwork,
analviolation: fullpornnetwork,
anilos: nubiles, anilos: nubiles,
babes, babes,
baddaddypov: fullpornnetwork,
bangbros, bangbros,
blacked: vixen,
blackedraw: vixen,
blowpass, blowpass,
boobpedia, boobpedia,
brattysis: nubiles, brattysis: nubiles,
@ -124,47 +130,47 @@ module.exports = {
burningangel, burningangel,
cherrypimps, cherrypimps,
ddfnetwork, ddfnetwork,
deeper: vixen,
deeplush: nubiles, deeplush: nubiles,
digitalplayground, digitalplayground,
dtfsluts: fullpornnetwork,
evilangel, evilangel,
fakehub, fakehub,
famedigital, famedigital,
freeones, freeones,
freeonesLegacy, freeonesLegacy,
girlfaction: fullpornnetwork,
hergape: fullpornnetwork,
homemadeanalwhores: fullpornnetwork,
hotcrazymess: nubiles, hotcrazymess: nubiles,
iconmale, iconmale,
jamesdeen: fullpornnetwork,
julesjordan, julesjordan,
kellymadison, kellymadison,
legalporno, legalporno,
men, men,
analbbc: fullpornnetwork,
analized: fullpornnetwork,
analviolation: fullpornnetwork,
baddaddypov: fullpornnetwork,
dtfsluts: fullpornnetwork,
girlfaction: fullpornnetwork,
hergape: fullpornnetwork,
homemadeanalwhores: fullpornnetwork,
jamesdeen: fullpornnetwork,
mugfucked: fullpornnetwork,
onlyprince: fullpornnetwork,
pervertgallery: fullpornnetwork,
povperverts: fullpornnetwork,
metrohd, metrohd,
milehighmedia, milehighmedia,
mofos, mofos,
mugfucked: fullpornnetwork,
naughtyamerica, naughtyamerica,
nfbusty: nubiles, nfbusty: nubiles,
nubilefilms: nubiles, nubilefilms: nubiles,
nubiles, nubiles,
nubilesporn: nubiles, nubilesporn: nubiles,
onlyprince: fullpornnetwork,
pervertgallery: fullpornnetwork,
pimpxxx: cherrypimps, pimpxxx: cherrypimps,
pornhub, pornhub,
povperverts: fullpornnetwork,
realitykings, realitykings,
score, score,
thatsitcomshow: nubiles, thatsitcomshow: nubiles,
transangels, transangels,
tushy: vixen,
tushyraw: vixen,
twistys, twistys,
vixen,
wicked, wicked,
xempire, xempire,
}, },

View File

@ -1,10 +1,18 @@
'use strict'; 'use strict';
/* eslint-disable newline-per-chained-call */ /* eslint-disable newline-per-chained-call */
const bhttp = require('bhttp'); const Promise = require('bluebird');
const cheerio = require('cheerio');
const moment = require('moment'); const moment = require('moment');
const { get, post } = require('../utils/http');
const slugify = require('../utils/slugify');
const genderMap = {
F: 'female',
M: 'male',
T: 'transsexual', // not yet observed
};
function getPosterFallbacks(poster) { function getPosterFallbacks(poster) {
return poster return poster
.filter(image => /landscape/i.test(image.name)) .filter(image => /landscape/i.test(image.name))
@ -17,54 +25,46 @@ function getPosterFallbacks(poster) {
.flat(); .flat();
} }
function scrapeLatest(html, site) { function getTeaserFallbacks(teaser) {
const $ = cheerio.load(html, { normalizeWhitespace: true }); return teaser
.filter(video => /landscape/i.test(video.name))
.map(video => ({
src: video.src,
type: video.type,
quality: Number(String(video.height).replace('353', '360')),
}));
}
const stateScript = $('script:contains("INITIAL_STATE")').html(); function getAvatarFallbacks(avatar) {
const { videos: scenes } = JSON.parse(stateScript.slice(stateScript.indexOf('{'), stateScript.indexOf('};') + 1)); return avatar
.sort((imageA, imageB) => imageB.height - imageA.height)
.map(image => [image.highdpi?.['3x'], image.highdpi?.['2x'], image.src])
.flat();
}
function scrapeAll(scenes, site, origin) {
return scenes.map((scene) => { return scenes.map((scene) => {
const entryId = String(scene.newId); const release = {};
const { release.title = scene.title;
title,
models: actors,
} = scene;
const url = `${site.url}${scene.targetUrl}`; release.entryId = String(scene.newId);
const date = moment.utc(scene.releaseDateFormatted, 'MMMM DD, YYYY').toDate(); release.url = `${site?.url || origin}${scene.targetUrl}`;
const stars = Number(scene.textRating) / 2;
// largest thumbnail. poster is the same image but bigger, too large for storage space efficiency release.date = moment.utc(scene.releaseDate).toDate();
const poster = scene.images.listing.slice(-1)[0].src; release.shootDate = moment.utc(scene.shootDate).toDate();
const teaser = scene.previews.listing.slice(-1)[0];
return { release.actors = scene.models;
url, release.stars = Number(scene.textRating) / 2;
entryId,
title, release.poster = getPosterFallbacks(scene.images.poster);
actors, release.teaser = getTeaserFallbacks(scene.previews.poster);
date,
poster, return release;
teaser: {
src: teaser.src,
type: teaser.type,
quality: teaser.height,
},
rating: {
stars,
},
site,
};
}); });
} }
function scrapeUpcoming(html, site) { function scrapeUpcoming(scene, site) {
const statePrefix = html.indexOf('__INITIAL_STATE__');
const stateString = html.slice(html.indexOf('{', statePrefix), html.indexOf('};', statePrefix) + 1);
const data = JSON.parse(stateString);
const scene = data.page.data['/'].data?.nextScene;
if (!scene || scene.isPreReleasePeriod) return null; if (!scene || scene.isPreReleasePeriod) return null;
const release = {}; const release = {};
@ -75,33 +75,23 @@ function scrapeUpcoming(html, site) {
.map(component => `${component.charAt(0).toUpperCase()}${component.slice(1)}`) .map(component => `${component.charAt(0).toUpperCase()}${component.slice(1)}`)
.join(' '); .join(' ');
release.date = moment.utc(scene.releaseDate).toDate();
release.url = `${site.url}${scene.targetUrl}`; release.url = `${site.url}${scene.targetUrl}`;
release.date = moment.utc(scene.releaseDate).toDate();
release.shootDate = moment.utc(scene.shootDate).toDate();
release.actors = scene.models; release.actors = scene.models;
release.poster = getPosterFallbacks(scene.images.poster); release.poster = getPosterFallbacks(scene.images.poster);
release.teaser = scene.previews.poster release.teaser = getTeaserFallbacks(scene.previews.poster);
.filter(teaser => /landscape/i.test(teaser.name))
.map(teaser => ({
src: teaser.src,
type: teaser.type,
quality: Number(String(teaser.height).replace('353', '360')),
}));
release.entryId = (release.poster[0] || release.teaser[0])?.match(/\/(\d+)/)?.[1]; release.entryId = (release.poster[0] || release.teaser[0])?.match(/\/(\d+)/)?.[1];
return [release]; return [release];
} }
async function scrapeScene(html, url) { async function scrapeScene(data, url, site, baseRelease) {
const $ = cheerio.load(html, { normalizeWhitespace: true }); const scene = data.video;
const stateObject = $('script:contains("INITIAL_STATE")');
const data = JSON.parse(stateObject.html().trim().slice(27, -1));
const pageData = data.page.data[data.location.pathname].data;
const scene = data.videos.find(video => video.newId === pageData.video);
const release = { const release = {
url, url,
@ -114,64 +104,132 @@ async function scrapeScene(html, url) {
tags: scene.tags, tags: scene.tags,
}; };
release.entryId = pageData.video; release.entryId = scene.newId;
release.actors = scene.models;
release.date = moment.utc(scene.releaseDate).toDate();
release.shootDate = moment.utc(scene.shootDate).toDate();
release.actors = baseRelease?.actors || scene.models;
// release.poster = scene.rotatingThumbsUrlSizes[0]['1040w'];
release.poster = getPosterFallbacks(scene.images.poster); release.poster = getPosterFallbacks(scene.images.poster);
release.photos = pageData.pictureset.map(photo => photo.main[0].src); release.photos = data.pictureset.map(photo => photo.main[0].src);
const trailer = scene.previews.listing.find(preview => preview.height === 353); release.teaser = getTeaserFallbacks(scene.previews.poster);
if (trailer) release.trailer = { src: trailer };
// trailer must exist! const qualities = [360, 480, 720, 1080, 2160];
const trailersUrl = `${site.url}/api/__tkn/${scene.previewVideoUrl1080P}/trailer/${qualities.join('+')}`;
const trailersRes = await post(trailersUrl, null, { headers: { referer: url } });
release.teaser = scene.previews.poster if (trailersRes.code === 200) {
.filter(teaser => /landscape/i.test(teaser.name)) release.trailer = qualities.map(quality => (trailersRes.body[quality] ? {
.map(teaser => ({ src: trailersRes.body[quality].token,
src: teaser.src, quality,
type: teaser.type, } : null)).filter(Boolean);
quality: Number(String(teaser.height).replace('353', '360')), }
}));
release.date = new Date(scene.releaseDate);
return release; return release;
} }
async function fetchLatest(site, page = 1) { async function fetchActorReleases(pages, model, origin) {
const url = `${site.url}/videos?page=${page}&size=7`; const releasesPerPage = await Promise.map(pages, async (page) => {
const res = await bhttp.get(url); const url = `${origin}/api${model.targetUrl}?page=${page}`;
const res = await get(url);
if (res.statusCode === 200) { if (res.code === 200) {
return scrapeLatest(res.body.toString(), site); return scrapeAll(res.body.data.videos.videos, null, origin);
} }
return res.statusCode; return [];
}, { concurrency: 3 });
return releasesPerPage.flat();
}
async function scrapeProfile(data, origin, withReleases) {
const model = data.model;
const profile = {};
profile.birthdate = new Date(model.dateOfBirth);
profile.gender = genderMap[model.sex];
profile.hair = model.hairColour;
profile.nationality = model.nationality;
if (model.biography.trim().length > 0) profile.description = model.biography;
if (model.cupSize && model.bustMeasurment) profile.bust = `${model.bustMeasurment}${model.cupSize}`;
if (model.waistMeasurment) profile.waist = model.waistMeasurment;
if (model.hipMeasurment) profile.hip = model.hipMeasurment;
profile.avatar = getAvatarFallbacks(model.images.listing);
profile.poster = getAvatarFallbacks(model.images.profile);
profile.banner = getAvatarFallbacks(model.images.poster);
const releases = scrapeAll(data.videos.videos, null, origin);
if (withReleases) {
const pageCount = Math.ceil(data.videos.count / 6);
const otherReleases = await fetchActorReleases((Array.from({ length: pageCount - 1 }, (value, index) => index + 2)), model, origin);
profile.releases = [...releases, ...otherReleases];
} else {
profile.releases = releases;
}
return profile;
}
async function fetchLatest(site, page = 1) {
const url = `${site.url}/api/videos?page=${page}`;
const res = await get(url);
if (res.code === 200) {
return scrapeAll(res.body.data.videos, site);
}
return res.code;
} }
async function fetchUpcoming(site) { async function fetchUpcoming(site) {
const res = await bhttp.get(site.url); const apiUrl = `${site.url}/api`;
const res = await get(apiUrl);
if (res.statusCode === 200) { if (res.code === 200) {
return scrapeUpcoming(res.body.toString(), site); return scrapeUpcoming(res.body.data.nextScene, site);
} }
return res.statusCode; return res.code;
} }
async function fetchScene(url, site) { async function fetchScene(url, site, baseRelease) {
const res = await bhttp.get(url); const { origin, pathname } = new URL(url);
const apiUrl = `${origin}/api${pathname}`;
if (res.statusCode === 200) { const res = await get(apiUrl);
return scrapeScene(res.body.toString(), url, site);
if (res.code === 200) {
return scrapeScene(res.body.data, url, site, baseRelease);
} }
throw new Error(`Vixen response not OK for scene (${url}): ${res.statusCode}`); return res.code;
}
async function fetchProfile(actorName, scraperSlug, withReleases) {
const origin = `https://www.${scraperSlug}.com`;
const actorSlug = slugify(actorName);
const url = `${origin}/api/${actorSlug}`;
const res = await get(url);
if (res.code === 200) {
return scrapeProfile(res.body.data, origin, withReleases);
}
return null;
} }
module.exports = { module.exports = {
fetchLatest, fetchLatest,
fetchUpcoming, fetchUpcoming,
fetchScene, fetchScene,
fetchProfile,
}; };