Refactored Vixen scraper, using API endpoint and added actor profile and releases scraper. Release scraper will return base release when present and 'deep' argument is false.

This commit is contained in:
ThePendulum 2020-02-22 23:25:10 +01:00
parent d977a5e712
commit 915eb75719
7 changed files with 182 additions and 102 deletions

View File

@ -140,6 +140,7 @@ function initActorActions(store, _router) {
url
title
date
slug
${releaseActorsFragment}
${releaseTagsFragment}
${releasePosterFragment}

View File

@ -43,6 +43,14 @@ module.exports = {
'burningangel',
'brazzers',
'milehighmedia',
[
'vixen',
'tushy',
'blacked',
'tushyraw',
'blackedraw',
'deeper',
],
[
// Nubiles
'nubiles',

View File

@ -381,7 +381,7 @@ async function scrapeActors(actorNames) {
logger.verbose(`Searching '${actorName}' on ${scraperSlug}`);
const profile = await scraper.fetchProfile(actorEntry ? actorEntry.name : actorName, scraperSlug);
const profile = await scraper.fetchProfile(actorEntry ? actorEntry.name : actorName, scraperSlug, argv.withReleases);
if (profile) {
logger.verbose(`Found profile for '${actorName}' on ${scraperSlug}`);

View File

@ -153,7 +153,7 @@ function curateReleases(releases) {
}
async function attachChannelSite(release) {
if (!release.site.isFallback) {
if (!release.site?.isFallback) {
return release;
}

View File

@ -45,6 +45,13 @@ async function scrapeRelease(source, basicRelease = null, type = 'scene', prefli
throw new Error(`Could not find site ${url} in database`);
}
if (!argv.deep && release) {
return {
...release,
site,
};
}
const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug];
if (!scraper) {

View File

@ -114,9 +114,15 @@ module.exports = {
},
actors: {
'21sextury': sextury,
analbbc: fullpornnetwork,
analized: fullpornnetwork,
analviolation: fullpornnetwork,
anilos: nubiles,
babes,
baddaddypov: fullpornnetwork,
bangbros,
blacked: vixen,
blackedraw: vixen,
blowpass,
boobpedia,
brattysis: nubiles,
@ -124,47 +130,47 @@ module.exports = {
burningangel,
cherrypimps,
ddfnetwork,
deeper: vixen,
deeplush: nubiles,
digitalplayground,
dtfsluts: fullpornnetwork,
evilangel,
fakehub,
famedigital,
freeones,
freeonesLegacy,
girlfaction: fullpornnetwork,
hergape: fullpornnetwork,
homemadeanalwhores: fullpornnetwork,
hotcrazymess: nubiles,
iconmale,
jamesdeen: fullpornnetwork,
julesjordan,
kellymadison,
legalporno,
men,
analbbc: fullpornnetwork,
analized: fullpornnetwork,
analviolation: fullpornnetwork,
baddaddypov: fullpornnetwork,
dtfsluts: fullpornnetwork,
girlfaction: fullpornnetwork,
hergape: fullpornnetwork,
homemadeanalwhores: fullpornnetwork,
jamesdeen: fullpornnetwork,
mugfucked: fullpornnetwork,
onlyprince: fullpornnetwork,
pervertgallery: fullpornnetwork,
povperverts: fullpornnetwork,
metrohd,
milehighmedia,
mofos,
mugfucked: fullpornnetwork,
naughtyamerica,
nfbusty: nubiles,
nubilefilms: nubiles,
nubiles,
nubilesporn: nubiles,
onlyprince: fullpornnetwork,
pervertgallery: fullpornnetwork,
pimpxxx: cherrypimps,
pornhub,
povperverts: fullpornnetwork,
realitykings,
score,
thatsitcomshow: nubiles,
transangels,
tushy: vixen,
tushyraw: vixen,
twistys,
vixen,
wicked,
xempire,
},

View File

@ -1,10 +1,18 @@
'use strict';
/* eslint-disable newline-per-chained-call */
const bhttp = require('bhttp');
const cheerio = require('cheerio');
const Promise = require('bluebird');
const moment = require('moment');
const { get, post } = require('../utils/http');
const slugify = require('../utils/slugify');
const genderMap = {
F: 'female',
M: 'male',
T: 'transsexual', // not yet observed
};
function getPosterFallbacks(poster) {
return poster
.filter(image => /landscape/i.test(image.name))
@ -17,54 +25,46 @@ function getPosterFallbacks(poster) {
.flat();
}
function scrapeLatest(html, site) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
function getTeaserFallbacks(teaser) {
return teaser
.filter(video => /landscape/i.test(video.name))
.map(video => ({
src: video.src,
type: video.type,
quality: Number(String(video.height).replace('353', '360')),
}));
}
const stateScript = $('script:contains("INITIAL_STATE")').html();
const { videos: scenes } = JSON.parse(stateScript.slice(stateScript.indexOf('{'), stateScript.indexOf('};') + 1));
function getAvatarFallbacks(avatar) {
return avatar
.sort((imageA, imageB) => imageB.height - imageA.height)
.map(image => [image.highdpi?.['3x'], image.highdpi?.['2x'], image.src])
.flat();
}
function scrapeAll(scenes, site, origin) {
return scenes.map((scene) => {
const entryId = String(scene.newId);
const release = {};
const {
title,
models: actors,
} = scene;
release.title = scene.title;
const url = `${site.url}${scene.targetUrl}`;
const date = moment.utc(scene.releaseDateFormatted, 'MMMM DD, YYYY').toDate();
const stars = Number(scene.textRating) / 2;
release.entryId = String(scene.newId);
release.url = `${site?.url || origin}${scene.targetUrl}`;
// largest thumbnail. poster is the same image but bigger, too large for storage space efficiency
const poster = scene.images.listing.slice(-1)[0].src;
const teaser = scene.previews.listing.slice(-1)[0];
release.date = moment.utc(scene.releaseDate).toDate();
release.shootDate = moment.utc(scene.shootDate).toDate();
return {
url,
entryId,
title,
actors,
date,
poster,
teaser: {
src: teaser.src,
type: teaser.type,
quality: teaser.height,
},
rating: {
stars,
},
site,
};
release.actors = scene.models;
release.stars = Number(scene.textRating) / 2;
release.poster = getPosterFallbacks(scene.images.poster);
release.teaser = getTeaserFallbacks(scene.previews.poster);
return release;
});
}
function scrapeUpcoming(html, site) {
const statePrefix = html.indexOf('__INITIAL_STATE__');
const stateString = html.slice(html.indexOf('{', statePrefix), html.indexOf('};', statePrefix) + 1);
const data = JSON.parse(stateString);
const scene = data.page.data['/'].data?.nextScene;
function scrapeUpcoming(scene, site) {
if (!scene || scene.isPreReleasePeriod) return null;
const release = {};
@ -75,33 +75,23 @@ function scrapeUpcoming(html, site) {
.map(component => `${component.charAt(0).toUpperCase()}${component.slice(1)}`)
.join(' ');
release.date = moment.utc(scene.releaseDate).toDate();
release.url = `${site.url}${scene.targetUrl}`;
release.date = moment.utc(scene.releaseDate).toDate();
release.shootDate = moment.utc(scene.shootDate).toDate();
release.actors = scene.models;
release.poster = getPosterFallbacks(scene.images.poster);
release.teaser = scene.previews.poster
.filter(teaser => /landscape/i.test(teaser.name))
.map(teaser => ({
src: teaser.src,
type: teaser.type,
quality: Number(String(teaser.height).replace('353', '360')),
}));
release.teaser = getTeaserFallbacks(scene.previews.poster);
release.entryId = (release.poster[0] || release.teaser[0])?.match(/\/(\d+)/)?.[1];
return [release];
}
async function scrapeScene(html, url) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const stateObject = $('script:contains("INITIAL_STATE")');
const data = JSON.parse(stateObject.html().trim().slice(27, -1));
const pageData = data.page.data[data.location.pathname].data;
const scene = data.videos.find(video => video.newId === pageData.video);
async function scrapeScene(data, url, site, baseRelease) {
const scene = data.video;
const release = {
url,
@ -114,64 +104,132 @@ async function scrapeScene(html, url) {
tags: scene.tags,
};
release.entryId = pageData.video;
release.actors = scene.models;
release.entryId = scene.newId;
release.date = moment.utc(scene.releaseDate).toDate();
release.shootDate = moment.utc(scene.shootDate).toDate();
release.actors = baseRelease?.actors || scene.models;
// release.poster = scene.rotatingThumbsUrlSizes[0]['1040w'];
release.poster = getPosterFallbacks(scene.images.poster);
release.photos = pageData.pictureset.map(photo => photo.main[0].src);
release.photos = data.pictureset.map(photo => photo.main[0].src);
const trailer = scene.previews.listing.find(preview => preview.height === 353);
if (trailer) release.trailer = { src: trailer };
release.teaser = getTeaserFallbacks(scene.previews.poster);
// trailer must exist!
const qualities = [360, 480, 720, 1080, 2160];
const trailersUrl = `${site.url}/api/__tkn/${scene.previewVideoUrl1080P}/trailer/${qualities.join('+')}`;
const trailersRes = await post(trailersUrl, null, { headers: { referer: url } });
release.teaser = scene.previews.poster
.filter(teaser => /landscape/i.test(teaser.name))
.map(teaser => ({
src: teaser.src,
type: teaser.type,
quality: Number(String(teaser.height).replace('353', '360')),
}));
release.date = new Date(scene.releaseDate);
if (trailersRes.code === 200) {
release.trailer = qualities.map(quality => (trailersRes.body[quality] ? {
src: trailersRes.body[quality].token,
quality,
} : null)).filter(Boolean);
}
return release;
}
async function fetchLatest(site, page = 1) {
const url = `${site.url}/videos?page=${page}&size=7`;
const res = await bhttp.get(url);
async function fetchActorReleases(pages, model, origin) {
const releasesPerPage = await Promise.map(pages, async (page) => {
const url = `${origin}/api${model.targetUrl}?page=${page}`;
const res = await get(url);
if (res.statusCode === 200) {
return scrapeLatest(res.body.toString(), site);
if (res.code === 200) {
return scrapeAll(res.body.data.videos.videos, null, origin);
}
return [];
}, { concurrency: 3 });
return releasesPerPage.flat();
}
async function scrapeProfile(data, origin, withReleases) {
const model = data.model;
const profile = {};
profile.birthdate = new Date(model.dateOfBirth);
profile.gender = genderMap[model.sex];
profile.hair = model.hairColour;
profile.nationality = model.nationality;
if (model.biography.trim().length > 0) profile.description = model.biography;
if (model.cupSize && model.bustMeasurment) profile.bust = `${model.bustMeasurment}${model.cupSize}`;
if (model.waistMeasurment) profile.waist = model.waistMeasurment;
if (model.hipMeasurment) profile.hip = model.hipMeasurment;
profile.avatar = getAvatarFallbacks(model.images.listing);
profile.poster = getAvatarFallbacks(model.images.profile);
profile.banner = getAvatarFallbacks(model.images.poster);
const releases = scrapeAll(data.videos.videos, null, origin);
if (withReleases) {
const pageCount = Math.ceil(data.videos.count / 6);
const otherReleases = await fetchActorReleases((Array.from({ length: pageCount - 1 }, (value, index) => index + 2)), model, origin);
profile.releases = [...releases, ...otherReleases];
} else {
profile.releases = releases;
}
return res.statusCode;
return profile;
}
async function fetchLatest(site, page = 1) {
const url = `${site.url}/api/videos?page=${page}`;
const res = await get(url);
if (res.code === 200) {
return scrapeAll(res.body.data.videos, site);
}
return res.code;
}
async function fetchUpcoming(site) {
const res = await bhttp.get(site.url);
const apiUrl = `${site.url}/api`;
const res = await get(apiUrl);
if (res.statusCode === 200) {
return scrapeUpcoming(res.body.toString(), site);
if (res.code === 200) {
return scrapeUpcoming(res.body.data.nextScene, site);
}
return res.statusCode;
return res.code;
}
async function fetchScene(url, site) {
const res = await bhttp.get(url);
async function fetchScene(url, site, baseRelease) {
const { origin, pathname } = new URL(url);
const apiUrl = `${origin}/api${pathname}`;
if (res.statusCode === 200) {
return scrapeScene(res.body.toString(), url, site);
const res = await get(apiUrl);
if (res.code === 200) {
return scrapeScene(res.body.data, url, site, baseRelease);
}
throw new Error(`Vixen response not OK for scene (${url}): ${res.statusCode}`);
return res.code;
}
async function fetchProfile(actorName, scraperSlug, withReleases) {
const origin = `https://www.${scraperSlug}.com`;
const actorSlug = slugify(actorName);
const url = `${origin}/api/${actorSlug}`;
const res = await get(url);
if (res.code === 200) {
return scrapeProfile(res.body.data, origin, withReleases);
}
return null;
}
module.exports = {
fetchLatest,
fetchUpcoming,
fetchScene,
fetchProfile,
};