Updated Vixen scraper with more informative API query.

This commit is contained in:
DebaucheryLibrarian 2023-07-06 04:24:47 +02:00
parent 43d8b93953
commit 18744372b3
7 changed files with 274 additions and 175 deletions

View File

@ -845,6 +845,10 @@ const tags = [
name: 'POV', name: 'POV',
slug: 'pov', slug: 'pov',
}, },
{
name: 'prone bone',
slug: 'prone-bone',
},
{ {
name: 'pussy eating', name: 'pussy eating',
slug: 'pussy-eating', slug: 'pussy-eating',
@ -2281,6 +2285,14 @@ const aliases = [
name: 'pijpen', name: 'pijpen',
for: 'blowjob', for: 'blowjob',
}, },
{
name: 'pronebone',
slug: 'prone-bone',
},
{
name: 'prone',
slug: 'prone-bone',
},
]; ];
const priorities = [ // higher index is higher priority const priorities = [ // higher index is higher priority

View File

@ -936,8 +936,8 @@ async function associatePeople(releases, batchId, type = 'actor') {
acc[release.id] = toBaseActors(release.actors, release); acc[release.id] = toBaseActors(release.actors, release);
} }
if (type === 'directors' && release.director) { if (type === 'directors' && (release.director || release.directors)) {
acc[release.id] = toBaseActors([release.director], release); acc[release.id] = toBaseActors([].concat(release.director || release.directors).filter(Boolean), release);
} }
return acc; return acc;

View File

@ -353,6 +353,8 @@ async function extractSource(baseSource, { existingExtractMediaByUrl }) {
if (typeof baseSource.defer === 'function') { if (typeof baseSource.defer === 'function') {
const src = await baseSource.defer(); const src = await baseSource.defer();
console.log(baseSource, src);
return { return {
...baseSource, ...baseSource,
...toBaseSource(src), ...toBaseSource(src),

View File

@ -316,6 +316,8 @@ function scrapeProfile(html, url, actorName, entity) {
profile.releases = Array.from(document.querySelectorAll('.category_listing_block .update_details > a:first-child'), (el) => el.href); profile.releases = Array.from(document.querySelectorAll('.category_listing_block .update_details > a:first-child'), (el) => el.href);
console.log(profile);
return profile; return profile;
} }

View File

@ -17,48 +17,62 @@ function random(array) {
function femaleAdjective() { function femaleAdjective() {
return random([ return random([
'hot',
'young',
'new',
'busty',
'insatiable',
'depraved',
'horny',
'flexible',
'bubble butt',
'voluptuous',
'curvy',
'skinny',
'nerdy',
'oiled',
'tied up',
'bound',
'Asian', 'Asian',
'Russian',
'Latina', 'Latina',
'Russian',
'bound',
'bubble butt',
'busty',
'cock-hungry',
'cum-hungry',
'adickted',
'curvy',
'depraved',
'ebony', 'ebony',
'flexible',
'greedy',
'horny',
'hot',
'insatiable',
'nerdy',
'new',
'oiled',
'shy',
'skinny',
'tied up',
'voluptuous',
'young',
]); ]);
} }
function maleAdjective() { function maleAdjective() {
return random([ return random([
'toned', 'toned',
'bulky',
'nerdy', 'nerdy',
'strong',
'shy',
]); ]);
} }
function sceneAdjective() { function sceneAdjective() {
return random([ return random([
'first', 'first',
'hot',
'hottest', 'hottest',
'wild',
'wildest', 'wildest',
'deep',
'deepest', 'deepest',
]); ]);
} }
function groupSceneAdjective() { function groupSceneAdjective() {
return random([ return random([
'big',
'biggest', 'biggest',
`${Math.floor(Math.random() * 20) + 4}-guy`,
`${Math.floor(Math.random() * 20) + 4}-man`,
]); ]);
} }
@ -69,6 +83,8 @@ function dickAdjective() {
'throbbing', 'throbbing',
'thick', 'thick',
'long', 'long',
'huge',
'girthy',
'monster', 'monster',
`${Math.floor(Math.random() * 12) + 9} inch`, `${Math.floor(Math.random() * 12) + 9} inch`,
]); ]);
@ -77,20 +93,22 @@ function dickAdjective() {
function femaleNoun() { function femaleNoun() {
return random([ return random([
'MILF', 'MILF',
'teen',
'spinner',
'coed',
'redhead',
'beauty', 'beauty',
'blonde', 'blonde',
'nympho',
'brunette', 'brunette',
'maid', 'coed',
'student',
'dominatrix', 'dominatrix',
'stepsister', 'maid',
'schoolgirl',
'nurse', 'nurse',
'nympho',
'redhead',
'schoolgirl',
'slut',
'spinner',
'stepsister',
'student',
'teen',
'whore',
]); ]);
} }

View File

@ -3,6 +3,7 @@
/* eslint-disable newline-per-chained-call */ /* eslint-disable newline-per-chained-call */
const Promise = require('bluebird'); const Promise = require('bluebird');
const moment = require('moment'); const moment = require('moment');
const unprint = require('unprint');
const qu = require('../utils/qu'); const qu = require('../utils/qu');
const http = require('../utils/http'); const http = require('../utils/http');
@ -41,11 +42,60 @@ function curateSources(sources, type = 'image/jpeg') {
- Math.abs(1.8 - Number((resB.width / resB.height).toFixed(1)))); - Math.abs(1.8 - Number((resB.width / resB.height).toFixed(1))));
} }
async function getTrailer(scene, channel, url) { function scrapeAll(scenes, channel) {
return scenes.map((data) => {
const release = {};
release.entryId = data.videoId;
release.url = `${channel.url}/videos/${data.slug}`;
release.title = data.title;
release.date = qu.extractDate(data.releaseDate);
release.actors = data.modelsSlugged.map((model) => ({
name: model.name,
url: `${channel.url}/models/${model.slugged}`,
}));
release.poster = curateSources(data.images.listing);
release.teaser = curateSources(data.previews.listing, 'video/mp4');
release.stars = data.rating;
return release;
});
}
function scrapeUpcoming(scene, site) {
if (!scene || scene.isPreReleasePeriod) {
return null;
}
const release = {};
release.entryId = scene.videoId;
release.url = `${site.url}/videos/${scene.slug}`;
release.title = scene.slug
.split('-')
.map((component) => `${component.charAt(0).toUpperCase()}${component.slice(1)}`)
.join(' ');
release.date = moment.utc(scene.releaseDate).toDate();
release.datePrecision = 'minute';
release.actors = scene.models.map((model) => model.name);
release.poster = curateSources(scene.images.poster);
release.teaser = curateSources(scene.previews.poster);
return [release];
}
async function getTrailer(videoId, channel, url) {
const res = await http.post(`${channel.url}/graphql`, { const res = await http.post(`${channel.url}/graphql`, {
operationName: 'getToken', operationName: 'getToken',
variables: { variables: {
videoId: scene.newId, videoId,
device: 'trailer', device: 'trailer',
}, },
query: ` query: `
@ -134,120 +184,7 @@ async function getTrailer(scene, channel, url) {
return null; return null;
} }
function scrapeAll(scenes, channel) { async function scrapeScene(data, url, channel, options) {
return scenes.map((data) => {
const release = {};
release.entryId = data.videoId;
release.url = `${channel.url}/videos/${data.slug}`;
release.title = data.title;
release.date = qu.extractDate(data.releaseDate);
release.actors = data.modelsSlugged.map((model) => ({
name: model.name,
url: `${channel.url}/models/${model.slugged}`,
}));
release.poster = curateSources(data.images.listing);
release.teaser = curateSources(data.previews.listing, 'video/mp4');
release.stars = data.rating;
return release;
});
}
function scrapeUpcoming(scene, site) {
if (!scene || scene.isPreReleasePeriod) {
return null;
}
const release = {};
release.entryId = scene.videoId;
release.url = `${site.url}/videos/${scene.slug}`;
release.title = scene.slug
.split('-')
.map((component) => `${component.charAt(0).toUpperCase()}${component.slice(1)}`)
.join(' ');
release.date = moment.utc(scene.releaseDate).toDate();
release.datePrecision = 'minute';
release.actors = scene.models.map((model) => model.name);
release.poster = curateSources(scene.images.poster);
release.teaser = curateSources(scene.previews.poster);
return [release];
}
async function fetchGraphqlDetails(release, channel, session) {
const query = `
query($query: String!, $site: Site!) {
searchVideos(input: {
query: $query
site: $site
}) {
edges {
node {
videoId
title
slug
description
releaseDate
categories {
name
}
chapters {
video {
title
seconds
}
}
models {
name
}
images {
poster {
...ImageInfo
}
}
}
}
}
}
fragment ImageInfo on Image {
src
highdpi {
double
}
}
`;
const variables = JSON.stringify({
site: channel.slug.toUpperCase(),
query: release.title,
});
const res = await http.get(`${channel.url}/graphql?query=${encodeURI(query)}&variables=${variables}`, {
session,
headers: {
referer: channel.url,
accept: '*/*',
},
});
if (res.ok) {
return res.body.data?.searchVideos?.edges?.find((edge) => edge.node.videoId === release.entryId)?.node || null;
}
return null;
}
async function scrapeScene(data, url, channel, options, session) {
const release = { const release = {
url, url,
entryId: data.video.videoId || data.video.newId, entryId: data.video.videoId || data.video.newId,
@ -273,30 +210,171 @@ async function scrapeScene(data, url, channel, options, session) {
: data.video.carousel?.map((photo) => photo.main[0]?.src).filter(Boolean); : data.video.carousel?.map((photo) => photo.main[0]?.src).filter(Boolean);
if (options.includeTrailers) { if (options.includeTrailers) {
const trailer = await getTrailer(data.video, channel, url); release.trailer = await getTrailer(release.entryId, channel, release.url);
if (trailer) {
release.trailer = trailer;
}
} }
release.qualities = data.video?.downloadResolutions.map((quality) => Number(quality.width)).filter(Boolean); // width property is actually the height release.qualities = data.video?.downloadResolutions.map((quality) => Number(quality.width)).filter(Boolean); // width property is actually the height
const graphqlDetails = await fetchGraphqlDetails(release, channel, session);
if (graphqlDetails) {
release.tags = graphqlDetails.categories?.map((category) => category.name);
release.chapters = graphqlDetails.chapters?.video?.map((chapter) => ({
time: chapter.seconds,
tags: [chapter.title],
}));
}
release.channel = data.video?.id.split(':')[0]; release.channel = data.video?.id.split(':')[0];
return release; return release;
} }
async function scrapeSceneData(data, channel, options) {
const release = {};
release.entryId = data.videoId;
release.url = `${channel.url}/videos/${data.slug}`;
release.title = data.title;
release.description = data.description;
release.date = new Date(data.releaseDate);
release.duration = unprint.extractDuration(data.runLength);
release.actors = data.models;
release.directors = data.directors.map((director) => ({
entryId: director.directorId,
name: director.name,
}));
release.poster = curateSources(data.images?.poster);
release.photos = data.carousel?.map((photo) => photo.main[0]?.src).filter(Boolean);
if (options.includeTrailers) {
release.trailer = await getTrailer(release.entryId, channel, release.url);
}
release.tags = data.categories.map((category) => category.name);
release.qualities = data.downloadResolutions.map((quality) => Number(quality.width)).filter(Boolean); // width property is actually the height
release.chapters = data.chapters.video?.map((chapter) => ({
time: chapter.seconds,
tags: [chapter.title],
}));
release.channel = data.site;
release.stars = data.rating;
return release;
}
async function fetchGraphqlScene(release, channel) {
const slug = new URL(release.url).pathname.match(/\/videos\/(.*)/)?.[1];
// the API won't reliable return results when the query is over ~30 characters for some reason
const query = slug.split('-').reduce((acc, word) => {
const newAcc = `${acc} ${word}`;
if (newAcc.length > 30) {
return acc;
}
return newAcc;
}, '').trim();
if (!slug) {
return null;
}
const res = await http.post(`${channel.url}/graphql`, {
operationName: 'searchVideos',
variables: {
site: channel.slug.toUpperCase(),
query,
},
query: `
query searchVideos($site: Site!, $query: String!) {
searchVideos(input: {
query: $query
site: $site
}) {
edges {
node {
videoId
title
slug
description
releaseDate
runLength
site
rating
models {
name
}
directors {
directorId
name
}
categories {
name
}
chapters {
video {
title
seconds
}
}
downloadResolutions {
width
}
carousel {
main {
src
}
}
images {
poster {
...ImageInfo
}
}
}
}
}
}
fragment ImageInfo on Image {
src
width
height
highdpi {
double
}
}
`,
}, {
headers: {
referer: release.url,
origin: channel.url,
},
});
if (res.ok) {
return res.body.data?.searchVideos?.edges?.find((edge) => edge.node.slug === slug)?.node || null;
}
return null;
}
async function fetchScene(url, channel, baseRelease, options) {
const graphqlData = await fetchGraphqlScene(baseRelease, channel);
if (graphqlData) {
return scrapeSceneData(graphqlData, channel, options);
}
const session = qu.session();
const res = await qu.get(url, null, null, { session });
if (res.ok) {
const dataString = res.item.query.html('#__NEXT_DATA__');
const data = dataString && JSON.parse(dataString);
return scrapeScene(data.props.pageProps, url, channel, options, session);
}
return res.status;
}
async function fetchActorReleases(pages, model, origin) { async function fetchActorReleases(pages, model, origin) {
const releasesPerPage = await Promise.map(pages, async (page) => { const releasesPerPage = await Promise.map(pages, async (page) => {
const url = `${origin}/api${model.targetUrl}?page=${page}`; const url = `${origin}/api${model.targetUrl}?page=${page}`;
@ -451,20 +529,6 @@ async function fetchUpcoming(channel) {
return res.status; return res.status;
} }
async function fetchScene(url, channel, baseRelease, options) {
const session = qu.session();
const res = await qu.get(url, null, null, { session });
if (res.ok) {
const dataString = res.item.query.html('#__NEXT_DATA__');
const data = dataString && JSON.parse(dataString);
return scrapeScene(data.props.pageProps, url, channel, options, session);
}
return res.status;
}
async function fetchProfile({ name: actorName }, { site }, include) { async function fetchProfile({ name: actorName }, { site }, include) {
const origin = site.url; const origin = site.url;
const actorSlug = slugify(actorName); const actorSlug = slugify(actorName);

View File

@ -98,7 +98,8 @@ async function matchReleaseTags(releases) {
} }
async function getEntityTags(releases) { async function getEntityTags(releases) {
const entityIds = releases.map((release) => release.entity?.id).filter(Boolean); const entityIds = Array.from(new Set(releases.map((release) => release.entity?.id).filter(Boolean)));
const entityTags = await knex('entities_tags') const entityTags = await knex('entities_tags')
.select('id', 'name', 'entity_id') .select('id', 'name', 'entity_id')
.whereIn('entity_id', entityIds) .whereIn('entity_id', entityIds)