Updated Vixen scraper with more informative API query.

This commit is contained in:
DebaucheryLibrarian 2023-07-06 04:24:47 +02:00
parent 43d8b93953
commit 18744372b3
7 changed files with 274 additions and 175 deletions

View File

@ -845,6 +845,10 @@ const tags = [
name: 'POV',
slug: 'pov',
},
{
name: 'prone bone',
slug: 'prone-bone',
},
{
name: 'pussy eating',
slug: 'pussy-eating',
@ -2281,6 +2285,14 @@ const aliases = [
name: 'pijpen',
for: 'blowjob',
},
{
name: 'pronebone',
slug: 'prone-bone',
},
{
name: 'prone',
slug: 'prone-bone',
},
];
const priorities = [ // higher index is higher priority

View File

@ -936,8 +936,8 @@ async function associatePeople(releases, batchId, type = 'actor') {
acc[release.id] = toBaseActors(release.actors, release);
}
if (type === 'directors' && release.director) {
acc[release.id] = toBaseActors([release.director], release);
if (type === 'directors' && (release.director || release.directors)) {
acc[release.id] = toBaseActors([].concat(release.director || release.directors).filter(Boolean), release);
}
return acc;

View File

@ -353,6 +353,8 @@ async function extractSource(baseSource, { existingExtractMediaByUrl }) {
if (typeof baseSource.defer === 'function') {
const src = await baseSource.defer();
console.log(baseSource, src);
return {
...baseSource,
...toBaseSource(src),

View File

@ -316,6 +316,8 @@ function scrapeProfile(html, url, actorName, entity) {
profile.releases = Array.from(document.querySelectorAll('.category_listing_block .update_details > a:first-child'), (el) => el.href);
console.log(profile);
return profile;
}

View File

@ -17,48 +17,62 @@ function random(array) {
function femaleAdjective() {
return random([
'hot',
'young',
'new',
'busty',
'insatiable',
'depraved',
'horny',
'flexible',
'bubble butt',
'voluptuous',
'curvy',
'skinny',
'nerdy',
'oiled',
'tied up',
'bound',
'Asian',
'Russian',
'Latina',
'Russian',
'bound',
'bubble butt',
'busty',
'cock-hungry',
'cum-hungry',
'adickted',
'curvy',
'depraved',
'ebony',
'flexible',
'greedy',
'horny',
'hot',
'insatiable',
'nerdy',
'new',
'oiled',
'shy',
'skinny',
'tied up',
'voluptuous',
'young',
]);
}
function maleAdjective() {
return random([
'toned',
'bulky',
'nerdy',
'strong',
'shy',
]);
}
function sceneAdjective() {
return random([
'first',
'hot',
'hottest',
'wild',
'wildest',
'deep',
'deepest',
]);
}
function groupSceneAdjective() {
return random([
'big',
'biggest',
`${Math.floor(Math.random() * 20) + 4}-guy`,
`${Math.floor(Math.random() * 20) + 4}-man`,
]);
}
@ -69,6 +83,8 @@ function dickAdjective() {
'throbbing',
'thick',
'long',
'huge',
'girthy',
'monster',
`${Math.floor(Math.random() * 12) + 9} inch`,
]);
@ -77,20 +93,22 @@ function dickAdjective() {
function femaleNoun() {
return random([
'MILF',
'teen',
'spinner',
'coed',
'redhead',
'beauty',
'blonde',
'nympho',
'brunette',
'maid',
'student',
'coed',
'dominatrix',
'stepsister',
'schoolgirl',
'maid',
'nurse',
'nympho',
'redhead',
'schoolgirl',
'slut',
'spinner',
'stepsister',
'student',
'teen',
'whore',
]);
}

View File

@ -3,6 +3,7 @@
/* eslint-disable newline-per-chained-call */
const Promise = require('bluebird');
const moment = require('moment');
const unprint = require('unprint');
const qu = require('../utils/qu');
const http = require('../utils/http');
@ -41,11 +42,60 @@ function curateSources(sources, type = 'image/jpeg') {
- Math.abs(1.8 - Number((resB.width / resB.height).toFixed(1))));
}
async function getTrailer(scene, channel, url) {
function scrapeAll(scenes, channel) {
return scenes.map((data) => {
const release = {};
release.entryId = data.videoId;
release.url = `${channel.url}/videos/${data.slug}`;
release.title = data.title;
release.date = qu.extractDate(data.releaseDate);
release.actors = data.modelsSlugged.map((model) => ({
name: model.name,
url: `${channel.url}/models/${model.slugged}`,
}));
release.poster = curateSources(data.images.listing);
release.teaser = curateSources(data.previews.listing, 'video/mp4');
release.stars = data.rating;
return release;
});
}
function scrapeUpcoming(scene, site) {
if (!scene || scene.isPreReleasePeriod) {
return null;
}
const release = {};
release.entryId = scene.videoId;
release.url = `${site.url}/videos/${scene.slug}`;
release.title = scene.slug
.split('-')
.map((component) => `${component.charAt(0).toUpperCase()}${component.slice(1)}`)
.join(' ');
release.date = moment.utc(scene.releaseDate).toDate();
release.datePrecision = 'minute';
release.actors = scene.models.map((model) => model.name);
release.poster = curateSources(scene.images.poster);
release.teaser = curateSources(scene.previews.poster);
return [release];
}
async function getTrailer(videoId, channel, url) {
const res = await http.post(`${channel.url}/graphql`, {
operationName: 'getToken',
variables: {
videoId: scene.newId,
videoId,
device: 'trailer',
},
query: `
@ -134,120 +184,7 @@ async function getTrailer(scene, channel, url) {
return null;
}
function scrapeAll(scenes, channel) {
return scenes.map((data) => {
const release = {};
release.entryId = data.videoId;
release.url = `${channel.url}/videos/${data.slug}`;
release.title = data.title;
release.date = qu.extractDate(data.releaseDate);
release.actors = data.modelsSlugged.map((model) => ({
name: model.name,
url: `${channel.url}/models/${model.slugged}`,
}));
release.poster = curateSources(data.images.listing);
release.teaser = curateSources(data.previews.listing, 'video/mp4');
release.stars = data.rating;
return release;
});
}
function scrapeUpcoming(scene, site) {
if (!scene || scene.isPreReleasePeriod) {
return null;
}
const release = {};
release.entryId = scene.videoId;
release.url = `${site.url}/videos/${scene.slug}`;
release.title = scene.slug
.split('-')
.map((component) => `${component.charAt(0).toUpperCase()}${component.slice(1)}`)
.join(' ');
release.date = moment.utc(scene.releaseDate).toDate();
release.datePrecision = 'minute';
release.actors = scene.models.map((model) => model.name);
release.poster = curateSources(scene.images.poster);
release.teaser = curateSources(scene.previews.poster);
return [release];
}
async function fetchGraphqlDetails(release, channel, session) {
const query = `
query($query: String!, $site: Site!) {
searchVideos(input: {
query: $query
site: $site
}) {
edges {
node {
videoId
title
slug
description
releaseDate
categories {
name
}
chapters {
video {
title
seconds
}
}
models {
name
}
images {
poster {
...ImageInfo
}
}
}
}
}
}
fragment ImageInfo on Image {
src
highdpi {
double
}
}
`;
const variables = JSON.stringify({
site: channel.slug.toUpperCase(),
query: release.title,
});
const res = await http.get(`${channel.url}/graphql?query=${encodeURI(query)}&variables=${variables}`, {
session,
headers: {
referer: channel.url,
accept: '*/*',
},
});
if (res.ok) {
return res.body.data?.searchVideos?.edges?.find((edge) => edge.node.videoId === release.entryId)?.node || null;
}
return null;
}
async function scrapeScene(data, url, channel, options, session) {
async function scrapeScene(data, url, channel, options) {
const release = {
url,
entryId: data.video.videoId || data.video.newId,
@ -273,30 +210,171 @@ async function scrapeScene(data, url, channel, options, session) {
: data.video.carousel?.map((photo) => photo.main[0]?.src).filter(Boolean);
if (options.includeTrailers) {
const trailer = await getTrailer(data.video, channel, url);
if (trailer) {
release.trailer = trailer;
}
release.trailer = await getTrailer(release.entryId, channel, release.url);
}
release.qualities = data.video?.downloadResolutions.map((quality) => Number(quality.width)).filter(Boolean); // width property is actually the height
const graphqlDetails = await fetchGraphqlDetails(release, channel, session);
if (graphqlDetails) {
release.tags = graphqlDetails.categories?.map((category) => category.name);
release.chapters = graphqlDetails.chapters?.video?.map((chapter) => ({
time: chapter.seconds,
tags: [chapter.title],
}));
}
release.channel = data.video?.id.split(':')[0];
return release;
}
async function scrapeSceneData(data, channel, options) {
const release = {};
release.entryId = data.videoId;
release.url = `${channel.url}/videos/${data.slug}`;
release.title = data.title;
release.description = data.description;
release.date = new Date(data.releaseDate);
release.duration = unprint.extractDuration(data.runLength);
release.actors = data.models;
release.directors = data.directors.map((director) => ({
entryId: director.directorId,
name: director.name,
}));
release.poster = curateSources(data.images?.poster);
release.photos = data.carousel?.map((photo) => photo.main[0]?.src).filter(Boolean);
if (options.includeTrailers) {
release.trailer = await getTrailer(release.entryId, channel, release.url);
}
release.tags = data.categories.map((category) => category.name);
release.qualities = data.downloadResolutions.map((quality) => Number(quality.width)).filter(Boolean); // width property is actually the height
release.chapters = data.chapters.video?.map((chapter) => ({
time: chapter.seconds,
tags: [chapter.title],
}));
release.channel = data.site;
release.stars = data.rating;
return release;
}
async function fetchGraphqlScene(release, channel) {
const slug = new URL(release.url).pathname.match(/\/videos\/(.*)/)?.[1];
// the API won't reliable return results when the query is over ~30 characters for some reason
const query = slug.split('-').reduce((acc, word) => {
const newAcc = `${acc} ${word}`;
if (newAcc.length > 30) {
return acc;
}
return newAcc;
}, '').trim();
if (!slug) {
return null;
}
const res = await http.post(`${channel.url}/graphql`, {
operationName: 'searchVideos',
variables: {
site: channel.slug.toUpperCase(),
query,
},
query: `
query searchVideos($site: Site!, $query: String!) {
searchVideos(input: {
query: $query
site: $site
}) {
edges {
node {
videoId
title
slug
description
releaseDate
runLength
site
rating
models {
name
}
directors {
directorId
name
}
categories {
name
}
chapters {
video {
title
seconds
}
}
downloadResolutions {
width
}
carousel {
main {
src
}
}
images {
poster {
...ImageInfo
}
}
}
}
}
}
fragment ImageInfo on Image {
src
width
height
highdpi {
double
}
}
`,
}, {
headers: {
referer: release.url,
origin: channel.url,
},
});
if (res.ok) {
return res.body.data?.searchVideos?.edges?.find((edge) => edge.node.slug === slug)?.node || null;
}
return null;
}
async function fetchScene(url, channel, baseRelease, options) {
const graphqlData = await fetchGraphqlScene(baseRelease, channel);
if (graphqlData) {
return scrapeSceneData(graphqlData, channel, options);
}
const session = qu.session();
const res = await qu.get(url, null, null, { session });
if (res.ok) {
const dataString = res.item.query.html('#__NEXT_DATA__');
const data = dataString && JSON.parse(dataString);
return scrapeScene(data.props.pageProps, url, channel, options, session);
}
return res.status;
}
async function fetchActorReleases(pages, model, origin) {
const releasesPerPage = await Promise.map(pages, async (page) => {
const url = `${origin}/api${model.targetUrl}?page=${page}`;
@ -451,20 +529,6 @@ async function fetchUpcoming(channel) {
return res.status;
}
async function fetchScene(url, channel, baseRelease, options) {
const session = qu.session();
const res = await qu.get(url, null, null, { session });
if (res.ok) {
const dataString = res.item.query.html('#__NEXT_DATA__');
const data = dataString && JSON.parse(dataString);
return scrapeScene(data.props.pageProps, url, channel, options, session);
}
return res.status;
}
async function fetchProfile({ name: actorName }, { site }, include) {
const origin = site.url;
const actorSlug = slugify(actorName);

View File

@ -98,7 +98,8 @@ async function matchReleaseTags(releases) {
}
async function getEntityTags(releases) {
const entityIds = releases.map((release) => release.entity?.id).filter(Boolean);
const entityIds = Array.from(new Set(releases.map((release) => release.entity?.id).filter(Boolean)));
const entityTags = await knex('entities_tags')
.select('id', 'name', 'entity_id')
.whereIn('entity_id', entityIds)