Updated Vixen scraper with more informative API query.
This commit is contained in:
parent
43d8b93953
commit
18744372b3
|
@ -845,6 +845,10 @@ const tags = [
|
|||
name: 'POV',
|
||||
slug: 'pov',
|
||||
},
|
||||
{
|
||||
name: 'prone bone',
|
||||
slug: 'prone-bone',
|
||||
},
|
||||
{
|
||||
name: 'pussy eating',
|
||||
slug: 'pussy-eating',
|
||||
|
@ -2281,6 +2285,14 @@ const aliases = [
|
|||
name: 'pijpen',
|
||||
for: 'blowjob',
|
||||
},
|
||||
{
|
||||
name: 'pronebone',
|
||||
slug: 'prone-bone',
|
||||
},
|
||||
{
|
||||
name: 'prone',
|
||||
slug: 'prone-bone',
|
||||
},
|
||||
];
|
||||
|
||||
const priorities = [ // higher index is higher priority
|
||||
|
|
|
@ -936,8 +936,8 @@ async function associatePeople(releases, batchId, type = 'actor') {
|
|||
acc[release.id] = toBaseActors(release.actors, release);
|
||||
}
|
||||
|
||||
if (type === 'directors' && release.director) {
|
||||
acc[release.id] = toBaseActors([release.director], release);
|
||||
if (type === 'directors' && (release.director || release.directors)) {
|
||||
acc[release.id] = toBaseActors([].concat(release.director || release.directors).filter(Boolean), release);
|
||||
}
|
||||
|
||||
return acc;
|
||||
|
|
|
@ -353,6 +353,8 @@ async function extractSource(baseSource, { existingExtractMediaByUrl }) {
|
|||
if (typeof baseSource.defer === 'function') {
|
||||
const src = await baseSource.defer();
|
||||
|
||||
console.log(baseSource, src);
|
||||
|
||||
return {
|
||||
...baseSource,
|
||||
...toBaseSource(src),
|
||||
|
|
|
@ -316,6 +316,8 @@ function scrapeProfile(html, url, actorName, entity) {
|
|||
|
||||
profile.releases = Array.from(document.querySelectorAll('.category_listing_block .update_details > a:first-child'), (el) => el.href);
|
||||
|
||||
console.log(profile);
|
||||
|
||||
return profile;
|
||||
}
|
||||
|
||||
|
|
|
@ -17,48 +17,62 @@ function random(array) {
|
|||
|
||||
function femaleAdjective() {
|
||||
return random([
|
||||
'hot',
|
||||
'young',
|
||||
'new',
|
||||
'busty',
|
||||
'insatiable',
|
||||
'depraved',
|
||||
'horny',
|
||||
'flexible',
|
||||
'bubble butt',
|
||||
'voluptuous',
|
||||
'curvy',
|
||||
'skinny',
|
||||
'nerdy',
|
||||
'oiled',
|
||||
'tied up',
|
||||
'bound',
|
||||
'Asian',
|
||||
'Russian',
|
||||
'Latina',
|
||||
'Russian',
|
||||
'bound',
|
||||
'bubble butt',
|
||||
'busty',
|
||||
'cock-hungry',
|
||||
'cum-hungry',
|
||||
'adickted',
|
||||
'curvy',
|
||||
'depraved',
|
||||
'ebony',
|
||||
'flexible',
|
||||
'greedy',
|
||||
'horny',
|
||||
'hot',
|
||||
'insatiable',
|
||||
'nerdy',
|
||||
'new',
|
||||
'oiled',
|
||||
'shy',
|
||||
'skinny',
|
||||
'tied up',
|
||||
'voluptuous',
|
||||
'young',
|
||||
]);
|
||||
}
|
||||
|
||||
function maleAdjective() {
|
||||
return random([
|
||||
'toned',
|
||||
'bulky',
|
||||
'nerdy',
|
||||
'strong',
|
||||
'shy',
|
||||
]);
|
||||
}
|
||||
|
||||
function sceneAdjective() {
|
||||
return random([
|
||||
'first',
|
||||
'hot',
|
||||
'hottest',
|
||||
'wild',
|
||||
'wildest',
|
||||
'deep',
|
||||
'deepest',
|
||||
]);
|
||||
}
|
||||
|
||||
function groupSceneAdjective() {
|
||||
return random([
|
||||
'big',
|
||||
'biggest',
|
||||
`${Math.floor(Math.random() * 20) + 4}-guy`,
|
||||
`${Math.floor(Math.random() * 20) + 4}-man`,
|
||||
]);
|
||||
}
|
||||
|
||||
|
@ -69,6 +83,8 @@ function dickAdjective() {
|
|||
'throbbing',
|
||||
'thick',
|
||||
'long',
|
||||
'huge',
|
||||
'girthy',
|
||||
'monster',
|
||||
`${Math.floor(Math.random() * 12) + 9} inch`,
|
||||
]);
|
||||
|
@ -77,20 +93,22 @@ function dickAdjective() {
|
|||
function femaleNoun() {
|
||||
return random([
|
||||
'MILF',
|
||||
'teen',
|
||||
'spinner',
|
||||
'coed',
|
||||
'redhead',
|
||||
'beauty',
|
||||
'blonde',
|
||||
'nympho',
|
||||
'brunette',
|
||||
'maid',
|
||||
'student',
|
||||
'coed',
|
||||
'dominatrix',
|
||||
'stepsister',
|
||||
'schoolgirl',
|
||||
'maid',
|
||||
'nurse',
|
||||
'nympho',
|
||||
'redhead',
|
||||
'schoolgirl',
|
||||
'slut',
|
||||
'spinner',
|
||||
'stepsister',
|
||||
'student',
|
||||
'teen',
|
||||
'whore',
|
||||
]);
|
||||
}
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
/* eslint-disable newline-per-chained-call */
|
||||
const Promise = require('bluebird');
|
||||
const moment = require('moment');
|
||||
const unprint = require('unprint');
|
||||
|
||||
const qu = require('../utils/qu');
|
||||
const http = require('../utils/http');
|
||||
|
@ -41,11 +42,60 @@ function curateSources(sources, type = 'image/jpeg') {
|
|||
- Math.abs(1.8 - Number((resB.width / resB.height).toFixed(1))));
|
||||
}
|
||||
|
||||
async function getTrailer(scene, channel, url) {
|
||||
function scrapeAll(scenes, channel) {
|
||||
return scenes.map((data) => {
|
||||
const release = {};
|
||||
|
||||
release.entryId = data.videoId;
|
||||
release.url = `${channel.url}/videos/${data.slug}`;
|
||||
release.title = data.title;
|
||||
|
||||
release.date = qu.extractDate(data.releaseDate);
|
||||
release.actors = data.modelsSlugged.map((model) => ({
|
||||
name: model.name,
|
||||
url: `${channel.url}/models/${model.slugged}`,
|
||||
}));
|
||||
|
||||
release.poster = curateSources(data.images.listing);
|
||||
release.teaser = curateSources(data.previews.listing, 'video/mp4');
|
||||
|
||||
release.stars = data.rating;
|
||||
|
||||
return release;
|
||||
});
|
||||
}
|
||||
|
||||
function scrapeUpcoming(scene, site) {
|
||||
if (!scene || scene.isPreReleasePeriod) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const release = {};
|
||||
|
||||
release.entryId = scene.videoId;
|
||||
release.url = `${site.url}/videos/${scene.slug}`;
|
||||
|
||||
release.title = scene.slug
|
||||
.split('-')
|
||||
.map((component) => `${component.charAt(0).toUpperCase()}${component.slice(1)}`)
|
||||
.join(' ');
|
||||
|
||||
release.date = moment.utc(scene.releaseDate).toDate();
|
||||
release.datePrecision = 'minute';
|
||||
|
||||
release.actors = scene.models.map((model) => model.name);
|
||||
|
||||
release.poster = curateSources(scene.images.poster);
|
||||
release.teaser = curateSources(scene.previews.poster);
|
||||
|
||||
return [release];
|
||||
}
|
||||
|
||||
async function getTrailer(videoId, channel, url) {
|
||||
const res = await http.post(`${channel.url}/graphql`, {
|
||||
operationName: 'getToken',
|
||||
variables: {
|
||||
videoId: scene.newId,
|
||||
videoId,
|
||||
device: 'trailer',
|
||||
},
|
||||
query: `
|
||||
|
@ -134,120 +184,7 @@ async function getTrailer(scene, channel, url) {
|
|||
return null;
|
||||
}
|
||||
|
||||
function scrapeAll(scenes, channel) {
|
||||
return scenes.map((data) => {
|
||||
const release = {};
|
||||
|
||||
release.entryId = data.videoId;
|
||||
release.url = `${channel.url}/videos/${data.slug}`;
|
||||
release.title = data.title;
|
||||
|
||||
release.date = qu.extractDate(data.releaseDate);
|
||||
release.actors = data.modelsSlugged.map((model) => ({
|
||||
name: model.name,
|
||||
url: `${channel.url}/models/${model.slugged}`,
|
||||
}));
|
||||
|
||||
release.poster = curateSources(data.images.listing);
|
||||
release.teaser = curateSources(data.previews.listing, 'video/mp4');
|
||||
|
||||
release.stars = data.rating;
|
||||
|
||||
return release;
|
||||
});
|
||||
}
|
||||
|
||||
function scrapeUpcoming(scene, site) {
|
||||
if (!scene || scene.isPreReleasePeriod) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const release = {};
|
||||
|
||||
release.entryId = scene.videoId;
|
||||
release.url = `${site.url}/videos/${scene.slug}`;
|
||||
|
||||
release.title = scene.slug
|
||||
.split('-')
|
||||
.map((component) => `${component.charAt(0).toUpperCase()}${component.slice(1)}`)
|
||||
.join(' ');
|
||||
|
||||
release.date = moment.utc(scene.releaseDate).toDate();
|
||||
release.datePrecision = 'minute';
|
||||
|
||||
release.actors = scene.models.map((model) => model.name);
|
||||
|
||||
release.poster = curateSources(scene.images.poster);
|
||||
release.teaser = curateSources(scene.previews.poster);
|
||||
|
||||
return [release];
|
||||
}
|
||||
|
||||
async function fetchGraphqlDetails(release, channel, session) {
|
||||
const query = `
|
||||
query($query: String!, $site: Site!) {
|
||||
searchVideos(input: {
|
||||
query: $query
|
||||
site: $site
|
||||
}) {
|
||||
edges {
|
||||
node {
|
||||
videoId
|
||||
title
|
||||
slug
|
||||
description
|
||||
releaseDate
|
||||
categories {
|
||||
name
|
||||
}
|
||||
chapters {
|
||||
video {
|
||||
title
|
||||
seconds
|
||||
}
|
||||
}
|
||||
models {
|
||||
name
|
||||
}
|
||||
images {
|
||||
poster {
|
||||
...ImageInfo
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fragment ImageInfo on Image {
|
||||
src
|
||||
highdpi {
|
||||
double
|
||||
}
|
||||
}
|
||||
`;
|
||||
|
||||
const variables = JSON.stringify({
|
||||
site: channel.slug.toUpperCase(),
|
||||
query: release.title,
|
||||
});
|
||||
|
||||
const res = await http.get(`${channel.url}/graphql?query=${encodeURI(query)}&variables=${variables}`, {
|
||||
session,
|
||||
headers: {
|
||||
referer: channel.url,
|
||||
accept: '*/*',
|
||||
},
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
return res.body.data?.searchVideos?.edges?.find((edge) => edge.node.videoId === release.entryId)?.node || null;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
async function scrapeScene(data, url, channel, options, session) {
|
||||
async function scrapeScene(data, url, channel, options) {
|
||||
const release = {
|
||||
url,
|
||||
entryId: data.video.videoId || data.video.newId,
|
||||
|
@ -273,30 +210,171 @@ async function scrapeScene(data, url, channel, options, session) {
|
|||
: data.video.carousel?.map((photo) => photo.main[0]?.src).filter(Boolean);
|
||||
|
||||
if (options.includeTrailers) {
|
||||
const trailer = await getTrailer(data.video, channel, url);
|
||||
|
||||
if (trailer) {
|
||||
release.trailer = trailer;
|
||||
}
|
||||
release.trailer = await getTrailer(release.entryId, channel, release.url);
|
||||
}
|
||||
|
||||
release.qualities = data.video?.downloadResolutions.map((quality) => Number(quality.width)).filter(Boolean); // width property is actually the height
|
||||
|
||||
const graphqlDetails = await fetchGraphqlDetails(release, channel, session);
|
||||
|
||||
if (graphqlDetails) {
|
||||
release.tags = graphqlDetails.categories?.map((category) => category.name);
|
||||
release.chapters = graphqlDetails.chapters?.video?.map((chapter) => ({
|
||||
time: chapter.seconds,
|
||||
tags: [chapter.title],
|
||||
}));
|
||||
}
|
||||
|
||||
release.channel = data.video?.id.split(':')[0];
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
async function scrapeSceneData(data, channel, options) {
|
||||
const release = {};
|
||||
|
||||
release.entryId = data.videoId;
|
||||
release.url = `${channel.url}/videos/${data.slug}`;
|
||||
|
||||
release.title = data.title;
|
||||
release.description = data.description;
|
||||
|
||||
release.date = new Date(data.releaseDate);
|
||||
release.duration = unprint.extractDuration(data.runLength);
|
||||
|
||||
release.actors = data.models;
|
||||
|
||||
release.directors = data.directors.map((director) => ({
|
||||
entryId: director.directorId,
|
||||
name: director.name,
|
||||
}));
|
||||
|
||||
release.poster = curateSources(data.images?.poster);
|
||||
release.photos = data.carousel?.map((photo) => photo.main[0]?.src).filter(Boolean);
|
||||
|
||||
if (options.includeTrailers) {
|
||||
release.trailer = await getTrailer(release.entryId, channel, release.url);
|
||||
}
|
||||
|
||||
release.tags = data.categories.map((category) => category.name);
|
||||
release.qualities = data.downloadResolutions.map((quality) => Number(quality.width)).filter(Boolean); // width property is actually the height
|
||||
|
||||
release.chapters = data.chapters.video?.map((chapter) => ({
|
||||
time: chapter.seconds,
|
||||
tags: [chapter.title],
|
||||
}));
|
||||
|
||||
release.channel = data.site;
|
||||
release.stars = data.rating;
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
async function fetchGraphqlScene(release, channel) {
|
||||
const slug = new URL(release.url).pathname.match(/\/videos\/(.*)/)?.[1];
|
||||
// the API won't reliable return results when the query is over ~30 characters for some reason
|
||||
const query = slug.split('-').reduce((acc, word) => {
|
||||
const newAcc = `${acc} ${word}`;
|
||||
|
||||
if (newAcc.length > 30) {
|
||||
return acc;
|
||||
}
|
||||
|
||||
return newAcc;
|
||||
}, '').trim();
|
||||
|
||||
if (!slug) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const res = await http.post(`${channel.url}/graphql`, {
|
||||
operationName: 'searchVideos',
|
||||
variables: {
|
||||
site: channel.slug.toUpperCase(),
|
||||
query,
|
||||
},
|
||||
query: `
|
||||
query searchVideos($site: Site!, $query: String!) {
|
||||
searchVideos(input: {
|
||||
query: $query
|
||||
site: $site
|
||||
}) {
|
||||
edges {
|
||||
node {
|
||||
videoId
|
||||
title
|
||||
slug
|
||||
description
|
||||
releaseDate
|
||||
runLength
|
||||
site
|
||||
rating
|
||||
models {
|
||||
name
|
||||
}
|
||||
directors {
|
||||
directorId
|
||||
name
|
||||
}
|
||||
categories {
|
||||
name
|
||||
}
|
||||
chapters {
|
||||
video {
|
||||
title
|
||||
seconds
|
||||
}
|
||||
}
|
||||
downloadResolutions {
|
||||
width
|
||||
}
|
||||
carousel {
|
||||
main {
|
||||
src
|
||||
}
|
||||
}
|
||||
images {
|
||||
poster {
|
||||
...ImageInfo
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fragment ImageInfo on Image {
|
||||
src
|
||||
width
|
||||
height
|
||||
highdpi {
|
||||
double
|
||||
}
|
||||
}
|
||||
`,
|
||||
}, {
|
||||
headers: {
|
||||
referer: release.url,
|
||||
origin: channel.url,
|
||||
},
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
return res.body.data?.searchVideos?.edges?.find((edge) => edge.node.slug === slug)?.node || null;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
async function fetchScene(url, channel, baseRelease, options) {
|
||||
const graphqlData = await fetchGraphqlScene(baseRelease, channel);
|
||||
|
||||
if (graphqlData) {
|
||||
return scrapeSceneData(graphqlData, channel, options);
|
||||
}
|
||||
|
||||
const session = qu.session();
|
||||
const res = await qu.get(url, null, null, { session });
|
||||
|
||||
if (res.ok) {
|
||||
const dataString = res.item.query.html('#__NEXT_DATA__');
|
||||
const data = dataString && JSON.parse(dataString);
|
||||
|
||||
return scrapeScene(data.props.pageProps, url, channel, options, session);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchActorReleases(pages, model, origin) {
|
||||
const releasesPerPage = await Promise.map(pages, async (page) => {
|
||||
const url = `${origin}/api${model.targetUrl}?page=${page}`;
|
||||
|
@ -451,20 +529,6 @@ async function fetchUpcoming(channel) {
|
|||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchScene(url, channel, baseRelease, options) {
|
||||
const session = qu.session();
|
||||
const res = await qu.get(url, null, null, { session });
|
||||
|
||||
if (res.ok) {
|
||||
const dataString = res.item.query.html('#__NEXT_DATA__');
|
||||
const data = dataString && JSON.parse(dataString);
|
||||
|
||||
return scrapeScene(data.props.pageProps, url, channel, options, session);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchProfile({ name: actorName }, { site }, include) {
|
||||
const origin = site.url;
|
||||
const actorSlug = slugify(actorName);
|
||||
|
|
|
@ -98,7 +98,8 @@ async function matchReleaseTags(releases) {
|
|||
}
|
||||
|
||||
async function getEntityTags(releases) {
|
||||
const entityIds = releases.map((release) => release.entity?.id).filter(Boolean);
|
||||
const entityIds = Array.from(new Set(releases.map((release) => release.entity?.id).filter(Boolean)));
|
||||
|
||||
const entityTags = await knex('entities_tags')
|
||||
.select('id', 'name', 'entity_id')
|
||||
.whereIn('entity_id', entityIds)
|
||||
|
|
Loading…
Reference in New Issue