From 18744372b3b4aaa2464b5cf53ad40e29ece085e4 Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Thu, 6 Jul 2023 04:24:47 +0200 Subject: [PATCH] Updated Vixen scraper with more informative API query. --- seeds/00_tags.js | 12 ++ src/actors.js | 4 +- src/media.js | 2 + src/scrapers/julesjordan.js | 2 + src/scrapers/traxxx.js | 70 ++++--- src/scrapers/vixen.js | 356 +++++++++++++++++++++--------------- src/tags.js | 3 +- 7 files changed, 274 insertions(+), 175 deletions(-) diff --git a/seeds/00_tags.js b/seeds/00_tags.js index b70957c22..df09cc4c6 100755 --- a/seeds/00_tags.js +++ b/seeds/00_tags.js @@ -845,6 +845,10 @@ const tags = [ name: 'POV', slug: 'pov', }, + { + name: 'prone bone', + slug: 'prone-bone', + }, { name: 'pussy eating', slug: 'pussy-eating', @@ -2281,6 +2285,14 @@ const aliases = [ name: 'pijpen', for: 'blowjob', }, + { + name: 'pronebone', + slug: 'prone-bone', + }, + { + name: 'prone', + slug: 'prone-bone', + }, ]; const priorities = [ // higher index is higher priority diff --git a/src/actors.js b/src/actors.js index 24999fb10..3d00093a0 100755 --- a/src/actors.js +++ b/src/actors.js @@ -936,8 +936,8 @@ async function associatePeople(releases, batchId, type = 'actor') { acc[release.id] = toBaseActors(release.actors, release); } - if (type === 'directors' && release.director) { - acc[release.id] = toBaseActors([release.director], release); + if (type === 'directors' && (release.director || release.directors)) { + acc[release.id] = toBaseActors([].concat(release.director || release.directors).filter(Boolean), release); } return acc; diff --git a/src/media.js b/src/media.js index b3164ad5e..c72a55371 100755 --- a/src/media.js +++ b/src/media.js @@ -353,6 +353,8 @@ async function extractSource(baseSource, { existingExtractMediaByUrl }) { if (typeof baseSource.defer === 'function') { const src = await baseSource.defer(); + console.log(baseSource, src); + return { ...baseSource, ...toBaseSource(src), diff --git a/src/scrapers/julesjordan.js b/src/scrapers/julesjordan.js index 6bef11737..ffe19dbb6 100755 --- a/src/scrapers/julesjordan.js +++ b/src/scrapers/julesjordan.js @@ -316,6 +316,8 @@ function scrapeProfile(html, url, actorName, entity) { profile.releases = Array.from(document.querySelectorAll('.category_listing_block .update_details > a:first-child'), (el) => el.href); + console.log(profile); + return profile; } diff --git a/src/scrapers/traxxx.js b/src/scrapers/traxxx.js index 4c0a5a17c..51a54b1ec 100755 --- a/src/scrapers/traxxx.js +++ b/src/scrapers/traxxx.js @@ -17,48 +17,62 @@ function random(array) { function femaleAdjective() { return random([ - 'hot', - 'young', - 'new', - 'busty', - 'insatiable', - 'depraved', - 'horny', - 'flexible', - 'bubble butt', - 'voluptuous', - 'curvy', - 'skinny', - 'nerdy', - 'oiled', - 'tied up', - 'bound', 'Asian', - 'Russian', 'Latina', + 'Russian', + 'bound', + 'bubble butt', + 'busty', + 'cock-hungry', + 'cum-hungry', + 'adickted', + 'curvy', + 'depraved', 'ebony', + 'flexible', + 'greedy', + 'horny', + 'hot', + 'insatiable', + 'nerdy', + 'new', + 'oiled', + 'shy', + 'skinny', + 'tied up', + 'voluptuous', + 'young', ]); } function maleAdjective() { return random([ 'toned', + 'bulky', 'nerdy', + 'strong', + 'shy', ]); } function sceneAdjective() { return random([ 'first', + 'hot', 'hottest', + 'wild', 'wildest', + 'deep', 'deepest', ]); } function groupSceneAdjective() { return random([ + 'big', 'biggest', + `${Math.floor(Math.random() * 20) + 4}-guy`, + `${Math.floor(Math.random() * 20) + 4}-man`, ]); } @@ -69,6 +83,8 @@ function dickAdjective() { 'throbbing', 'thick', 'long', + 'huge', + 'girthy', 'monster', `${Math.floor(Math.random() * 12) + 9} inch`, ]); @@ -77,20 +93,22 @@ function dickAdjective() { function femaleNoun() { return random([ 'MILF', - 'teen', - 'spinner', - 'coed', - 'redhead', 'beauty', 'blonde', - 'nympho', 'brunette', - 'maid', - 'student', + 'coed', 'dominatrix', - 'stepsister', - 'schoolgirl', + 'maid', 'nurse', + 'nympho', + 'redhead', + 'schoolgirl', + 'slut', + 'spinner', + 'stepsister', + 'student', + 'teen', + 'whore', ]); } diff --git a/src/scrapers/vixen.js b/src/scrapers/vixen.js index 3116551b5..aa9e04261 100755 --- a/src/scrapers/vixen.js +++ b/src/scrapers/vixen.js @@ -3,6 +3,7 @@ /* eslint-disable newline-per-chained-call */ const Promise = require('bluebird'); const moment = require('moment'); +const unprint = require('unprint'); const qu = require('../utils/qu'); const http = require('../utils/http'); @@ -41,11 +42,60 @@ function curateSources(sources, type = 'image/jpeg') { - Math.abs(1.8 - Number((resB.width / resB.height).toFixed(1)))); } -async function getTrailer(scene, channel, url) { +function scrapeAll(scenes, channel) { + return scenes.map((data) => { + const release = {}; + + release.entryId = data.videoId; + release.url = `${channel.url}/videos/${data.slug}`; + release.title = data.title; + + release.date = qu.extractDate(data.releaseDate); + release.actors = data.modelsSlugged.map((model) => ({ + name: model.name, + url: `${channel.url}/models/${model.slugged}`, + })); + + release.poster = curateSources(data.images.listing); + release.teaser = curateSources(data.previews.listing, 'video/mp4'); + + release.stars = data.rating; + + return release; + }); +} + +function scrapeUpcoming(scene, site) { + if (!scene || scene.isPreReleasePeriod) { + return null; + } + + const release = {}; + + release.entryId = scene.videoId; + release.url = `${site.url}/videos/${scene.slug}`; + + release.title = scene.slug + .split('-') + .map((component) => `${component.charAt(0).toUpperCase()}${component.slice(1)}`) + .join(' '); + + release.date = moment.utc(scene.releaseDate).toDate(); + release.datePrecision = 'minute'; + + release.actors = scene.models.map((model) => model.name); + + release.poster = curateSources(scene.images.poster); + release.teaser = curateSources(scene.previews.poster); + + return [release]; +} + +async function getTrailer(videoId, channel, url) { const res = await http.post(`${channel.url}/graphql`, { operationName: 'getToken', variables: { - videoId: scene.newId, + videoId, device: 'trailer', }, query: ` @@ -134,120 +184,7 @@ async function getTrailer(scene, channel, url) { return null; } -function scrapeAll(scenes, channel) { - return scenes.map((data) => { - const release = {}; - - release.entryId = data.videoId; - release.url = `${channel.url}/videos/${data.slug}`; - release.title = data.title; - - release.date = qu.extractDate(data.releaseDate); - release.actors = data.modelsSlugged.map((model) => ({ - name: model.name, - url: `${channel.url}/models/${model.slugged}`, - })); - - release.poster = curateSources(data.images.listing); - release.teaser = curateSources(data.previews.listing, 'video/mp4'); - - release.stars = data.rating; - - return release; - }); -} - -function scrapeUpcoming(scene, site) { - if (!scene || scene.isPreReleasePeriod) { - return null; - } - - const release = {}; - - release.entryId = scene.videoId; - release.url = `${site.url}/videos/${scene.slug}`; - - release.title = scene.slug - .split('-') - .map((component) => `${component.charAt(0).toUpperCase()}${component.slice(1)}`) - .join(' '); - - release.date = moment.utc(scene.releaseDate).toDate(); - release.datePrecision = 'minute'; - - release.actors = scene.models.map((model) => model.name); - - release.poster = curateSources(scene.images.poster); - release.teaser = curateSources(scene.previews.poster); - - return [release]; -} - -async function fetchGraphqlDetails(release, channel, session) { - const query = ` - query($query: String!, $site: Site!) { - searchVideos(input: { - query: $query - site: $site - }) { - edges { - node { - videoId - title - slug - description - releaseDate - categories { - name - } - chapters { - video { - title - seconds - } - } - models { - name - } - images { - poster { - ...ImageInfo - } - } - } - } - } - } - - fragment ImageInfo on Image { - src - highdpi { - double - } - } - `; - - const variables = JSON.stringify({ - site: channel.slug.toUpperCase(), - query: release.title, - }); - - const res = await http.get(`${channel.url}/graphql?query=${encodeURI(query)}&variables=${variables}`, { - session, - headers: { - referer: channel.url, - accept: '*/*', - }, - }); - - if (res.ok) { - return res.body.data?.searchVideos?.edges?.find((edge) => edge.node.videoId === release.entryId)?.node || null; - } - - return null; -} - -async function scrapeScene(data, url, channel, options, session) { +async function scrapeScene(data, url, channel, options) { const release = { url, entryId: data.video.videoId || data.video.newId, @@ -273,30 +210,171 @@ async function scrapeScene(data, url, channel, options, session) { : data.video.carousel?.map((photo) => photo.main[0]?.src).filter(Boolean); if (options.includeTrailers) { - const trailer = await getTrailer(data.video, channel, url); - - if (trailer) { - release.trailer = trailer; - } + release.trailer = await getTrailer(release.entryId, channel, release.url); } release.qualities = data.video?.downloadResolutions.map((quality) => Number(quality.width)).filter(Boolean); // width property is actually the height - - const graphqlDetails = await fetchGraphqlDetails(release, channel, session); - - if (graphqlDetails) { - release.tags = graphqlDetails.categories?.map((category) => category.name); - release.chapters = graphqlDetails.chapters?.video?.map((chapter) => ({ - time: chapter.seconds, - tags: [chapter.title], - })); - } - release.channel = data.video?.id.split(':')[0]; return release; } +async function scrapeSceneData(data, channel, options) { + const release = {}; + + release.entryId = data.videoId; + release.url = `${channel.url}/videos/${data.slug}`; + + release.title = data.title; + release.description = data.description; + + release.date = new Date(data.releaseDate); + release.duration = unprint.extractDuration(data.runLength); + + release.actors = data.models; + + release.directors = data.directors.map((director) => ({ + entryId: director.directorId, + name: director.name, + })); + + release.poster = curateSources(data.images?.poster); + release.photos = data.carousel?.map((photo) => photo.main[0]?.src).filter(Boolean); + + if (options.includeTrailers) { + release.trailer = await getTrailer(release.entryId, channel, release.url); + } + + release.tags = data.categories.map((category) => category.name); + release.qualities = data.downloadResolutions.map((quality) => Number(quality.width)).filter(Boolean); // width property is actually the height + + release.chapters = data.chapters.video?.map((chapter) => ({ + time: chapter.seconds, + tags: [chapter.title], + })); + + release.channel = data.site; + release.stars = data.rating; + + return release; +} + +async function fetchGraphqlScene(release, channel) { + const slug = new URL(release.url).pathname.match(/\/videos\/(.*)/)?.[1]; + // the API won't reliable return results when the query is over ~30 characters for some reason + const query = slug.split('-').reduce((acc, word) => { + const newAcc = `${acc} ${word}`; + + if (newAcc.length > 30) { + return acc; + } + + return newAcc; + }, '').trim(); + + if (!slug) { + return null; + } + + const res = await http.post(`${channel.url}/graphql`, { + operationName: 'searchVideos', + variables: { + site: channel.slug.toUpperCase(), + query, + }, + query: ` + query searchVideos($site: Site!, $query: String!) { + searchVideos(input: { + query: $query + site: $site + }) { + edges { + node { + videoId + title + slug + description + releaseDate + runLength + site + rating + models { + name + } + directors { + directorId + name + } + categories { + name + } + chapters { + video { + title + seconds + } + } + downloadResolutions { + width + } + carousel { + main { + src + } + } + images { + poster { + ...ImageInfo + } + } + } + } + } + } + + fragment ImageInfo on Image { + src + width + height + highdpi { + double + } + } + `, + }, { + headers: { + referer: release.url, + origin: channel.url, + }, + }); + + if (res.ok) { + return res.body.data?.searchVideos?.edges?.find((edge) => edge.node.slug === slug)?.node || null; + } + + return null; +} + +async function fetchScene(url, channel, baseRelease, options) { + const graphqlData = await fetchGraphqlScene(baseRelease, channel); + + if (graphqlData) { + return scrapeSceneData(graphqlData, channel, options); + } + + const session = qu.session(); + const res = await qu.get(url, null, null, { session }); + + if (res.ok) { + const dataString = res.item.query.html('#__NEXT_DATA__'); + const data = dataString && JSON.parse(dataString); + + return scrapeScene(data.props.pageProps, url, channel, options, session); + } + + return res.status; +} + async function fetchActorReleases(pages, model, origin) { const releasesPerPage = await Promise.map(pages, async (page) => { const url = `${origin}/api${model.targetUrl}?page=${page}`; @@ -451,20 +529,6 @@ async function fetchUpcoming(channel) { return res.status; } -async function fetchScene(url, channel, baseRelease, options) { - const session = qu.session(); - const res = await qu.get(url, null, null, { session }); - - if (res.ok) { - const dataString = res.item.query.html('#__NEXT_DATA__'); - const data = dataString && JSON.parse(dataString); - - return scrapeScene(data.props.pageProps, url, channel, options, session); - } - - return res.status; -} - async function fetchProfile({ name: actorName }, { site }, include) { const origin = site.url; const actorSlug = slugify(actorName); diff --git a/src/tags.js b/src/tags.js index 7bdbd4e07..2c510c105 100755 --- a/src/tags.js +++ b/src/tags.js @@ -98,7 +98,8 @@ async function matchReleaseTags(releases) { } async function getEntityTags(releases) { - const entityIds = releases.map((release) => release.entity?.id).filter(Boolean); + const entityIds = Array.from(new Set(releases.map((release) => release.entity?.id).filter(Boolean))); + const entityTags = await knex('entities_tags') .select('id', 'name', 'entity_id') .whereIn('entity_id', entityIds)