Using better video API for Vixen deep scrape.

This commit is contained in:
DebaucheryLibrarian 2023-07-07 02:20:07 +02:00
parent 10ba67fde1
commit 6f4608ba23
1 changed files with 142 additions and 67 deletions

View File

@ -5,6 +5,7 @@ const Promise = require('bluebird');
const moment = require('moment');
const unprint = require('unprint');
const argv = require('../argv');
const qu = require('../utils/qu');
const http = require('../utils/http');
const slugify = require('../utils/slugify');
@ -259,41 +260,7 @@ async function scrapeSceneData(data, channel, options) {
return release;
}
async function fetchGraphqlScene(release, channel) {
const slug = new URL(release.url).pathname.match(/\/videos\/(.*)/)?.[1];
if (!slug) {
return null;
}
// the API won't reliable return results when the query is over ~30 characters for some reason
// it may still occasionally fail to return the relevant result, first, such as Blacked Raw - After the Show
const query = slug.split('-').reduce((acc, word) => {
const newAcc = `${acc} ${word}`;
if (newAcc.length > 30) {
return acc;
}
return newAcc;
}, '').trim();
const res = await http.post(`${channel.url}/graphql`, {
operationName: 'searchVideos',
variables: {
site: channel.slug.toUpperCase(),
query,
},
// ranking can be weird, use higher limit to increase likelihood of finding scene
query: `
query searchVideos($site: Site!, $query: String!) {
searchVideos(input: {
query: $query
site: $site
first: 50
}) {
edges {
node {
const videoFields = `
videoId
title
slug
@ -331,10 +298,52 @@ async function fetchGraphqlScene(release, channel) {
...ImageInfo
}
}
`;
function getSlug(release) {
if (release.slug) {
return release.slug;
}
if (release.url) {
return new URL(release.url).pathname.match(/\/videos\/(.*)/)?.[1];
}
return null;
}
async function fetchGraphqlScene(release, channel) {
const slug = getSlug(release);
const entryId = argv.entryId || release.entryId;
if (!entryId && !slug) {
return null;
}
const query = entryId
? `
query searchVideos($videoId: ID!) {
video: findOneVideo(input: { videoId: $videoId }) {
${videoFields}
}
}
`
: `
query searchVideos($slug: String!) {
video: findOneVideo(input: { slug: $slug }) {
${videoFields}
}
}
`;
const res = await http.post(`${channel.url}/graphql`, {
operationName: 'searchVideos',
variables: {
videoId: entryId,
slug,
},
query: `
${query}
fragment ImageInfo on Image {
src
@ -353,7 +362,7 @@ async function fetchGraphqlScene(release, channel) {
});
if (res.ok) {
return res.body.data?.searchVideos?.edges?.find((edge) => edge.node.slug === slug)?.node || null;
return res.body.data.video;
}
return null;
@ -556,3 +565,69 @@ module.exports = {
fetchScene,
fetchProfile,
};
/* less reliable search API in case direct video query becomes unavailable
async function findGraphqlScene(release, channel) {
const slug = new URL(release.url).pathname.match(/\/videos\/(.*)/)?.[1];
if (!slug) {
return null;
}
// the API won't reliable return results when the query is over ~30 characters for some reason
// it may still occasionally fail to return the relevant result, first, such as Blacked Raw - After the Show
const query = slug.split('-').reduce((acc, word) => {
const newAcc = `${acc} ${word}`;
if (newAcc.length > 30) {
return acc;
}
return newAcc;
}, '').trim();
const res = await http.post(`${channel.url}/graphql`, {
operationName: 'searchVideos',
variables: {
site: channel.slug.toUpperCase(),
query,
},
// ranking can be weird, use higher limit to increase likelihood of finding scene
query: `
query searchVideos($site: Site!, $query: String!) {
searchVideos(input: {
query: $query
site: $site
first: 50
}) {
edges {
node {
${videoFields}
}
}
}
}
fragment ImageInfo on Image {
src
width
height
highdpi {
double
}
}
`,
}, {
headers: {
referer: release.url,
origin: channel.url,
},
});
if (res.ok) {
return res.body.data?.searchVideos?.edges?.find((edge) => edge.node.slug === slug)?.node || null;
}
return null;
}
*/