Refactored Vixen scraper, moved XEmpire to generic Gamma scraper after site update, added BiPhoria.

This commit is contained in:
DebaucheryLibrarian
2022-05-15 23:28:56 +02:00
parent 0d8c92aac9
commit 527112d5da
35 changed files with 213 additions and 156 deletions

View File

@@ -457,7 +457,7 @@ async function scrapeReleaseApi(data, site, options) {
release.trailer = Object.entries(data.trailers).map(([quality, source]) => ({ src: source, quality }));
}
if (data.movie_id && !data.movie_path) {
if (data.movie_id && !data.movie_path && options.parameters.sceneMovies !== false) {
release.movie = {
entryId: data.movie_id,
title: data.movie_title,

View File

@@ -131,7 +131,18 @@ async function scrapeProfile({ query }, actorUrl, include) {
}
async function fetchLatest(site, page = 1) {
const res = await qu.getAll(`${site.url}/latest/page/${page}`, '.shoot-list .shoot');
// const res = await qu.getAll(`${site.url}/latest/page/${page}`, '.shoot-list .shoot', {
const res = await qu.getAll(`https://www.kink.com/channel/bound-gang-bangs/latest/page/${page}`, '.shoot-list .shoot', {
Host: 'www.kink.com',
'User-Agent': 'HTTPie/2.6.0',
'Accept-Encoding': 'gzip, deflate, br',
Accept: '*/*',
Connection: 'keep-alive',
}, {
includeDefaultHeaders: false,
followRedirects: false,
});
if (res.ok) {
return scrapeAll(res.items, site);

View File

@@ -65,7 +65,7 @@ const vixen = require('./vixen');
const vogov = require('./vogov');
const wankzvr = require('./wankzvr');
const whalemember = require('./whalemember');
const xempire = require('./xempire');
// const xempire = require('./xempire');
// profiles
const boobpedia = require('./boobpedia');
@@ -157,7 +157,7 @@ const scrapers = {
wankzvr,
westcoastproductions: adultempire,
whalemember,
xempire,
// xempire,
},
actors: {
'18vr': badoink,
@@ -291,7 +291,7 @@ const scrapers = {
westcoastproductions: adultempire,
wicked: gamma,
wildoncam: cherrypimps,
xempire,
xempire: gamma,
},
};

View File

@@ -4,7 +4,7 @@
const Promise = require('bluebird');
const moment = require('moment');
const logger = require('../logger')(__filename);
const qu = require('../utils/qu');
const http = require('../utils/http');
const slugify = require('../utils/slugify');
@@ -49,6 +49,26 @@ function getAvatarFallbacks(avatar) {
.flat();
}
function curateSources(sources, type = 'image/jpeg') {
if (!sources) {
return null;
}
return sources
.map((source) => ({
src: source.src,
width: source.width,
height: source.height,
type: source.type || type,
expectType: {
'binary/octet-stream': type,
},
}))
.sort((resA, resB) => (resB.width * resB.height) - (resA.width * resA.height)) // number of pixels
.sort((resA, resB) => Math.abs(1.8 - Number((resA.width / resA.height).toFixed(1))) // approximation to 16:9
- Math.abs(1.8 - Number((resB.width / resB.height).toFixed(1))));
}
async function getTrailer(scene, channel, url) {
const res = await http.post(`${channel.url}/graphql`, {
operationName: 'getToken',
@@ -142,88 +162,27 @@ async function getTrailer(scene, channel, url) {
return null;
}
/*
async function getPhotosLegacy(url) {
const htmlRes = await http.get(url, {
extract: {
runScripts: 'dangerously',
},
});
try {
const state = htmlRes?.window?.__APOLLO_STATE__;
if (!state) {
return [];
}
const key = Object.values(state?.ROOT_QUERY).find((query) => query?.__ref)?.__ref;
const data = state[key];
if (!data) {
return [];
}
return data.carousel.slice(1).map((photo) => photo.main?.[0].src).filter(Boolean);
} catch (error) {
logger.warn(`Failed to retrieve Vixen images: ${error.message}`);
return [];
}
}
*/
async function getPhotos(url) {
const htmlRes = await http.get(url, {
parse: true,
extract: {
runScripts: 'dangerously',
},
});
try {
const state = htmlRes?.window?.__APOLLO_STATE__;
console.log('state', state);
if (!state) {
return [];
}
const key = Object.values(state?.ROOT_QUERY).find((query) => query?.__ref)?.__ref;
const data = state[key];
console.log('data', data);
if (!data) {
return [];
}
console.log(data.carousel);
return data.carousel.slice(1).map((photo) => photo.main?.[0].src).filter(Boolean);
} catch (error) {
logger.warn(`Failed to retrieve Vixen images: ${error.message}`);
return [];
}
}
function scrapeAll(scenes, site, origin) {
return scenes.map((scene) => {
function scrapeAll(scenes, channel) {
return scenes.map((data) => {
const release = {};
release.title = scene.title;
release.entryId = data.videoId;
release.url = `${channel.url}/videos/${data.slug}`;
release.title = data.title;
release.entryId = String(scene.newId);
release.url = `${site?.url || origin}/videos${scene.targetUrl}`;
release.date = qu.extractDate(data.releaseDate);
release.actors = data.modelsSlugged.map((model) => ({
name: model.name,
url: `${channel.url}/models/${model.slugged}`,
}));
release.date = moment.utc(scene.releaseDate).toDate();
release.datePrecision = 'minute';
release.poster = curateSources(data.images.listing);
release.teaser = curateSources(data.previews.listing, 'video/mp4');
release.actors = scene.models;
release.stars = Number(scene.textRating) / 2;
release.stars = data.rating;
release.poster = getPosterFallbacks(scene.images.poster);
release.teaser = getTeaserFallbacks(scene.previews.poster);
console.log(data);
console.log(release);
return release;
});
@@ -252,47 +211,47 @@ function scrapeUpcoming(scene, site) {
release.entryId = (release.poster[0] || release.teaser[0])?.src?.match(/\/(\d+)/)?.[1];
console.log('upcoming', scene);
return [release];
}
async function scrapeScene(data, url, site, baseRelease, options) {
const scene = data.video;
async function scrapeScene(data, url, channel, options) {
const release = {
url,
title: scene.title,
description: scene.description,
actors: scene.models,
director: scene.directorNames,
duration: scene.runLength,
stars: scene.totalRateVal,
tags: scene.tags,
entryId: data.video.videoId || data.video.newId,
title: data.video.title,
description: data.video.description,
actors: data.video.models,
director: data.video.directorNames,
duration: qu.durationToSeconds(data.video.runLength),
stars: data.video.rating,
};
release.entryId = scene.newId;
release.entryId = data.video.newId;
release.date = qu.extractDate(data.video.releaseDate);
release.date = moment.utc(scene.releaseDate).toDate();
release.productionDate = moment.utc(scene.shootDate).toDate();
release.datePrecision = 'minute';
release.actors = data.video.modelsSlugged.map((model) => ({
name: model.name,
url: `${channel.url}/models/${model.slugged}`,
}));
release.actors = baseRelease?.actors || scene.models;
release.poster = curateSources(data.video.images?.poster) || data.video.videoImage?.src;
release.photos = data.galleryImages?.length > 0
? data.galleryImages.map((image) => image.src)
: data.video.carousel?.map((photo) => photo.main[0]?.src).filter(Boolean);
release.poster = getPosterFallbacks(scene.images.poster);
if (options.includeTrailers) {
const trailer = await getTrailer(data.video, channel, url);
// release.photos = data.pictureset.map(photo => photo.main[0]?.src).filter(Boolean);
if (options.includePhotos) {
release.photos = await getPhotos(url);
if (trailer) {
release.trailer = trailer;
}
}
release.teaser = getTeaserFallbacks(scene.previews.poster);
release.qualities = data.video?.downloadResolutions.map((quality) => Number(quality.width)).filter(Boolean); // width property is actually the height
const trailer = await getTrailer(scene, site, url);
if (trailer) release.trailer = trailer;
release.chapters = data.video.chapters?.video.map((chapter) => ({
tags: [chapter.title],
time: chapter.seconds,
}));
console.log(release);
return release;
}
@@ -346,13 +305,71 @@ async function scrapeProfile(data, origin, withReleases) {
return profile;
}
async function fetchLatestGraphql(channel, page = 1) {
const query = `
query($query: String!, $site: Site!) {
searchVideos(input: {
query: $query
site: $site
}) {
edges {
node {
title
slug
description
releaseDate
categories {
name
}
chapters {
video {
title
seconds
}
}
models {
name
}
images {
poster {
...ImageInfo
}
}
}
}
}
}
fragment ImageInfo on Image {
src
highdpi {
double
}
}
`;
const variables = JSON.stringify({
site: channel.slug.toUpperCase(),
query: 'alone at last',
});
const res = await http.get(`${channel.url}/graphql?query=${encodeURI(query)}&variables=${variables}`);
console.log(res.body);
console.log(res.body.errors);
console.log(res.body.data?.searchVideos?.edges.map((edge) => edge.node));
}
async function fetchLatest(site, page = 1) {
const url = `${site.url}/api/videos?page=${page}`;
const res = await http.get(url);
const url = `${site.url}/videos?page=${page}`;
const res = await qu.get(url);
if (res.ok) {
if (res.body.data.videos) {
return scrapeAll(res.body.data.videos, site);
const dataString = res.item.query.html('#__NEXT_DATA__');
const data = dataString && JSON.parse(dataString);
if (data?.props.pageProps.edges) {
return scrapeAll(data.props.pageProps.edges.map((edge) => edge.node), site);
}
return [];
@@ -376,22 +393,14 @@ async function fetchUpcoming(site) {
return res.status;
}
async function fetchScene(url, site, baseRelease, options) {
const { origin, pathname } = new URL(url);
const apiUrl = `${origin}/api/${pathname.split('/').slice(-1)[0]}`;
const res = await http.get(apiUrl, {
extract: {
runScripts: 'dangerously',
},
});
async function fetchScene(url, channel, baseRelease, options) {
const res = await qu.get(url);
if (res.ok) {
if (res.body.data) {
return scrapeScene(res.body.data, url, site, baseRelease, options);
}
const dataString = res.item.query.html('#__NEXT_DATA__');
const data = dataString && JSON.parse(dataString);
return null;
return scrapeScene(data.props.pageProps, url, channel, options);
}
return res.status;
@@ -415,6 +424,7 @@ async function fetchProfile({ name: actorName }, { site }, include) {
}
module.exports = {
// fetchLatest: fetchLatestGraphql,
fetchLatest,
fetchUpcoming,
fetchScene,

View File

@@ -1,25 +0,0 @@
'use strict';
const { fetchLatest, fetchUpcoming, scrapeScene, fetchProfile } = require('./gamma');
const qu = require('../utils/qu');
async function fetchScene(url, site, baseRelease, options) {
const res = await qu.get(url);
const release = await scrapeScene(res.item, url, site, baseRelease, null, options);
const siteDomain = release.query.el('meta[name="twitter:domain"]', 'content') || 'allblackx.com'; // only AllBlackX has no twitter domain, no other useful hints available
const siteSlug = siteDomain && siteDomain.split('.')[0].toLowerCase();
// const siteUrl = siteDomain && `https://www.${siteDomain}`;
release.channel = siteSlug;
release.director = 'Mason';
return release;
}
module.exports = {
fetchLatest,
fetchProfile,
fetchUpcoming,
fetchScene,
};