Merge branch 'master' into experimental
This commit is contained in:
@@ -14,6 +14,6 @@
|
||||
"prefer-destructuring": "off",
|
||||
"template-curly-spacing": "off",
|
||||
"object-curly-newline": "off",
|
||||
"max-len": [2, {"code": 300, "tabWidth": 4, "ignoreUrls": true}],
|
||||
"max-len": [2, {"code": 300, "tabWidth": 4, "ignoreUrls": true}]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -20,6 +20,7 @@ const scrapers = require('./scrapers/scrapers').actors;
|
||||
const argv = require('./argv');
|
||||
const include = require('./utils/argv-include')(argv);
|
||||
const bulkInsert = require('./utils/bulk-insert');
|
||||
const chunk = require('./utils/chunk');
|
||||
const logger = require('./logger')(__filename);
|
||||
|
||||
const { toBaseReleases } = require('./deep');
|
||||
@@ -1048,33 +1049,42 @@ async function flushProfiles(actorIdsOrNames) {
|
||||
logger.info(`Removed ${deleteCount} profiles`);
|
||||
}
|
||||
|
||||
async function deleteActors(actorIdsOrNames) {
|
||||
const actors = await knex('actors')
|
||||
.whereIn('id', actorIdsOrNames.filter((idOrName) => typeof idOrName === 'number'))
|
||||
.orWhere((builder) => {
|
||||
builder
|
||||
.whereIn('name', actorIdsOrNames.filter((idOrName) => typeof idOrName === 'string'))
|
||||
.whereNull('entity_id');
|
||||
});
|
||||
async function deleteActors(allActorIdsOrNames) {
|
||||
const deleteCounts = await Promise.map(chunk(allActorIdsOrNames), async (actorIdsOrNames) => {
|
||||
const actors = await knex('actors')
|
||||
.whereIn('id', actorIdsOrNames.filter((idOrName) => typeof idOrName === 'number'))
|
||||
.orWhere((builder) => {
|
||||
builder
|
||||
.whereIn('name', actorIdsOrNames.filter((idOrName) => typeof idOrName === 'string'))
|
||||
.whereNull('entity_id');
|
||||
});
|
||||
|
||||
const actorIds = actors.map((actor) => actor.id);
|
||||
const actorIds = actors.map((actor) => actor.id);
|
||||
|
||||
const sceneIds = await knex('releases_actors')
|
||||
.select('releases.id')
|
||||
.whereIn('actor_id', actorIds)
|
||||
.leftJoin('releases', 'releases.id', 'releases_actors.release_id')
|
||||
.pluck('id');
|
||||
const sceneIds = await knex('releases_actors')
|
||||
.select('releases.id')
|
||||
.whereIn('actor_id', actorIds)
|
||||
.leftJoin('releases', 'releases.id', 'releases_actors.release_id')
|
||||
.pluck('id');
|
||||
|
||||
const [deletedScenesCount, deletedActorsCount] = await Promise.all([
|
||||
deleteScenes(sceneIds),
|
||||
knex('actors')
|
||||
.whereIn('id', actorIds)
|
||||
.delete(),
|
||||
]);
|
||||
const [deletedScenesCount, deletedActorsCount] = await Promise.all([
|
||||
deleteScenes(sceneIds),
|
||||
knex('actors')
|
||||
.whereIn('id', actorIds)
|
||||
.delete(),
|
||||
]);
|
||||
|
||||
return { deletedScenesCount, deletedActorsCount };
|
||||
}, { concurrency: 10 });
|
||||
|
||||
const deletedActorsCount = deleteCounts.reduce((acc, count) => acc + count.deletedActorsCount, 0);
|
||||
const deletedScenesCount = deleteCounts.reduce((acc, count) => acc + count.deletedScenesCount, 0);
|
||||
|
||||
await flushOrphanedMedia();
|
||||
|
||||
logger.info(`Removed ${deletedActorsCount} actors with ${deletedScenesCount} scenes`);
|
||||
|
||||
return deletedActorsCount;
|
||||
}
|
||||
|
||||
async function flushActors() {
|
||||
|
||||
95
src/app.js
95
src/app.js
@@ -1,5 +1,6 @@
|
||||
'use strict';
|
||||
|
||||
const config = require('config');
|
||||
const util = require('util');
|
||||
// const log = require('why-is-node-running');
|
||||
const Inspector = require('inspector-api');
|
||||
@@ -24,33 +25,64 @@ const getFileEntries = require('./utils/file-entries');
|
||||
const inspector = new Inspector();
|
||||
let done = false;
|
||||
|
||||
function logActive() {
|
||||
console.log('log active!');
|
||||
|
||||
setTimeout(() => {
|
||||
// log();
|
||||
logActive();
|
||||
}, typeof argv.logActive === 'number' ? argv.logActive : 60000);
|
||||
}
|
||||
|
||||
/*
|
||||
function monitorMemory() {
|
||||
logger.debug(`Memory usage: ${process.memoryUsage.rss() / 1000000} MB`);
|
||||
function logActive() {
|
||||
setTimeout(() => {
|
||||
log();
|
||||
|
||||
if (!done) {
|
||||
setTimeout(() => monitorMemory(), 10000);
|
||||
}
|
||||
if (!done) {
|
||||
logActive();
|
||||
}
|
||||
}, typeof argv.logActive === 'number' ? argv.logActive : 60000);
|
||||
}
|
||||
*/
|
||||
|
||||
async function stopMemorySample() {
|
||||
async function snapshotMemory(trigger) {
|
||||
const profile = await inspector.heap.takeSnapshot();
|
||||
const filepath = `traxxx_snapshot_${trigger}M_${dayjs().format('YYYY-MM-DD_HH-mm-ss')}.heapsnapshot`;
|
||||
|
||||
logger.info(`Starting heap snapshot, memory usage: ${process.memoryUsage.rss() / 1000000} MB`);
|
||||
|
||||
await inspector.heap.disable();
|
||||
await fs.writeFile(filepath, JSON.stringify(profile));
|
||||
|
||||
logger.info(`Saved heap snapshot to ${filepath}`);
|
||||
}
|
||||
|
||||
async function stopMemorySample(snapshotTriggers) {
|
||||
const usage = process.memoryUsage.rss() / 1000000;
|
||||
|
||||
const profile = await inspector.heap.stopSampling();
|
||||
const filepath = `${dayjs().format('YYYY-MM-DD_HH-mm-ss')}.heapprofile`;
|
||||
const filepath = `traxxx_sample_${dayjs().format('YYYY-MM-DD_HH-mm-ss')}.heapprofile`;
|
||||
|
||||
await inspector.heap.disable();
|
||||
await fs.writeFile(filepath, JSON.stringify(profile));
|
||||
|
||||
logger.info(`Saved heap sample to ${filepath}`);
|
||||
|
||||
if (usage > snapshotTriggers[0]) {
|
||||
await snapshotMemory(snapshotTriggers[0]);
|
||||
return snapshotTriggers.slice(1);
|
||||
}
|
||||
|
||||
return snapshotTriggers;
|
||||
}
|
||||
|
||||
async function startMemorySample(snapshotTriggers = []) {
|
||||
await inspector.heap.enable();
|
||||
await inspector.heap.startSampling();
|
||||
|
||||
const usage = process.memoryUsage.rss() / 1000000;
|
||||
|
||||
logger.info(`Start heap sampling, memory usage: ${usage} MB`);
|
||||
|
||||
setTimeout(async () => {
|
||||
const newSnapshotTriggers = await stopMemorySample(snapshotTriggers);
|
||||
|
||||
if (!done) {
|
||||
await startMemorySample(newSnapshotTriggers);
|
||||
}
|
||||
}, config.memorySampling.sampleDuration);
|
||||
}
|
||||
|
||||
async function startMemorySample() {
|
||||
@@ -72,19 +104,19 @@ async function startMemorySample() {
|
||||
|
||||
async function init() {
|
||||
try {
|
||||
if (argv.memory) {
|
||||
await startMemorySample();
|
||||
}
|
||||
|
||||
if (argv.logActive) {
|
||||
logActive();
|
||||
}
|
||||
|
||||
if (argv.server) {
|
||||
await initServer();
|
||||
return;
|
||||
}
|
||||
|
||||
if (argv.sampleMemory) {
|
||||
await startMemorySample(config.memorySampling.snapshotIntervals);
|
||||
}
|
||||
|
||||
if (argv.logActive) {
|
||||
// logActive();
|
||||
}
|
||||
|
||||
if (argv.updateSearch) {
|
||||
await Promise.all([
|
||||
updateSceneSearch(),
|
||||
@@ -157,8 +189,13 @@ async function init() {
|
||||
? await fetchScenes([...(sceneUrls), ...(updateBaseScenes || []), ...(actorBaseScenes || [])])
|
||||
: [...(updateBaseScenes || []), ...(actorBaseScenes || [])];
|
||||
|
||||
const sceneMovies = deepScenes ? deepScenes.filter((scene) => scene.movie).map((scene) => ({ ...scene.movie, entity: scene.entity })) : [];
|
||||
const deepMovies = argv.sceneMovies || argv.movie ? await fetchMovies([...(argv.movie || []), ...(sceneMovies || [])]) : sceneMovies;
|
||||
const storedScenes = argv.save ? await storeScenes(deepScenes) : [];
|
||||
|
||||
const moviesFromFile = argv.moviesFile && await getFileEntries(argv.moviesFile);
|
||||
const movieUrls = (argv.movie || []).concat(moviesFromFile || []);
|
||||
|
||||
const sceneMovies = deepScenes && argv.sceneMovies ? deepScenes.filter((scene) => scene.movie).map((scene) => ({ ...scene.movie, entity: scene.entity })) : [];
|
||||
const deepMovies = argv.sceneMovies || argv.movie || movieUrls ? await fetchMovies([...movieUrls, ...(sceneMovies || []), ...[]]) : sceneMovies;
|
||||
|
||||
const movieScenes = argv.movieScenes ? deepMovies.map((movie) => movie.scenes?.map((scene) => ({ ...scene, movie, entity: movie.entity }))).flat().filter(Boolean) : [];
|
||||
const deepMovieScenes = argv.deep ? await fetchScenes(movieScenes) : movieScenes;
|
||||
@@ -169,10 +206,10 @@ async function init() {
|
||||
}
|
||||
|
||||
if (argv.save) {
|
||||
const storedMovies = await storeMovies(deepMovies);
|
||||
const storedScenes = await storeScenes([...(deepScenes || []), ...(deepMovieScenes || [])]);
|
||||
const storedMovies = await storeMovies(deepMovies, storedScenes[0]?.batchId);
|
||||
const storedMovieScenes = await storeScenes(deepMovieScenes, storedScenes[0]?.batchId);
|
||||
|
||||
await associateMovieScenes(storedMovies, storedScenes);
|
||||
await associateMovieScenes(storedMovies, [...storedScenes, ...storedMovieScenes]);
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(error);
|
||||
|
||||
12
src/argv.js
12
src/argv.js
@@ -107,6 +107,11 @@ const { argv } = yargs
|
||||
describe: 'Scrape movie info from URL',
|
||||
type: 'array',
|
||||
})
|
||||
.option('movie-file', {
|
||||
describe: 'Scrape movie info from URLs in a file',
|
||||
type: 'string',
|
||||
alias: 'movies-file',
|
||||
})
|
||||
.option('deep', {
|
||||
describe: 'Fetch details for all releases',
|
||||
type: 'boolean',
|
||||
@@ -233,6 +238,7 @@ const { argv } = yargs
|
||||
default: false,
|
||||
})
|
||||
.option('level', {
|
||||
alias: 'log-level',
|
||||
describe: 'Log level',
|
||||
type: 'string',
|
||||
default: process.env.NODE_ENV === 'development' ? 'silly' : 'info',
|
||||
@@ -247,6 +253,12 @@ const { argv } = yargs
|
||||
type: 'boolean',
|
||||
default: process.env.NODE_ENV === 'development',
|
||||
})
|
||||
.option('sampleMemory', {
|
||||
alias: 'memory',
|
||||
describe: 'Take memory allocation samples, and snapshots at configured intervals',
|
||||
type: 'boolean',
|
||||
default: config.memorySampling.enabled,
|
||||
})
|
||||
.option('update-search', {
|
||||
describe: 'Update search documents for all releases.',
|
||||
type: 'boolean',
|
||||
|
||||
50
src/deep.js
50
src/deep.js
@@ -1,5 +1,6 @@
|
||||
'use strict';
|
||||
|
||||
const util = require('util');
|
||||
const Promise = require('bluebird');
|
||||
const { mergeAdvanced: merge } = require('object-merge-advanced');
|
||||
|
||||
@@ -9,6 +10,9 @@ const { fetchReleaseEntities, urlToSiteSlug } = require('./entities');
|
||||
const logger = require('./logger')(__filename);
|
||||
const qu = require('./utils/qu');
|
||||
const getRecursiveParameters = require('./utils/get-recursive-parameters');
|
||||
const windows = require('./utils/http-windows');
|
||||
|
||||
const waitImmediate = util.promisify(setImmediate);
|
||||
|
||||
function toBaseReleases(baseReleasesOrUrls, entity = null) {
|
||||
if (!baseReleasesOrUrls) {
|
||||
@@ -50,12 +54,12 @@ function toBaseReleases(baseReleasesOrUrls, entity = null) {
|
||||
.filter(Boolean);
|
||||
}
|
||||
|
||||
async function fetchScene(scraper, url, entity, baseRelease, options) {
|
||||
if (scraper.fetchScene) {
|
||||
return scraper.fetchScene(baseRelease.url, entity, baseRelease, options, null);
|
||||
async function fetchScene(scraper, url, entity, baseRelease, options, type = 'scene') {
|
||||
if ((type === 'scene' && scraper.fetchScene) || (type === 'movie' && scraper.fetchMovie)) {
|
||||
return scraper[type === 'movie' ? 'fetchMovie' : 'fetchScene'](baseRelease.url, entity, baseRelease, options, null);
|
||||
}
|
||||
|
||||
if (scraper.scrapeScene) {
|
||||
if ((type === 'scene' && scraper.scrapeScene) || (type === 'movie' && scraper.scrapeMovie)) {
|
||||
const session = qu.session();
|
||||
|
||||
const res = await qu.get(url, null, null, {
|
||||
@@ -66,7 +70,7 @@ async function fetchScene(scraper, url, entity, baseRelease, options) {
|
||||
const cookie = await session._sessionOptions.cookieJar.get(url);
|
||||
|
||||
if (res.ok) {
|
||||
return scraper.scrapeScene(res.item, url, entity, baseRelease, options, {
|
||||
return scraper[type === 'movie' ? 'scrapeMovie' : 'scrapeScene'](res.item, url, entity, baseRelease, options, {
|
||||
session,
|
||||
headers: res.headers,
|
||||
cookieJar: session._sessionOptions.cookieJar,
|
||||
@@ -80,6 +84,10 @@ async function fetchScene(scraper, url, entity, baseRelease, options) {
|
||||
return null;
|
||||
}
|
||||
|
||||
function fetchMovie(scraper, url, entity, baseRelease, options) {
|
||||
return fetchScene(scraper, url, entity, baseRelease, options, 'movie');
|
||||
}
|
||||
|
||||
async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
|
||||
const entity = baseRelease.entity || entitiesBySlug[urlToSiteSlug(baseRelease.url)];
|
||||
|
||||
@@ -102,7 +110,7 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
|
||||
return baseRelease;
|
||||
}
|
||||
|
||||
if ((type === 'scene' && !layoutScraper.fetchScene && !layoutScraper.scrapeScene) || (type === 'movie' && !layoutScraper.fetchMovie)) {
|
||||
if ((type === 'scene' && !layoutScraper.fetchScene && !layoutScraper.scrapeScene) || (type === 'movie' && !layoutScraper.fetchMovie && !layoutScraper.scrapeMovie)) {
|
||||
logger.warn(`The '${entity.name}'-scraper cannot scrape individual ${type}s`);
|
||||
return baseRelease;
|
||||
}
|
||||
@@ -116,9 +124,28 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
|
||||
parameters: getRecursiveParameters(entity),
|
||||
};
|
||||
|
||||
logger.debug(`Memory usage before: ${process.memoryUsage.rss() / 1000000} MB (${baseRelease.url})`);
|
||||
|
||||
const rawScrapedRelease = type === 'scene'
|
||||
? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, options, null)
|
||||
: await layoutScraper.fetchMovie(baseRelease.url, entity, baseRelease, options, null);
|
||||
? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, options)
|
||||
: await fetchMovie(layoutScraper, baseRelease.url, entity, baseRelease, options);
|
||||
|
||||
const pathname = baseRelease.path || (baseRelease.url && new URL(baseRelease.url).pathname.replace(/\//g, '_'));
|
||||
|
||||
if (rawScrapedRelease) {
|
||||
delete rawScrapedRelease.query; // some scrapers pass the qu-wrapped window instance to parent scrapers, filling up memory
|
||||
}
|
||||
|
||||
if (windows.has(pathname)) {
|
||||
logger.debug(`Closing window for ${pathname}`);
|
||||
|
||||
windows.get(pathname).close();
|
||||
windows.delete(pathname);
|
||||
}
|
||||
|
||||
await waitImmediate;
|
||||
|
||||
logger.debug(`Memory usage after: ${process.memoryUsage.rss() / 1000000} MB (${baseRelease.url})`);
|
||||
|
||||
const scrapedRelease = rawScrapedRelease?.scene || rawScrapedRelease;
|
||||
|
||||
@@ -173,12 +200,13 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
|
||||
async function scrapeReleases(baseReleases, entitiesBySlug, type) {
|
||||
const entitiesWithBeforeDataEntries = await Promise.all(Object.entries(entitiesBySlug).map(async ([slug, entity]) => {
|
||||
if (entity.scraper?.beforeFetchScenes) {
|
||||
const preData = await entity.scraper.beforeFetchScenes(entity);
|
||||
const parameters = getRecursiveParameters(entity);
|
||||
const preData = await entity.scraper.beforeFetchScenes(entity, parameters);
|
||||
|
||||
return [slug, { ...entity, preData }];
|
||||
}
|
||||
|
||||
return null;
|
||||
return [slug, entity];
|
||||
}));
|
||||
|
||||
const entitiesWithBeforeDataBySlug = Object.fromEntries(entitiesWithBeforeDataEntries.filter(Boolean));
|
||||
@@ -186,7 +214,7 @@ async function scrapeReleases(baseReleases, entitiesBySlug, type) {
|
||||
return Promise.map(
|
||||
baseReleases,
|
||||
async (baseRelease) => scrapeRelease(baseRelease, entitiesWithBeforeDataBySlug, type),
|
||||
{ concurrency: 10 },
|
||||
{ concurrency: 1 },
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ const inquirer = require('inquirer');
|
||||
const logger = require('./logger')(__filename);
|
||||
const argv = require('./argv');
|
||||
const knex = require('./knex');
|
||||
const { deleteScenes, deleteMovies } = require('./releases');
|
||||
const { deleteScenes, deleteMovies, deleteSeries } = require('./releases');
|
||||
const { flushOrphanedMedia } = require('./media');
|
||||
const { resolveScraper, resolveLayoutScraper } = require('./scrapers/resolve');
|
||||
|
||||
@@ -236,7 +236,7 @@ async function fetchReleaseEntities(baseReleases) {
|
||||
.filter(Boolean),
|
||||
));
|
||||
|
||||
return fetchEntitiesBySlug(entitySlugs);
|
||||
return fetchEntitiesBySlug(entitySlugs, 'desc');
|
||||
}
|
||||
|
||||
async function fetchEntity(entityId, type) {
|
||||
@@ -359,29 +359,39 @@ async function flushEntities(networkSlugs = [], channelSlugs = []) {
|
||||
.leftJoin('movies', 'movies.entity_id', 'selected_entities.id')
|
||||
.pluck('movies.id');
|
||||
|
||||
if (sceneIds.length === 0 && movieIds.length === 0) {
|
||||
logger.info(`No scenes or movies found to remove for ${entitySlugs}`);
|
||||
const serieIds = await entityQuery
|
||||
.clone()
|
||||
.select('series.id')
|
||||
.distinct('series.id')
|
||||
.whereNotNull('series.id')
|
||||
.from('selected_entities')
|
||||
.leftJoin('series', 'series.entity_id', 'selected_entities.id')
|
||||
.pluck('series.id');
|
||||
|
||||
if (sceneIds.length === 0 && movieIds.length === 0 && serieIds.length === 0) {
|
||||
logger.info(`No scenes, movies or series found to remove for ${entitySlugs}`);
|
||||
return;
|
||||
}
|
||||
|
||||
const confirmed = await inquirer.prompt([{
|
||||
type: 'confirm',
|
||||
name: 'flushEntities',
|
||||
message: `You are about to remove ${sceneIds.length} scenes and ${movieIds.length} movies for ${entitySlugs}. Are you sure?`,
|
||||
message: `You are about to remove ${sceneIds.length} scenes, ${movieIds.length} movies and ${serieIds.length} series for ${entitySlugs}. Are you sure?`,
|
||||
default: false,
|
||||
}]);
|
||||
|
||||
if (!confirmed.flushEntities) {
|
||||
logger.warn(`Confirmation rejected, not flushing scenes or movies for: ${entitySlugs}`);
|
||||
logger.warn(`Confirmation rejected, not flushing scenes, movies or series for: ${entitySlugs}`);
|
||||
return;
|
||||
}
|
||||
|
||||
const [deletedScenesCount, deletedMoviesCount] = await Promise.all([
|
||||
const [deletedScenesCount, deletedMoviesCount, deletedSeriesCount] = await Promise.all([
|
||||
deleteScenes(sceneIds),
|
||||
deleteMovies(movieIds),
|
||||
deleteSeries(serieIds),
|
||||
]);
|
||||
|
||||
logger.info(`Removed ${deletedScenesCount} scenes and ${deletedMoviesCount} movies for ${entitySlugs}`);
|
||||
logger.info(`Removed ${deletedScenesCount} scenes, ${deletedMoviesCount} movies and ${deletedSeriesCount} series for ${entitySlugs}`);
|
||||
|
||||
await flushOrphanedMedia();
|
||||
}
|
||||
|
||||
46
src/media.js
46
src/media.js
@@ -7,7 +7,7 @@ const fs = require('fs');
|
||||
const fsPromises = require('fs').promises;
|
||||
const path = require('path');
|
||||
const stream = require('stream');
|
||||
const nanoid = require('nanoid/non-secure');
|
||||
const { nanoid } = require('nanoid/non-secure');
|
||||
const mime = require('mime');
|
||||
// const fileType = require('file-type');
|
||||
const ffmpeg = require('fluent-ffmpeg');
|
||||
@@ -345,12 +345,13 @@ async function writeImage(image, media, info, filepath, isProcessed) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (isProcessed) {
|
||||
// convert to JPEG and write to permanent location
|
||||
await image
|
||||
.jpeg()
|
||||
.toFile(path.join(config.media.path, filepath));
|
||||
}
|
||||
await image
|
||||
.resize({
|
||||
height: config.media.maxSize,
|
||||
withoutEnlargement: true,
|
||||
})
|
||||
.jpeg({ quality: config.media.quality })
|
||||
.toFile(path.join(config.media.path, filepath));
|
||||
}
|
||||
|
||||
async function writeThumbnail(image, thumbpath) {
|
||||
@@ -377,6 +378,7 @@ async function writeLazy(image, lazypath) {
|
||||
|
||||
async function storeImageFile(media, hashDir, hashSubDir, filename, filedir, filepath, options) {
|
||||
logger.silly(`Storing permanent media files for ${media.id} from ${media.src} at ${filepath}`);
|
||||
logger.debug(`Memory usage at image storage: ${process.memoryUsage.rss() / 1000000} MB (${media.src})`);
|
||||
|
||||
try {
|
||||
const thumbdir = config.s3.enabled ? path.join(media.role, 'thumbs') : path.join(media.role, 'thumbs', hashDir, hashSubDir);
|
||||
@@ -415,12 +417,14 @@ async function storeImageFile(media, hashDir, hashSubDir, filename, filedir, fil
|
||||
});
|
||||
}
|
||||
|
||||
await writeImage(image, media, info, filepath, isProcessed);
|
||||
|
||||
await Promise.all([
|
||||
writeImage(image, media, info, filepath, isProcessed),
|
||||
writeThumbnail(image, thumbpath),
|
||||
writeLazy(image, lazypath),
|
||||
]);
|
||||
|
||||
/*
|
||||
if (isProcessed) {
|
||||
// file already stored, remove temporary file
|
||||
await fsPromises.unlink(media.file.path);
|
||||
@@ -428,6 +432,9 @@ async function storeImageFile(media, hashDir, hashSubDir, filename, filedir, fil
|
||||
// image not processed, simply move temporary file to final location
|
||||
await fsPromises.rename(media.file.path, path.join(config.media.path, filepath));
|
||||
}
|
||||
*/
|
||||
|
||||
await fsPromises.unlink(media.file.path);
|
||||
|
||||
if (config.s3.enabled) {
|
||||
await Promise.all([
|
||||
@@ -580,6 +587,7 @@ async function fetchSource(source, baseMedia) {
|
||||
const maxAttempts = source.attempts || 3;
|
||||
|
||||
logger.silly(`Fetching media from ${source.src}`);
|
||||
logger.debug(`Memory usage before media fetch: ${process.memoryUsage.rss() / 1000000} MB (${source.src})`);
|
||||
|
||||
async function attempt(attempts = 1) {
|
||||
const hasher = new blake2.Hash('blake2b', { digestLength: 24 });
|
||||
@@ -746,7 +754,8 @@ async function storeMedias(baseMedias, options) {
|
||||
const fetchedMedias = await Promise.map(
|
||||
baseMedias,
|
||||
async (baseMedia) => fetchMedia(baseMedia, { existingSourceMediaByUrl, existingExtractMediaByUrl }),
|
||||
{ concurrency: 100 }, // don't overload disk (or network, although this has its own throttling)
|
||||
// { concurrency: 100 }, // don't overload disk (or network, although this has its own throttling)
|
||||
{ concurrency: 10 }, // don't overload disk (or network, although this has its own throttling)
|
||||
);
|
||||
|
||||
const { uniqueHashMedias, existingHashMedias } = await findHashDuplicates(fetchedMedias);
|
||||
@@ -823,11 +832,12 @@ async function associateReleaseMedia(releases, type = 'release') {
|
||||
.reduce((acc, [releaseId, releaseBaseMedias]) => {
|
||||
releaseBaseMedias.forEach((baseMedia) => {
|
||||
const media = storedMediasById[baseMedia.id];
|
||||
const mediaId = media?.use || media?.entry?.id;
|
||||
|
||||
if (media) {
|
||||
if (mediaId) {
|
||||
acc.push({
|
||||
[`${type}_id`]: releaseId,
|
||||
media_id: media.use || media.entry.id,
|
||||
media_id: mediaId,
|
||||
});
|
||||
}
|
||||
});
|
||||
@@ -840,7 +850,10 @@ async function associateReleaseMedia(releases, type = 'release') {
|
||||
await bulkInsert(`${type}s_${role}`, associations, false);
|
||||
}
|
||||
} catch (error) {
|
||||
logger.error(util.inspect(error.entries, null, null, { color: true }));
|
||||
if (error.entries) {
|
||||
logger.error(util.inspect(error.entries, null, null, { color: true }));
|
||||
}
|
||||
|
||||
logger.error(`Failed to store ${type} ${role}: ${error.message}`);
|
||||
}
|
||||
}, Promise.resolve());
|
||||
@@ -948,9 +961,12 @@ async function flushOrphanedMedia() {
|
||||
await deleteS3Objects(orphanedMedia.filter((media) => media.is_s3));
|
||||
}
|
||||
|
||||
await fsPromises.rmdir(path.join(config.media.path, 'temp'), { recursive: true });
|
||||
|
||||
logger.info('Cleared temporary media directory');
|
||||
try {
|
||||
await fsPromises.rm(path.join(config.media.path, 'temp'), { recursive: true });
|
||||
logger.info('Cleared temporary media directory');
|
||||
} catch (error) {
|
||||
logger.warn(`Failed to clear temporary media directory: ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
|
||||
@@ -59,7 +59,7 @@ const releaseFields = `
|
||||
slug
|
||||
}
|
||||
}
|
||||
poster: chaptersPosterByChapterId {
|
||||
poster: chaptersPoster {
|
||||
media {
|
||||
id
|
||||
path
|
||||
@@ -82,7 +82,7 @@ const releaseFields = `
|
||||
}
|
||||
}
|
||||
}
|
||||
poster: releasesPosterByReleaseId {
|
||||
poster: releasesPoster {
|
||||
media {
|
||||
id
|
||||
path
|
||||
@@ -104,7 +104,7 @@ const releaseFields = `
|
||||
size
|
||||
}
|
||||
}
|
||||
trailer: releasesTrailerByReleaseId @include (if: $full) {
|
||||
trailer: releasesTrailer @include (if: $full) {
|
||||
media {
|
||||
id
|
||||
path
|
||||
@@ -325,6 +325,24 @@ async function deleteMovies(movieIds) {
|
||||
return deleteCount;
|
||||
}
|
||||
|
||||
async function deleteSeries(serieIds) {
|
||||
if (serieIds.length === 0) {
|
||||
return 0;
|
||||
}
|
||||
|
||||
await knex('series_scenes')
|
||||
.whereIn('serie_id', serieIds)
|
||||
.delete();
|
||||
|
||||
const deleteCount = await knex('series')
|
||||
.whereIn('id', serieIds)
|
||||
.delete();
|
||||
|
||||
logger.info(`Removed ${deleteCount}/${serieIds.length} series`);
|
||||
|
||||
return deleteCount;
|
||||
}
|
||||
|
||||
async function flushScenes() {
|
||||
const sceneIds = await knex('releases').select('id').pluck('id');
|
||||
|
||||
@@ -367,6 +385,27 @@ async function flushMovies() {
|
||||
logger.info(`Removed ${deleteCount}/${movieIds.length} movies`);
|
||||
}
|
||||
|
||||
async function flushSeries() {
|
||||
const serieIds = await knex('series').select('id').pluck('id');
|
||||
|
||||
const confirmed = await inquirer.prompt([{
|
||||
type: 'confirm',
|
||||
name: 'flushSeries',
|
||||
message: `You are about to remove ${serieIds.length} series. Are you sure?`,
|
||||
default: false,
|
||||
}]);
|
||||
|
||||
if (!confirmed.flushSeries) {
|
||||
logger.warn('Confirmation rejected, not flushing series');
|
||||
return;
|
||||
}
|
||||
const deleteCount = await deleteSeries(serieIds);
|
||||
|
||||
await flushOrphanedMedia();
|
||||
|
||||
logger.info(`Removed ${deleteCount}/${serieIds.length} series`);
|
||||
}
|
||||
|
||||
async function flushBatches(batchIds) {
|
||||
const [sceneIds, movieIds] = await Promise.all([
|
||||
knex('releases')
|
||||
@@ -407,8 +446,10 @@ module.exports = {
|
||||
fetchScenes,
|
||||
flushBatches,
|
||||
flushMovies,
|
||||
flushSeries,
|
||||
flushScenes,
|
||||
searchScenes,
|
||||
deleteScenes,
|
||||
deleteMovies,
|
||||
deleteSeries,
|
||||
};
|
||||
|
||||
@@ -1,20 +1,20 @@
|
||||
'use strict';
|
||||
|
||||
const { fetchApiLatest, fetchApiUpcoming, fetchScene, fetchApiProfile } = require('./gamma');
|
||||
const { fetchApiLatest, fetchApiUpcoming, fetchSceneApi, fetchApiProfile } = require('./gamma');
|
||||
|
||||
function curateRelease(release, site) {
|
||||
if (['bubblegumdungeon', 'ladygonzo'].includes(site.slug)) {
|
||||
return {
|
||||
...release,
|
||||
title: release.title.split(/:|\|/)[1].trim(),
|
||||
title: release.title.split(/:|\|/)[1]?.trim(),
|
||||
};
|
||||
}
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
async function networkFetchScene(url, site, release) {
|
||||
const scene = await fetchScene(url, site, release);
|
||||
async function networkFetchScene(url, site, release, options) {
|
||||
const scene = await fetchSceneApi(url, site, release, options);
|
||||
|
||||
return curateRelease(scene, site);
|
||||
}
|
||||
|
||||
@@ -34,13 +34,13 @@ function getPoster(posterElement, sceneId) {
|
||||
|
||||
if (typeof posterTimeRange === 'number') {
|
||||
// poster time is already a single time value
|
||||
return `https://legalporno.com/casting/${sceneId}/${posterTimeRange}`;
|
||||
return `https://analvids.com/casting/${sceneId}/${posterTimeRange}`;
|
||||
}
|
||||
|
||||
const [max, min] = posterTimeRange.split('-');
|
||||
const posterTime = Math.floor(Math.random() * (Number(max) - Number(min) + 1) + Number(min));
|
||||
|
||||
return `https://legalporno.com/casting/${sceneId}/${posterTime}`;
|
||||
return `https://analvids.com/casting/${sceneId}/${posterTime}`;
|
||||
}
|
||||
|
||||
function scrapeAll(html) {
|
||||
@@ -134,7 +134,7 @@ async function scrapeScene(html, url, site, useGallery) {
|
||||
}
|
||||
|
||||
const studioName = $('.watchpage-studioname').first().text().trim();
|
||||
release.studio = slugify(studioName, '');
|
||||
release.studio = slugify(studioName, '', { removePunctuation: true });
|
||||
|
||||
return release;
|
||||
}
|
||||
@@ -181,7 +181,7 @@ async function fetchScene(url, site) {
|
||||
}
|
||||
|
||||
async function fetchProfile({ name: actorName }) {
|
||||
const res = await http.get(`https://www.legalporno.com/api/autocomplete/search?q=${actorName.replace(' ', '+')}`);
|
||||
const res = await http.get(`https://www.analvids.com/api/autocomplete/search?q=${actorName.replace(' ', '+')}`);
|
||||
const data = res.body;
|
||||
|
||||
const result = data.terms.find((item) => item.type === 'model');
|
||||
@@ -5,6 +5,7 @@ const qu = require('../utils/qu');
|
||||
const { extractDate } = require('../utils/qu');
|
||||
const { inchesToCm } = require('../utils/convert');
|
||||
const slugify = require('../utils/slugify');
|
||||
const capitalize = require('../utils/capitalize');
|
||||
|
||||
const clusterId = '617fb597b659459bafe6472470d9073a';
|
||||
const authKey = 'YmFuZy1yZWFkOktqVDN0RzJacmQ1TFNRazI=';
|
||||
@@ -15,6 +16,10 @@ const genderMap = {
|
||||
};
|
||||
|
||||
function getScreenUrl(item, scene) {
|
||||
if (!scene.dvd?.id || !item?.screenId) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return `https://i.bang.com/screenshots/${scene.dvd.id}/${scene.type}/${scene.order}/${item.screenId}.jpg`;
|
||||
}
|
||||
|
||||
@@ -57,7 +62,7 @@ async function fetchPhotos(scene) {
|
||||
async function scrapeScene(scene, entity, options) {
|
||||
const release = {
|
||||
entryId: scene.id,
|
||||
title: scene.name,
|
||||
title: scene.name || (scene.dvd?.name && scene.type === 'bonus' && capitalize(`${scene.dvd.name} - Bonus Scene ${scene.order || 1}`)) || null,
|
||||
description: scene.description,
|
||||
tags: scene.genres.concat(scene.actions).map((genre) => genre.name),
|
||||
duration: scene.duration,
|
||||
@@ -91,7 +96,7 @@ async function scrapeScene(scene, entity, options) {
|
||||
}
|
||||
}
|
||||
|
||||
release.trailer = `https://i.bang.com/v/${scene.dvd.id}/${scene.identifier}/preview.mp4`;
|
||||
release.teaser = `https://i.bang.com/v/${scene.dvd.id}/${scene.identifier}/preview.mp4`;
|
||||
|
||||
release.channel = scene.series.name
|
||||
.replace(/[! .]/g, '')
|
||||
@@ -352,6 +357,11 @@ async function fetchUpcoming(site, page = 1) {
|
||||
}
|
||||
|
||||
async function fetchScene(url, entity, baseRelease, options) {
|
||||
if (baseRelease?.entryId) {
|
||||
// overview and deep data is the same, don't hit server unnecessarily
|
||||
return baseRelease;
|
||||
}
|
||||
|
||||
const encodedId = new URL(url).pathname.split('/')[2];
|
||||
const entryId = decodeId(encodedId);
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ const logger = require('../logger')(__filename);
|
||||
const slugify = require('../utils/slugify');
|
||||
const http = require('../utils/http');
|
||||
const qu = require('../utils/qu');
|
||||
const args = require('../argv');
|
||||
|
||||
function scrape(html, site) {
|
||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||
@@ -43,7 +44,7 @@ function scrape(html, site) {
|
||||
});
|
||||
}
|
||||
|
||||
function scrapeLegacy(scenes, site) {
|
||||
function scrapeAllLegacy(scenes, site) {
|
||||
return scenes.map(({ query }) => {
|
||||
const release = {};
|
||||
|
||||
@@ -63,6 +64,38 @@ function scrapeLegacy(scenes, site) {
|
||||
});
|
||||
}
|
||||
|
||||
function scrapeAllMembers(scenes, _channel) {
|
||||
return scenes.map(({ query, el }) => {
|
||||
const release = {};
|
||||
const data = JSON.parse(query.q(el, null, 'data-shoot'));
|
||||
|
||||
release.entryId = data?.id || query.url('a.etLnk')?.match(/\d+$/)?.[0];
|
||||
release.shootId = data?.code;
|
||||
release.url = data.url ? qu.prefixUrl(data.url, 'https://members.bangbros.com') : query.url('a.etLnk');
|
||||
|
||||
release.title = data?.title || query.cnt('.etl-hdd');
|
||||
release.description = data?.description || query.cnt('.etl-desc');
|
||||
|
||||
release.date = query.date('.etl-dt', 'MMM DD, YYYY', /\w{3} \d{1,2}, \d{4}/);
|
||||
release.actors = data?.model.map((actor) => ({
|
||||
name: actor.name,
|
||||
url: qu.prefixUrl(actor.url, 'https://members.bangbros.com'),
|
||||
}));
|
||||
|
||||
const rolloverUrl = query.q('.rollover-image', 'data-rollover-url');
|
||||
release.poster = data?.image || query.img('.rollover-image', 'data-initial-image-url');
|
||||
|
||||
if (rolloverUrl) {
|
||||
release.photos = Array.from({ length: 15 }, (value, index) => `${rolloverUrl}${index + 1}.jpg`);
|
||||
}
|
||||
|
||||
release.trailer = data?.trailer;
|
||||
release.tags = data?.tag.map((tag) => tag.name);
|
||||
|
||||
return release;
|
||||
});
|
||||
}
|
||||
|
||||
/* no dates available, breaks database
|
||||
function scrapeUpcoming(html, site) {
|
||||
const { document } = ex(html);
|
||||
@@ -147,6 +180,30 @@ function scrapeSceneLegacy({ query }, url) {
|
||||
return release;
|
||||
}
|
||||
|
||||
function scrapeSceneMembers({ query }, url) {
|
||||
const release = {};
|
||||
|
||||
release.entryId = new URL(url).pathname.match(/(\d+)\/?$/)[1];
|
||||
release.shootId = query.img('.player img')?.match(/\/shoots\/(\w+)\//)?.[1];
|
||||
|
||||
release.title = query.cnt('.vdo-hdd1');
|
||||
release.description = query.cnt('.ndcp');
|
||||
|
||||
release.actors = query.all('.vdsc a[href*="/model"]').map((actorEl) => ({
|
||||
name: query.cnt(actorEl, 'span'),
|
||||
url: query.url(actorEl, null, 'href', { origin: 'https://members.bangbros.com' }),
|
||||
avatar: query.img(actorEl, 'img'),
|
||||
}));
|
||||
|
||||
release.date = query.date('.ran:nth-child(2)', 'MMM DD, YYYY', /\w{3} \d{1,2}, \d{4}/);
|
||||
release.duration = query.duration('.ran:nth-child(3)');
|
||||
|
||||
release.tags = query.cnts('.tag a[href*="/tags"]');
|
||||
release.channel = slugify(query.cnt('.tag a[href*="/site"]'), '');
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
function scrapeProfile(html, scope) {
|
||||
const { query } = qu.ex(html);
|
||||
const profile = {};
|
||||
@@ -167,17 +224,6 @@ function scrapeProfileSearch(html, actorName) {
|
||||
}
|
||||
|
||||
async function fetchLatest(site, page = 1) {
|
||||
if (site.parameters?.legacy) {
|
||||
const url = `${site.parameters?.latest || site.url}/videos/${page}`;
|
||||
const res = await qu.getAll(url, '.videoList');
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeLegacy(res.items, site);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
const res = await qu.get(`${site.parameters?.latest || site.url}/${page}`);
|
||||
|
||||
if (res.ok) {
|
||||
@@ -187,6 +233,39 @@ async function fetchLatest(site, page = 1) {
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchLatestMembers(channel, page = 1, { parameters }) {
|
||||
if (!parameters.product) {
|
||||
throw new Error(`No member area product ID known for '${channel.name}'`);
|
||||
}
|
||||
|
||||
if (!args.cookie) {
|
||||
throw new Error(`Please specifiy --cookie "PHPSESSID=xxx" to access the '${channel.name}' members area.`);
|
||||
}
|
||||
|
||||
const url = `https://members.bangbros.com/product/${parameters.product}/videos/latest/${page}`;
|
||||
|
||||
const res = await qu.getAll(url, '.thumbHolder .echThumb', {
|
||||
cookie: args.cookie,
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeAllMembers(res.items, channel);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchLatestLegacy(site, page = 1) {
|
||||
const url = `${site.parameters?.latest || site.url}/videos/${page}`;
|
||||
const res = await qu.getAll(url, '.videoList');
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeAllLegacy(res.items, site);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
/*
|
||||
async function fetchUpcoming(site) {
|
||||
const res = await http.get('https://www.bangbros.com');
|
||||
@@ -218,6 +297,26 @@ async function fetchScene(url, site, release) {
|
||||
return scrapeScene(res.item.html, url, site);
|
||||
}
|
||||
|
||||
async function fetchSceneMembers(url, baseRelease, channel, { parameters }) {
|
||||
if (!parameters.product) {
|
||||
throw new Error(`No member area product ID known for '${channel.name}'`);
|
||||
}
|
||||
|
||||
if (!args.cookie) {
|
||||
throw new Error(`Please specifiy --cookie "PHPSESSID=xxx" to access the '${channel.name}' members area.`);
|
||||
}
|
||||
|
||||
const res = await qu.get(url, null, {
|
||||
cookie: args.cookie,
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeSceneMembers(res.item, url, channel);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchProfile({ name: actorName }, scope) {
|
||||
const actorSlug = slugify(actorName);
|
||||
const url = `https://bangbros.com/search/${actorSlug}`;
|
||||
@@ -242,5 +341,12 @@ module.exports = {
|
||||
fetchLatest,
|
||||
fetchScene,
|
||||
fetchProfile,
|
||||
legacy: {
|
||||
fetchLatest: fetchLatestLegacy,
|
||||
},
|
||||
members: {
|
||||
fetchLatest: fetchLatestMembers,
|
||||
fetchScene: fetchSceneMembers,
|
||||
},
|
||||
// fetchUpcoming, no dates available
|
||||
};
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
const Promise = require('bluebird');
|
||||
const util = require('util');
|
||||
const { JSDOM } = require('jsdom');
|
||||
const cheerio = require('cheerio');
|
||||
const moment = require('moment');
|
||||
const format = require('template-format');
|
||||
|
||||
@@ -25,6 +24,19 @@ function getApiUrl(appId, apiKey) {
|
||||
};
|
||||
}
|
||||
|
||||
function getAvatarFallbacks(avatar) {
|
||||
if (!avatar) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return [
|
||||
avatar.replace(/\d+x\d+/, '500x750'),
|
||||
avatar.replace(/\d+x\d+/, '240x360'),
|
||||
avatar.replace(/\d+x\d+/, '200x300'),
|
||||
avatar,
|
||||
];
|
||||
}
|
||||
|
||||
async function fetchApiCredentials(referer, site) {
|
||||
if (site?.parameters?.appId && site?.parameters?.apiKey) {
|
||||
return getApiUrl(site.parameters.appId, site.parameters.apiKey);
|
||||
@@ -62,21 +74,19 @@ function getAlbumUrl(albumPath, site) {
|
||||
}
|
||||
|
||||
async function fetchPhotos(url) {
|
||||
const res = await http.get(url);
|
||||
const res = await qu.get(url);
|
||||
|
||||
return res.body.toString();
|
||||
return res.item;
|
||||
}
|
||||
|
||||
function scrapePhotos(html, includeThumbnails = true) {
|
||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||
|
||||
return $('.preview .imgLink, .pgFooterThumb a').toArray().map((linkEl) => {
|
||||
const url = $(linkEl).attr('href');
|
||||
function scrapePhotos({ query }, includeThumbnails = true) {
|
||||
return query.all('.preview .imgLink, .pgFooterThumb a').map((linkEl) => {
|
||||
const url = linkEl.href;
|
||||
|
||||
if (/\/join|\/createaccount/.test(url)) {
|
||||
// URL links to join page instead of full photo, extract thumbnail
|
||||
// /createaccount is used by e.g. Tricky Spa native site
|
||||
const src = $(linkEl).find('img').attr('src');
|
||||
const src = query.img(linkEl);
|
||||
|
||||
if (/previews\//.test(src)) {
|
||||
// resource often serves full photo at a modifier URL anyway, add as primary source
|
||||
@@ -106,20 +116,18 @@ async function getPhotos(albumPath, site, includeThumbnails = true) {
|
||||
}
|
||||
|
||||
try {
|
||||
const html = await fetchPhotos(albumUrl);
|
||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||
const photos = scrapePhotos(html, includeThumbnails);
|
||||
const item = await fetchPhotos(albumUrl);
|
||||
const photos = scrapePhotos(item, includeThumbnails);
|
||||
|
||||
const lastPage = $('.Gamma_Paginator a.last').attr('href')?.match(/\d+$/)[0];
|
||||
const lastPage = item.query.url('.Gamma_Paginator a.last')?.match(/\d+$/)[0];
|
||||
|
||||
if (lastPage) {
|
||||
const otherPages = Array.from({ length: Number(lastPage) }, (_value, index) => index + 1).slice(1);
|
||||
|
||||
const otherPhotos = await Promise.map(otherPages, async (page) => {
|
||||
const pageUrl = `${albumUrl}/${page}`;
|
||||
const pageHtml = await fetchPhotos(pageUrl);
|
||||
const pageItem = await fetchPhotos(`${albumUrl}/${page}`);
|
||||
|
||||
return scrapePhotos(pageHtml, includeThumbnails);
|
||||
return scrapePhotos(pageItem, includeThumbnails);
|
||||
}, {
|
||||
concurrency: 2,
|
||||
});
|
||||
@@ -169,10 +177,15 @@ async function getThumbs(entryId, site, parameters) {
|
||||
});
|
||||
|
||||
if (res.ok && res.body.results?.[0]?.hits[0]?.set_pictures) {
|
||||
return res.body.results[0].hits[0].set_pictures.map((img) => ([
|
||||
`https://transform.gammacdn.com/photo_set${img.thumb_path}`,
|
||||
return res.body.results[0].hits[0].set_pictures.map((img) => img.thumb_path && ([
|
||||
`https://images-fame.gammacdn.com/photo_set${img.thumb_path}`,
|
||||
`https://images01-fame.gammacdn.com/photo_set${img.thumb_path}`,
|
||||
`https://images02-fame.gammacdn.com/photo_set${img.thumb_path}`,
|
||||
`https://images03-fame.gammacdn.com/photo_set${img.thumb_path}`,
|
||||
`https://images04-fame.gammacdn.com/photo_set${img.thumb_path}`,
|
||||
`https://images-evilangel.gammacdn.com/photo_set${img.thumb_path}`,
|
||||
]));
|
||||
`https://transform.gammacdn.com/photo_set${img.thumb_path}`,
|
||||
])).filter(Boolean);
|
||||
}
|
||||
|
||||
return [];
|
||||
@@ -187,6 +200,18 @@ async function getPhotosApi(entryId, site, parameters) {
|
||||
return photos.concat(thumbs.slice(photos.length));
|
||||
}
|
||||
|
||||
function getImageSources(source) {
|
||||
return [
|
||||
`https://images-fame.gammacdn.com/movies${source}`,
|
||||
`https://images01-fame.gammacdn.com/movies${source}`,
|
||||
`https://images02-fame.gammacdn.com/movies${source}`,
|
||||
`https://images03-fame.gammacdn.com/movies${source}`,
|
||||
`https://images04-fame.gammacdn.com/movies${source}`,
|
||||
`https://images-evilangel.gammacdn.com/movies${source}`,
|
||||
`https://transform.gammacdn.com/movies${source}`,
|
||||
];
|
||||
}
|
||||
|
||||
async function scrapeApiReleases(json, site) {
|
||||
return json.map((scene) => {
|
||||
if (site.parameters?.extract && scene.sitename !== site.parameters.extract) {
|
||||
@@ -225,9 +250,17 @@ async function scrapeApiReleases(json, site) {
|
||||
],
|
||||
}));
|
||||
|
||||
/* master categories include e.g. 'transgender' for non-trans Wicked scenes
|
||||
release.tags = scene.master_categories
|
||||
.concat(scene.categories?.map((category) => category.name))
|
||||
.filter(Boolean); // some categories don't have a name
|
||||
*/
|
||||
|
||||
release.tags = scene.categories?.map((category) => category.name).filter(Boolean); // some categories don't have a name
|
||||
|
||||
if (scene.availableOnSite.length > 1) {
|
||||
release.comment = `Also available on ${scene.availableOnSite.filter((sisterSite) => sisterSite !== site.slug).join(', ')}`;
|
||||
}
|
||||
|
||||
const posterPath = scene.pictures.resized || (scene.pictures.nsfw?.top && Object.values(scene.pictures.nsfw.top)[0]);
|
||||
|
||||
@@ -244,41 +277,21 @@ async function scrapeApiReleases(json, site) {
|
||||
}).filter(Boolean);
|
||||
}
|
||||
|
||||
function scrapeAll(html, site, networkUrl, hasTeaser = true) {
|
||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||
const scenesElements = $('li[data-itemtype=scene], div[data-itemtype=scenes]').toArray();
|
||||
|
||||
return scenesElements.map((element) => {
|
||||
function scrapeAll(scenes, site, networkUrl, hasTeaser = true) {
|
||||
return scenes.map(({ query, el }) => {
|
||||
const release = {};
|
||||
|
||||
const sceneLinkElement = $(element).find('.sceneTitle a, .tlcTitle a');
|
||||
release.url = query.url('.sceneTitle a, .tlcTitle a', 'href', { origin: networkUrl ? site.parent.url : site.url });
|
||||
|
||||
if (site) release.url = `${networkUrl ? site.parent.url : site.url}${sceneLinkElement.attr('href')}`;
|
||||
else release.url = `${networkUrl}${sceneLinkElement.attr('href')}`;
|
||||
release.title = query.cnt('.sceneTitle a', 'tlcTitle a', 'title');
|
||||
release.entryId = el.dataset.itemid;
|
||||
|
||||
release.title = sceneLinkElement.attr('title');
|
||||
release.entryId = $(element).attr('data-itemid');
|
||||
release.date = query.date('.sceneDate, .tlcSpecsDate .tlcDetailsValue', ['MM-DD-YYYY', 'YYYY-MM-DD']);
|
||||
release.actors = query.cnts('.sceneActors a, .tlcActors a', ' title');
|
||||
|
||||
const dateEl = $(element).find('.sceneDate, .tlcSpecsDate .tlcDetailsValue').text() || null;
|
||||
if (dateEl) {
|
||||
release.date = moment
|
||||
.utc(dateEl, ['MM-DD-YYYY', 'YYYY-MM-DD'])
|
||||
.toDate();
|
||||
}
|
||||
[release.likes, release.dislikes] = query.all('.value').map((likeEl) => query.number(likeEl));
|
||||
|
||||
release.actors = $(element).find('.sceneActors a, .tlcActors a')
|
||||
.map((actorIndex, actorElement) => $(actorElement).attr('title'))
|
||||
.toArray();
|
||||
|
||||
[release.likes, release.dislikes] = $(element).find('.value')
|
||||
.toArray()
|
||||
.map((value) => Number($(value).text()));
|
||||
|
||||
const posterEl = $(element).find('.imgLink img, .tlcImageItem');
|
||||
if (posterEl) release.poster = posterEl.attr('data-original') || posterEl.attr('src');
|
||||
|
||||
const channelEl = $(element).find('.fromSite a');
|
||||
if (channelEl.attr('title')) release.channel = channelEl.attr('title').replace('.com', '');
|
||||
release.poster = query.img('.imgLink img, .tlcImageItem', 'data-original') || query.img('.imgLink img, .tlcImageItem');
|
||||
|
||||
if (hasTeaser) {
|
||||
release.teaser = [
|
||||
@@ -287,76 +300,66 @@ function scrapeAll(html, site, networkUrl, hasTeaser = true) {
|
||||
];
|
||||
}
|
||||
|
||||
release.channel = query.el('.fromSite a', 'title')?.replace('.com', '');
|
||||
|
||||
return release;
|
||||
});
|
||||
}
|
||||
|
||||
async function scrapeScene(html, url, site, baseRelease, mobileHtml, options) {
|
||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||
const m$ = mobileHtml && cheerio.load(mobileHtml, { normalizeWhitespace: true });
|
||||
const release = { $, url };
|
||||
async function scrapeScene({ query }, url, channel, baseRelease, mobileItem, options) {
|
||||
const release = { query }; // used by XEmpire scraper to resolve channel-specific details
|
||||
|
||||
const json = $('script[type="application/ld+json"]').html();
|
||||
const videoJson = $('script:contains("window.ScenePlayerOptions")').html();
|
||||
const json = query.html('script[type="application/ld+json"]');
|
||||
const videoJson = query.htmls('script').find((script) => /ScenePlayerOptions/i.test(script));
|
||||
|
||||
const [data, data2] = json ? JSON.parse(json) : [];
|
||||
const videoData = videoJson && JSON.parse(videoJson.slice(videoJson.indexOf('{'), videoJson.indexOf('};') + 1));
|
||||
|
||||
release.entryId = (baseRelease?.path || new URL(url).pathname).match(/\/(\d{2,})(\/|$)/)?.[1];
|
||||
release.title = videoData?.playerOptions?.sceneInfos.sceneTitle || data?.name;
|
||||
release.description = data?.description;
|
||||
|
||||
// date in data object is not the release date of the scene, but the date the entry was added; only use as fallback
|
||||
const dateString = $('.updatedDate').first().text().trim();
|
||||
const dateMatch = dateString.match(/\d{2,4}[-/]\d{2}[-/]\d{2,4}/)?.[0];
|
||||
release.date = query.date('.updatedDate', ['MM-DD-YYYY', 'YYYY-MM-DD'])
|
||||
|| qu.extractDate(data?.dateCreated, 'YYYY-MM-DD')
|
||||
|| videoData?.playerOptions?.sceneInfos.sceneReleaseDate;
|
||||
|
||||
if (dateMatch) release.date = moment.utc(dateMatch, ['MM-DD-YYYY', 'YYYY-MM-DD']).toDate();
|
||||
else if (data?.dateCreated) release.date = moment.utc(data.dateCreated, 'YYYY-MM-DD').toDate();
|
||||
else release.date = videoData?.playerOptions?.sceneInfos.sceneReleaseDate;
|
||||
release.actors = (data?.actor || data2?.actor)?.map((actor) => ({
|
||||
name: actor.name,
|
||||
gender: actor.gender,
|
||||
})) || [];
|
||||
|
||||
if (data) {
|
||||
release.description = data.description;
|
||||
if (data.director?.[0]?.name) release.director = data.director[0].name;
|
||||
else if (data2?.director?.[0]?.name) release.director = data2.director[0].name;
|
||||
release.duration = qu.durationToSeconds(data.duration);
|
||||
release.director = data?.director?.[0]?.name || data2?.director?.[0]?.name;
|
||||
|
||||
const stars = (data.aggregateRating.ratingValue / data.aggregateRating.bestRating) * 5;
|
||||
if (stars) release.rating = { stars };
|
||||
release.tags = data?.keywords?.split(', ') || data2?.keywords?.split(', ') || [];
|
||||
release.stars = (data.aggregateRating.ratingValue / data.aggregateRating.bestRating) * 5 || null;
|
||||
|
||||
release.duration = moment.duration(data.duration.slice(2)).asSeconds();
|
||||
release.channel = slugify(data?.productionCompany?.name
|
||||
|| query.el('.studioLink a, .siteLink a', 'title')
|
||||
|| query.cnt('.siteNameSpan')?.toLowerCase().replace('.com', '')
|
||||
|| query.meta('meta[name="twitter:domain"]')?.replace('.com', ''), '');
|
||||
|
||||
if (videoData?.picPreview && new URL(videoData.picPreview).pathname.length > 1) {
|
||||
// sometimes links to just https://images02-fame.gammacdn.com/
|
||||
const poster = new URL(videoData.picPreview);
|
||||
|
||||
release.poster = [
|
||||
videoData.picPreview, // prefer original URL with width and height parameters, without may give a square crop on e.g. XEmpire
|
||||
`${poster.origin}${poster.pathname}`,
|
||||
];
|
||||
}
|
||||
|
||||
const actors = data?.actor || data2?.actor;
|
||||
|
||||
if (actors) {
|
||||
release.actors = actors.map((actor) => ({
|
||||
name: actor.name,
|
||||
gender: actor.gender,
|
||||
}));
|
||||
}
|
||||
|
||||
const hasTrans = release.actors?.some((actor) => actor.gender === 'shemale');
|
||||
const rawTags = data?.keywords?.split(', ') || data2?.keywords?.split(', ') || [];
|
||||
release.tags = hasTrans ? [...rawTags, 'transsexual'] : rawTags;
|
||||
|
||||
const channel = data?.productionCompany?.name
|
||||
|| $('.studioLink a, .siteLink a').attr('title')?.trim()
|
||||
|| $('.siteNameSpan').text()
|
||||
?.trim()
|
||||
.toLowerCase()
|
||||
.replace('.com', '')
|
||||
|| $('meta[name="twitter:domain"]').attr('content')?.replace('.com', '');
|
||||
|
||||
if (channel) release.channel = slugify(channel, '');
|
||||
|
||||
if (videoData?.picPreview && new URL(videoData.picPreview).pathname.length > 1) release.poster = videoData.picPreview; // sometimes links to just https://images02-fame.gammacdn.com/
|
||||
|
||||
const photoLink = $('.picturesItem a').attr('href');
|
||||
const mobilePhotos = m$ ? m$('.preview-displayer a img').map((photoIndex, photoEl) => $(photoEl).attr('src')).toArray() : [];
|
||||
const photoLink = query.url('.picturesItem a');
|
||||
const mobilePhotos = mobileItem?.query.imgs('.preview-displayer a img') || [];
|
||||
|
||||
if (photoLink && options.includePhotos) {
|
||||
const photos = await getPhotos(photoLink, site, mobilePhotos.length < 3); // only get thumbnails when less than 3 mobile photos are available
|
||||
const photos = await getPhotos(photoLink, channel, mobilePhotos.length < 3); // only get thumbnails when less than 3 mobile photos are available
|
||||
|
||||
if (photos.length < 7) release.photos = [...photos, ...mobilePhotos]; // probably only teaser photos available, supplement with mobile album
|
||||
else release.photos = photos;
|
||||
if (photos.length < 7) {
|
||||
release.photos = [...photos, ...mobilePhotos]; // probably only teaser photos available, supplement with mobile album
|
||||
} else {
|
||||
release.photos = photos;
|
||||
}
|
||||
} else {
|
||||
release.photos = mobilePhotos;
|
||||
}
|
||||
@@ -397,28 +400,28 @@ async function scrapeScene(html, url, site, baseRelease, mobileHtml, options) {
|
||||
];
|
||||
}
|
||||
|
||||
const movie = $('.dvdLink');
|
||||
const movieUrl = qu.prefixUrl(movie.attr('href'), site.url);
|
||||
const movieUrl = query.url('.dvdLink', 'href', { origin: channel.url });
|
||||
|
||||
if (movieUrl) {
|
||||
release.movie = {
|
||||
url: movieUrl,
|
||||
title: movie.attr('title'),
|
||||
title: query.el('.dvdLink', 'title'),
|
||||
entryId: movieUrl.match(/\/(\d+)(\/|$)/)?.[1],
|
||||
covers: [movie.find('img').attr('src')],
|
||||
covers: [qu.imgs('.dvdLink img')],
|
||||
};
|
||||
}
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
async function scrapeSceneApi(data, site, options) {
|
||||
async function scrapeReleaseApi(data, site, options) {
|
||||
const release = {};
|
||||
|
||||
release.entryId = data.clip_id;
|
||||
release.entryId = data.clip_id || data.movie_id;
|
||||
release.title = data.title;
|
||||
release.duration = data.length;
|
||||
release.date = new Date(data.date * 1000) || qu.parseDate(data.release_date, 'YYYY-MM-DD');
|
||||
release.date = (data.date && new Date(data.date * 1000)) || qu.parseDate(data.release_date || data.last_modified, 'YYYY-MM-DD');
|
||||
release.director = data.directors[0]?.name || null;
|
||||
|
||||
release.actors = data.actors.map((actor) => ({
|
||||
entryId: actor.actor_id,
|
||||
@@ -433,10 +436,9 @@ async function scrapeSceneApi(data, site, options) {
|
||||
|
||||
if (data.pictures) {
|
||||
release.poster = [
|
||||
`https://transform.gammacdn.com/movies${data.pictures['1920x1080']}`,
|
||||
`https://images-evilangel.gammacdn.com/movies${data.pictures['1920x1080']}`,
|
||||
`https://transform.gammacdn.com/movies${data.pictures.resized}`,
|
||||
`https://images-evilangel.gammacdn.com/movies${data.pictures.resized}`,
|
||||
...(data.pictures['1920x1080'] ? getImageSources(data.pictures['1920x1080']) : []),
|
||||
...(data.pictures.resized ? getImageSources(data.pictures.resized) : []),
|
||||
...(data.pictures['960x544'] ? getImageSources(data.pictures['960x544']) : []),
|
||||
];
|
||||
}
|
||||
|
||||
@@ -444,15 +446,22 @@ async function scrapeSceneApi(data, site, options) {
|
||||
release.photos = await getPhotosApi(data.photoset_id, site, options.parameters);
|
||||
}
|
||||
|
||||
if (data.cover_path) {
|
||||
release.covers = [
|
||||
getImageSources(`${data.cover_path}_front_400x625.jpg?width=450&height=636&format=webp`),
|
||||
getImageSources(`${data.cover_path}_back_400x625.jpg?width=450&height=636&format=webp`),
|
||||
];
|
||||
}
|
||||
|
||||
if (data.trailers) {
|
||||
release.trailer = Object.entries(data.trailers).map(([quality, source]) => ({ src: source, quality }));
|
||||
}
|
||||
|
||||
if (data.movie_id) {
|
||||
if (data.movie_id && !data.movie_path) {
|
||||
release.movie = {
|
||||
entryId: data.movie_id,
|
||||
title: data.movie_title,
|
||||
url: qu.prefixUrl(`/en/movie/${data.url_movie_title}/${data.movie_id}`, site.url),
|
||||
url: qu.prefixUrl(`${data.url_movie_title}/${data.movie_id}`, options.parameters.movie ? options.parameters.movie : `${site.url}/en/movie`),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -484,11 +493,16 @@ async function fetchMovieTrailer(release) {
|
||||
return null;
|
||||
}
|
||||
|
||||
async function scrapeMovie({ query, html }, window, url, entity, options) {
|
||||
async function scrapeMovie({ query, el }, url, entity, baseRelease, options) {
|
||||
const release = {};
|
||||
const data = window.dataLayer[0]?.dvdDetails;
|
||||
|
||||
// const options = html.match(/options = {.*};/);
|
||||
const { dataLayer } = query.exec('//script[contains(text(), "dataLayer")]', ['dataLayer']);
|
||||
const rawData = dataLayer?.[0]?.dvdDetails;
|
||||
const data = rawData?.dvdId && rawData; // dvdDetails is mostly empty in some cache states
|
||||
|
||||
if (query.exists('.NotFound-Title')) {
|
||||
return null;
|
||||
}
|
||||
|
||||
release.entryId = new URL(url).pathname.match(/\/(\d+)(\/|$)/)?.[1];
|
||||
|
||||
@@ -498,13 +512,20 @@ async function scrapeMovie({ query, html }, window, url, entity, options) {
|
||||
];
|
||||
|
||||
release.description = query.cnt('.descriptionText');
|
||||
release.date = qu.extractDate(data.dvdReleaseDate);
|
||||
release.title = data.dvdName;
|
||||
release.date = qu.extractDate(data?.dvdReleaseDate) || query.date('.updatedOn', 'YYYY-MM-DD');
|
||||
release.title = data?.dvdName || query.cnt('.dvdTitle');
|
||||
release.director = query.el('.directedBy a', 'title');
|
||||
|
||||
release.actors = data?.dvdActors.map((actor) => ({ name: actor.actorName, entryId: actor.actorId }))
|
||||
|| query.all('.actorCarousel a[href*="/pornstar"]').map((actorEl) => ({
|
||||
entryId: query.url(actorEl, null).match(/\/(\d+)/)?.[1],
|
||||
name: query.cnt(actorEl, 'span'),
|
||||
href: query.url(actorEl, null, 'href', { origin: entity.url }),
|
||||
avatar: getAvatarFallbacks(query.img(actorEl)),
|
||||
}));
|
||||
|
||||
release.actors = data.dvdActors.map((actor) => ({ name: actor.actorName, entryId: actor.actorId }));
|
||||
release.tags = query.cnts('.dvdCol a');
|
||||
|
||||
release.scenes = scrapeAll(html, entity, entity.url);
|
||||
release.scenes = scrapeAll(qu.initAll(el, 'div[data-itemtype*=scene], li[data-itemtype*=scene]'), entity, entity.url);
|
||||
|
||||
if (options.includeTrailers) {
|
||||
release.trailer = await fetchMovieTrailer(release);
|
||||
@@ -547,10 +568,8 @@ async function fetchActorReleases(profileUrl, getActorReleasesUrl, page = 1, acc
|
||||
return accReleases.concat(releases);
|
||||
}
|
||||
|
||||
async function scrapeProfile(html, url, actorName, _siteSlug, getActorReleasesUrl, withReleases, context) {
|
||||
const { query } = qu.extract(html);
|
||||
|
||||
const avatar = query.el('img.actorPicture');
|
||||
async function scrapeProfile({ query }, url, actorName, _siteSlug, getActorReleasesUrl, withReleases, context) {
|
||||
const avatar = query.img('img.actorPicture');
|
||||
const hair = query.cnt('.actorProfile .attribute_hair_color');
|
||||
const height = query.cnt('.actorProfile .attribute_height');
|
||||
const weight = query.cnt('.actorProfile .attribute_weight');
|
||||
@@ -563,12 +582,7 @@ async function scrapeProfile(html, url, actorName, _siteSlug, getActorReleasesUr
|
||||
|
||||
if (avatar) {
|
||||
// larger sizes usually available, provide fallbacks
|
||||
const avatars = [
|
||||
avatar.src.replace(/\d+x\d+/, '500x750'),
|
||||
avatar.src.replace(/\d+x\d+/, '240x360'),
|
||||
avatar.src.replace(/\d+x\d+/, '200x300'),
|
||||
avatar.src,
|
||||
];
|
||||
const avatars = getAvatarFallbacks(avatar);
|
||||
|
||||
profile.avatar = avatars;
|
||||
}
|
||||
@@ -617,7 +631,7 @@ async function fetchLatestApi(site, page = 1, preData, include, upcoming = false
|
||||
requests: [
|
||||
{
|
||||
indexName: 'all_scenes',
|
||||
params: `query=&hitsPerPage=36&maxValuesPerFacet=100&page=${page - 1}&facetFilters=[["lesbian:"],["bisex:"],["shemale:"],["upcoming:${upcoming ? 1 : 0}"]]&filters=sitename:${site.slug} OR channels.id:${site.slug}`,
|
||||
params: `query=&hitsPerPage=36&maxValuesPerFacet=100&page=${page - 1}&facetFilters=[["lesbian:"],["bisex:"],["shemale:"],["upcoming:${upcoming ? 1 : 0}"]]&filters=sitename:${site.slug}`, // OR channels.id:${site.slug}`,
|
||||
},
|
||||
],
|
||||
}, {
|
||||
@@ -664,8 +678,48 @@ async function fetchSceneApi(url, site, baseRelease, options) {
|
||||
encodeJSON: true,
|
||||
});
|
||||
|
||||
if (res.status === 200 && res.body.results?.[0]?.hits) {
|
||||
return scrapeSceneApi(res.body.results[0].hits[0], site, options);
|
||||
if (res.status === 200 && res.body.results?.[0]?.hits.length > 0) {
|
||||
return scrapeReleaseApi(res.body.results[0].hits[0], site, options);
|
||||
}
|
||||
|
||||
if (res.status === 200) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchMovieApi(url, site, baseRelease, options) {
|
||||
const referer = options.parameters?.referer || `${site.parameters?.networkReferer ? site.parent.url : site.url}/en/movies`;
|
||||
const { apiUrl } = await fetchApiCredentials(referer, site);
|
||||
|
||||
const entryId = (baseRelease?.path || new URL(url).pathname).match(/\/(\d{2,})(\/|$)/)?.[1];
|
||||
|
||||
const res = await http.post(apiUrl, {
|
||||
requests: [
|
||||
{
|
||||
indexName: 'all_movies',
|
||||
params: `query=&page=0&facets=[]&tagFilters=&facetFilters=[["movie_id:${entryId}"]]`,
|
||||
},
|
||||
{
|
||||
indexName: 'all_movies',
|
||||
params: 'query=&page=0&hitsPerPage=1&attributesToRetrieve=[]&attributesToHighlight=[]&attributesToSnippet=[]&tagFilters=&analytics=false&clickAnalytics=false&facets=clip_id',
|
||||
},
|
||||
],
|
||||
}, {
|
||||
headers: {
|
||||
Referer: referer,
|
||||
},
|
||||
}, {
|
||||
encodeJSON: true,
|
||||
});
|
||||
|
||||
if (res.status === 200 && res.body.results?.[0]?.hits.length > 0) {
|
||||
return scrapeReleaseApi(res.body.results[0].hits[0], site, options);
|
||||
}
|
||||
|
||||
if (res.status === 200) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return res.status;
|
||||
@@ -699,10 +753,10 @@ function getUpcomingUrl(site) {
|
||||
|
||||
async function fetchLatest(site, page = 1) {
|
||||
const url = getLatestUrl(site, page);
|
||||
const res = await http.get(url);
|
||||
const res = await qu.getAll(url, 'li[data-itemtype=scene], div[data-itemtype*=scene]');
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeAll(res.body.toString(), site);
|
||||
return scrapeAll(res.items, site);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
@@ -710,10 +764,10 @@ async function fetchLatest(site, page = 1) {
|
||||
|
||||
async function fetchUpcoming(site) {
|
||||
const url = getUpcomingUrl(site);
|
||||
const res = await http.get(url);
|
||||
const res = await qu.getAll(url, 'li[data-itemtype=scene], div[data-itemtype*=scene]');
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeAll(res.body.toString(), site, null, false);
|
||||
return scrapeAll(res.items, site, null, false);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
@@ -749,12 +803,12 @@ async function fetchScene(url, site, baseRelease, options) {
|
||||
}
|
||||
|
||||
const deepUrl = getDeepUrl(url, site, baseRelease);
|
||||
const mobileUrl = getDeepUrl(url, site, baseRelease, site.parameters?.mobile || site.parent?.parameters?.mobile);
|
||||
const mobileUrl = options.includePhotos && getDeepUrl(url, site, baseRelease, site.parameters?.mobile || site.parent?.parameters?.mobile);
|
||||
|
||||
if (deepUrl) {
|
||||
const [res, mobileRes] = await Promise.all([
|
||||
http.get(deepUrl),
|
||||
mobileUrl && http.get(mobileUrl, {
|
||||
qu.get(deepUrl),
|
||||
mobileUrl && qu.get(mobileUrl, null, {
|
||||
headers: {
|
||||
// don't redirect to main site
|
||||
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Mobile Safari/537.36',
|
||||
@@ -763,8 +817,8 @@ async function fetchScene(url, site, baseRelease, options) {
|
||||
]);
|
||||
|
||||
if (res.status === 200) {
|
||||
const mobileBody = mobileRes?.status === 200 ? mobileRes.body.toString() : null;
|
||||
const scene = await scrapeScene(res.body.toString(), url, site, baseRelease, mobileBody, options);
|
||||
const mobileItem = mobileRes?.status === 200 ? mobileRes.item : null;
|
||||
const scene = await scrapeScene(res.item, url, site, baseRelease, mobileItem, options);
|
||||
|
||||
return { ...scene, deepUrl };
|
||||
}
|
||||
@@ -773,20 +827,6 @@ async function fetchScene(url, site, baseRelease, options) {
|
||||
return null;
|
||||
}
|
||||
|
||||
async function fetchMovie(url, channel, baseRelease, options) {
|
||||
const res = await qu.get(url, null, null, {
|
||||
extract: {
|
||||
runScripts: 'dangerously',
|
||||
},
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeMovie(res.item, res.window, url, channel, options);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchActorScenes(actorName, apiUrl, siteSlug) {
|
||||
const res = await http.post(apiUrl, {
|
||||
requests: [
|
||||
@@ -827,13 +867,13 @@ async function fetchProfile({ name: actorName }, context, include, altSearchUrl,
|
||||
|
||||
if (actorUrl) {
|
||||
const url = `https://${siteSlug}.com${actorUrl}`;
|
||||
const actorRes = await http.get(url);
|
||||
const actorRes = await qu.get(url);
|
||||
|
||||
if (actorRes.status !== 200) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return scrapeProfile(actorRes.body.toString(), url, actorName, siteSlug, getActorReleasesUrl, include.scenes, context);
|
||||
return scrapeProfile(actorRes.item, url, actorName, siteSlug, getActorReleasesUrl, include.scenes, context);
|
||||
}
|
||||
|
||||
return null;
|
||||
@@ -881,7 +921,6 @@ module.exports = {
|
||||
fetchApiUpcoming: fetchUpcomingApi,
|
||||
fetchLatest,
|
||||
fetchLatestApi,
|
||||
fetchMovie,
|
||||
fetchProfile,
|
||||
fetchScene,
|
||||
fetchSceneApi,
|
||||
@@ -893,12 +932,14 @@ module.exports = {
|
||||
fetchProfile: fetchApiProfile,
|
||||
// fetchScene,
|
||||
fetchScene: fetchSceneApi,
|
||||
fetchMovie,
|
||||
// scrapeMovie,
|
||||
fetchMovie: fetchMovieApi,
|
||||
},
|
||||
getPhotos,
|
||||
scrapeApiProfile,
|
||||
scrapeApiReleases,
|
||||
scrapeProfile,
|
||||
scrapeAll,
|
||||
scrapeMovie,
|
||||
scrapeScene,
|
||||
};
|
||||
|
||||
@@ -136,14 +136,18 @@ function getEntryId(html) {
|
||||
function scrapeAll(scenes, site, entryIdFromTitle) {
|
||||
return scenes.map(({ el, query }) => {
|
||||
const release = {};
|
||||
const title = query.cnt('.content_img div, .dvd_info > a, a.update_title, a[title] + a[title]') || query.cnt('a[title*=" "]');
|
||||
|
||||
release.url = query.url('.update_title a, .dvd_info > a, a ~ a');
|
||||
release.title = query.q('.update_title a, .dvd_info > a, a ~ a', true);
|
||||
release.title = title?.slice(0, title.match(/starring:/i)?.index || Infinity).trim();
|
||||
release.url = query.url('.content_img a, .dvd_info > a, a.update_title, a[title*=" "]');
|
||||
release.date = query.date('.update_date', 'MM/DD/YYYY');
|
||||
|
||||
release.entryId = (entryIdFromTitle && slugify(release.title)) || el.dataset.setid || query.q('.rating_box')?.dataset.id;
|
||||
|
||||
release.actors = query.all('.update_models a', true);
|
||||
release.actors = query.all('.content_img .update_models a, .update_models a').map((actorEl) => ({
|
||||
name: query.cnt(actorEl),
|
||||
url: query.url(actorEl, null),
|
||||
}));
|
||||
|
||||
const dvdPhotos = query.imgs('.dvd_preview_thumb');
|
||||
const photoCount = Number(query.q('a img.thumbs', 'cnt')) || 1;
|
||||
@@ -183,9 +187,9 @@ function scrapeAll(scenes, site, entryIdFromTitle) {
|
||||
}).filter(Boolean);
|
||||
|
||||
const teaserScript = query.html('script');
|
||||
|
||||
if (teaserScript) {
|
||||
const src = teaserScript.slice(teaserScript.indexOf('http'), teaserScript.indexOf('.mp4') + 4);
|
||||
if (src) release.teaser = { src };
|
||||
release.teaser = teaserScript.slice(teaserScript.indexOf('http'), teaserScript.indexOf('.mp4') + 4);
|
||||
}
|
||||
|
||||
return release;
|
||||
@@ -235,17 +239,21 @@ function scrapeUpcoming(html, site) {
|
||||
});
|
||||
}
|
||||
|
||||
async function scrapeScene({ html, query }, url, site, include) {
|
||||
const release = { url, site };
|
||||
async function scrapeScene({ html, query }, url, site, options) {
|
||||
const release = {};
|
||||
|
||||
release.entryId = getEntryId(html);
|
||||
release.title = query.q('.title_bar_hilite', true);
|
||||
release.description = query.q('.update_description', true);
|
||||
release.title = query.cnt('.title_bar_hilite');
|
||||
release.description = query.cnt('.update_description');
|
||||
|
||||
release.date = query.date('.update_date', 'MM/DD/YYYY', null, 'innerHTML');
|
||||
|
||||
release.actors = query.all('.backgroundcolor_info > .update_models a, .item .update_models a', true);
|
||||
release.tags = query.all('.update_tags a', true);
|
||||
release.actors = query.all('.backgroundcolor_info > .update_models a, .item .update_models a').map((actorEl) => ({
|
||||
name: query.cnt(actorEl),
|
||||
url: query.url(actorEl, null),
|
||||
}));
|
||||
|
||||
release.tags = query.cnts('.update_tags a');
|
||||
|
||||
const posterPath = html.match(/useimage = "(.*)"/)?.[1];
|
||||
|
||||
@@ -260,7 +268,7 @@ async function scrapeScene({ html, query }, url, site, include) {
|
||||
}
|
||||
}
|
||||
|
||||
if (include.trailer && site.slug !== 'manuelferrara') {
|
||||
if (options.includeTrailers && site.slug !== 'manuelferrara') {
|
||||
const trailerLines = html.split('\n').filter((line) => /movie\["trailer\w*"\]\[/i.test(line));
|
||||
|
||||
if (trailerLines.length) {
|
||||
@@ -277,19 +285,20 @@ async function scrapeScene({ html, query }, url, site, include) {
|
||||
}
|
||||
}
|
||||
|
||||
if (include.photos) release.photos = await getPhotos(release.entryId, site);
|
||||
if (options.includePhotos) {
|
||||
release.photos = await getPhotos(release.entryId, site);
|
||||
}
|
||||
|
||||
if (query.exists('.update_dvds a')) {
|
||||
release.movie = {
|
||||
url: query.url('.update_dvds a'),
|
||||
title: query.q('.update_dvds a', true),
|
||||
title: query.cnt('.update_dvds a'),
|
||||
};
|
||||
|
||||
release.movie.entryId = new URL(release.movie.url).pathname.split('/').slice(-1)[0]?.replace('.html', '');
|
||||
}
|
||||
|
||||
const stars = Number(query.q('.avg_rating', true)?.replace(/[\s|Avg Rating:]/g, ''));
|
||||
if (stars) release.stars = stars;
|
||||
release.stars = query.number('.avg_rating');
|
||||
|
||||
return release;
|
||||
}
|
||||
@@ -298,7 +307,7 @@ function scrapeMovie({ el, query }, url, site) {
|
||||
const movie = { url, site };
|
||||
|
||||
movie.entryId = new URL(url).pathname.split('/').slice(-1)[0]?.replace('.html', '');
|
||||
movie.title = query.q('.title_bar span', true);
|
||||
movie.title = query.cnt('.title_bar span');
|
||||
movie.covers = query.urls('#dvd-cover-flip > a');
|
||||
movie.channel = slugify(query.q('.update_date a', true), '');
|
||||
|
||||
@@ -310,7 +319,7 @@ function scrapeMovie({ el, query }, url, site) {
|
||||
?.map((scene) => ({ ...scene, movie }))
|
||||
.sort((sceneA, sceneB) => sceneA.date - sceneB.date);
|
||||
|
||||
movie.date = curatedScenes?.[0].date;
|
||||
movie.date = curatedScenes?.[0]?.date;
|
||||
|
||||
return {
|
||||
...movie,
|
||||
|
||||
@@ -140,16 +140,6 @@ async function fetchLatest(site, page = 1) {
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchScene(url, site) {
|
||||
const res = await qu.get(url);
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeScene(res.item, url, site);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchProfile({ name: actorName }, entity, include) {
|
||||
const searchRes = await qu.getAll(`https://kink.com/search?type=performers&q=${actorName}`, '.model');
|
||||
|
||||
@@ -176,6 +166,6 @@ async function fetchProfile({ name: actorName }, entity, include) {
|
||||
|
||||
module.exports = {
|
||||
fetchLatest,
|
||||
fetchScene,
|
||||
fetchProfile,
|
||||
scrapeScene,
|
||||
};
|
||||
|
||||
@@ -11,6 +11,12 @@ const slugify = require('../utils/slugify');
|
||||
const http = require('../utils/http');
|
||||
const { inchesToCm, lbsToKg } = require('../utils/convert');
|
||||
|
||||
function getBasePath(channel, path = '/scene') {
|
||||
return channel.parameters?.scene
|
||||
|| ((channel.parameters?.native || channel.type === 'network') && `${channel.url}${path}`)
|
||||
|| `${channel.parent.url}${path}`;
|
||||
}
|
||||
|
||||
function getThumbs(scene) {
|
||||
if (scene.images.poster) {
|
||||
return Object.values(scene.images.poster) // can be { 0: {}, 1: {}, ... } instead of array
|
||||
@@ -18,7 +24,7 @@ function getThumbs(scene) {
|
||||
.map((image) => image.xl.url);
|
||||
}
|
||||
|
||||
if (scene.images.card_main_rect) {
|
||||
if (Array.isArray(scene.images.card_main_rect)) {
|
||||
return scene.images.card_main_rect
|
||||
.concat(scene.images.card_secondary_rect || [])
|
||||
.map((image) => image.xl.url.replace('.thumb', ''));
|
||||
@@ -27,6 +33,28 @@ function getThumbs(scene) {
|
||||
return [];
|
||||
}
|
||||
|
||||
function getCovers(images, target = 'cover') {
|
||||
if (!images[target]) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const covers = [
|
||||
images[target][0].md?.url,
|
||||
images[target][0].sm?.url,
|
||||
images[target][0].xs?.url,
|
||||
// bigger but usually upscaled
|
||||
images[target][0].xx?.url,
|
||||
images[target][0].xl?.url,
|
||||
images[target][0].lg?.url,
|
||||
];
|
||||
|
||||
if (target === 'poster') {
|
||||
return covers;
|
||||
}
|
||||
|
||||
return [covers];
|
||||
}
|
||||
|
||||
function getVideos(data) {
|
||||
const teaserSources = data.videos.mediabook?.files;
|
||||
const trailerSources = data.children.find((child) => child.type === 'trailer')?.videos.full?.files;
|
||||
@@ -51,9 +79,7 @@ function scrapeLatestX(data, site, filterChannel) {
|
||||
description: data.description,
|
||||
};
|
||||
|
||||
const basepath = site.parameters?.scene
|
||||
|| (site.parameters?.native && `${site.url}/scene`)
|
||||
|| `${site.parent.url}/scene`;
|
||||
const basepath = getBasePath(site);
|
||||
|
||||
release.url = `${basepath}/${release.entryId}/${slugify(release.title)}`;
|
||||
release.date = new Date(data.dateReleased);
|
||||
@@ -84,6 +110,9 @@ function scrapeLatestX(data, site, filterChannel) {
|
||||
};
|
||||
}
|
||||
|
||||
const siteName = data.collections[0]?.name || data.brand;
|
||||
release.channel = slugify(siteName, '');
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
@@ -96,7 +125,7 @@ async function scrapeLatest(items, site, filterChannel) {
|
||||
};
|
||||
}
|
||||
|
||||
function scrapeScene(data, url, _site, networkName) {
|
||||
function scrapeRelease(data, url, channel, networkName) {
|
||||
const release = {};
|
||||
|
||||
const { id: entryId, title, description } = data;
|
||||
@@ -129,6 +158,29 @@ function scrapeScene(data, url, _site, networkName) {
|
||||
|
||||
release.url = url || `https://www.${networkName || data.brand}.com/scene/${entryId}/`;
|
||||
|
||||
if (data.parent?.type === 'movie' || data.parent?.type === 'serie') {
|
||||
release[data.parent.type] = {
|
||||
entryId: data.parent.id,
|
||||
url: `${getBasePath(channel, data.parent.type === 'movie' ? '/movie' : '/series')}/${data.parent.id}/${slugify(data.parent.title, '-', { removePunctuation: true })}`,
|
||||
title: data.parent.title,
|
||||
description: data.parent.description,
|
||||
date: new Date(data.parent.dateReleased),
|
||||
channel: slugify(data.parent.collections?.name || data.parent.brand),
|
||||
poster: getCovers(data.parent.images, 'poster'),
|
||||
shallow: true,
|
||||
};
|
||||
}
|
||||
|
||||
if (data.type === 'movie') {
|
||||
release.covers = getCovers(data.images);
|
||||
release.scenes = data.children?.map((scene) => ({
|
||||
entryId: scene.id,
|
||||
url: `${getBasePath(channel)}/${scene.id}/${slugify(scene.title)}`,
|
||||
title: scene.title,
|
||||
shallow: true,
|
||||
}));
|
||||
}
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
@@ -155,17 +207,24 @@ function getUrl(site) {
|
||||
throw new Error(`Mind Geek site '${site.name}' (${site.url}) not supported`);
|
||||
}
|
||||
|
||||
async function getSession(site, parameters) {
|
||||
async function getSession(site, parameters, url) {
|
||||
if (site.slug === 'mindgeek' || site.parameters?.parentSession === false) {
|
||||
// most MG sites have a parent network to acquire a session from, don't try to acquire session from mindgeek.com for independent channels
|
||||
return null;
|
||||
}
|
||||
|
||||
const cookieJar = new CookieJar();
|
||||
const session = http.session({ cookieJar });
|
||||
|
||||
// const res = await session.get(url);
|
||||
const sessionUrl = site.parameters?.siteId && !(site.parameters?.native || site.parameters?.childSession || site.parent?.parameters?.childSession)
|
||||
? site.parent.url
|
||||
: site.url;
|
||||
: (url || site.url);
|
||||
|
||||
const res = await http.get(sessionUrl, {
|
||||
session,
|
||||
headers: {
|
||||
'Accept-Language': 'en-US,en;', // somehow seems essential for some MG sites
|
||||
},
|
||||
interval: parameters?.interval,
|
||||
concurrency: parameters?.concurrency,
|
||||
parse: false,
|
||||
@@ -175,7 +234,9 @@ async function getSession(site, parameters) {
|
||||
const cookieString = await cookieJar.getCookieStringAsync(sessionUrl);
|
||||
const { instance_token: instanceToken } = cookie.parse(cookieString);
|
||||
|
||||
return { session, instanceToken };
|
||||
if (instanceToken) {
|
||||
return { session, instanceToken };
|
||||
}
|
||||
}
|
||||
|
||||
throw new Error(`Failed to acquire MindGeek session (${res.statusCode})`);
|
||||
@@ -224,7 +285,7 @@ function scrapeProfile(data, html, releases = [], networkName) {
|
||||
profile.naturalBoobs = false;
|
||||
}
|
||||
|
||||
profile.releases = releases.map((release) => scrapeScene(release, null, null, networkName));
|
||||
profile.releases = releases.map((release) => scrapeRelease(release, null, null, networkName));
|
||||
|
||||
return profile;
|
||||
}
|
||||
@@ -234,7 +295,9 @@ async function fetchLatest(site, page = 1, options) {
|
||||
const { searchParams } = new URL(url);
|
||||
const siteId = searchParams.get('site');
|
||||
|
||||
const { session, instanceToken } = options.beforeNetwork || await getSession(site, options.parameters);
|
||||
const { session, instanceToken } = options.beforeNetwork?.headers?.Instance
|
||||
? options.beforeNetwork
|
||||
: await getSession(site, options.parameters, url);
|
||||
|
||||
const beforeDate = moment().add('1', 'day').format('YYYY-MM-DD');
|
||||
const limit = 24;
|
||||
@@ -250,6 +313,7 @@ async function fetchLatest(site, page = 1, options) {
|
||||
Instance: instanceToken,
|
||||
Origin: site.url,
|
||||
Referer: url,
|
||||
'Accept-Language': 'en-US,en;', // somehow seems essential for some MG sites
|
||||
},
|
||||
});
|
||||
|
||||
@@ -274,6 +338,7 @@ async function fetchUpcoming(site, page, options) {
|
||||
Instance: instanceToken,
|
||||
Origin: site.url,
|
||||
Referer: url,
|
||||
'Accept-Language': 'en-US,en;', // somehow seems essential for some MG sites
|
||||
},
|
||||
});
|
||||
|
||||
@@ -284,8 +349,8 @@ async function fetchUpcoming(site, page, options) {
|
||||
return res.statusCode;
|
||||
}
|
||||
|
||||
async function fetchScene(url, site, baseScene, options) {
|
||||
if (baseScene?.entryId) {
|
||||
async function fetchRelease(url, site, baseScene, options) {
|
||||
if (baseScene?.entryId && !baseScene.shallow && !options.parameters.forceDeep) {
|
||||
// overview and deep data is the same, don't hit server unnecessarily
|
||||
return baseScene;
|
||||
}
|
||||
@@ -299,12 +364,13 @@ async function fetchScene(url, site, baseScene, options) {
|
||||
concurrency: options.parameters.concurrency,
|
||||
headers: {
|
||||
Instance: instanceToken,
|
||||
'Accept-Language': 'en-US,en;', // somehow seems essential for some MG sites
|
||||
},
|
||||
});
|
||||
|
||||
if (res.status === 200 && res.body.result) {
|
||||
return {
|
||||
scene: scrapeScene(res.body.result, url, site),
|
||||
scene: scrapeRelease(res.body.result, url, site),
|
||||
};
|
||||
}
|
||||
|
||||
@@ -321,6 +387,7 @@ async function fetchProfile({ name: actorName, slug: actorSlug }, { entity, para
|
||||
concurrency: parameters.concurrency,
|
||||
headers: {
|
||||
Instance: instanceToken,
|
||||
'Accept-Language': 'en-US,en;', // somehow seems essential for some MG sites
|
||||
},
|
||||
});
|
||||
|
||||
@@ -362,9 +429,11 @@ async function fetchProfile({ name: actorName, slug: actorSlug }, { entity, para
|
||||
module.exports = {
|
||||
beforeNetwork: getSession,
|
||||
beforeFetchScenes: getSession,
|
||||
requireBeforeNetwork: false,
|
||||
scrapeLatestX,
|
||||
fetchLatest,
|
||||
fetchUpcoming,
|
||||
fetchScene,
|
||||
fetchScene: fetchRelease,
|
||||
fetchMovie: fetchRelease,
|
||||
fetchProfile,
|
||||
};
|
||||
|
||||
172
src/scrapers/purgatoryx.js
Normal file
172
src/scrapers/purgatoryx.js
Normal file
@@ -0,0 +1,172 @@
|
||||
'use strict';
|
||||
|
||||
const qu = require('../utils/qu');
|
||||
const http = require('../utils/http');
|
||||
const slugify = require('../utils/slugify');
|
||||
const { feetInchesToCm, lbsToKg } = require('../utils/convert');
|
||||
|
||||
function scrapeAll(scenes) {
|
||||
return scenes.map(({ query }) => {
|
||||
const release = {};
|
||||
|
||||
release.title = query.cnt('.title');
|
||||
release.url = query.url('.title a');
|
||||
release.entryId = new URL(release.url).pathname.match(/\/view\/(\d+)/)[1];
|
||||
|
||||
release.date = query.date('.pub-date', 'MMM DD, YYYY');
|
||||
release.duration = query.duration('.video-duration');
|
||||
|
||||
release.actors = query.all('.models a').map((el) => ({
|
||||
name: query.cnt(el),
|
||||
url: query.url(el, null),
|
||||
}));
|
||||
|
||||
if (query.exists('.thumb-big')) { // updates page
|
||||
release.poster = query.img('.thumb-big', 'data-image') || JSON.parse(query.el('.thumbnails-wrap a', 'data-images'));
|
||||
release.photos = [query.img('.thumb-top', 'data-image'), query.img('.thumb-bottom', 'data-image')];
|
||||
}
|
||||
|
||||
if (query.exists('.thumbnails-wrap')) { // actor page
|
||||
try {
|
||||
const images = JSON.parse(query.el('.thumbnails-wrap a', 'data-images'));
|
||||
|
||||
release.poster = images.slice(0, 1)[0];
|
||||
release.photos = images.slice(1);
|
||||
} catch (error) {
|
||||
// images probably not available
|
||||
}
|
||||
}
|
||||
|
||||
return release;
|
||||
});
|
||||
}
|
||||
|
||||
function scrapeUpcoming({ query }) {
|
||||
const release = {};
|
||||
|
||||
release.url = query.url('.bottom-info a');
|
||||
release.entryId = new URL(release.url).pathname.match(/\/view\/(\d+)/)?.[1];
|
||||
release.title = query.cnt('.title');
|
||||
|
||||
release.actors = query.all('.model-wrap li').map((el) => ({
|
||||
name: query.cnt(el, 'h5'),
|
||||
url: query.url(el, '.model-thumb a'),
|
||||
avatar: query.img(el, '.model-thumb img'),
|
||||
}));
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
function scrapeScene({ query }, url) {
|
||||
const release = {};
|
||||
|
||||
release.title = query.cnt('.title');
|
||||
release.entryId = new URL(url).pathname.match(/\/view\/(\d+)/)[1];
|
||||
release.date = query.date('.date', 'MMMM DD, YYYY', /\w+ \d{1,2}, \d{4}/);
|
||||
|
||||
release.description = query.cnt('.description p');
|
||||
release.duration = query.duration('.total-time');
|
||||
|
||||
release.actors = query.all('.model-wrap li').map((el) => ({
|
||||
name: query.cnt(el, 'h5'),
|
||||
url: query.url(el, 'a'),
|
||||
avatar: query.img(el),
|
||||
}));
|
||||
|
||||
release.poster = query.poster();
|
||||
release.photos = query.urls('.photos-slider a');
|
||||
release.trailer = query.video();
|
||||
|
||||
release.comment = query.cnt('.series');
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
async function fetchLatest(channel, page) {
|
||||
const res = await qu.getAll(`${channel.url}/episodes?page=${page}`, '.content-item');
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeAll(res.items, channel);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchUpcoming(channel) {
|
||||
const res = await qu.get(channel.url, '.upcoming-info-wrap');
|
||||
|
||||
if (res.ok && res.item) {
|
||||
return [scrapeUpcoming(res.item, channel)];
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
function scrapeProfile({ query }, url) {
|
||||
const profile = { url };
|
||||
|
||||
const bio = Object.fromEntries(query.all('.model-desc li').map((el) => [slugify(query.cnt(el, 'span'), '_'), query.text(el)]));
|
||||
|
||||
profile.description = bio.bio;
|
||||
|
||||
profile.dateOfBirth = qu.extractDate(bio.birthdate, 'YYYY-MM-DD');
|
||||
profile.birthPlace = bio.birthplace;
|
||||
|
||||
profile.hairColor = bio.hair_color;
|
||||
profile.eyes = bio.eye_color;
|
||||
|
||||
profile.height = feetInchesToCm(bio.height);
|
||||
profile.weight = lbsToKg(bio.weight);
|
||||
profile.measurements = bio.measurements;
|
||||
|
||||
profile.avatar = query.img('.model-pic img');
|
||||
|
||||
profile.scenes = scrapeAll(qu.initAll(query.all('.content-item')));
|
||||
|
||||
return profile;
|
||||
}
|
||||
|
||||
async function searchActor(baseActor, channel) {
|
||||
const searchRes = await http.post(`${channel.url}/search-preview`, { q: slugify(baseActor.name, ' ') }, {
|
||||
encodeJSON: false,
|
||||
headers: {
|
||||
'Accept-Language': 'en-US,en;',
|
||||
},
|
||||
});
|
||||
|
||||
if (searchRes.ok) {
|
||||
const actorUrl = searchRes.body.find((item) => item.type === 'model' && slugify(item.title) === baseActor.slug)?.url;
|
||||
|
||||
return actorUrl || null;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
async function fetchProfile(baseActor, context, include, retry = false) {
|
||||
const actorUrl = (!retry && baseActor.url) || await searchActor(baseActor, context.entity);
|
||||
|
||||
if (!actorUrl) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const res = await qu.get(actorUrl);
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeProfile(res.item, actorUrl);
|
||||
}
|
||||
|
||||
if (baseActor.url) {
|
||||
return fetchProfile(baseActor, context, include, true);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
fetchLatest,
|
||||
fetchProfile,
|
||||
fetchUpcoming,
|
||||
scrapeAll,
|
||||
scrapeScene,
|
||||
};
|
||||
143
src/scrapers/radical.js
Normal file
143
src/scrapers/radical.js
Normal file
@@ -0,0 +1,143 @@
|
||||
'use strict';
|
||||
|
||||
const http = require('../utils/http');
|
||||
const qu = require('../utils/qu');
|
||||
const slugify = require('../utils/slugify');
|
||||
const { lbsToKg, feetInchesToCm } = require('../utils/convert');
|
||||
|
||||
function scrapeSceneMetadata(data, channel) {
|
||||
const release = {};
|
||||
|
||||
release.entryId = data.id;
|
||||
release.url = `${channel.url}/tour/videos/${data.id}/${slugify(data.title, '-', { removePunctuation: true })}`;
|
||||
|
||||
release.title = data.title;
|
||||
release.description = data.description;
|
||||
|
||||
release.date = new Date(data.release_date);
|
||||
release.duration = qu.durationToSeconds(data.videos_duration);
|
||||
|
||||
release.actors = data.models.map((model) => ({
|
||||
entryId: model.id,
|
||||
name: model.name,
|
||||
gender: model.gender,
|
||||
avatar: model.thumb,
|
||||
url: `${channel.url}/tour/models/${model.id}/${slugify(model.name, '-', { removePunctuation: true })}`,
|
||||
}));
|
||||
|
||||
release.poster = data.trailer?.poster || [data.thumb?.replace('mobile.jpg', '.jpg'), data.thumb];
|
||||
release.photos = [
|
||||
data.extra_thumbs?.find((url) => /portrait1.jpg/.test(url)),
|
||||
data.extra_thumbs?.find((url) => /scene.jpg/.test(url)),
|
||||
data.extra_thumbs?.find((url) => /portrait2.jpg/.test(url)),
|
||||
]; // ordered by chronology: portrait1.jpg and scene.jpg are usually pre-shoot poses, portrait2.jpg is the cumshot aftermath
|
||||
|
||||
release.trailer = data.trailer && {
|
||||
src: data.trailer.src,
|
||||
type: data.trailer.type,
|
||||
};
|
||||
|
||||
release.teaser = data.special_thumbs;
|
||||
|
||||
release.tags = [].concat(data.tags?.map((tag) => tag.name));
|
||||
release.qualities = data.downloads && Object.values(data.downloads)?.map((download) => download.meta_data.height);
|
||||
release.stars = data.rating;
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
function scrapeAllMetadata(scenes, channel) {
|
||||
return scenes.map((data) => scrapeSceneMetadata(data, channel));
|
||||
}
|
||||
|
||||
function scrapeProfileMetadata(data, channel) {
|
||||
const profile = {};
|
||||
|
||||
profile.entryId = data.id;
|
||||
profile.url = `${channel.url}/tour/models/${data.id}/${slugify(data.name, '-', { removePunctuation: true })}`;
|
||||
|
||||
profile.description = data.attributes.bio?.value;
|
||||
profile.dateOfBirth = qu.parseDate(data.attributes.birthdate?.value, 'YYYY-MM-DD');
|
||||
profile.gender = data.gender;
|
||||
profile.age = data.attributes.age?.value;
|
||||
profile.birthPlace = data.attributes.born?.value;
|
||||
|
||||
profile.measurements = data.attributes.measurements?.value;
|
||||
profile.height = feetInchesToCm(data.attributes.height?.value);
|
||||
profile.weight = lbsToKg(data.attributes.weight?.value);
|
||||
|
||||
profile.eyes = data.attributes.eyes?.value;
|
||||
profile.hairColor = data.attributes.hair?.value;
|
||||
|
||||
profile.avatar = data.thumb;
|
||||
profile.date = new Date(data.publish_date);
|
||||
|
||||
return profile;
|
||||
}
|
||||
|
||||
async function fetchLatestMetadata(channel, page = 1) {
|
||||
const url = `${channel.url}/tour/videos?page=${page}`;
|
||||
const res = await http.get(url, {
|
||||
parse: true,
|
||||
extract: {
|
||||
runScripts: 'dangerously',
|
||||
},
|
||||
});
|
||||
|
||||
if (res.ok && res.window.__DATA__) {
|
||||
return scrapeAllMetadata(res.window.__DATA__.videos.items, channel);
|
||||
}
|
||||
|
||||
if (res.ok) {
|
||||
return res.window.__DATA__?.error || null;
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchSceneMetadata(url, channel) {
|
||||
const res = await http.get(url, {
|
||||
parse: true,
|
||||
extract: {
|
||||
runScripts: 'dangerously',
|
||||
},
|
||||
});
|
||||
|
||||
if (res.ok && res.window.__DATA__?.video) {
|
||||
return scrapeSceneMetadata(res.window.__DATA__.video, channel);
|
||||
}
|
||||
|
||||
if (res.ok) {
|
||||
return res.window.__DATA__?.error || null;
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchProfileMetadata(actor, channel) {
|
||||
const res = await http.get(`${channel.url}/tour/search-preview/${actor.name}`, {
|
||||
headers: {
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
},
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
const model = res.body.models?.items.find((modelX) => slugify(modelX.name) === actor.slug);
|
||||
|
||||
if (model) {
|
||||
return scrapeProfileMetadata(model, channel);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
metadata: {
|
||||
fetchLatest: fetchLatestMetadata,
|
||||
fetchScene: fetchSceneMetadata,
|
||||
fetchProfile: fetchProfileMetadata,
|
||||
},
|
||||
};
|
||||
@@ -35,7 +35,7 @@ const karups = require('./karups');
|
||||
const kellymadison = require('./kellymadison');
|
||||
const killergram = require('./killergram');
|
||||
const kink = require('./kink');
|
||||
const legalporno = require('./legalporno');
|
||||
const analvids = require('./analvids');
|
||||
const littlecapricedreams = require('./littlecapricedreams');
|
||||
const mikeadriano = require('./mikeadriano');
|
||||
const mindgeek = require('./mindgeek');
|
||||
@@ -51,6 +51,8 @@ const pascalssubsluts = require('./pascalssubsluts'); // reserved keyword
|
||||
const pierrewoodman = require('./pierrewoodman');
|
||||
const pinkyxxx = require('./pinkyxxx');
|
||||
const privateNetwork = require('./private'); // reserved keyword
|
||||
const purgatoryx = require('./purgatoryx'); // reserved keyword
|
||||
const radical = require('./radical');
|
||||
const score = require('./score');
|
||||
const spizoo = require('./spizoo');
|
||||
const teamskeet = require('./teamskeet');
|
||||
@@ -116,7 +118,7 @@ const scrapers = {
|
||||
killergram,
|
||||
kink,
|
||||
kinkvr: badoink,
|
||||
legalporno,
|
||||
analvids,
|
||||
letsdoeit: porndoe,
|
||||
littlecapricedreams,
|
||||
mamacitaz: porndoe,
|
||||
@@ -136,6 +138,8 @@ const scrapers = {
|
||||
porncz,
|
||||
pornpros: whalemember,
|
||||
private: privateNetwork,
|
||||
purgatoryx,
|
||||
radical,
|
||||
score,
|
||||
sexyhub: mindgeek,
|
||||
spizoo,
|
||||
@@ -206,6 +210,7 @@ const scrapers = {
|
||||
gaywire: bangbros,
|
||||
girlfaction: fullpornnetwork,
|
||||
gloryholesecrets: aziani,
|
||||
gotfilled: radical,
|
||||
hergape: fullpornnetwork,
|
||||
hitzefrei,
|
||||
homemadeanalwhores: fullpornnetwork,
|
||||
@@ -214,6 +219,7 @@ const scrapers = {
|
||||
hushpass: hush,
|
||||
hussiepass: hush,
|
||||
iconmale: mindgeek,
|
||||
inserted: radical,
|
||||
interracialpass: hush,
|
||||
interracialpovs: hush,
|
||||
inthecrack,
|
||||
@@ -224,7 +230,7 @@ const scrapers = {
|
||||
killergram,
|
||||
kink,
|
||||
kinkvr: badoink,
|
||||
legalporno,
|
||||
analvids,
|
||||
letsdoeit: porndoe,
|
||||
littlecapricedreams,
|
||||
mamacitaz: porndoe,
|
||||
@@ -255,6 +261,7 @@ const scrapers = {
|
||||
povperverts: fullpornnetwork,
|
||||
povpornstars: hush,
|
||||
private: privateNetwork,
|
||||
purgatoryx,
|
||||
realitykings: mindgeek,
|
||||
realvr: badoink,
|
||||
roccosiffredi: famedigital,
|
||||
|
||||
@@ -14,13 +14,16 @@ function scrapeAll(scenes) {
|
||||
release.url = query.url('a');
|
||||
release.entryId = getEntryId(release.url);
|
||||
|
||||
release.title = query.cnt('.title-label a');
|
||||
release.actors = query.all('.update_models a').map((el) => ({
|
||||
release.title = query.cnt('.title-label a, .thumb-title a, .p-7, .text h3');
|
||||
release.date = query.date('.date-label', 'MM/DD/YYYY');
|
||||
|
||||
release.actors = query.all(['.update_models a', '.tour_update_models a', '.pornstar-label span']).map((el) => ({
|
||||
name: query.cnt(el),
|
||||
url: query.url(el, null),
|
||||
}));
|
||||
|
||||
release.poster = query.img('a img');
|
||||
release.teaser = query.video('.leVideo source');
|
||||
|
||||
return release;
|
||||
});
|
||||
@@ -30,21 +33,21 @@ function scrapeScene({ query }, url) {
|
||||
const release = {};
|
||||
|
||||
release.entryId = getEntryId(url);
|
||||
release.title = query.cnt('#media-holder .title');
|
||||
release.title = query.cnt(['#media-holder .title', '.content-holder h1', '#scene h1', 'h2.titular', 'title'])?.replace(/\s+-$/, '');
|
||||
|
||||
release.date = query.date('#sceneInfo .date', 'YYYY-MM-DD');
|
||||
release.duration = query.duration('#sceneInfo .data-others', /\d+:\d+/);
|
||||
release.date = query.date('#sceneInfo .date, #trailer-data .date', 'YYYY-MM-DD');
|
||||
release.duration = query.duration('#sceneInfo .data-others, #trailer-data', /\d+:\d+/);
|
||||
|
||||
release.description = query.cnt('#sceneInfo .description');
|
||||
release.description = query.cnt('#sceneInfo .description, #trailer-data > div:first-child p');
|
||||
|
||||
release.actors = query.all('#sceneInfo .data-others a[href*="/models"]').map((el) => ({
|
||||
release.actors = query.all('#sceneInfo .data-others a[href*="/models"], #trailer-data a[href*="/models"]').map((el) => ({
|
||||
name: query.el(el, null, 'title'),
|
||||
url: query.url(el, null),
|
||||
}));
|
||||
|
||||
release.tags = query.cnts('.categories-holder a');
|
||||
release.tags = query.cnts('.categories-holder a, #sceneInfo a[href*="/categories"], #trailer-data a[href*="/categories"]');
|
||||
|
||||
const poster = query.img('#video-holder .update_thumb') || query.poster('#trailervideo');
|
||||
const poster = query.img(['#video-holder .update_thumb', '#noMore .update_thumb', '#hpromo .update_thumb', '.trailer-thumb']) || query.poster('#trailervideo');
|
||||
const posterPathname = poster && new URL(poster)?.pathname;
|
||||
|
||||
release.poster = [poster, poster?.replace(/imgw=\w+/, 'imgw=680')];
|
||||
@@ -56,7 +59,8 @@ function scrapeScene({ query }, url) {
|
||||
src,
|
||||
]);
|
||||
|
||||
release.trailer = query.video('#trailervideo source');
|
||||
release.trailer = query.video('#trailervideo source[type="video/mp4"], #FulsSizeVideo source[type="video/mp4"]'); // sic
|
||||
release.teaser = query.video('#trailer-video source[src*="/videothumbs"]');
|
||||
|
||||
return release;
|
||||
}
|
||||
@@ -127,7 +131,7 @@ function scrapeProfile({ query, el }) {
|
||||
}
|
||||
|
||||
async function fetchLatest(channel, page) {
|
||||
const res = await qu.getAll(`${channel.url}/categories/movies_${page}_d.html`, '.thumb-big');
|
||||
const res = await qu.getAll(`${channel.url}/categories/movies_${page}_d.html`, '.thumb-big, .thumb-video, .thumbnail, .thumbnail-popular, .full-thumbnail');
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeAll(res.items, channel);
|
||||
|
||||
@@ -35,7 +35,7 @@ function scrapeScene(scene, channel) {
|
||||
}));
|
||||
|
||||
release.poster = [
|
||||
scene.img.replace('med.jpg', 'hi.jpg'),
|
||||
// scene.img.replace('med.jpg', 'hi.jpg'), // this image is not always from the same scene! for example on Petite Teens 18
|
||||
scene.img,
|
||||
];
|
||||
|
||||
@@ -129,6 +129,11 @@ async function fetchLatest(channel, page = 1, { parameters }) {
|
||||
}
|
||||
|
||||
async function fetchScene(url, channel, baseScene, { parameters }) {
|
||||
if (baseScene?.entryId) {
|
||||
// overview and deep data is the same, don't hit server unnecessarily
|
||||
return baseScene;
|
||||
}
|
||||
|
||||
const sceneSlug = new URL(url).pathname.match(/\/([\w-]+$)/)[1];
|
||||
const res = await http.get(`${parameters.videos}/${sceneSlug}`);
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
const config = require('config');
|
||||
const faker = require('faker');
|
||||
const nanoid = require('nanoid');
|
||||
const { nanoid } = require('nanoid');
|
||||
const moment = require('moment');
|
||||
|
||||
const knex = require('../knex');
|
||||
@@ -232,7 +232,7 @@ function actors(release) {
|
||||
}
|
||||
|
||||
async function fetchLatest(entity, page, options) {
|
||||
return Promise.all(Array.from({ length: 100 }, async (value, index) => {
|
||||
return Promise.all(Array.from({ length: 10000 }, async (value, index) => {
|
||||
const release = {};
|
||||
|
||||
release.entryId = nanoid();
|
||||
@@ -249,7 +249,8 @@ async function fetchLatest(entity, page, options) {
|
||||
.where('is_sfw', true)
|
||||
.pluck('path')
|
||||
.orderByRaw('random()')
|
||||
.limit(Math.floor(Math.random() * 10) + 1);
|
||||
// .limit(Math.floor(Math.random() * 10) + 1)
|
||||
.limit(100);
|
||||
|
||||
// const poster = 'sfw/kittens/thumbs/iNEXVlX-RLs.jpeg';
|
||||
|
||||
@@ -261,7 +262,7 @@ async function fetchLatest(entity, page, options) {
|
||||
.select('name')
|
||||
.where('priority', '>', 7)
|
||||
.orderByRaw('random()')
|
||||
.limit(faker.random.number({ min: 2, max: 15 }))
|
||||
.limit(faker.datatype.number({ min: 15, max: 25 }))
|
||||
.pluck('name');
|
||||
|
||||
release.actors = [...actors(release), null]; // include empty actor to ensure proper handling
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
const Promise = require('bluebird');
|
||||
const moment = require('moment');
|
||||
|
||||
const logger = require('../logger')(__filename);
|
||||
const http = require('../utils/http');
|
||||
const slugify = require('../utils/slugify');
|
||||
|
||||
@@ -141,24 +142,69 @@ async function getTrailer(scene, channel, url) {
|
||||
return null;
|
||||
}
|
||||
|
||||
async function getPhotos(url) {
|
||||
/*
|
||||
async function getPhotosLegacy(url) {
|
||||
const htmlRes = await http.get(url, {
|
||||
extract: {
|
||||
runScripts: 'dangerously',
|
||||
},
|
||||
});
|
||||
|
||||
const state = htmlRes?.window.__APOLLO_STATE__;
|
||||
const key = Object.values(state.ROOT_QUERY).find((query) => query?.__ref)?.__ref;
|
||||
const data = state[key];
|
||||
try {
|
||||
const state = htmlRes?.window?.__APOLLO_STATE__;
|
||||
|
||||
console.log(data);
|
||||
if (!state) {
|
||||
return [];
|
||||
}
|
||||
|
||||
if (!data) {
|
||||
const key = Object.values(state?.ROOT_QUERY).find((query) => query?.__ref)?.__ref;
|
||||
const data = state[key];
|
||||
|
||||
if (!data) {
|
||||
return [];
|
||||
}
|
||||
|
||||
return data.carousel.slice(1).map((photo) => photo.main?.[0].src).filter(Boolean);
|
||||
} catch (error) {
|
||||
logger.warn(`Failed to retrieve Vixen images: ${error.message}`);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
return data.carousel.slice(1).map((photo) => photo.main?.[0].src).filter(Boolean);
|
||||
async function getPhotos(url) {
|
||||
const htmlRes = await http.get(url, {
|
||||
parse: true,
|
||||
extract: {
|
||||
runScripts: 'dangerously',
|
||||
},
|
||||
});
|
||||
|
||||
try {
|
||||
const state = htmlRes?.window?.__APOLLO_STATE__;
|
||||
|
||||
console.log('state', state);
|
||||
|
||||
if (!state) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const key = Object.values(state?.ROOT_QUERY).find((query) => query?.__ref)?.__ref;
|
||||
const data = state[key];
|
||||
|
||||
console.log('data', data);
|
||||
|
||||
if (!data) {
|
||||
return [];
|
||||
}
|
||||
|
||||
console.log(data.carousel);
|
||||
|
||||
return data.carousel.slice(1).map((photo) => photo.main?.[0].src).filter(Boolean);
|
||||
} catch (error) {
|
||||
logger.warn(`Failed to retrieve Vixen images: ${error.message}`);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
function scrapeAll(scenes, site, origin) {
|
||||
|
||||
@@ -7,7 +7,7 @@ const http = require('../utils/http');
|
||||
|
||||
function scrapeLatest(html, site) {
|
||||
const { document } = new JSDOM(html).window;
|
||||
const { origin } = new URL(site.url);
|
||||
const { origin } = new URL(site.parameters?.latest || site.url);
|
||||
|
||||
const videos = Array.from(document.querySelectorAll('.video-releases-list')).slice(-1)[0];
|
||||
|
||||
@@ -119,7 +119,7 @@ function scrapeScene(html, site, url) {
|
||||
}
|
||||
|
||||
async function fetchLatest(site, page = 1) {
|
||||
const url = `${site.url}?page=${page}`;
|
||||
const url = `${site.parameters?.latest || site.url}?page=${page}`;
|
||||
const res = await http.get(url);
|
||||
|
||||
if (res.statusCode === 200) {
|
||||
|
||||
@@ -1,14 +1,13 @@
|
||||
'use strict';
|
||||
|
||||
const { fetchLatest, fetchUpcoming, scrapeScene, fetchProfile } = require('./gamma');
|
||||
const http = require('../utils/http');
|
||||
const qu = require('../utils/qu');
|
||||
|
||||
async function fetchScene(url, site, baseRelease, options) {
|
||||
const res = await http.get(url);
|
||||
const res = await qu.get(url);
|
||||
const release = await scrapeScene(res.item, url, site, baseRelease, null, options);
|
||||
|
||||
const release = await scrapeScene(res.body.toString(), url, site, baseRelease, null, options);
|
||||
|
||||
const siteDomain = release.$('meta[name="twitter:domain"]').attr('content') || 'allblackx.com'; // only AllBlackX has no twitter domain, no other useful hints available
|
||||
const siteDomain = release.query.el('meta[name="twitter:domain"]', 'content') || 'allblackx.com'; // only AllBlackX has no twitter domain, no other useful hints available
|
||||
const siteSlug = siteDomain && siteDomain.split('.')[0].toLowerCase();
|
||||
// const siteUrl = siteDomain && `https://www.${siteDomain}`;
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
'use strict';
|
||||
|
||||
const config = require('config');
|
||||
const Promise = require('bluebird');
|
||||
|
||||
const argv = require('./argv');
|
||||
const logger = require('./logger')(__filename);
|
||||
@@ -8,6 +9,7 @@ const knex = require('./knex');
|
||||
const slugify = require('./utils/slugify');
|
||||
const bulkInsert = require('./utils/bulk-insert');
|
||||
const resolvePlace = require('./utils/resolve-place');
|
||||
const chunk = require('./utils/chunk');
|
||||
const { formatDate } = require('./utils/qu');
|
||||
const { associateActors, associateDirectors, scrapeActors, toBaseActors } = require('./actors');
|
||||
const { associateReleaseTags } = require('./tags');
|
||||
@@ -134,7 +136,7 @@ async function attachStudios(releases) {
|
||||
return releasesWithStudio;
|
||||
}
|
||||
|
||||
function attachReleaseIds(releases, storedReleases) {
|
||||
function attachReleaseIds(releases, storedReleases, batchId) {
|
||||
const storedReleaseIdsByEntityIdAndEntryId = storedReleases.reduce((acc, release) => {
|
||||
if (!acc[release.entity_id]) acc[release.entity_id] = {};
|
||||
acc[release.entity_id][release.entry_id] = release.id;
|
||||
@@ -144,7 +146,7 @@ function attachReleaseIds(releases, storedReleases) {
|
||||
|
||||
const releasesWithId = releases.map((release) => {
|
||||
if (!release.entity) {
|
||||
logger.error(`No entitity available for ${release.url}`);
|
||||
logger.error(`No entity available for ${release.url}`);
|
||||
return null;
|
||||
}
|
||||
|
||||
@@ -155,6 +157,7 @@ function attachReleaseIds(releases, storedReleases) {
|
||||
return {
|
||||
...release,
|
||||
id,
|
||||
batchId,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -192,13 +195,16 @@ function filterInternalDuplicateReleases(releases) {
|
||||
|
||||
async function filterDuplicateReleases(releases) {
|
||||
const internalUniqueReleases = filterInternalDuplicateReleases(releases);
|
||||
const internalUniqueReleaseChunks = chunk(internalUniqueReleases);
|
||||
|
||||
const duplicateReleaseEntries = await knex('releases')
|
||||
.whereIn(['entry_id', 'entity_id'], internalUniqueReleases.map((release) => [release.entryId, release.entity.id]))
|
||||
.orWhereIn(['entry_id', 'entity_id'], internalUniqueReleases
|
||||
const duplicateReleaseEntryChunks = await Promise.map(internalUniqueReleaseChunks, async (internalUniqueReleasesChunk) => knex('releases')
|
||||
.whereIn(['entry_id', 'entity_id'], internalUniqueReleasesChunk.map((release) => [release.entryId, release.entity.id]))
|
||||
.orWhereIn(['entry_id', 'entity_id'], internalUniqueReleasesChunk
|
||||
// scene IDs shared across network, mark as duplicate so scene can be updated with channel if only available on release day (i.e. Perv City)
|
||||
.filter((release) => release.entity.parent?.parameters?.networkEntryIds)
|
||||
.map((release) => [release.entryId, release.entity.parent.id]));
|
||||
.map((release) => [release.entryId, release.entity.parent.id])), { concurrency: 10 });
|
||||
|
||||
const duplicateReleaseEntries = duplicateReleaseEntryChunks.flat();
|
||||
|
||||
const duplicateReleasesByEntityIdAndEntryId = duplicateReleaseEntries.reduce((acc, release) => {
|
||||
if (!acc[release.entity_id]) acc[release.entity_id] = {};
|
||||
@@ -229,6 +235,7 @@ async function updateSceneSearch(releaseIds) {
|
||||
TO_TSVECTOR(
|
||||
'english',
|
||||
COALESCE(releases.title, '') || ' ' ||
|
||||
releases.entry_id || ' ' ||
|
||||
entities.name || ' ' ||
|
||||
entities.slug || ' ' ||
|
||||
COALESCE(array_to_string(entities.alias, ' '), '') || ' ' ||
|
||||
@@ -308,12 +315,148 @@ async function storeChapters(releases) {
|
||||
await associateReleaseMedia(chaptersWithId, 'chapter');
|
||||
}
|
||||
|
||||
async function storeScenes(releases) {
|
||||
async function associateMovieScenes(movies, movieScenes) {
|
||||
const moviesByEntityIdAndEntryId = movies.reduce((acc, movie) => ({
|
||||
...acc,
|
||||
[movie.entity.id]: {
|
||||
...acc[movie.entity.id],
|
||||
[movie.entryId]: movie,
|
||||
},
|
||||
}), {});
|
||||
|
||||
const associations = movieScenes.map((scene) => {
|
||||
if (!scene.movie) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const sceneMovie = moviesByEntityIdAndEntryId[scene.entity.id]?.[scene.movie.entryId]
|
||||
|| moviesByEntityIdAndEntryId[scene.entity.parent?.id]?.[scene.movie.entryId];
|
||||
|
||||
if (sceneMovie?.id) {
|
||||
return {
|
||||
movie_id: sceneMovie.id,
|
||||
scene_id: scene.id,
|
||||
};
|
||||
}
|
||||
|
||||
return null;
|
||||
}).filter(Boolean);
|
||||
|
||||
await bulkInsert('movies_scenes', associations, false);
|
||||
}
|
||||
|
||||
async function associateSerieScenes(series, serieScenes) {
|
||||
const seriesByEntityIdAndEntryId = series.reduce((acc, serie) => ({
|
||||
...acc,
|
||||
[serie.entity.id]: {
|
||||
...acc[serie.entity.id],
|
||||
[serie.entryId]: serie,
|
||||
},
|
||||
}), {});
|
||||
|
||||
const associations = serieScenes.map((scene) => {
|
||||
if (!scene.serie) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const sceneSerie = seriesByEntityIdAndEntryId[scene.entity.id]?.[scene.serie.entryId]
|
||||
|| seriesByEntityIdAndEntryId[scene.entity.parent?.id]?.[scene.serie.entryId];
|
||||
|
||||
if (sceneSerie?.id) {
|
||||
return {
|
||||
serie_id: sceneSerie.id,
|
||||
scene_id: scene.id,
|
||||
};
|
||||
}
|
||||
|
||||
return null;
|
||||
}).filter(Boolean);
|
||||
|
||||
await bulkInsert('series_scenes', associations, false);
|
||||
}
|
||||
|
||||
async function updateMovieSearch(movieIds, target = 'movie') {
|
||||
logger.info(`Updating search documents for ${movieIds ? movieIds.length : 'all' } ${target}s`);
|
||||
|
||||
const documents = await knex.raw(`
|
||||
SELECT
|
||||
${target}s.id AS ${target}_id,
|
||||
TO_TSVECTOR(
|
||||
'english',
|
||||
COALESCE(${target}s.title, '') || ' ' ||
|
||||
entities.name || ' ' ||
|
||||
entities.slug || ' ' ||
|
||||
COALESCE(array_to_string(entities.alias, ' '), '') || ' ' ||
|
||||
COALESCE(parents.name, '') || ' ' ||
|
||||
COALESCE(parents.slug, '') || ' ' ||
|
||||
COALESCE(array_to_string(parents.alias, ' '), '') || ' ' ||
|
||||
COALESCE(TO_CHAR(${target}s.date, 'YYYY YY MM FMMM FMMonth mon DD FMDD'), '') || ' ' ||
|
||||
STRING_AGG(COALESCE(releases.title, ''), ' ') || ' ' ||
|
||||
STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' ||
|
||||
STRING_AGG(COALESCE(tags.name, ''), ' ')
|
||||
) as document
|
||||
FROM ${target}s
|
||||
LEFT JOIN entities ON ${target}s.entity_id = entities.id
|
||||
LEFT JOIN entities AS parents ON parents.id = entities.parent_id
|
||||
LEFT JOIN ${target}s_scenes ON ${target}s_scenes.${target}_id = ${target}s.id
|
||||
LEFT JOIN releases ON releases.id = ${target}s_scenes.scene_id
|
||||
LEFT JOIN releases_actors ON releases_actors.release_id = ${target}s_scenes.scene_id
|
||||
LEFT JOIN releases_tags ON releases_tags.release_id = releases.id
|
||||
LEFT JOIN actors ON actors.id = releases_actors.actor_id
|
||||
LEFT JOIN tags ON tags.id = releases_tags.tag_id
|
||||
${movieIds ? `WHERE ${target}s.id = ANY(?)` : ''}
|
||||
GROUP BY ${target}s.id, entities.name, entities.slug, entities.alias, parents.name, parents.slug, parents.alias;
|
||||
`, movieIds && [movieIds]);
|
||||
|
||||
if (documents.rows?.length > 0) {
|
||||
await bulkInsert(`${target}s_search`, documents.rows, [`${target}_id`]);
|
||||
}
|
||||
}
|
||||
|
||||
async function storeMovies(movies, useBatchId) {
|
||||
if (!movies || movies.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const { uniqueReleases } = await filterDuplicateReleases(movies);
|
||||
const [batchId] = useBatchId ? [useBatchId] : await knex('batches').insert({ comment: null }).returning('id');
|
||||
|
||||
const curatedMovieEntries = await Promise.all(uniqueReleases.map((release) => curateReleaseEntry(release, batchId, null, 'movie')));
|
||||
|
||||
const storedMovies = await bulkInsert('movies', curatedMovieEntries, ['entity_id', 'entry_id'], true);
|
||||
const moviesWithId = attachReleaseIds(movies, storedMovies);
|
||||
|
||||
await updateMovieSearch(moviesWithId.map((movie) => movie.id));
|
||||
await associateReleaseMedia(moviesWithId, 'movie');
|
||||
|
||||
return moviesWithId;
|
||||
}
|
||||
|
||||
async function storeSeries(series, useBatchId) {
|
||||
if (!series || series.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const { uniqueReleases } = await filterDuplicateReleases(series);
|
||||
const [batchId] = useBatchId ? [useBatchId] : await knex('batches').insert({ comment: null }).returning('id');
|
||||
|
||||
const curatedSerieEntries = await Promise.all(uniqueReleases.map((release) => curateReleaseEntry(release, batchId, null, 'serie')));
|
||||
|
||||
const storedSeries = await bulkInsert('series', curatedSerieEntries, ['entity_id', 'entry_id'], true);
|
||||
const seriesWithId = attachReleaseIds(series, storedSeries);
|
||||
|
||||
await updateMovieSearch(seriesWithId.map((serie) => serie.id), 'serie');
|
||||
await associateReleaseMedia(seriesWithId, 'serie');
|
||||
|
||||
return seriesWithId;
|
||||
}
|
||||
|
||||
async function storeScenes(releases, useBatchId) {
|
||||
if (!releases || releases.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const [batchId] = await knex('batches').insert({ comment: null }).returning('id');
|
||||
const [batchId] = useBatchId ? [useBatchId] : await knex('batches').insert({ comment: null }).returning('id');
|
||||
|
||||
const releasesWithChannels = await attachChannelEntities(releases);
|
||||
const releasesWithBaseActors = releasesWithChannels.map((release) => ({ ...release, actors: toBaseActors(release.actors) }));
|
||||
@@ -327,8 +470,8 @@ async function storeScenes(releases) {
|
||||
|
||||
const storedReleaseEntries = Array.isArray(storedReleases) ? storedReleases : [];
|
||||
|
||||
const uniqueReleasesWithId = attachReleaseIds(uniqueReleases, storedReleaseEntries);
|
||||
const duplicateReleasesWithId = attachReleaseIds(duplicateReleases, duplicateReleaseEntries);
|
||||
const uniqueReleasesWithId = attachReleaseIds(uniqueReleases, storedReleaseEntries, batchId);
|
||||
const duplicateReleasesWithId = attachReleaseIds(duplicateReleases, duplicateReleaseEntries, batchId);
|
||||
const releasesWithId = uniqueReleasesWithId.concat(duplicateReleasesWithId);
|
||||
|
||||
const updated = await knex.raw(`
|
||||
@@ -348,12 +491,14 @@ async function storeScenes(releases) {
|
||||
scenes: JSON.stringify(duplicateReleasesWithId),
|
||||
});
|
||||
|
||||
const [actors] = await Promise.all([
|
||||
const [actors, storedSeries] = await Promise.all([
|
||||
associateActors(releasesWithId, batchId),
|
||||
storeSeries(releasesWithId.map((release) => release.serie && { ...release.serie, entity: release.entity }).filter(Boolean), batchId),
|
||||
associateReleaseTags(releasesWithId),
|
||||
storeChapters(releasesWithId),
|
||||
]);
|
||||
|
||||
await associateSerieScenes(storedSeries, releasesWithId);
|
||||
await associateDirectors(releasesWithId, batchId); // some directors may also be actors, don't associate at the same time
|
||||
await updateSceneSearch(releasesWithId.map((release) => release.id));
|
||||
|
||||
@@ -371,92 +516,6 @@ async function storeScenes(releases) {
|
||||
return releasesWithId;
|
||||
}
|
||||
|
||||
async function associateMovieScenes(movies, movieScenes) {
|
||||
const moviesByEntityIdAndEntryId = movies.reduce((acc, movie) => ({
|
||||
...acc,
|
||||
[movie.entity.id]: {
|
||||
...acc[movie.entity.id],
|
||||
[movie.entryId]: movie,
|
||||
},
|
||||
}), {});
|
||||
|
||||
const associations = movieScenes.map((scene) => {
|
||||
if (!scene.movie) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const sceneMovie = moviesByEntityIdAndEntryId[scene.entity.id]?.[scene.movie.entryId];
|
||||
|
||||
if (sceneMovie?.id) {
|
||||
return {
|
||||
movie_id: sceneMovie.id,
|
||||
scene_id: scene.id,
|
||||
};
|
||||
}
|
||||
|
||||
return null;
|
||||
}).filter(Boolean);
|
||||
|
||||
await bulkInsert('movies_scenes', associations, false);
|
||||
}
|
||||
|
||||
async function updateMovieSearch(movieIds) {
|
||||
logger.info(`Updating search documents for ${movieIds ? movieIds.length : 'all' } movies`);
|
||||
|
||||
const documents = await knex.raw(`
|
||||
SELECT
|
||||
movies.id AS movie_id,
|
||||
TO_TSVECTOR(
|
||||
'english',
|
||||
COALESCE(movies.title, '') || ' ' ||
|
||||
entities.name || ' ' ||
|
||||
entities.slug || ' ' ||
|
||||
COALESCE(array_to_string(entities.alias, ' '), '') || ' ' ||
|
||||
COALESCE(parents.name, '') || ' ' ||
|
||||
COALESCE(parents.slug, '') || ' ' ||
|
||||
COALESCE(array_to_string(parents.alias, ' '), '') || ' ' ||
|
||||
COALESCE(TO_CHAR(movies.date, 'YYYY YY MM FMMM FMMonth mon DD FMDD'), '') || ' ' ||
|
||||
STRING_AGG(COALESCE(releases.title, ''), ' ') || ' ' ||
|
||||
STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' ||
|
||||
STRING_AGG(COALESCE(tags.name, ''), ' ')
|
||||
) as document
|
||||
FROM movies
|
||||
LEFT JOIN entities ON movies.entity_id = entities.id
|
||||
LEFT JOIN entities AS parents ON parents.id = entities.parent_id
|
||||
LEFT JOIN movies_scenes ON movies_scenes.movie_id = movies.id
|
||||
LEFT JOIN releases ON releases.id = movies_scenes.scene_id
|
||||
LEFT JOIN releases_actors ON releases_actors.release_id = movies_scenes.scene_id
|
||||
LEFT JOIN releases_tags ON releases_tags.release_id = releases.id
|
||||
LEFT JOIN actors ON actors.id = releases_actors.actor_id
|
||||
LEFT JOIN tags ON tags.id = releases_tags.tag_id
|
||||
${movieIds ? 'WHERE movies.id = ANY(?)' : ''}
|
||||
GROUP BY movies.id, entities.name, entities.slug, entities.alias, parents.name, parents.slug, parents.alias;
|
||||
`, movieIds && [movieIds]);
|
||||
|
||||
if (documents.rows?.length > 0) {
|
||||
await bulkInsert('movies_search', documents.rows, ['movie_id']);
|
||||
}
|
||||
}
|
||||
|
||||
async function storeMovies(movies) {
|
||||
if (!movies || movies.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
const { uniqueReleases } = await filterDuplicateReleases(movies);
|
||||
const [batchId] = await knex('batches').insert({ comment: null }).returning('id');
|
||||
|
||||
const curatedMovieEntries = await Promise.all(uniqueReleases.map((release) => curateReleaseEntry(release, batchId, null, 'movie')));
|
||||
|
||||
const storedMovies = await bulkInsert('movies', curatedMovieEntries, ['entity_id', 'entry_id'], true);
|
||||
const moviesWithId = attachReleaseIds(movies, storedMovies);
|
||||
|
||||
await updateMovieSearch(moviesWithId.map((movie) => movie.id));
|
||||
await associateReleaseMedia(moviesWithId, 'movie');
|
||||
|
||||
return moviesWithId;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
associateMovieScenes,
|
||||
storeScenes,
|
||||
|
||||
16
src/tags.js
16
src/tags.js
@@ -99,14 +99,20 @@ async function matchReleaseTags(releases) {
|
||||
|
||||
async function getEntityTags(releases) {
|
||||
const entityIds = releases.map((release) => release.entity?.id).filter(Boolean);
|
||||
const entityTags = await knex('entities_tags').whereIn('entity_id', entityIds);
|
||||
const entityTags = await knex('entities_tags')
|
||||
.select('id', 'name', 'entity_id')
|
||||
.whereIn('entity_id', entityIds)
|
||||
.leftJoin('tags', 'tags.id', 'entities_tags.tag_id');
|
||||
|
||||
const entityTagIdsByEntityId = entityTags.reduce((acc, entityTag) => {
|
||||
if (!acc[entityTag.entity_id]) {
|
||||
acc[entityTag.entity_id] = [];
|
||||
}
|
||||
|
||||
acc[entityTag.entity_id].push(entityTag.tag_id);
|
||||
acc[entityTag.entity_id].push({
|
||||
id: entityTag.id,
|
||||
name: entityTag.name,
|
||||
});
|
||||
|
||||
return acc;
|
||||
}, {});
|
||||
@@ -117,7 +123,7 @@ async function getEntityTags(releases) {
|
||||
function buildReleaseTagAssociations(releases, tagIdsBySlug, entityTagIdsByEntityId, type) {
|
||||
const tagAssociations = releases
|
||||
.map((release) => {
|
||||
const entityTagIds = entityTagIdsByEntityId[release.entity?.id]?.map((tag) => ({ id: tag.id, origin: tag.name })) || [];
|
||||
const entityTagIds = entityTagIdsByEntityId[release.entity?.id]?.map((tag) => ({ id: tag.id, original: tag.name })) || [];
|
||||
const releaseTags = release.tags?.filter(Boolean) || [];
|
||||
|
||||
const releaseTagsWithIds = releaseTags.every((tag) => typeof tag === 'number')
|
||||
@@ -152,9 +158,9 @@ async function associateReleaseTags(releases, type = 'release') {
|
||||
}
|
||||
|
||||
const tagIdsBySlug = await matchReleaseTags(releases);
|
||||
const EntityTagIdsByEntityId = await getEntityTags(releases);
|
||||
const entityTagIdsByEntityId = await getEntityTags(releases);
|
||||
|
||||
const tagAssociations = buildReleaseTagAssociations(releases, tagIdsBySlug, EntityTagIdsByEntityId, type);
|
||||
const tagAssociations = buildReleaseTagAssociations(releases, tagIdsBySlug, entityTagIdsByEntityId, type);
|
||||
|
||||
await bulkInsert(`${type}s_tags`, tagAssociations, false);
|
||||
}
|
||||
|
||||
1721
src/tools/analvids.js
Normal file
1721
src/tools/analvids.js
Normal file
File diff suppressed because it is too large
Load Diff
18
src/tools/knex-update.js
Normal file
18
src/tools/knex-update.js
Normal file
@@ -0,0 +1,18 @@
|
||||
'use strict';
|
||||
|
||||
const knex = require('../knex');
|
||||
|
||||
async function update() {
|
||||
const query = knex('bans')
|
||||
.update('type', {
|
||||
type: 'mute',
|
||||
username_original: 'charles',
|
||||
})
|
||||
.where('id', 2754);
|
||||
|
||||
console.log(query.toSQL());
|
||||
|
||||
await query;
|
||||
}
|
||||
|
||||
update();
|
||||
40
src/tools/realitykings.js
Normal file
40
src/tools/realitykings.js
Normal file
@@ -0,0 +1,40 @@
|
||||
'use strict';
|
||||
|
||||
const fetch = require('node-fetch');
|
||||
const express = require('express');
|
||||
|
||||
async function init() {
|
||||
const res = await fetch('https://www.realitykings.com/scenes?site=45', {
|
||||
method: 'HEAD',
|
||||
headers: {
|
||||
'user-agent': 'HTTPie/2.6.0',
|
||||
'accept-encoding': 'gzip, deflate, br',
|
||||
accept: '*/*',
|
||||
connection: 'keep-alive',
|
||||
},
|
||||
});
|
||||
|
||||
console.log(res.status, res.headers);
|
||||
|
||||
const app = express();
|
||||
|
||||
app.get('/', (appReq, appRes) => {
|
||||
console.log(appReq.headers);
|
||||
appRes.status(204).send();
|
||||
});
|
||||
|
||||
app.listen(8000, () => {
|
||||
console.log('Listening on port 8000');
|
||||
|
||||
fetch('http://127.0.0.1:8000', {
|
||||
headers: {
|
||||
'user-agent': 'HTTPie/2.6.0',
|
||||
'accept-encoding': 'gzip, deflate, br',
|
||||
accept: '*/*',
|
||||
connection: 'keep-alive',
|
||||
},
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
init();
|
||||
@@ -8,6 +8,7 @@ const argv = require('./argv');
|
||||
const logger = require('./logger')(__filename);
|
||||
const knex = require('./knex');
|
||||
const { curateRelease } = require('./releases');
|
||||
const chunk = require('./utils/chunk');
|
||||
const include = require('./utils/argv-include')(argv);
|
||||
const { resolveScraper, resolveLayoutScraper } = require('./scrapers/resolve');
|
||||
const { fetchIncludedEntities } = require('./entities');
|
||||
@@ -38,22 +39,27 @@ function filterLocalUniqueReleases(releases, accReleases) {
|
||||
}
|
||||
|
||||
async function filterUniqueReleases(releases) {
|
||||
const releaseIdentifiers = releases
|
||||
.map((release) => [release.entity.id, release.entryId]);
|
||||
const releaseIdentifierChunks = chunk(releases.map((release) => [release.entity.id, release.entryId.toString()]));
|
||||
|
||||
const duplicateReleaseEntries = await knex('releases')
|
||||
.select(knex.raw('releases.*, row_to_json(entities) as entity'))
|
||||
.leftJoin('entities', 'entities.id', 'releases.entity_id')
|
||||
.whereIn(['entity_id', 'entry_id'], releaseIdentifiers)
|
||||
.where((builder) => {
|
||||
// check if previously upcoming scenes can be excluded from duplicates to be rescraped for release day updates
|
||||
builder
|
||||
.where('deep', true) // scene is already deep scraped
|
||||
.orWhereNull('date')
|
||||
.orWhereNotIn('date_precision', ['day', 'minute']) // don't worry about scenes without (accurate) dates for now
|
||||
.orWhere(knex.raw('date > NOW() - INTERVAL \'12 hours\'')) // scene is still upcoming, with a rough offset to wait for the end of the day west of UTC
|
||||
.orWhere(knex.raw('updated_at - date > INTERVAL \'1 day\'')); // scene was updated after the release date, no updates expected
|
||||
});
|
||||
const duplicateReleaseEntryChunks = await Promise.map(releaseIdentifierChunks, async (releaseIdentifiers) => {
|
||||
const duplicateReleaseEntriesQuery = knex('releases')
|
||||
.select(knex.raw('releases.*, row_to_json(entities) as entity'))
|
||||
.leftJoin('entities', 'entities.id', 'releases.entity_id')
|
||||
.whereIn(['entity_id', 'entry_id'], releaseIdentifiers)
|
||||
.where((builder) => {
|
||||
// check if previously upcoming scenes can be excluded from duplicates to be rescraped for release day updates
|
||||
builder
|
||||
.where('deep', true) // scene is already deep scraped
|
||||
.orWhereNull('date')
|
||||
.orWhereNotIn('date_precision', ['day', 'minute']) // don't worry about scenes without (accurate) dates for now
|
||||
.orWhere(knex.raw('date > NOW() - INTERVAL \'12 hours\'')) // scene is still upcoming, with a rough offset to wait for the end of the day west of UTC
|
||||
.orWhere(knex.raw('updated_at - date > INTERVAL \'1 day\'')); // scene was updated after the release date, no updates expected
|
||||
});
|
||||
|
||||
return duplicateReleaseEntriesQuery;
|
||||
}, { concurrency: 10 });
|
||||
|
||||
const duplicateReleaseEntries = duplicateReleaseEntryChunks.flat();
|
||||
|
||||
const duplicateReleases = duplicateReleaseEntries.map((release) => curateRelease(release));
|
||||
const duplicateReleasesByEntityIdAndEntryId = duplicateReleases.reduce(mapReleasesToEntityIdAndEntryId, {});
|
||||
@@ -261,8 +267,21 @@ async function scrapeNetworkSequential(networkEntity) {
|
||||
return releases.uniqueReleases;
|
||||
}
|
||||
|
||||
async function getBeforeNetwork(networkEntity) {
|
||||
try {
|
||||
const parameters = getRecursiveParameters(networkEntity);
|
||||
return await networkEntity.scraper?.beforeNetwork?.(networkEntity, parameters);
|
||||
} catch (error) {
|
||||
if (networkEntity.scraper?.requireBeforeNetwork === false) {
|
||||
return null;
|
||||
}
|
||||
|
||||
throw error;
|
||||
}
|
||||
}
|
||||
|
||||
async function scrapeNetworkParallel(networkEntity) {
|
||||
const beforeNetwork = await networkEntity.scraper.beforeNetwork?.(networkEntity);
|
||||
const beforeNetwork = await getBeforeNetwork(networkEntity);
|
||||
|
||||
return Promise.map(
|
||||
networkEntity.includedChildren,
|
||||
|
||||
3
src/utils/http-windows.js
Normal file
3
src/utils/http-windows.js
Normal file
@@ -0,0 +1,3 @@
|
||||
'use strict';
|
||||
|
||||
module.exports = new Map();
|
||||
@@ -3,12 +3,15 @@
|
||||
const config = require('config');
|
||||
const Promise = require('bluebird');
|
||||
const bhttp = require('bhttp');
|
||||
const fs = require('fs').promises;
|
||||
const util = require('util');
|
||||
const stream = require('stream');
|
||||
const tunnel = require('tunnel');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const { JSDOM, toughCookie } = require('jsdom');
|
||||
|
||||
const windows = require('./http-windows');
|
||||
|
||||
const logger = require('../logger')(__filename);
|
||||
const virtualConsole = require('./virtual-console')(__filename);
|
||||
const argv = require('../argv');
|
||||
@@ -78,8 +81,6 @@ function getLimiter(options = {}, url) {
|
||||
});
|
||||
}
|
||||
|
||||
limiters[interval][concurrency].on('queued', () => logger.silly(`Queued ${url}`));
|
||||
|
||||
return limiters[interval][concurrency];
|
||||
}
|
||||
|
||||
@@ -116,12 +117,23 @@ async function finalizeResult(res, options) {
|
||||
if (Buffer.isBuffer(res.body)) {
|
||||
const html = res.body.toString();
|
||||
const window = options?.parse ? new JSDOM(html, { virtualConsole, ...options.extract }).window : null;
|
||||
const pathname = new URL(res.request.url).pathname.replace(/\//g, '_');
|
||||
|
||||
// allow window.close to be called after scraping is done, only for deep scrapes where the URL is known outside the scraper
|
||||
if (window && /fetchScene|fetchMovie/.test(new Error().stack)) {
|
||||
windows.set(pathname, window);
|
||||
}
|
||||
|
||||
if (argv.saveHtml) {
|
||||
await fs.writeFile(`./html/${pathname}.html`, html);
|
||||
}
|
||||
|
||||
return {
|
||||
...res,
|
||||
body: html,
|
||||
html,
|
||||
status: res.statusCode,
|
||||
headers: res.headers,
|
||||
document: window?.document || null,
|
||||
window,
|
||||
ok: res.statusCode >= 200 && res.statusCode <= 299,
|
||||
@@ -132,6 +144,7 @@ async function finalizeResult(res, options) {
|
||||
...res,
|
||||
body: res.body,
|
||||
status: res.statusCode,
|
||||
headers: res.headers,
|
||||
ok: res.statusCode >= 200 && res.statusCode <= 299,
|
||||
};
|
||||
}
|
||||
|
||||
38
src/utils/jsdom-perf.js
Normal file
38
src/utils/jsdom-perf.js
Normal file
@@ -0,0 +1,38 @@
|
||||
'use strict';
|
||||
|
||||
const util = require('util');
|
||||
const fs = require('fs').promises;
|
||||
const Promise = require('bluebird');
|
||||
const { JSDOM } = require('jsdom');
|
||||
|
||||
const waitImmediate = util.promisify(setImmediate);
|
||||
|
||||
async function init() {
|
||||
let peak = 0;
|
||||
const files = await fs.readdir('./html');
|
||||
|
||||
// const dom = new JSDOM('<html><body></body></html>', { runScripts: 'dangerously' });
|
||||
|
||||
await Promise.map(Array.from({ length: 10 }).map(() => files).flat(), async (filename) => {
|
||||
const html = await fs.readFile(`./html/${filename}`, 'utf8');
|
||||
const dom = new JSDOM(html);
|
||||
|
||||
// dom.window.document.body.innerHTML = html;
|
||||
dom.window.close();
|
||||
|
||||
const usage = process.memoryUsage.rss() / 1000000;
|
||||
peak = Math.max(usage, peak);
|
||||
|
||||
console.log(`Memory usage: ${usage.toFixed(2)} MB, peak ${peak.toFixed(2)} MB`);
|
||||
|
||||
await waitImmediate;
|
||||
}, {
|
||||
concurrency: 1,
|
||||
});
|
||||
|
||||
await Promise.delay(2000);
|
||||
|
||||
console.log(`Final memory usage: ${(process.memoryUsage.rss() / 1000000).toFixed(2)} MB, max ${peak.toFixed(2)} MB`);
|
||||
}
|
||||
|
||||
init();
|
||||
@@ -6,7 +6,7 @@ const fsPromises = require('fs').promises;
|
||||
const Promise = require('bluebird');
|
||||
const blake2 = require('blake2');
|
||||
const sharp = require('sharp');
|
||||
const nanoid = require('nanoid');
|
||||
const { nanoid } = require('nanoid');
|
||||
const { PassThrough } = require('stream');
|
||||
|
||||
const http = require('./http');
|
||||
|
||||
103
src/utils/qu.js
103
src/utils/qu.js
@@ -16,6 +16,10 @@ function trim(str) {
|
||||
}
|
||||
|
||||
function extractDate(dateString, format, match) {
|
||||
if (!dateString) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (match) {
|
||||
const dateStamp = trim(dateString).match(match);
|
||||
|
||||
@@ -80,32 +84,69 @@ function prefixUrl(urlValue, origin, protocol = 'https') {
|
||||
return urlValue;
|
||||
}
|
||||
|
||||
function q(context, selector, attrArg, applyTrim = true) {
|
||||
if (!selector && context.nodeName === '#document') {
|
||||
function iterateXPathResult(iterator, results = []) {
|
||||
const element = iterator.iterateNext();
|
||||
|
||||
if (element) {
|
||||
return iterateXPathResult(iterator, [...results, element]);
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
function getElements(context, selector, first = false) {
|
||||
if (!selector) {
|
||||
return context;
|
||||
}
|
||||
|
||||
if (/^\/\//.test(selector)) {
|
||||
// XPath selector
|
||||
const iterator = globalWindow.document.evaluate(selector, context, null, globalWindow.XPathResult.ORDERED_NODE_ITERATOR_TYPE, null);
|
||||
|
||||
if (first) {
|
||||
return iterator.iterateNext();
|
||||
}
|
||||
|
||||
return iterateXPathResult(iterator);
|
||||
}
|
||||
|
||||
if (first) {
|
||||
return context.querySelector(selector);
|
||||
}
|
||||
|
||||
return Array.from(context.querySelectorAll(selector));
|
||||
}
|
||||
|
||||
function q(context, selectors, attrArg, applyTrim = true) {
|
||||
if (!selectors && context.nodeName === '#document') {
|
||||
return null;
|
||||
}
|
||||
|
||||
const attr = attrArg === true ? 'textContent' : attrArg;
|
||||
const element = [].concat(selectors).reduce((acc, selector) => acc || getElements(context, selector, true), null);
|
||||
|
||||
if (!element) {
|
||||
return null;
|
||||
}
|
||||
|
||||
if (attr) {
|
||||
const value = selector
|
||||
? context.querySelector(selector)?.[attr] || context.querySelector(selector)?.attributes[attr]?.value
|
||||
: context[attr] || context.getAttribute(attr);
|
||||
const value = element[attr] || element.getAttribute(attr);
|
||||
|
||||
return applyTrim && typeof value === 'string' ? trim(value) : value;
|
||||
}
|
||||
|
||||
return selector ? context.querySelector(selector) : context;
|
||||
return element;
|
||||
}
|
||||
|
||||
function all(context, selector, attrArg, applyTrim = true) {
|
||||
function all(context, selectors, attrArg, applyTrim = true) {
|
||||
const attr = attrArg === true ? 'textContent' : attrArg;
|
||||
const elements = [].concat(selectors).reduce((acc, selector) => acc || getElements(context, selector), null);
|
||||
|
||||
if (attr) {
|
||||
return Array.from(context.querySelectorAll(selector), (el) => q(el, null, attr, applyTrim));
|
||||
return elements.map((el) => q(el, null, attr, applyTrim));
|
||||
}
|
||||
|
||||
return Array.from(context.querySelectorAll(selector));
|
||||
return elements;
|
||||
}
|
||||
|
||||
function exists(context, selector) {
|
||||
@@ -130,6 +171,42 @@ function html(context, selector) {
|
||||
return el && el.innerHTML;
|
||||
}
|
||||
|
||||
function htmls(context, selector) {
|
||||
const els = all(context, selector, null, true);
|
||||
|
||||
return els.map((el) => el.innerHTML);
|
||||
}
|
||||
|
||||
function execute(context, selector = 'script') {
|
||||
const scripts = htmls(context, selector);
|
||||
const originalGlobal = Object.fromEntries(Object.entries(global));
|
||||
|
||||
const errors = scripts?.reduce((accErrors, script) => {
|
||||
try {
|
||||
Function(script)(); /* eslint-disable-line no-new-func */
|
||||
|
||||
return accErrors;
|
||||
} catch (error) {
|
||||
// the script failed
|
||||
return [...accErrors, error];
|
||||
}
|
||||
}, []);
|
||||
|
||||
const data = Object.fromEntries(Object.entries(global).filter(([key, value]) => {
|
||||
if (originalGlobal[key] !== value) {
|
||||
delete global[key];
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}));
|
||||
|
||||
return {
|
||||
...data,
|
||||
errors,
|
||||
};
|
||||
}
|
||||
|
||||
function json(context, selector) {
|
||||
const el = q(context, selector, null, true);
|
||||
|
||||
@@ -152,12 +229,6 @@ function jsons(context, selector) {
|
||||
});
|
||||
}
|
||||
|
||||
function htmls(context, selector) {
|
||||
const els = all(context, selector, null, true);
|
||||
|
||||
return els.map((el) => el.innerHTML);
|
||||
}
|
||||
|
||||
function texts(context, selector, applyTrim = true, filter = true) {
|
||||
const el = q(context, selector, null, applyTrim);
|
||||
if (!el) return null;
|
||||
@@ -425,6 +496,8 @@ const quFuncs = {
|
||||
duration,
|
||||
el: q,
|
||||
element: q,
|
||||
execute,
|
||||
exec: execute,
|
||||
exists,
|
||||
html,
|
||||
htmls,
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
const config = require('config');
|
||||
const AWS = require('aws-sdk');
|
||||
const fs = require('fs');
|
||||
const nanoid = require('nanoid');
|
||||
const { nanoid } = require('nanoid');
|
||||
|
||||
async function init() {
|
||||
const filepath = './public/img/sfw/animals/j0iiByCxGfA.jpeg';
|
||||
|
||||
@@ -33,8 +33,8 @@ async function upsert(table, items, identifier = ['id'], _knex) {
|
||||
logger.debug(`${table}: Updating ${update.length}`);
|
||||
|
||||
const [inserted, updated] = await Promise.all([
|
||||
knex(table).returning('*').insert(insert),
|
||||
knex.transaction(async (trx) => Promise.all(update.map((item) => {
|
||||
insert.length > 0 ? knex(table).returning('*').insert(insert) : [],
|
||||
update.length > 0 ? knex.transaction(async (trx) => Promise.all(update.map((item) => {
|
||||
const clause = identifiers.reduce((acc, identifierX) => ({ ...acc, [identifierX]: item[identifierX] }), {});
|
||||
|
||||
return trx
|
||||
@@ -42,7 +42,7 @@ async function upsert(table, items, identifier = ['id'], _knex) {
|
||||
.update(item)
|
||||
.into(table)
|
||||
.returning('*');
|
||||
}))),
|
||||
}))) : [],
|
||||
]);
|
||||
|
||||
return {
|
||||
|
||||
Reference in New Issue
Block a user