Merge branch 'master' into experimental

This commit is contained in:
DebaucheryLibrarian
2022-03-30 23:00:29 +02:00
1105 changed files with 6700 additions and 1561 deletions

View File

@@ -14,6 +14,6 @@
"prefer-destructuring": "off",
"template-curly-spacing": "off",
"object-curly-newline": "off",
"max-len": [2, {"code": 300, "tabWidth": 4, "ignoreUrls": true}],
"max-len": [2, {"code": 300, "tabWidth": 4, "ignoreUrls": true}]
}
}

View File

@@ -20,6 +20,7 @@ const scrapers = require('./scrapers/scrapers').actors;
const argv = require('./argv');
const include = require('./utils/argv-include')(argv);
const bulkInsert = require('./utils/bulk-insert');
const chunk = require('./utils/chunk');
const logger = require('./logger')(__filename);
const { toBaseReleases } = require('./deep');
@@ -1048,33 +1049,42 @@ async function flushProfiles(actorIdsOrNames) {
logger.info(`Removed ${deleteCount} profiles`);
}
async function deleteActors(actorIdsOrNames) {
const actors = await knex('actors')
.whereIn('id', actorIdsOrNames.filter((idOrName) => typeof idOrName === 'number'))
.orWhere((builder) => {
builder
.whereIn('name', actorIdsOrNames.filter((idOrName) => typeof idOrName === 'string'))
.whereNull('entity_id');
});
async function deleteActors(allActorIdsOrNames) {
const deleteCounts = await Promise.map(chunk(allActorIdsOrNames), async (actorIdsOrNames) => {
const actors = await knex('actors')
.whereIn('id', actorIdsOrNames.filter((idOrName) => typeof idOrName === 'number'))
.orWhere((builder) => {
builder
.whereIn('name', actorIdsOrNames.filter((idOrName) => typeof idOrName === 'string'))
.whereNull('entity_id');
});
const actorIds = actors.map((actor) => actor.id);
const actorIds = actors.map((actor) => actor.id);
const sceneIds = await knex('releases_actors')
.select('releases.id')
.whereIn('actor_id', actorIds)
.leftJoin('releases', 'releases.id', 'releases_actors.release_id')
.pluck('id');
const sceneIds = await knex('releases_actors')
.select('releases.id')
.whereIn('actor_id', actorIds)
.leftJoin('releases', 'releases.id', 'releases_actors.release_id')
.pluck('id');
const [deletedScenesCount, deletedActorsCount] = await Promise.all([
deleteScenes(sceneIds),
knex('actors')
.whereIn('id', actorIds)
.delete(),
]);
const [deletedScenesCount, deletedActorsCount] = await Promise.all([
deleteScenes(sceneIds),
knex('actors')
.whereIn('id', actorIds)
.delete(),
]);
return { deletedScenesCount, deletedActorsCount };
}, { concurrency: 10 });
const deletedActorsCount = deleteCounts.reduce((acc, count) => acc + count.deletedActorsCount, 0);
const deletedScenesCount = deleteCounts.reduce((acc, count) => acc + count.deletedScenesCount, 0);
await flushOrphanedMedia();
logger.info(`Removed ${deletedActorsCount} actors with ${deletedScenesCount} scenes`);
return deletedActorsCount;
}
async function flushActors() {

View File

@@ -1,5 +1,6 @@
'use strict';
const config = require('config');
const util = require('util');
// const log = require('why-is-node-running');
const Inspector = require('inspector-api');
@@ -24,33 +25,64 @@ const getFileEntries = require('./utils/file-entries');
const inspector = new Inspector();
let done = false;
function logActive() {
console.log('log active!');
setTimeout(() => {
// log();
logActive();
}, typeof argv.logActive === 'number' ? argv.logActive : 60000);
}
/*
function monitorMemory() {
logger.debug(`Memory usage: ${process.memoryUsage.rss() / 1000000} MB`);
function logActive() {
setTimeout(() => {
log();
if (!done) {
setTimeout(() => monitorMemory(), 10000);
}
if (!done) {
logActive();
}
}, typeof argv.logActive === 'number' ? argv.logActive : 60000);
}
*/
async function stopMemorySample() {
async function snapshotMemory(trigger) {
const profile = await inspector.heap.takeSnapshot();
const filepath = `traxxx_snapshot_${trigger}M_${dayjs().format('YYYY-MM-DD_HH-mm-ss')}.heapsnapshot`;
logger.info(`Starting heap snapshot, memory usage: ${process.memoryUsage.rss() / 1000000} MB`);
await inspector.heap.disable();
await fs.writeFile(filepath, JSON.stringify(profile));
logger.info(`Saved heap snapshot to ${filepath}`);
}
async function stopMemorySample(snapshotTriggers) {
const usage = process.memoryUsage.rss() / 1000000;
const profile = await inspector.heap.stopSampling();
const filepath = `${dayjs().format('YYYY-MM-DD_HH-mm-ss')}.heapprofile`;
const filepath = `traxxx_sample_${dayjs().format('YYYY-MM-DD_HH-mm-ss')}.heapprofile`;
await inspector.heap.disable();
await fs.writeFile(filepath, JSON.stringify(profile));
logger.info(`Saved heap sample to ${filepath}`);
if (usage > snapshotTriggers[0]) {
await snapshotMemory(snapshotTriggers[0]);
return snapshotTriggers.slice(1);
}
return snapshotTriggers;
}
async function startMemorySample(snapshotTriggers = []) {
await inspector.heap.enable();
await inspector.heap.startSampling();
const usage = process.memoryUsage.rss() / 1000000;
logger.info(`Start heap sampling, memory usage: ${usage} MB`);
setTimeout(async () => {
const newSnapshotTriggers = await stopMemorySample(snapshotTriggers);
if (!done) {
await startMemorySample(newSnapshotTriggers);
}
}, config.memorySampling.sampleDuration);
}
async function startMemorySample() {
@@ -72,19 +104,19 @@ async function startMemorySample() {
async function init() {
try {
if (argv.memory) {
await startMemorySample();
}
if (argv.logActive) {
logActive();
}
if (argv.server) {
await initServer();
return;
}
if (argv.sampleMemory) {
await startMemorySample(config.memorySampling.snapshotIntervals);
}
if (argv.logActive) {
// logActive();
}
if (argv.updateSearch) {
await Promise.all([
updateSceneSearch(),
@@ -157,8 +189,13 @@ async function init() {
? await fetchScenes([...(sceneUrls), ...(updateBaseScenes || []), ...(actorBaseScenes || [])])
: [...(updateBaseScenes || []), ...(actorBaseScenes || [])];
const sceneMovies = deepScenes ? deepScenes.filter((scene) => scene.movie).map((scene) => ({ ...scene.movie, entity: scene.entity })) : [];
const deepMovies = argv.sceneMovies || argv.movie ? await fetchMovies([...(argv.movie || []), ...(sceneMovies || [])]) : sceneMovies;
const storedScenes = argv.save ? await storeScenes(deepScenes) : [];
const moviesFromFile = argv.moviesFile && await getFileEntries(argv.moviesFile);
const movieUrls = (argv.movie || []).concat(moviesFromFile || []);
const sceneMovies = deepScenes && argv.sceneMovies ? deepScenes.filter((scene) => scene.movie).map((scene) => ({ ...scene.movie, entity: scene.entity })) : [];
const deepMovies = argv.sceneMovies || argv.movie || movieUrls ? await fetchMovies([...movieUrls, ...(sceneMovies || []), ...[]]) : sceneMovies;
const movieScenes = argv.movieScenes ? deepMovies.map((movie) => movie.scenes?.map((scene) => ({ ...scene, movie, entity: movie.entity }))).flat().filter(Boolean) : [];
const deepMovieScenes = argv.deep ? await fetchScenes(movieScenes) : movieScenes;
@@ -169,10 +206,10 @@ async function init() {
}
if (argv.save) {
const storedMovies = await storeMovies(deepMovies);
const storedScenes = await storeScenes([...(deepScenes || []), ...(deepMovieScenes || [])]);
const storedMovies = await storeMovies(deepMovies, storedScenes[0]?.batchId);
const storedMovieScenes = await storeScenes(deepMovieScenes, storedScenes[0]?.batchId);
await associateMovieScenes(storedMovies, storedScenes);
await associateMovieScenes(storedMovies, [...storedScenes, ...storedMovieScenes]);
}
} catch (error) {
logger.error(error);

View File

@@ -107,6 +107,11 @@ const { argv } = yargs
describe: 'Scrape movie info from URL',
type: 'array',
})
.option('movie-file', {
describe: 'Scrape movie info from URLs in a file',
type: 'string',
alias: 'movies-file',
})
.option('deep', {
describe: 'Fetch details for all releases',
type: 'boolean',
@@ -233,6 +238,7 @@ const { argv } = yargs
default: false,
})
.option('level', {
alias: 'log-level',
describe: 'Log level',
type: 'string',
default: process.env.NODE_ENV === 'development' ? 'silly' : 'info',
@@ -247,6 +253,12 @@ const { argv } = yargs
type: 'boolean',
default: process.env.NODE_ENV === 'development',
})
.option('sampleMemory', {
alias: 'memory',
describe: 'Take memory allocation samples, and snapshots at configured intervals',
type: 'boolean',
default: config.memorySampling.enabled,
})
.option('update-search', {
describe: 'Update search documents for all releases.',
type: 'boolean',

View File

@@ -1,5 +1,6 @@
'use strict';
const util = require('util');
const Promise = require('bluebird');
const { mergeAdvanced: merge } = require('object-merge-advanced');
@@ -9,6 +10,9 @@ const { fetchReleaseEntities, urlToSiteSlug } = require('./entities');
const logger = require('./logger')(__filename);
const qu = require('./utils/qu');
const getRecursiveParameters = require('./utils/get-recursive-parameters');
const windows = require('./utils/http-windows');
const waitImmediate = util.promisify(setImmediate);
function toBaseReleases(baseReleasesOrUrls, entity = null) {
if (!baseReleasesOrUrls) {
@@ -50,12 +54,12 @@ function toBaseReleases(baseReleasesOrUrls, entity = null) {
.filter(Boolean);
}
async function fetchScene(scraper, url, entity, baseRelease, options) {
if (scraper.fetchScene) {
return scraper.fetchScene(baseRelease.url, entity, baseRelease, options, null);
async function fetchScene(scraper, url, entity, baseRelease, options, type = 'scene') {
if ((type === 'scene' && scraper.fetchScene) || (type === 'movie' && scraper.fetchMovie)) {
return scraper[type === 'movie' ? 'fetchMovie' : 'fetchScene'](baseRelease.url, entity, baseRelease, options, null);
}
if (scraper.scrapeScene) {
if ((type === 'scene' && scraper.scrapeScene) || (type === 'movie' && scraper.scrapeMovie)) {
const session = qu.session();
const res = await qu.get(url, null, null, {
@@ -66,7 +70,7 @@ async function fetchScene(scraper, url, entity, baseRelease, options) {
const cookie = await session._sessionOptions.cookieJar.get(url);
if (res.ok) {
return scraper.scrapeScene(res.item, url, entity, baseRelease, options, {
return scraper[type === 'movie' ? 'scrapeMovie' : 'scrapeScene'](res.item, url, entity, baseRelease, options, {
session,
headers: res.headers,
cookieJar: session._sessionOptions.cookieJar,
@@ -80,6 +84,10 @@ async function fetchScene(scraper, url, entity, baseRelease, options) {
return null;
}
function fetchMovie(scraper, url, entity, baseRelease, options) {
return fetchScene(scraper, url, entity, baseRelease, options, 'movie');
}
async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
const entity = baseRelease.entity || entitiesBySlug[urlToSiteSlug(baseRelease.url)];
@@ -102,7 +110,7 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
return baseRelease;
}
if ((type === 'scene' && !layoutScraper.fetchScene && !layoutScraper.scrapeScene) || (type === 'movie' && !layoutScraper.fetchMovie)) {
if ((type === 'scene' && !layoutScraper.fetchScene && !layoutScraper.scrapeScene) || (type === 'movie' && !layoutScraper.fetchMovie && !layoutScraper.scrapeMovie)) {
logger.warn(`The '${entity.name}'-scraper cannot scrape individual ${type}s`);
return baseRelease;
}
@@ -116,9 +124,28 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
parameters: getRecursiveParameters(entity),
};
logger.debug(`Memory usage before: ${process.memoryUsage.rss() / 1000000} MB (${baseRelease.url})`);
const rawScrapedRelease = type === 'scene'
? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, options, null)
: await layoutScraper.fetchMovie(baseRelease.url, entity, baseRelease, options, null);
? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, options)
: await fetchMovie(layoutScraper, baseRelease.url, entity, baseRelease, options);
const pathname = baseRelease.path || (baseRelease.url && new URL(baseRelease.url).pathname.replace(/\//g, '_'));
if (rawScrapedRelease) {
delete rawScrapedRelease.query; // some scrapers pass the qu-wrapped window instance to parent scrapers, filling up memory
}
if (windows.has(pathname)) {
logger.debug(`Closing window for ${pathname}`);
windows.get(pathname).close();
windows.delete(pathname);
}
await waitImmediate;
logger.debug(`Memory usage after: ${process.memoryUsage.rss() / 1000000} MB (${baseRelease.url})`);
const scrapedRelease = rawScrapedRelease?.scene || rawScrapedRelease;
@@ -173,12 +200,13 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
async function scrapeReleases(baseReleases, entitiesBySlug, type) {
const entitiesWithBeforeDataEntries = await Promise.all(Object.entries(entitiesBySlug).map(async ([slug, entity]) => {
if (entity.scraper?.beforeFetchScenes) {
const preData = await entity.scraper.beforeFetchScenes(entity);
const parameters = getRecursiveParameters(entity);
const preData = await entity.scraper.beforeFetchScenes(entity, parameters);
return [slug, { ...entity, preData }];
}
return null;
return [slug, entity];
}));
const entitiesWithBeforeDataBySlug = Object.fromEntries(entitiesWithBeforeDataEntries.filter(Boolean));
@@ -186,7 +214,7 @@ async function scrapeReleases(baseReleases, entitiesBySlug, type) {
return Promise.map(
baseReleases,
async (baseRelease) => scrapeRelease(baseRelease, entitiesWithBeforeDataBySlug, type),
{ concurrency: 10 },
{ concurrency: 1 },
);
}

View File

@@ -6,7 +6,7 @@ const inquirer = require('inquirer');
const logger = require('./logger')(__filename);
const argv = require('./argv');
const knex = require('./knex');
const { deleteScenes, deleteMovies } = require('./releases');
const { deleteScenes, deleteMovies, deleteSeries } = require('./releases');
const { flushOrphanedMedia } = require('./media');
const { resolveScraper, resolveLayoutScraper } = require('./scrapers/resolve');
@@ -236,7 +236,7 @@ async function fetchReleaseEntities(baseReleases) {
.filter(Boolean),
));
return fetchEntitiesBySlug(entitySlugs);
return fetchEntitiesBySlug(entitySlugs, 'desc');
}
async function fetchEntity(entityId, type) {
@@ -359,29 +359,39 @@ async function flushEntities(networkSlugs = [], channelSlugs = []) {
.leftJoin('movies', 'movies.entity_id', 'selected_entities.id')
.pluck('movies.id');
if (sceneIds.length === 0 && movieIds.length === 0) {
logger.info(`No scenes or movies found to remove for ${entitySlugs}`);
const serieIds = await entityQuery
.clone()
.select('series.id')
.distinct('series.id')
.whereNotNull('series.id')
.from('selected_entities')
.leftJoin('series', 'series.entity_id', 'selected_entities.id')
.pluck('series.id');
if (sceneIds.length === 0 && movieIds.length === 0 && serieIds.length === 0) {
logger.info(`No scenes, movies or series found to remove for ${entitySlugs}`);
return;
}
const confirmed = await inquirer.prompt([{
type: 'confirm',
name: 'flushEntities',
message: `You are about to remove ${sceneIds.length} scenes and ${movieIds.length} movies for ${entitySlugs}. Are you sure?`,
message: `You are about to remove ${sceneIds.length} scenes, ${movieIds.length} movies and ${serieIds.length} series for ${entitySlugs}. Are you sure?`,
default: false,
}]);
if (!confirmed.flushEntities) {
logger.warn(`Confirmation rejected, not flushing scenes or movies for: ${entitySlugs}`);
logger.warn(`Confirmation rejected, not flushing scenes, movies or series for: ${entitySlugs}`);
return;
}
const [deletedScenesCount, deletedMoviesCount] = await Promise.all([
const [deletedScenesCount, deletedMoviesCount, deletedSeriesCount] = await Promise.all([
deleteScenes(sceneIds),
deleteMovies(movieIds),
deleteSeries(serieIds),
]);
logger.info(`Removed ${deletedScenesCount} scenes and ${deletedMoviesCount} movies for ${entitySlugs}`);
logger.info(`Removed ${deletedScenesCount} scenes, ${deletedMoviesCount} movies and ${deletedSeriesCount} series for ${entitySlugs}`);
await flushOrphanedMedia();
}

View File

@@ -7,7 +7,7 @@ const fs = require('fs');
const fsPromises = require('fs').promises;
const path = require('path');
const stream = require('stream');
const nanoid = require('nanoid/non-secure');
const { nanoid } = require('nanoid/non-secure');
const mime = require('mime');
// const fileType = require('file-type');
const ffmpeg = require('fluent-ffmpeg');
@@ -345,12 +345,13 @@ async function writeImage(image, media, info, filepath, isProcessed) {
return;
}
if (isProcessed) {
// convert to JPEG and write to permanent location
await image
.jpeg()
.toFile(path.join(config.media.path, filepath));
}
await image
.resize({
height: config.media.maxSize,
withoutEnlargement: true,
})
.jpeg({ quality: config.media.quality })
.toFile(path.join(config.media.path, filepath));
}
async function writeThumbnail(image, thumbpath) {
@@ -377,6 +378,7 @@ async function writeLazy(image, lazypath) {
async function storeImageFile(media, hashDir, hashSubDir, filename, filedir, filepath, options) {
logger.silly(`Storing permanent media files for ${media.id} from ${media.src} at ${filepath}`);
logger.debug(`Memory usage at image storage: ${process.memoryUsage.rss() / 1000000} MB (${media.src})`);
try {
const thumbdir = config.s3.enabled ? path.join(media.role, 'thumbs') : path.join(media.role, 'thumbs', hashDir, hashSubDir);
@@ -415,12 +417,14 @@ async function storeImageFile(media, hashDir, hashSubDir, filename, filedir, fil
});
}
await writeImage(image, media, info, filepath, isProcessed);
await Promise.all([
writeImage(image, media, info, filepath, isProcessed),
writeThumbnail(image, thumbpath),
writeLazy(image, lazypath),
]);
/*
if (isProcessed) {
// file already stored, remove temporary file
await fsPromises.unlink(media.file.path);
@@ -428,6 +432,9 @@ async function storeImageFile(media, hashDir, hashSubDir, filename, filedir, fil
// image not processed, simply move temporary file to final location
await fsPromises.rename(media.file.path, path.join(config.media.path, filepath));
}
*/
await fsPromises.unlink(media.file.path);
if (config.s3.enabled) {
await Promise.all([
@@ -580,6 +587,7 @@ async function fetchSource(source, baseMedia) {
const maxAttempts = source.attempts || 3;
logger.silly(`Fetching media from ${source.src}`);
logger.debug(`Memory usage before media fetch: ${process.memoryUsage.rss() / 1000000} MB (${source.src})`);
async function attempt(attempts = 1) {
const hasher = new blake2.Hash('blake2b', { digestLength: 24 });
@@ -746,7 +754,8 @@ async function storeMedias(baseMedias, options) {
const fetchedMedias = await Promise.map(
baseMedias,
async (baseMedia) => fetchMedia(baseMedia, { existingSourceMediaByUrl, existingExtractMediaByUrl }),
{ concurrency: 100 }, // don't overload disk (or network, although this has its own throttling)
// { concurrency: 100 }, // don't overload disk (or network, although this has its own throttling)
{ concurrency: 10 }, // don't overload disk (or network, although this has its own throttling)
);
const { uniqueHashMedias, existingHashMedias } = await findHashDuplicates(fetchedMedias);
@@ -823,11 +832,12 @@ async function associateReleaseMedia(releases, type = 'release') {
.reduce((acc, [releaseId, releaseBaseMedias]) => {
releaseBaseMedias.forEach((baseMedia) => {
const media = storedMediasById[baseMedia.id];
const mediaId = media?.use || media?.entry?.id;
if (media) {
if (mediaId) {
acc.push({
[`${type}_id`]: releaseId,
media_id: media.use || media.entry.id,
media_id: mediaId,
});
}
});
@@ -840,7 +850,10 @@ async function associateReleaseMedia(releases, type = 'release') {
await bulkInsert(`${type}s_${role}`, associations, false);
}
} catch (error) {
logger.error(util.inspect(error.entries, null, null, { color: true }));
if (error.entries) {
logger.error(util.inspect(error.entries, null, null, { color: true }));
}
logger.error(`Failed to store ${type} ${role}: ${error.message}`);
}
}, Promise.resolve());
@@ -948,9 +961,12 @@ async function flushOrphanedMedia() {
await deleteS3Objects(orphanedMedia.filter((media) => media.is_s3));
}
await fsPromises.rmdir(path.join(config.media.path, 'temp'), { recursive: true });
logger.info('Cleared temporary media directory');
try {
await fsPromises.rm(path.join(config.media.path, 'temp'), { recursive: true });
logger.info('Cleared temporary media directory');
} catch (error) {
logger.warn(`Failed to clear temporary media directory: ${error.message}`);
}
}
module.exports = {

View File

@@ -59,7 +59,7 @@ const releaseFields = `
slug
}
}
poster: chaptersPosterByChapterId {
poster: chaptersPoster {
media {
id
path
@@ -82,7 +82,7 @@ const releaseFields = `
}
}
}
poster: releasesPosterByReleaseId {
poster: releasesPoster {
media {
id
path
@@ -104,7 +104,7 @@ const releaseFields = `
size
}
}
trailer: releasesTrailerByReleaseId @include (if: $full) {
trailer: releasesTrailer @include (if: $full) {
media {
id
path
@@ -325,6 +325,24 @@ async function deleteMovies(movieIds) {
return deleteCount;
}
async function deleteSeries(serieIds) {
if (serieIds.length === 0) {
return 0;
}
await knex('series_scenes')
.whereIn('serie_id', serieIds)
.delete();
const deleteCount = await knex('series')
.whereIn('id', serieIds)
.delete();
logger.info(`Removed ${deleteCount}/${serieIds.length} series`);
return deleteCount;
}
async function flushScenes() {
const sceneIds = await knex('releases').select('id').pluck('id');
@@ -367,6 +385,27 @@ async function flushMovies() {
logger.info(`Removed ${deleteCount}/${movieIds.length} movies`);
}
async function flushSeries() {
const serieIds = await knex('series').select('id').pluck('id');
const confirmed = await inquirer.prompt([{
type: 'confirm',
name: 'flushSeries',
message: `You are about to remove ${serieIds.length} series. Are you sure?`,
default: false,
}]);
if (!confirmed.flushSeries) {
logger.warn('Confirmation rejected, not flushing series');
return;
}
const deleteCount = await deleteSeries(serieIds);
await flushOrphanedMedia();
logger.info(`Removed ${deleteCount}/${serieIds.length} series`);
}
async function flushBatches(batchIds) {
const [sceneIds, movieIds] = await Promise.all([
knex('releases')
@@ -407,8 +446,10 @@ module.exports = {
fetchScenes,
flushBatches,
flushMovies,
flushSeries,
flushScenes,
searchScenes,
deleteScenes,
deleteMovies,
deleteSeries,
};

View File

@@ -1,20 +1,20 @@
'use strict';
const { fetchApiLatest, fetchApiUpcoming, fetchScene, fetchApiProfile } = require('./gamma');
const { fetchApiLatest, fetchApiUpcoming, fetchSceneApi, fetchApiProfile } = require('./gamma');
function curateRelease(release, site) {
if (['bubblegumdungeon', 'ladygonzo'].includes(site.slug)) {
return {
...release,
title: release.title.split(/:|\|/)[1].trim(),
title: release.title.split(/:|\|/)[1]?.trim(),
};
}
return release;
}
async function networkFetchScene(url, site, release) {
const scene = await fetchScene(url, site, release);
async function networkFetchScene(url, site, release, options) {
const scene = await fetchSceneApi(url, site, release, options);
return curateRelease(scene, site);
}

View File

@@ -34,13 +34,13 @@ function getPoster(posterElement, sceneId) {
if (typeof posterTimeRange === 'number') {
// poster time is already a single time value
return `https://legalporno.com/casting/${sceneId}/${posterTimeRange}`;
return `https://analvids.com/casting/${sceneId}/${posterTimeRange}`;
}
const [max, min] = posterTimeRange.split('-');
const posterTime = Math.floor(Math.random() * (Number(max) - Number(min) + 1) + Number(min));
return `https://legalporno.com/casting/${sceneId}/${posterTime}`;
return `https://analvids.com/casting/${sceneId}/${posterTime}`;
}
function scrapeAll(html) {
@@ -134,7 +134,7 @@ async function scrapeScene(html, url, site, useGallery) {
}
const studioName = $('.watchpage-studioname').first().text().trim();
release.studio = slugify(studioName, '');
release.studio = slugify(studioName, '', { removePunctuation: true });
return release;
}
@@ -181,7 +181,7 @@ async function fetchScene(url, site) {
}
async function fetchProfile({ name: actorName }) {
const res = await http.get(`https://www.legalporno.com/api/autocomplete/search?q=${actorName.replace(' ', '+')}`);
const res = await http.get(`https://www.analvids.com/api/autocomplete/search?q=${actorName.replace(' ', '+')}`);
const data = res.body;
const result = data.terms.find((item) => item.type === 'model');

View File

@@ -5,6 +5,7 @@ const qu = require('../utils/qu');
const { extractDate } = require('../utils/qu');
const { inchesToCm } = require('../utils/convert');
const slugify = require('../utils/slugify');
const capitalize = require('../utils/capitalize');
const clusterId = '617fb597b659459bafe6472470d9073a';
const authKey = 'YmFuZy1yZWFkOktqVDN0RzJacmQ1TFNRazI=';
@@ -15,6 +16,10 @@ const genderMap = {
};
function getScreenUrl(item, scene) {
if (!scene.dvd?.id || !item?.screenId) {
return null;
}
return `https://i.bang.com/screenshots/${scene.dvd.id}/${scene.type}/${scene.order}/${item.screenId}.jpg`;
}
@@ -57,7 +62,7 @@ async function fetchPhotos(scene) {
async function scrapeScene(scene, entity, options) {
const release = {
entryId: scene.id,
title: scene.name,
title: scene.name || (scene.dvd?.name && scene.type === 'bonus' && capitalize(`${scene.dvd.name} - Bonus Scene ${scene.order || 1}`)) || null,
description: scene.description,
tags: scene.genres.concat(scene.actions).map((genre) => genre.name),
duration: scene.duration,
@@ -91,7 +96,7 @@ async function scrapeScene(scene, entity, options) {
}
}
release.trailer = `https://i.bang.com/v/${scene.dvd.id}/${scene.identifier}/preview.mp4`;
release.teaser = `https://i.bang.com/v/${scene.dvd.id}/${scene.identifier}/preview.mp4`;
release.channel = scene.series.name
.replace(/[! .]/g, '')
@@ -352,6 +357,11 @@ async function fetchUpcoming(site, page = 1) {
}
async function fetchScene(url, entity, baseRelease, options) {
if (baseRelease?.entryId) {
// overview and deep data is the same, don't hit server unnecessarily
return baseRelease;
}
const encodedId = new URL(url).pathname.split('/')[2];
const entryId = decodeId(encodedId);

View File

@@ -8,6 +8,7 @@ const logger = require('../logger')(__filename);
const slugify = require('../utils/slugify');
const http = require('../utils/http');
const qu = require('../utils/qu');
const args = require('../argv');
function scrape(html, site) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
@@ -43,7 +44,7 @@ function scrape(html, site) {
});
}
function scrapeLegacy(scenes, site) {
function scrapeAllLegacy(scenes, site) {
return scenes.map(({ query }) => {
const release = {};
@@ -63,6 +64,38 @@ function scrapeLegacy(scenes, site) {
});
}
function scrapeAllMembers(scenes, _channel) {
return scenes.map(({ query, el }) => {
const release = {};
const data = JSON.parse(query.q(el, null, 'data-shoot'));
release.entryId = data?.id || query.url('a.etLnk')?.match(/\d+$/)?.[0];
release.shootId = data?.code;
release.url = data.url ? qu.prefixUrl(data.url, 'https://members.bangbros.com') : query.url('a.etLnk');
release.title = data?.title || query.cnt('.etl-hdd');
release.description = data?.description || query.cnt('.etl-desc');
release.date = query.date('.etl-dt', 'MMM DD, YYYY', /\w{3} \d{1,2}, \d{4}/);
release.actors = data?.model.map((actor) => ({
name: actor.name,
url: qu.prefixUrl(actor.url, 'https://members.bangbros.com'),
}));
const rolloverUrl = query.q('.rollover-image', 'data-rollover-url');
release.poster = data?.image || query.img('.rollover-image', 'data-initial-image-url');
if (rolloverUrl) {
release.photos = Array.from({ length: 15 }, (value, index) => `${rolloverUrl}${index + 1}.jpg`);
}
release.trailer = data?.trailer;
release.tags = data?.tag.map((tag) => tag.name);
return release;
});
}
/* no dates available, breaks database
function scrapeUpcoming(html, site) {
const { document } = ex(html);
@@ -147,6 +180,30 @@ function scrapeSceneLegacy({ query }, url) {
return release;
}
function scrapeSceneMembers({ query }, url) {
const release = {};
release.entryId = new URL(url).pathname.match(/(\d+)\/?$/)[1];
release.shootId = query.img('.player img')?.match(/\/shoots\/(\w+)\//)?.[1];
release.title = query.cnt('.vdo-hdd1');
release.description = query.cnt('.ndcp');
release.actors = query.all('.vdsc a[href*="/model"]').map((actorEl) => ({
name: query.cnt(actorEl, 'span'),
url: query.url(actorEl, null, 'href', { origin: 'https://members.bangbros.com' }),
avatar: query.img(actorEl, 'img'),
}));
release.date = query.date('.ran:nth-child(2)', 'MMM DD, YYYY', /\w{3} \d{1,2}, \d{4}/);
release.duration = query.duration('.ran:nth-child(3)');
release.tags = query.cnts('.tag a[href*="/tags"]');
release.channel = slugify(query.cnt('.tag a[href*="/site"]'), '');
return release;
}
function scrapeProfile(html, scope) {
const { query } = qu.ex(html);
const profile = {};
@@ -167,17 +224,6 @@ function scrapeProfileSearch(html, actorName) {
}
async function fetchLatest(site, page = 1) {
if (site.parameters?.legacy) {
const url = `${site.parameters?.latest || site.url}/videos/${page}`;
const res = await qu.getAll(url, '.videoList');
if (res.ok) {
return scrapeLegacy(res.items, site);
}
return res.status;
}
const res = await qu.get(`${site.parameters?.latest || site.url}/${page}`);
if (res.ok) {
@@ -187,6 +233,39 @@ async function fetchLatest(site, page = 1) {
return res.status;
}
async function fetchLatestMembers(channel, page = 1, { parameters }) {
if (!parameters.product) {
throw new Error(`No member area product ID known for '${channel.name}'`);
}
if (!args.cookie) {
throw new Error(`Please specifiy --cookie "PHPSESSID=xxx" to access the '${channel.name}' members area.`);
}
const url = `https://members.bangbros.com/product/${parameters.product}/videos/latest/${page}`;
const res = await qu.getAll(url, '.thumbHolder .echThumb', {
cookie: args.cookie,
});
if (res.ok) {
return scrapeAllMembers(res.items, channel);
}
return res.status;
}
async function fetchLatestLegacy(site, page = 1) {
const url = `${site.parameters?.latest || site.url}/videos/${page}`;
const res = await qu.getAll(url, '.videoList');
if (res.ok) {
return scrapeAllLegacy(res.items, site);
}
return res.status;
}
/*
async function fetchUpcoming(site) {
const res = await http.get('https://www.bangbros.com');
@@ -218,6 +297,26 @@ async function fetchScene(url, site, release) {
return scrapeScene(res.item.html, url, site);
}
async function fetchSceneMembers(url, baseRelease, channel, { parameters }) {
if (!parameters.product) {
throw new Error(`No member area product ID known for '${channel.name}'`);
}
if (!args.cookie) {
throw new Error(`Please specifiy --cookie "PHPSESSID=xxx" to access the '${channel.name}' members area.`);
}
const res = await qu.get(url, null, {
cookie: args.cookie,
});
if (res.ok) {
return scrapeSceneMembers(res.item, url, channel);
}
return res.status;
}
async function fetchProfile({ name: actorName }, scope) {
const actorSlug = slugify(actorName);
const url = `https://bangbros.com/search/${actorSlug}`;
@@ -242,5 +341,12 @@ module.exports = {
fetchLatest,
fetchScene,
fetchProfile,
legacy: {
fetchLatest: fetchLatestLegacy,
},
members: {
fetchLatest: fetchLatestMembers,
fetchScene: fetchSceneMembers,
},
// fetchUpcoming, no dates available
};

View File

@@ -3,7 +3,6 @@
const Promise = require('bluebird');
const util = require('util');
const { JSDOM } = require('jsdom');
const cheerio = require('cheerio');
const moment = require('moment');
const format = require('template-format');
@@ -25,6 +24,19 @@ function getApiUrl(appId, apiKey) {
};
}
function getAvatarFallbacks(avatar) {
if (!avatar) {
return null;
}
return [
avatar.replace(/\d+x\d+/, '500x750'),
avatar.replace(/\d+x\d+/, '240x360'),
avatar.replace(/\d+x\d+/, '200x300'),
avatar,
];
}
async function fetchApiCredentials(referer, site) {
if (site?.parameters?.appId && site?.parameters?.apiKey) {
return getApiUrl(site.parameters.appId, site.parameters.apiKey);
@@ -62,21 +74,19 @@ function getAlbumUrl(albumPath, site) {
}
async function fetchPhotos(url) {
const res = await http.get(url);
const res = await qu.get(url);
return res.body.toString();
return res.item;
}
function scrapePhotos(html, includeThumbnails = true) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
return $('.preview .imgLink, .pgFooterThumb a').toArray().map((linkEl) => {
const url = $(linkEl).attr('href');
function scrapePhotos({ query }, includeThumbnails = true) {
return query.all('.preview .imgLink, .pgFooterThumb a').map((linkEl) => {
const url = linkEl.href;
if (/\/join|\/createaccount/.test(url)) {
// URL links to join page instead of full photo, extract thumbnail
// /createaccount is used by e.g. Tricky Spa native site
const src = $(linkEl).find('img').attr('src');
const src = query.img(linkEl);
if (/previews\//.test(src)) {
// resource often serves full photo at a modifier URL anyway, add as primary source
@@ -106,20 +116,18 @@ async function getPhotos(albumPath, site, includeThumbnails = true) {
}
try {
const html = await fetchPhotos(albumUrl);
const $ = cheerio.load(html, { normalizeWhitespace: true });
const photos = scrapePhotos(html, includeThumbnails);
const item = await fetchPhotos(albumUrl);
const photos = scrapePhotos(item, includeThumbnails);
const lastPage = $('.Gamma_Paginator a.last').attr('href')?.match(/\d+$/)[0];
const lastPage = item.query.url('.Gamma_Paginator a.last')?.match(/\d+$/)[0];
if (lastPage) {
const otherPages = Array.from({ length: Number(lastPage) }, (_value, index) => index + 1).slice(1);
const otherPhotos = await Promise.map(otherPages, async (page) => {
const pageUrl = `${albumUrl}/${page}`;
const pageHtml = await fetchPhotos(pageUrl);
const pageItem = await fetchPhotos(`${albumUrl}/${page}`);
return scrapePhotos(pageHtml, includeThumbnails);
return scrapePhotos(pageItem, includeThumbnails);
}, {
concurrency: 2,
});
@@ -169,10 +177,15 @@ async function getThumbs(entryId, site, parameters) {
});
if (res.ok && res.body.results?.[0]?.hits[0]?.set_pictures) {
return res.body.results[0].hits[0].set_pictures.map((img) => ([
`https://transform.gammacdn.com/photo_set${img.thumb_path}`,
return res.body.results[0].hits[0].set_pictures.map((img) => img.thumb_path && ([
`https://images-fame.gammacdn.com/photo_set${img.thumb_path}`,
`https://images01-fame.gammacdn.com/photo_set${img.thumb_path}`,
`https://images02-fame.gammacdn.com/photo_set${img.thumb_path}`,
`https://images03-fame.gammacdn.com/photo_set${img.thumb_path}`,
`https://images04-fame.gammacdn.com/photo_set${img.thumb_path}`,
`https://images-evilangel.gammacdn.com/photo_set${img.thumb_path}`,
]));
`https://transform.gammacdn.com/photo_set${img.thumb_path}`,
])).filter(Boolean);
}
return [];
@@ -187,6 +200,18 @@ async function getPhotosApi(entryId, site, parameters) {
return photos.concat(thumbs.slice(photos.length));
}
function getImageSources(source) {
return [
`https://images-fame.gammacdn.com/movies${source}`,
`https://images01-fame.gammacdn.com/movies${source}`,
`https://images02-fame.gammacdn.com/movies${source}`,
`https://images03-fame.gammacdn.com/movies${source}`,
`https://images04-fame.gammacdn.com/movies${source}`,
`https://images-evilangel.gammacdn.com/movies${source}`,
`https://transform.gammacdn.com/movies${source}`,
];
}
async function scrapeApiReleases(json, site) {
return json.map((scene) => {
if (site.parameters?.extract && scene.sitename !== site.parameters.extract) {
@@ -225,9 +250,17 @@ async function scrapeApiReleases(json, site) {
],
}));
/* master categories include e.g. 'transgender' for non-trans Wicked scenes
release.tags = scene.master_categories
.concat(scene.categories?.map((category) => category.name))
.filter(Boolean); // some categories don't have a name
*/
release.tags = scene.categories?.map((category) => category.name).filter(Boolean); // some categories don't have a name
if (scene.availableOnSite.length > 1) {
release.comment = `Also available on ${scene.availableOnSite.filter((sisterSite) => sisterSite !== site.slug).join(', ')}`;
}
const posterPath = scene.pictures.resized || (scene.pictures.nsfw?.top && Object.values(scene.pictures.nsfw.top)[0]);
@@ -244,41 +277,21 @@ async function scrapeApiReleases(json, site) {
}).filter(Boolean);
}
function scrapeAll(html, site, networkUrl, hasTeaser = true) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const scenesElements = $('li[data-itemtype=scene], div[data-itemtype=scenes]').toArray();
return scenesElements.map((element) => {
function scrapeAll(scenes, site, networkUrl, hasTeaser = true) {
return scenes.map(({ query, el }) => {
const release = {};
const sceneLinkElement = $(element).find('.sceneTitle a, .tlcTitle a');
release.url = query.url('.sceneTitle a, .tlcTitle a', 'href', { origin: networkUrl ? site.parent.url : site.url });
if (site) release.url = `${networkUrl ? site.parent.url : site.url}${sceneLinkElement.attr('href')}`;
else release.url = `${networkUrl}${sceneLinkElement.attr('href')}`;
release.title = query.cnt('.sceneTitle a', 'tlcTitle a', 'title');
release.entryId = el.dataset.itemid;
release.title = sceneLinkElement.attr('title');
release.entryId = $(element).attr('data-itemid');
release.date = query.date('.sceneDate, .tlcSpecsDate .tlcDetailsValue', ['MM-DD-YYYY', 'YYYY-MM-DD']);
release.actors = query.cnts('.sceneActors a, .tlcActors a', ' title');
const dateEl = $(element).find('.sceneDate, .tlcSpecsDate .tlcDetailsValue').text() || null;
if (dateEl) {
release.date = moment
.utc(dateEl, ['MM-DD-YYYY', 'YYYY-MM-DD'])
.toDate();
}
[release.likes, release.dislikes] = query.all('.value').map((likeEl) => query.number(likeEl));
release.actors = $(element).find('.sceneActors a, .tlcActors a')
.map((actorIndex, actorElement) => $(actorElement).attr('title'))
.toArray();
[release.likes, release.dislikes] = $(element).find('.value')
.toArray()
.map((value) => Number($(value).text()));
const posterEl = $(element).find('.imgLink img, .tlcImageItem');
if (posterEl) release.poster = posterEl.attr('data-original') || posterEl.attr('src');
const channelEl = $(element).find('.fromSite a');
if (channelEl.attr('title')) release.channel = channelEl.attr('title').replace('.com', '');
release.poster = query.img('.imgLink img, .tlcImageItem', 'data-original') || query.img('.imgLink img, .tlcImageItem');
if (hasTeaser) {
release.teaser = [
@@ -287,76 +300,66 @@ function scrapeAll(html, site, networkUrl, hasTeaser = true) {
];
}
release.channel = query.el('.fromSite a', 'title')?.replace('.com', '');
return release;
});
}
async function scrapeScene(html, url, site, baseRelease, mobileHtml, options) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const m$ = mobileHtml && cheerio.load(mobileHtml, { normalizeWhitespace: true });
const release = { $, url };
async function scrapeScene({ query }, url, channel, baseRelease, mobileItem, options) {
const release = { query }; // used by XEmpire scraper to resolve channel-specific details
const json = $('script[type="application/ld+json"]').html();
const videoJson = $('script:contains("window.ScenePlayerOptions")').html();
const json = query.html('script[type="application/ld+json"]');
const videoJson = query.htmls('script').find((script) => /ScenePlayerOptions/i.test(script));
const [data, data2] = json ? JSON.parse(json) : [];
const videoData = videoJson && JSON.parse(videoJson.slice(videoJson.indexOf('{'), videoJson.indexOf('};') + 1));
release.entryId = (baseRelease?.path || new URL(url).pathname).match(/\/(\d{2,})(\/|$)/)?.[1];
release.title = videoData?.playerOptions?.sceneInfos.sceneTitle || data?.name;
release.description = data?.description;
// date in data object is not the release date of the scene, but the date the entry was added; only use as fallback
const dateString = $('.updatedDate').first().text().trim();
const dateMatch = dateString.match(/\d{2,4}[-/]\d{2}[-/]\d{2,4}/)?.[0];
release.date = query.date('.updatedDate', ['MM-DD-YYYY', 'YYYY-MM-DD'])
|| qu.extractDate(data?.dateCreated, 'YYYY-MM-DD')
|| videoData?.playerOptions?.sceneInfos.sceneReleaseDate;
if (dateMatch) release.date = moment.utc(dateMatch, ['MM-DD-YYYY', 'YYYY-MM-DD']).toDate();
else if (data?.dateCreated) release.date = moment.utc(data.dateCreated, 'YYYY-MM-DD').toDate();
else release.date = videoData?.playerOptions?.sceneInfos.sceneReleaseDate;
release.actors = (data?.actor || data2?.actor)?.map((actor) => ({
name: actor.name,
gender: actor.gender,
})) || [];
if (data) {
release.description = data.description;
if (data.director?.[0]?.name) release.director = data.director[0].name;
else if (data2?.director?.[0]?.name) release.director = data2.director[0].name;
release.duration = qu.durationToSeconds(data.duration);
release.director = data?.director?.[0]?.name || data2?.director?.[0]?.name;
const stars = (data.aggregateRating.ratingValue / data.aggregateRating.bestRating) * 5;
if (stars) release.rating = { stars };
release.tags = data?.keywords?.split(', ') || data2?.keywords?.split(', ') || [];
release.stars = (data.aggregateRating.ratingValue / data.aggregateRating.bestRating) * 5 || null;
release.duration = moment.duration(data.duration.slice(2)).asSeconds();
release.channel = slugify(data?.productionCompany?.name
|| query.el('.studioLink a, .siteLink a', 'title')
|| query.cnt('.siteNameSpan')?.toLowerCase().replace('.com', '')
|| query.meta('meta[name="twitter:domain"]')?.replace('.com', ''), '');
if (videoData?.picPreview && new URL(videoData.picPreview).pathname.length > 1) {
// sometimes links to just https://images02-fame.gammacdn.com/
const poster = new URL(videoData.picPreview);
release.poster = [
videoData.picPreview, // prefer original URL with width and height parameters, without may give a square crop on e.g. XEmpire
`${poster.origin}${poster.pathname}`,
];
}
const actors = data?.actor || data2?.actor;
if (actors) {
release.actors = actors.map((actor) => ({
name: actor.name,
gender: actor.gender,
}));
}
const hasTrans = release.actors?.some((actor) => actor.gender === 'shemale');
const rawTags = data?.keywords?.split(', ') || data2?.keywords?.split(', ') || [];
release.tags = hasTrans ? [...rawTags, 'transsexual'] : rawTags;
const channel = data?.productionCompany?.name
|| $('.studioLink a, .siteLink a').attr('title')?.trim()
|| $('.siteNameSpan').text()
?.trim()
.toLowerCase()
.replace('.com', '')
|| $('meta[name="twitter:domain"]').attr('content')?.replace('.com', '');
if (channel) release.channel = slugify(channel, '');
if (videoData?.picPreview && new URL(videoData.picPreview).pathname.length > 1) release.poster = videoData.picPreview; // sometimes links to just https://images02-fame.gammacdn.com/
const photoLink = $('.picturesItem a').attr('href');
const mobilePhotos = m$ ? m$('.preview-displayer a img').map((photoIndex, photoEl) => $(photoEl).attr('src')).toArray() : [];
const photoLink = query.url('.picturesItem a');
const mobilePhotos = mobileItem?.query.imgs('.preview-displayer a img') || [];
if (photoLink && options.includePhotos) {
const photos = await getPhotos(photoLink, site, mobilePhotos.length < 3); // only get thumbnails when less than 3 mobile photos are available
const photos = await getPhotos(photoLink, channel, mobilePhotos.length < 3); // only get thumbnails when less than 3 mobile photos are available
if (photos.length < 7) release.photos = [...photos, ...mobilePhotos]; // probably only teaser photos available, supplement with mobile album
else release.photos = photos;
if (photos.length < 7) {
release.photos = [...photos, ...mobilePhotos]; // probably only teaser photos available, supplement with mobile album
} else {
release.photos = photos;
}
} else {
release.photos = mobilePhotos;
}
@@ -397,28 +400,28 @@ async function scrapeScene(html, url, site, baseRelease, mobileHtml, options) {
];
}
const movie = $('.dvdLink');
const movieUrl = qu.prefixUrl(movie.attr('href'), site.url);
const movieUrl = query.url('.dvdLink', 'href', { origin: channel.url });
if (movieUrl) {
release.movie = {
url: movieUrl,
title: movie.attr('title'),
title: query.el('.dvdLink', 'title'),
entryId: movieUrl.match(/\/(\d+)(\/|$)/)?.[1],
covers: [movie.find('img').attr('src')],
covers: [qu.imgs('.dvdLink img')],
};
}
return release;
}
async function scrapeSceneApi(data, site, options) {
async function scrapeReleaseApi(data, site, options) {
const release = {};
release.entryId = data.clip_id;
release.entryId = data.clip_id || data.movie_id;
release.title = data.title;
release.duration = data.length;
release.date = new Date(data.date * 1000) || qu.parseDate(data.release_date, 'YYYY-MM-DD');
release.date = (data.date && new Date(data.date * 1000)) || qu.parseDate(data.release_date || data.last_modified, 'YYYY-MM-DD');
release.director = data.directors[0]?.name || null;
release.actors = data.actors.map((actor) => ({
entryId: actor.actor_id,
@@ -433,10 +436,9 @@ async function scrapeSceneApi(data, site, options) {
if (data.pictures) {
release.poster = [
`https://transform.gammacdn.com/movies${data.pictures['1920x1080']}`,
`https://images-evilangel.gammacdn.com/movies${data.pictures['1920x1080']}`,
`https://transform.gammacdn.com/movies${data.pictures.resized}`,
`https://images-evilangel.gammacdn.com/movies${data.pictures.resized}`,
...(data.pictures['1920x1080'] ? getImageSources(data.pictures['1920x1080']) : []),
...(data.pictures.resized ? getImageSources(data.pictures.resized) : []),
...(data.pictures['960x544'] ? getImageSources(data.pictures['960x544']) : []),
];
}
@@ -444,15 +446,22 @@ async function scrapeSceneApi(data, site, options) {
release.photos = await getPhotosApi(data.photoset_id, site, options.parameters);
}
if (data.cover_path) {
release.covers = [
getImageSources(`${data.cover_path}_front_400x625.jpg?width=450&height=636&format=webp`),
getImageSources(`${data.cover_path}_back_400x625.jpg?width=450&height=636&format=webp`),
];
}
if (data.trailers) {
release.trailer = Object.entries(data.trailers).map(([quality, source]) => ({ src: source, quality }));
}
if (data.movie_id) {
if (data.movie_id && !data.movie_path) {
release.movie = {
entryId: data.movie_id,
title: data.movie_title,
url: qu.prefixUrl(`/en/movie/${data.url_movie_title}/${data.movie_id}`, site.url),
url: qu.prefixUrl(`${data.url_movie_title}/${data.movie_id}`, options.parameters.movie ? options.parameters.movie : `${site.url}/en/movie`),
};
}
@@ -484,11 +493,16 @@ async function fetchMovieTrailer(release) {
return null;
}
async function scrapeMovie({ query, html }, window, url, entity, options) {
async function scrapeMovie({ query, el }, url, entity, baseRelease, options) {
const release = {};
const data = window.dataLayer[0]?.dvdDetails;
// const options = html.match(/options = {.*};/);
const { dataLayer } = query.exec('//script[contains(text(), "dataLayer")]', ['dataLayer']);
const rawData = dataLayer?.[0]?.dvdDetails;
const data = rawData?.dvdId && rawData; // dvdDetails is mostly empty in some cache states
if (query.exists('.NotFound-Title')) {
return null;
}
release.entryId = new URL(url).pathname.match(/\/(\d+)(\/|$)/)?.[1];
@@ -498,13 +512,20 @@ async function scrapeMovie({ query, html }, window, url, entity, options) {
];
release.description = query.cnt('.descriptionText');
release.date = qu.extractDate(data.dvdReleaseDate);
release.title = data.dvdName;
release.date = qu.extractDate(data?.dvdReleaseDate) || query.date('.updatedOn', 'YYYY-MM-DD');
release.title = data?.dvdName || query.cnt('.dvdTitle');
release.director = query.el('.directedBy a', 'title');
release.actors = data?.dvdActors.map((actor) => ({ name: actor.actorName, entryId: actor.actorId }))
|| query.all('.actorCarousel a[href*="/pornstar"]').map((actorEl) => ({
entryId: query.url(actorEl, null).match(/\/(\d+)/)?.[1],
name: query.cnt(actorEl, 'span'),
href: query.url(actorEl, null, 'href', { origin: entity.url }),
avatar: getAvatarFallbacks(query.img(actorEl)),
}));
release.actors = data.dvdActors.map((actor) => ({ name: actor.actorName, entryId: actor.actorId }));
release.tags = query.cnts('.dvdCol a');
release.scenes = scrapeAll(html, entity, entity.url);
release.scenes = scrapeAll(qu.initAll(el, 'div[data-itemtype*=scene], li[data-itemtype*=scene]'), entity, entity.url);
if (options.includeTrailers) {
release.trailer = await fetchMovieTrailer(release);
@@ -547,10 +568,8 @@ async function fetchActorReleases(profileUrl, getActorReleasesUrl, page = 1, acc
return accReleases.concat(releases);
}
async function scrapeProfile(html, url, actorName, _siteSlug, getActorReleasesUrl, withReleases, context) {
const { query } = qu.extract(html);
const avatar = query.el('img.actorPicture');
async function scrapeProfile({ query }, url, actorName, _siteSlug, getActorReleasesUrl, withReleases, context) {
const avatar = query.img('img.actorPicture');
const hair = query.cnt('.actorProfile .attribute_hair_color');
const height = query.cnt('.actorProfile .attribute_height');
const weight = query.cnt('.actorProfile .attribute_weight');
@@ -563,12 +582,7 @@ async function scrapeProfile(html, url, actorName, _siteSlug, getActorReleasesUr
if (avatar) {
// larger sizes usually available, provide fallbacks
const avatars = [
avatar.src.replace(/\d+x\d+/, '500x750'),
avatar.src.replace(/\d+x\d+/, '240x360'),
avatar.src.replace(/\d+x\d+/, '200x300'),
avatar.src,
];
const avatars = getAvatarFallbacks(avatar);
profile.avatar = avatars;
}
@@ -617,7 +631,7 @@ async function fetchLatestApi(site, page = 1, preData, include, upcoming = false
requests: [
{
indexName: 'all_scenes',
params: `query=&hitsPerPage=36&maxValuesPerFacet=100&page=${page - 1}&facetFilters=[["lesbian:"],["bisex:"],["shemale:"],["upcoming:${upcoming ? 1 : 0}"]]&filters=sitename:${site.slug} OR channels.id:${site.slug}`,
params: `query=&hitsPerPage=36&maxValuesPerFacet=100&page=${page - 1}&facetFilters=[["lesbian:"],["bisex:"],["shemale:"],["upcoming:${upcoming ? 1 : 0}"]]&filters=sitename:${site.slug}`, // OR channels.id:${site.slug}`,
},
],
}, {
@@ -664,8 +678,48 @@ async function fetchSceneApi(url, site, baseRelease, options) {
encodeJSON: true,
});
if (res.status === 200 && res.body.results?.[0]?.hits) {
return scrapeSceneApi(res.body.results[0].hits[0], site, options);
if (res.status === 200 && res.body.results?.[0]?.hits.length > 0) {
return scrapeReleaseApi(res.body.results[0].hits[0], site, options);
}
if (res.status === 200) {
return null;
}
return res.status;
}
async function fetchMovieApi(url, site, baseRelease, options) {
const referer = options.parameters?.referer || `${site.parameters?.networkReferer ? site.parent.url : site.url}/en/movies`;
const { apiUrl } = await fetchApiCredentials(referer, site);
const entryId = (baseRelease?.path || new URL(url).pathname).match(/\/(\d{2,})(\/|$)/)?.[1];
const res = await http.post(apiUrl, {
requests: [
{
indexName: 'all_movies',
params: `query=&page=0&facets=[]&tagFilters=&facetFilters=[["movie_id:${entryId}"]]`,
},
{
indexName: 'all_movies',
params: 'query=&page=0&hitsPerPage=1&attributesToRetrieve=[]&attributesToHighlight=[]&attributesToSnippet=[]&tagFilters=&analytics=false&clickAnalytics=false&facets=clip_id',
},
],
}, {
headers: {
Referer: referer,
},
}, {
encodeJSON: true,
});
if (res.status === 200 && res.body.results?.[0]?.hits.length > 0) {
return scrapeReleaseApi(res.body.results[0].hits[0], site, options);
}
if (res.status === 200) {
return null;
}
return res.status;
@@ -699,10 +753,10 @@ function getUpcomingUrl(site) {
async function fetchLatest(site, page = 1) {
const url = getLatestUrl(site, page);
const res = await http.get(url);
const res = await qu.getAll(url, 'li[data-itemtype=scene], div[data-itemtype*=scene]');
if (res.ok) {
return scrapeAll(res.body.toString(), site);
return scrapeAll(res.items, site);
}
return res.status;
@@ -710,10 +764,10 @@ async function fetchLatest(site, page = 1) {
async function fetchUpcoming(site) {
const url = getUpcomingUrl(site);
const res = await http.get(url);
const res = await qu.getAll(url, 'li[data-itemtype=scene], div[data-itemtype*=scene]');
if (res.ok) {
return scrapeAll(res.body.toString(), site, null, false);
return scrapeAll(res.items, site, null, false);
}
return res.status;
@@ -749,12 +803,12 @@ async function fetchScene(url, site, baseRelease, options) {
}
const deepUrl = getDeepUrl(url, site, baseRelease);
const mobileUrl = getDeepUrl(url, site, baseRelease, site.parameters?.mobile || site.parent?.parameters?.mobile);
const mobileUrl = options.includePhotos && getDeepUrl(url, site, baseRelease, site.parameters?.mobile || site.parent?.parameters?.mobile);
if (deepUrl) {
const [res, mobileRes] = await Promise.all([
http.get(deepUrl),
mobileUrl && http.get(mobileUrl, {
qu.get(deepUrl),
mobileUrl && qu.get(mobileUrl, null, {
headers: {
// don't redirect to main site
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Mobile Safari/537.36',
@@ -763,8 +817,8 @@ async function fetchScene(url, site, baseRelease, options) {
]);
if (res.status === 200) {
const mobileBody = mobileRes?.status === 200 ? mobileRes.body.toString() : null;
const scene = await scrapeScene(res.body.toString(), url, site, baseRelease, mobileBody, options);
const mobileItem = mobileRes?.status === 200 ? mobileRes.item : null;
const scene = await scrapeScene(res.item, url, site, baseRelease, mobileItem, options);
return { ...scene, deepUrl };
}
@@ -773,20 +827,6 @@ async function fetchScene(url, site, baseRelease, options) {
return null;
}
async function fetchMovie(url, channel, baseRelease, options) {
const res = await qu.get(url, null, null, {
extract: {
runScripts: 'dangerously',
},
});
if (res.ok) {
return scrapeMovie(res.item, res.window, url, channel, options);
}
return res.status;
}
async function fetchActorScenes(actorName, apiUrl, siteSlug) {
const res = await http.post(apiUrl, {
requests: [
@@ -827,13 +867,13 @@ async function fetchProfile({ name: actorName }, context, include, altSearchUrl,
if (actorUrl) {
const url = `https://${siteSlug}.com${actorUrl}`;
const actorRes = await http.get(url);
const actorRes = await qu.get(url);
if (actorRes.status !== 200) {
return null;
}
return scrapeProfile(actorRes.body.toString(), url, actorName, siteSlug, getActorReleasesUrl, include.scenes, context);
return scrapeProfile(actorRes.item, url, actorName, siteSlug, getActorReleasesUrl, include.scenes, context);
}
return null;
@@ -881,7 +921,6 @@ module.exports = {
fetchApiUpcoming: fetchUpcomingApi,
fetchLatest,
fetchLatestApi,
fetchMovie,
fetchProfile,
fetchScene,
fetchSceneApi,
@@ -893,12 +932,14 @@ module.exports = {
fetchProfile: fetchApiProfile,
// fetchScene,
fetchScene: fetchSceneApi,
fetchMovie,
// scrapeMovie,
fetchMovie: fetchMovieApi,
},
getPhotos,
scrapeApiProfile,
scrapeApiReleases,
scrapeProfile,
scrapeAll,
scrapeMovie,
scrapeScene,
};

View File

@@ -136,14 +136,18 @@ function getEntryId(html) {
function scrapeAll(scenes, site, entryIdFromTitle) {
return scenes.map(({ el, query }) => {
const release = {};
const title = query.cnt('.content_img div, .dvd_info > a, a.update_title, a[title] + a[title]') || query.cnt('a[title*=" "]');
release.url = query.url('.update_title a, .dvd_info > a, a ~ a');
release.title = query.q('.update_title a, .dvd_info > a, a ~ a', true);
release.title = title?.slice(0, title.match(/starring:/i)?.index || Infinity).trim();
release.url = query.url('.content_img a, .dvd_info > a, a.update_title, a[title*=" "]');
release.date = query.date('.update_date', 'MM/DD/YYYY');
release.entryId = (entryIdFromTitle && slugify(release.title)) || el.dataset.setid || query.q('.rating_box')?.dataset.id;
release.actors = query.all('.update_models a', true);
release.actors = query.all('.content_img .update_models a, .update_models a').map((actorEl) => ({
name: query.cnt(actorEl),
url: query.url(actorEl, null),
}));
const dvdPhotos = query.imgs('.dvd_preview_thumb');
const photoCount = Number(query.q('a img.thumbs', 'cnt')) || 1;
@@ -183,9 +187,9 @@ function scrapeAll(scenes, site, entryIdFromTitle) {
}).filter(Boolean);
const teaserScript = query.html('script');
if (teaserScript) {
const src = teaserScript.slice(teaserScript.indexOf('http'), teaserScript.indexOf('.mp4') + 4);
if (src) release.teaser = { src };
release.teaser = teaserScript.slice(teaserScript.indexOf('http'), teaserScript.indexOf('.mp4') + 4);
}
return release;
@@ -235,17 +239,21 @@ function scrapeUpcoming(html, site) {
});
}
async function scrapeScene({ html, query }, url, site, include) {
const release = { url, site };
async function scrapeScene({ html, query }, url, site, options) {
const release = {};
release.entryId = getEntryId(html);
release.title = query.q('.title_bar_hilite', true);
release.description = query.q('.update_description', true);
release.title = query.cnt('.title_bar_hilite');
release.description = query.cnt('.update_description');
release.date = query.date('.update_date', 'MM/DD/YYYY', null, 'innerHTML');
release.actors = query.all('.backgroundcolor_info > .update_models a, .item .update_models a', true);
release.tags = query.all('.update_tags a', true);
release.actors = query.all('.backgroundcolor_info > .update_models a, .item .update_models a').map((actorEl) => ({
name: query.cnt(actorEl),
url: query.url(actorEl, null),
}));
release.tags = query.cnts('.update_tags a');
const posterPath = html.match(/useimage = "(.*)"/)?.[1];
@@ -260,7 +268,7 @@ async function scrapeScene({ html, query }, url, site, include) {
}
}
if (include.trailer && site.slug !== 'manuelferrara') {
if (options.includeTrailers && site.slug !== 'manuelferrara') {
const trailerLines = html.split('\n').filter((line) => /movie\["trailer\w*"\]\[/i.test(line));
if (trailerLines.length) {
@@ -277,19 +285,20 @@ async function scrapeScene({ html, query }, url, site, include) {
}
}
if (include.photos) release.photos = await getPhotos(release.entryId, site);
if (options.includePhotos) {
release.photos = await getPhotos(release.entryId, site);
}
if (query.exists('.update_dvds a')) {
release.movie = {
url: query.url('.update_dvds a'),
title: query.q('.update_dvds a', true),
title: query.cnt('.update_dvds a'),
};
release.movie.entryId = new URL(release.movie.url).pathname.split('/').slice(-1)[0]?.replace('.html', '');
}
const stars = Number(query.q('.avg_rating', true)?.replace(/[\s|Avg Rating:]/g, ''));
if (stars) release.stars = stars;
release.stars = query.number('.avg_rating');
return release;
}
@@ -298,7 +307,7 @@ function scrapeMovie({ el, query }, url, site) {
const movie = { url, site };
movie.entryId = new URL(url).pathname.split('/').slice(-1)[0]?.replace('.html', '');
movie.title = query.q('.title_bar span', true);
movie.title = query.cnt('.title_bar span');
movie.covers = query.urls('#dvd-cover-flip > a');
movie.channel = slugify(query.q('.update_date a', true), '');
@@ -310,7 +319,7 @@ function scrapeMovie({ el, query }, url, site) {
?.map((scene) => ({ ...scene, movie }))
.sort((sceneA, sceneB) => sceneA.date - sceneB.date);
movie.date = curatedScenes?.[0].date;
movie.date = curatedScenes?.[0]?.date;
return {
...movie,

View File

@@ -140,16 +140,6 @@ async function fetchLatest(site, page = 1) {
return res.status;
}
async function fetchScene(url, site) {
const res = await qu.get(url);
if (res.ok) {
return scrapeScene(res.item, url, site);
}
return res.status;
}
async function fetchProfile({ name: actorName }, entity, include) {
const searchRes = await qu.getAll(`https://kink.com/search?type=performers&q=${actorName}`, '.model');
@@ -176,6 +166,6 @@ async function fetchProfile({ name: actorName }, entity, include) {
module.exports = {
fetchLatest,
fetchScene,
fetchProfile,
scrapeScene,
};

View File

@@ -11,6 +11,12 @@ const slugify = require('../utils/slugify');
const http = require('../utils/http');
const { inchesToCm, lbsToKg } = require('../utils/convert');
function getBasePath(channel, path = '/scene') {
return channel.parameters?.scene
|| ((channel.parameters?.native || channel.type === 'network') && `${channel.url}${path}`)
|| `${channel.parent.url}${path}`;
}
function getThumbs(scene) {
if (scene.images.poster) {
return Object.values(scene.images.poster) // can be { 0: {}, 1: {}, ... } instead of array
@@ -18,7 +24,7 @@ function getThumbs(scene) {
.map((image) => image.xl.url);
}
if (scene.images.card_main_rect) {
if (Array.isArray(scene.images.card_main_rect)) {
return scene.images.card_main_rect
.concat(scene.images.card_secondary_rect || [])
.map((image) => image.xl.url.replace('.thumb', ''));
@@ -27,6 +33,28 @@ function getThumbs(scene) {
return [];
}
function getCovers(images, target = 'cover') {
if (!images[target]) {
return [];
}
const covers = [
images[target][0].md?.url,
images[target][0].sm?.url,
images[target][0].xs?.url,
// bigger but usually upscaled
images[target][0].xx?.url,
images[target][0].xl?.url,
images[target][0].lg?.url,
];
if (target === 'poster') {
return covers;
}
return [covers];
}
function getVideos(data) {
const teaserSources = data.videos.mediabook?.files;
const trailerSources = data.children.find((child) => child.type === 'trailer')?.videos.full?.files;
@@ -51,9 +79,7 @@ function scrapeLatestX(data, site, filterChannel) {
description: data.description,
};
const basepath = site.parameters?.scene
|| (site.parameters?.native && `${site.url}/scene`)
|| `${site.parent.url}/scene`;
const basepath = getBasePath(site);
release.url = `${basepath}/${release.entryId}/${slugify(release.title)}`;
release.date = new Date(data.dateReleased);
@@ -84,6 +110,9 @@ function scrapeLatestX(data, site, filterChannel) {
};
}
const siteName = data.collections[0]?.name || data.brand;
release.channel = slugify(siteName, '');
return release;
}
@@ -96,7 +125,7 @@ async function scrapeLatest(items, site, filterChannel) {
};
}
function scrapeScene(data, url, _site, networkName) {
function scrapeRelease(data, url, channel, networkName) {
const release = {};
const { id: entryId, title, description } = data;
@@ -129,6 +158,29 @@ function scrapeScene(data, url, _site, networkName) {
release.url = url || `https://www.${networkName || data.brand}.com/scene/${entryId}/`;
if (data.parent?.type === 'movie' || data.parent?.type === 'serie') {
release[data.parent.type] = {
entryId: data.parent.id,
url: `${getBasePath(channel, data.parent.type === 'movie' ? '/movie' : '/series')}/${data.parent.id}/${slugify(data.parent.title, '-', { removePunctuation: true })}`,
title: data.parent.title,
description: data.parent.description,
date: new Date(data.parent.dateReleased),
channel: slugify(data.parent.collections?.name || data.parent.brand),
poster: getCovers(data.parent.images, 'poster'),
shallow: true,
};
}
if (data.type === 'movie') {
release.covers = getCovers(data.images);
release.scenes = data.children?.map((scene) => ({
entryId: scene.id,
url: `${getBasePath(channel)}/${scene.id}/${slugify(scene.title)}`,
title: scene.title,
shallow: true,
}));
}
return release;
}
@@ -155,17 +207,24 @@ function getUrl(site) {
throw new Error(`Mind Geek site '${site.name}' (${site.url}) not supported`);
}
async function getSession(site, parameters) {
async function getSession(site, parameters, url) {
if (site.slug === 'mindgeek' || site.parameters?.parentSession === false) {
// most MG sites have a parent network to acquire a session from, don't try to acquire session from mindgeek.com for independent channels
return null;
}
const cookieJar = new CookieJar();
const session = http.session({ cookieJar });
// const res = await session.get(url);
const sessionUrl = site.parameters?.siteId && !(site.parameters?.native || site.parameters?.childSession || site.parent?.parameters?.childSession)
? site.parent.url
: site.url;
: (url || site.url);
const res = await http.get(sessionUrl, {
session,
headers: {
'Accept-Language': 'en-US,en;', // somehow seems essential for some MG sites
},
interval: parameters?.interval,
concurrency: parameters?.concurrency,
parse: false,
@@ -175,7 +234,9 @@ async function getSession(site, parameters) {
const cookieString = await cookieJar.getCookieStringAsync(sessionUrl);
const { instance_token: instanceToken } = cookie.parse(cookieString);
return { session, instanceToken };
if (instanceToken) {
return { session, instanceToken };
}
}
throw new Error(`Failed to acquire MindGeek session (${res.statusCode})`);
@@ -224,7 +285,7 @@ function scrapeProfile(data, html, releases = [], networkName) {
profile.naturalBoobs = false;
}
profile.releases = releases.map((release) => scrapeScene(release, null, null, networkName));
profile.releases = releases.map((release) => scrapeRelease(release, null, null, networkName));
return profile;
}
@@ -234,7 +295,9 @@ async function fetchLatest(site, page = 1, options) {
const { searchParams } = new URL(url);
const siteId = searchParams.get('site');
const { session, instanceToken } = options.beforeNetwork || await getSession(site, options.parameters);
const { session, instanceToken } = options.beforeNetwork?.headers?.Instance
? options.beforeNetwork
: await getSession(site, options.parameters, url);
const beforeDate = moment().add('1', 'day').format('YYYY-MM-DD');
const limit = 24;
@@ -250,6 +313,7 @@ async function fetchLatest(site, page = 1, options) {
Instance: instanceToken,
Origin: site.url,
Referer: url,
'Accept-Language': 'en-US,en;', // somehow seems essential for some MG sites
},
});
@@ -274,6 +338,7 @@ async function fetchUpcoming(site, page, options) {
Instance: instanceToken,
Origin: site.url,
Referer: url,
'Accept-Language': 'en-US,en;', // somehow seems essential for some MG sites
},
});
@@ -284,8 +349,8 @@ async function fetchUpcoming(site, page, options) {
return res.statusCode;
}
async function fetchScene(url, site, baseScene, options) {
if (baseScene?.entryId) {
async function fetchRelease(url, site, baseScene, options) {
if (baseScene?.entryId && !baseScene.shallow && !options.parameters.forceDeep) {
// overview and deep data is the same, don't hit server unnecessarily
return baseScene;
}
@@ -299,12 +364,13 @@ async function fetchScene(url, site, baseScene, options) {
concurrency: options.parameters.concurrency,
headers: {
Instance: instanceToken,
'Accept-Language': 'en-US,en;', // somehow seems essential for some MG sites
},
});
if (res.status === 200 && res.body.result) {
return {
scene: scrapeScene(res.body.result, url, site),
scene: scrapeRelease(res.body.result, url, site),
};
}
@@ -321,6 +387,7 @@ async function fetchProfile({ name: actorName, slug: actorSlug }, { entity, para
concurrency: parameters.concurrency,
headers: {
Instance: instanceToken,
'Accept-Language': 'en-US,en;', // somehow seems essential for some MG sites
},
});
@@ -362,9 +429,11 @@ async function fetchProfile({ name: actorName, slug: actorSlug }, { entity, para
module.exports = {
beforeNetwork: getSession,
beforeFetchScenes: getSession,
requireBeforeNetwork: false,
scrapeLatestX,
fetchLatest,
fetchUpcoming,
fetchScene,
fetchScene: fetchRelease,
fetchMovie: fetchRelease,
fetchProfile,
};

172
src/scrapers/purgatoryx.js Normal file
View File

@@ -0,0 +1,172 @@
'use strict';
const qu = require('../utils/qu');
const http = require('../utils/http');
const slugify = require('../utils/slugify');
const { feetInchesToCm, lbsToKg } = require('../utils/convert');
function scrapeAll(scenes) {
return scenes.map(({ query }) => {
const release = {};
release.title = query.cnt('.title');
release.url = query.url('.title a');
release.entryId = new URL(release.url).pathname.match(/\/view\/(\d+)/)[1];
release.date = query.date('.pub-date', 'MMM DD, YYYY');
release.duration = query.duration('.video-duration');
release.actors = query.all('.models a').map((el) => ({
name: query.cnt(el),
url: query.url(el, null),
}));
if (query.exists('.thumb-big')) { // updates page
release.poster = query.img('.thumb-big', 'data-image') || JSON.parse(query.el('.thumbnails-wrap a', 'data-images'));
release.photos = [query.img('.thumb-top', 'data-image'), query.img('.thumb-bottom', 'data-image')];
}
if (query.exists('.thumbnails-wrap')) { // actor page
try {
const images = JSON.parse(query.el('.thumbnails-wrap a', 'data-images'));
release.poster = images.slice(0, 1)[0];
release.photos = images.slice(1);
} catch (error) {
// images probably not available
}
}
return release;
});
}
function scrapeUpcoming({ query }) {
const release = {};
release.url = query.url('.bottom-info a');
release.entryId = new URL(release.url).pathname.match(/\/view\/(\d+)/)?.[1];
release.title = query.cnt('.title');
release.actors = query.all('.model-wrap li').map((el) => ({
name: query.cnt(el, 'h5'),
url: query.url(el, '.model-thumb a'),
avatar: query.img(el, '.model-thumb img'),
}));
return release;
}
function scrapeScene({ query }, url) {
const release = {};
release.title = query.cnt('.title');
release.entryId = new URL(url).pathname.match(/\/view\/(\d+)/)[1];
release.date = query.date('.date', 'MMMM DD, YYYY', /\w+ \d{1,2}, \d{4}/);
release.description = query.cnt('.description p');
release.duration = query.duration('.total-time');
release.actors = query.all('.model-wrap li').map((el) => ({
name: query.cnt(el, 'h5'),
url: query.url(el, 'a'),
avatar: query.img(el),
}));
release.poster = query.poster();
release.photos = query.urls('.photos-slider a');
release.trailer = query.video();
release.comment = query.cnt('.series');
return release;
}
async function fetchLatest(channel, page) {
const res = await qu.getAll(`${channel.url}/episodes?page=${page}`, '.content-item');
if (res.ok) {
return scrapeAll(res.items, channel);
}
return res.status;
}
async function fetchUpcoming(channel) {
const res = await qu.get(channel.url, '.upcoming-info-wrap');
if (res.ok && res.item) {
return [scrapeUpcoming(res.item, channel)];
}
return res.status;
}
function scrapeProfile({ query }, url) {
const profile = { url };
const bio = Object.fromEntries(query.all('.model-desc li').map((el) => [slugify(query.cnt(el, 'span'), '_'), query.text(el)]));
profile.description = bio.bio;
profile.dateOfBirth = qu.extractDate(bio.birthdate, 'YYYY-MM-DD');
profile.birthPlace = bio.birthplace;
profile.hairColor = bio.hair_color;
profile.eyes = bio.eye_color;
profile.height = feetInchesToCm(bio.height);
profile.weight = lbsToKg(bio.weight);
profile.measurements = bio.measurements;
profile.avatar = query.img('.model-pic img');
profile.scenes = scrapeAll(qu.initAll(query.all('.content-item')));
return profile;
}
async function searchActor(baseActor, channel) {
const searchRes = await http.post(`${channel.url}/search-preview`, { q: slugify(baseActor.name, ' ') }, {
encodeJSON: false,
headers: {
'Accept-Language': 'en-US,en;',
},
});
if (searchRes.ok) {
const actorUrl = searchRes.body.find((item) => item.type === 'model' && slugify(item.title) === baseActor.slug)?.url;
return actorUrl || null;
}
return null;
}
async function fetchProfile(baseActor, context, include, retry = false) {
const actorUrl = (!retry && baseActor.url) || await searchActor(baseActor, context.entity);
if (!actorUrl) {
return null;
}
const res = await qu.get(actorUrl);
if (res.ok) {
return scrapeProfile(res.item, actorUrl);
}
if (baseActor.url) {
return fetchProfile(baseActor, context, include, true);
}
return res.status;
}
module.exports = {
fetchLatest,
fetchProfile,
fetchUpcoming,
scrapeAll,
scrapeScene,
};

143
src/scrapers/radical.js Normal file
View File

@@ -0,0 +1,143 @@
'use strict';
const http = require('../utils/http');
const qu = require('../utils/qu');
const slugify = require('../utils/slugify');
const { lbsToKg, feetInchesToCm } = require('../utils/convert');
function scrapeSceneMetadata(data, channel) {
const release = {};
release.entryId = data.id;
release.url = `${channel.url}/tour/videos/${data.id}/${slugify(data.title, '-', { removePunctuation: true })}`;
release.title = data.title;
release.description = data.description;
release.date = new Date(data.release_date);
release.duration = qu.durationToSeconds(data.videos_duration);
release.actors = data.models.map((model) => ({
entryId: model.id,
name: model.name,
gender: model.gender,
avatar: model.thumb,
url: `${channel.url}/tour/models/${model.id}/${slugify(model.name, '-', { removePunctuation: true })}`,
}));
release.poster = data.trailer?.poster || [data.thumb?.replace('mobile.jpg', '.jpg'), data.thumb];
release.photos = [
data.extra_thumbs?.find((url) => /portrait1.jpg/.test(url)),
data.extra_thumbs?.find((url) => /scene.jpg/.test(url)),
data.extra_thumbs?.find((url) => /portrait2.jpg/.test(url)),
]; // ordered by chronology: portrait1.jpg and scene.jpg are usually pre-shoot poses, portrait2.jpg is the cumshot aftermath
release.trailer = data.trailer && {
src: data.trailer.src,
type: data.trailer.type,
};
release.teaser = data.special_thumbs;
release.tags = [].concat(data.tags?.map((tag) => tag.name));
release.qualities = data.downloads && Object.values(data.downloads)?.map((download) => download.meta_data.height);
release.stars = data.rating;
return release;
}
function scrapeAllMetadata(scenes, channel) {
return scenes.map((data) => scrapeSceneMetadata(data, channel));
}
function scrapeProfileMetadata(data, channel) {
const profile = {};
profile.entryId = data.id;
profile.url = `${channel.url}/tour/models/${data.id}/${slugify(data.name, '-', { removePunctuation: true })}`;
profile.description = data.attributes.bio?.value;
profile.dateOfBirth = qu.parseDate(data.attributes.birthdate?.value, 'YYYY-MM-DD');
profile.gender = data.gender;
profile.age = data.attributes.age?.value;
profile.birthPlace = data.attributes.born?.value;
profile.measurements = data.attributes.measurements?.value;
profile.height = feetInchesToCm(data.attributes.height?.value);
profile.weight = lbsToKg(data.attributes.weight?.value);
profile.eyes = data.attributes.eyes?.value;
profile.hairColor = data.attributes.hair?.value;
profile.avatar = data.thumb;
profile.date = new Date(data.publish_date);
return profile;
}
async function fetchLatestMetadata(channel, page = 1) {
const url = `${channel.url}/tour/videos?page=${page}`;
const res = await http.get(url, {
parse: true,
extract: {
runScripts: 'dangerously',
},
});
if (res.ok && res.window.__DATA__) {
return scrapeAllMetadata(res.window.__DATA__.videos.items, channel);
}
if (res.ok) {
return res.window.__DATA__?.error || null;
}
return res.status;
}
async function fetchSceneMetadata(url, channel) {
const res = await http.get(url, {
parse: true,
extract: {
runScripts: 'dangerously',
},
});
if (res.ok && res.window.__DATA__?.video) {
return scrapeSceneMetadata(res.window.__DATA__.video, channel);
}
if (res.ok) {
return res.window.__DATA__?.error || null;
}
return res.status;
}
async function fetchProfileMetadata(actor, channel) {
const res = await http.get(`${channel.url}/tour/search-preview/${actor.name}`, {
headers: {
'X-Requested-With': 'XMLHttpRequest',
},
});
if (res.ok) {
const model = res.body.models?.items.find((modelX) => slugify(modelX.name) === actor.slug);
if (model) {
return scrapeProfileMetadata(model, channel);
}
return null;
}
return res.status;
}
module.exports = {
metadata: {
fetchLatest: fetchLatestMetadata,
fetchScene: fetchSceneMetadata,
fetchProfile: fetchProfileMetadata,
},
};

View File

@@ -35,7 +35,7 @@ const karups = require('./karups');
const kellymadison = require('./kellymadison');
const killergram = require('./killergram');
const kink = require('./kink');
const legalporno = require('./legalporno');
const analvids = require('./analvids');
const littlecapricedreams = require('./littlecapricedreams');
const mikeadriano = require('./mikeadriano');
const mindgeek = require('./mindgeek');
@@ -51,6 +51,8 @@ const pascalssubsluts = require('./pascalssubsluts'); // reserved keyword
const pierrewoodman = require('./pierrewoodman');
const pinkyxxx = require('./pinkyxxx');
const privateNetwork = require('./private'); // reserved keyword
const purgatoryx = require('./purgatoryx'); // reserved keyword
const radical = require('./radical');
const score = require('./score');
const spizoo = require('./spizoo');
const teamskeet = require('./teamskeet');
@@ -116,7 +118,7 @@ const scrapers = {
killergram,
kink,
kinkvr: badoink,
legalporno,
analvids,
letsdoeit: porndoe,
littlecapricedreams,
mamacitaz: porndoe,
@@ -136,6 +138,8 @@ const scrapers = {
porncz,
pornpros: whalemember,
private: privateNetwork,
purgatoryx,
radical,
score,
sexyhub: mindgeek,
spizoo,
@@ -206,6 +210,7 @@ const scrapers = {
gaywire: bangbros,
girlfaction: fullpornnetwork,
gloryholesecrets: aziani,
gotfilled: radical,
hergape: fullpornnetwork,
hitzefrei,
homemadeanalwhores: fullpornnetwork,
@@ -214,6 +219,7 @@ const scrapers = {
hushpass: hush,
hussiepass: hush,
iconmale: mindgeek,
inserted: radical,
interracialpass: hush,
interracialpovs: hush,
inthecrack,
@@ -224,7 +230,7 @@ const scrapers = {
killergram,
kink,
kinkvr: badoink,
legalporno,
analvids,
letsdoeit: porndoe,
littlecapricedreams,
mamacitaz: porndoe,
@@ -255,6 +261,7 @@ const scrapers = {
povperverts: fullpornnetwork,
povpornstars: hush,
private: privateNetwork,
purgatoryx,
realitykings: mindgeek,
realvr: badoink,
roccosiffredi: famedigital,

View File

@@ -14,13 +14,16 @@ function scrapeAll(scenes) {
release.url = query.url('a');
release.entryId = getEntryId(release.url);
release.title = query.cnt('.title-label a');
release.actors = query.all('.update_models a').map((el) => ({
release.title = query.cnt('.title-label a, .thumb-title a, .p-7, .text h3');
release.date = query.date('.date-label', 'MM/DD/YYYY');
release.actors = query.all(['.update_models a', '.tour_update_models a', '.pornstar-label span']).map((el) => ({
name: query.cnt(el),
url: query.url(el, null),
}));
release.poster = query.img('a img');
release.teaser = query.video('.leVideo source');
return release;
});
@@ -30,21 +33,21 @@ function scrapeScene({ query }, url) {
const release = {};
release.entryId = getEntryId(url);
release.title = query.cnt('#media-holder .title');
release.title = query.cnt(['#media-holder .title', '.content-holder h1', '#scene h1', 'h2.titular', 'title'])?.replace(/\s+-$/, '');
release.date = query.date('#sceneInfo .date', 'YYYY-MM-DD');
release.duration = query.duration('#sceneInfo .data-others', /\d+:\d+/);
release.date = query.date('#sceneInfo .date, #trailer-data .date', 'YYYY-MM-DD');
release.duration = query.duration('#sceneInfo .data-others, #trailer-data', /\d+:\d+/);
release.description = query.cnt('#sceneInfo .description');
release.description = query.cnt('#sceneInfo .description, #trailer-data > div:first-child p');
release.actors = query.all('#sceneInfo .data-others a[href*="/models"]').map((el) => ({
release.actors = query.all('#sceneInfo .data-others a[href*="/models"], #trailer-data a[href*="/models"]').map((el) => ({
name: query.el(el, null, 'title'),
url: query.url(el, null),
}));
release.tags = query.cnts('.categories-holder a');
release.tags = query.cnts('.categories-holder a, #sceneInfo a[href*="/categories"], #trailer-data a[href*="/categories"]');
const poster = query.img('#video-holder .update_thumb') || query.poster('#trailervideo');
const poster = query.img(['#video-holder .update_thumb', '#noMore .update_thumb', '#hpromo .update_thumb', '.trailer-thumb']) || query.poster('#trailervideo');
const posterPathname = poster && new URL(poster)?.pathname;
release.poster = [poster, poster?.replace(/imgw=\w+/, 'imgw=680')];
@@ -56,7 +59,8 @@ function scrapeScene({ query }, url) {
src,
]);
release.trailer = query.video('#trailervideo source');
release.trailer = query.video('#trailervideo source[type="video/mp4"], #FulsSizeVideo source[type="video/mp4"]'); // sic
release.teaser = query.video('#trailer-video source[src*="/videothumbs"]');
return release;
}
@@ -127,7 +131,7 @@ function scrapeProfile({ query, el }) {
}
async function fetchLatest(channel, page) {
const res = await qu.getAll(`${channel.url}/categories/movies_${page}_d.html`, '.thumb-big');
const res = await qu.getAll(`${channel.url}/categories/movies_${page}_d.html`, '.thumb-big, .thumb-video, .thumbnail, .thumbnail-popular, .full-thumbnail');
if (res.ok) {
return scrapeAll(res.items, channel);

View File

@@ -35,7 +35,7 @@ function scrapeScene(scene, channel) {
}));
release.poster = [
scene.img.replace('med.jpg', 'hi.jpg'),
// scene.img.replace('med.jpg', 'hi.jpg'), // this image is not always from the same scene! for example on Petite Teens 18
scene.img,
];
@@ -129,6 +129,11 @@ async function fetchLatest(channel, page = 1, { parameters }) {
}
async function fetchScene(url, channel, baseScene, { parameters }) {
if (baseScene?.entryId) {
// overview and deep data is the same, don't hit server unnecessarily
return baseScene;
}
const sceneSlug = new URL(url).pathname.match(/\/([\w-]+$)/)[1];
const res = await http.get(`${parameters.videos}/${sceneSlug}`);

View File

@@ -4,7 +4,7 @@
const config = require('config');
const faker = require('faker');
const nanoid = require('nanoid');
const { nanoid } = require('nanoid');
const moment = require('moment');
const knex = require('../knex');
@@ -232,7 +232,7 @@ function actors(release) {
}
async function fetchLatest(entity, page, options) {
return Promise.all(Array.from({ length: 100 }, async (value, index) => {
return Promise.all(Array.from({ length: 10000 }, async (value, index) => {
const release = {};
release.entryId = nanoid();
@@ -249,7 +249,8 @@ async function fetchLatest(entity, page, options) {
.where('is_sfw', true)
.pluck('path')
.orderByRaw('random()')
.limit(Math.floor(Math.random() * 10) + 1);
// .limit(Math.floor(Math.random() * 10) + 1)
.limit(100);
// const poster = 'sfw/kittens/thumbs/iNEXVlX-RLs.jpeg';
@@ -261,7 +262,7 @@ async function fetchLatest(entity, page, options) {
.select('name')
.where('priority', '>', 7)
.orderByRaw('random()')
.limit(faker.random.number({ min: 2, max: 15 }))
.limit(faker.datatype.number({ min: 15, max: 25 }))
.pluck('name');
release.actors = [...actors(release), null]; // include empty actor to ensure proper handling

View File

@@ -4,6 +4,7 @@
const Promise = require('bluebird');
const moment = require('moment');
const logger = require('../logger')(__filename);
const http = require('../utils/http');
const slugify = require('../utils/slugify');
@@ -141,24 +142,69 @@ async function getTrailer(scene, channel, url) {
return null;
}
async function getPhotos(url) {
/*
async function getPhotosLegacy(url) {
const htmlRes = await http.get(url, {
extract: {
runScripts: 'dangerously',
},
});
const state = htmlRes?.window.__APOLLO_STATE__;
const key = Object.values(state.ROOT_QUERY).find((query) => query?.__ref)?.__ref;
const data = state[key];
try {
const state = htmlRes?.window?.__APOLLO_STATE__;
console.log(data);
if (!state) {
return [];
}
if (!data) {
const key = Object.values(state?.ROOT_QUERY).find((query) => query?.__ref)?.__ref;
const data = state[key];
if (!data) {
return [];
}
return data.carousel.slice(1).map((photo) => photo.main?.[0].src).filter(Boolean);
} catch (error) {
logger.warn(`Failed to retrieve Vixen images: ${error.message}`);
return [];
}
}
*/
return data.carousel.slice(1).map((photo) => photo.main?.[0].src).filter(Boolean);
async function getPhotos(url) {
const htmlRes = await http.get(url, {
parse: true,
extract: {
runScripts: 'dangerously',
},
});
try {
const state = htmlRes?.window?.__APOLLO_STATE__;
console.log('state', state);
if (!state) {
return [];
}
const key = Object.values(state?.ROOT_QUERY).find((query) => query?.__ref)?.__ref;
const data = state[key];
console.log('data', data);
if (!data) {
return [];
}
console.log(data.carousel);
return data.carousel.slice(1).map((photo) => photo.main?.[0].src).filter(Boolean);
} catch (error) {
logger.warn(`Failed to retrieve Vixen images: ${error.message}`);
return [];
}
}
function scrapeAll(scenes, site, origin) {

View File

@@ -7,7 +7,7 @@ const http = require('../utils/http');
function scrapeLatest(html, site) {
const { document } = new JSDOM(html).window;
const { origin } = new URL(site.url);
const { origin } = new URL(site.parameters?.latest || site.url);
const videos = Array.from(document.querySelectorAll('.video-releases-list')).slice(-1)[0];
@@ -119,7 +119,7 @@ function scrapeScene(html, site, url) {
}
async function fetchLatest(site, page = 1) {
const url = `${site.url}?page=${page}`;
const url = `${site.parameters?.latest || site.url}?page=${page}`;
const res = await http.get(url);
if (res.statusCode === 200) {

View File

@@ -1,14 +1,13 @@
'use strict';
const { fetchLatest, fetchUpcoming, scrapeScene, fetchProfile } = require('./gamma');
const http = require('../utils/http');
const qu = require('../utils/qu');
async function fetchScene(url, site, baseRelease, options) {
const res = await http.get(url);
const res = await qu.get(url);
const release = await scrapeScene(res.item, url, site, baseRelease, null, options);
const release = await scrapeScene(res.body.toString(), url, site, baseRelease, null, options);
const siteDomain = release.$('meta[name="twitter:domain"]').attr('content') || 'allblackx.com'; // only AllBlackX has no twitter domain, no other useful hints available
const siteDomain = release.query.el('meta[name="twitter:domain"]', 'content') || 'allblackx.com'; // only AllBlackX has no twitter domain, no other useful hints available
const siteSlug = siteDomain && siteDomain.split('.')[0].toLowerCase();
// const siteUrl = siteDomain && `https://www.${siteDomain}`;

View File

@@ -1,6 +1,7 @@
'use strict';
const config = require('config');
const Promise = require('bluebird');
const argv = require('./argv');
const logger = require('./logger')(__filename);
@@ -8,6 +9,7 @@ const knex = require('./knex');
const slugify = require('./utils/slugify');
const bulkInsert = require('./utils/bulk-insert');
const resolvePlace = require('./utils/resolve-place');
const chunk = require('./utils/chunk');
const { formatDate } = require('./utils/qu');
const { associateActors, associateDirectors, scrapeActors, toBaseActors } = require('./actors');
const { associateReleaseTags } = require('./tags');
@@ -134,7 +136,7 @@ async function attachStudios(releases) {
return releasesWithStudio;
}
function attachReleaseIds(releases, storedReleases) {
function attachReleaseIds(releases, storedReleases, batchId) {
const storedReleaseIdsByEntityIdAndEntryId = storedReleases.reduce((acc, release) => {
if (!acc[release.entity_id]) acc[release.entity_id] = {};
acc[release.entity_id][release.entry_id] = release.id;
@@ -144,7 +146,7 @@ function attachReleaseIds(releases, storedReleases) {
const releasesWithId = releases.map((release) => {
if (!release.entity) {
logger.error(`No entitity available for ${release.url}`);
logger.error(`No entity available for ${release.url}`);
return null;
}
@@ -155,6 +157,7 @@ function attachReleaseIds(releases, storedReleases) {
return {
...release,
id,
batchId,
};
}
@@ -192,13 +195,16 @@ function filterInternalDuplicateReleases(releases) {
async function filterDuplicateReleases(releases) {
const internalUniqueReleases = filterInternalDuplicateReleases(releases);
const internalUniqueReleaseChunks = chunk(internalUniqueReleases);
const duplicateReleaseEntries = await knex('releases')
.whereIn(['entry_id', 'entity_id'], internalUniqueReleases.map((release) => [release.entryId, release.entity.id]))
.orWhereIn(['entry_id', 'entity_id'], internalUniqueReleases
const duplicateReleaseEntryChunks = await Promise.map(internalUniqueReleaseChunks, async (internalUniqueReleasesChunk) => knex('releases')
.whereIn(['entry_id', 'entity_id'], internalUniqueReleasesChunk.map((release) => [release.entryId, release.entity.id]))
.orWhereIn(['entry_id', 'entity_id'], internalUniqueReleasesChunk
// scene IDs shared across network, mark as duplicate so scene can be updated with channel if only available on release day (i.e. Perv City)
.filter((release) => release.entity.parent?.parameters?.networkEntryIds)
.map((release) => [release.entryId, release.entity.parent.id]));
.map((release) => [release.entryId, release.entity.parent.id])), { concurrency: 10 });
const duplicateReleaseEntries = duplicateReleaseEntryChunks.flat();
const duplicateReleasesByEntityIdAndEntryId = duplicateReleaseEntries.reduce((acc, release) => {
if (!acc[release.entity_id]) acc[release.entity_id] = {};
@@ -229,6 +235,7 @@ async function updateSceneSearch(releaseIds) {
TO_TSVECTOR(
'english',
COALESCE(releases.title, '') || ' ' ||
releases.entry_id || ' ' ||
entities.name || ' ' ||
entities.slug || ' ' ||
COALESCE(array_to_string(entities.alias, ' '), '') || ' ' ||
@@ -308,12 +315,148 @@ async function storeChapters(releases) {
await associateReleaseMedia(chaptersWithId, 'chapter');
}
async function storeScenes(releases) {
async function associateMovieScenes(movies, movieScenes) {
const moviesByEntityIdAndEntryId = movies.reduce((acc, movie) => ({
...acc,
[movie.entity.id]: {
...acc[movie.entity.id],
[movie.entryId]: movie,
},
}), {});
const associations = movieScenes.map((scene) => {
if (!scene.movie) {
return null;
}
const sceneMovie = moviesByEntityIdAndEntryId[scene.entity.id]?.[scene.movie.entryId]
|| moviesByEntityIdAndEntryId[scene.entity.parent?.id]?.[scene.movie.entryId];
if (sceneMovie?.id) {
return {
movie_id: sceneMovie.id,
scene_id: scene.id,
};
}
return null;
}).filter(Boolean);
await bulkInsert('movies_scenes', associations, false);
}
async function associateSerieScenes(series, serieScenes) {
const seriesByEntityIdAndEntryId = series.reduce((acc, serie) => ({
...acc,
[serie.entity.id]: {
...acc[serie.entity.id],
[serie.entryId]: serie,
},
}), {});
const associations = serieScenes.map((scene) => {
if (!scene.serie) {
return null;
}
const sceneSerie = seriesByEntityIdAndEntryId[scene.entity.id]?.[scene.serie.entryId]
|| seriesByEntityIdAndEntryId[scene.entity.parent?.id]?.[scene.serie.entryId];
if (sceneSerie?.id) {
return {
serie_id: sceneSerie.id,
scene_id: scene.id,
};
}
return null;
}).filter(Boolean);
await bulkInsert('series_scenes', associations, false);
}
async function updateMovieSearch(movieIds, target = 'movie') {
logger.info(`Updating search documents for ${movieIds ? movieIds.length : 'all' } ${target}s`);
const documents = await knex.raw(`
SELECT
${target}s.id AS ${target}_id,
TO_TSVECTOR(
'english',
COALESCE(${target}s.title, '') || ' ' ||
entities.name || ' ' ||
entities.slug || ' ' ||
COALESCE(array_to_string(entities.alias, ' '), '') || ' ' ||
COALESCE(parents.name, '') || ' ' ||
COALESCE(parents.slug, '') || ' ' ||
COALESCE(array_to_string(parents.alias, ' '), '') || ' ' ||
COALESCE(TO_CHAR(${target}s.date, 'YYYY YY MM FMMM FMMonth mon DD FMDD'), '') || ' ' ||
STRING_AGG(COALESCE(releases.title, ''), ' ') || ' ' ||
STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' ||
STRING_AGG(COALESCE(tags.name, ''), ' ')
) as document
FROM ${target}s
LEFT JOIN entities ON ${target}s.entity_id = entities.id
LEFT JOIN entities AS parents ON parents.id = entities.parent_id
LEFT JOIN ${target}s_scenes ON ${target}s_scenes.${target}_id = ${target}s.id
LEFT JOIN releases ON releases.id = ${target}s_scenes.scene_id
LEFT JOIN releases_actors ON releases_actors.release_id = ${target}s_scenes.scene_id
LEFT JOIN releases_tags ON releases_tags.release_id = releases.id
LEFT JOIN actors ON actors.id = releases_actors.actor_id
LEFT JOIN tags ON tags.id = releases_tags.tag_id
${movieIds ? `WHERE ${target}s.id = ANY(?)` : ''}
GROUP BY ${target}s.id, entities.name, entities.slug, entities.alias, parents.name, parents.slug, parents.alias;
`, movieIds && [movieIds]);
if (documents.rows?.length > 0) {
await bulkInsert(`${target}s_search`, documents.rows, [`${target}_id`]);
}
}
async function storeMovies(movies, useBatchId) {
if (!movies || movies.length === 0) {
return [];
}
const { uniqueReleases } = await filterDuplicateReleases(movies);
const [batchId] = useBatchId ? [useBatchId] : await knex('batches').insert({ comment: null }).returning('id');
const curatedMovieEntries = await Promise.all(uniqueReleases.map((release) => curateReleaseEntry(release, batchId, null, 'movie')));
const storedMovies = await bulkInsert('movies', curatedMovieEntries, ['entity_id', 'entry_id'], true);
const moviesWithId = attachReleaseIds(movies, storedMovies);
await updateMovieSearch(moviesWithId.map((movie) => movie.id));
await associateReleaseMedia(moviesWithId, 'movie');
return moviesWithId;
}
async function storeSeries(series, useBatchId) {
if (!series || series.length === 0) {
return [];
}
const { uniqueReleases } = await filterDuplicateReleases(series);
const [batchId] = useBatchId ? [useBatchId] : await knex('batches').insert({ comment: null }).returning('id');
const curatedSerieEntries = await Promise.all(uniqueReleases.map((release) => curateReleaseEntry(release, batchId, null, 'serie')));
const storedSeries = await bulkInsert('series', curatedSerieEntries, ['entity_id', 'entry_id'], true);
const seriesWithId = attachReleaseIds(series, storedSeries);
await updateMovieSearch(seriesWithId.map((serie) => serie.id), 'serie');
await associateReleaseMedia(seriesWithId, 'serie');
return seriesWithId;
}
async function storeScenes(releases, useBatchId) {
if (!releases || releases.length === 0) {
return [];
}
const [batchId] = await knex('batches').insert({ comment: null }).returning('id');
const [batchId] = useBatchId ? [useBatchId] : await knex('batches').insert({ comment: null }).returning('id');
const releasesWithChannels = await attachChannelEntities(releases);
const releasesWithBaseActors = releasesWithChannels.map((release) => ({ ...release, actors: toBaseActors(release.actors) }));
@@ -327,8 +470,8 @@ async function storeScenes(releases) {
const storedReleaseEntries = Array.isArray(storedReleases) ? storedReleases : [];
const uniqueReleasesWithId = attachReleaseIds(uniqueReleases, storedReleaseEntries);
const duplicateReleasesWithId = attachReleaseIds(duplicateReleases, duplicateReleaseEntries);
const uniqueReleasesWithId = attachReleaseIds(uniqueReleases, storedReleaseEntries, batchId);
const duplicateReleasesWithId = attachReleaseIds(duplicateReleases, duplicateReleaseEntries, batchId);
const releasesWithId = uniqueReleasesWithId.concat(duplicateReleasesWithId);
const updated = await knex.raw(`
@@ -348,12 +491,14 @@ async function storeScenes(releases) {
scenes: JSON.stringify(duplicateReleasesWithId),
});
const [actors] = await Promise.all([
const [actors, storedSeries] = await Promise.all([
associateActors(releasesWithId, batchId),
storeSeries(releasesWithId.map((release) => release.serie && { ...release.serie, entity: release.entity }).filter(Boolean), batchId),
associateReleaseTags(releasesWithId),
storeChapters(releasesWithId),
]);
await associateSerieScenes(storedSeries, releasesWithId);
await associateDirectors(releasesWithId, batchId); // some directors may also be actors, don't associate at the same time
await updateSceneSearch(releasesWithId.map((release) => release.id));
@@ -371,92 +516,6 @@ async function storeScenes(releases) {
return releasesWithId;
}
async function associateMovieScenes(movies, movieScenes) {
const moviesByEntityIdAndEntryId = movies.reduce((acc, movie) => ({
...acc,
[movie.entity.id]: {
...acc[movie.entity.id],
[movie.entryId]: movie,
},
}), {});
const associations = movieScenes.map((scene) => {
if (!scene.movie) {
return null;
}
const sceneMovie = moviesByEntityIdAndEntryId[scene.entity.id]?.[scene.movie.entryId];
if (sceneMovie?.id) {
return {
movie_id: sceneMovie.id,
scene_id: scene.id,
};
}
return null;
}).filter(Boolean);
await bulkInsert('movies_scenes', associations, false);
}
async function updateMovieSearch(movieIds) {
logger.info(`Updating search documents for ${movieIds ? movieIds.length : 'all' } movies`);
const documents = await knex.raw(`
SELECT
movies.id AS movie_id,
TO_TSVECTOR(
'english',
COALESCE(movies.title, '') || ' ' ||
entities.name || ' ' ||
entities.slug || ' ' ||
COALESCE(array_to_string(entities.alias, ' '), '') || ' ' ||
COALESCE(parents.name, '') || ' ' ||
COALESCE(parents.slug, '') || ' ' ||
COALESCE(array_to_string(parents.alias, ' '), '') || ' ' ||
COALESCE(TO_CHAR(movies.date, 'YYYY YY MM FMMM FMMonth mon DD FMDD'), '') || ' ' ||
STRING_AGG(COALESCE(releases.title, ''), ' ') || ' ' ||
STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' ||
STRING_AGG(COALESCE(tags.name, ''), ' ')
) as document
FROM movies
LEFT JOIN entities ON movies.entity_id = entities.id
LEFT JOIN entities AS parents ON parents.id = entities.parent_id
LEFT JOIN movies_scenes ON movies_scenes.movie_id = movies.id
LEFT JOIN releases ON releases.id = movies_scenes.scene_id
LEFT JOIN releases_actors ON releases_actors.release_id = movies_scenes.scene_id
LEFT JOIN releases_tags ON releases_tags.release_id = releases.id
LEFT JOIN actors ON actors.id = releases_actors.actor_id
LEFT JOIN tags ON tags.id = releases_tags.tag_id
${movieIds ? 'WHERE movies.id = ANY(?)' : ''}
GROUP BY movies.id, entities.name, entities.slug, entities.alias, parents.name, parents.slug, parents.alias;
`, movieIds && [movieIds]);
if (documents.rows?.length > 0) {
await bulkInsert('movies_search', documents.rows, ['movie_id']);
}
}
async function storeMovies(movies) {
if (!movies || movies.length === 0) {
return [];
}
const { uniqueReleases } = await filterDuplicateReleases(movies);
const [batchId] = await knex('batches').insert({ comment: null }).returning('id');
const curatedMovieEntries = await Promise.all(uniqueReleases.map((release) => curateReleaseEntry(release, batchId, null, 'movie')));
const storedMovies = await bulkInsert('movies', curatedMovieEntries, ['entity_id', 'entry_id'], true);
const moviesWithId = attachReleaseIds(movies, storedMovies);
await updateMovieSearch(moviesWithId.map((movie) => movie.id));
await associateReleaseMedia(moviesWithId, 'movie');
return moviesWithId;
}
module.exports = {
associateMovieScenes,
storeScenes,

View File

@@ -99,14 +99,20 @@ async function matchReleaseTags(releases) {
async function getEntityTags(releases) {
const entityIds = releases.map((release) => release.entity?.id).filter(Boolean);
const entityTags = await knex('entities_tags').whereIn('entity_id', entityIds);
const entityTags = await knex('entities_tags')
.select('id', 'name', 'entity_id')
.whereIn('entity_id', entityIds)
.leftJoin('tags', 'tags.id', 'entities_tags.tag_id');
const entityTagIdsByEntityId = entityTags.reduce((acc, entityTag) => {
if (!acc[entityTag.entity_id]) {
acc[entityTag.entity_id] = [];
}
acc[entityTag.entity_id].push(entityTag.tag_id);
acc[entityTag.entity_id].push({
id: entityTag.id,
name: entityTag.name,
});
return acc;
}, {});
@@ -117,7 +123,7 @@ async function getEntityTags(releases) {
function buildReleaseTagAssociations(releases, tagIdsBySlug, entityTagIdsByEntityId, type) {
const tagAssociations = releases
.map((release) => {
const entityTagIds = entityTagIdsByEntityId[release.entity?.id]?.map((tag) => ({ id: tag.id, origin: tag.name })) || [];
const entityTagIds = entityTagIdsByEntityId[release.entity?.id]?.map((tag) => ({ id: tag.id, original: tag.name })) || [];
const releaseTags = release.tags?.filter(Boolean) || [];
const releaseTagsWithIds = releaseTags.every((tag) => typeof tag === 'number')
@@ -152,9 +158,9 @@ async function associateReleaseTags(releases, type = 'release') {
}
const tagIdsBySlug = await matchReleaseTags(releases);
const EntityTagIdsByEntityId = await getEntityTags(releases);
const entityTagIdsByEntityId = await getEntityTags(releases);
const tagAssociations = buildReleaseTagAssociations(releases, tagIdsBySlug, EntityTagIdsByEntityId, type);
const tagAssociations = buildReleaseTagAssociations(releases, tagIdsBySlug, entityTagIdsByEntityId, type);
await bulkInsert(`${type}s_tags`, tagAssociations, false);
}

1721
src/tools/analvids.js Normal file

File diff suppressed because it is too large Load Diff

18
src/tools/knex-update.js Normal file
View File

@@ -0,0 +1,18 @@
'use strict';
const knex = require('../knex');
async function update() {
const query = knex('bans')
.update('type', {
type: 'mute',
username_original: 'charles',
})
.where('id', 2754);
console.log(query.toSQL());
await query;
}
update();

40
src/tools/realitykings.js Normal file
View File

@@ -0,0 +1,40 @@
'use strict';
const fetch = require('node-fetch');
const express = require('express');
async function init() {
const res = await fetch('https://www.realitykings.com/scenes?site=45', {
method: 'HEAD',
headers: {
'user-agent': 'HTTPie/2.6.0',
'accept-encoding': 'gzip, deflate, br',
accept: '*/*',
connection: 'keep-alive',
},
});
console.log(res.status, res.headers);
const app = express();
app.get('/', (appReq, appRes) => {
console.log(appReq.headers);
appRes.status(204).send();
});
app.listen(8000, () => {
console.log('Listening on port 8000');
fetch('http://127.0.0.1:8000', {
headers: {
'user-agent': 'HTTPie/2.6.0',
'accept-encoding': 'gzip, deflate, br',
accept: '*/*',
connection: 'keep-alive',
},
});
});
}
init();

View File

@@ -8,6 +8,7 @@ const argv = require('./argv');
const logger = require('./logger')(__filename);
const knex = require('./knex');
const { curateRelease } = require('./releases');
const chunk = require('./utils/chunk');
const include = require('./utils/argv-include')(argv);
const { resolveScraper, resolveLayoutScraper } = require('./scrapers/resolve');
const { fetchIncludedEntities } = require('./entities');
@@ -38,22 +39,27 @@ function filterLocalUniqueReleases(releases, accReleases) {
}
async function filterUniqueReleases(releases) {
const releaseIdentifiers = releases
.map((release) => [release.entity.id, release.entryId]);
const releaseIdentifierChunks = chunk(releases.map((release) => [release.entity.id, release.entryId.toString()]));
const duplicateReleaseEntries = await knex('releases')
.select(knex.raw('releases.*, row_to_json(entities) as entity'))
.leftJoin('entities', 'entities.id', 'releases.entity_id')
.whereIn(['entity_id', 'entry_id'], releaseIdentifiers)
.where((builder) => {
// check if previously upcoming scenes can be excluded from duplicates to be rescraped for release day updates
builder
.where('deep', true) // scene is already deep scraped
.orWhereNull('date')
.orWhereNotIn('date_precision', ['day', 'minute']) // don't worry about scenes without (accurate) dates for now
.orWhere(knex.raw('date > NOW() - INTERVAL \'12 hours\'')) // scene is still upcoming, with a rough offset to wait for the end of the day west of UTC
.orWhere(knex.raw('updated_at - date > INTERVAL \'1 day\'')); // scene was updated after the release date, no updates expected
});
const duplicateReleaseEntryChunks = await Promise.map(releaseIdentifierChunks, async (releaseIdentifiers) => {
const duplicateReleaseEntriesQuery = knex('releases')
.select(knex.raw('releases.*, row_to_json(entities) as entity'))
.leftJoin('entities', 'entities.id', 'releases.entity_id')
.whereIn(['entity_id', 'entry_id'], releaseIdentifiers)
.where((builder) => {
// check if previously upcoming scenes can be excluded from duplicates to be rescraped for release day updates
builder
.where('deep', true) // scene is already deep scraped
.orWhereNull('date')
.orWhereNotIn('date_precision', ['day', 'minute']) // don't worry about scenes without (accurate) dates for now
.orWhere(knex.raw('date > NOW() - INTERVAL \'12 hours\'')) // scene is still upcoming, with a rough offset to wait for the end of the day west of UTC
.orWhere(knex.raw('updated_at - date > INTERVAL \'1 day\'')); // scene was updated after the release date, no updates expected
});
return duplicateReleaseEntriesQuery;
}, { concurrency: 10 });
const duplicateReleaseEntries = duplicateReleaseEntryChunks.flat();
const duplicateReleases = duplicateReleaseEntries.map((release) => curateRelease(release));
const duplicateReleasesByEntityIdAndEntryId = duplicateReleases.reduce(mapReleasesToEntityIdAndEntryId, {});
@@ -261,8 +267,21 @@ async function scrapeNetworkSequential(networkEntity) {
return releases.uniqueReleases;
}
async function getBeforeNetwork(networkEntity) {
try {
const parameters = getRecursiveParameters(networkEntity);
return await networkEntity.scraper?.beforeNetwork?.(networkEntity, parameters);
} catch (error) {
if (networkEntity.scraper?.requireBeforeNetwork === false) {
return null;
}
throw error;
}
}
async function scrapeNetworkParallel(networkEntity) {
const beforeNetwork = await networkEntity.scraper.beforeNetwork?.(networkEntity);
const beforeNetwork = await getBeforeNetwork(networkEntity);
return Promise.map(
networkEntity.includedChildren,

View File

@@ -0,0 +1,3 @@
'use strict';
module.exports = new Map();

View File

@@ -3,12 +3,15 @@
const config = require('config');
const Promise = require('bluebird');
const bhttp = require('bhttp');
const fs = require('fs').promises;
const util = require('util');
const stream = require('stream');
const tunnel = require('tunnel');
const Bottleneck = require('bottleneck');
const { JSDOM, toughCookie } = require('jsdom');
const windows = require('./http-windows');
const logger = require('../logger')(__filename);
const virtualConsole = require('./virtual-console')(__filename);
const argv = require('../argv');
@@ -78,8 +81,6 @@ function getLimiter(options = {}, url) {
});
}
limiters[interval][concurrency].on('queued', () => logger.silly(`Queued ${url}`));
return limiters[interval][concurrency];
}
@@ -116,12 +117,23 @@ async function finalizeResult(res, options) {
if (Buffer.isBuffer(res.body)) {
const html = res.body.toString();
const window = options?.parse ? new JSDOM(html, { virtualConsole, ...options.extract }).window : null;
const pathname = new URL(res.request.url).pathname.replace(/\//g, '_');
// allow window.close to be called after scraping is done, only for deep scrapes where the URL is known outside the scraper
if (window && /fetchScene|fetchMovie/.test(new Error().stack)) {
windows.set(pathname, window);
}
if (argv.saveHtml) {
await fs.writeFile(`./html/${pathname}.html`, html);
}
return {
...res,
body: html,
html,
status: res.statusCode,
headers: res.headers,
document: window?.document || null,
window,
ok: res.statusCode >= 200 && res.statusCode <= 299,
@@ -132,6 +144,7 @@ async function finalizeResult(res, options) {
...res,
body: res.body,
status: res.statusCode,
headers: res.headers,
ok: res.statusCode >= 200 && res.statusCode <= 299,
};
}

38
src/utils/jsdom-perf.js Normal file
View File

@@ -0,0 +1,38 @@
'use strict';
const util = require('util');
const fs = require('fs').promises;
const Promise = require('bluebird');
const { JSDOM } = require('jsdom');
const waitImmediate = util.promisify(setImmediate);
async function init() {
let peak = 0;
const files = await fs.readdir('./html');
// const dom = new JSDOM('<html><body></body></html>', { runScripts: 'dangerously' });
await Promise.map(Array.from({ length: 10 }).map(() => files).flat(), async (filename) => {
const html = await fs.readFile(`./html/${filename}`, 'utf8');
const dom = new JSDOM(html);
// dom.window.document.body.innerHTML = html;
dom.window.close();
const usage = process.memoryUsage.rss() / 1000000;
peak = Math.max(usage, peak);
console.log(`Memory usage: ${usage.toFixed(2)} MB, peak ${peak.toFixed(2)} MB`);
await waitImmediate;
}, {
concurrency: 1,
});
await Promise.delay(2000);
console.log(`Final memory usage: ${(process.memoryUsage.rss() / 1000000).toFixed(2)} MB, max ${peak.toFixed(2)} MB`);
}
init();

View File

@@ -6,7 +6,7 @@ const fsPromises = require('fs').promises;
const Promise = require('bluebird');
const blake2 = require('blake2');
const sharp = require('sharp');
const nanoid = require('nanoid');
const { nanoid } = require('nanoid');
const { PassThrough } = require('stream');
const http = require('./http');

View File

@@ -16,6 +16,10 @@ function trim(str) {
}
function extractDate(dateString, format, match) {
if (!dateString) {
return null;
}
if (match) {
const dateStamp = trim(dateString).match(match);
@@ -80,32 +84,69 @@ function prefixUrl(urlValue, origin, protocol = 'https') {
return urlValue;
}
function q(context, selector, attrArg, applyTrim = true) {
if (!selector && context.nodeName === '#document') {
function iterateXPathResult(iterator, results = []) {
const element = iterator.iterateNext();
if (element) {
return iterateXPathResult(iterator, [...results, element]);
}
return results;
}
function getElements(context, selector, first = false) {
if (!selector) {
return context;
}
if (/^\/\//.test(selector)) {
// XPath selector
const iterator = globalWindow.document.evaluate(selector, context, null, globalWindow.XPathResult.ORDERED_NODE_ITERATOR_TYPE, null);
if (first) {
return iterator.iterateNext();
}
return iterateXPathResult(iterator);
}
if (first) {
return context.querySelector(selector);
}
return Array.from(context.querySelectorAll(selector));
}
function q(context, selectors, attrArg, applyTrim = true) {
if (!selectors && context.nodeName === '#document') {
return null;
}
const attr = attrArg === true ? 'textContent' : attrArg;
const element = [].concat(selectors).reduce((acc, selector) => acc || getElements(context, selector, true), null);
if (!element) {
return null;
}
if (attr) {
const value = selector
? context.querySelector(selector)?.[attr] || context.querySelector(selector)?.attributes[attr]?.value
: context[attr] || context.getAttribute(attr);
const value = element[attr] || element.getAttribute(attr);
return applyTrim && typeof value === 'string' ? trim(value) : value;
}
return selector ? context.querySelector(selector) : context;
return element;
}
function all(context, selector, attrArg, applyTrim = true) {
function all(context, selectors, attrArg, applyTrim = true) {
const attr = attrArg === true ? 'textContent' : attrArg;
const elements = [].concat(selectors).reduce((acc, selector) => acc || getElements(context, selector), null);
if (attr) {
return Array.from(context.querySelectorAll(selector), (el) => q(el, null, attr, applyTrim));
return elements.map((el) => q(el, null, attr, applyTrim));
}
return Array.from(context.querySelectorAll(selector));
return elements;
}
function exists(context, selector) {
@@ -130,6 +171,42 @@ function html(context, selector) {
return el && el.innerHTML;
}
function htmls(context, selector) {
const els = all(context, selector, null, true);
return els.map((el) => el.innerHTML);
}
function execute(context, selector = 'script') {
const scripts = htmls(context, selector);
const originalGlobal = Object.fromEntries(Object.entries(global));
const errors = scripts?.reduce((accErrors, script) => {
try {
Function(script)(); /* eslint-disable-line no-new-func */
return accErrors;
} catch (error) {
// the script failed
return [...accErrors, error];
}
}, []);
const data = Object.fromEntries(Object.entries(global).filter(([key, value]) => {
if (originalGlobal[key] !== value) {
delete global[key];
return true;
}
return false;
}));
return {
...data,
errors,
};
}
function json(context, selector) {
const el = q(context, selector, null, true);
@@ -152,12 +229,6 @@ function jsons(context, selector) {
});
}
function htmls(context, selector) {
const els = all(context, selector, null, true);
return els.map((el) => el.innerHTML);
}
function texts(context, selector, applyTrim = true, filter = true) {
const el = q(context, selector, null, applyTrim);
if (!el) return null;
@@ -425,6 +496,8 @@ const quFuncs = {
duration,
el: q,
element: q,
execute,
exec: execute,
exists,
html,
htmls,

View File

@@ -3,7 +3,7 @@
const config = require('config');
const AWS = require('aws-sdk');
const fs = require('fs');
const nanoid = require('nanoid');
const { nanoid } = require('nanoid');
async function init() {
const filepath = './public/img/sfw/animals/j0iiByCxGfA.jpeg';

View File

@@ -33,8 +33,8 @@ async function upsert(table, items, identifier = ['id'], _knex) {
logger.debug(`${table}: Updating ${update.length}`);
const [inserted, updated] = await Promise.all([
knex(table).returning('*').insert(insert),
knex.transaction(async (trx) => Promise.all(update.map((item) => {
insert.length > 0 ? knex(table).returning('*').insert(insert) : [],
update.length > 0 ? knex.transaction(async (trx) => Promise.all(update.map((item) => {
const clause = identifiers.reduce((acc, identifierX) => ({ ...acc, [identifierX]: item[identifierX] }), {});
return trx
@@ -42,7 +42,7 @@ async function upsert(table, items, identifier = ['id'], _knex) {
.update(item)
.into(table)
.returning('*');
}))),
}))) : [],
]);
return {