traxxx/src/deep.js

218 lines
5.8 KiB
JavaScript
Raw Normal View History

'use strict';
const Promise = require('bluebird');
2021-08-15 11:16:48 +00:00
const { mergeAdvanced: merge } = require('object-merge-advanced');
const argv = require('./argv');
const include = require('./utils/argv-include')(argv);
const { fetchReleaseEntities, urlToSiteSlug } = require('./entities');
const logger = require('./logger')(__filename);
const qu = require('./utils/qu');
const getRecursiveParameters = require('./utils/get-recursive-parameters');
function toBaseReleases(baseReleasesOrUrls, entity = null) {
2020-05-17 01:00:44 +00:00
if (!baseReleasesOrUrls) {
return [];
}
return baseReleasesOrUrls
.map((baseReleaseOrUrl) => {
if (baseReleaseOrUrl.url) {
// base release with URL
return {
...baseReleaseOrUrl,
entity: baseReleaseOrUrl.entity || entity,
deep: false,
};
}
if (/^http/.test(baseReleaseOrUrl)) {
// URL
return {
url: baseReleaseOrUrl,
entity,
deep: false,
};
}
if (typeof baseReleaseOrUrl === 'object' && !Array.isArray(baseReleaseOrUrl)) {
// base release without URL, prepare for passthrough
return {
...baseReleaseOrUrl,
entity: baseReleaseOrUrl.entity || entity,
deep: false,
};
}
logger.warn(`Malformed base release, discarding '${baseReleaseOrUrl}'`);
return null;
})
.filter(Boolean);
}
async function fetchScene(scraper, url, entity, baseRelease, options) {
if (scraper.fetchScene) {
return scraper.fetchScene(baseRelease.url, entity, baseRelease, options, null);
}
if (scraper.scrapeScene) {
const session = qu.session();
const res = await qu.get(url, null, null, {
session,
rejectUnauthorized: false,
});
const cookie = await session._sessionOptions.cookieJar.get(url);
if (res.ok) {
return scraper.scrapeScene(res.item, url, entity, baseRelease, options, {
session,
headers: res.headers,
cookieJar: session._sessionOptions.cookieJar,
cookie,
});
}
return res.status;
}
return null;
}
async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
const entity = baseRelease.entity || entitiesBySlug[urlToSiteSlug(baseRelease.url)];
if (!entity) {
logger.warn(`No entity available for ${baseRelease.url}`);
return baseRelease;
}
if ((!baseRelease.url && !baseRelease.path) || !argv.deep) {
return {
...baseRelease,
entity,
};
}
const layoutScraper = entity.scraper;
if (!entity.scraper) {
logger.warn(`Could not find scraper for ${baseRelease.url}`);
return baseRelease;
}
if ((type === 'scene' && !layoutScraper.fetchScene && !layoutScraper.scrapeScene) || (type === 'movie' && !layoutScraper.fetchMovie)) {
logger.warn(`The '${entity.name}'-scraper cannot scrape individual ${type}s`);
return baseRelease;
}
try {
logger.verbose(`Fetching ${type} ${baseRelease.url}`);
const options = {
...include,
beforeFetchScenes: entity.preData,
parameters: getRecursiveParameters(entity),
};
const rawScrapedRelease = type === 'scene'
? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, options, null)
: await layoutScraper.fetchMovie(baseRelease.url, entity, baseRelease, options, null);
const scrapedRelease = rawScrapedRelease?.scene || rawScrapedRelease;
if (!scrapedRelease || typeof scrapedRelease !== 'object' || Array.isArray(scrapedRelease)) {
// scraper is unable to fetch the releases and returned a HTTP code or null
throw new Error(`Scraper returned '${scrapedRelease}' when fetching latest from '${entity.name}' (${entity.parent?.name})`);
}
// object-merge-advance will use null as explicit false on hard merged keys, even when null as explicit falls is disabled
// filter out keys with null values to ensure original base value is used instead
const curatedScrapedRelease = Object.entries(scrapedRelease).reduce((acc, [key, value]) => ({
...acc,
...(value !== null && value !== undefined && {
[key]: value,
}),
}), {});
const mergedRelease = {
...merge(baseRelease, curatedScrapedRelease, {
dedupeStringsInArrayValues: true,
2021-02-02 02:10:58 +00:00
hardMergeKeys: ['actors', 'covers', 'poster', 'trailer', 'teaser'],
}),
deep: !!scrapedRelease,
entity,
};
if (!mergedRelease.entryId) {
2020-05-20 00:23:45 +00:00
throw Object.assign(new Error('No entry ID supplied'), { code: 'NO_ENTRY_ID' });
}
if (scrapedRelease && baseRelease?.tags) {
// accumulate all available tags
mergedRelease.tags = baseRelease.tags.concat(scrapedRelease.tags);
}
return mergedRelease;
} catch (error) {
logger.error(`Deep scrape failed for ${baseRelease.url}: ${error.message}`);
2020-05-20 00:23:45 +00:00
if (argv.debug) {
console.error(error);
}
2020-05-20 00:23:45 +00:00
if (error.code === 'NO_ENTRY_ID') {
return null;
}
return baseRelease;
}
}
async function scrapeReleases(baseReleases, entitiesBySlug, type) {
const entitiesWithBeforeDataEntries = await Promise.all(Object.entries(entitiesBySlug).map(async ([slug, entity]) => {
if (entity.scraper?.beforeFetchScenes) {
const preData = await entity.scraper.beforeFetchScenes(entity);
return [slug, { ...entity, preData }];
}
return null;
}));
const entitiesWithBeforeDataBySlug = Object.fromEntries(entitiesWithBeforeDataEntries.filter(Boolean));
return Promise.map(
baseReleases,
async baseRelease => scrapeRelease(baseRelease, entitiesWithBeforeDataBySlug, type),
{ concurrency: 10 },
);
}
async function fetchReleases(baseReleasesOrUrls, type = 'scene') {
const baseReleases = toBaseReleases(baseReleasesOrUrls);
const entitiesBySlug = await fetchReleaseEntities(baseReleases);
const deepReleases = await scrapeReleases(baseReleases, entitiesBySlug, type);
2020-05-20 00:23:45 +00:00
return deepReleases.filter(Boolean);
}
async function fetchScenes(baseReleasesOrUrls) {
return fetchReleases(baseReleasesOrUrls, 'scene');
}
async function fetchMovies(baseReleasesOrUrls) {
const movies = await fetchReleases(baseReleasesOrUrls, 'movie');
return movies;
}
module.exports = {
fetchReleases,
fetchScenes,
fetchMovies,
2020-05-15 02:40:59 +00:00
toBaseReleases,
};