'use strict'; const util = require('util'); const Promise = require('bluebird'); const { mergeAdvanced: merge } = require('object-merge-advanced'); const argv = require('./argv'); const include = require('./utils/argv-include')(argv); const { fetchReleaseEntities, urlToSiteSlug } = require('./entities'); const logger = require('./logger')(__filename); const qu = require('./utils/qu'); const getRecursiveParameters = require('./utils/get-recursive-parameters'); const windows = require('./utils/http-windows'); const waitImmediate = util.promisify(setImmediate); function toBaseReleases(baseReleasesOrUrls, entity = null) { if (!baseReleasesOrUrls) { return []; } return baseReleasesOrUrls .map((baseReleaseOrUrl) => { if (baseReleaseOrUrl.url) { // base release with URL return { ...baseReleaseOrUrl, entity: baseReleaseOrUrl.entity || entity, deep: false, }; } if (/^http/.test(baseReleaseOrUrl)) { // URL return { url: baseReleaseOrUrl, entity, deep: false, }; } if (typeof baseReleaseOrUrl === 'object' && !Array.isArray(baseReleaseOrUrl)) { // base release without URL, prepare for passthrough return { ...baseReleaseOrUrl, entity: baseReleaseOrUrl.entity || entity, deep: false, }; } logger.warn(`Malformed base release, discarding '${baseReleaseOrUrl}'`); return null; }) .filter(Boolean); } async function fetchScene(scraper, url, entity, baseRelease, options, type = 'scene') { if ((type === 'scene' && scraper.fetchScene) || (type === 'movie' && scraper.fetchMovie)) { return scraper[type === 'movie' ? 'fetchMovie' : 'fetchScene'](baseRelease.url, entity, baseRelease, options, null); } if ((type === 'scene' && scraper.scrapeScene) || (type === 'movie' && scraper.scrapeMovie)) { const session = qu.session(); const res = await qu.get(url, null, null, { session, rejectUnauthorized: false, }); const cookie = await session._sessionOptions.cookieJar.get(url); if (res.ok) { return scraper[type === 'movie' ? 'scrapeMovie' : 'scrapeScene'](res.item, url, entity, baseRelease, options, { session, headers: res.headers, cookieJar: session._sessionOptions.cookieJar, cookie, }); } return res.status; } return null; } function fetchMovie(scraper, url, entity, baseRelease, options) { return fetchScene(scraper, url, entity, baseRelease, options, 'movie'); } async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') { const entity = baseRelease.entity || entitiesBySlug[urlToSiteSlug(baseRelease.url)]; if (!entity) { logger.warn(`No entity available for ${baseRelease.url}`); return baseRelease; } if ((!baseRelease.url && !baseRelease.path) || !argv.deep) { return { ...baseRelease, entity, }; } const layoutScraper = entity.scraper; if (!entity.scraper) { logger.warn(`Could not find scraper for ${baseRelease.url}`); return baseRelease; } if ((type === 'scene' && !layoutScraper.fetchScene && !layoutScraper.scrapeScene) || (type === 'movie' && !layoutScraper.fetchMovie && !layoutScraper.scrapeMovie)) { logger.warn(`The '${entity.name}'-scraper cannot scrape individual ${type}s`); return baseRelease; } try { logger.verbose(`Fetching ${type} ${baseRelease.url}`); const options = { ...include, beforeFetchScenes: entity.preData, parameters: getRecursiveParameters(entity), }; logger.debug(`Memory usage before: ${process.memoryUsage.rss() / 1000000} MB (${baseRelease.url})`); const rawScrapedRelease = type === 'scene' ? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, options) : await fetchMovie(layoutScraper, baseRelease.url, entity, baseRelease, options); const pathname = baseRelease.path || (baseRelease.url && new URL(baseRelease.url).pathname.replace(/\//g, '_')); if (rawScrapedRelease) { delete rawScrapedRelease.query; // some scrapers pass the qu-wrapped window instance to parent scrapers, filling up memory } if (windows.has(pathname)) { logger.debug(`Closing window for ${pathname}`); windows.get(pathname).close(); windows.delete(pathname); } await waitImmediate; logger.debug(`Memory usage after: ${process.memoryUsage.rss() / 1000000} MB (${baseRelease.url})`); const scrapedRelease = rawScrapedRelease?.scene || rawScrapedRelease; if (!scrapedRelease || typeof scrapedRelease !== 'object' || Array.isArray(scrapedRelease)) { // scraper is unable to fetch the releases and returned a HTTP code or null throw new Error(`Scraper returned '${scrapedRelease}' when fetching latest from '${entity.name}' (${entity.parent?.name})`); } // object-merge-advance will use null as explicit false on hard merged keys, even when null as explicit falls is disabled // filter out keys with null values to ensure original base value is used instead const curatedScrapedRelease = Object.entries(scrapedRelease).reduce((acc, [key, value]) => ({ ...acc, ...(value !== null && value !== undefined && !(Array.isArray(value) && value.filter(Boolean).length === 0) && { [key]: Array.isArray(value) ? value.filter(Boolean) : value, }), }), {}); const mergedRelease = { ...merge(baseRelease, curatedScrapedRelease, { dedupeStringsInArrayValues: true, hardMergeKeys: ['actors', 'covers', 'poster', 'trailer', 'teaser'], }), deep: !!scrapedRelease, entity, }; if (!mergedRelease.entryId) { throw Object.assign(new Error('No entry ID supplied'), { code: 'NO_ENTRY_ID' }); } if (scrapedRelease && baseRelease?.tags) { // accumulate all available tags mergedRelease.tags = baseRelease.tags.concat(scrapedRelease.tags); } return mergedRelease; } catch (error) { logger.error(`Deep scrape failed for ${baseRelease.url}: ${error.message}`); if (argv.debug) { console.error(error); } if (error.code === 'NO_ENTRY_ID') { return null; } return baseRelease; } } async function scrapeReleases(baseReleases, entitiesBySlug, type) { const entitiesWithBeforeDataEntries = await Promise.all(Object.entries(entitiesBySlug).map(async ([slug, entity]) => { if (entity.scraper?.beforeFetchScenes) { const parameters = getRecursiveParameters(entity); const preData = await entity.scraper.beforeFetchScenes(entity, parameters); return [slug, { ...entity, preData }]; } return [slug, entity]; })); const entitiesWithBeforeDataBySlug = Object.fromEntries(entitiesWithBeforeDataEntries.filter(Boolean)); return Promise.map( baseReleases, async (baseRelease) => scrapeRelease(baseRelease, entitiesWithBeforeDataBySlug, type), { concurrency: 1 }, ); } async function fetchReleases(baseReleasesOrUrls, type = 'scene') { const baseReleases = toBaseReleases(baseReleasesOrUrls); const entitiesBySlug = await fetchReleaseEntities(baseReleases); const deepReleases = await scrapeReleases(baseReleases, entitiesBySlug, type); return deepReleases.filter(Boolean); } async function fetchScenes(baseReleasesOrUrls) { return fetchReleases(baseReleasesOrUrls, 'scene'); } async function fetchMovies(baseReleasesOrUrls) { const movies = await fetchReleases(baseReleasesOrUrls, 'movie'); return movies; } module.exports = { fetchReleases, fetchScenes, fetchMovies, toBaseReleases, };