'use strict'; const Promise = require('bluebird'); const merge = require('object-merge-advanced'); const argv = require('./argv'); const include = require('./utils/argv-include')(argv); const logger = require('./logger')(__filename); const knex = require('./knex'); const qu = require('./utils/qu'); const scrapers = require('./scrapers/scrapers'); function urlToSiteSlug(url) { try { const slug = new URL(url) .hostname .match(/([\w-]+)\.\w+$/)?.[1] .replace(/[-_]+/g, ''); return slug; } catch (error) { logger.warn(`Failed to derive entity slug from '${url}': ${error.message}`); return null; } } async function findEntities(baseReleases) { const baseReleasesWithoutEntity = baseReleases.filter(release => release.url && !release.site && !release.entity); const entitySlugs = Array.from(new Set( baseReleasesWithoutEntity .map(baseRelease => urlToSiteSlug(baseRelease.url)) .filter(Boolean), )); const entities = await knex('entities') .select(knex.raw('entities.*, row_to_json(parents) as parent, json_agg(children) as children')) .leftJoin('entities as parents', 'parents.id', 'entities.parent_id') .leftJoin('entities as children', 'children.parent_id', 'entities.id') .whereIn('entities.slug', entitySlugs) .groupBy('entities.id', 'parents.id') .orderBy('entities.type', 'asc'); // channel entity will overwrite network entity const entitiesBySlug = entities.reduce((accEntities, entity) => ({ ...accEntities, [entity.slug]: accEntities[entity.slug] || entity }), {}); return entitiesBySlug; } function toBaseReleases(baseReleasesOrUrls, entity = null) { if (!baseReleasesOrUrls) { return []; } return baseReleasesOrUrls .map((baseReleaseOrUrl) => { if (baseReleaseOrUrl.url) { // base release with URL return { ...baseReleaseOrUrl, entity: baseReleaseOrUrl.entity || entity, deep: false, }; } if (/^http/.test(baseReleaseOrUrl)) { // URL return { url: baseReleaseOrUrl, entity, deep: false, }; } if (typeof baseReleaseOrUrl === 'object' && !Array.isArray(baseReleaseOrUrl)) { // base release without URL, prepare for passthrough return { ...baseReleaseOrUrl, entity: baseReleaseOrUrl.entity || entity, deep: false, }; } logger.warn(`Malformed base release, discarding '${baseReleaseOrUrl}'`); return null; }) .filter(Boolean); } async function fetchScene(scraper, url, entity, baseRelease, options) { if (scraper.fetchScene) { return scraper.fetchScene(baseRelease.url, entity, baseRelease, options, null); } if (scraper.scrapeScene) { const res = await qu.get(url); if (res.ok) { return scraper.scrapeScene(res.item, url, entity, baseRelease, options); } return res.status; } return null; } async function scrapeRelease(baseRelease, entities, type = 'scene') { const entity = baseRelease.entity || entities[urlToSiteSlug(baseRelease.url)]; if (!entity) { logger.warn(`No entity available for ${baseRelease.url}`); return baseRelease; } if ((!baseRelease.url && !baseRelease.path) || !argv.deep) { return { ...baseRelease, entity, }; } const scraper = scrapers.releases[entity.slug] || scrapers.releases[entity.parent?.slug] || scrapers.releases[entity.parent?.parent?.slug]; const layoutScraper = scraper?.[entity.parameters?.layout] || scraper?.[entity.parent?.parameters?.layout] || scraper?.[entity.parent?.parent?.parameters?.layout] || scraper; if (!layoutScraper) { logger.warn(`Could not find scraper for ${baseRelease.url}`); return baseRelease; } if ((type === 'scene' && !layoutScraper.fetchScene && !layoutScraper.scrapeScene) || (type === 'movie' && !layoutScraper.fetchMovie)) { logger.warn(`The '${entity.name}'-scraper cannot scrape individual ${type}s`); return baseRelease; } try { logger.verbose(`Fetching ${type} ${baseRelease.url}`); const scrapedRelease = type === 'scene' ? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, include, null) : await layoutScraper.fetchMovie(baseRelease.url, entity, baseRelease, include, null); // object-merge-advance will use null as explicit false on hard merged keys, even when null as explicit falls is disabled // filter out keys with null values to ensure original base value is used instead const curatedScrapedRelease = Object.entries(scrapedRelease).reduce((acc, [key, value]) => ({ ...acc, ...(value !== null && value !== undefined && { [key]: value, }), }), {}); const mergedRelease = { ...merge(baseRelease, curatedScrapedRelease, { dedupeStringsInArrayValues: true, hardMergeKeys: ['actors', 'poster', 'trailer', 'teaser'], }), deep: !!scrapedRelease, entity, }; if (!mergedRelease.entryId) { throw Object.assign(new Error('No entry ID supplied'), { code: 'NO_ENTRY_ID' }); } if (scrapedRelease && baseRelease?.tags) { // accumulate all available tags mergedRelease.tags = baseRelease.tags.concat(scrapedRelease.tags); } return mergedRelease; } catch (error) { logger.error(`Deep scrape failed for ${baseRelease.url}: ${error.message}`); if (argv.debug) { console.error(error); } if (error.code === 'NO_ENTRY_ID') { return null; } return baseRelease; } } async function scrapeReleases(baseReleases, entities, type) { return Promise.map( baseReleases, async baseRelease => scrapeRelease(baseRelease, entities, type), { concurrency: 10 }, ); } async function fetchReleases(baseReleasesOrUrls, type = 'scene') { const baseReleases = toBaseReleases(baseReleasesOrUrls); const entities = await findEntities(baseReleases); const deepReleases = await scrapeReleases(baseReleases, entities, type); return deepReleases.filter(Boolean); } async function fetchScenes(baseReleasesOrUrls) { return fetchReleases(baseReleasesOrUrls, 'scene'); } async function fetchMovies(baseReleasesOrUrls) { const movies = await fetchReleases(baseReleasesOrUrls, 'movie'); return movies; } module.exports = { fetchReleases, fetchScenes, fetchMovies, toBaseReleases, };