'use strict'; const Promise = require('bluebird'); const argv = require('./argv'); const include = require('./utils/argv-include')(argv); const logger = require('./logger')(__filename); const knex = require('./knex'); const scrapers = require('./scrapers/scrapers'); const { curateSites } = require('./sites'); const { curateNetworks } = require('./networks'); function urlToSiteSlug(url) { try { const slug = new URL(url) .hostname .match(/([\w-]+)\.\w+$/)?.[1]; return slug; } catch (error) { logger.warn(`Failed to derive site slug from '${url}': ${error.message}`); return null; } } async function findSites(baseReleases) { const baseReleasesWithoutSite = baseReleases.filter(release => release.url && !release.site); const siteSlugs = Array.from(new Set( baseReleasesWithoutSite .map(baseRelease => urlToSiteSlug(baseRelease.url)) .filter(Boolean), )); const siteEntries = await knex('sites') .leftJoin('networks', 'networks.id', 'sites.network_id') .select('sites.*', 'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.parameters as network_parameters', 'networks.description as network_description') .whereIn('sites.slug', siteSlugs); const networkEntries = await knex('networks').whereIn('slug', siteSlugs); const sites = await curateSites(siteEntries, true, false); const networks = await curateNetworks(networkEntries, true, false, false); const markedNetworks = networks.map(network => ({ ...network, isNetwork: true })); const sitesBySlug = [] .concat(markedNetworks, sites) .reduce((accSites, site) => ({ ...accSites, [site.slug]: site }), {}); return sitesBySlug; } function toBaseReleases(baseReleasesOrUrls) { return baseReleasesOrUrls .map((baseReleaseOrUrl) => { if (baseReleaseOrUrl.url) { // base release with URL return { ...baseReleaseOrUrl, deep: false, }; } if (/^http/.test(baseReleaseOrUrl)) { // URL return { url: baseReleaseOrUrl, deep: false, }; } if (typeof baseReleaseOrUrl === 'object' && !Array.isArray(baseReleaseOrUrl)) { // base release without URL, prepare for passthrough return { ...baseReleaseOrUrl, deep: false, }; } logger.warn(`Malformed base release, discarding '${baseReleaseOrUrl}'`); return null; }) .filter(Boolean); } async function scrapeRelease(baseRelease, sites, type = 'scene') { const site = baseRelease.site || sites[urlToSiteSlug(baseRelease.url)]; const siteWithFallbackNetwork = site.isNetwork ? { ...site, network: site } : site; // make site.network available, even when site is network fallback if (!site) { logger.warn(`No site available for ${baseRelease.url}`); return baseRelease; } if ((!baseRelease.url && !baseRelease.path) || !argv.deep) { return { ...baseRelease, site, }; } const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug]; if (!scraper) { logger.warn(`Could not find scraper for ${baseRelease.url}`); return baseRelease; } if ((type === 'scene' && !scraper.fetchScene) || (type === 'movie' && !scraper.fetchMovie)) { logger.warn(`The '${site.name}'-scraper cannot fetch individual ${type}s`); return baseRelease; } try { logger.verbose(`Fetching ${type} ${baseRelease.url}`); const scrapedRelease = type === 'scene' ? await scraper.fetchScene(baseRelease.url, siteWithFallbackNetwork, baseRelease, null, include) : await scraper.fetchMovie(baseRelease.url, siteWithFallbackNetwork, baseRelease, null, include); const mergedRelease = { ...baseRelease, ...scrapedRelease, deep: !!scrapedRelease, site, }; if (scrapedRelease && baseRelease?.tags) { // accumulate all available tags mergedRelease.tags = baseRelease.tags.concat(scrapedRelease.tags); } return mergedRelease; } catch (error) { logger.error(`Deep scrape failed for ${baseRelease.url}: ${error.message}`); return baseRelease; } } async function scrapeReleases(baseReleases, sites, type) { return Promise.map( baseReleases, async baseRelease => scrapeRelease(baseRelease, sites, type), { concurrency: 10 }, ); } async function fetchReleases(baseReleasesOrUrls, type = 'scene') { const baseReleases = toBaseReleases(baseReleasesOrUrls); const sites = await findSites(baseReleases); const deepReleases = await scrapeReleases(baseReleases, sites, type); return deepReleases; } async function fetchScenes(baseReleasesOrUrls) { return fetchReleases(baseReleasesOrUrls, 'scene'); } async function fetchMovies(baseReleasesOrUrls) { return fetchReleases(baseReleasesOrUrls, 'movie'); } module.exports = { fetchReleases, fetchScenes, fetchMovies, };