traxxx/src/deep.js

171 lines
4.6 KiB
JavaScript

'use strict';
const Promise = require('bluebird');
const argv = require('./argv');
const include = require('./utils/argv-include')(argv);
const logger = require('./logger')(__filename);
const knex = require('./knex');
const scrapers = require('./scrapers/scrapers');
const { curateSites } = require('./sites');
const { curateNetworks } = require('./networks');
function urlToSiteSlug(url) {
try {
const slug = new URL(url)
.hostname
.match(/([\w-]+)\.\w+$/)?.[1];
return slug;
} catch (error) {
logger.warn(`Failed to derive site slug from '${url}': ${error.message}`);
return null;
}
}
async function findSites(baseReleases) {
const baseReleasesWithoutSite = baseReleases.filter(release => release.url && !release.site);
const siteSlugs = Array.from(new Set(
baseReleasesWithoutSite
.map(baseRelease => urlToSiteSlug(baseRelease.url))
.filter(Boolean),
));
const siteEntries = await knex('sites')
.leftJoin('networks', 'networks.id', 'sites.network_id')
.select('sites.*', 'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.parameters as network_parameters', 'networks.description as network_description')
.whereIn('sites.slug', siteSlugs);
const networkEntries = await knex('networks').whereIn('slug', siteSlugs);
const sites = await curateSites(siteEntries, true, false);
const networks = await curateNetworks(networkEntries, true, false, false);
const markedNetworks = networks.map(network => ({ ...network, isNetwork: true }));
const sitesBySlug = []
.concat(markedNetworks, sites)
.reduce((accSites, site) => ({ ...accSites, [site.slug]: site }), {});
return sitesBySlug;
}
function toBaseReleases(baseReleasesOrUrls) {
return baseReleasesOrUrls
.map((baseReleaseOrUrl) => {
if (baseReleaseOrUrl.url) {
// base release with URL
return {
...baseReleaseOrUrl,
deep: false,
};
}
if (/^http/.test(baseReleaseOrUrl)) {
// URL
return {
url: baseReleaseOrUrl,
deep: false,
};
}
if (typeof baseReleaseOrUrl === 'object' && !Array.isArray(baseReleaseOrUrl)) {
// base release without URL, prepare for passthrough
return {
...baseReleaseOrUrl,
deep: false,
};
}
logger.warn(`Malformed base release, discarding '${baseReleaseOrUrl}'`);
return null;
})
.filter(Boolean);
}
async function scrapeRelease(baseRelease, sites, type = 'scene') {
const site = baseRelease.site || sites[urlToSiteSlug(baseRelease.url)];
const siteWithFallbackNetwork = site.isNetwork ? { ...site, network: site } : site; // make site.network available, even when site is network fallback
if (!site) {
logger.warn(`No site available for ${baseRelease.url}`);
return baseRelease;
}
if ((!baseRelease.url && !baseRelease.path) || !argv.deep) {
return {
...baseRelease,
site,
};
}
const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug];
if (!scraper) {
logger.warn(`Could not find scraper for ${baseRelease.url}`);
return baseRelease;
}
if ((type === 'scene' && !scraper.fetchScene) || (type === 'movie' && !scraper.fetchMovie)) {
logger.warn(`The '${site.name}'-scraper cannot fetch individual ${type}s`);
return baseRelease;
}
try {
logger.verbose(`Fetching ${type} ${baseRelease.url}`);
const scrapedRelease = type === 'scene'
? await scraper.fetchScene(baseRelease.url, siteWithFallbackNetwork, baseRelease, null, include)
: await scraper.fetchMovie(baseRelease.url, siteWithFallbackNetwork, baseRelease, null, include);
const mergedRelease = {
...baseRelease,
...scrapedRelease,
deep: !!scrapedRelease,
site,
};
if (scrapedRelease && baseRelease?.tags) {
// accumulate all available tags
mergedRelease.tags = baseRelease.tags.concat(scrapedRelease.tags);
}
return mergedRelease;
} catch (error) {
logger.error(`Deep scrape failed for ${baseRelease.url}: ${error.message}`);
return baseRelease;
}
}
async function scrapeReleases(baseReleases, sites, type) {
return Promise.map(
baseReleases,
async baseRelease => scrapeRelease(baseRelease, sites, type),
{ concurrency: 10 },
);
}
async function fetchReleases(baseReleasesOrUrls, type = 'scene') {
const baseReleases = toBaseReleases(baseReleasesOrUrls);
const sites = await findSites(baseReleases);
const deepReleases = await scrapeReleases(baseReleases, sites, type);
return deepReleases;
}
async function fetchScenes(baseReleasesOrUrls) {
return fetchReleases(baseReleasesOrUrls, 'scene');
}
async function fetchMovies(baseReleasesOrUrls) {
return fetchReleases(baseReleasesOrUrls, 'movie');
}
module.exports = {
fetchReleases,
fetchScenes,
fetchMovies,
};