2020-03-16 03:10:52 +00:00
|
|
|
'use strict';
|
|
|
|
|
2020-03-16 23:58:03 +00:00
|
|
|
const Promise = require('bluebird');
|
|
|
|
|
2020-03-16 03:10:52 +00:00
|
|
|
const argv = require('./argv');
|
|
|
|
const logger = require('./logger')(__filename);
|
|
|
|
const knex = require('./knex');
|
|
|
|
const scrapers = require('./scrapers/scrapers');
|
|
|
|
const { curateSites } = require('./sites');
|
|
|
|
const { curateNetworks } = require('./networks');
|
|
|
|
|
|
|
|
function urlToSiteSlug(url) {
|
|
|
|
try {
|
|
|
|
const slug = new URL(url)
|
|
|
|
.hostname
|
|
|
|
.match(/([\w-]+)\.\w+$/)?.[1];
|
|
|
|
|
|
|
|
return slug;
|
|
|
|
} catch (error) {
|
|
|
|
logger.warn(`Failed to derive site slug from '${url}': ${error.message}`);
|
|
|
|
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
async function findSites(baseReleases) {
|
|
|
|
const baseReleasesWithoutSite = baseReleases.filter(release => release.url && !release.site);
|
|
|
|
|
|
|
|
const siteSlugs = Array.from(new Set(
|
|
|
|
baseReleasesWithoutSite
|
|
|
|
.map(baseRelease => urlToSiteSlug(baseRelease.url))
|
|
|
|
.filter(Boolean),
|
|
|
|
));
|
|
|
|
|
|
|
|
const siteEntries = await knex('sites').whereIn('slug', siteSlugs);
|
|
|
|
const networkEntries = await knex('networks').whereIn('slug', siteSlugs);
|
|
|
|
|
|
|
|
const sites = await curateSites(siteEntries, true, false);
|
|
|
|
const networks = await curateNetworks(networkEntries, true, false, false);
|
|
|
|
const markedNetworks = networks.map(network => ({ ...network, isFallback: true }));
|
|
|
|
|
|
|
|
const sitesBySlug = []
|
|
|
|
.concat(sites, markedNetworks)
|
|
|
|
.reduce((accSites, site) => ({ ...accSites, [site.slug]: site }), {});
|
|
|
|
|
|
|
|
return sitesBySlug;
|
|
|
|
}
|
|
|
|
|
|
|
|
function toBaseReleases(baseReleasesOrUrls) {
|
|
|
|
return baseReleasesOrUrls
|
|
|
|
.map((baseReleaseOrUrl) => {
|
|
|
|
if (baseReleaseOrUrl.url) {
|
|
|
|
// base release with URL
|
|
|
|
return {
|
|
|
|
...baseReleaseOrUrl,
|
|
|
|
deep: false,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
if (/^http/.test(baseReleaseOrUrl)) {
|
|
|
|
// URL
|
|
|
|
return {
|
|
|
|
url: baseReleaseOrUrl,
|
|
|
|
deep: false,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
if (typeof baseReleaseOrUrl === 'object' && !Array.isArray(baseReleaseOrUrl)) {
|
|
|
|
// base release without URL, prepare for passthrough
|
|
|
|
return {
|
|
|
|
...baseReleaseOrUrl,
|
|
|
|
deep: false,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
logger.warn(`Malformed base release, discarding '${baseReleaseOrUrl}'`);
|
|
|
|
return null;
|
|
|
|
})
|
|
|
|
.filter(Boolean);
|
|
|
|
}
|
|
|
|
|
|
|
|
async function scrapeRelease(baseRelease, sites, type = 'scene') {
|
|
|
|
const site = baseRelease.site || sites[urlToSiteSlug(baseRelease.url)];
|
|
|
|
|
|
|
|
if (!site) {
|
|
|
|
logger.warn(`No site available for ${baseRelease.url}`);
|
|
|
|
return baseRelease;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((!baseRelease.url && !baseRelease.path) || !argv.deep) {
|
|
|
|
return {
|
|
|
|
...baseRelease,
|
|
|
|
site,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
const scraper = scrapers.releases[site.slug];
|
|
|
|
|
|
|
|
if (!scraper) {
|
|
|
|
logger.warn(`Could not find scraper for ${baseRelease.url}`);
|
|
|
|
return baseRelease;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((type === 'scene' && !scraper.fetchScene) || (type === 'movie' && !scraper.fetchMovie)) {
|
|
|
|
logger.warn(`The '${site.name}'-scraper cannot fetch individual ${type}s`);
|
|
|
|
return baseRelease;
|
|
|
|
}
|
|
|
|
|
|
|
|
try {
|
|
|
|
const scrapedRelease = type === 'scene'
|
|
|
|
? await scraper.fetchScene(baseRelease.url, site, baseRelease)
|
|
|
|
: await scraper.fetchMovie(baseRelease.url, site, baseRelease);
|
|
|
|
|
|
|
|
const mergedRelease = {
|
|
|
|
...baseRelease,
|
|
|
|
...scrapedRelease,
|
|
|
|
deep: !!scrapedRelease,
|
|
|
|
site,
|
|
|
|
};
|
|
|
|
|
|
|
|
if (scrapedRelease && baseRelease?.tags) {
|
|
|
|
mergedRelease.tags = baseRelease.tags.concat(scrapedRelease.tags);
|
|
|
|
}
|
|
|
|
|
|
|
|
return mergedRelease;
|
|
|
|
} catch (error) {
|
|
|
|
logger.error(`Deep scrape failed for ${baseRelease.url}: ${error.message}`);
|
|
|
|
return baseRelease;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
async function scrapeReleases(baseReleases, sites) {
|
2020-03-16 23:58:03 +00:00
|
|
|
return Promise.map(
|
|
|
|
baseReleases,
|
|
|
|
async baseRelease => scrapeRelease(baseRelease, sites),
|
|
|
|
{ concurrency: 10 },
|
|
|
|
);
|
2020-03-16 03:10:52 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
async function fetchReleases(baseReleasesOrUrls) {
|
|
|
|
const baseReleases = toBaseReleases(baseReleasesOrUrls);
|
|
|
|
const sites = await findSites(baseReleases);
|
|
|
|
|
|
|
|
const deepReleases = await scrapeReleases(baseReleases, sites);
|
|
|
|
|
|
|
|
return deepReleases;
|
|
|
|
}
|
|
|
|
|
|
|
|
module.exports = fetchReleases;
|