211 lines
5.4 KiB
JavaScript
211 lines
5.4 KiB
JavaScript
'use strict';
|
|
|
|
const Promise = require('bluebird');
|
|
const merge = require('object-merge-advanced');
|
|
|
|
const argv = require('./argv');
|
|
const include = require('./utils/argv-include')(argv);
|
|
const logger = require('./logger')(__filename);
|
|
const knex = require('./knex');
|
|
const qu = require('./utils/qu');
|
|
const scrapers = require('./scrapers/scrapers');
|
|
|
|
function urlToSiteSlug(url) {
|
|
try {
|
|
const slug = new URL(url)
|
|
.hostname
|
|
.match(/([\w-]+)\.\w+$/)?.[1]
|
|
.replace(/[-_]+/g, '');
|
|
|
|
return slug;
|
|
} catch (error) {
|
|
logger.warn(`Failed to derive entity slug from '${url}': ${error.message}`);
|
|
|
|
return null;
|
|
}
|
|
}
|
|
|
|
async function findEntities(baseReleases) {
|
|
const baseReleasesWithoutEntity = baseReleases.filter(release => release.url && !release.site && !release.entity);
|
|
|
|
const entitySlugs = Array.from(new Set(
|
|
baseReleasesWithoutEntity
|
|
.map(baseRelease => urlToSiteSlug(baseRelease.url))
|
|
.filter(Boolean),
|
|
));
|
|
|
|
const entities = await knex('entities')
|
|
.select(knex.raw('entities.*, row_to_json(parents) as parent, json_agg(children) as children'))
|
|
.leftJoin('entities as parents', 'parents.id', 'entities.parent_id')
|
|
.leftJoin('entities as children', 'children.parent_id', 'entities.id')
|
|
.whereIn('entities.slug', entitySlugs)
|
|
.groupBy('entities.id', 'parents.id')
|
|
.orderBy('entities.type', 'asc');
|
|
|
|
// channel entity will overwrite network entity
|
|
const entitiesBySlug = entities.reduce((accEntities, entity) => ({ ...accEntities, [entity.slug]: accEntities[entity.slug] || entity }), {});
|
|
|
|
return entitiesBySlug;
|
|
}
|
|
|
|
function toBaseReleases(baseReleasesOrUrls, entity = null) {
|
|
if (!baseReleasesOrUrls) {
|
|
return [];
|
|
}
|
|
|
|
return baseReleasesOrUrls
|
|
.map((baseReleaseOrUrl) => {
|
|
if (baseReleaseOrUrl.url) {
|
|
// base release with URL
|
|
return {
|
|
...baseReleaseOrUrl,
|
|
entity: baseReleaseOrUrl.entity || entity,
|
|
deep: false,
|
|
};
|
|
}
|
|
|
|
if (/^http/.test(baseReleaseOrUrl)) {
|
|
// URL
|
|
return {
|
|
url: baseReleaseOrUrl,
|
|
entity,
|
|
deep: false,
|
|
};
|
|
}
|
|
|
|
if (typeof baseReleaseOrUrl === 'object' && !Array.isArray(baseReleaseOrUrl)) {
|
|
// base release without URL, prepare for passthrough
|
|
return {
|
|
...baseReleaseOrUrl,
|
|
entity: baseReleaseOrUrl.entity || entity,
|
|
deep: false,
|
|
};
|
|
}
|
|
|
|
logger.warn(`Malformed base release, discarding '${baseReleaseOrUrl}'`);
|
|
return null;
|
|
})
|
|
.filter(Boolean);
|
|
}
|
|
|
|
async function fetchScene(scraper, url, entity, baseRelease, options) {
|
|
if (scraper.fetchScene) {
|
|
return scraper.fetchScene(baseRelease.url, entity, baseRelease, options, null);
|
|
}
|
|
|
|
if (scraper.scrapeScene) {
|
|
const res = await qu.get(url);
|
|
|
|
if (res.ok) {
|
|
return scraper.scrapeScene(res.item, url, entity, baseRelease, options);
|
|
}
|
|
|
|
return res.status;
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
async function scrapeRelease(baseRelease, entities, type = 'scene') {
|
|
const entity = baseRelease.entity || entities[urlToSiteSlug(baseRelease.url)];
|
|
|
|
if (!entity) {
|
|
logger.warn(`No entity available for ${baseRelease.url}`);
|
|
return baseRelease;
|
|
}
|
|
|
|
if ((!baseRelease.url && !baseRelease.path) || !argv.deep) {
|
|
return {
|
|
...baseRelease,
|
|
entity,
|
|
};
|
|
}
|
|
|
|
const scraper = scrapers.releases[entity.slug] || scrapers.releases[entity.parent?.slug];
|
|
const layoutScraper = scraper[entity.parameters?.layout] || scraper;
|
|
|
|
if (!layoutScraper) {
|
|
logger.warn(`Could not find scraper for ${baseRelease.url}`);
|
|
return baseRelease;
|
|
}
|
|
|
|
if ((type === 'scene' && !layoutScraper.fetchScene && !layoutScraper.scrapeScene) || (type === 'movie' && !layoutScraper.fetchMovie)) {
|
|
logger.warn(`The '${entity.name}'-scraper cannot scrape individual ${type}s`);
|
|
return baseRelease;
|
|
}
|
|
|
|
try {
|
|
logger.verbose(`Fetching ${type} ${baseRelease.url}`);
|
|
|
|
const scrapedRelease = type === 'scene'
|
|
? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, include, null)
|
|
: await layoutScraper.fetchMovie(baseRelease.url, entity, baseRelease, include, null);
|
|
|
|
const mergedRelease = {
|
|
...merge(baseRelease, scrapedRelease, {
|
|
dedupeStringsInArrayValues: true,
|
|
hardMergeKeys: ['actors', 'poster', 'trailer', 'teaser'],
|
|
}),
|
|
deep: !!scrapedRelease,
|
|
entity,
|
|
};
|
|
|
|
if (!mergedRelease.entryId) {
|
|
throw Object.assign(new Error('No entry ID supplied'), { code: 'NO_ENTRY_ID' });
|
|
}
|
|
|
|
if (scrapedRelease && baseRelease?.tags) {
|
|
// accumulate all available tags
|
|
mergedRelease.tags = baseRelease.tags.concat(scrapedRelease.tags);
|
|
}
|
|
|
|
return mergedRelease;
|
|
} catch (error) {
|
|
logger.error(`Deep scrape failed for ${baseRelease.url}: ${error.message}`);
|
|
|
|
if (argv.debug) {
|
|
console.error(error);
|
|
}
|
|
|
|
if (error.code === 'NO_ENTRY_ID') {
|
|
return null;
|
|
}
|
|
|
|
return baseRelease;
|
|
}
|
|
}
|
|
|
|
async function scrapeReleases(baseReleases, entities, type) {
|
|
return Promise.map(
|
|
baseReleases,
|
|
async baseRelease => scrapeRelease(baseRelease, entities, type),
|
|
{ concurrency: 10 },
|
|
);
|
|
}
|
|
|
|
async function fetchReleases(baseReleasesOrUrls, type = 'scene') {
|
|
const baseReleases = toBaseReleases(baseReleasesOrUrls);
|
|
const entities = await findEntities(baseReleases);
|
|
|
|
const deepReleases = await scrapeReleases(baseReleases, entities, type);
|
|
|
|
return deepReleases.filter(Boolean);
|
|
}
|
|
|
|
async function fetchScenes(baseReleasesOrUrls) {
|
|
return fetchReleases(baseReleasesOrUrls, 'scene');
|
|
}
|
|
|
|
async function fetchMovies(baseReleasesOrUrls) {
|
|
const movies = await fetchReleases(baseReleasesOrUrls, 'movie');
|
|
|
|
return movies;
|
|
}
|
|
|
|
module.exports = {
|
|
fetchReleases,
|
|
fetchScenes,
|
|
fetchMovies,
|
|
toBaseReleases,
|
|
};
|