traxxx/src/scrape-releases.js

133 lines
3.7 KiB
JavaScript
Raw Normal View History

'use strict';
const config = require('config');
2020-01-02 23:59:02 +00:00
const Promise = require('bluebird');
const logger = require('./logger')(__filename);
const argv = require('./argv');
const scrapers = require('./scrapers/scrapers');
const { findSiteByUrl } = require('./sites');
const { findNetworkByUrl } = require('./networks');
const { storeReleases } = require('./releases');
async function findSite(url, release) {
if (release?.site) return release.site;
if (!url) return null;
const site = await findSiteByUrl(url);
if (site) {
return site;
}
const network = await findNetworkByUrl(url);
if (network) {
return {
...network,
network,
isFallback: true,
};
}
return null;
}
async function scrapeRelease(source, basicRelease = null, type = 'scene', preflight) {
// profile scraper may return either URLs or pre-scraped scenes
const sourceIsUrlOrEmpty = typeof source === 'string' || source === undefined;
const url = sourceIsUrlOrEmpty ? source : source?.url;
const release = sourceIsUrlOrEmpty ? basicRelease : source;
const site = basicRelease?.site || await findSite(url, release);
if (!site) {
throw new Error(`Could not find site ${url} in database`);
}
if (!argv.deep && release) {
return {
...release,
site,
};
}
const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug];
if (!scraper) {
throw new Error('Could not find scraper for URL');
}
if (type === 'scene' && !scraper.fetchScene) {
if (release) {
logger.warn(`The '${site.name}'-scraper cannot fetch individual scenes`);
return null;
}
throw new Error(`The '${site.name}'-scraper cannot fetch individual scenes`);
}
if (type === 'movie' && !scraper.fetchMovie) {
if (release) {
logger.warn(`The '${site.name}'-scraper cannot fetch individual movies`);
return null;
}
throw new Error(`The '${site.name}'-scraper cannot fetch individual movies`);
}
if (!release) {
logger.info(`Scraping release from ${url}`);
}
const scrapedRelease = type === 'scene'
? await scraper.fetchScene(url, site, release, preflight)
: await scraper.fetchMovie(url, site, release, preflight);
return {
...release,
2020-02-01 02:30:11 +00:00
...scrapedRelease,
...(scrapedRelease && release?.tags && {
tags: release.tags.concat(scrapedRelease.tags),
}),
site,
};
2020-01-02 23:59:02 +00:00
}
async function scrapeReleases(sources, release = null, type = 'scene', preflight = null) {
const scrapedReleases = await Promise.map(sources, async source => scrapeRelease(source, release, type, preflight), {
2020-01-02 23:59:02 +00:00
concurrency: 5,
}).filter(Boolean);
2020-01-02 23:59:02 +00:00
const curatedReleases = scrapedReleases.map(scrapedRelease => ({ ...scrapedRelease, type }));
if (argv.scene && argv.inspect) {
// only show when fetching from URL
console.log(curatedReleases);
}
2020-01-02 23:59:02 +00:00
if (argv.save) {
/*
const movie = scrapedRelease.movie
? await scrapeRelease(scrapedRelease.movie, null, false, 'movie')
: null;
if (movie) {
const { releases: [storedMovie] } = await storeReleases([movie]);
curatedRelease.parentId = storedMovie.id;
}
*/
2020-01-02 23:59:02 +00:00
const { releases: storedReleases } = await storeReleases(curatedReleases);
2020-01-02 23:59:02 +00:00
if (storedReleases) {
logger.info(storedReleases.map(storedRelease => `\nhttp://${config.web.host}:${config.web.port}/scene/${storedRelease.id}/${storedRelease.slug}`).join(''));
2019-12-09 04:00:49 +00:00
}
}
}
2020-01-02 23:59:02 +00:00
module.exports = {
scrapeRelease,
scrapeReleases,
};