traxxx/src/scrape-releases.js

200 lines
5.6 KiB
JavaScript

'use strict';
const config = require('config');
const Promise = require('bluebird');
const logger = require('./logger')(__filename);
const argv = require('./argv');
const include = require('./utils/argv-include')(argv);
const knex = require('./knex');
const scrapers = require('./scrapers/scrapers');
const { findSiteByUrl } = require('./sites');
const { findNetworkByUrl } = require('./networks');
const { storeReleases } = require('./releases');
async function findSite(url, release) {
if (release?.site) return release.site;
if (!url) return null;
const site = await findSiteByUrl(url);
if (site) {
return site;
}
const network = await findNetworkByUrl(url);
if (network) {
return {
...network,
network,
isFallback: true,
};
}
return null;
}
async function scrapeRelease(source, basicRelease = null, type = 'scene', beforeFetchLatest) {
// profile scraper may return either URLs or pre-scraped scenes
const sourceIsUrlOrEmpty = typeof source === 'string' || source === undefined;
const url = sourceIsUrlOrEmpty ? source : source?.url;
const release = sourceIsUrlOrEmpty ? basicRelease : source;
const site = basicRelease?.site || await findSite(url, release);
if (!site) {
throw new Error(`Could not find site for ${url} in database`);
}
if (!argv.deep && release) {
return {
...release,
site,
};
}
const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug];
if (!scraper) {
throw new Error(`Could not find scraper for ${url}`);
}
if ((type === 'scene' && !scraper.fetchScene) || (type === 'movie' && !scraper.fetchMovie)) {
if (release) {
logger.warn(`The '${site.name}'-scraper cannot fetch individual ${type}s`);
return null;
}
throw new Error(`The '${site.name}'-scraper cannot fetch individual ${type}s`);
}
if (!release) {
logger.info(`Scraping release from ${url}`);
}
const scrapedRelease = type === 'scene'
? await scraper.fetchScene(url, site, release, beforeFetchLatest, include)
: await scraper.fetchMovie(url, site, release, beforeFetchLatest, include);
return {
...release,
...scrapedRelease,
...(scrapedRelease && release?.tags && {
tags: release.tags.concat(scrapedRelease.tags),
}),
site,
};
}
async function accumulateMovies(releases) {
if (!argv.withMovies) return [];
const moviesByUrl = releases.reduce((acc, release) => {
if (!release.movie) return acc;
const movie = release.movie.url ? release.movie : { url: release.movie };
if (!acc[movie.url]) {
acc[movie.url] = {
...movie,
type: 'movie',
sceneIds: [],
};
}
acc[movie.url].sceneIds = acc[movie.url].sceneIds.concat(release.id);
return acc;
}, {});
const movies = await Promise.map(Object.values(moviesByUrl), async movie => scrapeRelease(movie, null, 'movie'));
const { releases: storedMovies } = await storeReleases(movies);
const movieAssociations = storedMovies.reduce((acc, movie) => acc.concat(movie.sceneIds.map(sceneId => ({
movie_id: movie.id,
scene_id: sceneId,
}))), []);
await knex('releases_movies').insert(movieAssociations);
// console.log(moviesByUrl);
return movies;
}
async function scrapeReleases(sources, type = 'scene') {
const scrapedReleases = await Promise.map(sources, async source => scrapeRelease(source, null, type), {
concurrency: 5,
}).filter(Boolean);
const curatedReleases = scrapedReleases.map(scrapedRelease => ({ ...scrapedRelease, type }));
if ((argv.scene || argv.movie) && argv.inspect) {
// only show when fetching from URL
}
if (argv.save) {
const { releases: storedReleases } = await storeReleases(curatedReleases);
await accumulateMovies(storedReleases);
if (storedReleases) {
logger.info(storedReleases.map(storedRelease => `\nhttp://${config.web.host}:${config.web.port}/scene/${storedRelease.id}/${storedRelease.slug}`).join(''));
}
return storedReleases;
}
return curatedReleases;
}
async function scrapeScenes(sources) {
return scrapeReleases(sources, 'scene');
}
async function scrapeMovies(sources) {
return scrapeReleases(sources, 'movie');
}
async function deepFetchReleases(baseReleases, beforeFetchLatest) {
const deepReleases = await Promise.map(baseReleases, async (release) => {
if (release.url || (release.path && release.site)) {
try {
const fullRelease = await scrapeRelease(release.url, release, 'scene', beforeFetchLatest);
if (fullRelease) {
return {
...release,
...fullRelease,
deep: true,
};
}
logger.warn(`Release scraper returned empty result for ${release.url}`);
return release;
} catch (error) {
logger.error(`Failed to scrape ${release.url}: ${error}`);
return {
...release,
deep: false,
};
}
}
return release;
}, {
concurrency: 2,
});
return deepReleases;
}
module.exports = {
deepFetchReleases,
scrapeMovies,
scrapeRelease,
scrapeReleases,
scrapeScenes,
};