200 lines
5.6 KiB
JavaScript
200 lines
5.6 KiB
JavaScript
'use strict';
|
|
|
|
const config = require('config');
|
|
const Promise = require('bluebird');
|
|
|
|
const logger = require('./logger')(__filename);
|
|
const argv = require('./argv');
|
|
const include = require('./utils/argv-include')(argv);
|
|
const knex = require('./knex');
|
|
const scrapers = require('./scrapers/scrapers');
|
|
const { findSiteByUrl } = require('./sites');
|
|
const { findNetworkByUrl } = require('./networks');
|
|
const { storeReleases } = require('./releases');
|
|
|
|
async function findSite(url, release) {
|
|
if (release?.site) return release.site;
|
|
if (!url) return null;
|
|
|
|
const site = await findSiteByUrl(url);
|
|
|
|
if (site) {
|
|
return site;
|
|
}
|
|
|
|
const network = await findNetworkByUrl(url);
|
|
|
|
if (network) {
|
|
return {
|
|
...network,
|
|
network,
|
|
isFallback: true,
|
|
};
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
async function scrapeRelease(source, basicRelease = null, type = 'scene', beforeFetchLatest) {
|
|
// profile scraper may return either URLs or pre-scraped scenes
|
|
const sourceIsUrlOrEmpty = typeof source === 'string' || source === undefined;
|
|
const url = sourceIsUrlOrEmpty ? source : source?.url;
|
|
const release = sourceIsUrlOrEmpty ? basicRelease : source;
|
|
|
|
const site = basicRelease?.site || await findSite(url, release);
|
|
|
|
if (!site) {
|
|
throw new Error(`Could not find site for ${url} in database`);
|
|
}
|
|
|
|
if (!argv.deep && release) {
|
|
return {
|
|
...release,
|
|
site,
|
|
};
|
|
}
|
|
|
|
const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug];
|
|
|
|
if (!scraper) {
|
|
throw new Error(`Could not find scraper for ${url}`);
|
|
}
|
|
|
|
if ((type === 'scene' && !scraper.fetchScene) || (type === 'movie' && !scraper.fetchMovie)) {
|
|
if (release) {
|
|
logger.warn(`The '${site.name}'-scraper cannot fetch individual ${type}s`);
|
|
return null;
|
|
}
|
|
|
|
throw new Error(`The '${site.name}'-scraper cannot fetch individual ${type}s`);
|
|
}
|
|
|
|
if (!release) {
|
|
logger.info(`Scraping release from ${url}`);
|
|
}
|
|
|
|
const scrapedRelease = type === 'scene'
|
|
? await scraper.fetchScene(url, site, release, beforeFetchLatest, include)
|
|
: await scraper.fetchMovie(url, site, release, beforeFetchLatest, include);
|
|
|
|
return {
|
|
...release,
|
|
...scrapedRelease,
|
|
...(scrapedRelease && release?.tags && {
|
|
tags: release.tags.concat(scrapedRelease.tags),
|
|
}),
|
|
site,
|
|
};
|
|
}
|
|
|
|
async function accumulateMovies(releases) {
|
|
if (!argv.withMovies) return [];
|
|
|
|
const moviesByUrl = releases.reduce((acc, release) => {
|
|
if (!release.movie) return acc;
|
|
const movie = release.movie.url ? release.movie : { url: release.movie };
|
|
|
|
if (!acc[movie.url]) {
|
|
acc[movie.url] = {
|
|
...movie,
|
|
type: 'movie',
|
|
sceneIds: [],
|
|
};
|
|
}
|
|
|
|
acc[movie.url].sceneIds = acc[movie.url].sceneIds.concat(release.id);
|
|
|
|
return acc;
|
|
}, {});
|
|
|
|
const movies = await Promise.map(Object.values(moviesByUrl), async movie => scrapeRelease(movie, null, 'movie'));
|
|
const { releases: storedMovies } = await storeReleases(movies);
|
|
|
|
const movieAssociations = storedMovies.reduce((acc, movie) => acc.concat(movie.sceneIds.map(sceneId => ({
|
|
movie_id: movie.id,
|
|
scene_id: sceneId,
|
|
}))), []);
|
|
|
|
await knex('releases_movies').insert(movieAssociations);
|
|
|
|
// console.log(moviesByUrl);
|
|
return movies;
|
|
}
|
|
|
|
async function scrapeReleases(sources, type = 'scene') {
|
|
const scrapedReleases = await Promise.map(sources, async source => scrapeRelease(source, null, type), {
|
|
concurrency: 5,
|
|
}).filter(Boolean);
|
|
|
|
const curatedReleases = scrapedReleases.map(scrapedRelease => ({ ...scrapedRelease, type }));
|
|
|
|
if ((argv.scene || argv.movie) && argv.inspect) {
|
|
// only show when fetching from URL
|
|
}
|
|
|
|
if (argv.save) {
|
|
const { releases: storedReleases } = await storeReleases(curatedReleases);
|
|
|
|
await accumulateMovies(storedReleases);
|
|
|
|
if (storedReleases) {
|
|
logger.info(storedReleases.map(storedRelease => `\nhttp://${config.web.host}:${config.web.port}/scene/${storedRelease.id}/${storedRelease.slug}`).join(''));
|
|
}
|
|
|
|
return storedReleases;
|
|
}
|
|
|
|
return curatedReleases;
|
|
}
|
|
|
|
async function scrapeScenes(sources) {
|
|
return scrapeReleases(sources, 'scene');
|
|
}
|
|
|
|
async function scrapeMovies(sources) {
|
|
return scrapeReleases(sources, 'movie');
|
|
}
|
|
|
|
async function deepFetchReleases(baseReleases, beforeFetchLatest) {
|
|
const deepReleases = await Promise.map(baseReleases, async (release) => {
|
|
if (release.url || (release.path && release.site)) {
|
|
try {
|
|
const fullRelease = await scrapeRelease(release.url, release, 'scene', beforeFetchLatest);
|
|
|
|
if (fullRelease) {
|
|
return {
|
|
...release,
|
|
...fullRelease,
|
|
deep: true,
|
|
};
|
|
}
|
|
|
|
logger.warn(`Release scraper returned empty result for ${release.url}`);
|
|
|
|
return release;
|
|
} catch (error) {
|
|
logger.error(`Failed to scrape ${release.url}: ${error}`);
|
|
|
|
return {
|
|
...release,
|
|
deep: false,
|
|
};
|
|
}
|
|
}
|
|
|
|
return release;
|
|
}, {
|
|
concurrency: 2,
|
|
});
|
|
|
|
return deepReleases;
|
|
}
|
|
|
|
module.exports = {
|
|
deepFetchReleases,
|
|
scrapeMovies,
|
|
scrapeRelease,
|
|
scrapeReleases,
|
|
scrapeScenes,
|
|
};
|