2020-03-16 03:10:52 +00:00
|
|
|
'use strict';
|
|
|
|
|
2021-12-01 22:30:10 +00:00
|
|
|
const util = require('util');
|
2020-03-16 23:58:03 +00:00
|
|
|
const Promise = require('bluebird');
|
2022-11-27 03:22:58 +00:00
|
|
|
const unprint = require('unprint');
|
2021-08-15 11:16:48 +00:00
|
|
|
const { mergeAdvanced: merge } = require('object-merge-advanced');
|
2020-03-16 23:58:03 +00:00
|
|
|
|
2020-03-16 03:10:52 +00:00
|
|
|
const argv = require('./argv');
|
2020-03-21 01:48:24 +00:00
|
|
|
const include = require('./utils/argv-include')(argv);
|
2021-02-01 00:45:30 +00:00
|
|
|
const { fetchReleaseEntities, urlToSiteSlug } = require('./entities');
|
2020-03-16 03:10:52 +00:00
|
|
|
const logger = require('./logger')(__filename);
|
2021-01-13 20:29:05 +00:00
|
|
|
const qu = require('./utils/qu');
|
2021-02-10 02:23:48 +00:00
|
|
|
const getRecursiveParameters = require('./utils/get-recursive-parameters');
|
2021-12-01 16:26:13 +00:00
|
|
|
const windows = require('./utils/http-windows');
|
2020-03-16 03:10:52 +00:00
|
|
|
|
2021-12-01 22:30:10 +00:00
|
|
|
const waitImmediate = util.promisify(setImmediate);
|
|
|
|
|
2020-11-15 22:50:04 +00:00
|
|
|
function toBaseReleases(baseReleasesOrUrls, entity = null) {
|
2020-05-17 01:00:44 +00:00
|
|
|
if (!baseReleasesOrUrls) {
|
|
|
|
return [];
|
|
|
|
}
|
|
|
|
|
2020-05-14 02:26:05 +00:00
|
|
|
return baseReleasesOrUrls
|
|
|
|
.map((baseReleaseOrUrl) => {
|
|
|
|
if (baseReleaseOrUrl.url) {
|
|
|
|
// base release with URL
|
|
|
|
return {
|
|
|
|
...baseReleaseOrUrl,
|
2020-11-24 03:29:44 +00:00
|
|
|
entity: baseReleaseOrUrl.entity || entity,
|
2020-05-14 02:26:05 +00:00
|
|
|
deep: false,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
if (/^http/.test(baseReleaseOrUrl)) {
|
|
|
|
// URL
|
|
|
|
return {
|
|
|
|
url: baseReleaseOrUrl,
|
2020-11-15 22:50:04 +00:00
|
|
|
entity,
|
2020-05-14 02:26:05 +00:00
|
|
|
deep: false,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
if (typeof baseReleaseOrUrl === 'object' && !Array.isArray(baseReleaseOrUrl)) {
|
|
|
|
// base release without URL, prepare for passthrough
|
|
|
|
return {
|
|
|
|
...baseReleaseOrUrl,
|
2020-11-24 03:29:44 +00:00
|
|
|
entity: baseReleaseOrUrl.entity || entity,
|
2020-05-14 02:26:05 +00:00
|
|
|
deep: false,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
logger.warn(`Malformed base release, discarding '${baseReleaseOrUrl}'`);
|
|
|
|
return null;
|
|
|
|
})
|
|
|
|
.filter(Boolean);
|
2020-03-16 03:10:52 +00:00
|
|
|
}
|
|
|
|
|
2022-11-27 03:22:58 +00:00
|
|
|
async function fetchUnprintScene(scraper, url, entity, baseRelease, options, type) {
|
|
|
|
const res = await unprint.get(url, {
|
|
|
|
rejectUnauthorized: false,
|
|
|
|
});
|
|
|
|
|
|
|
|
if (res.ok) {
|
|
|
|
return scraper[type === 'movie' ? 'scrapeMovie' : 'scrapeScene'](res.context, {
|
|
|
|
url,
|
|
|
|
entity,
|
|
|
|
baseRelease,
|
|
|
|
headers: res.headers,
|
|
|
|
}, options);
|
|
|
|
}
|
|
|
|
|
|
|
|
return res.status;
|
|
|
|
}
|
|
|
|
|
2021-12-20 01:22:10 +00:00
|
|
|
async function fetchScene(scraper, url, entity, baseRelease, options, type = 'scene') {
|
|
|
|
if ((type === 'scene' && scraper.fetchScene) || (type === 'movie' && scraper.fetchMovie)) {
|
|
|
|
return scraper[type === 'movie' ? 'fetchMovie' : 'fetchScene'](baseRelease.url, entity, baseRelease, options, null);
|
2021-01-13 20:29:05 +00:00
|
|
|
}
|
|
|
|
|
2021-12-20 01:22:10 +00:00
|
|
|
if ((type === 'scene' && scraper.scrapeScene) || (type === 'movie' && scraper.scrapeMovie)) {
|
2022-11-27 03:22:58 +00:00
|
|
|
if (scraper.useUnprint) {
|
|
|
|
return fetchUnprintScene(scraper, url, entity, baseRelease, options, type);
|
|
|
|
}
|
|
|
|
|
2021-02-04 00:13:02 +00:00
|
|
|
const session = qu.session();
|
2021-08-29 23:13:32 +00:00
|
|
|
|
|
|
|
const res = await qu.get(url, null, null, {
|
|
|
|
session,
|
|
|
|
rejectUnauthorized: false,
|
|
|
|
});
|
|
|
|
|
2021-02-04 00:13:02 +00:00
|
|
|
const cookie = await session._sessionOptions.cookieJar.get(url);
|
2021-01-13 20:29:05 +00:00
|
|
|
|
|
|
|
if (res.ok) {
|
2021-12-20 01:22:10 +00:00
|
|
|
return scraper[type === 'movie' ? 'scrapeMovie' : 'scrapeScene'](res.item, url, entity, baseRelease, options, {
|
2021-02-04 00:13:02 +00:00
|
|
|
session,
|
|
|
|
headers: res.headers,
|
|
|
|
cookieJar: session._sessionOptions.cookieJar,
|
|
|
|
cookie,
|
|
|
|
});
|
2021-01-13 20:29:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
return res.status;
|
|
|
|
}
|
|
|
|
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
2021-12-20 01:22:10 +00:00
|
|
|
function fetchMovie(scraper, url, entity, baseRelease, options) {
|
|
|
|
return fetchScene(scraper, url, entity, baseRelease, options, 'movie');
|
|
|
|
}
|
|
|
|
|
2021-02-01 00:45:30 +00:00
|
|
|
async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
|
|
|
|
const entity = baseRelease.entity || entitiesBySlug[urlToSiteSlug(baseRelease.url)];
|
2020-05-14 02:26:05 +00:00
|
|
|
|
2020-06-25 00:26:25 +00:00
|
|
|
if (!entity) {
|
|
|
|
logger.warn(`No entity available for ${baseRelease.url}`);
|
2020-05-14 02:26:05 +00:00
|
|
|
return baseRelease;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ((!baseRelease.url && !baseRelease.path) || !argv.deep) {
|
|
|
|
return {
|
|
|
|
...baseRelease,
|
2020-06-25 00:26:25 +00:00
|
|
|
entity,
|
2020-05-14 02:26:05 +00:00
|
|
|
};
|
|
|
|
}
|
|
|
|
|
2021-10-26 21:42:32 +00:00
|
|
|
const layoutScraper = entity.scraper;
|
2020-05-14 02:26:05 +00:00
|
|
|
|
2021-10-26 21:42:32 +00:00
|
|
|
if (!entity.scraper) {
|
2020-05-14 02:26:05 +00:00
|
|
|
logger.warn(`Could not find scraper for ${baseRelease.url}`);
|
|
|
|
return baseRelease;
|
|
|
|
}
|
|
|
|
|
2021-12-20 01:22:10 +00:00
|
|
|
if ((type === 'scene' && !layoutScraper.fetchScene && !layoutScraper.scrapeScene) || (type === 'movie' && !layoutScraper.fetchMovie && !layoutScraper.scrapeMovie)) {
|
2021-01-13 20:29:05 +00:00
|
|
|
logger.warn(`The '${entity.name}'-scraper cannot scrape individual ${type}s`);
|
2020-05-14 02:26:05 +00:00
|
|
|
return baseRelease;
|
|
|
|
}
|
|
|
|
|
|
|
|
try {
|
|
|
|
logger.verbose(`Fetching ${type} ${baseRelease.url}`);
|
|
|
|
|
2021-02-10 02:23:48 +00:00
|
|
|
const options = {
|
|
|
|
...include,
|
2021-10-27 15:19:23 +00:00
|
|
|
beforeFetchScenes: entity.preData,
|
2021-02-10 02:23:48 +00:00
|
|
|
parameters: getRecursiveParameters(entity),
|
|
|
|
};
|
|
|
|
|
2021-12-01 16:26:13 +00:00
|
|
|
logger.debug(`Memory usage before: ${process.memoryUsage.rss() / 1000000} MB (${baseRelease.url})`);
|
|
|
|
|
2021-10-26 21:42:32 +00:00
|
|
|
const rawScrapedRelease = type === 'scene'
|
2021-12-20 01:22:10 +00:00
|
|
|
? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, options)
|
|
|
|
: await fetchMovie(layoutScraper, baseRelease.url, entity, baseRelease, options);
|
2020-05-14 02:26:05 +00:00
|
|
|
|
2022-02-07 21:16:43 +00:00
|
|
|
const pathname = baseRelease.path || (baseRelease.url && new URL(baseRelease.url).pathname.replace(/\//g, '_'));
|
2021-12-01 16:26:13 +00:00
|
|
|
|
2021-12-20 01:22:10 +00:00
|
|
|
if (rawScrapedRelease) {
|
|
|
|
delete rawScrapedRelease.query; // some scrapers pass the qu-wrapped window instance to parent scrapers, filling up memory
|
|
|
|
}
|
2021-12-05 01:54:55 +00:00
|
|
|
|
2021-12-01 22:44:25 +00:00
|
|
|
if (windows.has(pathname)) {
|
|
|
|
logger.debug(`Closing window for ${pathname}`);
|
|
|
|
|
2021-12-03 23:32:28 +00:00
|
|
|
windows.get(pathname).close();
|
|
|
|
windows.delete(pathname);
|
|
|
|
}
|
2021-12-01 16:26:13 +00:00
|
|
|
|
2021-12-01 22:30:10 +00:00
|
|
|
await waitImmediate;
|
|
|
|
|
2021-12-01 16:26:13 +00:00
|
|
|
logger.debug(`Memory usage after: ${process.memoryUsage.rss() / 1000000} MB (${baseRelease.url})`);
|
|
|
|
|
2021-10-26 21:42:32 +00:00
|
|
|
const scrapedRelease = rawScrapedRelease?.scene || rawScrapedRelease;
|
|
|
|
|
2021-02-27 17:05:06 +00:00
|
|
|
if (!scrapedRelease || typeof scrapedRelease !== 'object' || Array.isArray(scrapedRelease)) {
|
2021-02-10 02:00:17 +00:00
|
|
|
// scraper is unable to fetch the releases and returned a HTTP code or null
|
2021-02-27 17:05:06 +00:00
|
|
|
throw new Error(`Scraper returned '${scrapedRelease}' when fetching latest from '${entity.name}' (${entity.parent?.name})`);
|
2021-02-10 02:00:17 +00:00
|
|
|
}
|
|
|
|
|
2021-01-25 22:53:56 +00:00
|
|
|
// object-merge-advance will use null as explicit false on hard merged keys, even when null as explicit falls is disabled
|
|
|
|
// filter out keys with null values to ensure original base value is used instead
|
|
|
|
const curatedScrapedRelease = Object.entries(scrapedRelease).reduce((acc, [key, value]) => ({
|
|
|
|
...acc,
|
2022-04-07 14:06:38 +00:00
|
|
|
...(value !== null && value !== undefined && !(Array.isArray(value) && value.filter(Boolean).length === 0) && {
|
|
|
|
[key]: Array.isArray(value) ? value.filter(Boolean) : value,
|
2021-01-25 22:53:56 +00:00
|
|
|
}),
|
|
|
|
}), {});
|
|
|
|
|
2020-05-14 02:26:05 +00:00
|
|
|
const mergedRelease = {
|
2021-01-25 22:53:56 +00:00
|
|
|
...merge(baseRelease, curatedScrapedRelease, {
|
2021-01-13 15:08:19 +00:00
|
|
|
dedupeStringsInArrayValues: true,
|
2021-02-02 02:10:58 +00:00
|
|
|
hardMergeKeys: ['actors', 'covers', 'poster', 'trailer', 'teaser'],
|
2021-01-13 15:08:19 +00:00
|
|
|
}),
|
2022-12-29 22:07:08 +00:00
|
|
|
photos: curatedScrapedRelease.photos?.length > 0
|
|
|
|
? curatedScrapedRelease.photos
|
|
|
|
: baseRelease.photos,
|
2020-05-14 02:26:05 +00:00
|
|
|
deep: !!scrapedRelease,
|
2020-06-25 00:26:25 +00:00
|
|
|
entity,
|
2020-05-14 02:26:05 +00:00
|
|
|
};
|
|
|
|
|
2020-05-18 02:28:38 +00:00
|
|
|
if (!mergedRelease.entryId) {
|
2020-05-20 00:23:45 +00:00
|
|
|
throw Object.assign(new Error('No entry ID supplied'), { code: 'NO_ENTRY_ID' });
|
2020-05-18 02:28:38 +00:00
|
|
|
}
|
|
|
|
|
2020-05-14 02:26:05 +00:00
|
|
|
if (scrapedRelease && baseRelease?.tags) {
|
|
|
|
// accumulate all available tags
|
|
|
|
mergedRelease.tags = baseRelease.tags.concat(scrapedRelease.tags);
|
|
|
|
}
|
|
|
|
|
|
|
|
return mergedRelease;
|
|
|
|
} catch (error) {
|
|
|
|
logger.error(`Deep scrape failed for ${baseRelease.url}: ${error.message}`);
|
2020-05-20 00:23:45 +00:00
|
|
|
|
2020-08-20 21:35:18 +00:00
|
|
|
if (argv.debug) {
|
|
|
|
console.error(error);
|
|
|
|
}
|
|
|
|
|
2020-05-20 00:23:45 +00:00
|
|
|
if (error.code === 'NO_ENTRY_ID') {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
2020-05-14 02:26:05 +00:00
|
|
|
return baseRelease;
|
|
|
|
}
|
2020-03-16 03:10:52 +00:00
|
|
|
}
|
|
|
|
|
2021-02-01 00:45:30 +00:00
|
|
|
async function scrapeReleases(baseReleases, entitiesBySlug, type) {
|
2021-10-26 21:42:32 +00:00
|
|
|
const entitiesWithBeforeDataEntries = await Promise.all(Object.entries(entitiesBySlug).map(async ([slug, entity]) => {
|
2022-12-28 00:34:12 +00:00
|
|
|
console.log('scraper', entity.scraper?.beforeFetchScenes);
|
|
|
|
|
2021-10-27 15:19:23 +00:00
|
|
|
if (entity.scraper?.beforeFetchScenes) {
|
2022-02-25 21:13:41 +00:00
|
|
|
const parameters = getRecursiveParameters(entity);
|
|
|
|
const preData = await entity.scraper.beforeFetchScenes(entity, parameters);
|
2021-10-26 21:42:32 +00:00
|
|
|
|
2022-12-28 00:34:12 +00:00
|
|
|
console.log('pre data', preData);
|
|
|
|
|
2021-10-26 21:42:32 +00:00
|
|
|
return [slug, { ...entity, preData }];
|
|
|
|
}
|
|
|
|
|
2021-11-27 22:55:16 +00:00
|
|
|
return [slug, entity];
|
2021-10-26 21:42:32 +00:00
|
|
|
}));
|
|
|
|
|
|
|
|
const entitiesWithBeforeDataBySlug = Object.fromEntries(entitiesWithBeforeDataEntries.filter(Boolean));
|
|
|
|
|
2020-05-14 02:26:05 +00:00
|
|
|
return Promise.map(
|
|
|
|
baseReleases,
|
2021-11-20 22:59:15 +00:00
|
|
|
async (baseRelease) => scrapeRelease(baseRelease, entitiesWithBeforeDataBySlug, type),
|
2021-12-01 22:39:09 +00:00
|
|
|
{ concurrency: 1 },
|
2020-05-14 02:26:05 +00:00
|
|
|
);
|
2020-03-16 03:10:52 +00:00
|
|
|
}
|
|
|
|
|
2020-03-21 01:48:24 +00:00
|
|
|
async function fetchReleases(baseReleasesOrUrls, type = 'scene') {
|
2020-05-14 02:26:05 +00:00
|
|
|
const baseReleases = toBaseReleases(baseReleasesOrUrls);
|
2021-02-01 00:45:30 +00:00
|
|
|
const entitiesBySlug = await fetchReleaseEntities(baseReleases);
|
2020-03-16 03:10:52 +00:00
|
|
|
|
2021-02-01 00:45:30 +00:00
|
|
|
const deepReleases = await scrapeReleases(baseReleases, entitiesBySlug, type);
|
2020-03-16 03:10:52 +00:00
|
|
|
|
2020-05-20 00:23:45 +00:00
|
|
|
return deepReleases.filter(Boolean);
|
2020-03-16 03:10:52 +00:00
|
|
|
}
|
|
|
|
|
2020-03-21 01:48:24 +00:00
|
|
|
async function fetchScenes(baseReleasesOrUrls) {
|
2020-05-14 02:26:05 +00:00
|
|
|
return fetchReleases(baseReleasesOrUrls, 'scene');
|
2020-03-21 01:48:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
async function fetchMovies(baseReleasesOrUrls) {
|
2020-08-10 19:39:55 +00:00
|
|
|
const movies = await fetchReleases(baseReleasesOrUrls, 'movie');
|
|
|
|
|
|
|
|
return movies;
|
2020-03-21 01:48:24 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
module.exports = {
|
2020-05-14 02:26:05 +00:00
|
|
|
fetchReleases,
|
|
|
|
fetchScenes,
|
|
|
|
fetchMovies,
|
2020-05-15 02:40:59 +00:00
|
|
|
toBaseReleases,
|
2020-03-21 01:48:24 +00:00
|
|
|
};
|