Added before scene fetch method to prevent e.g. unnecessary session requests, moved scraper assignment to entity lookup. Removed channel URL hostname matching..

This commit is contained in:
DebaucheryLibrarian 2021-10-26 23:42:32 +02:00
parent 6c5d4389fe
commit 100a35b4e8
3 changed files with 28 additions and 9 deletions

View File

@ -5,7 +5,6 @@ const { mergeAdvanced: merge } = require('object-merge-advanced');
const argv = require('./argv'); const argv = require('./argv');
const include = require('./utils/argv-include')(argv); const include = require('./utils/argv-include')(argv);
const { resolveScraper, resolveLayoutScraper } = require('./scrapers/resolve');
const { fetchReleaseEntities, urlToSiteSlug } = require('./entities'); const { fetchReleaseEntities, urlToSiteSlug } = require('./entities');
const logger = require('./logger')(__filename); const logger = require('./logger')(__filename);
const qu = require('./utils/qu'); const qu = require('./utils/qu');
@ -96,10 +95,9 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
}; };
} }
const scraper = resolveScraper(entity); const layoutScraper = entity.scraper;
const layoutScraper = resolveLayoutScraper(entity, scraper);
if (!layoutScraper) { if (!entity.scraper) {
logger.warn(`Could not find scraper for ${baseRelease.url}`); logger.warn(`Could not find scraper for ${baseRelease.url}`);
return baseRelease; return baseRelease;
} }
@ -114,13 +112,16 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
const options = { const options = {
...include, ...include,
beforeFetchScene: entity.preData,
parameters: getRecursiveParameters(entity), parameters: getRecursiveParameters(entity),
}; };
const scrapedRelease = type === 'scene' const rawScrapedRelease = type === 'scene'
? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, options, null) ? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, options, null)
: await layoutScraper.fetchMovie(baseRelease.url, entity, baseRelease, options, null); : await layoutScraper.fetchMovie(baseRelease.url, entity, baseRelease, options, null);
const scrapedRelease = rawScrapedRelease?.scene || rawScrapedRelease;
if (!scrapedRelease || typeof scrapedRelease !== 'object' || Array.isArray(scrapedRelease)) { if (!scrapedRelease || typeof scrapedRelease !== 'object' || Array.isArray(scrapedRelease)) {
// scraper is unable to fetch the releases and returned a HTTP code or null // scraper is unable to fetch the releases and returned a HTTP code or null
throw new Error(`Scraper returned '${scrapedRelease}' when fetching latest from '${entity.name}' (${entity.parent?.name})`); throw new Error(`Scraper returned '${scrapedRelease}' when fetching latest from '${entity.name}' (${entity.parent?.name})`);
@ -170,9 +171,21 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
} }
async function scrapeReleases(baseReleases, entitiesBySlug, type) { async function scrapeReleases(baseReleases, entitiesBySlug, type) {
const entitiesWithBeforeDataEntries = await Promise.all(Object.entries(entitiesBySlug).map(async ([slug, entity]) => {
if (entity.scraper?.beforeFetchScene) {
const preData = await entity.scraper.beforeFetchScene(entity);
return [slug, { ...entity, preData }];
}
return null;
}));
const entitiesWithBeforeDataBySlug = Object.fromEntries(entitiesWithBeforeDataEntries.filter(Boolean));
return Promise.map( return Promise.map(
baseReleases, baseReleases,
async baseRelease => scrapeRelease(baseRelease, entitiesBySlug, type), async baseRelease => scrapeRelease(baseRelease, entitiesWithBeforeDataBySlug, type),
{ concurrency: 10 }, { concurrency: 10 },
); );
} }

View File

@ -8,6 +8,7 @@ const argv = require('./argv');
const knex = require('./knex'); const knex = require('./knex');
const { deleteScenes, deleteMovies } = require('./releases'); const { deleteScenes, deleteMovies } = require('./releases');
const { flushOrphanedMedia } = require('./media'); const { flushOrphanedMedia } = require('./media');
const { resolveScraper, resolveLayoutScraper } = require('./scrapers/resolve');
function getRecursiveParent(entity) { function getRecursiveParent(entity) {
if (!entity) { if (!entity) {
@ -71,6 +72,9 @@ function curateEntity(entity, includeParameters = false) {
}, includeParameters)); }, includeParameters));
} }
const scraper = resolveScraper(curatedEntity);
curatedEntity.scraper = resolveLayoutScraper(entity, scraper);
return curatedEntity; return curatedEntity;
} }
@ -187,7 +191,6 @@ async function fetchEntitiesBySlug(entitySlugs, sort = 'asc') {
array['parent'] as parent_path array['parent'] as parent_path
FROM entities FROM entities
WHERE slug = ANY(:entitySlugs) WHERE slug = ANY(:entitySlugs)
OR substring(url from 'https%://%#"[a-z0-9-]+#".(com|net)%' for '#') = ANY(:entitySlugs)
UNION ALL UNION ALL

View File

@ -294,7 +294,7 @@ async function fetchScene(url, site, baseScene, options) {
} }
const entryId = new URL(url).pathname.match(/\/(\d+)/)?.[1]; const entryId = new URL(url).pathname.match(/\/(\d+)/)?.[1];
const { session, instanceToken } = await getSession(site, options.parameters); const { session, instanceToken } = options.beforeFetchScene || await getSession(site, options.parameters);
const res = await http.get(`https://site-api.project1service.com/v2/releases/${entryId}`, { const res = await http.get(`https://site-api.project1service.com/v2/releases/${entryId}`, {
session, session,
@ -306,7 +306,9 @@ async function fetchScene(url, site, baseScene, options) {
}); });
if (res.status === 200 && res.body.result) { if (res.status === 200 && res.body.result) {
return scrapeScene(res.body.result, url, site); return {
scene: scrapeScene(res.body.result, url, site),
};
} }
return null; return null;
@ -361,6 +363,7 @@ async function fetchProfile({ name: actorName, slug: actorSlug }, { entity, para
} }
module.exports = { module.exports = {
beforeFetchScene: getSession,
scrapeLatestX, scrapeLatestX,
fetchLatest, fetchLatest,
fetchUpcoming, fetchUpcoming,