From 100a35b4e8d7924fd1fabef27b0b987a45edba7f Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Tue, 26 Oct 2021 23:42:32 +0200 Subject: [PATCH] Added before scene fetch method to prevent e.g. unnecessary session requests, moved scraper assignment to entity lookup. Removed channel URL hostname matching.. --- src/deep.js | 25 +++++++++++++++++++------ src/entities.js | 5 ++++- src/scrapers/mindgeek.js | 7 +++++-- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/src/deep.js b/src/deep.js index d7d3d25d..e09bc5bd 100644 --- a/src/deep.js +++ b/src/deep.js @@ -5,7 +5,6 @@ const { mergeAdvanced: merge } = require('object-merge-advanced'); const argv = require('./argv'); const include = require('./utils/argv-include')(argv); -const { resolveScraper, resolveLayoutScraper } = require('./scrapers/resolve'); const { fetchReleaseEntities, urlToSiteSlug } = require('./entities'); const logger = require('./logger')(__filename); const qu = require('./utils/qu'); @@ -96,10 +95,9 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') { }; } - const scraper = resolveScraper(entity); - const layoutScraper = resolveLayoutScraper(entity, scraper); + const layoutScraper = entity.scraper; - if (!layoutScraper) { + if (!entity.scraper) { logger.warn(`Could not find scraper for ${baseRelease.url}`); return baseRelease; } @@ -114,13 +112,16 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') { const options = { ...include, + beforeFetchScene: entity.preData, parameters: getRecursiveParameters(entity), }; - const scrapedRelease = type === 'scene' + const rawScrapedRelease = type === 'scene' ? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, options, null) : await layoutScraper.fetchMovie(baseRelease.url, entity, baseRelease, options, null); + const scrapedRelease = rawScrapedRelease?.scene || rawScrapedRelease; + if (!scrapedRelease || typeof scrapedRelease !== 'object' || Array.isArray(scrapedRelease)) { // scraper is unable to fetch the releases and returned a HTTP code or null throw new Error(`Scraper returned '${scrapedRelease}' when fetching latest from '${entity.name}' (${entity.parent?.name})`); @@ -170,9 +171,21 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') { } async function scrapeReleases(baseReleases, entitiesBySlug, type) { + const entitiesWithBeforeDataEntries = await Promise.all(Object.entries(entitiesBySlug).map(async ([slug, entity]) => { + if (entity.scraper?.beforeFetchScene) { + const preData = await entity.scraper.beforeFetchScene(entity); + + return [slug, { ...entity, preData }]; + } + + return null; + })); + + const entitiesWithBeforeDataBySlug = Object.fromEntries(entitiesWithBeforeDataEntries.filter(Boolean)); + return Promise.map( baseReleases, - async baseRelease => scrapeRelease(baseRelease, entitiesBySlug, type), + async baseRelease => scrapeRelease(baseRelease, entitiesWithBeforeDataBySlug, type), { concurrency: 10 }, ); } diff --git a/src/entities.js b/src/entities.js index 787eb8db..f94f63f0 100644 --- a/src/entities.js +++ b/src/entities.js @@ -8,6 +8,7 @@ const argv = require('./argv'); const knex = require('./knex'); const { deleteScenes, deleteMovies } = require('./releases'); const { flushOrphanedMedia } = require('./media'); +const { resolveScraper, resolveLayoutScraper } = require('./scrapers/resolve'); function getRecursiveParent(entity) { if (!entity) { @@ -71,6 +72,9 @@ function curateEntity(entity, includeParameters = false) { }, includeParameters)); } + const scraper = resolveScraper(curatedEntity); + curatedEntity.scraper = resolveLayoutScraper(entity, scraper); + return curatedEntity; } @@ -187,7 +191,6 @@ async function fetchEntitiesBySlug(entitySlugs, sort = 'asc') { array['parent'] as parent_path FROM entities WHERE slug = ANY(:entitySlugs) - OR substring(url from 'https%://%#"[a-z0-9-]+#".(com|net)%' for '#') = ANY(:entitySlugs) UNION ALL diff --git a/src/scrapers/mindgeek.js b/src/scrapers/mindgeek.js index 6f7dbb92..36198d45 100644 --- a/src/scrapers/mindgeek.js +++ b/src/scrapers/mindgeek.js @@ -294,7 +294,7 @@ async function fetchScene(url, site, baseScene, options) { } const entryId = new URL(url).pathname.match(/\/(\d+)/)?.[1]; - const { session, instanceToken } = await getSession(site, options.parameters); + const { session, instanceToken } = options.beforeFetchScene || await getSession(site, options.parameters); const res = await http.get(`https://site-api.project1service.com/v2/releases/${entryId}`, { session, @@ -306,7 +306,9 @@ async function fetchScene(url, site, baseScene, options) { }); if (res.status === 200 && res.body.result) { - return scrapeScene(res.body.result, url, site); + return { + scene: scrapeScene(res.body.result, url, site), + }; } return null; @@ -361,6 +363,7 @@ async function fetchProfile({ name: actorName, slug: actorSlug }, { entity, para } module.exports = { + beforeFetchScene: getSession, scrapeLatestX, fetchLatest, fetchUpcoming,