From 7ff222ce259d146d31aa75738727d725b6f119c8 Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Wed, 10 Feb 2021 03:23:48 +0100 Subject: [PATCH] Passing recursive parameters to all scraper methods. Using throttle parameters in MindGeek scraper, fixed missing slug breaking scene and actor URLs. --- seeds/01_networks.js | 4 +++ src/actors.js | 13 +++----- src/deep.js | 10 +++++-- src/media.js | 2 -- src/scrapers/mindgeek.js | 43 +++++++++++++++++++-------- src/updates.js | 2 ++ src/utils/get-recursive-parameters.js | 11 +++++++ 7 files changed, 59 insertions(+), 26 deletions(-) create mode 100644 src/utils/get-recursive-parameters.js diff --git a/seeds/01_networks.js b/seeds/01_networks.js index 07ae1015..79ed17ab 100644 --- a/seeds/01_networks.js +++ b/seeds/01_networks.js @@ -17,6 +17,10 @@ const grandParentNetworks = [ name: 'Mind Geek', url: 'https://www.mindgeek.com', description: '', + parameters: { + interval: 1000, + concurrency: 1, + }, }, { slug: 'whalemember', diff --git a/src/actors.js b/src/actors.js index 16c3263f..bb8c71c8 100644 --- a/src/actors.js +++ b/src/actors.js @@ -30,6 +30,8 @@ const { deleteScenes } = require('./releases'); const slugify = require('./utils/slugify'); const capitalize = require('./utils/capitalize'); const resolvePlace = require('./utils/resolve-place'); +const { resolveLayoutScraper } = require('./scrapers/resolve'); +const getRecursiveParameters = require('./utils/get-recursive-parameters'); const hairColors = { 'jet-black': 'black', @@ -637,10 +639,7 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy const entity = entitiesBySlug[scraperSlug] || null; const scraper = scrapers[scraperSlug]; - const layoutScraper = scraper?.[entity.parameters?.layout] - || scraper?.[entity.parent?.parameters?.layout] - || scraper?.[entity.parent?.parent?.parameters?.layout] - || scraper; + const layoutScraper = resolveLayoutScraper(entity, scraper); const context = { ...entity, @@ -649,11 +648,7 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy network: entity?.parent, entity, scraper: scraperSlug, - parameters: { - ...entity?.parent?.parent?.parameters, - ...entity?.parent?.parameters, - ...entity?.parameters, - }, + parameters: getRecursiveParameters(entity), }; const label = context.entity?.name; diff --git a/src/deep.js b/src/deep.js index 501b2f20..56104297 100644 --- a/src/deep.js +++ b/src/deep.js @@ -9,6 +9,7 @@ const { resolveScraper, resolveLayoutScraper } = require('./scrapers/resolve'); const { fetchReleaseEntities, urlToSiteSlug } = require('./entities'); const logger = require('./logger')(__filename); const qu = require('./utils/qu'); +const getRecursiveParameters = require('./utils/get-recursive-parameters'); function toBaseReleases(baseReleasesOrUrls, entity = null) { if (!baseReleasesOrUrls) { @@ -106,9 +107,14 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') { try { logger.verbose(`Fetching ${type} ${baseRelease.url}`); + const options = { + ...include, + parameters: getRecursiveParameters(entity), + }; + const scrapedRelease = type === 'scene' - ? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, include, null) - : await layoutScraper.fetchMovie(baseRelease.url, entity, baseRelease, include, null); + ? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, options, null) + : await layoutScraper.fetchMovie(baseRelease.url, entity, baseRelease, options, null); if (typeof scrapedRelease !== 'object' || Array.isArray(scrapedRelease)) { // scraper is unable to fetch the releases and returned a HTTP code or null diff --git a/src/media.js b/src/media.js index 06105463..e7ccb23c 100644 --- a/src/media.js +++ b/src/media.js @@ -168,8 +168,6 @@ function sortBaseTrailersByQuality(sources, role) { return 0; }); - console.log(sortedSources); - return sortedSources; } diff --git a/src/scrapers/mindgeek.js b/src/scrapers/mindgeek.js index 491efd43..f92faa5e 100644 --- a/src/scrapers/mindgeek.js +++ b/src/scrapers/mindgeek.js @@ -68,7 +68,7 @@ function scrapeLatestX(data, site, filterChannel) { || (site.parameters?.native && `${site.url}/scene`) || `${site.parent.url}/scene`; - release.url = `${basepath}/${release.entryId}/`; + release.url = `${basepath}/${release.entryId}/${slugify(release.title)}`; release.date = new Date(data.dateReleased); release.actors = data.actors.map(actor => ({ name: actor.name, gender: actor.gender })); @@ -143,7 +143,7 @@ function getUrl(site) { throw new Error(`Mind Geek site '${site.name}' (${site.url}) not supported`); } -async function getSession(site) { +async function getSession(site, parameters) { const cookieJar = new CookieJar(); const session = http.session({ cookieJar }); @@ -152,7 +152,11 @@ async function getSession(site) { ? site.parent.url : site.url; - const res = await http.get(sessionUrl, { session }); + const res = await http.get(sessionUrl, { + session, + interval: parameters?.interval, + concurrency: parameters?.concurrency, + }); if (res.statusCode === 200) { const cookieString = await cookieJar.getCookieStringAsync(sessionUrl); @@ -212,12 +216,12 @@ function scrapeProfile(data, html, releases = [], networkName) { return profile; } -async function fetchLatest(site, page = 1) { +async function fetchLatest(site, page = 1, options) { const url = getUrl(site); const { searchParams } = new URL(url); const siteId = searchParams.get('site'); - const { session, instanceToken } = await getSession(site); + const { session, instanceToken } = await getSession(site, options.parameters); const beforeDate = moment().add('1', 'day').format('YYYY-MM-DD'); const limit = 10; @@ -227,6 +231,8 @@ async function fetchLatest(site, page = 1) { const res = await http.get(apiUrl, { session, + interval: options.parameters.interval, + concurrency: options.parameters.concurrency, headers: { Instance: instanceToken, Origin: site.url, @@ -241,14 +247,16 @@ async function fetchLatest(site, page = 1) { return null; } -async function fetchUpcoming(site) { +async function fetchUpcoming(site, page, options) { const url = getUrl(site); - const { session, instanceToken } = await getSession(site); + const { session, instanceToken } = await getSession(site, options.parameters); const apiUrl = 'https://site-api.project1service.com/v2/upcoming-releases'; const res = await http.get(apiUrl, { session, + interval: options.parameters.interval, + concurrency: options.parameters.concurrency, headers: { Instance: instanceToken, Origin: site.url, @@ -263,17 +271,19 @@ async function fetchUpcoming(site) { return null; } -async function fetchScene(url, site, baseScene) { +async function fetchScene(url, site, baseScene, options) { if (baseScene?.entryId) { // overview and deep data is the same, don't hit server unnecessarily return baseScene; } const entryId = url.match(/\d+/)[0]; - const { session, instanceToken } = await getSession(site); + const { session, instanceToken } = await getSession(site, options.parameters); const res = await http.get(`https://site-api.project1service.com/v2/releases/${entryId}`, { session, + interval: options.parameters.interval, + concurrency: options.parameters.concurrency, headers: { Instance: instanceToken, }, @@ -286,12 +296,14 @@ async function fetchScene(url, site, baseScene) { return null; } -async function fetchProfile({ name: actorName }, { entity }) { +async function fetchProfile({ name: actorName, slug: actorSlug }, { entity, parameters }) { // const url = `https://www.${networkOrNetworkSlug.slug || networkOrNetworkSlug}.com`; - const { session, instanceToken } = await getSession(entity); + const { session, instanceToken } = await getSession(entity, parameters); const res = await http.get(`https://site-api.project1service.com/v1/actors/?search=${encodeURI(actorName)}`, { session, + interval: parameters.interval, + concurrency: parameters.concurrency, headers: { Instance: instanceToken, }, @@ -301,13 +313,18 @@ async function fetchProfile({ name: actorName }, { entity }) { const actorData = res.body.result.find(actor => actor.name.toLowerCase() === actorName.toLowerCase()); if (actorData) { - const actorUrl = `https://www.${entity.slug}.com/${entity.parameters?.actorPath || 'model'}/${actorData.id}/`; + const actorUrl = `https://www.${entity.slug}.com/${entity.parameters?.actorPath || 'model'}/${actorData.id}/${actorSlug}`; const actorReleasesUrl = `https://site-api.project1service.com/v2/releases?actorId=${actorData.id}&limit=100&offset=0&orderBy=-dateReleased&type=scene`; const [actorRes, actorReleasesRes] = await Promise.all([ - http.get(actorUrl), + http.get(actorUrl, { + interval: parameters.interval, + concurrency: parameters.concurrency, + }), http.get(actorReleasesUrl, { session, + interval: parameters.interval, + concurrency: parameters.concurrency, headers: { Instance: instanceToken, }, diff --git a/src/updates.js b/src/updates.js index 3c0002f8..cfab379c 100644 --- a/src/updates.js +++ b/src/updates.js @@ -11,6 +11,7 @@ const { curateRelease } = require('./releases'); const include = require('./utils/argv-include')(argv); const { resolveScraper, resolveLayoutScraper } = require('./scrapers/resolve'); const { fetchIncludedEntities } = require('./entities'); +const getRecursiveParameters = require('./utils/get-recursive-parameters'); const emptyReleases = { uniqueReleases: [], duplicateReleases: [] }; @@ -97,6 +98,7 @@ async function scrapeReleases(scraper, entity, preData, isUpcoming) { const options = { ...config.options[scraper.slug], ...include, + parameters: getRecursiveParameters(entity), }; const pageReleases = isUpcoming diff --git a/src/utils/get-recursive-parameters.js b/src/utils/get-recursive-parameters.js new file mode 100644 index 00000000..7aed4655 --- /dev/null +++ b/src/utils/get-recursive-parameters.js @@ -0,0 +1,11 @@ +'use strict'; + +function getRecursiveParameters(entity, parameters) { + if (entity.parent) { + return getRecursiveParameters(entity.parent, { ...parameters, ...entity.parameters }); + } + + return { ...parameters, ...entity.parameters }; +} + +module.exports = getRecursiveParameters;