Passing recursive parameters to all scraper methods. Using throttle parameters in MindGeek scraper, fixed missing slug breaking scene and actor URLs.

This commit is contained in:
DebaucheryLibrarian 2021-02-10 03:23:48 +01:00
parent 62ad786318
commit 7ff222ce25
7 changed files with 59 additions and 26 deletions

View File

@ -17,6 +17,10 @@ const grandParentNetworks = [
name: 'Mind Geek', name: 'Mind Geek',
url: 'https://www.mindgeek.com', url: 'https://www.mindgeek.com',
description: '', description: '',
parameters: {
interval: 1000,
concurrency: 1,
},
}, },
{ {
slug: 'whalemember', slug: 'whalemember',

View File

@ -30,6 +30,8 @@ const { deleteScenes } = require('./releases');
const slugify = require('./utils/slugify'); const slugify = require('./utils/slugify');
const capitalize = require('./utils/capitalize'); const capitalize = require('./utils/capitalize');
const resolvePlace = require('./utils/resolve-place'); const resolvePlace = require('./utils/resolve-place');
const { resolveLayoutScraper } = require('./scrapers/resolve');
const getRecursiveParameters = require('./utils/get-recursive-parameters');
const hairColors = { const hairColors = {
'jet-black': 'black', 'jet-black': 'black',
@ -637,10 +639,7 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy
const entity = entitiesBySlug[scraperSlug] || null; const entity = entitiesBySlug[scraperSlug] || null;
const scraper = scrapers[scraperSlug]; const scraper = scrapers[scraperSlug];
const layoutScraper = scraper?.[entity.parameters?.layout] const layoutScraper = resolveLayoutScraper(entity, scraper);
|| scraper?.[entity.parent?.parameters?.layout]
|| scraper?.[entity.parent?.parent?.parameters?.layout]
|| scraper;
const context = { const context = {
...entity, ...entity,
@ -649,11 +648,7 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy
network: entity?.parent, network: entity?.parent,
entity, entity,
scraper: scraperSlug, scraper: scraperSlug,
parameters: { parameters: getRecursiveParameters(entity),
...entity?.parent?.parent?.parameters,
...entity?.parent?.parameters,
...entity?.parameters,
},
}; };
const label = context.entity?.name; const label = context.entity?.name;

View File

@ -9,6 +9,7 @@ const { resolveScraper, resolveLayoutScraper } = require('./scrapers/resolve');
const { fetchReleaseEntities, urlToSiteSlug } = require('./entities'); const { fetchReleaseEntities, urlToSiteSlug } = require('./entities');
const logger = require('./logger')(__filename); const logger = require('./logger')(__filename);
const qu = require('./utils/qu'); const qu = require('./utils/qu');
const getRecursiveParameters = require('./utils/get-recursive-parameters');
function toBaseReleases(baseReleasesOrUrls, entity = null) { function toBaseReleases(baseReleasesOrUrls, entity = null) {
if (!baseReleasesOrUrls) { if (!baseReleasesOrUrls) {
@ -106,9 +107,14 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
try { try {
logger.verbose(`Fetching ${type} ${baseRelease.url}`); logger.verbose(`Fetching ${type} ${baseRelease.url}`);
const options = {
...include,
parameters: getRecursiveParameters(entity),
};
const scrapedRelease = type === 'scene' const scrapedRelease = type === 'scene'
? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, include, null) ? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, options, null)
: await layoutScraper.fetchMovie(baseRelease.url, entity, baseRelease, include, null); : await layoutScraper.fetchMovie(baseRelease.url, entity, baseRelease, options, null);
if (typeof scrapedRelease !== 'object' || Array.isArray(scrapedRelease)) { if (typeof scrapedRelease !== 'object' || Array.isArray(scrapedRelease)) {
// scraper is unable to fetch the releases and returned a HTTP code or null // scraper is unable to fetch the releases and returned a HTTP code or null

View File

@ -168,8 +168,6 @@ function sortBaseTrailersByQuality(sources, role) {
return 0; return 0;
}); });
console.log(sortedSources);
return sortedSources; return sortedSources;
} }

View File

@ -68,7 +68,7 @@ function scrapeLatestX(data, site, filterChannel) {
|| (site.parameters?.native && `${site.url}/scene`) || (site.parameters?.native && `${site.url}/scene`)
|| `${site.parent.url}/scene`; || `${site.parent.url}/scene`;
release.url = `${basepath}/${release.entryId}/`; release.url = `${basepath}/${release.entryId}/${slugify(release.title)}`;
release.date = new Date(data.dateReleased); release.date = new Date(data.dateReleased);
release.actors = data.actors.map(actor => ({ name: actor.name, gender: actor.gender })); release.actors = data.actors.map(actor => ({ name: actor.name, gender: actor.gender }));
@ -143,7 +143,7 @@ function getUrl(site) {
throw new Error(`Mind Geek site '${site.name}' (${site.url}) not supported`); throw new Error(`Mind Geek site '${site.name}' (${site.url}) not supported`);
} }
async function getSession(site) { async function getSession(site, parameters) {
const cookieJar = new CookieJar(); const cookieJar = new CookieJar();
const session = http.session({ cookieJar }); const session = http.session({ cookieJar });
@ -152,7 +152,11 @@ async function getSession(site) {
? site.parent.url ? site.parent.url
: site.url; : site.url;
const res = await http.get(sessionUrl, { session }); const res = await http.get(sessionUrl, {
session,
interval: parameters?.interval,
concurrency: parameters?.concurrency,
});
if (res.statusCode === 200) { if (res.statusCode === 200) {
const cookieString = await cookieJar.getCookieStringAsync(sessionUrl); const cookieString = await cookieJar.getCookieStringAsync(sessionUrl);
@ -212,12 +216,12 @@ function scrapeProfile(data, html, releases = [], networkName) {
return profile; return profile;
} }
async function fetchLatest(site, page = 1) { async function fetchLatest(site, page = 1, options) {
const url = getUrl(site); const url = getUrl(site);
const { searchParams } = new URL(url); const { searchParams } = new URL(url);
const siteId = searchParams.get('site'); const siteId = searchParams.get('site');
const { session, instanceToken } = await getSession(site); const { session, instanceToken } = await getSession(site, options.parameters);
const beforeDate = moment().add('1', 'day').format('YYYY-MM-DD'); const beforeDate = moment().add('1', 'day').format('YYYY-MM-DD');
const limit = 10; const limit = 10;
@ -227,6 +231,8 @@ async function fetchLatest(site, page = 1) {
const res = await http.get(apiUrl, { const res = await http.get(apiUrl, {
session, session,
interval: options.parameters.interval,
concurrency: options.parameters.concurrency,
headers: { headers: {
Instance: instanceToken, Instance: instanceToken,
Origin: site.url, Origin: site.url,
@ -241,14 +247,16 @@ async function fetchLatest(site, page = 1) {
return null; return null;
} }
async function fetchUpcoming(site) { async function fetchUpcoming(site, page, options) {
const url = getUrl(site); const url = getUrl(site);
const { session, instanceToken } = await getSession(site); const { session, instanceToken } = await getSession(site, options.parameters);
const apiUrl = 'https://site-api.project1service.com/v2/upcoming-releases'; const apiUrl = 'https://site-api.project1service.com/v2/upcoming-releases';
const res = await http.get(apiUrl, { const res = await http.get(apiUrl, {
session, session,
interval: options.parameters.interval,
concurrency: options.parameters.concurrency,
headers: { headers: {
Instance: instanceToken, Instance: instanceToken,
Origin: site.url, Origin: site.url,
@ -263,17 +271,19 @@ async function fetchUpcoming(site) {
return null; return null;
} }
async function fetchScene(url, site, baseScene) { async function fetchScene(url, site, baseScene, options) {
if (baseScene?.entryId) { if (baseScene?.entryId) {
// overview and deep data is the same, don't hit server unnecessarily // overview and deep data is the same, don't hit server unnecessarily
return baseScene; return baseScene;
} }
const entryId = url.match(/\d+/)[0]; const entryId = url.match(/\d+/)[0];
const { session, instanceToken } = await getSession(site); const { session, instanceToken } = await getSession(site, options.parameters);
const res = await http.get(`https://site-api.project1service.com/v2/releases/${entryId}`, { const res = await http.get(`https://site-api.project1service.com/v2/releases/${entryId}`, {
session, session,
interval: options.parameters.interval,
concurrency: options.parameters.concurrency,
headers: { headers: {
Instance: instanceToken, Instance: instanceToken,
}, },
@ -286,12 +296,14 @@ async function fetchScene(url, site, baseScene) {
return null; return null;
} }
async function fetchProfile({ name: actorName }, { entity }) { async function fetchProfile({ name: actorName, slug: actorSlug }, { entity, parameters }) {
// const url = `https://www.${networkOrNetworkSlug.slug || networkOrNetworkSlug}.com`; // const url = `https://www.${networkOrNetworkSlug.slug || networkOrNetworkSlug}.com`;
const { session, instanceToken } = await getSession(entity); const { session, instanceToken } = await getSession(entity, parameters);
const res = await http.get(`https://site-api.project1service.com/v1/actors/?search=${encodeURI(actorName)}`, { const res = await http.get(`https://site-api.project1service.com/v1/actors/?search=${encodeURI(actorName)}`, {
session, session,
interval: parameters.interval,
concurrency: parameters.concurrency,
headers: { headers: {
Instance: instanceToken, Instance: instanceToken,
}, },
@ -301,13 +313,18 @@ async function fetchProfile({ name: actorName }, { entity }) {
const actorData = res.body.result.find(actor => actor.name.toLowerCase() === actorName.toLowerCase()); const actorData = res.body.result.find(actor => actor.name.toLowerCase() === actorName.toLowerCase());
if (actorData) { if (actorData) {
const actorUrl = `https://www.${entity.slug}.com/${entity.parameters?.actorPath || 'model'}/${actorData.id}/`; const actorUrl = `https://www.${entity.slug}.com/${entity.parameters?.actorPath || 'model'}/${actorData.id}/${actorSlug}`;
const actorReleasesUrl = `https://site-api.project1service.com/v2/releases?actorId=${actorData.id}&limit=100&offset=0&orderBy=-dateReleased&type=scene`; const actorReleasesUrl = `https://site-api.project1service.com/v2/releases?actorId=${actorData.id}&limit=100&offset=0&orderBy=-dateReleased&type=scene`;
const [actorRes, actorReleasesRes] = await Promise.all([ const [actorRes, actorReleasesRes] = await Promise.all([
http.get(actorUrl), http.get(actorUrl, {
interval: parameters.interval,
concurrency: parameters.concurrency,
}),
http.get(actorReleasesUrl, { http.get(actorReleasesUrl, {
session, session,
interval: parameters.interval,
concurrency: parameters.concurrency,
headers: { headers: {
Instance: instanceToken, Instance: instanceToken,
}, },

View File

@ -11,6 +11,7 @@ const { curateRelease } = require('./releases');
const include = require('./utils/argv-include')(argv); const include = require('./utils/argv-include')(argv);
const { resolveScraper, resolveLayoutScraper } = require('./scrapers/resolve'); const { resolveScraper, resolveLayoutScraper } = require('./scrapers/resolve');
const { fetchIncludedEntities } = require('./entities'); const { fetchIncludedEntities } = require('./entities');
const getRecursiveParameters = require('./utils/get-recursive-parameters');
const emptyReleases = { uniqueReleases: [], duplicateReleases: [] }; const emptyReleases = { uniqueReleases: [], duplicateReleases: [] };
@ -97,6 +98,7 @@ async function scrapeReleases(scraper, entity, preData, isUpcoming) {
const options = { const options = {
...config.options[scraper.slug], ...config.options[scraper.slug],
...include, ...include,
parameters: getRecursiveParameters(entity),
}; };
const pageReleases = isUpcoming const pageReleases = isUpcoming

View File

@ -0,0 +1,11 @@
'use strict';
function getRecursiveParameters(entity, parameters) {
if (entity.parent) {
return getRecursiveParameters(entity.parent, { ...parameters, ...entity.parameters });
}
return { ...parameters, ...entity.parameters };
}
module.exports = getRecursiveParameters;