Passing recursive parameters to all scraper methods. Using throttle parameters in MindGeek scraper, fixed missing slug breaking scene and actor URLs.
This commit is contained in:
parent
62ad786318
commit
7ff222ce25
|
@ -17,6 +17,10 @@ const grandParentNetworks = [
|
||||||
name: 'Mind Geek',
|
name: 'Mind Geek',
|
||||||
url: 'https://www.mindgeek.com',
|
url: 'https://www.mindgeek.com',
|
||||||
description: '',
|
description: '',
|
||||||
|
parameters: {
|
||||||
|
interval: 1000,
|
||||||
|
concurrency: 1,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
slug: 'whalemember',
|
slug: 'whalemember',
|
||||||
|
|
|
@ -30,6 +30,8 @@ const { deleteScenes } = require('./releases');
|
||||||
const slugify = require('./utils/slugify');
|
const slugify = require('./utils/slugify');
|
||||||
const capitalize = require('./utils/capitalize');
|
const capitalize = require('./utils/capitalize');
|
||||||
const resolvePlace = require('./utils/resolve-place');
|
const resolvePlace = require('./utils/resolve-place');
|
||||||
|
const { resolveLayoutScraper } = require('./scrapers/resolve');
|
||||||
|
const getRecursiveParameters = require('./utils/get-recursive-parameters');
|
||||||
|
|
||||||
const hairColors = {
|
const hairColors = {
|
||||||
'jet-black': 'black',
|
'jet-black': 'black',
|
||||||
|
@ -637,10 +639,7 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy
|
||||||
const entity = entitiesBySlug[scraperSlug] || null;
|
const entity = entitiesBySlug[scraperSlug] || null;
|
||||||
|
|
||||||
const scraper = scrapers[scraperSlug];
|
const scraper = scrapers[scraperSlug];
|
||||||
const layoutScraper = scraper?.[entity.parameters?.layout]
|
const layoutScraper = resolveLayoutScraper(entity, scraper);
|
||||||
|| scraper?.[entity.parent?.parameters?.layout]
|
|
||||||
|| scraper?.[entity.parent?.parent?.parameters?.layout]
|
|
||||||
|| scraper;
|
|
||||||
|
|
||||||
const context = {
|
const context = {
|
||||||
...entity,
|
...entity,
|
||||||
|
@ -649,11 +648,7 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy
|
||||||
network: entity?.parent,
|
network: entity?.parent,
|
||||||
entity,
|
entity,
|
||||||
scraper: scraperSlug,
|
scraper: scraperSlug,
|
||||||
parameters: {
|
parameters: getRecursiveParameters(entity),
|
||||||
...entity?.parent?.parent?.parameters,
|
|
||||||
...entity?.parent?.parameters,
|
|
||||||
...entity?.parameters,
|
|
||||||
},
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const label = context.entity?.name;
|
const label = context.entity?.name;
|
||||||
|
|
10
src/deep.js
10
src/deep.js
|
@ -9,6 +9,7 @@ const { resolveScraper, resolveLayoutScraper } = require('./scrapers/resolve');
|
||||||
const { fetchReleaseEntities, urlToSiteSlug } = require('./entities');
|
const { fetchReleaseEntities, urlToSiteSlug } = require('./entities');
|
||||||
const logger = require('./logger')(__filename);
|
const logger = require('./logger')(__filename);
|
||||||
const qu = require('./utils/qu');
|
const qu = require('./utils/qu');
|
||||||
|
const getRecursiveParameters = require('./utils/get-recursive-parameters');
|
||||||
|
|
||||||
function toBaseReleases(baseReleasesOrUrls, entity = null) {
|
function toBaseReleases(baseReleasesOrUrls, entity = null) {
|
||||||
if (!baseReleasesOrUrls) {
|
if (!baseReleasesOrUrls) {
|
||||||
|
@ -106,9 +107,14 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
|
||||||
try {
|
try {
|
||||||
logger.verbose(`Fetching ${type} ${baseRelease.url}`);
|
logger.verbose(`Fetching ${type} ${baseRelease.url}`);
|
||||||
|
|
||||||
|
const options = {
|
||||||
|
...include,
|
||||||
|
parameters: getRecursiveParameters(entity),
|
||||||
|
};
|
||||||
|
|
||||||
const scrapedRelease = type === 'scene'
|
const scrapedRelease = type === 'scene'
|
||||||
? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, include, null)
|
? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, options, null)
|
||||||
: await layoutScraper.fetchMovie(baseRelease.url, entity, baseRelease, include, null);
|
: await layoutScraper.fetchMovie(baseRelease.url, entity, baseRelease, options, null);
|
||||||
|
|
||||||
if (typeof scrapedRelease !== 'object' || Array.isArray(scrapedRelease)) {
|
if (typeof scrapedRelease !== 'object' || Array.isArray(scrapedRelease)) {
|
||||||
// scraper is unable to fetch the releases and returned a HTTP code or null
|
// scraper is unable to fetch the releases and returned a HTTP code or null
|
||||||
|
|
|
@ -168,8 +168,6 @@ function sortBaseTrailersByQuality(sources, role) {
|
||||||
return 0;
|
return 0;
|
||||||
});
|
});
|
||||||
|
|
||||||
console.log(sortedSources);
|
|
||||||
|
|
||||||
return sortedSources;
|
return sortedSources;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -68,7 +68,7 @@ function scrapeLatestX(data, site, filterChannel) {
|
||||||
|| (site.parameters?.native && `${site.url}/scene`)
|
|| (site.parameters?.native && `${site.url}/scene`)
|
||||||
|| `${site.parent.url}/scene`;
|
|| `${site.parent.url}/scene`;
|
||||||
|
|
||||||
release.url = `${basepath}/${release.entryId}/`;
|
release.url = `${basepath}/${release.entryId}/${slugify(release.title)}`;
|
||||||
release.date = new Date(data.dateReleased);
|
release.date = new Date(data.dateReleased);
|
||||||
release.actors = data.actors.map(actor => ({ name: actor.name, gender: actor.gender }));
|
release.actors = data.actors.map(actor => ({ name: actor.name, gender: actor.gender }));
|
||||||
|
|
||||||
|
@ -143,7 +143,7 @@ function getUrl(site) {
|
||||||
throw new Error(`Mind Geek site '${site.name}' (${site.url}) not supported`);
|
throw new Error(`Mind Geek site '${site.name}' (${site.url}) not supported`);
|
||||||
}
|
}
|
||||||
|
|
||||||
async function getSession(site) {
|
async function getSession(site, parameters) {
|
||||||
const cookieJar = new CookieJar();
|
const cookieJar = new CookieJar();
|
||||||
const session = http.session({ cookieJar });
|
const session = http.session({ cookieJar });
|
||||||
|
|
||||||
|
@ -152,7 +152,11 @@ async function getSession(site) {
|
||||||
? site.parent.url
|
? site.parent.url
|
||||||
: site.url;
|
: site.url;
|
||||||
|
|
||||||
const res = await http.get(sessionUrl, { session });
|
const res = await http.get(sessionUrl, {
|
||||||
|
session,
|
||||||
|
interval: parameters?.interval,
|
||||||
|
concurrency: parameters?.concurrency,
|
||||||
|
});
|
||||||
|
|
||||||
if (res.statusCode === 200) {
|
if (res.statusCode === 200) {
|
||||||
const cookieString = await cookieJar.getCookieStringAsync(sessionUrl);
|
const cookieString = await cookieJar.getCookieStringAsync(sessionUrl);
|
||||||
|
@ -212,12 +216,12 @@ function scrapeProfile(data, html, releases = [], networkName) {
|
||||||
return profile;
|
return profile;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchLatest(site, page = 1) {
|
async function fetchLatest(site, page = 1, options) {
|
||||||
const url = getUrl(site);
|
const url = getUrl(site);
|
||||||
const { searchParams } = new URL(url);
|
const { searchParams } = new URL(url);
|
||||||
const siteId = searchParams.get('site');
|
const siteId = searchParams.get('site');
|
||||||
|
|
||||||
const { session, instanceToken } = await getSession(site);
|
const { session, instanceToken } = await getSession(site, options.parameters);
|
||||||
|
|
||||||
const beforeDate = moment().add('1', 'day').format('YYYY-MM-DD');
|
const beforeDate = moment().add('1', 'day').format('YYYY-MM-DD');
|
||||||
const limit = 10;
|
const limit = 10;
|
||||||
|
@ -227,6 +231,8 @@ async function fetchLatest(site, page = 1) {
|
||||||
|
|
||||||
const res = await http.get(apiUrl, {
|
const res = await http.get(apiUrl, {
|
||||||
session,
|
session,
|
||||||
|
interval: options.parameters.interval,
|
||||||
|
concurrency: options.parameters.concurrency,
|
||||||
headers: {
|
headers: {
|
||||||
Instance: instanceToken,
|
Instance: instanceToken,
|
||||||
Origin: site.url,
|
Origin: site.url,
|
||||||
|
@ -241,14 +247,16 @@ async function fetchLatest(site, page = 1) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchUpcoming(site) {
|
async function fetchUpcoming(site, page, options) {
|
||||||
const url = getUrl(site);
|
const url = getUrl(site);
|
||||||
const { session, instanceToken } = await getSession(site);
|
const { session, instanceToken } = await getSession(site, options.parameters);
|
||||||
|
|
||||||
const apiUrl = 'https://site-api.project1service.com/v2/upcoming-releases';
|
const apiUrl = 'https://site-api.project1service.com/v2/upcoming-releases';
|
||||||
|
|
||||||
const res = await http.get(apiUrl, {
|
const res = await http.get(apiUrl, {
|
||||||
session,
|
session,
|
||||||
|
interval: options.parameters.interval,
|
||||||
|
concurrency: options.parameters.concurrency,
|
||||||
headers: {
|
headers: {
|
||||||
Instance: instanceToken,
|
Instance: instanceToken,
|
||||||
Origin: site.url,
|
Origin: site.url,
|
||||||
|
@ -263,17 +271,19 @@ async function fetchUpcoming(site) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchScene(url, site, baseScene) {
|
async function fetchScene(url, site, baseScene, options) {
|
||||||
if (baseScene?.entryId) {
|
if (baseScene?.entryId) {
|
||||||
// overview and deep data is the same, don't hit server unnecessarily
|
// overview and deep data is the same, don't hit server unnecessarily
|
||||||
return baseScene;
|
return baseScene;
|
||||||
}
|
}
|
||||||
|
|
||||||
const entryId = url.match(/\d+/)[0];
|
const entryId = url.match(/\d+/)[0];
|
||||||
const { session, instanceToken } = await getSession(site);
|
const { session, instanceToken } = await getSession(site, options.parameters);
|
||||||
|
|
||||||
const res = await http.get(`https://site-api.project1service.com/v2/releases/${entryId}`, {
|
const res = await http.get(`https://site-api.project1service.com/v2/releases/${entryId}`, {
|
||||||
session,
|
session,
|
||||||
|
interval: options.parameters.interval,
|
||||||
|
concurrency: options.parameters.concurrency,
|
||||||
headers: {
|
headers: {
|
||||||
Instance: instanceToken,
|
Instance: instanceToken,
|
||||||
},
|
},
|
||||||
|
@ -286,12 +296,14 @@ async function fetchScene(url, site, baseScene) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchProfile({ name: actorName }, { entity }) {
|
async function fetchProfile({ name: actorName, slug: actorSlug }, { entity, parameters }) {
|
||||||
// const url = `https://www.${networkOrNetworkSlug.slug || networkOrNetworkSlug}.com`;
|
// const url = `https://www.${networkOrNetworkSlug.slug || networkOrNetworkSlug}.com`;
|
||||||
const { session, instanceToken } = await getSession(entity);
|
const { session, instanceToken } = await getSession(entity, parameters);
|
||||||
|
|
||||||
const res = await http.get(`https://site-api.project1service.com/v1/actors/?search=${encodeURI(actorName)}`, {
|
const res = await http.get(`https://site-api.project1service.com/v1/actors/?search=${encodeURI(actorName)}`, {
|
||||||
session,
|
session,
|
||||||
|
interval: parameters.interval,
|
||||||
|
concurrency: parameters.concurrency,
|
||||||
headers: {
|
headers: {
|
||||||
Instance: instanceToken,
|
Instance: instanceToken,
|
||||||
},
|
},
|
||||||
|
@ -301,13 +313,18 @@ async function fetchProfile({ name: actorName }, { entity }) {
|
||||||
const actorData = res.body.result.find(actor => actor.name.toLowerCase() === actorName.toLowerCase());
|
const actorData = res.body.result.find(actor => actor.name.toLowerCase() === actorName.toLowerCase());
|
||||||
|
|
||||||
if (actorData) {
|
if (actorData) {
|
||||||
const actorUrl = `https://www.${entity.slug}.com/${entity.parameters?.actorPath || 'model'}/${actorData.id}/`;
|
const actorUrl = `https://www.${entity.slug}.com/${entity.parameters?.actorPath || 'model'}/${actorData.id}/${actorSlug}`;
|
||||||
const actorReleasesUrl = `https://site-api.project1service.com/v2/releases?actorId=${actorData.id}&limit=100&offset=0&orderBy=-dateReleased&type=scene`;
|
const actorReleasesUrl = `https://site-api.project1service.com/v2/releases?actorId=${actorData.id}&limit=100&offset=0&orderBy=-dateReleased&type=scene`;
|
||||||
|
|
||||||
const [actorRes, actorReleasesRes] = await Promise.all([
|
const [actorRes, actorReleasesRes] = await Promise.all([
|
||||||
http.get(actorUrl),
|
http.get(actorUrl, {
|
||||||
|
interval: parameters.interval,
|
||||||
|
concurrency: parameters.concurrency,
|
||||||
|
}),
|
||||||
http.get(actorReleasesUrl, {
|
http.get(actorReleasesUrl, {
|
||||||
session,
|
session,
|
||||||
|
interval: parameters.interval,
|
||||||
|
concurrency: parameters.concurrency,
|
||||||
headers: {
|
headers: {
|
||||||
Instance: instanceToken,
|
Instance: instanceToken,
|
||||||
},
|
},
|
||||||
|
|
|
@ -11,6 +11,7 @@ const { curateRelease } = require('./releases');
|
||||||
const include = require('./utils/argv-include')(argv);
|
const include = require('./utils/argv-include')(argv);
|
||||||
const { resolveScraper, resolveLayoutScraper } = require('./scrapers/resolve');
|
const { resolveScraper, resolveLayoutScraper } = require('./scrapers/resolve');
|
||||||
const { fetchIncludedEntities } = require('./entities');
|
const { fetchIncludedEntities } = require('./entities');
|
||||||
|
const getRecursiveParameters = require('./utils/get-recursive-parameters');
|
||||||
|
|
||||||
const emptyReleases = { uniqueReleases: [], duplicateReleases: [] };
|
const emptyReleases = { uniqueReleases: [], duplicateReleases: [] };
|
||||||
|
|
||||||
|
@ -97,6 +98,7 @@ async function scrapeReleases(scraper, entity, preData, isUpcoming) {
|
||||||
const options = {
|
const options = {
|
||||||
...config.options[scraper.slug],
|
...config.options[scraper.slug],
|
||||||
...include,
|
...include,
|
||||||
|
parameters: getRecursiveParameters(entity),
|
||||||
};
|
};
|
||||||
|
|
||||||
const pageReleases = isUpcoming
|
const pageReleases = isUpcoming
|
||||||
|
|
|
@ -0,0 +1,11 @@
|
||||||
|
'use strict';
|
||||||
|
|
||||||
|
function getRecursiveParameters(entity, parameters) {
|
||||||
|
if (entity.parent) {
|
||||||
|
return getRecursiveParameters(entity.parent, { ...parameters, ...entity.parameters });
|
||||||
|
}
|
||||||
|
|
||||||
|
return { ...parameters, ...entity.parameters };
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = getRecursiveParameters;
|
Loading…
Reference in New Issue