Added optional sequential scraping and acc release injection. Added Hush Pass and Interracial Pass logos.

This commit is contained in:
ThePendulum 2020-03-05 23:01:03 +01:00
parent 6719d805d3
commit 3889faee26
4 changed files with 88 additions and 60 deletions

View File

@ -181,12 +181,18 @@ const networks = [
name: 'Hush Pass', name: 'Hush Pass',
url: 'http://www.hushpass.com', url: 'http://www.hushpass.com',
parent: 'hush', parent: 'hush',
parameters: {
sequential: true,
},
}, },
{ {
slug: 'interracialpass', slug: 'interracialpass',
name: 'Interracial Pass', name: 'Interracial Pass',
url: 'http://www.interracialpass.com', url: 'http://www.interracialpass.com',
parent: 'hush', parent: 'hush',
parameters: {
sequential: true,
},
}, },
{ {
slug: 'insex', slug: 'insex',

View File

@ -2101,15 +2101,6 @@ const sites = [
}, },
}, },
// HUSH PASS // HUSH PASS
{
slug: 'hushpass',
name: 'Hush Pass',
url: 'https://hushpass.com',
network: 'hushpass',
parameters: {
t1: true,
},
},
{ {
slug: 'shotherfirst', slug: 'shotherfirst',
name: 'Shot Her First', name: 'Shot Her First',
@ -2264,17 +2255,17 @@ const sites = [
t1: true, t1: true,
}, },
}, },
// INTERRACIAL PASS
{ {
slug: 'interracialpass', slug: 'hushpass',
name: 'Interracial Pass', name: 'Hush Pass',
url: 'https://www.interracialpass.com', url: 'https://hushpass.com',
tags: ['interracial'], network: 'hushpass',
network: 'interracialpass',
parameters: { parameters: {
t1: true, t1: true,
accFilter: true,
}, },
}, },
// INTERRACIAL PASS
{ {
slug: '2bigtobetrue', slug: '2bigtobetrue',
name: '2 Big To Be True', name: '2 Big To Be True',
@ -2355,9 +2346,21 @@ const sites = [
parameters: { parameters: {
latest: 'https://www.interracialpass.com/t1/categories/my-wifes-first-monster-cock_%d_d.html', latest: 'https://www.interracialpass.com/t1/categories/my-wifes-first-monster-cock_%d_d.html',
media: 'https://www.interracialpass.com', media: 'https://www.interracialpass.com',
match: 'My Wifes First Monster Cock',
t1: true, t1: true,
}, },
}, },
{
slug: 'interracialpass',
name: 'Interracial Pass',
url: 'https://www.interracialpass.com',
tags: ['interracial'],
network: 'interracialpass',
parameters: {
t1: true,
accFilter: true,
},
},
// INSEX // INSEX
{ {
slug: 'sexuallybroken', slug: 'sexuallybroken',

View File

@ -37,12 +37,12 @@ async function findDuplicateReleaseIds(latestReleases, accReleases) {
.concat(accReleases.map(release => String(release.entryId)))); .concat(accReleases.map(release => String(release.entryId))));
} }
async function scrapeUniqueReleases(scraper, site, beforeFetchLatest, afterDate = getAfterDate(), accReleases = [], page = argv.page) { async function scrapeUniqueReleases(scraper, site, beforeFetchLatest, accSiteReleases, afterDate = getAfterDate(), accReleases = [], page = argv.page) {
if (!argv.latest || !scraper.fetchLatest) { if (!argv.latest || !scraper.fetchLatest) {
return []; return [];
} }
const latestReleases = await scraper.fetchLatest(site, page, beforeFetchLatest); const latestReleases = await scraper.fetchLatest(site, page, beforeFetchLatest, accSiteReleases);
if (!Array.isArray(latestReleases)) { if (!Array.isArray(latestReleases)) {
logger.warn(`Scraper returned ${latestReleases || 'null'} when fetching latest from '${site.name}' on '${site.network.name}'`); logger.warn(`Scraper returned ${latestReleases || 'null'} when fetching latest from '${site.name}' on '${site.network.name}'`);
@ -73,7 +73,7 @@ async function scrapeUniqueReleases(scraper, site, beforeFetchLatest, afterDate
|| (argv.last && accReleases.length + uniqueReleases.length < argv.last)) || (argv.last && accReleases.length + uniqueReleases.length < argv.last))
) { ) {
// oldest release on page is newer that specified date range, or latest count has not yet been met, fetch next page // oldest release on page is newer that specified date range, or latest count has not yet been met, fetch next page
return scrapeUniqueReleases(scraper, site, beforeFetchLatest, afterDate, accReleases.concat(uniqueReleases), page + 1); return scrapeUniqueReleases(scraper, site, beforeFetchLatest, accSiteReleases, afterDate, accReleases.concat(uniqueReleases), page + 1);
} }
if (argv.last && uniqueReleases.length >= argv.last) { if (argv.last && uniqueReleases.length >= argv.last) {
@ -132,12 +132,12 @@ async function deepFetchReleases(baseReleases, beforeFetchLatest) {
}); });
} }
async function scrapeSiteReleases(scraper, site) { async function scrapeSiteReleases(scraper, site, accSiteReleases) {
const beforeFetchLatest = await scraper.beforeFetchLatest?.(site); const beforeFetchLatest = await scraper.beforeFetchLatest?.(site, accSiteReleases);
const [newReleases, upcomingReleases] = await Promise.all([ const [newReleases, upcomingReleases] = await Promise.all([
scrapeUniqueReleases(scraper, site, beforeFetchLatest), // fetch basic release info from scene overview scrapeUniqueReleases(scraper, site, beforeFetchLatest, accSiteReleases), // fetch basic release info from scene overview
scrapeUpcomingReleases(scraper, site, beforeFetchLatest), // fetch basic release info from upcoming overview scrapeUpcomingReleases(scraper, site, beforeFetchLatest, accSiteReleases), // fetch basic release info from upcoming overview
]); ]);
if (argv.upcoming) { if (argv.upcoming) {
@ -154,10 +154,7 @@ async function scrapeSiteReleases(scraper, site) {
return baseReleases; return baseReleases;
} }
async function scrapeSites() { async function scrapeSite(site, network, accSiteReleases = []) {
const networks = await fetchIncludedSites();
const scrapedNetworks = await Promise.map(networks, async network => Promise.map(network.sites, async (site) => {
if (site.parameters?.ignore) { if (site.parameters?.ignore) {
logger.warn(`Ignoring ${network.name}: ${site.name}`); logger.warn(`Ignoring ${network.name}: ${site.name}`);
return []; return [];
@ -171,7 +168,7 @@ async function scrapeSites() {
} }
try { try {
const siteReleases = await scrapeSiteReleases(scraper, site); const siteReleases = await scrapeSiteReleases(scraper, site, accSiteReleases);
return siteReleases.map(release => ({ ...release, site })); return siteReleases.map(release => ({ ...release, site }));
} catch (error) { } catch (error) {
@ -179,10 +176,27 @@ async function scrapeSites() {
return []; return [];
} }
}, { }
// 2 network sites at a time
concurrency: 2, async function scrapeSites() {
}), const networks = await fetchIncludedSites();
const scrapedNetworks = await Promise.map(networks, async (network) => {
if (network.parameters?.sequential) {
logger.info(`Scraping '${network.name}' sequentially`);
return Promise.reduce(network.sites, async (acc, site) => {
const accSiteReleases = await acc;
const siteReleases = await scrapeSite(site, network, accSiteReleases);
return accSiteReleases.concat(siteReleases);
}, Promise.resolve([]));
}
return Promise.map(network.sites, async site => scrapeSite(site, network), {
concurrency: network.parameters?.concurrency || 2,
});
},
{ {
// 5 networks at a time // 5 networks at a time
concurrency: 5, concurrency: 5,

View File

@ -6,6 +6,14 @@ const knex = require('../knex');
const { get, geta, fd } = require('../utils/q'); const { get, geta, fd } = require('../utils/q');
const slugify = require('../utils/slugify'); const slugify = require('../utils/slugify');
async function getChannelRegExp(site) {
if (!['hushpass', 'interracialpass'].includes(site.network.slug)) return null;
const sites = await knex('sites').where('network_id', site.network.id);
return new RegExp(sites.map(channel => channel.parameters?.match || channel.name).join('|'), 'i');
}
function deriveEntryId(release) { function deriveEntryId(release) {
return `${slugify(fd(release.date, 'YYYY-MM-DD'))}-${slugify(release.title)}`; return `${slugify(fd(release.date, 'YYYY-MM-DD'))}-${slugify(release.title)}`;
} }
@ -54,7 +62,7 @@ function scrapeLatest(scenes, site) {
}); });
} }
function scrapeLatestT1(scenes, site) { function scrapeLatestT1(scenes, site, accSiteReleases) {
return scenes.map(({ q, qi, qd, ql, qu }) => { return scenes.map(({ q, qi, qd, ql, qu }) => {
const release = {}; const release = {};
@ -79,8 +87,13 @@ function scrapeLatestT1(scenes, site) {
// release.entryId = q('.img-div img', 'id')?.match(/set-target-(\d+)/)[1]; // release.entryId = q('.img-div img', 'id')?.match(/set-target-(\d+)/)[1];
release.entryId = deriveEntryId(release); release.entryId = deriveEntryId(release);
if (site.parameters?.accFilter && accSiteReleases?.map(accRelease => accRelease.entryId).includes(release.entryId)) {
// filter out releases that were already scraped from a categorized site
return null;
}
return release; return release;
}); }).filter(Boolean);
} }
function scrapeLatestTour(scenes) { function scrapeLatestTour(scenes) {
@ -199,15 +212,7 @@ function scrapeSceneTour({ html, q, qd, qa, qis }, site, url) {
return release; return release;
} }
async function getChannelRegExp(site) { async function fetchLatest(site, page = 1, _beforeFetchLatest, accSiteReleases) {
if (!['hushpass', 'interracialpass'].includes(site.network.slug)) return null;
const sites = await knex('sites').where('network_id', site.network.id);
return new RegExp(sites.map(channel => channel.parameters?.match || channel.name).join('|'), 'i');
}
async function fetchLatest(site, page = 1) {
const url = (site.parameters?.latest && util.format(site.parameters.latest, page)) const url = (site.parameters?.latest && util.format(site.parameters.latest, page))
|| (site.parameters?.t1 && `${site.url}/t1/categories/movies_${page}_d.html`) || (site.parameters?.t1 && `${site.url}/t1/categories/movies_${page}_d.html`)
|| `${site.url}/categories/movies_${page}_d.html`; || `${site.url}/categories/movies_${page}_d.html`;
@ -215,10 +220,10 @@ async function fetchLatest(site, page = 1) {
const qLatest = await geta(url, '.modelfeature, .item-video, .updateItem'); const qLatest = await geta(url, '.modelfeature, .item-video, .updateItem');
if (!qLatest) return null; if (!qLatest) return null;
if (site.parameters?.t1) return scrapeLatestT1(qLatest, site); if (site.parameters?.t1) return scrapeLatestT1(qLatest, site, accSiteReleases);
if (site.parameters?.tour) return scrapeLatestTour(qLatest, site); if (site.parameters?.tour) return scrapeLatestTour(qLatest, site, accSiteReleases);
return scrapeLatest(qLatest, site); return scrapeLatest(qLatest, site, accSiteReleases);
} }
async function fetchScene(url, site, baseRelease, beforeFetchLatest) { async function fetchScene(url, site, baseRelease, beforeFetchLatest) {