Added optional sequential scraping and acc release injection. Added Hush Pass and Interracial Pass logos.
This commit is contained in:
parent
6719d805d3
commit
3889faee26
|
@ -181,12 +181,18 @@ const networks = [
|
||||||
name: 'Hush Pass',
|
name: 'Hush Pass',
|
||||||
url: 'http://www.hushpass.com',
|
url: 'http://www.hushpass.com',
|
||||||
parent: 'hush',
|
parent: 'hush',
|
||||||
|
parameters: {
|
||||||
|
sequential: true,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
slug: 'interracialpass',
|
slug: 'interracialpass',
|
||||||
name: 'Interracial Pass',
|
name: 'Interracial Pass',
|
||||||
url: 'http://www.interracialpass.com',
|
url: 'http://www.interracialpass.com',
|
||||||
parent: 'hush',
|
parent: 'hush',
|
||||||
|
parameters: {
|
||||||
|
sequential: true,
|
||||||
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
slug: 'insex',
|
slug: 'insex',
|
||||||
|
|
|
@ -2101,15 +2101,6 @@ const sites = [
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
// HUSH PASS
|
// HUSH PASS
|
||||||
{
|
|
||||||
slug: 'hushpass',
|
|
||||||
name: 'Hush Pass',
|
|
||||||
url: 'https://hushpass.com',
|
|
||||||
network: 'hushpass',
|
|
||||||
parameters: {
|
|
||||||
t1: true,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
slug: 'shotherfirst',
|
slug: 'shotherfirst',
|
||||||
name: 'Shot Her First',
|
name: 'Shot Her First',
|
||||||
|
@ -2264,17 +2255,17 @@ const sites = [
|
||||||
t1: true,
|
t1: true,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
// INTERRACIAL PASS
|
|
||||||
{
|
{
|
||||||
slug: 'interracialpass',
|
slug: 'hushpass',
|
||||||
name: 'Interracial Pass',
|
name: 'Hush Pass',
|
||||||
url: 'https://www.interracialpass.com',
|
url: 'https://hushpass.com',
|
||||||
tags: ['interracial'],
|
network: 'hushpass',
|
||||||
network: 'interracialpass',
|
|
||||||
parameters: {
|
parameters: {
|
||||||
t1: true,
|
t1: true,
|
||||||
|
accFilter: true,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
// INTERRACIAL PASS
|
||||||
{
|
{
|
||||||
slug: '2bigtobetrue',
|
slug: '2bigtobetrue',
|
||||||
name: '2 Big To Be True',
|
name: '2 Big To Be True',
|
||||||
|
@ -2355,9 +2346,21 @@ const sites = [
|
||||||
parameters: {
|
parameters: {
|
||||||
latest: 'https://www.interracialpass.com/t1/categories/my-wifes-first-monster-cock_%d_d.html',
|
latest: 'https://www.interracialpass.com/t1/categories/my-wifes-first-monster-cock_%d_d.html',
|
||||||
media: 'https://www.interracialpass.com',
|
media: 'https://www.interracialpass.com',
|
||||||
|
match: 'My Wifes First Monster Cock',
|
||||||
t1: true,
|
t1: true,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
slug: 'interracialpass',
|
||||||
|
name: 'Interracial Pass',
|
||||||
|
url: 'https://www.interracialpass.com',
|
||||||
|
tags: ['interracial'],
|
||||||
|
network: 'interracialpass',
|
||||||
|
parameters: {
|
||||||
|
t1: true,
|
||||||
|
accFilter: true,
|
||||||
|
},
|
||||||
|
},
|
||||||
// INSEX
|
// INSEX
|
||||||
{
|
{
|
||||||
slug: 'sexuallybroken',
|
slug: 'sexuallybroken',
|
||||||
|
|
|
@ -37,12 +37,12 @@ async function findDuplicateReleaseIds(latestReleases, accReleases) {
|
||||||
.concat(accReleases.map(release => String(release.entryId))));
|
.concat(accReleases.map(release => String(release.entryId))));
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapeUniqueReleases(scraper, site, beforeFetchLatest, afterDate = getAfterDate(), accReleases = [], page = argv.page) {
|
async function scrapeUniqueReleases(scraper, site, beforeFetchLatest, accSiteReleases, afterDate = getAfterDate(), accReleases = [], page = argv.page) {
|
||||||
if (!argv.latest || !scraper.fetchLatest) {
|
if (!argv.latest || !scraper.fetchLatest) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
const latestReleases = await scraper.fetchLatest(site, page, beforeFetchLatest);
|
const latestReleases = await scraper.fetchLatest(site, page, beforeFetchLatest, accSiteReleases);
|
||||||
|
|
||||||
if (!Array.isArray(latestReleases)) {
|
if (!Array.isArray(latestReleases)) {
|
||||||
logger.warn(`Scraper returned ${latestReleases || 'null'} when fetching latest from '${site.name}' on '${site.network.name}'`);
|
logger.warn(`Scraper returned ${latestReleases || 'null'} when fetching latest from '${site.name}' on '${site.network.name}'`);
|
||||||
|
@ -73,7 +73,7 @@ async function scrapeUniqueReleases(scraper, site, beforeFetchLatest, afterDate
|
||||||
|| (argv.last && accReleases.length + uniqueReleases.length < argv.last))
|
|| (argv.last && accReleases.length + uniqueReleases.length < argv.last))
|
||||||
) {
|
) {
|
||||||
// oldest release on page is newer that specified date range, or latest count has not yet been met, fetch next page
|
// oldest release on page is newer that specified date range, or latest count has not yet been met, fetch next page
|
||||||
return scrapeUniqueReleases(scraper, site, beforeFetchLatest, afterDate, accReleases.concat(uniqueReleases), page + 1);
|
return scrapeUniqueReleases(scraper, site, beforeFetchLatest, accSiteReleases, afterDate, accReleases.concat(uniqueReleases), page + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (argv.last && uniqueReleases.length >= argv.last) {
|
if (argv.last && uniqueReleases.length >= argv.last) {
|
||||||
|
@ -132,12 +132,12 @@ async function deepFetchReleases(baseReleases, beforeFetchLatest) {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapeSiteReleases(scraper, site) {
|
async function scrapeSiteReleases(scraper, site, accSiteReleases) {
|
||||||
const beforeFetchLatest = await scraper.beforeFetchLatest?.(site);
|
const beforeFetchLatest = await scraper.beforeFetchLatest?.(site, accSiteReleases);
|
||||||
|
|
||||||
const [newReleases, upcomingReleases] = await Promise.all([
|
const [newReleases, upcomingReleases] = await Promise.all([
|
||||||
scrapeUniqueReleases(scraper, site, beforeFetchLatest), // fetch basic release info from scene overview
|
scrapeUniqueReleases(scraper, site, beforeFetchLatest, accSiteReleases), // fetch basic release info from scene overview
|
||||||
scrapeUpcomingReleases(scraper, site, beforeFetchLatest), // fetch basic release info from upcoming overview
|
scrapeUpcomingReleases(scraper, site, beforeFetchLatest, accSiteReleases), // fetch basic release info from upcoming overview
|
||||||
]);
|
]);
|
||||||
|
|
||||||
if (argv.upcoming) {
|
if (argv.upcoming) {
|
||||||
|
@ -154,35 +154,49 @@ async function scrapeSiteReleases(scraper, site) {
|
||||||
return baseReleases;
|
return baseReleases;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function scrapeSite(site, network, accSiteReleases = []) {
|
||||||
|
if (site.parameters?.ignore) {
|
||||||
|
logger.warn(`Ignoring ${network.name}: ${site.name}`);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug];
|
||||||
|
|
||||||
|
if (!scraper) {
|
||||||
|
logger.warn(`No scraper found for '${site.name}' (${site.slug})`);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const siteReleases = await scrapeSiteReleases(scraper, site, accSiteReleases);
|
||||||
|
|
||||||
|
return siteReleases.map(release => ({ ...release, site }));
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(`${site.name}: Failed to scrape releases: ${error.message}`);
|
||||||
|
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async function scrapeSites() {
|
async function scrapeSites() {
|
||||||
const networks = await fetchIncludedSites();
|
const networks = await fetchIncludedSites();
|
||||||
|
|
||||||
const scrapedNetworks = await Promise.map(networks, async network => Promise.map(network.sites, async (site) => {
|
const scrapedNetworks = await Promise.map(networks, async (network) => {
|
||||||
if (site.parameters?.ignore) {
|
if (network.parameters?.sequential) {
|
||||||
logger.warn(`Ignoring ${network.name}: ${site.name}`);
|
logger.info(`Scraping '${network.name}' sequentially`);
|
||||||
return [];
|
|
||||||
|
return Promise.reduce(network.sites, async (acc, site) => {
|
||||||
|
const accSiteReleases = await acc;
|
||||||
|
const siteReleases = await scrapeSite(site, network, accSiteReleases);
|
||||||
|
|
||||||
|
return accSiteReleases.concat(siteReleases);
|
||||||
|
}, Promise.resolve([]));
|
||||||
}
|
}
|
||||||
|
|
||||||
const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug];
|
return Promise.map(network.sites, async site => scrapeSite(site, network), {
|
||||||
|
concurrency: network.parameters?.concurrency || 2,
|
||||||
if (!scraper) {
|
});
|
||||||
logger.warn(`No scraper found for '${site.name}' (${site.slug})`);
|
},
|
||||||
return [];
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
const siteReleases = await scrapeSiteReleases(scraper, site);
|
|
||||||
|
|
||||||
return siteReleases.map(release => ({ ...release, site }));
|
|
||||||
} catch (error) {
|
|
||||||
logger.error(`${site.name}: Failed to scrape releases: ${error.message}`);
|
|
||||||
|
|
||||||
return [];
|
|
||||||
}
|
|
||||||
}, {
|
|
||||||
// 2 network sites at a time
|
|
||||||
concurrency: 2,
|
|
||||||
}),
|
|
||||||
{
|
{
|
||||||
// 5 networks at a time
|
// 5 networks at a time
|
||||||
concurrency: 5,
|
concurrency: 5,
|
||||||
|
|
|
@ -6,6 +6,14 @@ const knex = require('../knex');
|
||||||
const { get, geta, fd } = require('../utils/q');
|
const { get, geta, fd } = require('../utils/q');
|
||||||
const slugify = require('../utils/slugify');
|
const slugify = require('../utils/slugify');
|
||||||
|
|
||||||
|
async function getChannelRegExp(site) {
|
||||||
|
if (!['hushpass', 'interracialpass'].includes(site.network.slug)) return null;
|
||||||
|
|
||||||
|
const sites = await knex('sites').where('network_id', site.network.id);
|
||||||
|
|
||||||
|
return new RegExp(sites.map(channel => channel.parameters?.match || channel.name).join('|'), 'i');
|
||||||
|
}
|
||||||
|
|
||||||
function deriveEntryId(release) {
|
function deriveEntryId(release) {
|
||||||
return `${slugify(fd(release.date, 'YYYY-MM-DD'))}-${slugify(release.title)}`;
|
return `${slugify(fd(release.date, 'YYYY-MM-DD'))}-${slugify(release.title)}`;
|
||||||
}
|
}
|
||||||
|
@ -54,7 +62,7 @@ function scrapeLatest(scenes, site) {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
function scrapeLatestT1(scenes, site) {
|
function scrapeLatestT1(scenes, site, accSiteReleases) {
|
||||||
return scenes.map(({ q, qi, qd, ql, qu }) => {
|
return scenes.map(({ q, qi, qd, ql, qu }) => {
|
||||||
const release = {};
|
const release = {};
|
||||||
|
|
||||||
|
@ -79,8 +87,13 @@ function scrapeLatestT1(scenes, site) {
|
||||||
// release.entryId = q('.img-div img', 'id')?.match(/set-target-(\d+)/)[1];
|
// release.entryId = q('.img-div img', 'id')?.match(/set-target-(\d+)/)[1];
|
||||||
release.entryId = deriveEntryId(release);
|
release.entryId = deriveEntryId(release);
|
||||||
|
|
||||||
|
if (site.parameters?.accFilter && accSiteReleases?.map(accRelease => accRelease.entryId).includes(release.entryId)) {
|
||||||
|
// filter out releases that were already scraped from a categorized site
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
return release;
|
return release;
|
||||||
});
|
}).filter(Boolean);
|
||||||
}
|
}
|
||||||
|
|
||||||
function scrapeLatestTour(scenes) {
|
function scrapeLatestTour(scenes) {
|
||||||
|
@ -199,15 +212,7 @@ function scrapeSceneTour({ html, q, qd, qa, qis }, site, url) {
|
||||||
return release;
|
return release;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function getChannelRegExp(site) {
|
async function fetchLatest(site, page = 1, _beforeFetchLatest, accSiteReleases) {
|
||||||
if (!['hushpass', 'interracialpass'].includes(site.network.slug)) return null;
|
|
||||||
|
|
||||||
const sites = await knex('sites').where('network_id', site.network.id);
|
|
||||||
|
|
||||||
return new RegExp(sites.map(channel => channel.parameters?.match || channel.name).join('|'), 'i');
|
|
||||||
}
|
|
||||||
|
|
||||||
async function fetchLatest(site, page = 1) {
|
|
||||||
const url = (site.parameters?.latest && util.format(site.parameters.latest, page))
|
const url = (site.parameters?.latest && util.format(site.parameters.latest, page))
|
||||||
|| (site.parameters?.t1 && `${site.url}/t1/categories/movies_${page}_d.html`)
|
|| (site.parameters?.t1 && `${site.url}/t1/categories/movies_${page}_d.html`)
|
||||||
|| `${site.url}/categories/movies_${page}_d.html`;
|
|| `${site.url}/categories/movies_${page}_d.html`;
|
||||||
|
@ -215,10 +220,10 @@ async function fetchLatest(site, page = 1) {
|
||||||
const qLatest = await geta(url, '.modelfeature, .item-video, .updateItem');
|
const qLatest = await geta(url, '.modelfeature, .item-video, .updateItem');
|
||||||
|
|
||||||
if (!qLatest) return null;
|
if (!qLatest) return null;
|
||||||
if (site.parameters?.t1) return scrapeLatestT1(qLatest, site);
|
if (site.parameters?.t1) return scrapeLatestT1(qLatest, site, accSiteReleases);
|
||||||
if (site.parameters?.tour) return scrapeLatestTour(qLatest, site);
|
if (site.parameters?.tour) return scrapeLatestTour(qLatest, site, accSiteReleases);
|
||||||
|
|
||||||
return scrapeLatest(qLatest, site);
|
return scrapeLatest(qLatest, site, accSiteReleases);
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchScene(url, site, baseRelease, beforeFetchLatest) {
|
async function fetchScene(url, site, baseRelease, beforeFetchLatest) {
|
||||||
|
|
Loading…
Reference in New Issue