From 3889faee26618a70d3860d1d748ff639f7398957 Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Thu, 5 Mar 2020 23:01:03 +0100 Subject: [PATCH] Added optional sequential scraping and acc release injection. Added Hush Pass and Interracial Pass logos. --- seeds/01_networks.js | 6 ++++ seeds/02_sites.js | 33 ++++++++++--------- src/scrape-sites.js | 76 ++++++++++++++++++++++++++------------------ src/scrapers/hush.js | 33 +++++++++++-------- 4 files changed, 88 insertions(+), 60 deletions(-) diff --git a/seeds/01_networks.js b/seeds/01_networks.js index 76a87a5c..b00893ae 100644 --- a/seeds/01_networks.js +++ b/seeds/01_networks.js @@ -181,12 +181,18 @@ const networks = [ name: 'Hush Pass', url: 'http://www.hushpass.com', parent: 'hush', + parameters: { + sequential: true, + }, }, { slug: 'interracialpass', name: 'Interracial Pass', url: 'http://www.interracialpass.com', parent: 'hush', + parameters: { + sequential: true, + }, }, { slug: 'insex', diff --git a/seeds/02_sites.js b/seeds/02_sites.js index 9fcf96fa..fcc764c9 100644 --- a/seeds/02_sites.js +++ b/seeds/02_sites.js @@ -2101,15 +2101,6 @@ const sites = [ }, }, // HUSH PASS - { - slug: 'hushpass', - name: 'Hush Pass', - url: 'https://hushpass.com', - network: 'hushpass', - parameters: { - t1: true, - }, - }, { slug: 'shotherfirst', name: 'Shot Her First', @@ -2264,17 +2255,17 @@ const sites = [ t1: true, }, }, - // INTERRACIAL PASS { - slug: 'interracialpass', - name: 'Interracial Pass', - url: 'https://www.interracialpass.com', - tags: ['interracial'], - network: 'interracialpass', + slug: 'hushpass', + name: 'Hush Pass', + url: 'https://hushpass.com', + network: 'hushpass', parameters: { t1: true, + accFilter: true, }, }, + // INTERRACIAL PASS { slug: '2bigtobetrue', name: '2 Big To Be True', @@ -2355,9 +2346,21 @@ const sites = [ parameters: { latest: 'https://www.interracialpass.com/t1/categories/my-wifes-first-monster-cock_%d_d.html', media: 'https://www.interracialpass.com', + match: 'My Wifes First Monster Cock', t1: true, }, }, + { + slug: 'interracialpass', + name: 'Interracial Pass', + url: 'https://www.interracialpass.com', + tags: ['interracial'], + network: 'interracialpass', + parameters: { + t1: true, + accFilter: true, + }, + }, // INSEX { slug: 'sexuallybroken', diff --git a/src/scrape-sites.js b/src/scrape-sites.js index 91be1fab..950a0947 100644 --- a/src/scrape-sites.js +++ b/src/scrape-sites.js @@ -37,12 +37,12 @@ async function findDuplicateReleaseIds(latestReleases, accReleases) { .concat(accReleases.map(release => String(release.entryId)))); } -async function scrapeUniqueReleases(scraper, site, beforeFetchLatest, afterDate = getAfterDate(), accReleases = [], page = argv.page) { +async function scrapeUniqueReleases(scraper, site, beforeFetchLatest, accSiteReleases, afterDate = getAfterDate(), accReleases = [], page = argv.page) { if (!argv.latest || !scraper.fetchLatest) { return []; } - const latestReleases = await scraper.fetchLatest(site, page, beforeFetchLatest); + const latestReleases = await scraper.fetchLatest(site, page, beforeFetchLatest, accSiteReleases); if (!Array.isArray(latestReleases)) { logger.warn(`Scraper returned ${latestReleases || 'null'} when fetching latest from '${site.name}' on '${site.network.name}'`); @@ -73,7 +73,7 @@ async function scrapeUniqueReleases(scraper, site, beforeFetchLatest, afterDate || (argv.last && accReleases.length + uniqueReleases.length < argv.last)) ) { // oldest release on page is newer that specified date range, or latest count has not yet been met, fetch next page - return scrapeUniqueReleases(scraper, site, beforeFetchLatest, afterDate, accReleases.concat(uniqueReleases), page + 1); + return scrapeUniqueReleases(scraper, site, beforeFetchLatest, accSiteReleases, afterDate, accReleases.concat(uniqueReleases), page + 1); } if (argv.last && uniqueReleases.length >= argv.last) { @@ -132,12 +132,12 @@ async function deepFetchReleases(baseReleases, beforeFetchLatest) { }); } -async function scrapeSiteReleases(scraper, site) { - const beforeFetchLatest = await scraper.beforeFetchLatest?.(site); +async function scrapeSiteReleases(scraper, site, accSiteReleases) { + const beforeFetchLatest = await scraper.beforeFetchLatest?.(site, accSiteReleases); const [newReleases, upcomingReleases] = await Promise.all([ - scrapeUniqueReleases(scraper, site, beforeFetchLatest), // fetch basic release info from scene overview - scrapeUpcomingReleases(scraper, site, beforeFetchLatest), // fetch basic release info from upcoming overview + scrapeUniqueReleases(scraper, site, beforeFetchLatest, accSiteReleases), // fetch basic release info from scene overview + scrapeUpcomingReleases(scraper, site, beforeFetchLatest, accSiteReleases), // fetch basic release info from upcoming overview ]); if (argv.upcoming) { @@ -154,35 +154,49 @@ async function scrapeSiteReleases(scraper, site) { return baseReleases; } +async function scrapeSite(site, network, accSiteReleases = []) { + if (site.parameters?.ignore) { + logger.warn(`Ignoring ${network.name}: ${site.name}`); + return []; + } + + const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug]; + + if (!scraper) { + logger.warn(`No scraper found for '${site.name}' (${site.slug})`); + return []; + } + + try { + const siteReleases = await scrapeSiteReleases(scraper, site, accSiteReleases); + + return siteReleases.map(release => ({ ...release, site })); + } catch (error) { + logger.error(`${site.name}: Failed to scrape releases: ${error.message}`); + + return []; + } +} + async function scrapeSites() { const networks = await fetchIncludedSites(); - const scrapedNetworks = await Promise.map(networks, async network => Promise.map(network.sites, async (site) => { - if (site.parameters?.ignore) { - logger.warn(`Ignoring ${network.name}: ${site.name}`); - return []; + const scrapedNetworks = await Promise.map(networks, async (network) => { + if (network.parameters?.sequential) { + logger.info(`Scraping '${network.name}' sequentially`); + + return Promise.reduce(network.sites, async (acc, site) => { + const accSiteReleases = await acc; + const siteReleases = await scrapeSite(site, network, accSiteReleases); + + return accSiteReleases.concat(siteReleases); + }, Promise.resolve([])); } - const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug]; - - if (!scraper) { - logger.warn(`No scraper found for '${site.name}' (${site.slug})`); - return []; - } - - try { - const siteReleases = await scrapeSiteReleases(scraper, site); - - return siteReleases.map(release => ({ ...release, site })); - } catch (error) { - logger.error(`${site.name}: Failed to scrape releases: ${error.message}`); - - return []; - } - }, { - // 2 network sites at a time - concurrency: 2, - }), + return Promise.map(network.sites, async site => scrapeSite(site, network), { + concurrency: network.parameters?.concurrency || 2, + }); + }, { // 5 networks at a time concurrency: 5, diff --git a/src/scrapers/hush.js b/src/scrapers/hush.js index 1cc6a665..f3951862 100644 --- a/src/scrapers/hush.js +++ b/src/scrapers/hush.js @@ -6,6 +6,14 @@ const knex = require('../knex'); const { get, geta, fd } = require('../utils/q'); const slugify = require('../utils/slugify'); +async function getChannelRegExp(site) { + if (!['hushpass', 'interracialpass'].includes(site.network.slug)) return null; + + const sites = await knex('sites').where('network_id', site.network.id); + + return new RegExp(sites.map(channel => channel.parameters?.match || channel.name).join('|'), 'i'); +} + function deriveEntryId(release) { return `${slugify(fd(release.date, 'YYYY-MM-DD'))}-${slugify(release.title)}`; } @@ -54,7 +62,7 @@ function scrapeLatest(scenes, site) { }); } -function scrapeLatestT1(scenes, site) { +function scrapeLatestT1(scenes, site, accSiteReleases) { return scenes.map(({ q, qi, qd, ql, qu }) => { const release = {}; @@ -79,8 +87,13 @@ function scrapeLatestT1(scenes, site) { // release.entryId = q('.img-div img', 'id')?.match(/set-target-(\d+)/)[1]; release.entryId = deriveEntryId(release); + if (site.parameters?.accFilter && accSiteReleases?.map(accRelease => accRelease.entryId).includes(release.entryId)) { + // filter out releases that were already scraped from a categorized site + return null; + } + return release; - }); + }).filter(Boolean); } function scrapeLatestTour(scenes) { @@ -199,15 +212,7 @@ function scrapeSceneTour({ html, q, qd, qa, qis }, site, url) { return release; } -async function getChannelRegExp(site) { - if (!['hushpass', 'interracialpass'].includes(site.network.slug)) return null; - - const sites = await knex('sites').where('network_id', site.network.id); - - return new RegExp(sites.map(channel => channel.parameters?.match || channel.name).join('|'), 'i'); -} - -async function fetchLatest(site, page = 1) { +async function fetchLatest(site, page = 1, _beforeFetchLatest, accSiteReleases) { const url = (site.parameters?.latest && util.format(site.parameters.latest, page)) || (site.parameters?.t1 && `${site.url}/t1/categories/movies_${page}_d.html`) || `${site.url}/categories/movies_${page}_d.html`; @@ -215,10 +220,10 @@ async function fetchLatest(site, page = 1) { const qLatest = await geta(url, '.modelfeature, .item-video, .updateItem'); if (!qLatest) return null; - if (site.parameters?.t1) return scrapeLatestT1(qLatest, site); - if (site.parameters?.tour) return scrapeLatestTour(qLatest, site); + if (site.parameters?.t1) return scrapeLatestT1(qLatest, site, accSiteReleases); + if (site.parameters?.tour) return scrapeLatestTour(qLatest, site, accSiteReleases); - return scrapeLatest(qLatest, site); + return scrapeLatest(qLatest, site, accSiteReleases); } async function fetchScene(url, site, baseRelease, beforeFetchLatest) {