diff --git a/config/default.js b/config/default.js index 8ec5c143..c96c49a6 100644 --- a/config/default.js +++ b/config/default.js @@ -12,9 +12,44 @@ module.exports = { // include: [], // exclude: [], exclude: [ + ['21sextreme', [ + // no longer updated + 'mightymistress', + 'dominatedgirls', + 'homepornreality', + 'peeandblow', + 'cummingmatures', + 'mandyiskinky', + 'speculumplays', + 'creampiereality', + ]], + ['blowpass', ['sunlustxxx']], + ['ddfnetwork', [ + 'fuckinhd', + 'bustylover', + ]], ['famedigital', [ + 'daringsex', 'lowartfilms', ]], + ['pornpros', [ + 'milfhumiliation', + 'humiliated', + 'flexiblepositions', + 'publicviolations', + 'amateurviolations', + 'squirtdisgrace', + 'cumdisgrace', + 'webcamhackers', + 'collegeteens', + ]], + ['score', [ + 'bigboobbundle', + 'milfbundle', + 'pornmegaload', + 'scorelandtv', + 'scoretv', + ]], ], profiles: [ [ diff --git a/migrations/20190325001339_releases.js b/migrations/20190325001339_releases.js index f123e93d..0db84781 100644 --- a/migrations/20190325001339_releases.js +++ b/migrations/20190325001339_releases.js @@ -162,10 +162,9 @@ exports.up = knex => Promise.resolve() table.integer('priority', 3) .defaultTo(0); + table.boolean('show') .defaultTo(true); - table.boolean('scrape') - .defaultTo(true); table.datetime('created_at') .defaultTo(knex.fn.now()); diff --git a/seeds/02_sites.js b/seeds/02_sites.js index ad7f5b91..bf61e964 100644 --- a/seeds/02_sites.js +++ b/seeds/02_sites.js @@ -81,7 +81,6 @@ const sites = [ parameters: { scene: 'https://21sextreme.com/en/video', }, - scrape: false, // no longer updated }, { slug: 'dominatedgirls', @@ -91,7 +90,6 @@ const sites = [ parameters: { scene: 'https://21sextreme.com/en/video', }, - scrape: false, // no longer updated }, { slug: 'homepornreality', @@ -101,7 +99,6 @@ const sites = [ parameters: { scene: 'https://21sextreme.com/en/video', }, - scrape: false, // no longer updated }, { slug: 'peeandblow', @@ -111,7 +108,6 @@ const sites = [ parameters: { scene: 'https://21sextreme.com/en/video', }, - scrape: false, // no longer updated }, { slug: 'cummingmatures', @@ -121,7 +117,6 @@ const sites = [ parameters: { scene: 'https://21sextreme.com/en/video', }, - scrape: false, // no longer updated }, { slug: 'mandyiskinky', @@ -131,7 +126,6 @@ const sites = [ parameters: { scene: 'https://21sextreme.com/en/video', }, - scrape: false, // no longer updated }, { slug: 'speculumplays', @@ -141,7 +135,6 @@ const sites = [ parameters: { scene: 'https://21sextreme.com/en/video', }, - scrape: false, // no longer updated }, { slug: 'creampiereality', @@ -151,7 +144,6 @@ const sites = [ parameters: { scene: 'https://21sextreme.com/en/video', }, - scrape: false, // no longer updated }, // 21SEXTURY { @@ -1098,7 +1090,6 @@ const sites = [ url: 'https://www.sunlustxxx.com', description: '', network: 'blowpass', - scrape: false, show: true, // site offline, use only for indexing old scenes }, // BRAZZERS @@ -1467,8 +1458,6 @@ const sites = [ description: 'HD Hardcore Sex & XXX Fantasy Porn Videos and Photos Produced in full HD featuring a Variety of Hardcore Porn Niches.', network: 'ddfnetwork', parameters: { native: true }, - scrape: false, - show: true, // appears to be re-releases only }, { slug: 'bustylover', @@ -1476,8 +1465,6 @@ const sites = [ url: 'https://bustylover.com', network: 'ddfnetwork', parameters: { native: true }, - scrape: false, - show: true, // appears to be re-releases only }, // DIGITAL PLAYGROUND { @@ -1815,7 +1802,6 @@ const sites = [ description: 'Welcome the official Daring Sex site, home of high quality erotica, sensual porn and hardcore exploration of the darker side of sexuality. Here you will find a variety of videos for lovers looking for a bit of extra, or something darker with an element of control.', network: 'famedigital', parameters: { api: true }, - scrape: false, show: false, // no data sources }, { @@ -4141,14 +4127,12 @@ const sites = [ url: 'https://milfhumiliation.com', network: 'pornpros', tags: ['milf'], - scrape: false, }, { name: 'Humiliated', slug: 'humiliated', url: 'https://humiliated.com', network: 'pornpros', - scrape: false, }, { name: 'Flexible Positions', @@ -4158,7 +4142,6 @@ const sites = [ parameters: { network: true, }, - scrape: false, }, { name: 'Public Violations', @@ -4168,41 +4151,35 @@ const sites = [ parameters: { network: true, }, - scrape: false, }, { name: 'Amateur Violations', slug: 'amateurviolations', url: 'https://amateurviolations.com', network: 'pornpros', - scrape: false, }, { name: 'Squirt Disgrace', slug: 'squirtdisgrace', url: 'https://squirtdisgrace.com', network: 'pornpros', - scrape: false, }, { name: 'Cum Disgrace', slug: 'cumdisgrace', url: 'https://cumdisgrace.com', network: 'pornpros', - scrape: false, }, { name: 'Webcam Hackers', slug: 'webcamhackers', url: 'https://webcamhackers.com', network: 'pornpros', - scrape: false, }, { name: 'College Teens', slug: 'collegeteens', network: 'pornpros', - scrape: false, }, // PRIVATE { @@ -4838,7 +4815,6 @@ const sites = [ slug: 'bigboobbundle', url: 'https://www.bigboobbundle.com', network: 'score', - scrape: false, show: false, // all content appears to be on subsites }, { @@ -5132,7 +5108,6 @@ const sites = [ url: 'https://www.milfbundle.com', network: 'score', show: false, - scrape: false, }, { name: 'Teaming Cock', @@ -5200,7 +5175,6 @@ const sites = [ url: 'https://www.pornmegaload.com', network: 'score', show: false, - scrape: false, }, { name: 'SaRennas World', @@ -5238,7 +5212,6 @@ const sites = [ url: 'https://www.scorepass.com/scorelandtv', network: 'score', priority: 1, - scrape: false, show: false, // appears to be streaming service for other sites }, { @@ -5247,7 +5220,6 @@ const sites = [ url: 'https://www.scoretv.tv', network: 'score', priority: 1, - scrape: false, show: false, // similar to or same as Scoreland TV }, { @@ -6114,7 +6086,6 @@ exports.seed = knex => Promise.resolve() parameters: site.parameters, network_id: networksMap[site.network], priority: site.priority, - scrape: site.scrape, show: site.show, })); diff --git a/src/app.js b/src/app.js index 645d6f4c..99f57d58 100644 --- a/src/app.js +++ b/src/app.js @@ -1,9 +1,23 @@ 'use strict'; +// const knex = require('./knex'); const argv = require('./argv'); -const knex = require('./knex'); const initServer = require('./web/server'); +const knex = require('./knex'); +const fetchUpdates = require('./fetch-updates'); + +async function init() { + if (argv.server) { + await initServer(); + return; + } + + await fetchUpdates(); + knex.destroy(); +} + +/* const scrapeSites = require('./scrape-sites'); const { scrapeScenes, scrapeMovies, deepFetchReleases } = require('./scrape-releases'); const { storeReleases, updateReleasesSearch } = require('./releases'); @@ -16,7 +30,7 @@ if (process.env.NODE_ENV === 'development') { async function init() { if (argv.scene) { await scrapeScenes(argv.scene); - } + if (argv.movie) { await scrapeMovies(argv.movie); @@ -52,5 +66,6 @@ async function init() { knex.destroy(); } +*/ module.exports = init; diff --git a/src/fetch-updates.js b/src/fetch-updates.js new file mode 100644 index 00000000..bed6ff6c --- /dev/null +++ b/src/fetch-updates.js @@ -0,0 +1,210 @@ +'use strict'; + +const Promise = require('bluebird'); +const moment = require('moment'); + +const argv = require('./argv'); +const logger = require('./logger')(__filename); +const knex = require('./knex'); +const include = require('./utils/argv-include')(argv); +const scrapers = require('./scrapers/scrapers'); +const { fetchSitesFromArgv, fetchSitesFromConfig } = require('./sites'); + +const afterDate = (() => { + if (/\d{2,4}-\d{2}-\d{2,4}/.test(argv.after)) { + // using date + return moment + .utc(argv.after, ['YYYY-MM-DD', 'DD-MM-YYYY']) + .toDate(); + } + + // using time distance (e.g. "1 month") + return moment + .utc() + .subtract(...argv.after.split(' ')) + .toDate(); +})(); + +async function extractUniqueReleases(latestReleases, accReleases) { + const latestReleaseEntryIds = latestReleases.map(release => release.entryId); + const duplicateReleases = await knex('releases') + .whereIn('entry_id', latestReleaseEntryIds); + + // add entry IDs of accumulated releases to prevent an infinite loop + // when one page contains the same release as the previous + const duplicateReleaseEntryIds = new Set(duplicateReleases + .map(release => String(release.entry_id)) + .concat(accReleases.map(release => String(release.entryId)))); + + const uniqueReleases = latestReleases + .filter(release => !duplicateReleaseEntryIds.has(String(release.entryId))); + + return uniqueReleases; +} + +function getNextPage(uniqueReleases, pageAccReleases, oldestReleaseOnPage) { + if (uniqueReleases === 0) { + return false; + } + + if (argv.last && pageAccReleases.length < argv.last) { + // request for last N releases not yet satisfied + return true; + } + + if (oldestReleaseOnPage && moment(oldestReleaseOnPage.date).isAfter(afterDate)) { + // oldest release on page is newer than the specified date cut-off + return true; + } + + // dates missing, and limit for scenes without dates not yet reached + return pageAccReleases.length <= argv.nullDateLimit; +} + +async function scrapeLatestReleases(scraper, site, preData) { + if (!scraper.fetchLatest) { + return []; + } + + const scrapePage = async (page = 1, accReleases = []) => { + const latestReleases = await scraper.fetchLatest(site, page, preData, include); + + if (!Array.isArray(latestReleases)) { + // scraper is unable to fetch the releases and returned a HTTP code or null + logger.warn(`Scraper returned ${latestReleases} when fetching latest from '${site.name}' (${site.network.name})`); + return accReleases; + } + + if (latestReleases.length === 0) { + // scraper successfully requested releases, but found none + return accReleases; + } + + const latestReleasesWithSite = latestReleases.map(release => ({ ...release, site: release.site || site })); // attach site release is assigned to when stored + const oldestReleaseOnPage = latestReleases.sort((releaseA, releaseB) => releaseB.date - releaseA.date).slice(-1)[0]; + + const uniqueReleases = await extractUniqueReleases(latestReleasesWithSite, accReleases); + const pageAccReleases = accReleases.concat(uniqueReleases); + + logger.verbose(`Scraped '${site.name}' (${site.network.name}) page ${page}, found ${uniqueReleases.length} unique releases`); + + if (getNextPage(uniqueReleases, pageAccReleases, oldestReleaseOnPage)) { + return scrapePage(page + 1, accReleases.concat(uniqueReleases)); + } + + if (argv.last) { + return pageAccReleases.slice(0, argv.last); + } + + if (oldestReleaseOnPage) { + const recentReleases = uniqueReleases + .filter(release => moment(release.date).isAfter(afterDate)); + + return accReleases.concat(recentReleases); + } + + return pageAccReleases.slice(0, argv.nullDateLimit); + }; + + return scrapePage(1, []); +} + +async function scrapeUpcomingReleases(scraper, site, preData) { + if (!scraper.fetchUpcoming) { + return []; + } + + try { + const upcomingReleases = await scraper.fetchUpcoming(site, 1, preData, include); + + if (upcomingReleases) { + return upcomingReleases.map(release => ({ + ...release, + site, + upcoming: true, + })); + } + } catch (error) { + logger.warn(`Failed to scrape upcoming releases for '${site.slug}' (${site.network.slug})`); + } + + return []; +} + +async function scrapeSiteReleases(scraper, site, preData) { + const [latestReleases, upcomingReleases] = await Promise.all([ + argv.latest + ? scrapeLatestReleases(scraper, site, preData) + : [], + argv.upcoming + ? scrapeUpcomingReleases(scraper, site, preData) + : [], + ]); + + return [...latestReleases, ...upcomingReleases]; +} + +async function scrapeSite(site, accSiteReleases) { + const scraper = scrapers.releases[site.slug] + || scrapers.releases[site.network.slug] + || scrapers.releases[site.network.parent?.slug]; + + if (!scraper) { + logger.warn(`No scraper found for '${site.name}' (${site.network.name})`); + return []; + } + + try { + const beforeFetchLatest = await scraper.beforeFetchLatest?.(site); + + const siteReleases = await scrapeSiteReleases(scraper, site, { + accSiteReleases, + beforeFetchLatest, + }); + + return siteReleases.map(release => ({ ...release, site })); + } catch (error) { + logger.error(`Failed to scrape releases from ${site.name} using ${scraper.slug}: ${error.message}`); + + return []; + } +} + +async function scrapeNetworkSequential(network) { + return Promise.reduce( + network.sites, + async (chain, site) => { + const accSiteReleases = await chain; + const siteReleases = await scrapeSite(site, network, accSiteReleases); + + return accSiteReleases.concat(siteReleases); + }, + Promise.resolve([]), + ); +} + +async function scrapeNetworkParallel(network) { + return Promise.map( + network.sites, + async site => scrapeSite(site, network), + { concurrency: 3 }, + ); +} + +async function fetchUpdates() { + const includedNetworks = argv.sites || argv.networks + ? await fetchSitesFromArgv() + : await fetchSitesFromConfig(); + + const scrapedNetworks = await Promise.map( + includedNetworks, + async network => (network.parameters?.sequential + ? scrapeNetworkSequential(network) + : scrapeNetworkParallel(network)), + { concurrency: 5 }, + ); + + return scrapedNetworks; +} + +module.exports = fetchUpdates; diff --git a/src/sites.js b/src/sites.js index 85f0b3cf..5854edff 100644 --- a/src/sites.js +++ b/src/sites.js @@ -113,7 +113,6 @@ async function fetchSitesFromArgv() { ) .whereIn('sites.slug', argv.sites || []) .orWhereIn('networks.slug', argv.networks || []) - .where('sites.scrape', true) .leftJoin('networks', 'sites.network_id', 'networks.id'); const curatedSites = await curateSites(rawSites, true); @@ -132,7 +131,6 @@ async function fetchSitesFromConfig() { 'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters', ) .leftJoin('networks', 'sites.network_id', 'networks.id') - .where('sites.scrape', true) .where((builder) => { if (config.include) { builder