From 3d427f7e1d05beb70e4325c85321580cd87e366f Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Sun, 22 Nov 2020 23:50:24 +0100 Subject: [PATCH] Allowing HTTP rate limits to be set by configuration or argument. --- config/default.js | 5 +++++ src/argv.js | 10 ++++++++++ src/scrapers/vixen.js | 16 ++++++++-------- src/utils/http.js | 40 +++++++++++++++++++++++++++++++--------- 4 files changed, 54 insertions(+), 17 deletions(-) diff --git a/config/default.js b/config/default.js index 27c02549..6f32ed84 100644 --- a/config/default.js +++ b/config/default.js @@ -202,6 +202,11 @@ module.exports = { interval: 50, concurrency: 20, }, + 'www.deeper.com': { + enable: false, // can be omitted to enable + interval: 1000, + concurrency: 1, + }, }, fetchAfter: [1, 'week'], missingDateLimit: 3, diff --git a/src/argv.js b/src/argv.js index 707948be..dcb02cfc 100644 --- a/src/argv.js +++ b/src/argv.js @@ -153,6 +153,16 @@ const { argv } = yargs type: 'number', default: 1, }) + .option('interval', { + describe: 'Minimum wait time between HTTP requests', + type: 'number', + // don't set default, because argument has to override config, but config has to override default + }) + .option('concurrency', { + describe: 'Maximum amount of parallel HTTP requests', + type: 'number', + // don't set default, because argument has to override config, but config has to override default + }) .option('save', { describe: 'Save fetched releases to database', type: 'boolean', diff --git a/src/scrapers/vixen.js b/src/scrapers/vixen.js index 47a2c92f..26c6cc4e 100644 --- a/src/scrapers/vixen.js +++ b/src/scrapers/vixen.js @@ -159,7 +159,7 @@ async function fetchActorReleases(pages, model, origin) { const url = `${origin}/api${model.targetUrl}?page=${page}`; const res = await http.get(url); - if (res.code === 200) { + if (res.status === 200) { return scrapeAll(res.body.data.videos.videos, null, origin); } @@ -207,22 +207,22 @@ async function fetchLatest(site, page = 1) { const url = `${site.url}/api/videos?page=${page}`; const res = await http.get(url); - if (res.code === 200) { + if (res.status === 200) { return scrapeAll(res.body.data.videos, site); } - return res.code; + return res.status; } async function fetchUpcoming(site) { const apiUrl = `${site.url}/api`; const res = await http.get(apiUrl); - if (res.code === 200) { + if (res.status === 200) { return scrapeUpcoming(res.body.data.nextScene, site); } - return res.code; + return res.status; } async function fetchScene(url, site, baseRelease) { @@ -231,11 +231,11 @@ async function fetchScene(url, site, baseRelease) { const res = await http.get(apiUrl); - if (res.code === 200) { + if (res.status === 200) { return scrapeScene(res.body.data, url, site, baseRelease); } - return res.code; + return res.status; } async function fetchProfile({ name: actorName }, { site }, include) { @@ -244,7 +244,7 @@ async function fetchProfile({ name: actorName }, { site }, include) { const url = `${origin}/api/${actorSlug}`; const res = await http.get(url); - if (res.code === 200) { + if (res.status === 200) { return scrapeProfile(res.body.data, origin, include.scenes); } diff --git a/src/utils/http.js b/src/utils/http.js index b62a45bf..90366d77 100644 --- a/src/utils/http.js +++ b/src/utils/http.js @@ -9,6 +9,7 @@ const Bottleneck = require('bottleneck'); const { JSDOM } = require('jsdom'); const logger = require('../logger')(__filename); +const argv = require('../argv'); const pipeline = util.promisify(stream.pipeline); const limiters = {}; @@ -33,12 +34,31 @@ function useProxy(url) { } const { hostname } = new URL(url); + return config.proxy.hostnames.includes(hostname); } -function getLimiter(limit = {}) { - const interval = limit.interval === undefined ? config.limits.default.interval : limit.interval; - const concurrency = limit.concurrency === undefined ? config.limits.default.concurrency : limit.concurrency; +function getLimiterValue(prop, options, hostname) { + if (argv[prop] !== undefined) { + return argv[prop]; + } + + if (options[prop] !== undefined) { + return options[prop]; + } + + if (config.limits[hostname]?.enable !== false && config.limits[hostname]?.[prop] !== undefined) { + return config.limits[hostname][prop]; + } + + return config.limits.default[prop]; +} + +function getLimiter(options = {}, url) { + const { hostname } = new URL(url); + + const interval = getLimiterValue('interval', options, hostname); + const concurrency = getLimiterValue('concurrency', options, hostname); if (!limiters[interval]?.[concurrency]) { limiters[interval] = limiters[interval] || {}; @@ -52,7 +72,7 @@ function getLimiter(limit = {}) { return limiters[interval][concurrency]; } -async function request(method = 'get', url, body, requestOptions = {}) { +async function request(method = 'get', url, body, requestOptions = {}, limiter) { const http = requestOptions.session || bhttp; const options = { @@ -60,16 +80,16 @@ async function request(method = 'get', url, body, requestOptions = {}) { ...requestOptions, responseTimeout: requestOptions.responseTimeout || requestOptions.timeout || 60000, stream: !!requestOptions.destination, - interval: requestOptions.interval || config.limits.default.interval, - concurrency: requestOptions.concurrency || config.limits.default.concurrency, session: null, }; - if (useProxy(url)) { + const withProxy = useProxy(url); + + if (withProxy) { options.agent = proxyAgent; } - logger.debug(`GET (${options.interval}ms/${options.concurrency}p) ${url}`); + logger.debug(`${method.toUpperCase()} (${limiter._store.storeOptions.minTime}ms/${limiter._store.storeOptions.maxConcurrent}p${withProxy ? ' proxy' : ''}) ${url}`); const res = await (body ? http[method](url, body, options) @@ -107,7 +127,9 @@ async function request(method = 'get', url, body, requestOptions = {}) { } async function scheduleRequest(method = 'get', url, body, options) { - return getLimiter(options || {}).schedule(() => request(method, url, body, options)); + const limiter = getLimiter(options, url); + + return limiter.schedule(() => request(method, url, body, options, limiter)); } async function get(url, options) {