From ea02ec394396307cf5d7864bdc7b7f767f95f30d Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Tue, 29 Oct 2024 22:42:30 +0100 Subject: [PATCH] Separated media request limits. --- config/default.js | 5 +++++ package-lock.json | 11 ++++++----- package.json | 2 +- src/app.js | 5 +++-- src/argv.js | 10 ++++++++++ src/media.js | 1 + src/scrapers/analvids.js | 37 ++++++++++++++++++++++++++++++++++++- src/utils/http.js | 17 +++++++++++++---- 8 files changed, 75 insertions(+), 13 deletions(-) diff --git a/config/default.js b/config/default.js index a8d98985..09651e56 100755 --- a/config/default.js +++ b/config/default.js @@ -301,6 +301,7 @@ module.exports = { }, proxy: { enable: false, + protocol: 'http', host: '', port: 8888, hostnames: [ @@ -376,6 +377,10 @@ module.exports = { interval: 50, concurrency: 20, }, + media: { + interval: 50, + concurrency: 20, + }, 'www.kink.com': { interval: 1000, concurrency: 1, diff --git a/package-lock.json b/package-lock.json index c82ee58a..2e878f0b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -89,7 +89,7 @@ "tunnel": "0.0.6", "ua-parser-js": "^1.0.37", "undici": "^5.28.1", - "unprint": "^0.13.3", + "unprint": "^0.14.1", "url-pattern": "^1.0.3", "v-tooltip": "^2.1.3", "video.js": "^8.6.1", @@ -18312,9 +18312,9 @@ } }, "node_modules/unprint": { - "version": "0.13.3", - "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.13.3.tgz", - "integrity": "sha512-HRpW+OdKmtW+cLnvLqYNVL2voH3aGvene8fxzAQzw2O0zPQrgv2iz5YivfQpxNyKsF+2jeUUma2ttWH8IttkHg==", + "version": "0.14.1", + "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.14.1.tgz", + "integrity": "sha512-LpsktR7NK3iDaYfy1HpNOiYoKGzLSq6wDhQN7RcwTQVJMz9kE0qQ8DS+ru2L76j52lq4v6oBktpnghbe//s3Mw==", "dependencies": { "axios": "^0.27.2", "bottleneck": "^2.19.5", @@ -18323,7 +18323,8 @@ "eslint-config-airbnb": "^19.0.4", "eslint-config-airbnb-base": "^15.0.0", "jsdom": "^17.0.0", - "moment-timezone": "^0.5.34" + "moment-timezone": "^0.5.34", + "tunnel": "^0.0.6" } }, "node_modules/unprint/node_modules/@tootallnate/once": { diff --git a/package.json b/package.json index f5d2825d..b224857c 100755 --- a/package.json +++ b/package.json @@ -148,7 +148,7 @@ "tunnel": "0.0.6", "ua-parser-js": "^1.0.37", "undici": "^5.28.1", - "unprint": "^0.13.3", + "unprint": "^0.14.1", "url-pattern": "^1.0.3", "v-tooltip": "^2.1.3", "video.js": "^8.6.1", diff --git a/src/app.js b/src/app.js index 3a90760f..b4ea4532 100755 --- a/src/app.js +++ b/src/app.js @@ -36,10 +36,11 @@ unprint.options({ 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36', }, limits: config.limits, + proxyAddress: `http://${config.proxy.host}:${config.proxy.port}`, }); -unprint.on('requestInit', (event) => logger.debug(`Unprint ${event.method} (${event.interval}ms/${event.concurrency}p) ${event.url}`)); -unprint.on('requestError', (event) => logger.error(`Unprint failed ${event.method} ${event.url} (${event.status}): ${event.statusText}`)); +unprint.on('requestInit', (event) => logger.debug(`Unprint ${event.method} (${event.interval}ms/${event.concurrency}p${event.proxy ? '' : ' proxied'}) ${event.url}`)); +unprint.on('requestError', (event) => logger.error(`Unprint failed ${event.proxied ? ' proxied' : ''}${event.method} ${event.url} (${event.status}): ${event.statusText}`)); function logActive() { setTimeout(() => { diff --git a/src/argv.js b/src/argv.js index 4bff43c0..8a861306 100755 --- a/src/argv.js +++ b/src/argv.js @@ -189,6 +189,16 @@ const { argv } = yargs type: 'number', // don't set default, because argument has to override config, but config has to override default }) + .option('media-interval', { + describe: 'Minimum wait time between HTTP media requests', + type: 'number', + // don't set default, because argument has to override config, but config has to override default + }) + .option('media-concurrency', { + describe: 'Maximum amount of parallel HTTP media requests', + type: 'number', + // don't set default, because argument has to override config, but config has to override default + }) .option('save', { describe: 'Save fetched releases to database', type: 'boolean', diff --git a/src/media.js b/src/media.js index 26017546..af553626 100755 --- a/src/media.js +++ b/src/media.js @@ -619,6 +619,7 @@ async function storeFile(media, options) { async function fetchHttpSource(source, tempFileTarget, hashStream) { const res = await http.get(source.src, { + limits: 'media', headers: { ...(source.referer && { referer: source.referer }), ...(source.host && { host: source.host }), diff --git a/src/scrapers/analvids.js b/src/scrapers/analvids.js index 0c386893..c0c70087 100644 --- a/src/scrapers/analvids.js +++ b/src/scrapers/analvids.js @@ -91,7 +91,9 @@ function scrapeProfile({ query }, url, channel) { async function fetchLatest(channel, page) { // const res = await unprint.get(`https://www.analvids.com/new-videos/${page}`, { selectAll: '.card-scene' }); // analvids as channel - const res = await unprint.get(`${channel.url}/latest/${page}`, { selectAll: '.card-scene' }); // studios as channels + const res = await unprint.get(`${channel.url}/latest/${page}`, { + selectAll: '.card-scene', // studios as channels + }); if (res.ok) { return scrapeAll(res.context, channel); @@ -100,6 +102,39 @@ async function fetchLatest(channel, page) { return res.status; } +/* +async function fetchLatest(channel, page) { + // const res = await unprint.get(`https://www.analvids.com/new-videos/${page}`, { selectAll: '.card-scene' }); // analvids as channel + // const res = await unprint.get(`${channel.url}/latest/${page}`, { selectAll: '.card-scene' }); // studios as channels + const url = `${channel.url}/latest/${page}`; // studios as channels + + const { tab } = await http.getBrowserSession('analvids', { + bypass: { + headless: false, + }, + }); + + const res = await tab.goto(url); + + const status = res.status(); + + console.log('STATUS', status); + + if (status === 200) { + const html = await tab.content(); + const context = unprint.initAll(html, '.card-scene'); // studios as channels + + const scenes = scrapeAll(context, channel); + + tab.close(); + + return scenes; + } + + return res.status; +} +*/ + async function getActorUrl(actor, channel) { if (actor.url) { return actor.url; diff --git a/src/utils/http.js b/src/utils/http.js index 62bef48f..c5024124 100755 --- a/src/utils/http.js +++ b/src/utils/http.js @@ -108,20 +108,29 @@ function useCloudflareBypass(url, options) { return null; } +const propMap = { + media: { + interval: 'mediaInterval', + concurrency: 'mediaConcurrency', + }, +}; + function getLimiterValue(prop, options, hostname) { - if (argv[prop] !== undefined) { + const mappedProp = propMap[options.limits]?.[prop] || prop; + + if (typeof argv[mappedProp] !== 'undefined') { return argv[prop]; } - if (options[prop] !== undefined) { + if (typeof options[prop] !== 'undefined') { return options[prop]; } - if (config.limits[hostname]?.enable !== false && config.limits[hostname]?.[prop] !== undefined) { + if (config.limits[hostname]?.enable !== false && typeof config.limits[hostname]?.[prop] !== 'undefined') { return config.limits[hostname][prop]; } - return config.limits.default[prop]; + return config.limits[options.limits || 'default'][prop]; } function getLimiter(options = {}, url) {