
239 lines
5.8 KiB
Raw Normal View History

'use strict';
const config = require('config');
const Promise = require('bluebird');
const bhttp = require('bhttp');
const fs = require('fs').promises;
const util = require('util');
const stream = require('stream');
const tunnel = require('tunnel');
const Bottleneck = require('bottleneck');
2020-12-29 01:05:22 +00:00
const { JSDOM, toughCookie } = require('jsdom');
const windows = require('./http-windows');
const logger = require('../logger')(__filename);
const virtualConsole = require('./virtual-console')(__filename);
const argv = require('../argv');
const pipeline = util.promisify(stream.pipeline);
const limiters = {};
cancellation: true,
const defaultOptions = {
2021-10-19 23:46:56 +00:00
timeout: argv.requestTimeout,
encodeJSON: true,
parse: false,
headers: {
'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1',
const proxyAgent = tunnel.httpsOverHttp({
proxy: {
port: config.proxy.port,
function useProxy(url) {
if (!config.proxy.enable) {
return false;
const { hostname } = new URL(url);
return config.proxy.hostnames.includes(hostname);
function getLimiterValue(prop, options, hostname) {
if (argv[prop] !== undefined) {
return argv[prop];
if (options[prop] !== undefined) {
return options[prop];
if (config.limits[hostname]?.enable !== false && config.limits[hostname]?.[prop] !== undefined) {
return config.limits[hostname][prop];
return config.limits.default[prop];
function getLimiter(options = {}, url) {
const { hostname } = new URL(url);
const interval = getLimiterValue('interval', options, hostname);
const concurrency = getLimiterValue('concurrency', options, hostname);
if (!limiters[interval]?.[concurrency]) {
limiters[interval] = limiters[interval] || {};
limiters[interval][concurrency] = new Bottleneck({
minTime: interval,
maxConcurrent: concurrency,
timeout: (options.timeout || defaultOptions.timeout) + 10000, // timeout 10 seconds after bhttp should
return limiters[interval][concurrency];
2021-03-17 01:09:34 +00:00
async function request(method = 'get', url, body, requestOptions = {}, limiter) {
const http = requestOptions.session || bhttp;
2021-03-23 14:25:21 +00:00
const options = {
session: null,
const withProxy = useProxy(url);
if (withProxy) {
options.agent = proxyAgent;
logger.debug(`${method.toUpperCase()} (${limiter._store.storeOptions.minTime}ms/${limiter._store.storeOptions.maxConcurrent}p${withProxy ? ' proxy' : ''}) ${url}`);
const res = await (body
? http[method](url, body, options)
: http[method](url, options));
2021-03-17 01:09:34 +00:00
return res;
2021-03-17 01:09:34 +00:00
async function finalizeResult(res, options) {
if (options.destination) {
// res.on('progress', (bytes, totalBytes) => logger.silly(`Downloaded ${Math.round((bytes / totalBytes) * 100)}% of ${url}`));
await pipeline(res, ...(options.transforms || []), options.destination);
if (Buffer.isBuffer(res.body)) {
const html = res.body.toString();
const window = options?.parse ? new JSDOM(html, { virtualConsole, ...options.extract }).window : null;
const pathname = new URL(res.request.url).pathname.replace(/\//g, '_');
// allow window.close to be called after scraping is done, only for deep scrapes where the URL is known outside the scraper
if (window && /fetchScene|fetchMovie/.test(new Error().stack)) {
windows.set(pathname, window);
if (argv.saveHtml) {
await fs.writeFile(`./html/${pathname}.html`, html);
return {
body: html,
status: res.statusCode,
headers: res.headers,
document: window?.document || null,
ok: res.statusCode >= 200 && res.statusCode <= 299,
return {
body: res.body,
status: res.statusCode,
headers: res.headers,
ok: res.statusCode >= 200 && res.statusCode <= 299,
function getTimeout(options, url) {
return new Promise((resolve, reject, onCancel) => {
2021-03-17 01:09:34 +00:00
const timeout = setTimeout(() => {
2021-10-19 23:46:56 +00:00
logger.debug(`Canceled timed out request to ${url}`);
reject(new Error(`URL ${url} timed out`));
}, (options?.timeout || defaultOptions.timeout) + 10000);
2021-03-17 01:09:34 +00:00
onCancel(() => {
2021-03-17 01:09:34 +00:00
async function scheduleRequest(method = 'get', url, body, requestOptions = {}) {
const options = {
headers: {
responseTimeout: requestOptions.responseTimeout || requestOptions.timeout || defaultOptions.timeout,
stream: !!requestOptions.destination,
const limiter = getLimiter(options, url);
const timeout = getTimeout(options, url);
2021-03-17 01:09:34 +00:00
const result = await limiter.schedule(async () => Promise.race([request(method, url, body, options, limiter), timeout]));
const curatedResult = await finalizeResult(result, options);
2021-10-19 23:46:56 +00:00
logger.silly(`Response ${curatedResult.status} for ${method.toUpperCase()} ${url}`);
2021-03-17 01:09:34 +00:00
return curatedResult;
async function get(url, options) {
return scheduleRequest('get', url, null, options);
async function post(url, body, options) {
return scheduleRequest('post', url, body, options);
async function put(url, body, options) {
return scheduleRequest('put', url, body, options);
async function patch(url, body, options) {
return scheduleRequest('patch', url, body, options);
async function del(url, options) {
return scheduleRequest('delete', url, null, options);
2020-07-22 02:12:20 +00:00
async function head(url, options) {
return scheduleRequest('head', url, null, options);
function getSession(options) {
return bhttp.session({ ...defaultOptions, ...options });
function getCookieJar(store, options) {
2020-12-29 01:05:22 +00:00
return new toughCookie.CookieJar(store, {
looseMode: true,
module.exports = {
2020-12-29 01:05:22 +00:00
2020-07-22 02:12:20 +00:00
delete: del,
session: getSession,
cookieJar: getCookieJar,