'use strict'; const config = require('config'); const Promise = require('bluebird'); const bhttp = require('bhttp'); const fs = require('fs').promises; const util = require('util'); const stream = require('stream'); const tunnel = require('tunnel'); const Bottleneck = require('bottleneck'); const { JSDOM, toughCookie } = require('jsdom'); const windows = require('./http-windows'); const logger = require('../logger')(__filename); const virtualConsole = require('./virtual-console')(__filename); const argv = require('../argv'); const pipeline = util.promisify(stream.pipeline); const limiters = { bypass: new Bottleneck({ minTime: 1000, maxConcurrent: 1, timeout: 60000, }), }; const bypassSessions = new Map(); Promise.config({ cancellation: true, }); const defaultOptions = { timeout: argv.requestTimeout, encodeJSON: true, parse: false, headers: { 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36', }, }; const proxyAgent = tunnel.httpsOverHttp({ proxy: { host: config.proxy.host, port: config.proxy.port, }, }); function useProxy(url) { if (!config.proxy.enable) { return false; } const { hostname } = new URL(url); return config.proxy.hostnames.includes(hostname); } function useCloudflareBypass(url, options) { if (!config.bypass.cloudflare.enable) { return null; } const { hostname } = new URL(url); if (options.bypassCloudflare === 'shared') { return 'shared'; } if (options.bypassCloudflare === 'independent') { return hostname; } if (config.bypass.cloudflare.sharedHostnames.includes(hostname)) { return 'shared'; } if (config.bypass.cloudflare.independentHostnames.includes(hostname)) { return hostname; } return null; } function getLimiterValue(prop, options, hostname) { if (argv[prop] !== undefined) { return argv[prop]; } if (options[prop] !== undefined) { return options[prop]; } if (config.limits[hostname]?.enable !== false && config.limits[hostname]?.[prop] !== undefined) { return config.limits[hostname][prop]; } return config.limits.default[prop]; } function getLimiter(options = {}, url) { const { hostname } = new URL(url); const interval = getLimiterValue('interval', options, hostname); const concurrency = getLimiterValue('concurrency', options, hostname); if (!limiters[interval]?.[concurrency]) { limiters[interval] = limiters[interval] || {}; limiters[interval][concurrency] = new Bottleneck({ minTime: interval, maxConcurrent: concurrency, timeout: (options.timeout || defaultOptions.timeout) + 10000, // timeout 10 seconds after bhttp should }); } return limiters[interval][concurrency]; } function extractJson(solution) { if (solution.headers['content-type'].includes('application/json')) { const { document } = new JSDOM(solution.response, { virtualConsole }).window; const dataString = document.querySelector('body > pre')?.textContent; if (dataString) { const data = JSON.parse(dataString); return data; } } return solution.response; } async function getBypassSession(url, hostname) { if (bypassSessions.has(hostname)) { return bypassSessions.get(hostname); } const sessionRes = await bhttp.post(config.bypass.cloudflare.path, { cmd: 'sessions.create', proxy: useProxy(url) ? { url: `${config.proxy.host}:${config.proxy.port}`, } : null, }, { encodeJSON: true, }); if (sessionRes.statusCode !== 200 || sessionRes.body.status !== 'ok') { throw new Error(`Could not acquire CloudFlare bypass session for ${url} (${sessionRes.statusCode}): ${sessionRes.body?.message}`); } bypassSessions.set(hostname, sessionRes.body.session); return sessionRes.body.session; } async function destroyBypassSession(sessionId) { const sessionDestroyRes = await limiters.bypass.schedule(async () => bhttp.post(config.bypass.cloudflare.path, { cmd: 'sessions.destroy', session: sessionId, }, { encodeJSON: true, })); if (sessionDestroyRes.statusCode === 200 && sessionDestroyRes.body.status === 'ok') { bypassSessions.delete(sessionId); logger.verbose(`Destroyed bypass session ${sessionId}`); return true; } logger.warn(`Failed to destroy bypass session ${sessionId} (${sessionDestroyRes.statusCode}): ${sessionDestroyRes.body?.message}`); return false; } async function destroyBypassSessions() { const sessionListRes = await limiters.bypass.schedule(async () => bhttp.post(config.bypass.cloudflare.path, { cmd: 'sessions.list', }, { encodeJSON: true, })); if (sessionListRes.statusCode !== 200 && sessionListRes.body.status !== 'ok') { logger.warn(`Failed to remove bypass sessions (${sessionListRes.statusCode}): ${sessionListRes.body?.message}`); } await Promise.map(sessionListRes.body.sessions, async (sessionId) => destroyBypassSession(sessionId), { concurrency: 5 }); } async function bypassCloudflareRequest(url, method, body, cloudflareBypass, options, attempts = 0) { const sessionId = await limiters.bypass.schedule(async () => getBypassSession(url, cloudflareBypass)); // the bypass proxy opens a new browser for each request, throttle beyond default limits for this URL const res = await limiters.bypass.schedule(async () => bhttp.post(config.bypass.cloudflare.path, { cmd: `request.${method}`, url, session: sessionId, maxTimeout: options.timeout, proxy: useProxy(url) ? { url: `${config.proxy.host}:${config.proxy.port}`, } : null, }, { encodeJSON: true, })); if (!res.statusCode === 200 || res.body?.status !== 'ok') { if (/session closed/i.test(res.body?.message) && attempts < 3) { await destroyBypassSession(sessionId); return bypassCloudflareRequest(url, method, body, cloudflareBypass, options, attempts + 1); } throw new Error(`CloudFlare bypass failed for ${url} (${res.statusCode}): ${res.body?.message}`); } const resBody = extractJson(res.body.solution); return { body: resBody, statusCode: res.body.solution.status, headers: res.body.solution.headers, }; } async function request(method = 'get', url, body, requestOptions = {}, limiter) { const http = requestOptions.session || bhttp; const options = { ...requestOptions, session: null, }; const withProxy = useProxy(url); const withCloudflareBypass = useCloudflareBypass(url, options); if (withProxy) { options.agent = proxyAgent; } logger.debug(`${method.toUpperCase()} (${limiter._store.storeOptions.minTime}ms/${limiter._store.storeOptions.maxConcurrent}p${withProxy ? ' proxy' : ''}${withCloudflareBypass ? ' bypass' : ''}) ${url}`); if (withCloudflareBypass) { return bypassCloudflareRequest(url, method, body, withCloudflareBypass, options); } const res = await (body ? http[method](url, body, options) : http[method](url, options)); return res; } async function finalizeResult(res, options) { if (options.destination) { // res.on('progress', (bytes, totalBytes) => logger.silly(`Downloaded ${Math.round((bytes / totalBytes) * 100)}% of ${url}`)); await pipeline(res, ...(options.transforms || []), options.destination); } if (Buffer.isBuffer(res.body)) { const html = res.body.toString(); const window = options?.parse ? new JSDOM(html, { virtualConsole, ...options.extract }).window : null; const pathname = new URL(res.request.url).pathname.replace(/\//g, '_'); // allow window.close to be called after scraping is done, only for deep scrapes where the URL is known outside the scraper if (window && /fetchScene|fetchMovie/.test(new Error().stack)) { windows.set(pathname, window); } if (argv.saveHtml) { await fs.writeFile(`./html/${pathname}.html`, html); } return { ...res, body: html, html, status: res.statusCode, headers: res.headers, document: window?.document || null, window, ok: res.statusCode >= 200 && res.statusCode <= 299, }; } return { ...res, body: res.body, status: res.statusCode, headers: res.headers, ok: res.statusCode >= 200 && res.statusCode <= 299, }; } function getTimeout(options, url) { return new Promise((resolve, reject, onCancel) => { const timeout = setTimeout(() => { logger.debug(`Canceled timed out request to ${url}`); reject(new Error(`URL ${url} timed out`)); }, (options?.timeout || defaultOptions.timeout) + 10000); onCancel(() => { clearTimeout(timeout); }); }); } async function scheduleRequest(method = 'get', url, body, requestOptions = {}) { if (typeof url !== 'string') { console.trace(`Bad URL: ${JSON.stringify(url)}`); } const options = { ...defaultOptions, ...requestOptions, headers: { ...(requestOptions.includeDefaultHeaders === false ? {} : defaultOptions.headers), ...requestOptions.headers, }, responseTimeout: requestOptions.responseTimeout || requestOptions.timeout || defaultOptions.timeout, stream: !!requestOptions.destination, }; const limiter = getLimiter(options, url); const timeout = getTimeout(options, url); const result = await limiter.schedule(async () => Promise.race([request(method, url, body, options, limiter), timeout])); timeout.cancel(); const curatedResult = await finalizeResult(result, options); logger.silly(`Response ${curatedResult.status} for ${method.toUpperCase()} ${url}`); return curatedResult; } async function get(url, options) { return scheduleRequest('get', url, null, options); } async function post(url, body, options) { return scheduleRequest('post', url, body, options); } async function put(url, body, options) { return scheduleRequest('put', url, body, options); } async function patch(url, body, options) { return scheduleRequest('patch', url, body, options); } async function del(url, options) { return scheduleRequest('delete', url, null, options); } async function head(url, options) { return scheduleRequest('head', url, null, options); } function getSession(options) { return bhttp.session({ ...defaultOptions, ...options }); } function getCookieJar(store, options) { return new toughCookie.CookieJar(store, { looseMode: true, ...options, }); } module.exports = { toughCookie, get, head, post, delete: del, put, patch, session: getSession, cookieJar: getCookieJar, getSession, getCookieJar, destroyBypassSessions, };