traxxx/src/utils/http.js

465 lines
11 KiB
JavaScript
Executable File

'use strict';
const config = require('config');
const Promise = require('bluebird');
const bhttp = require('bhttp');
const fs = require('fs').promises;
const util = require('util');
const stream = require('stream');
const tunnel = require('tunnel');
const Bottleneck = require('bottleneck');
const { JSDOM, toughCookie } = require('jsdom');
const puppeteer = require('puppeteer');
const windows = require('./http-windows');
const logger = require('../logger')(__filename);
const virtualConsole = require('./virtual-console')(__filename);
const argv = require('../argv');
const pipeline = util.promisify(stream.pipeline);
const limiters = {
bypass: new Bottleneck({
minTime: 1000,
maxConcurrent: 1,
timeout: 60000,
}),
};
const bypassSessions = new Map();
let browser = null;
const browserSessions = new Map();
Promise.config({
cancellation: true,
});
const defaultOptions = {
timeout: argv.requestTimeout,
encodeJSON: true,
parse: false,
headers: {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36',
},
};
const proxyAgent = tunnel.httpsOverHttp({
proxy: {
host: config.proxy.host,
port: config.proxy.port,
},
});
function useProxy(url) {
if (!config.proxy.enable) {
return false;
}
const { hostname } = new URL(url);
return config.proxy.hostnames.includes(hostname);
}
function useBrowserBypass(url, options) {
if (!config.bypass.browser.enable) {
return null;
}
const { hostname } = new URL(url);
if (options.bypassBrowser === 'shared') {
return true;
}
if (config.bypass.browser.hostnames.includes(hostname)) {
return true;
}
return false;
}
function useCloudflareBypass(url, options) {
if (!config.bypass.cloudflare.enable) {
return null;
}
const { hostname } = new URL(url);
if (options.bypassCloudflare === 'shared') {
return 'shared';
}
if (options.bypassCloudflare === 'independent') {
return hostname;
}
if (config.bypass.cloudflare.sharedHostnames.includes(hostname)) {
return 'shared';
}
if (config.bypass.cloudflare.independentHostnames.includes(hostname)) {
return hostname;
}
return null;
}
function getLimiterValue(prop, options, hostname) {
if (argv[prop] !== undefined) {
return argv[prop];
}
if (options[prop] !== undefined) {
return options[prop];
}
if (config.limits[hostname]?.enable !== false && config.limits[hostname]?.[prop] !== undefined) {
return config.limits[hostname][prop];
}
return config.limits.default[prop];
}
function getLimiter(options = {}, url) {
const { hostname } = new URL(url);
const interval = getLimiterValue('interval', options, hostname);
const concurrency = getLimiterValue('concurrency', options, hostname);
if (!limiters[interval]?.[concurrency]) {
limiters[interval] = limiters[interval] || {};
limiters[interval][concurrency] = new Bottleneck({
minTime: interval,
maxConcurrent: concurrency,
timeout: (options.timeout || defaultOptions.timeout) + 10000, // timeout 10 seconds after bhttp should
});
}
return limiters[interval][concurrency];
}
function extractJson(solution) {
if (solution.headers['content-type'].includes('application/json')) {
const { document } = new JSDOM(solution.response, { virtualConsole }).window;
const dataString = document.querySelector('body > pre')?.textContent;
if (dataString) {
const data = JSON.parse(dataString);
return data;
}
}
return solution.response;
}
async function getBrowserSession(hostname) {
console.log(browserSessions);
if (browserSessions.has(hostname)) {
return browserSessions.get(hostname);
}
if (!browser) {
browser = await puppeteer.launch({ headless: false });
}
const page = await browser.newPage();
browserSessions.set(hostname, page);
return page;
}
async function bypassBrowserRequest(url, _options) {
const page = await limiters.bypass.schedule(async () => getBrowserSession(new URL(url).hostname));
const res = await page.goto(url);
const body = await page.content();
console.log(res);
console.log(res.status());
return {
body,
/*
statusCode: res.body.solution.status,
headers: res.body.solution.headers,
*/
};
}
async function getBypassSession(url, hostname) {
if (bypassSessions.has(hostname)) {
return bypassSessions.get(hostname);
}
const sessionRes = await bhttp.post(config.bypass.cloudflare.path, {
cmd: 'sessions.create',
proxy: useProxy(url) ? {
url: `${config.proxy.host}:${config.proxy.port}`,
} : null,
}, {
encodeJSON: true,
});
if (sessionRes.statusCode !== 200 || sessionRes.body.status !== 'ok') {
throw new Error(`Could not acquire CloudFlare bypass session for ${url} (${sessionRes.statusCode}): ${sessionRes.body?.message}`);
}
bypassSessions.set(hostname, sessionRes.body.session);
return sessionRes.body.session;
}
async function destroyBypassSession(sessionId) {
const sessionDestroyRes = await limiters.bypass.schedule(async () => bhttp.post(config.bypass.cloudflare.path, {
cmd: 'sessions.destroy',
session: sessionId,
}, {
encodeJSON: true,
}));
if (sessionDestroyRes.statusCode === 200 && sessionDestroyRes.body.status === 'ok') {
bypassSessions.delete(sessionId);
logger.verbose(`Destroyed bypass session ${sessionId}`);
return true;
}
logger.warn(`Failed to destroy bypass session ${sessionId} (${sessionDestroyRes.statusCode}): ${sessionDestroyRes.body?.message}`);
return false;
}
async function destroyBypassSessions() {
if (!config.bypass.cloudflare.enabled) {
return;
}
const sessionListRes = await limiters.bypass.schedule(async () => bhttp.post(config.bypass.cloudflare.path, {
cmd: 'sessions.list',
}, {
encodeJSON: true,
}));
if (sessionListRes.statusCode !== 200 && sessionListRes.body.status !== 'ok') {
logger.warn(`Failed to remove bypass sessions (${sessionListRes.statusCode}): ${sessionListRes.body?.message}`);
}
await Promise.map(sessionListRes.body.sessions, async (sessionId) => destroyBypassSession(sessionId), { concurrency: 5 });
}
async function bypassCloudflareRequest(url, method, body, cloudflareBypass, options, attempts = 0) {
const sessionId = await limiters.bypass.schedule(async () => getBypassSession(url, cloudflareBypass));
// the bypass proxy opens a new browser for each request, throttle beyond default limits for this URL
const res = await limiters.bypass.schedule(async () => bhttp.post(config.bypass.cloudflare.path, {
cmd: `request.${method}`,
url,
session: sessionId,
maxTimeout: options.timeout,
proxy: useProxy(url) ? {
url: `${config.proxy.host}:${config.proxy.port}`,
} : null,
}, {
encodeJSON: true,
}));
if (!res.statusCode === 200 || res.body?.status !== 'ok') {
if (/session closed/i.test(res.body?.message) && attempts < 3) {
await destroyBypassSession(sessionId);
return bypassCloudflareRequest(url, method, body, cloudflareBypass, options, attempts + 1);
}
throw new Error(`CloudFlare bypass failed for ${url} (${res.statusCode}): ${res.body?.message}`);
}
const resBody = extractJson(res.body.solution);
return {
body: resBody,
statusCode: res.body.solution.status,
headers: res.body.solution.headers,
};
}
async function request(method = 'get', url, body, requestOptions = {}, limiter) {
const http = requestOptions.session || bhttp;
const options = {
...requestOptions,
session: null,
};
const withProxy = useProxy(url);
const withBrowserBypass = useBrowserBypass(url, options);
const withCloudflareBypass = useCloudflareBypass(url, options);
if (withProxy) {
options.agent = proxyAgent;
}
logger.debug(`${method.toUpperCase()} (${limiter._store.storeOptions.minTime}ms/${limiter._store.storeOptions.maxConcurrent}p${withProxy ? ' proxy' : ''}${withBrowserBypass || withCloudflareBypass ? ' bypass' : ''}) ${url}`);
if (withBrowserBypass) {
if (method !== 'get') {
throw new Error('Browser bypass only supports GET');
}
return bypassBrowserRequest(url, options);
}
if (withCloudflareBypass) {
return bypassCloudflareRequest(url, method, body, withCloudflareBypass, options);
}
const res = await (body
? http[method](url, body, options)
: http[method](url, options));
return res;
}
async function finalizeResult(res, options) {
if (options.destination) {
// res.on('progress', (bytes, totalBytes) => logger.silly(`Downloaded ${Math.round((bytes / totalBytes) * 100)}% of ${url}`));
await pipeline(res, ...(options.transforms || []), options.destination);
}
if (Buffer.isBuffer(res.body)) {
const html = res.body.toString();
const window = options?.parse ? new JSDOM(html, { virtualConsole, ...options.extract }).window : null;
const pathname = new URL(res.request.url).pathname.replace(/\//g, '_');
// allow window.close to be called after scraping is done, only for deep scrapes where the URL is known outside the scraper
if (window && /fetchScene|fetchMovie/.test(new Error().stack)) {
windows.set(pathname, window);
}
if (argv.saveHtml) {
await fs.writeFile(`./html/${pathname}.html`, html);
}
return {
...res,
body: html,
html,
status: res.statusCode,
headers: res.headers,
document: window?.document || null,
window,
ok: res.statusCode >= 200 && res.statusCode <= 299,
};
}
return {
...res,
body: res.body,
status: res.statusCode,
headers: res.headers,
ok: res.statusCode >= 200 && res.statusCode <= 299,
};
}
function getTimeout(options, url) {
return new Promise((resolve, reject, onCancel) => {
const timeout = setTimeout(() => {
logger.debug(`Canceled timed out request to ${url}`);
reject(new Error(`URL ${url} timed out`));
}, (options?.timeout || defaultOptions.timeout) + 10000);
onCancel(() => {
clearTimeout(timeout);
});
});
}
async function scheduleRequest(method = 'get', url, body, requestOptions = {}) {
if (typeof url !== 'string') {
console.trace(`Bad URL: ${JSON.stringify(url)}`);
}
const options = {
...defaultOptions,
...requestOptions,
headers: {
...(requestOptions.includeDefaultHeaders === false ? {} : defaultOptions.headers),
...requestOptions.headers,
},
responseTimeout: requestOptions.responseTimeout || requestOptions.timeout || defaultOptions.timeout,
stream: !!requestOptions.destination,
};
const limiter = getLimiter(options, url);
const timeout = getTimeout(options, url);
const result = await limiter.schedule(async () => Promise.race([request(method, url, body, options, limiter), timeout]));
timeout.cancel();
const curatedResult = await finalizeResult(result, options);
logger.silly(`Response ${curatedResult.status} for ${method.toUpperCase()} ${url}`);
return curatedResult;
}
async function get(url, options) {
return scheduleRequest('get', url, null, options);
}
async function post(url, body, options) {
return scheduleRequest('post', url, body, options);
}
async function put(url, body, options) {
return scheduleRequest('put', url, body, options);
}
async function patch(url, body, options) {
return scheduleRequest('patch', url, body, options);
}
async function del(url, options) {
return scheduleRequest('delete', url, null, options);
}
async function head(url, options) {
return scheduleRequest('head', url, null, options);
}
function getSession(options) {
return bhttp.session({ ...defaultOptions, ...options });
}
function getCookieJar(store, options) {
return new toughCookie.CookieJar(store, {
looseMode: true,
...options,
});
}
module.exports = {
toughCookie,
get,
head,
post,
delete: del,
put,
patch,
session: getSession,
cookieJar: getCookieJar,
getBrowserSession,
getBypassSession,
getSession,
getCookieJar,
destroyBypassSessions,
};