forked from DebaucheryLibrarian/traxxx
488 lines
12 KiB
JavaScript
Executable File
488 lines
12 KiB
JavaScript
Executable File
'use strict';
|
|
|
|
const config = require('config');
|
|
const Promise = require('bluebird');
|
|
const bhttp = require('bhttp');
|
|
const fs = require('fs').promises;
|
|
const util = require('util');
|
|
const stream = require('stream');
|
|
const tunnel = require('tunnel');
|
|
const Bottleneck = require('bottleneck');
|
|
const { JSDOM, toughCookie } = require('jsdom');
|
|
const puppeteer = require('puppeteer-extra');
|
|
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
|
|
|
|
const windows = require('./http-windows');
|
|
|
|
const logger = require('../logger')(__filename);
|
|
const virtualConsole = require('./virtual-console')(__filename);
|
|
const argv = require('../argv');
|
|
|
|
const pipeline = util.promisify(stream.pipeline);
|
|
|
|
const limiters = {
|
|
bypass: new Bottleneck({
|
|
minTime: 1000,
|
|
maxConcurrent: 1,
|
|
timeout: 60000,
|
|
}),
|
|
};
|
|
|
|
const bypassSessions = new Map();
|
|
|
|
let browser = null;
|
|
|
|
Promise.config({
|
|
cancellation: true,
|
|
});
|
|
|
|
puppeteer.use(StealthPlugin());
|
|
|
|
const defaultOptions = {
|
|
timeout: argv.requestTimeout,
|
|
encodeJSON: true,
|
|
parse: false,
|
|
headers: {
|
|
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36',
|
|
},
|
|
};
|
|
|
|
const proxyAgent = tunnel.httpsOverHttp({
|
|
proxy: {
|
|
host: config.proxy.host,
|
|
port: config.proxy.port,
|
|
},
|
|
});
|
|
|
|
function useProxy(url) {
|
|
if (!config.proxy.enable) {
|
|
return false;
|
|
}
|
|
|
|
const { hostname } = new URL(url);
|
|
|
|
return config.proxy.hostnames.includes(hostname);
|
|
}
|
|
|
|
function useBrowserBypass(url, options) {
|
|
if (!config.bypass.browser.enable) {
|
|
return null;
|
|
}
|
|
|
|
const { hostname } = new URL(url);
|
|
|
|
if (options.bypassBrowser === 'shared') {
|
|
return true;
|
|
}
|
|
|
|
if (config.bypass.browser.hostnames.includes(hostname)) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
function useCloudflareBypass(url, options) {
|
|
if (!config.bypass.cloudflare.enable) {
|
|
return null;
|
|
}
|
|
|
|
const { hostname } = new URL(url);
|
|
|
|
if (options.bypassCloudflare === 'shared') {
|
|
return 'shared';
|
|
}
|
|
|
|
if (options.bypassCloudflare === 'independent') {
|
|
return hostname;
|
|
}
|
|
|
|
if (config.bypass.cloudflare.sharedHostnames.includes(hostname)) {
|
|
return 'shared';
|
|
}
|
|
|
|
if (config.bypass.cloudflare.independentHostnames.includes(hostname)) {
|
|
return hostname;
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
function getLimiterValue(prop, options, hostname) {
|
|
if (argv[prop] !== undefined) {
|
|
return argv[prop];
|
|
}
|
|
|
|
if (options[prop] !== undefined) {
|
|
return options[prop];
|
|
}
|
|
|
|
if (config.limits[hostname]?.enable !== false && config.limits[hostname]?.[prop] !== undefined) {
|
|
return config.limits[hostname][prop];
|
|
}
|
|
|
|
return config.limits.default[prop];
|
|
}
|
|
|
|
function getLimiter(options = {}, url) {
|
|
const { hostname } = new URL(url);
|
|
|
|
const interval = getLimiterValue('interval', options, hostname);
|
|
const concurrency = getLimiterValue('concurrency', options, hostname);
|
|
|
|
if (!limiters[interval]?.[concurrency]) {
|
|
limiters[interval] = limiters[interval] || {};
|
|
|
|
limiters[interval][concurrency] = new Bottleneck({
|
|
minTime: interval,
|
|
maxConcurrent: concurrency,
|
|
timeout: (options.timeout || defaultOptions.timeout) + 10000, // timeout 10 seconds after bhttp should
|
|
});
|
|
}
|
|
|
|
return limiters[interval][concurrency];
|
|
}
|
|
|
|
function extractJson(body, headers) {
|
|
if (headers['content-type'].includes('application/json')) {
|
|
const { document } = new JSDOM(body, { virtualConsole }).window;
|
|
const dataString = document.querySelector('body > pre')?.textContent;
|
|
|
|
if (dataString) {
|
|
const data = JSON.parse(dataString);
|
|
|
|
return data;
|
|
}
|
|
}
|
|
|
|
return body;
|
|
}
|
|
|
|
async function getBrowserSession(identifier, options = {}) {
|
|
return limiters.bypass.schedule(async () => {
|
|
if (!browser) {
|
|
browser = await puppeteer.launch({
|
|
headless: typeof options.bypass?.headless === 'undefined' ? 'new' : options.bypass.headless,
|
|
// headless: false,
|
|
});
|
|
|
|
logger.info('Initialized puppeteer browser');
|
|
}
|
|
|
|
const tab = await browser.newPage();
|
|
|
|
logger.verbose(`Opened puppeteer tab${identifier ? ` for ${identifier}` : ''}`);
|
|
|
|
return { browser, tab };
|
|
});
|
|
}
|
|
|
|
async function bypassBrowserRequest(url, options) {
|
|
const { tab } = await getBrowserSession(new URL(url).hostname, options);
|
|
|
|
const res = await tab.goto(url);
|
|
|
|
if (options.bypass?.delay) {
|
|
await Promise.delay(options.bypass.delay);
|
|
}
|
|
|
|
if (typeof options.bypass?.evaluate === 'function') {
|
|
await tab.evaluate(options.bypass.evaluate, options.bypass);
|
|
}
|
|
|
|
const rawBody = await tab.content();
|
|
|
|
const headers = res.headers();
|
|
const body = extractJson(rawBody, headers);
|
|
|
|
const statusCode = res.status();
|
|
|
|
if (!statusCode === 200) {
|
|
throw new Error(`Puppeteer bypass failed for ${url} (${statusCode}): ${body?.message}`);
|
|
}
|
|
|
|
return {
|
|
body,
|
|
statusCode,
|
|
headers,
|
|
};
|
|
}
|
|
|
|
async function getBypassSession(url, hostname) {
|
|
if (bypassSessions.has(hostname)) {
|
|
return bypassSessions.get(hostname);
|
|
}
|
|
|
|
const sessionRes = await bhttp.post(config.bypass.cloudflare.path, {
|
|
cmd: 'sessions.create',
|
|
proxy: useProxy(url) ? {
|
|
url: `${config.proxy.host}:${config.proxy.port}`,
|
|
} : null,
|
|
}, {
|
|
encodeJSON: true,
|
|
});
|
|
|
|
if (sessionRes.statusCode !== 200 || sessionRes.body.status !== 'ok') {
|
|
throw new Error(`Could not acquire CloudFlare bypass session for ${url} (${sessionRes.statusCode}): ${sessionRes.body?.message}`);
|
|
}
|
|
|
|
bypassSessions.set(hostname, sessionRes.body.session);
|
|
|
|
return sessionRes.body.session;
|
|
}
|
|
|
|
async function destroyBypassSession(sessionId) {
|
|
const sessionDestroyRes = await limiters.bypass.schedule(async () => bhttp.post(config.bypass.cloudflare.path, {
|
|
cmd: 'sessions.destroy',
|
|
session: sessionId,
|
|
}, {
|
|
encodeJSON: true,
|
|
}));
|
|
|
|
if (sessionDestroyRes.statusCode === 200 && sessionDestroyRes.body.status === 'ok') {
|
|
bypassSessions.delete(sessionId);
|
|
|
|
logger.verbose(`Destroyed bypass session ${sessionId}`);
|
|
|
|
return true;
|
|
}
|
|
|
|
logger.warn(`Failed to destroy bypass session ${sessionId} (${sessionDestroyRes.statusCode}): ${sessionDestroyRes.body?.message}`);
|
|
|
|
return false;
|
|
}
|
|
|
|
async function destroyBypassSessions() {
|
|
if (!config.bypass.cloudflare.enabled) {
|
|
return;
|
|
}
|
|
|
|
const sessionListRes = await limiters.bypass.schedule(async () => bhttp.post(config.bypass.cloudflare.path, {
|
|
cmd: 'sessions.list',
|
|
}, {
|
|
encodeJSON: true,
|
|
}));
|
|
|
|
if (sessionListRes.statusCode !== 200 && sessionListRes.body.status !== 'ok') {
|
|
logger.warn(`Failed to remove bypass sessions (${sessionListRes.statusCode}): ${sessionListRes.body?.message}`);
|
|
}
|
|
|
|
await Promise.map(sessionListRes.body.sessions, async (sessionId) => destroyBypassSession(sessionId), { concurrency: 5 });
|
|
}
|
|
|
|
async function destroyBrowserSessions() {
|
|
await browser?.close();
|
|
}
|
|
|
|
async function bypassCloudflareRequest(url, method, body, cloudflareBypass, options, attempts = 0) {
|
|
const sessionId = await limiters.bypass.schedule(async () => getBypassSession(url, cloudflareBypass));
|
|
|
|
// the bypass proxy opens a new browser for each request, throttle beyond default limits for this URL
|
|
const res = await limiters.bypass.schedule(async () => bhttp.post(config.bypass.cloudflare.path, {
|
|
cmd: `request.${method}`,
|
|
url,
|
|
session: sessionId,
|
|
maxTimeout: options.timeout,
|
|
proxy: useProxy(url) ? {
|
|
url: `${config.proxy.host}:${config.proxy.port}`,
|
|
} : null,
|
|
}, {
|
|
encodeJSON: true,
|
|
}));
|
|
|
|
if (!res.statusCode === 200 || res.body?.status !== 'ok') {
|
|
if (/session closed/i.test(res.body?.message) && attempts < 3) {
|
|
await destroyBypassSession(sessionId);
|
|
|
|
return bypassCloudflareRequest(url, method, body, cloudflareBypass, options, attempts + 1);
|
|
}
|
|
|
|
throw new Error(`CloudFlare bypass failed for ${url} (${res.statusCode}): ${res.body?.message}`);
|
|
}
|
|
|
|
const resBody = extractJson(res.body.solution.response, res.body.solution.headers);
|
|
|
|
return {
|
|
body: resBody,
|
|
statusCode: res.body.solution.status,
|
|
headers: res.body.solution.headers,
|
|
};
|
|
}
|
|
|
|
async function request(method = 'get', url, body, requestOptions = {}, limiter) {
|
|
const http = requestOptions.session || bhttp;
|
|
|
|
const options = {
|
|
...requestOptions,
|
|
session: null,
|
|
};
|
|
|
|
const withProxy = useProxy(url);
|
|
const withBrowserBypass = useBrowserBypass(url, options);
|
|
const withCloudflareBypass = useCloudflareBypass(url, options);
|
|
|
|
if (withProxy) {
|
|
options.agent = proxyAgent;
|
|
}
|
|
|
|
logger.debug(`${method.toUpperCase()} (${limiter._store.storeOptions.minTime}ms/${limiter._store.storeOptions.maxConcurrent}p${withProxy ? ' proxy' : ''}${withBrowserBypass || withCloudflareBypass ? ' bypass' : ''}) ${url}`);
|
|
|
|
if (withBrowserBypass) {
|
|
if (method !== 'get') {
|
|
throw new Error('Browser bypass only supports GET');
|
|
}
|
|
|
|
return bypassBrowserRequest(url, options);
|
|
}
|
|
|
|
if (withCloudflareBypass) {
|
|
return bypassCloudflareRequest(url, method, body, withCloudflareBypass, options);
|
|
}
|
|
|
|
const res = await (body
|
|
? http[method](url, body, options)
|
|
: http[method](url, options));
|
|
|
|
return res;
|
|
}
|
|
|
|
async function finalizeResult(res, options) {
|
|
if (options.destination) {
|
|
// res.on('progress', (bytes, totalBytes) => logger.silly(`Downloaded ${Math.round((bytes / totalBytes) * 100)}% of ${url}`));
|
|
|
|
await pipeline(res, ...(options.transforms || []), options.destination);
|
|
}
|
|
|
|
if (Buffer.isBuffer(res.body)) {
|
|
const html = res.body.toString();
|
|
const window = options?.parse ? new JSDOM(html, { virtualConsole, ...options.extract }).window : null;
|
|
const pathname = new URL(res.request.url).pathname.replace(/\//g, '_');
|
|
|
|
// allow window.close to be called after scraping is done, only for deep scrapes where the URL is known outside the scraper
|
|
if (window && /fetchScene|fetchMovie/.test(new Error().stack)) {
|
|
windows.set(pathname, window);
|
|
}
|
|
|
|
if (argv.saveHtml) {
|
|
await fs.writeFile(`./html/${pathname}.html`, html);
|
|
}
|
|
|
|
return {
|
|
...res,
|
|
body: html,
|
|
html,
|
|
status: res.statusCode,
|
|
headers: res.headers,
|
|
document: window?.document || null,
|
|
window,
|
|
ok: res.statusCode >= 200 && res.statusCode <= 299,
|
|
};
|
|
}
|
|
|
|
return {
|
|
...res,
|
|
body: res.body,
|
|
status: res.statusCode,
|
|
headers: res.headers,
|
|
ok: res.statusCode >= 200 && res.statusCode <= 299,
|
|
};
|
|
}
|
|
|
|
function getTimeout(options, url) {
|
|
return new Promise((resolve, reject, onCancel) => {
|
|
const timeout = setTimeout(() => {
|
|
logger.debug(`Canceled timed out request to ${url}`);
|
|
reject(new Error(`URL ${url} timed out`));
|
|
}, (options?.timeout || defaultOptions.timeout) + 10000);
|
|
|
|
onCancel(() => {
|
|
clearTimeout(timeout);
|
|
});
|
|
});
|
|
}
|
|
|
|
async function scheduleRequest(method = 'get', url, body, requestOptions = {}) {
|
|
if (typeof url !== 'string') {
|
|
console.trace(`Bad URL: ${JSON.stringify(url)}`);
|
|
}
|
|
|
|
const options = {
|
|
...defaultOptions,
|
|
...requestOptions,
|
|
headers: {
|
|
...(requestOptions.includeDefaultHeaders === false ? {} : defaultOptions.headers),
|
|
...requestOptions.headers,
|
|
},
|
|
responseTimeout: requestOptions.responseTimeout || requestOptions.timeout || defaultOptions.timeout,
|
|
stream: !!requestOptions.destination,
|
|
};
|
|
|
|
const limiter = getLimiter(options, url);
|
|
const timeout = getTimeout(options, url);
|
|
|
|
const result = await limiter.schedule(async () => Promise.race([request(method, url, body, options, limiter), timeout]));
|
|
|
|
timeout.cancel();
|
|
|
|
const curatedResult = await finalizeResult(result, options);
|
|
|
|
logger.silly(`Response ${curatedResult.status} for ${method.toUpperCase()} ${url}`);
|
|
|
|
return curatedResult;
|
|
}
|
|
|
|
async function get(url, options) {
|
|
return scheduleRequest('get', url, null, options);
|
|
}
|
|
|
|
async function post(url, body, options) {
|
|
return scheduleRequest('post', url, body, options);
|
|
}
|
|
|
|
async function put(url, body, options) {
|
|
return scheduleRequest('put', url, body, options);
|
|
}
|
|
|
|
async function patch(url, body, options) {
|
|
return scheduleRequest('patch', url, body, options);
|
|
}
|
|
|
|
async function del(url, options) {
|
|
return scheduleRequest('delete', url, null, options);
|
|
}
|
|
|
|
async function head(url, options) {
|
|
return scheduleRequest('head', url, null, options);
|
|
}
|
|
|
|
function getSession(options) {
|
|
return bhttp.session({ ...defaultOptions, ...options });
|
|
}
|
|
|
|
function getCookieJar(store, options) {
|
|
return new toughCookie.CookieJar(store, {
|
|
looseMode: true,
|
|
...options,
|
|
});
|
|
}
|
|
|
|
module.exports = {
|
|
toughCookie,
|
|
get,
|
|
head,
|
|
post,
|
|
delete: del,
|
|
put,
|
|
patch,
|
|
session: getSession,
|
|
cookieJar: getCookieJar,
|
|
browser,
|
|
getBrowserSession,
|
|
getBypassSession,
|
|
getSession,
|
|
getCookieJar,
|
|
destroyBypassSessions,
|
|
destroyBrowserSessions,
|
|
proxyAgent,
|
|
};
|