Closing JSDOM window after deep scrape in an attempt to save memory. Reduced deep scrape concurrency to 5.

This commit is contained in:
DebaucheryLibrarian
2021-12-01 17:26:13 +01:00
parent 08f725a0b6
commit e29cbc9fea
6 changed files with 59 additions and 2 deletions

View File

@@ -0,0 +1,3 @@
'use strict';
module.exports = new Map();

View File

@@ -3,12 +3,15 @@
const config = require('config');
const Promise = require('bluebird');
const bhttp = require('bhttp');
const fs = require('fs').promises;
const util = require('util');
const stream = require('stream');
const tunnel = require('tunnel');
const Bottleneck = require('bottleneck');
const { JSDOM, toughCookie } = require('jsdom');
const windows = require('./http-windows');
const logger = require('../logger')(__filename);
const virtualConsole = require('./virtual-console')(__filename);
const argv = require('../argv');
@@ -114,6 +117,15 @@ async function finalizeResult(res, options) {
if (Buffer.isBuffer(res.body)) {
const html = res.body.toString();
const window = options?.parse ? new JSDOM(html, { virtualConsole, ...options.extract }).window : null;
const pathname = new URL(res.request.url).pathname.replace(/\//g, '_');
if (window) {
windows.set(pathname, window);
}
if (argv.saveHtml) {
await fs.writeFile(`./html/${pathname}.html`, html);
}
return {
...res,

32
src/utils/jsdom-perf.js Normal file
View File

@@ -0,0 +1,32 @@
'use strict';
const fs = require('fs').promises;
const Promise = require('bluebird');
const { JSDOM } = require('jsdom');
async function init() {
let peak = 0;
const files = await fs.readdir('./html');
await Promise.map(Array.from({ length: 10 }).map(() => files).flat(), async (filename) => {
const html = await fs.readFile(`./html/${filename}`, 'utf8');
const dom = new JSDOM(html);
dom.window.close();
const usage = process.memoryUsage.rss() / 1000000;
peak = Math.max(usage, peak);
console.log(`Memory usage: ${usage.toFixed(2)} MB, peak ${peak.toFixed(2)} MB`);
await Promise.delay(100);
}, {
concurrency: 10,
});
await Promise.delay(2000);
console.log(`Final memory usage: ${(process.memoryUsage.rss() / 1000000).toFixed(2)} MB, max ${peak.toFixed(2)} MB`);
}
init();