Closing JSDOM window after deep scrape in an attempt to save memory. Reduced deep scrape concurrency to 5.

This commit is contained in:
DebaucheryLibrarian 2021-12-01 17:26:13 +01:00
parent 08f725a0b6
commit e29cbc9fea
6 changed files with 59 additions and 2 deletions

1
.gitignore vendored
View File

@ -2,6 +2,7 @@ node_modules/
dist/
log/
media/
html/
public/js/*
public/css/*
config/*

View File

@ -9,6 +9,7 @@ const { fetchReleaseEntities, urlToSiteSlug } = require('./entities');
const logger = require('./logger')(__filename);
const qu = require('./utils/qu');
const getRecursiveParameters = require('./utils/get-recursive-parameters');
const windows = require('./utils/http-windows');
function toBaseReleases(baseReleasesOrUrls, entity = null) {
if (!baseReleasesOrUrls) {
@ -116,10 +117,19 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
parameters: getRecursiveParameters(entity),
};
logger.debug(`Memory usage before: ${process.memoryUsage.rss() / 1000000} MB (${baseRelease.url})`);
const rawScrapedRelease = type === 'scene'
? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, options, null)
: await layoutScraper.fetchMovie(baseRelease.url, entity, baseRelease, options, null);
const pathname = new URL(baseRelease.url).pathname.replace(/\//g, '_');
windows.get(pathname)?.close();
windows.delete(pathname);
logger.debug(`Memory usage after: ${process.memoryUsage.rss() / 1000000} MB (${baseRelease.url})`);
const scrapedRelease = rawScrapedRelease?.scene || rawScrapedRelease;
if (!scrapedRelease || typeof scrapedRelease !== 'object' || Array.isArray(scrapedRelease)) {
@ -186,7 +196,7 @@ async function scrapeReleases(baseReleases, entitiesBySlug, type) {
return Promise.map(
baseReleases,
async (baseRelease) => scrapeRelease(baseRelease, entitiesWithBeforeDataBySlug, type),
{ concurrency: 10 },
{ concurrency: 5 },
);
}

View File

@ -5,7 +5,6 @@ const qu = require('../utils/qu');
async function fetchScene(url, site, baseRelease, options) {
const res = await qu.get(url);
const release = await scrapeScene(res.item, url, site, baseRelease, null, options);
const siteDomain = release.query.el('meta[name="twitter:domain"]', 'content') || 'allblackx.com'; // only AllBlackX has no twitter domain, no other useful hints available

View File

@ -0,0 +1,3 @@
'use strict';
module.exports = new Map();

View File

@ -3,12 +3,15 @@
const config = require('config');
const Promise = require('bluebird');
const bhttp = require('bhttp');
const fs = require('fs').promises;
const util = require('util');
const stream = require('stream');
const tunnel = require('tunnel');
const Bottleneck = require('bottleneck');
const { JSDOM, toughCookie } = require('jsdom');
const windows = require('./http-windows');
const logger = require('../logger')(__filename);
const virtualConsole = require('./virtual-console')(__filename);
const argv = require('../argv');
@ -114,6 +117,15 @@ async function finalizeResult(res, options) {
if (Buffer.isBuffer(res.body)) {
const html = res.body.toString();
const window = options?.parse ? new JSDOM(html, { virtualConsole, ...options.extract }).window : null;
const pathname = new URL(res.request.url).pathname.replace(/\//g, '_');
if (window) {
windows.set(pathname, window);
}
if (argv.saveHtml) {
await fs.writeFile(`./html/${pathname}.html`, html);
}
return {
...res,

32
src/utils/jsdom-perf.js Normal file
View File

@ -0,0 +1,32 @@
'use strict';
const fs = require('fs').promises;
const Promise = require('bluebird');
const { JSDOM } = require('jsdom');
async function init() {
let peak = 0;
const files = await fs.readdir('./html');
await Promise.map(Array.from({ length: 10 }).map(() => files).flat(), async (filename) => {
const html = await fs.readFile(`./html/${filename}`, 'utf8');
const dom = new JSDOM(html);
dom.window.close();
const usage = process.memoryUsage.rss() / 1000000;
peak = Math.max(usage, peak);
console.log(`Memory usage: ${usage.toFixed(2)} MB, peak ${peak.toFixed(2)} MB`);
await Promise.delay(100);
}, {
concurrency: 10,
});
await Promise.delay(2000);
console.log(`Final memory usage: ${(process.memoryUsage.rss() / 1000000).toFixed(2)} MB, max ${peak.toFixed(2)} MB`);
}
init();