Closing JSDOM window after deep scrape in an attempt to save memory. Reduced deep scrape concurrency to 5.

This commit is contained in:
DebaucheryLibrarian 2021-12-01 17:26:13 +01:00
parent 08f725a0b6
commit e29cbc9fea
6 changed files with 59 additions and 2 deletions

1
.gitignore vendored
View File

@ -2,6 +2,7 @@ node_modules/
dist/ dist/
log/ log/
media/ media/
html/
public/js/* public/js/*
public/css/* public/css/*
config/* config/*

View File

@ -9,6 +9,7 @@ const { fetchReleaseEntities, urlToSiteSlug } = require('./entities');
const logger = require('./logger')(__filename); const logger = require('./logger')(__filename);
const qu = require('./utils/qu'); const qu = require('./utils/qu');
const getRecursiveParameters = require('./utils/get-recursive-parameters'); const getRecursiveParameters = require('./utils/get-recursive-parameters');
const windows = require('./utils/http-windows');
function toBaseReleases(baseReleasesOrUrls, entity = null) { function toBaseReleases(baseReleasesOrUrls, entity = null) {
if (!baseReleasesOrUrls) { if (!baseReleasesOrUrls) {
@ -116,10 +117,19 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
parameters: getRecursiveParameters(entity), parameters: getRecursiveParameters(entity),
}; };
logger.debug(`Memory usage before: ${process.memoryUsage.rss() / 1000000} MB (${baseRelease.url})`);
const rawScrapedRelease = type === 'scene' const rawScrapedRelease = type === 'scene'
? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, options, null) ? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, options, null)
: await layoutScraper.fetchMovie(baseRelease.url, entity, baseRelease, options, null); : await layoutScraper.fetchMovie(baseRelease.url, entity, baseRelease, options, null);
const pathname = new URL(baseRelease.url).pathname.replace(/\//g, '_');
windows.get(pathname)?.close();
windows.delete(pathname);
logger.debug(`Memory usage after: ${process.memoryUsage.rss() / 1000000} MB (${baseRelease.url})`);
const scrapedRelease = rawScrapedRelease?.scene || rawScrapedRelease; const scrapedRelease = rawScrapedRelease?.scene || rawScrapedRelease;
if (!scrapedRelease || typeof scrapedRelease !== 'object' || Array.isArray(scrapedRelease)) { if (!scrapedRelease || typeof scrapedRelease !== 'object' || Array.isArray(scrapedRelease)) {
@ -186,7 +196,7 @@ async function scrapeReleases(baseReleases, entitiesBySlug, type) {
return Promise.map( return Promise.map(
baseReleases, baseReleases,
async (baseRelease) => scrapeRelease(baseRelease, entitiesWithBeforeDataBySlug, type), async (baseRelease) => scrapeRelease(baseRelease, entitiesWithBeforeDataBySlug, type),
{ concurrency: 10 }, { concurrency: 5 },
); );
} }

View File

@ -5,7 +5,6 @@ const qu = require('../utils/qu');
async function fetchScene(url, site, baseRelease, options) { async function fetchScene(url, site, baseRelease, options) {
const res = await qu.get(url); const res = await qu.get(url);
const release = await scrapeScene(res.item, url, site, baseRelease, null, options); const release = await scrapeScene(res.item, url, site, baseRelease, null, options);
const siteDomain = release.query.el('meta[name="twitter:domain"]', 'content') || 'allblackx.com'; // only AllBlackX has no twitter domain, no other useful hints available const siteDomain = release.query.el('meta[name="twitter:domain"]', 'content') || 'allblackx.com'; // only AllBlackX has no twitter domain, no other useful hints available

View File

@ -0,0 +1,3 @@
'use strict';
module.exports = new Map();

View File

@ -3,12 +3,15 @@
const config = require('config'); const config = require('config');
const Promise = require('bluebird'); const Promise = require('bluebird');
const bhttp = require('bhttp'); const bhttp = require('bhttp');
const fs = require('fs').promises;
const util = require('util'); const util = require('util');
const stream = require('stream'); const stream = require('stream');
const tunnel = require('tunnel'); const tunnel = require('tunnel');
const Bottleneck = require('bottleneck'); const Bottleneck = require('bottleneck');
const { JSDOM, toughCookie } = require('jsdom'); const { JSDOM, toughCookie } = require('jsdom');
const windows = require('./http-windows');
const logger = require('../logger')(__filename); const logger = require('../logger')(__filename);
const virtualConsole = require('./virtual-console')(__filename); const virtualConsole = require('./virtual-console')(__filename);
const argv = require('../argv'); const argv = require('../argv');
@ -114,6 +117,15 @@ async function finalizeResult(res, options) {
if (Buffer.isBuffer(res.body)) { if (Buffer.isBuffer(res.body)) {
const html = res.body.toString(); const html = res.body.toString();
const window = options?.parse ? new JSDOM(html, { virtualConsole, ...options.extract }).window : null; const window = options?.parse ? new JSDOM(html, { virtualConsole, ...options.extract }).window : null;
const pathname = new URL(res.request.url).pathname.replace(/\//g, '_');
if (window) {
windows.set(pathname, window);
}
if (argv.saveHtml) {
await fs.writeFile(`./html/${pathname}.html`, html);
}
return { return {
...res, ...res,

32
src/utils/jsdom-perf.js Normal file
View File

@ -0,0 +1,32 @@
'use strict';
const fs = require('fs').promises;
const Promise = require('bluebird');
const { JSDOM } = require('jsdom');
async function init() {
let peak = 0;
const files = await fs.readdir('./html');
await Promise.map(Array.from({ length: 10 }).map(() => files).flat(), async (filename) => {
const html = await fs.readFile(`./html/${filename}`, 'utf8');
const dom = new JSDOM(html);
dom.window.close();
const usage = process.memoryUsage.rss() / 1000000;
peak = Math.max(usage, peak);
console.log(`Memory usage: ${usage.toFixed(2)} MB, peak ${peak.toFixed(2)} MB`);
await Promise.delay(100);
}, {
concurrency: 10,
});
await Promise.delay(2000);
console.log(`Final memory usage: ${(process.memoryUsage.rss() / 1000000).toFixed(2)} MB, max ${peak.toFixed(2)} MB`);
}
init();