From e29cbc9fea3dedbe6efdac6ccbd99ae5d2b06bab Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Wed, 1 Dec 2021 17:26:13 +0100 Subject: [PATCH] Closing JSDOM window after deep scrape in an attempt to save memory. Reduced deep scrape concurrency to 5. --- .gitignore | 1 + src/deep.js | 12 +++++++++++- src/scrapers/xempire.js | 1 - src/utils/http-windows.js | 3 +++ src/utils/http.js | 12 ++++++++++++ src/utils/jsdom-perf.js | 32 ++++++++++++++++++++++++++++++++ 6 files changed, 59 insertions(+), 2 deletions(-) create mode 100644 src/utils/http-windows.js create mode 100644 src/utils/jsdom-perf.js diff --git a/.gitignore b/.gitignore index 74d116246..d87f39073 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ node_modules/ dist/ log/ media/ +html/ public/js/* public/css/* config/* diff --git a/src/deep.js b/src/deep.js index af3e43836..953cee3de 100644 --- a/src/deep.js +++ b/src/deep.js @@ -9,6 +9,7 @@ const { fetchReleaseEntities, urlToSiteSlug } = require('./entities'); const logger = require('./logger')(__filename); const qu = require('./utils/qu'); const getRecursiveParameters = require('./utils/get-recursive-parameters'); +const windows = require('./utils/http-windows'); function toBaseReleases(baseReleasesOrUrls, entity = null) { if (!baseReleasesOrUrls) { @@ -116,10 +117,19 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') { parameters: getRecursiveParameters(entity), }; + logger.debug(`Memory usage before: ${process.memoryUsage.rss() / 1000000} MB (${baseRelease.url})`); + const rawScrapedRelease = type === 'scene' ? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, options, null) : await layoutScraper.fetchMovie(baseRelease.url, entity, baseRelease, options, null); + const pathname = new URL(baseRelease.url).pathname.replace(/\//g, '_'); + + windows.get(pathname)?.close(); + windows.delete(pathname); + + logger.debug(`Memory usage after: ${process.memoryUsage.rss() / 1000000} MB (${baseRelease.url})`); + const scrapedRelease = rawScrapedRelease?.scene || rawScrapedRelease; if (!scrapedRelease || typeof scrapedRelease !== 'object' || Array.isArray(scrapedRelease)) { @@ -186,7 +196,7 @@ async function scrapeReleases(baseReleases, entitiesBySlug, type) { return Promise.map( baseReleases, async (baseRelease) => scrapeRelease(baseRelease, entitiesWithBeforeDataBySlug, type), - { concurrency: 10 }, + { concurrency: 5 }, ); } diff --git a/src/scrapers/xempire.js b/src/scrapers/xempire.js index 45ef5c3c7..e74cb576c 100644 --- a/src/scrapers/xempire.js +++ b/src/scrapers/xempire.js @@ -5,7 +5,6 @@ const qu = require('../utils/qu'); async function fetchScene(url, site, baseRelease, options) { const res = await qu.get(url); - const release = await scrapeScene(res.item, url, site, baseRelease, null, options); const siteDomain = release.query.el('meta[name="twitter:domain"]', 'content') || 'allblackx.com'; // only AllBlackX has no twitter domain, no other useful hints available diff --git a/src/utils/http-windows.js b/src/utils/http-windows.js new file mode 100644 index 000000000..4fd928aa7 --- /dev/null +++ b/src/utils/http-windows.js @@ -0,0 +1,3 @@ +'use strict'; + +module.exports = new Map(); diff --git a/src/utils/http.js b/src/utils/http.js index 4010fd6b8..17ef74ec4 100644 --- a/src/utils/http.js +++ b/src/utils/http.js @@ -3,12 +3,15 @@ const config = require('config'); const Promise = require('bluebird'); const bhttp = require('bhttp'); +const fs = require('fs').promises; const util = require('util'); const stream = require('stream'); const tunnel = require('tunnel'); const Bottleneck = require('bottleneck'); const { JSDOM, toughCookie } = require('jsdom'); +const windows = require('./http-windows'); + const logger = require('../logger')(__filename); const virtualConsole = require('./virtual-console')(__filename); const argv = require('../argv'); @@ -114,6 +117,15 @@ async function finalizeResult(res, options) { if (Buffer.isBuffer(res.body)) { const html = res.body.toString(); const window = options?.parse ? new JSDOM(html, { virtualConsole, ...options.extract }).window : null; + const pathname = new URL(res.request.url).pathname.replace(/\//g, '_'); + + if (window) { + windows.set(pathname, window); + } + + if (argv.saveHtml) { + await fs.writeFile(`./html/${pathname}.html`, html); + } return { ...res, diff --git a/src/utils/jsdom-perf.js b/src/utils/jsdom-perf.js new file mode 100644 index 000000000..74336ff38 --- /dev/null +++ b/src/utils/jsdom-perf.js @@ -0,0 +1,32 @@ +'use strict'; + +const fs = require('fs').promises; +const Promise = require('bluebird'); +const { JSDOM } = require('jsdom'); + +async function init() { + let peak = 0; + const files = await fs.readdir('./html'); + + await Promise.map(Array.from({ length: 10 }).map(() => files).flat(), async (filename) => { + const html = await fs.readFile(`./html/${filename}`, 'utf8'); + const dom = new JSDOM(html); + + dom.window.close(); + + const usage = process.memoryUsage.rss() / 1000000; + peak = Math.max(usage, peak); + + console.log(`Memory usage: ${usage.toFixed(2)} MB, peak ${peak.toFixed(2)} MB`); + + await Promise.delay(100); + }, { + concurrency: 10, + }); + + await Promise.delay(2000); + + console.log(`Final memory usage: ${(process.memoryUsage.rss() / 1000000).toFixed(2)} MB, max ${peak.toFixed(2)} MB`); +} + +init();