Closing JSDOM window after deep scrape in an attempt to save memory. Reduced deep scrape concurrency to 5.
This commit is contained in:
parent
08f725a0b6
commit
e29cbc9fea
|
@ -2,6 +2,7 @@ node_modules/
|
|||
dist/
|
||||
log/
|
||||
media/
|
||||
html/
|
||||
public/js/*
|
||||
public/css/*
|
||||
config/*
|
||||
|
|
12
src/deep.js
12
src/deep.js
|
@ -9,6 +9,7 @@ const { fetchReleaseEntities, urlToSiteSlug } = require('./entities');
|
|||
const logger = require('./logger')(__filename);
|
||||
const qu = require('./utils/qu');
|
||||
const getRecursiveParameters = require('./utils/get-recursive-parameters');
|
||||
const windows = require('./utils/http-windows');
|
||||
|
||||
function toBaseReleases(baseReleasesOrUrls, entity = null) {
|
||||
if (!baseReleasesOrUrls) {
|
||||
|
@ -116,10 +117,19 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
|
|||
parameters: getRecursiveParameters(entity),
|
||||
};
|
||||
|
||||
logger.debug(`Memory usage before: ${process.memoryUsage.rss() / 1000000} MB (${baseRelease.url})`);
|
||||
|
||||
const rawScrapedRelease = type === 'scene'
|
||||
? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, options, null)
|
||||
: await layoutScraper.fetchMovie(baseRelease.url, entity, baseRelease, options, null);
|
||||
|
||||
const pathname = new URL(baseRelease.url).pathname.replace(/\//g, '_');
|
||||
|
||||
windows.get(pathname)?.close();
|
||||
windows.delete(pathname);
|
||||
|
||||
logger.debug(`Memory usage after: ${process.memoryUsage.rss() / 1000000} MB (${baseRelease.url})`);
|
||||
|
||||
const scrapedRelease = rawScrapedRelease?.scene || rawScrapedRelease;
|
||||
|
||||
if (!scrapedRelease || typeof scrapedRelease !== 'object' || Array.isArray(scrapedRelease)) {
|
||||
|
@ -186,7 +196,7 @@ async function scrapeReleases(baseReleases, entitiesBySlug, type) {
|
|||
return Promise.map(
|
||||
baseReleases,
|
||||
async (baseRelease) => scrapeRelease(baseRelease, entitiesWithBeforeDataBySlug, type),
|
||||
{ concurrency: 10 },
|
||||
{ concurrency: 5 },
|
||||
);
|
||||
}
|
||||
|
||||
|
|
|
@ -5,7 +5,6 @@ const qu = require('../utils/qu');
|
|||
|
||||
async function fetchScene(url, site, baseRelease, options) {
|
||||
const res = await qu.get(url);
|
||||
|
||||
const release = await scrapeScene(res.item, url, site, baseRelease, null, options);
|
||||
|
||||
const siteDomain = release.query.el('meta[name="twitter:domain"]', 'content') || 'allblackx.com'; // only AllBlackX has no twitter domain, no other useful hints available
|
||||
|
|
|
@ -0,0 +1,3 @@
|
|||
'use strict';
|
||||
|
||||
module.exports = new Map();
|
|
@ -3,12 +3,15 @@
|
|||
const config = require('config');
|
||||
const Promise = require('bluebird');
|
||||
const bhttp = require('bhttp');
|
||||
const fs = require('fs').promises;
|
||||
const util = require('util');
|
||||
const stream = require('stream');
|
||||
const tunnel = require('tunnel');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const { JSDOM, toughCookie } = require('jsdom');
|
||||
|
||||
const windows = require('./http-windows');
|
||||
|
||||
const logger = require('../logger')(__filename);
|
||||
const virtualConsole = require('./virtual-console')(__filename);
|
||||
const argv = require('../argv');
|
||||
|
@ -114,6 +117,15 @@ async function finalizeResult(res, options) {
|
|||
if (Buffer.isBuffer(res.body)) {
|
||||
const html = res.body.toString();
|
||||
const window = options?.parse ? new JSDOM(html, { virtualConsole, ...options.extract }).window : null;
|
||||
const pathname = new URL(res.request.url).pathname.replace(/\//g, '_');
|
||||
|
||||
if (window) {
|
||||
windows.set(pathname, window);
|
||||
}
|
||||
|
||||
if (argv.saveHtml) {
|
||||
await fs.writeFile(`./html/${pathname}.html`, html);
|
||||
}
|
||||
|
||||
return {
|
||||
...res,
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
'use strict';
|
||||
|
||||
const fs = require('fs').promises;
|
||||
const Promise = require('bluebird');
|
||||
const { JSDOM } = require('jsdom');
|
||||
|
||||
async function init() {
|
||||
let peak = 0;
|
||||
const files = await fs.readdir('./html');
|
||||
|
||||
await Promise.map(Array.from({ length: 10 }).map(() => files).flat(), async (filename) => {
|
||||
const html = await fs.readFile(`./html/${filename}`, 'utf8');
|
||||
const dom = new JSDOM(html);
|
||||
|
||||
dom.window.close();
|
||||
|
||||
const usage = process.memoryUsage.rss() / 1000000;
|
||||
peak = Math.max(usage, peak);
|
||||
|
||||
console.log(`Memory usage: ${usage.toFixed(2)} MB, peak ${peak.toFixed(2)} MB`);
|
||||
|
||||
await Promise.delay(100);
|
||||
}, {
|
||||
concurrency: 10,
|
||||
});
|
||||
|
||||
await Promise.delay(2000);
|
||||
|
||||
console.log(`Final memory usage: ${(process.memoryUsage.rss() / 1000000).toFixed(2)} MB, max ${peak.toFixed(2)} MB`);
|
||||
}
|
||||
|
||||
init();
|
Loading…
Reference in New Issue