Closing JSDOM window after deep scrape in an attempt to save memory. Reduced deep scrape concurrency to 5.
This commit is contained in:
parent
08f725a0b6
commit
e29cbc9fea
|
@ -2,6 +2,7 @@ node_modules/
|
||||||
dist/
|
dist/
|
||||||
log/
|
log/
|
||||||
media/
|
media/
|
||||||
|
html/
|
||||||
public/js/*
|
public/js/*
|
||||||
public/css/*
|
public/css/*
|
||||||
config/*
|
config/*
|
||||||
|
|
12
src/deep.js
12
src/deep.js
|
@ -9,6 +9,7 @@ const { fetchReleaseEntities, urlToSiteSlug } = require('./entities');
|
||||||
const logger = require('./logger')(__filename);
|
const logger = require('./logger')(__filename);
|
||||||
const qu = require('./utils/qu');
|
const qu = require('./utils/qu');
|
||||||
const getRecursiveParameters = require('./utils/get-recursive-parameters');
|
const getRecursiveParameters = require('./utils/get-recursive-parameters');
|
||||||
|
const windows = require('./utils/http-windows');
|
||||||
|
|
||||||
function toBaseReleases(baseReleasesOrUrls, entity = null) {
|
function toBaseReleases(baseReleasesOrUrls, entity = null) {
|
||||||
if (!baseReleasesOrUrls) {
|
if (!baseReleasesOrUrls) {
|
||||||
|
@ -116,10 +117,19 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
|
||||||
parameters: getRecursiveParameters(entity),
|
parameters: getRecursiveParameters(entity),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
logger.debug(`Memory usage before: ${process.memoryUsage.rss() / 1000000} MB (${baseRelease.url})`);
|
||||||
|
|
||||||
const rawScrapedRelease = type === 'scene'
|
const rawScrapedRelease = type === 'scene'
|
||||||
? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, options, null)
|
? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, options, null)
|
||||||
: await layoutScraper.fetchMovie(baseRelease.url, entity, baseRelease, options, null);
|
: await layoutScraper.fetchMovie(baseRelease.url, entity, baseRelease, options, null);
|
||||||
|
|
||||||
|
const pathname = new URL(baseRelease.url).pathname.replace(/\//g, '_');
|
||||||
|
|
||||||
|
windows.get(pathname)?.close();
|
||||||
|
windows.delete(pathname);
|
||||||
|
|
||||||
|
logger.debug(`Memory usage after: ${process.memoryUsage.rss() / 1000000} MB (${baseRelease.url})`);
|
||||||
|
|
||||||
const scrapedRelease = rawScrapedRelease?.scene || rawScrapedRelease;
|
const scrapedRelease = rawScrapedRelease?.scene || rawScrapedRelease;
|
||||||
|
|
||||||
if (!scrapedRelease || typeof scrapedRelease !== 'object' || Array.isArray(scrapedRelease)) {
|
if (!scrapedRelease || typeof scrapedRelease !== 'object' || Array.isArray(scrapedRelease)) {
|
||||||
|
@ -186,7 +196,7 @@ async function scrapeReleases(baseReleases, entitiesBySlug, type) {
|
||||||
return Promise.map(
|
return Promise.map(
|
||||||
baseReleases,
|
baseReleases,
|
||||||
async (baseRelease) => scrapeRelease(baseRelease, entitiesWithBeforeDataBySlug, type),
|
async (baseRelease) => scrapeRelease(baseRelease, entitiesWithBeforeDataBySlug, type),
|
||||||
{ concurrency: 10 },
|
{ concurrency: 5 },
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,6 @@ const qu = require('../utils/qu');
|
||||||
|
|
||||||
async function fetchScene(url, site, baseRelease, options) {
|
async function fetchScene(url, site, baseRelease, options) {
|
||||||
const res = await qu.get(url);
|
const res = await qu.get(url);
|
||||||
|
|
||||||
const release = await scrapeScene(res.item, url, site, baseRelease, null, options);
|
const release = await scrapeScene(res.item, url, site, baseRelease, null, options);
|
||||||
|
|
||||||
const siteDomain = release.query.el('meta[name="twitter:domain"]', 'content') || 'allblackx.com'; // only AllBlackX has no twitter domain, no other useful hints available
|
const siteDomain = release.query.el('meta[name="twitter:domain"]', 'content') || 'allblackx.com'; // only AllBlackX has no twitter domain, no other useful hints available
|
||||||
|
|
|
@ -0,0 +1,3 @@
|
||||||
|
'use strict';
|
||||||
|
|
||||||
|
module.exports = new Map();
|
|
@ -3,12 +3,15 @@
|
||||||
const config = require('config');
|
const config = require('config');
|
||||||
const Promise = require('bluebird');
|
const Promise = require('bluebird');
|
||||||
const bhttp = require('bhttp');
|
const bhttp = require('bhttp');
|
||||||
|
const fs = require('fs').promises;
|
||||||
const util = require('util');
|
const util = require('util');
|
||||||
const stream = require('stream');
|
const stream = require('stream');
|
||||||
const tunnel = require('tunnel');
|
const tunnel = require('tunnel');
|
||||||
const Bottleneck = require('bottleneck');
|
const Bottleneck = require('bottleneck');
|
||||||
const { JSDOM, toughCookie } = require('jsdom');
|
const { JSDOM, toughCookie } = require('jsdom');
|
||||||
|
|
||||||
|
const windows = require('./http-windows');
|
||||||
|
|
||||||
const logger = require('../logger')(__filename);
|
const logger = require('../logger')(__filename);
|
||||||
const virtualConsole = require('./virtual-console')(__filename);
|
const virtualConsole = require('./virtual-console')(__filename);
|
||||||
const argv = require('../argv');
|
const argv = require('../argv');
|
||||||
|
@ -114,6 +117,15 @@ async function finalizeResult(res, options) {
|
||||||
if (Buffer.isBuffer(res.body)) {
|
if (Buffer.isBuffer(res.body)) {
|
||||||
const html = res.body.toString();
|
const html = res.body.toString();
|
||||||
const window = options?.parse ? new JSDOM(html, { virtualConsole, ...options.extract }).window : null;
|
const window = options?.parse ? new JSDOM(html, { virtualConsole, ...options.extract }).window : null;
|
||||||
|
const pathname = new URL(res.request.url).pathname.replace(/\//g, '_');
|
||||||
|
|
||||||
|
if (window) {
|
||||||
|
windows.set(pathname, window);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (argv.saveHtml) {
|
||||||
|
await fs.writeFile(`./html/${pathname}.html`, html);
|
||||||
|
}
|
||||||
|
|
||||||
return {
|
return {
|
||||||
...res,
|
...res,
|
||||||
|
|
|
@ -0,0 +1,32 @@
|
||||||
|
'use strict';
|
||||||
|
|
||||||
|
const fs = require('fs').promises;
|
||||||
|
const Promise = require('bluebird');
|
||||||
|
const { JSDOM } = require('jsdom');
|
||||||
|
|
||||||
|
async function init() {
|
||||||
|
let peak = 0;
|
||||||
|
const files = await fs.readdir('./html');
|
||||||
|
|
||||||
|
await Promise.map(Array.from({ length: 10 }).map(() => files).flat(), async (filename) => {
|
||||||
|
const html = await fs.readFile(`./html/${filename}`, 'utf8');
|
||||||
|
const dom = new JSDOM(html);
|
||||||
|
|
||||||
|
dom.window.close();
|
||||||
|
|
||||||
|
const usage = process.memoryUsage.rss() / 1000000;
|
||||||
|
peak = Math.max(usage, peak);
|
||||||
|
|
||||||
|
console.log(`Memory usage: ${usage.toFixed(2)} MB, peak ${peak.toFixed(2)} MB`);
|
||||||
|
|
||||||
|
await Promise.delay(100);
|
||||||
|
}, {
|
||||||
|
concurrency: 10,
|
||||||
|
});
|
||||||
|
|
||||||
|
await Promise.delay(2000);
|
||||||
|
|
||||||
|
console.log(`Final memory usage: ${(process.memoryUsage.rss() / 1000000).toFixed(2)} MB, max ${peak.toFixed(2)} MB`);
|
||||||
|
}
|
||||||
|
|
||||||
|
init();
|
Loading…
Reference in New Issue