Closing JSDOM window after deep scrape in an attempt to save memory. Reduced deep scrape concurrency to 5.
This commit is contained in:
		
							parent
							
								
									08f725a0b6
								
							
						
					
					
						commit
						e29cbc9fea
					
				|  | @ -2,6 +2,7 @@ node_modules/ | ||||||
| dist/ | dist/ | ||||||
| log/ | log/ | ||||||
| media/ | media/ | ||||||
|  | html/ | ||||||
| public/js/* | public/js/* | ||||||
| public/css/* | public/css/* | ||||||
| config/* | config/* | ||||||
|  |  | ||||||
							
								
								
									
										12
									
								
								src/deep.js
								
								
								
								
							
							
						
						
									
										12
									
								
								src/deep.js
								
								
								
								
							|  | @ -9,6 +9,7 @@ const { fetchReleaseEntities, urlToSiteSlug } = require('./entities'); | ||||||
| const logger = require('./logger')(__filename); | const logger = require('./logger')(__filename); | ||||||
| const qu = require('./utils/qu'); | const qu = require('./utils/qu'); | ||||||
| const getRecursiveParameters = require('./utils/get-recursive-parameters'); | const getRecursiveParameters = require('./utils/get-recursive-parameters'); | ||||||
|  | const windows = require('./utils/http-windows'); | ||||||
| 
 | 
 | ||||||
| function toBaseReleases(baseReleasesOrUrls, entity = null) { | function toBaseReleases(baseReleasesOrUrls, entity = null) { | ||||||
| 	if (!baseReleasesOrUrls) { | 	if (!baseReleasesOrUrls) { | ||||||
|  | @ -116,10 +117,19 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') { | ||||||
| 			parameters: getRecursiveParameters(entity), | 			parameters: getRecursiveParameters(entity), | ||||||
| 		}; | 		}; | ||||||
| 
 | 
 | ||||||
|  | 		logger.debug(`Memory usage before: ${process.memoryUsage.rss() / 1000000} MB (${baseRelease.url})`); | ||||||
|  | 
 | ||||||
| 		const rawScrapedRelease = type === 'scene' | 		const rawScrapedRelease = type === 'scene' | ||||||
| 			? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, options, null) | 			? await fetchScene(layoutScraper, baseRelease.url, entity, baseRelease, options, null) | ||||||
| 			: await layoutScraper.fetchMovie(baseRelease.url, entity, baseRelease, options, null); | 			: await layoutScraper.fetchMovie(baseRelease.url, entity, baseRelease, options, null); | ||||||
| 
 | 
 | ||||||
|  | 		const pathname = new URL(baseRelease.url).pathname.replace(/\//g, '_'); | ||||||
|  | 
 | ||||||
|  | 		windows.get(pathname)?.close(); | ||||||
|  | 		windows.delete(pathname); | ||||||
|  | 
 | ||||||
|  | 		logger.debug(`Memory usage after: ${process.memoryUsage.rss() / 1000000} MB (${baseRelease.url})`); | ||||||
|  | 
 | ||||||
| 		const scrapedRelease = rawScrapedRelease?.scene || rawScrapedRelease; | 		const scrapedRelease = rawScrapedRelease?.scene || rawScrapedRelease; | ||||||
| 
 | 
 | ||||||
| 		if (!scrapedRelease || typeof scrapedRelease !== 'object' || Array.isArray(scrapedRelease)) { | 		if (!scrapedRelease || typeof scrapedRelease !== 'object' || Array.isArray(scrapedRelease)) { | ||||||
|  | @ -186,7 +196,7 @@ async function scrapeReleases(baseReleases, entitiesBySlug, type) { | ||||||
| 	return Promise.map( | 	return Promise.map( | ||||||
| 		baseReleases, | 		baseReleases, | ||||||
| 		async (baseRelease) => scrapeRelease(baseRelease, entitiesWithBeforeDataBySlug, type), | 		async (baseRelease) => scrapeRelease(baseRelease, entitiesWithBeforeDataBySlug, type), | ||||||
| 		{ concurrency: 10 }, | 		{ concurrency: 5 }, | ||||||
| 	); | 	); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -5,7 +5,6 @@ const qu = require('../utils/qu'); | ||||||
| 
 | 
 | ||||||
| async function fetchScene(url, site, baseRelease, options) { | async function fetchScene(url, site, baseRelease, options) { | ||||||
| 	const res = await qu.get(url); | 	const res = await qu.get(url); | ||||||
| 
 |  | ||||||
| 	const release = await scrapeScene(res.item, url, site, baseRelease, null, options); | 	const release = await scrapeScene(res.item, url, site, baseRelease, null, options); | ||||||
| 
 | 
 | ||||||
| 	const siteDomain = release.query.el('meta[name="twitter:domain"]', 'content') || 'allblackx.com'; // only AllBlackX has no twitter domain, no other useful hints available
 | 	const siteDomain = release.query.el('meta[name="twitter:domain"]', 'content') || 'allblackx.com'; // only AllBlackX has no twitter domain, no other useful hints available
 | ||||||
|  |  | ||||||
|  | @ -0,0 +1,3 @@ | ||||||
|  | 'use strict'; | ||||||
|  | 
 | ||||||
|  | module.exports = new Map(); | ||||||
|  | @ -3,12 +3,15 @@ | ||||||
| const config = require('config'); | const config = require('config'); | ||||||
| const Promise = require('bluebird'); | const Promise = require('bluebird'); | ||||||
| const bhttp = require('bhttp'); | const bhttp = require('bhttp'); | ||||||
|  | const fs = require('fs').promises; | ||||||
| const util = require('util'); | const util = require('util'); | ||||||
| const stream = require('stream'); | const stream = require('stream'); | ||||||
| const tunnel = require('tunnel'); | const tunnel = require('tunnel'); | ||||||
| const Bottleneck = require('bottleneck'); | const Bottleneck = require('bottleneck'); | ||||||
| const { JSDOM, toughCookie } = require('jsdom'); | const { JSDOM, toughCookie } = require('jsdom'); | ||||||
| 
 | 
 | ||||||
|  | const windows = require('./http-windows'); | ||||||
|  | 
 | ||||||
| const logger = require('../logger')(__filename); | const logger = require('../logger')(__filename); | ||||||
| const virtualConsole = require('./virtual-console')(__filename); | const virtualConsole = require('./virtual-console')(__filename); | ||||||
| const argv = require('../argv'); | const argv = require('../argv'); | ||||||
|  | @ -114,6 +117,15 @@ async function finalizeResult(res, options) { | ||||||
| 	if (Buffer.isBuffer(res.body)) { | 	if (Buffer.isBuffer(res.body)) { | ||||||
| 		const html = res.body.toString(); | 		const html = res.body.toString(); | ||||||
| 		const window = options?.parse ? new JSDOM(html, { virtualConsole, ...options.extract }).window : null; | 		const window = options?.parse ? new JSDOM(html, { virtualConsole, ...options.extract }).window : null; | ||||||
|  | 		const pathname = new URL(res.request.url).pathname.replace(/\//g, '_'); | ||||||
|  | 
 | ||||||
|  | 		if (window) { | ||||||
|  | 			windows.set(pathname, window); | ||||||
|  | 		} | ||||||
|  | 
 | ||||||
|  | 		if (argv.saveHtml) { | ||||||
|  | 			await fs.writeFile(`./html/${pathname}.html`, html); | ||||||
|  | 		} | ||||||
| 
 | 
 | ||||||
| 		return { | 		return { | ||||||
| 			...res, | 			...res, | ||||||
|  |  | ||||||
|  | @ -0,0 +1,32 @@ | ||||||
|  | 'use strict'; | ||||||
|  | 
 | ||||||
|  | const fs = require('fs').promises; | ||||||
|  | const Promise = require('bluebird'); | ||||||
|  | const { JSDOM } = require('jsdom'); | ||||||
|  | 
 | ||||||
|  | async function init() { | ||||||
|  | 	let peak = 0; | ||||||
|  | 	const files = await fs.readdir('./html'); | ||||||
|  | 
 | ||||||
|  | 	await Promise.map(Array.from({ length: 10 }).map(() => files).flat(), async (filename) => { | ||||||
|  | 		const html = await fs.readFile(`./html/${filename}`, 'utf8'); | ||||||
|  | 		const dom = new JSDOM(html); | ||||||
|  | 
 | ||||||
|  | 		dom.window.close(); | ||||||
|  | 
 | ||||||
|  | 		const usage = process.memoryUsage.rss() / 1000000; | ||||||
|  | 		peak = Math.max(usage, peak); | ||||||
|  | 
 | ||||||
|  | 		console.log(`Memory usage: ${usage.toFixed(2)} MB, peak ${peak.toFixed(2)} MB`); | ||||||
|  | 
 | ||||||
|  | 		await Promise.delay(100); | ||||||
|  | 	}, { | ||||||
|  | 		concurrency: 10, | ||||||
|  | 	}); | ||||||
|  | 
 | ||||||
|  | 	await Promise.delay(2000); | ||||||
|  | 
 | ||||||
|  | 	console.log(`Final memory usage: ${(process.memoryUsage.rss() / 1000000).toFixed(2)} MB, max ${peak.toFixed(2)} MB`); | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | init(); | ||||||
		Loading…
	
		Reference in New Issue