Chunked duplicate check to prevent postgres stack depth errors.

2022-01-10 02:17:17 +01:00 · 2022-01-10 02:17:17 +01:00 · 506971b44b
parent 43a0bc8a2c
commit 506971b44b
3 changed files with 33 additions and 22 deletions
--- a/src/scrapers/traxxx.js
+++ b/src/scrapers/traxxx.js
@ -232,7 +232,7 @@ function actors(release) {
 }

 async function fetchLatest(entity, page, options) {
-	return Promise.all(Array.from({ length: 100 }, async (value, index) => {
+	return Promise.all(Array.from({ length: 10000 }, async (value, index) => {
 		const release = {};

 		release.entryId = nanoid();
@ -262,7 +262,7 @@ async function fetchLatest(entity, page, options) {
 			.select('name')
 			.where('priority', '>', 7)
 			.orderByRaw('random()')
-			.limit(faker.random.number({ min: 2, max: 15 }))
+			.limit(faker.datatype.number({ min: 2, max: 15 }))
 			.pluck('name');

 		release.actors = [...actors(release), null]; // include empty actor to ensure proper handling
--- a/src/store-releases.js
+++ b/src/store-releases.js
@ -1,6 +1,7 @@
 'use strict';

 const config = require('config');
+const Promise = require('bluebird');

 const argv = require('./argv');
 const logger = require('./logger')(__filename);
@ -8,6 +9,7 @@ const knex = require('./knex');
 const slugify = require('./utils/slugify');
 const bulkInsert = require('./utils/bulk-insert');
 const resolvePlace = require('./utils/resolve-place');
+const chunk = require('./utils/chunk');
 const { formatDate } = require('./utils/qu');
 const { associateActors, associateDirectors, scrapeActors, toBaseActors } = require('./actors');
 const { associateReleaseTags } = require('./tags');
@ -192,13 +194,16 @@ function filterInternalDuplicateReleases(releases) {

 async function filterDuplicateReleases(releases) {
 	const internalUniqueReleases = filterInternalDuplicateReleases(releases);
+	const internalUniqueReleaseChunks = chunk(internalUniqueReleases);

-	const duplicateReleaseEntries = await knex('releases')
-		.whereIn(['entry_id', 'entity_id'], internalUniqueReleases.map((release) => [release.entryId, release.entity.id]))
-		.orWhereIn(['entry_id', 'entity_id'], internalUniqueReleases
+	const duplicateReleaseEntryChunks = await Promise.map(internalUniqueReleaseChunks, async (internalUniqueReleasesChunk) => knex('releases')
+		.whereIn(['entry_id', 'entity_id'], internalUniqueReleasesChunk.map((release) => [release.entryId, release.entity.id]))
+		.orWhereIn(['entry_id', 'entity_id'], internalUniqueReleasesChunk
 			// scene IDs shared across network, mark as duplicate so scene can be updated with channel if only available on release day (i.e. Perv City)
 			.filter((release) => release.entity.parent?.parameters?.networkEntryIds)
-			.map((release) => [release.entryId, release.entity.parent.id]));
+			.map((release) => [release.entryId, release.entity.parent.id])), { concurrency: 10 });
+
+	const duplicateReleaseEntries = duplicateReleaseEntryChunks.flat();

 	const duplicateReleasesByEntityIdAndEntryId = duplicateReleaseEntries.reduce((acc, release) => {
 		if (!acc[release.entity_id]) acc[release.entity_id] = {};
--- a/src/updates.js
+++ b/src/updates.js
@ -8,6 +8,7 @@ const argv = require('./argv');
 const logger = require('./logger')(__filename);
 const knex = require('./knex');
 const { curateRelease } = require('./releases');
+const chunk = require('./utils/chunk');
 const include = require('./utils/argv-include')(argv);
 const { resolveScraper, resolveLayoutScraper } = require('./scrapers/resolve');
 const { fetchIncludedEntities } = require('./entities');
@ -38,24 +39,29 @@ function filterLocalUniqueReleases(releases, accReleases) {
 }

 async function filterUniqueReleases(releases) {
-	const releaseIdentifiers = releases
-		.map((release) => [release.entity.id, release.entryId.toString()]);
+	const releaseIdentifierChunks = chunk(releases.map((release) => [release.entity.id, release.entryId.toString()]));

-	const duplicateReleaseEntriesQuery = knex('releases')
-		.select(knex.raw('releases.*, row_to_json(entities) as entity'))
-		.leftJoin('entities', 'entities.id', 'releases.entity_id')
-		.whereIn(['entity_id', 'entry_id'], releaseIdentifiers)
-		.where((builder) => {
-			// check if previously upcoming scenes can be excluded from duplicates to be rescraped for release day updates
-			builder
-				.where('deep', true) // scene is already deep scraped
-				.orWhereNull('date')
-				.orWhereNotIn('date_precision', ['day', 'minute']) // don't worry about scenes without (accurate) dates for now
-				.orWhere(knex.raw('date > NOW() - INTERVAL \'12 hours\'')) // scene is still upcoming, with a rough offset to wait for the end of the day west of UTC
-				.orWhere(knex.raw('updated_at - date > INTERVAL \'1 day\'')); // scene was updated after the release date, no updates expected
-		});
+	const duplicateReleaseEntryChunks = await Promise.map(releaseIdentifierChunks, async (releaseIdentifiers) => {
+		const duplicateReleaseEntriesQuery = knex('releases')
+			.select(knex.raw('releases.*, row_to_json(entities) as entity'))
+			.leftJoin('entities', 'entities.id', 'releases.entity_id')
+			.whereIn(['entity_id', 'entry_id'], releaseIdentifiers)
+			.where((builder) => {
+				// check if previously upcoming scenes can be excluded from duplicates to be rescraped for release day updates
+				builder
+					.where('deep', true) // scene is already deep scraped
+					.orWhereNull('date')
+					.orWhereNotIn('date_precision', ['day', 'minute']) // don't worry about scenes without (accurate) dates for now
+					.orWhere(knex.raw('date > NOW() - INTERVAL \'12 hours\'')) // scene is still upcoming, with a rough offset to wait for the end of the day west of UTC
+					.orWhere(knex.raw('updated_at - date > INTERVAL \'1 day\'')); // scene was updated after the release date, no updates expected
+			});

-	const duplicateReleaseEntries = await duplicateReleaseEntriesQuery;
+		return duplicateReleaseEntriesQuery;
+	}, { concurrency: 10 });
+
+	const duplicateReleaseEntries = duplicateReleaseEntryChunks.flat();
+
+	console.log(duplicateReleaseEntries);

 	const duplicateReleases = duplicateReleaseEntries.map((release) => curateRelease(release));
 	const duplicateReleasesByEntityIdAndEntryId = duplicateReleases.reduce(mapReleasesToEntityIdAndEntryId, {});