From 506971b44b9f27b609a7b33d272a96779ddfdae8 Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Mon, 10 Jan 2022 02:17:17 +0100 Subject: [PATCH] Chunked duplicate check to prevent postgres stack depth errors. --- src/scrapers/traxxx.js | 4 ++-- src/store-releases.js | 13 +++++++++---- src/updates.js | 38 ++++++++++++++++++++++---------------- 3 files changed, 33 insertions(+), 22 deletions(-) diff --git a/src/scrapers/traxxx.js b/src/scrapers/traxxx.js index ca504451..cc3db3d6 100644 --- a/src/scrapers/traxxx.js +++ b/src/scrapers/traxxx.js @@ -232,7 +232,7 @@ function actors(release) { } async function fetchLatest(entity, page, options) { - return Promise.all(Array.from({ length: 100 }, async (value, index) => { + return Promise.all(Array.from({ length: 10000 }, async (value, index) => { const release = {}; release.entryId = nanoid(); @@ -262,7 +262,7 @@ async function fetchLatest(entity, page, options) { .select('name') .where('priority', '>', 7) .orderByRaw('random()') - .limit(faker.random.number({ min: 2, max: 15 })) + .limit(faker.datatype.number({ min: 2, max: 15 })) .pluck('name'); release.actors = [...actors(release), null]; // include empty actor to ensure proper handling diff --git a/src/store-releases.js b/src/store-releases.js index 04f0a334..bdb0a8af 100644 --- a/src/store-releases.js +++ b/src/store-releases.js @@ -1,6 +1,7 @@ 'use strict'; const config = require('config'); +const Promise = require('bluebird'); const argv = require('./argv'); const logger = require('./logger')(__filename); @@ -8,6 +9,7 @@ const knex = require('./knex'); const slugify = require('./utils/slugify'); const bulkInsert = require('./utils/bulk-insert'); const resolvePlace = require('./utils/resolve-place'); +const chunk = require('./utils/chunk'); const { formatDate } = require('./utils/qu'); const { associateActors, associateDirectors, scrapeActors, toBaseActors } = require('./actors'); const { associateReleaseTags } = require('./tags'); @@ -192,13 +194,16 @@ function filterInternalDuplicateReleases(releases) { async function filterDuplicateReleases(releases) { const internalUniqueReleases = filterInternalDuplicateReleases(releases); + const internalUniqueReleaseChunks = chunk(internalUniqueReleases); - const duplicateReleaseEntries = await knex('releases') - .whereIn(['entry_id', 'entity_id'], internalUniqueReleases.map((release) => [release.entryId, release.entity.id])) - .orWhereIn(['entry_id', 'entity_id'], internalUniqueReleases + const duplicateReleaseEntryChunks = await Promise.map(internalUniqueReleaseChunks, async (internalUniqueReleasesChunk) => knex('releases') + .whereIn(['entry_id', 'entity_id'], internalUniqueReleasesChunk.map((release) => [release.entryId, release.entity.id])) + .orWhereIn(['entry_id', 'entity_id'], internalUniqueReleasesChunk // scene IDs shared across network, mark as duplicate so scene can be updated with channel if only available on release day (i.e. Perv City) .filter((release) => release.entity.parent?.parameters?.networkEntryIds) - .map((release) => [release.entryId, release.entity.parent.id])); + .map((release) => [release.entryId, release.entity.parent.id])), { concurrency: 10 }); + + const duplicateReleaseEntries = duplicateReleaseEntryChunks.flat(); const duplicateReleasesByEntityIdAndEntryId = duplicateReleaseEntries.reduce((acc, release) => { if (!acc[release.entity_id]) acc[release.entity_id] = {}; diff --git a/src/updates.js b/src/updates.js index e85d2157..355ac92d 100644 --- a/src/updates.js +++ b/src/updates.js @@ -8,6 +8,7 @@ const argv = require('./argv'); const logger = require('./logger')(__filename); const knex = require('./knex'); const { curateRelease } = require('./releases'); +const chunk = require('./utils/chunk'); const include = require('./utils/argv-include')(argv); const { resolveScraper, resolveLayoutScraper } = require('./scrapers/resolve'); const { fetchIncludedEntities } = require('./entities'); @@ -38,24 +39,29 @@ function filterLocalUniqueReleases(releases, accReleases) { } async function filterUniqueReleases(releases) { - const releaseIdentifiers = releases - .map((release) => [release.entity.id, release.entryId.toString()]); + const releaseIdentifierChunks = chunk(releases.map((release) => [release.entity.id, release.entryId.toString()])); - const duplicateReleaseEntriesQuery = knex('releases') - .select(knex.raw('releases.*, row_to_json(entities) as entity')) - .leftJoin('entities', 'entities.id', 'releases.entity_id') - .whereIn(['entity_id', 'entry_id'], releaseIdentifiers) - .where((builder) => { - // check if previously upcoming scenes can be excluded from duplicates to be rescraped for release day updates - builder - .where('deep', true) // scene is already deep scraped - .orWhereNull('date') - .orWhereNotIn('date_precision', ['day', 'minute']) // don't worry about scenes without (accurate) dates for now - .orWhere(knex.raw('date > NOW() - INTERVAL \'12 hours\'')) // scene is still upcoming, with a rough offset to wait for the end of the day west of UTC - .orWhere(knex.raw('updated_at - date > INTERVAL \'1 day\'')); // scene was updated after the release date, no updates expected - }); + const duplicateReleaseEntryChunks = await Promise.map(releaseIdentifierChunks, async (releaseIdentifiers) => { + const duplicateReleaseEntriesQuery = knex('releases') + .select(knex.raw('releases.*, row_to_json(entities) as entity')) + .leftJoin('entities', 'entities.id', 'releases.entity_id') + .whereIn(['entity_id', 'entry_id'], releaseIdentifiers) + .where((builder) => { + // check if previously upcoming scenes can be excluded from duplicates to be rescraped for release day updates + builder + .where('deep', true) // scene is already deep scraped + .orWhereNull('date') + .orWhereNotIn('date_precision', ['day', 'minute']) // don't worry about scenes without (accurate) dates for now + .orWhere(knex.raw('date > NOW() - INTERVAL \'12 hours\'')) // scene is still upcoming, with a rough offset to wait for the end of the day west of UTC + .orWhere(knex.raw('updated_at - date > INTERVAL \'1 day\'')); // scene was updated after the release date, no updates expected + }); - const duplicateReleaseEntries = await duplicateReleaseEntriesQuery; + return duplicateReleaseEntriesQuery; + }, { concurrency: 10 }); + + const duplicateReleaseEntries = duplicateReleaseEntryChunks.flat(); + + console.log(duplicateReleaseEntries); const duplicateReleases = duplicateReleaseEntries.map((release) => curateRelease(release)); const duplicateReleasesByEntityIdAndEntryId = duplicateReleases.reduce(mapReleasesToEntityIdAndEntryId, {});