Chunked duplicate check to prevent postgres stack depth errors.
This commit is contained in:
@@ -8,6 +8,7 @@ const argv = require('./argv');
|
||||
const logger = require('./logger')(__filename);
|
||||
const knex = require('./knex');
|
||||
const { curateRelease } = require('./releases');
|
||||
const chunk = require('./utils/chunk');
|
||||
const include = require('./utils/argv-include')(argv);
|
||||
const { resolveScraper, resolveLayoutScraper } = require('./scrapers/resolve');
|
||||
const { fetchIncludedEntities } = require('./entities');
|
||||
@@ -38,24 +39,29 @@ function filterLocalUniqueReleases(releases, accReleases) {
|
||||
}
|
||||
|
||||
async function filterUniqueReleases(releases) {
|
||||
const releaseIdentifiers = releases
|
||||
.map((release) => [release.entity.id, release.entryId.toString()]);
|
||||
const releaseIdentifierChunks = chunk(releases.map((release) => [release.entity.id, release.entryId.toString()]));
|
||||
|
||||
const duplicateReleaseEntriesQuery = knex('releases')
|
||||
.select(knex.raw('releases.*, row_to_json(entities) as entity'))
|
||||
.leftJoin('entities', 'entities.id', 'releases.entity_id')
|
||||
.whereIn(['entity_id', 'entry_id'], releaseIdentifiers)
|
||||
.where((builder) => {
|
||||
// check if previously upcoming scenes can be excluded from duplicates to be rescraped for release day updates
|
||||
builder
|
||||
.where('deep', true) // scene is already deep scraped
|
||||
.orWhereNull('date')
|
||||
.orWhereNotIn('date_precision', ['day', 'minute']) // don't worry about scenes without (accurate) dates for now
|
||||
.orWhere(knex.raw('date > NOW() - INTERVAL \'12 hours\'')) // scene is still upcoming, with a rough offset to wait for the end of the day west of UTC
|
||||
.orWhere(knex.raw('updated_at - date > INTERVAL \'1 day\'')); // scene was updated after the release date, no updates expected
|
||||
});
|
||||
const duplicateReleaseEntryChunks = await Promise.map(releaseIdentifierChunks, async (releaseIdentifiers) => {
|
||||
const duplicateReleaseEntriesQuery = knex('releases')
|
||||
.select(knex.raw('releases.*, row_to_json(entities) as entity'))
|
||||
.leftJoin('entities', 'entities.id', 'releases.entity_id')
|
||||
.whereIn(['entity_id', 'entry_id'], releaseIdentifiers)
|
||||
.where((builder) => {
|
||||
// check if previously upcoming scenes can be excluded from duplicates to be rescraped for release day updates
|
||||
builder
|
||||
.where('deep', true) // scene is already deep scraped
|
||||
.orWhereNull('date')
|
||||
.orWhereNotIn('date_precision', ['day', 'minute']) // don't worry about scenes without (accurate) dates for now
|
||||
.orWhere(knex.raw('date > NOW() - INTERVAL \'12 hours\'')) // scene is still upcoming, with a rough offset to wait for the end of the day west of UTC
|
||||
.orWhere(knex.raw('updated_at - date > INTERVAL \'1 day\'')); // scene was updated after the release date, no updates expected
|
||||
});
|
||||
|
||||
const duplicateReleaseEntries = await duplicateReleaseEntriesQuery;
|
||||
return duplicateReleaseEntriesQuery;
|
||||
}, { concurrency: 10 });
|
||||
|
||||
const duplicateReleaseEntries = duplicateReleaseEntryChunks.flat();
|
||||
|
||||
console.log(duplicateReleaseEntries);
|
||||
|
||||
const duplicateReleases = duplicateReleaseEntries.map((release) => curateRelease(release));
|
||||
const duplicateReleasesByEntityIdAndEntryId = duplicateReleases.reduce(mapReleasesToEntityIdAndEntryId, {});
|
||||
|
||||
Reference in New Issue
Block a user