Rescraping upcoming scenes. Fixed language and scene deep scraping for Dorcel scraper.

This commit is contained in:
DebaucheryLibrarian
2021-06-02 03:27:32 +02:00
parent 42791c528e
commit c979173422
15 changed files with 105 additions and 15 deletions

View File

@@ -44,7 +44,17 @@ async function filterUniqueReleases(releases) {
const duplicateReleaseEntries = await knex('releases')
.select(knex.raw('releases.*, row_to_json(entities) as entity'))
.leftJoin('entities', 'entities.id', 'releases.entity_id')
.whereIn(['entity_id', 'entry_id'], releaseIdentifiers);
.whereIn(['entity_id', 'entry_id'], releaseIdentifiers)
.where((builder) => {
// check if previously upcoming scenes can be excluded from duplicates to be rescraped for release day updates
builder
.where('deep', true) // scene is already deep scraped
.orWhereNull('date')
.orWhereNotIn('date_precision', ['day', 'minute']) // don't worry about scenes without (accurate) dates for now
.orWhere(knex.raw('NOW() - date > INTERVAL \'12 hours\'')) // scene is still upcoming, with a rough offset to wait for the end of the day west of UTC
.orWhere(knex.raw('updated_at - date > INTERVAL \'1 day\'')); // scene was updated after the release date, no updated expected
});
const duplicateReleases = duplicateReleaseEntries.map(release => curateRelease(release));
const duplicateReleasesByEntityIdAndEntryId = duplicateReleases.reduce(mapReleasesToEntityIdAndEntryId, {});