Added revised next page determination.

This commit is contained in:
DebaucheryLibrarian 2020-10-12 04:08:22 +02:00
parent 8aefb8eddb
commit 7c856c267d
1 changed files with 39 additions and 57 deletions

View File

@ -24,9 +24,18 @@ function mapReleasesToSiteIdAndEntryId(acc, release) {
}
function filterLocalUniqueReleases(releases, accReleases) {
const localDuplicateReleasesBySiteIdAndEntryId = accReleases.reduce(mapReleasesToSiteIdAndEntryId, {});
const localUniqueReleases = releases.filter(release => !localDuplicateReleasesBySiteIdAndEntryId[release.entity.id]?.[release.entryId]);
const localDuplicateReleases = releases.filter(release => localDuplicateReleasesBySiteIdAndEntryId[release.entity.id]?.[release.entryId]);
return {
localUniqueReleases,
localDuplicateReleases,
};
}
/*
async function filterUniqueReleases(releases) {
const releaseIdentifiers = releases
.map(release => [release.entity.id, release.entryId]);
@ -38,7 +47,9 @@ async function filterUniqueReleases(releases) {
const duplicateReleases = duplicateReleaseEntries.map(release => curateRelease(release));
}
*/
/*
async function filterUniqueReleasesLegacy(latestReleases, accReleases) {
const latestReleaseIdentifiers = latestReleases
.map(release => [release.entity.id, release.entryId]);
@ -71,32 +82,31 @@ async function filterUniqueReleasesLegacy(latestReleases, accReleases) {
duplicateReleases,
};
}
*/
function needNextPageLegacy(releasesOnPage, uniqueReleasesOnPage, localUniqueReleasesOnPage, totalReleases, hasDates, upcoming) {
if (releasesOnPage.length === 0) {
function needNextPage(pageReleases, accReleases, isUpcoming) {
const { localUniqueReleases: uniquePageReleases } = filterLocalUniqueReleases(pageReleases, accReleases);
if (uniquePageReleases.length === 0) {
// page is empty, or only contains scenes from previous page
return false;
}
if (upcoming) {
return uniqueReleasesOnPage.length > 0 && argv.paginateUpcoming;
if (isUpcoming) {
return uniquePageReleases.length > 0 && argv.paginateUpcoming;
}
// no longer works when there are no unique releases, need to keep track of /all/ releases regardless uniqueness
// console.log(localUniqueReleasesOnPage.length);
if (localUniqueReleasesOnPage.length > 0) {
if (uniquePageReleases.length > 0) {
if (argv.last) {
return totalReleases + releasesOnPage.length < argv.last;
return accReleases.length + pageReleases.length < argv.last;
}
if (!hasDates) {
return totalReleases + releasesOnPage.length < argv.nullDateLimit;
if (!pageReleases.every(release => !!release.date)) { // some scenes don't have dates
return accReleases.length + pageReleases.length < argv.nullDateLimit;
}
if (argv.after) {
// this will keep paginating infinitely on sites that will keep serving the last page if you exceed the last page number (e.g. HardX as of september 2020)
// checking unqiueReleases > 0 could prevent that, but this would stop pagination prematurely if we already scraped a full page of data earlier
const oldestReleaseOnPage = releasesOnPage
const oldestReleaseOnPage = pageReleases
.sort((releaseA, releaseB) => releaseB.date - releaseA.date)
.slice(-1)[0];
@ -110,47 +120,9 @@ function needNextPageLegacy(releasesOnPage, uniqueReleasesOnPage, localUniqueRel
return false;
}
async function scrapeReleasesLegacy(scraper, entity, preData, upcoming = false, page = 1, acc = emptyReleases, totalReleases = 0) {
const releases = upcoming
? await scraper.fetchUpcoming(entity, page, include, preData)
: await scraper.fetchLatest(entity, page, include, preData);
if (!Array.isArray(releases)) {
// scraper is unable to fetch the releases and returned a HTTP code or null
logger.warn(`Scraper returned ${releases} when fetching latest from '${entity.name}' (${entity.parent?.name})`);
return acc;
}
const releasesWithEntity = releases.map(release => ({
...release,
entity: release.entity || entity, // allow override
})); // attach entity the release is assigned to when stored
const hasDates = releasesWithEntity.every(release => !!release.date);
const limitedReleases = (argv.last && releasesWithEntity.slice(0, Math.max(argv.last - totalReleases, 0)))
|| (hasDates && releasesWithEntity.filter(release => moment(release.date).isAfter(argv.after)))
|| releasesWithEntity.slice(0, Math.max(argv.nullDateLimit - totalReleases, 0));
const { uniqueReleases, localUniqueReleases, duplicateReleases } = argv.force
? { uniqueReleases: limitedReleases, localUniqueReleases: releases, duplicateReleases: [] }
: await filterUniqueReleases(limitedReleases, acc);
const accReleases = {
uniqueReleases: acc.uniqueReleases.concat(uniqueReleases),
duplicateReleases: acc.duplicateReleases.concat(duplicateReleases),
};
if (needNextPage(releases, uniqueReleases, localUniqueReleases, totalReleases, hasDates, upcoming)) {
return scrapeReleases(scraper, entity, preData, upcoming, page + 1, accReleases, totalReleases + releases.length);
}
return accReleases;
}
async function scrapeReleases(scraper, entity, preData, upcoming) {
async function scrapeReleases(scraper, entity, preData, isUpcoming) {
async function scrapeReleasesPage(page, accReleases) {
const pageReleases = upcoming
const pageReleases = isUpcoming
? await scraper.fetchUpcoming(entity, page, include, preData)
: await scraper.fetchLatest(entity, page, include, preData);
@ -160,16 +132,25 @@ async function scrapeReleases(scraper, entity, preData, upcoming) {
return accReleases;
}
return accReleases.concat(pageReleases);
const pageReleasesWithEntity = pageReleases.map(release => ({ ...release, entity }));
if (needNextPage(pageReleasesWithEntity, accReleases, isUpcoming)) {
return scrapeReleasesPage(page + 1, accReleases.concat(pageReleasesWithEntity), isUpcoming);
}
const releases = await scrapeReleasesPage(1, emptyReleases);
return accReleases.concat(pageReleasesWithEntity);
}
const releases = await scrapeReleasesPage(argv.page || 1, []);
const hasDates = releases.every(release => !!release.date);
const limitedReleases = (argv.last && releases.slice(0, Math.max(argv.last, 0)))
|| (hasDates && releases.filter(release => moment(release.date).isAfter(argv.after)))
|| releases.slice(0, Math.max(argv.nullDateLimit, 0));
console.log(releases.length, limitedReleases.length);
/*
// attach entity the release is assigned to when stored
const releasesWithEntity = limitedReleases.map(release => ({
...release,
@ -179,6 +160,7 @@ async function scrapeReleases(scraper, entity, preData, upcoming) {
const { uniqueReleases, duplicateReleases } = argv.force
? { uniqueReleases: limitedReleases, localUniqueReleases: releases, duplicateReleases: [] }
: await filterUniqueReleases(releasesWithEntity);
*/
}
async function scrapeLatestReleases(scraper, entity, preData) {
@ -187,7 +169,7 @@ async function scrapeLatestReleases(scraper, entity, preData) {
}
try {
return await scrapeReleases(scraper, entity, preData, false, argv.page || 1);
return await scrapeReleases(scraper, entity, preData, false);
} catch (error) {
if (argv.debug) {
console.trace(error);