Added revised next page determination.

This commit is contained in:
DebaucheryLibrarian 2020-10-12 04:08:22 +02:00
parent 8aefb8eddb
commit 7c856c267d
1 changed files with 39 additions and 57 deletions

View File

@ -24,9 +24,18 @@ function mapReleasesToSiteIdAndEntryId(acc, release) {
} }
function filterLocalUniqueReleases(releases, accReleases) { function filterLocalUniqueReleases(releases, accReleases) {
const localDuplicateReleasesBySiteIdAndEntryId = accReleases.reduce(mapReleasesToSiteIdAndEntryId, {});
const localUniqueReleases = releases.filter(release => !localDuplicateReleasesBySiteIdAndEntryId[release.entity.id]?.[release.entryId]);
const localDuplicateReleases = releases.filter(release => localDuplicateReleasesBySiteIdAndEntryId[release.entity.id]?.[release.entryId]);
return {
localUniqueReleases,
localDuplicateReleases,
};
} }
/*
async function filterUniqueReleases(releases) { async function filterUniqueReleases(releases) {
const releaseIdentifiers = releases const releaseIdentifiers = releases
.map(release => [release.entity.id, release.entryId]); .map(release => [release.entity.id, release.entryId]);
@ -38,7 +47,9 @@ async function filterUniqueReleases(releases) {
const duplicateReleases = duplicateReleaseEntries.map(release => curateRelease(release)); const duplicateReleases = duplicateReleaseEntries.map(release => curateRelease(release));
} }
*/
/*
async function filterUniqueReleasesLegacy(latestReleases, accReleases) { async function filterUniqueReleasesLegacy(latestReleases, accReleases) {
const latestReleaseIdentifiers = latestReleases const latestReleaseIdentifiers = latestReleases
.map(release => [release.entity.id, release.entryId]); .map(release => [release.entity.id, release.entryId]);
@ -71,32 +82,31 @@ async function filterUniqueReleasesLegacy(latestReleases, accReleases) {
duplicateReleases, duplicateReleases,
}; };
} }
*/
function needNextPageLegacy(releasesOnPage, uniqueReleasesOnPage, localUniqueReleasesOnPage, totalReleases, hasDates, upcoming) { function needNextPage(pageReleases, accReleases, isUpcoming) {
if (releasesOnPage.length === 0) { const { localUniqueReleases: uniquePageReleases } = filterLocalUniqueReleases(pageReleases, accReleases);
if (uniquePageReleases.length === 0) {
// page is empty, or only contains scenes from previous page
return false; return false;
} }
if (upcoming) { if (isUpcoming) {
return uniqueReleasesOnPage.length > 0 && argv.paginateUpcoming; return uniquePageReleases.length > 0 && argv.paginateUpcoming;
} }
// no longer works when there are no unique releases, need to keep track of /all/ releases regardless uniqueness if (uniquePageReleases.length > 0) {
// console.log(localUniqueReleasesOnPage.length);
if (localUniqueReleasesOnPage.length > 0) {
if (argv.last) { if (argv.last) {
return totalReleases + releasesOnPage.length < argv.last; return accReleases.length + pageReleases.length < argv.last;
} }
if (!hasDates) { if (!pageReleases.every(release => !!release.date)) { // some scenes don't have dates
return totalReleases + releasesOnPage.length < argv.nullDateLimit; return accReleases.length + pageReleases.length < argv.nullDateLimit;
} }
if (argv.after) { if (argv.after) {
// this will keep paginating infinitely on sites that will keep serving the last page if you exceed the last page number (e.g. HardX as of september 2020) const oldestReleaseOnPage = pageReleases
// checking unqiueReleases > 0 could prevent that, but this would stop pagination prematurely if we already scraped a full page of data earlier
const oldestReleaseOnPage = releasesOnPage
.sort((releaseA, releaseB) => releaseB.date - releaseA.date) .sort((releaseA, releaseB) => releaseB.date - releaseA.date)
.slice(-1)[0]; .slice(-1)[0];
@ -110,47 +120,9 @@ function needNextPageLegacy(releasesOnPage, uniqueReleasesOnPage, localUniqueRel
return false; return false;
} }
async function scrapeReleasesLegacy(scraper, entity, preData, upcoming = false, page = 1, acc = emptyReleases, totalReleases = 0) { async function scrapeReleases(scraper, entity, preData, isUpcoming) {
const releases = upcoming
? await scraper.fetchUpcoming(entity, page, include, preData)
: await scraper.fetchLatest(entity, page, include, preData);
if (!Array.isArray(releases)) {
// scraper is unable to fetch the releases and returned a HTTP code or null
logger.warn(`Scraper returned ${releases} when fetching latest from '${entity.name}' (${entity.parent?.name})`);
return acc;
}
const releasesWithEntity = releases.map(release => ({
...release,
entity: release.entity || entity, // allow override
})); // attach entity the release is assigned to when stored
const hasDates = releasesWithEntity.every(release => !!release.date);
const limitedReleases = (argv.last && releasesWithEntity.slice(0, Math.max(argv.last - totalReleases, 0)))
|| (hasDates && releasesWithEntity.filter(release => moment(release.date).isAfter(argv.after)))
|| releasesWithEntity.slice(0, Math.max(argv.nullDateLimit - totalReleases, 0));
const { uniqueReleases, localUniqueReleases, duplicateReleases } = argv.force
? { uniqueReleases: limitedReleases, localUniqueReleases: releases, duplicateReleases: [] }
: await filterUniqueReleases(limitedReleases, acc);
const accReleases = {
uniqueReleases: acc.uniqueReleases.concat(uniqueReleases),
duplicateReleases: acc.duplicateReleases.concat(duplicateReleases),
};
if (needNextPage(releases, uniqueReleases, localUniqueReleases, totalReleases, hasDates, upcoming)) {
return scrapeReleases(scraper, entity, preData, upcoming, page + 1, accReleases, totalReleases + releases.length);
}
return accReleases;
}
async function scrapeReleases(scraper, entity, preData, upcoming) {
async function scrapeReleasesPage(page, accReleases) { async function scrapeReleasesPage(page, accReleases) {
const pageReleases = upcoming const pageReleases = isUpcoming
? await scraper.fetchUpcoming(entity, page, include, preData) ? await scraper.fetchUpcoming(entity, page, include, preData)
: await scraper.fetchLatest(entity, page, include, preData); : await scraper.fetchLatest(entity, page, include, preData);
@ -160,16 +132,25 @@ async function scrapeReleases(scraper, entity, preData, upcoming) {
return accReleases; return accReleases;
} }
return accReleases.concat(pageReleases); const pageReleasesWithEntity = pageReleases.map(release => ({ ...release, entity }));
if (needNextPage(pageReleasesWithEntity, accReleases, isUpcoming)) {
return scrapeReleasesPage(page + 1, accReleases.concat(pageReleasesWithEntity), isUpcoming);
}
return accReleases.concat(pageReleasesWithEntity);
} }
const releases = await scrapeReleasesPage(1, emptyReleases); const releases = await scrapeReleasesPage(argv.page || 1, []);
const hasDates = releases.every(release => !!release.date); const hasDates = releases.every(release => !!release.date);
const limitedReleases = (argv.last && releases.slice(0, Math.max(argv.last, 0))) const limitedReleases = (argv.last && releases.slice(0, Math.max(argv.last, 0)))
|| (hasDates && releases.filter(release => moment(release.date).isAfter(argv.after))) || (hasDates && releases.filter(release => moment(release.date).isAfter(argv.after)))
|| releases.slice(0, Math.max(argv.nullDateLimit, 0)); || releases.slice(0, Math.max(argv.nullDateLimit, 0));
console.log(releases.length, limitedReleases.length);
/*
// attach entity the release is assigned to when stored // attach entity the release is assigned to when stored
const releasesWithEntity = limitedReleases.map(release => ({ const releasesWithEntity = limitedReleases.map(release => ({
...release, ...release,
@ -179,6 +160,7 @@ async function scrapeReleases(scraper, entity, preData, upcoming) {
const { uniqueReleases, duplicateReleases } = argv.force const { uniqueReleases, duplicateReleases } = argv.force
? { uniqueReleases: limitedReleases, localUniqueReleases: releases, duplicateReleases: [] } ? { uniqueReleases: limitedReleases, localUniqueReleases: releases, duplicateReleases: [] }
: await filterUniqueReleases(releasesWithEntity); : await filterUniqueReleases(releasesWithEntity);
*/
} }
async function scrapeLatestReleases(scraper, entity, preData) { async function scrapeLatestReleases(scraper, entity, preData) {
@ -187,7 +169,7 @@ async function scrapeLatestReleases(scraper, entity, preData) {
} }
try { try {
return await scrapeReleases(scraper, entity, preData, false, argv.page || 1); return await scrapeReleases(scraper, entity, preData, false);
} catch (error) { } catch (error) {
if (argv.debug) { if (argv.debug) {
console.trace(error); console.trace(error);