Added revised next page determination.
This commit is contained in:
parent
8aefb8eddb
commit
7c856c267d
|
@ -24,9 +24,18 @@ function mapReleasesToSiteIdAndEntryId(acc, release) {
|
||||||
}
|
}
|
||||||
|
|
||||||
function filterLocalUniqueReleases(releases, accReleases) {
|
function filterLocalUniqueReleases(releases, accReleases) {
|
||||||
|
const localDuplicateReleasesBySiteIdAndEntryId = accReleases.reduce(mapReleasesToSiteIdAndEntryId, {});
|
||||||
|
|
||||||
|
const localUniqueReleases = releases.filter(release => !localDuplicateReleasesBySiteIdAndEntryId[release.entity.id]?.[release.entryId]);
|
||||||
|
const localDuplicateReleases = releases.filter(release => localDuplicateReleasesBySiteIdAndEntryId[release.entity.id]?.[release.entryId]);
|
||||||
|
|
||||||
|
return {
|
||||||
|
localUniqueReleases,
|
||||||
|
localDuplicateReleases,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
async function filterUniqueReleases(releases) {
|
async function filterUniqueReleases(releases) {
|
||||||
const releaseIdentifiers = releases
|
const releaseIdentifiers = releases
|
||||||
.map(release => [release.entity.id, release.entryId]);
|
.map(release => [release.entity.id, release.entryId]);
|
||||||
|
@ -38,7 +47,9 @@ async function filterUniqueReleases(releases) {
|
||||||
|
|
||||||
const duplicateReleases = duplicateReleaseEntries.map(release => curateRelease(release));
|
const duplicateReleases = duplicateReleaseEntries.map(release => curateRelease(release));
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
async function filterUniqueReleasesLegacy(latestReleases, accReleases) {
|
async function filterUniqueReleasesLegacy(latestReleases, accReleases) {
|
||||||
const latestReleaseIdentifiers = latestReleases
|
const latestReleaseIdentifiers = latestReleases
|
||||||
.map(release => [release.entity.id, release.entryId]);
|
.map(release => [release.entity.id, release.entryId]);
|
||||||
|
@ -71,32 +82,31 @@ async function filterUniqueReleasesLegacy(latestReleases, accReleases) {
|
||||||
duplicateReleases,
|
duplicateReleases,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
function needNextPageLegacy(releasesOnPage, uniqueReleasesOnPage, localUniqueReleasesOnPage, totalReleases, hasDates, upcoming) {
|
function needNextPage(pageReleases, accReleases, isUpcoming) {
|
||||||
if (releasesOnPage.length === 0) {
|
const { localUniqueReleases: uniquePageReleases } = filterLocalUniqueReleases(pageReleases, accReleases);
|
||||||
|
|
||||||
|
if (uniquePageReleases.length === 0) {
|
||||||
|
// page is empty, or only contains scenes from previous page
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (upcoming) {
|
if (isUpcoming) {
|
||||||
return uniqueReleasesOnPage.length > 0 && argv.paginateUpcoming;
|
return uniquePageReleases.length > 0 && argv.paginateUpcoming;
|
||||||
}
|
}
|
||||||
|
|
||||||
// no longer works when there are no unique releases, need to keep track of /all/ releases regardless uniqueness
|
if (uniquePageReleases.length > 0) {
|
||||||
// console.log(localUniqueReleasesOnPage.length);
|
|
||||||
|
|
||||||
if (localUniqueReleasesOnPage.length > 0) {
|
|
||||||
if (argv.last) {
|
if (argv.last) {
|
||||||
return totalReleases + releasesOnPage.length < argv.last;
|
return accReleases.length + pageReleases.length < argv.last;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!hasDates) {
|
if (!pageReleases.every(release => !!release.date)) { // some scenes don't have dates
|
||||||
return totalReleases + releasesOnPage.length < argv.nullDateLimit;
|
return accReleases.length + pageReleases.length < argv.nullDateLimit;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (argv.after) {
|
if (argv.after) {
|
||||||
// this will keep paginating infinitely on sites that will keep serving the last page if you exceed the last page number (e.g. HardX as of september 2020)
|
const oldestReleaseOnPage = pageReleases
|
||||||
// checking unqiueReleases > 0 could prevent that, but this would stop pagination prematurely if we already scraped a full page of data earlier
|
|
||||||
const oldestReleaseOnPage = releasesOnPage
|
|
||||||
.sort((releaseA, releaseB) => releaseB.date - releaseA.date)
|
.sort((releaseA, releaseB) => releaseB.date - releaseA.date)
|
||||||
.slice(-1)[0];
|
.slice(-1)[0];
|
||||||
|
|
||||||
|
@ -110,47 +120,9 @@ function needNextPageLegacy(releasesOnPage, uniqueReleasesOnPage, localUniqueRel
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapeReleasesLegacy(scraper, entity, preData, upcoming = false, page = 1, acc = emptyReleases, totalReleases = 0) {
|
async function scrapeReleases(scraper, entity, preData, isUpcoming) {
|
||||||
const releases = upcoming
|
|
||||||
? await scraper.fetchUpcoming(entity, page, include, preData)
|
|
||||||
: await scraper.fetchLatest(entity, page, include, preData);
|
|
||||||
|
|
||||||
if (!Array.isArray(releases)) {
|
|
||||||
// scraper is unable to fetch the releases and returned a HTTP code or null
|
|
||||||
logger.warn(`Scraper returned ${releases} when fetching latest from '${entity.name}' (${entity.parent?.name})`);
|
|
||||||
return acc;
|
|
||||||
}
|
|
||||||
|
|
||||||
const releasesWithEntity = releases.map(release => ({
|
|
||||||
...release,
|
|
||||||
entity: release.entity || entity, // allow override
|
|
||||||
})); // attach entity the release is assigned to when stored
|
|
||||||
|
|
||||||
const hasDates = releasesWithEntity.every(release => !!release.date);
|
|
||||||
|
|
||||||
const limitedReleases = (argv.last && releasesWithEntity.slice(0, Math.max(argv.last - totalReleases, 0)))
|
|
||||||
|| (hasDates && releasesWithEntity.filter(release => moment(release.date).isAfter(argv.after)))
|
|
||||||
|| releasesWithEntity.slice(0, Math.max(argv.nullDateLimit - totalReleases, 0));
|
|
||||||
|
|
||||||
const { uniqueReleases, localUniqueReleases, duplicateReleases } = argv.force
|
|
||||||
? { uniqueReleases: limitedReleases, localUniqueReleases: releases, duplicateReleases: [] }
|
|
||||||
: await filterUniqueReleases(limitedReleases, acc);
|
|
||||||
|
|
||||||
const accReleases = {
|
|
||||||
uniqueReleases: acc.uniqueReleases.concat(uniqueReleases),
|
|
||||||
duplicateReleases: acc.duplicateReleases.concat(duplicateReleases),
|
|
||||||
};
|
|
||||||
|
|
||||||
if (needNextPage(releases, uniqueReleases, localUniqueReleases, totalReleases, hasDates, upcoming)) {
|
|
||||||
return scrapeReleases(scraper, entity, preData, upcoming, page + 1, accReleases, totalReleases + releases.length);
|
|
||||||
}
|
|
||||||
|
|
||||||
return accReleases;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function scrapeReleases(scraper, entity, preData, upcoming) {
|
|
||||||
async function scrapeReleasesPage(page, accReleases) {
|
async function scrapeReleasesPage(page, accReleases) {
|
||||||
const pageReleases = upcoming
|
const pageReleases = isUpcoming
|
||||||
? await scraper.fetchUpcoming(entity, page, include, preData)
|
? await scraper.fetchUpcoming(entity, page, include, preData)
|
||||||
: await scraper.fetchLatest(entity, page, include, preData);
|
: await scraper.fetchLatest(entity, page, include, preData);
|
||||||
|
|
||||||
|
@ -160,16 +132,25 @@ async function scrapeReleases(scraper, entity, preData, upcoming) {
|
||||||
return accReleases;
|
return accReleases;
|
||||||
}
|
}
|
||||||
|
|
||||||
return accReleases.concat(pageReleases);
|
const pageReleasesWithEntity = pageReleases.map(release => ({ ...release, entity }));
|
||||||
|
|
||||||
|
if (needNextPage(pageReleasesWithEntity, accReleases, isUpcoming)) {
|
||||||
|
return scrapeReleasesPage(page + 1, accReleases.concat(pageReleasesWithEntity), isUpcoming);
|
||||||
|
}
|
||||||
|
|
||||||
|
return accReleases.concat(pageReleasesWithEntity);
|
||||||
}
|
}
|
||||||
|
|
||||||
const releases = await scrapeReleasesPage(1, emptyReleases);
|
const releases = await scrapeReleasesPage(argv.page || 1, []);
|
||||||
const hasDates = releases.every(release => !!release.date);
|
const hasDates = releases.every(release => !!release.date);
|
||||||
|
|
||||||
const limitedReleases = (argv.last && releases.slice(0, Math.max(argv.last, 0)))
|
const limitedReleases = (argv.last && releases.slice(0, Math.max(argv.last, 0)))
|
||||||
|| (hasDates && releases.filter(release => moment(release.date).isAfter(argv.after)))
|
|| (hasDates && releases.filter(release => moment(release.date).isAfter(argv.after)))
|
||||||
|| releases.slice(0, Math.max(argv.nullDateLimit, 0));
|
|| releases.slice(0, Math.max(argv.nullDateLimit, 0));
|
||||||
|
|
||||||
|
console.log(releases.length, limitedReleases.length);
|
||||||
|
|
||||||
|
/*
|
||||||
// attach entity the release is assigned to when stored
|
// attach entity the release is assigned to when stored
|
||||||
const releasesWithEntity = limitedReleases.map(release => ({
|
const releasesWithEntity = limitedReleases.map(release => ({
|
||||||
...release,
|
...release,
|
||||||
|
@ -179,6 +160,7 @@ async function scrapeReleases(scraper, entity, preData, upcoming) {
|
||||||
const { uniqueReleases, duplicateReleases } = argv.force
|
const { uniqueReleases, duplicateReleases } = argv.force
|
||||||
? { uniqueReleases: limitedReleases, localUniqueReleases: releases, duplicateReleases: [] }
|
? { uniqueReleases: limitedReleases, localUniqueReleases: releases, duplicateReleases: [] }
|
||||||
: await filterUniqueReleases(releasesWithEntity);
|
: await filterUniqueReleases(releasesWithEntity);
|
||||||
|
*/
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapeLatestReleases(scraper, entity, preData) {
|
async function scrapeLatestReleases(scraper, entity, preData) {
|
||||||
|
@ -187,7 +169,7 @@ async function scrapeLatestReleases(scraper, entity, preData) {
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
return await scrapeReleases(scraper, entity, preData, false, argv.page || 1);
|
return await scrapeReleases(scraper, entity, preData, false);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (argv.debug) {
|
if (argv.debug) {
|
||||||
console.trace(error);
|
console.trace(error);
|
||||||
|
|
Loading…
Reference in New Issue