forked from DebaucheryLibrarian/traxxx
Added revised next page determination.
This commit is contained in:
parent
8aefb8eddb
commit
7c856c267d
|
@ -24,9 +24,18 @@ function mapReleasesToSiteIdAndEntryId(acc, release) {
|
|||
}
|
||||
|
||||
function filterLocalUniqueReleases(releases, accReleases) {
|
||||
const localDuplicateReleasesBySiteIdAndEntryId = accReleases.reduce(mapReleasesToSiteIdAndEntryId, {});
|
||||
|
||||
const localUniqueReleases = releases.filter(release => !localDuplicateReleasesBySiteIdAndEntryId[release.entity.id]?.[release.entryId]);
|
||||
const localDuplicateReleases = releases.filter(release => localDuplicateReleasesBySiteIdAndEntryId[release.entity.id]?.[release.entryId]);
|
||||
|
||||
return {
|
||||
localUniqueReleases,
|
||||
localDuplicateReleases,
|
||||
};
|
||||
}
|
||||
|
||||
/*
|
||||
async function filterUniqueReleases(releases) {
|
||||
const releaseIdentifiers = releases
|
||||
.map(release => [release.entity.id, release.entryId]);
|
||||
|
@ -38,7 +47,9 @@ async function filterUniqueReleases(releases) {
|
|||
|
||||
const duplicateReleases = duplicateReleaseEntries.map(release => curateRelease(release));
|
||||
}
|
||||
*/
|
||||
|
||||
/*
|
||||
async function filterUniqueReleasesLegacy(latestReleases, accReleases) {
|
||||
const latestReleaseIdentifiers = latestReleases
|
||||
.map(release => [release.entity.id, release.entryId]);
|
||||
|
@ -71,32 +82,31 @@ async function filterUniqueReleasesLegacy(latestReleases, accReleases) {
|
|||
duplicateReleases,
|
||||
};
|
||||
}
|
||||
*/
|
||||
|
||||
function needNextPageLegacy(releasesOnPage, uniqueReleasesOnPage, localUniqueReleasesOnPage, totalReleases, hasDates, upcoming) {
|
||||
if (releasesOnPage.length === 0) {
|
||||
function needNextPage(pageReleases, accReleases, isUpcoming) {
|
||||
const { localUniqueReleases: uniquePageReleases } = filterLocalUniqueReleases(pageReleases, accReleases);
|
||||
|
||||
if (uniquePageReleases.length === 0) {
|
||||
// page is empty, or only contains scenes from previous page
|
||||
return false;
|
||||
}
|
||||
|
||||
if (upcoming) {
|
||||
return uniqueReleasesOnPage.length > 0 && argv.paginateUpcoming;
|
||||
if (isUpcoming) {
|
||||
return uniquePageReleases.length > 0 && argv.paginateUpcoming;
|
||||
}
|
||||
|
||||
// no longer works when there are no unique releases, need to keep track of /all/ releases regardless uniqueness
|
||||
// console.log(localUniqueReleasesOnPage.length);
|
||||
|
||||
if (localUniqueReleasesOnPage.length > 0) {
|
||||
if (uniquePageReleases.length > 0) {
|
||||
if (argv.last) {
|
||||
return totalReleases + releasesOnPage.length < argv.last;
|
||||
return accReleases.length + pageReleases.length < argv.last;
|
||||
}
|
||||
|
||||
if (!hasDates) {
|
||||
return totalReleases + releasesOnPage.length < argv.nullDateLimit;
|
||||
if (!pageReleases.every(release => !!release.date)) { // some scenes don't have dates
|
||||
return accReleases.length + pageReleases.length < argv.nullDateLimit;
|
||||
}
|
||||
|
||||
if (argv.after) {
|
||||
// this will keep paginating infinitely on sites that will keep serving the last page if you exceed the last page number (e.g. HardX as of september 2020)
|
||||
// checking unqiueReleases > 0 could prevent that, but this would stop pagination prematurely if we already scraped a full page of data earlier
|
||||
const oldestReleaseOnPage = releasesOnPage
|
||||
const oldestReleaseOnPage = pageReleases
|
||||
.sort((releaseA, releaseB) => releaseB.date - releaseA.date)
|
||||
.slice(-1)[0];
|
||||
|
||||
|
@ -110,47 +120,9 @@ function needNextPageLegacy(releasesOnPage, uniqueReleasesOnPage, localUniqueRel
|
|||
return false;
|
||||
}
|
||||
|
||||
async function scrapeReleasesLegacy(scraper, entity, preData, upcoming = false, page = 1, acc = emptyReleases, totalReleases = 0) {
|
||||
const releases = upcoming
|
||||
? await scraper.fetchUpcoming(entity, page, include, preData)
|
||||
: await scraper.fetchLatest(entity, page, include, preData);
|
||||
|
||||
if (!Array.isArray(releases)) {
|
||||
// scraper is unable to fetch the releases and returned a HTTP code or null
|
||||
logger.warn(`Scraper returned ${releases} when fetching latest from '${entity.name}' (${entity.parent?.name})`);
|
||||
return acc;
|
||||
}
|
||||
|
||||
const releasesWithEntity = releases.map(release => ({
|
||||
...release,
|
||||
entity: release.entity || entity, // allow override
|
||||
})); // attach entity the release is assigned to when stored
|
||||
|
||||
const hasDates = releasesWithEntity.every(release => !!release.date);
|
||||
|
||||
const limitedReleases = (argv.last && releasesWithEntity.slice(0, Math.max(argv.last - totalReleases, 0)))
|
||||
|| (hasDates && releasesWithEntity.filter(release => moment(release.date).isAfter(argv.after)))
|
||||
|| releasesWithEntity.slice(0, Math.max(argv.nullDateLimit - totalReleases, 0));
|
||||
|
||||
const { uniqueReleases, localUniqueReleases, duplicateReleases } = argv.force
|
||||
? { uniqueReleases: limitedReleases, localUniqueReleases: releases, duplicateReleases: [] }
|
||||
: await filterUniqueReleases(limitedReleases, acc);
|
||||
|
||||
const accReleases = {
|
||||
uniqueReleases: acc.uniqueReleases.concat(uniqueReleases),
|
||||
duplicateReleases: acc.duplicateReleases.concat(duplicateReleases),
|
||||
};
|
||||
|
||||
if (needNextPage(releases, uniqueReleases, localUniqueReleases, totalReleases, hasDates, upcoming)) {
|
||||
return scrapeReleases(scraper, entity, preData, upcoming, page + 1, accReleases, totalReleases + releases.length);
|
||||
}
|
||||
|
||||
return accReleases;
|
||||
}
|
||||
|
||||
async function scrapeReleases(scraper, entity, preData, upcoming) {
|
||||
async function scrapeReleases(scraper, entity, preData, isUpcoming) {
|
||||
async function scrapeReleasesPage(page, accReleases) {
|
||||
const pageReleases = upcoming
|
||||
const pageReleases = isUpcoming
|
||||
? await scraper.fetchUpcoming(entity, page, include, preData)
|
||||
: await scraper.fetchLatest(entity, page, include, preData);
|
||||
|
||||
|
@ -160,16 +132,25 @@ async function scrapeReleases(scraper, entity, preData, upcoming) {
|
|||
return accReleases;
|
||||
}
|
||||
|
||||
return accReleases.concat(pageReleases);
|
||||
const pageReleasesWithEntity = pageReleases.map(release => ({ ...release, entity }));
|
||||
|
||||
if (needNextPage(pageReleasesWithEntity, accReleases, isUpcoming)) {
|
||||
return scrapeReleasesPage(page + 1, accReleases.concat(pageReleasesWithEntity), isUpcoming);
|
||||
}
|
||||
|
||||
const releases = await scrapeReleasesPage(1, emptyReleases);
|
||||
return accReleases.concat(pageReleasesWithEntity);
|
||||
}
|
||||
|
||||
const releases = await scrapeReleasesPage(argv.page || 1, []);
|
||||
const hasDates = releases.every(release => !!release.date);
|
||||
|
||||
const limitedReleases = (argv.last && releases.slice(0, Math.max(argv.last, 0)))
|
||||
|| (hasDates && releases.filter(release => moment(release.date).isAfter(argv.after)))
|
||||
|| releases.slice(0, Math.max(argv.nullDateLimit, 0));
|
||||
|
||||
console.log(releases.length, limitedReleases.length);
|
||||
|
||||
/*
|
||||
// attach entity the release is assigned to when stored
|
||||
const releasesWithEntity = limitedReleases.map(release => ({
|
||||
...release,
|
||||
|
@ -179,6 +160,7 @@ async function scrapeReleases(scraper, entity, preData, upcoming) {
|
|||
const { uniqueReleases, duplicateReleases } = argv.force
|
||||
? { uniqueReleases: limitedReleases, localUniqueReleases: releases, duplicateReleases: [] }
|
||||
: await filterUniqueReleases(releasesWithEntity);
|
||||
*/
|
||||
}
|
||||
|
||||
async function scrapeLatestReleases(scraper, entity, preData) {
|
||||
|
@ -187,7 +169,7 @@ async function scrapeLatestReleases(scraper, entity, preData) {
|
|||
}
|
||||
|
||||
try {
|
||||
return await scrapeReleases(scraper, entity, preData, false, argv.page || 1);
|
||||
return await scrapeReleases(scraper, entity, preData, false);
|
||||
} catch (error) {
|
||||
if (argv.debug) {
|
||||
console.trace(error);
|
||||
|
|
Loading…
Reference in New Issue