diff --git a/src/updates.js b/src/updates.js index bf9fef01..84a03613 100644 --- a/src/updates.js +++ b/src/updates.js @@ -24,9 +24,18 @@ function mapReleasesToSiteIdAndEntryId(acc, release) { } function filterLocalUniqueReleases(releases, accReleases) { + const localDuplicateReleasesBySiteIdAndEntryId = accReleases.reduce(mapReleasesToSiteIdAndEntryId, {}); + const localUniqueReleases = releases.filter(release => !localDuplicateReleasesBySiteIdAndEntryId[release.entity.id]?.[release.entryId]); + const localDuplicateReleases = releases.filter(release => localDuplicateReleasesBySiteIdAndEntryId[release.entity.id]?.[release.entryId]); + + return { + localUniqueReleases, + localDuplicateReleases, + }; } +/* async function filterUniqueReleases(releases) { const releaseIdentifiers = releases .map(release => [release.entity.id, release.entryId]); @@ -38,7 +47,9 @@ async function filterUniqueReleases(releases) { const duplicateReleases = duplicateReleaseEntries.map(release => curateRelease(release)); } +*/ +/* async function filterUniqueReleasesLegacy(latestReleases, accReleases) { const latestReleaseIdentifiers = latestReleases .map(release => [release.entity.id, release.entryId]); @@ -71,32 +82,31 @@ async function filterUniqueReleasesLegacy(latestReleases, accReleases) { duplicateReleases, }; } +*/ -function needNextPageLegacy(releasesOnPage, uniqueReleasesOnPage, localUniqueReleasesOnPage, totalReleases, hasDates, upcoming) { - if (releasesOnPage.length === 0) { +function needNextPage(pageReleases, accReleases, isUpcoming) { + const { localUniqueReleases: uniquePageReleases } = filterLocalUniqueReleases(pageReleases, accReleases); + + if (uniquePageReleases.length === 0) { + // page is empty, or only contains scenes from previous page return false; } - if (upcoming) { - return uniqueReleasesOnPage.length > 0 && argv.paginateUpcoming; + if (isUpcoming) { + return uniquePageReleases.length > 0 && argv.paginateUpcoming; } - // no longer works when there are no unique releases, need to keep track of /all/ releases regardless uniqueness - // console.log(localUniqueReleasesOnPage.length); - - if (localUniqueReleasesOnPage.length > 0) { + if (uniquePageReleases.length > 0) { if (argv.last) { - return totalReleases + releasesOnPage.length < argv.last; + return accReleases.length + pageReleases.length < argv.last; } - if (!hasDates) { - return totalReleases + releasesOnPage.length < argv.nullDateLimit; + if (!pageReleases.every(release => !!release.date)) { // some scenes don't have dates + return accReleases.length + pageReleases.length < argv.nullDateLimit; } if (argv.after) { - // this will keep paginating infinitely on sites that will keep serving the last page if you exceed the last page number (e.g. HardX as of september 2020) - // checking unqiueReleases > 0 could prevent that, but this would stop pagination prematurely if we already scraped a full page of data earlier - const oldestReleaseOnPage = releasesOnPage + const oldestReleaseOnPage = pageReleases .sort((releaseA, releaseB) => releaseB.date - releaseA.date) .slice(-1)[0]; @@ -110,47 +120,9 @@ function needNextPageLegacy(releasesOnPage, uniqueReleasesOnPage, localUniqueRel return false; } -async function scrapeReleasesLegacy(scraper, entity, preData, upcoming = false, page = 1, acc = emptyReleases, totalReleases = 0) { - const releases = upcoming - ? await scraper.fetchUpcoming(entity, page, include, preData) - : await scraper.fetchLatest(entity, page, include, preData); - - if (!Array.isArray(releases)) { - // scraper is unable to fetch the releases and returned a HTTP code or null - logger.warn(`Scraper returned ${releases} when fetching latest from '${entity.name}' (${entity.parent?.name})`); - return acc; - } - - const releasesWithEntity = releases.map(release => ({ - ...release, - entity: release.entity || entity, // allow override - })); // attach entity the release is assigned to when stored - - const hasDates = releasesWithEntity.every(release => !!release.date); - - const limitedReleases = (argv.last && releasesWithEntity.slice(0, Math.max(argv.last - totalReleases, 0))) - || (hasDates && releasesWithEntity.filter(release => moment(release.date).isAfter(argv.after))) - || releasesWithEntity.slice(0, Math.max(argv.nullDateLimit - totalReleases, 0)); - - const { uniqueReleases, localUniqueReleases, duplicateReleases } = argv.force - ? { uniqueReleases: limitedReleases, localUniqueReleases: releases, duplicateReleases: [] } - : await filterUniqueReleases(limitedReleases, acc); - - const accReleases = { - uniqueReleases: acc.uniqueReleases.concat(uniqueReleases), - duplicateReleases: acc.duplicateReleases.concat(duplicateReleases), - }; - - if (needNextPage(releases, uniqueReleases, localUniqueReleases, totalReleases, hasDates, upcoming)) { - return scrapeReleases(scraper, entity, preData, upcoming, page + 1, accReleases, totalReleases + releases.length); - } - - return accReleases; -} - -async function scrapeReleases(scraper, entity, preData, upcoming) { +async function scrapeReleases(scraper, entity, preData, isUpcoming) { async function scrapeReleasesPage(page, accReleases) { - const pageReleases = upcoming + const pageReleases = isUpcoming ? await scraper.fetchUpcoming(entity, page, include, preData) : await scraper.fetchLatest(entity, page, include, preData); @@ -160,16 +132,25 @@ async function scrapeReleases(scraper, entity, preData, upcoming) { return accReleases; } - return accReleases.concat(pageReleases); + const pageReleasesWithEntity = pageReleases.map(release => ({ ...release, entity })); + + if (needNextPage(pageReleasesWithEntity, accReleases, isUpcoming)) { + return scrapeReleasesPage(page + 1, accReleases.concat(pageReleasesWithEntity), isUpcoming); + } + + return accReleases.concat(pageReleasesWithEntity); } - const releases = await scrapeReleasesPage(1, emptyReleases); + const releases = await scrapeReleasesPage(argv.page || 1, []); const hasDates = releases.every(release => !!release.date); const limitedReleases = (argv.last && releases.slice(0, Math.max(argv.last, 0))) || (hasDates && releases.filter(release => moment(release.date).isAfter(argv.after))) || releases.slice(0, Math.max(argv.nullDateLimit, 0)); + console.log(releases.length, limitedReleases.length); + + /* // attach entity the release is assigned to when stored const releasesWithEntity = limitedReleases.map(release => ({ ...release, @@ -179,6 +160,7 @@ async function scrapeReleases(scraper, entity, preData, upcoming) { const { uniqueReleases, duplicateReleases } = argv.force ? { uniqueReleases: limitedReleases, localUniqueReleases: releases, duplicateReleases: [] } : await filterUniqueReleases(releasesWithEntity); + */ } async function scrapeLatestReleases(scraper, entity, preData) { @@ -187,7 +169,7 @@ async function scrapeLatestReleases(scraper, entity, preData) { } try { - return await scrapeReleases(scraper, entity, preData, false, argv.page || 1); + return await scrapeReleases(scraper, entity, preData, false); } catch (error) { if (argv.debug) { console.trace(error);