Added various tag photos. Renamed some toy tags.
This commit is contained in:
@@ -13,6 +13,16 @@ const { fetchIncludedEntities } = require('./entities');
|
||||
|
||||
const emptyReleases = { uniqueReleases: [], duplicateReleases: [] };
|
||||
|
||||
function mapReleasesToSiteIdAndEntryId(acc, release) {
|
||||
const entityId = release.entityId || release.entity.id;
|
||||
const entryId = release.entryId || release.entryId;
|
||||
|
||||
if (!acc[entityId]) acc[entityId] = {};
|
||||
acc[entityId][entryId] = true;
|
||||
|
||||
return acc;
|
||||
}
|
||||
|
||||
async function filterUniqueReleases(latestReleases, accReleases) {
|
||||
const latestReleaseIdentifiers = latestReleases
|
||||
.map(release => [release.entity.id, release.entryId]);
|
||||
@@ -26,52 +36,54 @@ async function filterUniqueReleases(latestReleases, accReleases) {
|
||||
|
||||
// add entry IDs of accumulated releases to prevent an infinite scrape loop
|
||||
// when one page contains the same release as the previous
|
||||
const duplicateReleasesSiteIdAndEntryIds = duplicateReleases
|
||||
const duplicateReleasesBySiteIdAndEntryId = duplicateReleases
|
||||
.concat(accReleases)
|
||||
.reduce((acc, release) => {
|
||||
const entityId = release.entityId || release.entity.id;
|
||||
const entryId = release.entryId || release.entryId;
|
||||
.reduce(mapReleasesToSiteIdAndEntryId, {});
|
||||
|
||||
if (!acc[entityId]) acc[entityId] = {};
|
||||
acc[entityId][entryId] = true;
|
||||
const localDuplicateReleasesBySiteIdAndEntryId = accReleases.reduce(mapReleasesToSiteIdAndEntryId, {});
|
||||
|
||||
return acc;
|
||||
}, {});
|
||||
const uniqueReleases = latestReleases.filter(release => !duplicateReleasesBySiteIdAndEntryId[release.entity.id]?.[release.entryId]);
|
||||
const localUniqueReleases = latestReleases.filter(release => !localDuplicateReleasesBySiteIdAndEntryId[release.entity.id]?.[release.entryId]);
|
||||
|
||||
const uniqueReleases = latestReleases.filter(release => !duplicateReleasesSiteIdAndEntryIds[release.entity.id]?.[release.entryId]);
|
||||
|
||||
return { uniqueReleases, duplicateReleases };
|
||||
return {
|
||||
uniqueReleases,
|
||||
localUniqueReleases,
|
||||
duplicateReleases,
|
||||
};
|
||||
}
|
||||
|
||||
function needNextPage(releasesOnPage, uniqueReleases, totalReleases, hasDates, upcoming) {
|
||||
function needNextPage(releasesOnPage, uniqueReleasesOnPage, localUniqueReleasesOnPage, totalReleases, hasDates, upcoming) {
|
||||
if (releasesOnPage.length === 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (upcoming) {
|
||||
return uniqueReleases.length > 0 && argv.paginateUpcoming;
|
||||
return uniqueReleasesOnPage.length > 0 && argv.paginateUpcoming;
|
||||
}
|
||||
|
||||
if (argv.last) {
|
||||
// this will keep paginating until the second condition is met on sites that will keep serving the last page if you exceed the last page number (e.g. HardX as of september 2020)
|
||||
// checking unqiueReleases > 0 could prevent that, but this would stop pagination prematurely if we already scraped a full page of data earlier
|
||||
return releasesOnPage.length > 0 && totalReleases + releasesOnPage.length < argv.last;
|
||||
}
|
||||
// no longer works when there are no unique releases, need to keep track of /all/ releases regardless uniqueness
|
||||
console.log(localUniqueReleasesOnPage.length);
|
||||
|
||||
if (!hasDates) {
|
||||
return totalReleases + releasesOnPage.length < argv.nullDateLimit;
|
||||
}
|
||||
if (localUniqueReleasesOnPage.length > 0) {
|
||||
if (argv.last) {
|
||||
return totalReleases + releasesOnPage.length < argv.last;
|
||||
}
|
||||
|
||||
if (argv.after) {
|
||||
// this will keep paginating infinitely on sites that will keep serving the last page if you exceed the last page number (e.g. HardX as of september 2020)
|
||||
// checking unqiueReleases > 0 could prevent that, but this would stop pagination prematurely if we already scraped a full page of data earlier
|
||||
const oldestReleaseOnPage = releasesOnPage
|
||||
.sort((releaseA, releaseB) => releaseB.date - releaseA.date)
|
||||
.slice(-1)[0];
|
||||
if (!hasDates) {
|
||||
return totalReleases + releasesOnPage.length < argv.nullDateLimit;
|
||||
}
|
||||
|
||||
if (moment(oldestReleaseOnPage.date).isAfter(argv.after)) {
|
||||
// oldest release on page is newer than the specified date cut-off
|
||||
return true;
|
||||
if (argv.after) {
|
||||
// this will keep paginating infinitely on sites that will keep serving the last page if you exceed the last page number (e.g. HardX as of september 2020)
|
||||
// checking unqiueReleases > 0 could prevent that, but this would stop pagination prematurely if we already scraped a full page of data earlier
|
||||
const oldestReleaseOnPage = releasesOnPage
|
||||
.sort((releaseA, releaseB) => releaseB.date - releaseA.date)
|
||||
.slice(-1)[0];
|
||||
|
||||
if (moment(oldestReleaseOnPage.date).isAfter(argv.after)) {
|
||||
// oldest release on page is newer than the specified date cut-off
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -100,8 +112,8 @@ async function scrapeReleases(scraper, entity, preData, upcoming = false, page =
|
||||
|| (hasDates && releasesWithEntity.filter(release => moment(release.date).isAfter(argv.after)))
|
||||
|| releasesWithEntity.slice(0, Math.max(argv.nullDateLimit - totalReleases, 0));
|
||||
|
||||
const { uniqueReleases, duplicateReleases } = argv.force
|
||||
? { uniqueReleases: limitedReleases, duplicateReleases: [] }
|
||||
const { uniqueReleases, localUniqueReleases, duplicateReleases } = argv.force
|
||||
? { uniqueReleases: limitedReleases, localUniqueReleases: limitedReleases, duplicateReleases: [] }
|
||||
: await filterUniqueReleases(limitedReleases, acc.uniqueReleases);
|
||||
|
||||
const accReleases = {
|
||||
@@ -109,7 +121,7 @@ async function scrapeReleases(scraper, entity, preData, upcoming = false, page =
|
||||
duplicateReleases: acc.duplicateReleases.concat(duplicateReleases),
|
||||
};
|
||||
|
||||
if (needNextPage(releases, uniqueReleases, totalReleases, hasDates, upcoming)) {
|
||||
if (needNextPage(releases, uniqueReleases, localUniqueReleases, totalReleases, hasDates, upcoming)) {
|
||||
return scrapeReleases(scraper, entity, preData, upcoming, page + 1, accReleases, totalReleases + releases.length);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user