Added various tag photos. Renamed some toy tags.

This commit is contained in:
DebaucheryLibrarian
2020-09-21 05:11:24 +02:00
parent a9c1a91571
commit 566c20ea7e
99 changed files with 249 additions and 151 deletions

View File

@@ -200,6 +200,8 @@ async function findSourceDuplicates(baseMedias) {
knex('media').whereIn('source_page', extractUrls),
]);
console.log(sourceUrls, existingSourceMedia);
const existingSourceMediaByUrl = itemsByKey(existingSourceMedia, 'source');
const existingExtractMediaByUrl = itemsByKey(existingExtractMedia, 'source_page');
@@ -279,86 +281,94 @@ async function extractSource(baseSource, { existingExtractMediaByUrl }) {
}
async function storeImageFile(media, hashDir, hashSubDir, filename, filedir, filepath) {
const thumbdir = path.join(media.role, 'thumbs', hashDir, hashSubDir);
const thumbpath = path.join(thumbdir, filename);
try {
const thumbdir = path.join(media.role, 'thumbs', hashDir, hashSubDir);
const thumbpath = path.join(thumbdir, filename);
const lazydir = path.join(media.role, 'lazy', hashDir, hashSubDir);
const lazypath = path.join(lazydir, filename);
const lazydir = path.join(media.role, 'lazy', hashDir, hashSubDir);
const lazypath = path.join(lazydir, filename);
await Promise.all([
fsPromises.mkdir(path.join(config.media.path, filedir), { recursive: true }),
fsPromises.mkdir(path.join(config.media.path, thumbdir), { recursive: true }),
fsPromises.mkdir(path.join(config.media.path, lazydir), { recursive: true }),
]);
await Promise.all([
fsPromises.mkdir(path.join(config.media.path, filedir), { recursive: true }),
fsPromises.mkdir(path.join(config.media.path, thumbdir), { recursive: true }),
fsPromises.mkdir(path.join(config.media.path, lazydir), { recursive: true }),
]);
const image = sharp(media.file.path);
const info = await image.metadata();
const isProcessed = media.meta.subtype !== 'jpeg' || media.process;
const image = sharp(media.file.path);
const info = await image.metadata();
const isProcessed = media.meta.subtype !== 'jpeg' || media.process;
if (media.process) {
Object.entries(media.process).forEach(([operation, options]) => {
if (image[operation]) {
image[operation](...(Array.isArray(options) ? options : [options]));
return;
}
if (media.process) {
Object.entries(media.process).forEach(([operation, options]) => {
if (image[operation]) {
image[operation](...(Array.isArray(options) ? options : [options]));
return;
}
if (operation === 'crop') {
image.extract(...(Array.isArray(options) ? options : [options]));
return;
}
if (operation === 'crop') {
image.extract(...(Array.isArray(options) ? options : [options]));
return;
}
logger.warn(`Unknown image operation on ${media.id} (${media.src}): ${operation}`);
});
}
logger.warn(`Unknown image operation on ${media.id} (${media.src}): ${operation}`);
});
}
if (isProcessed) {
// convert to JPEG and write to permanent location
await image
.jpeg()
.toFile(path.join(config.media.path, filepath));
}
if (isProcessed) {
// convert to JPEG and write to permanent location
await image
.jpeg()
.toFile(path.join(config.media.path, filepath));
}
// generate thumbnail and lazy
await Promise.all([
image
.resize({
height: config.media.thumbnailSize,
withoutEnlargement: true,
})
.jpeg({ quality: config.media.thumbnailQuality })
.toFile(path.join(config.media.path, thumbpath)),
image
.resize({
height: config.media.lazySize,
withoutEnlargement: true,
})
.jpeg({ quality: config.media.lazyQuality })
.toFile(path.join(config.media.path, lazypath)),
]);
// generate thumbnail and lazy
await Promise.all([
image
.resize({
height: config.media.thumbnailSize,
withoutEnlargement: true,
})
.jpeg({ quality: config.media.thumbnailQuality })
.toFile(path.join(config.media.path, thumbpath)),
image
.resize({
height: config.media.lazySize,
withoutEnlargement: true,
})
.jpeg({ quality: config.media.lazyQuality })
.toFile(path.join(config.media.path, lazypath)),
]);
if (isProcessed) {
// remove temp file
await fsPromises.unlink(media.file.path);
} else {
// move temp file to permanent location
await fsPromises.rename(media.file.path, path.join(config.media.path, filepath));
}
logger.silly(`Stored thumbnail, lazy and permanent media file for ${media.id} from ${media.src} at ${filepath}`);
return {
...media,
file: {
path: filepath,
thumbnail: thumbpath,
lazy: lazypath,
},
meta: {
...media.meta,
width: info.width,
height: info.height,
},
};
} catch (error) {
logger.error(`Failed to store ${media.id} from ${media.src} at ${filepath}: ${error.message}`);
if (isProcessed) {
// remove temp file
await fsPromises.unlink(media.file.path);
} else {
// move temp file to permanent location
await fsPromises.rename(media.file.path, path.join(config.media.path, filepath));
return null;
}
logger.silly(`Stored thumbnail, lazy and permanent media file for ${media.id} from ${media.src} at ${filepath}`);
return {
...media,
file: {
path: filepath,
thumbnail: thumbpath,
lazy: lazypath,
},
meta: {
...media.meta,
width: info.width,
height: info.height,
},
};
}
async function storeFile(media) {
@@ -374,6 +384,15 @@ async function storeFile(media) {
const filedir = path.join(media.role, hashDir, hashSubDir);
const filepath = path.join(filedir, filename);
if (argv.force) {
try {
// remove old file to in case rename() does not overwrite (possibly on NFS setups)
await fsPromises.unlink(path.join(config.media.path, filepath));
} catch (error) {
// file probably didn't exist
}
}
if (media.meta.type === 'image') {
return storeImageFile(media, hashDir, hashSubDir, filename, filedir, filepath);
}
@@ -383,6 +402,7 @@ async function storeFile(media) {
fsPromises.mkdir(path.join(config.media.path, filedir), { recursive: true }),
]);
// move temp file to permanent location
await fsPromises.rename(media.file.path, path.join(config.media.path, filepath));
logger.silly(`Stored permanent media file for ${media.id} from ${media.src} at ${filepath}`);

View File

@@ -13,6 +13,16 @@ const { fetchIncludedEntities } = require('./entities');
const emptyReleases = { uniqueReleases: [], duplicateReleases: [] };
function mapReleasesToSiteIdAndEntryId(acc, release) {
const entityId = release.entityId || release.entity.id;
const entryId = release.entryId || release.entryId;
if (!acc[entityId]) acc[entityId] = {};
acc[entityId][entryId] = true;
return acc;
}
async function filterUniqueReleases(latestReleases, accReleases) {
const latestReleaseIdentifiers = latestReleases
.map(release => [release.entity.id, release.entryId]);
@@ -26,52 +36,54 @@ async function filterUniqueReleases(latestReleases, accReleases) {
// add entry IDs of accumulated releases to prevent an infinite scrape loop
// when one page contains the same release as the previous
const duplicateReleasesSiteIdAndEntryIds = duplicateReleases
const duplicateReleasesBySiteIdAndEntryId = duplicateReleases
.concat(accReleases)
.reduce((acc, release) => {
const entityId = release.entityId || release.entity.id;
const entryId = release.entryId || release.entryId;
.reduce(mapReleasesToSiteIdAndEntryId, {});
if (!acc[entityId]) acc[entityId] = {};
acc[entityId][entryId] = true;
const localDuplicateReleasesBySiteIdAndEntryId = accReleases.reduce(mapReleasesToSiteIdAndEntryId, {});
return acc;
}, {});
const uniqueReleases = latestReleases.filter(release => !duplicateReleasesBySiteIdAndEntryId[release.entity.id]?.[release.entryId]);
const localUniqueReleases = latestReleases.filter(release => !localDuplicateReleasesBySiteIdAndEntryId[release.entity.id]?.[release.entryId]);
const uniqueReleases = latestReleases.filter(release => !duplicateReleasesSiteIdAndEntryIds[release.entity.id]?.[release.entryId]);
return { uniqueReleases, duplicateReleases };
return {
uniqueReleases,
localUniqueReleases,
duplicateReleases,
};
}
function needNextPage(releasesOnPage, uniqueReleases, totalReleases, hasDates, upcoming) {
function needNextPage(releasesOnPage, uniqueReleasesOnPage, localUniqueReleasesOnPage, totalReleases, hasDates, upcoming) {
if (releasesOnPage.length === 0) {
return false;
}
if (upcoming) {
return uniqueReleases.length > 0 && argv.paginateUpcoming;
return uniqueReleasesOnPage.length > 0 && argv.paginateUpcoming;
}
if (argv.last) {
// this will keep paginating until the second condition is met on sites that will keep serving the last page if you exceed the last page number (e.g. HardX as of september 2020)
// checking unqiueReleases > 0 could prevent that, but this would stop pagination prematurely if we already scraped a full page of data earlier
return releasesOnPage.length > 0 && totalReleases + releasesOnPage.length < argv.last;
}
// no longer works when there are no unique releases, need to keep track of /all/ releases regardless uniqueness
console.log(localUniqueReleasesOnPage.length);
if (!hasDates) {
return totalReleases + releasesOnPage.length < argv.nullDateLimit;
}
if (localUniqueReleasesOnPage.length > 0) {
if (argv.last) {
return totalReleases + releasesOnPage.length < argv.last;
}
if (argv.after) {
// this will keep paginating infinitely on sites that will keep serving the last page if you exceed the last page number (e.g. HardX as of september 2020)
// checking unqiueReleases > 0 could prevent that, but this would stop pagination prematurely if we already scraped a full page of data earlier
const oldestReleaseOnPage = releasesOnPage
.sort((releaseA, releaseB) => releaseB.date - releaseA.date)
.slice(-1)[0];
if (!hasDates) {
return totalReleases + releasesOnPage.length < argv.nullDateLimit;
}
if (moment(oldestReleaseOnPage.date).isAfter(argv.after)) {
// oldest release on page is newer than the specified date cut-off
return true;
if (argv.after) {
// this will keep paginating infinitely on sites that will keep serving the last page if you exceed the last page number (e.g. HardX as of september 2020)
// checking unqiueReleases > 0 could prevent that, but this would stop pagination prematurely if we already scraped a full page of data earlier
const oldestReleaseOnPage = releasesOnPage
.sort((releaseA, releaseB) => releaseB.date - releaseA.date)
.slice(-1)[0];
if (moment(oldestReleaseOnPage.date).isAfter(argv.after)) {
// oldest release on page is newer than the specified date cut-off
return true;
}
}
}
@@ -100,8 +112,8 @@ async function scrapeReleases(scraper, entity, preData, upcoming = false, page =
|| (hasDates && releasesWithEntity.filter(release => moment(release.date).isAfter(argv.after)))
|| releasesWithEntity.slice(0, Math.max(argv.nullDateLimit - totalReleases, 0));
const { uniqueReleases, duplicateReleases } = argv.force
? { uniqueReleases: limitedReleases, duplicateReleases: [] }
const { uniqueReleases, localUniqueReleases, duplicateReleases } = argv.force
? { uniqueReleases: limitedReleases, localUniqueReleases: limitedReleases, duplicateReleases: [] }
: await filterUniqueReleases(limitedReleases, acc.uniqueReleases);
const accReleases = {
@@ -109,7 +121,7 @@ async function scrapeReleases(scraper, entity, preData, upcoming = false, page =
duplicateReleases: acc.duplicateReleases.concat(duplicateReleases),
};
if (needNextPage(releases, uniqueReleases, totalReleases, hasDates, upcoming)) {
if (needNextPage(releases, uniqueReleases, localUniqueReleases, totalReleases, hasDates, upcoming)) {
return scrapeReleases(scraper, entity, preData, upcoming, page + 1, accReleases, totalReleases + releases.length);
}