Added self hash filtering to media module. Moved Girl Girl back to Jules Jordan.

This commit is contained in:
2020-04-02 01:10:50 +02:00
parent 9a712e7371
commit ad7874649f
8 changed files with 54 additions and 43 deletions

View File

@@ -1,7 +1,7 @@
'use strict';
const config = require('config');
const util = require('util');
// const util = require('util');
const Promise = require('bluebird');
const fs = require('fs').promises;
const path = require('path');
@@ -177,10 +177,7 @@ async function findSourceDuplicates(baseMedias) {
const existingSourceMediaByUrl = itemsByKey(existingSourceMedia, 'source');
const existingExtractMediaByUrl = itemsByKey(existingExtractMedia, 'source_page');
return {
existingSourceMediaByUrl,
existingExtractMediaByUrl,
};
return [existingSourceMediaByUrl, existingExtractMediaByUrl];
}
async function findHashDuplicates(medias) {
@@ -189,16 +186,37 @@ async function findHashDuplicates(medias) {
const existingHashMediaEntries = await knex('media').whereIn('hash', hashes);
const existingHashMediaEntriesByHash = itemsByKey(existingHashMediaEntries, 'hash');
const uniqueHashMedia = medias.filter(media => !media.entry && !existingHashMediaEntriesByHash[media.meta?.hash]);
const uniqueHashMedias = medias.filter(media => !media.entry && !existingHashMediaEntriesByHash[media.meta?.hash]);
const existingHashMedia = medias
const { selfDuplicateMedias, selfUniqueMediasByHash } = uniqueHashMedias.reduce((acc, media) => {
if (acc.selfUniqueMediasByHash[media.meta.hash]) {
acc.selfDuplicateMedias.push({
...media,
use: acc.selfUniqueMediasByHash[media.meta.hash].id,
});
return acc;
}
acc.selfUniqueMediasByHash[media.meta.hash] = media;
return acc;
}, {
selfDuplicateMedias: [],
selfUniqueMediasByHash: {},
});
const selfUniqueHashMedias = Object.values(selfUniqueMediasByHash);
const existingHashMedias = medias
.filter(media => existingHashMediaEntriesByHash[media.entry?.hash || media.meta?.hash])
.map(media => ({
...media,
entry: existingHashMediaEntriesByHash[media.entry?.hash || media.meta?.hash],
}));
}))
.concat(selfDuplicateMedias);
return { uniqueHashMedia, existingHashMedia };
return [selfUniqueHashMedias, existingHashMedias];
}
async function extractSource(baseSource, { existingExtractMediaByUrl }) {
@@ -337,7 +355,7 @@ async function fetchSource(source) {
return attempt(1);
}
async function trySource(baseSource, existingMedias, baseMedia, baseSourceIndex) {
async function trySource(baseSource, existingMedias) {
// catch error and try the next source
const extractedSource = await extractSource(baseSource, existingMedias);
const existingSourceMedia = existingMedias.existingSourceMediaByUrl[extractedSource.src];
@@ -350,7 +368,7 @@ async function trySource(baseSource, existingMedias, baseMedia, baseSourceIndex)
}
if (existingSourceMedia) {
logger.silly(`Media source URL already in database, skipping ${baseSource.url}`);
logger.silly(`Media source URL already in database, skipping ${baseSource.src}`);
// media entry found by source URL, don't fetch
return {
@@ -359,7 +377,7 @@ async function trySource(baseSource, existingMedias, baseMedia, baseSourceIndex)
};
}
return fetchSource(extractedSource, baseMedia, baseSourceIndex, 1);
return fetchSource(extractedSource);
}
async function fetchMedia(baseMedia, existingMedias) {
@@ -420,21 +438,21 @@ function curateMediaEntry(media, index) {
}
async function storeMedias(baseMedias) {
const { existingSourceMediaByUrl, existingExtractMediaByUrl } = await findSourceDuplicates(baseMedias);
const [existingSourceMediaByUrl, existingExtractMediaByUrl] = await findSourceDuplicates(baseMedias);
const savedMedias = await Promise.map(
baseMedias,
async baseMedia => fetchMedia(baseMedia, { existingSourceMediaByUrl, existingExtractMediaByUrl }),
);
const { uniqueHashMedia, existingHashMedia } = await findHashDuplicates(savedMedias);
const [uniqueHashMedias, existingHashMedias] = await findHashDuplicates(savedMedias);
const newMediaWithEntries = uniqueHashMedia.map((media, index) => curateMediaEntry(media, index));
const newMediaEntries = newMediaWithEntries.filter(media => !media.newEntry).map(media => media.entry);
const newMediaWithEntries = uniqueHashMedias.map((media, index) => curateMediaEntry(media, index));
const newMediaEntries = newMediaWithEntries.filter(media => media.newEntry).map(media => media.entry);
await knex('media').insert(newMediaEntries);
return [...newMediaWithEntries, ...existingHashMedia];
return [...newMediaWithEntries, ...existingHashMedias];
}
async function associateReleaseMedia(releases) {
@@ -445,12 +463,13 @@ async function associateReleaseMedia(releases) {
// TODO: internal duplicate filtering
// TODO: media count limits
// TODO: catch errors
// TODO: stage by role
const baseMediasByReleaseId = releases.reduce((acc, release) => ({
...acc,
[release.id]: [
...(argv.images && argv.poster ? toBaseMedias([release.poster], 'posters') : []),
...(argv.images && argv.photos ? toBaseMedias(release.photos, 'photos').slice(0, 5) : []),
...(argv.images && argv.photos ? toBaseMedias(release.photos, 'photos') : []),
...(argv.videos && argv.trailer ? toBaseMedias([release.trailer], 'trailers') : []),
...(argv.videos && argv.teaser ? toBaseMedias([release.teaser], 'teasers') : []),
],
@@ -473,17 +492,15 @@ async function associateReleaseMedia(releases) {
acc[media.role].push({
release_id: releaseId,
media_id: media.entry.id,
media_id: media.use || media.entry.id,
});
});
return acc;
}, {});
console.log(util.inspect(associationsByRole, null, null));
await Promise.all(Object.entries(associationsByRole)
.map(async ([role, associations]) => knex(`releases_${role}`).insert(associations)));
.map(async ([role, associations]) => knex.raw(`${knex(`releases_${role}`).insert(associations)} ON CONFLICT DO NOTHING`)));
}
module.exports = {