traxxx/src/media.js

492 lines
14 KiB
JavaScript
Raw Normal View History

'use strict';
const config = require('config');
2020-03-31 02:05:31 +00:00
const util = require('util');
const Promise = require('bluebird');
2020-03-31 02:05:31 +00:00
const fs = require('fs').promises;
const path = require('path');
const nanoid = require('nanoid/non-secure');
const mime = require('mime');
const sharp = require('sharp');
const blake2 = require('blake2');
const logger = require('./logger')(__filename);
const argv = require('./argv');
const knex = require('./knex');
const http = require('./utils/http');
const { get } = require('./utils/qu');
function getHash(buffer) {
const hash = blake2.createHash('blake2b', { digestLength: 24 });
hash.update(buffer);
return hash.digest('hex');
}
2020-03-31 02:05:31 +00:00
async function getEntropy(buffer, source) {
try {
const { entropy } = await sharp(buffer).stats();
return entropy;
} catch (error) {
2020-03-31 02:05:31 +00:00
logger.warn(`Failed to retrieve image entropy, using 7.5 for ${source.src}: ${error.message}`);
return 7.5;
}
}
2020-03-31 02:05:31 +00:00
async function getMeta(buffer, source) {
try {
const { width, height, size } = await sharp(buffer).metadata();
return {
width,
height,
size,
};
} catch (error) {
2020-03-31 02:05:31 +00:00
logger.warn(`Failed to retrieve image metadata from ${source.src}: ${error.message}`);
return {};
}
}
async function getThumbnail(buffer, height = config.media.thumbnailSize) {
try {
const thumbnail = sharp(buffer)
.resize({
height,
withoutEnlargement: true,
})
.jpeg({
quality: config.media.thumbnailQuality,
})
.toBuffer();
return thumbnail;
} catch (error) {
logger.error(`Failed to create thumbnail: ${error.message}`);
}
return null;
}
function itemsByKey(items, key) {
return items.reduce((acc, item) => ({ ...acc, [item[key]]: item }), {});
}
function toBaseSource(rawSource) {
if (rawSource.src || (rawSource.extract && rawSource.url)) {
const baseSource = {};
if (rawSource.src) baseSource.src = rawSource.src;
if (rawSource.quality) baseSource.quality = rawSource.quality;
if (rawSource.type) baseSource.type = rawSource.type;
if (rawSource.url) baseSource.url = rawSource.url;
if (rawSource.extract) baseSource.extract = rawSource.extract;
if (rawSource.referer) baseSource.referer = rawSource.referer;
if (rawSource.host) baseSource.host = rawSource.host;
2020-03-31 02:05:31 +00:00
if (rawSource.copyright) baseSource.copyright = rawSource.copyright;
if (rawSource.comment) baseSource.comment = rawSource.comment;
if (rawSource.group) baseSource.group = rawSource.group;
return baseSource;
}
if (typeof rawSource === 'string') {
return {
src: rawSource,
};
}
return null;
}
function baseSourceToBaseMedia(baseSource, role) {
if (Array.isArray(baseSource)) {
if (baseSource.length > 0) {
return {
id: nanoid(),
role,
sources: baseSource,
};
}
return null;
}
if (baseSource) {
return {
id: nanoid(),
role,
sources: [baseSource],
};
}
return null;
}
function fallbackMediaToBaseMedia(rawMedia, role) {
const baseSources = rawMedia
.map(source => toBaseSource(source))
.filter(Boolean);
return baseSourceToBaseMedia(baseSources, role);
}
function toBaseMedias(rawMedias, role) {
if (!rawMedias || rawMedias.length === 0) {
return [];
}
return rawMedias.map((rawMedia) => {
if (!rawMedia) {
return null;
}
if (Array.isArray(rawMedia)) {
// fallback sources provided
return fallbackMediaToBaseMedia(rawMedia, role);
}
const baseSource = toBaseSource(rawMedia);
return baseSourceToBaseMedia(baseSource, role);
}).filter(Boolean);
}
async function findSourceDuplicates(baseMedias) {
const sourceUrls = baseMedias
.map(baseMedia => baseMedia.sources.map(source => source.src))
.flat()
.filter(Boolean);
const extractUrls = baseMedias
.map(baseMedia => baseMedia.sources.map(source => source.url))
.flat()
.filter(Boolean);
const [existingSourceMedia, existingExtractMedia] = await Promise.all([
knex('media').whereIn('source', sourceUrls),
knex('media').whereIn('source_page', extractUrls),
]);
const existingSourceMediaByUrl = itemsByKey(existingSourceMedia, 'source');
const existingExtractMediaByUrl = itemsByKey(existingExtractMedia, 'source_page');
return {
existingSourceMediaByUrl,
existingExtractMediaByUrl,
};
}
async function findHashDuplicates(medias) {
2020-03-31 02:05:31 +00:00
const hashes = medias.map(media => media.meta?.hash || media.entry?.hash).filter(Boolean);
const existingHashMediaEntries = await knex('media').whereIn('hash', hashes);
const existingHashMediaEntriesByHash = itemsByKey(existingHashMediaEntries, 'hash');
const uniqueHashMedia = medias.filter(media => !media.entry && !existingHashMediaEntriesByHash[media.meta?.hash]);
const existingHashMedia = medias
.filter(media => existingHashMediaEntriesByHash[media.entry?.hash || media.meta?.hash])
.map(media => ({
...media,
entry: existingHashMediaEntriesByHash[media.entry?.hash || media.meta?.hash],
}));
2020-03-31 02:05:31 +00:00
return { uniqueHashMedia, existingHashMedia };
}
async function extractSource(baseSource, { existingExtractMediaByUrl }) {
if (typeof baseSource.extract !== 'function' || !baseSource.url) {
return baseSource;
}
const existingExtractMedia = existingExtractMediaByUrl[baseSource.url];
if (existingExtractMedia) {
// media entry found by extract URL
return {
...baseSource,
entry: existingExtractMedia,
};
}
const res = await get(baseSource.url);
if (res.ok) {
const src = await baseSource.extract(res.item);
return {
...baseSource,
src,
};
}
throw new Error(`Could not extract source from ${baseSource.url}: ${res.status}`);
}
2020-03-31 02:05:31 +00:00
async function saveMedia(media) {
const hashDir = media.meta.hash.slice(0, 2);
const hashSubDir = media.meta.hash.slice(2, 4);
const hashFilename = media.meta.hash.slice(4);
const filename = media.quality
? `${hashFilename}_${media.quality}.${media.meta.extension}`
: `${hashFilename}.${media.meta.extension}`;
const filedir = path.join(media.role, hashDir, hashSubDir);
const filepath = path.join(filedir, filename);
if (media.meta.type === 'image') {
const thumbnail = await getThumbnail(media.file.buffer);
const thumbdir = path.join(media.role, 'thumbs', hashDir, hashSubDir);
const thumbpath = path.join(thumbdir, filename);
await Promise.all([
fs.mkdir(path.join(config.media.path, filedir), { recursive: true }),
fs.mkdir(path.join(config.media.path, thumbdir), { recursive: true }),
]);
await Promise.all([
fs.writeFile(path.join(config.media.path, filepath), media.file.buffer),
fs.writeFile(path.join(config.media.path, thumbpath), thumbnail),
]);
return {
...media,
file: {
// buffer is no longer needed, discard to free up memory
path: filepath,
thumbnail: thumbpath,
},
};
}
await fs.mkdir(path.join(config.media.path, filedir), { recursive: true });
await fs.writeFile(path.join(config.media.path, filepath), media.file.buffer);
return {
...media,
file: {
// buffer is no longer needed, discard to free up memory
path: filepath,
},
};
}
async function fetchSource(source) {
logger.silly(`Fetching media from ${source.src}`);
// attempts
async function attempt(attempts = 1) {
try {
const { pathname } = new URL(source.src);
const mimetype = mime.getType(pathname);
const extension = mime.getExtension(mimetype);
2020-03-31 02:05:31 +00:00
const type = mimetype.split('/')[0];
const res = await http.get(source.src, {
...(source.referer && { referer: source.referer }),
...(source.host && { host: source.host }),
});
if (!res.ok) {
throw new Error(`Response ${res.status} not OK`);
}
2020-03-31 02:05:31 +00:00
const hash = getHash(res.body);
const entropy = type === 'image' ? await getEntropy(res.body) : null;
const { size, width, height } = type === 'image' ? await getMeta(res.body) : {};
logger.silly(`Fetched media from ${source.src}`);
return {
...source,
file: {
2020-03-31 02:05:31 +00:00
buffer: res.body,
},
meta: {
mimetype,
extension,
2020-03-31 02:05:31 +00:00
type,
hash,
entropy,
size,
width,
height,
},
};
} catch (error) {
logger.warn(`Failed attempt ${attempts}/3 to fetch ${source.src}: ${error.message}`);
if (attempts < 3) {
await Promise.delay(1000);
return attempt(attempts + 1);
}
throw new Error(`Failed to fetch ${source.src}: ${error.message}`);
}
}
return attempt(1);
}
async function trySource(baseSource, existingMedias, baseMedia, baseSourceIndex) {
// catch error and try the next source
const extractedSource = await extractSource(baseSource, existingMedias);
const existingSourceMedia = existingMedias.existingSourceMediaByUrl[extractedSource.src];
if (extractedSource.entry) {
2020-03-31 02:05:31 +00:00
logger.silly(`Media page URL already in database, not extracting ${baseSource.url}`);
// media entry found during extraction, don't fetch
return extractedSource;
}
if (existingSourceMedia) {
2020-03-31 02:05:31 +00:00
logger.silly(`Media source URL already in database, skipping ${baseSource.url}`);
// media entry found by source URL, don't fetch
return {
...baseSource,
entry: existingSourceMedia,
};
}
return fetchSource(extractedSource, baseMedia, baseSourceIndex, 1);
}
async function fetchMedia(baseMedia, existingMedias) {
try {
const source = await baseMedia.sources.reduce(
// try each source until success
(result, baseSource, baseSourceIndex) => result.catch(async () => trySource(baseSource, existingMedias, baseMedia, baseSourceIndex)),
Promise.reject(new Error()),
);
2020-03-31 02:05:31 +00:00
if (source.entry) {
// don't save media, already in database
return {
...baseMedia,
...source,
};
}
return saveMedia({
...baseMedia,
...source,
2020-03-31 02:05:31 +00:00
});
} catch (error) {
logger.warn(error.message);
return baseMedia;
}
}
2020-03-31 02:05:31 +00:00
function curateMediaEntry(media, index) {
if (media.entry) {
return media;
}
2020-03-31 02:05:31 +00:00
const curatedMediaEntry = {
id: media.id,
path: media.file.path,
thumbnail: media.file.thumbnail,
index,
mime: media.meta.mimetype,
hash: media.meta.hash,
size: media.meta.size,
width: media.meta.width,
height: media.meta.height,
entropy: media.meta.entropy,
source: media.src,
source_page: media.url,
scraper: media.scraper,
copyright: media.copyright,
comment: media.comment,
};
2020-03-31 02:05:31 +00:00
return {
...media,
newEntry: true,
entry: curatedMediaEntry,
};
}
2020-03-31 02:05:31 +00:00
async function storeMedias(baseMedias) {
const { existingSourceMediaByUrl, existingExtractMediaByUrl } = await findSourceDuplicates(baseMedias);
2020-03-31 02:05:31 +00:00
const savedMedias = await Promise.map(
baseMedias,
async baseMedia => fetchMedia(baseMedia, { existingSourceMediaByUrl, existingExtractMediaByUrl }),
);
2020-03-31 02:05:31 +00:00
const { uniqueHashMedia, existingHashMedia } = await findHashDuplicates(savedMedias);
2020-03-31 02:05:31 +00:00
const newMediaWithEntries = uniqueHashMedia.map((media, index) => curateMediaEntry(media, index));
const newMediaEntries = newMediaWithEntries.filter(media => !media.newEntry).map(media => media.entry);
2020-03-31 02:05:31 +00:00
await knex('media').insert(newMediaEntries);
2020-03-31 02:05:31 +00:00
return [...newMediaWithEntries, ...existingHashMedia];
}
async function associateReleaseMedia(releases) {
if (!argv.media) {
return;
}
2020-03-31 02:05:31 +00:00
// TODO: internal duplicate filtering
// TODO: media count limits
// TODO: catch errors
const baseMediasByReleaseId = releases.reduce((acc, release) => ({
...acc,
2020-03-31 02:05:31 +00:00
[release.id]: [
...(argv.images && argv.poster ? toBaseMedias([release.poster], 'posters') : []),
...(argv.images && argv.photos ? toBaseMedias(release.photos, 'photos').slice(0, 5) : []),
...(argv.videos && argv.trailer ? toBaseMedias([release.trailer], 'trailers') : []),
...(argv.videos && argv.teaser ? toBaseMedias([release.teaser], 'teasers') : []),
],
}), {});
const baseMedias = Object.values(baseMediasByReleaseId)
.map(releaseMedia => Object.values(releaseMedia))
.flat(2)
.filter(Boolean);
2020-03-31 02:05:31 +00:00
const storedMedias = await storeMedias(baseMedias);
const storedMediasById = itemsByKey(storedMedias, 'id');
const associationsByRole = Object.entries(baseMediasByReleaseId).reduce((acc, [releaseId, releaseBaseMedias]) => {
releaseBaseMedias.forEach((baseMedia) => {
const media = storedMediasById[baseMedia.id];
if (!media) return;
if (!acc[media.role]) acc[media.role] = [];
acc[media.role].push({
release_id: releaseId,
media_id: media.entry.id,
});
});
return acc;
}, {});
console.log(util.inspect(associationsByRole, null, null));
await Promise.all(Object.entries(associationsByRole)
.map(async ([role, associations]) => knex(`releases_${role}`).insert(associations)));
}
module.exports = {
associateReleaseMedia,
};