forked from DebaucheryLibrarian/traxxx
585 lines
17 KiB
JavaScript
585 lines
17 KiB
JavaScript
'use strict';
|
|
|
|
const config = require('config');
|
|
const Promise = require('bluebird');
|
|
const fs = require('fs').promises;
|
|
const path = require('path');
|
|
const nanoid = require('nanoid/non-secure');
|
|
const mime = require('mime');
|
|
// const fileType = require('file-type');
|
|
const sharp = require('sharp');
|
|
const blake2 = require('blake2');
|
|
|
|
const logger = require('./logger')(__filename);
|
|
const argv = require('./argv');
|
|
const knex = require('./knex');
|
|
const http = require('./utils/http');
|
|
const { get } = require('./utils/qu');
|
|
|
|
function getHash(buffer) {
|
|
const hash = blake2.createHash('blake2b', { digestLength: 24 });
|
|
hash.update(buffer);
|
|
|
|
return hash.digest('hex');
|
|
}
|
|
|
|
async function getEntropy(buffer, source) {
|
|
try {
|
|
const { entropy } = await sharp(buffer).stats();
|
|
|
|
return entropy;
|
|
} catch (error) {
|
|
logger.warn(`Failed to retrieve image entropy, using 7.5 for ${source.src}: ${error.message}`);
|
|
|
|
return 7.5;
|
|
}
|
|
}
|
|
|
|
async function getMeta(buffer, source) {
|
|
try {
|
|
const { width, height, size } = await sharp(buffer).metadata();
|
|
|
|
return {
|
|
width,
|
|
height,
|
|
size,
|
|
};
|
|
} catch (error) {
|
|
logger.warn(`Failed to retrieve image metadata from ${source.src}: ${error.message}`);
|
|
|
|
return {};
|
|
}
|
|
}
|
|
|
|
async function getThumbnail(buffer, height = config.media.thumbnailSize) {
|
|
try {
|
|
const thumbnail = sharp(buffer)
|
|
.resize({
|
|
height,
|
|
withoutEnlargement: true,
|
|
})
|
|
.jpeg({
|
|
quality: config.media.thumbnailQuality,
|
|
})
|
|
.toBuffer();
|
|
|
|
return thumbnail;
|
|
} catch (error) {
|
|
logger.error(`Failed to create thumbnail: ${error.message}`);
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
function sampleMedias(medias, limit = config.media.limit, preferLast = true) {
|
|
// limit media sets, use extrax as fallbacks
|
|
if (medias.length <= limit) {
|
|
return medias;
|
|
}
|
|
|
|
const chunkSize = Math.floor(medias.length / limit);
|
|
const rest = medias.length - (limit * chunkSize);
|
|
|
|
const chunks = Array.from(
|
|
{ length: limit },
|
|
(value, index) => {
|
|
const start = (chunkSize * index) + Math.min(index, rest);
|
|
|
|
return medias.slice(
|
|
start,
|
|
start + chunkSize + (index < rest ? 1 : 0),
|
|
);
|
|
},
|
|
);
|
|
|
|
// flip last chunk so the very last image (often the best cumshot) is tried first
|
|
const lastPreferredChunks = preferLast
|
|
? chunks.slice(0, -1).concat(chunks.slice(-1).reverse())
|
|
: chunks;
|
|
|
|
const groupedMedias = lastPreferredChunks.map((chunk) => {
|
|
// merge chunked medias into single media with grouped fallback priorities,
|
|
// so the first sources of each media is preferred over all second sources, etc.
|
|
const sources = chunk
|
|
.reduce((accSources, media) => {
|
|
media.sources.forEach((source, index) => {
|
|
if (!accSources[index]) {
|
|
accSources.push([source]);
|
|
return;
|
|
}
|
|
|
|
accSources[index].push(source);
|
|
});
|
|
|
|
return accSources;
|
|
}, [])
|
|
.flat();
|
|
|
|
return {
|
|
id: chunk[0].id,
|
|
role: chunk[0].role,
|
|
sources,
|
|
};
|
|
});
|
|
|
|
return groupedMedias;
|
|
}
|
|
|
|
function itemsByKey(items, key) {
|
|
return items.reduce((acc, item) => ({ ...acc, [item[key]]: item }), {});
|
|
}
|
|
|
|
function toBaseSource(rawSource) {
|
|
if (rawSource.src || (rawSource.extract && rawSource.url)) {
|
|
const baseSource = {};
|
|
|
|
if (rawSource.src) baseSource.src = rawSource.src;
|
|
if (rawSource.quality) baseSource.quality = rawSource.quality;
|
|
if (rawSource.type) baseSource.type = rawSource.type;
|
|
|
|
if (rawSource.url) baseSource.url = rawSource.url;
|
|
if (rawSource.extract) baseSource.extract = rawSource.extract;
|
|
|
|
if (rawSource.referer) baseSource.referer = rawSource.referer;
|
|
if (rawSource.host) baseSource.host = rawSource.host;
|
|
|
|
if (rawSource.copyright) baseSource.copyright = rawSource.copyright;
|
|
if (rawSource.comment) baseSource.comment = rawSource.comment;
|
|
if (rawSource.group) baseSource.group = rawSource.group;
|
|
|
|
return baseSource;
|
|
}
|
|
|
|
if (typeof rawSource === 'string') {
|
|
return {
|
|
src: rawSource,
|
|
};
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
function baseSourceToBaseMedia(baseSource, role) {
|
|
if (Array.isArray(baseSource)) {
|
|
if (baseSource.length > 0) {
|
|
return {
|
|
id: nanoid(),
|
|
role,
|
|
sources: baseSource,
|
|
};
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
if (baseSource) {
|
|
return {
|
|
id: nanoid(),
|
|
role,
|
|
sources: [baseSource],
|
|
};
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
function fallbackMediaToBaseMedia(rawMedia, role) {
|
|
const baseSources = rawMedia
|
|
.map(source => toBaseSource(source))
|
|
.filter(Boolean);
|
|
|
|
return baseSourceToBaseMedia(baseSources, role);
|
|
}
|
|
|
|
function toBaseMedias(rawMedias, role) {
|
|
if (!rawMedias || rawMedias.length === 0) {
|
|
return [];
|
|
}
|
|
|
|
const baseMedias = rawMedias.map((rawMedia) => {
|
|
if (!rawMedia) {
|
|
return null;
|
|
}
|
|
|
|
if (Array.isArray(rawMedia)) {
|
|
// fallback sources provided
|
|
return fallbackMediaToBaseMedia(rawMedia, role);
|
|
}
|
|
|
|
const baseSource = toBaseSource(rawMedia);
|
|
|
|
return baseSourceToBaseMedia(baseSource, role);
|
|
}).filter(Boolean);
|
|
|
|
const sampledBaseMedias = sampleMedias(baseMedias);
|
|
|
|
return sampledBaseMedias;
|
|
}
|
|
|
|
async function findSourceDuplicates(baseMedias) {
|
|
const sourceUrls = baseMedias
|
|
.map(baseMedia => baseMedia.sources.map(source => source.src))
|
|
.flat()
|
|
.filter(Boolean);
|
|
|
|
const extractUrls = baseMedias
|
|
.map(baseMedia => baseMedia.sources.map(source => source.url))
|
|
.flat()
|
|
.filter(Boolean);
|
|
|
|
const [existingSourceMedia, existingExtractMedia] = await Promise.all([
|
|
knex('media').whereIn('source', sourceUrls),
|
|
knex('media').whereIn('source_page', extractUrls),
|
|
]);
|
|
|
|
const existingSourceMediaByUrl = itemsByKey(existingSourceMedia, 'source');
|
|
const existingExtractMediaByUrl = itemsByKey(existingExtractMedia, 'source_page');
|
|
|
|
return [existingSourceMediaByUrl, existingExtractMediaByUrl];
|
|
}
|
|
|
|
async function findHashDuplicates(medias) {
|
|
const hashes = medias.map(media => media.meta?.hash || media.entry?.hash).filter(Boolean);
|
|
|
|
const existingHashMediaEntries = await knex('media').whereIn('hash', hashes);
|
|
const existingHashMediaEntriesByHash = itemsByKey(existingHashMediaEntries, 'hash');
|
|
|
|
const uniqueHashMedias = medias.filter(media => !media.entry && !existingHashMediaEntriesByHash[media.meta?.hash]);
|
|
|
|
const { selfDuplicateMedias, selfUniqueMediasByHash } = uniqueHashMedias.reduce((acc, media) => {
|
|
if (!media.meta?.hash) {
|
|
return acc;
|
|
}
|
|
|
|
if (acc.selfUniqueMediasByHash[media.meta.hash]) {
|
|
acc.selfDuplicateMedias.push({
|
|
...media,
|
|
use: acc.selfUniqueMediasByHash[media.meta.hash].id,
|
|
});
|
|
|
|
return acc;
|
|
}
|
|
|
|
acc.selfUniqueMediasByHash[media.meta.hash] = media;
|
|
|
|
return acc;
|
|
}, {
|
|
selfDuplicateMedias: [],
|
|
selfUniqueMediasByHash: {},
|
|
});
|
|
|
|
const selfUniqueHashMedias = Object.values(selfUniqueMediasByHash);
|
|
|
|
const existingHashMedias = medias
|
|
.filter(media => existingHashMediaEntriesByHash[media.entry?.hash || media.meta?.hash])
|
|
.map(media => ({
|
|
...media,
|
|
entry: existingHashMediaEntriesByHash[media.entry?.hash || media.meta?.hash],
|
|
}))
|
|
.concat(selfDuplicateMedias);
|
|
|
|
return [selfUniqueHashMedias, existingHashMedias];
|
|
}
|
|
|
|
async function extractSource(baseSource, { existingExtractMediaByUrl }) {
|
|
if (typeof baseSource.extract !== 'function' || !baseSource.url) {
|
|
return baseSource;
|
|
}
|
|
|
|
const existingExtractMedia = existingExtractMediaByUrl[baseSource.url];
|
|
|
|
if (existingExtractMedia) {
|
|
// media entry found by extract URL
|
|
return {
|
|
...baseSource,
|
|
entry: existingExtractMedia,
|
|
};
|
|
}
|
|
|
|
const res = await get(baseSource.url);
|
|
|
|
if (res.ok) {
|
|
const src = await baseSource.extract(res.item);
|
|
|
|
return {
|
|
...baseSource,
|
|
src,
|
|
};
|
|
}
|
|
|
|
throw new Error(`Could not extract source from ${baseSource.url}: ${res.status}`);
|
|
}
|
|
|
|
async function saveMedia(media) {
|
|
const hashDir = media.meta.hash.slice(0, 2);
|
|
const hashSubDir = media.meta.hash.slice(2, 4);
|
|
const hashFilename = media.meta.hash.slice(4);
|
|
|
|
const filename = media.quality
|
|
? `${hashFilename}_${media.quality}.${media.meta.extension}`
|
|
: `${hashFilename}.${media.meta.extension}`;
|
|
|
|
const filedir = path.join(media.role, hashDir, hashSubDir);
|
|
const filepath = path.join(filedir, filename);
|
|
|
|
if (media.meta.type === 'image') {
|
|
const thumbnail = await getThumbnail(media.file.buffer);
|
|
|
|
const thumbdir = path.join(media.role, 'thumbs', hashDir, hashSubDir);
|
|
const thumbpath = path.join(thumbdir, filename);
|
|
|
|
await Promise.all([
|
|
fs.mkdir(path.join(config.media.path, filedir), { recursive: true }),
|
|
fs.mkdir(path.join(config.media.path, thumbdir), { recursive: true }),
|
|
]);
|
|
|
|
await Promise.all([
|
|
fs.writeFile(path.join(config.media.path, filepath), media.file.buffer),
|
|
fs.writeFile(path.join(config.media.path, thumbpath), thumbnail),
|
|
]);
|
|
|
|
return {
|
|
...media,
|
|
file: {
|
|
// buffer is no longer needed, discard to free up memory
|
|
path: filepath,
|
|
thumbnail: thumbpath,
|
|
},
|
|
};
|
|
}
|
|
|
|
await fs.mkdir(path.join(config.media.path, filedir), { recursive: true });
|
|
await fs.writeFile(path.join(config.media.path, filepath), media.file.buffer);
|
|
|
|
return {
|
|
...media,
|
|
file: {
|
|
// buffer is no longer needed, discard to free up memory
|
|
path: filepath,
|
|
},
|
|
};
|
|
}
|
|
|
|
async function fetchSource(source) {
|
|
logger.silly(`Fetching media from ${source.src}`);
|
|
// attempts
|
|
|
|
async function attempt(attempts = 1) {
|
|
try {
|
|
const { pathname } = new URL(source.src);
|
|
const mimetype = mime.getType(pathname);
|
|
const extension = mime.getExtension(mimetype);
|
|
const type = mimetype?.split('/')[0] || 'image';
|
|
|
|
const res = await http.get(source.src, {
|
|
...(source.referer && { referer: source.referer }),
|
|
...(source.host && { host: source.host }),
|
|
});
|
|
|
|
if (!res.ok) {
|
|
throw new Error(`Response ${res.status} not OK`);
|
|
}
|
|
|
|
const hash = getHash(res.body);
|
|
const entropy = type === 'image' ? await getEntropy(res.body) : null;
|
|
const { size, width, height } = type === 'image' ? await getMeta(res.body) : {};
|
|
|
|
logger.silly(`Fetched media from ${source.src}`);
|
|
|
|
return {
|
|
...source,
|
|
file: {
|
|
buffer: res.body,
|
|
},
|
|
meta: {
|
|
mimetype,
|
|
extension,
|
|
type,
|
|
hash,
|
|
entropy,
|
|
size,
|
|
width,
|
|
height,
|
|
},
|
|
};
|
|
} catch (error) {
|
|
logger.warn(`Failed attempt ${attempts}/3 to fetch ${source.src}: ${error.message}`);
|
|
|
|
if (attempts < 3) {
|
|
await Promise.delay(1000);
|
|
return attempt(attempts + 1);
|
|
}
|
|
|
|
throw new Error(`Failed to fetch ${source.src}: ${error.message}`);
|
|
}
|
|
}
|
|
|
|
return attempt(1);
|
|
}
|
|
|
|
async function trySource(baseSource, existingMedias) {
|
|
// catch error and try the next source
|
|
const extractedSource = await extractSource(baseSource, existingMedias);
|
|
const existingSourceMedia = existingMedias.existingSourceMediaByUrl[extractedSource.src];
|
|
|
|
if (extractedSource.entry) {
|
|
logger.silly(`Media page URL already in database, not extracting ${baseSource.url}`);
|
|
|
|
// media entry found during extraction, don't fetch
|
|
return extractedSource;
|
|
}
|
|
|
|
if (existingSourceMedia) {
|
|
logger.silly(`Media source URL already in database, skipping ${baseSource.src}`);
|
|
|
|
// media entry found by source URL, don't fetch
|
|
return {
|
|
...baseSource,
|
|
entry: existingSourceMedia,
|
|
};
|
|
}
|
|
|
|
return fetchSource(extractedSource);
|
|
}
|
|
|
|
async function fetchMedia(baseMedia, existingMedias) {
|
|
try {
|
|
const source = await baseMedia.sources.reduce(
|
|
// try each source until success
|
|
(result, baseSource, baseSourceIndex) => result.catch(async () => trySource(baseSource, existingMedias, baseMedia, baseSourceIndex)),
|
|
Promise.reject(new Error()),
|
|
);
|
|
|
|
if (source.entry) {
|
|
// don't save media, already in database
|
|
return {
|
|
...baseMedia,
|
|
...source,
|
|
};
|
|
}
|
|
|
|
return saveMedia({
|
|
...baseMedia,
|
|
...source,
|
|
});
|
|
} catch (error) {
|
|
logger.warn(error.message);
|
|
|
|
return baseMedia;
|
|
}
|
|
}
|
|
|
|
function curateMediaEntry(media, index) {
|
|
if (media.entry) {
|
|
return media;
|
|
}
|
|
|
|
const curatedMediaEntry = {
|
|
id: media.id,
|
|
path: media.file.path,
|
|
thumbnail: media.file.thumbnail,
|
|
index,
|
|
mime: media.meta.mimetype,
|
|
hash: media.meta.hash,
|
|
size: media.meta.size,
|
|
width: media.meta.width,
|
|
height: media.meta.height,
|
|
entropy: media.meta.entropy,
|
|
source: media.src,
|
|
source_page: media.url,
|
|
scraper: media.scraper,
|
|
copyright: media.copyright,
|
|
comment: media.comment,
|
|
};
|
|
|
|
return {
|
|
...media,
|
|
newEntry: true,
|
|
entry: curatedMediaEntry,
|
|
};
|
|
}
|
|
|
|
async function storeMedias(baseMedias) {
|
|
const [existingSourceMediaByUrl, existingExtractMediaByUrl] = await findSourceDuplicates(baseMedias);
|
|
|
|
const savedMedias = await Promise.map(
|
|
baseMedias,
|
|
async baseMedia => fetchMedia(baseMedia, { existingSourceMediaByUrl, existingExtractMediaByUrl }),
|
|
{ concurrency: 10 },
|
|
);
|
|
|
|
const [uniqueHashMedias, existingHashMedias] = await findHashDuplicates(savedMedias);
|
|
|
|
const newMediaWithEntries = uniqueHashMedias.map((media, index) => curateMediaEntry(media, index));
|
|
const newMediaEntries = newMediaWithEntries.filter(media => media.newEntry).map(media => media.entry);
|
|
|
|
await knex('media').insert(newMediaEntries);
|
|
|
|
return [...newMediaWithEntries, ...existingHashMedias];
|
|
}
|
|
|
|
async function associateReleaseMedia(releases) {
|
|
if (!argv.media) {
|
|
return;
|
|
}
|
|
|
|
const baseMediasByReleaseId = releases.reduce((acc, release) => ({
|
|
...acc,
|
|
[release.id]: [
|
|
...(argv.images && argv.poster ? toBaseMedias([release.poster], 'posters') : []),
|
|
...(argv.images && argv.poster ? toBaseMedias(release.covers, 'covers') : []),
|
|
...(argv.images && argv.photos ? toBaseMedias(release.photos, 'photos') : []),
|
|
...(argv.videos && argv.trailer ? toBaseMedias([release.trailer], 'trailers') : []),
|
|
...(argv.videos && argv.teaser ? toBaseMedias([release.teaser], 'teasers') : []),
|
|
],
|
|
}), {});
|
|
|
|
const baseMediasByRole = Object.values(baseMediasByReleaseId)
|
|
.flat()
|
|
.filter(Boolean)
|
|
.reduce((acc, baseMedia) => {
|
|
if (!acc[baseMedia.role]) acc[baseMedia.role] = [];
|
|
acc[baseMedia.role].push(baseMedia);
|
|
|
|
return acc;
|
|
}, {});
|
|
|
|
await Promise.reduce(['posters', 'covers', 'photos', 'teasers', 'trailers'], async (chain, role) => {
|
|
// stage by role so posters are prioritized over photos and videos
|
|
await chain;
|
|
|
|
const baseMedias = baseMediasByRole[role];
|
|
|
|
if (!baseMedias) {
|
|
return;
|
|
}
|
|
|
|
const storedMedias = await storeMedias(baseMedias);
|
|
const storedMediasById = itemsByKey(storedMedias, 'id');
|
|
|
|
const associations = Object
|
|
.entries(baseMediasByReleaseId)
|
|
.reduce((acc, [releaseId, releaseBaseMedias]) => {
|
|
releaseBaseMedias.forEach((baseMedia) => {
|
|
const media = storedMediasById[baseMedia.id];
|
|
|
|
if (media) {
|
|
acc.push({
|
|
release_id: releaseId,
|
|
media_id: media.use || media.entry.id,
|
|
});
|
|
}
|
|
});
|
|
|
|
return acc;
|
|
}, [])
|
|
.filter(Boolean);
|
|
|
|
await knex.raw(`${knex(`releases_${role}`).insert(associations)} ON CONFLICT DO NOTHING`);
|
|
}, Promise.resolve());
|
|
}
|
|
|
|
module.exports = {
|
|
associateReleaseMedia,
|
|
};
|