'use strict'; const config = require('config'); const Promise = require('bluebird'); const fs = require('fs'); const fsPromises = require('fs').promises; const path = require('path'); const { PassThrough } = require('stream'); const nanoid = require('nanoid/non-secure'); const mime = require('mime'); // const fileType = require('file-type'); const sharp = require('sharp'); const blake2 = require('blake2'); const logger = require('./logger')(__filename); const argv = require('./argv'); const knex = require('./knex'); const http = require('./utils/http'); const { get } = require('./utils/qu'); function sampleMedias(medias, limit = config.media.limit, preferLast = true) { // limit media sets, use extras as fallbacks if (medias.length <= limit) { return medias; } const chunkSize = Math.floor(medias.length / limit); const rest = medias.length - (limit * chunkSize); const chunks = Array.from( { length: limit }, (value, index) => { const start = (chunkSize * index) + Math.min(index, rest); return medias.slice( start, start + chunkSize + (index < rest ? 1 : 0), ); }, ); // flip last chunk so the very last image (often the best cumshot) is tried first const lastPreferredChunks = preferLast ? chunks.slice(0, -1).concat(chunks.slice(-1).reverse()) : chunks; const groupedMedias = lastPreferredChunks.map((chunk) => { // merge chunked medias into single media with grouped fallback priorities, // so the first sources of each media is preferred over all second sources, etc. const sources = chunk .reduce((accSources, media) => { media.sources.forEach((source, index) => { if (!accSources[index]) { accSources.push([source]); return; } accSources[index].push(source); }); return accSources; }, []) .flat(); return { id: chunk[0].id, role: chunk[0].role, sources, }; }); return groupedMedias; } function itemsByKey(items, key) { return items.reduce((acc, item) => ({ ...acc, [item[key]]: item }), {}); } function toBaseSource(rawSource) { if (rawSource.src || (rawSource.extract && rawSource.url)) { const baseSource = {}; if (rawSource.src) baseSource.src = rawSource.src; if (rawSource.quality) baseSource.quality = rawSource.quality; if (rawSource.type) baseSource.type = rawSource.type; if (rawSource.url) baseSource.url = rawSource.url; if (rawSource.extract) baseSource.extract = rawSource.extract; if (rawSource.referer) baseSource.referer = rawSource.referer; if (rawSource.host) baseSource.host = rawSource.host; if (rawSource.copyright) baseSource.copyright = rawSource.copyright; if (rawSource.comment) baseSource.comment = rawSource.comment; if (rawSource.group) baseSource.group = rawSource.group; return baseSource; } if (typeof rawSource === 'string') { return { src: rawSource, }; } return null; } function baseSourceToBaseMedia(baseSource, role) { if (Array.isArray(baseSource)) { if (baseSource.length > 0) { return { id: nanoid(), role, sources: baseSource, }; } return null; } if (baseSource) { return { id: nanoid(), role, sources: [baseSource], }; } return null; } function fallbackMediaToBaseMedia(rawMedia, role) { const baseSources = rawMedia .map(source => toBaseSource(source)) .filter(Boolean); return baseSourceToBaseMedia(baseSources, role); } function toBaseMedias(rawMedias, role) { if (!rawMedias || rawMedias.length === 0) { return []; } const baseMedias = rawMedias.map((rawMedia) => { if (!rawMedia) { return null; } if (Array.isArray(rawMedia)) { // fallback sources provided return fallbackMediaToBaseMedia(rawMedia, role); } const baseSource = toBaseSource(rawMedia); return baseSourceToBaseMedia(baseSource, role); }).filter(Boolean); const sampledBaseMedias = sampleMedias(baseMedias); return sampledBaseMedias; } async function findSourceDuplicates(baseMedias) { const sourceUrls = baseMedias .map(baseMedia => baseMedia.sources.map(source => source.src)) .flat() .filter(Boolean); const extractUrls = baseMedias .map(baseMedia => baseMedia.sources.map(source => source.url)) .flat() .filter(Boolean); const [existingSourceMedia, existingExtractMedia] = await Promise.all([ knex('media').whereIn('source', sourceUrls), knex('media').whereIn('source_page', extractUrls), ]); const existingSourceMediaByUrl = itemsByKey(existingSourceMedia, 'source'); const existingExtractMediaByUrl = itemsByKey(existingExtractMedia, 'source_page'); return [existingSourceMediaByUrl, existingExtractMediaByUrl]; } async function findHashDuplicates(medias) { const hashes = medias.map(media => media.meta?.hash || media.entry?.hash).filter(Boolean); const existingHashMediaEntries = await knex('media').whereIn('hash', hashes); const existingHashMediaEntriesByHash = itemsByKey(existingHashMediaEntries, 'hash'); const uniqueHashMedias = medias.filter(media => !media.entry && !existingHashMediaEntriesByHash[media.meta?.hash]); const { selfDuplicateMedias, selfUniqueMediasByHash } = uniqueHashMedias.reduce((acc, media) => { if (!media.meta?.hash) { return acc; } if (acc.selfUniqueMediasByHash[media.meta.hash]) { acc.selfDuplicateMedias.push({ ...media, use: acc.selfUniqueMediasByHash[media.meta.hash].id, }); return acc; } acc.selfUniqueMediasByHash[media.meta.hash] = media; return acc; }, { selfDuplicateMedias: [], selfUniqueMediasByHash: {}, }); const selfUniqueHashMedias = Object.values(selfUniqueMediasByHash); const existingHashMedias = medias .filter(media => existingHashMediaEntriesByHash[media.entry?.hash || media.meta?.hash]) .map(media => ({ ...media, entry: existingHashMediaEntriesByHash[media.entry?.hash || media.meta?.hash], })) .concat(selfDuplicateMedias); return [selfUniqueHashMedias, existingHashMedias]; } async function extractSource(baseSource, { existingExtractMediaByUrl }) { if (typeof baseSource.extract !== 'function' || !baseSource.url) { return baseSource; } const existingExtractMedia = existingExtractMediaByUrl[baseSource.url]; if (existingExtractMedia) { // media entry found by extract URL return { ...baseSource, entry: existingExtractMedia, }; } const res = await get(baseSource.url); if (res.ok) { const src = await baseSource.extract(res.item); return { ...baseSource, src, }; } throw new Error(`Could not extract source from ${baseSource.url}: ${res.status}`); } async function storeImageFile(media, hashDir, hashSubDir, filename, filedir, filepath) { const thumbdir = path.join(media.role, 'thumbs', hashDir, hashSubDir); const thumbpath = path.join(thumbdir, filename); const lazydir = path.join(media.role, 'lazy', hashDir, hashSubDir); const lazypath = path.join(lazydir, filename); await Promise.all([ fsPromises.mkdir(path.join(config.media.path, filedir), { recursive: true }), fsPromises.mkdir(path.join(config.media.path, thumbdir), { recursive: true }), fsPromises.mkdir(path.join(config.media.path, lazydir), { recursive: true }), ]); const image = sharp(media.file.path); const info = await image.metadata(); // generate thumbnail and lazy await Promise.all([ image .resize({ height: config.media.thumbnailSize, withoutEnlargement: true, }) .jpeg({ quality: config.media.thumbnailQuality }) .toFile(path.join(config.media.path, thumbpath)), image .resize({ height: config.media.lazySize, withoutEnlargement: true, }) .jpeg({ quality: config.media.lazyQuality }) .toFile(path.join(config.media.path, lazypath)), ]); if (media.meta.subtype === 'jpeg') { // move temp file to permanent location await fsPromises.rename(media.file.path, path.join(config.media.path, filepath)); } else { // convert to JPEG and write to permanent location await sharp(media.file.path) .jpeg() .toFile(path.join(config.media.path, filepath)); // remove temp file await fsPromises.unlink(media.file.path); } logger.silly(`Stored thumbnail, lazy and permanent media file for ${media.id} from ${media.src} at ${filepath}`); return { ...media, file: { path: filepath, thumbnail: thumbpath, lazy: lazypath, }, meta: { ...media.meta, width: info.width, height: info.height, }, }; } async function storeFile(media) { try { const hashDir = media.meta.hash.slice(0, 2); const hashSubDir = media.meta.hash.slice(2, 4); const hashFilename = media.meta.hash.slice(4); const filename = media.quality ? `${hashFilename}_${media.quality}.${media.meta.extension}` : `${hashFilename}.${media.meta.extension}`; const filedir = path.join(media.role, hashDir, hashSubDir); const filepath = path.join(filedir, filename); if (media.meta.type === 'image') { return storeImageFile(media, hashDir, hashSubDir, filename, filedir, filepath); } const [stat] = await Promise.all([ fsPromises.stat(media.file.path), fsPromises.mkdir(path.join(config.media.path, filedir), { recursive: true }), ]); await fsPromises.rename(media.file.path, path.join(config.media.path, filepath)); logger.silly(`Stored permanent media file for ${media.id} from ${media.src} at ${filepath}`); return { ...media, file: { path: filepath, }, meta: { ...media.meta, size: stat.size, }, }; } catch (error) { logger.warn(`Failed to store ${media.src}: ${error.message}`); return null; } } async function fetchSource(source, baseMedia) { logger.silly(`Fetching media from ${source.src}`); // attempts async function attempt(attempts = 1) { try { const tempFilePath = path.join(config.media.path, 'temp', `${baseMedia.id}`); const hasher = new blake2.Hash('blake2b'); hasher.setEncoding('hex'); const tempFileTarget = fs.createWriteStream(tempFilePath); const hashStream = new PassThrough(); let size = 0; hashStream.on('data', (chunk) => { size += chunk.length; hasher.write(chunk); }); const res = await http.get(source.src, { ...(source.referer && { referer: source.referer }), ...(source.host && { host: source.host }), }, { stream: true, // sources are fetched in parallel, don't gobble up memory transforms: [hashStream], destination: tempFileTarget, }); hasher.end(); const hash = hasher.read(); const { pathname } = new URL(source.src); const mimetype = res.headers['content-type'] || mime.getType(pathname); const [type, subtype] = mimetype.split('/'); const extension = mime.getExtension(mimetype); if (!res.ok) { throw new Error(`Response ${res.status} not OK`); } return { ...source, file: { path: tempFilePath, }, meta: { hash, mimetype, extension, type, subtype, size, }, }; } catch (error) { logger.warn(`Failed attempt ${attempts}/3 to fetch ${source.src}: ${error.message}`); if (attempts < 3) { await Promise.delay(1000); return attempt(attempts + 1); } throw new Error(`Failed to fetch ${source.src}: ${error.message}`); } } return attempt(1); } async function trySource(baseSource, existingMedias, baseMedia) { // catch error and try the next source const extractedSource = await extractSource(baseSource, existingMedias); const existingSourceMedia = existingMedias.existingSourceMediaByUrl[extractedSource.src]; if (!argv.force && extractedSource.entry) { logger.silly(`Media page URL already in database, not extracting ${baseSource.url}`); // media entry found during extraction, don't fetch return extractedSource; } if (!argv.force && existingSourceMedia) { logger.silly(`Media source URL already in database, skipping ${baseSource.src}`); // media entry found by source URL, don't fetch return { ...baseSource, entry: existingSourceMedia, }; } return fetchSource(extractedSource, baseMedia); } async function fetchMedia(baseMedia, existingMedias) { try { const source = await baseMedia.sources.reduce( // try each source until success (result, baseSource, baseSourceIndex) => result.catch(async (error) => { if (error.message) { logger.warn(error.message); } return trySource(baseSource, existingMedias, baseMedia, baseSourceIndex); }), Promise.reject(new Error()), ); return { ...baseMedia, ...source, }; } catch (error) { logger.warn(error.message); return baseMedia; } } function curateMediaEntry(media, index) { if (media.entry) { return media; } const curatedMediaEntry = { id: media.id, path: media.file.path, thumbnail: media.file.thumbnail, lazy: media.file.lazy, index, mime: media.meta.mimetype, hash: media.meta.hash, size: media.meta.size, width: media.meta.width, height: media.meta.height, entropy: media.meta.entropy, source: media.src, source_page: media.url, scraper: media.scraper, copyright: media.copyright, comment: media.comment, }; return { ...media, newEntry: true, entry: curatedMediaEntry, }; } async function storeMedias(baseMedias) { await fsPromises.mkdir(path.join(config.media.path, 'temp'), { recursive: true }); const [existingSourceMediaByUrl, existingExtractMediaByUrl] = await findSourceDuplicates(baseMedias); const fetchedMedias = await Promise.map( baseMedias, async baseMedia => fetchMedia(baseMedia, { existingSourceMediaByUrl, existingExtractMediaByUrl }), ); const [uniqueHashMedias, existingHashMedias] = await findHashDuplicates(fetchedMedias); const savedMedias = await Promise.map( uniqueHashMedias, async baseMedia => storeFile(baseMedia), ); const newMediaWithEntries = savedMedias.map((media, index) => curateMediaEntry(media, index)); const newMediaEntries = newMediaWithEntries.filter(media => media.newEntry).map(media => media.entry); await knex('media').insert(newMediaEntries); return [...newMediaWithEntries, ...existingHashMedias]; } async function associateReleaseMedia(releases) { if (!argv.media) { return; } const baseMediasByReleaseId = releases.reduce((acc, release) => ({ ...acc, [release.id]: [ ...(argv.images && argv.poster ? toBaseMedias([release.poster], 'posters') : []), ...(argv.images && argv.poster ? toBaseMedias(release.covers, 'covers') : []), ...(argv.images && argv.photos ? toBaseMedias(release.photos, 'photos') : []), ...(argv.videos && argv.trailer ? toBaseMedias([release.trailer], 'trailers') : []), ...(argv.videos && argv.teaser ? toBaseMedias([release.teaser], 'teasers') : []), ], }), {}); const baseMediasByRole = Object.values(baseMediasByReleaseId) .flat() .filter(Boolean) .reduce((acc, baseMedia) => { if (!acc[baseMedia.role]) acc[baseMedia.role] = []; acc[baseMedia.role].push(baseMedia); return acc; }, {}); await Promise.reduce(['posters', 'covers', 'photos', 'teasers', 'trailers'], async (chain, role) => { // stage by role so posters are prioritized over photos and videos await chain; const baseMedias = baseMediasByRole[role]; if (!baseMedias) { return; } const storedMedias = await storeMedias(baseMedias); const storedMediasById = itemsByKey(storedMedias, 'id'); const associations = Object .entries(baseMediasByReleaseId) .reduce((acc, [releaseId, releaseBaseMedias]) => { releaseBaseMedias.forEach((baseMedia) => { const media = storedMediasById[baseMedia.id]; if (media) { acc.push({ release_id: releaseId, media_id: media.use || media.entry.id, }); } }); return acc; }, []) .filter(Boolean); if (associations.length > 0) { await knex.raw(`${knex(`releases_${role}`).insert(associations)} ON CONFLICT DO NOTHING`); } }, Promise.resolve()); } module.exports = { associateReleaseMedia, };