From 9a712e73717ce84e039cf492a752391e78d6e6b2 Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Tue, 31 Mar 2020 04:05:31 +0200 Subject: [PATCH] Media module saves files. --- assets/components/tags/tags.vue | 1 + src/media.js | 231 +++++++++++++++++++++++--------- src/utils/http.js | 3 +- 3 files changed, 170 insertions(+), 65 deletions(-) diff --git a/assets/components/tags/tags.vue b/assets/components/tags/tags.vue index ee1b96e2..130bf893 100644 --- a/assets/components/tags/tags.vue +++ b/assets/components/tags/tags.vue @@ -43,6 +43,7 @@ async function mounted() { 'double-vaginal', 'da-tp', 'dv-tp', + 'triple-anal', ], oral: [ 'deepthroat', diff --git a/src/media.js b/src/media.js index 621789b6..ae64c50c 100644 --- a/src/media.js +++ b/src/media.js @@ -1,9 +1,9 @@ 'use strict'; const config = require('config'); +const util = require('util'); const Promise = require('bluebird'); -const fs = require('fs'); -const fsPromises = require('fs').promises; +const fs = require('fs').promises; const path = require('path'); const nanoid = require('nanoid/non-secure'); const mime = require('mime'); @@ -23,19 +23,19 @@ function getHash(buffer) { return hash.digest('hex'); } -async function getEntropy(buffer) { +async function getEntropy(buffer, source) { try { const { entropy } = await sharp(buffer).stats(); return entropy; } catch (error) { - logger.warn(`Failed to retrieve image entropy, using 7.5: ${error.message}`); + logger.warn(`Failed to retrieve image entropy, using 7.5 for ${source.src}: ${error.message}`); return 7.5; } } -async function getMeta(buffer) { +async function getMeta(buffer, source) { try { const { width, height, size } = await sharp(buffer).metadata(); @@ -45,7 +45,7 @@ async function getMeta(buffer) { size, }; } catch (error) { - logger.warn(`Failed to retrieve image metadata: ${error.message}`); + logger.warn(`Failed to retrieve image metadata from ${source.src}: ${error.message}`); return {}; } @@ -89,6 +89,10 @@ function toBaseSource(rawSource) { if (rawSource.referer) baseSource.referer = rawSource.referer; if (rawSource.host) baseSource.host = rawSource.host; + if (rawSource.copyright) baseSource.copyright = rawSource.copyright; + if (rawSource.comment) baseSource.comment = rawSource.comment; + if (rawSource.group) baseSource.group = rawSource.group; + return baseSource; } @@ -180,10 +184,21 @@ async function findSourceDuplicates(baseMedias) { } async function findHashDuplicates(medias) { - const mediaHashes = medias.map(media => media.file?.hash).filter(Boolean); - const existingHashMedia = await knex('media').whereIn('hash', mediaHashes); + const hashes = medias.map(media => media.meta?.hash || media.entry?.hash).filter(Boolean); - return itemsByKey(existingHashMedia, 'hash'); + const existingHashMediaEntries = await knex('media').whereIn('hash', hashes); + const existingHashMediaEntriesByHash = itemsByKey(existingHashMediaEntries, 'hash'); + + const uniqueHashMedia = medias.filter(media => !media.entry && !existingHashMediaEntriesByHash[media.meta?.hash]); + + const existingHashMedia = medias + .filter(media => existingHashMediaEntriesByHash[media.entry?.hash || media.meta?.hash]) + .map(media => ({ + ...media, + entry: existingHashMediaEntriesByHash[media.entry?.hash || media.meta?.hash], + })); + + return { uniqueHashMedia, existingHashMedia }; } async function extractSource(baseSource, { existingExtractMediaByUrl }) { @@ -198,7 +213,6 @@ async function extractSource(baseSource, { existingExtractMediaByUrl }) { return { ...baseSource, entry: existingExtractMedia, - src: existingExtractMedia.source, }; } @@ -216,7 +230,57 @@ async function extractSource(baseSource, { existingExtractMediaByUrl }) { throw new Error(`Could not extract source from ${baseSource.url}: ${res.status}`); } -async function fetchSource(source, baseMedia, baseSourceIndex) { +async function saveMedia(media) { + const hashDir = media.meta.hash.slice(0, 2); + const hashSubDir = media.meta.hash.slice(2, 4); + const hashFilename = media.meta.hash.slice(4); + + const filename = media.quality + ? `${hashFilename}_${media.quality}.${media.meta.extension}` + : `${hashFilename}.${media.meta.extension}`; + + const filedir = path.join(media.role, hashDir, hashSubDir); + const filepath = path.join(filedir, filename); + + if (media.meta.type === 'image') { + const thumbnail = await getThumbnail(media.file.buffer); + + const thumbdir = path.join(media.role, 'thumbs', hashDir, hashSubDir); + const thumbpath = path.join(thumbdir, filename); + + await Promise.all([ + fs.mkdir(path.join(config.media.path, filedir), { recursive: true }), + fs.mkdir(path.join(config.media.path, thumbdir), { recursive: true }), + ]); + + await Promise.all([ + fs.writeFile(path.join(config.media.path, filepath), media.file.buffer), + fs.writeFile(path.join(config.media.path, thumbpath), thumbnail), + ]); + + return { + ...media, + file: { + // buffer is no longer needed, discard to free up memory + path: filepath, + thumbnail: thumbpath, + }, + }; + } + + await fs.mkdir(path.join(config.media.path, filedir), { recursive: true }); + await fs.writeFile(path.join(config.media.path, filepath), media.file.buffer); + + return { + ...media, + file: { + // buffer is no longer needed, discard to free up memory + path: filepath, + }, + }; +} + +async function fetchSource(source) { logger.silly(`Fetching media from ${source.src}`); // attempts @@ -225,39 +289,32 @@ async function fetchSource(source, baseMedia, baseSourceIndex) { const { pathname } = new URL(source.src); const mimetype = mime.getType(pathname); const extension = mime.getExtension(mimetype); - const isImage = /image/.test(mimetype); - - const tempPath = path.join(config.media.path, 'temp', `${baseMedia.id}-${baseSourceIndex}.${extension}`); + const type = mimetype.split('/')[0]; const res = await http.get(source.src, { ...(source.referer && { referer: source.referer }), ...(source.host && { host: source.host }), - }, { - stream: true, }); if (!res.ok) { throw new Error(`Response ${res.status} not OK`); } - res.res.pipe(fs.createWriteStream(tempPath)); - - const buffer = res.body; - - console.log(res.body); - - const hash = getHash(buffer); - const entropy = isImage ? await getEntropy(buffer) : null; - const { size, width, height } = isImage ? await getMeta(buffer) : {}; + const hash = getHash(res.body); + const entropy = type === 'image' ? await getEntropy(res.body) : null; + const { size, width, height } = type === 'image' ? await getMeta(res.body) : {}; logger.silly(`Fetched media from ${source.src}`); return { ...source, file: { - temp: tempPath, + buffer: res.body, + }, + meta: { mimetype, extension, + type, hash, entropy, size, @@ -270,7 +327,6 @@ async function fetchSource(source, baseMedia, baseSourceIndex) { if (attempts < 3) { await Promise.delay(1000); - return attempt(attempts + 1); } @@ -287,16 +343,19 @@ async function trySource(baseSource, existingMedias, baseMedia, baseSourceIndex) const existingSourceMedia = existingMedias.existingSourceMediaByUrl[extractedSource.src]; if (extractedSource.entry) { + logger.silly(`Media page URL already in database, not extracting ${baseSource.url}`); + // media entry found during extraction, don't fetch return extractedSource; } if (existingSourceMedia) { + logger.silly(`Media source URL already in database, skipping ${baseSource.url}`); + // media entry found by source URL, don't fetch return { ...baseSource, entry: existingSourceMedia, - src: existingSourceMedia.source, }; } @@ -311,10 +370,18 @@ async function fetchMedia(baseMedia, existingMedias) { Promise.reject(new Error()), ); - return { + if (source.entry) { + // don't save media, already in database + return { + ...baseMedia, + ...source, + }; + } + + return saveMedia({ ...baseMedia, ...source, - }; + }); } catch (error) { logger.warn(error.message); @@ -322,40 +389,52 @@ async function fetchMedia(baseMedia, existingMedias) { } } -function saveMedia(media, existingHashMediaByHash) { - const existingHashMedia = existingHashMediaByHash[media.file.hash]; - - if (existingHashMedia) { - return { - ...media, - entry: existingHashMedia, - }; +function curateMediaEntry(media, index) { + if (media.entry) { + return media; } - const hashDir = media.file.hash.slice(0, 2); - const hashSubDir = media.file.hash.slice(2, 4); - const hashFilename = media.file.hash.slice(4); + const curatedMediaEntry = { + id: media.id, + path: media.file.path, + thumbnail: media.file.thumbnail, + index, + mime: media.meta.mimetype, + hash: media.meta.hash, + size: media.meta.size, + width: media.meta.width, + height: media.meta.height, + entropy: media.meta.entropy, + source: media.src, + source_page: media.url, + scraper: media.scraper, + copyright: media.copyright, + comment: media.comment, + }; - const filename = media.quality - ? `${hashFilename}_${media.quality}.${media.file.extension}` - : `${hashFilename}.${media.file.extension}`; - - const filedir = path.join(media.role, hashDir, hashSubDir); - const filepath = path.join(filedir, filename); - - console.log(filedir, filepath); - - return media; + return { + ...media, + newEntry: true, + entry: curatedMediaEntry, + }; } async function storeMedias(baseMedias) { - await fsPromises.mkdir(path.join(config.media.path, 'temp'), { recursive: true }); - const { existingSourceMediaByUrl, existingExtractMediaByUrl } = await findSourceDuplicates(baseMedias); - const fetchedMedias = await Promise.map(baseMedias, async baseMedia => fetchMedia(baseMedia, { existingSourceMediaByUrl, existingExtractMediaByUrl })); - const existingHashMediaByHash = await findHashDuplicates(fetchedMedias); - const savedMedias = await Promise.map(fetchedMedias, async fetchedMedia => saveMedia(fetchedMedia, existingHashMediaByHash)); + const savedMedias = await Promise.map( + baseMedias, + async baseMedia => fetchMedia(baseMedia, { existingSourceMediaByUrl, existingExtractMediaByUrl }), + ); + + const { uniqueHashMedia, existingHashMedia } = await findHashDuplicates(savedMedias); + + const newMediaWithEntries = uniqueHashMedia.map((media, index) => curateMediaEntry(media, index)); + const newMediaEntries = newMediaWithEntries.filter(media => !media.newEntry).map(media => media.entry); + + await knex('media').insert(newMediaEntries); + + return [...newMediaWithEntries, ...existingHashMedia]; } async function associateReleaseMedia(releases) { @@ -363,14 +442,18 @@ async function associateReleaseMedia(releases) { return; } + // TODO: internal duplicate filtering + // TODO: media count limits + // TODO: catch errors + const baseMediasByReleaseId = releases.reduce((acc, release) => ({ ...acc, - [release.id]: { - poster: argv.images && argv.poster && toBaseMedias([release.poster], 'posters'), - photos: argv.images && argv.photos && toBaseMedias(release.photos, 'photos').slice(0, 5), - trailer: argv.videos && argv.trailer && toBaseMedias([release.trailer], 'trailers'), - teaser: argv.videos && argv.teaser && toBaseMedias([release.teaser], 'teasers'), - }, + [release.id]: [ + ...(argv.images && argv.poster ? toBaseMedias([release.poster], 'posters') : []), + ...(argv.images && argv.photos ? toBaseMedias(release.photos, 'photos').slice(0, 5) : []), + ...(argv.videos && argv.trailer ? toBaseMedias([release.trailer], 'trailers') : []), + ...(argv.videos && argv.teaser ? toBaseMedias([release.teaser], 'teasers') : []), + ], }), {}); const baseMedias = Object.values(baseMediasByReleaseId) @@ -378,7 +461,29 @@ async function associateReleaseMedia(releases) { .flat(2) .filter(Boolean); - await storeMedias(baseMedias); + const storedMedias = await storeMedias(baseMedias); + const storedMediasById = itemsByKey(storedMedias, 'id'); + + const associationsByRole = Object.entries(baseMediasByReleaseId).reduce((acc, [releaseId, releaseBaseMedias]) => { + releaseBaseMedias.forEach((baseMedia) => { + const media = storedMediasById[baseMedia.id]; + + if (!media) return; + if (!acc[media.role]) acc[media.role] = []; + + acc[media.role].push({ + release_id: releaseId, + media_id: media.entry.id, + }); + }); + + return acc; + }, {}); + + console.log(util.inspect(associationsByRole, null, null)); + + await Promise.all(Object.entries(associationsByRole) + .map(async ([role, associations]) => knex(`releases_${role}`).insert(associations))); } module.exports = { diff --git a/src/utils/http.js b/src/utils/http.js index 281baac0..71acd3f9 100644 --- a/src/utils/http.js +++ b/src/utils/http.js @@ -72,10 +72,9 @@ queue.define('http', async ({ const json = Buffer.isBuffer(res.body) ? null : res.body; return { - res, + ...res, html, json, - pipe: res.pipe, ok: res.statusCode >= 200 && res.statusCode <= 299, code: res.statusCode, status: res.statusCode,