Media module saves files.

This commit is contained in:
ThePendulum 2020-03-31 04:05:31 +02:00
parent 4eaacf5697
commit 9a712e7371
3 changed files with 170 additions and 65 deletions

View File

@ -43,6 +43,7 @@ async function mounted() {
'double-vaginal', 'double-vaginal',
'da-tp', 'da-tp',
'dv-tp', 'dv-tp',
'triple-anal',
], ],
oral: [ oral: [
'deepthroat', 'deepthroat',

View File

@ -1,9 +1,9 @@
'use strict'; 'use strict';
const config = require('config'); const config = require('config');
const util = require('util');
const Promise = require('bluebird'); const Promise = require('bluebird');
const fs = require('fs'); const fs = require('fs').promises;
const fsPromises = require('fs').promises;
const path = require('path'); const path = require('path');
const nanoid = require('nanoid/non-secure'); const nanoid = require('nanoid/non-secure');
const mime = require('mime'); const mime = require('mime');
@ -23,19 +23,19 @@ function getHash(buffer) {
return hash.digest('hex'); return hash.digest('hex');
} }
async function getEntropy(buffer) { async function getEntropy(buffer, source) {
try { try {
const { entropy } = await sharp(buffer).stats(); const { entropy } = await sharp(buffer).stats();
return entropy; return entropy;
} catch (error) { } catch (error) {
logger.warn(`Failed to retrieve image entropy, using 7.5: ${error.message}`); logger.warn(`Failed to retrieve image entropy, using 7.5 for ${source.src}: ${error.message}`);
return 7.5; return 7.5;
} }
} }
async function getMeta(buffer) { async function getMeta(buffer, source) {
try { try {
const { width, height, size } = await sharp(buffer).metadata(); const { width, height, size } = await sharp(buffer).metadata();
@ -45,7 +45,7 @@ async function getMeta(buffer) {
size, size,
}; };
} catch (error) { } catch (error) {
logger.warn(`Failed to retrieve image metadata: ${error.message}`); logger.warn(`Failed to retrieve image metadata from ${source.src}: ${error.message}`);
return {}; return {};
} }
@ -89,6 +89,10 @@ function toBaseSource(rawSource) {
if (rawSource.referer) baseSource.referer = rawSource.referer; if (rawSource.referer) baseSource.referer = rawSource.referer;
if (rawSource.host) baseSource.host = rawSource.host; if (rawSource.host) baseSource.host = rawSource.host;
if (rawSource.copyright) baseSource.copyright = rawSource.copyright;
if (rawSource.comment) baseSource.comment = rawSource.comment;
if (rawSource.group) baseSource.group = rawSource.group;
return baseSource; return baseSource;
} }
@ -180,10 +184,21 @@ async function findSourceDuplicates(baseMedias) {
} }
async function findHashDuplicates(medias) { async function findHashDuplicates(medias) {
const mediaHashes = medias.map(media => media.file?.hash).filter(Boolean); const hashes = medias.map(media => media.meta?.hash || media.entry?.hash).filter(Boolean);
const existingHashMedia = await knex('media').whereIn('hash', mediaHashes);
return itemsByKey(existingHashMedia, 'hash'); const existingHashMediaEntries = await knex('media').whereIn('hash', hashes);
const existingHashMediaEntriesByHash = itemsByKey(existingHashMediaEntries, 'hash');
const uniqueHashMedia = medias.filter(media => !media.entry && !existingHashMediaEntriesByHash[media.meta?.hash]);
const existingHashMedia = medias
.filter(media => existingHashMediaEntriesByHash[media.entry?.hash || media.meta?.hash])
.map(media => ({
...media,
entry: existingHashMediaEntriesByHash[media.entry?.hash || media.meta?.hash],
}));
return { uniqueHashMedia, existingHashMedia };
} }
async function extractSource(baseSource, { existingExtractMediaByUrl }) { async function extractSource(baseSource, { existingExtractMediaByUrl }) {
@ -198,7 +213,6 @@ async function extractSource(baseSource, { existingExtractMediaByUrl }) {
return { return {
...baseSource, ...baseSource,
entry: existingExtractMedia, entry: existingExtractMedia,
src: existingExtractMedia.source,
}; };
} }
@ -216,7 +230,57 @@ async function extractSource(baseSource, { existingExtractMediaByUrl }) {
throw new Error(`Could not extract source from ${baseSource.url}: ${res.status}`); throw new Error(`Could not extract source from ${baseSource.url}: ${res.status}`);
} }
async function fetchSource(source, baseMedia, baseSourceIndex) { async function saveMedia(media) {
const hashDir = media.meta.hash.slice(0, 2);
const hashSubDir = media.meta.hash.slice(2, 4);
const hashFilename = media.meta.hash.slice(4);
const filename = media.quality
? `${hashFilename}_${media.quality}.${media.meta.extension}`
: `${hashFilename}.${media.meta.extension}`;
const filedir = path.join(media.role, hashDir, hashSubDir);
const filepath = path.join(filedir, filename);
if (media.meta.type === 'image') {
const thumbnail = await getThumbnail(media.file.buffer);
const thumbdir = path.join(media.role, 'thumbs', hashDir, hashSubDir);
const thumbpath = path.join(thumbdir, filename);
await Promise.all([
fs.mkdir(path.join(config.media.path, filedir), { recursive: true }),
fs.mkdir(path.join(config.media.path, thumbdir), { recursive: true }),
]);
await Promise.all([
fs.writeFile(path.join(config.media.path, filepath), media.file.buffer),
fs.writeFile(path.join(config.media.path, thumbpath), thumbnail),
]);
return {
...media,
file: {
// buffer is no longer needed, discard to free up memory
path: filepath,
thumbnail: thumbpath,
},
};
}
await fs.mkdir(path.join(config.media.path, filedir), { recursive: true });
await fs.writeFile(path.join(config.media.path, filepath), media.file.buffer);
return {
...media,
file: {
// buffer is no longer needed, discard to free up memory
path: filepath,
},
};
}
async function fetchSource(source) {
logger.silly(`Fetching media from ${source.src}`); logger.silly(`Fetching media from ${source.src}`);
// attempts // attempts
@ -225,39 +289,32 @@ async function fetchSource(source, baseMedia, baseSourceIndex) {
const { pathname } = new URL(source.src); const { pathname } = new URL(source.src);
const mimetype = mime.getType(pathname); const mimetype = mime.getType(pathname);
const extension = mime.getExtension(mimetype); const extension = mime.getExtension(mimetype);
const isImage = /image/.test(mimetype); const type = mimetype.split('/')[0];
const tempPath = path.join(config.media.path, 'temp', `${baseMedia.id}-${baseSourceIndex}.${extension}`);
const res = await http.get(source.src, { const res = await http.get(source.src, {
...(source.referer && { referer: source.referer }), ...(source.referer && { referer: source.referer }),
...(source.host && { host: source.host }), ...(source.host && { host: source.host }),
}, {
stream: true,
}); });
if (!res.ok) { if (!res.ok) {
throw new Error(`Response ${res.status} not OK`); throw new Error(`Response ${res.status} not OK`);
} }
res.res.pipe(fs.createWriteStream(tempPath)); const hash = getHash(res.body);
const entropy = type === 'image' ? await getEntropy(res.body) : null;
const buffer = res.body; const { size, width, height } = type === 'image' ? await getMeta(res.body) : {};
console.log(res.body);
const hash = getHash(buffer);
const entropy = isImage ? await getEntropy(buffer) : null;
const { size, width, height } = isImage ? await getMeta(buffer) : {};
logger.silly(`Fetched media from ${source.src}`); logger.silly(`Fetched media from ${source.src}`);
return { return {
...source, ...source,
file: { file: {
temp: tempPath, buffer: res.body,
},
meta: {
mimetype, mimetype,
extension, extension,
type,
hash, hash,
entropy, entropy,
size, size,
@ -270,7 +327,6 @@ async function fetchSource(source, baseMedia, baseSourceIndex) {
if (attempts < 3) { if (attempts < 3) {
await Promise.delay(1000); await Promise.delay(1000);
return attempt(attempts + 1); return attempt(attempts + 1);
} }
@ -287,16 +343,19 @@ async function trySource(baseSource, existingMedias, baseMedia, baseSourceIndex)
const existingSourceMedia = existingMedias.existingSourceMediaByUrl[extractedSource.src]; const existingSourceMedia = existingMedias.existingSourceMediaByUrl[extractedSource.src];
if (extractedSource.entry) { if (extractedSource.entry) {
logger.silly(`Media page URL already in database, not extracting ${baseSource.url}`);
// media entry found during extraction, don't fetch // media entry found during extraction, don't fetch
return extractedSource; return extractedSource;
} }
if (existingSourceMedia) { if (existingSourceMedia) {
logger.silly(`Media source URL already in database, skipping ${baseSource.url}`);
// media entry found by source URL, don't fetch // media entry found by source URL, don't fetch
return { return {
...baseSource, ...baseSource,
entry: existingSourceMedia, entry: existingSourceMedia,
src: existingSourceMedia.source,
}; };
} }
@ -311,10 +370,18 @@ async function fetchMedia(baseMedia, existingMedias) {
Promise.reject(new Error()), Promise.reject(new Error()),
); );
return { if (source.entry) {
// don't save media, already in database
return {
...baseMedia,
...source,
};
}
return saveMedia({
...baseMedia, ...baseMedia,
...source, ...source,
}; });
} catch (error) { } catch (error) {
logger.warn(error.message); logger.warn(error.message);
@ -322,40 +389,52 @@ async function fetchMedia(baseMedia, existingMedias) {
} }
} }
function saveMedia(media, existingHashMediaByHash) { function curateMediaEntry(media, index) {
const existingHashMedia = existingHashMediaByHash[media.file.hash]; if (media.entry) {
return media;
if (existingHashMedia) {
return {
...media,
entry: existingHashMedia,
};
} }
const hashDir = media.file.hash.slice(0, 2); const curatedMediaEntry = {
const hashSubDir = media.file.hash.slice(2, 4); id: media.id,
const hashFilename = media.file.hash.slice(4); path: media.file.path,
thumbnail: media.file.thumbnail,
index,
mime: media.meta.mimetype,
hash: media.meta.hash,
size: media.meta.size,
width: media.meta.width,
height: media.meta.height,
entropy: media.meta.entropy,
source: media.src,
source_page: media.url,
scraper: media.scraper,
copyright: media.copyright,
comment: media.comment,
};
const filename = media.quality return {
? `${hashFilename}_${media.quality}.${media.file.extension}` ...media,
: `${hashFilename}.${media.file.extension}`; newEntry: true,
entry: curatedMediaEntry,
const filedir = path.join(media.role, hashDir, hashSubDir); };
const filepath = path.join(filedir, filename);
console.log(filedir, filepath);
return media;
} }
async function storeMedias(baseMedias) { async function storeMedias(baseMedias) {
await fsPromises.mkdir(path.join(config.media.path, 'temp'), { recursive: true });
const { existingSourceMediaByUrl, existingExtractMediaByUrl } = await findSourceDuplicates(baseMedias); const { existingSourceMediaByUrl, existingExtractMediaByUrl } = await findSourceDuplicates(baseMedias);
const fetchedMedias = await Promise.map(baseMedias, async baseMedia => fetchMedia(baseMedia, { existingSourceMediaByUrl, existingExtractMediaByUrl }));
const existingHashMediaByHash = await findHashDuplicates(fetchedMedias); const savedMedias = await Promise.map(
const savedMedias = await Promise.map(fetchedMedias, async fetchedMedia => saveMedia(fetchedMedia, existingHashMediaByHash)); baseMedias,
async baseMedia => fetchMedia(baseMedia, { existingSourceMediaByUrl, existingExtractMediaByUrl }),
);
const { uniqueHashMedia, existingHashMedia } = await findHashDuplicates(savedMedias);
const newMediaWithEntries = uniqueHashMedia.map((media, index) => curateMediaEntry(media, index));
const newMediaEntries = newMediaWithEntries.filter(media => !media.newEntry).map(media => media.entry);
await knex('media').insert(newMediaEntries);
return [...newMediaWithEntries, ...existingHashMedia];
} }
async function associateReleaseMedia(releases) { async function associateReleaseMedia(releases) {
@ -363,14 +442,18 @@ async function associateReleaseMedia(releases) {
return; return;
} }
// TODO: internal duplicate filtering
// TODO: media count limits
// TODO: catch errors
const baseMediasByReleaseId = releases.reduce((acc, release) => ({ const baseMediasByReleaseId = releases.reduce((acc, release) => ({
...acc, ...acc,
[release.id]: { [release.id]: [
poster: argv.images && argv.poster && toBaseMedias([release.poster], 'posters'), ...(argv.images && argv.poster ? toBaseMedias([release.poster], 'posters') : []),
photos: argv.images && argv.photos && toBaseMedias(release.photos, 'photos').slice(0, 5), ...(argv.images && argv.photos ? toBaseMedias(release.photos, 'photos').slice(0, 5) : []),
trailer: argv.videos && argv.trailer && toBaseMedias([release.trailer], 'trailers'), ...(argv.videos && argv.trailer ? toBaseMedias([release.trailer], 'trailers') : []),
teaser: argv.videos && argv.teaser && toBaseMedias([release.teaser], 'teasers'), ...(argv.videos && argv.teaser ? toBaseMedias([release.teaser], 'teasers') : []),
}, ],
}), {}); }), {});
const baseMedias = Object.values(baseMediasByReleaseId) const baseMedias = Object.values(baseMediasByReleaseId)
@ -378,7 +461,29 @@ async function associateReleaseMedia(releases) {
.flat(2) .flat(2)
.filter(Boolean); .filter(Boolean);
await storeMedias(baseMedias); const storedMedias = await storeMedias(baseMedias);
const storedMediasById = itemsByKey(storedMedias, 'id');
const associationsByRole = Object.entries(baseMediasByReleaseId).reduce((acc, [releaseId, releaseBaseMedias]) => {
releaseBaseMedias.forEach((baseMedia) => {
const media = storedMediasById[baseMedia.id];
if (!media) return;
if (!acc[media.role]) acc[media.role] = [];
acc[media.role].push({
release_id: releaseId,
media_id: media.entry.id,
});
});
return acc;
}, {});
console.log(util.inspect(associationsByRole, null, null));
await Promise.all(Object.entries(associationsByRole)
.map(async ([role, associations]) => knex(`releases_${role}`).insert(associations)));
} }
module.exports = { module.exports = {

View File

@ -72,10 +72,9 @@ queue.define('http', async ({
const json = Buffer.isBuffer(res.body) ? null : res.body; const json = Buffer.isBuffer(res.body) ? null : res.body;
return { return {
res, ...res,
html, html,
json, json,
pipe: res.pipe,
ok: res.statusCode >= 200 && res.statusCode <= 299, ok: res.statusCode >= 200 && res.statusCode <= 299,
code: res.statusCode, code: res.statusCode,
status: res.statusCode, status: res.statusCode,