Media module saves files.

This commit is contained in:
ThePendulum 2020-03-31 04:05:31 +02:00
parent 4eaacf5697
commit 9a712e7371
3 changed files with 170 additions and 65 deletions

View File

@ -43,6 +43,7 @@ async function mounted() {
'double-vaginal',
'da-tp',
'dv-tp',
'triple-anal',
],
oral: [
'deepthroat',

View File

@ -1,9 +1,9 @@
'use strict';
const config = require('config');
const util = require('util');
const Promise = require('bluebird');
const fs = require('fs');
const fsPromises = require('fs').promises;
const fs = require('fs').promises;
const path = require('path');
const nanoid = require('nanoid/non-secure');
const mime = require('mime');
@ -23,19 +23,19 @@ function getHash(buffer) {
return hash.digest('hex');
}
async function getEntropy(buffer) {
async function getEntropy(buffer, source) {
try {
const { entropy } = await sharp(buffer).stats();
return entropy;
} catch (error) {
logger.warn(`Failed to retrieve image entropy, using 7.5: ${error.message}`);
logger.warn(`Failed to retrieve image entropy, using 7.5 for ${source.src}: ${error.message}`);
return 7.5;
}
}
async function getMeta(buffer) {
async function getMeta(buffer, source) {
try {
const { width, height, size } = await sharp(buffer).metadata();
@ -45,7 +45,7 @@ async function getMeta(buffer) {
size,
};
} catch (error) {
logger.warn(`Failed to retrieve image metadata: ${error.message}`);
logger.warn(`Failed to retrieve image metadata from ${source.src}: ${error.message}`);
return {};
}
@ -89,6 +89,10 @@ function toBaseSource(rawSource) {
if (rawSource.referer) baseSource.referer = rawSource.referer;
if (rawSource.host) baseSource.host = rawSource.host;
if (rawSource.copyright) baseSource.copyright = rawSource.copyright;
if (rawSource.comment) baseSource.comment = rawSource.comment;
if (rawSource.group) baseSource.group = rawSource.group;
return baseSource;
}
@ -180,10 +184,21 @@ async function findSourceDuplicates(baseMedias) {
}
async function findHashDuplicates(medias) {
const mediaHashes = medias.map(media => media.file?.hash).filter(Boolean);
const existingHashMedia = await knex('media').whereIn('hash', mediaHashes);
const hashes = medias.map(media => media.meta?.hash || media.entry?.hash).filter(Boolean);
return itemsByKey(existingHashMedia, 'hash');
const existingHashMediaEntries = await knex('media').whereIn('hash', hashes);
const existingHashMediaEntriesByHash = itemsByKey(existingHashMediaEntries, 'hash');
const uniqueHashMedia = medias.filter(media => !media.entry && !existingHashMediaEntriesByHash[media.meta?.hash]);
const existingHashMedia = medias
.filter(media => existingHashMediaEntriesByHash[media.entry?.hash || media.meta?.hash])
.map(media => ({
...media,
entry: existingHashMediaEntriesByHash[media.entry?.hash || media.meta?.hash],
}));
return { uniqueHashMedia, existingHashMedia };
}
async function extractSource(baseSource, { existingExtractMediaByUrl }) {
@ -198,7 +213,6 @@ async function extractSource(baseSource, { existingExtractMediaByUrl }) {
return {
...baseSource,
entry: existingExtractMedia,
src: existingExtractMedia.source,
};
}
@ -216,7 +230,57 @@ async function extractSource(baseSource, { existingExtractMediaByUrl }) {
throw new Error(`Could not extract source from ${baseSource.url}: ${res.status}`);
}
async function fetchSource(source, baseMedia, baseSourceIndex) {
async function saveMedia(media) {
const hashDir = media.meta.hash.slice(0, 2);
const hashSubDir = media.meta.hash.slice(2, 4);
const hashFilename = media.meta.hash.slice(4);
const filename = media.quality
? `${hashFilename}_${media.quality}.${media.meta.extension}`
: `${hashFilename}.${media.meta.extension}`;
const filedir = path.join(media.role, hashDir, hashSubDir);
const filepath = path.join(filedir, filename);
if (media.meta.type === 'image') {
const thumbnail = await getThumbnail(media.file.buffer);
const thumbdir = path.join(media.role, 'thumbs', hashDir, hashSubDir);
const thumbpath = path.join(thumbdir, filename);
await Promise.all([
fs.mkdir(path.join(config.media.path, filedir), { recursive: true }),
fs.mkdir(path.join(config.media.path, thumbdir), { recursive: true }),
]);
await Promise.all([
fs.writeFile(path.join(config.media.path, filepath), media.file.buffer),
fs.writeFile(path.join(config.media.path, thumbpath), thumbnail),
]);
return {
...media,
file: {
// buffer is no longer needed, discard to free up memory
path: filepath,
thumbnail: thumbpath,
},
};
}
await fs.mkdir(path.join(config.media.path, filedir), { recursive: true });
await fs.writeFile(path.join(config.media.path, filepath), media.file.buffer);
return {
...media,
file: {
// buffer is no longer needed, discard to free up memory
path: filepath,
},
};
}
async function fetchSource(source) {
logger.silly(`Fetching media from ${source.src}`);
// attempts
@ -225,39 +289,32 @@ async function fetchSource(source, baseMedia, baseSourceIndex) {
const { pathname } = new URL(source.src);
const mimetype = mime.getType(pathname);
const extension = mime.getExtension(mimetype);
const isImage = /image/.test(mimetype);
const tempPath = path.join(config.media.path, 'temp', `${baseMedia.id}-${baseSourceIndex}.${extension}`);
const type = mimetype.split('/')[0];
const res = await http.get(source.src, {
...(source.referer && { referer: source.referer }),
...(source.host && { host: source.host }),
}, {
stream: true,
});
if (!res.ok) {
throw new Error(`Response ${res.status} not OK`);
}
res.res.pipe(fs.createWriteStream(tempPath));
const buffer = res.body;
console.log(res.body);
const hash = getHash(buffer);
const entropy = isImage ? await getEntropy(buffer) : null;
const { size, width, height } = isImage ? await getMeta(buffer) : {};
const hash = getHash(res.body);
const entropy = type === 'image' ? await getEntropy(res.body) : null;
const { size, width, height } = type === 'image' ? await getMeta(res.body) : {};
logger.silly(`Fetched media from ${source.src}`);
return {
...source,
file: {
temp: tempPath,
buffer: res.body,
},
meta: {
mimetype,
extension,
type,
hash,
entropy,
size,
@ -270,7 +327,6 @@ async function fetchSource(source, baseMedia, baseSourceIndex) {
if (attempts < 3) {
await Promise.delay(1000);
return attempt(attempts + 1);
}
@ -287,16 +343,19 @@ async function trySource(baseSource, existingMedias, baseMedia, baseSourceIndex)
const existingSourceMedia = existingMedias.existingSourceMediaByUrl[extractedSource.src];
if (extractedSource.entry) {
logger.silly(`Media page URL already in database, not extracting ${baseSource.url}`);
// media entry found during extraction, don't fetch
return extractedSource;
}
if (existingSourceMedia) {
logger.silly(`Media source URL already in database, skipping ${baseSource.url}`);
// media entry found by source URL, don't fetch
return {
...baseSource,
entry: existingSourceMedia,
src: existingSourceMedia.source,
};
}
@ -311,10 +370,18 @@ async function fetchMedia(baseMedia, existingMedias) {
Promise.reject(new Error()),
);
if (source.entry) {
// don't save media, already in database
return {
...baseMedia,
...source,
};
}
return saveMedia({
...baseMedia,
...source,
});
} catch (error) {
logger.warn(error.message);
@ -322,40 +389,52 @@ async function fetchMedia(baseMedia, existingMedias) {
}
}
function saveMedia(media, existingHashMediaByHash) {
const existingHashMedia = existingHashMediaByHash[media.file.hash];
if (existingHashMedia) {
return {
...media,
entry: existingHashMedia,
};
function curateMediaEntry(media, index) {
if (media.entry) {
return media;
}
const hashDir = media.file.hash.slice(0, 2);
const hashSubDir = media.file.hash.slice(2, 4);
const hashFilename = media.file.hash.slice(4);
const curatedMediaEntry = {
id: media.id,
path: media.file.path,
thumbnail: media.file.thumbnail,
index,
mime: media.meta.mimetype,
hash: media.meta.hash,
size: media.meta.size,
width: media.meta.width,
height: media.meta.height,
entropy: media.meta.entropy,
source: media.src,
source_page: media.url,
scraper: media.scraper,
copyright: media.copyright,
comment: media.comment,
};
const filename = media.quality
? `${hashFilename}_${media.quality}.${media.file.extension}`
: `${hashFilename}.${media.file.extension}`;
const filedir = path.join(media.role, hashDir, hashSubDir);
const filepath = path.join(filedir, filename);
console.log(filedir, filepath);
return media;
return {
...media,
newEntry: true,
entry: curatedMediaEntry,
};
}
async function storeMedias(baseMedias) {
await fsPromises.mkdir(path.join(config.media.path, 'temp'), { recursive: true });
const { existingSourceMediaByUrl, existingExtractMediaByUrl } = await findSourceDuplicates(baseMedias);
const fetchedMedias = await Promise.map(baseMedias, async baseMedia => fetchMedia(baseMedia, { existingSourceMediaByUrl, existingExtractMediaByUrl }));
const existingHashMediaByHash = await findHashDuplicates(fetchedMedias);
const savedMedias = await Promise.map(fetchedMedias, async fetchedMedia => saveMedia(fetchedMedia, existingHashMediaByHash));
const savedMedias = await Promise.map(
baseMedias,
async baseMedia => fetchMedia(baseMedia, { existingSourceMediaByUrl, existingExtractMediaByUrl }),
);
const { uniqueHashMedia, existingHashMedia } = await findHashDuplicates(savedMedias);
const newMediaWithEntries = uniqueHashMedia.map((media, index) => curateMediaEntry(media, index));
const newMediaEntries = newMediaWithEntries.filter(media => !media.newEntry).map(media => media.entry);
await knex('media').insert(newMediaEntries);
return [...newMediaWithEntries, ...existingHashMedia];
}
async function associateReleaseMedia(releases) {
@ -363,14 +442,18 @@ async function associateReleaseMedia(releases) {
return;
}
// TODO: internal duplicate filtering
// TODO: media count limits
// TODO: catch errors
const baseMediasByReleaseId = releases.reduce((acc, release) => ({
...acc,
[release.id]: {
poster: argv.images && argv.poster && toBaseMedias([release.poster], 'posters'),
photos: argv.images && argv.photos && toBaseMedias(release.photos, 'photos').slice(0, 5),
trailer: argv.videos && argv.trailer && toBaseMedias([release.trailer], 'trailers'),
teaser: argv.videos && argv.teaser && toBaseMedias([release.teaser], 'teasers'),
},
[release.id]: [
...(argv.images && argv.poster ? toBaseMedias([release.poster], 'posters') : []),
...(argv.images && argv.photos ? toBaseMedias(release.photos, 'photos').slice(0, 5) : []),
...(argv.videos && argv.trailer ? toBaseMedias([release.trailer], 'trailers') : []),
...(argv.videos && argv.teaser ? toBaseMedias([release.teaser], 'teasers') : []),
],
}), {});
const baseMedias = Object.values(baseMediasByReleaseId)
@ -378,7 +461,29 @@ async function associateReleaseMedia(releases) {
.flat(2)
.filter(Boolean);
await storeMedias(baseMedias);
const storedMedias = await storeMedias(baseMedias);
const storedMediasById = itemsByKey(storedMedias, 'id');
const associationsByRole = Object.entries(baseMediasByReleaseId).reduce((acc, [releaseId, releaseBaseMedias]) => {
releaseBaseMedias.forEach((baseMedia) => {
const media = storedMediasById[baseMedia.id];
if (!media) return;
if (!acc[media.role]) acc[media.role] = [];
acc[media.role].push({
release_id: releaseId,
media_id: media.entry.id,
});
});
return acc;
}, {});
console.log(util.inspect(associationsByRole, null, null));
await Promise.all(Object.entries(associationsByRole)
.map(async ([role, associations]) => knex(`releases_${role}`).insert(associations)));
}
module.exports = {

View File

@ -72,10 +72,9 @@ queue.define('http', async ({
const json = Buffer.isBuffer(res.body) ? null : res.body;
return {
res,
...res,
html,
json,
pipe: res.pipe,
ok: res.statusCode >= 200 && res.statusCode <= 299,
code: res.statusCode,
status: res.statusCode,