Fixed Vixen scraper, using new token URL for trailers.

2020-04-26 03:51:59 +02:00
parent 2cfbd21560
commit 2ac879d276
5 changed files with 313 additions and 97 deletions
--- a/src/media.js
+++ b/src/media.js
@@ -2,9 +2,12 @@

 const config = require('config');
 const Promise = require('bluebird');
-const fs = require('fs').promises;
+const util = require('util');
+const fs = require('fs');
+const fsPromises = require('fs').promises;
 const path = require('path');
-const bhttp = require('bhttp');
+const stream = require('stream');
+const { once } = require('events');
 const nanoid = require('nanoid/non-secure');
 const mime = require('mime');
 // const fileType = require('file-type');
@@ -17,63 +20,17 @@ const knex = require('./knex');
 const http = require('./utils/http');
 const { get } = require('./utils/qu');

-function getHash(buffer) {
-    const hash = blake2.createHash('blake2b', { digestLength: 24 });
-    hash.update(buffer);
+const PassThrough = stream.PassThrough;
+const pipeline = util.promisify(stream.pipeline);

-    return hash.digest('hex');
+function getMemoryUsage() {
+    return process.memoryUsage().rss / (10 ** 6);
 }

-async function getEntropy(buffer, source) {
-    try {
-        const { entropy } = await sharp(buffer).stats();
-
-        return entropy;
-    } catch (error) {
-        logger.warn(`Failed to retrieve image entropy, using 7.5 for ${source.src}: ${error.message}`);
-
-        return 7.5;
-    }
-}
-
-async function getMeta(buffer, source) {
-    try {
-        const { width, height, size } = await sharp(buffer).metadata();
-
-        return {
-            width,
-            height,
-            size,
-        };
-    } catch (error) {
-        logger.warn(`Failed to retrieve image metadata from ${source.src}: ${error.message}`);
-
-        return {};
-    }
-}
-
-async function getThumbnail(buffer, height = config.media.thumbnailSize) {
-    try {
-        const thumbnail = sharp(buffer)
-            .resize({
-                height,
-                withoutEnlargement: true,
-            })
-            .jpeg({
-                quality: config.media.thumbnailQuality,
-            })
-            .toBuffer();
-
-        return thumbnail;
-    } catch (error) {
-        logger.error(`Failed to create thumbnail: ${error.message}`);
-    }
-
-    return null;
-}
+let peakMemoryUsage = getMemoryUsage();

 function sampleMedias(medias, limit = config.media.limit, preferLast = true) {
-    // limit media sets, use extrax as fallbacks
+    // limit media sets, use extras as fallbacks
    if (medias.length <= limit) {
        return medias;
    }
@@ -311,7 +268,7 @@ async function extractSource(baseSource, { existingExtractMediaByUrl }) {
    throw new Error(`Could not extract source from ${baseSource.url}: ${res.status}`);
 }

-async function saveMedia(media) {
+async function storeFile(media) {
    const hashDir = media.meta.hash.slice(0, 2);
    const hashSubDir = media.meta.hash.slice(2, 4);
    const hashFilename = media.meta.hash.slice(4);
@@ -324,83 +281,137 @@ async function saveMedia(media) {
    const filepath = path.join(filedir, filename);

    if (media.meta.type === 'image') {
-        const thumbnail = await getThumbnail(media.file.buffer);
-
        const thumbdir = path.join(media.role, 'thumbs', hashDir, hashSubDir);
        const thumbpath = path.join(thumbdir, filename);

        await Promise.all([
-            fs.mkdir(path.join(config.media.path, filedir), { recursive: true }),
-            fs.mkdir(path.join(config.media.path, thumbdir), { recursive: true }),
+            fsPromises.mkdir(path.join(config.media.path, filedir), { recursive: true }),
+            fsPromises.mkdir(path.join(config.media.path, thumbdir), { recursive: true }),
        ]);

        await Promise.all([
-            fs.writeFile(path.join(config.media.path, filepath), media.file.buffer),
-            fs.writeFile(path.join(config.media.path, thumbpath), thumbnail),
+            fsPromises.rename(media.file.path, path.join(config.media.path, filepath)),
+            fsPromises.rename(media.file.thumbnail, path.join(config.media.path, thumbpath)),
        ]);

        return {
            ...media,
            file: {
-                // buffer is no longer needed, discard to free up memory
                path: filepath,
                thumbnail: thumbpath,
            },
        };
    }

-    await fs.mkdir(path.join(config.media.path, filedir), { recursive: true });
-    await fs.writeFile(path.join(config.media.path, filepath), media.file.buffer);
+    await fsPromises.mkdir(path.join(config.media.path, filedir), { recursive: true });
+    await fsPromises.rename(media.file.path, path.join(config.media.path, filepath));
+
+    logger.silly(`Stored media file at permanent location ${filepath}`);

    return {
        ...media,
        file: {
-            // buffer is no longer needed, discard to free up memory
            path: filepath,
        },
    };
 }

-async function fetchSource(source) {
+async function fetchSource(source, baseMedia) {
    logger.silly(`Fetching media from ${source.src}`);
    // attempts

    async function attempt(attempts = 1) {
        try {
-            const { pathname } = new URL(source.src);
-            const mimetype = mime.getType(pathname);
-            const extension = mime.getExtension(mimetype);
-            const type = mimetype?.split('/')[0] || 'image';
-
-            const res = await bhttp.get(source.src, {
-                headers: {
-                    ...(source.referer && { referer: source.referer }),
-                    ...(source.host && { host: source.host }),
-                },
-                stream: true,
+            const res = await http.get(source.src, {
+                ...(source.referer && { referer: source.referer }),
+                ...(source.host && { host: source.host }),
+            }, {
+                stream: true, // sources are fetched in parallel, don't gobble up memory
            });

            if (!res.ok) {
                throw new Error(`Response ${res.status} not OK`);
            }

-            const hash = getHash(res.body);
-            const entropy = type === 'image' ? await getEntropy(res.body) : null;
-            const { size, width, height } = type === 'image' ? await getMeta(res.body) : {};
+            const { pathname } = new URL(source.src);
+            const mimetype = mime.getType(pathname);
+            const extension = mime.getExtension(mimetype);
+            const type = mimetype?.split('/')[0] || 'image';

-            logger.silly(`Fetched media from ${source.src}`);
+            const hasher = new blake2.Hash('blake2b');
+            hasher.setEncoding('hex');
+
+            const hashStream = new PassThrough();
+            const metaStream = type === 'image' ? sharp() : new PassThrough();
+
+            const tempFilePath = path.join(config.media.path, 'temp', `${baseMedia.id}.${extension}`);
+            const tempThumbPath = path.join(config.media.path, 'temp', `${baseMedia.id}_thumb.${extension}`);
+
+            const tempFileTarget = fs.createWriteStream(tempFilePath);
+            const tempThumbTarget = fs.createWriteStream(tempThumbPath);
+
+            hashStream.on('data', chunk => hasher.write(chunk));
+
+            if (type === 'image') {
+                metaStream
+                    .clone()
+                    .resize({
+                        height: config.media.thumbnailSize,
+                        withoutEnlargement: true,
+                    })
+                    .jpeg({ quality: config.media.thumbnailQuality })
+                    .pipe(tempThumbTarget)
+                    .on('error', error => logger.error(error));
+            }
+
+            // pipeline destroys streams
+            const infoPromise = type === 'image' ? once(metaStream, 'info') : Promise.resolve([{}]);
+            const metaPromise = type === 'image' ? metaStream.stats() : Promise.resolve();
+
+            await pipeline(
+                res.originalRes,
+                metaStream,
+                hashStream,
+                tempFileTarget,
+            );
+
+            /*
+            res.originalRes
+                .pipe(metaStream)
+                .pipe(hashStream)
+                .pipe(tempFileTarget);
+            */
+
+            logger.silly(`Temporarily saved media from ${source.src}`);
+
+            const [stats, info] = await Promise.all([
+                metaPromise,
+                infoPromise,
+            ]);
+
+            logger.silly(`Ended pipeline for ${source.src}`);
+
+            hasher.end();
+
+            const hash = hasher.read();
+            const [{ size, width, height }] = info;
+
+            peakMemoryUsage = Math.max(getMemoryUsage(), peakMemoryUsage);
+
+            logger.silly(`Retrieved metadata from ${source.src}`);

            return {
                ...source,
                file: {
-                    buffer: res.body,
+                    path: tempFilePath,
+                    thumbnail: tempThumbPath,
                },
                meta: {
                    mimetype,
                    extension,
                    type,
                    hash,
-                    entropy,
+                    entropy: stats?.entropy,
                    size,
                    width,
                    height,
@@ -421,7 +432,7 @@ async function fetchSource(source) {
    return attempt(1);
 }

-async function trySource(baseSource, existingMedias) {
+async function trySource(baseSource, existingMedias, baseMedia) {
    // catch error and try the next source
    const extractedSource = await extractSource(baseSource, existingMedias);
    const existingSourceMedia = existingMedias.existingSourceMediaByUrl[extractedSource.src];
@@ -443,7 +454,7 @@ async function trySource(baseSource, existingMedias) {
        };
    }

-    return fetchSource(extractedSource);
+    return fetchSource(extractedSource, baseMedia);
 }

 async function fetchMedia(baseMedia, existingMedias) {
@@ -462,10 +473,17 @@ async function fetchMedia(baseMedia, existingMedias) {
            };
        }

+        return storeFile({
+            ...baseMedia,
+            ...source,
+        });
+
+        /*
        return saveMedia({
            ...baseMedia,
            ...source,
        });
+        */
    } catch (error) {
        logger.warn(error.message);

@@ -504,6 +522,8 @@ function curateMediaEntry(media, index) {
 }

 async function storeMedias(baseMedias) {
+    await fsPromises.mkdir(path.join(config.media.path, 'temp'), { recursive: true });
+
    const [existingSourceMediaByUrl, existingExtractMediaByUrl] = await findSourceDuplicates(baseMedias);

    const savedMedias = await Promise.map(
@@ -581,6 +601,8 @@ async function associateReleaseMedia(releases) {

        await knex.raw(`${knex(`releases_${role}`).insert(associations)} ON CONFLICT DO NOTHING`);
    }, Promise.resolve());
+
+    logger.debug(`Peak media fetching memory usage: ${peakMemoryUsage.toFixed(2)} MB`);
 }

 module.exports = {