Fixed Vixen scraper, using new token URL for trailers.

This commit is contained in:
2020-04-26 03:51:59 +02:00
parent 2cfbd21560
commit 2ac879d276
5 changed files with 313 additions and 97 deletions

View File

@@ -2,9 +2,12 @@
const config = require('config');
const Promise = require('bluebird');
const fs = require('fs').promises;
const util = require('util');
const fs = require('fs');
const fsPromises = require('fs').promises;
const path = require('path');
const bhttp = require('bhttp');
const stream = require('stream');
const { once } = require('events');
const nanoid = require('nanoid/non-secure');
const mime = require('mime');
// const fileType = require('file-type');
@@ -17,63 +20,17 @@ const knex = require('./knex');
const http = require('./utils/http');
const { get } = require('./utils/qu');
function getHash(buffer) {
const hash = blake2.createHash('blake2b', { digestLength: 24 });
hash.update(buffer);
const PassThrough = stream.PassThrough;
const pipeline = util.promisify(stream.pipeline);
return hash.digest('hex');
function getMemoryUsage() {
return process.memoryUsage().rss / (10 ** 6);
}
async function getEntropy(buffer, source) {
try {
const { entropy } = await sharp(buffer).stats();
return entropy;
} catch (error) {
logger.warn(`Failed to retrieve image entropy, using 7.5 for ${source.src}: ${error.message}`);
return 7.5;
}
}
async function getMeta(buffer, source) {
try {
const { width, height, size } = await sharp(buffer).metadata();
return {
width,
height,
size,
};
} catch (error) {
logger.warn(`Failed to retrieve image metadata from ${source.src}: ${error.message}`);
return {};
}
}
async function getThumbnail(buffer, height = config.media.thumbnailSize) {
try {
const thumbnail = sharp(buffer)
.resize({
height,
withoutEnlargement: true,
})
.jpeg({
quality: config.media.thumbnailQuality,
})
.toBuffer();
return thumbnail;
} catch (error) {
logger.error(`Failed to create thumbnail: ${error.message}`);
}
return null;
}
let peakMemoryUsage = getMemoryUsage();
function sampleMedias(medias, limit = config.media.limit, preferLast = true) {
// limit media sets, use extrax as fallbacks
// limit media sets, use extras as fallbacks
if (medias.length <= limit) {
return medias;
}
@@ -311,7 +268,7 @@ async function extractSource(baseSource, { existingExtractMediaByUrl }) {
throw new Error(`Could not extract source from ${baseSource.url}: ${res.status}`);
}
async function saveMedia(media) {
async function storeFile(media) {
const hashDir = media.meta.hash.slice(0, 2);
const hashSubDir = media.meta.hash.slice(2, 4);
const hashFilename = media.meta.hash.slice(4);
@@ -324,83 +281,137 @@ async function saveMedia(media) {
const filepath = path.join(filedir, filename);
if (media.meta.type === 'image') {
const thumbnail = await getThumbnail(media.file.buffer);
const thumbdir = path.join(media.role, 'thumbs', hashDir, hashSubDir);
const thumbpath = path.join(thumbdir, filename);
await Promise.all([
fs.mkdir(path.join(config.media.path, filedir), { recursive: true }),
fs.mkdir(path.join(config.media.path, thumbdir), { recursive: true }),
fsPromises.mkdir(path.join(config.media.path, filedir), { recursive: true }),
fsPromises.mkdir(path.join(config.media.path, thumbdir), { recursive: true }),
]);
await Promise.all([
fs.writeFile(path.join(config.media.path, filepath), media.file.buffer),
fs.writeFile(path.join(config.media.path, thumbpath), thumbnail),
fsPromises.rename(media.file.path, path.join(config.media.path, filepath)),
fsPromises.rename(media.file.thumbnail, path.join(config.media.path, thumbpath)),
]);
return {
...media,
file: {
// buffer is no longer needed, discard to free up memory
path: filepath,
thumbnail: thumbpath,
},
};
}
await fs.mkdir(path.join(config.media.path, filedir), { recursive: true });
await fs.writeFile(path.join(config.media.path, filepath), media.file.buffer);
await fsPromises.mkdir(path.join(config.media.path, filedir), { recursive: true });
await fsPromises.rename(media.file.path, path.join(config.media.path, filepath));
logger.silly(`Stored media file at permanent location ${filepath}`);
return {
...media,
file: {
// buffer is no longer needed, discard to free up memory
path: filepath,
},
};
}
async function fetchSource(source) {
async function fetchSource(source, baseMedia) {
logger.silly(`Fetching media from ${source.src}`);
// attempts
async function attempt(attempts = 1) {
try {
const { pathname } = new URL(source.src);
const mimetype = mime.getType(pathname);
const extension = mime.getExtension(mimetype);
const type = mimetype?.split('/')[0] || 'image';
const res = await bhttp.get(source.src, {
headers: {
...(source.referer && { referer: source.referer }),
...(source.host && { host: source.host }),
},
stream: true,
const res = await http.get(source.src, {
...(source.referer && { referer: source.referer }),
...(source.host && { host: source.host }),
}, {
stream: true, // sources are fetched in parallel, don't gobble up memory
});
if (!res.ok) {
throw new Error(`Response ${res.status} not OK`);
}
const hash = getHash(res.body);
const entropy = type === 'image' ? await getEntropy(res.body) : null;
const { size, width, height } = type === 'image' ? await getMeta(res.body) : {};
const { pathname } = new URL(source.src);
const mimetype = mime.getType(pathname);
const extension = mime.getExtension(mimetype);
const type = mimetype?.split('/')[0] || 'image';
logger.silly(`Fetched media from ${source.src}`);
const hasher = new blake2.Hash('blake2b');
hasher.setEncoding('hex');
const hashStream = new PassThrough();
const metaStream = type === 'image' ? sharp() : new PassThrough();
const tempFilePath = path.join(config.media.path, 'temp', `${baseMedia.id}.${extension}`);
const tempThumbPath = path.join(config.media.path, 'temp', `${baseMedia.id}_thumb.${extension}`);
const tempFileTarget = fs.createWriteStream(tempFilePath);
const tempThumbTarget = fs.createWriteStream(tempThumbPath);
hashStream.on('data', chunk => hasher.write(chunk));
if (type === 'image') {
metaStream
.clone()
.resize({
height: config.media.thumbnailSize,
withoutEnlargement: true,
})
.jpeg({ quality: config.media.thumbnailQuality })
.pipe(tempThumbTarget)
.on('error', error => logger.error(error));
}
// pipeline destroys streams
const infoPromise = type === 'image' ? once(metaStream, 'info') : Promise.resolve([{}]);
const metaPromise = type === 'image' ? metaStream.stats() : Promise.resolve();
await pipeline(
res.originalRes,
metaStream,
hashStream,
tempFileTarget,
);
/*
res.originalRes
.pipe(metaStream)
.pipe(hashStream)
.pipe(tempFileTarget);
*/
logger.silly(`Temporarily saved media from ${source.src}`);
const [stats, info] = await Promise.all([
metaPromise,
infoPromise,
]);
logger.silly(`Ended pipeline for ${source.src}`);
hasher.end();
const hash = hasher.read();
const [{ size, width, height }] = info;
peakMemoryUsage = Math.max(getMemoryUsage(), peakMemoryUsage);
logger.silly(`Retrieved metadata from ${source.src}`);
return {
...source,
file: {
buffer: res.body,
path: tempFilePath,
thumbnail: tempThumbPath,
},
meta: {
mimetype,
extension,
type,
hash,
entropy,
entropy: stats?.entropy,
size,
width,
height,
@@ -421,7 +432,7 @@ async function fetchSource(source) {
return attempt(1);
}
async function trySource(baseSource, existingMedias) {
async function trySource(baseSource, existingMedias, baseMedia) {
// catch error and try the next source
const extractedSource = await extractSource(baseSource, existingMedias);
const existingSourceMedia = existingMedias.existingSourceMediaByUrl[extractedSource.src];
@@ -443,7 +454,7 @@ async function trySource(baseSource, existingMedias) {
};
}
return fetchSource(extractedSource);
return fetchSource(extractedSource, baseMedia);
}
async function fetchMedia(baseMedia, existingMedias) {
@@ -462,10 +473,17 @@ async function fetchMedia(baseMedia, existingMedias) {
};
}
return storeFile({
...baseMedia,
...source,
});
/*
return saveMedia({
...baseMedia,
...source,
});
*/
} catch (error) {
logger.warn(error.message);
@@ -504,6 +522,8 @@ function curateMediaEntry(media, index) {
}
async function storeMedias(baseMedias) {
await fsPromises.mkdir(path.join(config.media.path, 'temp'), { recursive: true });
const [existingSourceMediaByUrl, existingExtractMediaByUrl] = await findSourceDuplicates(baseMedias);
const savedMedias = await Promise.map(
@@ -581,6 +601,8 @@ async function associateReleaseMedia(releases) {
await knex.raw(`${knex(`releases_${role}`).insert(associations)} ON CONFLICT DO NOTHING`);
}, Promise.resolve());
logger.debug(`Peak media fetching memory usage: ${peakMemoryUsage.toFixed(2)} MB`);
}
module.exports = {