Expanded new media module. Added network to channel site to fix actor glitch.

This commit is contained in:
ThePendulum 2020-03-30 03:01:08 +02:00
parent a5d859471d
commit 4eaacf5697
6 changed files with 263 additions and 40 deletions

View File

@ -1,13 +1,76 @@
'use strict';
const config = require('config');
const Promise = require('bluebird');
const fs = require('fs');
const fsPromises = require('fs').promises;
const path = require('path');
const nanoid = require('nanoid/non-secure');
const mime = require('mime');
const sharp = require('sharp');
const blake2 = require('blake2');
const logger = require('./logger')(__filename);
const argv = require('./argv');
const knex = require('./knex');
const http = require('./utils/http');
const { get } = require('./utils/qu');
function getHash(buffer) {
const hash = blake2.createHash('blake2b', { digestLength: 24 });
hash.update(buffer);
return hash.digest('hex');
}
async function getEntropy(buffer) {
try {
const { entropy } = await sharp(buffer).stats();
return entropy;
} catch (error) {
logger.warn(`Failed to retrieve image entropy, using 7.5: ${error.message}`);
return 7.5;
}
}
async function getMeta(buffer) {
try {
const { width, height, size } = await sharp(buffer).metadata();
return {
width,
height,
size,
};
} catch (error) {
logger.warn(`Failed to retrieve image metadata: ${error.message}`);
return {};
}
}
async function getThumbnail(buffer, height = config.media.thumbnailSize) {
try {
const thumbnail = sharp(buffer)
.resize({
height,
withoutEnlargement: true,
})
.jpeg({
quality: config.media.thumbnailQuality,
})
.toBuffer();
return thumbnail;
} catch (error) {
logger.error(`Failed to create thumbnail: ${error.message}`);
}
return null;
}
function itemsByKey(items, key) {
return items.reduce((acc, item) => ({ ...acc, [item[key]]: item }), {});
}
@ -23,6 +86,9 @@ function toBaseSource(rawSource) {
if (rawSource.url) baseSource.url = rawSource.url;
if (rawSource.extract) baseSource.extract = rawSource.extract;
if (rawSource.referer) baseSource.referer = rawSource.referer;
if (rawSource.host) baseSource.host = rawSource.host;
return baseSource;
}
@ -35,11 +101,12 @@ function toBaseSource(rawSource) {
return null;
}
function baseSourceToBaseMedia(baseSource) {
function baseSourceToBaseMedia(baseSource, role) {
if (Array.isArray(baseSource)) {
if (baseSource.length > 0) {
return {
id: nanoid(),
role,
sources: baseSource,
};
}
@ -50,6 +117,7 @@ function baseSourceToBaseMedia(baseSource) {
if (baseSource) {
return {
id: nanoid(),
role,
sources: [baseSource],
};
}
@ -57,15 +125,15 @@ function baseSourceToBaseMedia(baseSource) {
return null;
}
function fallbackMediaToBaseMedia(rawMedia) {
function fallbackMediaToBaseMedia(rawMedia, role) {
const baseSources = rawMedia
.map(source => toBaseSource(source))
.filter(Boolean);
return baseSourceToBaseMedia(baseSources);
return baseSourceToBaseMedia(baseSources, role);
}
function toBaseMedias(rawMedias) {
function toBaseMedias(rawMedias, role) {
if (!rawMedias || rawMedias.length === 0) {
return [];
}
@ -77,12 +145,12 @@ function toBaseMedias(rawMedias) {
if (Array.isArray(rawMedia)) {
// fallback sources provided
return fallbackMediaToBaseMedia(rawMedia);
return fallbackMediaToBaseMedia(rawMedia, role);
}
const baseSource = toBaseSource(rawMedia);
return baseSourceToBaseMedia(baseSource);
return baseSourceToBaseMedia(baseSource, role);
}).filter(Boolean);
}
@ -97,7 +165,6 @@ async function findSourceDuplicates(baseMedias) {
.flat()
.filter(Boolean);
const [existingSourceMedia, existingExtractMedia] = await Promise.all([
knex('media').whereIn('source', sourceUrls),
knex('media').whereIn('source_page', extractUrls),
@ -112,37 +179,183 @@ async function findSourceDuplicates(baseMedias) {
};
}
async function extractSource(baseSource) {
if (!baseSource.extract || !baseSource.url) {
async function findHashDuplicates(medias) {
const mediaHashes = medias.map(media => media.file?.hash).filter(Boolean);
const existingHashMedia = await knex('media').whereIn('hash', mediaHashes);
return itemsByKey(existingHashMedia, 'hash');
}
async function extractSource(baseSource, { existingExtractMediaByUrl }) {
if (typeof baseSource.extract !== 'function' || !baseSource.url) {
return baseSource;
}
const existingExtractMedia = existingExtractMediaByUrl[baseSource.url];
if (existingExtractMedia) {
// media entry found by extract URL
return {
...baseSource,
entry: existingExtractMedia,
src: existingExtractMedia.source,
};
}
const res = await get(baseSource.url);
console.log(res);
return baseSource;
if (res.ok) {
const src = await baseSource.extract(res.item);
return {
...baseSource,
src,
};
}
async function fetchSource(baseSource, { existingSourceMediaByUrl, existingExtractMediaByUrl }) {
// attempts
// extract
const extractedSource = await extractSource(baseSource, existingExtractMediaByUrl);
throw new Error(`Could not extract source from ${baseSource.url}: ${res.status}`);
}
console.log(extractedSource);
async function fetchSource(source, baseMedia, baseSourceIndex) {
logger.silly(`Fetching media from ${source.src}`);
// attempts
async function attempt(attempts = 1) {
try {
const { pathname } = new URL(source.src);
const mimetype = mime.getType(pathname);
const extension = mime.getExtension(mimetype);
const isImage = /image/.test(mimetype);
const tempPath = path.join(config.media.path, 'temp', `${baseMedia.id}-${baseSourceIndex}.${extension}`);
const res = await http.get(source.src, {
...(source.referer && { referer: source.referer }),
...(source.host && { host: source.host }),
}, {
stream: true,
});
if (!res.ok) {
throw new Error(`Response ${res.status} not OK`);
}
res.res.pipe(fs.createWriteStream(tempPath));
const buffer = res.body;
console.log(res.body);
const hash = getHash(buffer);
const entropy = isImage ? await getEntropy(buffer) : null;
const { size, width, height } = isImage ? await getMeta(buffer) : {};
logger.silly(`Fetched media from ${source.src}`);
return {
...source,
file: {
temp: tempPath,
mimetype,
extension,
hash,
entropy,
size,
width,
height,
},
};
} catch (error) {
logger.warn(`Failed attempt ${attempts}/3 to fetch ${source.src}: ${error.message}`);
if (attempts < 3) {
await Promise.delay(1000);
return attempt(attempts + 1);
}
throw new Error(`Failed to fetch ${source.src}: ${error.message}`);
}
}
return attempt(1);
}
async function trySource(baseSource, existingMedias, baseMedia, baseSourceIndex) {
// catch error and try the next source
const extractedSource = await extractSource(baseSource, existingMedias);
const existingSourceMedia = existingMedias.existingSourceMediaByUrl[extractedSource.src];
if (extractedSource.entry) {
// media entry found during extraction, don't fetch
return extractedSource;
}
if (existingSourceMedia) {
// media entry found by source URL, don't fetch
return {
...baseSource,
entry: existingSourceMedia,
src: existingSourceMedia.source,
};
}
return fetchSource(extractedSource, baseMedia, baseSourceIndex, 1);
}
async function fetchMedia(baseMedia, existingMedias) {
await baseMedia.sources.reduce((result, baseSource, _baseSourceIndex) => result.catch(async () => {
await fetchSource(baseSource, existingMedias);
}), Promise.reject(new Error()));
try {
const source = await baseMedia.sources.reduce(
// try each source until success
(result, baseSource, baseSourceIndex) => result.catch(async () => trySource(baseSource, existingMedias, baseMedia, baseSourceIndex)),
Promise.reject(new Error()),
);
return {
...baseMedia,
...source,
};
} catch (error) {
logger.warn(error.message);
return baseMedia;
}
}
function saveMedia(media, existingHashMediaByHash) {
const existingHashMedia = existingHashMediaByHash[media.file.hash];
if (existingHashMedia) {
return {
...media,
entry: existingHashMedia,
};
}
const hashDir = media.file.hash.slice(0, 2);
const hashSubDir = media.file.hash.slice(2, 4);
const hashFilename = media.file.hash.slice(4);
const filename = media.quality
? `${hashFilename}_${media.quality}.${media.file.extension}`
: `${hashFilename}.${media.file.extension}`;
const filedir = path.join(media.role, hashDir, hashSubDir);
const filepath = path.join(filedir, filename);
console.log(filedir, filepath);
return media;
}
async function storeMedias(baseMedias) {
await fsPromises.mkdir(path.join(config.media.path, 'temp'), { recursive: true });
const { existingSourceMediaByUrl, existingExtractMediaByUrl } = await findSourceDuplicates(baseMedias);
const fetchedMedias = await Promise.map(baseMedias, async baseMedia => fetchMedia(baseMedia, { existingSourceMediaByUrl, existingExtractMediaByUrl }));
await Promise.map(baseMedias, async baseMedia => fetchMedia(baseMedia, { existingSourceMediaByUrl, existingExtractMediaByUrl }));
console.log(existingSourceMediaByUrl, existingExtractMediaByUrl);
const existingHashMediaByHash = await findHashDuplicates(fetchedMedias);
const savedMedias = await Promise.map(fetchedMedias, async fetchedMedia => saveMedia(fetchedMedia, existingHashMediaByHash));
}
async function associateReleaseMedia(releases) {
@ -153,14 +366,17 @@ async function associateReleaseMedia(releases) {
const baseMediasByReleaseId = releases.reduce((acc, release) => ({
...acc,
[release.id]: {
poster: argv.images && argv.poster && toBaseMedias([release.poster]),
photos: argv.images && argv.photos && toBaseMedias(release.photos),
trailer: argv.videos && argv.trailer && toBaseMedias([release.trailer]),
teaser: argv.videos && argv.teaser && toBaseMedias([release.teaser]),
poster: argv.images && argv.poster && toBaseMedias([release.poster], 'posters'),
photos: argv.images && argv.photos && toBaseMedias(release.photos, 'photos').slice(0, 5),
trailer: argv.videos && argv.trailer && toBaseMedias([release.trailer], 'trailers'),
teaser: argv.videos && argv.teaser && toBaseMedias([release.teaser], 'teasers'),
},
}), {});
const baseMedias = Object.values(baseMediasByReleaseId).map(releaseMedia => Object.values(releaseMedia)).flat(2);
const baseMedias = Object.values(baseMediasByReleaseId)
.map(releaseMedia => Object.values(releaseMedia))
.flat(2)
.filter(Boolean);
await storeMedias(baseMedias);
}

View File

@ -19,7 +19,7 @@ async function getPhotos(albumUrl) {
return {
url: pageUrl,
extract: q => q('.scenes-module img', 'src'),
extract: ({ qu }) => qu.q('.scenes-module img', 'src'),
};
});

View File

@ -36,7 +36,7 @@ async function curateSite(site, includeParameters = false, includeTags = true) {
return curatedSite;
}
function curateSites(sites, includeParameters) {
async function curateSites(sites, includeParameters) {
return Promise.all(sites.map(async site => curateSite(site, includeParameters)));
}

View File

@ -7,10 +7,11 @@ const knex = require('./knex');
const slugify = require('./utils/slugify');
const { associateActors } = require('./actors');
const { associateReleaseTags } = require('./tags');
const { curateSite } = require('./sites');
const { associateReleaseMedia } = require('./media');
function curateReleaseEntry(release, batchId, existingRelease) {
const slug = slugify(release.title, '-', {
const slug = slugify(release.title || release.actors?.join('-') || null, '-', {
encode: true,
limit: config.titleSlugLength,
});
@ -46,29 +47,34 @@ function curateReleaseEntry(release, batchId, existingRelease) {
async function attachChannelSites(releases) {
const releasesWithoutSite = releases.filter(release => release.channel && (!release.site || release.site.isFallback));
const channelSites = await knex('sites').whereIn('slug', releasesWithoutSite.map(release => release.channel));
const channelSites = await knex('sites')
.leftJoin('networks', 'networks.id', 'sites.network_id')
.select('sites.*', 'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.parameters as network_parameters', 'networks.description as network_description')
.whereIn('sites.slug', releasesWithoutSite.map(release => release.channel));
const channelSitesBySlug = channelSites.reduce((acc, site) => ({ ...acc, [site.slug]: site }), {});
const releasesWithChannelSite = releases
.map((release) => {
const releasesWithChannelSite = await Promise.all(releases
.map(async (release) => {
if (release.site && !release.site.isFallback) {
return release;
}
if (release.channel && channelSitesBySlug[release.channel]) {
const curatedSite = await curateSite(channelSitesBySlug[release.channel]);
return {
...release,
site: channelSitesBySlug[release.channel],
site: curatedSite,
};
}
logger.error(`Unable to match channel '${release.channel?.slug || release.channel}' from generic URL ${release.url}`);
return null;
})
.filter(Boolean);
}));
return releasesWithChannelSite;
return releasesWithChannelSite.filter(Boolean);
}
async function attachStudios(releases) {
@ -201,8 +207,6 @@ async function storeReleases(releases) {
const curatedNewReleaseEntries = uniqueReleases.map(release => curateReleaseEntry(release, batchId));
// console.log(curatedNewReleaseEntries);
const storedReleases = await knex('releases').insert(curatedNewReleaseEntries).returning('*');
// TODO: update duplicate releases

View File

@ -72,10 +72,13 @@ queue.define('http', async ({
const json = Buffer.isBuffer(res.body) ? null : res.body;
return {
...res,
res,
html,
json,
pipe: res.pipe,
ok: res.statusCode >= 200 && res.statusCode <= 299,
code: res.statusCode,
status: res.statusCode,
};
}, {
concurrency: 20,

View File

@ -5,7 +5,7 @@ function slugify(string, delimiter = '-', {
limit = 1000,
} = {}) {
if (!string) {
return '';
return string;
}
const slugComponents = string.trim().toLowerCase().match(/\w+/g);