From 4eaacf56973257dec0c06a575ed610d25010d8c5 Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Mon, 30 Mar 2020 03:01:08 +0200 Subject: [PATCH] Expanded new media module. Added network to channel site to fix actor glitch. --- src/media.js | 268 ++++++++++++++++++++++++++++++++++++---- src/scrapers/dogfart.js | 2 +- src/sites.js | 2 +- src/store-releases.js | 24 ++-- src/utils/http.js | 5 +- src/utils/slugify.js | 2 +- 6 files changed, 263 insertions(+), 40 deletions(-) diff --git a/src/media.js b/src/media.js index c0b534f1..621789b6 100644 --- a/src/media.js +++ b/src/media.js @@ -1,13 +1,76 @@ 'use strict'; +const config = require('config'); const Promise = require('bluebird'); +const fs = require('fs'); +const fsPromises = require('fs').promises; +const path = require('path'); const nanoid = require('nanoid/non-secure'); +const mime = require('mime'); +const sharp = require('sharp'); +const blake2 = require('blake2'); const logger = require('./logger')(__filename); const argv = require('./argv'); const knex = require('./knex'); +const http = require('./utils/http'); const { get } = require('./utils/qu'); +function getHash(buffer) { + const hash = blake2.createHash('blake2b', { digestLength: 24 }); + hash.update(buffer); + + return hash.digest('hex'); +} + +async function getEntropy(buffer) { + try { + const { entropy } = await sharp(buffer).stats(); + + return entropy; + } catch (error) { + logger.warn(`Failed to retrieve image entropy, using 7.5: ${error.message}`); + + return 7.5; + } +} + +async function getMeta(buffer) { + try { + const { width, height, size } = await sharp(buffer).metadata(); + + return { + width, + height, + size, + }; + } catch (error) { + logger.warn(`Failed to retrieve image metadata: ${error.message}`); + + return {}; + } +} + +async function getThumbnail(buffer, height = config.media.thumbnailSize) { + try { + const thumbnail = sharp(buffer) + .resize({ + height, + withoutEnlargement: true, + }) + .jpeg({ + quality: config.media.thumbnailQuality, + }) + .toBuffer(); + + return thumbnail; + } catch (error) { + logger.error(`Failed to create thumbnail: ${error.message}`); + } + + return null; +} + function itemsByKey(items, key) { return items.reduce((acc, item) => ({ ...acc, [item[key]]: item }), {}); } @@ -23,6 +86,9 @@ function toBaseSource(rawSource) { if (rawSource.url) baseSource.url = rawSource.url; if (rawSource.extract) baseSource.extract = rawSource.extract; + if (rawSource.referer) baseSource.referer = rawSource.referer; + if (rawSource.host) baseSource.host = rawSource.host; + return baseSource; } @@ -35,11 +101,12 @@ function toBaseSource(rawSource) { return null; } -function baseSourceToBaseMedia(baseSource) { +function baseSourceToBaseMedia(baseSource, role) { if (Array.isArray(baseSource)) { if (baseSource.length > 0) { return { id: nanoid(), + role, sources: baseSource, }; } @@ -50,6 +117,7 @@ function baseSourceToBaseMedia(baseSource) { if (baseSource) { return { id: nanoid(), + role, sources: [baseSource], }; } @@ -57,15 +125,15 @@ function baseSourceToBaseMedia(baseSource) { return null; } -function fallbackMediaToBaseMedia(rawMedia) { +function fallbackMediaToBaseMedia(rawMedia, role) { const baseSources = rawMedia .map(source => toBaseSource(source)) .filter(Boolean); - return baseSourceToBaseMedia(baseSources); + return baseSourceToBaseMedia(baseSources, role); } -function toBaseMedias(rawMedias) { +function toBaseMedias(rawMedias, role) { if (!rawMedias || rawMedias.length === 0) { return []; } @@ -77,12 +145,12 @@ function toBaseMedias(rawMedias) { if (Array.isArray(rawMedia)) { // fallback sources provided - return fallbackMediaToBaseMedia(rawMedia); + return fallbackMediaToBaseMedia(rawMedia, role); } const baseSource = toBaseSource(rawMedia); - return baseSourceToBaseMedia(baseSource); + return baseSourceToBaseMedia(baseSource, role); }).filter(Boolean); } @@ -97,7 +165,6 @@ async function findSourceDuplicates(baseMedias) { .flat() .filter(Boolean); - const [existingSourceMedia, existingExtractMedia] = await Promise.all([ knex('media').whereIn('source', sourceUrls), knex('media').whereIn('source_page', extractUrls), @@ -112,37 +179,183 @@ async function findSourceDuplicates(baseMedias) { }; } -async function extractSource(baseSource) { - if (!baseSource.extract || !baseSource.url) { +async function findHashDuplicates(medias) { + const mediaHashes = medias.map(media => media.file?.hash).filter(Boolean); + const existingHashMedia = await knex('media').whereIn('hash', mediaHashes); + + return itemsByKey(existingHashMedia, 'hash'); +} + +async function extractSource(baseSource, { existingExtractMediaByUrl }) { + if (typeof baseSource.extract !== 'function' || !baseSource.url) { return baseSource; } + const existingExtractMedia = existingExtractMediaByUrl[baseSource.url]; + + if (existingExtractMedia) { + // media entry found by extract URL + return { + ...baseSource, + entry: existingExtractMedia, + src: existingExtractMedia.source, + }; + } + const res = await get(baseSource.url); - console.log(res); - return baseSource; + if (res.ok) { + const src = await baseSource.extract(res.item); + + return { + ...baseSource, + src, + }; + } + + throw new Error(`Could not extract source from ${baseSource.url}: ${res.status}`); } -async function fetchSource(baseSource, { existingSourceMediaByUrl, existingExtractMediaByUrl }) { +async function fetchSource(source, baseMedia, baseSourceIndex) { + logger.silly(`Fetching media from ${source.src}`); // attempts - // extract - const extractedSource = await extractSource(baseSource, existingExtractMediaByUrl); - console.log(extractedSource); + async function attempt(attempts = 1) { + try { + const { pathname } = new URL(source.src); + const mimetype = mime.getType(pathname); + const extension = mime.getExtension(mimetype); + const isImage = /image/.test(mimetype); + + const tempPath = path.join(config.media.path, 'temp', `${baseMedia.id}-${baseSourceIndex}.${extension}`); + + const res = await http.get(source.src, { + ...(source.referer && { referer: source.referer }), + ...(source.host && { host: source.host }), + }, { + stream: true, + }); + + if (!res.ok) { + throw new Error(`Response ${res.status} not OK`); + } + + res.res.pipe(fs.createWriteStream(tempPath)); + + const buffer = res.body; + + console.log(res.body); + + const hash = getHash(buffer); + const entropy = isImage ? await getEntropy(buffer) : null; + const { size, width, height } = isImage ? await getMeta(buffer) : {}; + + logger.silly(`Fetched media from ${source.src}`); + + return { + ...source, + file: { + temp: tempPath, + mimetype, + extension, + hash, + entropy, + size, + width, + height, + }, + }; + } catch (error) { + logger.warn(`Failed attempt ${attempts}/3 to fetch ${source.src}: ${error.message}`); + + if (attempts < 3) { + await Promise.delay(1000); + + return attempt(attempts + 1); + } + + throw new Error(`Failed to fetch ${source.src}: ${error.message}`); + } + } + + return attempt(1); +} + +async function trySource(baseSource, existingMedias, baseMedia, baseSourceIndex) { + // catch error and try the next source + const extractedSource = await extractSource(baseSource, existingMedias); + const existingSourceMedia = existingMedias.existingSourceMediaByUrl[extractedSource.src]; + + if (extractedSource.entry) { + // media entry found during extraction, don't fetch + return extractedSource; + } + + if (existingSourceMedia) { + // media entry found by source URL, don't fetch + return { + ...baseSource, + entry: existingSourceMedia, + src: existingSourceMedia.source, + }; + } + + return fetchSource(extractedSource, baseMedia, baseSourceIndex, 1); } async function fetchMedia(baseMedia, existingMedias) { - await baseMedia.sources.reduce((result, baseSource, _baseSourceIndex) => result.catch(async () => { - await fetchSource(baseSource, existingMedias); - }), Promise.reject(new Error())); + try { + const source = await baseMedia.sources.reduce( + // try each source until success + (result, baseSource, baseSourceIndex) => result.catch(async () => trySource(baseSource, existingMedias, baseMedia, baseSourceIndex)), + Promise.reject(new Error()), + ); + + return { + ...baseMedia, + ...source, + }; + } catch (error) { + logger.warn(error.message); + + return baseMedia; + } +} + +function saveMedia(media, existingHashMediaByHash) { + const existingHashMedia = existingHashMediaByHash[media.file.hash]; + + if (existingHashMedia) { + return { + ...media, + entry: existingHashMedia, + }; + } + + const hashDir = media.file.hash.slice(0, 2); + const hashSubDir = media.file.hash.slice(2, 4); + const hashFilename = media.file.hash.slice(4); + + const filename = media.quality + ? `${hashFilename}_${media.quality}.${media.file.extension}` + : `${hashFilename}.${media.file.extension}`; + + const filedir = path.join(media.role, hashDir, hashSubDir); + const filepath = path.join(filedir, filename); + + console.log(filedir, filepath); + + return media; } async function storeMedias(baseMedias) { + await fsPromises.mkdir(path.join(config.media.path, 'temp'), { recursive: true }); + const { existingSourceMediaByUrl, existingExtractMediaByUrl } = await findSourceDuplicates(baseMedias); + const fetchedMedias = await Promise.map(baseMedias, async baseMedia => fetchMedia(baseMedia, { existingSourceMediaByUrl, existingExtractMediaByUrl })); - await Promise.map(baseMedias, async baseMedia => fetchMedia(baseMedia, { existingSourceMediaByUrl, existingExtractMediaByUrl })); - - console.log(existingSourceMediaByUrl, existingExtractMediaByUrl); + const existingHashMediaByHash = await findHashDuplicates(fetchedMedias); + const savedMedias = await Promise.map(fetchedMedias, async fetchedMedia => saveMedia(fetchedMedia, existingHashMediaByHash)); } async function associateReleaseMedia(releases) { @@ -153,14 +366,17 @@ async function associateReleaseMedia(releases) { const baseMediasByReleaseId = releases.reduce((acc, release) => ({ ...acc, [release.id]: { - poster: argv.images && argv.poster && toBaseMedias([release.poster]), - photos: argv.images && argv.photos && toBaseMedias(release.photos), - trailer: argv.videos && argv.trailer && toBaseMedias([release.trailer]), - teaser: argv.videos && argv.teaser && toBaseMedias([release.teaser]), + poster: argv.images && argv.poster && toBaseMedias([release.poster], 'posters'), + photos: argv.images && argv.photos && toBaseMedias(release.photos, 'photos').slice(0, 5), + trailer: argv.videos && argv.trailer && toBaseMedias([release.trailer], 'trailers'), + teaser: argv.videos && argv.teaser && toBaseMedias([release.teaser], 'teasers'), }, }), {}); - const baseMedias = Object.values(baseMediasByReleaseId).map(releaseMedia => Object.values(releaseMedia)).flat(2); + const baseMedias = Object.values(baseMediasByReleaseId) + .map(releaseMedia => Object.values(releaseMedia)) + .flat(2) + .filter(Boolean); await storeMedias(baseMedias); } diff --git a/src/scrapers/dogfart.js b/src/scrapers/dogfart.js index acdcb80a..1d9252e4 100644 --- a/src/scrapers/dogfart.js +++ b/src/scrapers/dogfart.js @@ -19,7 +19,7 @@ async function getPhotos(albumUrl) { return { url: pageUrl, - extract: q => q('.scenes-module img', 'src'), + extract: ({ qu }) => qu.q('.scenes-module img', 'src'), }; }); diff --git a/src/sites.js b/src/sites.js index bbe3f6e0..1c9a591f 100644 --- a/src/sites.js +++ b/src/sites.js @@ -36,7 +36,7 @@ async function curateSite(site, includeParameters = false, includeTags = true) { return curatedSite; } -function curateSites(sites, includeParameters) { +async function curateSites(sites, includeParameters) { return Promise.all(sites.map(async site => curateSite(site, includeParameters))); } diff --git a/src/store-releases.js b/src/store-releases.js index 4d11c74d..28da0269 100644 --- a/src/store-releases.js +++ b/src/store-releases.js @@ -7,10 +7,11 @@ const knex = require('./knex'); const slugify = require('./utils/slugify'); const { associateActors } = require('./actors'); const { associateReleaseTags } = require('./tags'); +const { curateSite } = require('./sites'); const { associateReleaseMedia } = require('./media'); function curateReleaseEntry(release, batchId, existingRelease) { - const slug = slugify(release.title, '-', { + const slug = slugify(release.title || release.actors?.join('-') || null, '-', { encode: true, limit: config.titleSlugLength, }); @@ -46,29 +47,34 @@ function curateReleaseEntry(release, batchId, existingRelease) { async function attachChannelSites(releases) { const releasesWithoutSite = releases.filter(release => release.channel && (!release.site || release.site.isFallback)); - const channelSites = await knex('sites').whereIn('slug', releasesWithoutSite.map(release => release.channel)); + const channelSites = await knex('sites') + .leftJoin('networks', 'networks.id', 'sites.network_id') + .select('sites.*', 'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.parameters as network_parameters', 'networks.description as network_description') + .whereIn('sites.slug', releasesWithoutSite.map(release => release.channel)); + const channelSitesBySlug = channelSites.reduce((acc, site) => ({ ...acc, [site.slug]: site }), {}); - const releasesWithChannelSite = releases - .map((release) => { + const releasesWithChannelSite = await Promise.all(releases + .map(async (release) => { if (release.site && !release.site.isFallback) { return release; } if (release.channel && channelSitesBySlug[release.channel]) { + const curatedSite = await curateSite(channelSitesBySlug[release.channel]); + return { ...release, - site: channelSitesBySlug[release.channel], + site: curatedSite, }; } logger.error(`Unable to match channel '${release.channel?.slug || release.channel}' from generic URL ${release.url}`); return null; - }) - .filter(Boolean); + })); - return releasesWithChannelSite; + return releasesWithChannelSite.filter(Boolean); } async function attachStudios(releases) { @@ -201,8 +207,6 @@ async function storeReleases(releases) { const curatedNewReleaseEntries = uniqueReleases.map(release => curateReleaseEntry(release, batchId)); - // console.log(curatedNewReleaseEntries); - const storedReleases = await knex('releases').insert(curatedNewReleaseEntries).returning('*'); // TODO: update duplicate releases diff --git a/src/utils/http.js b/src/utils/http.js index f83da774..281baac0 100644 --- a/src/utils/http.js +++ b/src/utils/http.js @@ -72,10 +72,13 @@ queue.define('http', async ({ const json = Buffer.isBuffer(res.body) ? null : res.body; return { - ...res, + res, html, json, + pipe: res.pipe, + ok: res.statusCode >= 200 && res.statusCode <= 299, code: res.statusCode, + status: res.statusCode, }; }, { concurrency: 20, diff --git a/src/utils/slugify.js b/src/utils/slugify.js index 97f02870..ae5a6d38 100644 --- a/src/utils/slugify.js +++ b/src/utils/slugify.js @@ -5,7 +5,7 @@ function slugify(string, delimiter = '-', { limit = 1000, } = {}) { if (!string) { - return ''; + return string; } const slugComponents = string.trim().toLowerCase().match(/\w+/g);