2019-11-11 02:20:00 +00:00
'use strict' ;
const config = require ( 'config' ) ;
const Promise = require ( 'bluebird' ) ;
const path = require ( 'path' ) ;
const fs = require ( 'fs-extra' ) ;
const bhttp = require ( 'bhttp' ) ;
const mime = require ( 'mime' ) ;
const sharp = require ( 'sharp' ) ;
const blake2 = require ( 'blake2' ) ;
2020-02-07 18:53:16 +00:00
const logger = require ( './logger' ) ( _ _filename ) ;
2019-11-11 02:20:00 +00:00
const knex = require ( './knex' ) ;
2019-12-31 02:12:52 +00:00
const upsert = require ( './utils/upsert' ) ;
2020-02-02 21:36:33 +00:00
const { ex } = require ( './utils/q' ) ;
2019-11-11 02:20:00 +00:00
function getHash ( buffer ) {
const hash = blake2 . createHash ( 'blake2b' , { digestLength : 24 } ) ;
hash . update ( buffer ) ;
return hash . digest ( 'hex' ) ;
}
2019-12-13 02:28:52 +00:00
function pluckPhotos ( photos , specifiedLimit ) {
2019-12-12 02:12:05 +00:00
const limit = specifiedLimit || config . media . limit ;
if ( photos . length <= limit ) {
return photos ;
}
const plucked = [ 1 ]
. concat (
2019-12-12 03:04:35 +00:00
Array . from ( { length : limit - 1 } , ( value , index ) => Math . round ( ( index + 1 ) * ( photos . length / ( limit - 1 ) ) ) ) ,
2019-12-12 02:12:05 +00:00
) ;
return Array . from ( new Set ( plucked ) ) . map ( photoIndex => photos [ photoIndex - 1 ] ) ; // remove duplicates, may happen when photo total and photo limit are close
}
2020-01-30 00:14:31 +00:00
async function getEntropy ( buffer ) {
2020-02-08 03:52:32 +00:00
try {
const { entropy } = await sharp ( buffer ) . stats ( ) ;
return entropy ;
} catch ( error ) {
logger . warn ( ` Failed to retrieve image entropy, using 7.5: ${ error . message } ` ) ;
2020-01-30 00:14:31 +00:00
2020-02-08 03:52:32 +00:00
return 7.5 ;
}
2020-01-30 00:14:31 +00:00
}
2019-12-13 02:28:52 +00:00
async function createThumbnail ( buffer ) {
2020-01-27 00:41:04 +00:00
try {
const thumbnail = sharp ( buffer )
. resize ( {
height : config . media . thumbnailSize ,
withoutEnlargement : true ,
} )
. jpeg ( {
quality : config . media . thumbnailQuality ,
} )
. toBuffer ( ) ;
return thumbnail ;
} catch ( error ) {
logger . error ( ` Failed to create thumbnail: ${ error . message } ` ) ;
}
2020-02-11 03:58:18 +00:00
return null ;
2019-11-12 00:22:20 +00:00
}
2019-12-13 02:28:52 +00:00
async function createMediaDirectory ( domain , subpath ) {
const filepath = path . join ( config . media . path , domain , subpath ) ;
2019-11-20 03:53:36 +00:00
2019-12-13 02:28:52 +00:00
await fs . mkdir ( filepath , { recursive : true } ) ;
return filepath ;
2019-11-16 02:33:36 +00:00
}
2019-12-31 02:12:52 +00:00
function curatePhotoEntries ( files ) {
2019-12-07 02:59:55 +00:00
return files . map ( ( file , index ) => ( {
path : file . filepath ,
thumbnail : file . thumbpath ,
mime : file . mimetype ,
hash : file . hash ,
source : file . source ,
index ,
} ) ) ;
}
2020-02-08 03:52:32 +00:00
async function findDuplicates ( photos , identifier , prop = null ) {
2019-12-31 02:12:52 +00:00
const duplicates = await knex ( 'media' )
2020-02-02 21:36:33 +00:00
. whereIn ( identifier , photos . flat ( ) . map ( ( photo ) => {
if ( prop ) return photo [ prop ] ;
if ( photo . src ) return photo . src ;
return photo ;
} ) ) ;
2019-12-07 02:59:55 +00:00
2019-12-31 02:12:52 +00:00
const duplicateLookup = new Set ( duplicates . map ( photo => photo [ prop || identifier ] ) ) ;
2020-02-02 21:36:33 +00:00
const originals = photos . filter ( ( source ) => {
if ( Array . isArray ( source ) ) {
2020-02-02 23:51:25 +00:00
return ! source . some ( sourceX => duplicateLookup . has ( ( prop && sourceX [ prop ] ) || ( sourceX . src && sourceX ) ) ) ;
2020-02-02 21:36:33 +00:00
}
return ! duplicateLookup . has ( ( prop && source [ prop ] ) || ( source . src && source ) ) ;
} ) ;
2019-12-07 02:59:55 +00:00
2019-12-31 02:12:52 +00:00
return [ duplicates , originals ] ;
2019-12-07 02:59:55 +00:00
}
2020-02-02 21:36:33 +00:00
async function extractPhoto ( source ) {
const res = await bhttp . get ( source . src ) ;
if ( res . statusCode === 200 ) {
const { q } = ex ( res . body . toString ( ) ) ;
return source . extract ( q ) ;
}
return null ;
}
2019-12-31 02:12:52 +00:00
async function fetchPhoto ( photoUrl , index , label , attempt = 1 ) {
2020-02-02 21:36:33 +00:00
if ( photoUrl . src && photoUrl . extract ) {
// source links to page containing a (presumably) tokenized photo
const photo = await extractPhoto ( photoUrl ) ;
return fetchPhoto ( photo , index , label ) ;
}
2019-12-12 02:12:05 +00:00
if ( Array . isArray ( photoUrl ) ) {
2019-12-12 04:18:43 +00:00
return photoUrl . reduce ( async ( outcome , url ) => outcome . catch ( async ( ) => {
2019-12-31 02:12:52 +00:00
const photo = await fetchPhoto ( url , index , label ) ;
2019-12-12 04:18:43 +00:00
if ( photo ) {
return photo ;
}
throw new Error ( 'Photo not available' ) ;
} ) , Promise . reject ( new Error ( ) ) ) ;
2019-12-12 02:12:05 +00:00
}
2019-12-07 02:59:55 +00:00
try {
2019-12-09 04:00:49 +00:00
const { pathname } = new URL ( photoUrl ) ;
2019-12-07 02:59:55 +00:00
const res = await bhttp . get ( photoUrl ) ;
if ( res . statusCode === 200 ) {
2020-01-30 00:14:31 +00:00
const mimetype = mime . getType ( pathname ) ;
2019-12-07 02:59:55 +00:00
const extension = mime . getExtension ( mimetype ) ;
const hash = getHash ( res . body ) ;
2020-01-30 00:14:31 +00:00
const entropy = await getEntropy ( res . body ) ;
2019-12-07 02:59:55 +00:00
return {
photo : res . body ,
mimetype ,
extension ,
hash ,
2020-01-30 00:14:31 +00:00
entropy ,
2019-12-07 02:59:55 +00:00
source : photoUrl ,
} ;
}
throw new Error ( ` Response ${ res . statusCode } not OK ` ) ;
} catch ( error ) {
2020-01-10 03:40:41 +00:00
logger . warn ( ` Failed attempt ${ attempt } /3 to fetch photo ${ index + 1 } for ${ label } ( ${ photoUrl } ): ${ error } ` ) ;
2019-12-12 02:12:05 +00:00
if ( attempt < 3 ) {
2020-01-30 02:53:54 +00:00
await Promise . delay ( 5000 ) ;
2019-12-31 02:12:52 +00:00
return fetchPhoto ( photoUrl , index , label , attempt + 1 ) ;
2019-12-12 02:12:05 +00:00
}
2019-11-11 02:20:00 +00:00
2019-12-07 02:59:55 +00:00
return null ;
}
}
2019-11-11 02:20:00 +00:00
2019-12-13 02:28:52 +00:00
async function savePhotos ( files , {
2019-12-31 02:12:52 +00:00
domain = 'release' ,
2019-12-13 02:28:52 +00:00
subpath ,
role = 'photo' ,
naming = 'index' ,
} ) {
2019-12-07 02:59:55 +00:00
return Promise . map ( files , async ( file , index ) => {
2020-01-27 00:41:04 +00:00
try {
const timestamp = new Date ( ) . getTime ( ) ;
const thumbnail = await createThumbnail ( file . photo ) ;
const filename = naming === 'index'
? ` ${ file . role || role } ${ index + 1 } `
: ` ${ timestamp + index } ` ;
const filepath = path . join ( ` ${ domain } s ` , subpath , ` ${ filename } . ${ file . extension } ` ) ;
const thumbpath = path . join ( ` ${ domain } s ` , subpath , ` ${ filename } _thumb. ${ file . extension } ` ) ;
await Promise . all ( [
fs . writeFile ( path . join ( config . media . path , filepath ) , file . photo ) ,
fs . writeFile ( path . join ( config . media . path , thumbpath ) , thumbnail ) ,
] ) ;
return {
... file ,
thumbnail ,
filepath ,
thumbpath ,
} ;
} catch ( error ) {
logger . error ( ` Failed to store ${ domain } ${ role } to ${ subpath } : ${ error . message } ` ) ;
return null ;
}
2019-12-07 02:59:55 +00:00
} ) ;
}
2019-11-11 02:20:00 +00:00
2019-12-13 02:28:52 +00:00
async function storePhotos ( photos , {
2019-12-31 02:12:52 +00:00
domain = 'release' ,
2019-12-13 02:28:52 +00:00
role = 'photo' ,
naming = 'index' ,
targetId ,
subpath ,
primaryRole , // role to assign to first photo if not already in database, used mainly for avatars
2020-02-02 04:14:58 +00:00
entropyFilter = 2.5 , // filter out fallback avatars and other generic clipart
2019-12-31 02:12:52 +00:00
} , label ) {
2019-12-13 02:28:52 +00:00
if ( ! photos || photos . length === 0 ) {
2020-01-10 03:40:41 +00:00
logger . info ( ` No ${ role } s available for ${ label } ` ) ;
2019-11-11 02:20:00 +00:00
return ;
}
2019-11-16 02:33:36 +00:00
2020-01-22 21:25:58 +00:00
const pluckedPhotos = pluckPhotos ( Array . from ( new Set ( photos ) ) ) ; // pre-filter link duplicates, limit total per configuration
2019-12-31 02:12:52 +00:00
const [ sourceDuplicates , sourceOriginals ] = await findDuplicates ( pluckedPhotos , 'source' , null , label ) ;
2019-11-11 02:20:00 +00:00
2020-02-08 03:52:32 +00:00
logger . info ( ` Fetching ${ sourceOriginals . length } new ${ role } s, ${ sourceDuplicates . length } already present by source for ${ label } ` ) ;
2019-12-31 02:12:52 +00:00
const metaFiles = await Promise . map ( sourceOriginals , async ( photoUrl , index ) => fetchPhoto ( photoUrl , index , label ) , {
2019-12-04 20:58:08 +00:00
concurrency : 10 ,
2020-02-02 04:14:58 +00:00
} ) . filter ( photo => photo && photo . entropy > entropyFilter ) ;
2019-11-11 02:20:00 +00:00
2020-01-23 02:52:12 +00:00
const metaFilesByHash = metaFiles . reduce ( ( acc , photo ) => ( { ... acc , [ photo . hash ] : photo } ) , { } ) ; // pre-filter hash duplicates within set; may occur through fallbacks
const [ hashDuplicates , hashOriginals ] = await findDuplicates ( Object . values ( metaFilesByHash ) , 'hash' , 'hash' , label ) ;
2019-12-13 15:59:04 +00:00
2020-02-08 03:52:32 +00:00
logger . info ( ` Saving ${ hashOriginals . length } new ${ role } s, ${ hashDuplicates . length } already present by hash for ${ label } ` ) ;
2019-12-31 02:12:52 +00:00
const savedPhotos = await savePhotos ( hashOriginals , {
2019-12-13 02:28:52 +00:00
domain ,
role ,
targetId ,
subpath ,
naming ,
} ) ;
2019-12-07 02:59:55 +00:00
2020-01-27 00:41:04 +00:00
const curatedPhotoEntries = curatePhotoEntries ( savedPhotos . filter ( Boolean ) , domain , role , targetId ) ;
2019-12-12 03:04:35 +00:00
2019-12-31 02:12:52 +00:00
const newPhotos = await knex ( 'media' ) . insert ( curatedPhotoEntries ) . returning ( '*' ) ;
const photoEntries = Array . isArray ( newPhotos )
? [ ... sourceDuplicates , ... hashDuplicates , ... newPhotos ]
: [ ... sourceDuplicates , ... hashDuplicates ] ;
const photoAssociations = photoEntries
. map ( photoEntry => ( {
[ ` ${ domain } _id ` ] : targetId ,
media _id : photoEntry . id ,
} ) ) ;
if ( primaryRole ) {
// store one photo as a 'primary' photo, such as an avatar or cover
const primaryPhoto = await knex ( ` ${ domain } s_ ${ primaryRole } s ` )
. where ( ` ${ domain } _id ` , targetId )
. first ( ) ;
if ( primaryPhoto ) {
2020-01-22 22:17:39 +00:00
const remainingAssociations = photoAssociations . filter ( association => association . media _id !== primaryPhoto . media _id ) ;
2020-01-02 16:13:57 +00:00
await upsert ( ` ${ domain } s_ ${ role } s ` , remainingAssociations , [ ` ${ domain } _id ` , 'media_id' ] ) ;
2019-12-31 02:12:52 +00:00
return ;
}
await Promise . all ( [
upsert ( ` ${ domain } s_ ${ primaryRole } s ` , photoAssociations . slice ( 0 , 1 ) , [ ` ${ domain } _id ` , 'media_id' ] ) ,
upsert ( ` ${ domain } s_ ${ role } s ` , photoAssociations . slice ( 1 ) , [ ` ${ domain } _id ` , 'media_id' ] ) ,
] ) ;
2020-01-02 16:13:57 +00:00
return ;
2019-12-31 02:12:52 +00:00
}
2019-12-07 02:59:55 +00:00
2019-12-31 02:12:52 +00:00
await upsert ( ` ${ domain } s_ ${ role } s ` , photoAssociations , [ ` ${ domain } _id ` , 'media_id' ] ) ;
2019-11-11 02:20:00 +00:00
}
2020-01-08 22:33:24 +00:00
/ *
async function storeReleasePhotos ( releases , label ) {
const sources = releases . map ( release => pluckPhotos ( release . photos ) ) . flat ( ) ;
const uniqueSources = Array . from ( new Set ( sources ) ) ;
const [ sourceDuplicates , sourceOriginals ] = await findDuplicates ( uniqueSources , 'source' , null , label ) ;
const metaFiles = await Promise . map (
sourceOriginals ,
async ( photoUrl , index ) => fetchPhoto ( photoUrl , index , label ) ,
{ concurrency : 10 } ,
)
. filter ( photo => photo ) ;
const hashUniques = Object . values ( metaFiles . reduce ( ( acc , file ) => {
if ( ! acc [ file . hash ] ) acc [ file . hash ] = file ;
return acc ;
} , { } ) ) ;
const [ hashDuplicates , hashOriginals ] = await findDuplicates ( hashUniques , 'hash' , 'hash' , label ) ;
const sourceHashes = metaFiles . concat ( sourceDuplicates ) . reduce ( ( acc , file ) => {
acc [ file . source ] = file . hash ;
return acc ;
} , { } ) ;
const associations = releases . map ( release => release . photos . map ( source => [ release . id , sourceHashes [ source ] ] ) ) . flat ( ) ;
console . log ( associations ) ;
}
* /
2019-12-13 02:28:52 +00:00
async function storeTrailer ( trailers , {
domain = 'releases' ,
2020-02-02 04:14:58 +00:00
role = 'trailer' ,
2019-12-13 02:28:52 +00:00
targetId ,
subpath ,
2019-12-31 02:12:52 +00:00
} , label ) {
2019-12-07 03:17:14 +00:00
// support scrapers supplying multiple qualities
2019-12-13 02:28:52 +00:00
const trailer = Array . isArray ( trailers )
2020-01-30 03:14:51 +00:00
? trailers . find ( trailerX => config . media . trailerQuality . includes ( trailerX . quality ) ) || trailers [ 0 ]
2019-12-13 02:28:52 +00:00
: trailers ;
2019-12-07 03:17:14 +00:00
if ( ! trailer || ! trailer . src ) {
2020-02-02 04:14:58 +00:00
logger . info ( ` No ${ role } available for ${ label } ` ) ;
2019-11-16 02:33:36 +00:00
return ;
}
2019-12-31 02:12:52 +00:00
const [ sourceDuplicates , sourceOriginals ] = await findDuplicates ( [ trailer ] , 'source' , 'src' , label ) ;
2019-11-11 02:20:00 +00:00
2019-12-31 02:12:52 +00:00
const metaFiles = await Promise . map ( sourceOriginals , async ( trailerX ) => {
const { pathname } = new URL ( trailerX . src ) ;
const mimetype = trailerX . type || mime . getType ( pathname ) ;
2019-11-11 02:20:00 +00:00
2019-12-31 02:12:52 +00:00
const res = await bhttp . get ( trailerX . src ) ;
const hash = getHash ( res . body ) ;
2020-02-02 04:14:58 +00:00
const filepath = path . join ( domain , subpath , ` ${ role } ${ trailerX . quality ? ` _ ${ trailerX . quality } ` : '' } . ${ mime . getExtension ( mimetype ) } ` ) ;
2019-12-31 02:12:52 +00:00
return {
2020-01-02 23:59:02 +00:00
trailer : res . body ,
2019-11-11 02:20:00 +00:00
path : filepath ,
mime : mimetype ,
2019-12-31 02:12:52 +00:00
source : trailerX . src ,
quality : trailerX . quality || null ,
hash ,
} ;
} ) ;
2020-01-02 23:59:02 +00:00
const [ hashDuplicates , hashOriginals ] = await findDuplicates ( metaFiles , 'hash' , 'hash' , label ) ;
2019-12-31 02:12:52 +00:00
const newTrailers = await knex ( 'media' )
2020-01-02 23:59:02 +00:00
. insert ( hashOriginals . map ( trailerX => ( {
path : trailerX . path ,
mime : trailerX . mime ,
source : trailerX . source ,
quality : trailerX . quality ,
hash : trailerX . hash ,
2020-02-02 04:14:58 +00:00
type : role ,
2020-01-02 23:59:02 +00:00
} ) ) )
2019-12-31 02:12:52 +00:00
. returning ( '*' ) ;
2020-01-02 23:59:02 +00:00
await Promise . all ( hashOriginals . map ( trailerX => fs . writeFile ( path . join ( config . media . path , trailerX . path ) , trailerX . trailer ) ) ) ;
const trailerEntries = Array . isArray ( newTrailers )
? [ ... sourceDuplicates , ... hashDuplicates , ... newTrailers ]
: [ ... sourceDuplicates , ... hashDuplicates ] ;
2020-02-02 04:14:58 +00:00
await upsert ( ` releases_ ${ role } s ` , trailerEntries . map ( trailerEntry => ( {
2020-01-02 23:59:02 +00:00
release _id : targetId ,
media _id : trailerEntry . id ,
} ) ) , [ 'release_id' , 'media_id' ] ) ;
2019-11-11 02:20:00 +00:00
}
module . exports = {
2019-12-13 02:28:52 +00:00
createMediaDirectory ,
2019-11-11 02:20:00 +00:00
storePhotos ,
2020-01-08 22:33:24 +00:00
// storeReleasePhotos,
2019-11-11 02:20:00 +00:00
storeTrailer ,
} ;