2019-11-10 03:20:22 +00:00
'use strict' ;
2020-05-14 02:26:05 +00:00
const config = require ( 'config' ) ;
const Promise = require ( 'bluebird' ) ;
2020-05-17 01:00:44 +00:00
const moment = require ( 'moment' ) ;
2020-05-19 02:46:49 +00:00
const blake2 = require ( 'blake2' ) ;
const DOMPurify = require ( 'dompurify' ) ;
const { JSDOM } = require ( 'jsdom' ) ;
const { window } = new JSDOM ( '' ) ;
const domPurify = DOMPurify ( window ) ;
2020-05-14 02:26:05 +00:00
2020-05-13 00:56:20 +00:00
// const logger = require('./logger')(__filename);
2020-03-26 02:32:07 +00:00
const knex = require ( './knex' ) ;
2020-05-15 02:40:59 +00:00
const scrapers = require ( './scrapers/scrapers' ) . actors ;
2020-05-14 02:26:05 +00:00
const argv = require ( './argv' ) ;
2020-05-15 02:40:59 +00:00
const include = require ( './utils/argv-include' ) ( argv ) ;
const logger = require ( './logger' ) ( _ _filename ) ;
2020-05-17 01:00:44 +00:00
const { toBaseReleases } = require ( './deep' ) ;
const { associateAvatars } = require ( './media' ) ;
2020-01-07 03:23:28 +00:00
const slugify = require ( './utils/slugify' ) ;
2020-03-26 02:32:07 +00:00
const capitalize = require ( './utils/capitalize' ) ;
2020-05-15 02:40:59 +00:00
const resolvePlace = require ( './utils/resolve-place' ) ;
2020-05-18 23:10:32 +00:00
const hairColors = {
'jet-black' : 'black' ,
'red-head' : 'red' ,
'soft-black' : 'black' ,
black : 'black' ,
blonde : 'blonde' ,
blondie : 'blonde' ,
brown : 'brown' ,
brunette : 'brown' ,
fair : 'blonde' ,
raven : 'black' ,
red : 'red' ,
redhead : 'red' ,
2020-05-21 01:44:44 +00:00
blue : 'blue' ,
green : 'green' ,
purple : 'purple' ,
pink : 'pink' ,
2020-05-18 23:10:32 +00:00
} ;
const eyeColors = {
blue : 'blue' ,
brown : 'brown' ,
dark : 'brown' ,
gray : 'gray' ,
green : 'green' ,
grey : 'gray' ,
hazel : 'hazel' ,
} ;
const ethnicities = {
'african american' : 'black' ,
'african-american' : 'black' ,
'native american' : 'native american' ,
african : 'black' ,
aravic : 'arabic' ,
asian : 'asian' ,
black : 'black' ,
caucasian : 'white' ,
european : 'white' ,
2020-07-12 22:12:01 +00:00
hispanic : 'latin' ,
2020-05-18 23:10:32 +00:00
indian : 'indian' ,
japanese : 'japanese' ,
2020-07-12 22:12:01 +00:00
latin : 'latin' ,
2020-05-18 23:10:32 +00:00
latina : 'latina' ,
latino : 'latino' ,
white : 'white' ,
} ;
2020-05-17 01:00:44 +00:00
function getMostFrequent ( items ) {
const { mostFrequent } = items . reduce ( ( acc , item ) => {
2020-05-18 23:10:32 +00:00
const slug = slugify ( item ) ;
acc . counts [ slug ] = ( acc . counts [ slug ] || 0 ) + 1 ;
2020-05-17 01:00:44 +00:00
2020-05-18 23:10:32 +00:00
if ( ! acc . mostFrequent || acc . counts [ slug ] > acc . counts [ slugify ( acc . mostFrequent ) ] ) {
2020-05-17 01:00:44 +00:00
acc . mostFrequent = item ;
}
return acc ;
} , {
counts : { } ,
mostFrequent : null ,
} ) ;
return mostFrequent ;
}
function getMostFrequentDate ( dates ) {
const year = getMostFrequent ( dates . map ( dateX => dateX . getFullYear ( ) ) ) ;
const month = getMostFrequent ( dates . map ( dateX => dateX . getMonth ( ) ) ) ;
const date = getMostFrequent ( dates . map ( dateX => dateX . getDate ( ) ) ) ;
2020-07-21 02:04:07 +00:00
if ( year === null || month === null || date === null ) {
return null ;
2020-05-17 02:59:09 +00:00
}
2020-07-21 02:04:07 +00:00
return moment ( { year , month , date } ) . toDate ( ) ;
2020-05-17 01:00:44 +00:00
}
function getLongest ( items ) {
return items . sort ( ( itemA , itemB ) => itemB . length - itemA . length ) [ 0 ] || null ;
}
function getAverage ( items ) {
2020-05-17 01:04:58 +00:00
return Math . round ( items . reduce ( ( acc , item ) => acc + item , 0 ) / items . length ) || null ;
2020-05-17 01:00:44 +00:00
}
2019-11-10 03:20:22 +00:00
2020-03-26 02:32:07 +00:00
function toBaseActors ( actorsOrNames , release ) {
2020-05-14 02:26:05 +00:00
return actorsOrNames . map ( ( actorOrName ) => {
2020-07-20 23:16:26 +00:00
const [ baseName , entryId ] = ( actorOrName . name || actorOrName ) . split ( ':' ) ;
const name = capitalize ( baseName ) ;
2020-05-14 02:26:05 +00:00
const slug = slugify ( name ) ;
const baseActor = {
name ,
slug ,
2020-07-20 23:16:26 +00:00
entryId : entryId || null ,
2020-06-27 22:15:13 +00:00
entity : release ? . site ? . network || release ? . entity ? . parent || release ? . entity || null ,
2020-05-14 02:26:05 +00:00
} ;
if ( actorOrName . name ) {
return {
... actorOrName ,
... baseActor ,
} ;
}
return baseActor ;
} ) ;
2020-03-24 02:48:24 +00:00
}
2020-07-17 01:39:13 +00:00
function curateActor ( actor , withDetails = false , isProfile = false ) {
2020-05-19 23:11:32 +00:00
if ( ! actor ) {
return null ;
}
const curatedActor = {
id : actor . id ,
name : actor . name ,
slug : actor . slug ,
gender : actor . gender ,
2020-06-25 00:26:25 +00:00
entityId : actor . entity _id ,
2020-05-19 23:38:58 +00:00
aliasFor : actor . alias _for ,
2020-05-19 23:11:32 +00:00
dateOfBirth : actor . date _of _birth ,
2020-05-19 23:38:58 +00:00
birthCountry : actor . birth _country _alpha2 ,
... ( withDetails && {
alias : actor . alias && {
id : actor . alias . id ,
name : actor . alias . name ,
slug : actor . slug ,
gender : actor . alias . gender ,
2020-05-19 23:11:32 +00:00
} ,
2020-06-25 00:26:25 +00:00
entity : actor . entity && {
id : actor . entity . id ,
name : actor . entity . name ,
slug : actor . entity . slug ,
2020-05-19 23:11:32 +00:00
} ,
2020-05-19 23:38:58 +00:00
dateOfDeath : actor . date _of _death ,
cup : actor . cup ,
bust : actor . bust ,
waist : actor . waist ,
hip : actor . hip ,
naturalBoobs : actor . natural _boobs ,
height : actor . height ,
weight : actor . weight ,
eyes : actor . eyes ,
2020-05-24 01:54:29 +00:00
hairColor : actor . hair _color ,
2020-05-19 23:38:58 +00:00
hasTattoos : actor . has _tattoos ,
hasPiercings : actor . has _piercings ,
tattoos : actor . tattoos ,
piercings : actor . piercings ,
2020-07-17 01:39:13 +00:00
... ( isProfile && { description : actor . description } ) ,
2020-05-19 23:38:58 +00:00
placeOfBirth : actor . birth _country && {
country : {
alpha2 : actor . birth _country . alpha2 ,
name : actor . birth _country . name ,
alias : actor . birth _country . alias ,
} ,
state : actor . birth _state ,
city : actor . birth _city ,
} ,
placeOfResidence : actor . residence _country && {
country : {
alpha2 : actor . residence _country . alpha2 ,
name : actor . residence _country . name ,
alias : actor . residence _country . alias ,
} ,
state : actor . residence _state ,
city : actor . residence _city ,
} ,
avatar : actor . avatar && {
id : actor . avatar . id ,
path : actor . avatar . path ,
width : actor . avatar . width ,
height : actor . avatar . height ,
size : actor . avatar . size ,
source : actor . avatar . source ,
} ,
2020-07-17 01:39:13 +00:00
... ( actor . profiles && { profiles : actor . profiles ? . map ( profile => curateActor ( profile , true , true ) ) } ) ,
2020-05-19 23:38:58 +00:00
} ) ,
2020-05-19 23:11:32 +00:00
} ;
return curatedActor ;
}
2020-05-13 00:56:20 +00:00
function curateActorEntry ( baseActor , batchId ) {
2020-05-14 02:26:05 +00:00
return {
name : baseActor . name ,
slug : baseActor . slug ,
2020-06-17 02:07:24 +00:00
entity _id : null ,
2020-07-21 02:04:07 +00:00
entry _id : baseActor . entryId ,
2020-05-14 02:26:05 +00:00
batch _id : batchId ,
} ;
2020-03-26 02:32:07 +00:00
}
2020-05-13 00:56:20 +00:00
function curateActorEntries ( baseActors , batchId ) {
2020-05-14 02:26:05 +00:00
return baseActors . map ( baseActor => curateActorEntry ( baseActor , batchId ) ) ;
2020-03-26 02:32:07 +00:00
}
2020-05-15 02:40:59 +00:00
function curateProfileEntry ( profile ) {
const curatedProfileEntry = {
2020-05-19 00:02:48 +00:00
... ( profile . update !== false && { id : profile . update } ) ,
2020-05-15 02:40:59 +00:00
actor _id : profile . id ,
2020-06-25 00:26:25 +00:00
entity _id : profile . entity ? . id || null ,
2020-05-15 02:40:59 +00:00
date _of _birth : profile . dateOfBirth ,
date _of _death : profile . dateOfDeath ,
gender : profile . gender ,
ethnicity : profile . ethnicity ,
description : profile . description ,
2020-05-19 02:46:49 +00:00
description _hash : profile . descriptionHash ,
2020-05-15 02:40:59 +00:00
birth _city : profile . placeOfBirth ? . city || null ,
birth _state : profile . placeOfBirth ? . state || null ,
2020-05-17 01:00:44 +00:00
birth _country _alpha2 : profile . placeOfBirth ? . country || null ,
2020-05-15 02:40:59 +00:00
residence _city : profile . placeOfResidence ? . city || null ,
residence _state : profile . placeOfResidence ? . state || null ,
2020-05-17 01:00:44 +00:00
residence _country _alpha2 : profile . placeOfResidence ? . country || null ,
2020-05-15 02:40:59 +00:00
cup : profile . cup ,
bust : profile . bust ,
waist : profile . waist ,
hip : profile . hip ,
natural _boobs : profile . naturalBoobs ,
height : profile . height ,
weight : profile . weight ,
2020-05-24 01:54:29 +00:00
hair _color : profile . hairColor ,
2020-05-15 02:40:59 +00:00
eyes : profile . eyes ,
has _tattoos : profile . hasTattoos ,
has _piercings : profile . hasPiercings ,
piercings : profile . piercings ,
tattoos : profile . tattoos ,
2020-05-16 02:36:45 +00:00
avatar _media _id : profile . avatarMediaId || null ,
2020-05-15 02:40:59 +00:00
} ;
return curatedProfileEntry ;
}
async function curateProfile ( profile ) {
2020-05-17 23:22:56 +00:00
if ( ! profile ) {
return null ;
}
2020-05-15 02:40:59 +00:00
try {
const curatedProfile = {
id : profile . id ,
name : profile . name ,
avatar : profile . avatar ,
2020-05-16 02:36:45 +00:00
scraper : profile . scraper ,
2020-06-25 00:26:25 +00:00
entity : profile . entity ,
2020-05-19 00:02:48 +00:00
update : profile . update ,
2020-05-15 02:40:59 +00:00
} ;
2020-05-19 23:11:32 +00:00
curatedProfile . description = domPurify . sanitize ( profile . description ? . replace ( /\s+/g , ' ' ) , { ALLOWED _TAGS : [ ] } ) . trim ( ) || null ;
2020-05-19 02:46:49 +00:00
const hasher = curatedProfile . description && blake2
2020-05-23 02:32:50 +00:00
. createHash ( 'blake2b' , { digestLength : 24 } )
2020-05-19 02:46:49 +00:00
. update ( Buffer . from ( slugify ( curatedProfile . description ) ) ) ;
curatedProfile . descriptionHash = curatedProfile . description && hasher . digest ( 'hex' ) ;
2020-05-15 02:40:59 +00:00
curatedProfile . nationality = profile . nationality ? . trim ( ) || null ; // used to derive country when country not available
2020-05-18 23:10:32 +00:00
curatedProfile . ethnicity = ethnicities [ profile . ethnicity ? . trim ( ) . toLowerCase ( ) ] || null ;
2020-07-09 00:00:54 +00:00
curatedProfile . hairColor = hairColors [ ( profile . hairColor || profile . hair ) ? . toLowerCase ( ) . replace ( 'hair' , '' ) . trim ( ) ] || null ;
2020-05-18 23:10:32 +00:00
curatedProfile . eyes = eyeColors [ profile . eyes ? . trim ( ) . toLowerCase ( ) ] || null ;
2020-05-15 02:40:59 +00:00
curatedProfile . tattoos = profile . tattoos ? . trim ( ) || null ;
curatedProfile . piercings = profile . piercings ? . trim ( ) || null ;
curatedProfile . gender = ( /female/i . test ( profile . gender ) && 'female' )
|| ( /shemale/i . test ( profile . gender ) && 'transsexual' )
|| ( /male/i . test ( profile . gender ) && 'male' )
|| ( /trans/i . test ( profile . gender ) && 'transsexual' )
|| null ;
2020-05-17 03:08:41 +00:00
const dateOfBirth = profile . dateOfBirth || profile . birthdate ;
curatedProfile . dateOfBirth = ( ! Number . isNaN ( Number ( dateOfBirth ) ) // possibly valid date
&& new Date ( ) - dateOfBirth > 567648000000 // over 18
&& dateOfBirth )
2020-05-15 02:40:59 +00:00
|| null ;
curatedProfile . dateOfDeath = Number . isNaN ( Number ( profile . dateOfDeath ) ) ? null : profile . dateOfDeath ;
2020-07-12 03:10:23 +00:00
curatedProfile . cup = profile . cup || ( typeof profile . bust === 'string' && profile . bust ? . match ? . ( /[a-zA-Z]+/ ) ? . [ 0 ] ) || null ;
curatedProfile . bust = Number ( profile . bust ) || profile . bust ? . match ? . ( /\d+/ ) ? . [ 0 ] || null ;
curatedProfile . waist = Number ( profile . waist ) || profile . waist ? . match ? . ( /\d+/ ) ? . [ 0 ] || null ;
curatedProfile . hip = Number ( profile . hip ) || profile . hip ? . match ? . ( /\d+/ ) ? . [ 0 ] || null ;
curatedProfile . height = Number ( profile . height ) || profile . height ? . match ? . ( /\d+/ ) ? . [ 0 ] || null ;
curatedProfile . weight = Number ( profile . weight ) || profile . weight ? . match ? . ( /\d+/ ) ? . [ 0 ] || null ;
2020-05-15 02:40:59 +00:00
curatedProfile . naturalBoobs = typeof profile . naturalBoobs === 'boolean' ? profile . naturalBoobs : null ;
curatedProfile . hasTattoos = typeof profile . hasTattoos === 'boolean' ? profile . hasTattoos : null ;
curatedProfile . hasPiercings = typeof profile . hasPiercings === 'boolean' ? profile . hasPiercings : null ;
2020-05-17 01:00:44 +00:00
if ( argv . resolvePlace ) {
const [ placeOfBirth , placeOfResidence ] = await Promise . all ( [
resolvePlace ( profile . birthPlace ) ,
resolvePlace ( profile . residencePlace ) ,
] ) ;
2020-05-15 02:40:59 +00:00
2020-05-17 01:00:44 +00:00
curatedProfile . placeOfBirth = placeOfBirth ;
curatedProfile . placeOfResidence = placeOfResidence ;
}
2020-05-15 02:40:59 +00:00
if ( ! curatedProfile . placeOfBirth && curatedProfile . nationality ) {
const country = await knex ( 'countries' )
. where ( 'nationality' , 'ilike' , ` % ${ curatedProfile . nationality } % ` )
2020-07-23 02:39:12 +00:00
. orWhere ( 'alpha3' , 'ilike' , ` % ${ curatedProfile . nationality } % ` )
. orWhere ( 'alpha2' , 'ilike' , ` % ${ curatedProfile . nationality } % ` )
2020-05-15 02:40:59 +00:00
. orderBy ( 'priority' , 'desc' )
. first ( ) ;
2020-07-23 02:39:12 +00:00
if ( country ) {
curatedProfile . placeOfBirth = {
country : country . alpha2 ,
} ;
}
2020-05-15 02:40:59 +00:00
}
curatedProfile . social = Array . isArray ( profile . social )
? profile . social . map ( ( social ) => {
try {
2020-05-17 03:24:46 +00:00
const { href } = new URL ( social ) ;
2020-05-15 02:40:59 +00:00
return href ;
} catch ( error ) {
2020-06-25 00:26:25 +00:00
logger . warn ( ` Profile scraper for ' ${ profile . entity . name } ' returned invalid social link: ${ social } ` ) ;
2020-05-15 02:40:59 +00:00
return null ;
}
} ) . filter ( Boolean )
: [ ] ;
curatedProfile . releases = toBaseReleases ( profile . releases ) ;
2020-06-25 00:26:25 +00:00
if ( profile . ethnicity && ! curatedProfile . ethnicity ) logger . warn ( ` Unrecognized ethnicity returned by ' ${ profile . entity . name } ' scraper: ${ profile . ethnicity } ` ) ;
if ( ( profile . hairColor || profile . hair ) && ! curatedProfile . hairColor ) logger . warn ( ` Unrecognized hair color returned by ' ${ profile . entity . name } ' scraper: ${ profile . hairColor || profile . hair } ` ) ;
if ( profile . eyes && ! curatedProfile . eyes ) logger . warn ( ` Unrecognized eye color returned by ' ${ profile . entity . name } ' scraper: ${ profile . eyes } ` ) ;
2020-05-18 23:10:32 +00:00
2020-05-15 02:40:59 +00:00
return curatedProfile ;
} catch ( error ) {
logger . error ( ` Failed to curate ' ${ profile . name } ': ${ error . message } ` ) ;
return null ;
}
}
2020-05-17 01:00:44 +00:00
async function interpolateProfiles ( actors ) {
const profiles = await knex ( 'actors_profiles' )
. select ( [ 'actors_profiles.*' , 'media.width as avatar_width' , 'media.height as avatar_height' , 'media.size as avatar_size' ] )
. whereIn ( 'actor_id' , actors . map ( actor => actor . id ) )
. leftJoin ( 'media' , 'actors_profiles.avatar_media_id' , 'media.id' ) ;
const profilesByActorId = profiles . reduce ( ( acc , profile ) => ( {
... acc ,
[ profile . actor _id ] : [
... ( acc [ profile . actor _id ] || [ ] ) ,
profile ,
] ,
} ) , { } ) ;
const interpolatedProfiles = Object . entries ( profilesByActorId ) . map ( ( [ actorId , actorProfiles ] ) => {
2020-05-18 23:10:32 +00:00
// group values from each profile
2020-05-17 01:00:44 +00:00
const valuesByProperty = actorProfiles . reduce ( ( acc , profile ) => Object
. entries ( profile )
. reduce ( ( profileAcc , [ property , value ] ) => ( {
... profileAcc ,
[ property ] : [
... ( acc [ property ] || [ ] ) ,
2020-05-18 23:10:32 +00:00
... ( value === null ? [ ] : Array . from ( { length : profile . priority } , ( ) => value ) ) , // multiply by priority, increasing the odds of being the most frequent value
2020-05-17 01:00:44 +00:00
] ,
2020-05-18 23:10:32 +00:00
} ) , {
// bundle location values so they can be assessed together, to ensure the most frequent city is in the most frequent state is in most frequent country
origin : [ ... acc . origin || [ ] , {
... ( profile . birth _country _alpha2 && { country : profile . birth _country _alpha2 } ) ,
... ( profile . birth _state && { state : profile . birth _state } ) ,
... ( profile . birth _city && { city : profile . birth _city } ) ,
} ] . filter ( location => Object . keys ( location ) . length > 0 ) ,
residence : [ ... acc . residence || [ ] , {
... ( profile . residence _country _alpha2 && { country : profile . residence _country _alpha2 } ) ,
... ( profile . residence _state && { state : profile . residence _state } ) ,
... ( profile . residence _city && { city : profile . residence _city } ) ,
} ] . filter ( location => Object . keys ( location ) . length > 0 ) ,
} ) , { } ) ;
2020-05-17 01:00:44 +00:00
const avatars = actorProfiles . map ( profile => profile . avatar _media _id && ( {
id : profile . avatar _media _id ,
width : profile . avatar _width ,
height : profile . avatar _height ,
size : profile . avatar _size ,
} ) ) . filter ( Boolean ) ;
2020-05-18 23:10:32 +00:00
const mostFrequentValues = [
'gender' ,
'ethnicity' ,
'cup' ,
'bust' ,
'waist' ,
'hip' ,
'natural_boobs' ,
'height' ,
2020-05-24 01:54:29 +00:00
'hair_color' ,
2020-05-18 23:10:32 +00:00
'eyes' ,
'has_tattoos' ,
'has_piercings' ,
] . reduce ( ( acc , property ) => ( {
... acc ,
[ property ] : getMostFrequent ( valuesByProperty [ property ] ) ,
} ) , { } ) ;
2020-05-17 01:00:44 +00:00
const profile = {
id : actorId ,
2020-05-18 23:10:32 +00:00
... mostFrequentValues ,
2020-05-17 01:00:44 +00:00
} ;
profile . date _of _birth = getMostFrequentDate ( valuesByProperty . date _of _birth ) ;
profile . date _of _death = getMostFrequentDate ( valuesByProperty . date _of _death ) ;
2020-05-18 23:10:32 +00:00
// ensure most frequent country, city and state match up
profile . birth _country _alpha2 = getMostFrequent ( valuesByProperty . origin . map ( location => location . country ) ) ;
const remainingOriginCountries = valuesByProperty . origin . filter ( location => location . country === profile . birth _country _alpha2 ) ;
2020-05-17 01:00:44 +00:00
2020-05-18 23:10:32 +00:00
profile . birth _state = getMostFrequent ( remainingOriginCountries . map ( location => location . state ) ) ;
const remainingOriginStates = remainingOriginCountries . filter ( location => ! profile . birth _state || location . state === profile . birth _state ) ;
2020-05-17 01:00:44 +00:00
2020-05-18 23:10:32 +00:00
profile . birth _city = getMostFrequent ( remainingOriginStates . map ( location => location . city ) ) ;
2020-05-17 01:00:44 +00:00
2020-05-18 23:10:32 +00:00
profile . residence _country _alpha2 = getMostFrequent ( valuesByProperty . residence . map ( location => location . country ) ) ;
const remainingResidenceCountries = valuesByProperty . residence . filter ( location => location . country === profile . residence _country _alpha2 ) ;
2020-05-17 01:00:44 +00:00
2020-05-18 23:10:32 +00:00
profile . residence _state = getMostFrequent ( remainingResidenceCountries . map ( location => location . state ) ) ;
const remainingResidenceStates = remainingResidenceCountries . filter ( location => ! profile . residence _state || location . state === profile . residence _state ) ;
profile . residence _city = getMostFrequent ( remainingResidenceStates . map ( location => location . city ) ) ;
2020-05-17 01:00:44 +00:00
2020-05-18 23:10:32 +00:00
profile . weight = getAverage ( valuesByProperty . weight ) ;
2020-05-17 01:00:44 +00:00
profile . tattoos = getLongest ( valuesByProperty . tattoos ) ;
profile . piercings = getLongest ( valuesByProperty . piercings ) ;
2020-05-18 01:22:03 +00:00
profile . avatar _media _id = avatars . sort ( ( avatarA , avatarB ) => avatarB . height - avatarA . height ) [ 0 ] ? . id || null ;
2020-05-17 01:00:44 +00:00
return profile ;
} ) ;
const transaction = await knex . transaction ( ) ;
const queries = interpolatedProfiles . map ( profile => knex ( 'actors' )
. where ( 'id' , profile . id )
. update ( profile )
. transacting ( transaction ) ) ;
await Promise . all ( queries )
. then ( transaction . commit )
. catch ( transaction . rollback ) ;
}
async function upsertProfiles ( profiles ) {
2020-05-19 00:02:48 +00:00
const newProfileEntries = profiles . filter ( profile => ! profile . update ) . map ( profile => curateProfileEntry ( profile ) ) ;
const updatingProfileEntries = profiles . filter ( profile => profile . update ) . map ( profile => curateProfileEntry ( profile ) ) ;
2020-05-15 02:40:59 +00:00
if ( newProfileEntries . length > 0 ) {
2020-05-21 01:44:44 +00:00
await knex . batchInsert ( 'actors_profiles' , newProfileEntries ) ;
2020-05-19 00:02:48 +00:00
2020-05-21 01:44:44 +00:00
logger . info ( ` Saved ${ newProfileEntries . length } actor profiles ` ) ;
2020-05-15 02:40:59 +00:00
}
if ( argv . force && updatingProfileEntries . length > 0 ) {
2020-05-16 02:36:45 +00:00
const transaction = await knex . transaction ( ) ;
const queries = updatingProfileEntries . map ( profileEntry => knex ( 'actors_profiles' )
. where ( 'id' , profileEntry . id )
. update ( profileEntry )
. returning ( [ 'id' , 'actor_id' ] )
. transacting ( transaction ) ) ;
await Promise . all ( queries )
. then ( transaction . commit )
. catch ( transaction . rollback ) ;
2020-05-19 00:02:48 +00:00
logger . info ( ` Updated ${ updatingProfileEntries . length } new actor profiles ` ) ;
2020-05-15 02:40:59 +00:00
}
}
2020-06-25 00:26:25 +00:00
async function scrapeProfiles ( actor , sources , entitiesBySlug , existingProfilesByActorEntityId ) {
2020-05-17 23:22:56 +00:00
const profiles = Promise . map ( sources , async ( source ) => {
try {
2020-05-18 23:10:32 +00:00
// config may group sources to try until success
2020-05-17 23:22:56 +00:00
return await [ ] . concat ( source ) . reduce ( async ( outcome , scraperSlug ) => outcome . catch ( async ( ) => {
2020-05-18 23:10:32 +00:00
try {
const scraper = scrapers [ scraperSlug ] ;
2020-07-17 01:39:13 +00:00
const entity = entitiesBySlug [ scraperSlug ] || null ;
2020-05-18 23:10:32 +00:00
const context = {
2020-07-17 01:39:13 +00:00
... entity ,
2020-07-16 13:55:03 +00:00
// legacy
2020-07-17 01:39:13 +00:00
site : entity ,
network : entity ? . parent ,
entity ,
2020-05-18 23:10:32 +00:00
scraper : scraperSlug ,
} ;
2020-06-25 00:26:25 +00:00
const label = context . entity ? . name ;
2020-05-19 00:02:48 +00:00
2020-05-18 23:10:32 +00:00
if ( ! scraper ? . fetchProfile ) {
logger . warn ( ` No profile profile scraper available for ${ scraperSlug } ` ) ;
throw new Error ( ` No profile profile scraper available for ${ scraperSlug } ` ) ;
}
2020-06-25 00:26:25 +00:00
if ( ! context . entity ) {
logger . warn ( ` No entity found for ${ scraperSlug } ` ) ;
throw new Error ( ` No entity found for ${ scraperSlug } ` ) ;
2020-05-18 23:10:32 +00:00
}
2020-06-25 00:26:25 +00:00
const existingProfile = existingProfilesByActorEntityId [ actor . id ] ? . [ context . entity ? . id || null ] ;
2020-05-19 00:02:48 +00:00
if ( existingProfile && ! argv . force ) {
logger . verbose ( ` Found existing profile for ' ${ actor . name } ' on ' ${ label } ', use --force to scrape again ` ) ;
return null ;
}
logger . verbose ( ` Searching profile for ' ${ actor . name } ' on ' ${ label } ' ` ) ;
2020-05-18 23:10:32 +00:00
2020-07-20 23:16:26 +00:00
const profile = await scraper . fetchProfile ( actor , context , include ) ;
2020-05-18 23:10:32 +00:00
if ( ! profile || typeof profile === 'number' ) { // scraper returns HTTP code on request failure
2020-05-19 00:02:48 +00:00
logger . verbose ( ` Profile for ' ${ actor . name } ' not available on ${ label } , scraper returned ${ profile } ` ) ;
throw Object . assign ( new Error ( ` Profile for ' ${ actor . name } ' not available on ${ label } ` ) , { code : 'PROFILE_NOT_AVAILABLE' } ) ;
2020-05-18 23:10:32 +00:00
}
2020-05-19 00:02:48 +00:00
logger . verbose ( ` Found profile for ' ${ actor . name } ' on ' ${ label } ' ` ) ;
return await curateProfile ( {
2020-05-18 23:10:32 +00:00
... actor ,
... profile ,
2020-07-17 01:39:13 +00:00
entity ,
2020-05-19 00:02:48 +00:00
update : existingProfile ? . id || false ,
} ) ;
2020-05-18 23:10:32 +00:00
} catch ( error ) {
if ( error . code !== 'PROFILE_NOT_AVAILABLE' ) {
logger . error ( ` Failed to fetch profile for ' ${ actor . name } ' from ' ${ scraperSlug } ': ${ error . message } ` ) ;
}
2020-05-17 23:22:56 +00:00
2020-05-19 00:02:48 +00:00
// throw error to try next source
2020-05-18 23:10:32 +00:00
throw error ;
2020-05-17 23:22:56 +00:00
}
} ) , Promise . reject ( new Error ( ) ) ) ;
} catch ( error ) {
if ( error . code !== 'PROFILE_NOT_AVAILABLE' ) {
logger . error ( ` Failed to fetch profile for ' ${ actor . name } ': ${ error . message } ` ) ;
}
}
return null ;
} ) ;
return profiles . filter ( Boolean ) ;
}
2020-08-12 18:51:08 +00:00
async function getActorNames ( actorNames ) {
if ( actorNames . length > 0 ) {
return actorNames ;
}
const actorsWithoutProfiles = await knex . raw ( `
SELECT actors . name
FROM actors
WHERE NOT EXISTS (
SELECT *
FROM actors _profiles
WHERE actors _profiles . actor _id = actors . id
AND actors _profiles . updated _at <= ( ? )
)
` , [argv.actorsUpdate || new Date()]);
return actorsWithoutProfiles . rows . map ( actor => actor . name ) ;
}
async function scrapeActors ( argNames ) {
const actorNames = await getActorNames ( argNames ) ;
2020-05-14 02:26:05 +00:00
const baseActors = toBaseActors ( actorNames ) ;
2020-08-12 18:51:08 +00:00
logger . info ( ` Scraping profiles for ${ actorNames . length } actors ` ) ;
2020-05-14 02:26:05 +00:00
const sources = argv . sources || config . profiles || Object . keys ( scrapers . actors ) ;
2020-06-25 00:26:25 +00:00
const entitySlugs = sources . flat ( ) ;
2020-05-14 02:26:05 +00:00
2020-06-25 00:26:25 +00:00
const [ entities , existingActorEntries ] = await Promise . all ( [
2020-06-17 02:07:24 +00:00
knex ( 'entities' )
2020-06-25 00:26:25 +00:00
. select ( knex . raw ( 'entities.*, row_to_json(parents) as parent' ) )
. whereIn ( 'entities.slug' , entitySlugs )
. leftJoin ( 'entities as parents' , 'parents.id' , 'entities.parent_id' )
. orderBy ( 'entities.type' ) ,
2020-05-14 02:26:05 +00:00
knex ( 'actors' )
2020-07-20 23:16:26 +00:00
. select ( [ 'id' , 'name' , 'slug' , 'entry_id' ] )
2020-08-12 18:51:08 +00:00
. whereIn ( 'slug' , baseActors . map ( baseActor => baseActor . slug ) )
2020-05-21 01:44:44 +00:00
. whereNull ( 'alias_for' ) ,
2020-05-14 02:26:05 +00:00
] ) ;
2020-06-25 00:26:25 +00:00
const entitiesBySlug = entities . reduce ( ( acc , entity ) => ( { ... acc , [ entity . slug ] : entity } ) , { } ) ;
2020-05-14 02:26:05 +00:00
2020-07-20 23:16:26 +00:00
const existingActorEntriesBySlugAndEntryId = existingActorEntries . reduce ( ( acc , actorEntry ) => ( {
... acc ,
[ actorEntry . slug ] : {
... acc [ actorEntry . slug ] ,
[ actorEntry . entryId || null ] : actorEntry ,
} ,
} ) , { } ) ;
2020-07-20 23:44:51 +00:00
const newBaseActors = baseActors . filter ( baseActor => ! existingActorEntriesBySlugAndEntryId [ baseActor . slug ] ? . [ baseActor . entryId ] ) ;
2020-05-14 02:26:05 +00:00
const [ batchId ] = newBaseActors . length > 0 ? await knex ( 'batches' ) . insert ( { comment : null } ) . returning ( 'id' ) : [ null ] ;
const curatedActorEntries = batchId && curateActorEntries ( newBaseActors , batchId ) ;
2020-07-20 23:16:26 +00:00
2020-07-21 02:04:07 +00:00
// TODO: associate entity when entry ID is provided
2020-07-20 23:16:26 +00:00
const newActorEntries = batchId && await knex ( 'actors' )
. insert ( curatedActorEntries )
. returning ( [ 'id' , 'name' , 'slug' , 'entry_id' ] ) ;
2020-05-14 02:26:05 +00:00
2020-05-15 02:40:59 +00:00
const actors = existingActorEntries . concat ( Array . isArray ( newActorEntries ) ? newActorEntries : [ ] ) ;
2020-05-19 00:02:48 +00:00
const existingProfiles = await knex ( 'actors_profiles' ) . whereIn ( 'actor_id' , actors . map ( actor => actor . id ) ) ;
2020-06-25 00:26:25 +00:00
const existingProfilesByActorEntityId = existingProfiles . reduce ( ( acc , profile ) => ( {
2020-05-19 00:02:48 +00:00
... acc ,
[ profile . actor _id ] : {
... acc [ profile . actor _id ] ,
2020-06-25 00:26:25 +00:00
[ profile . entity _id ] : profile ,
2020-05-19 00:02:48 +00:00
} ,
} ) , { } ) ;
2020-05-15 02:40:59 +00:00
const profilesPerActor = await Promise . map (
actors ,
2020-06-25 00:26:25 +00:00
async actor => scrapeProfiles ( actor , sources , entitiesBySlug , existingProfilesByActorEntityId ) ,
2020-05-15 02:40:59 +00:00
{ concurrency : 10 } ,
) ;
2020-05-19 00:02:48 +00:00
const profiles = profilesPerActor . flat ( ) . filter ( Boolean ) ;
logger . info ( ` Scraped ${ profiles . length } profiles ` ) ;
2020-05-16 02:36:45 +00:00
2020-05-17 02:59:09 +00:00
if ( argv . inspect ) {
console . log ( profiles ) ;
}
if ( argv . save ) {
const profilesWithAvatarIds = await associateAvatars ( profiles ) ;
await upsertProfiles ( profilesWithAvatarIds ) ;
await interpolateProfiles ( actors ) ;
}
2020-05-18 01:22:03 +00:00
return profiles ;
2020-05-13 21:17:39 +00:00
}
2020-05-13 00:56:20 +00:00
async function getOrCreateActors ( baseActors , batchId ) {
2020-05-14 02:26:05 +00:00
const existingActors = await knex ( 'actors' )
2020-06-17 02:07:24 +00:00
. select ( 'id' , 'alias_for' , 'name' , 'slug' , 'entity_id' )
2020-05-14 02:26:05 +00:00
. whereIn ( 'slug' , baseActors . map ( baseActor => baseActor . slug ) )
2020-06-17 02:07:24 +00:00
. whereNull ( 'entity_id' )
2020-06-25 00:26:25 +00:00
. orWhereIn ( [ 'slug' , 'entity_id' ] , baseActors . map ( baseActor => [ baseActor . slug , baseActor . entity . id ] ) ) ;
2020-05-14 02:26:05 +00:00
// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug));
const existingActorSlugs = existingActors . reduce ( ( acc , actor ) => ( {
... acc ,
2020-06-17 02:07:24 +00:00
[ actor . entity _id ] : {
... acc [ actor . entity _id ] ,
2020-05-14 02:26:05 +00:00
[ actor . slug ] : true ,
} ,
} ) , { } ) ;
2020-06-25 00:26:25 +00:00
const uniqueBaseActors = baseActors . filter ( baseActor => ! existingActorSlugs [ baseActor . entity . id ] ? . [ baseActor . slug ] && ! existingActorSlugs . null ? . [ baseActor . slug ] ) ;
2020-05-14 02:26:05 +00:00
const curatedActorEntries = curateActorEntries ( uniqueBaseActors , batchId ) ;
2020-06-17 02:07:24 +00:00
const newActors = await knex ( 'actors' ) . insert ( curatedActorEntries , [ 'id' , 'alias_for' , 'name' , 'slug' , 'entity_id' ] ) ;
2020-05-14 02:26:05 +00:00
if ( Array . isArray ( newActors ) ) {
return newActors . concat ( existingActors ) ;
}
return existingActors ;
2020-03-26 02:32:07 +00:00
}
2020-05-13 00:56:20 +00:00
async function associateActors ( releases , batchId ) {
2020-05-14 02:26:05 +00:00
const baseActorsByReleaseId = releases . reduce ( ( acc , release ) => {
if ( release . actors ) {
acc [ release . id ] = toBaseActors ( release . actors , release ) ;
}
return acc ;
} , { } ) ;
const baseActors = Object . values ( baseActorsByReleaseId ) . flat ( ) ;
if ( baseActors . length === 0 ) {
2020-05-16 02:36:45 +00:00
return null ;
2020-05-14 02:26:05 +00:00
}
2020-05-17 23:22:56 +00:00
const baseActorsBySlug = baseActors . reduce ( ( acc , baseActor ) => ( {
2020-05-14 02:26:05 +00:00
... acc ,
2020-05-17 23:22:56 +00:00
[ baseActor . slug ] : baseActor ,
2020-05-14 02:26:05 +00:00
} ) , { } ) ;
2020-05-17 23:22:56 +00:00
const uniqueBaseActors = Object . values ( baseActorsBySlug ) ;
2020-05-14 02:26:05 +00:00
const actors = await getOrCreateActors ( uniqueBaseActors , batchId ) ;
2020-05-17 23:22:56 +00:00
const actorIdsBySlug = actors . reduce ( ( acc , actor ) => ( {
2020-05-14 02:26:05 +00:00
... acc ,
2020-05-17 23:22:56 +00:00
[ actor . slug ] : actor . alias _for || actor . id ,
2020-05-14 02:26:05 +00:00
} ) , { } ) ;
const releaseActorAssociations = Object . entries ( baseActorsByReleaseId )
. map ( ( [ releaseId , releaseActors ] ) => releaseActors
. map ( releaseActor => ( {
release _id : releaseId ,
2020-05-17 23:22:56 +00:00
actor _id : actorIdsBySlug [ releaseActor . slug ] ,
2020-05-14 02:26:05 +00:00
} ) ) )
. flat ( ) ;
await knex . raw ( ` ${ knex ( 'releases_actors' ) . insert ( releaseActorAssociations ) . toString ( ) } ON CONFLICT DO NOTHING; ` ) ;
2020-05-16 02:36:45 +00:00
return actors ;
2019-11-11 02:20:00 +00:00
}
2020-05-19 23:11:32 +00:00
async function fetchActor ( actorId ) {
const actor = await knex ( 'actors' )
. select ( knex . raw ( `
actors . * ,
2020-06-25 00:26:25 +00:00
row _to _json ( entities ) as entity ,
2020-05-19 23:11:32 +00:00
row _to _json ( actor _alias ) as alias ,
row _to _json ( birth _country ) as birth _country ,
row _to _json ( residence _country ) as residence _country ,
2020-07-17 01:39:13 +00:00
row _to _json ( media ) as avatar ,
json _agg ( actors _profiles ) as profiles
2020-05-19 23:11:32 +00:00
` ))
. modify ( ( queryBuilder ) => {
if ( Number . isNaN ( Number ( actorId ) ) ) {
queryBuilder . where ( 'actors.slug' , actorId ) ;
return ;
}
queryBuilder . where ( 'actors.id' , actorId ) ;
} )
. leftJoin ( 'actors as actor_alias' , 'actor_alias.id' , 'actors.alias_for' )
2020-07-17 01:39:13 +00:00
. leftJoin ( 'actors_profiles' , 'actors.id' , 'actors_profiles.actor_id' )
2020-06-25 00:26:25 +00:00
. leftJoin ( 'entities' , 'entities.id' , 'actors.entity_id' )
2020-05-19 23:11:32 +00:00
. leftJoin ( 'countries as birth_country' , 'birth_country.alpha2' , 'actors.birth_country_alpha2' )
. leftJoin ( 'countries as residence_country' , 'residence_country.alpha2' , 'actors.residence_country_alpha2' )
. leftJoin ( 'media' , 'media.id' , 'actors.avatar_media_id' )
2020-07-17 01:39:13 +00:00
. groupBy ( 'actors.id' , 'entities.id' , 'actor_alias.id' , 'birth_country.alpha2' , 'residence_country.alpha2' , 'media.id' )
2020-05-19 23:11:32 +00:00
. first ( ) ;
2020-05-19 23:38:58 +00:00
return curateActor ( actor , true ) ;
}
async function searchActors ( query ) {
const actors = await knex
. select ( '*' )
. from ( knex . raw ( 'search_actors(?) as actors' , [ query ] ) )
. limit ( 10 ) ;
return actors . map ( actor => curateActor ( actor ) ) ;
2020-05-19 23:11:32 +00:00
}
2019-11-10 03:20:22 +00:00
module . exports = {
2020-05-14 02:26:05 +00:00
associateActors ,
2020-05-19 23:11:32 +00:00
fetchActor ,
2020-05-14 02:26:05 +00:00
scrapeActors ,
2020-05-19 23:38:58 +00:00
searchActors ,
2019-11-10 03:20:22 +00:00
} ;