2019-11-10 03:20:22 +00:00
'use strict' ;
2020-05-14 02:26:05 +00:00
const config = require ( 'config' ) ;
2020-12-02 20:26:55 +00:00
const util = require ( 'util' ) ;
2020-05-14 02:26:05 +00:00
const Promise = require ( 'bluebird' ) ;
2020-05-17 01:00:44 +00:00
const moment = require ( 'moment' ) ;
2020-05-19 02:46:49 +00:00
const blake2 = require ( 'blake2' ) ;
const DOMPurify = require ( 'dompurify' ) ;
const { JSDOM } = require ( 'jsdom' ) ;
2020-12-30 01:23:43 +00:00
const omit = require ( 'object.omit' ) ;
2021-02-05 03:23:13 +00:00
const inquirer = require ( 'inquirer' ) ;
2020-05-19 02:46:49 +00:00
const { window } = new JSDOM ( '' ) ;
const domPurify = DOMPurify ( window ) ;
2020-05-14 02:26:05 +00:00
2020-05-13 00:56:20 +00:00
// const logger = require('./logger')(__filename);
2020-03-26 02:32:07 +00:00
const knex = require ( './knex' ) ;
2020-05-15 02:40:59 +00:00
const scrapers = require ( './scrapers/scrapers' ) . actors ;
2020-05-14 02:26:05 +00:00
const argv = require ( './argv' ) ;
2020-05-15 02:40:59 +00:00
const include = require ( './utils/argv-include' ) ( argv ) ;
2020-08-14 21:21:53 +00:00
const bulkInsert = require ( './utils/bulk-insert' ) ;
2022-03-04 22:31:59 +00:00
const chunk = require ( './utils/chunk' ) ;
2020-05-15 02:40:59 +00:00
const logger = require ( './logger' ) ( _ _filename ) ;
2020-05-17 01:00:44 +00:00
const { toBaseReleases } = require ( './deep' ) ;
2020-12-30 01:23:43 +00:00
const { associateAvatars , flushOrphanedMedia } = require ( './media' ) ;
2021-02-27 20:59:33 +00:00
const { fetchEntitiesBySlug } = require ( './entities' ) ;
2020-12-30 02:19:09 +00:00
const { deleteScenes } = require ( './releases' ) ;
2020-05-17 01:00:44 +00:00
2020-01-07 03:23:28 +00:00
const slugify = require ( './utils/slugify' ) ;
2020-03-26 02:32:07 +00:00
const capitalize = require ( './utils/capitalize' ) ;
2020-05-15 02:40:59 +00:00
const resolvePlace = require ( './utils/resolve-place' ) ;
2021-02-10 02:23:48 +00:00
const { resolveLayoutScraper } = require ( './scrapers/resolve' ) ;
const getRecursiveParameters = require ( './utils/get-recursive-parameters' ) ;
2020-05-15 02:40:59 +00:00
2020-05-18 23:10:32 +00:00
const hairColors = {
'jet-black' : 'black' ,
'red-head' : 'red' ,
'soft-black' : 'black' ,
2021-09-12 23:29:39 +00:00
'brunette/raven' : 'brown' ,
2020-05-18 23:10:32 +00:00
black : 'black' ,
2023-07-01 19:46:44 +00:00
blond : 'blond' ,
2020-05-18 23:10:32 +00:00
blonde : 'blonde' ,
blondie : 'blonde' ,
brown : 'brown' ,
2023-07-01 19:46:44 +00:00
bruin : 'brown' ,
2020-05-18 23:10:32 +00:00
brunette : 'brown' ,
fair : 'blonde' ,
2023-07-20 23:07:06 +00:00
grey : 'gray' ,
gray : 'gray' ,
2020-05-18 23:10:32 +00:00
raven : 'black' ,
red : 'red' ,
redhead : 'red' ,
2023-07-21 19:58:50 +00:00
'red head' : 'red' ,
2023-07-01 19:46:44 +00:00
rood : 'red' ,
2020-05-21 01:44:44 +00:00
blue : 'blue' ,
green : 'green' ,
purple : 'purple' ,
pink : 'pink' ,
2023-07-01 19:46:44 +00:00
zwart : 'black' ,
2020-05-18 23:10:32 +00:00
} ;
const eyeColors = {
2023-07-01 19:46:44 +00:00
blauw : 'blue' ,
2020-05-18 23:10:32 +00:00
blue : 'blue' ,
brown : 'brown' ,
2023-07-01 19:46:44 +00:00
bruin : 'bruin' ,
2020-05-18 23:10:32 +00:00
dark : 'brown' ,
gray : 'gray' ,
green : 'green' ,
2023-07-01 19:46:44 +00:00
groen : 'green' ,
2020-05-18 23:10:32 +00:00
grey : 'gray' ,
hazel : 'hazel' ,
} ;
2023-07-01 19:46:44 +00:00
const orientations = {
bi : 'bisexual' ,
biseksueel : 'bisexual' ,
bisexual : 'bisexual' ,
gay : 'gay' ,
hetero : 'straight' ,
heteroseksueel : 'straight' ,
heterosexual : 'straight' ,
homoseksueel : 'gay' ,
homosexual : 'gay' ,
straight : 'straight' ,
} ;
2020-05-18 23:10:32 +00:00
const ethnicities = {
'african american' : 'black' ,
'african-american' : 'black' ,
'native american' : 'native american' ,
african : 'black' ,
aravic : 'arabic' ,
asian : 'asian' ,
black : 'black' ,
caucasian : 'white' ,
european : 'white' ,
2020-07-12 22:12:01 +00:00
hispanic : 'latin' ,
2020-05-18 23:10:32 +00:00
indian : 'indian' ,
japanese : 'japanese' ,
2020-07-12 22:12:01 +00:00
latin : 'latin' ,
2020-05-18 23:10:32 +00:00
latina : 'latina' ,
latino : 'latino' ,
white : 'white' ,
} ;
2023-07-25 01:03:41 +00:00
const bloodTypes = {
A : 'A' ,
'A+' : 'A+' ,
'A-' : 'A-' ,
B : 'B' ,
'B+' : 'B+' ,
'B-' : 'B-' ,
AB : 'AB' ,
'AB+' : 'AB+' ,
'AB-' : 'AB-' ,
O : 'O' ,
'O+' : 'O+' ,
'O-' : 'O-' ,
} ;
2020-12-29 22:44:38 +00:00
function getBoolean ( value ) {
if ( typeof value === 'boolean' ) {
return value ;
}
if ( typeof value === 'string' ) {
if ( /yes/i . test ( value ) ) {
return true ;
}
if ( /no/i . test ( value ) ) {
return true ;
}
}
return null ;
}
2020-05-17 01:00:44 +00:00
function getMostFrequent ( items ) {
const { mostFrequent } = items . reduce ( ( acc , item ) => {
2020-12-29 22:44:38 +00:00
if ( item === undefined || item === null ) {
return acc ;
}
2020-05-18 23:10:32 +00:00
const slug = slugify ( item ) ;
acc . counts [ slug ] = ( acc . counts [ slug ] || 0 ) + 1 ;
2020-05-17 01:00:44 +00:00
2020-05-18 23:10:32 +00:00
if ( ! acc . mostFrequent || acc . counts [ slug ] > acc . counts [ slugify ( acc . mostFrequent ) ] ) {
2020-05-17 01:00:44 +00:00
acc . mostFrequent = item ;
}
return acc ;
} , {
counts : { } ,
mostFrequent : null ,
} ) ;
return mostFrequent ;
}
function getMostFrequentDate ( dates ) {
2021-11-20 22:59:15 +00:00
const year = getMostFrequent ( dates . map ( ( dateX ) => dateX . getFullYear ( ) ) ) ;
const month = getMostFrequent ( dates . map ( ( dateX ) => dateX . getMonth ( ) ) ) ;
const date = getMostFrequent ( dates . map ( ( dateX ) => dateX . getDate ( ) ) ) ;
2020-05-17 01:00:44 +00:00
2020-07-21 02:04:07 +00:00
if ( year === null || month === null || date === null ) {
return null ;
2020-05-17 02:59:09 +00:00
}
2020-07-21 02:04:07 +00:00
return moment ( { year , month , date } ) . toDate ( ) ;
2020-05-17 01:00:44 +00:00
}
2020-11-29 02:59:47 +00:00
function getHighest ( items ) {
2020-12-17 02:43:09 +00:00
return items . reduce ( ( prevItem , item ) => ( item > prevItem ? item : prevItem ) , null ) ;
2020-11-29 02:59:47 +00:00
}
2020-05-17 01:00:44 +00:00
function getLongest ( items ) {
return items . sort ( ( itemA , itemB ) => itemB . length - itemA . length ) [ 0 ] || null ;
}
function getAverage ( items ) {
2020-05-17 01:04:58 +00:00
return Math . round ( items . reduce ( ( acc , item ) => acc + item , 0 ) / items . length ) || null ;
2020-05-17 01:00:44 +00:00
}
2019-11-10 03:20:22 +00:00
2020-03-26 02:32:07 +00:00
function toBaseActors ( actorsOrNames , release ) {
2020-10-29 15:06:20 +00:00
if ( ! actorsOrNames ) {
return [ ] ;
}
const baseActors = actorsOrNames
2021-11-20 22:59:15 +00:00
. filter ( ( actorOrName ) => actorOrName && ( typeof actorOrName === 'string' || actorOrName . name ) )
2020-10-29 15:06:20 +00:00
. map ( ( actorOrName ) => {
const [ baseName , entryId ] = ( actorOrName . name || actorOrName ) . split ( ':' ) ;
2020-05-14 02:26:05 +00:00
2020-10-29 15:06:20 +00:00
const name = capitalize ( baseName ) ;
const slug = slugify ( name ) ;
2021-02-27 20:59:33 +00:00
// using top level parent widens the scope too much, e.g. different Gamma sites may not use the same actor database
// const entity = getRecursiveParent(release?.entity);
const entity = ( release ? . entity ? . indepdendent && release ? . entity )
|| release ? . entity ? . parent
2021-02-28 02:38:54 +00:00
|| release ? . entity
|| null ;
2021-02-24 01:43:34 +00:00
2020-10-29 15:06:20 +00:00
const baseActor = {
name ,
slug ,
2021-02-28 02:38:54 +00:00
entryId : ( entity && ( entryId || actorOrName . entryId ) ) || null ,
2023-07-25 01:03:41 +00:00
suppliedEntryId : entryId ,
2021-02-24 01:43:34 +00:00
entity ,
2020-10-29 15:06:20 +00:00
hasProfile : ! ! actorOrName . name , // actor contains profile information
2020-05-14 02:26:05 +00:00
} ;
2020-10-29 15:06:20 +00:00
if ( actorOrName . name ) {
return {
... actorOrName ,
... baseActor ,
} ;
}
return baseActor ;
} ) ;
return baseActors ;
2020-03-24 02:48:24 +00:00
}
2021-02-16 02:37:52 +00:00
function getCollisionLikely ( actor ) {
// actor with single name
return actor . name . match ( /\w+/g ) . length === 1 ;
}
2020-07-17 01:39:13 +00:00
function curateActor ( actor , withDetails = false , isProfile = false ) {
2020-05-19 23:11:32 +00:00
if ( ! actor ) {
return null ;
}
const curatedActor = {
id : actor . id ,
name : actor . name ,
slug : actor . slug ,
2020-08-31 00:43:41 +00:00
url : actor . url ,
2020-05-19 23:11:32 +00:00
gender : actor . gender ,
2023-07-01 19:46:44 +00:00
orientation : actor . orientation ,
2020-06-25 00:26:25 +00:00
entityId : actor . entity _id ,
2020-05-19 23:38:58 +00:00
aliasFor : actor . alias _for ,
2020-05-19 23:11:32 +00:00
dateOfBirth : actor . date _of _birth ,
2020-11-29 02:59:47 +00:00
age : actor . age ,
2020-05-19 23:38:58 +00:00
birthCountry : actor . birth _country _alpha2 ,
... ( withDetails && {
alias : actor . alias && {
id : actor . alias . id ,
name : actor . alias . name ,
slug : actor . slug ,
gender : actor . alias . gender ,
2020-05-19 23:11:32 +00:00
} ,
2020-06-25 00:26:25 +00:00
entity : actor . entity && {
id : actor . entity . id ,
name : actor . entity . name ,
slug : actor . entity . slug ,
2020-05-19 23:11:32 +00:00
} ,
2020-05-19 23:38:58 +00:00
dateOfDeath : actor . date _of _death ,
cup : actor . cup ,
bust : actor . bust ,
waist : actor . waist ,
hip : actor . hip ,
naturalBoobs : actor . natural _boobs ,
2020-11-15 03:33:24 +00:00
penisLength : actor . penis _length ,
penisGirth : actor . penis _girth ,
circumcised : actor . circumcised ,
2020-05-19 23:38:58 +00:00
height : actor . height ,
weight : actor . weight ,
2023-07-25 01:03:41 +00:00
shoeSize : actor . shoe _size ,
2020-05-19 23:38:58 +00:00
eyes : actor . eyes ,
2020-05-24 01:54:29 +00:00
hairColor : actor . hair _color ,
2023-07-25 01:03:41 +00:00
hairType : actor . hair _type ,
2020-05-19 23:38:58 +00:00
hasTattoos : actor . has _tattoos ,
hasPiercings : actor . has _piercings ,
tattoos : actor . tattoos ,
piercings : actor . piercings ,
2023-07-25 01:03:41 +00:00
bloodType : actor . blood _type ,
2020-07-17 01:39:13 +00:00
... ( isProfile && { description : actor . description } ) ,
2020-05-19 23:38:58 +00:00
placeOfBirth : actor . birth _country && {
country : {
alpha2 : actor . birth _country . alpha2 ,
name : actor . birth _country . name ,
alias : actor . birth _country . alias ,
} ,
state : actor . birth _state ,
city : actor . birth _city ,
} ,
placeOfResidence : actor . residence _country && {
country : {
alpha2 : actor . residence _country . alpha2 ,
name : actor . residence _country . name ,
alias : actor . residence _country . alias ,
} ,
state : actor . residence _state ,
city : actor . residence _city ,
} ,
avatar : actor . avatar && {
id : actor . avatar . id ,
path : actor . avatar . path ,
width : actor . avatar . width ,
height : actor . avatar . height ,
size : actor . avatar . size ,
source : actor . avatar . source ,
} ,
2021-11-20 22:59:15 +00:00
... ( actor . profiles && { profiles : actor . profiles ? . map ( ( profile ) => curateActor ( profile , true , true ) ) } ) ,
2020-05-19 23:38:58 +00:00
} ) ,
2020-05-19 23:11:32 +00:00
} ;
return curatedActor ;
}
2020-05-13 00:56:20 +00:00
function curateActorEntry ( baseActor , batchId ) {
2021-02-16 02:37:52 +00:00
const collisionLikely = getCollisionLikely ( baseActor ) ;
2020-05-14 02:26:05 +00:00
return {
name : baseActor . name ,
slug : baseActor . slug ,
2021-09-17 01:30:49 +00:00
entity _id : ( collisionLikely && baseActor . entity ? . id ) || null ,
entry _id : ( collisionLikely && baseActor . entryId ) || null ,
2020-05-14 02:26:05 +00:00
batch _id : batchId ,
} ;
2020-03-26 02:32:07 +00:00
}
2020-05-13 00:56:20 +00:00
function curateActorEntries ( baseActors , batchId ) {
2021-11-20 22:59:15 +00:00
return baseActors . map ( ( baseActor ) => curateActorEntry ( baseActor , batchId ) ) ;
2020-03-26 02:32:07 +00:00
}
2020-05-15 02:40:59 +00:00
function curateProfileEntry ( profile ) {
2020-08-30 02:18:47 +00:00
if ( ! profile . id ) {
return null ;
}
2020-05-15 02:40:59 +00:00
const curatedProfileEntry = {
2020-05-19 00:02:48 +00:00
... ( profile . update !== false && { id : profile . update } ) ,
2020-05-15 02:40:59 +00:00
actor _id : profile . id ,
2020-06-25 00:26:25 +00:00
entity _id : profile . entity ? . id || null ,
2020-05-15 02:40:59 +00:00
date _of _birth : profile . dateOfBirth ,
date _of _death : profile . dateOfDeath ,
2020-11-29 02:59:47 +00:00
age : profile . age ,
2020-08-31 00:43:41 +00:00
url : profile . url ,
2020-05-15 02:40:59 +00:00
gender : profile . gender ,
2023-07-01 19:46:44 +00:00
orientation : profile . orientation ,
2020-05-15 02:40:59 +00:00
ethnicity : profile . ethnicity ,
description : profile . description ,
2020-05-19 02:46:49 +00:00
description _hash : profile . descriptionHash ,
2020-05-15 02:40:59 +00:00
birth _city : profile . placeOfBirth ? . city || null ,
birth _state : profile . placeOfBirth ? . state || null ,
2020-05-17 01:00:44 +00:00
birth _country _alpha2 : profile . placeOfBirth ? . country || null ,
2020-05-15 02:40:59 +00:00
residence _city : profile . placeOfResidence ? . city || null ,
residence _state : profile . placeOfResidence ? . state || null ,
2020-05-17 01:00:44 +00:00
residence _country _alpha2 : profile . placeOfResidence ? . country || null ,
2020-05-15 02:40:59 +00:00
cup : profile . cup ,
bust : profile . bust ,
waist : profile . waist ,
hip : profile . hip ,
2020-11-15 03:33:24 +00:00
penis _length : profile . penisLength ,
penis _girth : profile . penisGirth ,
circumcised : profile . circumcised ,
2020-05-15 02:40:59 +00:00
natural _boobs : profile . naturalBoobs ,
height : profile . height ,
weight : profile . weight ,
2023-07-25 01:03:41 +00:00
shoe _size : profile . shoeSize ,
2020-05-24 01:54:29 +00:00
hair _color : profile . hairColor ,
2023-07-25 01:03:41 +00:00
hair _type : profile . hairType ,
2020-05-15 02:40:59 +00:00
eyes : profile . eyes ,
has _tattoos : profile . hasTattoos ,
has _piercings : profile . hasPiercings ,
piercings : profile . piercings ,
tattoos : profile . tattoos ,
2023-07-25 01:03:41 +00:00
blood _type : profile . bloodType ,
2020-05-16 02:36:45 +00:00
avatar _media _id : profile . avatarMediaId || null ,
2020-05-15 02:40:59 +00:00
} ;
return curatedProfileEntry ;
}
2020-12-15 23:50:58 +00:00
async function curateProfile ( profile , actor ) {
2020-05-17 23:22:56 +00:00
if ( ! profile ) {
return null ;
}
2020-05-15 02:40:59 +00:00
try {
const curatedProfile = {
id : profile . id ,
name : profile . name ,
2021-02-03 20:03:35 +00:00
url : profile . url ,
2020-05-15 02:40:59 +00:00
avatar : profile . avatar ,
2020-05-16 02:36:45 +00:00
scraper : profile . scraper ,
2020-06-25 00:26:25 +00:00
entity : profile . entity ,
2020-05-19 00:02:48 +00:00
update : profile . update ,
2020-05-15 02:40:59 +00:00
} ;
2020-05-19 23:11:32 +00:00
curatedProfile . description = domPurify . sanitize ( profile . description ? . replace ( /\s+/g , ' ' ) , { ALLOWED _TAGS : [ ] } ) . trim ( ) || null ;
2020-05-19 02:46:49 +00:00
const hasher = curatedProfile . description && blake2
2020-05-23 02:32:50 +00:00
. createHash ( 'blake2b' , { digestLength : 24 } )
2020-05-19 02:46:49 +00:00
. update ( Buffer . from ( slugify ( curatedProfile . description ) ) ) ;
curatedProfile . descriptionHash = curatedProfile . description && hasher . digest ( 'hex' ) ;
2020-05-15 02:40:59 +00:00
curatedProfile . nationality = profile . nationality ? . trim ( ) || null ; // used to derive country when country not available
2020-05-18 23:10:32 +00:00
curatedProfile . ethnicity = ethnicities [ profile . ethnicity ? . trim ( ) . toLowerCase ( ) ] || null ;
2023-07-25 01:03:41 +00:00
curatedProfile . hairType = profile . hairType ? . trim ( ) || null ;
2020-07-09 00:00:54 +00:00
curatedProfile . hairColor = hairColors [ ( profile . hairColor || profile . hair ) ? . toLowerCase ( ) . replace ( 'hair' , '' ) . trim ( ) ] || null ;
2020-05-18 23:10:32 +00:00
curatedProfile . eyes = eyeColors [ profile . eyes ? . trim ( ) . toLowerCase ( ) ] || null ;
2020-05-15 02:40:59 +00:00
curatedProfile . tattoos = profile . tattoos ? . trim ( ) || null ;
curatedProfile . piercings = profile . piercings ? . trim ( ) || null ;
curatedProfile . gender = ( /female/i . test ( profile . gender ) && 'female' )
2020-09-04 01:07:28 +00:00
|| ( /shemale|trans/i . test ( profile . gender ) && 'transsexual' )
2020-05-15 02:40:59 +00:00
|| ( /male/i . test ( profile . gender ) && 'male' )
|| null ;
2023-07-01 19:46:44 +00:00
curatedProfile . orientation = orientations [ profile . orientation ? . trim ( ) ] || null ;
2020-05-17 03:08:41 +00:00
const dateOfBirth = profile . dateOfBirth || profile . birthdate ;
curatedProfile . dateOfBirth = ( ! Number . isNaN ( Number ( dateOfBirth ) ) // possibly valid date
&& new Date ( ) - dateOfBirth > 567648000000 // over 18
&& dateOfBirth )
2020-05-15 02:40:59 +00:00
|| null ;
curatedProfile . dateOfDeath = Number . isNaN ( Number ( profile . dateOfDeath ) ) ? null : profile . dateOfDeath ;
2020-11-29 02:59:47 +00:00
curatedProfile . age = Number ( profile . age ) || null ;
2020-05-15 02:40:59 +00:00
2020-11-15 03:33:24 +00:00
curatedProfile . height = Number ( profile . height ) || profile . height ? . match ? . ( /\d+/ ) ? . [ 0 ] || null ;
curatedProfile . weight = Number ( profile . weight ) || profile . weight ? . match ? . ( /\d+/ ) ? . [ 0 ] || null ;
2023-07-25 01:03:41 +00:00
curatedProfile . shoeSize = Number ( profile . shoeSize ) || profile . shoeSize ? . match ? . ( /\d+/ ) ? . [ 0 ] || null ;
2020-11-15 03:33:24 +00:00
2021-02-03 20:03:35 +00:00
// separate measurement values
2020-07-12 03:10:23 +00:00
curatedProfile . cup = profile . cup || ( typeof profile . bust === 'string' && profile . bust ? . match ? . ( /[a-zA-Z]+/ ) ? . [ 0 ] ) || null ;
curatedProfile . bust = Number ( profile . bust ) || profile . bust ? . match ? . ( /\d+/ ) ? . [ 0 ] || null ;
curatedProfile . waist = Number ( profile . waist ) || profile . waist ? . match ? . ( /\d+/ ) ? . [ 0 ] || null ;
curatedProfile . hip = Number ( profile . hip ) || profile . hip ? . match ? . ( /\d+/ ) ? . [ 0 ] || null ;
2021-02-03 20:03:35 +00:00
// combined measurement value
2023-07-06 03:09:05 +00:00
const measurements = profile . measurements ? . match ( /(\d+)(\w+)\s*[-x]\s*(\d+)\s*[-x]\s*(\d+)/ ) ; // ExCoGi uses x, Jules Jordan has spaces between the dashes
2021-02-03 20:03:35 +00:00
if ( measurements ) {
curatedProfile . bust = Number ( measurements [ 1 ] ) ;
curatedProfile . cup = measurements [ 2 ] ;
curatedProfile . waist = Number ( measurements [ 3 ] ) ;
curatedProfile . hip = Number ( measurements [ 4 ] ) ;
}
2020-11-15 03:33:24 +00:00
curatedProfile . penisLength = Number ( profile . penisLength ) || profile . penisLength ? . match ? . ( /\d+/ ) ? . [ 0 ] || null ;
curatedProfile . penisGirth = Number ( profile . penisGirth ) || profile . penisGirth ? . match ? . ( /\d+/ ) ? . [ 0 ] || null ;
2020-12-29 22:44:38 +00:00
curatedProfile . circumcised = getBoolean ( profile . circumcised ) ;
curatedProfile . naturalBoobs = getBoolean ( profile . naturalBoobs ) ;
curatedProfile . hasTattoos = getBoolean ( profile . hasTattoos ) ;
curatedProfile . hasPiercings = getBoolean ( profile . hasPiercings ) ;
2023-07-25 01:03:41 +00:00
curatedProfile . bloodType = bloodTypes [ profile . bloodType ? . trim ( ) . toUpperCase ( ) ] || null ;
2020-05-15 02:40:59 +00:00
2020-05-17 01:00:44 +00:00
if ( argv . resolvePlace ) {
const [ placeOfBirth , placeOfResidence ] = await Promise . all ( [
resolvePlace ( profile . birthPlace ) ,
resolvePlace ( profile . residencePlace ) ,
] ) ;
2020-05-15 02:40:59 +00:00
2020-05-17 01:00:44 +00:00
curatedProfile . placeOfBirth = placeOfBirth ;
curatedProfile . placeOfResidence = placeOfResidence ;
}
2020-05-15 02:40:59 +00:00
if ( ! curatedProfile . placeOfBirth && curatedProfile . nationality ) {
const country = await knex ( 'countries' )
. where ( 'nationality' , 'ilike' , ` % ${ curatedProfile . nationality } % ` )
2020-07-23 02:39:12 +00:00
. orWhere ( 'alpha3' , 'ilike' , ` % ${ curatedProfile . nationality } % ` )
. orWhere ( 'alpha2' , 'ilike' , ` % ${ curatedProfile . nationality } % ` )
2020-05-15 02:40:59 +00:00
. orderBy ( 'priority' , 'desc' )
. first ( ) ;
2020-07-23 02:39:12 +00:00
if ( country ) {
curatedProfile . placeOfBirth = {
country : country . alpha2 ,
} ;
}
2020-05-15 02:40:59 +00:00
}
2023-07-21 22:49:56 +00:00
curatedProfile . social = [ ] . concat ( profile . social ) . map ( ( social ) => {
if ( ! social ) {
return null ;
}
try {
const { origin , pathname } = new URL ( social ) ;
return ` ${ origin } ${ pathname } ` ;
} catch ( error ) {
logger . warn ( ` Profile scraper for ' ${ profile . entity . name } ' returned invalid social link: ${ social } ` ) ;
return null ;
}
} ) . filter ( Boolean ) ;
2020-05-15 02:40:59 +00:00
2020-12-15 23:50:58 +00:00
curatedProfile . scenes = toBaseReleases ( profile . scenes || profile . releases , profile . entity , actor )
// attach actor to base scene, in case it was not scraped
. map ( ( scene ) => {
2021-11-20 22:59:15 +00:00
if ( actor && ! scene . actors ? . find ( ( sceneActor ) => slugify ( sceneActor ) === actor . slug || slugify ( sceneActor . name ) === actor . slug ) ) {
2020-12-15 23:50:58 +00:00
return {
... scene ,
actors : [ actor , ... ( scene . actors || [ ] ) ] ,
} ;
}
return scene ;
} ) ;
2020-05-15 02:40:59 +00:00
2020-06-25 00:26:25 +00:00
if ( profile . ethnicity && ! curatedProfile . ethnicity ) logger . warn ( ` Unrecognized ethnicity returned by ' ${ profile . entity . name } ' scraper: ${ profile . ethnicity } ` ) ;
if ( ( profile . hairColor || profile . hair ) && ! curatedProfile . hairColor ) logger . warn ( ` Unrecognized hair color returned by ' ${ profile . entity . name } ' scraper: ${ profile . hairColor || profile . hair } ` ) ;
if ( profile . eyes && ! curatedProfile . eyes ) logger . warn ( ` Unrecognized eye color returned by ' ${ profile . entity . name } ' scraper: ${ profile . eyes } ` ) ;
2020-05-18 23:10:32 +00:00
2020-05-15 02:40:59 +00:00
return curatedProfile ;
} catch ( error ) {
logger . error ( ` Failed to curate ' ${ profile . name } ': ${ error . message } ` ) ;
return null ;
}
}
2020-12-30 01:23:43 +00:00
async function fetchProfiles ( actorIdsOrNames ) {
return knex ( 'actors_profiles' )
. select ( knex . raw ( 'actors_profiles.*, row_to_json(actors) as actor, row_to_json(media) as avatar' ) )
2020-12-29 23:16:05 +00:00
. leftJoin ( 'actors' , 'actors.id' , 'actors_profiles.actor_id' )
. modify ( ( query ) => {
2020-12-30 01:23:43 +00:00
if ( actorIdsOrNames ) {
2020-12-29 23:16:05 +00:00
query
2021-11-20 22:59:15 +00:00
. whereIn ( 'actor_id' , actorIdsOrNames . filter ( ( idOrName ) => typeof idOrName === 'number' ) )
2020-12-30 01:23:43 +00:00
. orWhere ( ( builder ) => {
builder
2021-11-20 22:59:15 +00:00
. whereIn ( 'actors.name' , actorIdsOrNames . filter ( ( idOrName ) => typeof idOrName === 'string' ) )
2020-12-30 01:23:43 +00:00
. whereNull ( 'actors.entity_id' ) ;
} ) ;
2020-12-29 23:16:05 +00:00
}
} )
2020-05-17 01:00:44 +00:00
. leftJoin ( 'media' , 'actors_profiles.avatar_media_id' , 'media.id' ) ;
2020-12-30 01:23:43 +00:00
}
async function interpolateProfiles ( actorIdsOrNames ) {
const profiles = await fetchProfiles ( actorIdsOrNames ) ;
2020-05-17 01:00:44 +00:00
const profilesByActorId = profiles . reduce ( ( acc , profile ) => ( {
... acc ,
[ profile . actor _id ] : [
... ( acc [ profile . actor _id ] || [ ] ) ,
profile ,
] ,
} ) , { } ) ;
2020-12-29 23:16:05 +00:00
logger . info ( ` Interpolating ${ profiles . length } profiles from ${ Object . keys ( profilesByActorId ) . length } actors ` ) ;
2020-05-17 01:00:44 +00:00
const interpolatedProfiles = Object . entries ( profilesByActorId ) . map ( ( [ actorId , actorProfiles ] ) => {
2020-05-18 23:10:32 +00:00
// group values from each profile
2020-05-17 01:00:44 +00:00
const valuesByProperty = actorProfiles . reduce ( ( acc , profile ) => Object
. entries ( profile )
. reduce ( ( profileAcc , [ property , value ] ) => ( {
... profileAcc ,
[ property ] : [
... ( acc [ property ] || [ ] ) ,
2020-05-18 23:10:32 +00:00
... ( value === null ? [ ] : Array . from ( { length : profile . priority } , ( ) => value ) ) , // multiply by priority, increasing the odds of being the most frequent value
2020-05-17 01:00:44 +00:00
] ,
2020-05-18 23:10:32 +00:00
} ) , {
// bundle location values so they can be assessed together, to ensure the most frequent city is in the most frequent state is in most frequent country
origin : [ ... acc . origin || [ ] , {
... ( profile . birth _country _alpha2 && { country : profile . birth _country _alpha2 } ) ,
... ( profile . birth _state && { state : profile . birth _state } ) ,
... ( profile . birth _city && { city : profile . birth _city } ) ,
2021-11-20 22:59:15 +00:00
} ] . filter ( ( location ) => Object . keys ( location ) . length > 0 ) ,
2020-05-18 23:10:32 +00:00
residence : [ ... acc . residence || [ ] , {
... ( profile . residence _country _alpha2 && { country : profile . residence _country _alpha2 } ) ,
... ( profile . residence _state && { state : profile . residence _state } ) ,
... ( profile . residence _city && { city : profile . residence _city } ) ,
2021-11-20 22:59:15 +00:00
} ] . filter ( ( location ) => Object . keys ( location ) . length > 0 ) ,
2020-05-18 23:10:32 +00:00
} ) , { } ) ;
2020-05-17 01:00:44 +00:00
2020-05-18 23:10:32 +00:00
const mostFrequentValues = [
'gender' ,
2023-07-01 19:46:44 +00:00
'orientation' ,
2020-05-18 23:10:32 +00:00
'ethnicity' ,
'cup' ,
'bust' ,
'waist' ,
'hip' ,
2023-07-25 01:03:41 +00:00
'shoe_size' ,
2020-11-15 03:33:24 +00:00
'penis_length' ,
'penis_girth' ,
'circumcised' ,
2020-05-24 01:54:29 +00:00
'hair_color' ,
2020-05-18 23:10:32 +00:00
'eyes' ,
'has_tattoos' ,
'has_piercings' ,
2023-07-25 01:03:41 +00:00
'blood_type' ,
2020-05-18 23:10:32 +00:00
] . reduce ( ( acc , property ) => ( {
... acc ,
[ property ] : getMostFrequent ( valuesByProperty [ property ] ) ,
} ) , { } ) ;
2020-05-17 01:00:44 +00:00
const profile = {
id : actorId ,
2020-05-18 23:10:32 +00:00
... mostFrequentValues ,
2020-05-17 01:00:44 +00:00
} ;
2021-11-20 22:59:15 +00:00
profile . height = getMostFrequent ( valuesByProperty . height . filter ( ( height ) => height > 50 && height < 300 ) ) ; // remove unlikely values
2021-01-15 15:14:48 +00:00
2020-05-17 01:00:44 +00:00
profile . date _of _birth = getMostFrequentDate ( valuesByProperty . date _of _birth ) ;
profile . date _of _death = getMostFrequentDate ( valuesByProperty . date _of _death ) ;
2020-11-29 02:59:47 +00:00
profile . age = getHighest ( valuesByProperty . age ) ;
2020-05-17 01:00:44 +00:00
2021-02-28 02:38:54 +00:00
profile . natural _boobs = profile . gender === 'male' ? null : getMostFrequent ( valuesByProperty . natural _boobs ) ;
2020-05-18 23:10:32 +00:00
// ensure most frequent country, city and state match up
2021-11-20 22:59:15 +00:00
profile . birth _country _alpha2 = getMostFrequent ( valuesByProperty . origin . map ( ( location ) => location . country ) ) ;
const remainingOriginCountries = valuesByProperty . origin . filter ( ( location ) => location . country === profile . birth _country _alpha2 ) ;
2020-05-17 01:00:44 +00:00
2021-11-20 22:59:15 +00:00
profile . birth _state = getMostFrequent ( remainingOriginCountries . map ( ( location ) => location . state ) ) ;
const remainingOriginStates = remainingOriginCountries . filter ( ( location ) => ! profile . birth _state || location . state === profile . birth _state ) ;
2020-05-17 01:00:44 +00:00
2021-11-20 22:59:15 +00:00
profile . birth _city = getMostFrequent ( remainingOriginStates . map ( ( location ) => location . city ) ) ;
2020-05-17 01:00:44 +00:00
2021-11-20 22:59:15 +00:00
profile . residence _country _alpha2 = getMostFrequent ( valuesByProperty . residence . map ( ( location ) => location . country ) ) ;
const remainingResidenceCountries = valuesByProperty . residence . filter ( ( location ) => location . country === profile . residence _country _alpha2 ) ;
2020-05-17 01:00:44 +00:00
2021-11-20 22:59:15 +00:00
profile . residence _state = getMostFrequent ( remainingResidenceCountries . map ( ( location ) => location . state ) ) ;
const remainingResidenceStates = remainingResidenceCountries . filter ( ( location ) => ! profile . residence _state || location . state === profile . residence _state ) ;
2020-05-18 23:10:32 +00:00
2021-11-20 22:59:15 +00:00
profile . residence _city = getMostFrequent ( remainingResidenceStates . map ( ( location ) => location . city ) ) ;
2020-05-17 01:00:44 +00:00
2020-05-18 23:10:32 +00:00
profile . weight = getAverage ( valuesByProperty . weight ) ;
2020-05-17 01:00:44 +00:00
profile . tattoos = getLongest ( valuesByProperty . tattoos ) ;
profile . piercings = getLongest ( valuesByProperty . piercings ) ;
2020-12-27 21:45:38 +00:00
profile . avatar _media _id = actorProfiles
2021-11-20 22:59:15 +00:00
. map ( ( actorProfile ) => actorProfile . avatar )
. filter ( ( avatar ) => avatar && ( avatar . entropy === null || avatar . entropy > 5.5 ) )
2020-12-20 03:21:28 +00:00
. sort ( ( avatarA , avatarB ) => avatarB . height - avatarA . height ) [ 0 ] ? . id || null ;
2020-05-17 01:00:44 +00:00
2023-07-06 03:09:05 +00:00
if ( ! profile . avatar _media _id ) {
// try to settle for low quality avatar
profile . avatar _media _id = actorProfiles
. map ( ( actorProfile ) => actorProfile . avatar )
. filter ( ( avatar ) => avatar )
. sort ( ( avatarA , avatarB ) => avatarB . height - avatarA . height ) [ 0 ] ? . id || null ;
}
2020-05-17 01:00:44 +00:00
return profile ;
} ) ;
const transaction = await knex . transaction ( ) ;
2020-12-30 01:23:43 +00:00
// clear existing interpolated data
const emptyProfile = Object
. keys ( omit ( curateProfileEntry ( { id : 1 } ) , [ 'id' , 'actor_id' , 'entity_id' , 'url' , 'description_hash' ] ) )
. reduce ( ( acc , key ) => ( { ... acc , [ key ] : null } ) , { } ) ;
await knex ( 'actors' )
. modify ( ( modifyBuilder ) => {
if ( actorIdsOrNames ) {
modifyBuilder
2021-11-20 22:59:15 +00:00
. whereIn ( 'id' , actorIdsOrNames . filter ( ( idOrName ) => typeof idOrName === 'number' ) )
2020-12-30 01:23:43 +00:00
. orWhere ( ( whereBuilder ) => {
whereBuilder
2021-11-20 22:59:15 +00:00
. whereIn ( 'name' , actorIdsOrNames . filter ( ( idOrName ) => typeof idOrName === 'string' ) )
2020-12-30 01:23:43 +00:00
. whereNull ( 'entity_id' ) ;
} ) ;
}
} )
. update ( emptyProfile )
. transacting ( transaction ) ;
// insert new interpolated data
2021-11-20 22:59:15 +00:00
const queries = interpolatedProfiles . map ( ( profile ) => knex ( 'actors' )
2020-05-17 01:00:44 +00:00
. where ( 'id' , profile . id )
. update ( profile )
. transacting ( transaction ) ) ;
await Promise . all ( queries )
. then ( transaction . commit )
. catch ( transaction . rollback ) ;
}
async function upsertProfiles ( profiles ) {
2021-11-20 22:59:15 +00:00
const newProfileEntries = profiles . filter ( ( profile ) => ! profile . update ) . map ( ( profile ) => curateProfileEntry ( profile ) ) . filter ( Boolean ) ;
const updatingProfileEntries = profiles . filter ( ( profile ) => profile . update ) . map ( ( profile ) => curateProfileEntry ( profile ) ) . filter ( Boolean ) ;
2020-05-15 02:40:59 +00:00
if ( newProfileEntries . length > 0 ) {
2020-08-14 21:21:53 +00:00
await bulkInsert ( 'actors_profiles' , newProfileEntries ) ;
2020-05-19 00:02:48 +00:00
2020-05-21 01:44:44 +00:00
logger . info ( ` Saved ${ newProfileEntries . length } actor profiles ` ) ;
2020-05-15 02:40:59 +00:00
}
if ( argv . force && updatingProfileEntries . length > 0 ) {
2020-05-16 02:36:45 +00:00
const transaction = await knex . transaction ( ) ;
2021-11-20 22:59:15 +00:00
const queries = updatingProfileEntries . map ( ( profileEntry ) => knex ( 'actors_profiles' )
2020-05-16 02:36:45 +00:00
. where ( 'id' , profileEntry . id )
. update ( profileEntry )
. returning ( [ 'id' , 'actor_id' ] )
. transacting ( transaction ) ) ;
await Promise . all ( queries )
. then ( transaction . commit )
. catch ( transaction . rollback ) ;
2020-05-19 00:02:48 +00:00
logger . info ( ` Updated ${ updatingProfileEntries . length } new actor profiles ` ) ;
2020-05-15 02:40:59 +00:00
}
}
2020-06-25 00:26:25 +00:00
async function scrapeProfiles ( actor , sources , entitiesBySlug , existingProfilesByActorEntityId ) {
2021-11-20 22:59:15 +00:00
const validSources = actor . entity ? sources . filter ( ( source ) => source === actor . entity . slug ) : sources ;
2021-03-11 03:16:59 +00:00
const profiles = Promise . map ( validSources , async ( source ) => {
2020-05-17 23:22:56 +00:00
try {
2020-05-18 23:10:32 +00:00
// config may group sources to try until success
2020-05-17 23:22:56 +00:00
return await [ ] . concat ( source ) . reduce ( async ( outcome , scraperSlug ) => outcome . catch ( async ( ) => {
2020-05-18 23:10:32 +00:00
try {
2020-07-17 01:39:13 +00:00
const entity = entitiesBySlug [ scraperSlug ] || null ;
2020-12-04 22:53:20 +00:00
const scraper = scrapers [ scraperSlug ] ;
2021-02-10 02:23:48 +00:00
const layoutScraper = resolveLayoutScraper ( entity , scraper ) ;
2020-12-04 22:53:20 +00:00
2021-02-16 02:37:52 +00:00
if ( ! layoutScraper ? . fetchProfile ) {
logger . warn ( ` No profile profile scraper available for ${ scraperSlug } ` ) ;
throw new Error ( ` No profile profile scraper available for ${ scraperSlug } ` ) ;
}
2020-05-18 23:10:32 +00:00
const context = {
2020-07-17 01:39:13 +00:00
... entity ,
2020-07-16 13:55:03 +00:00
// legacy
2020-07-17 01:39:13 +00:00
site : entity ,
2023-07-01 19:46:44 +00:00
channel : entity ,
2020-07-17 01:39:13 +00:00
network : entity ? . parent ,
entity ,
2023-07-02 03:07:38 +00:00
include ,
2020-05-18 23:10:32 +00:00
scraper : scraperSlug ,
2021-02-10 02:23:48 +00:00
parameters : getRecursiveParameters ( entity ) ,
2020-05-18 23:10:32 +00:00
} ;
2020-06-25 00:26:25 +00:00
const label = context . entity ? . name ;
2020-05-19 00:02:48 +00:00
2020-06-25 00:26:25 +00:00
if ( ! context . entity ) {
logger . warn ( ` No entity found for ${ scraperSlug } ` ) ;
throw new Error ( ` No entity found for ${ scraperSlug } ` ) ;
2020-05-18 23:10:32 +00:00
}
2020-06-25 00:26:25 +00:00
const existingProfile = existingProfilesByActorEntityId [ actor . id ] ? . [ context . entity ? . id || null ] ;
2020-05-19 00:02:48 +00:00
if ( existingProfile && ! argv . force ) {
logger . verbose ( ` Found existing profile for ' ${ actor . name } ' on ' ${ label } ', use --force to scrape again ` ) ;
return null ;
}
logger . verbose ( ` Searching profile for ' ${ actor . name } ' on ' ${ label } ' ` ) ;
2020-05-18 23:10:32 +00:00
2020-12-04 22:53:20 +00:00
const profile = await layoutScraper . fetchProfile ( curateActor ( {
2020-08-31 00:43:41 +00:00
... existingProfile ,
... actor ,
} ) , context , include ) ;
2020-05-18 23:10:32 +00:00
if ( ! profile || typeof profile === 'number' ) { // scraper returns HTTP code on request failure
2020-05-19 00:02:48 +00:00
logger . verbose ( ` Profile for ' ${ actor . name } ' not available on ${ label } , scraper returned ${ profile } ` ) ;
throw Object . assign ( new Error ( ` Profile for ' ${ actor . name } ' not available on ${ label } ` ) , { code : 'PROFILE_NOT_AVAILABLE' } ) ;
2020-05-18 23:10:32 +00:00
}
2020-05-19 00:02:48 +00:00
logger . verbose ( ` Found profile for ' ${ actor . name } ' on ' ${ label } ' ` ) ;
return await curateProfile ( {
2020-05-18 23:10:32 +00:00
... actor ,
... profile ,
2020-07-17 01:39:13 +00:00
entity ,
2020-05-19 00:02:48 +00:00
update : existingProfile ? . id || false ,
2020-12-15 23:50:58 +00:00
} , actor ) ;
2020-05-18 23:10:32 +00:00
} catch ( error ) {
if ( error . code !== 'PROFILE_NOT_AVAILABLE' ) {
logger . error ( ` Failed to fetch profile for ' ${ actor . name } ' from ' ${ scraperSlug } ': ${ error . message } ` ) ;
}
2020-05-17 23:22:56 +00:00
2020-05-19 00:02:48 +00:00
// throw error to try next source
2020-05-18 23:10:32 +00:00
throw error ;
2020-05-17 23:22:56 +00:00
}
} ) , Promise . reject ( new Error ( ) ) ) ;
} catch ( error ) {
2021-01-22 14:40:49 +00:00
console . log ( error ) ;
2020-05-17 23:22:56 +00:00
if ( error . code !== 'PROFILE_NOT_AVAILABLE' ) {
logger . error ( ` Failed to fetch profile for ' ${ actor . name } ': ${ error . message } ` ) ;
}
}
return null ;
} ) ;
return profiles . filter ( Boolean ) ;
}
2023-07-21 22:49:56 +00:00
async function associateSocials ( profiles ) {
2023-07-22 23:02:18 +00:00
const profileEntries = await knex ( 'actors_profiles' ) . whereIn ( [ 'actor_id' , 'entity_id' ] , profiles . map ( ( profile ) => [ profile . id , profile . entity . id ] ) ) ;
const profileEntriesByActorIdAndEntityId = profileEntries . reduce ( ( acc , profileEntry ) => {
if ( ! acc [ profileEntry . actor _id ] ) {
acc [ profileEntry . actor _id ] = { } ;
}
acc [ profileEntry . actor _id ] [ profileEntry . entity _id ] = profileEntry . id ;
return acc ;
} , { } ) ;
profiles . reduce ( async ( chain , profile ) => {
await chain ;
if ( ! Array . isArray ( profile . social ) || profile . social . length === 0 ) {
return ;
}
const profileId = profileEntriesByActorIdAndEntityId [ profile . id ] ? . [ profile . entity . id ] ;
if ( ! profileId ) {
return ;
}
await knex ( 'actors_social' )
. insert ( profile . social . map ( ( url ) => ( {
url ,
platform : new URL ( url ) . hostname . match ( /([\w-]+)?\.(\w+)$/ ) ? . [ 1 ] ,
actor _id : profile . id ,
profile _id : profileId ,
} ) ) )
. onConflict ( )
. ignore ( ) ;
} , Promise . resolve ( ) ) ;
2023-07-21 22:49:56 +00:00
}
2020-08-12 18:51:08 +00:00
async function getActorNames ( actorNames ) {
if ( actorNames . length > 0 ) {
return actorNames ;
}
const actorsWithoutProfiles = await knex . raw ( `
SELECT actors . name
FROM actors
WHERE NOT EXISTS (
SELECT *
FROM actors _profiles
WHERE actors _profiles . actor _id = actors . id
AND actors _profiles . updated _at <= ( ? )
)
` , [argv.actorsUpdate || new Date()]);
2021-11-20 22:59:15 +00:00
return actorsWithoutProfiles . rows . map ( ( actor ) => actor . name ) ;
2020-08-12 18:51:08 +00:00
}
2020-08-30 02:18:47 +00:00
async function storeProfiles ( profiles ) {
const profilesWithAvatarIds = await associateAvatars ( profiles ) ;
2021-11-20 22:59:15 +00:00
const actorIds = Array . from ( new Set ( profiles . map ( ( profile ) => profile . id ) ) ) ;
2020-08-30 02:18:47 +00:00
2023-07-22 23:02:18 +00:00
await associateSocials ( profiles ) ;
2023-07-21 22:49:56 +00:00
2020-08-30 02:18:47 +00:00
await upsertProfiles ( profilesWithAvatarIds ) ;
await interpolateProfiles ( actorIds ) ;
}
2020-08-12 18:51:08 +00:00
async function scrapeActors ( argNames ) {
const actorNames = await getActorNames ( argNames ) ;
2020-05-14 02:26:05 +00:00
const baseActors = toBaseActors ( actorNames ) ;
2020-08-12 18:51:08 +00:00
logger . info ( ` Scraping profiles for ${ actorNames . length } actors ` ) ;
2020-11-27 23:46:30 +00:00
const sources = argv . profileSources || config . profiles || Object . keys ( scrapers . actors ) ;
2020-06-25 00:26:25 +00:00
const entitySlugs = sources . flat ( ) ;
2020-05-14 02:26:05 +00:00
2021-02-02 23:46:59 +00:00
const [ entitiesBySlug , existingActorEntries ] = await Promise . all ( [
fetchEntitiesBySlug ( entitySlugs , 'desc' ) ,
2020-05-14 02:26:05 +00:00
knex ( 'actors' )
2021-03-11 03:16:59 +00:00
. select ( knex . raw ( 'actors.id, actors.name, actors.slug, actors.entry_id, actors.entity_id, row_to_json(entities) as entity' ) )
2021-11-20 22:59:15 +00:00
. whereIn ( 'actors.slug' , baseActors . map ( ( baseActor ) => baseActor . slug ) )
2021-03-11 03:16:59 +00:00
. whereNull ( 'actors.alias_for' )
. leftJoin ( 'entities' , 'entities.id' , 'actors.entity_id' )
. groupBy ( 'actors.id' , 'entities.id' ) ,
2020-05-14 02:26:05 +00:00
] ) ;
2020-07-20 23:16:26 +00:00
const existingActorEntriesBySlugAndEntryId = existingActorEntries . reduce ( ( acc , actorEntry ) => ( {
... acc ,
[ actorEntry . slug ] : {
... acc [ actorEntry . slug ] ,
[ actorEntry . entryId || null ] : actorEntry ,
} ,
} ) , { } ) ;
2021-11-20 22:59:15 +00:00
const newBaseActors = baseActors . filter ( ( baseActor ) => ! existingActorEntriesBySlugAndEntryId [ baseActor . slug ] ? . [ baseActor . entryId ] ) ;
2020-05-14 02:26:05 +00:00
const [ batchId ] = newBaseActors . length > 0 ? await knex ( 'batches' ) . insert ( { comment : null } ) . returning ( 'id' ) : [ null ] ;
const curatedActorEntries = batchId && curateActorEntries ( newBaseActors , batchId ) ;
2020-07-20 23:16:26 +00:00
2020-07-21 02:04:07 +00:00
// TODO: associate entity when entry ID is provided
2020-08-14 21:21:53 +00:00
const newActorEntries = batchId && await bulkInsert ( 'actors' , curatedActorEntries ) ;
2020-05-14 02:26:05 +00:00
2020-05-15 02:40:59 +00:00
const actors = existingActorEntries . concat ( Array . isArray ( newActorEntries ) ? newActorEntries : [ ] ) ;
2020-08-31 00:43:41 +00:00
const existingProfiles = await knex ( 'actors_profiles' )
. select ( knex . raw ( 'actors_profiles.*, row_to_json(avatars) as avatar' ) )
2021-11-20 22:59:15 +00:00
. whereIn ( 'actor_id' , actors . map ( ( actor ) => actor . id ) )
2020-08-31 00:43:41 +00:00
. leftJoin ( 'media as avatars' , 'avatars.id' , 'actors_profiles.avatar_media_id' ) ;
2020-06-25 00:26:25 +00:00
const existingProfilesByActorEntityId = existingProfiles . reduce ( ( acc , profile ) => ( {
2020-05-19 00:02:48 +00:00
... acc ,
[ profile . actor _id ] : {
... acc [ profile . actor _id ] ,
2020-06-25 00:26:25 +00:00
[ profile . entity _id ] : profile ,
2020-05-19 00:02:48 +00:00
} ,
} ) , { } ) ;
2020-05-15 02:40:59 +00:00
const profilesPerActor = await Promise . map (
actors ,
2021-11-20 22:59:15 +00:00
async ( actor ) => scrapeProfiles ( actor , sources , entitiesBySlug , existingProfilesByActorEntityId ) ,
2020-05-15 02:40:59 +00:00
{ concurrency : 10 } ,
) ;
2020-05-19 00:02:48 +00:00
const profiles = profilesPerActor . flat ( ) . filter ( Boolean ) ;
logger . info ( ` Scraped ${ profiles . length } profiles ` ) ;
2020-05-16 02:36:45 +00:00
2020-12-02 20:26:55 +00:00
if ( argv . report ) {
console . log ( util . inspect ( profiles , { depth : Infinity , colors : true } ) ) ;
2020-05-17 02:59:09 +00:00
}
if ( argv . save ) {
2020-08-30 02:18:47 +00:00
await storeProfiles ( profiles ) ;
2020-05-17 02:59:09 +00:00
}
2020-05-18 01:22:03 +00:00
return profiles ;
2020-05-13 21:17:39 +00:00
}
2020-05-13 00:56:20 +00:00
async function getOrCreateActors ( baseActors , batchId ) {
2020-10-28 02:50:52 +00:00
// WHERE IN causes stack depth error and performance issues with a large amount of values, no knex VALUES helper available
2021-11-20 22:59:15 +00:00
const actorValues = baseActors . map ( ( actor ) => knex . raw ( '(:slug, :entityId, :entryId, :collisionLikely)' , {
2021-02-16 02:37:52 +00:00
slug : actor . slug ,
entityId : actor . entity . id ,
entryId : actor . entryId ,
collisionLikely : getCollisionLikely ( actor ) ,
} ) ) . join ( ', ' ) ;
2020-10-28 02:50:52 +00:00
const existingActors = await knex
. select ( 'actors.*' )
2021-02-16 02:37:52 +00:00
. from ( knex . raw ( ` actors, (VALUES ${ actorValues } ) AS base_actors (slug, entity_id, entry_id, collision_likely) ` ) )
. whereRaw ( `
actors . slug = base _actors . slug
AND actors . entity _id IS NULL
AND NOT base _actors . collision _likely
` )
. orWhereRaw ( `
actors . slug = base _actors . slug
AND actors . entity _id = base _actors . entity _id
AND ( ( actors . entry _id IS NULL AND base _actors . entry _id IS NULL )
OR actors . entry _id = base _actors . entry _id )
` );
2020-05-14 02:26:05 +00:00
// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug));
const existingActorSlugs = existingActors . reduce ( ( acc , actor ) => ( {
... acc ,
2020-06-17 02:07:24 +00:00
[ actor . entity _id ] : {
... acc [ actor . entity _id ] ,
2021-02-16 02:37:52 +00:00
[ actor . entry _id ] : {
... acc [ actor . entity _id ] ? . [ actor . entry _id ] ,
[ actor . slug ] : true ,
} ,
2020-05-14 02:26:05 +00:00
} ,
} ) , { } ) ;
2021-11-20 22:59:15 +00:00
const uniqueBaseActors = baseActors . filter ( ( baseActor ) => ! existingActorSlugs [ baseActor . entity . id ] ? . [ baseActor . entryId ] ? . [ baseActor . slug ] && ! existingActorSlugs . null ? . null ? . [ baseActor . slug ] ) ;
2020-05-14 02:26:05 +00:00
const curatedActorEntries = curateActorEntries ( uniqueBaseActors , batchId ) ;
2021-02-16 02:37:52 +00:00
2020-08-14 21:21:53 +00:00
const newActors = await bulkInsert ( 'actors' , curatedActorEntries ) ;
2020-05-14 02:26:05 +00:00
2021-02-16 02:37:52 +00:00
const newActorIdsByEntityIdEntryIdAndSlug = newActors . reduce ( ( acc , actor ) => ( {
2020-08-30 02:18:47 +00:00
... acc ,
[ actor . entity _id ] : {
... acc [ actor . entity _id ] ,
2021-02-16 02:37:52 +00:00
[ actor . entry _id ] : {
... acc [ actor . entity _id ] ? . [ actor . entry _id ] ,
[ actor . slug ] : actor . id ,
} ,
2020-08-30 02:18:47 +00:00
} ,
} ) , { } ) ;
2020-09-04 01:07:28 +00:00
const newActorProfiles = await Promise . all ( baseActors
2021-11-20 22:59:15 +00:00
. filter ( ( actor ) => actor . hasProfile )
. map ( ( actor ) => ( {
2020-08-30 02:18:47 +00:00
... actor ,
2021-02-16 02:37:52 +00:00
id : newActorIdsByEntityIdEntryIdAndSlug [ actor . entity ? . id ] ? . [ actor . entryId ] ? . [ actor . slug ] || newActorIdsByEntityIdEntryIdAndSlug . null ? . null ? . [ actor . slug ] ,
2020-08-30 02:18:47 +00:00
} ) )
2021-11-20 22:59:15 +00:00
. filter ( ( actor ) => ! ! actor . id )
. map ( ( actor ) => curateProfile ( actor ) ) ) ;
2020-08-31 00:43:41 +00:00
2020-08-30 02:18:47 +00:00
await storeProfiles ( newActorProfiles ) ;
2020-05-14 02:26:05 +00:00
if ( Array . isArray ( newActors ) ) {
return newActors . concat ( existingActors ) ;
}
return existingActors ;
2020-03-26 02:32:07 +00:00
}
2021-03-06 23:01:02 +00:00
async function associatePeople ( releases , batchId , type = 'actor' ) {
2021-02-23 03:09:33 +00:00
try {
const baseActorsByReleaseId = releases . reduce ( ( acc , release ) => {
2021-03-06 23:01:02 +00:00
if ( type === 'actors' && release . actors ) {
2021-02-23 03:09:33 +00:00
acc [ release . id ] = toBaseActors ( release . actors , release ) ;
}
2020-05-14 02:26:05 +00:00
2023-07-06 02:24:47 +00:00
if ( type === 'directors' && ( release . director || release . directors ) ) {
acc [ release . id ] = toBaseActors ( [ ] . concat ( release . director || release . directors ) . filter ( Boolean ) , release ) ;
2021-03-06 23:01:02 +00:00
}
2021-02-23 03:09:33 +00:00
return acc ;
} , { } ) ;
2020-05-14 02:26:05 +00:00
2021-02-23 03:09:33 +00:00
const baseActors = Object . values ( baseActorsByReleaseId ) . flat ( ) ;
2020-05-14 02:26:05 +00:00
2021-02-23 03:09:33 +00:00
if ( baseActors . length === 0 ) {
return [ ] ;
}
2020-05-14 02:26:05 +00:00
2021-02-23 03:09:33 +00:00
const baseActorsBySlug = baseActors . reduce ( ( acc , baseActor ) => ( {
... acc ,
[ baseActor . slug ] : baseActor ,
} ) , { } ) ;
2020-05-14 02:26:05 +00:00
2021-02-23 03:09:33 +00:00
const uniqueBaseActors = Object . values ( baseActorsBySlug ) ;
const actors = await getOrCreateActors ( uniqueBaseActors , batchId ) ;
2020-05-14 02:26:05 +00:00
2021-03-06 23:01:02 +00:00
const personKey = ( {
actors : 'actor_id' ,
directors : 'director_id' ,
} ) [ type ] ;
2021-02-23 03:09:33 +00:00
const actorIdsByEntityIdEntryIdAndSlug = actors . reduce ( ( acc , actor ) => ( {
... acc ,
[ actor . entity _id ] : {
... acc [ actor . entity _id ] ,
[ actor . entry _id ] : {
... acc [ actor . entity _id ] ? . [ actor . entry _id ] ,
[ actor . slug ] : {
2021-03-06 23:01:02 +00:00
[ personKey ] : actor . alias _for || actor . id ,
2021-02-23 03:09:33 +00:00
alias _id : actor . alias _for ? actor . id : null ,
} ,
2021-02-16 23:40:20 +00:00
} ,
} ,
2021-02-23 03:09:33 +00:00
} ) , { } ) ;
const releaseActorAssociations = Object . entries ( baseActorsByReleaseId )
. map ( ( [ releaseId , releaseActors ] ) => releaseActors
2021-11-20 22:59:15 +00:00
. map ( ( releaseActor ) => ( {
2021-02-23 03:09:33 +00:00
release _id : releaseId ,
... ( actorIdsByEntityIdEntryIdAndSlug [ releaseActor . entity ? . id ] ? . [ releaseActor . entryId ] ? . [ releaseActor . slug ] || actorIdsByEntityIdEntryIdAndSlug . null . null [ releaseActor . slug ] ) ,
} ) ) )
. flat ( ) ;
2020-05-14 02:26:05 +00:00
2021-11-20 22:59:15 +00:00
const validReleaseActorAssociations = releaseActorAssociations . filter ( ( association ) => association . release _id && association [ personKey ] ) ;
2021-03-03 12:56:50 +00:00
2021-03-03 23:31:31 +00:00
if ( releaseActorAssociations . length > validReleaseActorAssociations . length ) {
2021-11-20 22:59:15 +00:00
const invalidReleaseActorAssociations = releaseActorAssociations . filter ( ( association ) => ! association . release _id || ! association [ personKey ] ) ;
2021-03-03 23:31:31 +00:00
logger . error ( invalidReleaseActorAssociations ) ;
}
2021-03-06 23:01:02 +00:00
await bulkInsert ( ` releases_ ${ type } ` , validReleaseActorAssociations , false ) ;
2020-05-14 02:26:05 +00:00
2021-02-23 03:09:33 +00:00
logger . verbose ( ` Associated ${ releaseActorAssociations . length } actors to ${ releases . length } scenes ` ) ;
2020-05-16 02:36:45 +00:00
2021-02-23 03:09:33 +00:00
return actors ;
} catch ( error ) {
logger . error ( ` Failed to associate actors: ${ error . message } ` ) ;
2020-10-30 16:37:10 +00:00
2021-02-23 03:09:33 +00:00
return [ ] ;
}
2019-11-11 02:20:00 +00:00
}
2021-03-06 23:01:02 +00:00
async function associateActors ( releases , batchId ) {
return associatePeople ( releases , batchId , 'actors' ) ;
}
async function associateDirectors ( releases , batchId ) {
return associatePeople ( releases , batchId , 'directors' ) ;
}
2020-05-19 23:11:32 +00:00
async function fetchActor ( actorId ) {
const actor = await knex ( 'actors' )
. select ( knex . raw ( `
actors . * ,
2020-06-25 00:26:25 +00:00
row _to _json ( entities ) as entity ,
2020-05-19 23:11:32 +00:00
row _to _json ( actor _alias ) as alias ,
row _to _json ( birth _country ) as birth _country ,
row _to _json ( residence _country ) as residence _country ,
2020-07-17 01:39:13 +00:00
row _to _json ( media ) as avatar ,
json _agg ( actors _profiles ) as profiles
2020-05-19 23:11:32 +00:00
` ))
. modify ( ( queryBuilder ) => {
if ( Number . isNaN ( Number ( actorId ) ) ) {
queryBuilder . where ( 'actors.slug' , actorId ) ;
return ;
}
queryBuilder . where ( 'actors.id' , actorId ) ;
} )
. leftJoin ( 'actors as actor_alias' , 'actor_alias.id' , 'actors.alias_for' )
2020-07-17 01:39:13 +00:00
. leftJoin ( 'actors_profiles' , 'actors.id' , 'actors_profiles.actor_id' )
2020-06-25 00:26:25 +00:00
. leftJoin ( 'entities' , 'entities.id' , 'actors.entity_id' )
2020-05-19 23:11:32 +00:00
. leftJoin ( 'countries as birth_country' , 'birth_country.alpha2' , 'actors.birth_country_alpha2' )
. leftJoin ( 'countries as residence_country' , 'residence_country.alpha2' , 'actors.residence_country_alpha2' )
. leftJoin ( 'media' , 'media.id' , 'actors.avatar_media_id' )
2020-07-17 01:39:13 +00:00
. groupBy ( 'actors.id' , 'entities.id' , 'actor_alias.id' , 'birth_country.alpha2' , 'residence_country.alpha2' , 'media.id' )
2020-05-19 23:11:32 +00:00
. first ( ) ;
2020-05-19 23:38:58 +00:00
return curateActor ( actor , true ) ;
}
async function searchActors ( query ) {
const actors = await knex
. select ( '*' )
. from ( knex . raw ( 'search_actors(?) as actors' , [ query ] ) )
2020-11-26 03:01:01 +00:00
. limit ( 100 ) ;
2020-05-19 23:38:58 +00:00
2021-11-20 22:59:15 +00:00
return actors . map ( ( actor ) => curateActor ( actor ) ) ;
2020-05-19 23:11:32 +00:00
}
2020-12-30 02:19:09 +00:00
async function flushProfiles ( actorIdsOrNames ) {
const profiles = await fetchProfiles ( actorIdsOrNames ) ;
2021-11-20 22:59:15 +00:00
const actorNames = Array . from ( new Set ( profiles . map ( ( profile ) => profile . actor . name ) ) ) ;
2020-12-30 02:19:09 +00:00
const deleteCount = await knex ( 'actors_profiles' )
2021-11-20 22:59:15 +00:00
. whereIn ( 'id' , profiles . map ( ( profile ) => profile . id ) )
2020-12-30 02:19:09 +00:00
. delete ( ) ;
await interpolateProfiles ( actorIdsOrNames ) ;
await flushOrphanedMedia ( ) ; // don't flush until main avatar is detached by re-interpolating
if ( actorNames . length > 20 ) {
logger . info ( ` Removed ${ deleteCount } profiles for ${ actorNames . length } actors ` ) ;
return ;
}
if ( deleteCount > 0 ) {
logger . info ( ` Removed ${ deleteCount } profiles for ${ actorNames . join ( ', ' ) } ` ) ;
return ;
}
logger . info ( ` Removed ${ deleteCount } profiles ` ) ;
}
2022-03-04 22:31:59 +00:00
async function deleteActors ( allActorIdsOrNames ) {
const deleteCounts = await Promise . map ( chunk ( allActorIdsOrNames ) , async ( actorIdsOrNames ) => {
const actors = await knex ( 'actors' )
. whereIn ( 'id' , actorIdsOrNames . filter ( ( idOrName ) => typeof idOrName === 'number' ) )
. orWhere ( ( builder ) => {
builder
. whereIn ( 'name' , actorIdsOrNames . filter ( ( idOrName ) => typeof idOrName === 'string' ) )
. whereNull ( 'entity_id' ) ;
} ) ;
2020-12-30 02:19:09 +00:00
2022-03-04 22:31:59 +00:00
const actorIds = actors . map ( ( actor ) => actor . id ) ;
2020-12-30 02:19:09 +00:00
2022-03-04 22:31:59 +00:00
const sceneIds = await knex ( 'releases_actors' )
. select ( 'releases.id' )
. whereIn ( 'actor_id' , actorIds )
. leftJoin ( 'releases' , 'releases.id' , 'releases_actors.release_id' )
. pluck ( 'id' ) ;
2020-12-30 02:19:09 +00:00
2022-03-04 22:31:59 +00:00
const [ deletedScenesCount , deletedActorsCount ] = await Promise . all ( [
deleteScenes ( sceneIds ) ,
knex ( 'actors' )
. whereIn ( 'id' , actorIds )
. delete ( ) ,
] ) ;
return { deletedScenesCount , deletedActorsCount } ;
} , { concurrency : 10 } ) ;
const deletedActorsCount = deleteCounts . reduce ( ( acc , count ) => acc + count . deletedActorsCount , 0 ) ;
const deletedScenesCount = deleteCounts . reduce ( ( acc , count ) => acc + count . deletedScenesCount , 0 ) ;
2020-12-30 02:19:09 +00:00
await flushOrphanedMedia ( ) ;
logger . info ( ` Removed ${ deletedActorsCount } actors with ${ deletedScenesCount } scenes ` ) ;
2022-03-04 22:31:59 +00:00
return deletedActorsCount ;
2020-12-30 02:19:09 +00:00
}
2021-02-05 03:23:13 +00:00
async function flushActors ( ) {
const actorIds = await knex ( 'actors' ) . select ( 'id' ) . pluck ( 'id' ) ;
const confirmed = await inquirer . prompt ( [ {
type : 'confirm' ,
name : 'flushActors' ,
message : ` You are about to remove ${ actorIds . length } actors. Are you sure? ` ,
default : false ,
} ] ) ;
if ( ! confirmed . flushActors ) {
logger . warn ( 'Confirmation rejected, not flushing actors' ) ;
return ;
}
const deleteCount = await deleteActors ( actorIds ) ;
await flushOrphanedMedia ( ) ;
logger . info ( ` Removed ${ deleteCount } / ${ actorIds . length } actors ` ) ;
}
2019-11-10 03:20:22 +00:00
module . exports = {
2020-05-14 02:26:05 +00:00
associateActors ,
2021-03-06 23:01:02 +00:00
associateDirectors ,
2021-02-05 03:23:13 +00:00
deleteActors ,
2020-05-19 23:11:32 +00:00
fetchActor ,
2020-12-30 02:19:09 +00:00
flushActors ,
2020-12-30 01:23:43 +00:00
flushProfiles ,
interpolateProfiles ,
2020-05-14 02:26:05 +00:00
scrapeActors ,
2020-05-19 23:38:58 +00:00
searchActors ,
2020-10-29 15:06:20 +00:00
toBaseActors ,
2019-11-10 03:20:22 +00:00
} ;