2020-03-16 03:10:52 +00:00
'use strict' ;
const config = require ( 'config' ) ;
2020-05-17 02:59:09 +00:00
const argv = require ( './argv' ) ;
2020-03-21 01:48:24 +00:00
const logger = require ( './logger' ) ( _ _filename ) ;
2020-03-16 03:10:52 +00:00
const knex = require ( './knex' ) ;
const slugify = require ( './utils/slugify' ) ;
2020-05-16 02:36:45 +00:00
const { associateActors , scrapeActors } = require ( './actors' ) ;
2020-03-29 02:00:46 +00:00
const { associateReleaseTags } = require ( './tags' ) ;
2020-03-30 01:01:08 +00:00
const { curateSite } = require ( './sites' ) ;
2020-03-29 02:00:46 +00:00
const { associateReleaseMedia } = require ( './media' ) ;
2020-03-16 03:10:52 +00:00
function curateReleaseEntry ( release , batchId , existingRelease ) {
2020-05-14 02:26:05 +00:00
const slug = slugify ( release . title || release . actors ? . join ( '-' ) || null , '-' , {
encode : true ,
limit : config . titleSlugLength ,
} ) ;
const curatedRelease = {
title : release . title ,
entry _id : release . entryId || null ,
2020-06-17 02:07:24 +00:00
entity _id : release . site ? . id ,
2020-05-14 02:26:05 +00:00
shoot _id : release . shootId || null ,
studio _id : release . studio ? . id || null ,
url : release . url ,
2020-05-26 02:11:29 +00:00
date : Number ( release . date ) ? release . date : null ,
2020-05-14 02:26:05 +00:00
slug ,
description : release . description ,
duration : release . duration ,
type : release . type ,
// director: release.director,
// likes: release.rating && release.rating.likes,
// dislikes: release.rating && release.rating.dislikes,
// rating: release.rating && release.rating.stars && Math.floor(release.rating.stars),
deep : typeof release . deep === 'boolean' ? release . deep : false ,
deep _url : release . deepUrl ,
updated _batch _id : batchId ,
} ;
if ( ! existingRelease && ! release . id ) {
curatedRelease . created _batch _id = batchId ;
}
return curatedRelease ;
2020-03-16 03:10:52 +00:00
}
2020-03-16 23:58:03 +00:00
async function attachChannelSites ( releases ) {
2020-05-26 02:11:29 +00:00
const releasesWithoutSite = releases . filter ( release => release . channel && ( ! release . site || release . site . isNetwork || release . site . slug !== release . channel ) ) ;
2020-03-16 03:10:52 +00:00
2020-06-17 02:07:24 +00:00
const channelSites = await knex ( 'entities' )
. leftJoin ( 'entities AS parents' , 'parents.id' , 'entities.parent_id' )
. select ( 'entities.*' , 'parents.name as network_name' , 'parents.slug as network_slug' , 'parents.url as network_url' , 'parents.parameters as network_parameters' , 'parents.description as network_description' )
. whereIn ( 'entities.slug' , releasesWithoutSite . map ( release => release . channel ) ) ;
2020-03-30 01:01:08 +00:00
2020-05-14 02:26:05 +00:00
const channelSitesBySlug = channelSites . reduce ( ( acc , site ) => ( { ... acc , [ site . slug ] : site } ) , { } ) ;
2020-03-16 23:58:03 +00:00
2020-05-14 02:26:05 +00:00
const releasesWithChannelSite = await Promise . all ( releases
. map ( async ( release ) => {
if ( release . channel && channelSitesBySlug [ release . channel ] ) {
const curatedSite = await curateSite ( channelSitesBySlug [ release . channel ] ) ;
2020-03-30 01:01:08 +00:00
2020-05-14 02:26:05 +00:00
return {
... release ,
site : curatedSite ,
} ;
}
2020-03-16 23:58:03 +00:00
2020-05-26 02:11:29 +00:00
if ( release . site && ! release . site . isNetwork ) {
return release ;
}
2020-05-14 02:26:05 +00:00
if ( release . site && release . site . isNetwork ) {
return {
... release ,
site : null ,
network : release . site ,
} ;
}
2020-03-16 23:58:03 +00:00
2020-05-14 02:26:05 +00:00
logger . error ( ` Unable to match channel ' ${ release . channel ? . slug || release . channel } ' from generic URL ${ release . url } ` ) ;
2020-03-16 23:58:03 +00:00
2020-05-14 02:26:05 +00:00
return null ;
} ) ) ;
return releasesWithChannelSite . filter ( Boolean ) ;
2020-03-16 23:58:03 +00:00
}
async function attachStudios ( releases ) {
2020-05-14 02:26:05 +00:00
const studioSlugs = releases . map ( release => release . studio ) . filter ( Boolean ) ;
2020-03-16 23:58:03 +00:00
2020-05-14 02:26:05 +00:00
const studios = await knex ( 'studios' ) . whereIn ( 'slug' , studioSlugs ) ;
const studioBySlug = studios . reduce ( ( acc , studio ) => ( { ... acc , [ studio . slug ] : studio } ) , { } ) ;
2020-03-16 23:58:03 +00:00
2020-05-14 02:26:05 +00:00
const releasesWithStudio = releases . map ( ( release ) => {
if ( release . studio && studioBySlug [ release . studio ] ) {
return {
... release ,
studio : studioBySlug [ release . studio ] ,
} ;
}
2020-03-16 23:58:03 +00:00
2020-05-14 02:26:05 +00:00
if ( release . studio ) {
logger . warn ( ` Unable to match studio ' ${ release . studio } ' for ${ release . url } ` ) ;
}
2020-03-16 23:58:03 +00:00
2020-05-14 02:26:05 +00:00
return release ;
} ) ;
2020-03-16 23:58:03 +00:00
2020-05-14 02:26:05 +00:00
return releasesWithStudio ;
2020-03-16 03:10:52 +00:00
}
2020-03-22 02:50:24 +00:00
function attachReleaseIds ( releases , storedReleases ) {
2020-05-14 02:26:05 +00:00
const storedReleaseIdsBySiteIdAndEntryId = storedReleases . reduce ( ( acc , release ) => {
2020-06-17 02:07:24 +00:00
if ( ! acc [ release . entity _id ] ) acc [ release . entity _id ] = { } ;
acc [ release . entity _id ] [ release . entry _id ] = release . id ;
2020-03-22 02:50:24 +00:00
2020-05-14 02:26:05 +00:00
return acc ;
} , { } ) ;
2020-03-22 02:50:24 +00:00
2020-05-14 02:26:05 +00:00
const releasesWithId = releases . map ( release => ( {
... release ,
id : storedReleaseIdsBySiteIdAndEntryId [ release . site . id ] [ release . entryId ] ,
} ) ) ;
2020-03-22 02:50:24 +00:00
2020-05-14 02:26:05 +00:00
return releasesWithId ;
2020-03-22 02:50:24 +00:00
}
2020-03-28 03:37:04 +00:00
function filterInternalDuplicateReleases ( releases ) {
2020-05-14 02:26:05 +00:00
const releasesBySiteIdAndEntryId = releases . reduce ( ( acc , release ) => {
2020-05-26 02:11:29 +00:00
if ( ! release . site ) {
return acc ;
}
2020-05-14 02:26:05 +00:00
if ( ! acc [ release . site . id ] ) {
acc [ release . site . id ] = { } ;
}
2020-03-28 03:37:04 +00:00
2020-05-14 02:26:05 +00:00
acc [ release . site . id ] [ release . entryId ] = release ;
2020-03-28 03:37:04 +00:00
2020-05-14 02:26:05 +00:00
return acc ;
} , { } ) ;
2020-03-28 03:37:04 +00:00
2020-05-14 02:26:05 +00:00
return Object . values ( releasesBySiteIdAndEntryId )
. map ( siteReleases => Object . values ( siteReleases ) )
. flat ( ) ;
2020-03-28 03:37:04 +00:00
}
async function filterDuplicateReleases ( releases ) {
2020-05-14 02:26:05 +00:00
const internalUniqueReleases = filterInternalDuplicateReleases ( releases ) ;
2020-03-28 03:37:04 +00:00
2020-05-14 02:26:05 +00:00
const duplicateReleaseEntries = await knex ( 'releases' )
2020-06-17 02:07:24 +00:00
. whereIn ( [ 'entry_id' , 'entity_id' ] , internalUniqueReleases . map ( release => [ release . entryId , release . site . id ] ) ) ;
2020-03-16 03:10:52 +00:00
2020-05-14 02:26:05 +00:00
const duplicateReleasesBySiteIdAndEntryId = duplicateReleaseEntries . reduce ( ( acc , release ) => {
2020-06-17 02:07:24 +00:00
if ( ! acc [ release . entity _id ] ) acc [ release . entity _id ] = { } ;
acc [ release . entity _id ] [ release . entry _id ] = true ;
2020-03-16 03:10:52 +00:00
2020-05-14 02:26:05 +00:00
return acc ;
} , { } ) ;
2020-03-22 02:50:24 +00:00
2020-05-14 02:26:05 +00:00
const duplicateReleases = internalUniqueReleases . filter ( release => duplicateReleasesBySiteIdAndEntryId [ release . site . id ] ? . [ release . entryId ] ) ;
const uniqueReleases = internalUniqueReleases . filter ( release => ! duplicateReleasesBySiteIdAndEntryId [ release . site . id ] ? . [ release . entryId ] ) ;
2020-03-22 02:50:24 +00:00
2020-05-14 02:26:05 +00:00
return {
uniqueReleases ,
duplicateReleases ,
duplicateReleaseEntries ,
} ;
2020-03-16 03:10:52 +00:00
}
2020-03-29 02:00:46 +00:00
async function updateReleasesSearch ( releaseIds ) {
2020-05-14 02:26:05 +00:00
logger . info ( ` Updating search documents for ${ releaseIds ? releaseIds . length : 'all' } releases ` ) ;
2020-03-29 02:00:46 +00:00
2020-05-14 02:26:05 +00:00
const documents = await knex . raw ( `
2020-03-29 02:00:46 +00:00
SELECT
releases . id AS release _id ,
TO _TSVECTOR (
'traxxx' ,
2020-03-29 21:42:41 +00:00
COALESCE ( releases . title , '' ) || ' ' ||
2020-06-17 02:07:24 +00:00
parents . name || ' ' ||
parents . slug || ' ' ||
parents . url || ' ' ||
entities . name || ' ' ||
entities . slug || ' ' ||
COALESCE ( entities . url , '' ) || ' ' ||
COALESCE ( entities . alias , '' ) || ' ' ||
2020-03-29 02:00:46 +00:00
COALESCE ( releases . shoot _id , '' ) || ' ' ||
COALESCE ( TO _CHAR ( releases . date , 'YYYY YY MM FMMM FMmonth mon DD FMDD' ) , '' ) || ' ' ||
STRING _AGG ( COALESCE ( actors . name , '' ) , ' ' ) || ' ' ||
STRING _AGG ( COALESCE ( tags . name , '' ) , ' ' ) || ' ' ||
STRING _AGG ( COALESCE ( tags _aliases . name , '' ) , ' ' )
) as document
FROM releases
2020-06-17 02:07:24 +00:00
LEFT JOIN entities ON releases . entity _id = entities . id
LEFT JOIN entities AS parents ON parents . id = entities . parent _id
2020-03-29 02:00:46 +00:00
LEFT JOIN releases _actors AS local _actors ON local _actors . release _id = releases . id
LEFT JOIN releases _tags AS local _tags ON local _tags . release _id = releases . id
LEFT JOIN actors ON local _actors . actor _id = actors . id
LEFT JOIN tags ON local _tags . tag _id = tags . id AND tags . priority >= 7
LEFT JOIN tags as tags _aliases ON local _tags . tag _id = tags _aliases . alias _for AND tags _aliases . secondary = true
$ { releaseIds ? 'WHERE releases.id = ANY(?)' : '' }
2020-06-17 02:07:24 +00:00
GROUP BY releases . id , entities . name , entities . slug , entities . alias , entities . url , parents . name , parents . slug , parents . url ;
2020-03-29 02:00:46 +00:00
` , releaseIds && [releaseIds]);
2020-05-14 02:26:05 +00:00
if ( documents . rows ? . length > 0 ) {
const query = knex ( 'releases_search' ) . insert ( documents . rows ) . toString ( ) ;
await knex . raw ( ` ${ query } ON CONFLICT (release_id) DO UPDATE SET document = EXCLUDED.document ` ) ;
}
2020-03-29 02:00:46 +00:00
}
2020-03-16 03:10:52 +00:00
async function storeReleases ( releases ) {
2020-05-14 02:26:05 +00:00
if ( releases . length === 0 ) {
return [ ] ;
}
const [ batchId ] = await knex ( 'batches' ) . insert ( { comment : null } ) . returning ( 'id' ) ;
2020-03-16 03:10:52 +00:00
2020-05-14 02:26:05 +00:00
const releasesWithSites = await attachChannelSites ( releases ) ;
const releasesWithStudios = await attachStudios ( releasesWithSites ) ;
2020-03-16 23:58:03 +00:00
2020-05-14 02:26:05 +00:00
// uniqueness is site ID + entry ID, filter uniques after adding sites
const { uniqueReleases , duplicateReleases , duplicateReleaseEntries } = await filterDuplicateReleases ( releasesWithStudios ) ;
2020-03-16 03:10:52 +00:00
2020-05-14 02:26:05 +00:00
const curatedNewReleaseEntries = uniqueReleases . map ( release => curateReleaseEntry ( release , batchId ) ) ;
2020-03-21 01:48:24 +00:00
2020-05-26 02:11:29 +00:00
const storedReleases = await knex . batchInsert ( 'releases' , curatedNewReleaseEntries ) . returning ( '*' ) ;
2020-05-14 02:26:05 +00:00
// TODO: update duplicate releases
2020-03-16 03:10:52 +00:00
2020-05-14 02:26:05 +00:00
const storedReleaseEntries = Array . isArray ( storedReleases ) ? storedReleases : [ ] ;
const releasesWithId = attachReleaseIds ( [ ] . concat ( uniqueReleases , duplicateReleases ) , [ ] . concat ( storedReleaseEntries , duplicateReleaseEntries ) ) ;
2020-03-22 02:50:24 +00:00
2020-05-16 02:36:45 +00:00
const [ actors ] = await Promise . all ( [
2020-05-14 02:26:05 +00:00
associateActors ( releasesWithId , batchId ) ,
associateReleaseTags ( releasesWithId ) ,
] ) ;
2020-03-21 01:48:24 +00:00
2020-05-16 02:36:45 +00:00
await updateReleasesSearch ( releasesWithId . map ( release => release . id ) ) ;
2020-05-14 02:26:05 +00:00
// media is more error-prone, associate separately
await associateReleaseMedia ( releasesWithId ) ;
2020-05-17 02:59:09 +00:00
if ( argv . withActors ) {
await scrapeActors ( actors . map ( actor => actor . name ) ) ;
}
2020-05-13 00:56:20 +00:00
2020-05-14 02:26:05 +00:00
logger . info ( ` Stored ${ storedReleaseEntries . length } releases ` ) ;
2020-03-28 03:37:04 +00:00
2020-05-14 02:26:05 +00:00
return releasesWithId ;
2020-03-16 03:10:52 +00:00
}
module . exports = {
2020-05-14 02:26:05 +00:00
storeReleases ,
updateReleasesSearch ,
2020-03-16 03:10:52 +00:00
} ;