2020-03-16 03:10:52 +00:00
'use strict' ;
2021-12-01 22:30:10 +00:00
const util = require ( 'util' ) ;
2020-03-16 23:58:03 +00:00
const Promise = require ( 'bluebird' ) ;
2022-11-27 03:22:58 +00:00
const unprint = require ( 'unprint' ) ;
2021-08-15 11:16:48 +00:00
const { mergeAdvanced : merge } = require ( 'object-merge-advanced' ) ;
2020-03-16 23:58:03 +00:00
2020-03-16 03:10:52 +00:00
const argv = require ( './argv' ) ;
2020-03-21 01:48:24 +00:00
const include = require ( './utils/argv-include' ) ( argv ) ;
2023-06-04 19:50:59 +00:00
const { fetchReleaseEntities , urlToHostname } = require ( './entities' ) ;
2020-03-16 03:10:52 +00:00
const logger = require ( './logger' ) ( _ _filename ) ;
2021-01-13 20:29:05 +00:00
const qu = require ( './utils/qu' ) ;
2021-02-10 02:23:48 +00:00
const getRecursiveParameters = require ( './utils/get-recursive-parameters' ) ;
2021-12-01 16:26:13 +00:00
const windows = require ( './utils/http-windows' ) ;
2020-03-16 03:10:52 +00:00
2021-12-01 22:30:10 +00:00
const waitImmediate = util . promisify ( setImmediate ) ;
2020-11-15 22:50:04 +00:00
function toBaseReleases ( baseReleasesOrUrls , entity = null ) {
2020-05-17 01:00:44 +00:00
if ( ! baseReleasesOrUrls ) {
return [ ] ;
}
2020-05-14 02:26:05 +00:00
return baseReleasesOrUrls
. map ( ( baseReleaseOrUrl ) => {
if ( baseReleaseOrUrl . url ) {
// base release with URL
return {
... baseReleaseOrUrl ,
2020-11-24 03:29:44 +00:00
entity : baseReleaseOrUrl . entity || entity ,
2020-05-14 02:26:05 +00:00
deep : false ,
} ;
}
if ( /^http/ . test ( baseReleaseOrUrl ) ) {
// URL
return {
url : baseReleaseOrUrl ,
2020-11-15 22:50:04 +00:00
entity ,
2020-05-14 02:26:05 +00:00
deep : false ,
} ;
}
if ( typeof baseReleaseOrUrl === 'object' && ! Array . isArray ( baseReleaseOrUrl ) ) {
// base release without URL, prepare for passthrough
return {
... baseReleaseOrUrl ,
2020-11-24 03:29:44 +00:00
entity : baseReleaseOrUrl . entity || entity ,
2020-05-14 02:26:05 +00:00
deep : false ,
} ;
}
logger . warn ( ` Malformed base release, discarding ' ${ baseReleaseOrUrl } ' ` ) ;
return null ;
} )
. filter ( Boolean ) ;
2020-03-16 03:10:52 +00:00
}
2022-11-27 03:22:58 +00:00
async function fetchUnprintScene ( scraper , url , entity , baseRelease , options , type ) {
2023-07-01 19:46:44 +00:00
const releaseScraper = scraper [ type === 'movie' ? 'scrapeMovie' : 'scrapeScene' ] ;
2022-11-27 03:22:58 +00:00
const res = await unprint . get ( url , {
rejectUnauthorized : false ,
2023-07-01 19:46:44 +00:00
... ( releaseScraper . scraper && releaseScraper ) , // options object
2022-11-27 03:22:58 +00:00
} ) ;
if ( res . ok ) {
2023-07-01 19:46:44 +00:00
return ( releaseScraper . scraper || releaseScraper ) ( res . context , {
2022-11-27 03:22:58 +00:00
url ,
entity ,
baseRelease ,
headers : res . headers ,
2023-07-05 22:14:38 +00:00
include ,
beforeFetchScenes : options . beforeFetchScenes ,
parameters : options . parameters ,
} , options ) ; // options parameter should probably be retired
2022-11-27 03:22:58 +00:00
}
return res . status ;
}
2021-12-20 01:22:10 +00:00
async function fetchScene ( scraper , url , entity , baseRelease , options , type = 'scene' ) {
if ( ( type === 'scene' && scraper . fetchScene ) || ( type === 'movie' && scraper . fetchMovie ) ) {
return scraper [ type === 'movie' ? 'fetchMovie' : 'fetchScene' ] ( baseRelease . url , entity , baseRelease , options , null ) ;
2021-01-13 20:29:05 +00:00
}
2021-12-20 01:22:10 +00:00
if ( ( type === 'scene' && scraper . scrapeScene ) || ( type === 'movie' && scraper . scrapeMovie ) ) {
2024-08-20 00:33:42 +00:00
/ *
2024-08-16 21:26:52 +00:00
if ( scraper . useUnprint || ( type === 'scene' && scraper . scrapeScene ? . unprint ) || ( type === 'movie' && scraper . scrapeMovie ? . unprint ) ) {
2022-11-27 03:22:58 +00:00
return fetchUnprintScene ( scraper , url , entity , baseRelease , options , type ) ;
}
2024-08-20 00:33:42 +00:00
* /
if ( ! scraper . deprecated ) {
return fetchUnprintScene ( scraper , url , entity , baseRelease , options , type ) ;
}
2022-11-27 03:22:58 +00:00
2021-02-04 00:13:02 +00:00
const session = qu . session ( ) ;
2021-08-29 23:13:32 +00:00
const res = await qu . get ( url , null , null , {
session ,
rejectUnauthorized : false ,
} ) ;
2021-02-04 00:13:02 +00:00
const cookie = await session . _sessionOptions . cookieJar . get ( url ) ;
2021-01-13 20:29:05 +00:00
if ( res . ok ) {
2021-12-20 01:22:10 +00:00
return scraper [ type === 'movie' ? 'scrapeMovie' : 'scrapeScene' ] ( res . item , url , entity , baseRelease , options , {
2021-02-04 00:13:02 +00:00
session ,
headers : res . headers ,
cookieJar : session . _sessionOptions . cookieJar ,
cookie ,
} ) ;
2021-01-13 20:29:05 +00:00
}
return res . status ;
}
return null ;
}
2021-12-20 01:22:10 +00:00
function fetchMovie ( scraper , url , entity , baseRelease , options ) {
return fetchScene ( scraper , url , entity , baseRelease , options , 'movie' ) ;
}
2023-06-04 19:50:59 +00:00
async function scrapeRelease ( baseRelease , entitiesByHostname , type = 'scene' ) {
const entity = baseRelease . entity || entitiesByHostname [ urlToHostname ( baseRelease . url ) ] ;
2020-06-25 00:26:25 +00:00
if ( ! entity ) {
logger . warn ( ` No entity available for ${ baseRelease . url } ` ) ;
2020-05-14 02:26:05 +00:00
return baseRelease ;
}
2024-10-16 00:39:11 +00:00
if ( ( ! baseRelease . url && ! baseRelease . path && ! baseRelease . forceDeep ) || ! argv . deep ) {
2020-05-14 02:26:05 +00:00
return {
... baseRelease ,
2020-06-25 00:26:25 +00:00
entity ,
2020-05-14 02:26:05 +00:00
} ;
}
2021-10-26 21:42:32 +00:00
const layoutScraper = entity . scraper ;
2020-05-14 02:26:05 +00:00
2021-10-26 21:42:32 +00:00
if ( ! entity . scraper ) {
2020-05-14 02:26:05 +00:00
logger . warn ( ` Could not find scraper for ${ baseRelease . url } ` ) ;
return baseRelease ;
}
2021-12-20 01:22:10 +00:00
if ( ( type === 'scene' && ! layoutScraper . fetchScene && ! layoutScraper . scrapeScene ) || ( type === 'movie' && ! layoutScraper . fetchMovie && ! layoutScraper . scrapeMovie ) ) {
2021-01-13 20:29:05 +00:00
logger . warn ( ` The ' ${ entity . name } '-scraper cannot scrape individual ${ type } s ` ) ;
2020-05-14 02:26:05 +00:00
return baseRelease ;
}
try {
2024-10-18 02:05:51 +00:00
logger . verbose ( ` Fetching ${ type } ${ baseRelease . url || baseRelease . path } ` ) ;
2020-05-14 02:26:05 +00:00
2021-02-10 02:23:48 +00:00
const options = {
... include ,
2021-10-27 15:19:23 +00:00
beforeFetchScenes : entity . preData ,
2021-02-10 02:23:48 +00:00
parameters : getRecursiveParameters ( entity ) ,
} ;
2021-12-01 16:26:13 +00:00
logger . debug ( ` Memory usage before: ${ process . memoryUsage . rss ( ) / 1000000 } MB ( ${ baseRelease . url } ) ` ) ;
2021-10-26 21:42:32 +00:00
const rawScrapedRelease = type === 'scene'
2021-12-20 01:22:10 +00:00
? await fetchScene ( layoutScraper , baseRelease . url , entity , baseRelease , options )
: await fetchMovie ( layoutScraper , baseRelease . url , entity , baseRelease , options ) ;
2020-05-14 02:26:05 +00:00
2022-02-07 21:16:43 +00:00
const pathname = baseRelease . path || ( baseRelease . url && new URL ( baseRelease . url ) . pathname . replace ( /\//g , '_' ) ) ;
2021-12-01 16:26:13 +00:00
2021-12-20 01:22:10 +00:00
if ( rawScrapedRelease ) {
delete rawScrapedRelease . query ; // some scrapers pass the qu-wrapped window instance to parent scrapers, filling up memory
}
2021-12-05 01:54:55 +00:00
2021-12-01 22:44:25 +00:00
if ( windows . has ( pathname ) ) {
logger . debug ( ` Closing window for ${ pathname } ` ) ;
2021-12-03 23:32:28 +00:00
windows . get ( pathname ) . close ( ) ;
windows . delete ( pathname ) ;
}
2021-12-01 16:26:13 +00:00
2021-12-01 22:30:10 +00:00
await waitImmediate ;
2021-12-01 16:26:13 +00:00
logger . debug ( ` Memory usage after: ${ process . memoryUsage . rss ( ) / 1000000 } MB ( ${ baseRelease . url } ) ` ) ;
2021-10-26 21:42:32 +00:00
const scrapedRelease = rawScrapedRelease ? . scene || rawScrapedRelease ;
2021-02-27 17:05:06 +00:00
if ( ! scrapedRelease || typeof scrapedRelease !== 'object' || Array . isArray ( scrapedRelease ) ) {
2021-02-10 02:00:17 +00:00
// scraper is unable to fetch the releases and returned a HTTP code or null
2024-10-17 21:50:55 +00:00
throw new Error ( ` Scraper returned ' ${ scrapedRelease } ' when deep fetching ( ${ entity . name } , ${ entity . parent ? . name } ) ${ baseRelease . url || baseRelease . path } ` ) ;
2021-02-10 02:00:17 +00:00
}
2021-01-25 22:53:56 +00:00
// object-merge-advance will use null as explicit false on hard merged keys, even when null as explicit falls is disabled
// filter out keys with null values to ensure original base value is used instead
const curatedScrapedRelease = Object . entries ( scrapedRelease ) . reduce ( ( acc , [ key , value ] ) => ( {
... acc ,
2022-04-07 14:06:38 +00:00
... ( value !== null && value !== undefined && ! ( Array . isArray ( value ) && value . filter ( Boolean ) . length === 0 ) && {
[ key ] : Array . isArray ( value ) ? value . filter ( Boolean ) : value ,
2021-01-25 22:53:56 +00:00
} ) ,
} ) , { } ) ;
2024-08-20 00:33:42 +00:00
// curatedScrapedRelease.poster = null; // wat
2024-07-09 00:19:23 +00:00
2020-05-14 02:26:05 +00:00
const mergedRelease = {
2021-01-25 22:53:56 +00:00
... merge ( baseRelease , curatedScrapedRelease , {
2021-01-13 15:08:19 +00:00
dedupeStringsInArrayValues : true ,
2021-02-02 02:10:58 +00:00
hardMergeKeys : [ 'actors' , 'covers' , 'poster' , 'trailer' , 'teaser' ] ,
2024-07-09 00:19:23 +00:00
ignoreKeys : [ 'poster' ] ,
2021-01-13 15:08:19 +00:00
} ) ,
2024-08-20 00:33:42 +00:00
datePrecision : curatedScrapedRelease . date // don't inherit date precision from base release
? curatedScrapedRelease . datePrecision
: baseRelease . datePrecision ,
2024-07-09 00:19:23 +00:00
poster : Array . from ( new Set ( [
... [ ] . concat ( curatedScrapedRelease . poster ) ,
... [ ] . concat ( baseRelease . poster ) ,
] ) ) . filter ( Boolean ) ,
2022-12-29 22:07:08 +00:00
photos : curatedScrapedRelease . photos ? . length > 0
? curatedScrapedRelease . photos
: baseRelease . photos ,
2020-05-14 02:26:05 +00:00
deep : ! ! scrapedRelease ,
2020-06-25 00:26:25 +00:00
entity ,
2020-05-14 02:26:05 +00:00
} ;
2020-05-18 02:28:38 +00:00
if ( ! mergedRelease . entryId ) {
2020-05-20 00:23:45 +00:00
throw Object . assign ( new Error ( 'No entry ID supplied' ) , { code : 'NO_ENTRY_ID' } ) ;
2020-05-18 02:28:38 +00:00
}
2020-05-14 02:26:05 +00:00
if ( scrapedRelease && baseRelease ? . tags ) {
// accumulate all available tags
mergedRelease . tags = baseRelease . tags . concat ( scrapedRelease . tags ) ;
}
return mergedRelease ;
} catch ( error ) {
logger . error ( ` Deep scrape failed for ${ baseRelease . url } : ${ error . message } ` ) ;
2020-05-20 00:23:45 +00:00
2020-08-20 21:35:18 +00:00
if ( argv . debug ) {
console . error ( error ) ;
}
2020-05-20 00:23:45 +00:00
if ( error . code === 'NO_ENTRY_ID' ) {
return null ;
}
2020-05-14 02:26:05 +00:00
return baseRelease ;
}
2020-03-16 03:10:52 +00:00
}
2023-06-04 19:50:59 +00:00
async function scrapeReleases ( baseReleases , entitiesByHostname , type ) {
const entitiesWithBeforeDataEntries = await Promise . all ( Object . entries ( entitiesByHostname ) . map ( async ( [ slug , entity ] ) => {
2021-10-27 15:19:23 +00:00
if ( entity . scraper ? . beforeFetchScenes ) {
2022-02-25 21:13:41 +00:00
const parameters = getRecursiveParameters ( entity ) ;
const preData = await entity . scraper . beforeFetchScenes ( entity , parameters ) ;
2021-10-26 21:42:32 +00:00
return [ slug , { ... entity , preData } ] ;
}
2021-11-27 22:55:16 +00:00
return [ slug , entity ] ;
2021-10-26 21:42:32 +00:00
} ) ) ;
const entitiesWithBeforeDataBySlug = Object . fromEntries ( entitiesWithBeforeDataEntries . filter ( Boolean ) ) ;
2020-05-14 02:26:05 +00:00
return Promise . map (
baseReleases ,
2021-11-20 22:59:15 +00:00
async ( baseRelease ) => scrapeRelease ( baseRelease , entitiesWithBeforeDataBySlug , type ) ,
2021-12-01 22:39:09 +00:00
{ concurrency : 1 } ,
2020-05-14 02:26:05 +00:00
) ;
2020-03-16 03:10:52 +00:00
}
2020-03-21 01:48:24 +00:00
async function fetchReleases ( baseReleasesOrUrls , type = 'scene' ) {
2020-05-14 02:26:05 +00:00
const baseReleases = toBaseReleases ( baseReleasesOrUrls ) ;
2023-06-04 19:50:59 +00:00
const entitiesByHostname = await fetchReleaseEntities ( baseReleases ) ;
2020-03-16 03:10:52 +00:00
2023-06-04 19:50:59 +00:00
const deepReleases = await scrapeReleases ( baseReleases , entitiesByHostname , type ) ;
2020-03-16 03:10:52 +00:00
2020-05-20 00:23:45 +00:00
return deepReleases . filter ( Boolean ) ;
2020-03-16 03:10:52 +00:00
}
2020-03-21 01:48:24 +00:00
async function fetchScenes ( baseReleasesOrUrls ) {
2020-05-14 02:26:05 +00:00
return fetchReleases ( baseReleasesOrUrls , 'scene' ) ;
2020-03-21 01:48:24 +00:00
}
async function fetchMovies ( baseReleasesOrUrls ) {
2020-08-10 19:39:55 +00:00
const movies = await fetchReleases ( baseReleasesOrUrls , 'movie' ) ;
return movies ;
2020-03-21 01:48:24 +00:00
}
module . exports = {
2020-05-14 02:26:05 +00:00
fetchReleases ,
fetchScenes ,
fetchMovies ,
2020-05-15 02:40:59 +00:00
toBaseReleases ,
2020-03-21 01:48:24 +00:00
} ;