2020-03-16 03:10:52 +00:00
'use strict' ;
2020-03-16 23:58:03 +00:00
const Promise = require ( 'bluebird' ) ;
2020-03-16 03:10:52 +00:00
const argv = require ( './argv' ) ;
2020-03-21 01:48:24 +00:00
const include = require ( './utils/argv-include' ) ( argv ) ;
2020-03-16 03:10:52 +00:00
const logger = require ( './logger' ) ( _ _filename ) ;
const knex = require ( './knex' ) ;
const scrapers = require ( './scrapers/scrapers' ) ;
const { curateSites } = require ( './sites' ) ;
const { curateNetworks } = require ( './networks' ) ;
function urlToSiteSlug ( url ) {
try {
const slug = new URL ( url )
. hostname
. match ( /([\w-]+)\.\w+$/ ) ? . [ 1 ] ;
return slug ;
} catch ( error ) {
logger . warn ( ` Failed to derive site slug from ' ${ url } ': ${ error . message } ` ) ;
return null ;
}
}
async function findSites ( baseReleases ) {
const baseReleasesWithoutSite = baseReleases . filter ( release => release . url && ! release . site ) ;
const siteSlugs = Array . from ( new Set (
baseReleasesWithoutSite
. map ( baseRelease => urlToSiteSlug ( baseRelease . url ) )
. filter ( Boolean ) ,
) ) ;
2020-03-21 01:48:24 +00:00
const siteEntries = await knex ( 'sites' )
. leftJoin ( 'networks' , 'networks.id' , 'sites.network_id' )
. select ( 'sites.*' , 'networks.name as network_name' , 'networks.slug as network_slug' , 'networks.url as network_url' , 'networks.parameters as network_parameters' , 'networks.description as network_description' )
. whereIn ( 'sites.slug' , siteSlugs ) ;
2020-03-16 03:10:52 +00:00
const networkEntries = await knex ( 'networks' ) . whereIn ( 'slug' , siteSlugs ) ;
const sites = await curateSites ( siteEntries , true , false ) ;
const networks = await curateNetworks ( networkEntries , true , false , false ) ;
const markedNetworks = networks . map ( network => ( { ... network , isFallback : true } ) ) ;
const sitesBySlug = [ ]
2020-03-21 01:48:24 +00:00
. concat ( markedNetworks , sites )
2020-03-16 03:10:52 +00:00
. reduce ( ( accSites , site ) => ( { ... accSites , [ site . slug ] : site } ) , { } ) ;
return sitesBySlug ;
}
function toBaseReleases ( baseReleasesOrUrls ) {
return baseReleasesOrUrls
. map ( ( baseReleaseOrUrl ) => {
if ( baseReleaseOrUrl . url ) {
// base release with URL
return {
... baseReleaseOrUrl ,
deep : false ,
} ;
}
if ( /^http/ . test ( baseReleaseOrUrl ) ) {
// URL
return {
url : baseReleaseOrUrl ,
deep : false ,
} ;
}
if ( typeof baseReleaseOrUrl === 'object' && ! Array . isArray ( baseReleaseOrUrl ) ) {
// base release without URL, prepare for passthrough
return {
... baseReleaseOrUrl ,
deep : false ,
} ;
}
logger . warn ( ` Malformed base release, discarding ' ${ baseReleaseOrUrl } ' ` ) ;
return null ;
} )
. filter ( Boolean ) ;
}
async function scrapeRelease ( baseRelease , sites , type = 'scene' ) {
const site = baseRelease . site || sites [ urlToSiteSlug ( baseRelease . url ) ] ;
if ( ! site ) {
logger . warn ( ` No site available for ${ baseRelease . url } ` ) ;
return baseRelease ;
}
if ( ( ! baseRelease . url && ! baseRelease . path ) || ! argv . deep ) {
return {
... baseRelease ,
site ,
} ;
}
const scraper = scrapers . releases [ site . slug ] ;
if ( ! scraper ) {
logger . warn ( ` Could not find scraper for ${ baseRelease . url } ` ) ;
return baseRelease ;
}
if ( ( type === 'scene' && ! scraper . fetchScene ) || ( type === 'movie' && ! scraper . fetchMovie ) ) {
logger . warn ( ` The ' ${ site . name } '-scraper cannot fetch individual ${ type } s ` ) ;
return baseRelease ;
}
try {
const scrapedRelease = type === 'scene'
2020-03-21 01:48:24 +00:00
? await scraper . fetchScene ( baseRelease . url , site , baseRelease , null , include )
: await scraper . fetchMovie ( baseRelease . url , site , baseRelease , null , include ) ;
2020-03-16 03:10:52 +00:00
const mergedRelease = {
... baseRelease ,
... scrapedRelease ,
deep : ! ! scrapedRelease ,
site ,
} ;
if ( scrapedRelease && baseRelease ? . tags ) {
mergedRelease . tags = baseRelease . tags . concat ( scrapedRelease . tags ) ;
}
return mergedRelease ;
} catch ( error ) {
logger . error ( ` Deep scrape failed for ${ baseRelease . url } : ${ error . message } ` ) ;
return baseRelease ;
}
}
2020-03-21 01:48:24 +00:00
async function scrapeReleases ( baseReleases , sites , type ) {
2020-03-16 23:58:03 +00:00
return Promise . map (
baseReleases ,
2020-03-21 01:48:24 +00:00
async baseRelease => scrapeRelease ( baseRelease , sites , type ) ,
2020-03-16 23:58:03 +00:00
{ concurrency : 10 } ,
) ;
2020-03-16 03:10:52 +00:00
}
2020-03-21 01:48:24 +00:00
async function fetchReleases ( baseReleasesOrUrls , type = 'scene' ) {
2020-03-16 03:10:52 +00:00
const baseReleases = toBaseReleases ( baseReleasesOrUrls ) ;
const sites = await findSites ( baseReleases ) ;
2020-03-21 01:48:24 +00:00
const deepReleases = await scrapeReleases ( baseReleases , sites , type ) ;
2020-03-16 03:10:52 +00:00
return deepReleases ;
}
2020-03-21 01:48:24 +00:00
async function fetchScenes ( baseReleasesOrUrls ) {
return fetchReleases ( baseReleasesOrUrls , 'scene' ) ;
}
async function fetchMovies ( baseReleasesOrUrls ) {
return fetchReleases ( baseReleasesOrUrls , 'movie' ) ;
}
module . exports = {
fetchReleases ,
fetchScenes ,
fetchMovies ,
} ;