2020-03-14 01:56:28 +00:00
'use strict' ;
const Promise = require ( 'bluebird' ) ;
const moment = require ( 'moment' ) ;
const argv = require ( './argv' ) ;
const logger = require ( './logger' ) ( _ _filename ) ;
const knex = require ( './knex' ) ;
const include = require ( './utils/argv-include' ) ( argv ) ;
const scrapers = require ( './scrapers/scrapers' ) ;
2020-08-13 22:32:59 +00:00
const { fetchIncludedEntities } = require ( './entities' ) ;
2020-03-14 01:56:28 +00:00
2020-03-28 03:37:04 +00:00
async function filterUniqueReleases ( latestReleases , accReleases ) {
2020-05-14 02:26:05 +00:00
const latestReleaseIdentifiers = latestReleases
2020-06-27 00:57:30 +00:00
. map ( release => [ release . entity . id , release . entryId ] ) ;
2020-03-21 01:48:24 +00:00
2020-05-14 02:26:05 +00:00
const duplicateReleases = await knex ( 'releases' )
2020-06-17 02:07:24 +00:00
. whereIn ( [ 'entity_id' , 'entry_id' ] , latestReleaseIdentifiers ) ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
// add entry IDs of accumulated releases to prevent an infinite scrape loop
// when one page contains the same release as the previous
const duplicateReleasesSiteIdAndEntryIds = duplicateReleases
. concat ( accReleases )
. reduce ( ( acc , release ) => {
2020-06-27 00:57:30 +00:00
const entityId = release . entity _id || release . entity . id ;
2020-05-14 02:26:05 +00:00
const entryId = release . entry _id || release . entryId ;
2020-03-21 01:48:24 +00:00
2020-06-27 00:57:30 +00:00
if ( ! acc [ entityId ] ) acc [ entityId ] = { } ;
acc [ entityId ] [ entryId ] = true ;
2020-03-21 01:48:24 +00:00
2020-05-14 02:26:05 +00:00
return acc ;
} , { } ) ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
const uniqueReleases = latestReleases
2020-06-27 00:57:30 +00:00
. filter ( release => ! duplicateReleasesSiteIdAndEntryIds [ release . entity . id ] ? . [ release . entryId ] ) ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
return uniqueReleases ;
2020-03-14 01:56:28 +00:00
}
2020-03-21 01:48:24 +00:00
function needNextPage ( uniqueReleases , pageAccReleases ) {
2020-07-14 01:46:31 +00:00
if ( uniqueReleases . length === 0 ) {
return false ;
}
2020-07-15 02:51:39 +00:00
if ( argv . last && pageAccReleases . length < argv . last ) {
// TODO: find a way to paginate if scraper filters page with multiple channels, see Kelly Madison
return true ;
}
2020-05-14 02:26:05 +00:00
if ( uniqueReleases . every ( release => ! ! release . date ) ) {
const oldestReleaseOnPage = uniqueReleases
. sort ( ( releaseA , releaseB ) => releaseB . date - releaseA . date )
. slice ( - 1 ) [ 0 ] ;
2020-08-12 18:51:08 +00:00
if ( moment ( oldestReleaseOnPage . date ) . isAfter ( argv . after ) ) {
2020-05-14 02:26:05 +00:00
// oldest release on page is newer than the specified date cut-off
return true ;
}
}
// dates missing, and limit for scenes without dates not yet reached
return pageAccReleases . length <= argv . nullDateLimit ;
2020-03-14 01:56:28 +00:00
}
2020-06-27 00:57:30 +00:00
async function scrapeReleases ( scraper , entity , preData , upcoming = false ) {
2020-05-14 02:26:05 +00:00
const scrapePage = async ( page = 1 , accReleases = [ ] ) => {
const latestReleases = upcoming
2020-07-17 21:27:59 +00:00
? await scraper . fetchUpcoming ( entity , page , include , preData )
: await scraper . fetchLatest ( entity , page , include , preData ) ;
2020-07-17 02:33:05 +00:00
2020-05-14 02:26:05 +00:00
if ( ! Array . isArray ( latestReleases ) ) {
// scraper is unable to fetch the releases and returned a HTTP code or null
2020-06-27 00:57:30 +00:00
logger . warn ( ` Scraper returned ${ latestReleases } when fetching latest from ' ${ entity . name } ' ( ${ entity . parent ? . name } ) ` ) ;
2020-05-14 02:26:05 +00:00
return accReleases ;
}
2020-03-14 01:56:28 +00:00
2020-07-12 03:25:27 +00:00
const latestReleasesWithEntity = latestReleases . map ( release => ( {
... release ,
entity : release . entity || entity , // allow override
} ) ) ; // attach entity the release is assigned to when stored
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
const uniqueReleases = argv . redownload
2020-06-27 00:57:30 +00:00
? latestReleasesWithEntity
: await filterUniqueReleases ( latestReleasesWithEntity , accReleases ) ;
2020-03-16 03:10:52 +00:00
2020-05-14 02:26:05 +00:00
const pageAccReleases = accReleases . concat ( uniqueReleases ) ;
2020-03-14 01:56:28 +00:00
2020-06-27 00:57:30 +00:00
logger . verbose ( ` Scraped ' ${ entity . name } ' ( ${ entity . parent ? . name } ) ${ upcoming ? 'upcoming' : 'latest' } page ${ page } , found ${ uniqueReleases . length } unique updates ` ) ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
if ( needNextPage ( uniqueReleases , pageAccReleases ) ) {
return scrapePage ( page + 1 , pageAccReleases ) ;
}
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
return pageAccReleases ;
} ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
const rawReleases = await scrapePage ( argv . page || 1 , [ ] ) ;
const releases = upcoming
? rawReleases . map ( rawRelease => ( { ... rawRelease , upcoming : true } ) )
: rawReleases ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
if ( argv . last ) {
return releases . slice ( 0 , argv . last ) ;
}
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
if ( releases . every ( release => release . date ) ) {
2020-08-12 18:51:08 +00:00
return releases . filter ( release => moment ( release . date ) . isAfter ( argv . after ) ) ;
2020-05-14 02:26:05 +00:00
}
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
return releases . slice ( 0 , argv . nullDateLimit ) ;
2020-03-14 01:56:28 +00:00
}
2020-06-27 00:57:30 +00:00
async function scrapeLatestReleases ( scraper , entity , preData ) {
2020-05-14 02:26:05 +00:00
if ( ! scraper . fetchLatest ) {
return [ ] ;
}
2020-03-28 03:37:04 +00:00
2020-05-14 02:26:05 +00:00
try {
2020-06-27 00:57:30 +00:00
return await scrapeReleases ( scraper , entity , preData , false ) ;
2020-05-14 02:26:05 +00:00
} catch ( error ) {
2020-07-05 02:10:35 +00:00
console . trace ( error ) ;
2020-06-27 00:57:30 +00:00
logger . warn ( ` Failed to scrape latest updates for ' ${ entity . slug } ' ( ${ entity . parent ? . slug } ): ${ error . message } ` ) ;
2020-05-14 02:26:05 +00:00
}
2020-03-28 03:37:04 +00:00
2020-05-14 02:26:05 +00:00
return [ ] ;
2020-03-28 03:37:04 +00:00
}
2020-06-27 00:57:30 +00:00
async function scrapeUpcomingReleases ( scraper , entity , preData ) {
2020-05-14 02:26:05 +00:00
if ( ! scraper . fetchUpcoming ) {
return [ ] ;
}
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
try {
2020-06-27 00:57:30 +00:00
return await scrapeReleases ( scraper , entity , preData , true ) ;
2020-05-14 02:26:05 +00:00
} catch ( error ) {
2020-06-27 00:57:30 +00:00
logger . warn ( ` Failed to scrape upcoming updates for ' ${ entity . slug } ' ( ${ entity . parent ? . slug } ): ${ error . message } ` ) ;
2020-05-14 02:26:05 +00:00
}
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
return [ ] ;
2020-03-14 01:56:28 +00:00
}
2020-08-01 13:11:07 +00:00
async function scrapeMovies ( scraper , entity ) {
if ( ! scraper . fetchMovies ) {
return [ ] ;
}
try {
// return await scrapeReleases(scraper, entity, preData, true);
return await scraper . fetchMovies ( entity ) ;
} catch ( error ) {
logger . warn ( ` Failed to scrape movies for ' ${ entity . slug } ' ( ${ entity . parent ? . slug } ): ${ error . message } ` ) ;
}
return [ ] ;
}
2020-06-27 00:57:30 +00:00
async function scrapeChannelReleases ( scraper , channelEntity , preData ) {
2020-08-10 19:39:55 +00:00
const [ latestReleases , upcomingReleases , movies ] = await Promise . all ( [
2020-05-14 02:26:05 +00:00
argv . latest
2020-06-27 00:57:30 +00:00
? scrapeLatestReleases ( scraper , channelEntity , preData )
2020-05-14 02:26:05 +00:00
: [ ] ,
argv . upcoming
2020-06-27 00:57:30 +00:00
? scrapeUpcomingReleases ( scraper , channelEntity , preData )
2020-05-14 02:26:05 +00:00
: [ ] ,
2020-08-01 13:11:07 +00:00
argv . movies
? scrapeMovies ( scraper , channelEntity , preData )
: [ ] ,
2020-05-14 02:26:05 +00:00
] ) ;
2020-08-15 17:04:33 +00:00
console . log ( movies ) ;
2020-06-27 22:15:13 +00:00
logger . info ( ` Fetching ${ latestReleases . length } latest and ${ upcomingReleases . length } upcoming updates for ' ${ channelEntity . name } ' ( ${ channelEntity . parent ? . name } ) ` ) ;
2020-05-14 02:26:05 +00:00
return [ ... latestReleases , ... upcomingReleases ] ;
2020-03-14 01:56:28 +00:00
}
2020-06-27 00:57:30 +00:00
async function scrapeChannel ( channelEntity , accNetworkReleases ) {
const scraper = scrapers . releases [ channelEntity . slug ]
|| scrapers . releases [ channelEntity . parent ? . slug ]
|| scrapers . releases [ channelEntity . parent ? . parent ? . slug ] ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
if ( ! scraper ) {
2020-06-27 22:15:13 +00:00
logger . warn ( ` No scraper found for ' ${ channelEntity . name } ' ( ${ channelEntity . parent ? . name } ) ` ) ;
2020-05-14 02:26:05 +00:00
return [ ] ;
}
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
try {
2020-06-27 00:57:30 +00:00
const beforeFetchLatest = await scraper . beforeFetchLatest ? . ( channelEntity ) ;
2020-03-14 01:56:28 +00:00
2020-06-27 00:57:30 +00:00
const channelEntityReleases = await scrapeChannelReleases ( scraper , channelEntity , {
accNetworkReleases ,
2020-05-14 02:26:05 +00:00
beforeFetchLatest ,
} ) ;
2020-03-14 01:56:28 +00:00
2020-06-27 00:57:30 +00:00
return channelEntityReleases . map ( release => ( { ... release , channelEntity } ) ) ;
2020-05-14 02:26:05 +00:00
} catch ( error ) {
2020-06-27 00:57:30 +00:00
logger . error ( ` Failed to scrape releases from ${ channelEntity . name } using ${ scraper . slug } : ${ error . message } ` ) ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
return [ ] ;
}
2020-03-14 01:56:28 +00:00
}
2020-06-25 00:26:25 +00:00
async function scrapeNetworkSequential ( networkEntity ) {
2020-05-14 02:26:05 +00:00
return Promise . reduce (
2020-06-25 00:26:25 +00:00
networkEntity . children ,
2020-06-27 00:57:30 +00:00
async ( chain , channelEntity ) => {
const accNetworkReleases = await chain ;
2020-06-27 22:15:13 +00:00
const channelReleases = await scrapeChannel ( channelEntity , accNetworkReleases ) ;
2020-05-14 02:26:05 +00:00
2020-06-27 00:57:30 +00:00
return accNetworkReleases . concat ( channelReleases ) ;
2020-05-14 02:26:05 +00:00
} ,
Promise . resolve ( [ ] ) ,
) ;
2020-03-14 01:56:28 +00:00
}
2020-06-25 00:26:25 +00:00
async function scrapeNetworkParallel ( networkEntity ) {
2020-05-14 02:26:05 +00:00
return Promise . map (
2020-06-25 00:26:25 +00:00
networkEntity . children ,
2020-06-27 00:57:30 +00:00
async channelEntity => scrapeChannel ( channelEntity , networkEntity ) ,
2020-05-14 02:26:05 +00:00
{ concurrency : 3 } ,
) ;
2020-03-14 01:56:28 +00:00
}
async function fetchUpdates ( ) {
2020-08-13 22:32:59 +00:00
const includedNetworks = await fetchIncludedEntities ( ) ;
2020-06-15 01:58:35 +00:00
2020-05-14 02:26:05 +00:00
const scrapedNetworks = await Promise . map (
includedNetworks ,
2020-06-27 00:57:30 +00:00
async networkEntity => ( networkEntity . parameters ? . sequential
? scrapeNetworkSequential ( networkEntity )
: scrapeNetworkParallel ( networkEntity ) ) ,
2020-05-14 02:26:05 +00:00
{ concurrency : 5 } ,
) ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
const releases = scrapedNetworks . flat ( 2 ) ;
2020-03-16 03:10:52 +00:00
2020-05-14 02:26:05 +00:00
return releases ;
2020-03-14 01:56:28 +00:00
}
module . exports = fetchUpdates ;