2020-03-14 01:56:28 +00:00
'use strict' ;
const Promise = require ( 'bluebird' ) ;
const moment = require ( 'moment' ) ;
const argv = require ( './argv' ) ;
const logger = require ( './logger' ) ( _ _filename ) ;
const knex = require ( './knex' ) ;
2020-08-23 00:43:10 +00:00
const { curateRelease } = require ( './releases' ) ;
2020-03-14 01:56:28 +00:00
const include = require ( './utils/argv-include' ) ( argv ) ;
const scrapers = require ( './scrapers/scrapers' ) ;
2020-08-13 22:32:59 +00:00
const { fetchIncludedEntities } = require ( './entities' ) ;
2020-03-14 01:56:28 +00:00
2020-08-22 02:22:56 +00:00
const emptyReleases = { uniqueReleases : [ ] , duplicateReleases : [ ] } ;
2020-03-28 03:37:04 +00:00
async function filterUniqueReleases ( latestReleases , accReleases ) {
2020-05-14 02:26:05 +00:00
const latestReleaseIdentifiers = latestReleases
2020-06-27 00:57:30 +00:00
. map ( release => [ release . entity . id , release . entryId ] ) ;
2020-03-21 01:48:24 +00:00
2020-08-23 00:43:10 +00:00
const duplicateReleaseEntries = await knex ( 'releases' )
. select ( knex . raw ( 'releases.*, row_to_json(entities) as entity' ) )
. leftJoin ( 'entities' , 'entities.id' , 'releases.entity_id' )
2020-06-17 02:07:24 +00:00
. whereIn ( [ 'entity_id' , 'entry_id' ] , latestReleaseIdentifiers ) ;
2020-03-14 01:56:28 +00:00
2020-08-23 00:43:10 +00:00
const duplicateReleases = duplicateReleaseEntries . map ( release => curateRelease ( release ) ) ;
2020-05-14 02:26:05 +00:00
// add entry IDs of accumulated releases to prevent an infinite scrape loop
// when one page contains the same release as the previous
const duplicateReleasesSiteIdAndEntryIds = duplicateReleases
. concat ( accReleases )
. reduce ( ( acc , release ) => {
2020-08-23 00:43:10 +00:00
const entityId = release . entityId || release . entity . id ;
const entryId = release . entryId || release . entryId ;
2020-03-21 01:48:24 +00:00
2020-06-27 00:57:30 +00:00
if ( ! acc [ entityId ] ) acc [ entityId ] = { } ;
acc [ entityId ] [ entryId ] = true ;
2020-03-21 01:48:24 +00:00
2020-05-14 02:26:05 +00:00
return acc ;
} , { } ) ;
2020-03-14 01:56:28 +00:00
2020-08-22 02:22:56 +00:00
const uniqueReleases = latestReleases . filter ( release => ! duplicateReleasesSiteIdAndEntryIds [ release . entity . id ] ? . [ release . entryId ] ) ;
2020-03-14 01:56:28 +00:00
2020-08-22 02:22:56 +00:00
return { uniqueReleases , duplicateReleases } ;
2020-03-14 01:56:28 +00:00
}
2020-08-21 23:57:23 +00:00
function needNextPage ( releases , uniqueReleases , totalReleases , hasDates , upcoming ) {
if ( upcoming && uniqueReleases . length > 0 ) {
return argv . paginateUpcoming ;
}
2020-08-21 01:55:51 +00:00
if ( argv . last ) {
return totalReleases + releases . length < argv . last ;
2020-07-14 01:46:31 +00:00
}
2020-08-21 01:55:51 +00:00
if ( ! hasDates ) {
return totalReleases + releases . length < argv . nullDateLimit ;
2020-07-15 02:51:39 +00:00
}
2020-08-21 23:57:23 +00:00
if ( uniqueReleases . length === 0 ) {
return false ;
}
2020-08-21 01:55:51 +00:00
const oldestReleaseOnPage = releases
. sort ( ( releaseA , releaseB ) => releaseB . date - releaseA . date )
. slice ( - 1 ) [ 0 ] ;
2020-05-14 02:26:05 +00:00
2020-08-21 01:55:51 +00:00
if ( moment ( oldestReleaseOnPage . date ) . isAfter ( argv . after ) ) {
// oldest release on page is newer than the specified date cut-off
return true ;
2020-05-14 02:26:05 +00:00
}
2020-08-21 01:55:51 +00:00
return false ;
2020-03-14 01:56:28 +00:00
}
2020-08-22 02:22:56 +00:00
async function scrapeReleases ( scraper , entity , preData , upcoming = false , page = 1 , acc = emptyReleases , totalReleases = 0 ) {
2020-08-21 01:55:51 +00:00
const releases = upcoming
? await scraper . fetchUpcoming ( entity , page , include , preData )
: await scraper . fetchLatest ( entity , page , include , preData ) ;
2020-03-14 01:56:28 +00:00
2020-08-21 01:55:51 +00:00
if ( ! Array . isArray ( releases ) ) {
// scraper is unable to fetch the releases and returned a HTTP code or null
logger . warn ( ` Scraper returned ${ releases } when fetching latest from ' ${ entity . name } ' ( ${ entity . parent ? . name } ) ` ) ;
2020-08-22 02:22:56 +00:00
return acc ;
2020-08-21 01:55:51 +00:00
}
2020-03-14 01:56:28 +00:00
2020-08-21 01:55:51 +00:00
const releasesWithEntity = releases . map ( release => ( {
... release ,
entity : release . entity || entity , // allow override
} ) ) ; // attach entity the release is assigned to when stored
2020-03-14 01:56:28 +00:00
2020-08-21 01:55:51 +00:00
const hasDates = releasesWithEntity . every ( release => ! ! release . date ) ;
2020-03-14 01:56:28 +00:00
2020-08-21 01:55:51 +00:00
const limitedReleases = ( argv . last && releasesWithEntity . slice ( 0 , Math . max ( argv . last - totalReleases , 0 ) ) )
|| ( hasDates && releasesWithEntity . filter ( release => moment ( release . date ) . isAfter ( argv . after ) ) )
|| releasesWithEntity . slice ( 0 , Math . max ( argv . nullDateLimit - totalReleases , 0 ) ) ;
2020-03-14 01:56:28 +00:00
2020-08-22 02:22:56 +00:00
const { uniqueReleases , duplicateReleases } = argv . force
? { uniqueReleases : limitedReleases , duplicateReleases : [ ] }
: await filterUniqueReleases ( limitedReleases , acc . uniqueReleases ) ;
const accReleases = {
uniqueReleases : acc . uniqueReleases . concat ( uniqueReleases ) ,
duplicateReleases : acc . duplicateReleases . concat ( duplicateReleases ) ,
} ;
2020-03-14 01:56:28 +00:00
2020-08-21 23:57:23 +00:00
if ( needNextPage ( releases , uniqueReleases , totalReleases , hasDates , upcoming ) ) {
2020-08-22 02:22:56 +00:00
return scrapeReleases ( scraper , entity , preData , upcoming , page + 1 , accReleases , totalReleases + releases . length ) ;
2020-05-14 02:26:05 +00:00
}
2020-03-14 01:56:28 +00:00
2020-08-22 02:22:56 +00:00
return accReleases ;
2020-03-14 01:56:28 +00:00
}
2020-06-27 00:57:30 +00:00
async function scrapeLatestReleases ( scraper , entity , preData ) {
2020-08-22 02:22:56 +00:00
if ( ! argv . latest || ! scraper . fetchLatest ) {
return emptyReleases ;
2020-05-14 02:26:05 +00:00
}
2020-03-28 03:37:04 +00:00
2020-05-14 02:26:05 +00:00
try {
2020-08-21 01:55:51 +00:00
return await scrapeReleases ( scraper , entity , preData , false , argv . page || 1 ) ;
2020-05-14 02:26:05 +00:00
} catch ( error ) {
2020-08-21 01:55:51 +00:00
if ( argv . debug ) {
console . trace ( error ) ;
}
2020-06-27 00:57:30 +00:00
logger . warn ( ` Failed to scrape latest updates for ' ${ entity . slug } ' ( ${ entity . parent ? . slug } ): ${ error . message } ` ) ;
2020-05-14 02:26:05 +00:00
}
2020-03-28 03:37:04 +00:00
2020-08-22 02:22:56 +00:00
return emptyReleases ;
2020-03-28 03:37:04 +00:00
}
2020-06-27 00:57:30 +00:00
async function scrapeUpcomingReleases ( scraper , entity , preData ) {
2020-08-22 02:22:56 +00:00
if ( ! argv . upcoming || ! scraper . fetchUpcoming ) {
return emptyReleases ;
2020-05-14 02:26:05 +00:00
}
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
try {
2020-06-27 00:57:30 +00:00
return await scrapeReleases ( scraper , entity , preData , true ) ;
2020-05-14 02:26:05 +00:00
} catch ( error ) {
2020-08-21 01:55:51 +00:00
if ( argv . debug ) {
console . trace ( error ) ;
}
2020-06-27 00:57:30 +00:00
logger . warn ( ` Failed to scrape upcoming updates for ' ${ entity . slug } ' ( ${ entity . parent ? . slug } ): ${ error . message } ` ) ;
2020-05-14 02:26:05 +00:00
}
2020-03-14 01:56:28 +00:00
2020-08-22 02:22:56 +00:00
return emptyReleases ;
2020-03-14 01:56:28 +00:00
}
2020-08-01 13:11:07 +00:00
async function scrapeMovies ( scraper , entity ) {
2020-08-22 02:22:56 +00:00
if ( ! argv . movies || ! scraper . fetchMovies ) {
2020-08-01 13:11:07 +00:00
return [ ] ;
}
try {
// return await scrapeReleases(scraper, entity, preData, true);
return await scraper . fetchMovies ( entity ) ;
} catch ( error ) {
logger . warn ( ` Failed to scrape movies for ' ${ entity . slug } ' ( ${ entity . parent ? . slug } ): ${ error . message } ` ) ;
}
return [ ] ;
}
2020-06-27 00:57:30 +00:00
async function scrapeChannelReleases ( scraper , channelEntity , preData ) {
2020-08-21 01:55:51 +00:00
const [ latestReleases , upcomingReleases ] = await Promise . all ( [
2020-08-22 02:22:56 +00:00
scrapeLatestReleases ( scraper , channelEntity , preData ) ,
scrapeUpcomingReleases ( scraper , channelEntity , preData ) ,
scrapeMovies ( scraper , channelEntity , preData ) ,
2020-05-14 02:26:05 +00:00
] ) ;
2020-08-23 00:43:10 +00:00
logger . info ( ` Fetching ${ latestReleases . uniqueReleases . length } latest and ${ upcomingReleases . uniqueReleases . length } upcoming updates for ' ${ channelEntity . name } ' ( ${ channelEntity . parent ? . name } ) ` ) ;
2020-05-14 02:26:05 +00:00
2020-08-23 00:43:10 +00:00
return {
uniqueReleases : [ ... latestReleases . uniqueReleases , ... upcomingReleases . uniqueReleases ] ,
duplicateReleases : [ ... latestReleases . duplicateReleases , ... upcomingReleases . duplicateReleases ] ,
} ;
2020-03-14 01:56:28 +00:00
}
2020-06-27 00:57:30 +00:00
async function scrapeChannel ( channelEntity , accNetworkReleases ) {
const scraper = scrapers . releases [ channelEntity . slug ]
|| scrapers . releases [ channelEntity . parent ? . slug ]
|| scrapers . releases [ channelEntity . parent ? . parent ? . slug ] ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
if ( ! scraper ) {
2020-06-27 22:15:13 +00:00
logger . warn ( ` No scraper found for ' ${ channelEntity . name } ' ( ${ channelEntity . parent ? . name } ) ` ) ;
2020-09-03 20:22:12 +00:00
return emptyReleases ;
2020-05-14 02:26:05 +00:00
}
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
try {
2020-06-27 00:57:30 +00:00
const beforeFetchLatest = await scraper . beforeFetchLatest ? . ( channelEntity ) ;
2020-03-14 01:56:28 +00:00
2020-08-23 00:43:10 +00:00
return await scrapeChannelReleases ( scraper , channelEntity , {
... accNetworkReleases ,
2020-05-14 02:26:05 +00:00
beforeFetchLatest ,
} ) ;
} catch ( error ) {
2020-06-27 00:57:30 +00:00
logger . error ( ` Failed to scrape releases from ${ channelEntity . name } using ${ scraper . slug } : ${ error . message } ` ) ;
2020-03-14 01:56:28 +00:00
2020-09-03 20:22:12 +00:00
return emptyReleases ;
2020-05-14 02:26:05 +00:00
}
2020-03-14 01:56:28 +00:00
}
2020-06-25 00:26:25 +00:00
async function scrapeNetworkSequential ( networkEntity ) {
2020-08-23 00:43:10 +00:00
const releases = await Promise . reduce (
2020-06-25 00:26:25 +00:00
networkEntity . children ,
2020-06-27 00:57:30 +00:00
async ( chain , channelEntity ) => {
const accNetworkReleases = await chain ;
2020-08-23 00:43:10 +00:00
const { uniqueReleases , duplicateReleases } = await scrapeChannel ( channelEntity , accNetworkReleases ) ;
2020-05-14 02:26:05 +00:00
2020-08-23 00:43:10 +00:00
return {
uniqueReleases : accNetworkReleases . uniqueReleases . concat ( uniqueReleases ) ,
duplicateReleases : accNetworkReleases . duplicateReleases . concat ( duplicateReleases ) ,
} ;
2020-05-14 02:26:05 +00:00
} ,
2020-08-23 00:43:10 +00:00
Promise . resolve ( emptyReleases ) ,
2020-05-14 02:26:05 +00:00
) ;
2020-08-23 00:43:10 +00:00
return releases . uniqueReleases ;
2020-03-14 01:56:28 +00:00
}
2020-06-25 00:26:25 +00:00
async function scrapeNetworkParallel ( networkEntity ) {
2020-05-14 02:26:05 +00:00
return Promise . map (
2020-06-25 00:26:25 +00:00
networkEntity . children ,
2020-08-23 00:43:10 +00:00
async ( channelEntity ) => {
const { uniqueReleases } = await scrapeChannel ( channelEntity , networkEntity ) ;
return uniqueReleases ;
} ,
2020-05-14 02:26:05 +00:00
{ concurrency : 3 } ,
) ;
2020-03-14 01:56:28 +00:00
}
async function fetchUpdates ( ) {
2020-08-13 22:32:59 +00:00
const includedNetworks = await fetchIncludedEntities ( ) ;
2020-06-15 01:58:35 +00:00
2020-05-14 02:26:05 +00:00
const scrapedNetworks = await Promise . map (
includedNetworks ,
2020-06-27 00:57:30 +00:00
async networkEntity => ( networkEntity . parameters ? . sequential
? scrapeNetworkSequential ( networkEntity )
: scrapeNetworkParallel ( networkEntity ) ) ,
2020-05-14 02:26:05 +00:00
{ concurrency : 5 } ,
) ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
const releases = scrapedNetworks . flat ( 2 ) ;
2020-03-16 03:10:52 +00:00
2020-05-14 02:26:05 +00:00
return releases ;
2020-03-14 01:56:28 +00:00
}
module . exports = fetchUpdates ;