2020-03-14 01:56:28 +00:00
'use strict' ;
const Promise = require ( 'bluebird' ) ;
const moment = require ( 'moment' ) ;
const argv = require ( './argv' ) ;
const logger = require ( './logger' ) ( _ _filename ) ;
const knex = require ( './knex' ) ;
const include = require ( './utils/argv-include' ) ( argv ) ;
const scrapers = require ( './scrapers/scrapers' ) ;
2020-06-27 00:57:30 +00:00
const { fetchChannelsFromArgv , fetchChannelsFromConfig } = require ( './entities' ) ;
2020-03-14 01:56:28 +00:00
const afterDate = ( ( ) => {
2020-05-14 02:26:05 +00:00
if ( /\d{2,4}-\d{2}-\d{2,4}/ . test ( argv . after ) ) {
// using date
return moment
. utc ( argv . after , [ 'YYYY-MM-DD' , 'DD-MM-YYYY' ] )
. toDate ( ) ;
}
// using time distance (e.g. "1 month")
return moment
. utc ( )
. subtract ( ... argv . after . split ( ' ' ) )
. toDate ( ) ;
2020-03-14 01:56:28 +00:00
} ) ( ) ;
2020-03-28 03:37:04 +00:00
async function filterUniqueReleases ( latestReleases , accReleases ) {
2020-05-14 02:26:05 +00:00
const latestReleaseIdentifiers = latestReleases
2020-06-27 00:57:30 +00:00
. map ( release => [ release . entity . id , release . entryId ] ) ;
2020-03-21 01:48:24 +00:00
2020-05-14 02:26:05 +00:00
const duplicateReleases = await knex ( 'releases' )
2020-06-17 02:07:24 +00:00
. whereIn ( [ 'entity_id' , 'entry_id' ] , latestReleaseIdentifiers ) ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
// add entry IDs of accumulated releases to prevent an infinite scrape loop
// when one page contains the same release as the previous
const duplicateReleasesSiteIdAndEntryIds = duplicateReleases
. concat ( accReleases )
. reduce ( ( acc , release ) => {
2020-06-27 00:57:30 +00:00
const entityId = release . entity _id || release . entity . id ;
2020-05-14 02:26:05 +00:00
const entryId = release . entry _id || release . entryId ;
2020-03-21 01:48:24 +00:00
2020-06-27 00:57:30 +00:00
if ( ! acc [ entityId ] ) acc [ entityId ] = { } ;
acc [ entityId ] [ entryId ] = true ;
2020-03-21 01:48:24 +00:00
2020-05-14 02:26:05 +00:00
return acc ;
} , { } ) ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
const uniqueReleases = latestReleases
2020-06-27 00:57:30 +00:00
. filter ( release => ! duplicateReleasesSiteIdAndEntryIds [ release . entity . id ] ? . [ release . entryId ] ) ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
return uniqueReleases ;
2020-03-14 01:56:28 +00:00
}
2020-03-21 01:48:24 +00:00
function needNextPage ( uniqueReleases , pageAccReleases ) {
2020-05-14 02:26:05 +00:00
if ( uniqueReleases . length === 0 ) {
return false ;
}
if ( argv . last && pageAccReleases . length < argv . last ) {
// request for last N releases not yet satisfied
return true ;
}
if ( uniqueReleases . every ( release => ! ! release . date ) ) {
const oldestReleaseOnPage = uniqueReleases
. sort ( ( releaseA , releaseB ) => releaseB . date - releaseA . date )
. slice ( - 1 ) [ 0 ] ;
if ( moment ( oldestReleaseOnPage . date ) . isAfter ( afterDate ) ) {
// oldest release on page is newer than the specified date cut-off
return true ;
}
}
// dates missing, and limit for scenes without dates not yet reached
return pageAccReleases . length <= argv . nullDateLimit ;
2020-03-14 01:56:28 +00:00
}
2020-06-27 00:57:30 +00:00
async function scrapeReleases ( scraper , entity , preData , upcoming = false ) {
2020-05-14 02:26:05 +00:00
const scrapePage = async ( page = 1 , accReleases = [ ] ) => {
const latestReleases = upcoming
2020-06-27 00:57:30 +00:00
? await scraper . fetchUpcoming ( entity , page , preData , include )
: await scraper . fetchLatest ( entity , page , preData , include ) ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
if ( ! Array . isArray ( latestReleases ) ) {
// scraper is unable to fetch the releases and returned a HTTP code or null
2020-06-27 00:57:30 +00:00
logger . warn ( ` Scraper returned ${ latestReleases } when fetching latest from ' ${ entity . name } ' ( ${ entity . parent ? . name } ) ` ) ;
2020-05-14 02:26:05 +00:00
return accReleases ;
}
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
if ( latestReleases . length === 0 ) {
// scraper successfully requested releases, but found none
return accReleases ;
}
2020-03-14 01:56:28 +00:00
2020-07-12 03:25:27 +00:00
const latestReleasesWithEntity = latestReleases . map ( release => ( {
... release ,
entity : release . entity || entity , // allow override
} ) ) ; // attach entity the release is assigned to when stored
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
const uniqueReleases = argv . redownload
2020-06-27 00:57:30 +00:00
? latestReleasesWithEntity
: await filterUniqueReleases ( latestReleasesWithEntity , accReleases ) ;
2020-03-16 03:10:52 +00:00
2020-05-14 02:26:05 +00:00
const pageAccReleases = accReleases . concat ( uniqueReleases ) ;
2020-03-14 01:56:28 +00:00
2020-06-27 00:57:30 +00:00
logger . verbose ( ` Scraped ' ${ entity . name } ' ( ${ entity . parent ? . name } ) ${ upcoming ? 'upcoming' : 'latest' } page ${ page } , found ${ uniqueReleases . length } unique updates ` ) ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
if ( needNextPage ( uniqueReleases , pageAccReleases ) ) {
return scrapePage ( page + 1 , pageAccReleases ) ;
}
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
return pageAccReleases ;
} ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
const rawReleases = await scrapePage ( argv . page || 1 , [ ] ) ;
const releases = upcoming
? rawReleases . map ( rawRelease => ( { ... rawRelease , upcoming : true } ) )
: rawReleases ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
if ( argv . last ) {
return releases . slice ( 0 , argv . last ) ;
}
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
if ( releases . every ( release => release . date ) ) {
return releases . filter ( release => moment ( release . date ) . isAfter ( afterDate ) ) ;
}
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
return releases . slice ( 0 , argv . nullDateLimit ) ;
2020-03-14 01:56:28 +00:00
}
2020-06-27 00:57:30 +00:00
async function scrapeLatestReleases ( scraper , entity , preData ) {
2020-05-14 02:26:05 +00:00
if ( ! scraper . fetchLatest ) {
return [ ] ;
}
2020-03-28 03:37:04 +00:00
2020-05-14 02:26:05 +00:00
try {
2020-06-27 00:57:30 +00:00
return await scrapeReleases ( scraper , entity , preData , false ) ;
2020-05-14 02:26:05 +00:00
} catch ( error ) {
2020-07-05 02:10:35 +00:00
console . trace ( error ) ;
2020-06-27 00:57:30 +00:00
logger . warn ( ` Failed to scrape latest updates for ' ${ entity . slug } ' ( ${ entity . parent ? . slug } ): ${ error . message } ` ) ;
2020-05-14 02:26:05 +00:00
}
2020-03-28 03:37:04 +00:00
2020-05-14 02:26:05 +00:00
return [ ] ;
2020-03-28 03:37:04 +00:00
}
2020-06-27 00:57:30 +00:00
async function scrapeUpcomingReleases ( scraper , entity , preData ) {
2020-05-14 02:26:05 +00:00
if ( ! scraper . fetchUpcoming ) {
return [ ] ;
}
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
try {
2020-06-27 00:57:30 +00:00
return await scrapeReleases ( scraper , entity , preData , true ) ;
2020-05-14 02:26:05 +00:00
} catch ( error ) {
2020-06-27 00:57:30 +00:00
logger . warn ( ` Failed to scrape upcoming updates for ' ${ entity . slug } ' ( ${ entity . parent ? . slug } ): ${ error . message } ` ) ;
2020-05-14 02:26:05 +00:00
}
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
return [ ] ;
2020-03-14 01:56:28 +00:00
}
2020-06-27 00:57:30 +00:00
async function scrapeChannelReleases ( scraper , channelEntity , preData ) {
2020-05-14 02:26:05 +00:00
const [ latestReleases , upcomingReleases ] = await Promise . all ( [
argv . latest
2020-06-27 00:57:30 +00:00
? scrapeLatestReleases ( scraper , channelEntity , preData )
2020-05-14 02:26:05 +00:00
: [ ] ,
argv . upcoming
2020-06-27 00:57:30 +00:00
? scrapeUpcomingReleases ( scraper , channelEntity , preData )
2020-05-14 02:26:05 +00:00
: [ ] ,
] ) ;
2020-06-27 22:15:13 +00:00
logger . info ( ` Fetching ${ latestReleases . length } latest and ${ upcomingReleases . length } upcoming updates for ' ${ channelEntity . name } ' ( ${ channelEntity . parent ? . name } ) ` ) ;
2020-05-14 02:26:05 +00:00
return [ ... latestReleases , ... upcomingReleases ] ;
2020-03-14 01:56:28 +00:00
}
2020-06-27 00:57:30 +00:00
async function scrapeChannel ( channelEntity , accNetworkReleases ) {
const scraper = scrapers . releases [ channelEntity . slug ]
|| scrapers . releases [ channelEntity . parent ? . slug ]
|| scrapers . releases [ channelEntity . parent ? . parent ? . slug ] ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
if ( ! scraper ) {
2020-06-27 22:15:13 +00:00
logger . warn ( ` No scraper found for ' ${ channelEntity . name } ' ( ${ channelEntity . parent ? . name } ) ` ) ;
2020-05-14 02:26:05 +00:00
return [ ] ;
}
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
try {
2020-06-27 00:57:30 +00:00
const beforeFetchLatest = await scraper . beforeFetchLatest ? . ( channelEntity ) ;
2020-03-14 01:56:28 +00:00
2020-06-27 00:57:30 +00:00
const channelEntityReleases = await scrapeChannelReleases ( scraper , channelEntity , {
accNetworkReleases ,
2020-05-14 02:26:05 +00:00
beforeFetchLatest ,
} ) ;
2020-03-14 01:56:28 +00:00
2020-06-27 00:57:30 +00:00
return channelEntityReleases . map ( release => ( { ... release , channelEntity } ) ) ;
2020-05-14 02:26:05 +00:00
} catch ( error ) {
2020-06-27 00:57:30 +00:00
logger . error ( ` Failed to scrape releases from ${ channelEntity . name } using ${ scraper . slug } : ${ error . message } ` ) ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
return [ ] ;
}
2020-03-14 01:56:28 +00:00
}
2020-06-25 00:26:25 +00:00
async function scrapeNetworkSequential ( networkEntity ) {
2020-05-14 02:26:05 +00:00
return Promise . reduce (
2020-06-25 00:26:25 +00:00
networkEntity . children ,
2020-06-27 00:57:30 +00:00
async ( chain , channelEntity ) => {
const accNetworkReleases = await chain ;
2020-06-27 22:15:13 +00:00
const channelReleases = await scrapeChannel ( channelEntity , accNetworkReleases ) ;
2020-05-14 02:26:05 +00:00
2020-06-27 00:57:30 +00:00
return accNetworkReleases . concat ( channelReleases ) ;
2020-05-14 02:26:05 +00:00
} ,
Promise . resolve ( [ ] ) ,
) ;
2020-03-14 01:56:28 +00:00
}
2020-06-25 00:26:25 +00:00
async function scrapeNetworkParallel ( networkEntity ) {
2020-05-14 02:26:05 +00:00
return Promise . map (
2020-06-25 00:26:25 +00:00
networkEntity . children ,
2020-06-27 00:57:30 +00:00
async channelEntity => scrapeChannel ( channelEntity , networkEntity ) ,
2020-05-14 02:26:05 +00:00
{ concurrency : 3 } ,
) ;
2020-03-14 01:56:28 +00:00
}
async function fetchUpdates ( ) {
2020-06-27 00:57:30 +00:00
const includedNetworks = argv . channels || argv . networks
? await fetchChannelsFromArgv ( )
: await fetchChannelsFromConfig ( ) ;
2020-06-15 01:58:35 +00:00
2020-05-14 02:26:05 +00:00
const scrapedNetworks = await Promise . map (
includedNetworks ,
2020-06-27 00:57:30 +00:00
async networkEntity => ( networkEntity . parameters ? . sequential
? scrapeNetworkSequential ( networkEntity )
: scrapeNetworkParallel ( networkEntity ) ) ,
2020-05-14 02:26:05 +00:00
{ concurrency : 5 } ,
) ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
const releases = scrapedNetworks . flat ( 2 ) ;
2020-03-16 03:10:52 +00:00
2020-05-14 02:26:05 +00:00
return releases ;
2020-03-14 01:56:28 +00:00
}
module . exports = fetchUpdates ;