2020-03-14 01:56:28 +00:00
'use strict' ;
const Promise = require ( 'bluebird' ) ;
const moment = require ( 'moment' ) ;
const argv = require ( './argv' ) ;
const logger = require ( './logger' ) ( _ _filename ) ;
const knex = require ( './knex' ) ;
2020-08-23 00:43:10 +00:00
const { curateRelease } = require ( './releases' ) ;
2020-03-14 01:56:28 +00:00
const include = require ( './utils/argv-include' ) ( argv ) ;
const scrapers = require ( './scrapers/scrapers' ) ;
2020-08-13 22:32:59 +00:00
const { fetchIncludedEntities } = require ( './entities' ) ;
2020-03-14 01:56:28 +00:00
2020-08-22 02:22:56 +00:00
const emptyReleases = { uniqueReleases : [ ] , duplicateReleases : [ ] } ;
2020-09-21 03:11:24 +00:00
function mapReleasesToSiteIdAndEntryId ( acc , release ) {
const entityId = release . entityId || release . entity . id ;
const entryId = release . entryId || release . entryId ;
if ( ! acc [ entityId ] ) acc [ entityId ] = { } ;
acc [ entityId ] [ entryId ] = true ;
return acc ;
}
2020-03-28 03:37:04 +00:00
async function filterUniqueReleases ( latestReleases , accReleases ) {
2020-05-14 02:26:05 +00:00
const latestReleaseIdentifiers = latestReleases
2020-06-27 00:57:30 +00:00
. map ( release => [ release . entity . id , release . entryId ] ) ;
2020-03-21 01:48:24 +00:00
2020-08-23 00:43:10 +00:00
const duplicateReleaseEntries = await knex ( 'releases' )
. select ( knex . raw ( 'releases.*, row_to_json(entities) as entity' ) )
. leftJoin ( 'entities' , 'entities.id' , 'releases.entity_id' )
2020-06-17 02:07:24 +00:00
. whereIn ( [ 'entity_id' , 'entry_id' ] , latestReleaseIdentifiers ) ;
2020-03-14 01:56:28 +00:00
2020-08-23 00:43:10 +00:00
const duplicateReleases = duplicateReleaseEntries . map ( release => curateRelease ( release ) ) ;
2020-05-14 02:26:05 +00:00
// add entry IDs of accumulated releases to prevent an infinite scrape loop
// when one page contains the same release as the previous
2020-09-21 03:11:24 +00:00
const duplicateReleasesBySiteIdAndEntryId = duplicateReleases
2020-09-25 19:21:26 +00:00
. concat ( accReleases . uniqueReleases )
2020-09-21 03:11:24 +00:00
. reduce ( mapReleasesToSiteIdAndEntryId , { } ) ;
2020-03-21 01:48:24 +00:00
2020-09-25 19:21:26 +00:00
const localDuplicateReleasesBySiteIdAndEntryId = accReleases . uniqueReleases
. concat ( accReleases . duplicateReleases )
. reduce ( mapReleasesToSiteIdAndEntryId , { } ) ;
console . log ( localDuplicateReleasesBySiteIdAndEntryId ) ;
2020-03-14 01:56:28 +00:00
2020-09-21 03:11:24 +00:00
const uniqueReleases = latestReleases . filter ( release => ! duplicateReleasesBySiteIdAndEntryId [ release . entity . id ] ? . [ release . entryId ] ) ;
const localUniqueReleases = latestReleases . filter ( release => ! localDuplicateReleasesBySiteIdAndEntryId [ release . entity . id ] ? . [ release . entryId ] ) ;
2020-03-14 01:56:28 +00:00
2020-09-21 03:11:24 +00:00
return {
uniqueReleases ,
localUniqueReleases ,
duplicateReleases ,
} ;
2020-03-14 01:56:28 +00:00
}
2020-09-21 03:11:24 +00:00
function needNextPage ( releasesOnPage , uniqueReleasesOnPage , localUniqueReleasesOnPage , totalReleases , hasDates , upcoming ) {
2020-09-18 22:12:15 +00:00
if ( releasesOnPage . length === 0 ) {
2020-09-12 23:41:52 +00:00
return false ;
}
if ( upcoming ) {
2020-09-21 03:11:24 +00:00
return uniqueReleasesOnPage . length > 0 && argv . paginateUpcoming ;
2020-08-21 23:57:23 +00:00
}
2020-09-21 03:11:24 +00:00
// no longer works when there are no unique releases, need to keep track of /all/ releases regardless uniqueness
console . log ( localUniqueReleasesOnPage . length ) ;
2020-07-14 01:46:31 +00:00
2020-09-21 03:11:24 +00:00
if ( localUniqueReleasesOnPage . length > 0 ) {
if ( argv . last ) {
return totalReleases + releasesOnPage . length < argv . last ;
}
2020-07-15 02:51:39 +00:00
2020-09-21 03:11:24 +00:00
if ( ! hasDates ) {
return totalReleases + releasesOnPage . length < argv . nullDateLimit ;
}
2020-09-10 21:49:24 +00:00
2020-09-21 03:11:24 +00:00
if ( argv . after ) {
// this will keep paginating infinitely on sites that will keep serving the last page if you exceed the last page number (e.g. HardX as of september 2020)
// checking unqiueReleases > 0 could prevent that, but this would stop pagination prematurely if we already scraped a full page of data earlier
const oldestReleaseOnPage = releasesOnPage
. sort ( ( releaseA , releaseB ) => releaseB . date - releaseA . date )
. slice ( - 1 ) [ 0 ] ;
if ( moment ( oldestReleaseOnPage . date ) . isAfter ( argv . after ) ) {
// oldest release on page is newer than the specified date cut-off
return true ;
}
2020-09-10 21:49:24 +00:00
}
2020-05-14 02:26:05 +00:00
}
2020-08-21 01:55:51 +00:00
return false ;
2020-03-14 01:56:28 +00:00
}
2020-08-22 02:22:56 +00:00
async function scrapeReleases ( scraper , entity , preData , upcoming = false , page = 1 , acc = emptyReleases , totalReleases = 0 ) {
2020-08-21 01:55:51 +00:00
const releases = upcoming
? await scraper . fetchUpcoming ( entity , page , include , preData )
: await scraper . fetchLatest ( entity , page , include , preData ) ;
2020-03-14 01:56:28 +00:00
2020-08-21 01:55:51 +00:00
if ( ! Array . isArray ( releases ) ) {
// scraper is unable to fetch the releases and returned a HTTP code or null
logger . warn ( ` Scraper returned ${ releases } when fetching latest from ' ${ entity . name } ' ( ${ entity . parent ? . name } ) ` ) ;
2020-08-22 02:22:56 +00:00
return acc ;
2020-08-21 01:55:51 +00:00
}
2020-03-14 01:56:28 +00:00
2020-08-21 01:55:51 +00:00
const releasesWithEntity = releases . map ( release => ( {
... release ,
entity : release . entity || entity , // allow override
} ) ) ; // attach entity the release is assigned to when stored
2020-03-14 01:56:28 +00:00
2020-08-21 01:55:51 +00:00
const hasDates = releasesWithEntity . every ( release => ! ! release . date ) ;
2020-03-14 01:56:28 +00:00
2020-08-21 01:55:51 +00:00
const limitedReleases = ( argv . last && releasesWithEntity . slice ( 0 , Math . max ( argv . last - totalReleases , 0 ) ) )
|| ( hasDates && releasesWithEntity . filter ( release => moment ( release . date ) . isAfter ( argv . after ) ) )
|| releasesWithEntity . slice ( 0 , Math . max ( argv . nullDateLimit - totalReleases , 0 ) ) ;
2020-03-14 01:56:28 +00:00
2020-09-21 03:11:24 +00:00
const { uniqueReleases , localUniqueReleases , duplicateReleases } = argv . force
2020-09-25 19:21:26 +00:00
? { uniqueReleases : limitedReleases , localUniqueReleases : releases , duplicateReleases : [ ] }
: await filterUniqueReleases ( limitedReleases , acc ) ;
2020-08-22 02:22:56 +00:00
const accReleases = {
uniqueReleases : acc . uniqueReleases . concat ( uniqueReleases ) ,
duplicateReleases : acc . duplicateReleases . concat ( duplicateReleases ) ,
} ;
2020-03-14 01:56:28 +00:00
2020-09-21 03:11:24 +00:00
if ( needNextPage ( releases , uniqueReleases , localUniqueReleases , totalReleases , hasDates , upcoming ) ) {
2020-08-22 02:22:56 +00:00
return scrapeReleases ( scraper , entity , preData , upcoming , page + 1 , accReleases , totalReleases + releases . length ) ;
2020-05-14 02:26:05 +00:00
}
2020-03-14 01:56:28 +00:00
2020-08-22 02:22:56 +00:00
return accReleases ;
2020-03-14 01:56:28 +00:00
}
2020-06-27 00:57:30 +00:00
async function scrapeLatestReleases ( scraper , entity , preData ) {
2020-09-10 01:17:19 +00:00
if ( ( ! argv . latest && ! argv . last && ! argv . after ) || ! scraper . fetchLatest ) {
2020-08-22 02:22:56 +00:00
return emptyReleases ;
2020-05-14 02:26:05 +00:00
}
2020-03-28 03:37:04 +00:00
2020-05-14 02:26:05 +00:00
try {
2020-08-21 01:55:51 +00:00
return await scrapeReleases ( scraper , entity , preData , false , argv . page || 1 ) ;
2020-05-14 02:26:05 +00:00
} catch ( error ) {
2020-08-21 01:55:51 +00:00
if ( argv . debug ) {
console . trace ( error ) ;
}
2020-06-27 00:57:30 +00:00
logger . warn ( ` Failed to scrape latest updates for ' ${ entity . slug } ' ( ${ entity . parent ? . slug } ): ${ error . message } ` ) ;
2020-05-14 02:26:05 +00:00
}
2020-03-28 03:37:04 +00:00
2020-08-22 02:22:56 +00:00
return emptyReleases ;
2020-03-28 03:37:04 +00:00
}
2020-06-27 00:57:30 +00:00
async function scrapeUpcomingReleases ( scraper , entity , preData ) {
2020-08-22 02:22:56 +00:00
if ( ! argv . upcoming || ! scraper . fetchUpcoming ) {
return emptyReleases ;
2020-05-14 02:26:05 +00:00
}
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
try {
2020-06-27 00:57:30 +00:00
return await scrapeReleases ( scraper , entity , preData , true ) ;
2020-05-14 02:26:05 +00:00
} catch ( error ) {
2020-08-21 01:55:51 +00:00
if ( argv . debug ) {
console . trace ( error ) ;
}
2020-06-27 00:57:30 +00:00
logger . warn ( ` Failed to scrape upcoming updates for ' ${ entity . slug } ' ( ${ entity . parent ? . slug } ): ${ error . message } ` ) ;
2020-05-14 02:26:05 +00:00
}
2020-03-14 01:56:28 +00:00
2020-08-22 02:22:56 +00:00
return emptyReleases ;
2020-03-14 01:56:28 +00:00
}
2020-08-01 13:11:07 +00:00
async function scrapeMovies ( scraper , entity ) {
2020-08-22 02:22:56 +00:00
if ( ! argv . movies || ! scraper . fetchMovies ) {
2020-08-01 13:11:07 +00:00
return [ ] ;
}
try {
// return await scrapeReleases(scraper, entity, preData, true);
return await scraper . fetchMovies ( entity ) ;
} catch ( error ) {
logger . warn ( ` Failed to scrape movies for ' ${ entity . slug } ' ( ${ entity . parent ? . slug } ): ${ error . message } ` ) ;
}
return [ ] ;
}
2020-06-27 00:57:30 +00:00
async function scrapeChannelReleases ( scraper , channelEntity , preData ) {
2020-08-21 01:55:51 +00:00
const [ latestReleases , upcomingReleases ] = await Promise . all ( [
2020-08-22 02:22:56 +00:00
scrapeLatestReleases ( scraper , channelEntity , preData ) ,
scrapeUpcomingReleases ( scraper , channelEntity , preData ) ,
scrapeMovies ( scraper , channelEntity , preData ) ,
2020-05-14 02:26:05 +00:00
] ) ;
2020-08-23 00:43:10 +00:00
logger . info ( ` Fetching ${ latestReleases . uniqueReleases . length } latest and ${ upcomingReleases . uniqueReleases . length } upcoming updates for ' ${ channelEntity . name } ' ( ${ channelEntity . parent ? . name } ) ` ) ;
2020-05-14 02:26:05 +00:00
2020-08-23 00:43:10 +00:00
return {
uniqueReleases : [ ... latestReleases . uniqueReleases , ... upcomingReleases . uniqueReleases ] ,
duplicateReleases : [ ... latestReleases . duplicateReleases , ... upcomingReleases . duplicateReleases ] ,
} ;
2020-03-14 01:56:28 +00:00
}
2020-06-27 00:57:30 +00:00
async function scrapeChannel ( channelEntity , accNetworkReleases ) {
const scraper = scrapers . releases [ channelEntity . slug ]
|| scrapers . releases [ channelEntity . parent ? . slug ]
|| scrapers . releases [ channelEntity . parent ? . parent ? . slug ] ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
if ( ! scraper ) {
2020-06-27 22:15:13 +00:00
logger . warn ( ` No scraper found for ' ${ channelEntity . name } ' ( ${ channelEntity . parent ? . name } ) ` ) ;
2020-09-03 20:22:12 +00:00
return emptyReleases ;
2020-05-14 02:26:05 +00:00
}
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
try {
2020-06-27 00:57:30 +00:00
const beforeFetchLatest = await scraper . beforeFetchLatest ? . ( channelEntity ) ;
2020-03-14 01:56:28 +00:00
2020-08-23 00:43:10 +00:00
return await scrapeChannelReleases ( scraper , channelEntity , {
... accNetworkReleases ,
2020-05-14 02:26:05 +00:00
beforeFetchLatest ,
} ) ;
} catch ( error ) {
2020-06-27 00:57:30 +00:00
logger . error ( ` Failed to scrape releases from ${ channelEntity . name } using ${ scraper . slug } : ${ error . message } ` ) ;
2020-03-14 01:56:28 +00:00
2020-09-03 20:22:12 +00:00
return emptyReleases ;
2020-05-14 02:26:05 +00:00
}
2020-03-14 01:56:28 +00:00
}
2020-06-25 00:26:25 +00:00
async function scrapeNetworkSequential ( networkEntity ) {
2020-08-23 00:43:10 +00:00
const releases = await Promise . reduce (
2020-06-25 00:26:25 +00:00
networkEntity . children ,
2020-06-27 00:57:30 +00:00
async ( chain , channelEntity ) => {
const accNetworkReleases = await chain ;
2020-08-23 00:43:10 +00:00
const { uniqueReleases , duplicateReleases } = await scrapeChannel ( channelEntity , accNetworkReleases ) ;
2020-05-14 02:26:05 +00:00
2020-08-23 00:43:10 +00:00
return {
uniqueReleases : accNetworkReleases . uniqueReleases . concat ( uniqueReleases ) ,
duplicateReleases : accNetworkReleases . duplicateReleases . concat ( duplicateReleases ) ,
} ;
2020-05-14 02:26:05 +00:00
} ,
2020-08-23 00:43:10 +00:00
Promise . resolve ( emptyReleases ) ,
2020-05-14 02:26:05 +00:00
) ;
2020-08-23 00:43:10 +00:00
return releases . uniqueReleases ;
2020-03-14 01:56:28 +00:00
}
2020-06-25 00:26:25 +00:00
async function scrapeNetworkParallel ( networkEntity ) {
2020-05-14 02:26:05 +00:00
return Promise . map (
2020-06-25 00:26:25 +00:00
networkEntity . children ,
2020-08-23 00:43:10 +00:00
async ( channelEntity ) => {
const { uniqueReleases } = await scrapeChannel ( channelEntity , networkEntity ) ;
return uniqueReleases ;
} ,
2020-05-14 02:26:05 +00:00
{ concurrency : 3 } ,
) ;
2020-03-14 01:56:28 +00:00
}
async function fetchUpdates ( ) {
2020-08-13 22:32:59 +00:00
const includedNetworks = await fetchIncludedEntities ( ) ;
2020-06-15 01:58:35 +00:00
2020-05-14 02:26:05 +00:00
const scrapedNetworks = await Promise . map (
includedNetworks ,
2020-06-27 00:57:30 +00:00
async networkEntity => ( networkEntity . parameters ? . sequential
? scrapeNetworkSequential ( networkEntity )
: scrapeNetworkParallel ( networkEntity ) ) ,
2020-05-14 02:26:05 +00:00
{ concurrency : 5 } ,
) ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
const releases = scrapedNetworks . flat ( 2 ) ;
2020-03-16 03:10:52 +00:00
2020-05-14 02:26:05 +00:00
return releases ;
2020-03-14 01:56:28 +00:00
}
module . exports = fetchUpdates ;