2020-03-14 01:56:28 +00:00
'use strict' ;
2020-10-28 00:36:13 +00:00
const config = require ( 'config' ) ;
2020-03-14 01:56:28 +00:00
const Promise = require ( 'bluebird' ) ;
const moment = require ( 'moment' ) ;
const argv = require ( './argv' ) ;
const logger = require ( './logger' ) ( _ _filename ) ;
const knex = require ( './knex' ) ;
2020-08-23 00:43:10 +00:00
const { curateRelease } = require ( './releases' ) ;
2022-01-10 01:17:17 +00:00
const chunk = require ( './utils/chunk' ) ;
2020-03-14 01:56:28 +00:00
const include = require ( './utils/argv-include' ) ( argv ) ;
2021-02-02 00:31:12 +00:00
const { resolveScraper , resolveLayoutScraper } = require ( './scrapers/resolve' ) ;
2020-08-13 22:32:59 +00:00
const { fetchIncludedEntities } = require ( './entities' ) ;
2021-02-10 02:23:48 +00:00
const getRecursiveParameters = require ( './utils/get-recursive-parameters' ) ;
2020-03-14 01:56:28 +00:00
2020-08-22 02:22:56 +00:00
const emptyReleases = { uniqueReleases : [ ] , duplicateReleases : [ ] } ;
2020-10-14 01:17:03 +00:00
function mapReleasesToEntityIdAndEntryId ( acc , release ) {
2020-09-21 03:11:24 +00:00
const entityId = release . entityId || release . entity . id ;
const entryId = release . entryId || release . entryId ;
if ( ! acc [ entityId ] ) acc [ entityId ] = { } ;
2020-10-14 01:17:03 +00:00
acc [ entityId ] [ entryId ] = release ;
2020-09-21 03:11:24 +00:00
return acc ;
}
2020-10-07 01:40:19 +00:00
function filterLocalUniqueReleases ( releases , accReleases ) {
2020-10-14 01:17:03 +00:00
const localDuplicateReleasesBySiteIdAndEntryId = accReleases . reduce ( mapReleasesToEntityIdAndEntryId , { } ) ;
2020-10-07 01:40:19 +00:00
2021-11-20 22:59:15 +00:00
const localUniqueReleases = releases . filter ( ( release ) => ! localDuplicateReleasesBySiteIdAndEntryId [ release . entity . id ] ? . [ release . entryId ] ) ;
const localDuplicateReleases = releases . filter ( ( release ) => localDuplicateReleasesBySiteIdAndEntryId [ release . entity . id ] ? . [ release . entryId ] ) ;
2020-10-12 02:08:22 +00:00
return {
localUniqueReleases ,
localDuplicateReleases ,
} ;
2020-10-07 01:40:19 +00:00
}
async function filterUniqueReleases ( releases ) {
2022-01-10 01:17:17 +00:00
const releaseIdentifierChunks = chunk ( releases . map ( ( release ) => [ release . entity . id , release . entryId . toString ( ) ] ) ) ;
const duplicateReleaseEntryChunks = await Promise . map ( releaseIdentifierChunks , async ( releaseIdentifiers ) => {
const duplicateReleaseEntriesQuery = knex ( 'releases' )
. select ( knex . raw ( 'releases.*, row_to_json(entities) as entity' ) )
. leftJoin ( 'entities' , 'entities.id' , 'releases.entity_id' )
. whereIn ( [ 'entity_id' , 'entry_id' ] , releaseIdentifiers )
. where ( ( builder ) => {
// check if previously upcoming scenes can be excluded from duplicates to be rescraped for release day updates
builder
. where ( 'deep' , true ) // scene is already deep scraped
. orWhereNull ( 'date' )
. orWhereNotIn ( 'date_precision' , [ 'day' , 'minute' ] ) // don't worry about scenes without (accurate) dates for now
. orWhere ( knex . raw ( 'date > NOW() - INTERVAL \'12 hours\'' ) ) // scene is still upcoming, with a rough offset to wait for the end of the day west of UTC
. orWhere ( knex . raw ( 'updated_at - date > INTERVAL \'1 day\'' ) ) ; // scene was updated after the release date, no updates expected
} ) ;
return duplicateReleaseEntriesQuery ;
} , { concurrency : 10 } ) ;
const duplicateReleaseEntries = duplicateReleaseEntryChunks . flat ( ) ;
2021-11-20 22:59:15 +00:00
const duplicateReleases = duplicateReleaseEntries . map ( ( release ) => curateRelease ( release ) ) ;
2020-10-14 01:17:03 +00:00
const duplicateReleasesByEntityIdAndEntryId = duplicateReleases . reduce ( mapReleasesToEntityIdAndEntryId , { } ) ;
2020-08-23 00:43:10 +00:00
2020-10-14 01:17:03 +00:00
const internalUniqueReleasesByEntityIdAndEntryId = releases . reduce ( ( acc , release ) => mapReleasesToEntityIdAndEntryId ( acc , release ) , { } ) ;
2021-11-20 22:59:15 +00:00
const internalUniqueReleases = Object . values ( internalUniqueReleasesByEntityIdAndEntryId ) . map ( ( releasesByEntryId ) => Object . values ( releasesByEntryId ) ) . flat ( ) ;
2020-03-21 01:48:24 +00:00
2021-11-20 22:59:15 +00:00
const uniqueReleases = internalUniqueReleases . filter ( ( release ) => ! duplicateReleasesByEntityIdAndEntryId [ release . entity . id ] ? . [ release . entryId ] ) ;
2020-09-25 19:21:26 +00:00
2020-10-14 01:17:03 +00:00
return { uniqueReleases , duplicateReleases } ;
2020-03-14 01:56:28 +00:00
}
2021-10-28 00:10:30 +00:00
function needNextPage ( pageReleases , accReleases , isUpcoming , unextracted = [ ] ) {
2020-10-12 02:08:22 +00:00
const { localUniqueReleases : uniquePageReleases } = filterLocalUniqueReleases ( pageReleases , accReleases ) ;
2021-10-28 00:10:30 +00:00
if ( uniquePageReleases . length + unextracted . length === 0 ) {
2020-10-12 02:08:22 +00:00
// page is empty, or only contains scenes from previous page
2020-09-12 23:41:52 +00:00
return false ;
}
2020-10-12 02:08:22 +00:00
if ( isUpcoming ) {
return uniquePageReleases . length > 0 && argv . paginateUpcoming ;
2020-08-21 23:57:23 +00:00
}
2021-10-28 00:10:30 +00:00
if ( uniquePageReleases . length + unextracted . length > 0 ) {
2020-09-21 03:11:24 +00:00
if ( argv . last ) {
2020-10-12 02:08:22 +00:00
return accReleases . length + pageReleases . length < argv . last ;
2020-09-21 03:11:24 +00:00
}
2020-07-15 02:51:39 +00:00
2021-11-20 22:59:15 +00:00
if ( ! pageReleases . concat ( unextracted ) . every ( ( release ) => ! ! release . date ) ) { // some scenes don't have dates
2020-10-19 22:05:23 +00:00
return accReleases . length + pageReleases . length < argv . missingDateLimit ;
2020-09-21 03:11:24 +00:00
}
2020-09-10 21:49:24 +00:00
2020-09-21 03:11:24 +00:00
if ( argv . after ) {
2020-10-12 02:08:22 +00:00
const oldestReleaseOnPage = pageReleases
2021-10-28 00:10:30 +00:00
. concat ( unextracted )
2020-09-21 03:11:24 +00:00
. sort ( ( releaseA , releaseB ) => releaseB . date - releaseA . date )
. slice ( - 1 ) [ 0 ] ;
if ( moment ( oldestReleaseOnPage . date ) . isAfter ( argv . after ) ) {
// oldest release on page is newer than the specified date cut-off
return true ;
}
2020-09-10 21:49:24 +00:00
}
2020-05-14 02:26:05 +00:00
}
2020-08-21 01:55:51 +00:00
return false ;
2020-03-14 01:56:28 +00:00
}
2020-10-12 02:08:22 +00:00
async function scrapeReleases ( scraper , entity , preData , isUpcoming ) {
2023-08-11 01:56:08 +00:00
async function scrapeReleasesPage ( page , accReleases , pageContext ) {
2020-10-28 00:36:13 +00:00
const options = {
... config . options [ scraper . slug ] ,
... include ,
2021-10-27 15:19:23 +00:00
... preData ,
2023-08-11 01:56:08 +00:00
... pageContext ,
2021-02-10 02:23:48 +00:00
parameters : getRecursiveParameters ( entity ) ,
2020-10-28 00:36:13 +00:00
} ;
2021-10-27 23:59:53 +00:00
const rawPageReleases = isUpcoming
2020-10-28 00:36:13 +00:00
? await scraper . fetchUpcoming ( entity , page , options , preData )
: await scraper . fetchLatest ( entity , page , options , preData ) ;
2020-10-07 01:40:19 +00:00
2021-10-27 23:59:53 +00:00
const pageReleases = rawPageReleases . scenes || rawPageReleases ;
2020-10-07 01:40:19 +00:00
if ( ! Array . isArray ( pageReleases ) ) {
// scraper is unable to fetch the releases and returned a HTTP code or null
logger . warn ( ` Scraper returned ${ pageReleases } when fetching latest from ' ${ entity . name } ' ( ${ entity . parent ? . name } ) ` ) ;
return accReleases ;
}
2021-11-20 22:59:15 +00:00
const validPageReleases = pageReleases . filter ( ( release ) => release ? . entryId ) ; // filter out empty and unidentified releases
const pageReleasesWithEntity = validPageReleases . map ( ( release ) => ( { ... release , entity : release . entity || entity } ) ) ;
2020-10-29 14:20:59 +00:00
if ( pageReleases . length > validPageReleases . length ) {
logger . warn ( ` Found ${ pageReleases . length - validPageReleases . length } empty or unidentified releases on page ${ page } for ' ${ entity . name } ' ` ) ;
}
2020-10-12 02:08:22 +00:00
2021-10-27 23:59:53 +00:00
if ( needNextPage ( pageReleasesWithEntity , accReleases , isUpcoming , rawPageReleases . unextracted ) ) {
2023-08-11 01:56:08 +00:00
return scrapeReleasesPage ( page + 1 , accReleases . concat ( pageReleasesWithEntity ) , rawPageReleases . context ) ;
2020-10-12 02:08:22 +00:00
}
return accReleases . concat ( pageReleasesWithEntity ) ;
2020-10-07 01:40:19 +00:00
}
2020-10-12 02:08:22 +00:00
const releases = await scrapeReleasesPage ( argv . page || 1 , [ ] ) ;
2020-10-29 14:20:59 +00:00
2021-11-20 22:59:15 +00:00
const hasDates = releases . every ( ( release ) => ! ! release . date ) ;
2020-10-07 01:40:19 +00:00
const limitedReleases = ( argv . last && releases . slice ( 0 , Math . max ( argv . last , 0 ) ) )
2021-11-20 22:59:15 +00:00
|| ( hasDates && releases . filter ( ( release ) => moment ( release . date ) . isAfter ( argv . after ) ) )
2020-10-19 22:05:23 +00:00
|| releases . slice ( 0 , Math . max ( argv . missingDateLimit , 0 ) ) ;
2020-10-07 01:40:19 +00:00
const { uniqueReleases , duplicateReleases } = argv . force
2020-10-14 01:17:03 +00:00
? { uniqueReleases : limitedReleases , duplicateReleases : [ ] }
: await filterUniqueReleases ( limitedReleases ) ;
return { uniqueReleases , duplicateReleases } ;
2020-10-07 01:40:19 +00:00
}
2020-06-27 00:57:30 +00:00
async function scrapeLatestReleases ( scraper , entity , preData ) {
2020-11-19 01:01:13 +00:00
if ( ! argv . latest || ! scraper . fetchLatest ) {
2020-08-22 02:22:56 +00:00
return emptyReleases ;
2020-05-14 02:26:05 +00:00
}
2020-03-28 03:37:04 +00:00
2020-05-14 02:26:05 +00:00
try {
2020-10-12 02:08:22 +00:00
return await scrapeReleases ( scraper , entity , preData , false ) ;
2020-05-14 02:26:05 +00:00
} catch ( error ) {
2020-08-21 01:55:51 +00:00
if ( argv . debug ) {
console . trace ( error ) ;
}
2020-06-27 00:57:30 +00:00
logger . warn ( ` Failed to scrape latest updates for ' ${ entity . slug } ' ( ${ entity . parent ? . slug } ): ${ error . message } ` ) ;
2020-05-14 02:26:05 +00:00
}
2020-03-28 03:37:04 +00:00
2020-08-22 02:22:56 +00:00
return emptyReleases ;
2020-03-28 03:37:04 +00:00
}
2020-06-27 00:57:30 +00:00
async function scrapeUpcomingReleases ( scraper , entity , preData ) {
2020-08-22 02:22:56 +00:00
if ( ! argv . upcoming || ! scraper . fetchUpcoming ) {
return emptyReleases ;
2020-05-14 02:26:05 +00:00
}
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
try {
2020-06-27 00:57:30 +00:00
return await scrapeReleases ( scraper , entity , preData , true ) ;
2020-05-14 02:26:05 +00:00
} catch ( error ) {
2020-08-21 01:55:51 +00:00
if ( argv . debug ) {
console . trace ( error ) ;
}
2020-06-27 00:57:30 +00:00
logger . warn ( ` Failed to scrape upcoming updates for ' ${ entity . slug } ' ( ${ entity . parent ? . slug } ): ${ error . message } ` ) ;
2020-05-14 02:26:05 +00:00
}
2020-03-14 01:56:28 +00:00
2020-08-22 02:22:56 +00:00
return emptyReleases ;
2020-03-14 01:56:28 +00:00
}
2020-08-01 13:11:07 +00:00
async function scrapeMovies ( scraper , entity ) {
2020-08-22 02:22:56 +00:00
if ( ! argv . movies || ! scraper . fetchMovies ) {
2020-08-01 13:11:07 +00:00
return [ ] ;
}
try {
// return await scrapeReleases(scraper, entity, preData, true);
return await scraper . fetchMovies ( entity ) ;
} catch ( error ) {
logger . warn ( ` Failed to scrape movies for ' ${ entity . slug } ' ( ${ entity . parent ? . slug } ): ${ error . message } ` ) ;
}
return [ ] ;
}
2020-06-27 00:57:30 +00:00
async function scrapeChannelReleases ( scraper , channelEntity , preData ) {
2020-08-21 01:55:51 +00:00
const [ latestReleases , upcomingReleases ] = await Promise . all ( [
2020-08-22 02:22:56 +00:00
scrapeLatestReleases ( scraper , channelEntity , preData ) ,
scrapeUpcomingReleases ( scraper , channelEntity , preData ) ,
scrapeMovies ( scraper , channelEntity , preData ) ,
2020-05-14 02:26:05 +00:00
] ) ;
2021-10-27 15:19:23 +00:00
logger . info ( ` Fetching ${ argv . latest ? latestReleases . uniqueReleases . length : 'no' } latest and ${ argv . upcoming ? upcomingReleases . uniqueReleases . length : 'no' } upcoming updates for ' ${ channelEntity . name } ' ( ${ channelEntity . parent ? . name } ) ` ) ;
2020-05-14 02:26:05 +00:00
2020-08-23 00:43:10 +00:00
return {
uniqueReleases : [ ... latestReleases . uniqueReleases , ... upcomingReleases . uniqueReleases ] ,
duplicateReleases : [ ... latestReleases . duplicateReleases , ... upcomingReleases . duplicateReleases ] ,
} ;
2020-03-14 01:56:28 +00:00
}
2021-10-27 15:19:23 +00:00
async function scrapeChannel ( channelEntity , accNetworkReleases , beforeNetwork ) {
2021-02-02 00:31:12 +00:00
const scraper = resolveScraper ( channelEntity ) ;
const layoutScraper = resolveLayoutScraper ( channelEntity , scraper ) ;
2020-12-02 02:17:32 +00:00
if ( ! layoutScraper ) {
2020-06-27 22:15:13 +00:00
logger . warn ( ` No scraper found for ' ${ channelEntity . name } ' ( ${ channelEntity . parent ? . name } ) ` ) ;
2020-09-03 20:22:12 +00:00
return emptyReleases ;
2020-05-14 02:26:05 +00:00
}
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
try {
2022-12-28 00:34:12 +00:00
const beforeFetchLatest = await scraper . beforeFetchLatest ? . ( channelEntity , { beforeNetwork } ) ;
2020-03-14 01:56:28 +00:00
2020-12-02 02:17:32 +00:00
return await scrapeChannelReleases ( layoutScraper , channelEntity , {
2020-08-23 00:43:10 +00:00
... accNetworkReleases ,
2020-05-14 02:26:05 +00:00
beforeFetchLatest ,
2021-10-27 15:19:23 +00:00
beforeNetwork ,
2020-05-14 02:26:05 +00:00
} ) ;
} catch ( error ) {
2020-06-27 00:57:30 +00:00
logger . error ( ` Failed to scrape releases from ${ channelEntity . name } using ${ scraper . slug } : ${ error . message } ` ) ;
2020-03-14 01:56:28 +00:00
2020-09-03 20:22:12 +00:00
return emptyReleases ;
2020-05-14 02:26:05 +00:00
}
2020-03-14 01:56:28 +00:00
}
2020-06-25 00:26:25 +00:00
async function scrapeNetworkSequential ( networkEntity ) {
2020-08-23 00:43:10 +00:00
const releases = await Promise . reduce (
2020-11-26 03:26:52 +00:00
networkEntity . includedChildren ,
2020-06-27 00:57:30 +00:00
async ( chain , channelEntity ) => {
const accNetworkReleases = await chain ;
2020-08-23 00:43:10 +00:00
const { uniqueReleases , duplicateReleases } = await scrapeChannel ( channelEntity , accNetworkReleases ) ;
2020-05-14 02:26:05 +00:00
2020-08-23 00:43:10 +00:00
return {
uniqueReleases : accNetworkReleases . uniqueReleases . concat ( uniqueReleases ) ,
duplicateReleases : accNetworkReleases . duplicateReleases . concat ( duplicateReleases ) ,
} ;
2020-05-14 02:26:05 +00:00
} ,
2020-08-23 00:43:10 +00:00
Promise . resolve ( emptyReleases ) ,
2020-05-14 02:26:05 +00:00
) ;
2020-08-23 00:43:10 +00:00
return releases . uniqueReleases ;
2020-03-14 01:56:28 +00:00
}
2022-02-11 21:14:44 +00:00
async function getBeforeNetwork ( networkEntity ) {
try {
2022-02-25 21:13:41 +00:00
const parameters = getRecursiveParameters ( networkEntity ) ;
return await networkEntity . scraper ? . beforeNetwork ? . ( networkEntity , parameters ) ;
2022-02-11 21:14:44 +00:00
} catch ( error ) {
if ( networkEntity . scraper ? . requireBeforeNetwork === false ) {
return null ;
}
throw error ;
}
}
2020-06-25 00:26:25 +00:00
async function scrapeNetworkParallel ( networkEntity ) {
2022-02-11 21:14:44 +00:00
const beforeNetwork = await getBeforeNetwork ( networkEntity ) ;
2021-10-27 15:19:23 +00:00
2020-05-14 02:26:05 +00:00
return Promise . map (
2020-11-26 03:26:52 +00:00
networkEntity . includedChildren ,
2020-08-23 00:43:10 +00:00
async ( channelEntity ) => {
2021-10-27 15:19:23 +00:00
const { uniqueReleases } = await scrapeChannel ( channelEntity , null , beforeNetwork ) ;
2020-08-23 00:43:10 +00:00
return uniqueReleases ;
} ,
2020-05-14 02:26:05 +00:00
{ concurrency : 3 } ,
) ;
2020-03-14 01:56:28 +00:00
}
async function fetchUpdates ( ) {
2020-08-13 22:32:59 +00:00
const includedNetworks = await fetchIncludedEntities ( ) ;
2020-06-15 01:58:35 +00:00
2020-05-14 02:26:05 +00:00
const scrapedNetworks = await Promise . map (
includedNetworks ,
2021-11-20 22:59:15 +00:00
async ( networkEntity ) => ( networkEntity . parameters ? . sequential
2020-06-27 00:57:30 +00:00
? scrapeNetworkSequential ( networkEntity )
: scrapeNetworkParallel ( networkEntity ) ) ,
2020-05-14 02:26:05 +00:00
{ concurrency : 5 } ,
) ;
2020-03-14 01:56:28 +00:00
2020-05-14 02:26:05 +00:00
const releases = scrapedNetworks . flat ( 2 ) ;
2020-03-16 03:10:52 +00:00
2020-05-14 02:26:05 +00:00
return releases ;
2020-03-14 01:56:28 +00:00
}
module . exports = fetchUpdates ;