2018-04-29 00:02:34 +00:00
'use strict' ;
const config = require ( 'config' ) ;
2018-05-27 23:42:46 +00:00
const omit = require ( 'object.omit' ) ;
2018-06-16 23:11:10 +00:00
const dissectLink = require ( '../dissectLink.js' ) ;
2018-06-10 00:48:49 +00:00
const hashPost = require ( './hashPost.js' ) ;
2018-04-29 00:02:34 +00:00
2018-06-30 22:07:12 +00:00
const { isAfter , isBefore , isEqual } = require ( 'date-fns' ) ;
2021-12-17 01:04:25 +00:00
const logger = require ( '../logger' ) ( _ _filename ) ;
2018-06-30 22:07:12 +00:00
function report ( curatedPosts , indexed , user , args ) {
2018-07-07 00:17:41 +00:00
const {
2021-12-17 01:04:25 +00:00
indexedUpdated , tooOldCount , tooRecentCount , beforeIndexedCount , afterIndexedCount , requestedIgnored , duplicates ,
2018-07-07 00:17:41 +00:00
} = curatedPosts ;
2018-06-30 22:07:12 +00:00
if ( indexedUpdated . length > 0 ) {
2021-12-17 01:04:25 +00:00
logger . info ( ` Ignoring ${ indexedUpdated . length } indexed posts for ' ${ user . name } ' ` ) ;
2018-06-30 22:07:12 +00:00
}
2018-07-07 00:17:41 +00:00
if ( requestedIgnored . length > 0 ) {
2021-12-17 01:04:25 +00:00
logger . info ( ` Ignoring ${ requestedIgnored . length } posts because their IDs are specified to be ignored for ' ${ user . name } ' ` ) ;
2018-07-07 00:17:41 +00:00
}
2018-06-30 22:07:12 +00:00
if ( tooOldCount > 0 ) {
2021-12-17 01:04:25 +00:00
logger . info ( ` Ignoring ${ tooOldCount } older posts for ' ${ user . name } ' for specified date limit ' ${ args . after } ' ` ) ;
2018-06-30 22:07:12 +00:00
}
if ( tooRecentCount > 0 ) {
2021-12-17 01:04:25 +00:00
logger . info ( ` Ignoring ${ tooRecentCount } newer posts for ' ${ user . name } ' for specified date limit ' ${ args . before } ' ` ) ;
2018-06-30 22:07:12 +00:00
}
if ( beforeIndexedCount > 0 ) {
2021-12-17 01:04:25 +00:00
logger . info ( ` Ignoring ${ beforeIndexedCount } posts older than the ${ args . afterIndexed } indexed post ( ${ indexed [ args . afterIndexed ] . id } , ${ indexed [ args . afterIndexed ] . date } ) for ' ${ user . name } ' ` ) ;
2018-06-30 22:07:12 +00:00
}
if ( afterIndexedCount > 0 ) {
2021-12-17 01:04:25 +00:00
logger . info ( ` Ignoring ${ afterIndexedCount } posts newer than the ${ args . beforeIndexed } indexed post ( ${ indexed [ args . beforeIndexed ] . id } , ${ indexed [ args . beforeIndexed ] . date } ) for ' ${ user . name } ' ` ) ;
}
if ( duplicates . length > 0 ) {
logger . info ( ` Ignoring ${ duplicates . length } duplicate posts for ' ${ user . name } ' ` ) ;
2018-06-30 22:07:12 +00:00
}
2018-07-02 01:45:20 +00:00
}
2018-06-30 22:07:12 +00:00
2018-07-02 00:33:34 +00:00
function curatePost ( acc , post , user , index , indexed , ignoreIds , processed , args ) {
2018-06-16 23:11:10 +00:00
const host = dissectLink ( post . url ) ;
const permalink = ` https://reddit.com ${ post . permalink } ` ;
2018-05-27 23:42:46 +00:00
2018-06-20 00:06:59 +00:00
const curatedPost = {
id : post . id ,
index ,
title : post . title ,
text : post . selftext ,
user : omit ( user , [ 'posts' ] ) ,
permalink ,
url : post . url ,
datetime : new Date ( post . created _utc * 1000 ) ,
subreddit : post . subreddit . display _name ,
2018-07-08 16:48:14 +00:00
score : post . score ,
2018-06-20 00:06:59 +00:00
preview : post . preview ? post . preview . images . map ( image => image . source ) : null ,
host ,
2019-11-05 00:58:00 +00:00
direct : post . direct ,
comments : post . comments ,
2018-06-20 00:06:59 +00:00
hash : hashPost ( post ) ,
} ;
2018-06-30 22:07:12 +00:00
if ( indexed . entries . length ) {
2019-10-31 04:22:07 +00:00
const indexedPost = indexed . entries . find ( entry => entry . id === post . id ) ;
if ( indexedPost && ! args . redownload ) {
curatedPost . previewFallback = indexedPost . preview ;
2018-06-30 22:07:12 +00:00
return { ... acc , indexedUpdated : [ ... acc . indexedUpdated , curatedPost ] } ;
}
if ( args . afterIndexed && ( isBefore ( curatedPost . datetime , indexed [ args . afterIndexed ] . date ) || isEqual ( curatedPost . datetime , indexed [ args . afterIndexed ] . date ) ) ) {
return { ... acc , beforeIndexedCount : acc . beforeIndexedCount + 1 } ;
}
if ( args . beforeIndexed && ( isAfter ( curatedPost . datetime , indexed [ args . beforeIndexed ] . date ) || isEqual ( curatedPost . datetime , indexed [ args . beforeIndexed ] . date ) ) ) {
return { ... acc , afterIndexedCount : acc . afterIndexedCount + 1 } ;
}
}
if ( args . after && ( isBefore ( curatedPost . datetime , args . after ) || isEqual ( curatedPost . datetime , args . after ) ) ) {
return { ... acc , tooOldCount : acc . tooOldCount + 1 } ;
}
if ( args . before && ( isAfter ( curatedPost . datetime , args . before ) || isEqual ( curatedPost . datetime , args . before ) ) ) {
return { ... acc , tooRecentCount : acc . tooRecentCount + 1 } ;
2018-06-20 00:06:59 +00:00
}
// cut-off at limit, but don't count posts requested directly by ID
if ( acc . posts . length >= args . limit && ! post . direct ) {
return acc ;
}
2018-06-16 23:11:10 +00:00
const ignoring = args . ignore ? args . ignore . find ( prop => post [ prop ] ) : null ;
2018-05-05 15:08:40 +00:00
2018-06-16 23:11:10 +00:00
if ( ignoring ) {
2021-12-17 01:04:25 +00:00
logger . verbose ( ` Ignoring ${ ignoring } post ' ${ post . title } ' ( ${ permalink } ) ` ) ;
2018-04-29 00:02:34 +00:00
2018-06-20 00:06:59 +00:00
return acc ;
2018-06-16 23:11:10 +00:00
}
if ( host ) {
const hostIncludes = args . include && ! args . include . includes ( host . label ) ;
const hostExcluded = args . exclude && args . exclude . includes ( host . label ) ;
2018-07-02 00:33:34 +00:00
if ( ignoreIds . has ( String ( host . id ) . toLowerCase ( ) ) ) {
2018-07-07 00:17:41 +00:00
return { ... acc , requestedIgnored : [ ... acc . requestedIgnored , curatedPost ] } ;
2018-07-02 00:33:34 +00:00
}
2018-06-16 23:11:10 +00:00
if ( hostIncludes || hostExcluded ) {
2021-12-17 01:04:25 +00:00
logger . info ( ` Ignoring source ' ${ host . label } ' from post ' ${ post . url } ' ( ${ permalink } ) ` ) ;
2018-04-29 00:02:34 +00:00
2018-06-20 00:06:59 +00:00
return acc ;
2018-04-29 00:02:34 +00:00
}
2018-06-16 23:11:10 +00:00
if ( config . fetch . avoidDuplicates && processed . has ( host . id ) ) {
2021-12-17 01:04:25 +00:00
logger . verbose ( ` Ignoring duplicate content ' ${ post . url } ' (cross-post, repost or superfluous --post ID) ( ${ permalink } ) ` ) ;
2018-04-29 00:02:34 +00:00
2021-12-17 01:04:25 +00:00
return { ... acc , duplicates : [ ... acc . duplicates , curatedPost ] } ;
2018-06-16 23:11:10 +00:00
}
2018-04-29 00:02:34 +00:00
2018-06-16 23:11:10 +00:00
processed . add ( host . id ) ;
}
2018-05-05 15:08:40 +00:00
2019-11-05 00:58:00 +00:00
return {
... acc ,
2021-12-17 01:04:25 +00:00
processed ,
2019-11-05 00:58:00 +00:00
posts : [
... acc . posts ,
curatedPost ,
] ,
} ;
2018-06-16 23:11:10 +00:00
}
2018-05-05 15:08:40 +00:00
2018-07-02 00:33:34 +00:00
const curatePosts = ( userPosts , ignoreIdsArray , args ) => Object . values ( userPosts ) . reduce ( ( accPosts , user ) => {
2018-06-20 00:06:59 +00:00
const processed = new Set ( ) ;
2018-07-02 00:33:34 +00:00
const ignoreIds = new Set ( ignoreIdsArray . map ( postId => String ( postId ) . toLowerCase ( ) ) ) ;
2018-06-30 22:07:12 +00:00
const indexedByDate = user . indexed . original . sort ( ( entryA , entryB ) => new Date ( entryA . date ) - new Date ( entryB . date ) ) ;
2018-06-20 00:06:59 +00:00
2018-06-30 22:07:12 +00:00
const indexed = {
entries : indexedByDate ,
oldest : indexedByDate . slice ( 0 , 1 ) [ 0 ] ,
latest : indexedByDate . slice ( - 1 ) [ 0 ] ,
} ;
2018-06-30 19:47:43 +00:00
2018-07-02 00:33:34 +00:00
const curatedPosts = user . posts . reduce ( ( accUserPosts , post , index ) => curatePost ( accUserPosts , post , user , index , indexed , ignoreIds , processed , args ) , {
2018-06-30 22:07:12 +00:00
posts : [ ] ,
indexedUpdated : [ ] ,
2018-07-07 00:17:41 +00:00
requestedIgnored : [ ] ,
2021-12-17 01:04:25 +00:00
duplicates : [ ] ,
duplicateCount : 0 ,
2018-06-30 22:07:12 +00:00
tooOldCount : 0 ,
tooRecentCount : 0 ,
beforeIndexedCount : 0 ,
afterIndexedCount : 0 ,
} ) ;
report ( curatedPosts , indexed , user , args ) ;
const indexedOriginal = user . indexed . original . filter ( entry => ! curatedPosts . indexedUpdated . find ( post => post . id === entry . id ) ) ;
return {
... accPosts ,
[ user . name ] : {
... user ,
posts : curatedPosts . posts ,
indexed : {
2018-07-01 01:26:26 +00:00
profile : user . indexed . profile ,
2018-06-30 22:07:12 +00:00
original : indexedOriginal ,
updated : curatedPosts . indexedUpdated ,
2018-07-07 00:17:41 +00:00
ignored : curatedPosts . requestedIgnored ,
2018-06-30 22:07:12 +00:00
oldest : indexed . oldest ,
latest : indexed . latest ,
} ,
} ,
} ;
2018-06-17 01:39:12 +00:00
} , { } ) ;
2018-04-29 00:02:34 +00:00
module . exports = curatePosts ;