'use strict'; const config = require('config'); const omit = require('object.omit'); const dissectLink = require('../dissectLink.js'); const hashPost = require('./hashPost.js'); const { isAfter, isBefore, isEqual } = require('date-fns'); const logger = require('../logger')(__filename); function report(curatedPosts, indexed, user, args) { const { indexedUpdated, tooOldCount, tooRecentCount, beforeIndexedCount, afterIndexedCount, requestedIgnored, duplicates, } = curatedPosts; if (indexedUpdated.length > 0) { logger.info(`Ignoring ${indexedUpdated.length} indexed posts for '${user.name}'`); } if (requestedIgnored.length > 0) { logger.info(`Ignoring ${requestedIgnored.length} posts because their IDs are specified to be ignored for '${user.name}'`); } if (tooOldCount > 0) { logger.info(`Ignoring ${tooOldCount} older posts for '${user.name}' for specified date limit '${args.after}'`); } if (tooRecentCount > 0) { logger.info(`Ignoring ${tooRecentCount} newer posts for '${user.name}' for specified date limit '${args.before}'`); } if (beforeIndexedCount > 0) { logger.info(`Ignoring ${beforeIndexedCount} posts older than the ${args.afterIndexed} indexed post (${indexed[args.afterIndexed].id}, ${indexed[args.afterIndexed].date}) for '${user.name}'`); } if (afterIndexedCount > 0) { logger.info(`Ignoring ${afterIndexedCount} posts newer than the ${args.beforeIndexed} indexed post (${indexed[args.beforeIndexed].id}, ${indexed[args.beforeIndexed].date}) for '${user.name}'`); } if (duplicates.length > 0) { logger.info(`Ignoring ${duplicates.length} duplicate posts for '${user.name}'`); } } function curatePost(acc, post, user, index, indexed, ignoreIds, processed, args) { const host = dissectLink(post.url); const permalink = `https://reddit.com${post.permalink}`; const curatedPost = { id: post.id, index, title: post.title, text: post.selftext, user: omit(user, ['posts']), permalink, url: post.url, datetime: new Date(post.created_utc * 1000), subreddit: post.subreddit.display_name, score: post.score, preview: post.preview ? post.preview.images.map(image => image.source) : null, host, direct: post.direct, comments: post.comments, hash: hashPost(post), }; if (indexed.entries.length) { const indexedPost = indexed.entries.find(entry => entry.id === post.id); if (indexedPost && !args.redownload) { curatedPost.previewFallback = indexedPost.preview; return { ...acc, indexedUpdated: [...acc.indexedUpdated, curatedPost] }; } if (args.afterIndexed && (isBefore(curatedPost.datetime, indexed[args.afterIndexed].date) || isEqual(curatedPost.datetime, indexed[args.afterIndexed].date))) { return { ...acc, beforeIndexedCount: acc.beforeIndexedCount + 1 }; } if (args.beforeIndexed && (isAfter(curatedPost.datetime, indexed[args.beforeIndexed].date) || isEqual(curatedPost.datetime, indexed[args.beforeIndexed].date))) { return { ...acc, afterIndexedCount: acc.afterIndexedCount + 1 }; } } if (args.after && (isBefore(curatedPost.datetime, args.after) || isEqual(curatedPost.datetime, args.after))) { return { ...acc, tooOldCount: acc.tooOldCount + 1 }; } if (args.before && (isAfter(curatedPost.datetime, args.before) || isEqual(curatedPost.datetime, args.before))) { return { ...acc, tooRecentCount: acc.tooRecentCount + 1 }; } // cut-off at limit, but don't count posts requested directly by ID if (acc.posts.length >= args.limit && !post.direct) { return acc; } const ignoring = args.ignore ? args.ignore.find(prop => post[prop]) : null; if (ignoring) { logger.verbose(`Ignoring ${ignoring} post '${post.title}' (${permalink})`); return acc; } if (host) { const hostIncludes = args.include && !args.include.includes(host.label); const hostExcluded = args.exclude && args.exclude.includes(host.label); if (ignoreIds.has(String(host.id).toLowerCase())) { return { ...acc, requestedIgnored: [...acc.requestedIgnored, curatedPost] }; } if (hostIncludes || hostExcluded) { logger.info(`Ignoring source '${host.label}' from post '${post.url}' (${permalink})`); return acc; } if (config.fetch.avoidDuplicates && processed.has(host.id)) { logger.verbose(`Ignoring duplicate content '${post.url}' (cross-post, repost or superfluous --post ID) (${permalink})`); return { ...acc, duplicates: [...acc.duplicates, curatedPost] }; } processed.add(host.id); } return { ...acc, processed, posts: [ ...acc.posts, curatedPost, ], }; } const curatePosts = (userPosts, ignoreIdsArray, args) => Object.values(userPosts).reduce((accPosts, user) => { const processed = new Set(); const ignoreIds = new Set(ignoreIdsArray.map(postId => String(postId).toLowerCase())); const indexedByDate = user.indexed.original.sort((entryA, entryB) => new Date(entryA.date) - new Date(entryB.date)); const indexed = { entries: indexedByDate, oldest: indexedByDate.slice(0, 1)[0], latest: indexedByDate.slice(-1)[0], }; const curatedPosts = user.posts.reduce((accUserPosts, post, index) => curatePost(accUserPosts, post, user, index, indexed, ignoreIds, processed, args), { posts: [], indexedUpdated: [], requestedIgnored: [], duplicates: [], duplicateCount: 0, tooOldCount: 0, tooRecentCount: 0, beforeIndexedCount: 0, afterIndexedCount: 0, }); report(curatedPosts, indexed, user, args); const indexedOriginal = user.indexed.original.filter(entry => !curatedPosts.indexedUpdated.find(post => post.id === entry.id)); return { ...accPosts, [user.name]: { ...user, posts: curatedPosts.posts, indexed: { profile: user.indexed.profile, original: indexedOriginal, updated: curatedPosts.indexedUpdated, ignored: curatedPosts.requestedIgnored, oldest: indexed.oldest, latest: indexed.latest, }, }, }; }, {}); module.exports = curatePosts;