From 24a165cf481d8aa45fe58d2a34cfb43faf0e59ca Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Wed, 11 Sep 2024 05:16:56 +0200 Subject: [PATCH] Added date and indexed oldest/latest limit arguments. --- src/cli.js | 21 +++++++++- src/curate/posts.js | 93 ++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 102 insertions(+), 12 deletions(-) diff --git a/src/cli.js b/src/cli.js index bb3cbf3..b3c265d 100644 --- a/src/cli.js +++ b/src/cli.js @@ -4,7 +4,7 @@ const config = require('config'); const yargs = require('yargs'); function getArgs() { - return yargs + const args = yargs .command('npm start -- --user ') .option('users', { alias: 'user', @@ -40,7 +40,18 @@ function getArgs() { type: 'array', }) .option('after', { - describe: 'Do not include posts from before this date (DD-MM-YYYY). When set to \'index\', it will assume the date of the latest indexed post.', + describe: 'Only include posts after this date (YYYY-MM-DD, optionally HH:mm)', + }) + .option('before', { + describe: 'Only include posts before this date (YYYY-MM-DD, optionally HH:mm)', + }) + .option('after-indexed', { + describe: 'Only include posts after the oldest or the latest entry in the index', + options: ['oldest', 'latest'], + }) + .option('before-indexed', { + describe: 'Only include posts before the oldest or the latest entry in the index', + options: ['oldest', 'latest'], }) .option('archives', { describe: 'Search archives for deleted posts', @@ -48,6 +59,12 @@ function getArgs() { default: config.fetch.archives.search, }) .argv; + + return { + ...args, + after: args.after ? new Date(args.after) : null, + before: args.before ? new Date(args.before) : null, + }; } module.exports = getArgs; diff --git a/src/curate/posts.js b/src/curate/posts.js index 149395a..dc9f3c3 100644 --- a/src/curate/posts.js +++ b/src/curate/posts.js @@ -6,7 +6,39 @@ const omit = require('object.omit'); const dissectLink = require('../dissectLink.js'); const hashPost = require('./hashPost.js'); -function curatePost(acc, post, user, index, processed, args) { +const { isAfter, isBefore, isEqual } = require('date-fns'); + +function report(curatedPosts, indexed, user, args) { + const { posts, indexedUpdated, tooOldCount, tooRecentCount, beforeIndexedCount, afterIndexedCount } = curatedPosts; + + if (indexedUpdated.length > 0) { + console.log('\x1b[33m%s\x1b[0m', `Ignoring ${indexedUpdated.length} indexed posts for '${user.name}'`); + } + + if (tooOldCount > 0) { + console.log('\x1b[33m%s\x1b[0m', `Ignoring ${tooOldCount} older posts for '${user.name}' for specified date limit '${args.after}'`); + } + + if (tooRecentCount > 0) { + console.log('\x1b[33m%s\x1b[0m', `Ignoring ${tooRecentCount} newer posts for '${user.name}' for specified date limit '${args.before}'`); + } + + if (beforeIndexedCount > 0) { + console.log( + '\x1b[33m%s\x1b[0m', + `Ignoring ${beforeIndexedCount} posts older than the ${args.afterIndexed} indexed post (${indexed[args.afterIndexed].id}, ${indexed[args.afterIndexed].date}) for '${user.name}'` + ); + } + + if (afterIndexedCount > 0) { + console.log( + '\x1b[33m%s\x1b[0m', + `Ignoring ${afterIndexedCount} posts newer than the ${args.beforeIndexed} indexed post (${indexed[args.beforeIndexed].id}, ${indexed[args.beforeIndexed].date}) for '${user.name}'` + ); + } +}; + +function curatePost(acc, post, user, index, indexed, processed, args) { const host = dissectLink(post.url); const permalink = `https://reddit.com${post.permalink}`; @@ -25,8 +57,26 @@ function curatePost(acc, post, user, index, processed, args) { hash: hashPost(post), }; - if (user.indexed.original.find(entry => entry.id === post.id)) { - return { ...acc, indexedUpdated: [...acc.indexedUpdated, curatedPost] }; + if (indexed.entries.length) { + if (indexed.entries.find(entry => entry.id === post.id)) { + return { ...acc, indexedUpdated: [...acc.indexedUpdated, curatedPost] }; + } + + if (args.afterIndexed && (isBefore(curatedPost.datetime, indexed[args.afterIndexed].date) || isEqual(curatedPost.datetime, indexed[args.afterIndexed].date))) { + return { ...acc, beforeIndexedCount: acc.beforeIndexedCount + 1 }; + } + + if (args.beforeIndexed && (isAfter(curatedPost.datetime, indexed[args.beforeIndexed].date) || isEqual(curatedPost.datetime, indexed[args.beforeIndexed].date))) { + return { ...acc, afterIndexedCount: acc.afterIndexedCount + 1 }; + } + } + + if (args.after && (isBefore(curatedPost.datetime, args.after) || isEqual(curatedPost.datetime, args.after))) { + return { ...acc, tooOldCount: acc.tooOldCount + 1 }; + } + + if (args.before && (isAfter(curatedPost.datetime, args.before) || isEqual(curatedPost.datetime, args.before))) { + return { ...acc, tooRecentCount: acc.tooRecentCount + 1 }; } // cut-off at limit, but don't count posts requested directly by ID @@ -72,17 +122,40 @@ function curatePost(acc, post, user, index, processed, args) { const curatePosts = (userPosts, args) => Object.values(userPosts).reduce((accPosts, user) => { const processed = new Set(); + const indexedByDate = user.indexed.original.sort((entryA, entryB) => new Date(entryA.date) - new Date(entryB.date)); - const { posts, indexedUpdated } = user.posts.reduce((accUserPosts, post, index) => - curatePost(accUserPosts, post, user, index, processed, args), { posts: [], indexedUpdated: [] }); + const indexed = { + entries: indexedByDate, + oldest: indexedByDate.slice(0, 1)[0], + latest: indexedByDate.slice(-1)[0], + }; - if (indexedUpdated.length > 0) { - console.log('\x1b[33m%s\x1b[0m', `Ignoring ${indexedUpdated.length} indexed posts for '${user.name}'`); - } + const curatedPosts = user.posts.reduce((accUserPosts, post, index) => curatePost(accUserPosts, post, user, index, indexed, processed, args), { + posts: [], + indexedUpdated: [], + tooOldCount: 0, + tooRecentCount: 0, + beforeIndexedCount: 0, + afterIndexedCount: 0, + }); - const indexedOriginal = user.indexed.original.filter(entry => !indexedUpdated.find(post => post.id === entry.id)); + report(curatedPosts, indexed, user, args); - return { ...accPosts, [user.name]: { ...user, posts, indexed: { original: indexedOriginal, updated: indexedUpdated } } }; + const indexedOriginal = user.indexed.original.filter(entry => !curatedPosts.indexedUpdated.find(post => post.id === entry.id)); + + return { + ...accPosts, + [user.name]: { + ...user, + posts: curatedPosts.posts, + indexed: { + original: indexedOriginal, + updated: curatedPosts.indexedUpdated, + oldest: indexed.oldest, + latest: indexed.latest, + }, + }, + }; }, {}); module.exports = curatePosts;