From 84836bc8c0da17a613a475aba89bfe2670c98c5a Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Wed, 11 Sep 2024 05:16:55 +0200 Subject: [PATCH] Refactored post retrieval so limit is applied per-user and ignores directly requested posts, and to start utilizing async/await. --- README.md | 2 +- package-lock.json | 37 +++++++++++++++++++++++ package.json | 3 ++ src/app.js | 19 ++++-------- src/cli.js | 2 +- src/curate/posts.js | 32 ++++++++++++-------- src/sources/getPosts.js | 60 +++++++++++++++++++++---------------- src/sources/getUserPosts.js | 38 +++++++++++++---------- 8 files changed, 124 insertions(+), 69 deletions(-) diff --git a/README.md b/README.md index e5d3ae7..9f60125 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ reddit-post-dump requires a arbitrarily recent version of Node.js. Before use, d ### Optional arguments * `--users [...]`: You may fetch posts from multiple users by supplying a space-separated list of usernames to `--users`. * `--posts [...]`: Fetch multiple posts by supplying a space-separated list of post IDs to `--posts`. -* `--limit `: Maximum amount posts per user to fetch content from. Limit is applied after fltering out ignored, cross- and reposts. +* `--limit `: Maximum amount posts per user to fetch content from. Limit is applied after filtering out ignored, cross- and reposts. Posts requested directly by ID may be discarded as duplicates, but are not otherwise affected by the limit. * `--sort `: How posts should be sorted while fetched. This affects the `$postIndex` variable, and in combination with a `--limit` decides what posts will be included. * `--ignore [...]`: Ignore posts with any of the following properties: `pinned`, `stickied`, `hidden`, `over_18`, `spoiler`. * `--exclude [...]`: Do not include posts from these sources (e.g. `self`, `reddit`, `imgur`, `gfycat`, ...). Should not be used in combination with `--include`. diff --git a/package-lock.json b/package-lock.json index 5e18a29..c0b7353 100644 --- a/package-lock.json +++ b/package-lock.json @@ -532,11 +532,27 @@ "resolved": "https://registry.npmjs.org/is-date-object/-/is-date-object-1.0.1.tgz", "integrity": "sha1-mqIOtq7rv/d/vTPnTKAbM1gdOhY=" }, + "is-extendable": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-1.0.1.tgz", + "integrity": "sha512-arnXMxT1hhoKo9k1LZdmlNyJdDDfy2v0fXjFlmok4+i8ul/6WlbVge9bhM74OpNPQPMGUToDtz+KXa1PneJxOA==", + "requires": { + "is-plain-object": "2.0.4" + } + }, "is-fullwidth-code-point": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz", "integrity": "sha1-o7MKXE8ZkYMWeqq5O+764937ZU8=" }, + "is-plain-object": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-2.0.4.tgz", + "integrity": "sha512-h5PpgXkWitc38BBMYawTYMWJHFZJVnBquFE57xFpjB8pJFiF6gZ+bU+WyI/yqXiFR5mdLsgYNaPe8uao6Uv9Og==", + "requires": { + "isobject": "3.0.1" + } + }, "is-regex": { "version": "1.0.4", "resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.0.4.tgz", @@ -570,6 +586,11 @@ "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz", "integrity": "sha1-6PvzdNxVb/iUehDcsFctYz8s+hA=" }, + "isobject": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", + "integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8=" + }, "isstream": { "version": "0.1.2", "resolved": "https://registry.npmjs.org/isstream/-/isstream-0.1.2.tgz", @@ -728,6 +749,22 @@ "resolved": "https://registry.npmjs.org/object-keys/-/object-keys-1.0.11.tgz", "integrity": "sha1-xUYBd4rVYPEULODgG8yotW0TQm0=" }, + "object.omit": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/object.omit/-/object.omit-3.0.0.tgz", + "integrity": "sha512-EO+BCv6LJfu+gBIF3ggLicFebFLN5zqzz/WWJlMFfkMyGth+oBkhxzDl0wx2W4GkLzuQs/FsSkXZb2IMWQqmBQ==", + "requires": { + "is-extendable": "1.0.1" + } + }, + "object.pick": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/object.pick/-/object.pick-1.3.0.tgz", + "integrity": "sha1-h6EKxMFpS9Lhy/U1kaZhQftd10c=", + "requires": { + "isobject": "3.0.1" + } + }, "options": { "version": "0.0.6", "resolved": "https://registry.npmjs.org/options/-/options-0.0.6.tgz", diff --git a/package.json b/package.json index be82455..6788b29 100644 --- a/package.json +++ b/package.json @@ -28,6 +28,7 @@ "license": "ISC", "dependencies": { "array.prototype.flatten": "^1.2.1", + "bluebird": "^3.5.1", "cheerio": "^1.0.0-rc.2", "config": "^1.30.0", "date-fns": "^1.29.0", @@ -37,6 +38,8 @@ "mime-types": "^2.1.18", "node-exiftool": "^2.3.0", "node-fetch": "^2.1.2", + "object.omit": "^3.0.0", + "object.pick": "^1.3.0", "promise.prototype.finally": "^3.1.0", "snoowrap": "^1.15.2", "url-pattern": "^1.0.3", diff --git a/src/app.js b/src/app.js index 43fdab8..34bd729 100644 --- a/src/app.js +++ b/src/app.js @@ -25,23 +25,16 @@ if(!(args.users && args.users.length) && !(args.posts && args.posts.length)) { return console.log('\x1b[31m%s\x1b[0m', 'Please supply at least one user with --user or one post with --post . See --help for more options.'); } -Promise.resolve().then(() => { - if(args.users) { - return getUserPosts(args.users); - } +Promise.resolve().then(async () => { + let userPosts = args.users ? await getUserPosts(args.users) : {}; - return []; -}).then(userPosts => { if(args.posts) { - return getPosts(args.posts).then(posts => posts.concat(userPosts)); + userPosts = await getPosts(args.posts, userPosts); } - return userPosts; -}).then(posts => { - return curatePosts(posts, args).slice(0, args.limit); -}).then(posts => { - return attachContentInfo(posts); -}).then(posts => { + const curatedPosts = curatePosts(userPosts, args); + const posts = await attachContentInfo(curatedPosts); + return fetchContent(posts); }).catch(error => { return console.error(error); diff --git a/src/cli.js b/src/cli.js index fcf8b2f..263b1bc 100644 --- a/src/cli.js +++ b/src/cli.js @@ -12,7 +12,7 @@ module.exports = yargs.command('npm start -- --user ').option('users', describe: 'Reddit post IDs to fetch', type: 'array' }).option('limit', { - describe: 'Maximum amount of posts to fetch after filtering out ignored, cross- and reposts', + describe: 'Maximum amount of posts to fetch per supplied user (!), after filtering out ignored, cross- and reposts', type: 'number', default: config.fetch.limit }).option('sort', { diff --git a/src/curate/posts.js b/src/curate/posts.js index b176379..7cd3dcd 100644 --- a/src/curate/posts.js +++ b/src/curate/posts.js @@ -2,54 +2,60 @@ const config = require('config'); const dissectLink = require('../dissectLink.js'); +const omit = require('object.omit'); -function curatePosts(posts, args) { +const curatePosts = (userPosts, args) => { const processed = new Set(); - return posts.reduce((acc, post, index) => { + return Object.values(userPosts).reduce((accPosts, user) => accPosts.concat(user.posts.reduce((accUserPosts, post, index) => { + // cut-off at limit, but don't count posts requested directly by ID + if(accUserPosts.length >= args.limit && !post.direct) { + return accUserPosts; + } + const host = dissectLink(post.url); - post.permalink = 'https://reddit.com' + post.permalink; + const permalink = 'https://reddit.com' + post.permalink; const ignoring = args.ignore ? args.ignore.find(prop => { return post[prop]; }) : null; if(ignoring) { - console.log('\x1b[33m%s\x1b[0m', `Ignoring ${ignoring} post '${post.title}' (${post.permalink})`); + console.log('\x1b[33m%s\x1b[0m', `Ignoring ${ignoring} post '${post.title}' (${permalink})`); - return acc; + return accUserPosts; } if(host) { if(config.fetch.avoidDuplicates && processed.has(host.id)) { - console.log('\x1b[33m%s\x1b[0m', `Ignoring cross-post or repost '${post.url}' (${post.permalink})`); + console.log('\x1b[33m%s\x1b[0m', `Ignoring duplicate content '${post.url}' (cross-post, repost, or --post ID was already included) (${permalink})`); - return acc; + return accUserPosts; } if((args.include && !args.include.includes(host.label)) || (args.exclude && args.exclude.includes(host.label))) { - console.log('\x1b[33m%s\x1b[0m', `Ignoring source '${host.label}' from post '${post.url}' (${post.permalink})`); + console.log('\x1b[33m%s\x1b[0m', `Ignoring source '${host.label}' from post '${post.url}' (${permalink})`); - return acc; + return accUserPosts; } processed.add(host.id); } - return acc.concat({ + return accUserPosts.concat({ id: post.id, index: index, title: post.title, text: post.selftext, - user: post.user, - permalink: post.permalink, + user: omit(user, ['posts']), + permalink, url: post.url, datetime: new Date(post.created_utc * 1000), subreddit: post.subreddit.display_name, preview: post.preview ? post.preview.images.map(image => image.source) : null, host }); - }, []); + }, [])), []); }; module.exports = curatePosts; diff --git a/src/sources/getPosts.js b/src/sources/getPosts.js index eecc8e2..eb52179 100644 --- a/src/sources/getPosts.js +++ b/src/sources/getPosts.js @@ -1,40 +1,48 @@ 'use strict'; +const Promise = require('bluebird'); const config = require('config'); const curateUser = require('../curate/user.js'); const saveProfileDetails = require('../save/profileDetails.js'); -function getPostsWrap(reddit, args) { - return function getPosts(postIds) { - return postIds.reduce((chain, postId) => { - return chain.then(acc => { - return reddit.getSubmission(postId).fetch().then(post => ({post, acc})); - }).then(({post, acc}) => { - if(acc.users[post.author.name]) { - return {post, acc, user: acc.users[post.author.name]} - } +const getUser = async (username, reddit) => { + try { + const user = await reddit.getUser(username).fetch(); - if(post.author.name === '[deleted]') { - return {post, acc, user: {name: '[deleted]'}}; - } + return curateUser(user); + } catch(error) { + console.log('\x1b[31m%s\x1b[0m', `Failed to fetch reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`); - return reddit.getUser(post.author.name).fetch().then(curateUser).then(saveProfileDetails).then(user => ({post, acc, user})); - }).then(({post, acc, user}) => { - post.user = user; - acc.posts.push(post); + return { + name: username, + fallback: true + }; + } +}; - // keep track of users to prevent fetching one user multiple times - acc.users[user.name] = user; +const getPostsWrap = (reddit, args) => { + return function getPosts(postIds, userPosts = {}) { + return Promise.reduce(postIds, (accUserPosts, postId) => Promise.resolve().then(async () => { + const post = await reddit.getSubmission(postId).fetch(); - return acc; - }); - }, Promise.resolve({ - posts: [], - users: {} - })).then(({posts, users}) => { - return posts; - }); + post.direct = true; + + if(accUserPosts[post.author.name]) { + accUserPosts[post.author.name].posts = accUserPosts[post.author.name].posts.concat(post); + + return accUserPosts; + } + + // don't attempt to fetch deleted user + if(post.author.name === '[deleted]') { + return {...accUserPosts, '[deleted]': {name: '[deleted]', deleted: true, posts: [post]}}; + } + + const user = await getUser(post.author.name); + + return {...accUserPosts, [post.author.name]: {...user, posts: [post]}} + }), userPosts); }; }; diff --git a/src/sources/getUserPosts.js b/src/sources/getUserPosts.js index 65c274d..d8fdfcd 100644 --- a/src/sources/getUserPosts.js +++ b/src/sources/getUserPosts.js @@ -1,35 +1,41 @@ 'use strict'; +const Promise = require('bluebird'); const config = require('config'); - const getArchivePostIds = require('../archives/getArchivePostIds.js'); const curateUser = require('../curate/user.js'); const saveProfileDetails = require('../save/profileDetails.js'); -const getUser = (username, reddit) => { - return reddit.getUser(username).fetch().then(user => curateUser(user)).catch(error => { +const getUser = async (username, reddit) => { + try { + const user = await reddit.getUser(username).fetch(); + + return curateUser(user); + } catch(error) { console.log('\x1b[31m%s\x1b[0m', `Failed to fetch reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`); return { name: username, fallback: true }; - }); + } }; -const getPosts = (username, reddit, args) => { - return reddit.getUser(username).getSubmissions({ - sort: args.sort, - limit: Infinity - }).catch(error => { +const getPosts = async (username, reddit, args) => { + try { + return await reddit.getUser(username).getSubmissions({ + sort: args.sort, + limit: Infinity + }); + } catch(error) { console.log('\x1b[31m%s\x1b[0m', `Failed to fetch posts from reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`); return []; - }); + } }; -const getUserPostsWrap = (reddit, args) => usernames => Promise.all( - usernames.map(async username => { +const getUserPostsWrap = (reddit, args) => usernames => Promise.props(usernames.reduce((userPosts, username) => { + userPosts[username] = (async () => { const [user, posts] = await Promise.all([ getUser(username, reddit), getPosts(username, reddit, args) @@ -46,8 +52,10 @@ const getUserPostsWrap = (reddit, args) => usernames => Promise.all( posts.push(...archivedPosts); } - return posts.map(post => Object.assign(post, {user})); - }) -).then(posts => posts.flatten()); + return {...user, posts}; + })(); + + return userPosts; +}, {})); module.exports = getUserPostsWrap;