From 952392e0d966e637a9587a37022cd8cd2a3ac4ea Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Wed, 11 Sep 2024 05:16:56 +0200 Subject: [PATCH] Building user posts object after fetching user to ensure user fetched posts and directly fetched posts are added to the same user key. Refactor to make better use of functions. Moved profile detail saving call to content fetch. No longer attempting and failing to save profile details for deleted users (directory would not exist). --- config/default.js | 2 +- package-lock.json | 10 --- package.json | 1 - src/app.js | 61 +++++++++------- src/archives/getArchivePostIds.js | 4 +- src/cli.js | 66 +++++++++-------- src/curate/posts.js | 114 +++++++++++++++++------------- src/fetch/content.js | 13 ++-- src/save/profileDetails.js | 4 +- src/save/writeToIndex.js | 20 ++++++ src/sources/getUserPosts.js | 61 ++++++++-------- 11 files changed, 199 insertions(+), 157 deletions(-) create mode 100644 src/save/writeToIndex.js diff --git a/config/default.js b/config/default.js index 8ed9435..cf08dc2 100644 --- a/config/default.js +++ b/config/default.js @@ -19,7 +19,7 @@ module.exports = { index: { file: '$base/index', format: 'tsv', - keys: ['postId', 'postTitle', 'subreddit', 'postDate', 'url'], + keys: ['postId', 'subreddit', 'postDate', 'postTitle', 'url'], }, booleans: { extracted: 'extracted-', diff --git a/package-lock.json b/package-lock.json index 9078510..0a094b4 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1870,16 +1870,6 @@ "harmony-reflect": "1.6.0" } }, - "promise.prototype.finally": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/promise.prototype.finally/-/promise.prototype.finally-3.1.0.tgz", - "integrity": "sha512-7p/K2f6dI+dM8yjRQEGrTQs5hTQixUAdOGpMEA3+pVxpX5oHKRSKAXyLw9Q9HUWDTdwtoo39dSHGQtN90HcEwQ==", - "requires": { - "define-properties": "1.1.2", - "es-abstract": "1.11.0", - "function-bind": "1.1.1" - } - }, "pseudomap": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/pseudomap/-/pseudomap-1.0.2.tgz", diff --git a/package.json b/package.json index 9d79f5e..8e2046a 100644 --- a/package.json +++ b/package.json @@ -42,7 +42,6 @@ "node-fetch": "^2.1.2", "object.omit": "^3.0.0", "object.pick": "^1.3.0", - "promise.prototype.finally": "^3.1.0", "snoowrap": "^1.15.2", "url-pattern": "^1.0.3", "yargs": "^11.0.0" diff --git a/src/app.js b/src/app.js index c5635ee..c88e656 100644 --- a/src/app.js +++ b/src/app.js @@ -1,49 +1,58 @@ 'use strict'; const config = require('config'); -const util = require('util'); -const fs = require('fs-extra'); -const snoowrap = require('snoowrap'); -const omit = require('object.omit'); +const Snoowrap = require('snoowrap'); const exiftool = require('node-exiftool'); const exiftoolBin = require('dist-exiftool'); -require('promise.prototype.finally').shim(); require('array.prototype.flatten').shim(); -const reddit = new snoowrap(config.reddit.api); -const args = require('./cli.js'); +const reddit = new Snoowrap(config.reddit.api); +const args = require('./cli.js')(); const curatePosts = require('./curate/posts.js'); -const interpolate = require('./interpolate.js'); - const attachContentInfo = require('./fetch/info.js'); const fetchSaveContent = require('./fetch/content.js'); const getPosts = require('./sources/getPosts.js')(reddit, args); const getUserPosts = require('./sources/getUserPosts.js')(reddit, args); -if(!(args.users && args.users.length) && !(args.posts && args.posts.length)) { - return console.log('\x1b[31m%s\x1b[0m', 'Please supply at least one user with --user or one post with --post . See --help for more options.'); -} +async function getCompleteUserPosts() { + let userPosts = await getUserPosts(args.users); -const ep = new exiftool.ExiftoolProcess(exiftoolBin); - -Promise.resolve().then(async () => { - const initUsers = args.users ? args.users.reduce((acc, username) => ({...acc, [username]: {name: username, posts: []}}), {}) : {}; - let userPosts = await getUserPosts(initUsers); - - if(args.posts) { + if (args.posts) { userPosts = await getPosts(args.posts, userPosts); } const curatedUserPosts = curatePosts(userPosts, args); - const infoUserPosts = await attachContentInfo(curatedUserPosts); - await ep.open(); - await Promise.all(Object.values(infoUserPosts).map(user => fetchSaveContent(user, ep))); - await ep.close(); -}).catch(error => { - return console.error(error); -}); + return attachContentInfo(curatedUserPosts); +} + +function fetchSavePosts(userPosts, ep) { + return Promise.all(Object.values(userPosts).map(user => fetchSaveContent(user, ep))); +} + +async function initApp() { + const usersProvided = args.users && args.users.length; + const postIdsProvided = args.posts && args.posts.length; + + if (!usersProvided && !postIdsProvided) { + return console.log('\x1b[31m%s\x1b[0m', 'Please supply at least one user or post ID. See --help for more details.'); + } + + try { + const userPosts = await getCompleteUserPosts(); + const ep = new exiftool.ExiftoolProcess(exiftoolBin); + + await ep.open(); + await fetchSavePosts(userPosts, ep); + + return ep.close(); + } catch (error) { + console.error(error); + } +} + +initApp(); diff --git a/src/archives/getArchivePostIds.js b/src/archives/getArchivePostIds.js index 5885a7f..d5f5677 100644 --- a/src/archives/getArchivePostIds.js +++ b/src/archives/getArchivePostIds.js @@ -5,12 +5,12 @@ const config = require('config'); const archives = require('./archives.js'); function getArchivePostIds(username, exclude) { - console.log('Searching archives for posts...'); + console.log(`Finding archived posts for '${username}'...`); return Promise.all(config.fetch.archives.reddit.map(source => archives[source](username))).then(postIds => postIds.flatten()).then(postIds => { return exclude ? postIds.filter(postId => !exclude.includes(postId)) : postIds; }).then(postIds => { - console.log(`Found ${postIds.length} unique archived posts`); + console.log(`Found ${postIds.length} unique archived posts for user '${username}'`); return postIds; }); diff --git a/src/cli.js b/src/cli.js index 263b1bc..fe6b332 100644 --- a/src/cli.js +++ b/src/cli.js @@ -3,34 +3,38 @@ const config = require('config'); const yargs = require('yargs'); -module.exports = yargs.command('npm start -- --user ').option('users', { - alias: 'user', - describe: 'Reddit usernames to fetch posts from', - type: 'array' -}).option('posts', { - alias: 'post', - describe: 'Reddit post IDs to fetch', - type: 'array' -}).option('limit', { - describe: 'Maximum amount of posts to fetch per supplied user (!), after filtering out ignored, cross- and reposts', - type: 'number', - default: config.fetch.limit -}).option('sort', { - describe: 'Property to sort posts by', - choices: ['new', 'top', 'hot', 'controversial'], - default: config.fetch.sort -}).option('ignore', { - describe: 'Ignore posts with any of these properties', - type: 'array', - choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18'] -}).option('include', { - describe: 'Include only these sources', - type: 'array' -}).option('exclude', { - describe: 'Do not include these sources', - type: 'array' -}).option('archives', { - describe: 'Search archives for deleted posts', - type: 'boolean', - default: config.fetch.archives.search -}).argv; +function getArgs() { + return yargs.command('npm start -- --user ').option('users', { + alias: 'user', + describe: 'Reddit usernames to fetch posts from', + type: 'array', + }).option('posts', { + alias: 'post', + describe: 'Reddit post IDs to fetch', + type: 'array', + }).option('limit', { + describe: 'Maximum amount of posts to fetch per supplied user (!), after filtering out ignored, cross- and reposts', + type: 'number', + default: config.fetch.limit, + }).option('sort', { + describe: 'Property to sort posts by', + choices: ['new', 'top', 'hot', 'controversial'], + default: config.fetch.sort, + }).option('ignore', { + describe: 'Ignore posts with any of these properties', + type: 'array', + choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18'], + }).option('include', { + describe: 'Include only these sources', + type: 'array', + }).option('exclude', { + describe: 'Do not include these sources', + type: 'array', + }).option('archives', { + describe: 'Search archives for deleted posts', + type: 'boolean', + default: config.fetch.archives.search, + }).argv; +} + +module.exports = getArgs; diff --git a/src/curate/posts.js b/src/curate/posts.js index 97855ce..2cf4fe8 100644 --- a/src/curate/posts.js +++ b/src/curate/posts.js @@ -1,63 +1,75 @@ 'use strict'; const config = require('config'); -const dissectLink = require('../dissectLink.js'); const omit = require('object.omit'); + +const dissectLink = require('../dissectLink.js'); const hashPost = require('./hashPost.js'); +function curatePost(accUserPosts, post, user, index, processed, args) { + // cut-off at limit, but don't count posts requested directly by ID + if (accUserPosts.length >= args.limit && !post.direct) { + return accUserPosts; + } + + const host = dissectLink(post.url); + const permalink = `https://reddit.com${post.permalink}`; + + const ignoring = args.ignore ? args.ignore.find(prop => post[prop]) : null; + + if (ignoring) { + console.log('\x1b[33m%s\x1b[0m', `Ignoring ${ignoring} post '${post.title}' (${permalink})`); + + return accUserPosts; + } + + if (host) { + const hostIncludes = args.include && !args.include.includes(host.label); + const hostExcluded = args.exclude && args.exclude.includes(host.label); + + if (hostIncludes || hostExcluded) { + console.log( + '\x1b[33m%s\x1b[0m', + `Ignoring source '${host.label}' from post '${post.url}' (${permalink})`, + ); + + return accUserPosts; + } + + if (config.fetch.avoidDuplicates && processed.has(host.id)) { + console.log( + '\x1b[33m%s\x1b[0m', + `Ignoring duplicate content '${post.url}' (cross-post, repost, or superfluous --post ID) (${permalink})`, + ); + + return accUserPosts; + } + + processed.add(host.id); + } + + return accUserPosts.concat({ + id: post.id, + index, + title: post.title, + text: post.selftext, + user: omit(user, ['posts']), + permalink, + url: post.url, + datetime: new Date(post.created_utc * 1000), + subreddit: post.subreddit.display_name, + preview: post.preview ? post.preview.images.map(image => image.source) : null, + host, + hash: hashPost(post) + }); +} + const curatePosts = (userPosts, args) => { const processed = new Set(); - return Object.values(userPosts).reduce((accPosts, user) => ({...accPosts, [user.name]: {...user, posts: user.posts.reduce((accUserPosts, post, index) => { - // cut-off at limit, but don't count posts requested directly by ID - if(accUserPosts.length >= args.limit && !post.direct) { - return accUserPosts; - } - - const host = dissectLink(post.url); - const permalink = 'https://reddit.com' + post.permalink; - - const ignoring = args.ignore ? args.ignore.find(prop => { - return post[prop]; - }) : null; - - if(ignoring) { - console.log('\x1b[33m%s\x1b[0m', `Ignoring ${ignoring} post '${post.title}' (${permalink})`); - - return accUserPosts; - } - - if(host) { - if(config.fetch.avoidDuplicates && processed.has(host.id)) { - console.log('\x1b[33m%s\x1b[0m', `Ignoring duplicate content '${post.url}' (cross-post, repost, or --post ID was already included) (${permalink})`); - - return accUserPosts; - } - - if((args.include && !args.include.includes(host.label)) || (args.exclude && args.exclude.includes(host.label))) { - console.log('\x1b[33m%s\x1b[0m', `Ignoring source '${host.label}' from post '${post.url}' (${permalink})`); - - return accUserPosts; - } - - processed.add(host.id); - } - - return accUserPosts.concat({ - id: post.id, - index: index, - title: post.title, - text: post.selftext, - user: omit(user, ['posts']), - permalink, - url: post.url, - datetime: new Date(post.created_utc * 1000), - subreddit: post.subreddit.display_name, - preview: post.preview ? post.preview.images.map(image => image.source) : null, - host, - hash: hashPost(post) - }); - }, [])}}), {}); + return Object.values(userPosts).reduce((accPosts, user) => { + return { ...accPosts, [user.name]: { ...user, posts: user.posts.reduce((accUserPosts, post, index) => curatePost(accUserPosts, post, user, index, processed, args), []) } }; + }, {}); }; module.exports = curatePosts; diff --git a/src/fetch/content.js b/src/fetch/content.js index 302711d..ef3e1f3 100644 --- a/src/fetch/content.js +++ b/src/fetch/content.js @@ -1,7 +1,9 @@ -const fs = require('fs-extra'); +'use strict'; + const config = require('config'); const Promise = require('bluebird'); +const saveProfileDetails = require('../save/profileDetails.js'); const fetchItem = require('./item.js'); const interpolate = require('../interpolate.js'); const save = require('../save/save.js'); @@ -25,7 +27,7 @@ async function getStreams(item, post) { return null; } -async function addMeta(filepath, ep, item, post, user) { +async function addMeta(filepath, item, post, user, ep) { const meta = Object.entries(config.library.meta).reduce((acc, [key, value]) => { const interpolatedValue = interpolate(value, user, post, item); @@ -46,13 +48,16 @@ function getFilepath(item, post, user) { } async function fetchSaveContent(user, ep) { + // async, nothing depends on its success so don't await + saveProfileDetails(user); + const posts = await Promise.map(user.posts, async (post) => { await Promise.reduce(post.content.items, async (accItems, originalItem, index) => { const item = { ...originalItem, index }; const streams = await getStreams(item, post); // no streams, ignore item - if (streams.length <= 0) { + if (!streams || streams.length <= 0) { return accItems; } @@ -63,7 +68,7 @@ async function fetchSaveContent(user, ep) { await mux(filepath, sourcePaths, item); } - await addMeta(filepath, ep, item, post, user); + await addMeta(filepath, item, post, user, ep); return sourcePaths; }, []); diff --git a/src/save/profileDetails.js b/src/save/profileDetails.js index 320a97f..9c46af1 100644 --- a/src/save/profileDetails.js +++ b/src/save/profileDetails.js @@ -9,7 +9,7 @@ const textToStream = require('./textToStream.js'); const save = require('./save.js'); function saveProfileDetails(user) { - if(config.library.profile.image && !user.fallback) { + if(config.library.profile.image && !user.fallback && !user.deleted) { const image = user.profile ? user.profile.image : user.image; if(config.library.profile.avoidAvatar && new urlPattern('http(s)\\://(www.)redditstatic.com/avatars/:id(.:ext)(?:query)').match(image)) { @@ -26,7 +26,7 @@ function saveProfileDetails(user) { } } - if(config.library.profile.description && !user.fallback) { + if(config.library.profile.description && !user.fallback && !user.deleted) { if(user.profile && user.profile.description) { const filepath = interpolate(config.library.profile.description, user); const stream = textToStream(user.profile.description); diff --git a/src/save/writeToIndex.js b/src/save/writeToIndex.js new file mode 100644 index 0000000..499addb --- /dev/null +++ b/src/save/writeToIndex.js @@ -0,0 +1,20 @@ +'use strict'; + +const config = require('config'); +const fs = require('fs-extra'); +const Promise = require('bluebird'); + +const csvStringify = Promise.promisify(require('csv').stringify); + +const interpolate = require('../interpolate.js'); + +async function writeToIndex(posts, user) { + const filename = interpolate(config.library.index.file, user, null, false); + const data = posts.map(post => config.library.index.keys.map(key => interpolate(`$${key}`, user, post, null, false, 'YYYY-MM-DDTHH:mm:ssZ'))); + + const tsvString = await csvStringify(data, { delimiter: '\t', header: true, columns: config.library.index.keys }); + + return fs.writeFile(filename, tsvString); +} + +module.exports = writeToIndex; diff --git a/src/sources/getUserPosts.js b/src/sources/getUserPosts.js index d37d667..acd0fe7 100644 --- a/src/sources/getUserPosts.js +++ b/src/sources/getUserPosts.js @@ -1,61 +1,64 @@ 'use strict'; const Promise = require('bluebird'); -const config = require('config'); const getArchivePostIds = require('../archives/getArchivePostIds.js'); const curateUser = require('../curate/user.js'); -const saveProfileDetails = require('../save/profileDetails.js'); -const getUser = async (username, reddit) => { +async function getUser(username, reddit) { try { const user = await reddit.getUser(username).fetch(); return curateUser(user); - } catch(error) { + } catch (error) { console.log('\x1b[31m%s\x1b[0m', `Failed to fetch reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`); return { name: username, - fallback: true + fallback: true, }; } -}; +} -const getPosts = async (username, reddit, args) => { +async function getPosts(username, reddit, args) { try { - return await reddit.getUser(username).getSubmissions({ + const user = await reddit.getUser(username).getSubmissions({ sort: args.sort, - limit: Infinity + limit: Infinity, }); - } catch(error) { + + return user; + } catch (error) { console.log('\x1b[31m%s\x1b[0m', `Failed to fetch posts from reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`); return []; } -}; +} -const getUserPostsWrap = (reddit, args) => users => Promise.props(Object.entries(users).reduce((userPosts, [username, user]) => { - userPosts[username] = (async () => { - const [user, posts] = await Promise.all([ - getUser(username, reddit), - getPosts(username, reddit, args) - ]); +async function getArchivedPosts(username, posts, reddit) { + const postIds = await getArchivePostIds(username, posts.map(post => post.id)); - if(user) { - saveProfileDetails(user); - } + return Promise.all(postIds.map(postId => reddit.getSubmission(postId).fetch())); +} - if(args.archives) { - const postIds = await getArchivePostIds(username, posts.map(post => post.id)); - const archivedPosts = await Promise.all(postIds.map(postId => reddit.getSubmission(postId).fetch())); +function getUserPostsWrap(reddit, args) { + return function getUserPosts(usernames) { + return Promise.props(usernames.reduce(async (userPosts, username) => { + const [user, posts] = await Promise.all([ + getUser(username, reddit), + getPosts(username, reddit, args), + ]); - posts.push(...archivedPosts); - } + if (args.archives) { + posts.push(...await getArchivedPosts(username, posts, reddit)); + } - return {...user, posts}; - })(); + if (posts.length) { + return { ...userPosts, [user.name]: { ...user, posts } }; + } - return userPosts; -}, {})); + return userPosts; + }, {})); + }; +} module.exports = getUserPostsWrap;