diff --git a/config/default.js b/config/default.js index 8ed9435..cf08dc2 100644 --- a/config/default.js +++ b/config/default.js @@ -19,7 +19,7 @@ module.exports = { index: { file: '$base/index', format: 'tsv', - keys: ['postId', 'postTitle', 'subreddit', 'postDate', 'url'], + keys: ['postId', 'subreddit', 'postDate', 'postTitle', 'url'], }, booleans: { extracted: 'extracted-', diff --git a/package-lock.json b/package-lock.json index 9078510..0a094b4 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1870,16 +1870,6 @@ "harmony-reflect": "1.6.0" } }, - "promise.prototype.finally": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/promise.prototype.finally/-/promise.prototype.finally-3.1.0.tgz", - "integrity": "sha512-7p/K2f6dI+dM8yjRQEGrTQs5hTQixUAdOGpMEA3+pVxpX5oHKRSKAXyLw9Q9HUWDTdwtoo39dSHGQtN90HcEwQ==", - "requires": { - "define-properties": "1.1.2", - "es-abstract": "1.11.0", - "function-bind": "1.1.1" - } - }, "pseudomap": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/pseudomap/-/pseudomap-1.0.2.tgz", diff --git a/package.json b/package.json index 9d79f5e..8e2046a 100644 --- a/package.json +++ b/package.json @@ -42,7 +42,6 @@ "node-fetch": "^2.1.2", "object.omit": "^3.0.0", "object.pick": "^1.3.0", - "promise.prototype.finally": "^3.1.0", "snoowrap": "^1.15.2", "url-pattern": "^1.0.3", "yargs": "^11.0.0" diff --git a/src/app.js b/src/app.js index c5635ee..c88e656 100644 --- a/src/app.js +++ b/src/app.js @@ -1,49 +1,58 @@ 'use strict'; const config = require('config'); -const util = require('util'); -const fs = require('fs-extra'); -const snoowrap = require('snoowrap'); -const omit = require('object.omit'); +const Snoowrap = require('snoowrap'); const exiftool = require('node-exiftool'); const exiftoolBin = require('dist-exiftool'); -require('promise.prototype.finally').shim(); require('array.prototype.flatten').shim(); -const reddit = new snoowrap(config.reddit.api); -const args = require('./cli.js'); +const reddit = new Snoowrap(config.reddit.api); +const args = require('./cli.js')(); const curatePosts = require('./curate/posts.js'); -const interpolate = require('./interpolate.js'); - const attachContentInfo = require('./fetch/info.js'); const fetchSaveContent = require('./fetch/content.js'); const getPosts = require('./sources/getPosts.js')(reddit, args); const getUserPosts = require('./sources/getUserPosts.js')(reddit, args); -if(!(args.users && args.users.length) && !(args.posts && args.posts.length)) { - return console.log('\x1b[31m%s\x1b[0m', 'Please supply at least one user with --user or one post with --post . See --help for more options.'); -} +async function getCompleteUserPosts() { + let userPosts = await getUserPosts(args.users); -const ep = new exiftool.ExiftoolProcess(exiftoolBin); - -Promise.resolve().then(async () => { - const initUsers = args.users ? args.users.reduce((acc, username) => ({...acc, [username]: {name: username, posts: []}}), {}) : {}; - let userPosts = await getUserPosts(initUsers); - - if(args.posts) { + if (args.posts) { userPosts = await getPosts(args.posts, userPosts); } const curatedUserPosts = curatePosts(userPosts, args); - const infoUserPosts = await attachContentInfo(curatedUserPosts); - await ep.open(); - await Promise.all(Object.values(infoUserPosts).map(user => fetchSaveContent(user, ep))); - await ep.close(); -}).catch(error => { - return console.error(error); -}); + return attachContentInfo(curatedUserPosts); +} + +function fetchSavePosts(userPosts, ep) { + return Promise.all(Object.values(userPosts).map(user => fetchSaveContent(user, ep))); +} + +async function initApp() { + const usersProvided = args.users && args.users.length; + const postIdsProvided = args.posts && args.posts.length; + + if (!usersProvided && !postIdsProvided) { + return console.log('\x1b[31m%s\x1b[0m', 'Please supply at least one user or post ID. See --help for more details.'); + } + + try { + const userPosts = await getCompleteUserPosts(); + const ep = new exiftool.ExiftoolProcess(exiftoolBin); + + await ep.open(); + await fetchSavePosts(userPosts, ep); + + return ep.close(); + } catch (error) { + console.error(error); + } +} + +initApp(); diff --git a/src/archives/getArchivePostIds.js b/src/archives/getArchivePostIds.js index 5885a7f..d5f5677 100644 --- a/src/archives/getArchivePostIds.js +++ b/src/archives/getArchivePostIds.js @@ -5,12 +5,12 @@ const config = require('config'); const archives = require('./archives.js'); function getArchivePostIds(username, exclude) { - console.log('Searching archives for posts...'); + console.log(`Finding archived posts for '${username}'...`); return Promise.all(config.fetch.archives.reddit.map(source => archives[source](username))).then(postIds => postIds.flatten()).then(postIds => { return exclude ? postIds.filter(postId => !exclude.includes(postId)) : postIds; }).then(postIds => { - console.log(`Found ${postIds.length} unique archived posts`); + console.log(`Found ${postIds.length} unique archived posts for user '${username}'`); return postIds; }); diff --git a/src/cli.js b/src/cli.js index 263b1bc..fe6b332 100644 --- a/src/cli.js +++ b/src/cli.js @@ -3,34 +3,38 @@ const config = require('config'); const yargs = require('yargs'); -module.exports = yargs.command('npm start -- --user ').option('users', { - alias: 'user', - describe: 'Reddit usernames to fetch posts from', - type: 'array' -}).option('posts', { - alias: 'post', - describe: 'Reddit post IDs to fetch', - type: 'array' -}).option('limit', { - describe: 'Maximum amount of posts to fetch per supplied user (!), after filtering out ignored, cross- and reposts', - type: 'number', - default: config.fetch.limit -}).option('sort', { - describe: 'Property to sort posts by', - choices: ['new', 'top', 'hot', 'controversial'], - default: config.fetch.sort -}).option('ignore', { - describe: 'Ignore posts with any of these properties', - type: 'array', - choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18'] -}).option('include', { - describe: 'Include only these sources', - type: 'array' -}).option('exclude', { - describe: 'Do not include these sources', - type: 'array' -}).option('archives', { - describe: 'Search archives for deleted posts', - type: 'boolean', - default: config.fetch.archives.search -}).argv; +function getArgs() { + return yargs.command('npm start -- --user ').option('users', { + alias: 'user', + describe: 'Reddit usernames to fetch posts from', + type: 'array', + }).option('posts', { + alias: 'post', + describe: 'Reddit post IDs to fetch', + type: 'array', + }).option('limit', { + describe: 'Maximum amount of posts to fetch per supplied user (!), after filtering out ignored, cross- and reposts', + type: 'number', + default: config.fetch.limit, + }).option('sort', { + describe: 'Property to sort posts by', + choices: ['new', 'top', 'hot', 'controversial'], + default: config.fetch.sort, + }).option('ignore', { + describe: 'Ignore posts with any of these properties', + type: 'array', + choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18'], + }).option('include', { + describe: 'Include only these sources', + type: 'array', + }).option('exclude', { + describe: 'Do not include these sources', + type: 'array', + }).option('archives', { + describe: 'Search archives for deleted posts', + type: 'boolean', + default: config.fetch.archives.search, + }).argv; +} + +module.exports = getArgs; diff --git a/src/curate/posts.js b/src/curate/posts.js index 97855ce..2cf4fe8 100644 --- a/src/curate/posts.js +++ b/src/curate/posts.js @@ -1,63 +1,75 @@ 'use strict'; const config = require('config'); -const dissectLink = require('../dissectLink.js'); const omit = require('object.omit'); + +const dissectLink = require('../dissectLink.js'); const hashPost = require('./hashPost.js'); +function curatePost(accUserPosts, post, user, index, processed, args) { + // cut-off at limit, but don't count posts requested directly by ID + if (accUserPosts.length >= args.limit && !post.direct) { + return accUserPosts; + } + + const host = dissectLink(post.url); + const permalink = `https://reddit.com${post.permalink}`; + + const ignoring = args.ignore ? args.ignore.find(prop => post[prop]) : null; + + if (ignoring) { + console.log('\x1b[33m%s\x1b[0m', `Ignoring ${ignoring} post '${post.title}' (${permalink})`); + + return accUserPosts; + } + + if (host) { + const hostIncludes = args.include && !args.include.includes(host.label); + const hostExcluded = args.exclude && args.exclude.includes(host.label); + + if (hostIncludes || hostExcluded) { + console.log( + '\x1b[33m%s\x1b[0m', + `Ignoring source '${host.label}' from post '${post.url}' (${permalink})`, + ); + + return accUserPosts; + } + + if (config.fetch.avoidDuplicates && processed.has(host.id)) { + console.log( + '\x1b[33m%s\x1b[0m', + `Ignoring duplicate content '${post.url}' (cross-post, repost, or superfluous --post ID) (${permalink})`, + ); + + return accUserPosts; + } + + processed.add(host.id); + } + + return accUserPosts.concat({ + id: post.id, + index, + title: post.title, + text: post.selftext, + user: omit(user, ['posts']), + permalink, + url: post.url, + datetime: new Date(post.created_utc * 1000), + subreddit: post.subreddit.display_name, + preview: post.preview ? post.preview.images.map(image => image.source) : null, + host, + hash: hashPost(post) + }); +} + const curatePosts = (userPosts, args) => { const processed = new Set(); - return Object.values(userPosts).reduce((accPosts, user) => ({...accPosts, [user.name]: {...user, posts: user.posts.reduce((accUserPosts, post, index) => { - // cut-off at limit, but don't count posts requested directly by ID - if(accUserPosts.length >= args.limit && !post.direct) { - return accUserPosts; - } - - const host = dissectLink(post.url); - const permalink = 'https://reddit.com' + post.permalink; - - const ignoring = args.ignore ? args.ignore.find(prop => { - return post[prop]; - }) : null; - - if(ignoring) { - console.log('\x1b[33m%s\x1b[0m', `Ignoring ${ignoring} post '${post.title}' (${permalink})`); - - return accUserPosts; - } - - if(host) { - if(config.fetch.avoidDuplicates && processed.has(host.id)) { - console.log('\x1b[33m%s\x1b[0m', `Ignoring duplicate content '${post.url}' (cross-post, repost, or --post ID was already included) (${permalink})`); - - return accUserPosts; - } - - if((args.include && !args.include.includes(host.label)) || (args.exclude && args.exclude.includes(host.label))) { - console.log('\x1b[33m%s\x1b[0m', `Ignoring source '${host.label}' from post '${post.url}' (${permalink})`); - - return accUserPosts; - } - - processed.add(host.id); - } - - return accUserPosts.concat({ - id: post.id, - index: index, - title: post.title, - text: post.selftext, - user: omit(user, ['posts']), - permalink, - url: post.url, - datetime: new Date(post.created_utc * 1000), - subreddit: post.subreddit.display_name, - preview: post.preview ? post.preview.images.map(image => image.source) : null, - host, - hash: hashPost(post) - }); - }, [])}}), {}); + return Object.values(userPosts).reduce((accPosts, user) => { + return { ...accPosts, [user.name]: { ...user, posts: user.posts.reduce((accUserPosts, post, index) => curatePost(accUserPosts, post, user, index, processed, args), []) } }; + }, {}); }; module.exports = curatePosts; diff --git a/src/fetch/content.js b/src/fetch/content.js index 302711d..ef3e1f3 100644 --- a/src/fetch/content.js +++ b/src/fetch/content.js @@ -1,7 +1,9 @@ -const fs = require('fs-extra'); +'use strict'; + const config = require('config'); const Promise = require('bluebird'); +const saveProfileDetails = require('../save/profileDetails.js'); const fetchItem = require('./item.js'); const interpolate = require('../interpolate.js'); const save = require('../save/save.js'); @@ -25,7 +27,7 @@ async function getStreams(item, post) { return null; } -async function addMeta(filepath, ep, item, post, user) { +async function addMeta(filepath, item, post, user, ep) { const meta = Object.entries(config.library.meta).reduce((acc, [key, value]) => { const interpolatedValue = interpolate(value, user, post, item); @@ -46,13 +48,16 @@ function getFilepath(item, post, user) { } async function fetchSaveContent(user, ep) { + // async, nothing depends on its success so don't await + saveProfileDetails(user); + const posts = await Promise.map(user.posts, async (post) => { await Promise.reduce(post.content.items, async (accItems, originalItem, index) => { const item = { ...originalItem, index }; const streams = await getStreams(item, post); // no streams, ignore item - if (streams.length <= 0) { + if (!streams || streams.length <= 0) { return accItems; } @@ -63,7 +68,7 @@ async function fetchSaveContent(user, ep) { await mux(filepath, sourcePaths, item); } - await addMeta(filepath, ep, item, post, user); + await addMeta(filepath, item, post, user, ep); return sourcePaths; }, []); diff --git a/src/save/profileDetails.js b/src/save/profileDetails.js index 320a97f..9c46af1 100644 --- a/src/save/profileDetails.js +++ b/src/save/profileDetails.js @@ -9,7 +9,7 @@ const textToStream = require('./textToStream.js'); const save = require('./save.js'); function saveProfileDetails(user) { - if(config.library.profile.image && !user.fallback) { + if(config.library.profile.image && !user.fallback && !user.deleted) { const image = user.profile ? user.profile.image : user.image; if(config.library.profile.avoidAvatar && new urlPattern('http(s)\\://(www.)redditstatic.com/avatars/:id(.:ext)(?:query)').match(image)) { @@ -26,7 +26,7 @@ function saveProfileDetails(user) { } } - if(config.library.profile.description && !user.fallback) { + if(config.library.profile.description && !user.fallback && !user.deleted) { if(user.profile && user.profile.description) { const filepath = interpolate(config.library.profile.description, user); const stream = textToStream(user.profile.description); diff --git a/src/save/writeToIndex.js b/src/save/writeToIndex.js new file mode 100644 index 0000000..499addb --- /dev/null +++ b/src/save/writeToIndex.js @@ -0,0 +1,20 @@ +'use strict'; + +const config = require('config'); +const fs = require('fs-extra'); +const Promise = require('bluebird'); + +const csvStringify = Promise.promisify(require('csv').stringify); + +const interpolate = require('../interpolate.js'); + +async function writeToIndex(posts, user) { + const filename = interpolate(config.library.index.file, user, null, false); + const data = posts.map(post => config.library.index.keys.map(key => interpolate(`$${key}`, user, post, null, false, 'YYYY-MM-DDTHH:mm:ssZ'))); + + const tsvString = await csvStringify(data, { delimiter: '\t', header: true, columns: config.library.index.keys }); + + return fs.writeFile(filename, tsvString); +} + +module.exports = writeToIndex; diff --git a/src/sources/getUserPosts.js b/src/sources/getUserPosts.js index d37d667..acd0fe7 100644 --- a/src/sources/getUserPosts.js +++ b/src/sources/getUserPosts.js @@ -1,61 +1,64 @@ 'use strict'; const Promise = require('bluebird'); -const config = require('config'); const getArchivePostIds = require('../archives/getArchivePostIds.js'); const curateUser = require('../curate/user.js'); -const saveProfileDetails = require('../save/profileDetails.js'); -const getUser = async (username, reddit) => { +async function getUser(username, reddit) { try { const user = await reddit.getUser(username).fetch(); return curateUser(user); - } catch(error) { + } catch (error) { console.log('\x1b[31m%s\x1b[0m', `Failed to fetch reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`); return { name: username, - fallback: true + fallback: true, }; } -}; +} -const getPosts = async (username, reddit, args) => { +async function getPosts(username, reddit, args) { try { - return await reddit.getUser(username).getSubmissions({ + const user = await reddit.getUser(username).getSubmissions({ sort: args.sort, - limit: Infinity + limit: Infinity, }); - } catch(error) { + + return user; + } catch (error) { console.log('\x1b[31m%s\x1b[0m', `Failed to fetch posts from reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`); return []; } -}; +} -const getUserPostsWrap = (reddit, args) => users => Promise.props(Object.entries(users).reduce((userPosts, [username, user]) => { - userPosts[username] = (async () => { - const [user, posts] = await Promise.all([ - getUser(username, reddit), - getPosts(username, reddit, args) - ]); +async function getArchivedPosts(username, posts, reddit) { + const postIds = await getArchivePostIds(username, posts.map(post => post.id)); - if(user) { - saveProfileDetails(user); - } + return Promise.all(postIds.map(postId => reddit.getSubmission(postId).fetch())); +} - if(args.archives) { - const postIds = await getArchivePostIds(username, posts.map(post => post.id)); - const archivedPosts = await Promise.all(postIds.map(postId => reddit.getSubmission(postId).fetch())); +function getUserPostsWrap(reddit, args) { + return function getUserPosts(usernames) { + return Promise.props(usernames.reduce(async (userPosts, username) => { + const [user, posts] = await Promise.all([ + getUser(username, reddit), + getPosts(username, reddit, args), + ]); - posts.push(...archivedPosts); - } + if (args.archives) { + posts.push(...await getArchivedPosts(username, posts, reddit)); + } - return {...user, posts}; - })(); + if (posts.length) { + return { ...userPosts, [user.name]: { ...user, posts } }; + } - return userPosts; -}, {})); + return userPosts; + }, {})); + }; +} module.exports = getUserPostsWrap;