Refactored post retrieval so limit is applied per-user and ignores directly requested posts, and to start utilizing async/await.
This commit is contained in:
parent
2627ec6207
commit
84836bc8c0
|
@ -38,7 +38,7 @@ reddit-post-dump requires a arbitrarily recent version of Node.js. Before use, d
|
||||||
### Optional arguments
|
### Optional arguments
|
||||||
* `--users <username> [<username>...]`: You may fetch posts from multiple users by supplying a space-separated list of usernames to `--users`.
|
* `--users <username> [<username>...]`: You may fetch posts from multiple users by supplying a space-separated list of usernames to `--users`.
|
||||||
* `--posts <post-id> [<post-id>...]`: Fetch multiple posts by supplying a space-separated list of post IDs to `--posts`.
|
* `--posts <post-id> [<post-id>...]`: Fetch multiple posts by supplying a space-separated list of post IDs to `--posts`.
|
||||||
* `--limit <number>`: Maximum amount posts per user to fetch content from. Limit is applied after fltering out ignored, cross- and reposts.
|
* `--limit <number>`: Maximum amount posts per user to fetch content from. Limit is applied after filtering out ignored, cross- and reposts. Posts requested directly by ID may be discarded as duplicates, but are not otherwise affected by the limit.
|
||||||
* `--sort <method>`: How posts should be sorted while fetched. This affects the `$postIndex` variable, and in combination with a `--limit` decides what posts will be included.
|
* `--sort <method>`: How posts should be sorted while fetched. This affects the `$postIndex` variable, and in combination with a `--limit` decides what posts will be included.
|
||||||
* `--ignore <prop> [<prop>...]`: Ignore posts with any of the following properties: `pinned`, `stickied`, `hidden`, `over_18`, `spoiler`.
|
* `--ignore <prop> [<prop>...]`: Ignore posts with any of the following properties: `pinned`, `stickied`, `hidden`, `over_18`, `spoiler`.
|
||||||
* `--exclude <source> [<source>...]`: Do not include posts from these sources (e.g. `self`, `reddit`, `imgur`, `gfycat`, ...). Should not be used in combination with `--include`.
|
* `--exclude <source> [<source>...]`: Do not include posts from these sources (e.g. `self`, `reddit`, `imgur`, `gfycat`, ...). Should not be used in combination with `--include`.
|
||||||
|
|
|
@ -532,11 +532,27 @@
|
||||||
"resolved": "https://registry.npmjs.org/is-date-object/-/is-date-object-1.0.1.tgz",
|
"resolved": "https://registry.npmjs.org/is-date-object/-/is-date-object-1.0.1.tgz",
|
||||||
"integrity": "sha1-mqIOtq7rv/d/vTPnTKAbM1gdOhY="
|
"integrity": "sha1-mqIOtq7rv/d/vTPnTKAbM1gdOhY="
|
||||||
},
|
},
|
||||||
|
"is-extendable": {
|
||||||
|
"version": "1.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-1.0.1.tgz",
|
||||||
|
"integrity": "sha512-arnXMxT1hhoKo9k1LZdmlNyJdDDfy2v0fXjFlmok4+i8ul/6WlbVge9bhM74OpNPQPMGUToDtz+KXa1PneJxOA==",
|
||||||
|
"requires": {
|
||||||
|
"is-plain-object": "2.0.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
"is-fullwidth-code-point": {
|
"is-fullwidth-code-point": {
|
||||||
"version": "2.0.0",
|
"version": "2.0.0",
|
||||||
"resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-2.0.0.tgz",
|
||||||
"integrity": "sha1-o7MKXE8ZkYMWeqq5O+764937ZU8="
|
"integrity": "sha1-o7MKXE8ZkYMWeqq5O+764937ZU8="
|
||||||
},
|
},
|
||||||
|
"is-plain-object": {
|
||||||
|
"version": "2.0.4",
|
||||||
|
"resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-2.0.4.tgz",
|
||||||
|
"integrity": "sha512-h5PpgXkWitc38BBMYawTYMWJHFZJVnBquFE57xFpjB8pJFiF6gZ+bU+WyI/yqXiFR5mdLsgYNaPe8uao6Uv9Og==",
|
||||||
|
"requires": {
|
||||||
|
"isobject": "3.0.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
"is-regex": {
|
"is-regex": {
|
||||||
"version": "1.0.4",
|
"version": "1.0.4",
|
||||||
"resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.0.4.tgz",
|
"resolved": "https://registry.npmjs.org/is-regex/-/is-regex-1.0.4.tgz",
|
||||||
|
@ -570,6 +586,11 @@
|
||||||
"resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz",
|
"resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz",
|
||||||
"integrity": "sha1-6PvzdNxVb/iUehDcsFctYz8s+hA="
|
"integrity": "sha1-6PvzdNxVb/iUehDcsFctYz8s+hA="
|
||||||
},
|
},
|
||||||
|
"isobject": {
|
||||||
|
"version": "3.0.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz",
|
||||||
|
"integrity": "sha1-TkMekrEalzFjaqH5yNHMvP2reN8="
|
||||||
|
},
|
||||||
"isstream": {
|
"isstream": {
|
||||||
"version": "0.1.2",
|
"version": "0.1.2",
|
||||||
"resolved": "https://registry.npmjs.org/isstream/-/isstream-0.1.2.tgz",
|
"resolved": "https://registry.npmjs.org/isstream/-/isstream-0.1.2.tgz",
|
||||||
|
@ -728,6 +749,22 @@
|
||||||
"resolved": "https://registry.npmjs.org/object-keys/-/object-keys-1.0.11.tgz",
|
"resolved": "https://registry.npmjs.org/object-keys/-/object-keys-1.0.11.tgz",
|
||||||
"integrity": "sha1-xUYBd4rVYPEULODgG8yotW0TQm0="
|
"integrity": "sha1-xUYBd4rVYPEULODgG8yotW0TQm0="
|
||||||
},
|
},
|
||||||
|
"object.omit": {
|
||||||
|
"version": "3.0.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/object.omit/-/object.omit-3.0.0.tgz",
|
||||||
|
"integrity": "sha512-EO+BCv6LJfu+gBIF3ggLicFebFLN5zqzz/WWJlMFfkMyGth+oBkhxzDl0wx2W4GkLzuQs/FsSkXZb2IMWQqmBQ==",
|
||||||
|
"requires": {
|
||||||
|
"is-extendable": "1.0.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"object.pick": {
|
||||||
|
"version": "1.3.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/object.pick/-/object.pick-1.3.0.tgz",
|
||||||
|
"integrity": "sha1-h6EKxMFpS9Lhy/U1kaZhQftd10c=",
|
||||||
|
"requires": {
|
||||||
|
"isobject": "3.0.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
"options": {
|
"options": {
|
||||||
"version": "0.0.6",
|
"version": "0.0.6",
|
||||||
"resolved": "https://registry.npmjs.org/options/-/options-0.0.6.tgz",
|
"resolved": "https://registry.npmjs.org/options/-/options-0.0.6.tgz",
|
||||||
|
|
|
@ -28,6 +28,7 @@
|
||||||
"license": "ISC",
|
"license": "ISC",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"array.prototype.flatten": "^1.2.1",
|
"array.prototype.flatten": "^1.2.1",
|
||||||
|
"bluebird": "^3.5.1",
|
||||||
"cheerio": "^1.0.0-rc.2",
|
"cheerio": "^1.0.0-rc.2",
|
||||||
"config": "^1.30.0",
|
"config": "^1.30.0",
|
||||||
"date-fns": "^1.29.0",
|
"date-fns": "^1.29.0",
|
||||||
|
@ -37,6 +38,8 @@
|
||||||
"mime-types": "^2.1.18",
|
"mime-types": "^2.1.18",
|
||||||
"node-exiftool": "^2.3.0",
|
"node-exiftool": "^2.3.0",
|
||||||
"node-fetch": "^2.1.2",
|
"node-fetch": "^2.1.2",
|
||||||
|
"object.omit": "^3.0.0",
|
||||||
|
"object.pick": "^1.3.0",
|
||||||
"promise.prototype.finally": "^3.1.0",
|
"promise.prototype.finally": "^3.1.0",
|
||||||
"snoowrap": "^1.15.2",
|
"snoowrap": "^1.15.2",
|
||||||
"url-pattern": "^1.0.3",
|
"url-pattern": "^1.0.3",
|
||||||
|
|
19
src/app.js
19
src/app.js
|
@ -25,23 +25,16 @@ if(!(args.users && args.users.length) && !(args.posts && args.posts.length)) {
|
||||||
return console.log('\x1b[31m%s\x1b[0m', 'Please supply at least one user with --user <user> or one post with --post <post-id>. See --help for more options.');
|
return console.log('\x1b[31m%s\x1b[0m', 'Please supply at least one user with --user <user> or one post with --post <post-id>. See --help for more options.');
|
||||||
}
|
}
|
||||||
|
|
||||||
Promise.resolve().then(() => {
|
Promise.resolve().then(async () => {
|
||||||
if(args.users) {
|
let userPosts = args.users ? await getUserPosts(args.users) : {};
|
||||||
return getUserPosts(args.users);
|
|
||||||
}
|
|
||||||
|
|
||||||
return [];
|
|
||||||
}).then(userPosts => {
|
|
||||||
if(args.posts) {
|
if(args.posts) {
|
||||||
return getPosts(args.posts).then(posts => posts.concat(userPosts));
|
userPosts = await getPosts(args.posts, userPosts);
|
||||||
}
|
}
|
||||||
|
|
||||||
return userPosts;
|
const curatedPosts = curatePosts(userPosts, args);
|
||||||
}).then(posts => {
|
const posts = await attachContentInfo(curatedPosts);
|
||||||
return curatePosts(posts, args).slice(0, args.limit);
|
|
||||||
}).then(posts => {
|
|
||||||
return attachContentInfo(posts);
|
|
||||||
}).then(posts => {
|
|
||||||
return fetchContent(posts);
|
return fetchContent(posts);
|
||||||
}).catch(error => {
|
}).catch(error => {
|
||||||
return console.error(error);
|
return console.error(error);
|
||||||
|
|
|
@ -12,7 +12,7 @@ module.exports = yargs.command('npm start -- --user <username>').option('users',
|
||||||
describe: 'Reddit post IDs to fetch',
|
describe: 'Reddit post IDs to fetch',
|
||||||
type: 'array'
|
type: 'array'
|
||||||
}).option('limit', {
|
}).option('limit', {
|
||||||
describe: 'Maximum amount of posts to fetch after filtering out ignored, cross- and reposts',
|
describe: 'Maximum amount of posts to fetch per supplied user (!), after filtering out ignored, cross- and reposts',
|
||||||
type: 'number',
|
type: 'number',
|
||||||
default: config.fetch.limit
|
default: config.fetch.limit
|
||||||
}).option('sort', {
|
}).option('sort', {
|
||||||
|
|
|
@ -2,54 +2,60 @@
|
||||||
|
|
||||||
const config = require('config');
|
const config = require('config');
|
||||||
const dissectLink = require('../dissectLink.js');
|
const dissectLink = require('../dissectLink.js');
|
||||||
|
const omit = require('object.omit');
|
||||||
|
|
||||||
function curatePosts(posts, args) {
|
const curatePosts = (userPosts, args) => {
|
||||||
const processed = new Set();
|
const processed = new Set();
|
||||||
|
|
||||||
return posts.reduce((acc, post, index) => {
|
return Object.values(userPosts).reduce((accPosts, user) => accPosts.concat(user.posts.reduce((accUserPosts, post, index) => {
|
||||||
|
// cut-off at limit, but don't count posts requested directly by ID
|
||||||
|
if(accUserPosts.length >= args.limit && !post.direct) {
|
||||||
|
return accUserPosts;
|
||||||
|
}
|
||||||
|
|
||||||
const host = dissectLink(post.url);
|
const host = dissectLink(post.url);
|
||||||
post.permalink = 'https://reddit.com' + post.permalink;
|
const permalink = 'https://reddit.com' + post.permalink;
|
||||||
|
|
||||||
const ignoring = args.ignore ? args.ignore.find(prop => {
|
const ignoring = args.ignore ? args.ignore.find(prop => {
|
||||||
return post[prop];
|
return post[prop];
|
||||||
}) : null;
|
}) : null;
|
||||||
|
|
||||||
if(ignoring) {
|
if(ignoring) {
|
||||||
console.log('\x1b[33m%s\x1b[0m', `Ignoring ${ignoring} post '${post.title}' (${post.permalink})`);
|
console.log('\x1b[33m%s\x1b[0m', `Ignoring ${ignoring} post '${post.title}' (${permalink})`);
|
||||||
|
|
||||||
return acc;
|
return accUserPosts;
|
||||||
}
|
}
|
||||||
|
|
||||||
if(host) {
|
if(host) {
|
||||||
if(config.fetch.avoidDuplicates && processed.has(host.id)) {
|
if(config.fetch.avoidDuplicates && processed.has(host.id)) {
|
||||||
console.log('\x1b[33m%s\x1b[0m', `Ignoring cross-post or repost '${post.url}' (${post.permalink})`);
|
console.log('\x1b[33m%s\x1b[0m', `Ignoring duplicate content '${post.url}' (cross-post, repost, or --post ID was already included) (${permalink})`);
|
||||||
|
|
||||||
return acc;
|
return accUserPosts;
|
||||||
}
|
}
|
||||||
|
|
||||||
if((args.include && !args.include.includes(host.label)) || (args.exclude && args.exclude.includes(host.label))) {
|
if((args.include && !args.include.includes(host.label)) || (args.exclude && args.exclude.includes(host.label))) {
|
||||||
console.log('\x1b[33m%s\x1b[0m', `Ignoring source '${host.label}' from post '${post.url}' (${post.permalink})`);
|
console.log('\x1b[33m%s\x1b[0m', `Ignoring source '${host.label}' from post '${post.url}' (${permalink})`);
|
||||||
|
|
||||||
return acc;
|
return accUserPosts;
|
||||||
}
|
}
|
||||||
|
|
||||||
processed.add(host.id);
|
processed.add(host.id);
|
||||||
}
|
}
|
||||||
|
|
||||||
return acc.concat({
|
return accUserPosts.concat({
|
||||||
id: post.id,
|
id: post.id,
|
||||||
index: index,
|
index: index,
|
||||||
title: post.title,
|
title: post.title,
|
||||||
text: post.selftext,
|
text: post.selftext,
|
||||||
user: post.user,
|
user: omit(user, ['posts']),
|
||||||
permalink: post.permalink,
|
permalink,
|
||||||
url: post.url,
|
url: post.url,
|
||||||
datetime: new Date(post.created_utc * 1000),
|
datetime: new Date(post.created_utc * 1000),
|
||||||
subreddit: post.subreddit.display_name,
|
subreddit: post.subreddit.display_name,
|
||||||
preview: post.preview ? post.preview.images.map(image => image.source) : null,
|
preview: post.preview ? post.preview.images.map(image => image.source) : null,
|
||||||
host
|
host
|
||||||
});
|
});
|
||||||
}, []);
|
}, [])), []);
|
||||||
};
|
};
|
||||||
|
|
||||||
module.exports = curatePosts;
|
module.exports = curatePosts;
|
||||||
|
|
|
@ -1,40 +1,48 @@
|
||||||
'use strict';
|
'use strict';
|
||||||
|
|
||||||
|
const Promise = require('bluebird');
|
||||||
const config = require('config');
|
const config = require('config');
|
||||||
|
|
||||||
const curateUser = require('../curate/user.js');
|
const curateUser = require('../curate/user.js');
|
||||||
const saveProfileDetails = require('../save/profileDetails.js');
|
const saveProfileDetails = require('../save/profileDetails.js');
|
||||||
|
|
||||||
function getPostsWrap(reddit, args) {
|
const getUser = async (username, reddit) => {
|
||||||
return function getPosts(postIds) {
|
try {
|
||||||
return postIds.reduce((chain, postId) => {
|
const user = await reddit.getUser(username).fetch();
|
||||||
return chain.then(acc => {
|
|
||||||
return reddit.getSubmission(postId).fetch().then(post => ({post, acc}));
|
return curateUser(user);
|
||||||
}).then(({post, acc}) => {
|
} catch(error) {
|
||||||
if(acc.users[post.author.name]) {
|
console.log('\x1b[31m%s\x1b[0m', `Failed to fetch reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`);
|
||||||
return {post, acc, user: acc.users[post.author.name]}
|
|
||||||
|
return {
|
||||||
|
name: username,
|
||||||
|
fallback: true
|
||||||
|
};
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const getPostsWrap = (reddit, args) => {
|
||||||
|
return function getPosts(postIds, userPosts = {}) {
|
||||||
|
return Promise.reduce(postIds, (accUserPosts, postId) => Promise.resolve().then(async () => {
|
||||||
|
const post = await reddit.getSubmission(postId).fetch();
|
||||||
|
|
||||||
|
post.direct = true;
|
||||||
|
|
||||||
|
if(accUserPosts[post.author.name]) {
|
||||||
|
accUserPosts[post.author.name].posts = accUserPosts[post.author.name].posts.concat(post);
|
||||||
|
|
||||||
|
return accUserPosts;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// don't attempt to fetch deleted user
|
||||||
if(post.author.name === '[deleted]') {
|
if(post.author.name === '[deleted]') {
|
||||||
return {post, acc, user: {name: '[deleted]'}};
|
return {...accUserPosts, '[deleted]': {name: '[deleted]', deleted: true, posts: [post]}};
|
||||||
}
|
}
|
||||||
|
|
||||||
return reddit.getUser(post.author.name).fetch().then(curateUser).then(saveProfileDetails).then(user => ({post, acc, user}));
|
const user = await getUser(post.author.name);
|
||||||
}).then(({post, acc, user}) => {
|
|
||||||
post.user = user;
|
|
||||||
acc.posts.push(post);
|
|
||||||
|
|
||||||
// keep track of users to prevent fetching one user multiple times
|
return {...accUserPosts, [post.author.name]: {...user, posts: [post]}}
|
||||||
acc.users[user.name] = user;
|
}), userPosts);
|
||||||
|
|
||||||
return acc;
|
|
||||||
});
|
|
||||||
}, Promise.resolve({
|
|
||||||
posts: [],
|
|
||||||
users: {}
|
|
||||||
})).then(({posts, users}) => {
|
|
||||||
return posts;
|
|
||||||
});
|
|
||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -1,35 +1,41 @@
|
||||||
'use strict';
|
'use strict';
|
||||||
|
|
||||||
|
const Promise = require('bluebird');
|
||||||
const config = require('config');
|
const config = require('config');
|
||||||
|
|
||||||
const getArchivePostIds = require('../archives/getArchivePostIds.js');
|
const getArchivePostIds = require('../archives/getArchivePostIds.js');
|
||||||
const curateUser = require('../curate/user.js');
|
const curateUser = require('../curate/user.js');
|
||||||
const saveProfileDetails = require('../save/profileDetails.js');
|
const saveProfileDetails = require('../save/profileDetails.js');
|
||||||
|
|
||||||
const getUser = (username, reddit) => {
|
const getUser = async (username, reddit) => {
|
||||||
return reddit.getUser(username).fetch().then(user => curateUser(user)).catch(error => {
|
try {
|
||||||
|
const user = await reddit.getUser(username).fetch();
|
||||||
|
|
||||||
|
return curateUser(user);
|
||||||
|
} catch(error) {
|
||||||
console.log('\x1b[31m%s\x1b[0m', `Failed to fetch reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`);
|
console.log('\x1b[31m%s\x1b[0m', `Failed to fetch reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
name: username,
|
name: username,
|
||||||
fallback: true
|
fallback: true
|
||||||
};
|
};
|
||||||
});
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const getPosts = (username, reddit, args) => {
|
const getPosts = async (username, reddit, args) => {
|
||||||
return reddit.getUser(username).getSubmissions({
|
try {
|
||||||
|
return await reddit.getUser(username).getSubmissions({
|
||||||
sort: args.sort,
|
sort: args.sort,
|
||||||
limit: Infinity
|
limit: Infinity
|
||||||
}).catch(error => {
|
});
|
||||||
|
} catch(error) {
|
||||||
console.log('\x1b[31m%s\x1b[0m', `Failed to fetch posts from reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`);
|
console.log('\x1b[31m%s\x1b[0m', `Failed to fetch posts from reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`);
|
||||||
|
|
||||||
return [];
|
return [];
|
||||||
});
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const getUserPostsWrap = (reddit, args) => usernames => Promise.all(
|
const getUserPostsWrap = (reddit, args) => usernames => Promise.props(usernames.reduce((userPosts, username) => {
|
||||||
usernames.map(async username => {
|
userPosts[username] = (async () => {
|
||||||
const [user, posts] = await Promise.all([
|
const [user, posts] = await Promise.all([
|
||||||
getUser(username, reddit),
|
getUser(username, reddit),
|
||||||
getPosts(username, reddit, args)
|
getPosts(username, reddit, args)
|
||||||
|
@ -46,8 +52,10 @@ const getUserPostsWrap = (reddit, args) => usernames => Promise.all(
|
||||||
posts.push(...archivedPosts);
|
posts.push(...archivedPosts);
|
||||||
}
|
}
|
||||||
|
|
||||||
return posts.map(post => Object.assign(post, {user}));
|
return {...user, posts};
|
||||||
})
|
})();
|
||||||
).then(posts => posts.flatten());
|
|
||||||
|
return userPosts;
|
||||||
|
}, {}));
|
||||||
|
|
||||||
module.exports = getUserPostsWrap;
|
module.exports = getUserPostsWrap;
|
||||||
|
|
Loading…
Reference in New Issue