Building user posts object after fetching user to ensure user fetched posts and directly fetched posts are added to the same user key. Refactor to make better use of functions. Moved profile detail saving call to content fetch. No longer attempting and failing to save profile details for deleted users (directory would not exist).

This commit is contained in:
ThePendulum 2018-06-17 01:11:10 +02:00
parent 3b5d886da3
commit 7cf1a99915
11 changed files with 199 additions and 157 deletions

View File

@ -19,7 +19,7 @@ module.exports = {
index: { index: {
file: '$base/index', file: '$base/index',
format: 'tsv', format: 'tsv',
keys: ['postId', 'postTitle', 'subreddit', 'postDate', 'url'], keys: ['postId', 'subreddit', 'postDate', 'postTitle', 'url'],
}, },
booleans: { booleans: {
extracted: 'extracted-', extracted: 'extracted-',

10
package-lock.json generated
View File

@ -1870,16 +1870,6 @@
"harmony-reflect": "1.6.0" "harmony-reflect": "1.6.0"
} }
}, },
"promise.prototype.finally": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/promise.prototype.finally/-/promise.prototype.finally-3.1.0.tgz",
"integrity": "sha512-7p/K2f6dI+dM8yjRQEGrTQs5hTQixUAdOGpMEA3+pVxpX5oHKRSKAXyLw9Q9HUWDTdwtoo39dSHGQtN90HcEwQ==",
"requires": {
"define-properties": "1.1.2",
"es-abstract": "1.11.0",
"function-bind": "1.1.1"
}
},
"pseudomap": { "pseudomap": {
"version": "1.0.2", "version": "1.0.2",
"resolved": "https://registry.npmjs.org/pseudomap/-/pseudomap-1.0.2.tgz", "resolved": "https://registry.npmjs.org/pseudomap/-/pseudomap-1.0.2.tgz",

View File

@ -42,7 +42,6 @@
"node-fetch": "^2.1.2", "node-fetch": "^2.1.2",
"object.omit": "^3.0.0", "object.omit": "^3.0.0",
"object.pick": "^1.3.0", "object.pick": "^1.3.0",
"promise.prototype.finally": "^3.1.0",
"snoowrap": "^1.15.2", "snoowrap": "^1.15.2",
"url-pattern": "^1.0.3", "url-pattern": "^1.0.3",
"yargs": "^11.0.0" "yargs": "^11.0.0"

View File

@ -1,49 +1,58 @@
'use strict'; 'use strict';
const config = require('config'); const config = require('config');
const util = require('util'); const Snoowrap = require('snoowrap');
const fs = require('fs-extra');
const snoowrap = require('snoowrap');
const omit = require('object.omit');
const exiftool = require('node-exiftool'); const exiftool = require('node-exiftool');
const exiftoolBin = require('dist-exiftool'); const exiftoolBin = require('dist-exiftool');
require('promise.prototype.finally').shim();
require('array.prototype.flatten').shim(); require('array.prototype.flatten').shim();
const reddit = new snoowrap(config.reddit.api); const reddit = new Snoowrap(config.reddit.api);
const args = require('./cli.js'); const args = require('./cli.js')();
const curatePosts = require('./curate/posts.js'); const curatePosts = require('./curate/posts.js');
const interpolate = require('./interpolate.js');
const attachContentInfo = require('./fetch/info.js'); const attachContentInfo = require('./fetch/info.js');
const fetchSaveContent = require('./fetch/content.js'); const fetchSaveContent = require('./fetch/content.js');
const getPosts = require('./sources/getPosts.js')(reddit, args); const getPosts = require('./sources/getPosts.js')(reddit, args);
const getUserPosts = require('./sources/getUserPosts.js')(reddit, args); const getUserPosts = require('./sources/getUserPosts.js')(reddit, args);
if(!(args.users && args.users.length) && !(args.posts && args.posts.length)) { async function getCompleteUserPosts() {
return console.log('\x1b[31m%s\x1b[0m', 'Please supply at least one user with --user <user> or one post with --post <post-id>. See --help for more options.'); let userPosts = await getUserPosts(args.users);
}
const ep = new exiftool.ExiftoolProcess(exiftoolBin); if (args.posts) {
Promise.resolve().then(async () => {
const initUsers = args.users ? args.users.reduce((acc, username) => ({...acc, [username]: {name: username, posts: []}}), {}) : {};
let userPosts = await getUserPosts(initUsers);
if(args.posts) {
userPosts = await getPosts(args.posts, userPosts); userPosts = await getPosts(args.posts, userPosts);
} }
const curatedUserPosts = curatePosts(userPosts, args); const curatedUserPosts = curatePosts(userPosts, args);
const infoUserPosts = await attachContentInfo(curatedUserPosts);
await ep.open(); return attachContentInfo(curatedUserPosts);
await Promise.all(Object.values(infoUserPosts).map(user => fetchSaveContent(user, ep))); }
await ep.close();
}).catch(error => { function fetchSavePosts(userPosts, ep) {
return console.error(error); return Promise.all(Object.values(userPosts).map(user => fetchSaveContent(user, ep)));
}); }
async function initApp() {
const usersProvided = args.users && args.users.length;
const postIdsProvided = args.posts && args.posts.length;
if (!usersProvided && !postIdsProvided) {
return console.log('\x1b[31m%s\x1b[0m', 'Please supply at least one user or post ID. See --help for more details.');
}
try {
const userPosts = await getCompleteUserPosts();
const ep = new exiftool.ExiftoolProcess(exiftoolBin);
await ep.open();
await fetchSavePosts(userPosts, ep);
return ep.close();
} catch (error) {
console.error(error);
}
}
initApp();

View File

@ -5,12 +5,12 @@ const config = require('config');
const archives = require('./archives.js'); const archives = require('./archives.js');
function getArchivePostIds(username, exclude) { function getArchivePostIds(username, exclude) {
console.log('Searching archives for posts...'); console.log(`Finding archived posts for '${username}'...`);
return Promise.all(config.fetch.archives.reddit.map(source => archives[source](username))).then(postIds => postIds.flatten()).then(postIds => { return Promise.all(config.fetch.archives.reddit.map(source => archives[source](username))).then(postIds => postIds.flatten()).then(postIds => {
return exclude ? postIds.filter(postId => !exclude.includes(postId)) : postIds; return exclude ? postIds.filter(postId => !exclude.includes(postId)) : postIds;
}).then(postIds => { }).then(postIds => {
console.log(`Found ${postIds.length} unique archived posts`); console.log(`Found ${postIds.length} unique archived posts for user '${username}'`);
return postIds; return postIds;
}); });

View File

@ -3,34 +3,38 @@
const config = require('config'); const config = require('config');
const yargs = require('yargs'); const yargs = require('yargs');
module.exports = yargs.command('npm start -- --user <username>').option('users', { function getArgs() {
alias: 'user', return yargs.command('npm start -- --user <username>').option('users', {
describe: 'Reddit usernames to fetch posts from', alias: 'user',
type: 'array' describe: 'Reddit usernames to fetch posts from',
}).option('posts', { type: 'array',
alias: 'post', }).option('posts', {
describe: 'Reddit post IDs to fetch', alias: 'post',
type: 'array' describe: 'Reddit post IDs to fetch',
}).option('limit', { type: 'array',
describe: 'Maximum amount of posts to fetch per supplied user (!), after filtering out ignored, cross- and reposts', }).option('limit', {
type: 'number', describe: 'Maximum amount of posts to fetch per supplied user (!), after filtering out ignored, cross- and reposts',
default: config.fetch.limit type: 'number',
}).option('sort', { default: config.fetch.limit,
describe: 'Property to sort posts by', }).option('sort', {
choices: ['new', 'top', 'hot', 'controversial'], describe: 'Property to sort posts by',
default: config.fetch.sort choices: ['new', 'top', 'hot', 'controversial'],
}).option('ignore', { default: config.fetch.sort,
describe: 'Ignore posts with any of these properties', }).option('ignore', {
type: 'array', describe: 'Ignore posts with any of these properties',
choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18'] type: 'array',
}).option('include', { choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18'],
describe: 'Include only these sources', }).option('include', {
type: 'array' describe: 'Include only these sources',
}).option('exclude', { type: 'array',
describe: 'Do not include these sources', }).option('exclude', {
type: 'array' describe: 'Do not include these sources',
}).option('archives', { type: 'array',
describe: 'Search archives for deleted posts', }).option('archives', {
type: 'boolean', describe: 'Search archives for deleted posts',
default: config.fetch.archives.search type: 'boolean',
}).argv; default: config.fetch.archives.search,
}).argv;
}
module.exports = getArgs;

View File

@ -1,63 +1,75 @@
'use strict'; 'use strict';
const config = require('config'); const config = require('config');
const dissectLink = require('../dissectLink.js');
const omit = require('object.omit'); const omit = require('object.omit');
const dissectLink = require('../dissectLink.js');
const hashPost = require('./hashPost.js'); const hashPost = require('./hashPost.js');
function curatePost(accUserPosts, post, user, index, processed, args) {
// cut-off at limit, but don't count posts requested directly by ID
if (accUserPosts.length >= args.limit && !post.direct) {
return accUserPosts;
}
const host = dissectLink(post.url);
const permalink = `https://reddit.com${post.permalink}`;
const ignoring = args.ignore ? args.ignore.find(prop => post[prop]) : null;
if (ignoring) {
console.log('\x1b[33m%s\x1b[0m', `Ignoring ${ignoring} post '${post.title}' (${permalink})`);
return accUserPosts;
}
if (host) {
const hostIncludes = args.include && !args.include.includes(host.label);
const hostExcluded = args.exclude && args.exclude.includes(host.label);
if (hostIncludes || hostExcluded) {
console.log(
'\x1b[33m%s\x1b[0m',
`Ignoring source '${host.label}' from post '${post.url}' (${permalink})`,
);
return accUserPosts;
}
if (config.fetch.avoidDuplicates && processed.has(host.id)) {
console.log(
'\x1b[33m%s\x1b[0m',
`Ignoring duplicate content '${post.url}' (cross-post, repost, or superfluous --post ID) (${permalink})`,
);
return accUserPosts;
}
processed.add(host.id);
}
return accUserPosts.concat({
id: post.id,
index,
title: post.title,
text: post.selftext,
user: omit(user, ['posts']),
permalink,
url: post.url,
datetime: new Date(post.created_utc * 1000),
subreddit: post.subreddit.display_name,
preview: post.preview ? post.preview.images.map(image => image.source) : null,
host,
hash: hashPost(post)
});
}
const curatePosts = (userPosts, args) => { const curatePosts = (userPosts, args) => {
const processed = new Set(); const processed = new Set();
return Object.values(userPosts).reduce((accPosts, user) => ({...accPosts, [user.name]: {...user, posts: user.posts.reduce((accUserPosts, post, index) => { return Object.values(userPosts).reduce((accPosts, user) => {
// cut-off at limit, but don't count posts requested directly by ID return { ...accPosts, [user.name]: { ...user, posts: user.posts.reduce((accUserPosts, post, index) => curatePost(accUserPosts, post, user, index, processed, args), []) } };
if(accUserPosts.length >= args.limit && !post.direct) { }, {});
return accUserPosts;
}
const host = dissectLink(post.url);
const permalink = 'https://reddit.com' + post.permalink;
const ignoring = args.ignore ? args.ignore.find(prop => {
return post[prop];
}) : null;
if(ignoring) {
console.log('\x1b[33m%s\x1b[0m', `Ignoring ${ignoring} post '${post.title}' (${permalink})`);
return accUserPosts;
}
if(host) {
if(config.fetch.avoidDuplicates && processed.has(host.id)) {
console.log('\x1b[33m%s\x1b[0m', `Ignoring duplicate content '${post.url}' (cross-post, repost, or --post ID was already included) (${permalink})`);
return accUserPosts;
}
if((args.include && !args.include.includes(host.label)) || (args.exclude && args.exclude.includes(host.label))) {
console.log('\x1b[33m%s\x1b[0m', `Ignoring source '${host.label}' from post '${post.url}' (${permalink})`);
return accUserPosts;
}
processed.add(host.id);
}
return accUserPosts.concat({
id: post.id,
index: index,
title: post.title,
text: post.selftext,
user: omit(user, ['posts']),
permalink,
url: post.url,
datetime: new Date(post.created_utc * 1000),
subreddit: post.subreddit.display_name,
preview: post.preview ? post.preview.images.map(image => image.source) : null,
host,
hash: hashPost(post)
});
}, [])}}), {});
}; };
module.exports = curatePosts; module.exports = curatePosts;

View File

@ -1,7 +1,9 @@
const fs = require('fs-extra'); 'use strict';
const config = require('config'); const config = require('config');
const Promise = require('bluebird'); const Promise = require('bluebird');
const saveProfileDetails = require('../save/profileDetails.js');
const fetchItem = require('./item.js'); const fetchItem = require('./item.js');
const interpolate = require('../interpolate.js'); const interpolate = require('../interpolate.js');
const save = require('../save/save.js'); const save = require('../save/save.js');
@ -25,7 +27,7 @@ async function getStreams(item, post) {
return null; return null;
} }
async function addMeta(filepath, ep, item, post, user) { async function addMeta(filepath, item, post, user, ep) {
const meta = Object.entries(config.library.meta).reduce((acc, [key, value]) => { const meta = Object.entries(config.library.meta).reduce((acc, [key, value]) => {
const interpolatedValue = interpolate(value, user, post, item); const interpolatedValue = interpolate(value, user, post, item);
@ -46,13 +48,16 @@ function getFilepath(item, post, user) {
} }
async function fetchSaveContent(user, ep) { async function fetchSaveContent(user, ep) {
// async, nothing depends on its success so don't await
saveProfileDetails(user);
const posts = await Promise.map(user.posts, async (post) => { const posts = await Promise.map(user.posts, async (post) => {
await Promise.reduce(post.content.items, async (accItems, originalItem, index) => { await Promise.reduce(post.content.items, async (accItems, originalItem, index) => {
const item = { ...originalItem, index }; const item = { ...originalItem, index };
const streams = await getStreams(item, post); const streams = await getStreams(item, post);
// no streams, ignore item // no streams, ignore item
if (streams.length <= 0) { if (!streams || streams.length <= 0) {
return accItems; return accItems;
} }
@ -63,7 +68,7 @@ async function fetchSaveContent(user, ep) {
await mux(filepath, sourcePaths, item); await mux(filepath, sourcePaths, item);
} }
await addMeta(filepath, ep, item, post, user); await addMeta(filepath, item, post, user, ep);
return sourcePaths; return sourcePaths;
}, []); }, []);

View File

@ -9,7 +9,7 @@ const textToStream = require('./textToStream.js');
const save = require('./save.js'); const save = require('./save.js');
function saveProfileDetails(user) { function saveProfileDetails(user) {
if(config.library.profile.image && !user.fallback) { if(config.library.profile.image && !user.fallback && !user.deleted) {
const image = user.profile ? user.profile.image : user.image; const image = user.profile ? user.profile.image : user.image;
if(config.library.profile.avoidAvatar && new urlPattern('http(s)\\://(www.)redditstatic.com/avatars/:id(.:ext)(?:query)').match(image)) { if(config.library.profile.avoidAvatar && new urlPattern('http(s)\\://(www.)redditstatic.com/avatars/:id(.:ext)(?:query)').match(image)) {
@ -26,7 +26,7 @@ function saveProfileDetails(user) {
} }
} }
if(config.library.profile.description && !user.fallback) { if(config.library.profile.description && !user.fallback && !user.deleted) {
if(user.profile && user.profile.description) { if(user.profile && user.profile.description) {
const filepath = interpolate(config.library.profile.description, user); const filepath = interpolate(config.library.profile.description, user);
const stream = textToStream(user.profile.description); const stream = textToStream(user.profile.description);

20
src/save/writeToIndex.js Normal file
View File

@ -0,0 +1,20 @@
'use strict';
const config = require('config');
const fs = require('fs-extra');
const Promise = require('bluebird');
const csvStringify = Promise.promisify(require('csv').stringify);
const interpolate = require('../interpolate.js');
async function writeToIndex(posts, user) {
const filename = interpolate(config.library.index.file, user, null, false);
const data = posts.map(post => config.library.index.keys.map(key => interpolate(`$${key}`, user, post, null, false, 'YYYY-MM-DDTHH:mm:ssZ')));
const tsvString = await csvStringify(data, { delimiter: '\t', header: true, columns: config.library.index.keys });
return fs.writeFile(filename, tsvString);
}
module.exports = writeToIndex;

View File

@ -1,61 +1,64 @@
'use strict'; 'use strict';
const Promise = require('bluebird'); const Promise = require('bluebird');
const config = require('config');
const getArchivePostIds = require('../archives/getArchivePostIds.js'); const getArchivePostIds = require('../archives/getArchivePostIds.js');
const curateUser = require('../curate/user.js'); const curateUser = require('../curate/user.js');
const saveProfileDetails = require('../save/profileDetails.js');
const getUser = async (username, reddit) => { async function getUser(username, reddit) {
try { try {
const user = await reddit.getUser(username).fetch(); const user = await reddit.getUser(username).fetch();
return curateUser(user); return curateUser(user);
} catch(error) { } catch (error) {
console.log('\x1b[31m%s\x1b[0m', `Failed to fetch reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`); console.log('\x1b[31m%s\x1b[0m', `Failed to fetch reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`);
return { return {
name: username, name: username,
fallback: true fallback: true,
}; };
} }
}; }
const getPosts = async (username, reddit, args) => { async function getPosts(username, reddit, args) {
try { try {
return await reddit.getUser(username).getSubmissions({ const user = await reddit.getUser(username).getSubmissions({
sort: args.sort, sort: args.sort,
limit: Infinity limit: Infinity,
}); });
} catch(error) {
return user;
} catch (error) {
console.log('\x1b[31m%s\x1b[0m', `Failed to fetch posts from reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`); console.log('\x1b[31m%s\x1b[0m', `Failed to fetch posts from reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`);
return []; return [];
} }
}; }
const getUserPostsWrap = (reddit, args) => users => Promise.props(Object.entries(users).reduce((userPosts, [username, user]) => { async function getArchivedPosts(username, posts, reddit) {
userPosts[username] = (async () => { const postIds = await getArchivePostIds(username, posts.map(post => post.id));
const [user, posts] = await Promise.all([
getUser(username, reddit),
getPosts(username, reddit, args)
]);
if(user) { return Promise.all(postIds.map(postId => reddit.getSubmission(postId).fetch()));
saveProfileDetails(user); }
}
if(args.archives) { function getUserPostsWrap(reddit, args) {
const postIds = await getArchivePostIds(username, posts.map(post => post.id)); return function getUserPosts(usernames) {
const archivedPosts = await Promise.all(postIds.map(postId => reddit.getSubmission(postId).fetch())); return Promise.props(usernames.reduce(async (userPosts, username) => {
const [user, posts] = await Promise.all([
getUser(username, reddit),
getPosts(username, reddit, args),
]);
posts.push(...archivedPosts); if (args.archives) {
} posts.push(...await getArchivedPosts(username, posts, reddit));
}
return {...user, posts}; if (posts.length) {
})(); return { ...userPosts, [user.name]: { ...user, posts } };
}
return userPosts; return userPosts;
}, {})); }, {}));
};
}
module.exports = getUserPostsWrap; module.exports = getUserPostsWrap;