Building user posts object after fetching user to ensure user fetched posts and directly fetched posts are added to the same user key. Refactor to make better use of functions. Moved profile detail saving call to content fetch. No longer attempting and failing to save profile details for deleted users (directory would not exist).

This commit is contained in:
ThePendulum 2018-06-17 01:11:10 +02:00
parent 3b5d886da3
commit 7cf1a99915
11 changed files with 199 additions and 157 deletions

View File

@ -19,7 +19,7 @@ module.exports = {
index: {
file: '$base/index',
format: 'tsv',
keys: ['postId', 'postTitle', 'subreddit', 'postDate', 'url'],
keys: ['postId', 'subreddit', 'postDate', 'postTitle', 'url'],
},
booleans: {
extracted: 'extracted-',

10
package-lock.json generated
View File

@ -1870,16 +1870,6 @@
"harmony-reflect": "1.6.0"
}
},
"promise.prototype.finally": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/promise.prototype.finally/-/promise.prototype.finally-3.1.0.tgz",
"integrity": "sha512-7p/K2f6dI+dM8yjRQEGrTQs5hTQixUAdOGpMEA3+pVxpX5oHKRSKAXyLw9Q9HUWDTdwtoo39dSHGQtN90HcEwQ==",
"requires": {
"define-properties": "1.1.2",
"es-abstract": "1.11.0",
"function-bind": "1.1.1"
}
},
"pseudomap": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/pseudomap/-/pseudomap-1.0.2.tgz",

View File

@ -42,7 +42,6 @@
"node-fetch": "^2.1.2",
"object.omit": "^3.0.0",
"object.pick": "^1.3.0",
"promise.prototype.finally": "^3.1.0",
"snoowrap": "^1.15.2",
"url-pattern": "^1.0.3",
"yargs": "^11.0.0"

View File

@ -1,49 +1,58 @@
'use strict';
const config = require('config');
const util = require('util');
const fs = require('fs-extra');
const snoowrap = require('snoowrap');
const omit = require('object.omit');
const Snoowrap = require('snoowrap');
const exiftool = require('node-exiftool');
const exiftoolBin = require('dist-exiftool');
require('promise.prototype.finally').shim();
require('array.prototype.flatten').shim();
const reddit = new snoowrap(config.reddit.api);
const args = require('./cli.js');
const reddit = new Snoowrap(config.reddit.api);
const args = require('./cli.js')();
const curatePosts = require('./curate/posts.js');
const interpolate = require('./interpolate.js');
const attachContentInfo = require('./fetch/info.js');
const fetchSaveContent = require('./fetch/content.js');
const getPosts = require('./sources/getPosts.js')(reddit, args);
const getUserPosts = require('./sources/getUserPosts.js')(reddit, args);
if(!(args.users && args.users.length) && !(args.posts && args.posts.length)) {
return console.log('\x1b[31m%s\x1b[0m', 'Please supply at least one user with --user <user> or one post with --post <post-id>. See --help for more options.');
}
async function getCompleteUserPosts() {
let userPosts = await getUserPosts(args.users);
const ep = new exiftool.ExiftoolProcess(exiftoolBin);
Promise.resolve().then(async () => {
const initUsers = args.users ? args.users.reduce((acc, username) => ({...acc, [username]: {name: username, posts: []}}), {}) : {};
let userPosts = await getUserPosts(initUsers);
if(args.posts) {
if (args.posts) {
userPosts = await getPosts(args.posts, userPosts);
}
const curatedUserPosts = curatePosts(userPosts, args);
const infoUserPosts = await attachContentInfo(curatedUserPosts);
await ep.open();
await Promise.all(Object.values(infoUserPosts).map(user => fetchSaveContent(user, ep)));
await ep.close();
}).catch(error => {
return console.error(error);
});
return attachContentInfo(curatedUserPosts);
}
function fetchSavePosts(userPosts, ep) {
return Promise.all(Object.values(userPosts).map(user => fetchSaveContent(user, ep)));
}
async function initApp() {
const usersProvided = args.users && args.users.length;
const postIdsProvided = args.posts && args.posts.length;
if (!usersProvided && !postIdsProvided) {
return console.log('\x1b[31m%s\x1b[0m', 'Please supply at least one user or post ID. See --help for more details.');
}
try {
const userPosts = await getCompleteUserPosts();
const ep = new exiftool.ExiftoolProcess(exiftoolBin);
await ep.open();
await fetchSavePosts(userPosts, ep);
return ep.close();
} catch (error) {
console.error(error);
}
}
initApp();

View File

@ -5,12 +5,12 @@ const config = require('config');
const archives = require('./archives.js');
function getArchivePostIds(username, exclude) {
console.log('Searching archives for posts...');
console.log(`Finding archived posts for '${username}'...`);
return Promise.all(config.fetch.archives.reddit.map(source => archives[source](username))).then(postIds => postIds.flatten()).then(postIds => {
return exclude ? postIds.filter(postId => !exclude.includes(postId)) : postIds;
}).then(postIds => {
console.log(`Found ${postIds.length} unique archived posts`);
console.log(`Found ${postIds.length} unique archived posts for user '${username}'`);
return postIds;
});

View File

@ -3,34 +3,38 @@
const config = require('config');
const yargs = require('yargs');
module.exports = yargs.command('npm start -- --user <username>').option('users', {
alias: 'user',
describe: 'Reddit usernames to fetch posts from',
type: 'array'
}).option('posts', {
alias: 'post',
describe: 'Reddit post IDs to fetch',
type: 'array'
}).option('limit', {
describe: 'Maximum amount of posts to fetch per supplied user (!), after filtering out ignored, cross- and reposts',
type: 'number',
default: config.fetch.limit
}).option('sort', {
describe: 'Property to sort posts by',
choices: ['new', 'top', 'hot', 'controversial'],
default: config.fetch.sort
}).option('ignore', {
describe: 'Ignore posts with any of these properties',
type: 'array',
choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18']
}).option('include', {
describe: 'Include only these sources',
type: 'array'
}).option('exclude', {
describe: 'Do not include these sources',
type: 'array'
}).option('archives', {
describe: 'Search archives for deleted posts',
type: 'boolean',
default: config.fetch.archives.search
}).argv;
function getArgs() {
return yargs.command('npm start -- --user <username>').option('users', {
alias: 'user',
describe: 'Reddit usernames to fetch posts from',
type: 'array',
}).option('posts', {
alias: 'post',
describe: 'Reddit post IDs to fetch',
type: 'array',
}).option('limit', {
describe: 'Maximum amount of posts to fetch per supplied user (!), after filtering out ignored, cross- and reposts',
type: 'number',
default: config.fetch.limit,
}).option('sort', {
describe: 'Property to sort posts by',
choices: ['new', 'top', 'hot', 'controversial'],
default: config.fetch.sort,
}).option('ignore', {
describe: 'Ignore posts with any of these properties',
type: 'array',
choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18'],
}).option('include', {
describe: 'Include only these sources',
type: 'array',
}).option('exclude', {
describe: 'Do not include these sources',
type: 'array',
}).option('archives', {
describe: 'Search archives for deleted posts',
type: 'boolean',
default: config.fetch.archives.search,
}).argv;
}
module.exports = getArgs;

View File

@ -1,63 +1,75 @@
'use strict';
const config = require('config');
const dissectLink = require('../dissectLink.js');
const omit = require('object.omit');
const dissectLink = require('../dissectLink.js');
const hashPost = require('./hashPost.js');
function curatePost(accUserPosts, post, user, index, processed, args) {
// cut-off at limit, but don't count posts requested directly by ID
if (accUserPosts.length >= args.limit && !post.direct) {
return accUserPosts;
}
const host = dissectLink(post.url);
const permalink = `https://reddit.com${post.permalink}`;
const ignoring = args.ignore ? args.ignore.find(prop => post[prop]) : null;
if (ignoring) {
console.log('\x1b[33m%s\x1b[0m', `Ignoring ${ignoring} post '${post.title}' (${permalink})`);
return accUserPosts;
}
if (host) {
const hostIncludes = args.include && !args.include.includes(host.label);
const hostExcluded = args.exclude && args.exclude.includes(host.label);
if (hostIncludes || hostExcluded) {
console.log(
'\x1b[33m%s\x1b[0m',
`Ignoring source '${host.label}' from post '${post.url}' (${permalink})`,
);
return accUserPosts;
}
if (config.fetch.avoidDuplicates && processed.has(host.id)) {
console.log(
'\x1b[33m%s\x1b[0m',
`Ignoring duplicate content '${post.url}' (cross-post, repost, or superfluous --post ID) (${permalink})`,
);
return accUserPosts;
}
processed.add(host.id);
}
return accUserPosts.concat({
id: post.id,
index,
title: post.title,
text: post.selftext,
user: omit(user, ['posts']),
permalink,
url: post.url,
datetime: new Date(post.created_utc * 1000),
subreddit: post.subreddit.display_name,
preview: post.preview ? post.preview.images.map(image => image.source) : null,
host,
hash: hashPost(post)
});
}
const curatePosts = (userPosts, args) => {
const processed = new Set();
return Object.values(userPosts).reduce((accPosts, user) => ({...accPosts, [user.name]: {...user, posts: user.posts.reduce((accUserPosts, post, index) => {
// cut-off at limit, but don't count posts requested directly by ID
if(accUserPosts.length >= args.limit && !post.direct) {
return accUserPosts;
}
const host = dissectLink(post.url);
const permalink = 'https://reddit.com' + post.permalink;
const ignoring = args.ignore ? args.ignore.find(prop => {
return post[prop];
}) : null;
if(ignoring) {
console.log('\x1b[33m%s\x1b[0m', `Ignoring ${ignoring} post '${post.title}' (${permalink})`);
return accUserPosts;
}
if(host) {
if(config.fetch.avoidDuplicates && processed.has(host.id)) {
console.log('\x1b[33m%s\x1b[0m', `Ignoring duplicate content '${post.url}' (cross-post, repost, or --post ID was already included) (${permalink})`);
return accUserPosts;
}
if((args.include && !args.include.includes(host.label)) || (args.exclude && args.exclude.includes(host.label))) {
console.log('\x1b[33m%s\x1b[0m', `Ignoring source '${host.label}' from post '${post.url}' (${permalink})`);
return accUserPosts;
}
processed.add(host.id);
}
return accUserPosts.concat({
id: post.id,
index: index,
title: post.title,
text: post.selftext,
user: omit(user, ['posts']),
permalink,
url: post.url,
datetime: new Date(post.created_utc * 1000),
subreddit: post.subreddit.display_name,
preview: post.preview ? post.preview.images.map(image => image.source) : null,
host,
hash: hashPost(post)
});
}, [])}}), {});
return Object.values(userPosts).reduce((accPosts, user) => {
return { ...accPosts, [user.name]: { ...user, posts: user.posts.reduce((accUserPosts, post, index) => curatePost(accUserPosts, post, user, index, processed, args), []) } };
}, {});
};
module.exports = curatePosts;

View File

@ -1,7 +1,9 @@
const fs = require('fs-extra');
'use strict';
const config = require('config');
const Promise = require('bluebird');
const saveProfileDetails = require('../save/profileDetails.js');
const fetchItem = require('./item.js');
const interpolate = require('../interpolate.js');
const save = require('../save/save.js');
@ -25,7 +27,7 @@ async function getStreams(item, post) {
return null;
}
async function addMeta(filepath, ep, item, post, user) {
async function addMeta(filepath, item, post, user, ep) {
const meta = Object.entries(config.library.meta).reduce((acc, [key, value]) => {
const interpolatedValue = interpolate(value, user, post, item);
@ -46,13 +48,16 @@ function getFilepath(item, post, user) {
}
async function fetchSaveContent(user, ep) {
// async, nothing depends on its success so don't await
saveProfileDetails(user);
const posts = await Promise.map(user.posts, async (post) => {
await Promise.reduce(post.content.items, async (accItems, originalItem, index) => {
const item = { ...originalItem, index };
const streams = await getStreams(item, post);
// no streams, ignore item
if (streams.length <= 0) {
if (!streams || streams.length <= 0) {
return accItems;
}
@ -63,7 +68,7 @@ async function fetchSaveContent(user, ep) {
await mux(filepath, sourcePaths, item);
}
await addMeta(filepath, ep, item, post, user);
await addMeta(filepath, item, post, user, ep);
return sourcePaths;
}, []);

View File

@ -9,7 +9,7 @@ const textToStream = require('./textToStream.js');
const save = require('./save.js');
function saveProfileDetails(user) {
if(config.library.profile.image && !user.fallback) {
if(config.library.profile.image && !user.fallback && !user.deleted) {
const image = user.profile ? user.profile.image : user.image;
if(config.library.profile.avoidAvatar && new urlPattern('http(s)\\://(www.)redditstatic.com/avatars/:id(.:ext)(?:query)').match(image)) {
@ -26,7 +26,7 @@ function saveProfileDetails(user) {
}
}
if(config.library.profile.description && !user.fallback) {
if(config.library.profile.description && !user.fallback && !user.deleted) {
if(user.profile && user.profile.description) {
const filepath = interpolate(config.library.profile.description, user);
const stream = textToStream(user.profile.description);

20
src/save/writeToIndex.js Normal file
View File

@ -0,0 +1,20 @@
'use strict';
const config = require('config');
const fs = require('fs-extra');
const Promise = require('bluebird');
const csvStringify = Promise.promisify(require('csv').stringify);
const interpolate = require('../interpolate.js');
async function writeToIndex(posts, user) {
const filename = interpolate(config.library.index.file, user, null, false);
const data = posts.map(post => config.library.index.keys.map(key => interpolate(`$${key}`, user, post, null, false, 'YYYY-MM-DDTHH:mm:ssZ')));
const tsvString = await csvStringify(data, { delimiter: '\t', header: true, columns: config.library.index.keys });
return fs.writeFile(filename, tsvString);
}
module.exports = writeToIndex;

View File

@ -1,61 +1,64 @@
'use strict';
const Promise = require('bluebird');
const config = require('config');
const getArchivePostIds = require('../archives/getArchivePostIds.js');
const curateUser = require('../curate/user.js');
const saveProfileDetails = require('../save/profileDetails.js');
const getUser = async (username, reddit) => {
async function getUser(username, reddit) {
try {
const user = await reddit.getUser(username).fetch();
return curateUser(user);
} catch(error) {
} catch (error) {
console.log('\x1b[31m%s\x1b[0m', `Failed to fetch reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`);
return {
name: username,
fallback: true
fallback: true,
};
}
};
}
const getPosts = async (username, reddit, args) => {
async function getPosts(username, reddit, args) {
try {
return await reddit.getUser(username).getSubmissions({
const user = await reddit.getUser(username).getSubmissions({
sort: args.sort,
limit: Infinity
limit: Infinity,
});
} catch(error) {
return user;
} catch (error) {
console.log('\x1b[31m%s\x1b[0m', `Failed to fetch posts from reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`);
return [];
}
};
}
const getUserPostsWrap = (reddit, args) => users => Promise.props(Object.entries(users).reduce((userPosts, [username, user]) => {
userPosts[username] = (async () => {
const [user, posts] = await Promise.all([
getUser(username, reddit),
getPosts(username, reddit, args)
]);
async function getArchivedPosts(username, posts, reddit) {
const postIds = await getArchivePostIds(username, posts.map(post => post.id));
if(user) {
saveProfileDetails(user);
}
return Promise.all(postIds.map(postId => reddit.getSubmission(postId).fetch()));
}
if(args.archives) {
const postIds = await getArchivePostIds(username, posts.map(post => post.id));
const archivedPosts = await Promise.all(postIds.map(postId => reddit.getSubmission(postId).fetch()));
function getUserPostsWrap(reddit, args) {
return function getUserPosts(usernames) {
return Promise.props(usernames.reduce(async (userPosts, username) => {
const [user, posts] = await Promise.all([
getUser(username, reddit),
getPosts(username, reddit, args),
]);
posts.push(...archivedPosts);
}
if (args.archives) {
posts.push(...await getArchivedPosts(username, posts, reddit));
}
return {...user, posts};
})();
if (posts.length) {
return { ...userPosts, [user.name]: { ...user, posts } };
}
return userPosts;
}, {}));
return userPosts;
}, {}));
};
}
module.exports = getUserPostsWrap;