Reading index file and ignoring already indexed content.
This commit is contained in:
parent
952392e0d9
commit
c49e1edca0
|
@ -19,7 +19,7 @@ module.exports = {
|
|||
index: {
|
||||
file: '$base/index',
|
||||
format: 'tsv',
|
||||
keys: ['postId', 'subreddit', 'postDate', 'postTitle', 'url'],
|
||||
keys: ['postId', 'subreddit', 'postDate', 'postTitle', 'url', 'hostId'],
|
||||
},
|
||||
booleans: {
|
||||
extracted: 'extracted-',
|
||||
|
|
|
@ -48,11 +48,12 @@ async function initApp() {
|
|||
|
||||
await ep.open();
|
||||
await fetchSavePosts(userPosts, ep);
|
||||
|
||||
return ep.close();
|
||||
await ep.close();
|
||||
} catch (error) {
|
||||
console.error(error);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
initApp();
|
||||
|
|
|
@ -60,16 +60,18 @@ function curatePost(accUserPosts, post, user, index, processed, args) {
|
|||
subreddit: post.subreddit.display_name,
|
||||
preview: post.preview ? post.preview.images.map(image => image.source) : null,
|
||||
host,
|
||||
hash: hashPost(post)
|
||||
hash: hashPost(post),
|
||||
});
|
||||
}
|
||||
|
||||
const curatePosts = (userPosts, args) => {
|
||||
const processed = new Set();
|
||||
const curatePosts = (userPosts, args) => Object.values(userPosts).reduce((accPosts, user) => {
|
||||
const indexedHostIds = user.indexed.map(entry => entry.hostId); // already downloaded
|
||||
const processed = new Set(indexedHostIds);
|
||||
|
||||
return Object.values(userPosts).reduce((accPosts, user) => {
|
||||
return { ...accPosts, [user.name]: { ...user, posts: user.posts.reduce((accUserPosts, post, index) => curatePost(accUserPosts, post, user, index, processed, args), []) } };
|
||||
const posts = user.posts.reduce((accUserPosts, post, index) =>
|
||||
curatePost(accUserPosts, post, user, index, processed, args), []);
|
||||
|
||||
return { ...accPosts, [user.name]: { ...user, posts } };
|
||||
}, {});
|
||||
};
|
||||
|
||||
module.exports = curatePosts;
|
||||
|
|
|
@ -3,16 +3,16 @@
|
|||
const config = require('config');
|
||||
const fs = require('fs-extra');
|
||||
const Promise = require('bluebird');
|
||||
|
||||
const csvStringify = Promise.promisify(require('csv').stringify);
|
||||
|
||||
const interpolate = require('../interpolate.js');
|
||||
|
||||
async function writeToIndex(posts, user) {
|
||||
const filename = interpolate(config.library.index.file, user, null, false);
|
||||
const data = posts.map(post => config.library.index.keys.map(key => interpolate(`$${key}`, user, post, null, false, 'YYYY-MM-DDTHH:mm:ssZ')));
|
||||
const newEntries = posts.map(post => config.library.index.keys.map(key => interpolate(`$${key}`, user, post, null, false, 'YYYY-MM-DDTHH:mm:ssZ')));
|
||||
const fullEntries = newEntries.concat(user.indexed.map(entry => Object.values(entry)));
|
||||
|
||||
const tsvString = await csvStringify(data, { delimiter: '\t', header: true, columns: config.library.index.keys });
|
||||
const tsvString = await csvStringify(fullEntries, { delimiter: '\t', header: true, columns: config.library.index.keys });
|
||||
|
||||
return fs.writeFile(filename, tsvString);
|
||||
}
|
||||
|
|
|
@ -1,8 +1,13 @@
|
|||
'use strict';
|
||||
|
||||
const config = require('config');
|
||||
const Promise = require('bluebird');
|
||||
const fs = require('fs-extra');
|
||||
const csvParse = Promise.promisify(require('csv').parse);
|
||||
|
||||
const getArchivePostIds = require('../archives/getArchivePostIds.js');
|
||||
const curateUser = require('../curate/user.js');
|
||||
const interpolate = require('../interpolate.js');
|
||||
|
||||
async function getUser(username, reddit) {
|
||||
try {
|
||||
|
@ -40,24 +45,42 @@ async function getArchivedPosts(username, posts, reddit) {
|
|||
return Promise.all(postIds.map(postId => reddit.getSubmission(postId).fetch()));
|
||||
}
|
||||
|
||||
async function getIndexedPosts(user) {
|
||||
const indexFilePath = interpolate(config.library.index.file, user, null, null, false);
|
||||
|
||||
try {
|
||||
const indexFile = await fs.readFile(indexFilePath, 'utf8');
|
||||
|
||||
return await csvParse(indexFile, { delimiter: '\t', columns: true, cast: true });
|
||||
} catch (error) {
|
||||
console.log('\x1b[33m%s\x1b[0m', `Could not load index file for '${user.name}' at '${indexFilePath}': ${error}`);
|
||||
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
function getUserPostsWrap(reddit, args) {
|
||||
return function getUserPosts(usernames) {
|
||||
return Promise.props(usernames.reduce(async (userPosts, username) => {
|
||||
return async function getUserPosts(usernames) {
|
||||
const users = await Promise.map(usernames, async (username) => {
|
||||
const [user, posts] = await Promise.all([
|
||||
getUser(username, reddit),
|
||||
getPosts(username, reddit, args),
|
||||
]);
|
||||
|
||||
const indexed = await getIndexedPosts(user);
|
||||
|
||||
if (args.archives) {
|
||||
posts.push(...await getArchivedPosts(username, posts, reddit));
|
||||
}
|
||||
|
||||
if (posts.length) {
|
||||
return { ...userPosts, [user.name]: { ...user, posts } };
|
||||
return { ...user, posts, indexed };
|
||||
}
|
||||
|
||||
return userPosts;
|
||||
}, {}));
|
||||
return null;
|
||||
});
|
||||
|
||||
return users.reduce((userPosts, user) => (user ? { ...userPosts, [user.name]: user } : userPosts), {});
|
||||
};
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue