Reading index file and ignoring already indexed content.
This commit is contained in:
parent
952392e0d9
commit
c49e1edca0
|
@ -19,7 +19,7 @@ module.exports = {
|
||||||
index: {
|
index: {
|
||||||
file: '$base/index',
|
file: '$base/index',
|
||||||
format: 'tsv',
|
format: 'tsv',
|
||||||
keys: ['postId', 'subreddit', 'postDate', 'postTitle', 'url'],
|
keys: ['postId', 'subreddit', 'postDate', 'postTitle', 'url', 'hostId'],
|
||||||
},
|
},
|
||||||
booleans: {
|
booleans: {
|
||||||
extracted: 'extracted-',
|
extracted: 'extracted-',
|
||||||
|
|
|
@ -48,11 +48,12 @@ async function initApp() {
|
||||||
|
|
||||||
await ep.open();
|
await ep.open();
|
||||||
await fetchSavePosts(userPosts, ep);
|
await fetchSavePosts(userPosts, ep);
|
||||||
|
await ep.close();
|
||||||
return ep.close();
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error(error);
|
console.error(error);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
initApp();
|
initApp();
|
||||||
|
|
|
@ -60,16 +60,18 @@ function curatePost(accUserPosts, post, user, index, processed, args) {
|
||||||
subreddit: post.subreddit.display_name,
|
subreddit: post.subreddit.display_name,
|
||||||
preview: post.preview ? post.preview.images.map(image => image.source) : null,
|
preview: post.preview ? post.preview.images.map(image => image.source) : null,
|
||||||
host,
|
host,
|
||||||
hash: hashPost(post)
|
hash: hashPost(post),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
const curatePosts = (userPosts, args) => {
|
const curatePosts = (userPosts, args) => Object.values(userPosts).reduce((accPosts, user) => {
|
||||||
const processed = new Set();
|
const indexedHostIds = user.indexed.map(entry => entry.hostId); // already downloaded
|
||||||
|
const processed = new Set(indexedHostIds);
|
||||||
|
|
||||||
return Object.values(userPosts).reduce((accPosts, user) => {
|
const posts = user.posts.reduce((accUserPosts, post, index) =>
|
||||||
return { ...accPosts, [user.name]: { ...user, posts: user.posts.reduce((accUserPosts, post, index) => curatePost(accUserPosts, post, user, index, processed, args), []) } };
|
curatePost(accUserPosts, post, user, index, processed, args), []);
|
||||||
}, {});
|
|
||||||
};
|
return { ...accPosts, [user.name]: { ...user, posts } };
|
||||||
|
}, {});
|
||||||
|
|
||||||
module.exports = curatePosts;
|
module.exports = curatePosts;
|
||||||
|
|
|
@ -3,16 +3,16 @@
|
||||||
const config = require('config');
|
const config = require('config');
|
||||||
const fs = require('fs-extra');
|
const fs = require('fs-extra');
|
||||||
const Promise = require('bluebird');
|
const Promise = require('bluebird');
|
||||||
|
|
||||||
const csvStringify = Promise.promisify(require('csv').stringify);
|
const csvStringify = Promise.promisify(require('csv').stringify);
|
||||||
|
|
||||||
const interpolate = require('../interpolate.js');
|
const interpolate = require('../interpolate.js');
|
||||||
|
|
||||||
async function writeToIndex(posts, user) {
|
async function writeToIndex(posts, user) {
|
||||||
const filename = interpolate(config.library.index.file, user, null, false);
|
const filename = interpolate(config.library.index.file, user, null, false);
|
||||||
const data = posts.map(post => config.library.index.keys.map(key => interpolate(`$${key}`, user, post, null, false, 'YYYY-MM-DDTHH:mm:ssZ')));
|
const newEntries = posts.map(post => config.library.index.keys.map(key => interpolate(`$${key}`, user, post, null, false, 'YYYY-MM-DDTHH:mm:ssZ')));
|
||||||
|
const fullEntries = newEntries.concat(user.indexed.map(entry => Object.values(entry)));
|
||||||
|
|
||||||
const tsvString = await csvStringify(data, { delimiter: '\t', header: true, columns: config.library.index.keys });
|
const tsvString = await csvStringify(fullEntries, { delimiter: '\t', header: true, columns: config.library.index.keys });
|
||||||
|
|
||||||
return fs.writeFile(filename, tsvString);
|
return fs.writeFile(filename, tsvString);
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,8 +1,13 @@
|
||||||
'use strict';
|
'use strict';
|
||||||
|
|
||||||
|
const config = require('config');
|
||||||
const Promise = require('bluebird');
|
const Promise = require('bluebird');
|
||||||
|
const fs = require('fs-extra');
|
||||||
|
const csvParse = Promise.promisify(require('csv').parse);
|
||||||
|
|
||||||
const getArchivePostIds = require('../archives/getArchivePostIds.js');
|
const getArchivePostIds = require('../archives/getArchivePostIds.js');
|
||||||
const curateUser = require('../curate/user.js');
|
const curateUser = require('../curate/user.js');
|
||||||
|
const interpolate = require('../interpolate.js');
|
||||||
|
|
||||||
async function getUser(username, reddit) {
|
async function getUser(username, reddit) {
|
||||||
try {
|
try {
|
||||||
|
@ -40,24 +45,42 @@ async function getArchivedPosts(username, posts, reddit) {
|
||||||
return Promise.all(postIds.map(postId => reddit.getSubmission(postId).fetch()));
|
return Promise.all(postIds.map(postId => reddit.getSubmission(postId).fetch()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function getIndexedPosts(user) {
|
||||||
|
const indexFilePath = interpolate(config.library.index.file, user, null, null, false);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const indexFile = await fs.readFile(indexFilePath, 'utf8');
|
||||||
|
|
||||||
|
return await csvParse(indexFile, { delimiter: '\t', columns: true, cast: true });
|
||||||
|
} catch (error) {
|
||||||
|
console.log('\x1b[33m%s\x1b[0m', `Could not load index file for '${user.name}' at '${indexFilePath}': ${error}`);
|
||||||
|
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
function getUserPostsWrap(reddit, args) {
|
function getUserPostsWrap(reddit, args) {
|
||||||
return function getUserPosts(usernames) {
|
return async function getUserPosts(usernames) {
|
||||||
return Promise.props(usernames.reduce(async (userPosts, username) => {
|
const users = await Promise.map(usernames, async (username) => {
|
||||||
const [user, posts] = await Promise.all([
|
const [user, posts] = await Promise.all([
|
||||||
getUser(username, reddit),
|
getUser(username, reddit),
|
||||||
getPosts(username, reddit, args),
|
getPosts(username, reddit, args),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
const indexed = await getIndexedPosts(user);
|
||||||
|
|
||||||
if (args.archives) {
|
if (args.archives) {
|
||||||
posts.push(...await getArchivedPosts(username, posts, reddit));
|
posts.push(...await getArchivedPosts(username, posts, reddit));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (posts.length) {
|
if (posts.length) {
|
||||||
return { ...userPosts, [user.name]: { ...user, posts } };
|
return { ...user, posts, indexed };
|
||||||
}
|
}
|
||||||
|
|
||||||
return userPosts;
|
return null;
|
||||||
}, {}));
|
});
|
||||||
|
|
||||||
|
return users.reduce((userPosts, user) => (user ? { ...userPosts, [user.name]: user } : userPosts), {});
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue