Reading index file and ignoring already indexed content.

This commit is contained in:
DebaucheryLibrarian 2024-09-11 05:16:56 +02:00
parent 952392e0d9
commit c49e1edca0
5 changed files with 44 additions and 18 deletions

View File

@ -19,7 +19,7 @@ module.exports = {
index: {
file: '$base/index',
format: 'tsv',
keys: ['postId', 'subreddit', 'postDate', 'postTitle', 'url'],
keys: ['postId', 'subreddit', 'postDate', 'postTitle', 'url', 'hostId'],
},
booleans: {
extracted: 'extracted-',

View File

@ -48,11 +48,12 @@ async function initApp() {
await ep.open();
await fetchSavePosts(userPosts, ep);
return ep.close();
await ep.close();
} catch (error) {
console.error(error);
}
return true;
}
initApp();

View File

@ -60,16 +60,18 @@ function curatePost(accUserPosts, post, user, index, processed, args) {
subreddit: post.subreddit.display_name,
preview: post.preview ? post.preview.images.map(image => image.source) : null,
host,
hash: hashPost(post)
hash: hashPost(post),
});
}
const curatePosts = (userPosts, args) => {
const processed = new Set();
const curatePosts = (userPosts, args) => Object.values(userPosts).reduce((accPosts, user) => {
const indexedHostIds = user.indexed.map(entry => entry.hostId); // already downloaded
const processed = new Set(indexedHostIds);
return Object.values(userPosts).reduce((accPosts, user) => {
return { ...accPosts, [user.name]: { ...user, posts: user.posts.reduce((accUserPosts, post, index) => curatePost(accUserPosts, post, user, index, processed, args), []) } };
}, {});
};
const posts = user.posts.reduce((accUserPosts, post, index) =>
curatePost(accUserPosts, post, user, index, processed, args), []);
return { ...accPosts, [user.name]: { ...user, posts } };
}, {});
module.exports = curatePosts;

View File

@ -3,16 +3,16 @@
const config = require('config');
const fs = require('fs-extra');
const Promise = require('bluebird');
const csvStringify = Promise.promisify(require('csv').stringify);
const interpolate = require('../interpolate.js');
async function writeToIndex(posts, user) {
const filename = interpolate(config.library.index.file, user, null, false);
const data = posts.map(post => config.library.index.keys.map(key => interpolate(`$${key}`, user, post, null, false, 'YYYY-MM-DDTHH:mm:ssZ')));
const newEntries = posts.map(post => config.library.index.keys.map(key => interpolate(`$${key}`, user, post, null, false, 'YYYY-MM-DDTHH:mm:ssZ')));
const fullEntries = newEntries.concat(user.indexed.map(entry => Object.values(entry)));
const tsvString = await csvStringify(data, { delimiter: '\t', header: true, columns: config.library.index.keys });
const tsvString = await csvStringify(fullEntries, { delimiter: '\t', header: true, columns: config.library.index.keys });
return fs.writeFile(filename, tsvString);
}

View File

@ -1,8 +1,13 @@
'use strict';
const config = require('config');
const Promise = require('bluebird');
const fs = require('fs-extra');
const csvParse = Promise.promisify(require('csv').parse);
const getArchivePostIds = require('../archives/getArchivePostIds.js');
const curateUser = require('../curate/user.js');
const interpolate = require('../interpolate.js');
async function getUser(username, reddit) {
try {
@ -40,24 +45,42 @@ async function getArchivedPosts(username, posts, reddit) {
return Promise.all(postIds.map(postId => reddit.getSubmission(postId).fetch()));
}
async function getIndexedPosts(user) {
const indexFilePath = interpolate(config.library.index.file, user, null, null, false);
try {
const indexFile = await fs.readFile(indexFilePath, 'utf8');
return await csvParse(indexFile, { delimiter: '\t', columns: true, cast: true });
} catch (error) {
console.log('\x1b[33m%s\x1b[0m', `Could not load index file for '${user.name}' at '${indexFilePath}': ${error}`);
return [];
}
}
function getUserPostsWrap(reddit, args) {
return function getUserPosts(usernames) {
return Promise.props(usernames.reduce(async (userPosts, username) => {
return async function getUserPosts(usernames) {
const users = await Promise.map(usernames, async (username) => {
const [user, posts] = await Promise.all([
getUser(username, reddit),
getPosts(username, reddit, args),
]);
const indexed = await getIndexedPosts(user);
if (args.archives) {
posts.push(...await getArchivedPosts(username, posts, reddit));
}
if (posts.length) {
return { ...userPosts, [user.name]: { ...user, posts } };
return { ...user, posts, indexed };
}
return userPosts;
}, {}));
return null;
});
return users.reduce((userPosts, user) => (user ? { ...userPosts, [user.name]: user } : userPosts), {});
};
}