Reading index file and ignoring already indexed content.

This commit is contained in:
DebaucheryLibrarian 2024-09-11 05:16:56 +02:00
parent 952392e0d9
commit c49e1edca0
5 changed files with 44 additions and 18 deletions

View File

@ -19,7 +19,7 @@ module.exports = {
index: { index: {
file: '$base/index', file: '$base/index',
format: 'tsv', format: 'tsv',
keys: ['postId', 'subreddit', 'postDate', 'postTitle', 'url'], keys: ['postId', 'subreddit', 'postDate', 'postTitle', 'url', 'hostId'],
}, },
booleans: { booleans: {
extracted: 'extracted-', extracted: 'extracted-',

View File

@ -48,11 +48,12 @@ async function initApp() {
await ep.open(); await ep.open();
await fetchSavePosts(userPosts, ep); await fetchSavePosts(userPosts, ep);
await ep.close();
return ep.close();
} catch (error) { } catch (error) {
console.error(error); console.error(error);
} }
return true;
} }
initApp(); initApp();

View File

@ -60,16 +60,18 @@ function curatePost(accUserPosts, post, user, index, processed, args) {
subreddit: post.subreddit.display_name, subreddit: post.subreddit.display_name,
preview: post.preview ? post.preview.images.map(image => image.source) : null, preview: post.preview ? post.preview.images.map(image => image.source) : null,
host, host,
hash: hashPost(post) hash: hashPost(post),
}); });
} }
const curatePosts = (userPosts, args) => { const curatePosts = (userPosts, args) => Object.values(userPosts).reduce((accPosts, user) => {
const processed = new Set(); const indexedHostIds = user.indexed.map(entry => entry.hostId); // already downloaded
const processed = new Set(indexedHostIds);
return Object.values(userPosts).reduce((accPosts, user) => { const posts = user.posts.reduce((accUserPosts, post, index) =>
return { ...accPosts, [user.name]: { ...user, posts: user.posts.reduce((accUserPosts, post, index) => curatePost(accUserPosts, post, user, index, processed, args), []) } }; curatePost(accUserPosts, post, user, index, processed, args), []);
return { ...accPosts, [user.name]: { ...user, posts } };
}, {}); }, {});
};
module.exports = curatePosts; module.exports = curatePosts;

View File

@ -3,16 +3,16 @@
const config = require('config'); const config = require('config');
const fs = require('fs-extra'); const fs = require('fs-extra');
const Promise = require('bluebird'); const Promise = require('bluebird');
const csvStringify = Promise.promisify(require('csv').stringify); const csvStringify = Promise.promisify(require('csv').stringify);
const interpolate = require('../interpolate.js'); const interpolate = require('../interpolate.js');
async function writeToIndex(posts, user) { async function writeToIndex(posts, user) {
const filename = interpolate(config.library.index.file, user, null, false); const filename = interpolate(config.library.index.file, user, null, false);
const data = posts.map(post => config.library.index.keys.map(key => interpolate(`$${key}`, user, post, null, false, 'YYYY-MM-DDTHH:mm:ssZ'))); const newEntries = posts.map(post => config.library.index.keys.map(key => interpolate(`$${key}`, user, post, null, false, 'YYYY-MM-DDTHH:mm:ssZ')));
const fullEntries = newEntries.concat(user.indexed.map(entry => Object.values(entry)));
const tsvString = await csvStringify(data, { delimiter: '\t', header: true, columns: config.library.index.keys }); const tsvString = await csvStringify(fullEntries, { delimiter: '\t', header: true, columns: config.library.index.keys });
return fs.writeFile(filename, tsvString); return fs.writeFile(filename, tsvString);
} }

View File

@ -1,8 +1,13 @@
'use strict'; 'use strict';
const config = require('config');
const Promise = require('bluebird'); const Promise = require('bluebird');
const fs = require('fs-extra');
const csvParse = Promise.promisify(require('csv').parse);
const getArchivePostIds = require('../archives/getArchivePostIds.js'); const getArchivePostIds = require('../archives/getArchivePostIds.js');
const curateUser = require('../curate/user.js'); const curateUser = require('../curate/user.js');
const interpolate = require('../interpolate.js');
async function getUser(username, reddit) { async function getUser(username, reddit) {
try { try {
@ -40,24 +45,42 @@ async function getArchivedPosts(username, posts, reddit) {
return Promise.all(postIds.map(postId => reddit.getSubmission(postId).fetch())); return Promise.all(postIds.map(postId => reddit.getSubmission(postId).fetch()));
} }
async function getIndexedPosts(user) {
const indexFilePath = interpolate(config.library.index.file, user, null, null, false);
try {
const indexFile = await fs.readFile(indexFilePath, 'utf8');
return await csvParse(indexFile, { delimiter: '\t', columns: true, cast: true });
} catch (error) {
console.log('\x1b[33m%s\x1b[0m', `Could not load index file for '${user.name}' at '${indexFilePath}': ${error}`);
return [];
}
}
function getUserPostsWrap(reddit, args) { function getUserPostsWrap(reddit, args) {
return function getUserPosts(usernames) { return async function getUserPosts(usernames) {
return Promise.props(usernames.reduce(async (userPosts, username) => { const users = await Promise.map(usernames, async (username) => {
const [user, posts] = await Promise.all([ const [user, posts] = await Promise.all([
getUser(username, reddit), getUser(username, reddit),
getPosts(username, reddit, args), getPosts(username, reddit, args),
]); ]);
const indexed = await getIndexedPosts(user);
if (args.archives) { if (args.archives) {
posts.push(...await getArchivedPosts(username, posts, reddit)); posts.push(...await getArchivedPosts(username, posts, reddit));
} }
if (posts.length) { if (posts.length) {
return { ...userPosts, [user.name]: { ...user, posts } }; return { ...user, posts, indexed };
} }
return userPosts; return null;
}, {})); });
return users.reduce((userPosts, user) => (user ? { ...userPosts, [user.name]: user } : userPosts), {});
}; };
} }