From c49e1edca0f7c0b12d6be1469d381936757d20f0 Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Wed, 11 Sep 2024 05:16:56 +0200 Subject: [PATCH] Reading index file and ignoring already indexed content. --- config/default.js | 2 +- src/app.js | 5 +++-- src/curate/posts.js | 16 +++++++++------- src/save/writeToIndex.js | 6 +++--- src/sources/getUserPosts.js | 33 ++++++++++++++++++++++++++++----- 5 files changed, 44 insertions(+), 18 deletions(-) diff --git a/config/default.js b/config/default.js index cf08dc2..d905767 100644 --- a/config/default.js +++ b/config/default.js @@ -19,7 +19,7 @@ module.exports = { index: { file: '$base/index', format: 'tsv', - keys: ['postId', 'subreddit', 'postDate', 'postTitle', 'url'], + keys: ['postId', 'subreddit', 'postDate', 'postTitle', 'url', 'hostId'], }, booleans: { extracted: 'extracted-', diff --git a/src/app.js b/src/app.js index c88e656..327eb53 100644 --- a/src/app.js +++ b/src/app.js @@ -48,11 +48,12 @@ async function initApp() { await ep.open(); await fetchSavePosts(userPosts, ep); - - return ep.close(); + await ep.close(); } catch (error) { console.error(error); } + + return true; } initApp(); diff --git a/src/curate/posts.js b/src/curate/posts.js index 2cf4fe8..1e175a5 100644 --- a/src/curate/posts.js +++ b/src/curate/posts.js @@ -60,16 +60,18 @@ function curatePost(accUserPosts, post, user, index, processed, args) { subreddit: post.subreddit.display_name, preview: post.preview ? post.preview.images.map(image => image.source) : null, host, - hash: hashPost(post) + hash: hashPost(post), }); } -const curatePosts = (userPosts, args) => { - const processed = new Set(); +const curatePosts = (userPosts, args) => Object.values(userPosts).reduce((accPosts, user) => { + const indexedHostIds = user.indexed.map(entry => entry.hostId); // already downloaded + const processed = new Set(indexedHostIds); - return Object.values(userPosts).reduce((accPosts, user) => { - return { ...accPosts, [user.name]: { ...user, posts: user.posts.reduce((accUserPosts, post, index) => curatePost(accUserPosts, post, user, index, processed, args), []) } }; - }, {}); -}; + const posts = user.posts.reduce((accUserPosts, post, index) => + curatePost(accUserPosts, post, user, index, processed, args), []); + + return { ...accPosts, [user.name]: { ...user, posts } }; +}, {}); module.exports = curatePosts; diff --git a/src/save/writeToIndex.js b/src/save/writeToIndex.js index 499addb..494bf82 100644 --- a/src/save/writeToIndex.js +++ b/src/save/writeToIndex.js @@ -3,16 +3,16 @@ const config = require('config'); const fs = require('fs-extra'); const Promise = require('bluebird'); - const csvStringify = Promise.promisify(require('csv').stringify); const interpolate = require('../interpolate.js'); async function writeToIndex(posts, user) { const filename = interpolate(config.library.index.file, user, null, false); - const data = posts.map(post => config.library.index.keys.map(key => interpolate(`$${key}`, user, post, null, false, 'YYYY-MM-DDTHH:mm:ssZ'))); + const newEntries = posts.map(post => config.library.index.keys.map(key => interpolate(`$${key}`, user, post, null, false, 'YYYY-MM-DDTHH:mm:ssZ'))); + const fullEntries = newEntries.concat(user.indexed.map(entry => Object.values(entry))); - const tsvString = await csvStringify(data, { delimiter: '\t', header: true, columns: config.library.index.keys }); + const tsvString = await csvStringify(fullEntries, { delimiter: '\t', header: true, columns: config.library.index.keys }); return fs.writeFile(filename, tsvString); } diff --git a/src/sources/getUserPosts.js b/src/sources/getUserPosts.js index acd0fe7..002a953 100644 --- a/src/sources/getUserPosts.js +++ b/src/sources/getUserPosts.js @@ -1,8 +1,13 @@ 'use strict'; +const config = require('config'); const Promise = require('bluebird'); +const fs = require('fs-extra'); +const csvParse = Promise.promisify(require('csv').parse); + const getArchivePostIds = require('../archives/getArchivePostIds.js'); const curateUser = require('../curate/user.js'); +const interpolate = require('../interpolate.js'); async function getUser(username, reddit) { try { @@ -40,24 +45,42 @@ async function getArchivedPosts(username, posts, reddit) { return Promise.all(postIds.map(postId => reddit.getSubmission(postId).fetch())); } +async function getIndexedPosts(user) { + const indexFilePath = interpolate(config.library.index.file, user, null, null, false); + + try { + const indexFile = await fs.readFile(indexFilePath, 'utf8'); + + return await csvParse(indexFile, { delimiter: '\t', columns: true, cast: true }); + } catch (error) { + console.log('\x1b[33m%s\x1b[0m', `Could not load index file for '${user.name}' at '${indexFilePath}': ${error}`); + + return []; + } +} + function getUserPostsWrap(reddit, args) { - return function getUserPosts(usernames) { - return Promise.props(usernames.reduce(async (userPosts, username) => { + return async function getUserPosts(usernames) { + const users = await Promise.map(usernames, async (username) => { const [user, posts] = await Promise.all([ getUser(username, reddit), getPosts(username, reddit, args), ]); + const indexed = await getIndexedPosts(user); + if (args.archives) { posts.push(...await getArchivedPosts(username, posts, reddit)); } if (posts.length) { - return { ...userPosts, [user.name]: { ...user, posts } }; + return { ...user, posts, indexed }; } - return userPosts; - }, {})); + return null; + }); + + return users.reduce((userPosts, user) => (user ? { ...userPosts, [user.name]: user } : userPosts), {}); }; }