Using YAML rather than TSV for index files. Improves both readability and reindexability.

2018-06-30 03:33:30 +02:00
parent 1c4ec06f68
commit 74e36a6826
9 changed files with 73 additions and 91 deletions
--- a/src/cli.js
+++ b/src/cli.js
@@ -4,37 +4,50 @@ const config = require('config');
 const yargs = require('yargs');

 function getArgs() {
-    return yargs.command('npm start -- --user <username>').option('users', {
-        alias: 'user',
-        describe: 'Reddit usernames to fetch posts from',
-        type: 'array',
-    }).option('posts', {
-        alias: 'post',
-        describe: 'Reddit post IDs to fetch',
-        type: 'array',
-    }).option('limit', {
-        describe: 'Maximum amount of posts to fetch per supplied user (!), after filtering out ignored, cross- and reposts',
-        type: 'number',
-        default: config.fetch.limit,
-    }).option('sort', {
-        describe: 'Property to sort posts by',
-        choices: ['new', 'top', 'hot', 'controversial'],
-        default: config.fetch.sort,
-    }).option('ignore', {
-        describe: 'Ignore posts with any of these properties',
-        type: 'array',
-        choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18'],
-    }).option('include', {
-        describe: 'Include only these sources',
-        type: 'array',
-    }).option('exclude', {
-        describe: 'Do not include these sources',
-        type: 'array',
-    }).option('archives', {
-        describe: 'Search archives for deleted posts',
-        type: 'boolean',
-        default: config.fetch.archives.search,
-    }).argv;
+    return yargs
+        .command('npm start -- --user <username>')
+        .option('users', {
+            alias: 'user',
+            describe: 'Reddit usernames to fetch posts from',
+            type: 'array',
+        })
+        .option('posts', {
+            alias: 'post',
+            describe: 'Reddit post IDs to fetch',
+            type: 'array',
+        })
+        .option('limit', {
+            describe: 'Maximum amount of posts to fetch per supplied user (!), after filtering out ignored, cross- and reposts',
+            type: 'number',
+            default: config.fetch.limit,
+        })
+        .option('sort', {
+            describe: 'Property to sort posts by',
+            choices: ['new', 'top', 'hot', 'controversial'],
+            default: config.fetch.sort,
+        })
+        .option('ignore', {
+            describe: 'Ignore posts with any of these properties',
+            type: 'array',
+            choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18'],
+        })
+        .option('include', {
+            describe: 'Include only these sources',
+            type: 'array',
+        })
+        .option('exclude', {
+            describe: 'Do not include these sources',
+            type: 'array',
+        })
+        .option('after', {
+            describe: 'Do not include posts from before this date (DD-MM-YYYY). When set to \'index\', it will assume the date of the latest indexed post.',
+        })
+        .option('archives', {
+            describe: 'Search archives for deleted posts',
+            type: 'boolean',
+            default: config.fetch.archives.search,
+        })
+        .argv;
 }

 module.exports = getArgs;
--- a/src/curate/posts.js
+++ b/src/curate/posts.js
@@ -6,7 +6,7 @@ const omit = require('object.omit');
 const dissectLink = require('../dissectLink.js');
 const hashPost = require('./hashPost.js');

-function curatePost(acc, post, user, index, indexedPostIds, processed, args) {
+function curatePost(acc, post, user, index, processed, args) {
    const host = dissectLink(post.url);
    const permalink = `https://reddit.com${post.permalink}`;

@@ -25,7 +25,7 @@ function curatePost(acc, post, user, index, indexedPostIds, processed, args) {
        hash: hashPost(post),
    };

-    if (indexedPostIds.includes(post.id)) {
+    if (user.indexed.find(entry => entry.id === post.id)) {
        return { ...acc, indexed: { ...acc.indexed, [post.id]: curatedPost } };
    }

@@ -72,10 +72,9 @@ function curatePost(acc, post, user, index, indexedPostIds, processed, args) {

 const curatePosts = (userPosts, args) => Object.values(userPosts).reduce((accPosts, user) => {
    const processed = new Set();
-    const indexedPostIds = user.indexed.map(entry => entry.postId);

    const { posts, indexed } = user.posts.reduce((accUserPosts, post, index) =>
-        curatePost(accUserPosts, post, user, index, indexedPostIds, processed, args), { posts: [], indexed: {} });
+        curatePost(accUserPosts, post, user, index, processed, args), { posts: [], indexed: {} });

    const indexedLength = Object.keys(indexed).length;

--- a/src/interpolate.js
+++ b/src/interpolate.js
@@ -5,6 +5,7 @@ const path = require('path');
 const url = require('url');
 const dateFns = require('date-fns');
 const mime = require('mime-types');
+const dotty = require('dotty');

 function interpolate(pattern, user, post, item, strip = true, dateFormat = config.library.dateFormat) {
    const vars = {
--- a/src/save/writeToIndex.js
+++ b/src/save/writeToIndex.js
@@ -2,19 +2,31 @@

 const config = require('config');
 const fs = require('fs-extra');
-const Promise = require('bluebird');
-const csvStringify = Promise.promisify(require('csv').stringify);
+const yaml = require('js-yaml');

 const interpolate = require('../interpolate.js');

 async function writeToIndex(posts, user) {
    const filename = interpolate(config.library.index.file, user, null, false);
-    const newEntries = posts.map(post => config.library.index.keys.map(key => interpolate(`$${key}`, user, post, null, false, 'YYYY-MM-DDTHH:mm:ssZ')));
-    const fullEntries = newEntries.concat(user.indexed.map(entry => Object.values(entry)));
+    const now = new Date();

-    const tsvString = await csvStringify(fullEntries, { delimiter: '\t', header: true, columns: config.library.index.keys });
+    // Individual posts are wrapped in [] to get a YAML array value for each individual item, allowing them to be joined manually with a newline
+    // between each entry to improve human readability of the index while maintaining a valid YAML list
+    const oldEntries = user.indexed.map(entry => yaml.safeDump([entry]));
+    const newEntries = posts.map(post => yaml.safeDump([{
+        id: post.id,
+        subreddit: post.subreddit,
+        permalink: post.permalink,
+        url: post.url,
+        hostId: post.host.id,
+        date: post.datetime,
+        indexed: now,
+        title: post.title,
+    }]));

-    return fs.writeFile(filename, tsvString);
+    const entries = newEntries.concat(oldEntries).join('\n');
+
+    return fs.writeFile(filename, entries);
 }

 module.exports = writeToIndex;
--- a/src/sources/getUserPosts.js
+++ b/src/sources/getUserPosts.js
@@ -3,7 +3,7 @@
 const config = require('config');
 const Promise = require('bluebird');
 const fs = require('fs-extra');
-const csvParse = Promise.promisify(require('csv').parse);
+const yaml = require('js-yaml');

 const getArchivePostIds = require('../archives/getArchivePostIds.js');
 const curateUser = require('../curate/user.js');
@@ -51,7 +51,7 @@ async function getIndexedPosts(user) {
    try {
        const indexFile = await fs.readFile(indexFilePath, 'utf8');

-        return await csvParse(indexFile, { delimiter: '\t', columns: true, cast: true });
+        return yaml.safeLoad(indexFile);
    } catch (error) {
        console.log('\x1b[33m%s\x1b[0m', `Could not load index file for '${user.name}' at '${indexFilePath}': ${error}`);