Using YAML rather than TSV for index files. Improves both readability and reindexability.

This commit is contained in:
2018-06-30 03:33:30 +02:00
parent 1c4ec06f68
commit 74e36a6826
9 changed files with 73 additions and 91 deletions

View File

@@ -4,37 +4,50 @@ const config = require('config');
const yargs = require('yargs');
function getArgs() {
return yargs.command('npm start -- --user <username>').option('users', {
alias: 'user',
describe: 'Reddit usernames to fetch posts from',
type: 'array',
}).option('posts', {
alias: 'post',
describe: 'Reddit post IDs to fetch',
type: 'array',
}).option('limit', {
describe: 'Maximum amount of posts to fetch per supplied user (!), after filtering out ignored, cross- and reposts',
type: 'number',
default: config.fetch.limit,
}).option('sort', {
describe: 'Property to sort posts by',
choices: ['new', 'top', 'hot', 'controversial'],
default: config.fetch.sort,
}).option('ignore', {
describe: 'Ignore posts with any of these properties',
type: 'array',
choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18'],
}).option('include', {
describe: 'Include only these sources',
type: 'array',
}).option('exclude', {
describe: 'Do not include these sources',
type: 'array',
}).option('archives', {
describe: 'Search archives for deleted posts',
type: 'boolean',
default: config.fetch.archives.search,
}).argv;
return yargs
.command('npm start -- --user <username>')
.option('users', {
alias: 'user',
describe: 'Reddit usernames to fetch posts from',
type: 'array',
})
.option('posts', {
alias: 'post',
describe: 'Reddit post IDs to fetch',
type: 'array',
})
.option('limit', {
describe: 'Maximum amount of posts to fetch per supplied user (!), after filtering out ignored, cross- and reposts',
type: 'number',
default: config.fetch.limit,
})
.option('sort', {
describe: 'Property to sort posts by',
choices: ['new', 'top', 'hot', 'controversial'],
default: config.fetch.sort,
})
.option('ignore', {
describe: 'Ignore posts with any of these properties',
type: 'array',
choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18'],
})
.option('include', {
describe: 'Include only these sources',
type: 'array',
})
.option('exclude', {
describe: 'Do not include these sources',
type: 'array',
})
.option('after', {
describe: 'Do not include posts from before this date (DD-MM-YYYY). When set to \'index\', it will assume the date of the latest indexed post.',
})
.option('archives', {
describe: 'Search archives for deleted posts',
type: 'boolean',
default: config.fetch.archives.search,
})
.argv;
}
module.exports = getArgs;

View File

@@ -6,7 +6,7 @@ const omit = require('object.omit');
const dissectLink = require('../dissectLink.js');
const hashPost = require('./hashPost.js');
function curatePost(acc, post, user, index, indexedPostIds, processed, args) {
function curatePost(acc, post, user, index, processed, args) {
const host = dissectLink(post.url);
const permalink = `https://reddit.com${post.permalink}`;
@@ -25,7 +25,7 @@ function curatePost(acc, post, user, index, indexedPostIds, processed, args) {
hash: hashPost(post),
};
if (indexedPostIds.includes(post.id)) {
if (user.indexed.find(entry => entry.id === post.id)) {
return { ...acc, indexed: { ...acc.indexed, [post.id]: curatedPost } };
}
@@ -72,10 +72,9 @@ function curatePost(acc, post, user, index, indexedPostIds, processed, args) {
const curatePosts = (userPosts, args) => Object.values(userPosts).reduce((accPosts, user) => {
const processed = new Set();
const indexedPostIds = user.indexed.map(entry => entry.postId);
const { posts, indexed } = user.posts.reduce((accUserPosts, post, index) =>
curatePost(accUserPosts, post, user, index, indexedPostIds, processed, args), { posts: [], indexed: {} });
curatePost(accUserPosts, post, user, index, processed, args), { posts: [], indexed: {} });
const indexedLength = Object.keys(indexed).length;

View File

@@ -5,6 +5,7 @@ const path = require('path');
const url = require('url');
const dateFns = require('date-fns');
const mime = require('mime-types');
const dotty = require('dotty');
function interpolate(pattern, user, post, item, strip = true, dateFormat = config.library.dateFormat) {
const vars = {

View File

@@ -2,19 +2,31 @@
const config = require('config');
const fs = require('fs-extra');
const Promise = require('bluebird');
const csvStringify = Promise.promisify(require('csv').stringify);
const yaml = require('js-yaml');
const interpolate = require('../interpolate.js');
async function writeToIndex(posts, user) {
const filename = interpolate(config.library.index.file, user, null, false);
const newEntries = posts.map(post => config.library.index.keys.map(key => interpolate(`$${key}`, user, post, null, false, 'YYYY-MM-DDTHH:mm:ssZ')));
const fullEntries = newEntries.concat(user.indexed.map(entry => Object.values(entry)));
const now = new Date();
const tsvString = await csvStringify(fullEntries, { delimiter: '\t', header: true, columns: config.library.index.keys });
// Individual posts are wrapped in [] to get a YAML array value for each individual item, allowing them to be joined manually with a newline
// between each entry to improve human readability of the index while maintaining a valid YAML list
const oldEntries = user.indexed.map(entry => yaml.safeDump([entry]));
const newEntries = posts.map(post => yaml.safeDump([{
id: post.id,
subreddit: post.subreddit,
permalink: post.permalink,
url: post.url,
hostId: post.host.id,
date: post.datetime,
indexed: now,
title: post.title,
}]));
return fs.writeFile(filename, tsvString);
const entries = newEntries.concat(oldEntries).join('\n');
return fs.writeFile(filename, entries);
}
module.exports = writeToIndex;

View File

@@ -3,7 +3,7 @@
const config = require('config');
const Promise = require('bluebird');
const fs = require('fs-extra');
const csvParse = Promise.promisify(require('csv').parse);
const yaml = require('js-yaml');
const getArchivePostIds = require('../archives/getArchivePostIds.js');
const curateUser = require('../curate/user.js');
@@ -51,7 +51,7 @@ async function getIndexedPosts(user) {
try {
const indexFile = await fs.readFile(indexFilePath, 'utf8');
return await csvParse(indexFile, { delimiter: '\t', columns: true, cast: true });
return yaml.safeLoad(indexFile);
} catch (error) {
console.log('\x1b[33m%s\x1b[0m', `Could not load index file for '${user.name}' at '${indexFilePath}': ${error}`);