Using YAML rather than TSV for index files. Improves both readability and reindexability.
This commit is contained in:
75
src/cli.js
75
src/cli.js
@@ -4,37 +4,50 @@ const config = require('config');
|
||||
const yargs = require('yargs');
|
||||
|
||||
function getArgs() {
|
||||
return yargs.command('npm start -- --user <username>').option('users', {
|
||||
alias: 'user',
|
||||
describe: 'Reddit usernames to fetch posts from',
|
||||
type: 'array',
|
||||
}).option('posts', {
|
||||
alias: 'post',
|
||||
describe: 'Reddit post IDs to fetch',
|
||||
type: 'array',
|
||||
}).option('limit', {
|
||||
describe: 'Maximum amount of posts to fetch per supplied user (!), after filtering out ignored, cross- and reposts',
|
||||
type: 'number',
|
||||
default: config.fetch.limit,
|
||||
}).option('sort', {
|
||||
describe: 'Property to sort posts by',
|
||||
choices: ['new', 'top', 'hot', 'controversial'],
|
||||
default: config.fetch.sort,
|
||||
}).option('ignore', {
|
||||
describe: 'Ignore posts with any of these properties',
|
||||
type: 'array',
|
||||
choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18'],
|
||||
}).option('include', {
|
||||
describe: 'Include only these sources',
|
||||
type: 'array',
|
||||
}).option('exclude', {
|
||||
describe: 'Do not include these sources',
|
||||
type: 'array',
|
||||
}).option('archives', {
|
||||
describe: 'Search archives for deleted posts',
|
||||
type: 'boolean',
|
||||
default: config.fetch.archives.search,
|
||||
}).argv;
|
||||
return yargs
|
||||
.command('npm start -- --user <username>')
|
||||
.option('users', {
|
||||
alias: 'user',
|
||||
describe: 'Reddit usernames to fetch posts from',
|
||||
type: 'array',
|
||||
})
|
||||
.option('posts', {
|
||||
alias: 'post',
|
||||
describe: 'Reddit post IDs to fetch',
|
||||
type: 'array',
|
||||
})
|
||||
.option('limit', {
|
||||
describe: 'Maximum amount of posts to fetch per supplied user (!), after filtering out ignored, cross- and reposts',
|
||||
type: 'number',
|
||||
default: config.fetch.limit,
|
||||
})
|
||||
.option('sort', {
|
||||
describe: 'Property to sort posts by',
|
||||
choices: ['new', 'top', 'hot', 'controversial'],
|
||||
default: config.fetch.sort,
|
||||
})
|
||||
.option('ignore', {
|
||||
describe: 'Ignore posts with any of these properties',
|
||||
type: 'array',
|
||||
choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18'],
|
||||
})
|
||||
.option('include', {
|
||||
describe: 'Include only these sources',
|
||||
type: 'array',
|
||||
})
|
||||
.option('exclude', {
|
||||
describe: 'Do not include these sources',
|
||||
type: 'array',
|
||||
})
|
||||
.option('after', {
|
||||
describe: 'Do not include posts from before this date (DD-MM-YYYY). When set to \'index\', it will assume the date of the latest indexed post.',
|
||||
})
|
||||
.option('archives', {
|
||||
describe: 'Search archives for deleted posts',
|
||||
type: 'boolean',
|
||||
default: config.fetch.archives.search,
|
||||
})
|
||||
.argv;
|
||||
}
|
||||
|
||||
module.exports = getArgs;
|
||||
|
||||
@@ -6,7 +6,7 @@ const omit = require('object.omit');
|
||||
const dissectLink = require('../dissectLink.js');
|
||||
const hashPost = require('./hashPost.js');
|
||||
|
||||
function curatePost(acc, post, user, index, indexedPostIds, processed, args) {
|
||||
function curatePost(acc, post, user, index, processed, args) {
|
||||
const host = dissectLink(post.url);
|
||||
const permalink = `https://reddit.com${post.permalink}`;
|
||||
|
||||
@@ -25,7 +25,7 @@ function curatePost(acc, post, user, index, indexedPostIds, processed, args) {
|
||||
hash: hashPost(post),
|
||||
};
|
||||
|
||||
if (indexedPostIds.includes(post.id)) {
|
||||
if (user.indexed.find(entry => entry.id === post.id)) {
|
||||
return { ...acc, indexed: { ...acc.indexed, [post.id]: curatedPost } };
|
||||
}
|
||||
|
||||
@@ -72,10 +72,9 @@ function curatePost(acc, post, user, index, indexedPostIds, processed, args) {
|
||||
|
||||
const curatePosts = (userPosts, args) => Object.values(userPosts).reduce((accPosts, user) => {
|
||||
const processed = new Set();
|
||||
const indexedPostIds = user.indexed.map(entry => entry.postId);
|
||||
|
||||
const { posts, indexed } = user.posts.reduce((accUserPosts, post, index) =>
|
||||
curatePost(accUserPosts, post, user, index, indexedPostIds, processed, args), { posts: [], indexed: {} });
|
||||
curatePost(accUserPosts, post, user, index, processed, args), { posts: [], indexed: {} });
|
||||
|
||||
const indexedLength = Object.keys(indexed).length;
|
||||
|
||||
|
||||
@@ -5,6 +5,7 @@ const path = require('path');
|
||||
const url = require('url');
|
||||
const dateFns = require('date-fns');
|
||||
const mime = require('mime-types');
|
||||
const dotty = require('dotty');
|
||||
|
||||
function interpolate(pattern, user, post, item, strip = true, dateFormat = config.library.dateFormat) {
|
||||
const vars = {
|
||||
|
||||
@@ -2,19 +2,31 @@
|
||||
|
||||
const config = require('config');
|
||||
const fs = require('fs-extra');
|
||||
const Promise = require('bluebird');
|
||||
const csvStringify = Promise.promisify(require('csv').stringify);
|
||||
const yaml = require('js-yaml');
|
||||
|
||||
const interpolate = require('../interpolate.js');
|
||||
|
||||
async function writeToIndex(posts, user) {
|
||||
const filename = interpolate(config.library.index.file, user, null, false);
|
||||
const newEntries = posts.map(post => config.library.index.keys.map(key => interpolate(`$${key}`, user, post, null, false, 'YYYY-MM-DDTHH:mm:ssZ')));
|
||||
const fullEntries = newEntries.concat(user.indexed.map(entry => Object.values(entry)));
|
||||
const now = new Date();
|
||||
|
||||
const tsvString = await csvStringify(fullEntries, { delimiter: '\t', header: true, columns: config.library.index.keys });
|
||||
// Individual posts are wrapped in [] to get a YAML array value for each individual item, allowing them to be joined manually with a newline
|
||||
// between each entry to improve human readability of the index while maintaining a valid YAML list
|
||||
const oldEntries = user.indexed.map(entry => yaml.safeDump([entry]));
|
||||
const newEntries = posts.map(post => yaml.safeDump([{
|
||||
id: post.id,
|
||||
subreddit: post.subreddit,
|
||||
permalink: post.permalink,
|
||||
url: post.url,
|
||||
hostId: post.host.id,
|
||||
date: post.datetime,
|
||||
indexed: now,
|
||||
title: post.title,
|
||||
}]));
|
||||
|
||||
return fs.writeFile(filename, tsvString);
|
||||
const entries = newEntries.concat(oldEntries).join('\n');
|
||||
|
||||
return fs.writeFile(filename, entries);
|
||||
}
|
||||
|
||||
module.exports = writeToIndex;
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
const config = require('config');
|
||||
const Promise = require('bluebird');
|
||||
const fs = require('fs-extra');
|
||||
const csvParse = Promise.promisify(require('csv').parse);
|
||||
const yaml = require('js-yaml');
|
||||
|
||||
const getArchivePostIds = require('../archives/getArchivePostIds.js');
|
||||
const curateUser = require('../curate/user.js');
|
||||
@@ -51,7 +51,7 @@ async function getIndexedPosts(user) {
|
||||
try {
|
||||
const indexFile = await fs.readFile(indexFilePath, 'utf8');
|
||||
|
||||
return await csvParse(indexFile, { delimiter: '\t', columns: true, cast: true });
|
||||
return yaml.safeLoad(indexFile);
|
||||
} catch (error) {
|
||||
console.log('\x1b[33m%s\x1b[0m', `Could not load index file for '${user.name}' at '${indexFilePath}': ${error}`);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user