Using YAML rather than TSV for index files. Improves both readability and reindexability.
This commit is contained in:
parent
f41b788183
commit
029351f228
|
@ -6,6 +6,6 @@
|
|||
"rules": {
|
||||
"no-console": 0,
|
||||
"indent": ["error", 4],
|
||||
"max-len": [2, {"code": 125, "tabWidth": 4, "ignoreUrls": true}]
|
||||
"max-len": [2, {"code": 200, "tabWidth": 4, "ignoreUrls": true}]
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,7 +19,7 @@ module.exports = {
|
|||
index: {
|
||||
file: '$base/index',
|
||||
format: 'tsv',
|
||||
keys: ['postId', 'subreddit', 'postDate', 'postTitle', 'url', 'hostId'],
|
||||
keys: ['postId', 'subreddit', 'postDate', 'url', 'hostId', 'postTitle'],
|
||||
},
|
||||
booleans: {
|
||||
extracted: 'extracted-',
|
||||
|
|
|
@ -70,7 +70,6 @@
|
|||
"version": "1.0.10",
|
||||
"resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz",
|
||||
"integrity": "sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"sprintf-js": "1.0.3"
|
||||
}
|
||||
|
@ -461,35 +460,6 @@
|
|||
"resolved": "https://registry.npmjs.org/css-what/-/css-what-2.1.0.tgz",
|
||||
"integrity": "sha1-lGfQMsOM+u+58teVASUwYvh/ob0="
|
||||
},
|
||||
"csv": {
|
||||
"version": "3.1.0",
|
||||
"resolved": "https://registry.npmjs.org/csv/-/csv-3.1.0.tgz",
|
||||
"integrity": "sha512-SfnePMkhjljB7ehvubZESGjgrnM7V/gBe5ubZWKxeKwgmTl/HtVCdfSaGRgH/i/vG7qJaSLMpP0krNbAuunRBg==",
|
||||
"requires": {
|
||||
"csv-generate": "2.0.2",
|
||||
"csv-parse": "2.5.0",
|
||||
"csv-stringify": "3.1.1",
|
||||
"stream-transform": "1.0.2"
|
||||
}
|
||||
},
|
||||
"csv-generate": {
|
||||
"version": "2.0.2",
|
||||
"resolved": "https://registry.npmjs.org/csv-generate/-/csv-generate-2.0.2.tgz",
|
||||
"integrity": "sha512-oyidhQ/sQcqKOyt+hRnL9oiqFFWsEkOwBE7tEV3pwku6dSuFUQqTGfhYXH/HZ3rKy8xBtcrwsspmXVo+LPijuA=="
|
||||
},
|
||||
"csv-parse": {
|
||||
"version": "2.5.0",
|
||||
"resolved": "https://registry.npmjs.org/csv-parse/-/csv-parse-2.5.0.tgz",
|
||||
"integrity": "sha512-4OcjOJQByI0YDU5COYw9HAqjo8/MOLLmT9EKyMCXUzgvh30vS1SlMK+Ho84IH5exN44cSnrYecw/7Zpu2m4lkA=="
|
||||
},
|
||||
"csv-stringify": {
|
||||
"version": "3.1.1",
|
||||
"resolved": "https://registry.npmjs.org/csv-stringify/-/csv-stringify-3.1.1.tgz",
|
||||
"integrity": "sha512-Ni9r/BdQM2cGnWzwAP09zp12LVOAMHLJ86azNHGC7s4OUo2WidGfcM3QwYEjD8c4ELCL/a4AzfIsVCzroeys+g==",
|
||||
"requires": {
|
||||
"lodash.get": "4.4.2"
|
||||
}
|
||||
},
|
||||
"dashdash": {
|
||||
"version": "1.14.1",
|
||||
"resolved": "https://registry.npmjs.org/dashdash/-/dashdash-1.14.1.tgz",
|
||||
|
@ -831,8 +801,7 @@
|
|||
"esprima": {
|
||||
"version": "4.0.0",
|
||||
"resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.0.tgz",
|
||||
"integrity": "sha512-oftTcaMu/EGrEIu904mWteKIv8vMuOgGYo7EhVJJN00R/EED9DCua/xxHRdYnKtcECzVg7xOWhflvJMnqcFZjw==",
|
||||
"dev": true
|
||||
"integrity": "sha512-oftTcaMu/EGrEIu904mWteKIv8vMuOgGYo7EhVJJN00R/EED9DCua/xxHRdYnKtcECzVg7xOWhflvJMnqcFZjw=="
|
||||
},
|
||||
"esquery": {
|
||||
"version": "1.0.1",
|
||||
|
@ -1372,7 +1341,6 @@
|
|||
"version": "3.12.0",
|
||||
"resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.12.0.tgz",
|
||||
"integrity": "sha512-PIt2cnwmPfL4hKNwqeiuz4bKfnzHTBv6HyVgjahA6mPLwPDzjDWrplJBMjHUFxku/N3FlmrbyPclad+I+4mJ3A==",
|
||||
"dev": true,
|
||||
"requires": {
|
||||
"argparse": "1.0.10",
|
||||
"esprima": "4.0.0"
|
||||
|
@ -1473,11 +1441,6 @@
|
|||
"resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.5.tgz",
|
||||
"integrity": "sha512-svL3uiZf1RwhH+cWrfZn3A4+U58wbP0tGVTLQPbjplZxZ8ROD9VLuNgsRniTlLe7OlSqR79RUehXgpBW/s0IQw=="
|
||||
},
|
||||
"lodash.get": {
|
||||
"version": "4.4.2",
|
||||
"resolved": "https://registry.npmjs.org/lodash.get/-/lodash.get-4.4.2.tgz",
|
||||
"integrity": "sha1-LRd/ZS+jHpObRDjVNBSZ36OCXpk="
|
||||
},
|
||||
"lru-cache": {
|
||||
"version": "4.1.2",
|
||||
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-4.1.2.tgz",
|
||||
|
@ -2152,8 +2115,7 @@
|
|||
"sprintf-js": {
|
||||
"version": "1.0.3",
|
||||
"resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz",
|
||||
"integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw=",
|
||||
"dev": true
|
||||
"integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw="
|
||||
},
|
||||
"sshpk": {
|
||||
"version": "1.14.1",
|
||||
|
@ -2170,11 +2132,6 @@
|
|||
"tweetnacl": "0.14.5"
|
||||
}
|
||||
},
|
||||
"stream-transform": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/stream-transform/-/stream-transform-1.0.2.tgz",
|
||||
"integrity": "sha512-LNcZSF01PZ+bM0OqwPY7UHPiKoxSmLGHAcqakvh01DCU98ONEslLORdyBPdmTqjTpZSfCiaYLV4sci9y5M47oA=="
|
||||
},
|
||||
"string-width": {
|
||||
"version": "2.1.1",
|
||||
"resolved": "https://registry.npmjs.org/string-width/-/string-width-2.1.1.tgz",
|
||||
|
|
|
@ -32,11 +32,11 @@
|
|||
"bluebird": "^3.5.1",
|
||||
"cheerio": "^1.0.0-rc.2",
|
||||
"config": "^1.30.0",
|
||||
"csv": "^3.1.0",
|
||||
"date-fns": "^1.29.0",
|
||||
"dist-exiftool": "^10.53.0",
|
||||
"fluent-ffmpeg": "^2.1.2",
|
||||
"fs-extra": "^5.0.0",
|
||||
"js-yaml": "^3.12.0",
|
||||
"mime-types": "^2.1.18",
|
||||
"node-exiftool": "^2.3.0",
|
||||
"node-fetch": "^2.1.2",
|
||||
|
|
75
src/cli.js
75
src/cli.js
|
@ -4,37 +4,50 @@ const config = require('config');
|
|||
const yargs = require('yargs');
|
||||
|
||||
function getArgs() {
|
||||
return yargs.command('npm start -- --user <username>').option('users', {
|
||||
alias: 'user',
|
||||
describe: 'Reddit usernames to fetch posts from',
|
||||
type: 'array',
|
||||
}).option('posts', {
|
||||
alias: 'post',
|
||||
describe: 'Reddit post IDs to fetch',
|
||||
type: 'array',
|
||||
}).option('limit', {
|
||||
describe: 'Maximum amount of posts to fetch per supplied user (!), after filtering out ignored, cross- and reposts',
|
||||
type: 'number',
|
||||
default: config.fetch.limit,
|
||||
}).option('sort', {
|
||||
describe: 'Property to sort posts by',
|
||||
choices: ['new', 'top', 'hot', 'controversial'],
|
||||
default: config.fetch.sort,
|
||||
}).option('ignore', {
|
||||
describe: 'Ignore posts with any of these properties',
|
||||
type: 'array',
|
||||
choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18'],
|
||||
}).option('include', {
|
||||
describe: 'Include only these sources',
|
||||
type: 'array',
|
||||
}).option('exclude', {
|
||||
describe: 'Do not include these sources',
|
||||
type: 'array',
|
||||
}).option('archives', {
|
||||
describe: 'Search archives for deleted posts',
|
||||
type: 'boolean',
|
||||
default: config.fetch.archives.search,
|
||||
}).argv;
|
||||
return yargs
|
||||
.command('npm start -- --user <username>')
|
||||
.option('users', {
|
||||
alias: 'user',
|
||||
describe: 'Reddit usernames to fetch posts from',
|
||||
type: 'array',
|
||||
})
|
||||
.option('posts', {
|
||||
alias: 'post',
|
||||
describe: 'Reddit post IDs to fetch',
|
||||
type: 'array',
|
||||
})
|
||||
.option('limit', {
|
||||
describe: 'Maximum amount of posts to fetch per supplied user (!), after filtering out ignored, cross- and reposts',
|
||||
type: 'number',
|
||||
default: config.fetch.limit,
|
||||
})
|
||||
.option('sort', {
|
||||
describe: 'Property to sort posts by',
|
||||
choices: ['new', 'top', 'hot', 'controversial'],
|
||||
default: config.fetch.sort,
|
||||
})
|
||||
.option('ignore', {
|
||||
describe: 'Ignore posts with any of these properties',
|
||||
type: 'array',
|
||||
choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18'],
|
||||
})
|
||||
.option('include', {
|
||||
describe: 'Include only these sources',
|
||||
type: 'array',
|
||||
})
|
||||
.option('exclude', {
|
||||
describe: 'Do not include these sources',
|
||||
type: 'array',
|
||||
})
|
||||
.option('after', {
|
||||
describe: 'Do not include posts from before this date (DD-MM-YYYY). When set to \'index\', it will assume the date of the latest indexed post.',
|
||||
})
|
||||
.option('archives', {
|
||||
describe: 'Search archives for deleted posts',
|
||||
type: 'boolean',
|
||||
default: config.fetch.archives.search,
|
||||
})
|
||||
.argv;
|
||||
}
|
||||
|
||||
module.exports = getArgs;
|
||||
|
|
|
@ -6,7 +6,7 @@ const omit = require('object.omit');
|
|||
const dissectLink = require('../dissectLink.js');
|
||||
const hashPost = require('./hashPost.js');
|
||||
|
||||
function curatePost(acc, post, user, index, indexedPostIds, processed, args) {
|
||||
function curatePost(acc, post, user, index, processed, args) {
|
||||
const host = dissectLink(post.url);
|
||||
const permalink = `https://reddit.com${post.permalink}`;
|
||||
|
||||
|
@ -25,7 +25,7 @@ function curatePost(acc, post, user, index, indexedPostIds, processed, args) {
|
|||
hash: hashPost(post),
|
||||
};
|
||||
|
||||
if (indexedPostIds.includes(post.id)) {
|
||||
if (user.indexed.find(entry => entry.id === post.id)) {
|
||||
return { ...acc, indexed: { ...acc.indexed, [post.id]: curatedPost } };
|
||||
}
|
||||
|
||||
|
@ -72,10 +72,9 @@ function curatePost(acc, post, user, index, indexedPostIds, processed, args) {
|
|||
|
||||
const curatePosts = (userPosts, args) => Object.values(userPosts).reduce((accPosts, user) => {
|
||||
const processed = new Set();
|
||||
const indexedPostIds = user.indexed.map(entry => entry.postId);
|
||||
|
||||
const { posts, indexed } = user.posts.reduce((accUserPosts, post, index) =>
|
||||
curatePost(accUserPosts, post, user, index, indexedPostIds, processed, args), { posts: [], indexed: {} });
|
||||
curatePost(accUserPosts, post, user, index, processed, args), { posts: [], indexed: {} });
|
||||
|
||||
const indexedLength = Object.keys(indexed).length;
|
||||
|
||||
|
|
|
@ -5,6 +5,7 @@ const path = require('path');
|
|||
const url = require('url');
|
||||
const dateFns = require('date-fns');
|
||||
const mime = require('mime-types');
|
||||
const dotty = require('dotty');
|
||||
|
||||
function interpolate(pattern, user, post, item, strip = true, dateFormat = config.library.dateFormat) {
|
||||
const vars = {
|
||||
|
|
|
@ -2,19 +2,31 @@
|
|||
|
||||
const config = require('config');
|
||||
const fs = require('fs-extra');
|
||||
const Promise = require('bluebird');
|
||||
const csvStringify = Promise.promisify(require('csv').stringify);
|
||||
const yaml = require('js-yaml');
|
||||
|
||||
const interpolate = require('../interpolate.js');
|
||||
|
||||
async function writeToIndex(posts, user) {
|
||||
const filename = interpolate(config.library.index.file, user, null, false);
|
||||
const newEntries = posts.map(post => config.library.index.keys.map(key => interpolate(`$${key}`, user, post, null, false, 'YYYY-MM-DDTHH:mm:ssZ')));
|
||||
const fullEntries = newEntries.concat(user.indexed.map(entry => Object.values(entry)));
|
||||
const now = new Date();
|
||||
|
||||
const tsvString = await csvStringify(fullEntries, { delimiter: '\t', header: true, columns: config.library.index.keys });
|
||||
// Individual posts are wrapped in [] to get a YAML array value for each individual item, allowing them to be joined manually with a newline
|
||||
// between each entry to improve human readability of the index while maintaining a valid YAML list
|
||||
const oldEntries = user.indexed.map(entry => yaml.safeDump([entry]));
|
||||
const newEntries = posts.map(post => yaml.safeDump([{
|
||||
id: post.id,
|
||||
subreddit: post.subreddit,
|
||||
permalink: post.permalink,
|
||||
url: post.url,
|
||||
hostId: post.host.id,
|
||||
date: post.datetime,
|
||||
indexed: now,
|
||||
title: post.title,
|
||||
}]));
|
||||
|
||||
return fs.writeFile(filename, tsvString);
|
||||
const entries = newEntries.concat(oldEntries).join('\n');
|
||||
|
||||
return fs.writeFile(filename, entries);
|
||||
}
|
||||
|
||||
module.exports = writeToIndex;
|
||||
|
|
|
@ -3,7 +3,7 @@
|
|||
const config = require('config');
|
||||
const Promise = require('bluebird');
|
||||
const fs = require('fs-extra');
|
||||
const csvParse = Promise.promisify(require('csv').parse);
|
||||
const yaml = require('js-yaml');
|
||||
|
||||
const getArchivePostIds = require('../archives/getArchivePostIds.js');
|
||||
const curateUser = require('../curate/user.js');
|
||||
|
@ -51,7 +51,7 @@ async function getIndexedPosts(user) {
|
|||
try {
|
||||
const indexFile = await fs.readFile(indexFilePath, 'utf8');
|
||||
|
||||
return await csvParse(indexFile, { delimiter: '\t', columns: true, cast: true });
|
||||
return yaml.safeLoad(indexFile);
|
||||
} catch (error) {
|
||||
console.log('\x1b[33m%s\x1b[0m', `Could not load index file for '${user.name}' at '${indexFilePath}': ${error}`);
|
||||
|
||||
|
|
Loading…
Reference in New Issue