diff --git a/.eslintrc b/.eslintrc index 95b9fda..780f9cb 100644 --- a/.eslintrc +++ b/.eslintrc @@ -6,6 +6,6 @@ "rules": { "no-console": 0, "indent": ["error", 4], - "max-len": [2, {"code": 125, "tabWidth": 4, "ignoreUrls": true}] + "max-len": [2, {"code": 200, "tabWidth": 4, "ignoreUrls": true}] } } diff --git a/config/default.js b/config/default.js index d905767..917b588 100644 --- a/config/default.js +++ b/config/default.js @@ -19,7 +19,7 @@ module.exports = { index: { file: '$base/index', format: 'tsv', - keys: ['postId', 'subreddit', 'postDate', 'postTitle', 'url', 'hostId'], + keys: ['postId', 'subreddit', 'postDate', 'url', 'hostId', 'postTitle'], }, booleans: { extracted: 'extracted-', diff --git a/package-lock.json b/package-lock.json index 0a094b4..794a103 100644 --- a/package-lock.json +++ b/package-lock.json @@ -70,7 +70,6 @@ "version": "1.0.10", "resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz", "integrity": "sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==", - "dev": true, "requires": { "sprintf-js": "1.0.3" } @@ -461,35 +460,6 @@ "resolved": "https://registry.npmjs.org/css-what/-/css-what-2.1.0.tgz", "integrity": "sha1-lGfQMsOM+u+58teVASUwYvh/ob0=" }, - "csv": { - "version": "3.1.0", - "resolved": "https://registry.npmjs.org/csv/-/csv-3.1.0.tgz", - "integrity": "sha512-SfnePMkhjljB7ehvubZESGjgrnM7V/gBe5ubZWKxeKwgmTl/HtVCdfSaGRgH/i/vG7qJaSLMpP0krNbAuunRBg==", - "requires": { - "csv-generate": "2.0.2", - "csv-parse": "2.5.0", - "csv-stringify": "3.1.1", - "stream-transform": "1.0.2" - } - }, - "csv-generate": { - "version": "2.0.2", - "resolved": "https://registry.npmjs.org/csv-generate/-/csv-generate-2.0.2.tgz", - "integrity": "sha512-oyidhQ/sQcqKOyt+hRnL9oiqFFWsEkOwBE7tEV3pwku6dSuFUQqTGfhYXH/HZ3rKy8xBtcrwsspmXVo+LPijuA==" - }, - "csv-parse": { - "version": "2.5.0", - "resolved": "https://registry.npmjs.org/csv-parse/-/csv-parse-2.5.0.tgz", - "integrity": "sha512-4OcjOJQByI0YDU5COYw9HAqjo8/MOLLmT9EKyMCXUzgvh30vS1SlMK+Ho84IH5exN44cSnrYecw/7Zpu2m4lkA==" - }, - "csv-stringify": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/csv-stringify/-/csv-stringify-3.1.1.tgz", - "integrity": "sha512-Ni9r/BdQM2cGnWzwAP09zp12LVOAMHLJ86azNHGC7s4OUo2WidGfcM3QwYEjD8c4ELCL/a4AzfIsVCzroeys+g==", - "requires": { - "lodash.get": "4.4.2" - } - }, "dashdash": { "version": "1.14.1", "resolved": "https://registry.npmjs.org/dashdash/-/dashdash-1.14.1.tgz", @@ -831,8 +801,7 @@ "esprima": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.0.tgz", - "integrity": "sha512-oftTcaMu/EGrEIu904mWteKIv8vMuOgGYo7EhVJJN00R/EED9DCua/xxHRdYnKtcECzVg7xOWhflvJMnqcFZjw==", - "dev": true + "integrity": "sha512-oftTcaMu/EGrEIu904mWteKIv8vMuOgGYo7EhVJJN00R/EED9DCua/xxHRdYnKtcECzVg7xOWhflvJMnqcFZjw==" }, "esquery": { "version": "1.0.1", @@ -1372,7 +1341,6 @@ "version": "3.12.0", "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.12.0.tgz", "integrity": "sha512-PIt2cnwmPfL4hKNwqeiuz4bKfnzHTBv6HyVgjahA6mPLwPDzjDWrplJBMjHUFxku/N3FlmrbyPclad+I+4mJ3A==", - "dev": true, "requires": { "argparse": "1.0.10", "esprima": "4.0.0" @@ -1473,11 +1441,6 @@ "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.5.tgz", "integrity": "sha512-svL3uiZf1RwhH+cWrfZn3A4+U58wbP0tGVTLQPbjplZxZ8ROD9VLuNgsRniTlLe7OlSqR79RUehXgpBW/s0IQw==" }, - "lodash.get": { - "version": "4.4.2", - "resolved": "https://registry.npmjs.org/lodash.get/-/lodash.get-4.4.2.tgz", - "integrity": "sha1-LRd/ZS+jHpObRDjVNBSZ36OCXpk=" - }, "lru-cache": { "version": "4.1.2", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-4.1.2.tgz", @@ -2152,8 +2115,7 @@ "sprintf-js": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz", - "integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw=", - "dev": true + "integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw=" }, "sshpk": { "version": "1.14.1", @@ -2170,11 +2132,6 @@ "tweetnacl": "0.14.5" } }, - "stream-transform": { - "version": "1.0.2", - "resolved": "https://registry.npmjs.org/stream-transform/-/stream-transform-1.0.2.tgz", - "integrity": "sha512-LNcZSF01PZ+bM0OqwPY7UHPiKoxSmLGHAcqakvh01DCU98ONEslLORdyBPdmTqjTpZSfCiaYLV4sci9y5M47oA==" - }, "string-width": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/string-width/-/string-width-2.1.1.tgz", diff --git a/package.json b/package.json index 8e2046a..a187830 100644 --- a/package.json +++ b/package.json @@ -32,11 +32,11 @@ "bluebird": "^3.5.1", "cheerio": "^1.0.0-rc.2", "config": "^1.30.0", - "csv": "^3.1.0", "date-fns": "^1.29.0", "dist-exiftool": "^10.53.0", "fluent-ffmpeg": "^2.1.2", "fs-extra": "^5.0.0", + "js-yaml": "^3.12.0", "mime-types": "^2.1.18", "node-exiftool": "^2.3.0", "node-fetch": "^2.1.2", diff --git a/src/cli.js b/src/cli.js index fe6b332..bb3cbf3 100644 --- a/src/cli.js +++ b/src/cli.js @@ -4,37 +4,50 @@ const config = require('config'); const yargs = require('yargs'); function getArgs() { - return yargs.command('npm start -- --user ').option('users', { - alias: 'user', - describe: 'Reddit usernames to fetch posts from', - type: 'array', - }).option('posts', { - alias: 'post', - describe: 'Reddit post IDs to fetch', - type: 'array', - }).option('limit', { - describe: 'Maximum amount of posts to fetch per supplied user (!), after filtering out ignored, cross- and reposts', - type: 'number', - default: config.fetch.limit, - }).option('sort', { - describe: 'Property to sort posts by', - choices: ['new', 'top', 'hot', 'controversial'], - default: config.fetch.sort, - }).option('ignore', { - describe: 'Ignore posts with any of these properties', - type: 'array', - choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18'], - }).option('include', { - describe: 'Include only these sources', - type: 'array', - }).option('exclude', { - describe: 'Do not include these sources', - type: 'array', - }).option('archives', { - describe: 'Search archives for deleted posts', - type: 'boolean', - default: config.fetch.archives.search, - }).argv; + return yargs + .command('npm start -- --user ') + .option('users', { + alias: 'user', + describe: 'Reddit usernames to fetch posts from', + type: 'array', + }) + .option('posts', { + alias: 'post', + describe: 'Reddit post IDs to fetch', + type: 'array', + }) + .option('limit', { + describe: 'Maximum amount of posts to fetch per supplied user (!), after filtering out ignored, cross- and reposts', + type: 'number', + default: config.fetch.limit, + }) + .option('sort', { + describe: 'Property to sort posts by', + choices: ['new', 'top', 'hot', 'controversial'], + default: config.fetch.sort, + }) + .option('ignore', { + describe: 'Ignore posts with any of these properties', + type: 'array', + choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18'], + }) + .option('include', { + describe: 'Include only these sources', + type: 'array', + }) + .option('exclude', { + describe: 'Do not include these sources', + type: 'array', + }) + .option('after', { + describe: 'Do not include posts from before this date (DD-MM-YYYY). When set to \'index\', it will assume the date of the latest indexed post.', + }) + .option('archives', { + describe: 'Search archives for deleted posts', + type: 'boolean', + default: config.fetch.archives.search, + }) + .argv; } module.exports = getArgs; diff --git a/src/curate/posts.js b/src/curate/posts.js index bc596ff..65678a1 100644 --- a/src/curate/posts.js +++ b/src/curate/posts.js @@ -6,7 +6,7 @@ const omit = require('object.omit'); const dissectLink = require('../dissectLink.js'); const hashPost = require('./hashPost.js'); -function curatePost(acc, post, user, index, indexedPostIds, processed, args) { +function curatePost(acc, post, user, index, processed, args) { const host = dissectLink(post.url); const permalink = `https://reddit.com${post.permalink}`; @@ -25,7 +25,7 @@ function curatePost(acc, post, user, index, indexedPostIds, processed, args) { hash: hashPost(post), }; - if (indexedPostIds.includes(post.id)) { + if (user.indexed.find(entry => entry.id === post.id)) { return { ...acc, indexed: { ...acc.indexed, [post.id]: curatedPost } }; } @@ -72,10 +72,9 @@ function curatePost(acc, post, user, index, indexedPostIds, processed, args) { const curatePosts = (userPosts, args) => Object.values(userPosts).reduce((accPosts, user) => { const processed = new Set(); - const indexedPostIds = user.indexed.map(entry => entry.postId); const { posts, indexed } = user.posts.reduce((accUserPosts, post, index) => - curatePost(accUserPosts, post, user, index, indexedPostIds, processed, args), { posts: [], indexed: {} }); + curatePost(accUserPosts, post, user, index, processed, args), { posts: [], indexed: {} }); const indexedLength = Object.keys(indexed).length; diff --git a/src/interpolate.js b/src/interpolate.js index 6eb7aeb..ad03496 100644 --- a/src/interpolate.js +++ b/src/interpolate.js @@ -5,6 +5,7 @@ const path = require('path'); const url = require('url'); const dateFns = require('date-fns'); const mime = require('mime-types'); +const dotty = require('dotty'); function interpolate(pattern, user, post, item, strip = true, dateFormat = config.library.dateFormat) { const vars = { diff --git a/src/save/writeToIndex.js b/src/save/writeToIndex.js index 494bf82..b45f8e4 100644 --- a/src/save/writeToIndex.js +++ b/src/save/writeToIndex.js @@ -2,19 +2,31 @@ const config = require('config'); const fs = require('fs-extra'); -const Promise = require('bluebird'); -const csvStringify = Promise.promisify(require('csv').stringify); +const yaml = require('js-yaml'); const interpolate = require('../interpolate.js'); async function writeToIndex(posts, user) { const filename = interpolate(config.library.index.file, user, null, false); - const newEntries = posts.map(post => config.library.index.keys.map(key => interpolate(`$${key}`, user, post, null, false, 'YYYY-MM-DDTHH:mm:ssZ'))); - const fullEntries = newEntries.concat(user.indexed.map(entry => Object.values(entry))); + const now = new Date(); - const tsvString = await csvStringify(fullEntries, { delimiter: '\t', header: true, columns: config.library.index.keys }); + // Individual posts are wrapped in [] to get a YAML array value for each individual item, allowing them to be joined manually with a newline + // between each entry to improve human readability of the index while maintaining a valid YAML list + const oldEntries = user.indexed.map(entry => yaml.safeDump([entry])); + const newEntries = posts.map(post => yaml.safeDump([{ + id: post.id, + subreddit: post.subreddit, + permalink: post.permalink, + url: post.url, + hostId: post.host.id, + date: post.datetime, + indexed: now, + title: post.title, + }])); - return fs.writeFile(filename, tsvString); + const entries = newEntries.concat(oldEntries).join('\n'); + + return fs.writeFile(filename, entries); } module.exports = writeToIndex; diff --git a/src/sources/getUserPosts.js b/src/sources/getUserPosts.js index 002a953..607be75 100644 --- a/src/sources/getUserPosts.js +++ b/src/sources/getUserPosts.js @@ -3,7 +3,7 @@ const config = require('config'); const Promise = require('bluebird'); const fs = require('fs-extra'); -const csvParse = Promise.promisify(require('csv').parse); +const yaml = require('js-yaml'); const getArchivePostIds = require('../archives/getArchivePostIds.js'); const curateUser = require('../curate/user.js'); @@ -51,7 +51,7 @@ async function getIndexedPosts(user) { try { const indexFile = await fs.readFile(indexFilePath, 'utf8'); - return await csvParse(indexFile, { delimiter: '\t', columns: true, cast: true }); + return yaml.safeLoad(indexFile); } catch (error) { console.log('\x1b[33m%s\x1b[0m', `Could not load index file for '${user.name}' at '${indexFilePath}': ${error}`);