Using YAML rather than TSV for index files. Improves both readability and reindexability.

This commit is contained in:
DebaucheryLibrarian 2024-09-11 05:16:56 +02:00
parent f41b788183
commit 029351f228
9 changed files with 73 additions and 91 deletions

View File

@ -6,6 +6,6 @@
"rules": { "rules": {
"no-console": 0, "no-console": 0,
"indent": ["error", 4], "indent": ["error", 4],
"max-len": [2, {"code": 125, "tabWidth": 4, "ignoreUrls": true}] "max-len": [2, {"code": 200, "tabWidth": 4, "ignoreUrls": true}]
} }
} }

View File

@ -19,7 +19,7 @@ module.exports = {
index: { index: {
file: '$base/index', file: '$base/index',
format: 'tsv', format: 'tsv',
keys: ['postId', 'subreddit', 'postDate', 'postTitle', 'url', 'hostId'], keys: ['postId', 'subreddit', 'postDate', 'url', 'hostId', 'postTitle'],
}, },
booleans: { booleans: {
extracted: 'extracted-', extracted: 'extracted-',

47
package-lock.json generated
View File

@ -70,7 +70,6 @@
"version": "1.0.10", "version": "1.0.10",
"resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz", "resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz",
"integrity": "sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==", "integrity": "sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==",
"dev": true,
"requires": { "requires": {
"sprintf-js": "1.0.3" "sprintf-js": "1.0.3"
} }
@ -461,35 +460,6 @@
"resolved": "https://registry.npmjs.org/css-what/-/css-what-2.1.0.tgz", "resolved": "https://registry.npmjs.org/css-what/-/css-what-2.1.0.tgz",
"integrity": "sha1-lGfQMsOM+u+58teVASUwYvh/ob0=" "integrity": "sha1-lGfQMsOM+u+58teVASUwYvh/ob0="
}, },
"csv": {
"version": "3.1.0",
"resolved": "https://registry.npmjs.org/csv/-/csv-3.1.0.tgz",
"integrity": "sha512-SfnePMkhjljB7ehvubZESGjgrnM7V/gBe5ubZWKxeKwgmTl/HtVCdfSaGRgH/i/vG7qJaSLMpP0krNbAuunRBg==",
"requires": {
"csv-generate": "2.0.2",
"csv-parse": "2.5.0",
"csv-stringify": "3.1.1",
"stream-transform": "1.0.2"
}
},
"csv-generate": {
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/csv-generate/-/csv-generate-2.0.2.tgz",
"integrity": "sha512-oyidhQ/sQcqKOyt+hRnL9oiqFFWsEkOwBE7tEV3pwku6dSuFUQqTGfhYXH/HZ3rKy8xBtcrwsspmXVo+LPijuA=="
},
"csv-parse": {
"version": "2.5.0",
"resolved": "https://registry.npmjs.org/csv-parse/-/csv-parse-2.5.0.tgz",
"integrity": "sha512-4OcjOJQByI0YDU5COYw9HAqjo8/MOLLmT9EKyMCXUzgvh30vS1SlMK+Ho84IH5exN44cSnrYecw/7Zpu2m4lkA=="
},
"csv-stringify": {
"version": "3.1.1",
"resolved": "https://registry.npmjs.org/csv-stringify/-/csv-stringify-3.1.1.tgz",
"integrity": "sha512-Ni9r/BdQM2cGnWzwAP09zp12LVOAMHLJ86azNHGC7s4OUo2WidGfcM3QwYEjD8c4ELCL/a4AzfIsVCzroeys+g==",
"requires": {
"lodash.get": "4.4.2"
}
},
"dashdash": { "dashdash": {
"version": "1.14.1", "version": "1.14.1",
"resolved": "https://registry.npmjs.org/dashdash/-/dashdash-1.14.1.tgz", "resolved": "https://registry.npmjs.org/dashdash/-/dashdash-1.14.1.tgz",
@ -831,8 +801,7 @@
"esprima": { "esprima": {
"version": "4.0.0", "version": "4.0.0",
"resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.0.tgz", "resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.0.tgz",
"integrity": "sha512-oftTcaMu/EGrEIu904mWteKIv8vMuOgGYo7EhVJJN00R/EED9DCua/xxHRdYnKtcECzVg7xOWhflvJMnqcFZjw==", "integrity": "sha512-oftTcaMu/EGrEIu904mWteKIv8vMuOgGYo7EhVJJN00R/EED9DCua/xxHRdYnKtcECzVg7xOWhflvJMnqcFZjw=="
"dev": true
}, },
"esquery": { "esquery": {
"version": "1.0.1", "version": "1.0.1",
@ -1372,7 +1341,6 @@
"version": "3.12.0", "version": "3.12.0",
"resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.12.0.tgz", "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-3.12.0.tgz",
"integrity": "sha512-PIt2cnwmPfL4hKNwqeiuz4bKfnzHTBv6HyVgjahA6mPLwPDzjDWrplJBMjHUFxku/N3FlmrbyPclad+I+4mJ3A==", "integrity": "sha512-PIt2cnwmPfL4hKNwqeiuz4bKfnzHTBv6HyVgjahA6mPLwPDzjDWrplJBMjHUFxku/N3FlmrbyPclad+I+4mJ3A==",
"dev": true,
"requires": { "requires": {
"argparse": "1.0.10", "argparse": "1.0.10",
"esprima": "4.0.0" "esprima": "4.0.0"
@ -1473,11 +1441,6 @@
"resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.5.tgz", "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.5.tgz",
"integrity": "sha512-svL3uiZf1RwhH+cWrfZn3A4+U58wbP0tGVTLQPbjplZxZ8ROD9VLuNgsRniTlLe7OlSqR79RUehXgpBW/s0IQw==" "integrity": "sha512-svL3uiZf1RwhH+cWrfZn3A4+U58wbP0tGVTLQPbjplZxZ8ROD9VLuNgsRniTlLe7OlSqR79RUehXgpBW/s0IQw=="
}, },
"lodash.get": {
"version": "4.4.2",
"resolved": "https://registry.npmjs.org/lodash.get/-/lodash.get-4.4.2.tgz",
"integrity": "sha1-LRd/ZS+jHpObRDjVNBSZ36OCXpk="
},
"lru-cache": { "lru-cache": {
"version": "4.1.2", "version": "4.1.2",
"resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-4.1.2.tgz", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-4.1.2.tgz",
@ -2152,8 +2115,7 @@
"sprintf-js": { "sprintf-js": {
"version": "1.0.3", "version": "1.0.3",
"resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz", "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz",
"integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw=", "integrity": "sha1-BOaSb2YolTVPPdAVIDYzuFcpfiw="
"dev": true
}, },
"sshpk": { "sshpk": {
"version": "1.14.1", "version": "1.14.1",
@ -2170,11 +2132,6 @@
"tweetnacl": "0.14.5" "tweetnacl": "0.14.5"
} }
}, },
"stream-transform": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/stream-transform/-/stream-transform-1.0.2.tgz",
"integrity": "sha512-LNcZSF01PZ+bM0OqwPY7UHPiKoxSmLGHAcqakvh01DCU98ONEslLORdyBPdmTqjTpZSfCiaYLV4sci9y5M47oA=="
},
"string-width": { "string-width": {
"version": "2.1.1", "version": "2.1.1",
"resolved": "https://registry.npmjs.org/string-width/-/string-width-2.1.1.tgz", "resolved": "https://registry.npmjs.org/string-width/-/string-width-2.1.1.tgz",

View File

@ -32,11 +32,11 @@
"bluebird": "^3.5.1", "bluebird": "^3.5.1",
"cheerio": "^1.0.0-rc.2", "cheerio": "^1.0.0-rc.2",
"config": "^1.30.0", "config": "^1.30.0",
"csv": "^3.1.0",
"date-fns": "^1.29.0", "date-fns": "^1.29.0",
"dist-exiftool": "^10.53.0", "dist-exiftool": "^10.53.0",
"fluent-ffmpeg": "^2.1.2", "fluent-ffmpeg": "^2.1.2",
"fs-extra": "^5.0.0", "fs-extra": "^5.0.0",
"js-yaml": "^3.12.0",
"mime-types": "^2.1.18", "mime-types": "^2.1.18",
"node-exiftool": "^2.3.0", "node-exiftool": "^2.3.0",
"node-fetch": "^2.1.2", "node-fetch": "^2.1.2",

View File

@ -4,37 +4,50 @@ const config = require('config');
const yargs = require('yargs'); const yargs = require('yargs');
function getArgs() { function getArgs() {
return yargs.command('npm start -- --user <username>').option('users', { return yargs
alias: 'user', .command('npm start -- --user <username>')
describe: 'Reddit usernames to fetch posts from', .option('users', {
type: 'array', alias: 'user',
}).option('posts', { describe: 'Reddit usernames to fetch posts from',
alias: 'post', type: 'array',
describe: 'Reddit post IDs to fetch', })
type: 'array', .option('posts', {
}).option('limit', { alias: 'post',
describe: 'Maximum amount of posts to fetch per supplied user (!), after filtering out ignored, cross- and reposts', describe: 'Reddit post IDs to fetch',
type: 'number', type: 'array',
default: config.fetch.limit, })
}).option('sort', { .option('limit', {
describe: 'Property to sort posts by', describe: 'Maximum amount of posts to fetch per supplied user (!), after filtering out ignored, cross- and reposts',
choices: ['new', 'top', 'hot', 'controversial'], type: 'number',
default: config.fetch.sort, default: config.fetch.limit,
}).option('ignore', { })
describe: 'Ignore posts with any of these properties', .option('sort', {
type: 'array', describe: 'Property to sort posts by',
choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18'], choices: ['new', 'top', 'hot', 'controversial'],
}).option('include', { default: config.fetch.sort,
describe: 'Include only these sources', })
type: 'array', .option('ignore', {
}).option('exclude', { describe: 'Ignore posts with any of these properties',
describe: 'Do not include these sources', type: 'array',
type: 'array', choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18'],
}).option('archives', { })
describe: 'Search archives for deleted posts', .option('include', {
type: 'boolean', describe: 'Include only these sources',
default: config.fetch.archives.search, type: 'array',
}).argv; })
.option('exclude', {
describe: 'Do not include these sources',
type: 'array',
})
.option('after', {
describe: 'Do not include posts from before this date (DD-MM-YYYY). When set to \'index\', it will assume the date of the latest indexed post.',
})
.option('archives', {
describe: 'Search archives for deleted posts',
type: 'boolean',
default: config.fetch.archives.search,
})
.argv;
} }
module.exports = getArgs; module.exports = getArgs;

View File

@ -6,7 +6,7 @@ const omit = require('object.omit');
const dissectLink = require('../dissectLink.js'); const dissectLink = require('../dissectLink.js');
const hashPost = require('./hashPost.js'); const hashPost = require('./hashPost.js');
function curatePost(acc, post, user, index, indexedPostIds, processed, args) { function curatePost(acc, post, user, index, processed, args) {
const host = dissectLink(post.url); const host = dissectLink(post.url);
const permalink = `https://reddit.com${post.permalink}`; const permalink = `https://reddit.com${post.permalink}`;
@ -25,7 +25,7 @@ function curatePost(acc, post, user, index, indexedPostIds, processed, args) {
hash: hashPost(post), hash: hashPost(post),
}; };
if (indexedPostIds.includes(post.id)) { if (user.indexed.find(entry => entry.id === post.id)) {
return { ...acc, indexed: { ...acc.indexed, [post.id]: curatedPost } }; return { ...acc, indexed: { ...acc.indexed, [post.id]: curatedPost } };
} }
@ -72,10 +72,9 @@ function curatePost(acc, post, user, index, indexedPostIds, processed, args) {
const curatePosts = (userPosts, args) => Object.values(userPosts).reduce((accPosts, user) => { const curatePosts = (userPosts, args) => Object.values(userPosts).reduce((accPosts, user) => {
const processed = new Set(); const processed = new Set();
const indexedPostIds = user.indexed.map(entry => entry.postId);
const { posts, indexed } = user.posts.reduce((accUserPosts, post, index) => const { posts, indexed } = user.posts.reduce((accUserPosts, post, index) =>
curatePost(accUserPosts, post, user, index, indexedPostIds, processed, args), { posts: [], indexed: {} }); curatePost(accUserPosts, post, user, index, processed, args), { posts: [], indexed: {} });
const indexedLength = Object.keys(indexed).length; const indexedLength = Object.keys(indexed).length;

View File

@ -5,6 +5,7 @@ const path = require('path');
const url = require('url'); const url = require('url');
const dateFns = require('date-fns'); const dateFns = require('date-fns');
const mime = require('mime-types'); const mime = require('mime-types');
const dotty = require('dotty');
function interpolate(pattern, user, post, item, strip = true, dateFormat = config.library.dateFormat) { function interpolate(pattern, user, post, item, strip = true, dateFormat = config.library.dateFormat) {
const vars = { const vars = {

View File

@ -2,19 +2,31 @@
const config = require('config'); const config = require('config');
const fs = require('fs-extra'); const fs = require('fs-extra');
const Promise = require('bluebird'); const yaml = require('js-yaml');
const csvStringify = Promise.promisify(require('csv').stringify);
const interpolate = require('../interpolate.js'); const interpolate = require('../interpolate.js');
async function writeToIndex(posts, user) { async function writeToIndex(posts, user) {
const filename = interpolate(config.library.index.file, user, null, false); const filename = interpolate(config.library.index.file, user, null, false);
const newEntries = posts.map(post => config.library.index.keys.map(key => interpolate(`$${key}`, user, post, null, false, 'YYYY-MM-DDTHH:mm:ssZ'))); const now = new Date();
const fullEntries = newEntries.concat(user.indexed.map(entry => Object.values(entry)));
const tsvString = await csvStringify(fullEntries, { delimiter: '\t', header: true, columns: config.library.index.keys }); // Individual posts are wrapped in [] to get a YAML array value for each individual item, allowing them to be joined manually with a newline
// between each entry to improve human readability of the index while maintaining a valid YAML list
const oldEntries = user.indexed.map(entry => yaml.safeDump([entry]));
const newEntries = posts.map(post => yaml.safeDump([{
id: post.id,
subreddit: post.subreddit,
permalink: post.permalink,
url: post.url,
hostId: post.host.id,
date: post.datetime,
indexed: now,
title: post.title,
}]));
return fs.writeFile(filename, tsvString); const entries = newEntries.concat(oldEntries).join('\n');
return fs.writeFile(filename, entries);
} }
module.exports = writeToIndex; module.exports = writeToIndex;

View File

@ -3,7 +3,7 @@
const config = require('config'); const config = require('config');
const Promise = require('bluebird'); const Promise = require('bluebird');
const fs = require('fs-extra'); const fs = require('fs-extra');
const csvParse = Promise.promisify(require('csv').parse); const yaml = require('js-yaml');
const getArchivePostIds = require('../archives/getArchivePostIds.js'); const getArchivePostIds = require('../archives/getArchivePostIds.js');
const curateUser = require('../curate/user.js'); const curateUser = require('../curate/user.js');
@ -51,7 +51,7 @@ async function getIndexedPosts(user) {
try { try {
const indexFile = await fs.readFile(indexFilePath, 'utf8'); const indexFile = await fs.readFile(indexFilePath, 'utf8');
return await csvParse(indexFile, { delimiter: '\t', columns: true, cast: true }); return yaml.safeLoad(indexFile);
} catch (error) { } catch (error) {
console.log('\x1b[33m%s\x1b[0m', `Could not load index file for '${user.name}' at '${indexFilePath}': ${error}`); console.log('\x1b[33m%s\x1b[0m', `Could not load index file for '${user.name}' at '${indexFilePath}': ${error}`);