From 5eb2eb651a28a680d13d9658f26499c40cceba44 Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Wed, 11 Sep 2024 05:16:56 +0200 Subject: [PATCH] Saving index per user as TSV. Refactoring. --- .eslintrc | 3 +++ config/default.js | 30 ++++++++++++----------- package-lock.json | 39 ++++++++++++++++++++++++++++++ package.json | 4 +++- src/app.js | 4 ++-- src/fetch/content.js | 57 +++++++++++++++++++++++++------------------- src/interpolate.js | 31 ++++++++++++------------ 7 files changed, 110 insertions(+), 58 deletions(-) diff --git a/.eslintrc b/.eslintrc index ba88572..95b9fda 100644 --- a/.eslintrc +++ b/.eslintrc @@ -1,5 +1,8 @@ { "extends": "airbnb-base", + "parserOptions": { + "sourceType": "script" + }, "rules": { "no-console": 0, "indent": ["error", 4], diff --git a/config/default.js b/config/default.js index 181834c..8ed9435 100644 --- a/config/default.js +++ b/config/default.js @@ -1,3 +1,5 @@ +'use strict'; + module.exports = { library: { base: 'output/$user/', @@ -7,17 +9,17 @@ module.exports = { album: { image: '$base$postDate - $preview$albumId - $postTitle/$itemIndex - $itemId$ext', video: '$base$postDate - $preview$albumId - $postTitle/$itemIndex - $itemId$ext', - extractSingleItem: true + extractSingleItem: true, }, profile: { image: '$base$userCreated - profile$ext', description: '$base$userCreated - profile ($userVerified$userVerifiedEmail$userGold$profileOver18)', - avoidAvatar: true + avoidAvatar: true, }, index: { file: '$base/index', - entry: '$postId (r/$subreddit) - $hostId ($url) - $postTitle', - unique: true + format: 'tsv', + keys: ['postId', 'postTitle', 'subreddit', 'postDate', 'url'], }, booleans: { extracted: 'extracted-', @@ -25,15 +27,15 @@ module.exports = { verified: '✔', verifiedEmail: '✉', gold: '★', - over18: '♥' + over18: '♥', }, meta: { - comment: '$itemDescription' + comment: '$itemDescription', }, dateFormat: 'YYYYMMDD', truncate: { limit: 250, - truncator: '...' + truncator: '...', }, indexOffset: 1, slashSubstitute: '#', @@ -47,8 +49,8 @@ module.exports = { search: false, preview: true, reddit: ['ip'], - reupload: [] - } + reupload: [], + }, }, reddit: { api: { @@ -58,12 +60,12 @@ module.exports = { token_type: 'bearer', expires_in: 3600, refresh_token: '1234567-A-Bc-defg8912hij-klm345opqr', - scope: 'history identity mysubreddits read subscribe' - } + scope: 'history identity mysubreddits read subscribe', + }, }, methods: { imgur: { - clientId: '1234567abcdefgh' - } - } + clientId: '1234567abcdefgh', + }, + }, }; diff --git a/package-lock.json b/package-lock.json index dc66350..9078510 100644 --- a/package-lock.json +++ b/package-lock.json @@ -461,6 +461,35 @@ "resolved": "https://registry.npmjs.org/css-what/-/css-what-2.1.0.tgz", "integrity": "sha1-lGfQMsOM+u+58teVASUwYvh/ob0=" }, + "csv": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/csv/-/csv-3.1.0.tgz", + "integrity": "sha512-SfnePMkhjljB7ehvubZESGjgrnM7V/gBe5ubZWKxeKwgmTl/HtVCdfSaGRgH/i/vG7qJaSLMpP0krNbAuunRBg==", + "requires": { + "csv-generate": "2.0.2", + "csv-parse": "2.5.0", + "csv-stringify": "3.1.1", + "stream-transform": "1.0.2" + } + }, + "csv-generate": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/csv-generate/-/csv-generate-2.0.2.tgz", + "integrity": "sha512-oyidhQ/sQcqKOyt+hRnL9oiqFFWsEkOwBE7tEV3pwku6dSuFUQqTGfhYXH/HZ3rKy8xBtcrwsspmXVo+LPijuA==" + }, + "csv-parse": { + "version": "2.5.0", + "resolved": "https://registry.npmjs.org/csv-parse/-/csv-parse-2.5.0.tgz", + "integrity": "sha512-4OcjOJQByI0YDU5COYw9HAqjo8/MOLLmT9EKyMCXUzgvh30vS1SlMK+Ho84IH5exN44cSnrYecw/7Zpu2m4lkA==" + }, + "csv-stringify": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/csv-stringify/-/csv-stringify-3.1.1.tgz", + "integrity": "sha512-Ni9r/BdQM2cGnWzwAP09zp12LVOAMHLJ86azNHGC7s4OUo2WidGfcM3QwYEjD8c4ELCL/a4AzfIsVCzroeys+g==", + "requires": { + "lodash.get": "4.4.2" + } + }, "dashdash": { "version": "1.14.1", "resolved": "https://registry.npmjs.org/dashdash/-/dashdash-1.14.1.tgz", @@ -1444,6 +1473,11 @@ "resolved": "https://registry.npmjs.org/lodash/-/lodash-4.17.5.tgz", "integrity": "sha512-svL3uiZf1RwhH+cWrfZn3A4+U58wbP0tGVTLQPbjplZxZ8ROD9VLuNgsRniTlLe7OlSqR79RUehXgpBW/s0IQw==" }, + "lodash.get": { + "version": "4.4.2", + "resolved": "https://registry.npmjs.org/lodash.get/-/lodash.get-4.4.2.tgz", + "integrity": "sha1-LRd/ZS+jHpObRDjVNBSZ36OCXpk=" + }, "lru-cache": { "version": "4.1.2", "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-4.1.2.tgz", @@ -2146,6 +2180,11 @@ "tweetnacl": "0.14.5" } }, + "stream-transform": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/stream-transform/-/stream-transform-1.0.2.tgz", + "integrity": "sha512-LNcZSF01PZ+bM0OqwPY7UHPiKoxSmLGHAcqakvh01DCU98ONEslLORdyBPdmTqjTpZSfCiaYLV4sci9y5M47oA==" + }, "string-width": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/string-width/-/string-width-2.1.1.tgz", diff --git a/package.json b/package.json index 57db394..9d79f5e 100644 --- a/package.json +++ b/package.json @@ -4,7 +4,8 @@ "description": "Reddit user post dump tool with versatile saving patterns and fallback searches for deleted users.", "main": "app.js", "scripts": { - "start": "node src/app.js" + "start": "node src/app.js", + "inspect": "node --inspect src/app.js" }, "pkg": { "scripts": "src/app.js", @@ -31,6 +32,7 @@ "bluebird": "^3.5.1", "cheerio": "^1.0.0-rc.2", "config": "^1.30.0", + "csv": "^3.1.0", "date-fns": "^1.29.0", "dist-exiftool": "^10.53.0", "fluent-ffmpeg": "^2.1.2", diff --git a/src/app.js b/src/app.js index ec784e9..c5635ee 100644 --- a/src/app.js +++ b/src/app.js @@ -19,7 +19,7 @@ const curatePosts = require('./curate/posts.js'); const interpolate = require('./interpolate.js'); const attachContentInfo = require('./fetch/info.js'); -const fetchContent = require('./fetch/content.js'); +const fetchSaveContent = require('./fetch/content.js'); const getPosts = require('./sources/getPosts.js')(reddit, args); const getUserPosts = require('./sources/getUserPosts.js')(reddit, args); @@ -42,7 +42,7 @@ Promise.resolve().then(async () => { const infoUserPosts = await attachContentInfo(curatedUserPosts); await ep.open(); - await Promise.all(Object.values(infoUserPosts).map(user => fetchContent(user, ep))); + await Promise.all(Object.values(infoUserPosts).map(user => fetchSaveContent(user, ep))); await ep.close(); }).catch(error => { return console.error(error); diff --git a/src/fetch/content.js b/src/fetch/content.js index 950ee6c..302711d 100644 --- a/src/fetch/content.js +++ b/src/fetch/content.js @@ -8,9 +8,10 @@ const save = require('../save/save.js'); const textToStream = require('../save/textToStream.js'); const saveMeta = require('../save/meta.js'); const mux = require('../save/mux.js'); +const writeToIndex = require('../save/writeToIndex.js'); async function getStreams(item, post) { - if(item.self) { + if (item.self) { return [textToStream(item.text)]; } @@ -24,9 +25,29 @@ async function getStreams(item, post) { return null; } -async function fetchContent(user, ep) { - await Promise.map(user.posts, async (post) => { - const items = await Promise.reduce(post.content.items, async (accItems, originalItem, index) => { +async function addMeta(filepath, ep, item, post, user) { + const meta = Object.entries(config.library.meta).reduce((acc, [key, value]) => { + const interpolatedValue = interpolate(value, user, post, item); + + return interpolatedValue ? { ...acc, [key]: interpolatedValue } : acc; + }, {}); + + if (Object.keys(meta).length > 0) { + await saveMeta(filepath, meta, ep); + } +} + +function getFilepath(item, post, user) { + const type = item.type.split('/')[0]; + + return post.content.album + ? interpolate(config.library.album[type], user, post, item) + : interpolate(config.library[type], user, post, item); +} + +async function fetchSaveContent(user, ep) { + const posts = await Promise.map(user.posts, async (post) => { + await Promise.reduce(post.content.items, async (accItems, originalItem, index) => { const item = { ...originalItem, index }; const streams = await getStreams(item, post); @@ -35,36 +56,22 @@ async function fetchContent(user, ep) { return accItems; } - const type = item.type.split('/')[0]; - const filepath = post.content.album - ? interpolate(config.library.album[type], user, post, item) - : interpolate(config.library[type], user, post, item); - + const filepath = getFilepath(item, post, user); const sourcePaths = await save(filepath, streams, item, post); if (item.mux) { await mux(filepath, sourcePaths, item); } - const meta = Object.entries(config.library.meta).reduce((acc, [key, value]) => { - const interpolatedValue = interpolate(value, user, post, item); - - return interpolatedValue ? { ...acc, [key]: interpolatedValue } : acc; - }, {}); - - if (Object.keys(meta).length > 0) { - await saveMeta(filepath, meta, ep); - } + await addMeta(filepath, ep, item, post, user); return sourcePaths; }, []); - console.log(items); - - const filename = interpolate(config.library.index.file, user, post); - const entry = `${interpolate(config.library.index.entry, user, post, null, false)}\n`; - - await fs.appendFile(filename, config.library.index.unique ? `${post.hash} ${entry}` : entry); + return post; }); + + return writeToIndex(posts, user); } -module.exports = fetchContent; + +module.exports = fetchSaveContent; diff --git a/src/interpolate.js b/src/interpolate.js index e52c2a0..6eb7aeb 100644 --- a/src/interpolate.js +++ b/src/interpolate.js @@ -6,14 +6,12 @@ const url = require('url'); const dateFns = require('date-fns'); const mime = require('mime-types'); -function interpolate(pattern, user, post, item, strip = true) { - const dateFormat = config.library.dateFormat || 'YYYYMMDD'; - +function interpolate(pattern, user, post, item, strip = true, dateFormat = config.library.dateFormat) { const vars = { - $base: config.library.base + $base: config.library.base, }; - if(user) { + if (user) { Object.assign(vars, { $user: user.name, $username: user.name, @@ -24,40 +22,41 @@ function interpolate(pattern, user, post, item, strip = true) { $userGold: user.gold ? config.library.booleans.gold : '', }); - if(user.profile) { + if (user.profile) { Object.assign(vars, { $profileId: user.profile.id, $profileTitle: user.profile.title, $profileDescription: user.profile.description, - $profileOver18: user.profile.over18 ? config.library.booleans.over18 : '' + $profileOver18: user.profile.over18 ? config.library.booleans.over18 : '', }); } } - if(post) { + if (post) { Object.assign(vars, { $postId: post.id, $postTitle: (post.title || '').slice(0, config.library.titleLength), $postUser: post.user, $postDate: dateFns.format(post.datetime, dateFormat), $postIndex: post.index + config.library.indexOffset, + $postHash: post.hash, $url: post.url, $subreddit: post.subreddit, $hostLabel: post.host.label, - $hostId: post.host.id + $hostId: post.host.id, }); - if(post.content.album) { + if (post.content.album) { Object.assign(vars, { $albumId: post.content.album.id, $albumTitle: (post.content.album.title || '').slice(0, config.library.titleLength), $albumDescription: post.content.album.description, - $albumDate: dateFns.format(post.content.album.datetime, dateFormat) + $albumDate: dateFns.format(post.content.album.datetime, dateFormat), }); } } - if(item) { + if (item) { Object.assign(vars, { $itemId: item.id, $itemTitle: (item.title || '').slice(0, config.library.titleLength), @@ -66,18 +65,18 @@ function interpolate(pattern, user, post, item, strip = true) { $itemIndex: item.index + config.library.indexOffset, $extracted: item.extracted ? config.library.booleans.extracted : '', $preview: item.preview ? config.library.booleans.preview : '', - $ext: item.type ? `.${mime.extension(item.type)}` : path.extname(url.parse(item.url).pathname) + $ext: item.type ? `.${mime.extension(item.type)}` : path.extname(url.parse(item.url).pathname), }); } - return Object.entries(vars).reduce((acc, [key, value], index) => { + return Object.entries(vars).reduce((acc, [key, value]) => { // substitute slashes for filesystem compatability - if(key !== '$base' && strip) { + if (key !== '$base' && strip) { value = (value || '').toString().replace(/\//g, config.library.slashSubstitute); } return acc.replace(key, value); }, pattern); -}; +} module.exports = interpolate;