From e01979a757939a29bbbf842a3edbec940e79608f Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Wed, 11 Sep 2024 05:16:55 +0200 Subject: [PATCH] Added include and exclude source arguments. Improved stream fetch failure handling and improved logging details. --- README.md | 2 ++ config/default.js | 1 + src/app.js | 2 +- src/cli.js | 6 ++++++ src/curate/posts.js | 14 +++++++++++--- src/fetch/content.js | 14 ++++++++++---- src/fetch/item.js | 19 +++++++++++++------ src/save/save.js | 2 +- 8 files changed, 45 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 9950dc6..dfd1849 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,8 @@ reddit-post-dump requires a arbitrarily recent version of Node.js. Before use, d * `--limit `: Maximum amount posts per user to fetch content from. Limit is applied after fltering out ignored, cross- and reposts. * `--sort `: How posts should be sorted while fetched. This affects the `$postIndex` variable, and in combination with a `--limit` decides what posts will be included. * `--ignore [...]`: Ignore posts with any of the following properties: `pinned`, `stickied`, `hidden`, `over_18`, `spoiler`. +* `--exclude [...]`: Do not include posts from these sources (e.g. `self`, `reddit`, `imgur`, `gfycat`, ...). Should not be used in combination with `--include`. +* `--include [...]`: Only include posts from these sources (e.g. `self`, `reddit`, `imgur`, `gfycat`, ...). Should not be used in combination with `--exclude`. ### Examples * `npm start -- --user AWildSketchAppeared` diff --git a/config/default.js b/config/default.js index bb52e77..b1f6458 100644 --- a/config/default.js +++ b/config/default.js @@ -34,6 +34,7 @@ module.exports = { sort: 'new', limit: 1000, avoidDuplicates: true, + retries: 3, archives: { search: false, preview: true, diff --git a/src/app.js b/src/app.js index 8373636..b1a8ac5 100644 --- a/src/app.js +++ b/src/app.js @@ -38,7 +38,7 @@ Promise.resolve().then(() => { return userPosts; }).then(posts => { - return curatePosts(posts, args.ignore).slice(0, args.limit); + return curatePosts(posts, args).slice(0, args.limit); }).then(posts => { return attachContentInfo(posts); }).then(posts => { diff --git a/src/cli.js b/src/cli.js index f6a6f5b..470b15f 100644 --- a/src/cli.js +++ b/src/cli.js @@ -23,6 +23,12 @@ module.exports = yargs.command('npm start -- --user ').option('users', describe: 'Ignore posts with any of these properties', type: 'array', choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18'] +}).option('include', { + describe: 'Include only these sources', + type: 'array' +}).option('exclude', { + describe: 'Do not include these sources', + type: 'array' }).option('archives', { describe: 'Search archives for deleted posts', type: 'boolean' diff --git a/src/curate/posts.js b/src/curate/posts.js index 2fabc61..b176379 100644 --- a/src/curate/posts.js +++ b/src/curate/posts.js @@ -3,12 +3,14 @@ const config = require('config'); const dissectLink = require('../dissectLink.js'); -function curatePosts(posts, ignore) { +function curatePosts(posts, args) { const processed = new Set(); return posts.reduce((acc, post, index) => { const host = dissectLink(post.url); - const ignoring = ignore ? ignore.find(prop => { + post.permalink = 'https://reddit.com' + post.permalink; + + const ignoring = args.ignore ? args.ignore.find(prop => { return post[prop]; }) : null; @@ -25,6 +27,12 @@ function curatePosts(posts, ignore) { return acc; } + if((args.include && !args.include.includes(host.label)) || (args.exclude && args.exclude.includes(host.label))) { + console.log('\x1b[33m%s\x1b[0m', `Ignoring source '${host.label}' from post '${post.url}' (${post.permalink})`); + + return acc; + } + processed.add(host.id); } @@ -34,7 +42,7 @@ function curatePosts(posts, ignore) { title: post.title, text: post.selftext, user: post.user, - permalink: 'https://reddit.com' + post.permalink, + permalink: post.permalink, url: post.url, datetime: new Date(post.created_utc * 1000), subreddit: post.subreddit.display_name, diff --git a/src/fetch/content.js b/src/fetch/content.js index ccfbfb5..fb4b415 100644 --- a/src/fetch/content.js +++ b/src/fetch/content.js @@ -32,9 +32,15 @@ module.exports = function(posts) { const sources = item.mux ? [item.url].concat(item.mux) : [item.url]; return Promise.all(sources.map(source => { - return fetchItem(source, 0); - })).then(streams => Object.assign({}, item, {streams})); - })).then(items => { + return fetchItem(source, 0, post); + })).then(streams => { + if(streams.filter(stream => stream).length > 0) { + Object.assign({}, item, {streams}) + } + + return null; + }); + })).then(items => items.filter(item => item)).then(items => { return Promise.all(items.map(item => { const type = item.type.split('/')[0]; const filepath = post.content.album ? interpolate(config.library.album[type], post.user, post, item) : interpolate(config.library[type], post.user, post, item); @@ -42,7 +48,7 @@ module.exports = function(posts) { return Promise.resolve().then(() => { return fs.ensureDir(path.dirname(filepath)); }).then(() => { - return save(filepath, item.streams || item.stream, item); + return save(filepath, item.streams || item.stream, item, post); }).then(sourcePaths => { if(item.mux) { return mux(filepath, sourcePaths, item); diff --git a/src/fetch/item.js b/src/fetch/item.js index 7cd0408..ccea093 100644 --- a/src/fetch/item.js +++ b/src/fetch/item.js @@ -1,22 +1,29 @@ 'use strict'; +const config = require('config'); const fetch = require('node-fetch'); -function fetchItem(url, attempt) { +function fetchItem(url, attempt, post) { function retry(error) { - console.log(error); + console.log('\x1b[31m%s\x1b[0m', `Failed to fetch '${url}': ${error.message} (${post.permalink})`); - if(attempt < 3) { + if(attempt < config.fetch.retries) { console.log('Retrying...'); - return fetchItem(url, ++attempt); + return fetchItem(url, ++attempt, post); } + + return null; }; return fetch(url).then(res => { - return res.ok ? res : Promise.reject(`Failed to fetch ${url}`); + if(!res.ok) { + throw new Error(`Response not OK for '${url}', HTTP code '${res.status}'`); + } + + return res; }).then(res => { - console.log(`Fetched '${url}'`); + console.log(`Fetched '${url}' (${post.permalink})`); return res.body; }).catch(retry); diff --git a/src/save/save.js b/src/save/save.js index 0598c96..2b26285 100644 --- a/src/save/save.js +++ b/src/save/save.js @@ -4,7 +4,7 @@ const fs = require('fs-extra'); const path = require('path'); const ffmpeg = require('fluent-ffmpeg'); -function save(filepath, streams, item) { +function save(filepath, streams, item, post) { const pathComponents = path.parse(filepath); // allow for single stream argument