Added include and exclude source arguments. Improved stream fetch failure handling and improved logging details.

This commit is contained in:
DebaucheryLibrarian 2024-09-11 05:16:55 +02:00
parent 0a2fac5f1f
commit e01979a757
8 changed files with 45 additions and 15 deletions

View File

@ -41,6 +41,8 @@ reddit-post-dump requires a arbitrarily recent version of Node.js. Before use, d
* `--limit <number>`: Maximum amount posts per user to fetch content from. Limit is applied after fltering out ignored, cross- and reposts.
* `--sort <method>`: How posts should be sorted while fetched. This affects the `$postIndex` variable, and in combination with a `--limit` decides what posts will be included.
* `--ignore <prop> [<prop>...]`: Ignore posts with any of the following properties: `pinned`, `stickied`, `hidden`, `over_18`, `spoiler`.
* `--exclude <source> [<source>...]`: Do not include posts from these sources (e.g. `self`, `reddit`, `imgur`, `gfycat`, ...). Should not be used in combination with `--include`.
* `--include <source> [<source>...]`: Only include posts from these sources (e.g. `self`, `reddit`, `imgur`, `gfycat`, ...). Should not be used in combination with `--exclude`.
### Examples
* `npm start -- --user AWildSketchAppeared`

View File

@ -34,6 +34,7 @@ module.exports = {
sort: 'new',
limit: 1000,
avoidDuplicates: true,
retries: 3,
archives: {
search: false,
preview: true,

View File

@ -38,7 +38,7 @@ Promise.resolve().then(() => {
return userPosts;
}).then(posts => {
return curatePosts(posts, args.ignore).slice(0, args.limit);
return curatePosts(posts, args).slice(0, args.limit);
}).then(posts => {
return attachContentInfo(posts);
}).then(posts => {

View File

@ -23,6 +23,12 @@ module.exports = yargs.command('npm start -- --user <username>').option('users',
describe: 'Ignore posts with any of these properties',
type: 'array',
choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18']
}).option('include', {
describe: 'Include only these sources',
type: 'array'
}).option('exclude', {
describe: 'Do not include these sources',
type: 'array'
}).option('archives', {
describe: 'Search archives for deleted posts',
type: 'boolean'

View File

@ -3,12 +3,14 @@
const config = require('config');
const dissectLink = require('../dissectLink.js');
function curatePosts(posts, ignore) {
function curatePosts(posts, args) {
const processed = new Set();
return posts.reduce((acc, post, index) => {
const host = dissectLink(post.url);
const ignoring = ignore ? ignore.find(prop => {
post.permalink = 'https://reddit.com' + post.permalink;
const ignoring = args.ignore ? args.ignore.find(prop => {
return post[prop];
}) : null;
@ -25,6 +27,12 @@ function curatePosts(posts, ignore) {
return acc;
}
if((args.include && !args.include.includes(host.label)) || (args.exclude && args.exclude.includes(host.label))) {
console.log('\x1b[33m%s\x1b[0m', `Ignoring source '${host.label}' from post '${post.url}' (${post.permalink})`);
return acc;
}
processed.add(host.id);
}
@ -34,7 +42,7 @@ function curatePosts(posts, ignore) {
title: post.title,
text: post.selftext,
user: post.user,
permalink: 'https://reddit.com' + post.permalink,
permalink: post.permalink,
url: post.url,
datetime: new Date(post.created_utc * 1000),
subreddit: post.subreddit.display_name,

View File

@ -32,9 +32,15 @@ module.exports = function(posts) {
const sources = item.mux ? [item.url].concat(item.mux) : [item.url];
return Promise.all(sources.map(source => {
return fetchItem(source, 0);
})).then(streams => Object.assign({}, item, {streams}));
})).then(items => {
return fetchItem(source, 0, post);
})).then(streams => {
if(streams.filter(stream => stream).length > 0) {
Object.assign({}, item, {streams})
}
return null;
});
})).then(items => items.filter(item => item)).then(items => {
return Promise.all(items.map(item => {
const type = item.type.split('/')[0];
const filepath = post.content.album ? interpolate(config.library.album[type], post.user, post, item) : interpolate(config.library[type], post.user, post, item);
@ -42,7 +48,7 @@ module.exports = function(posts) {
return Promise.resolve().then(() => {
return fs.ensureDir(path.dirname(filepath));
}).then(() => {
return save(filepath, item.streams || item.stream, item);
return save(filepath, item.streams || item.stream, item, post);
}).then(sourcePaths => {
if(item.mux) {
return mux(filepath, sourcePaths, item);

View File

@ -1,22 +1,29 @@
'use strict';
const config = require('config');
const fetch = require('node-fetch');
function fetchItem(url, attempt) {
function fetchItem(url, attempt, post) {
function retry(error) {
console.log(error);
console.log('\x1b[31m%s\x1b[0m', `Failed to fetch '${url}': ${error.message} (${post.permalink})`);
if(attempt < 3) {
if(attempt < config.fetch.retries) {
console.log('Retrying...');
return fetchItem(url, ++attempt);
return fetchItem(url, ++attempt, post);
}
return null;
};
return fetch(url).then(res => {
return res.ok ? res : Promise.reject(`Failed to fetch ${url}`);
if(!res.ok) {
throw new Error(`Response not OK for '${url}', HTTP code '${res.status}'`);
}
return res;
}).then(res => {
console.log(`Fetched '${url}'`);
console.log(`Fetched '${url}' (${post.permalink})`);
return res.body;
}).catch(retry);

View File

@ -4,7 +4,7 @@ const fs = require('fs-extra');
const path = require('path');
const ffmpeg = require('fluent-ffmpeg');
function save(filepath, streams, item) {
function save(filepath, streams, item, post) {
const pathComponents = path.parse(filepath);
// allow for single stream argument