Added include and exclude source arguments. Improved stream fetch failure handling and improved logging details.

This commit is contained in:
DebaucheryLibrarian 2024-09-11 05:16:55 +02:00
parent 0a2fac5f1f
commit e01979a757
8 changed files with 45 additions and 15 deletions

View File

@ -41,6 +41,8 @@ reddit-post-dump requires a arbitrarily recent version of Node.js. Before use, d
* `--limit <number>`: Maximum amount posts per user to fetch content from. Limit is applied after fltering out ignored, cross- and reposts. * `--limit <number>`: Maximum amount posts per user to fetch content from. Limit is applied after fltering out ignored, cross- and reposts.
* `--sort <method>`: How posts should be sorted while fetched. This affects the `$postIndex` variable, and in combination with a `--limit` decides what posts will be included. * `--sort <method>`: How posts should be sorted while fetched. This affects the `$postIndex` variable, and in combination with a `--limit` decides what posts will be included.
* `--ignore <prop> [<prop>...]`: Ignore posts with any of the following properties: `pinned`, `stickied`, `hidden`, `over_18`, `spoiler`. * `--ignore <prop> [<prop>...]`: Ignore posts with any of the following properties: `pinned`, `stickied`, `hidden`, `over_18`, `spoiler`.
* `--exclude <source> [<source>...]`: Do not include posts from these sources (e.g. `self`, `reddit`, `imgur`, `gfycat`, ...). Should not be used in combination with `--include`.
* `--include <source> [<source>...]`: Only include posts from these sources (e.g. `self`, `reddit`, `imgur`, `gfycat`, ...). Should not be used in combination with `--exclude`.
### Examples ### Examples
* `npm start -- --user AWildSketchAppeared` * `npm start -- --user AWildSketchAppeared`

View File

@ -34,6 +34,7 @@ module.exports = {
sort: 'new', sort: 'new',
limit: 1000, limit: 1000,
avoidDuplicates: true, avoidDuplicates: true,
retries: 3,
archives: { archives: {
search: false, search: false,
preview: true, preview: true,

View File

@ -38,7 +38,7 @@ Promise.resolve().then(() => {
return userPosts; return userPosts;
}).then(posts => { }).then(posts => {
return curatePosts(posts, args.ignore).slice(0, args.limit); return curatePosts(posts, args).slice(0, args.limit);
}).then(posts => { }).then(posts => {
return attachContentInfo(posts); return attachContentInfo(posts);
}).then(posts => { }).then(posts => {

View File

@ -23,6 +23,12 @@ module.exports = yargs.command('npm start -- --user <username>').option('users',
describe: 'Ignore posts with any of these properties', describe: 'Ignore posts with any of these properties',
type: 'array', type: 'array',
choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18'] choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18']
}).option('include', {
describe: 'Include only these sources',
type: 'array'
}).option('exclude', {
describe: 'Do not include these sources',
type: 'array'
}).option('archives', { }).option('archives', {
describe: 'Search archives for deleted posts', describe: 'Search archives for deleted posts',
type: 'boolean' type: 'boolean'

View File

@ -3,12 +3,14 @@
const config = require('config'); const config = require('config');
const dissectLink = require('../dissectLink.js'); const dissectLink = require('../dissectLink.js');
function curatePosts(posts, ignore) { function curatePosts(posts, args) {
const processed = new Set(); const processed = new Set();
return posts.reduce((acc, post, index) => { return posts.reduce((acc, post, index) => {
const host = dissectLink(post.url); const host = dissectLink(post.url);
const ignoring = ignore ? ignore.find(prop => { post.permalink = 'https://reddit.com' + post.permalink;
const ignoring = args.ignore ? args.ignore.find(prop => {
return post[prop]; return post[prop];
}) : null; }) : null;
@ -25,6 +27,12 @@ function curatePosts(posts, ignore) {
return acc; return acc;
} }
if((args.include && !args.include.includes(host.label)) || (args.exclude && args.exclude.includes(host.label))) {
console.log('\x1b[33m%s\x1b[0m', `Ignoring source '${host.label}' from post '${post.url}' (${post.permalink})`);
return acc;
}
processed.add(host.id); processed.add(host.id);
} }
@ -34,7 +42,7 @@ function curatePosts(posts, ignore) {
title: post.title, title: post.title,
text: post.selftext, text: post.selftext,
user: post.user, user: post.user,
permalink: 'https://reddit.com' + post.permalink, permalink: post.permalink,
url: post.url, url: post.url,
datetime: new Date(post.created_utc * 1000), datetime: new Date(post.created_utc * 1000),
subreddit: post.subreddit.display_name, subreddit: post.subreddit.display_name,

View File

@ -32,9 +32,15 @@ module.exports = function(posts) {
const sources = item.mux ? [item.url].concat(item.mux) : [item.url]; const sources = item.mux ? [item.url].concat(item.mux) : [item.url];
return Promise.all(sources.map(source => { return Promise.all(sources.map(source => {
return fetchItem(source, 0); return fetchItem(source, 0, post);
})).then(streams => Object.assign({}, item, {streams})); })).then(streams => {
})).then(items => { if(streams.filter(stream => stream).length > 0) {
Object.assign({}, item, {streams})
}
return null;
});
})).then(items => items.filter(item => item)).then(items => {
return Promise.all(items.map(item => { return Promise.all(items.map(item => {
const type = item.type.split('/')[0]; const type = item.type.split('/')[0];
const filepath = post.content.album ? interpolate(config.library.album[type], post.user, post, item) : interpolate(config.library[type], post.user, post, item); const filepath = post.content.album ? interpolate(config.library.album[type], post.user, post, item) : interpolate(config.library[type], post.user, post, item);
@ -42,7 +48,7 @@ module.exports = function(posts) {
return Promise.resolve().then(() => { return Promise.resolve().then(() => {
return fs.ensureDir(path.dirname(filepath)); return fs.ensureDir(path.dirname(filepath));
}).then(() => { }).then(() => {
return save(filepath, item.streams || item.stream, item); return save(filepath, item.streams || item.stream, item, post);
}).then(sourcePaths => { }).then(sourcePaths => {
if(item.mux) { if(item.mux) {
return mux(filepath, sourcePaths, item); return mux(filepath, sourcePaths, item);

View File

@ -1,22 +1,29 @@
'use strict'; 'use strict';
const config = require('config');
const fetch = require('node-fetch'); const fetch = require('node-fetch');
function fetchItem(url, attempt) { function fetchItem(url, attempt, post) {
function retry(error) { function retry(error) {
console.log(error); console.log('\x1b[31m%s\x1b[0m', `Failed to fetch '${url}': ${error.message} (${post.permalink})`);
if(attempt < 3) { if(attempt < config.fetch.retries) {
console.log('Retrying...'); console.log('Retrying...');
return fetchItem(url, ++attempt); return fetchItem(url, ++attempt, post);
} }
return null;
}; };
return fetch(url).then(res => { return fetch(url).then(res => {
return res.ok ? res : Promise.reject(`Failed to fetch ${url}`); if(!res.ok) {
throw new Error(`Response not OK for '${url}', HTTP code '${res.status}'`);
}
return res;
}).then(res => { }).then(res => {
console.log(`Fetched '${url}'`); console.log(`Fetched '${url}' (${post.permalink})`);
return res.body; return res.body;
}).catch(retry); }).catch(retry);

View File

@ -4,7 +4,7 @@ const fs = require('fs-extra');
const path = require('path'); const path = require('path');
const ffmpeg = require('fluent-ffmpeg'); const ffmpeg = require('fluent-ffmpeg');
function save(filepath, streams, item) { function save(filepath, streams, item, post) {
const pathComponents = path.parse(filepath); const pathComponents = path.parse(filepath);
// allow for single stream argument // allow for single stream argument