Added include and exclude source arguments. Improved stream fetch failure handling and improved logging details.
This commit is contained in:
parent
0a2fac5f1f
commit
e01979a757
|
@ -41,6 +41,8 @@ reddit-post-dump requires a arbitrarily recent version of Node.js. Before use, d
|
||||||
* `--limit <number>`: Maximum amount posts per user to fetch content from. Limit is applied after fltering out ignored, cross- and reposts.
|
* `--limit <number>`: Maximum amount posts per user to fetch content from. Limit is applied after fltering out ignored, cross- and reposts.
|
||||||
* `--sort <method>`: How posts should be sorted while fetched. This affects the `$postIndex` variable, and in combination with a `--limit` decides what posts will be included.
|
* `--sort <method>`: How posts should be sorted while fetched. This affects the `$postIndex` variable, and in combination with a `--limit` decides what posts will be included.
|
||||||
* `--ignore <prop> [<prop>...]`: Ignore posts with any of the following properties: `pinned`, `stickied`, `hidden`, `over_18`, `spoiler`.
|
* `--ignore <prop> [<prop>...]`: Ignore posts with any of the following properties: `pinned`, `stickied`, `hidden`, `over_18`, `spoiler`.
|
||||||
|
* `--exclude <source> [<source>...]`: Do not include posts from these sources (e.g. `self`, `reddit`, `imgur`, `gfycat`, ...). Should not be used in combination with `--include`.
|
||||||
|
* `--include <source> [<source>...]`: Only include posts from these sources (e.g. `self`, `reddit`, `imgur`, `gfycat`, ...). Should not be used in combination with `--exclude`.
|
||||||
|
|
||||||
### Examples
|
### Examples
|
||||||
* `npm start -- --user AWildSketchAppeared`
|
* `npm start -- --user AWildSketchAppeared`
|
||||||
|
|
|
@ -34,6 +34,7 @@ module.exports = {
|
||||||
sort: 'new',
|
sort: 'new',
|
||||||
limit: 1000,
|
limit: 1000,
|
||||||
avoidDuplicates: true,
|
avoidDuplicates: true,
|
||||||
|
retries: 3,
|
||||||
archives: {
|
archives: {
|
||||||
search: false,
|
search: false,
|
||||||
preview: true,
|
preview: true,
|
||||||
|
|
|
@ -38,7 +38,7 @@ Promise.resolve().then(() => {
|
||||||
|
|
||||||
return userPosts;
|
return userPosts;
|
||||||
}).then(posts => {
|
}).then(posts => {
|
||||||
return curatePosts(posts, args.ignore).slice(0, args.limit);
|
return curatePosts(posts, args).slice(0, args.limit);
|
||||||
}).then(posts => {
|
}).then(posts => {
|
||||||
return attachContentInfo(posts);
|
return attachContentInfo(posts);
|
||||||
}).then(posts => {
|
}).then(posts => {
|
||||||
|
|
|
@ -23,6 +23,12 @@ module.exports = yargs.command('npm start -- --user <username>').option('users',
|
||||||
describe: 'Ignore posts with any of these properties',
|
describe: 'Ignore posts with any of these properties',
|
||||||
type: 'array',
|
type: 'array',
|
||||||
choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18']
|
choices: ['pinned', 'stickied', 'hidden', 'spoiler', 'over_18']
|
||||||
|
}).option('include', {
|
||||||
|
describe: 'Include only these sources',
|
||||||
|
type: 'array'
|
||||||
|
}).option('exclude', {
|
||||||
|
describe: 'Do not include these sources',
|
||||||
|
type: 'array'
|
||||||
}).option('archives', {
|
}).option('archives', {
|
||||||
describe: 'Search archives for deleted posts',
|
describe: 'Search archives for deleted posts',
|
||||||
type: 'boolean'
|
type: 'boolean'
|
||||||
|
|
|
@ -3,12 +3,14 @@
|
||||||
const config = require('config');
|
const config = require('config');
|
||||||
const dissectLink = require('../dissectLink.js');
|
const dissectLink = require('../dissectLink.js');
|
||||||
|
|
||||||
function curatePosts(posts, ignore) {
|
function curatePosts(posts, args) {
|
||||||
const processed = new Set();
|
const processed = new Set();
|
||||||
|
|
||||||
return posts.reduce((acc, post, index) => {
|
return posts.reduce((acc, post, index) => {
|
||||||
const host = dissectLink(post.url);
|
const host = dissectLink(post.url);
|
||||||
const ignoring = ignore ? ignore.find(prop => {
|
post.permalink = 'https://reddit.com' + post.permalink;
|
||||||
|
|
||||||
|
const ignoring = args.ignore ? args.ignore.find(prop => {
|
||||||
return post[prop];
|
return post[prop];
|
||||||
}) : null;
|
}) : null;
|
||||||
|
|
||||||
|
@ -25,6 +27,12 @@ function curatePosts(posts, ignore) {
|
||||||
return acc;
|
return acc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if((args.include && !args.include.includes(host.label)) || (args.exclude && args.exclude.includes(host.label))) {
|
||||||
|
console.log('\x1b[33m%s\x1b[0m', `Ignoring source '${host.label}' from post '${post.url}' (${post.permalink})`);
|
||||||
|
|
||||||
|
return acc;
|
||||||
|
}
|
||||||
|
|
||||||
processed.add(host.id);
|
processed.add(host.id);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -34,7 +42,7 @@ function curatePosts(posts, ignore) {
|
||||||
title: post.title,
|
title: post.title,
|
||||||
text: post.selftext,
|
text: post.selftext,
|
||||||
user: post.user,
|
user: post.user,
|
||||||
permalink: 'https://reddit.com' + post.permalink,
|
permalink: post.permalink,
|
||||||
url: post.url,
|
url: post.url,
|
||||||
datetime: new Date(post.created_utc * 1000),
|
datetime: new Date(post.created_utc * 1000),
|
||||||
subreddit: post.subreddit.display_name,
|
subreddit: post.subreddit.display_name,
|
||||||
|
|
|
@ -32,9 +32,15 @@ module.exports = function(posts) {
|
||||||
const sources = item.mux ? [item.url].concat(item.mux) : [item.url];
|
const sources = item.mux ? [item.url].concat(item.mux) : [item.url];
|
||||||
|
|
||||||
return Promise.all(sources.map(source => {
|
return Promise.all(sources.map(source => {
|
||||||
return fetchItem(source, 0);
|
return fetchItem(source, 0, post);
|
||||||
})).then(streams => Object.assign({}, item, {streams}));
|
})).then(streams => {
|
||||||
})).then(items => {
|
if(streams.filter(stream => stream).length > 0) {
|
||||||
|
Object.assign({}, item, {streams})
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
})).then(items => items.filter(item => item)).then(items => {
|
||||||
return Promise.all(items.map(item => {
|
return Promise.all(items.map(item => {
|
||||||
const type = item.type.split('/')[0];
|
const type = item.type.split('/')[0];
|
||||||
const filepath = post.content.album ? interpolate(config.library.album[type], post.user, post, item) : interpolate(config.library[type], post.user, post, item);
|
const filepath = post.content.album ? interpolate(config.library.album[type], post.user, post, item) : interpolate(config.library[type], post.user, post, item);
|
||||||
|
@ -42,7 +48,7 @@ module.exports = function(posts) {
|
||||||
return Promise.resolve().then(() => {
|
return Promise.resolve().then(() => {
|
||||||
return fs.ensureDir(path.dirname(filepath));
|
return fs.ensureDir(path.dirname(filepath));
|
||||||
}).then(() => {
|
}).then(() => {
|
||||||
return save(filepath, item.streams || item.stream, item);
|
return save(filepath, item.streams || item.stream, item, post);
|
||||||
}).then(sourcePaths => {
|
}).then(sourcePaths => {
|
||||||
if(item.mux) {
|
if(item.mux) {
|
||||||
return mux(filepath, sourcePaths, item);
|
return mux(filepath, sourcePaths, item);
|
||||||
|
|
|
@ -1,22 +1,29 @@
|
||||||
'use strict';
|
'use strict';
|
||||||
|
|
||||||
|
const config = require('config');
|
||||||
const fetch = require('node-fetch');
|
const fetch = require('node-fetch');
|
||||||
|
|
||||||
function fetchItem(url, attempt) {
|
function fetchItem(url, attempt, post) {
|
||||||
function retry(error) {
|
function retry(error) {
|
||||||
console.log(error);
|
console.log('\x1b[31m%s\x1b[0m', `Failed to fetch '${url}': ${error.message} (${post.permalink})`);
|
||||||
|
|
||||||
if(attempt < 3) {
|
if(attempt < config.fetch.retries) {
|
||||||
console.log('Retrying...');
|
console.log('Retrying...');
|
||||||
|
|
||||||
return fetchItem(url, ++attempt);
|
return fetchItem(url, ++attempt, post);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
};
|
};
|
||||||
|
|
||||||
return fetch(url).then(res => {
|
return fetch(url).then(res => {
|
||||||
return res.ok ? res : Promise.reject(`Failed to fetch ${url}`);
|
if(!res.ok) {
|
||||||
|
throw new Error(`Response not OK for '${url}', HTTP code '${res.status}'`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return res;
|
||||||
}).then(res => {
|
}).then(res => {
|
||||||
console.log(`Fetched '${url}'`);
|
console.log(`Fetched '${url}' (${post.permalink})`);
|
||||||
|
|
||||||
return res.body;
|
return res.body;
|
||||||
}).catch(retry);
|
}).catch(retry);
|
||||||
|
|
|
@ -4,7 +4,7 @@ const fs = require('fs-extra');
|
||||||
const path = require('path');
|
const path = require('path');
|
||||||
const ffmpeg = require('fluent-ffmpeg');
|
const ffmpeg = require('fluent-ffmpeg');
|
||||||
|
|
||||||
function save(filepath, streams, item) {
|
function save(filepath, streams, item, post) {
|
||||||
const pathComponents = path.parse(filepath);
|
const pathComponents = path.parse(filepath);
|
||||||
|
|
||||||
// allow for single stream argument
|
// allow for single stream argument
|
||||||
|
|
Loading…
Reference in New Issue