Browse Source

Added reddit user and subreddit feeds.

master
ThePendulum 3 years ago
parent
commit
e3163dc9cb
  1. 14
      .editorconfig
  2. 20
      .eslintrc
  3. 1
      .gitignore
  4. 1
      .nvmrc
  5. 6
      config/default.js
  6. 5242
      package-lock.json
  7. 31
      package.json
  8. 35
      src/app.js
  9. 77
      src/args.js
  10. 7
      src/content/content.js
  11. 21
      src/content/redgifs.js
  12. 10
      src/feeds/feeds.js
  13. 51
      src/feeds/reddit.js
  14. 39
      src/logger.js
  15. 107
      src/utils/http.js

14
.editorconfig

@ -0,0 +1,14 @@
# top-most EditorConfig file
root = true
# Unix-style newlines with a newline ending every file
[*]
end_of_line = lf
insert_final_newline = true
indent_style = tab
indent_size = 4
# Matches multiple files with brace expansion notation
# Set default charset
[*.js]
charset = utf-8

20
.eslintrc

@ -0,0 +1,20 @@
{
"extends": "airbnb-base",
"parserOptions": {
"parser": "babel-eslint",
"sourceType": "script",
"ecmaVersion": 2020
},
"rules": {
"strict": 0,
"indent": ["error", "tab"],
"no-tabs": "off",
"no-unused-vars": ["error", {"argsIgnorePattern": "^_"}],
"no-console": 0,
"no-underscore-dangle": 0,
"prefer-destructuring": "off",
"template-curly-spacing": "off",
"object-curly-newline": "off",
"max-len": [2, {"code": 300, "tabWidth": 4, "ignoreUrls": true}],
}
}

1
.gitignore

@ -2,3 +2,4 @@ node_modules/
config/*
!config/default.js
output/
log/

1
.nvmrc

@ -0,0 +1 @@
14.13.0

6
config/default.js

@ -0,0 +1,6 @@
module.exports = {
limits: {
requestInterval: 1000,
requestConcurrency: 1,
},
};

5242
package-lock.json
File diff suppressed because it is too large
View File

31
package.json

@ -4,7 +4,9 @@
"description": "Content archiver for reddit users, subreddits, YouTube channels, supporting many popular media hosts.",
"main": "src/app.js",
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
"start": "node src/app.js",
"eslint": "eslint src/",
"eslint-watch": "esw --watch src/"
},
"repository": {
"type": "git",
@ -18,5 +20,30 @@
"download"
],
"author": "pendulum",
"license": "ISC"
"license": "ISC",
"dependencies": {
"bhttp": "^1.2.6",
"blake2": "^4.0.1",
"bluebird": "^3.7.2",
"bottleneck": "^2.19.5",
"config": "^3.3.2",
"js-yaml": "^3.14.0",
"mime": "^2.4.6",
"moment": "^2.29.0",
"promise-task-queue": "^1.2.0",
"winston": "^3.3.3",
"winston-daily-rotate-file": "^4.5.0",
"yargs": "^16.0.3"
},
"devDependencies": {
"@babel/cli": "^7.11.6",
"@babel/core": "^7.11.6",
"@babel/plugin-proposal-optional-chaining": "^7.11.0",
"@babel/preset-env": "^7.11.5",
"@babel/register": "^7.11.5",
"eslint": "^7.10.0",
"eslint-config-airbnb": "^18.2.0",
"eslint-plugin-import": "^2.22.1",
"eslint-watch": "^7.0.0"
}
}

35
src/app.js

@ -0,0 +1,35 @@
'use strict';
// const util = require('util');
const { argv } = require('./args');
const feeds = require('./feeds/feeds');
const content = require('./content/content');
async function init() {
const postsPerFeed = (await Promise.all(Object.keys(feeds).map(async (feedType) => {
if (argv[feedType]) {
return Promise.all(argv[feedType].map(async (channelName) => {
const posts = await feeds[feedType](channelName);
return {
name: channelName,
type: feedType,
posts,
};
}));
}
return null;
}))).flat().filter(Boolean);
if (argv.items) {
const items = await Promise.all(argv.items.map((url) => content.redgifs.fetchContent(url)));
console.log(items);
}
// console.log(util.inspect(itemsPerChannel, null, null));
console.log(postsPerFeed);
}
init();

77
src/args.js

@ -0,0 +1,77 @@
const yargs = require('yargs');
const moment = require('moment');
function toDate(dateString) {
if (!dateString) {
return null;
}
if (/\d{2,4}-\d{2}-\d{2,4}/.test(dateString)) {
// using date
return moment
.utc(dateString, ['YYYY-MM-DD', 'DD-MM-YYYY'])
.toDate();
}
// using timespan (e.g. "1 month")
return moment
.utc()
.subtract(...dateString.split(' '))
.toDate();
}
const args = yargs
.command('npm start')
.option('reddit-users', {
describe: 'Fetch content from a reddit user by username.',
type: 'array',
alias: 'reddit-user',
})
.option('reddit-posts', {
describe: 'Fetch content from a reddit post by post ID.',
type: 'array',
alias: 'reddit-post',
})
.option('reddit-subs', {
describe: 'Fetch content from a subreddit by subreddit name.',
alias: ['reddit-sub', 'subreddit', 'subreddits'],
type: 'array',
})
.option('items', {
describe: 'Fetch media items directly from host URL.',
type: 'array',
alias: 'item',
})
.option('limit', {
describe: 'Maximum number of items to fetch content from.',
type: 'boolean',
})
.option('after', {
describe: 'Only include items uploaded after this date or timespan.',
type: 'string',
default: toDate(),
})
.option('before', {
describe: 'Only include items uploaded before this date or timespan.',
type: 'string',
default: toDate(),
})
.option('sort', {
describe: 'How to sort the items before applying limits.',
type: 'string',
})
.option('redownload', {
describe: 'Ignore index file and force a redownload of every item in the selection.',
alias: 'force',
type: 'boolean',
})
.option('url')
.option('log-level', {
describe: 'Log level',
type: 'string',
default: process.env.NODE_ENV === 'development' ? 'silly' : 'info',
})
.coerce('after', toDate)
.coerce('before', toDate);
module.exports = args;

7
src/content/content.js

@ -0,0 +1,7 @@
'use strict';
const redgifs = require('./redgifs');
module.exports = {
redgifs,
};

21
src/content/redgifs.js

@ -0,0 +1,21 @@
'use strict';
const http = require('../utils/http');
async function fetchContent(url, _post) {
const res = await http.get(url);
if (res.ok) {
const id = new URL(url).pathname.match(/\/watch\/(\w+)/)[1];
return {
src: `https://thcf8.redgifs.com/${id}.webm`,
};
}
return null;
}
module.exports = {
fetchContent,
};

10
src/feeds/feeds.js

@ -0,0 +1,10 @@
'use strict';
const reddit = require('./reddit');
const feeds = {
redditUser: reddit.fetchUserPosts,
redditSub: reddit.fetchSubPosts,
};
module.exports = feeds;

51
src/feeds/reddit.js

@ -0,0 +1,51 @@
'use strict';
const moment = require('moment');
const http = require('../utils/http');
function curatePost(rawPost) {
const post = {
id: rawPost.id,
title: rawPost.title,
rawPost: `https://reddit.com${rawPost.permalink}`,
subreddit: rawPost.subreddit,
author: rawPost.author,
pinned: rawPost.pinned,
stickied: rawPost.stickied,
upvotes: rawPost.ups,
nsfw: rawPost.over_18,
date: moment.utc(rawPost.created_utc * 1000).toDate(),
url: rawPost.url,
raw: rawPost,
};
return post;
}
async function fetchUserPosts(username, _options) {
// const res = await http.get(`https://www.reddit.com/user/${username}/submitted.json?limit=100&sort=new`);
const res = await http.get(`https://api.pushshift.io/reddit/submission/search?author=${username}&sort_type=created_utc`);
if (res.ok) {
return res.body.data.map((post) => curatePost(post));
}
return null;
}
async function fetchSubPosts(subreddit, _options) {
// const res = await http.get(`https://www.reddit.com/r/${subreddit}.json?limit=100&sort=new`);
const res = await http.get(`https://api.pushshift.io/reddit/submission/search?subreddit=${subreddit}&sort_type=created_utc`);
if (res.ok) {
return res.body.data.map((post) => curatePost(post));
}
return null;
}
module.exports = {
fetchUserPosts,
fetchSubPosts,
};

39
src/logger.js

@ -0,0 +1,39 @@
'use strict';
const util = require('util');
const path = require('path');
const winston = require('winston');
require('winston-daily-rotate-file');
const { argv } = require('./args');
function logger(context) {
const root = context.match(/src[/\\]|dist[/\\]/);
const filename = context.slice(root.index + root[0].length)
.replace(path.extname(context), '');
return winston.createLogger({
format: winston.format.combine(
winston.format.timestamp({ format: 'YYYY-MM-DD HH:mm:ss' }),
winston.format((info) => (info instanceof Error
? { ...info, message: info.stack }
: { ...info, message: typeof info.message === 'string' ? info.message : util.inspect(info.message) }))(),
winston.format.colorize(),
winston.format.printf(({ level, timestamp, label, message }) => `${timestamp} ${level} [${label || filename}] ${message}`),
),
transports: [
new winston.transports.Console({
level: argv.logLevel,
timestamp: true,
}),
new winston.transports.DailyRotateFile({
datePattern: 'YYYY-MM-DD',
filename: path.join('log', '%DATE%.log'),
level: 'silly',
}),
],
});
}
module.exports = logger;

107
src/utils/http.js

@ -0,0 +1,107 @@
'use strict';
const config = require('config');
const bhttp = require('bhttp');
const Bottleneck = require('bottleneck');
const logger = require('../logger')(__filename);
const defaultLimiterOptions = {
minTime: config.limits.requestInterval,
maxConcurrent: config.limits.requestConcurrency,
};
const defaultLimiter = new Bottleneck(defaultLimiterOptions);
const limiters = {};
const defaultOptions = {
encodeJSON: true,
headers: {
'user-agent': 'ripunzel',
},
};
function getLimiter(limit) {
if (limit) {
if (!limiters[limit.interval]?.[limit.concurrency]) {
limiters[limit.interval || null][limit.concurrency || null] = new Bottleneck({
...(limit.interval && { mineTime: limit.interval }),
...(limit.concurrency && { maxConcurrent: limit.concurrency }),
});
}
return limiters[limit.interval][limit.concurrency];
}
return defaultLimiter;
}
async function request(method = 'get', url, body, requestOptions, session) {
const http = session || bhttp;
const options = {
...defaultOptions,
...requestOptions,
responseTimeout: requestOptions?.responseTimeout || requestOptions?.timeout || 60000,
};
logger.silly(`GET (${options.limit?.interval || defaultLimiterOptions.minTime}ms/${options.limit?.concurrency || defaultLimiterOptions.maxConcurrent}p) ${url}`);
const res = body
? await http[method](url, options)
: await http[method](url, body, options);
if (Buffer.isBuffer(res.body)) {
return {
...res,
body: res.body.toString(),
status: res.statusCode,
ok: res.statusCode >= 200 && res.statusCode <= 299,
};
}
return {
...res,
body: res.body,
status: res.statusCode,
ok: res.statusCode >= 200 && res.statusCode <= 299,
};
}
async function scheduleRequest(method = 'get', url, body, options, session) {
return getLimiter(options && options.limit).schedule(() => request(method, url, body, options, session));
}
async function get(url, options, session) {
return scheduleRequest('get', url, null, options, session);
}
async function post(url, body, options, session) {
return scheduleRequest('post', url, body, options, session);
}
async function put(url, body, options, session) {
return scheduleRequest('put', url, body, options, session);
}
async function patch(url, body, options, session) {
return scheduleRequest('patch', url, body, options, session);
}
async function del(url, options, session) {
return scheduleRequest('delete', url, null, options, session);
}
function getSession(options) {
return bhttp.session(options);
}
module.exports = {
get,
post,
delete: del,
put,
patch,
session: getSession,
};
Loading…
Cancel
Save