Added predata step. Using RedGifs temporary API key.

This commit is contained in:
DebaucheryLibrarian 2024-09-11 05:16:58 +02:00
parent 4acbe16fb8
commit bed4fe288f
16 changed files with 9877 additions and 1524 deletions

View File

@ -1,11 +1,14 @@
{
"extends": "airbnb-base",
"parserOptions": {
"sourceType": "script"
"parser": "@babel/eslint-parser",
"sourceType": "script",
"ecmaVersion": 2020
},
"rules": {
"no-console": 0,
"indent": ["error", 4],
"max-len": 0
"max-len": 0,
"strict": 0
}
}

4
.gitignore vendored
View File

@ -2,8 +2,8 @@ node_modules/
config/*.js
!config/default.js
output/
logs/
dist/
users
users_invalid
users*
posts
ignore

1
.nvmrc Normal file
View File

@ -0,0 +1 @@
16.19.1

View File

@ -73,7 +73,7 @@ module.exports = {
level: 'info',
},
limiter: {
concurrency: 100,
concurrency: 10,
interval: 100,
},
reddit: {

11157
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -30,7 +30,7 @@
"dependencies": {
"array.prototype.flatten": "^1.2.1",
"bhttp": "^1.2.4",
"blake2": "^4.0.0",
"blake2": "^4.1.1",
"bluebird": "^3.5.1",
"bottleneck": "^2.19.5",
"cheerio": "^1.0.0-rc.2",
@ -50,14 +50,18 @@
"object.pick": "^1.3.0",
"snoowrap": "^1.20.0",
"template-format": "^1.2.4",
"unprint": "^0.8.1",
"url-pattern": "^1.0.3",
"winston": "^3.3.3",
"winston-daily-rotate-file": "^4.7.1",
"yargs": "^11.0.0",
"youtube-dl": "^2.1.0"
"youtube-dl": "^2.3.0",
"youtube-dl-exec": "^2.2.3"
},
"devDependencies": {
"eslint": "^4.19.1",
"eslint-config-airbnb-base": "^12.1.0",
"eslint-plugin-import": "^2.12.0"
"@babel/eslint-parser": "^7.19.1",
"eslint": "^8.34.0",
"eslint-config-airbnb-base": "^15.0.0",
"eslint-plugin-import": "^2.27.5"
}
}

View File

@ -16,6 +16,7 @@ const logger = require('./logger')(__filename);
const dissectLink = require('./dissectLink');
const curatePosts = require('./curate/posts');
const methods = require('./methods/methods');
const { attachContentInfo, getInfo } = require('./fetch/info');
const { fetchSaveUserContent, fetchSaveDirectContent } = require('./fetch/content');
@ -27,7 +28,7 @@ async function getFileContents(location, label) {
try {
const fileContents = await fs.readFile(location, 'utf8');
return fileContents.split('\n').filter(entry => entry && entry.slice(0, 1) !== '#');
return fileContents.split('\n').filter((entry) => entry && entry.slice(0, 1) !== '#');
} catch (error) {
logger.error(`Could not read ${label} file '${location}': ${error}.`);
@ -35,6 +36,30 @@ async function getFileContents(location, label) {
}
}
function getPostHosts(posts) {
// const hosts = Array.from(new Set(Object.values(posts).flatMap((user) => user.posts.map((post) => post.host?.method)))).filter(Boolean);
const hosts = Object.values(Object.fromEntries(Object.values(posts).flatMap((user) => user.posts.map((post) => post.host && [post.host?.method, post.host])).filter(Boolean)));
return hosts;
}
async function fetchPredata(hosts) {
return hosts.reduce(async (chain, host) => {
const acc = await chain;
if (methods[host?.method]?.fetchPredata) {
const data = await methods[host.method].fetchPredata();
return {
...acc,
[host.method]: data,
};
}
return acc;
}, Promise.resolve({}));
}
async function getCompletePosts() {
let userPosts = {};
let ignoreIds = [];
@ -66,13 +91,24 @@ async function getCompletePosts() {
}
const curatedUserPosts = curatePosts(userPosts, ignoreIds, args);
const predata = await fetchPredata(getPostHosts(curatedUserPosts));
return attachContentInfo(curatedUserPosts, reddit);
return attachContentInfo(curatedUserPosts, { reddit, predata });
}
async function getDirectContent(links, ep) {
return Promise.map(links, async (link) => {
const hosts = links.map((link) => {
const host = dissectLink(link);
return {
link,
host,
};
});
// const predata = await fetchPredata(hosts.map(({ host }) => host));
return Promise.map(hosts, async ({ link, host }) => {
const info = await getInfo(host, reddit, link);
if (info) {

View File

@ -3,8 +3,8 @@
const config = require('config');
const omit = require('object.omit');
const dissectLink = require('../dissectLink.js');
const hashPost = require('./hashPost.js');
const dissectLink = require('../dissectLink');
const hashPost = require('./hashPost');
const { isAfter, isBefore, isEqual } = require('date-fns');
const logger = require('../logger')(__filename);

View File

@ -49,18 +49,18 @@ function selfPostToText(item, post) {
return yaml.safeDump(curatedPost);
}
async function getBuffers(item, post, host) {
async function getBuffers(item, context) {
if (item.self) {
return [{
...Buffer.from(selfPostToText(item, post), 'utf8'),
hash: post.hash,
...Buffer.from(selfPostToText(item, context.post), 'utf8'),
hash: context.post.hash,
}];
}
const sources = item.mux ? [item.url].concat(item.mux) : [item.url];
const buffers = await Promise.map(sources, source => fetchItem(source, 0, post, host));
const buffers = await Promise.map(sources, (source) => fetchItem(source, 0, context));
if (buffers.filter(buffer => buffer).length > 0) {
if (buffers.filter((buffer) => buffer).length > 0) {
return buffers;
}
@ -105,7 +105,7 @@ function getFilepath(item, content, host, post, user) {
async function fetchSaveUserContent(user, ep, args) {
const profilePaths = await saveProfileDetails(user, args);
const hashes = new Set(user.indexed.original.map(item => item.hash));
const hashes = new Set(user.indexed.original.map((item) => item.hash));
const posts = await Promise.map(user.posts, async (post) => {
if (!post.content) {
@ -114,7 +114,7 @@ async function fetchSaveUserContent(user, ep, args) {
const hash = await Promise.reduce(post.content.items, async (accItems, originalItem, index) => {
const item = { ...originalItem, index };
const buffers = await getBuffers(item, post, post.host);
const buffers = await getBuffers(item, { post, host: post.host, headers: post.content.headers || item.headers });
// no buffers, ignore item
if (!buffers || buffers.length === 0) {
@ -158,7 +158,7 @@ async function fetchSaveDirectContent(content, host, ep) {
logger.info(`Fetching and saving '${host.url}'`);
const item = { ...originalItem, index };
const buffers = await getBuffers(item, null, host);
const buffers = await getBuffers(item, { host, headers: content.headers || item.headers });
// no buffers, ignore item
if (!buffers || buffers.length === 0) {

View File

@ -6,7 +6,7 @@ const Promise = require('bluebird');
const logger = require('../logger')(__filename);
const methods = require('../methods/methods');
const attachContentInfo = (users, reddit) => Promise.reduce(Object.values(users), async (accUsers, user) => ({
const attachContentInfo = (users, { reddit, predata }) => Promise.reduce(Object.values(users), async (accUsers, user) => ({
...accUsers,
[user.name]: {
...user,
@ -22,7 +22,10 @@ const attachContentInfo = (users, reddit) => Promise.reduce(Object.values(users)
...accPosts,
{
...post,
content: await methods[post.host.method](post.host, post, reddit),
content: await (methods[post.host.method].fetchInfo || methods[post.host.method])(post.host, post, {
predata: predata[post.host.method],
reddit,
}),
},
];
} catch (error) {
@ -36,7 +39,10 @@ const attachContentInfo = (users, reddit) => Promise.reduce(Object.values(users)
{
...post,
previewFallback: true,
content: await methods.redditPreview(post.host, post),
content: await methods.redditPreview(post.host, post, {
predata: predata.redditPreview,
reddit,
}),
},
];
}
@ -60,7 +66,7 @@ async function getInfo(host, reddit, url) {
}
}
return methods[host.method](host, null, reddit);
return (methods[host.method].fetchInfo || methods[host.method])(host, null, reddit);
}
module.exports = {

View File

@ -7,26 +7,26 @@ const blake2 = require('blake2');
const logger = require('../logger')(__filename);
const limiter = require('../limiter').items;
async function fetchItem(url, attempt, post, host) {
async function fetchItem(url, attempt, { post, host, headers }) {
async function retry(error) {
logger.warn(`Failed to fetch '${url}', ${attempt < config.fetch.retries ? 'retrying' : 'giving up'}: ${error.message} (${post ? post.permalink : 'no post'})`);
if (attempt < config.fetch.retries) {
return fetchItem(url, attempt + 1, post);
return fetchItem(url, attempt + 1, { post, host, headers });
}
return null;
}
try {
const res = await limiter.schedule(async () => bhttp.get(url));
const res = await limiter.schedule(async () => bhttp.get(url, { headers }));
if (!res.statusCode === 200) {
throw new Error(`Response not OK for '${url}', HTTP code '${res.status}'`);
if (res.statusCode !== 200) {
throw new Error(`Response not OK for ${url} (${res.statusCode}): ${res.body.toString()}`);
}
if (!Buffer.isBuffer(res.body)) {
throw new Error(`Unexpected response for '${url}' (${res.status}): ${res.body}`);
throw new Error(`Unexpected response for ${url} (${res.statusCode}): ${res.body}`);
}
logger.debug(`Fetched '${host ? host.url : url}' (${post ? post.permalink : 'no post'})`);

View File

@ -1,11 +1,16 @@
'use strict';
const winston = require('winston');
require('winston-daily-rotate-file');
const args = require('./cli.js')();
const args = require('./cli')();
const logger = winston.createLogger({
level: args.logLevel,
format: winston.format.combine(
winston.format.timestamp(),
winston.format.json(),
),
transports: [
new winston.transports.Console({
level: args.logLevel,
@ -16,6 +21,15 @@ const logger = winston.createLogger({
),
timestamp: true,
}),
new winston.transports.DailyRotateFile({
datePattern: 'YYYY-MM-DD',
filename: 'logs/%DATE%.log',
}),
new winston.transports.DailyRotateFile({
datePattern: 'YYYY-MM-DD',
filename: 'logs/error_%DATE%.log',
level: 'error',
}),
],
});

View File

@ -10,6 +10,8 @@ async function imgurImageApi(host) {
},
});
console.log('imgur headers', res.headers);
if (res.status !== 200) {
throw new Error(`Imgur API returned HTTP ${res.status} for source '${host.url}'`);
}

View File

@ -11,7 +11,7 @@ async function redditPreview(host, post) {
datetime: post.datetime,
original: post,
} : null,
items: post.preview.map(image => ({
items: post.preview.map((image) => ({
id: post.host.id || post.id,
url: image.url,
title: post.title,

View File

@ -2,16 +2,19 @@
const fetch = require('node-fetch');
const mime = require('mime');
// const unprint = require('unprint');
const { version } = require('../../package.json');
function scrapeGallery(data) {
const oldestDate = Math.min(...data.gifs.map(gif => gif.createDate));
const oldestDate = Math.min(...data.gifs.map((gif) => gif.createDate));
return {
album: {
id: data.id,
datetime: new Date(oldestDate * 1000),
},
items: data.gifs.map(gif => ({
items: data.gifs.map((gif) => ({
id: gif.id,
url: gif.urls.hd,
description: gif.tags.join(', '),
@ -33,12 +36,22 @@ async function fetchGallery(galleryId) {
return scrapeGallery(data);
}
async function redgifs(host) {
const res = await fetch(`https://api.redgifs.com/v2/gifs/${host.id.toLowerCase()}`);
async function redgifsApi(host, post, { predata }) {
if (!predata?.token) {
throw new Error('No RedGifs token provided');
}
const res = await fetch(`https://api.redgifs.com/v2/gifs/${host.id.toLowerCase()}`, {
headers: {
authorization: `Bearer ${predata.token}`,
'user-agent': predata.userAgent,
},
});
const data = await res.json();
if (data.errorMessage) {
throw new Error(`RedGifs API returned error for source '${host.url}' (${res.status}): ${data.errorMessage.description}`);
if (data.errorMessage || data.error) {
throw new Error(`RedGifs API returned error for source '${host.url}' (${res.status}): ${data.errorMessage?.description || data.error?.description}`);
}
if (data.id && data.gifs) {
@ -53,7 +66,7 @@ async function redgifs(host) {
return fetchGallery(data.gif.gallery);
}
return {
const curated = {
album: null,
items: [{
id: data.gif.id,
@ -63,7 +76,78 @@ async function redgifs(host) {
datetime: new Date(data.gif.createDate * 1000),
original: data.gif,
}],
headers: {
'user-agent': predata.userAgent,
},
};
return curated;
}
async function redgifs(host, post, { predata }) {
if (predata?.token) {
return redgifsApi(host, post, { predata });
}
throw new Error('No RedGifs token provided');
/*
const res = await unprint.get(host.url);
if (!res.ok) {
throw new Error(`RedGifs returned error for source '${host.url}' (${res.status})`);
}
const data = res.context.query.json('script[type="application/ld+json"]');
if (!data.video) {
return null;
}
// console.log(data);
const curatedData = {
album: null,
items: [{
id: host.id,
url: data.video.contentUrl,
description: data.video.keywords,
type: mime.getType(new URL(data.video.contentUrl).pathname),
datetime: new Date(data.video.uploadDate),
original: data.video,
}],
};
// console.log(curatedData);
return null;
// return curatedData;
*/
}
async function fetchPredata() {
const userAgent = `ripunzel/${version}`;
const res = await fetch('https://api.redgifs.com/v2/auth/temporary', {
headers: {
'user-agent': userAgent,
},
});
const data = await res.json();
if (res.ok) {
return {
address: data.addr,
agent: data.agent,
token: data.token,
userAgent,
};
}
module.exports = redgifs;
return null;
}
module.exports = {
fetchInfo: redgifs,
fetchPredata,
};

View File

@ -39,7 +39,7 @@ async function getPosts(username, reddit, args) {
return submissions;
} catch (error) {
logger.warn(`Failed to fetch posts from reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`);
logger.error(`Failed to fetch posts from reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`);
return [];
}