Added predata step. Using RedGifs temporary API key.

This commit is contained in:
DebaucheryLibrarian 2024-09-11 05:16:58 +02:00
parent 4acbe16fb8
commit bed4fe288f
16 changed files with 9877 additions and 1524 deletions

View File

@ -1,11 +1,14 @@
{ {
"extends": "airbnb-base", "extends": "airbnb-base",
"parserOptions": { "parserOptions": {
"sourceType": "script" "parser": "@babel/eslint-parser",
"sourceType": "script",
"ecmaVersion": 2020
}, },
"rules": { "rules": {
"no-console": 0, "no-console": 0,
"indent": ["error", 4], "indent": ["error", 4],
"max-len": 0 "max-len": 0,
"strict": 0
} }
} }

4
.gitignore vendored
View File

@ -2,8 +2,8 @@ node_modules/
config/*.js config/*.js
!config/default.js !config/default.js
output/ output/
logs/
dist/ dist/
users users*
users_invalid
posts posts
ignore ignore

1
.nvmrc Normal file
View File

@ -0,0 +1 @@
16.19.1

View File

@ -73,7 +73,7 @@ module.exports = {
level: 'info', level: 'info',
}, },
limiter: { limiter: {
concurrency: 100, concurrency: 10,
interval: 100, interval: 100,
}, },
reddit: { reddit: {

11157
package-lock.json generated

File diff suppressed because it is too large Load Diff

View File

@ -30,7 +30,7 @@
"dependencies": { "dependencies": {
"array.prototype.flatten": "^1.2.1", "array.prototype.flatten": "^1.2.1",
"bhttp": "^1.2.4", "bhttp": "^1.2.4",
"blake2": "^4.0.0", "blake2": "^4.1.1",
"bluebird": "^3.5.1", "bluebird": "^3.5.1",
"bottleneck": "^2.19.5", "bottleneck": "^2.19.5",
"cheerio": "^1.0.0-rc.2", "cheerio": "^1.0.0-rc.2",
@ -50,14 +50,18 @@
"object.pick": "^1.3.0", "object.pick": "^1.3.0",
"snoowrap": "^1.20.0", "snoowrap": "^1.20.0",
"template-format": "^1.2.4", "template-format": "^1.2.4",
"unprint": "^0.8.1",
"url-pattern": "^1.0.3", "url-pattern": "^1.0.3",
"winston": "^3.3.3", "winston": "^3.3.3",
"winston-daily-rotate-file": "^4.7.1",
"yargs": "^11.0.0", "yargs": "^11.0.0",
"youtube-dl": "^2.1.0" "youtube-dl": "^2.3.0",
"youtube-dl-exec": "^2.2.3"
}, },
"devDependencies": { "devDependencies": {
"eslint": "^4.19.1", "@babel/eslint-parser": "^7.19.1",
"eslint-config-airbnb-base": "^12.1.0", "eslint": "^8.34.0",
"eslint-plugin-import": "^2.12.0" "eslint-config-airbnb-base": "^15.0.0",
"eslint-plugin-import": "^2.27.5"
} }
} }

View File

@ -16,6 +16,7 @@ const logger = require('./logger')(__filename);
const dissectLink = require('./dissectLink'); const dissectLink = require('./dissectLink');
const curatePosts = require('./curate/posts'); const curatePosts = require('./curate/posts');
const methods = require('./methods/methods');
const { attachContentInfo, getInfo } = require('./fetch/info'); const { attachContentInfo, getInfo } = require('./fetch/info');
const { fetchSaveUserContent, fetchSaveDirectContent } = require('./fetch/content'); const { fetchSaveUserContent, fetchSaveDirectContent } = require('./fetch/content');
@ -27,7 +28,7 @@ async function getFileContents(location, label) {
try { try {
const fileContents = await fs.readFile(location, 'utf8'); const fileContents = await fs.readFile(location, 'utf8');
return fileContents.split('\n').filter(entry => entry && entry.slice(0, 1) !== '#'); return fileContents.split('\n').filter((entry) => entry && entry.slice(0, 1) !== '#');
} catch (error) { } catch (error) {
logger.error(`Could not read ${label} file '${location}': ${error}.`); logger.error(`Could not read ${label} file '${location}': ${error}.`);
@ -35,6 +36,30 @@ async function getFileContents(location, label) {
} }
} }
function getPostHosts(posts) {
// const hosts = Array.from(new Set(Object.values(posts).flatMap((user) => user.posts.map((post) => post.host?.method)))).filter(Boolean);
const hosts = Object.values(Object.fromEntries(Object.values(posts).flatMap((user) => user.posts.map((post) => post.host && [post.host?.method, post.host])).filter(Boolean)));
return hosts;
}
async function fetchPredata(hosts) {
return hosts.reduce(async (chain, host) => {
const acc = await chain;
if (methods[host?.method]?.fetchPredata) {
const data = await methods[host.method].fetchPredata();
return {
...acc,
[host.method]: data,
};
}
return acc;
}, Promise.resolve({}));
}
async function getCompletePosts() { async function getCompletePosts() {
let userPosts = {}; let userPosts = {};
let ignoreIds = []; let ignoreIds = [];
@ -66,13 +91,24 @@ async function getCompletePosts() {
} }
const curatedUserPosts = curatePosts(userPosts, ignoreIds, args); const curatedUserPosts = curatePosts(userPosts, ignoreIds, args);
const predata = await fetchPredata(getPostHosts(curatedUserPosts));
return attachContentInfo(curatedUserPosts, reddit); return attachContentInfo(curatedUserPosts, { reddit, predata });
} }
async function getDirectContent(links, ep) { async function getDirectContent(links, ep) {
return Promise.map(links, async (link) => { const hosts = links.map((link) => {
const host = dissectLink(link); const host = dissectLink(link);
return {
link,
host,
};
});
// const predata = await fetchPredata(hosts.map(({ host }) => host));
return Promise.map(hosts, async ({ link, host }) => {
const info = await getInfo(host, reddit, link); const info = await getInfo(host, reddit, link);
if (info) { if (info) {

View File

@ -3,8 +3,8 @@
const config = require('config'); const config = require('config');
const omit = require('object.omit'); const omit = require('object.omit');
const dissectLink = require('../dissectLink.js'); const dissectLink = require('../dissectLink');
const hashPost = require('./hashPost.js'); const hashPost = require('./hashPost');
const { isAfter, isBefore, isEqual } = require('date-fns'); const { isAfter, isBefore, isEqual } = require('date-fns');
const logger = require('../logger')(__filename); const logger = require('../logger')(__filename);

View File

@ -49,18 +49,18 @@ function selfPostToText(item, post) {
return yaml.safeDump(curatedPost); return yaml.safeDump(curatedPost);
} }
async function getBuffers(item, post, host) { async function getBuffers(item, context) {
if (item.self) { if (item.self) {
return [{ return [{
...Buffer.from(selfPostToText(item, post), 'utf8'), ...Buffer.from(selfPostToText(item, context.post), 'utf8'),
hash: post.hash, hash: context.post.hash,
}]; }];
} }
const sources = item.mux ? [item.url].concat(item.mux) : [item.url]; const sources = item.mux ? [item.url].concat(item.mux) : [item.url];
const buffers = await Promise.map(sources, source => fetchItem(source, 0, post, host)); const buffers = await Promise.map(sources, (source) => fetchItem(source, 0, context));
if (buffers.filter(buffer => buffer).length > 0) { if (buffers.filter((buffer) => buffer).length > 0) {
return buffers; return buffers;
} }
@ -105,7 +105,7 @@ function getFilepath(item, content, host, post, user) {
async function fetchSaveUserContent(user, ep, args) { async function fetchSaveUserContent(user, ep, args) {
const profilePaths = await saveProfileDetails(user, args); const profilePaths = await saveProfileDetails(user, args);
const hashes = new Set(user.indexed.original.map(item => item.hash)); const hashes = new Set(user.indexed.original.map((item) => item.hash));
const posts = await Promise.map(user.posts, async (post) => { const posts = await Promise.map(user.posts, async (post) => {
if (!post.content) { if (!post.content) {
@ -114,7 +114,7 @@ async function fetchSaveUserContent(user, ep, args) {
const hash = await Promise.reduce(post.content.items, async (accItems, originalItem, index) => { const hash = await Promise.reduce(post.content.items, async (accItems, originalItem, index) => {
const item = { ...originalItem, index }; const item = { ...originalItem, index };
const buffers = await getBuffers(item, post, post.host); const buffers = await getBuffers(item, { post, host: post.host, headers: post.content.headers || item.headers });
// no buffers, ignore item // no buffers, ignore item
if (!buffers || buffers.length === 0) { if (!buffers || buffers.length === 0) {
@ -158,7 +158,7 @@ async function fetchSaveDirectContent(content, host, ep) {
logger.info(`Fetching and saving '${host.url}'`); logger.info(`Fetching and saving '${host.url}'`);
const item = { ...originalItem, index }; const item = { ...originalItem, index };
const buffers = await getBuffers(item, null, host); const buffers = await getBuffers(item, { host, headers: content.headers || item.headers });
// no buffers, ignore item // no buffers, ignore item
if (!buffers || buffers.length === 0) { if (!buffers || buffers.length === 0) {

View File

@ -6,7 +6,7 @@ const Promise = require('bluebird');
const logger = require('../logger')(__filename); const logger = require('../logger')(__filename);
const methods = require('../methods/methods'); const methods = require('../methods/methods');
const attachContentInfo = (users, reddit) => Promise.reduce(Object.values(users), async (accUsers, user) => ({ const attachContentInfo = (users, { reddit, predata }) => Promise.reduce(Object.values(users), async (accUsers, user) => ({
...accUsers, ...accUsers,
[user.name]: { [user.name]: {
...user, ...user,
@ -22,7 +22,10 @@ const attachContentInfo = (users, reddit) => Promise.reduce(Object.values(users)
...accPosts, ...accPosts,
{ {
...post, ...post,
content: await methods[post.host.method](post.host, post, reddit), content: await (methods[post.host.method].fetchInfo || methods[post.host.method])(post.host, post, {
predata: predata[post.host.method],
reddit,
}),
}, },
]; ];
} catch (error) { } catch (error) {
@ -36,7 +39,10 @@ const attachContentInfo = (users, reddit) => Promise.reduce(Object.values(users)
{ {
...post, ...post,
previewFallback: true, previewFallback: true,
content: await methods.redditPreview(post.host, post), content: await methods.redditPreview(post.host, post, {
predata: predata.redditPreview,
reddit,
}),
}, },
]; ];
} }
@ -60,7 +66,7 @@ async function getInfo(host, reddit, url) {
} }
} }
return methods[host.method](host, null, reddit); return (methods[host.method].fetchInfo || methods[host.method])(host, null, reddit);
} }
module.exports = { module.exports = {

View File

@ -7,26 +7,26 @@ const blake2 = require('blake2');
const logger = require('../logger')(__filename); const logger = require('../logger')(__filename);
const limiter = require('../limiter').items; const limiter = require('../limiter').items;
async function fetchItem(url, attempt, post, host) { async function fetchItem(url, attempt, { post, host, headers }) {
async function retry(error) { async function retry(error) {
logger.warn(`Failed to fetch '${url}', ${attempt < config.fetch.retries ? 'retrying' : 'giving up'}: ${error.message} (${post ? post.permalink : 'no post'})`); logger.warn(`Failed to fetch '${url}', ${attempt < config.fetch.retries ? 'retrying' : 'giving up'}: ${error.message} (${post ? post.permalink : 'no post'})`);
if (attempt < config.fetch.retries) { if (attempt < config.fetch.retries) {
return fetchItem(url, attempt + 1, post); return fetchItem(url, attempt + 1, { post, host, headers });
} }
return null; return null;
} }
try { try {
const res = await limiter.schedule(async () => bhttp.get(url)); const res = await limiter.schedule(async () => bhttp.get(url, { headers }));
if (!res.statusCode === 200) { if (res.statusCode !== 200) {
throw new Error(`Response not OK for '${url}', HTTP code '${res.status}'`); throw new Error(`Response not OK for ${url} (${res.statusCode}): ${res.body.toString()}`);
} }
if (!Buffer.isBuffer(res.body)) { if (!Buffer.isBuffer(res.body)) {
throw new Error(`Unexpected response for '${url}' (${res.status}): ${res.body}`); throw new Error(`Unexpected response for ${url} (${res.statusCode}): ${res.body}`);
} }
logger.debug(`Fetched '${host ? host.url : url}' (${post ? post.permalink : 'no post'})`); logger.debug(`Fetched '${host ? host.url : url}' (${post ? post.permalink : 'no post'})`);

View File

@ -1,11 +1,16 @@
'use strict'; 'use strict';
const winston = require('winston'); const winston = require('winston');
require('winston-daily-rotate-file');
const args = require('./cli.js')(); const args = require('./cli')();
const logger = winston.createLogger({ const logger = winston.createLogger({
level: args.logLevel, level: args.logLevel,
format: winston.format.combine(
winston.format.timestamp(),
winston.format.json(),
),
transports: [ transports: [
new winston.transports.Console({ new winston.transports.Console({
level: args.logLevel, level: args.logLevel,
@ -16,6 +21,15 @@ const logger = winston.createLogger({
), ),
timestamp: true, timestamp: true,
}), }),
new winston.transports.DailyRotateFile({
datePattern: 'YYYY-MM-DD',
filename: 'logs/%DATE%.log',
}),
new winston.transports.DailyRotateFile({
datePattern: 'YYYY-MM-DD',
filename: 'logs/error_%DATE%.log',
level: 'error',
}),
], ],
}); });

View File

@ -10,6 +10,8 @@ async function imgurImageApi(host) {
}, },
}); });
console.log('imgur headers', res.headers);
if (res.status !== 200) { if (res.status !== 200) {
throw new Error(`Imgur API returned HTTP ${res.status} for source '${host.url}'`); throw new Error(`Imgur API returned HTTP ${res.status} for source '${host.url}'`);
} }

View File

@ -11,7 +11,7 @@ async function redditPreview(host, post) {
datetime: post.datetime, datetime: post.datetime,
original: post, original: post,
} : null, } : null,
items: post.preview.map(image => ({ items: post.preview.map((image) => ({
id: post.host.id || post.id, id: post.host.id || post.id,
url: image.url, url: image.url,
title: post.title, title: post.title,

View File

@ -2,16 +2,19 @@
const fetch = require('node-fetch'); const fetch = require('node-fetch');
const mime = require('mime'); const mime = require('mime');
// const unprint = require('unprint');
const { version } = require('../../package.json');
function scrapeGallery(data) { function scrapeGallery(data) {
const oldestDate = Math.min(...data.gifs.map(gif => gif.createDate)); const oldestDate = Math.min(...data.gifs.map((gif) => gif.createDate));
return { return {
album: { album: {
id: data.id, id: data.id,
datetime: new Date(oldestDate * 1000), datetime: new Date(oldestDate * 1000),
}, },
items: data.gifs.map(gif => ({ items: data.gifs.map((gif) => ({
id: gif.id, id: gif.id,
url: gif.urls.hd, url: gif.urls.hd,
description: gif.tags.join(', '), description: gif.tags.join(', '),
@ -33,12 +36,22 @@ async function fetchGallery(galleryId) {
return scrapeGallery(data); return scrapeGallery(data);
} }
async function redgifs(host) { async function redgifsApi(host, post, { predata }) {
const res = await fetch(`https://api.redgifs.com/v2/gifs/${host.id.toLowerCase()}`); if (!predata?.token) {
throw new Error('No RedGifs token provided');
}
const res = await fetch(`https://api.redgifs.com/v2/gifs/${host.id.toLowerCase()}`, {
headers: {
authorization: `Bearer ${predata.token}`,
'user-agent': predata.userAgent,
},
});
const data = await res.json(); const data = await res.json();
if (data.errorMessage) { if (data.errorMessage || data.error) {
throw new Error(`RedGifs API returned error for source '${host.url}' (${res.status}): ${data.errorMessage.description}`); throw new Error(`RedGifs API returned error for source '${host.url}' (${res.status}): ${data.errorMessage?.description || data.error?.description}`);
} }
if (data.id && data.gifs) { if (data.id && data.gifs) {
@ -53,7 +66,7 @@ async function redgifs(host) {
return fetchGallery(data.gif.gallery); return fetchGallery(data.gif.gallery);
} }
return { const curated = {
album: null, album: null,
items: [{ items: [{
id: data.gif.id, id: data.gif.id,
@ -63,7 +76,78 @@ async function redgifs(host) {
datetime: new Date(data.gif.createDate * 1000), datetime: new Date(data.gif.createDate * 1000),
original: data.gif, original: data.gif,
}], }],
headers: {
'user-agent': predata.userAgent,
},
};
return curated;
}
async function redgifs(host, post, { predata }) {
if (predata?.token) {
return redgifsApi(host, post, { predata });
}
throw new Error('No RedGifs token provided');
/*
const res = await unprint.get(host.url);
if (!res.ok) {
throw new Error(`RedGifs returned error for source '${host.url}' (${res.status})`);
}
const data = res.context.query.json('script[type="application/ld+json"]');
if (!data.video) {
return null;
}
// console.log(data);
const curatedData = {
album: null,
items: [{
id: host.id,
url: data.video.contentUrl,
description: data.video.keywords,
type: mime.getType(new URL(data.video.contentUrl).pathname),
datetime: new Date(data.video.uploadDate),
original: data.video,
}],
};
// console.log(curatedData);
return null;
// return curatedData;
*/
}
async function fetchPredata() {
const userAgent = `ripunzel/${version}`;
const res = await fetch('https://api.redgifs.com/v2/auth/temporary', {
headers: {
'user-agent': userAgent,
},
});
const data = await res.json();
if (res.ok) {
return {
address: data.addr,
agent: data.agent,
token: data.token,
userAgent,
}; };
} }
module.exports = redgifs; return null;
}
module.exports = {
fetchInfo: redgifs,
fetchPredata,
};

View File

@ -39,7 +39,7 @@ async function getPosts(username, reddit, args) {
return submissions; return submissions;
} catch (error) { } catch (error) {
logger.warn(`Failed to fetch posts from reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`); logger.error(`Failed to fetch posts from reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`);
return []; return [];
} }