Added predata step. Using RedGifs temporary API key.
This commit is contained in:
parent
4acbe16fb8
commit
bed4fe288f
|
@ -1,11 +1,14 @@
|
|||
{
|
||||
"extends": "airbnb-base",
|
||||
"parserOptions": {
|
||||
"sourceType": "script"
|
||||
"parser": "@babel/eslint-parser",
|
||||
"sourceType": "script",
|
||||
"ecmaVersion": 2020
|
||||
},
|
||||
"rules": {
|
||||
"no-console": 0,
|
||||
"indent": ["error", 4],
|
||||
"max-len": 0
|
||||
"max-len": 0,
|
||||
"strict": 0
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,8 +2,8 @@ node_modules/
|
|||
config/*.js
|
||||
!config/default.js
|
||||
output/
|
||||
logs/
|
||||
dist/
|
||||
users
|
||||
users_invalid
|
||||
users*
|
||||
posts
|
||||
ignore
|
||||
|
|
|
@ -73,7 +73,7 @@ module.exports = {
|
|||
level: 'info',
|
||||
},
|
||||
limiter: {
|
||||
concurrency: 100,
|
||||
concurrency: 10,
|
||||
interval: 100,
|
||||
},
|
||||
reddit: {
|
||||
|
|
File diff suppressed because it is too large
Load Diff
14
package.json
14
package.json
|
@ -30,7 +30,7 @@
|
|||
"dependencies": {
|
||||
"array.prototype.flatten": "^1.2.1",
|
||||
"bhttp": "^1.2.4",
|
||||
"blake2": "^4.0.0",
|
||||
"blake2": "^4.1.1",
|
||||
"bluebird": "^3.5.1",
|
||||
"bottleneck": "^2.19.5",
|
||||
"cheerio": "^1.0.0-rc.2",
|
||||
|
@ -50,14 +50,18 @@
|
|||
"object.pick": "^1.3.0",
|
||||
"snoowrap": "^1.20.0",
|
||||
"template-format": "^1.2.4",
|
||||
"unprint": "^0.8.1",
|
||||
"url-pattern": "^1.0.3",
|
||||
"winston": "^3.3.3",
|
||||
"winston-daily-rotate-file": "^4.7.1",
|
||||
"yargs": "^11.0.0",
|
||||
"youtube-dl": "^2.1.0"
|
||||
"youtube-dl": "^2.3.0",
|
||||
"youtube-dl-exec": "^2.2.3"
|
||||
},
|
||||
"devDependencies": {
|
||||
"eslint": "^4.19.1",
|
||||
"eslint-config-airbnb-base": "^12.1.0",
|
||||
"eslint-plugin-import": "^2.12.0"
|
||||
"@babel/eslint-parser": "^7.19.1",
|
||||
"eslint": "^8.34.0",
|
||||
"eslint-config-airbnb-base": "^15.0.0",
|
||||
"eslint-plugin-import": "^2.27.5"
|
||||
}
|
||||
}
|
||||
|
|
42
src/app.js
42
src/app.js
|
@ -16,6 +16,7 @@ const logger = require('./logger')(__filename);
|
|||
|
||||
const dissectLink = require('./dissectLink');
|
||||
const curatePosts = require('./curate/posts');
|
||||
const methods = require('./methods/methods');
|
||||
|
||||
const { attachContentInfo, getInfo } = require('./fetch/info');
|
||||
const { fetchSaveUserContent, fetchSaveDirectContent } = require('./fetch/content');
|
||||
|
@ -27,7 +28,7 @@ async function getFileContents(location, label) {
|
|||
try {
|
||||
const fileContents = await fs.readFile(location, 'utf8');
|
||||
|
||||
return fileContents.split('\n').filter(entry => entry && entry.slice(0, 1) !== '#');
|
||||
return fileContents.split('\n').filter((entry) => entry && entry.slice(0, 1) !== '#');
|
||||
} catch (error) {
|
||||
logger.error(`Could not read ${label} file '${location}': ${error}.`);
|
||||
|
||||
|
@ -35,6 +36,30 @@ async function getFileContents(location, label) {
|
|||
}
|
||||
}
|
||||
|
||||
function getPostHosts(posts) {
|
||||
// const hosts = Array.from(new Set(Object.values(posts).flatMap((user) => user.posts.map((post) => post.host?.method)))).filter(Boolean);
|
||||
const hosts = Object.values(Object.fromEntries(Object.values(posts).flatMap((user) => user.posts.map((post) => post.host && [post.host?.method, post.host])).filter(Boolean)));
|
||||
|
||||
return hosts;
|
||||
}
|
||||
|
||||
async function fetchPredata(hosts) {
|
||||
return hosts.reduce(async (chain, host) => {
|
||||
const acc = await chain;
|
||||
|
||||
if (methods[host?.method]?.fetchPredata) {
|
||||
const data = await methods[host.method].fetchPredata();
|
||||
|
||||
return {
|
||||
...acc,
|
||||
[host.method]: data,
|
||||
};
|
||||
}
|
||||
|
||||
return acc;
|
||||
}, Promise.resolve({}));
|
||||
}
|
||||
|
||||
async function getCompletePosts() {
|
||||
let userPosts = {};
|
||||
let ignoreIds = [];
|
||||
|
@ -66,13 +91,24 @@ async function getCompletePosts() {
|
|||
}
|
||||
|
||||
const curatedUserPosts = curatePosts(userPosts, ignoreIds, args);
|
||||
const predata = await fetchPredata(getPostHosts(curatedUserPosts));
|
||||
|
||||
return attachContentInfo(curatedUserPosts, reddit);
|
||||
return attachContentInfo(curatedUserPosts, { reddit, predata });
|
||||
}
|
||||
|
||||
async function getDirectContent(links, ep) {
|
||||
return Promise.map(links, async (link) => {
|
||||
const hosts = links.map((link) => {
|
||||
const host = dissectLink(link);
|
||||
|
||||
return {
|
||||
link,
|
||||
host,
|
||||
};
|
||||
});
|
||||
|
||||
// const predata = await fetchPredata(hosts.map(({ host }) => host));
|
||||
|
||||
return Promise.map(hosts, async ({ link, host }) => {
|
||||
const info = await getInfo(host, reddit, link);
|
||||
|
||||
if (info) {
|
||||
|
|
|
@ -3,8 +3,8 @@
|
|||
const config = require('config');
|
||||
const omit = require('object.omit');
|
||||
|
||||
const dissectLink = require('../dissectLink.js');
|
||||
const hashPost = require('./hashPost.js');
|
||||
const dissectLink = require('../dissectLink');
|
||||
const hashPost = require('./hashPost');
|
||||
|
||||
const { isAfter, isBefore, isEqual } = require('date-fns');
|
||||
const logger = require('../logger')(__filename);
|
||||
|
|
|
@ -49,18 +49,18 @@ function selfPostToText(item, post) {
|
|||
return yaml.safeDump(curatedPost);
|
||||
}
|
||||
|
||||
async function getBuffers(item, post, host) {
|
||||
async function getBuffers(item, context) {
|
||||
if (item.self) {
|
||||
return [{
|
||||
...Buffer.from(selfPostToText(item, post), 'utf8'),
|
||||
hash: post.hash,
|
||||
...Buffer.from(selfPostToText(item, context.post), 'utf8'),
|
||||
hash: context.post.hash,
|
||||
}];
|
||||
}
|
||||
|
||||
const sources = item.mux ? [item.url].concat(item.mux) : [item.url];
|
||||
const buffers = await Promise.map(sources, source => fetchItem(source, 0, post, host));
|
||||
const buffers = await Promise.map(sources, (source) => fetchItem(source, 0, context));
|
||||
|
||||
if (buffers.filter(buffer => buffer).length > 0) {
|
||||
if (buffers.filter((buffer) => buffer).length > 0) {
|
||||
return buffers;
|
||||
}
|
||||
|
||||
|
@ -105,7 +105,7 @@ function getFilepath(item, content, host, post, user) {
|
|||
|
||||
async function fetchSaveUserContent(user, ep, args) {
|
||||
const profilePaths = await saveProfileDetails(user, args);
|
||||
const hashes = new Set(user.indexed.original.map(item => item.hash));
|
||||
const hashes = new Set(user.indexed.original.map((item) => item.hash));
|
||||
|
||||
const posts = await Promise.map(user.posts, async (post) => {
|
||||
if (!post.content) {
|
||||
|
@ -114,7 +114,7 @@ async function fetchSaveUserContent(user, ep, args) {
|
|||
|
||||
const hash = await Promise.reduce(post.content.items, async (accItems, originalItem, index) => {
|
||||
const item = { ...originalItem, index };
|
||||
const buffers = await getBuffers(item, post, post.host);
|
||||
const buffers = await getBuffers(item, { post, host: post.host, headers: post.content.headers || item.headers });
|
||||
|
||||
// no buffers, ignore item
|
||||
if (!buffers || buffers.length === 0) {
|
||||
|
@ -158,7 +158,7 @@ async function fetchSaveDirectContent(content, host, ep) {
|
|||
logger.info(`Fetching and saving '${host.url}'`);
|
||||
|
||||
const item = { ...originalItem, index };
|
||||
const buffers = await getBuffers(item, null, host);
|
||||
const buffers = await getBuffers(item, { host, headers: content.headers || item.headers });
|
||||
|
||||
// no buffers, ignore item
|
||||
if (!buffers || buffers.length === 0) {
|
||||
|
|
|
@ -6,7 +6,7 @@ const Promise = require('bluebird');
|
|||
const logger = require('../logger')(__filename);
|
||||
const methods = require('../methods/methods');
|
||||
|
||||
const attachContentInfo = (users, reddit) => Promise.reduce(Object.values(users), async (accUsers, user) => ({
|
||||
const attachContentInfo = (users, { reddit, predata }) => Promise.reduce(Object.values(users), async (accUsers, user) => ({
|
||||
...accUsers,
|
||||
[user.name]: {
|
||||
...user,
|
||||
|
@ -22,7 +22,10 @@ const attachContentInfo = (users, reddit) => Promise.reduce(Object.values(users)
|
|||
...accPosts,
|
||||
{
|
||||
...post,
|
||||
content: await methods[post.host.method](post.host, post, reddit),
|
||||
content: await (methods[post.host.method].fetchInfo || methods[post.host.method])(post.host, post, {
|
||||
predata: predata[post.host.method],
|
||||
reddit,
|
||||
}),
|
||||
},
|
||||
];
|
||||
} catch (error) {
|
||||
|
@ -36,7 +39,10 @@ const attachContentInfo = (users, reddit) => Promise.reduce(Object.values(users)
|
|||
{
|
||||
...post,
|
||||
previewFallback: true,
|
||||
content: await methods.redditPreview(post.host, post),
|
||||
content: await methods.redditPreview(post.host, post, {
|
||||
predata: predata.redditPreview,
|
||||
reddit,
|
||||
}),
|
||||
},
|
||||
];
|
||||
}
|
||||
|
@ -60,7 +66,7 @@ async function getInfo(host, reddit, url) {
|
|||
}
|
||||
}
|
||||
|
||||
return methods[host.method](host, null, reddit);
|
||||
return (methods[host.method].fetchInfo || methods[host.method])(host, null, reddit);
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
|
|
|
@ -7,26 +7,26 @@ const blake2 = require('blake2');
|
|||
const logger = require('../logger')(__filename);
|
||||
const limiter = require('../limiter').items;
|
||||
|
||||
async function fetchItem(url, attempt, post, host) {
|
||||
async function fetchItem(url, attempt, { post, host, headers }) {
|
||||
async function retry(error) {
|
||||
logger.warn(`Failed to fetch '${url}', ${attempt < config.fetch.retries ? 'retrying' : 'giving up'}: ${error.message} (${post ? post.permalink : 'no post'})`);
|
||||
|
||||
if (attempt < config.fetch.retries) {
|
||||
return fetchItem(url, attempt + 1, post);
|
||||
return fetchItem(url, attempt + 1, { post, host, headers });
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
const res = await limiter.schedule(async () => bhttp.get(url));
|
||||
const res = await limiter.schedule(async () => bhttp.get(url, { headers }));
|
||||
|
||||
if (!res.statusCode === 200) {
|
||||
throw new Error(`Response not OK for '${url}', HTTP code '${res.status}'`);
|
||||
if (res.statusCode !== 200) {
|
||||
throw new Error(`Response not OK for ${url} (${res.statusCode}): ${res.body.toString()}`);
|
||||
}
|
||||
|
||||
if (!Buffer.isBuffer(res.body)) {
|
||||
throw new Error(`Unexpected response for '${url}' (${res.status}): ${res.body}`);
|
||||
throw new Error(`Unexpected response for ${url} (${res.statusCode}): ${res.body}`);
|
||||
}
|
||||
|
||||
logger.debug(`Fetched '${host ? host.url : url}' (${post ? post.permalink : 'no post'})`);
|
||||
|
|
|
@ -1,11 +1,16 @@
|
|||
'use strict';
|
||||
|
||||
const winston = require('winston');
|
||||
require('winston-daily-rotate-file');
|
||||
|
||||
const args = require('./cli.js')();
|
||||
const args = require('./cli')();
|
||||
|
||||
const logger = winston.createLogger({
|
||||
level: args.logLevel,
|
||||
format: winston.format.combine(
|
||||
winston.format.timestamp(),
|
||||
winston.format.json(),
|
||||
),
|
||||
transports: [
|
||||
new winston.transports.Console({
|
||||
level: args.logLevel,
|
||||
|
@ -16,6 +21,15 @@ const logger = winston.createLogger({
|
|||
),
|
||||
timestamp: true,
|
||||
}),
|
||||
new winston.transports.DailyRotateFile({
|
||||
datePattern: 'YYYY-MM-DD',
|
||||
filename: 'logs/%DATE%.log',
|
||||
}),
|
||||
new winston.transports.DailyRotateFile({
|
||||
datePattern: 'YYYY-MM-DD',
|
||||
filename: 'logs/error_%DATE%.log',
|
||||
level: 'error',
|
||||
}),
|
||||
],
|
||||
});
|
||||
|
||||
|
|
|
@ -10,6 +10,8 @@ async function imgurImageApi(host) {
|
|||
},
|
||||
});
|
||||
|
||||
console.log('imgur headers', res.headers);
|
||||
|
||||
if (res.status !== 200) {
|
||||
throw new Error(`Imgur API returned HTTP ${res.status} for source '${host.url}'`);
|
||||
}
|
||||
|
|
|
@ -11,7 +11,7 @@ async function redditPreview(host, post) {
|
|||
datetime: post.datetime,
|
||||
original: post,
|
||||
} : null,
|
||||
items: post.preview.map(image => ({
|
||||
items: post.preview.map((image) => ({
|
||||
id: post.host.id || post.id,
|
||||
url: image.url,
|
||||
title: post.title,
|
||||
|
|
|
@ -2,16 +2,19 @@
|
|||
|
||||
const fetch = require('node-fetch');
|
||||
const mime = require('mime');
|
||||
// const unprint = require('unprint');
|
||||
|
||||
const { version } = require('../../package.json');
|
||||
|
||||
function scrapeGallery(data) {
|
||||
const oldestDate = Math.min(...data.gifs.map(gif => gif.createDate));
|
||||
const oldestDate = Math.min(...data.gifs.map((gif) => gif.createDate));
|
||||
|
||||
return {
|
||||
album: {
|
||||
id: data.id,
|
||||
datetime: new Date(oldestDate * 1000),
|
||||
},
|
||||
items: data.gifs.map(gif => ({
|
||||
items: data.gifs.map((gif) => ({
|
||||
id: gif.id,
|
||||
url: gif.urls.hd,
|
||||
description: gif.tags.join(', '),
|
||||
|
@ -33,12 +36,22 @@ async function fetchGallery(galleryId) {
|
|||
return scrapeGallery(data);
|
||||
}
|
||||
|
||||
async function redgifs(host) {
|
||||
const res = await fetch(`https://api.redgifs.com/v2/gifs/${host.id.toLowerCase()}`);
|
||||
async function redgifsApi(host, post, { predata }) {
|
||||
if (!predata?.token) {
|
||||
throw new Error('No RedGifs token provided');
|
||||
}
|
||||
|
||||
const res = await fetch(`https://api.redgifs.com/v2/gifs/${host.id.toLowerCase()}`, {
|
||||
headers: {
|
||||
authorization: `Bearer ${predata.token}`,
|
||||
'user-agent': predata.userAgent,
|
||||
},
|
||||
});
|
||||
|
||||
const data = await res.json();
|
||||
|
||||
if (data.errorMessage) {
|
||||
throw new Error(`RedGifs API returned error for source '${host.url}' (${res.status}): ${data.errorMessage.description}`);
|
||||
if (data.errorMessage || data.error) {
|
||||
throw new Error(`RedGifs API returned error for source '${host.url}' (${res.status}): ${data.errorMessage?.description || data.error?.description}`);
|
||||
}
|
||||
|
||||
if (data.id && data.gifs) {
|
||||
|
@ -53,7 +66,7 @@ async function redgifs(host) {
|
|||
return fetchGallery(data.gif.gallery);
|
||||
}
|
||||
|
||||
return {
|
||||
const curated = {
|
||||
album: null,
|
||||
items: [{
|
||||
id: data.gif.id,
|
||||
|
@ -63,7 +76,78 @@ async function redgifs(host) {
|
|||
datetime: new Date(data.gif.createDate * 1000),
|
||||
original: data.gif,
|
||||
}],
|
||||
headers: {
|
||||
'user-agent': predata.userAgent,
|
||||
},
|
||||
};
|
||||
|
||||
return curated;
|
||||
}
|
||||
|
||||
module.exports = redgifs;
|
||||
async function redgifs(host, post, { predata }) {
|
||||
if (predata?.token) {
|
||||
return redgifsApi(host, post, { predata });
|
||||
}
|
||||
|
||||
throw new Error('No RedGifs token provided');
|
||||
|
||||
/*
|
||||
const res = await unprint.get(host.url);
|
||||
|
||||
if (!res.ok) {
|
||||
throw new Error(`RedGifs returned error for source '${host.url}' (${res.status})`);
|
||||
}
|
||||
|
||||
const data = res.context.query.json('script[type="application/ld+json"]');
|
||||
|
||||
if (!data.video) {
|
||||
return null;
|
||||
}
|
||||
|
||||
// console.log(data);
|
||||
|
||||
const curatedData = {
|
||||
album: null,
|
||||
items: [{
|
||||
id: host.id,
|
||||
url: data.video.contentUrl,
|
||||
description: data.video.keywords,
|
||||
type: mime.getType(new URL(data.video.contentUrl).pathname),
|
||||
datetime: new Date(data.video.uploadDate),
|
||||
original: data.video,
|
||||
}],
|
||||
};
|
||||
|
||||
// console.log(curatedData);
|
||||
return null;
|
||||
|
||||
// return curatedData;
|
||||
*/
|
||||
}
|
||||
|
||||
async function fetchPredata() {
|
||||
const userAgent = `ripunzel/${version}`;
|
||||
const res = await fetch('https://api.redgifs.com/v2/auth/temporary', {
|
||||
headers: {
|
||||
'user-agent': userAgent,
|
||||
},
|
||||
});
|
||||
|
||||
const data = await res.json();
|
||||
|
||||
if (res.ok) {
|
||||
return {
|
||||
address: data.addr,
|
||||
agent: data.agent,
|
||||
token: data.token,
|
||||
userAgent,
|
||||
};
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
fetchInfo: redgifs,
|
||||
fetchPredata,
|
||||
};
|
||||
|
|
|
@ -39,7 +39,7 @@ async function getPosts(username, reddit, args) {
|
|||
|
||||
return submissions;
|
||||
} catch (error) {
|
||||
logger.warn(`Failed to fetch posts from reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`);
|
||||
logger.error(`Failed to fetch posts from reddit user '${username}': ${error.message} (https://reddit.com/user/${username})`);
|
||||
|
||||
return [];
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue