ripunzel/src/curate/posts.js

185 lines
6.5 KiB
JavaScript

'use strict';
const config = require('config');
const omit = require('object.omit');
const dissectLink = require('../dissectLink.js');
const hashPost = require('./hashPost.js');
const { isAfter, isBefore, isEqual } = require('date-fns');
const logger = require('../logger')(__filename);
function report(curatedPosts, indexed, user, args) {
const {
indexedUpdated, tooOldCount, tooRecentCount, beforeIndexedCount, afterIndexedCount, requestedIgnored, duplicates,
} = curatedPosts;
if (indexedUpdated.length > 0) {
logger.info(`Ignoring ${indexedUpdated.length} indexed posts for '${user.name}'`);
}
if (requestedIgnored.length > 0) {
logger.info(`Ignoring ${requestedIgnored.length} posts because their IDs are specified to be ignored for '${user.name}'`);
}
if (tooOldCount > 0) {
logger.info(`Ignoring ${tooOldCount} older posts for '${user.name}' for specified date limit '${args.after}'`);
}
if (tooRecentCount > 0) {
logger.info(`Ignoring ${tooRecentCount} newer posts for '${user.name}' for specified date limit '${args.before}'`);
}
if (beforeIndexedCount > 0) {
logger.info(`Ignoring ${beforeIndexedCount} posts older than the ${args.afterIndexed} indexed post (${indexed[args.afterIndexed].id}, ${indexed[args.afterIndexed].date}) for '${user.name}'`);
}
if (afterIndexedCount > 0) {
logger.info(`Ignoring ${afterIndexedCount} posts newer than the ${args.beforeIndexed} indexed post (${indexed[args.beforeIndexed].id}, ${indexed[args.beforeIndexed].date}) for '${user.name}'`);
}
if (duplicates.length > 0) {
logger.info(`Ignoring ${duplicates.length} duplicate posts for '${user.name}'`);
}
}
function curatePost(acc, post, user, index, indexed, ignoreIds, processed, args) {
const host = dissectLink(post.url);
const permalink = `https://reddit.com${post.permalink}`;
const curatedPost = {
id: post.id,
index,
title: post.title,
text: post.selftext,
user: omit(user, ['posts']),
permalink,
url: post.url,
datetime: new Date(post.created_utc * 1000),
subreddit: post.subreddit.display_name,
score: post.score,
preview: post.preview ? post.preview.images.map(image => image.source) : null,
host,
direct: post.direct,
comments: post.comments,
hash: hashPost(post),
};
if (indexed.entries.length) {
const indexedPost = indexed.entries.find(entry => entry.id === post.id);
if (indexedPost && !args.redownload) {
curatedPost.previewFallback = indexedPost.preview;
return { ...acc, indexedUpdated: [...acc.indexedUpdated, curatedPost] };
}
if (args.afterIndexed && (isBefore(curatedPost.datetime, indexed[args.afterIndexed].date) || isEqual(curatedPost.datetime, indexed[args.afterIndexed].date))) {
return { ...acc, beforeIndexedCount: acc.beforeIndexedCount + 1 };
}
if (args.beforeIndexed && (isAfter(curatedPost.datetime, indexed[args.beforeIndexed].date) || isEqual(curatedPost.datetime, indexed[args.beforeIndexed].date))) {
return { ...acc, afterIndexedCount: acc.afterIndexedCount + 1 };
}
}
if (args.after && (isBefore(curatedPost.datetime, args.after) || isEqual(curatedPost.datetime, args.after))) {
return { ...acc, tooOldCount: acc.tooOldCount + 1 };
}
if (args.before && (isAfter(curatedPost.datetime, args.before) || isEqual(curatedPost.datetime, args.before))) {
return { ...acc, tooRecentCount: acc.tooRecentCount + 1 };
}
// cut-off at limit, but don't count posts requested directly by ID
if (acc.posts.length >= args.limit && !post.direct) {
return acc;
}
const ignoring = args.ignore ? args.ignore.find(prop => post[prop]) : null;
if (ignoring) {
logger.verbose(`Ignoring ${ignoring} post '${post.title}' (${permalink})`);
return acc;
}
if (host) {
const hostIncludes = args.include && !args.include.includes(host.label);
const hostExcluded = args.exclude && args.exclude.includes(host.label);
if (ignoreIds.has(String(host.id).toLowerCase())) {
return { ...acc, requestedIgnored: [...acc.requestedIgnored, curatedPost] };
}
if (hostIncludes || hostExcluded) {
logger.info(`Ignoring source '${host.label}' from post '${post.url}' (${permalink})`);
return acc;
}
if (config.fetch.avoidDuplicates && processed.has(host.id)) {
logger.verbose(`Ignoring duplicate content '${post.url}' (cross-post, repost or superfluous --post ID) (${permalink})`);
return { ...acc, duplicates: [...acc.duplicates, curatedPost] };
}
processed.add(host.id);
}
return {
...acc,
processed,
posts: [
...acc.posts,
curatedPost,
],
};
}
const curatePosts = (userPosts, ignoreIdsArray, args) => Object.values(userPosts).reduce((accPosts, user) => {
const processed = new Set();
const ignoreIds = new Set(ignoreIdsArray.map(postId => String(postId).toLowerCase()));
const indexedByDate = user.indexed.original.sort((entryA, entryB) => new Date(entryA.date) - new Date(entryB.date));
const indexed = {
entries: indexedByDate,
oldest: indexedByDate.slice(0, 1)[0],
latest: indexedByDate.slice(-1)[0],
};
const curatedPosts = user.posts.reduce((accUserPosts, post, index) => curatePost(accUserPosts, post, user, index, indexed, ignoreIds, processed, args), {
posts: [],
indexedUpdated: [],
requestedIgnored: [],
duplicates: [],
duplicateCount: 0,
tooOldCount: 0,
tooRecentCount: 0,
beforeIndexedCount: 0,
afterIndexedCount: 0,
});
report(curatedPosts, indexed, user, args);
const indexedOriginal = user.indexed.original.filter(entry => !curatedPosts.indexedUpdated.find(post => post.id === entry.id));
return {
...accPosts,
[user.name]: {
...user,
posts: curatedPosts.posts,
indexed: {
profile: user.indexed.profile,
original: indexedOriginal,
updated: curatedPosts.indexedUpdated,
ignored: curatedPosts.requestedIgnored,
oldest: indexed.oldest,
latest: indexed.latest,
},
},
};
}, {});
module.exports = curatePosts;