Fixed invalid URL breaking scrape, prefixing reddit URLs with origin.

This commit is contained in:
2024-09-11 05:16:59 +02:00
parent b1e13e048c
commit e642203d6c
6 changed files with 115 additions and 467 deletions

View File

@@ -3,6 +3,7 @@
const config = require('config');
const { isAfter, isBefore, isEqual } = require('date-fns');
const omit = require('object.omit');
const unprint = require('unprint');
const dissectLink = require('../dissectLink');
const hashPost = require('./hashPost');
@@ -44,7 +45,7 @@ function report(curatedPosts, indexed, user, args) {
}
function curatePost(acc, post, user, index, indexed, ignoreIds, processed, args) {
const host = dissectLink(post.url);
const host = dissectLink(unprint.prefixUrl(post.url, 'https://reddit.com'));
const permalink = `https://reddit.com${post.permalink}`;
const curatedPost = {

View File

@@ -2,6 +2,8 @@
const UrlPattern = require('url-pattern');
const logger = require('./logger')(__filename);
const hosts = [
{
method: 'self',
@@ -118,19 +120,23 @@ module.exports = function dissectLink(url) {
return acc;
}
// const match = host.pattern.match(url.replace(/(https?:\/\/)|(\/)+/g, '$1$2')); // remove double slashes
const { origin, pathname } = new URL(url);
const match = host.pattern.match(`${origin}${pathname}`); // remove double slashes
try {
const { origin, pathname } = new URL(url);
const match = host.pattern.match(`${origin}${pathname}`); // remove double slashes
if (match) {
return Object.assign(match, {
url,
method: host.method,
label: host.label,
});
if (match) {
return Object.assign(match, {
url,
method: host.method,
label: host.label,
});
}
return null;
} catch (error) {
logger.error(`${error.message}: ${url}`);
return url;
}
return null;
}, null);
if (hostMethod) {

View File

@@ -3,6 +3,7 @@
const config = require('config');
const Promise = require('bluebird');
const args = require('../cli')();
const logger = require('../logger')(__filename);
const methods = require('../methods/methods');
@@ -30,7 +31,11 @@ async function attachContentInfo(users, { reddit, predata }) {
},
];
} catch (error) {
logger.warn(`${error.message} (${post.permalink})`);
if (args.debug) {
logger.warn(`${error.stack} (${post.permalink})`);
} else {
logger.warn(`${error.message} (${post.permalink})`);
}
if (config.fetch.archives.preview && post.preview) {
logger.info(`Found preview images for unavailable source '${post.url}' (${post.permalink})`);

View File

@@ -20,8 +20,8 @@ async function redditAlbum(host, post) {
url: host.url,
title: post.title,
},
items: items.map(url => ({
id: new URL(url).pathname.match(/\/(.*).jpg/)[1],
items: items.map((url) => ({
id: new URL(url).pathname.match(/\/(.*).\w+$/)?.[1],
url,
datetime: post.datetime,
type: mime.getType(url) || 'image/jpeg',