Fixed invalid URL breaking scrape, prefixing reddit URLs with origin.
This commit is contained in:
@@ -3,6 +3,7 @@
|
||||
const config = require('config');
|
||||
const { isAfter, isBefore, isEqual } = require('date-fns');
|
||||
const omit = require('object.omit');
|
||||
const unprint = require('unprint');
|
||||
|
||||
const dissectLink = require('../dissectLink');
|
||||
const hashPost = require('./hashPost');
|
||||
@@ -44,7 +45,7 @@ function report(curatedPosts, indexed, user, args) {
|
||||
}
|
||||
|
||||
function curatePost(acc, post, user, index, indexed, ignoreIds, processed, args) {
|
||||
const host = dissectLink(post.url);
|
||||
const host = dissectLink(unprint.prefixUrl(post.url, 'https://reddit.com'));
|
||||
const permalink = `https://reddit.com${post.permalink}`;
|
||||
|
||||
const curatedPost = {
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
|
||||
const UrlPattern = require('url-pattern');
|
||||
|
||||
const logger = require('./logger')(__filename);
|
||||
|
||||
const hosts = [
|
||||
{
|
||||
method: 'self',
|
||||
@@ -118,19 +120,23 @@ module.exports = function dissectLink(url) {
|
||||
return acc;
|
||||
}
|
||||
|
||||
// const match = host.pattern.match(url.replace(/(https?:\/\/)|(\/)+/g, '$1$2')); // remove double slashes
|
||||
const { origin, pathname } = new URL(url);
|
||||
const match = host.pattern.match(`${origin}${pathname}`); // remove double slashes
|
||||
try {
|
||||
const { origin, pathname } = new URL(url);
|
||||
const match = host.pattern.match(`${origin}${pathname}`); // remove double slashes
|
||||
|
||||
if (match) {
|
||||
return Object.assign(match, {
|
||||
url,
|
||||
method: host.method,
|
||||
label: host.label,
|
||||
});
|
||||
if (match) {
|
||||
return Object.assign(match, {
|
||||
url,
|
||||
method: host.method,
|
||||
label: host.label,
|
||||
});
|
||||
}
|
||||
|
||||
return null;
|
||||
} catch (error) {
|
||||
logger.error(`${error.message}: ${url}`);
|
||||
return url;
|
||||
}
|
||||
|
||||
return null;
|
||||
}, null);
|
||||
|
||||
if (hostMethod) {
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
const config = require('config');
|
||||
const Promise = require('bluebird');
|
||||
|
||||
const args = require('../cli')();
|
||||
const logger = require('../logger')(__filename);
|
||||
const methods = require('../methods/methods');
|
||||
|
||||
@@ -30,7 +31,11 @@ async function attachContentInfo(users, { reddit, predata }) {
|
||||
},
|
||||
];
|
||||
} catch (error) {
|
||||
logger.warn(`${error.message} (${post.permalink})`);
|
||||
if (args.debug) {
|
||||
logger.warn(`${error.stack} (${post.permalink})`);
|
||||
} else {
|
||||
logger.warn(`${error.message} (${post.permalink})`);
|
||||
}
|
||||
|
||||
if (config.fetch.archives.preview && post.preview) {
|
||||
logger.info(`Found preview images for unavailable source '${post.url}' (${post.permalink})`);
|
||||
|
||||
@@ -20,8 +20,8 @@ async function redditAlbum(host, post) {
|
||||
url: host.url,
|
||||
title: post.title,
|
||||
},
|
||||
items: items.map(url => ({
|
||||
id: new URL(url).pathname.match(/\/(.*).jpg/)[1],
|
||||
items: items.map((url) => ({
|
||||
id: new URL(url).pathname.match(/\/(.*).\w+$/)?.[1],
|
||||
url,
|
||||
datetime: post.datetime,
|
||||
type: mime.getType(url) || 'image/jpeg',
|
||||
|
||||
Reference in New Issue
Block a user