Added hash comparison to duplicate avoidance.

This commit is contained in:
DebaucheryLibrarian 2024-09-11 05:16:58 +02:00
parent 7c6c99c9d1
commit 45123bc630
7 changed files with 305 additions and 35 deletions

227
package-lock.json generated
View File

@ -231,6 +231,60 @@
"tweetnacl": "^0.14.3"
}
},
"bhttp": {
"version": "1.2.4",
"resolved": "https://registry.npmjs.org/bhttp/-/bhttp-1.2.4.tgz",
"integrity": "sha1-/tDCT3ZbNa/ElAsIqzIUgT44848=",
"requires": {
"bluebird": "^2.8.2",
"concat-stream": "^1.4.7",
"debug": "^2.1.1",
"dev-null": "^0.1.1",
"errors": "^0.2.0",
"extend": "^2.0.0",
"form-data2": "^1.0.0",
"form-fix-array": "^1.0.0",
"lodash": "^2.4.1",
"stream-length": "^1.0.2",
"string": "^3.0.0",
"through2-sink": "^1.0.0",
"through2-spy": "^1.2.0",
"tough-cookie": "^2.3.1"
},
"dependencies": {
"bluebird": {
"version": "2.11.0",
"resolved": "https://registry.npmjs.org/bluebird/-/bluebird-2.11.0.tgz",
"integrity": "sha1-U0uQM8AiyVecVro7Plpcqvu2UOE="
},
"debug": {
"version": "2.6.9",
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
"requires": {
"ms": "2.0.0"
}
},
"extend": {
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/extend/-/extend-2.0.2.tgz",
"integrity": "sha512-AgFD4VU+lVLP6vjnlNfF7OeInLTyeyckCNPEsuxz1vi786UuK/nk6ynPuhn/h+Ju9++TQyr5EpLRI14fc1QtTQ=="
},
"lodash": {
"version": "2.4.2",
"resolved": "https://registry.npmjs.org/lodash/-/lodash-2.4.2.tgz",
"integrity": "sha1-+t2DS5aDBz2hebPq5tnA0VBT9z4="
}
}
},
"blake2": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/blake2/-/blake2-4.0.0.tgz",
"integrity": "sha512-PIOc6RXAZYBYcdpyMzI6/SCU3BH8EbmA9vr0BAVyQv48CQTXDN6viHOTM+8KQue2IPsyHNpIR3UDisz8rZDPTA==",
"requires": {
"nan": "^2.14.0"
}
},
"bluebird": {
"version": "3.5.1",
"resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.5.1.tgz",
@ -259,8 +313,7 @@
"buffer-from": {
"version": "1.1.0",
"resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.0.tgz",
"integrity": "sha512-c5mRlguI/Pe2dSZmpER62rSCu0ryKmWddzRYsuXc50U2/g8jMOulc31VZMa4mYx31U5xsmSOpDCgH88Vl9cDGQ==",
"dev": true
"integrity": "sha512-c5mRlguI/Pe2dSZmpER62rSCu0ryKmWddzRYsuXc50U2/g8jMOulc31VZMa4mYx31U5xsmSOpDCgH88Vl9cDGQ=="
},
"builtin-modules": {
"version": "1.1.1",
@ -413,6 +466,31 @@
"delayed-stream": "~1.0.0"
}
},
"combined-stream2": {
"version": "1.1.2",
"resolved": "https://registry.npmjs.org/combined-stream2/-/combined-stream2-1.1.2.tgz",
"integrity": "sha1-9uFLegFWZvjHsKH6xQYkAWSsNXA=",
"requires": {
"bluebird": "^2.8.1",
"debug": "^2.1.1",
"stream-length": "^1.0.1"
},
"dependencies": {
"bluebird": {
"version": "2.11.0",
"resolved": "https://registry.npmjs.org/bluebird/-/bluebird-2.11.0.tgz",
"integrity": "sha1-U0uQM8AiyVecVro7Plpcqvu2UOE="
},
"debug": {
"version": "2.6.9",
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
"requires": {
"ms": "2.0.0"
}
}
}
},
"concat-map": {
"version": "0.0.1",
"resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
@ -423,7 +501,6 @@
"version": "1.6.2",
"resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-1.6.2.tgz",
"integrity": "sha512-27HBghJxjiZtIk3Ycvn/4kbJk/1uZuJFfuPEns6LaEvpvG1f0hTea8lilrouyo9mVc2GWdcEZ8OLoGmSADlrCw==",
"dev": true,
"requires": {
"buffer-from": "^1.0.0",
"inherits": "^2.0.3",
@ -568,6 +645,11 @@
"resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
"integrity": "sha1-3zrhmayt+31ECqrgsp4icrJOxhk="
},
"dev-null": {
"version": "0.1.1",
"resolved": "https://registry.npmjs.org/dev-null/-/dev-null-0.1.1.tgz",
"integrity": "sha1-WiBc48Ky73e2I41roXnrdMag6Bg="
},
"dist-exiftool": {
"version": "10.53.0",
"resolved": "https://registry.npmjs.org/dist-exiftool/-/dist-exiftool-10.53.0.tgz",
@ -664,6 +746,11 @@
"is-arrayish": "^0.2.1"
}
},
"errors": {
"version": "0.2.0",
"resolved": "https://registry.npmjs.org/errors/-/errors-0.2.0.tgz",
"integrity": "sha1-D1Hoidqj4RsZ5xhtEfEEqmbrJAM="
},
"es-abstract": {
"version": "1.11.0",
"resolved": "https://registry.npmjs.org/es-abstract/-/es-abstract-1.11.0.tgz",
@ -1043,6 +1130,49 @@
"mime-types": "^2.1.12"
}
},
"form-data2": {
"version": "1.0.3",
"resolved": "https://registry.npmjs.org/form-data2/-/form-data2-1.0.3.tgz",
"integrity": "sha1-y6XiNgGmlE2Vq31xEf+Tl6XLKk0=",
"requires": {
"bluebird": "^2.8.2",
"combined-stream2": "^1.0.2",
"debug": "^2.1.1",
"lodash": "^2.4.1",
"mime": "^1.2.11",
"uuid": "^2.0.1"
},
"dependencies": {
"bluebird": {
"version": "2.11.0",
"resolved": "https://registry.npmjs.org/bluebird/-/bluebird-2.11.0.tgz",
"integrity": "sha1-U0uQM8AiyVecVro7Plpcqvu2UOE="
},
"debug": {
"version": "2.6.9",
"resolved": "https://registry.npmjs.org/debug/-/debug-2.6.9.tgz",
"integrity": "sha512-bC7ElrdJaJnPbAP+1EotYvqZsb3ecl5wi6Bfi6BJTUcNowp6cvspg0jXznRTKDjm/E7AdgFBVeAPVMNcKGsHMA==",
"requires": {
"ms": "2.0.0"
}
},
"lodash": {
"version": "2.4.2",
"resolved": "https://registry.npmjs.org/lodash/-/lodash-2.4.2.tgz",
"integrity": "sha1-+t2DS5aDBz2hebPq5tnA0VBT9z4="
},
"uuid": {
"version": "2.0.3",
"resolved": "https://registry.npmjs.org/uuid/-/uuid-2.0.3.tgz",
"integrity": "sha1-Z+LoY3lyFVMN/zGOW/nc6/1Hsho="
}
}
},
"form-fix-array": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/form-fix-array/-/form-fix-array-1.0.0.tgz",
"integrity": "sha1-oTR6R+UxF6t7zb8+Lz7JHGZ2m8g="
},
"fs-extra": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-5.0.0.tgz",
@ -1774,6 +1904,11 @@
"resolved": "https://registry.npmjs.org/merge-stream/-/merge-stream-2.0.0.tgz",
"integrity": "sha512-abv/qOcuPfk3URPfDzmZU1LKmuw8kT+0nIHvKrKgFrwifol/doWcdA4ZqsWQ8ENrFKkd67Mfpo/LovbIUsbt3w=="
},
"mime": {
"version": "1.6.0",
"resolved": "https://registry.npmjs.org/mime/-/mime-1.6.0.tgz",
"integrity": "sha512-x0Vn8spI+wuJ1O6S7gnbaQg8Pxh4NNHb7KSINmEWKiPE4RKOplvijn+NkmYmmRgP68mc70j2EbeTFRsrswaQeg=="
},
"mime-db": {
"version": "1.33.0",
"resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.33.0.tgz",
@ -1817,8 +1952,7 @@
"ms": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.0.0.tgz",
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g=",
"dev": true
"integrity": "sha1-VgiurfwAvmwpAd9fmGF4jeDVl8g="
},
"mute-stream": {
"version": "0.0.7",
@ -1826,6 +1960,11 @@
"integrity": "sha1-MHXOk7whuPq0PhvE2n6BFe0ee6s=",
"dev": true
},
"nan": {
"version": "2.14.0",
"resolved": "https://registry.npmjs.org/nan/-/nan-2.14.0.tgz",
"integrity": "sha512-INOFj37C7k3AfaNTtX8RhsTw7qRy7eLET14cROi9+5HAVbbHuIWUHEauBv5qT4Av2tWasiTY1Jw6puUNqRJXQg=="
},
"natural-compare": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz",
@ -2538,6 +2677,21 @@
"resolved": "https://registry.npmjs.org/stealthy-require/-/stealthy-require-1.1.1.tgz",
"integrity": "sha1-NbCYdbT/SfJqd35QmzCQoyJr8ks="
},
"stream-length": {
"version": "1.0.2",
"resolved": "https://registry.npmjs.org/stream-length/-/stream-length-1.0.2.tgz",
"integrity": "sha1-gnfzy+5JpNqrz9tOL0qbXp8snwA=",
"requires": {
"bluebird": "^2.6.2"
},
"dependencies": {
"bluebird": {
"version": "2.11.0",
"resolved": "https://registry.npmjs.org/bluebird/-/bluebird-2.11.0.tgz",
"integrity": "sha1-U0uQM8AiyVecVro7Plpcqvu2UOE="
}
}
},
"streamify": {
"version": "0.2.9",
"resolved": "https://registry.npmjs.org/streamify/-/streamify-0.2.9.tgz",
@ -2546,6 +2700,11 @@
"hashish": "~0.0.4"
}
},
"string": {
"version": "3.3.3",
"resolved": "https://registry.npmjs.org/string/-/string-3.3.3.tgz",
"integrity": "sha1-XqIRzZLSKOGEKUmQpsyXs2anfLA="
},
"string-width": {
"version": "2.1.1",
"resolved": "https://registry.npmjs.org/string-width/-/string-width-2.1.1.tgz",
@ -2635,6 +2794,56 @@
"integrity": "sha1-DdTJ/6q8NXlgsbckEV1+Doai4fU=",
"dev": true
},
"through2": {
"version": "0.5.1",
"resolved": "https://registry.npmjs.org/through2/-/through2-0.5.1.tgz",
"integrity": "sha1-390BLrnHAOIyP9M084rGIqs3Lac=",
"requires": {
"readable-stream": "~1.0.17",
"xtend": "~3.0.0"
},
"dependencies": {
"isarray": {
"version": "0.0.1",
"resolved": "https://registry.npmjs.org/isarray/-/isarray-0.0.1.tgz",
"integrity": "sha1-ihis/Kmo9Bd+Cav8YDiTmwXR7t8="
},
"readable-stream": {
"version": "1.0.34",
"resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-1.0.34.tgz",
"integrity": "sha1-Elgg40vIQtLyqq+v5MKRbuMsFXw=",
"requires": {
"core-util-is": "~1.0.0",
"inherits": "~2.0.1",
"isarray": "0.0.1",
"string_decoder": "~0.10.x"
}
},
"string_decoder": {
"version": "0.10.31",
"resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-0.10.31.tgz",
"integrity": "sha1-YuIDvEF2bGwoyfyEMB2rHFMQ+pQ="
}
}
},
"through2-sink": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/through2-sink/-/through2-sink-1.0.0.tgz",
"integrity": "sha1-XxBruh1zMNrTy6XAqxhjkjJWw5k=",
"requires": {
"through2": "~0.5.1",
"xtend": "~3.0.0"
}
},
"through2-spy": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/through2-spy/-/through2-spy-1.2.0.tgz",
"integrity": "sha1-nIkcqcpA4eHkzzHhrFf5TMnSSMs=",
"requires": {
"through2": "~0.5.1",
"xtend": "~3.0.0"
}
},
"tmp": {
"version": "0.0.33",
"resolved": "https://registry.npmjs.org/tmp/-/tmp-0.0.33.tgz",
@ -2697,8 +2906,7 @@
"typedarray": {
"version": "0.0.6",
"resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
"integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c=",
"dev": true
"integrity": "sha1-hnrHTjhkGHsdPUfZlqeOxciDB3c="
},
"ultron": {
"version": "1.1.1",
@ -2919,6 +3127,11 @@
"resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz",
"integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw=="
},
"xtend": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/xtend/-/xtend-3.0.0.tgz",
"integrity": "sha1-XM50B7r2Qsunvs2laBEcST9ZZlo="
},
"y18n": {
"version": "3.2.1",
"resolved": "https://registry.npmjs.org/y18n/-/y18n-3.2.1.tgz",

View File

@ -29,6 +29,8 @@
"license": "ISC",
"dependencies": {
"array.prototype.flatten": "^1.2.1",
"bhttp": "^1.2.4",
"blake2": "^4.0.0",
"bluebird": "^3.5.1",
"cheerio": "^1.0.0-rc.2",
"config": "^1.30.0",

View File

@ -2,16 +2,16 @@
const config = require('config');
const Promise = require('bluebird');
const yaml = require('js-yaml');
const saveProfileDetails = require('../save/profileDetails.js');
const fetchItem = require('./item.js');
const interpolate = require('../interpolate.js');
const save = require('../save/save.js');
const textToStream = require('../save/textToStream.js');
// const textToStream = require('../save/textToStream.js');
const saveMeta = require('../save/meta.js');
const mux = require('../save/mux.js');
const writeToIndex = require('../save/writeToIndex.js');
const yaml = require('js-yaml');
function curateComments(comments) {
return comments.map((comment) => {
@ -48,16 +48,19 @@ function selfPostToText(item, post) {
return yaml.safeDump(curatedPost);
}
async function getStreams(item, post) {
async function getBuffers(item, post) {
if (item.self) {
return [textToStream(selfPostToText(item, post))];
return [{
...Buffer.from(selfPostToText(item, post), 'utf8'),
hash: post.hash,
}];
}
const sources = item.mux ? [item.url].concat(item.mux) : [item.url];
const streams = await Promise.map(sources, source => fetchItem(source, 0, post));
const buffers = await Promise.map(sources, source => fetchItem(source, 0, post));
if (streams.filter(stream => stream).length > 0) {
return streams;
if (buffers.filter(buffer => buffer).length > 0) {
return buffers;
}
return null;
@ -101,19 +104,32 @@ function getFilepath(item, content, host, post, user) {
async function fetchSaveUserContent(user, ep, args) {
const profilePaths = await saveProfileDetails(user, args);
const hashes = new Set(user.indexed.original.map(item => item.hash));
const posts = await Promise.map(user.posts, async (post) => {
await Promise.reduce(post.content.items, async (accItems, originalItem, index) => {
const hash = await Promise.reduce(post.content.items, async (accItems, originalItem, index) => {
const item = { ...originalItem, index };
const streams = await getStreams(item, post);
const buffers = await getBuffers(item, post);
// no streams, ignore item
if (!streams || streams.length === 0) {
// no buffers, ignore item
if (!buffers || buffers.length === 0) {
return accItems;
}
// prevent duplicates
if (config.fetch.avoidDuplicates && hashes.has(buffers[0].hash)) {
console.log(
'\x1b[33m%s\x1b[0m',
`Ignoring duplicate file '${post.url}' (${post.permalink})`,
);
return buffers[0].hash;
}
const filepath = getFilepath(item, post.content, post.host, post, user);
const sourcePaths = await save(filepath, streams, item, post);
const sourcePaths = await save(filepath, buffers, item, post);
hashes.add(buffers[0].hash);
if (item.mux) {
await mux(filepath, sourcePaths, item);
@ -121,10 +137,13 @@ async function fetchSaveUserContent(user, ep, args) {
await addMeta(filepath, item, post, user, ep);
return sourcePaths;
return buffers[0].hash;
}, []);
return post;
return {
...post,
hash,
};
}, {
concurrency: config.fetch.concurrency,
});
@ -135,15 +154,15 @@ async function fetchSaveUserContent(user, ep, args) {
async function fetchSaveDirectContent(content, host, ep) {
return Promise.reduce(content.items, async (accItems, originalItem, index) => {
const item = { ...originalItem, index };
const streams = await getStreams(item, null);
const buffers = await getBuffers(item, null);
// no streams, ignore item
if (!streams || streams.length === 0) {
// no buffers, ignore item
if (!buffers || buffers.length === 0) {
return accItems;
}
const filepath = getFilepath(item, content, host, null, null);
const sourcePaths = await save(filepath, streams, item, null);
const sourcePaths = await save(filepath, buffers, item, null);
if (item.mux) {
await mux(filepath, sourcePaths, item);

View File

@ -1,7 +1,8 @@
'use strict';
const config = require('config');
const fetch = require('node-fetch');
const bhttp = require('bhttp');
const blake2 = require('blake2');
async function fetchItem(url, attempt, post) {
async function retry(error) {
@ -17,16 +18,21 @@ async function fetchItem(url, attempt, post) {
}
try {
const res = await fetch(url);
const res = await bhttp.get(url);
if (!res.ok) {
if (!res.statusCode === 200) {
throw new Error(`Response not OK for '${url}', HTTP code '${res.status}'`);
}
console.log(`Fetched '${url}' (${post ? post.permalink : 'no post'})`);
return res.body;
const hash = blake2.createHash('blake2b', { digestLength: 24 });
hash.update(res.body);
const contentHash = hash.digest('hex');
return Object.assign(res.body, { hash: contentHash });
} catch (error) {
console.log(error);
return retry(error);
}
}

View File

@ -6,7 +6,7 @@ const UrlPattern = require('url-pattern');
const interpolate = require('../interpolate.js');
const fetchItem = require('../fetch/item.js');
const textToStream = require('./textToStream.js');
// const textToStream = require('./textToStream.js');
const save = require('./save.js');
async function saveProfileImage(user, args) {
@ -38,7 +38,8 @@ async function saveProfileImage(user, args) {
);
try {
const stream = await fetchItem(image, 0, { permalink: `https://reddit.com/user/${user.name}` });
const { protocol, hostname, pathname } = new URL(image);
const stream = await fetchItem(`${protocol}//${hostname}${pathname}`, 0, { permalink: `https://reddit.com/user/${user.name}` });
const targets = await save(filepath, stream);
return targets[0];
@ -62,7 +63,7 @@ async function saveProfileDescription(user, args) {
if (config.library.profile.description && !user.fallback && !user.deleted) {
if (user.profile && user.profile.description) {
const filepath = interpolate(config.library.profile.description, null, null, null, null, user);
const stream = textToStream(user.profile.description);
const stream = Buffer.from(user.profile.description, 'utf8');
try {
const targets = await save(filepath, stream);

View File

@ -26,7 +26,20 @@ function getPathElements(requestedFilepath) {
};
}
function pipeStreamToFile(target, stream, item) {
async function writeBufferToFile(target, buffer, item) {
await fs.writeFile(target, buffer);
if (item && item.mux) {
console.log(`Temporarily saved '${target}', queued for muxing`);
} else {
console.log('\x1b[32m%s\x1b[0m', `Saved '${target}'`);
}
return target;
}
/*
async function pipeStreamToFile(target, stream, item) {
const file = fs.createWriteStream(target);
return new Promise((resolve, reject) => {
@ -57,5 +70,19 @@ async function save(requestedFilepath, streamOrStreams, item) {
return pipeStreamToFile(target, stream, item);
}));
}
*/
async function save(requestedFilepath, bufferOrBuffers, item) {
const pathElements = getPathElements(requestedFilepath);
const buffers = [].concat(bufferOrBuffers); // allow for single stream argument
await fs.ensureDir(pathElements.dir);
return Promise.all(buffers.map((buffer, index) => {
const target = path.join(pathElements.root, pathElements.dir, `${pathElements.name}${buffers.length > 1 ? `-${index}` : ''}${pathElements.ext}`);
return writeBufferToFile(target, buffer, item);
}));
}
module.exports = save;

View File

@ -4,7 +4,7 @@ const config = require('config');
const yaml = require('js-yaml');
const interpolate = require('../interpolate');
const textToStream = require('./textToStream');
// const textToStream = require('./textToStream');
const save = require('./save');
async function writeToIndex(posts, profilePaths, user, args) {
@ -22,6 +22,7 @@ async function writeToIndex(posts, profilePaths, user, args) {
indexed: now,
score: post.score,
title: post.title,
hash: post.hash,
};
if (post.previewFallback) {
@ -43,7 +44,8 @@ async function writeToIndex(posts, profilePaths, user, args) {
return false;
}
return save(filepath, textToStream(yaml.safeDump(data)));
// return save(filepath, textToStream(yaml.safeDump(data)));
return save(filepath, Buffer.from(yaml.safeDump(data), 'utf8'));
}
module.exports = writeToIndex;