Browse Source

Storing file at rudimentary interpolated destination path.

master
ThePendulum 2 years ago
parent
commit
04270f2f04
  1. 41
      config/default.js
  2. 5
      package-lock.json
  3. 1
      package.json
  4. 4
      src/app.js
  5. 2
      src/args.js
  6. 96
      src/content/fetch.js
  7. 5
      src/content/gfycat.js
  8. 119
      src/content/store.js
  9. 4
      src/feeds/feeds.js
  10. 4
      src/utils/http.js
  11. 76
      src/utils/slugify.js

41
config/default.js

@ -1,6 +1,47 @@
module.exports = {
library: {
temp: 'output/temp',
base: {
feed: 'output/{feed.name}/',
direct: 'output/{host.slug}/',
},
feed: {
image: '{base.feed}{post.date}{div}{item.id}{div}{post.title}{ext}',
video: '{base.feed}{post.date}{div}{item.id}{div}{post.title}{ext}',
text: '{base.feed}{post.date}{div}{item.id}{div}{post.title}',
album: {
image: '{base.feed}{post.date}{div}{album.id}{div}{post.title}/{item.index}{div}{item.id}{ext}',
video: '{base.feed}{post.date}{div}{album.id}{div}{post.title}/{item.index}{div}{item.id}{ext}',
},
},
direct: {
image: '{base.direct}{item.date}{div}{item.id}{divs.item.title}{item.title}{ext}',
video: '{base.direct}{item.date}{div}{item.id}{divs.item.title}{item.title}{ext}',
text: '{base.direct}{item.date}{div}{item.id}{divs.item.title}{item.title}',
album: {
image: '{base.direct}{album.date}{div}{album.id}{divs.album.title}{album.title}/{item.index}{div}{item.id}{ext}',
video: '{base.direct}{album.date}{div}{album.id}{divs.album.title}{album.title}/{item.index}{div}{item.id}{ext}',
},
},
extractSingleAlbumItem: true,
profile: {
image: '{base.feed}{feed.name}{ext}',
bio: '{base.feed}{feed.name}',
},
index: {
file: '{base.feed}index',
},
dateFormat: 'YYYYMMDD',
divider: ' - ',
slashSubstitute: '#',
truncate: {
limit: 250,
truncator: '...',
},
},
limits: {
requestInterval: 1000,
requestConcurrency: 1,
attempts: 3,
},
};

5
package-lock.json

@ -5303,6 +5303,11 @@
}
}
},
"template-format": {
"version": "1.2.5",
"resolved": "https://registry.npmjs.org/template-format/-/template-format-1.2.5.tgz",
"integrity": "sha512-ZZqSfqYBMfPjouADYSRN9iaYlLr2PPVFYgULcV8cGMrJbifNXKvP7qx5PBFQjXg5mh1Gwkk+LTgdsZ8bmSvBdw=="
},
"text-hex": {
"version": "1.0.0",
"resolved": "https://registry.npmjs.org/text-hex/-/text-hex-1.0.0.tgz",

1
package.json

@ -33,6 +33,7 @@
"moment": "^2.29.0",
"nanoid": "^3.1.16",
"promise-task-queue": "^1.2.0",
"template-format": "^1.2.5",
"winston": "^3.3.3",
"winston-daily-rotate-file": "^4.5.0",
"yargs": "^16.0.3"

4
src/app.js

@ -3,13 +3,13 @@
const util = require('util');
const { fetchFeeds } = require('./feeds/feeds');
const { fetchFeedsContent } = require('./content/content');
const { fetchFeedsContent } = require('./content/fetch');
async function init() {
const feeds = await fetchFeeds();
const feedsWithContent = await fetchFeedsContent(feeds);
console.log(util.inspect(feedsWithContent, null, null));
// console.log(util.inspect(feedsWithContent, null, null));
}
init();

2
src/args.js

@ -44,7 +44,7 @@ const args = yargs
})
.option('limit', {
describe: 'Maximum number of items to fetch content from.',
type: 'boolean',
type: 'number',
})
.option('after', {
describe: 'Only include items uploaded after this date or timespan.',

96
src/content/content.js → src/content/fetch.js

@ -1,30 +1,63 @@
'use strict';
const config = require('config');
const fs = require('fs');
const fsPromises = require('fs').promises;
const path = require('path');
const Promise = require('bluebird');
const { nanoid } = require('nanoid/non-secure');
const mime = require('mime');
const logger = require('../logger')(__filename);
const http = require('../utils/http');
const slugify = require('../utils/slugify');
const gfycat = require('./gfycat');
const { storeContent } = require('./store');
const hosts = {
'www.gfycat.com': gfycat,
'www.redgifs.com': gfycat,
'gfycat.com': gfycat,
'redgifs.com': gfycat,
};
async function fetchSource(source, _item, _post, _feed) {
const tempPath = path.join('media', nanoid());
async function fetchSource(source, item, post, feed, attempt = 1) {
const tempPath = path.join(config.library.temp, nanoid());
await fsPromises.mkdir(config.library.temp, { recursive: true });
try {
const res = await http.get(source, {
destination: fs.createWriteStream(tempPath),
});
if (res.ok) {
const type = source.type || mime.getType(path.extname(source));
const extension = mime.getExtension(type);
return {
source,
tempPath,
extension,
type,
};
}
throw new Error(`Server response ${res.status}`);
} catch (error) {
if (attempt < config.limits.attempts) {
logger.warn(`Failed attempt ${attempt}/3 to fetch source ${source} for '${feed.name}': ${error.message}`);
const res = await http.get(source, {
destination: fs.createWriteStream(tempPath),
});
await Promise.delay(1000);
console.log(res.status);
return fetchSource(source, item, post, feed, attempt + 1);
}
}
return source;
return {
source,
tempPath: null,
};
}
async function fetchItem(item, post, feed) {
@ -44,7 +77,7 @@ async function fetchItem(item, post, feed) {
return {
...item,
media,
...media,
};
} catch (error) {
logger.warn(error.message);
@ -54,6 +87,10 @@ async function fetchItem(item, post, feed) {
}
async function fetchContentItems(post, feed) {
if (!post.content) {
return post;
}
const itemsWithFetchedSources = await Promise.map(
post.content.items,
async (item) => fetchItem(item, post, feed),
@ -69,10 +106,21 @@ async function fetchContentItems(post, feed) {
};
}
function curateHost(host, hostname) {
const curatedHost = {
name: host.name,
slug: slugify(host.name, '_'),
hostname,
};
return curatedHost;
}
async function fetchContent(post, feed) {
const { hostname } = new URL(post.link);
const host = hosts[hostname] || hosts[`www.${hostname}`];
const curatedHostname = hostname.replace('www.', '');
const host = hosts[curatedHostname];
if (!host) {
logger.warn(`No method available for content from '${feed.name || '[unknown]'}': ${post.link}`);
@ -91,6 +139,7 @@ async function fetchContent(post, feed) {
return {
...post,
content: item,
host: curateHost(host, curatedHostname),
};
}
@ -99,9 +148,10 @@ async function fetchContent(post, feed) {
return {
...post,
content: {
isAlbum: true,
...item,
isAlbum: true,
},
host: curateHost(host, curatedHostname),
};
}
@ -109,9 +159,10 @@ async function fetchContent(post, feed) {
return {
...post,
content: {
isAlbum: Array.isArray(item),
items: [].concat(item), // methods may return single item object instead of array
isAlbum: Array.isArray(item),
},
host: curateHost(host, curatedHostname),
};
}
@ -152,7 +203,24 @@ async function fetchFeedsContent(feeds) {
{ concurrency: 2 },
);
return feedsWithFetchedSources;
const feedsWithStoredContent = await Promise.map(
feedsWithFetchedSources,
async (feed) => {
const postsWithStoredContent = await Promise.map(
feed.posts,
async (post) => storeContent(post, feed),
{ concurrency: 20 },
);
return {
...feed,
posts: postsWithStoredContent,
};
},
{ concurrency: 2 },
);
return feedsWithStoredContent;
}
module.exports = {

5
src/content/gfycat.js

@ -34,10 +34,13 @@ async function fetchContent(url, _post) {
id: item.gfyName || id, // gfyName is the capitalized ID
title: item.title || null,
date: moment(item.createDate * 1000).toDate(),
username: (item.userData?.name || item.userDisplayName || item.username)?.trim(),
user: {
name: (item.userData?.name || item.userDisplayName || item.username)?.trim(),
},
};
}
module.exports = {
name: 'Gfycat',
fetchContent,
};

119
src/content/store.js

@ -0,0 +1,119 @@
'use strict';
const config = require('config');
const Promise = require('bluebird');
const path = require('path');
const fsPromises = require('fs').promises;
const format = require('template-format');
const moment = require('moment');
const logger = require('../logger')(__filename);
function interpolate(item, post, feed) {
const type = item.type.split('/')[0];
const pattern = feed.type === 'direct'
? config.library.direct[type]
: config.library.feed[type];
const data = {};
if (feed) {
data.feed = {
name: feed.name,
type: feed.type,
};
}
if (post) {
data.post = {
title: post.title,
};
data.host = {
name: post.host.name,
slug: post.host.slug,
};
}
if (item) {
data.item = {
id: item.id,
title: item.title,
description: item.description,
date: item.date && moment(item.date).format(config.library.dateFormat),
};
data.ext = `.${item.extension}`;
}
const dividers = Object.entries(data).reduce((acc, [key, value]) => {
if (typeof value === 'string') {
return {
...acc,
[key]: value ? config.library.divider : '',
};
}
return {
...acc,
[key]: Object.entries(value).reduce((subacc, [subkey, subvalue]) => ({
...subacc,
[subkey]: subvalue ? config.library.divider : '',
}), {}),
};
}, {});
const interpolated = format(pattern, {
...data,
base: {
feed: format(config.library.base.feed, data),
direct: format(config.library.base.direct, data),
},
dividers,
divider: config.library.divider,
div: config.library.divider,
divs: dividers,
});
return interpolated;
}
async function storeItem(item, post, feed) {
if (!item.tempPath) {
return item;
}
const destination = interpolate(item, post, feed);
logger.info(`Storing '${item.title || post.title}' from '${feed.name}' at ${destination}`);
await fsPromises.mkdir(path.parse(destination).dir, { recursive: true });
await fsPromises.rename(item.tempPath, destination);
return item;
}
async function storeContent(post, feed) {
if (!post.content) {
return post;
}
const storedItems = await Promise.map(
post.content.items,
async (item) => storeItem(item, post, feed),
{ concurrency: 20 },
);
return {
...post,
content: {
...post.content,
items: storedItems,
},
};
}
module.exports = {
storeContent,
};

4
src/feeds/feeds.js

@ -16,6 +16,7 @@ async function fetchPosts(channelName, feedType) {
async function fetchFeeds() {
const directFeed = {
name: 'direct',
type: 'direct',
posts: argv.items?.map((link) => ({
link,
@ -28,11 +29,12 @@ async function fetchFeeds() {
if (argv[feedType]) {
return Promise.all(argv[feedType].map(async (channelName) => {
const posts = await fetchPosts(channelName, feedType);
const limitedPosts = posts.slice(0, argv.limit);
return {
name: channelName,
type: feedType,
posts,
posts: limitedPosts,
};
}));
}

4
src/utils/http.js

@ -51,7 +51,7 @@ async function request(method = 'get', url, body, requestOptions, session) {
stream: !!requestOptions?.destination,
};
logger.silly(`GET (${options.limit?.interval || defaultLimiterOptions.minTime}ms/${options.limit?.concurrency || defaultLimiterOptions.maxConcurrent}p) ${url}`);
logger.debug(`GET (${options.limit?.interval || defaultLimiterOptions.minTime}ms/${options.limit?.concurrency || defaultLimiterOptions.maxConcurrent}p) ${url}`);
const res = body
? await http[method](url, body, options)
@ -60,7 +60,7 @@ async function request(method = 'get', url, body, requestOptions, session) {
const resIsOk = res.statusCode >= 200 && res.statusCode <= 299;
if (options.destination) {
res.on('progress', (bytes, totalBytes) => logger.silly(`Fetched ${Math.round((bytes / totalBytes) * 100)}% of ${url}`));
// res.on('progress', (bytes, totalBytes) => logger.silly(`Fetched ${Math.round((bytes / totalBytes) * 100)}% of ${url}`));
await pipeline(res, ...(options.transforms || []), options.destination);
}

76
src/utils/slugify.js

@ -0,0 +1,76 @@
'use strict';
const substitutes = {
à: 'a',
á: 'a',
ä: 'a',
å: 'a',
ã: 'a',
æ: 'ae',
ç: 'c',
è: 'e',
é: 'e',
ë: 'e',
: 'e',
ì: 'i',
í: 'i',
ï: 'i',
ĩ: 'i',
ǹ: 'n',
ń: 'n',
ñ: 'n',
ò: 'o',
ó: 'o',
ö: 'o',
õ: 'o',
ø: 'o',
œ: 'oe',
ß: 'ss',
ù: 'u',
ú: 'u',
ü: 'u',
ũ: 'u',
: 'y',
ý: 'y',
ÿ: 'y',
: 'y',
};
function slugify(string, delimiter = '-', {
encode = false,
removeAccents = true,
removePunctuation = false,
limit = 1000,
} = {}) {
if (!string || typeof string !== 'string') {
return string;
}
const slugComponents = string
.trim()
.toLowerCase()
.replace(removePunctuation && /[.,:;'"]/g, '')
.match(/[A-Za-zÀ-ÖØ-öø-ÿ0-9]+/g);
if (!slugComponents) {
return '';
}
const slug = slugComponents.reduce((acc, component, index) => {
const accSlug = `${acc}${index > 0 ? delimiter : ''}${component}`;
if (accSlug.length < limit) {
if (removeAccents) {
return accSlug.replace(/[à-ÿ]/g, (match) => substitutes[match] || '');
}
return accSlug;
}
return acc;
}, '');
return encode ? encodeURI(slug) : slug;
}
module.exports = slugify;
Loading…
Cancel
Save