forked from DebaucheryLibrarian/traxxx
Allowing image sources to specify queue method. Using 5s queue for Whale Member to avoid CDN time-outs.
This commit is contained in:
@@ -10,9 +10,10 @@ const { argv } = yargs
|
||||
type: 'boolean',
|
||||
alias: 'web',
|
||||
})
|
||||
.option('scrape', {
|
||||
.option('all', {
|
||||
describe: 'Scrape channels and networks defined in configuration',
|
||||
type: 'boolean',
|
||||
alias: 'scrape',
|
||||
})
|
||||
.option('networks', {
|
||||
describe: 'Network to scrape all channels from (overrides configuration)',
|
||||
|
||||
@@ -89,6 +89,8 @@ function toBaseSource(rawSource) {
|
||||
|
||||
if (rawSource.referer) baseSource.referer = rawSource.referer;
|
||||
if (rawSource.host) baseSource.host = rawSource.host;
|
||||
if (rawSource.attempts) baseSource.attempts = rawSource.attempts;
|
||||
if (rawSource.queueMethod) baseSource.queueMethod = rawSource.queueMethod;
|
||||
|
||||
if (rawSource.copyright) baseSource.copyright = rawSource.copyright;
|
||||
if (rawSource.comment) baseSource.comment = rawSource.comment;
|
||||
@@ -393,6 +395,7 @@ async function fetchSource(source, baseMedia) {
|
||||
stream: true, // sources are fetched in parallel, don't gobble up memory
|
||||
transforms: [hashStream],
|
||||
destination: tempFileTarget,
|
||||
queueMethod: source.queueMethod || null, // use http module's default
|
||||
});
|
||||
|
||||
hasher.end();
|
||||
@@ -422,9 +425,11 @@ async function fetchSource(source, baseMedia) {
|
||||
},
|
||||
};
|
||||
} catch (error) {
|
||||
logger.warn(`Failed attempt ${attempts}/3 to fetch ${source.src}: ${error.message}`);
|
||||
const maxAttempts = source.attempts || 3;
|
||||
|
||||
if (attempts < 3) {
|
||||
logger.warn(`Failed attempt ${attempts}/${maxAttempts} to fetch ${source.src}: ${error.message}`);
|
||||
|
||||
if (attempts < maxAttempts) {
|
||||
await Promise.delay(1000);
|
||||
|
||||
return attempt(attempts + 1);
|
||||
|
||||
@@ -19,13 +19,32 @@ function scrapeLatest(html, site) {
|
||||
release.date = moment.utc(scene.dataset.date, 'MMMM DD, YYYY').toDate();
|
||||
release.actors = Array.from(scene.querySelectorAll('.actors a'), el => el.textContent);
|
||||
|
||||
// slow CDN?
|
||||
const poster = scene.querySelector('.single-image').dataset.src;
|
||||
release.poster = /^http/.test(poster) ? poster : `https:${poster}`;
|
||||
const teaserEl = scene.querySelector('source');
|
||||
|
||||
release.photos = Array.from(scene.querySelectorAll('.rollover-thumbs img'), el => (/^http/.test(el.dataset.src) ? el.dataset.src : `https:${el.dataset.src}`));
|
||||
release.poster = {
|
||||
src: /^http/.test(poster) ? poster : `https:${poster}`,
|
||||
referer: site.url,
|
||||
attempts: 5,
|
||||
queueMethod: '5s',
|
||||
};
|
||||
|
||||
const trailerEl = scene.querySelector('source');
|
||||
if (trailerEl) release.trailer = { src: trailerEl.dataset.src };
|
||||
release.photos = Array.from(scene.querySelectorAll('.rollover-thumbs img'), el => ({
|
||||
src: (/^http/.test(el.dataset.src) ? el.dataset.src : `https:${el.dataset.src}`),
|
||||
referer: site.url,
|
||||
attempts: 5,
|
||||
queueMethod: '5s',
|
||||
}));
|
||||
|
||||
if (teaserEl) {
|
||||
release.teaser = {
|
||||
src: teaserEl.dataset.src,
|
||||
referer: site.url,
|
||||
attempts: 5,
|
||||
queueMethod: '5s',
|
||||
};
|
||||
}
|
||||
|
||||
return release;
|
||||
});
|
||||
@@ -51,16 +70,42 @@ function scrapeScene(html, site, url) {
|
||||
release.duration = Number(durationEls[0].textContent.match(/\d+/)[0]) * 60;
|
||||
}
|
||||
|
||||
release.photos = Array.from(scene.querySelectorAll('#t2019-main .t2019-thumbs img'), el => (/^http/.test(el.src) ? el.src : `https:${el.src}`));
|
||||
// unreliable CDN
|
||||
release.photos = Array.from(scene.querySelectorAll('#t2019-main .t2019-thumbs img'), el => ({
|
||||
src: (/^http/.test(el.src) ? el.src : `https:${el.src}`),
|
||||
referer: site.url,
|
||||
attempts: 5,
|
||||
queueMethod: '5s',
|
||||
}));
|
||||
|
||||
const posterEl = scene.querySelector('#no-player-image');
|
||||
const videoEl = scene.querySelector('video');
|
||||
|
||||
if (posterEl) release.poster = /^http/.test(posterEl.src) ? posterEl.src : `https:${posterEl.src}`;
|
||||
else if (videoEl) release.poster = /^http/.test(videoEl.poster) ? videoEl.poster : `https:${videoEl.poster}`;
|
||||
|
||||
const trailerEl = scene.querySelector('#t2019-video source');
|
||||
if (trailerEl) release.trailer = { src: trailerEl.src };
|
||||
|
||||
if (posterEl) {
|
||||
release.poster = {
|
||||
src: /^http/.test(posterEl.src) ? posterEl.src : `https:${posterEl.src}`,
|
||||
referer: site.url,
|
||||
attempts: 5,
|
||||
queueMethod: '5s',
|
||||
};
|
||||
} else if (videoEl) {
|
||||
release.poster = {
|
||||
src: /^http/.test(videoEl.poster) ? videoEl.poster : `https:${videoEl.poster}`,
|
||||
referer: site.url,
|
||||
attempts: 5,
|
||||
queueMethod: '5s',
|
||||
};
|
||||
}
|
||||
|
||||
if (trailerEl) {
|
||||
release.trailer = {
|
||||
src: trailerEl.src,
|
||||
referer: site.url,
|
||||
attempts: 5,
|
||||
queueMethod: '5s',
|
||||
};
|
||||
}
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
@@ -46,7 +46,7 @@ function curateReleaseEntry(release, batchId, existingRelease) {
|
||||
}
|
||||
|
||||
async function attachChannelEntities(releases) {
|
||||
const releasesWithoutEntity = releases.filter(release => release.channel && !release.entity && release.entity.type !== 1);
|
||||
const releasesWithoutEntity = releases.filter(release => release.channel && (!release.entity || release.entity.type === 'network'));
|
||||
|
||||
const channelEntities = await knex('entities')
|
||||
.select(knex.raw('entities.*, row_to_json(parents) as parent'))
|
||||
|
||||
@@ -35,6 +35,7 @@ function useProxy(url) {
|
||||
}
|
||||
|
||||
const queue = taskQueue();
|
||||
const defaultQueueMethod = '20p';
|
||||
|
||||
async function handler({
|
||||
url,
|
||||
@@ -44,9 +45,9 @@ async function handler({
|
||||
options = {},
|
||||
}) {
|
||||
if (body) {
|
||||
logger.silly(`${method.toUpperCase()} ${url} with ${JSON.stringify(body)}`);
|
||||
logger.silly(`${method.toUpperCase()} ${url} with ${JSON.stringify(body)} ${options.queueMethod || defaultQueueMethod}`);
|
||||
} else {
|
||||
logger.silly(`${method.toUpperCase()} ${url}`);
|
||||
logger.silly(`${method.toUpperCase()} ${url} ${options.queueMethod || defaultQueueMethod}`);
|
||||
}
|
||||
|
||||
const reqOptions = {
|
||||
@@ -98,8 +99,12 @@ queue.define('1s', handler, {
|
||||
interval: 1,
|
||||
});
|
||||
|
||||
async function get(url, headers, options, queueMethod = '20p') {
|
||||
return queue.push(queueMethod, {
|
||||
queue.define('5s', handler, {
|
||||
interval: 5,
|
||||
});
|
||||
|
||||
async function get(url, headers, options) {
|
||||
return queue.push(options.queueMethod || defaultQueueMethod, {
|
||||
method: 'GET',
|
||||
url,
|
||||
headers,
|
||||
@@ -107,8 +112,8 @@ async function get(url, headers, options, queueMethod = '20p') {
|
||||
});
|
||||
}
|
||||
|
||||
async function post(url, body, headers, options, queueMethod = '20p') {
|
||||
return queue.push(queueMethod, {
|
||||
async function post(url, body, headers, options) {
|
||||
return queue.push(options.queueMethod || defaultQueueMethod, {
|
||||
method: 'POST',
|
||||
url,
|
||||
body,
|
||||
|
||||
@@ -12,7 +12,7 @@ async function resolvePlace(query) {
|
||||
// https://operations.osmfoundation.org/policies/nominatim/
|
||||
const res = await http.get(`https://nominatim.openstreetmap.org/search/${encodeURI(query)}?format=json&accept-language=en&addressdetails=1`, {
|
||||
'User-Agent': 'contact at moonloop.adult@protonmail.com',
|
||||
}, null, '1s');
|
||||
}, { queueMethod: '1s' });
|
||||
|
||||
const [item] = res.body;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user