Allowing image sources to specify queue method. Using 5s queue for Whale Member to avoid CDN time-outs.

This commit is contained in:
2020-07-01 04:47:05 +02:00
parent 53870fda89
commit 1f444e58ce
12 changed files with 183 additions and 100 deletions

View File

@@ -10,9 +10,10 @@ const { argv } = yargs
type: 'boolean',
alias: 'web',
})
.option('scrape', {
.option('all', {
describe: 'Scrape channels and networks defined in configuration',
type: 'boolean',
alias: 'scrape',
})
.option('networks', {
describe: 'Network to scrape all channels from (overrides configuration)',

View File

@@ -89,6 +89,8 @@ function toBaseSource(rawSource) {
if (rawSource.referer) baseSource.referer = rawSource.referer;
if (rawSource.host) baseSource.host = rawSource.host;
if (rawSource.attempts) baseSource.attempts = rawSource.attempts;
if (rawSource.queueMethod) baseSource.queueMethod = rawSource.queueMethod;
if (rawSource.copyright) baseSource.copyright = rawSource.copyright;
if (rawSource.comment) baseSource.comment = rawSource.comment;
@@ -393,6 +395,7 @@ async function fetchSource(source, baseMedia) {
stream: true, // sources are fetched in parallel, don't gobble up memory
transforms: [hashStream],
destination: tempFileTarget,
queueMethod: source.queueMethod || null, // use http module's default
});
hasher.end();
@@ -422,9 +425,11 @@ async function fetchSource(source, baseMedia) {
},
};
} catch (error) {
logger.warn(`Failed attempt ${attempts}/3 to fetch ${source.src}: ${error.message}`);
const maxAttempts = source.attempts || 3;
if (attempts < 3) {
logger.warn(`Failed attempt ${attempts}/${maxAttempts} to fetch ${source.src}: ${error.message}`);
if (attempts < maxAttempts) {
await Promise.delay(1000);
return attempt(attempts + 1);

View File

@@ -19,13 +19,32 @@ function scrapeLatest(html, site) {
release.date = moment.utc(scene.dataset.date, 'MMMM DD, YYYY').toDate();
release.actors = Array.from(scene.querySelectorAll('.actors a'), el => el.textContent);
// slow CDN?
const poster = scene.querySelector('.single-image').dataset.src;
release.poster = /^http/.test(poster) ? poster : `https:${poster}`;
const teaserEl = scene.querySelector('source');
release.photos = Array.from(scene.querySelectorAll('.rollover-thumbs img'), el => (/^http/.test(el.dataset.src) ? el.dataset.src : `https:${el.dataset.src}`));
release.poster = {
src: /^http/.test(poster) ? poster : `https:${poster}`,
referer: site.url,
attempts: 5,
queueMethod: '5s',
};
const trailerEl = scene.querySelector('source');
if (trailerEl) release.trailer = { src: trailerEl.dataset.src };
release.photos = Array.from(scene.querySelectorAll('.rollover-thumbs img'), el => ({
src: (/^http/.test(el.dataset.src) ? el.dataset.src : `https:${el.dataset.src}`),
referer: site.url,
attempts: 5,
queueMethod: '5s',
}));
if (teaserEl) {
release.teaser = {
src: teaserEl.dataset.src,
referer: site.url,
attempts: 5,
queueMethod: '5s',
};
}
return release;
});
@@ -51,16 +70,42 @@ function scrapeScene(html, site, url) {
release.duration = Number(durationEls[0].textContent.match(/\d+/)[0]) * 60;
}
release.photos = Array.from(scene.querySelectorAll('#t2019-main .t2019-thumbs img'), el => (/^http/.test(el.src) ? el.src : `https:${el.src}`));
// unreliable CDN
release.photos = Array.from(scene.querySelectorAll('#t2019-main .t2019-thumbs img'), el => ({
src: (/^http/.test(el.src) ? el.src : `https:${el.src}`),
referer: site.url,
attempts: 5,
queueMethod: '5s',
}));
const posterEl = scene.querySelector('#no-player-image');
const videoEl = scene.querySelector('video');
if (posterEl) release.poster = /^http/.test(posterEl.src) ? posterEl.src : `https:${posterEl.src}`;
else if (videoEl) release.poster = /^http/.test(videoEl.poster) ? videoEl.poster : `https:${videoEl.poster}`;
const trailerEl = scene.querySelector('#t2019-video source');
if (trailerEl) release.trailer = { src: trailerEl.src };
if (posterEl) {
release.poster = {
src: /^http/.test(posterEl.src) ? posterEl.src : `https:${posterEl.src}`,
referer: site.url,
attempts: 5,
queueMethod: '5s',
};
} else if (videoEl) {
release.poster = {
src: /^http/.test(videoEl.poster) ? videoEl.poster : `https:${videoEl.poster}`,
referer: site.url,
attempts: 5,
queueMethod: '5s',
};
}
if (trailerEl) {
release.trailer = {
src: trailerEl.src,
referer: site.url,
attempts: 5,
queueMethod: '5s',
};
}
return release;
}

View File

@@ -46,7 +46,7 @@ function curateReleaseEntry(release, batchId, existingRelease) {
}
async function attachChannelEntities(releases) {
const releasesWithoutEntity = releases.filter(release => release.channel && !release.entity && release.entity.type !== 1);
const releasesWithoutEntity = releases.filter(release => release.channel && (!release.entity || release.entity.type === 'network'));
const channelEntities = await knex('entities')
.select(knex.raw('entities.*, row_to_json(parents) as parent'))

View File

@@ -35,6 +35,7 @@ function useProxy(url) {
}
const queue = taskQueue();
const defaultQueueMethod = '20p';
async function handler({
url,
@@ -44,9 +45,9 @@ async function handler({
options = {},
}) {
if (body) {
logger.silly(`${method.toUpperCase()} ${url} with ${JSON.stringify(body)}`);
logger.silly(`${method.toUpperCase()} ${url} with ${JSON.stringify(body)} ${options.queueMethod || defaultQueueMethod}`);
} else {
logger.silly(`${method.toUpperCase()} ${url}`);
logger.silly(`${method.toUpperCase()} ${url} ${options.queueMethod || defaultQueueMethod}`);
}
const reqOptions = {
@@ -98,8 +99,12 @@ queue.define('1s', handler, {
interval: 1,
});
async function get(url, headers, options, queueMethod = '20p') {
return queue.push(queueMethod, {
queue.define('5s', handler, {
interval: 5,
});
async function get(url, headers, options) {
return queue.push(options.queueMethod || defaultQueueMethod, {
method: 'GET',
url,
headers,
@@ -107,8 +112,8 @@ async function get(url, headers, options, queueMethod = '20p') {
});
}
async function post(url, body, headers, options, queueMethod = '20p') {
return queue.push(queueMethod, {
async function post(url, body, headers, options) {
return queue.push(options.queueMethod || defaultQueueMethod, {
method: 'POST',
url,
body,

View File

@@ -12,7 +12,7 @@ async function resolvePlace(query) {
// https://operations.osmfoundation.org/policies/nominatim/
const res = await http.get(`https://nominatim.openstreetmap.org/search/${encodeURI(query)}?format=json&accept-language=en&addressdetails=1`, {
'User-Agent': 'contact at moonloop.adult@protonmail.com',
}, null, '1s');
}, { queueMethod: '1s' });
const [item] = res.body;