Queueing and batching media HTTP requests for improved reliability.

This commit is contained in:
ThePendulum 2020-02-22 03:22:30 +01:00
parent b2dfbac9e5
commit 349a5a506e
15 changed files with 251 additions and 78 deletions

View File

@ -16,7 +16,7 @@ import FilterBar from '../header/filter-bar.vue';
import Releases from '../releases/releases.vue'; import Releases from '../releases/releases.vue';
async function fetchReleases() { async function fetchReleases() {
this.releases = await this.$store.dispatch('fetchReleases', { limit: 100 }); this.releases = await this.$store.dispatch('fetchReleases', { limit: 250 });
this.$store.commit('setCache', { this.$store.commit('setCache', {
target: 'home', target: 'home',
releases: this.releases, releases: this.releases,

View File

@ -76,10 +76,13 @@ function initNetworksActions(store, _router) {
{ {
date: { date: {
lessThan: $before, lessThan: $before,
greaterThan: $after, greaterThan: $after
} }
}, },
{ {
date: {
isNull: true
},
createdAt: { createdAt: {
lessThan: $beforeTime, lessThan: $beforeTime,
greaterThan: $afterTime, greaterThan: $afterTime,

23
package-lock.json generated
View File

@ -4955,6 +4955,11 @@
"elliptic": "^6.0.0" "elliptic": "^6.0.0"
} }
}, },
"create-error": {
"version": "0.3.1",
"resolved": "https://registry.npmjs.org/create-error/-/create-error-0.3.1.tgz",
"integrity": "sha1-aYECRaYp5lRDK/BDdzYAA6U1GiM="
},
"create-hash": { "create-hash": {
"version": "1.2.0", "version": "1.2.0",
"resolved": "https://registry.npmjs.org/create-hash/-/create-hash-1.2.0.tgz", "resolved": "https://registry.npmjs.org/create-hash/-/create-hash-1.2.0.tgz",
@ -10899,6 +10904,24 @@
"integrity": "sha1-mEcocL8igTL8vdhoEputEsPAKeM=", "integrity": "sha1-mEcocL8igTL8vdhoEputEsPAKeM=",
"dev": true "dev": true
}, },
"promise-task-queue": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/promise-task-queue/-/promise-task-queue-1.2.0.tgz",
"integrity": "sha1-fz2NszEFNRuq/CqsFDRtcXwfHjs=",
"requires": {
"bluebird": "^3.3.3",
"create-error": "^0.3.1",
"debug": "^2.2.0",
"extend": "^3.0.0"
},
"dependencies": {
"extend": {
"version": "3.0.2",
"resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz",
"integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g=="
}
}
},
"prop-types": { "prop-types": {
"version": "15.7.2", "version": "15.7.2",
"resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.7.2.tgz", "resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.7.2.tgz",

View File

@ -96,6 +96,7 @@
"pg": "^7.18.1", "pg": "^7.18.1",
"postgraphile": "^4.5.5", "postgraphile": "^4.5.5",
"postgraphile-plugin-connection-filter": "^1.1.3", "postgraphile-plugin-connection-filter": "^1.1.3",
"promise-task-queue": "^1.2.0",
"prop-types": "^15.7.2", "prop-types": "^15.7.2",
"react": "^16.12.0", "react": "^16.12.0",
"react-dom": "^16.12.0", "react-dom": "^16.12.0",

View File

@ -748,14 +748,14 @@ const tags = [
priority: 10, priority: 10,
}, },
{ {
name: 'double anal TP', name: 'DA triple penetration',
slug: 'da-tp', slug: 'da-tp',
priority: 7, priority: 7,
description: 'Triple penetration with two cocks in the ass, and one in the pussy. Also see [double vaginal TP](/tag/dv-tp).', description: 'Triple penetration with two cocks in the ass, and one in the pussy. Also see [double vaginal TP](/tag/dv-tp).',
group: 'penetration', group: 'penetration',
}, },
{ {
name: 'double vaginal TP', name: 'DV triple penetration',
slug: 'dv-tp', slug: 'dv-tp',
priority: 7, priority: 7,
description: 'Triple penetration with two cocks in the pussy, and one in the ass. Also see [double anal TP](/tag/da-tp).', description: 'Triple penetration with two cocks in the pussy, and one in the ass. Also see [double anal TP](/tag/da-tp).',

View File

@ -101,6 +101,41 @@ const { argv } = yargs
type: 'boolean', type: 'boolean',
default: true, default: true,
}) })
.option('media', {
describe: 'Include any release media',
type: 'boolean',
default: true,
})
.option('posters', {
describe: 'Include release posters',
type: 'boolean',
default: true,
})
.option('covers', {
describe: 'Include release covers',
type: 'boolean',
default: true,
})
.option('photos', {
describe: 'Include release photos',
type: 'boolean',
default: true,
})
.option('trailers', {
describe: 'Include release trailers',
type: 'boolean',
default: true,
})
.option('teasers', {
describe: 'Include release teasers',
type: 'boolean',
default: true,
})
.option('avatars', {
describe: 'Include actor avatars',
type: 'boolean',
default: true,
})
.option('inspect', { .option('inspect', {
describe: 'Show data in console.', describe: 'Show data in console.',
type: 'boolean', type: 'boolean',

View File

@ -2,7 +2,7 @@
const config = require('config'); const config = require('config');
const Promise = require('bluebird'); const Promise = require('bluebird');
const bhttp = require('bhttp'); // const bhttp = require('bhttp');
const mime = require('mime'); const mime = require('mime');
const fs = require('fs-extra'); const fs = require('fs-extra');
const sharp = require('sharp'); const sharp = require('sharp');
@ -11,7 +11,9 @@ const blake2 = require('blake2');
const logger = require('./logger')(__filename); const logger = require('./logger')(__filename);
const knex = require('./knex'); const knex = require('./knex');
const { get } = require('./utils/http');
const { ex } = require('./utils/q'); const { ex } = require('./utils/q');
const chunk = require('./utils/chunk');
function getHash(buffer) { function getHash(buffer) {
const hash = blake2.createHash('blake2b', { digestLength: 24 }); const hash = blake2.createHash('blake2b', { digestLength: 24 });
@ -73,7 +75,8 @@ async function getEntropy(buffer) {
} }
async function extractItem(source) { async function extractItem(source) {
const res = await bhttp.get(source.src); // const res = await bhttp.get(source.src);
const res = await get(source.src);
if (res.statusCode === 200) { if (res.statusCode === 200) {
const { q } = ex(res.body.toString()); const { q } = ex(res.body.toString());
@ -114,7 +117,8 @@ async function fetchItem(source, index, existingItemsBySource, domain, role, att
logger.verbose(`Fetching media item from ${source.src || source}`); logger.verbose(`Fetching media item from ${source.src || source}`);
const res = await bhttp.get(source.src || source); // const res = await bhttp.get(source.src || source);
const res = await get(source.src || source);
if (res.statusCode === 200) { if (res.statusCode === 200) {
const { pathname } = new URL(source.src || source); const { pathname } = new URL(source.src || source);
@ -150,9 +154,7 @@ async function fetchItem(source, index, existingItemsBySource, domain, role, att
} }
async function fetchItems(itemSources, existingItemsBySource, domain, role) { async function fetchItems(itemSources, existingItemsBySource, domain, role) {
return Promise.map(itemSources, async (source, index) => fetchItem(source, index, existingItemsBySource, domain, role), { return Promise.map(itemSources, async (source, index) => fetchItem(source, index, existingItemsBySource, domain, role)).filter(Boolean);
concurrency: 10,
}).filter(Boolean);
} }
async function saveItems(items, domain, role) { async function saveItems(items, domain, role) {
@ -182,10 +184,15 @@ async function saveItems(items, domain, role) {
logger.verbose(`Saved ${domain} ${role} with thumbnail to ${filepath}`); logger.verbose(`Saved ${domain} ${role} with thumbnail to ${filepath}`);
return { return {
...item,
thumbnail, thumbnail,
filepath, filepath,
thumbpath, thumbpath,
mimetype: item.mimetype,
extension: item.extension,
hash: item.hash,
entropy: item.entropy,
quality: item.quality,
source: item.source,
}; };
} }
@ -199,8 +206,6 @@ async function saveItems(items, domain, role) {
logger.error(`Failed to store ${domain} ${role} from ${item.source}: ${error.message}`); logger.error(`Failed to store ${domain} ${role} from ${item.source}: ${error.message}`);
return null; return null;
} }
}, {
concurrency: 20,
}); });
} }
@ -227,50 +232,55 @@ function groupItems(items) {
} }
async function storeMedia(sources, domain, role) { async function storeMedia(sources, domain, role) {
try { const presentSources = sources.filter(Boolean);
const presentSources = sources.filter(Boolean);
if (presentSources.length === 0) { if (presentSources.length === 0) {
return {}; return {};
}
// find source duplicates that don't need to be re-downloaded or re-saved
const existingSourceItems = await knex('media').whereIn('source', presentSources.flat().map(source => source.src || source));
const { source: existingSourceItemsBySource, hash: existingSourceItemsByHash } = groupItems(existingSourceItems);
// download media items from new sources
const fetchedItems = await fetchItems(presentSources, existingSourceItemsBySource, domain, role);
const { hash: fetchedItemsByHash } = groupItems(fetchedItems);
// find hash duplicates that don't need to be re-saved
const uniqueFetchedItems = Object.values(fetchedItemsByHash);
const existingHashItems = await knex('media').whereIn('hash', uniqueFetchedItems.map(item => item.hash));
const { hash: existingHashItemsByHash } = groupItems(existingHashItems);
// save new items to disk
const newItems = uniqueFetchedItems.filter(item => !existingHashItemsByHash[item.hash]);
const savedItems = await saveItems(newItems, domain, role);
// store new items in database
const curatedItemEntries = curateItemEntries(savedItems);
const storedItems = await knex('media').insert(curatedItemEntries).returning('*');
const { hash: storedItemsByHash } = groupItems(Array.isArray(storedItems) ? storedItems : []);
// accumulate existing and new items by source to be mapped onto releases
const itemsByHash = { ...existingSourceItemsByHash, ...existingHashItemsByHash, ...storedItemsByHash };
const itemsBySource = {
...existingSourceItemsBySource,
...fetchedItems.reduce((acc, item) => ({ ...acc, [item.source]: itemsByHash[item.hash] }), {}),
};
logger.info(`Stored ${fetchedItems.length} new ${domain} ${role}s`);
return itemsBySource;
} catch (error) {
logger.error(`Failed to store ${domain} ${role} batch: ${error.message}`);
return null;
} }
// split up source list to prevent excessive RAM usage
const itemChunksBySource = await Promise.all(chunk(presentSources, 50).map(async (sourceChunk, index) => {
try {
// find source duplicates that don't need to be re-downloaded or re-saved
const existingSourceItems = await knex('media').whereIn('source', sourceChunk.flat().map(source => source.src || source));
const { source: existingSourceItemsBySource, hash: existingSourceItemsByHash } = groupItems(existingSourceItems);
// download media items from new sources
const fetchedItems = await fetchItems(sourceChunk, existingSourceItemsBySource, domain, role);
const { hash: fetchedItemsByHash } = groupItems(fetchedItems);
// find hash duplicates that don't need to be re-saved
const uniqueFetchedItems = Object.values(fetchedItemsByHash);
const existingHashItems = await knex('media').whereIn('hash', uniqueFetchedItems.map(item => item.hash));
const { hash: existingHashItemsByHash } = groupItems(existingHashItems);
// save new items to disk
const newItems = uniqueFetchedItems.filter(item => !existingHashItemsByHash[item.hash]);
const savedItems = await saveItems(newItems, domain, role);
// store new items in database
const curatedItemEntries = curateItemEntries(savedItems);
const storedItems = await knex('media').insert(curatedItemEntries).returning('*');
const { hash: storedItemsByHash } = groupItems(Array.isArray(storedItems) ? storedItems : []);
// accumulate existing and new items by source to be mapped onto releases
const itemsByHash = { ...existingSourceItemsByHash, ...existingHashItemsByHash, ...storedItemsByHash };
const itemsBySource = {
...existingSourceItemsBySource,
...fetchedItems.reduce((acc, item) => ({ ...acc, [item.source]: itemsByHash[item.hash] }), {}),
};
logger.info(`Stored batch ${index + 1} with ${fetchedItems.length} of new ${domain} ${role}s`);
return itemsBySource;
} catch (error) {
logger.error(`Failed to store ${domain} ${role} batch ${index + 1}: ${error.message}`);
return null;
}
}));
return itemChunksBySource.reduce((acc, itemChunk) => ({ ...acc, ...itemChunk }), {});
} }
function extractPrimaryItem(associations, targetId, role, primaryRole, primaryItemsByTargetId) { function extractPrimaryItem(associations, targetId, role, primaryRole, primaryItemsByTargetId) {
@ -319,7 +329,7 @@ async function associateMedia(sourcesByTargetId, mediaBySource, domain, role, pr
const primaryAssociations = associationsPerTarget.map(association => association[primaryRole]).filter(Boolean); const primaryAssociations = associationsPerTarget.map(association => association[primaryRole]).filter(Boolean);
logger.info(`Associated ${associations.length} ${role}s to ${domain}s`); logger.info(`Associated ${associations.length} ${role}s to ${domain}s`);
logger.info(`Associated ${primaryAssociations.length} extracted ${primaryRole}s to ${domain}s`); if (primaryRole) logger.info(`Associated ${primaryAssociations.length} extracted ${primaryRole}s to ${domain}s`);
return Promise.all([ return Promise.all([
(associations.length > 0 && knex.raw(`${knex(`${domain}s_${role}s`).insert(associations).toString()} ON CONFLICT DO NOTHING`)), (associations.length > 0 && knex.raw(`${knex(`${domain}s_${role}s`).insert(associations).toString()} ON CONFLICT DO NOTHING`)),

View File

@ -330,6 +330,10 @@ function accumulateMovies(releases) {
} }
async function storeReleaseAssets(releases) { async function storeReleaseAssets(releases) {
if (!argv.media) {
return;
}
const releasePostersById = releases.reduce((acc, release) => ({ ...acc, [release.id]: [release.poster] }), {}); const releasePostersById = releases.reduce((acc, release) => ({ ...acc, [release.id]: [release.poster] }), {});
const releaseCoversById = releases.reduce((acc, release) => ({ ...acc, [release.id]: release.covers }), {}); const releaseCoversById = releases.reduce((acc, release) => ({ ...acc, [release.id]: release.covers }), {});
const releaseTrailersById = releases.reduce((acc, release) => ({ ...acc, [release.id]: [release.trailer] }), {}); const releaseTrailersById = releases.reduce((acc, release) => ({ ...acc, [release.id]: [release.trailer] }), {});
@ -340,28 +344,30 @@ async function storeReleaseAssets(releases) {
}), {}); }), {});
const [posters, covers] = await Promise.all([ const [posters, covers] = await Promise.all([
storeMedia(Object.values(releasePostersById).flat(), 'release', 'poster'), argv.posters && storeMedia(Object.values(releasePostersById).flat(), 'release', 'poster'),
storeMedia(Object.values(releaseCoversById).flat(), 'release', 'cover'), argv.covers && storeMedia(Object.values(releaseCoversById).flat(), 'release', 'cover'),
]); ]);
// ensure posters are available before fetching supplementary media // ensure posters are available before fetching supplementary media
await Promise.all([ await Promise.all([
(posters && associateMedia(releasePostersById, posters, 'release', 'poster')), posters && associateMedia(releasePostersById, posters, 'release', 'poster'),
(covers && associateMedia(releaseCoversById, covers, 'release', 'cover')), covers && associateMedia(releaseCoversById, covers, 'release', 'cover'),
]); ]);
const photos = await storeMedia(Object.values(releasePhotosById).flat(), 'release', 'photo'); if (argv.photos) {
if (photos) await associateMedia(releasePhotosById, photos, 'release', 'photo'); const photos = await storeMedia(Object.values(releasePhotosById).flat(), 'release', 'photo');
if (photos) await associateMedia(releasePhotosById, photos, 'release', 'photo');
}
// videos take a long time, fetch last // videos take a long time, fetch last
const [trailers, teasers] = await Promise.all([ const [trailers, teasers] = await Promise.all([
storeMedia(Object.values(releaseTrailersById).flat(), 'release', 'trailer'), argv.trailers && storeMedia(Object.values(releaseTrailersById).flat(), 'release', 'trailer'),
storeMedia(Object.values(releaseTeasersById).flat(), 'release', 'teaser'), argv.teasers && storeMedia(Object.values(releaseTeasersById).flat(), 'release', 'teaser'),
]); ]);
await Promise.all([ await Promise.all([
(trailers && associateMedia(releaseTrailersById, trailers, 'release', 'trailer')), trailers && associateMedia(releaseTrailersById, trailers, 'release', 'trailer'),
(teasers && associateMedia(releaseTeasersById, teasers, 'release', 'teaser')), teasers && associateMedia(releaseTeasersById, teasers, 'release', 'teaser'),
]); ]);
} }

View File

@ -52,10 +52,20 @@ async function scrapeRelease(source, basicRelease = null, type = 'scene', prefli
} }
if (type === 'scene' && !scraper.fetchScene) { if (type === 'scene' && !scraper.fetchScene) {
if (release) {
logger.warn(`The '${site.name}'-scraper cannot fetch individual scenes`);
return null;
}
throw new Error(`The '${site.name}'-scraper cannot fetch individual scenes`); throw new Error(`The '${site.name}'-scraper cannot fetch individual scenes`);
} }
if (type === 'movie' && !scraper.fetchMovie) { if (type === 'movie' && !scraper.fetchMovie) {
if (release) {
logger.warn(`The '${site.name}'-scraper cannot fetch individual movies`);
return null;
}
throw new Error(`The '${site.name}'-scraper cannot fetch individual movies`); throw new Error(`The '${site.name}'-scraper cannot fetch individual movies`);
} }
@ -80,7 +90,7 @@ async function scrapeRelease(source, basicRelease = null, type = 'scene', prefli
async function scrapeReleases(sources, release = null, type = 'scene', preflight = null) { async function scrapeReleases(sources, release = null, type = 'scene', preflight = null) {
const scrapedReleases = await Promise.map(sources, async source => scrapeRelease(source, release, type, preflight), { const scrapedReleases = await Promise.map(sources, async source => scrapeRelease(source, release, type, preflight), {
concurrency: 5, concurrency: 5,
}); }).filter(Boolean);
const curatedReleases = scrapedReleases.map(scrapedRelease => ({ ...scrapedRelease, type })); const curatedReleases = scrapedReleases.map(scrapedRelease => ({ ...scrapedRelease, type }));

View File

@ -117,7 +117,7 @@ async function deepFetchReleases(baseReleases, preflight) {
return release; return release;
} catch (error) { } catch (error) {
logger.error(error.stack); logger.error(`Failed to scrape ${release.url}: ${error}`);
return { return {
...release, ...release,

View File

@ -211,8 +211,13 @@ async function scrapeScene(html, url, site) {
release.title = $('.title_bar_hilite').text().trim(); release.title = $('.title_bar_hilite').text().trim();
const setIdIndex = html.indexOf('setid:"'); const entryId = html.match(/showtagform\((\d+)\)/);
release.entryId = html.slice(setIdIndex, html.indexOf(',', setIdIndex)).match(/\d+/)[0];
if (entryId) release.entryId = entryId[1];
else {
const setIdIndex = html.indexOf('setid:"');
if (setIdIndex) release.entryId = html.slice(setIdIndex, html.indexOf(',', setIdIndex)).match(/\d+/)[0];
}
const dateElement = $('.update_date').text().trim(); const dateElement = $('.update_date').text().trim();
const dateComment = $('*') const dateComment = $('*')

View File

@ -1,6 +1,6 @@
'use strict'; 'use strict';
const { geta, edate } = require('../utils/q'); const { geta, ed } = require('../utils/q');
function scrapeBlockLatest(scenes) { function scrapeBlockLatest(scenes) {
return scenes.map(({ html, q, qa, qu, qt }) => { return scenes.map(({ html, q, qa, qu, qt }) => {
@ -13,7 +13,7 @@ function scrapeBlockLatest(scenes) {
release.title = q('h4 a', true); release.title = q('h4 a', true);
release.url = qu('h4 a'); release.url = qu('h4 a');
release.date = edate(html, 'MM/DD/YYYY', /\d{2}\/\d{2}\/\d{4}/); release.date = ed(html, 'MM/DD/YYYY', /\d{2}\/\d{2}\/\d{4}/);
release.actors = qa('.tour_update_models a', true); release.actors = qa('.tour_update_models a', true);
@ -22,8 +22,6 @@ function scrapeBlockLatest(scenes) {
release.teaser = qt(); release.teaser = qt();
console.log(release);
return release; return release;
}); });
} }
@ -52,8 +50,6 @@ function scrapeClassicLatest(scenes) {
const photoCount = q('.update_thumb', 'cnt'); const photoCount = q('.update_thumb', 'cnt');
[release.poster, ...release.photos] = Array.from({ length: photoCount }).map((value, index) => q('.update_thumb', `src${index}_3x`) || q('.update_thumb', `src${index}_2x`) || q('.update_thumb', `src${index}_1x`)); [release.poster, ...release.photos] = Array.from({ length: photoCount }).map((value, index) => q('.update_thumb', `src${index}_3x`) || q('.update_thumb', `src${index}_2x`) || q('.update_thumb', `src${index}_1x`));
console.log(release);
return release; return release;
}); });
} }

8
src/utils/chunk.js Normal file
View File

@ -0,0 +1,8 @@
'use strict';
function chunk(array, chunkSize) {
return Array.from({ length: Math.ceil(array.length / chunkSize) })
.map((value, index) => array.slice(index * chunkSize, (index * chunkSize) + chunkSize));
}
module.exports = chunk;

76
src/utils/http.js Normal file
View File

@ -0,0 +1,76 @@
'use strict';
const bhttp = require('bhttp');
const taskQueue = require('promise-task-queue');
const logger = require('../logger')(__filename);
const queue = taskQueue();
queue.on('concurrencyReached:httpGet', () => {
logger.silly('Queueing GET requests');
});
queue.on('concurrencyReached:httpPost', () => {
logger.silly('Queueing POST requests');
});
queue.define('httpGet', async ({
url,
timeout = 30000,
options = {},
}) => {
logger.silly(`GET ${url}`);
const res = await bhttp.get(url, {
responseTimeout: timeout,
...options,
});
res.code = res.statusCode;
return res;
}, {
concurrency: 20,
});
queue.define('httpPost', async ({
url,
body,
timeout = 30000,
options = {},
}) => {
logger.silly(`POST ${url} with ${body}`);
const res = await bhttp.post(url, body, {
responseTimeout: timeout,
...options,
});
res.code = res.statusCode;
return res;
}, {
concurrency: 20,
});
async function get(url, options) {
return queue.push('httpGet', {
method: 'get',
url,
options,
});
}
async function post(url, body, options) {
return queue.push('httpPost', {
url,
body,
options,
});
}
module.exports = {
get,
post,
};

View File

@ -2,7 +2,7 @@
const { JSDOM } = require('jsdom'); const { JSDOM } = require('jsdom');
const moment = require('moment'); const moment = require('moment');
const bhttp = require('bhttp'); const http = require('./http');
function trim(str) { function trim(str) {
return str.trim().replace(/\s+/g, ' '); return str.trim().replace(/\s+/g, ' ');
@ -228,7 +228,7 @@ function extractAll(html, selector) {
} }
async function get(url, selector, headers, all = false) { async function get(url, selector, headers, all = false) {
const res = await bhttp.get(url, { const res = await http.get(url, {
headers, headers,
}); });