Queueing and batching media HTTP requests for improved reliability.

This commit is contained in:
ThePendulum 2020-02-22 03:22:30 +01:00
parent b2dfbac9e5
commit 349a5a506e
15 changed files with 251 additions and 78 deletions

View File

@ -16,7 +16,7 @@ import FilterBar from '../header/filter-bar.vue';
import Releases from '../releases/releases.vue';
async function fetchReleases() {
this.releases = await this.$store.dispatch('fetchReleases', { limit: 100 });
this.releases = await this.$store.dispatch('fetchReleases', { limit: 250 });
this.$store.commit('setCache', {
target: 'home',
releases: this.releases,

View File

@ -76,10 +76,13 @@ function initNetworksActions(store, _router) {
{
date: {
lessThan: $before,
greaterThan: $after,
greaterThan: $after
}
},
{
date: {
isNull: true
},
createdAt: {
lessThan: $beforeTime,
greaterThan: $afterTime,

23
package-lock.json generated
View File

@ -4955,6 +4955,11 @@
"elliptic": "^6.0.0"
}
},
"create-error": {
"version": "0.3.1",
"resolved": "https://registry.npmjs.org/create-error/-/create-error-0.3.1.tgz",
"integrity": "sha1-aYECRaYp5lRDK/BDdzYAA6U1GiM="
},
"create-hash": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/create-hash/-/create-hash-1.2.0.tgz",
@ -10899,6 +10904,24 @@
"integrity": "sha1-mEcocL8igTL8vdhoEputEsPAKeM=",
"dev": true
},
"promise-task-queue": {
"version": "1.2.0",
"resolved": "https://registry.npmjs.org/promise-task-queue/-/promise-task-queue-1.2.0.tgz",
"integrity": "sha1-fz2NszEFNRuq/CqsFDRtcXwfHjs=",
"requires": {
"bluebird": "^3.3.3",
"create-error": "^0.3.1",
"debug": "^2.2.0",
"extend": "^3.0.0"
},
"dependencies": {
"extend": {
"version": "3.0.2",
"resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz",
"integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g=="
}
}
},
"prop-types": {
"version": "15.7.2",
"resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.7.2.tgz",

View File

@ -96,6 +96,7 @@
"pg": "^7.18.1",
"postgraphile": "^4.5.5",
"postgraphile-plugin-connection-filter": "^1.1.3",
"promise-task-queue": "^1.2.0",
"prop-types": "^15.7.2",
"react": "^16.12.0",
"react-dom": "^16.12.0",

View File

@ -748,14 +748,14 @@ const tags = [
priority: 10,
},
{
name: 'double anal TP',
name: 'DA triple penetration',
slug: 'da-tp',
priority: 7,
description: 'Triple penetration with two cocks in the ass, and one in the pussy. Also see [double vaginal TP](/tag/dv-tp).',
group: 'penetration',
},
{
name: 'double vaginal TP',
name: 'DV triple penetration',
slug: 'dv-tp',
priority: 7,
description: 'Triple penetration with two cocks in the pussy, and one in the ass. Also see [double anal TP](/tag/da-tp).',

View File

@ -101,6 +101,41 @@ const { argv } = yargs
type: 'boolean',
default: true,
})
.option('media', {
describe: 'Include any release media',
type: 'boolean',
default: true,
})
.option('posters', {
describe: 'Include release posters',
type: 'boolean',
default: true,
})
.option('covers', {
describe: 'Include release covers',
type: 'boolean',
default: true,
})
.option('photos', {
describe: 'Include release photos',
type: 'boolean',
default: true,
})
.option('trailers', {
describe: 'Include release trailers',
type: 'boolean',
default: true,
})
.option('teasers', {
describe: 'Include release teasers',
type: 'boolean',
default: true,
})
.option('avatars', {
describe: 'Include actor avatars',
type: 'boolean',
default: true,
})
.option('inspect', {
describe: 'Show data in console.',
type: 'boolean',

View File

@ -2,7 +2,7 @@
const config = require('config');
const Promise = require('bluebird');
const bhttp = require('bhttp');
// const bhttp = require('bhttp');
const mime = require('mime');
const fs = require('fs-extra');
const sharp = require('sharp');
@ -11,7 +11,9 @@ const blake2 = require('blake2');
const logger = require('./logger')(__filename);
const knex = require('./knex');
const { get } = require('./utils/http');
const { ex } = require('./utils/q');
const chunk = require('./utils/chunk');
function getHash(buffer) {
const hash = blake2.createHash('blake2b', { digestLength: 24 });
@ -73,7 +75,8 @@ async function getEntropy(buffer) {
}
async function extractItem(source) {
const res = await bhttp.get(source.src);
// const res = await bhttp.get(source.src);
const res = await get(source.src);
if (res.statusCode === 200) {
const { q } = ex(res.body.toString());
@ -114,7 +117,8 @@ async function fetchItem(source, index, existingItemsBySource, domain, role, att
logger.verbose(`Fetching media item from ${source.src || source}`);
const res = await bhttp.get(source.src || source);
// const res = await bhttp.get(source.src || source);
const res = await get(source.src || source);
if (res.statusCode === 200) {
const { pathname } = new URL(source.src || source);
@ -150,9 +154,7 @@ async function fetchItem(source, index, existingItemsBySource, domain, role, att
}
async function fetchItems(itemSources, existingItemsBySource, domain, role) {
return Promise.map(itemSources, async (source, index) => fetchItem(source, index, existingItemsBySource, domain, role), {
concurrency: 10,
}).filter(Boolean);
return Promise.map(itemSources, async (source, index) => fetchItem(source, index, existingItemsBySource, domain, role)).filter(Boolean);
}
async function saveItems(items, domain, role) {
@ -182,10 +184,15 @@ async function saveItems(items, domain, role) {
logger.verbose(`Saved ${domain} ${role} with thumbnail to ${filepath}`);
return {
...item,
thumbnail,
filepath,
thumbpath,
mimetype: item.mimetype,
extension: item.extension,
hash: item.hash,
entropy: item.entropy,
quality: item.quality,
source: item.source,
};
}
@ -199,8 +206,6 @@ async function saveItems(items, domain, role) {
logger.error(`Failed to store ${domain} ${role} from ${item.source}: ${error.message}`);
return null;
}
}, {
concurrency: 20,
});
}
@ -227,50 +232,55 @@ function groupItems(items) {
}
async function storeMedia(sources, domain, role) {
try {
const presentSources = sources.filter(Boolean);
const presentSources = sources.filter(Boolean);
if (presentSources.length === 0) {
return {};
}
// find source duplicates that don't need to be re-downloaded or re-saved
const existingSourceItems = await knex('media').whereIn('source', presentSources.flat().map(source => source.src || source));
const { source: existingSourceItemsBySource, hash: existingSourceItemsByHash } = groupItems(existingSourceItems);
// download media items from new sources
const fetchedItems = await fetchItems(presentSources, existingSourceItemsBySource, domain, role);
const { hash: fetchedItemsByHash } = groupItems(fetchedItems);
// find hash duplicates that don't need to be re-saved
const uniqueFetchedItems = Object.values(fetchedItemsByHash);
const existingHashItems = await knex('media').whereIn('hash', uniqueFetchedItems.map(item => item.hash));
const { hash: existingHashItemsByHash } = groupItems(existingHashItems);
// save new items to disk
const newItems = uniqueFetchedItems.filter(item => !existingHashItemsByHash[item.hash]);
const savedItems = await saveItems(newItems, domain, role);
// store new items in database
const curatedItemEntries = curateItemEntries(savedItems);
const storedItems = await knex('media').insert(curatedItemEntries).returning('*');
const { hash: storedItemsByHash } = groupItems(Array.isArray(storedItems) ? storedItems : []);
// accumulate existing and new items by source to be mapped onto releases
const itemsByHash = { ...existingSourceItemsByHash, ...existingHashItemsByHash, ...storedItemsByHash };
const itemsBySource = {
...existingSourceItemsBySource,
...fetchedItems.reduce((acc, item) => ({ ...acc, [item.source]: itemsByHash[item.hash] }), {}),
};
logger.info(`Stored ${fetchedItems.length} new ${domain} ${role}s`);
return itemsBySource;
} catch (error) {
logger.error(`Failed to store ${domain} ${role} batch: ${error.message}`);
return null;
if (presentSources.length === 0) {
return {};
}
// split up source list to prevent excessive RAM usage
const itemChunksBySource = await Promise.all(chunk(presentSources, 50).map(async (sourceChunk, index) => {
try {
// find source duplicates that don't need to be re-downloaded or re-saved
const existingSourceItems = await knex('media').whereIn('source', sourceChunk.flat().map(source => source.src || source));
const { source: existingSourceItemsBySource, hash: existingSourceItemsByHash } = groupItems(existingSourceItems);
// download media items from new sources
const fetchedItems = await fetchItems(sourceChunk, existingSourceItemsBySource, domain, role);
const { hash: fetchedItemsByHash } = groupItems(fetchedItems);
// find hash duplicates that don't need to be re-saved
const uniqueFetchedItems = Object.values(fetchedItemsByHash);
const existingHashItems = await knex('media').whereIn('hash', uniqueFetchedItems.map(item => item.hash));
const { hash: existingHashItemsByHash } = groupItems(existingHashItems);
// save new items to disk
const newItems = uniqueFetchedItems.filter(item => !existingHashItemsByHash[item.hash]);
const savedItems = await saveItems(newItems, domain, role);
// store new items in database
const curatedItemEntries = curateItemEntries(savedItems);
const storedItems = await knex('media').insert(curatedItemEntries).returning('*');
const { hash: storedItemsByHash } = groupItems(Array.isArray(storedItems) ? storedItems : []);
// accumulate existing and new items by source to be mapped onto releases
const itemsByHash = { ...existingSourceItemsByHash, ...existingHashItemsByHash, ...storedItemsByHash };
const itemsBySource = {
...existingSourceItemsBySource,
...fetchedItems.reduce((acc, item) => ({ ...acc, [item.source]: itemsByHash[item.hash] }), {}),
};
logger.info(`Stored batch ${index + 1} with ${fetchedItems.length} of new ${domain} ${role}s`);
return itemsBySource;
} catch (error) {
logger.error(`Failed to store ${domain} ${role} batch ${index + 1}: ${error.message}`);
return null;
}
}));
return itemChunksBySource.reduce((acc, itemChunk) => ({ ...acc, ...itemChunk }), {});
}
function extractPrimaryItem(associations, targetId, role, primaryRole, primaryItemsByTargetId) {
@ -319,7 +329,7 @@ async function associateMedia(sourcesByTargetId, mediaBySource, domain, role, pr
const primaryAssociations = associationsPerTarget.map(association => association[primaryRole]).filter(Boolean);
logger.info(`Associated ${associations.length} ${role}s to ${domain}s`);
logger.info(`Associated ${primaryAssociations.length} extracted ${primaryRole}s to ${domain}s`);
if (primaryRole) logger.info(`Associated ${primaryAssociations.length} extracted ${primaryRole}s to ${domain}s`);
return Promise.all([
(associations.length > 0 && knex.raw(`${knex(`${domain}s_${role}s`).insert(associations).toString()} ON CONFLICT DO NOTHING`)),

View File

@ -330,6 +330,10 @@ function accumulateMovies(releases) {
}
async function storeReleaseAssets(releases) {
if (!argv.media) {
return;
}
const releasePostersById = releases.reduce((acc, release) => ({ ...acc, [release.id]: [release.poster] }), {});
const releaseCoversById = releases.reduce((acc, release) => ({ ...acc, [release.id]: release.covers }), {});
const releaseTrailersById = releases.reduce((acc, release) => ({ ...acc, [release.id]: [release.trailer] }), {});
@ -340,28 +344,30 @@ async function storeReleaseAssets(releases) {
}), {});
const [posters, covers] = await Promise.all([
storeMedia(Object.values(releasePostersById).flat(), 'release', 'poster'),
storeMedia(Object.values(releaseCoversById).flat(), 'release', 'cover'),
argv.posters && storeMedia(Object.values(releasePostersById).flat(), 'release', 'poster'),
argv.covers && storeMedia(Object.values(releaseCoversById).flat(), 'release', 'cover'),
]);
// ensure posters are available before fetching supplementary media
await Promise.all([
(posters && associateMedia(releasePostersById, posters, 'release', 'poster')),
(covers && associateMedia(releaseCoversById, covers, 'release', 'cover')),
posters && associateMedia(releasePostersById, posters, 'release', 'poster'),
covers && associateMedia(releaseCoversById, covers, 'release', 'cover'),
]);
const photos = await storeMedia(Object.values(releasePhotosById).flat(), 'release', 'photo');
if (photos) await associateMedia(releasePhotosById, photos, 'release', 'photo');
if (argv.photos) {
const photos = await storeMedia(Object.values(releasePhotosById).flat(), 'release', 'photo');
if (photos) await associateMedia(releasePhotosById, photos, 'release', 'photo');
}
// videos take a long time, fetch last
const [trailers, teasers] = await Promise.all([
storeMedia(Object.values(releaseTrailersById).flat(), 'release', 'trailer'),
storeMedia(Object.values(releaseTeasersById).flat(), 'release', 'teaser'),
argv.trailers && storeMedia(Object.values(releaseTrailersById).flat(), 'release', 'trailer'),
argv.teasers && storeMedia(Object.values(releaseTeasersById).flat(), 'release', 'teaser'),
]);
await Promise.all([
(trailers && associateMedia(releaseTrailersById, trailers, 'release', 'trailer')),
(teasers && associateMedia(releaseTeasersById, teasers, 'release', 'teaser')),
trailers && associateMedia(releaseTrailersById, trailers, 'release', 'trailer'),
teasers && associateMedia(releaseTeasersById, teasers, 'release', 'teaser'),
]);
}

View File

@ -52,10 +52,20 @@ async function scrapeRelease(source, basicRelease = null, type = 'scene', prefli
}
if (type === 'scene' && !scraper.fetchScene) {
if (release) {
logger.warn(`The '${site.name}'-scraper cannot fetch individual scenes`);
return null;
}
throw new Error(`The '${site.name}'-scraper cannot fetch individual scenes`);
}
if (type === 'movie' && !scraper.fetchMovie) {
if (release) {
logger.warn(`The '${site.name}'-scraper cannot fetch individual movies`);
return null;
}
throw new Error(`The '${site.name}'-scraper cannot fetch individual movies`);
}
@ -80,7 +90,7 @@ async function scrapeRelease(source, basicRelease = null, type = 'scene', prefli
async function scrapeReleases(sources, release = null, type = 'scene', preflight = null) {
const scrapedReleases = await Promise.map(sources, async source => scrapeRelease(source, release, type, preflight), {
concurrency: 5,
});
}).filter(Boolean);
const curatedReleases = scrapedReleases.map(scrapedRelease => ({ ...scrapedRelease, type }));

View File

@ -117,7 +117,7 @@ async function deepFetchReleases(baseReleases, preflight) {
return release;
} catch (error) {
logger.error(error.stack);
logger.error(`Failed to scrape ${release.url}: ${error}`);
return {
...release,

View File

@ -211,8 +211,13 @@ async function scrapeScene(html, url, site) {
release.title = $('.title_bar_hilite').text().trim();
const setIdIndex = html.indexOf('setid:"');
release.entryId = html.slice(setIdIndex, html.indexOf(',', setIdIndex)).match(/\d+/)[0];
const entryId = html.match(/showtagform\((\d+)\)/);
if (entryId) release.entryId = entryId[1];
else {
const setIdIndex = html.indexOf('setid:"');
if (setIdIndex) release.entryId = html.slice(setIdIndex, html.indexOf(',', setIdIndex)).match(/\d+/)[0];
}
const dateElement = $('.update_date').text().trim();
const dateComment = $('*')

View File

@ -1,6 +1,6 @@
'use strict';
const { geta, edate } = require('../utils/q');
const { geta, ed } = require('../utils/q');
function scrapeBlockLatest(scenes) {
return scenes.map(({ html, q, qa, qu, qt }) => {
@ -13,7 +13,7 @@ function scrapeBlockLatest(scenes) {
release.title = q('h4 a', true);
release.url = qu('h4 a');
release.date = edate(html, 'MM/DD/YYYY', /\d{2}\/\d{2}\/\d{4}/);
release.date = ed(html, 'MM/DD/YYYY', /\d{2}\/\d{2}\/\d{4}/);
release.actors = qa('.tour_update_models a', true);
@ -22,8 +22,6 @@ function scrapeBlockLatest(scenes) {
release.teaser = qt();
console.log(release);
return release;
});
}
@ -52,8 +50,6 @@ function scrapeClassicLatest(scenes) {
const photoCount = q('.update_thumb', 'cnt');
[release.poster, ...release.photos] = Array.from({ length: photoCount }).map((value, index) => q('.update_thumb', `src${index}_3x`) || q('.update_thumb', `src${index}_2x`) || q('.update_thumb', `src${index}_1x`));
console.log(release);
return release;
});
}

8
src/utils/chunk.js Normal file
View File

@ -0,0 +1,8 @@
'use strict';
function chunk(array, chunkSize) {
return Array.from({ length: Math.ceil(array.length / chunkSize) })
.map((value, index) => array.slice(index * chunkSize, (index * chunkSize) + chunkSize));
}
module.exports = chunk;

76
src/utils/http.js Normal file
View File

@ -0,0 +1,76 @@
'use strict';
const bhttp = require('bhttp');
const taskQueue = require('promise-task-queue');
const logger = require('../logger')(__filename);
const queue = taskQueue();
queue.on('concurrencyReached:httpGet', () => {
logger.silly('Queueing GET requests');
});
queue.on('concurrencyReached:httpPost', () => {
logger.silly('Queueing POST requests');
});
queue.define('httpGet', async ({
url,
timeout = 30000,
options = {},
}) => {
logger.silly(`GET ${url}`);
const res = await bhttp.get(url, {
responseTimeout: timeout,
...options,
});
res.code = res.statusCode;
return res;
}, {
concurrency: 20,
});
queue.define('httpPost', async ({
url,
body,
timeout = 30000,
options = {},
}) => {
logger.silly(`POST ${url} with ${body}`);
const res = await bhttp.post(url, body, {
responseTimeout: timeout,
...options,
});
res.code = res.statusCode;
return res;
}, {
concurrency: 20,
});
async function get(url, options) {
return queue.push('httpGet', {
method: 'get',
url,
options,
});
}
async function post(url, body, options) {
return queue.push('httpPost', {
url,
body,
options,
});
}
module.exports = {
get,
post,
};

View File

@ -2,7 +2,7 @@
const { JSDOM } = require('jsdom');
const moment = require('moment');
const bhttp = require('bhttp');
const http = require('./http');
function trim(str) {
return str.trim().replace(/\s+/g, ' ');
@ -228,7 +228,7 @@ function extractAll(html, selector) {
}
async function get(url, selector, headers, all = false) {
const res = await bhttp.get(url, {
const res = await http.get(url, {
headers,
});