forked from DebaucheryLibrarian/traxxx
Queueing and batching media HTTP requests for improved reliability.
This commit is contained in:
parent
b2dfbac9e5
commit
349a5a506e
|
@ -16,7 +16,7 @@ import FilterBar from '../header/filter-bar.vue';
|
||||||
import Releases from '../releases/releases.vue';
|
import Releases from '../releases/releases.vue';
|
||||||
|
|
||||||
async function fetchReleases() {
|
async function fetchReleases() {
|
||||||
this.releases = await this.$store.dispatch('fetchReleases', { limit: 100 });
|
this.releases = await this.$store.dispatch('fetchReleases', { limit: 250 });
|
||||||
this.$store.commit('setCache', {
|
this.$store.commit('setCache', {
|
||||||
target: 'home',
|
target: 'home',
|
||||||
releases: this.releases,
|
releases: this.releases,
|
||||||
|
|
|
@ -76,10 +76,13 @@ function initNetworksActions(store, _router) {
|
||||||
{
|
{
|
||||||
date: {
|
date: {
|
||||||
lessThan: $before,
|
lessThan: $before,
|
||||||
greaterThan: $after,
|
greaterThan: $after
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
date: {
|
||||||
|
isNull: true
|
||||||
|
},
|
||||||
createdAt: {
|
createdAt: {
|
||||||
lessThan: $beforeTime,
|
lessThan: $beforeTime,
|
||||||
greaterThan: $afterTime,
|
greaterThan: $afterTime,
|
||||||
|
|
|
@ -4955,6 +4955,11 @@
|
||||||
"elliptic": "^6.0.0"
|
"elliptic": "^6.0.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"create-error": {
|
||||||
|
"version": "0.3.1",
|
||||||
|
"resolved": "https://registry.npmjs.org/create-error/-/create-error-0.3.1.tgz",
|
||||||
|
"integrity": "sha1-aYECRaYp5lRDK/BDdzYAA6U1GiM="
|
||||||
|
},
|
||||||
"create-hash": {
|
"create-hash": {
|
||||||
"version": "1.2.0",
|
"version": "1.2.0",
|
||||||
"resolved": "https://registry.npmjs.org/create-hash/-/create-hash-1.2.0.tgz",
|
"resolved": "https://registry.npmjs.org/create-hash/-/create-hash-1.2.0.tgz",
|
||||||
|
@ -10899,6 +10904,24 @@
|
||||||
"integrity": "sha1-mEcocL8igTL8vdhoEputEsPAKeM=",
|
"integrity": "sha1-mEcocL8igTL8vdhoEputEsPAKeM=",
|
||||||
"dev": true
|
"dev": true
|
||||||
},
|
},
|
||||||
|
"promise-task-queue": {
|
||||||
|
"version": "1.2.0",
|
||||||
|
"resolved": "https://registry.npmjs.org/promise-task-queue/-/promise-task-queue-1.2.0.tgz",
|
||||||
|
"integrity": "sha1-fz2NszEFNRuq/CqsFDRtcXwfHjs=",
|
||||||
|
"requires": {
|
||||||
|
"bluebird": "^3.3.3",
|
||||||
|
"create-error": "^0.3.1",
|
||||||
|
"debug": "^2.2.0",
|
||||||
|
"extend": "^3.0.0"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"extend": {
|
||||||
|
"version": "3.0.2",
|
||||||
|
"resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz",
|
||||||
|
"integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g=="
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
"prop-types": {
|
"prop-types": {
|
||||||
"version": "15.7.2",
|
"version": "15.7.2",
|
||||||
"resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.7.2.tgz",
|
"resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.7.2.tgz",
|
||||||
|
|
|
@ -96,6 +96,7 @@
|
||||||
"pg": "^7.18.1",
|
"pg": "^7.18.1",
|
||||||
"postgraphile": "^4.5.5",
|
"postgraphile": "^4.5.5",
|
||||||
"postgraphile-plugin-connection-filter": "^1.1.3",
|
"postgraphile-plugin-connection-filter": "^1.1.3",
|
||||||
|
"promise-task-queue": "^1.2.0",
|
||||||
"prop-types": "^15.7.2",
|
"prop-types": "^15.7.2",
|
||||||
"react": "^16.12.0",
|
"react": "^16.12.0",
|
||||||
"react-dom": "^16.12.0",
|
"react-dom": "^16.12.0",
|
||||||
|
|
|
@ -748,14 +748,14 @@ const tags = [
|
||||||
priority: 10,
|
priority: 10,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: 'double anal TP',
|
name: 'DA triple penetration',
|
||||||
slug: 'da-tp',
|
slug: 'da-tp',
|
||||||
priority: 7,
|
priority: 7,
|
||||||
description: 'Triple penetration with two cocks in the ass, and one in the pussy. Also see [double vaginal TP](/tag/dv-tp).',
|
description: 'Triple penetration with two cocks in the ass, and one in the pussy. Also see [double vaginal TP](/tag/dv-tp).',
|
||||||
group: 'penetration',
|
group: 'penetration',
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
name: 'double vaginal TP',
|
name: 'DV triple penetration',
|
||||||
slug: 'dv-tp',
|
slug: 'dv-tp',
|
||||||
priority: 7,
|
priority: 7,
|
||||||
description: 'Triple penetration with two cocks in the pussy, and one in the ass. Also see [double anal TP](/tag/da-tp).',
|
description: 'Triple penetration with two cocks in the pussy, and one in the ass. Also see [double anal TP](/tag/da-tp).',
|
||||||
|
|
35
src/argv.js
35
src/argv.js
|
@ -101,6 +101,41 @@ const { argv } = yargs
|
||||||
type: 'boolean',
|
type: 'boolean',
|
||||||
default: true,
|
default: true,
|
||||||
})
|
})
|
||||||
|
.option('media', {
|
||||||
|
describe: 'Include any release media',
|
||||||
|
type: 'boolean',
|
||||||
|
default: true,
|
||||||
|
})
|
||||||
|
.option('posters', {
|
||||||
|
describe: 'Include release posters',
|
||||||
|
type: 'boolean',
|
||||||
|
default: true,
|
||||||
|
})
|
||||||
|
.option('covers', {
|
||||||
|
describe: 'Include release covers',
|
||||||
|
type: 'boolean',
|
||||||
|
default: true,
|
||||||
|
})
|
||||||
|
.option('photos', {
|
||||||
|
describe: 'Include release photos',
|
||||||
|
type: 'boolean',
|
||||||
|
default: true,
|
||||||
|
})
|
||||||
|
.option('trailers', {
|
||||||
|
describe: 'Include release trailers',
|
||||||
|
type: 'boolean',
|
||||||
|
default: true,
|
||||||
|
})
|
||||||
|
.option('teasers', {
|
||||||
|
describe: 'Include release teasers',
|
||||||
|
type: 'boolean',
|
||||||
|
default: true,
|
||||||
|
})
|
||||||
|
.option('avatars', {
|
||||||
|
describe: 'Include actor avatars',
|
||||||
|
type: 'boolean',
|
||||||
|
default: true,
|
||||||
|
})
|
||||||
.option('inspect', {
|
.option('inspect', {
|
||||||
describe: 'Show data in console.',
|
describe: 'Show data in console.',
|
||||||
type: 'boolean',
|
type: 'boolean',
|
||||||
|
|
114
src/media.js
114
src/media.js
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
const config = require('config');
|
const config = require('config');
|
||||||
const Promise = require('bluebird');
|
const Promise = require('bluebird');
|
||||||
const bhttp = require('bhttp');
|
// const bhttp = require('bhttp');
|
||||||
const mime = require('mime');
|
const mime = require('mime');
|
||||||
const fs = require('fs-extra');
|
const fs = require('fs-extra');
|
||||||
const sharp = require('sharp');
|
const sharp = require('sharp');
|
||||||
|
@ -11,7 +11,9 @@ const blake2 = require('blake2');
|
||||||
|
|
||||||
const logger = require('./logger')(__filename);
|
const logger = require('./logger')(__filename);
|
||||||
const knex = require('./knex');
|
const knex = require('./knex');
|
||||||
|
const { get } = require('./utils/http');
|
||||||
const { ex } = require('./utils/q');
|
const { ex } = require('./utils/q');
|
||||||
|
const chunk = require('./utils/chunk');
|
||||||
|
|
||||||
function getHash(buffer) {
|
function getHash(buffer) {
|
||||||
const hash = blake2.createHash('blake2b', { digestLength: 24 });
|
const hash = blake2.createHash('blake2b', { digestLength: 24 });
|
||||||
|
@ -73,7 +75,8 @@ async function getEntropy(buffer) {
|
||||||
}
|
}
|
||||||
|
|
||||||
async function extractItem(source) {
|
async function extractItem(source) {
|
||||||
const res = await bhttp.get(source.src);
|
// const res = await bhttp.get(source.src);
|
||||||
|
const res = await get(source.src);
|
||||||
|
|
||||||
if (res.statusCode === 200) {
|
if (res.statusCode === 200) {
|
||||||
const { q } = ex(res.body.toString());
|
const { q } = ex(res.body.toString());
|
||||||
|
@ -114,7 +117,8 @@ async function fetchItem(source, index, existingItemsBySource, domain, role, att
|
||||||
|
|
||||||
logger.verbose(`Fetching media item from ${source.src || source}`);
|
logger.verbose(`Fetching media item from ${source.src || source}`);
|
||||||
|
|
||||||
const res = await bhttp.get(source.src || source);
|
// const res = await bhttp.get(source.src || source);
|
||||||
|
const res = await get(source.src || source);
|
||||||
|
|
||||||
if (res.statusCode === 200) {
|
if (res.statusCode === 200) {
|
||||||
const { pathname } = new URL(source.src || source);
|
const { pathname } = new URL(source.src || source);
|
||||||
|
@ -150,9 +154,7 @@ async function fetchItem(source, index, existingItemsBySource, domain, role, att
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchItems(itemSources, existingItemsBySource, domain, role) {
|
async function fetchItems(itemSources, existingItemsBySource, domain, role) {
|
||||||
return Promise.map(itemSources, async (source, index) => fetchItem(source, index, existingItemsBySource, domain, role), {
|
return Promise.map(itemSources, async (source, index) => fetchItem(source, index, existingItemsBySource, domain, role)).filter(Boolean);
|
||||||
concurrency: 10,
|
|
||||||
}).filter(Boolean);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async function saveItems(items, domain, role) {
|
async function saveItems(items, domain, role) {
|
||||||
|
@ -182,10 +184,15 @@ async function saveItems(items, domain, role) {
|
||||||
logger.verbose(`Saved ${domain} ${role} with thumbnail to ${filepath}`);
|
logger.verbose(`Saved ${domain} ${role} with thumbnail to ${filepath}`);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
...item,
|
|
||||||
thumbnail,
|
thumbnail,
|
||||||
filepath,
|
filepath,
|
||||||
thumbpath,
|
thumbpath,
|
||||||
|
mimetype: item.mimetype,
|
||||||
|
extension: item.extension,
|
||||||
|
hash: item.hash,
|
||||||
|
entropy: item.entropy,
|
||||||
|
quality: item.quality,
|
||||||
|
source: item.source,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -199,8 +206,6 @@ async function saveItems(items, domain, role) {
|
||||||
logger.error(`Failed to store ${domain} ${role} from ${item.source}: ${error.message}`);
|
logger.error(`Failed to store ${domain} ${role} from ${item.source}: ${error.message}`);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}, {
|
|
||||||
concurrency: 20,
|
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -227,50 +232,55 @@ function groupItems(items) {
|
||||||
}
|
}
|
||||||
|
|
||||||
async function storeMedia(sources, domain, role) {
|
async function storeMedia(sources, domain, role) {
|
||||||
try {
|
const presentSources = sources.filter(Boolean);
|
||||||
const presentSources = sources.filter(Boolean);
|
|
||||||
|
|
||||||
if (presentSources.length === 0) {
|
if (presentSources.length === 0) {
|
||||||
return {};
|
return {};
|
||||||
}
|
|
||||||
|
|
||||||
// find source duplicates that don't need to be re-downloaded or re-saved
|
|
||||||
const existingSourceItems = await knex('media').whereIn('source', presentSources.flat().map(source => source.src || source));
|
|
||||||
const { source: existingSourceItemsBySource, hash: existingSourceItemsByHash } = groupItems(existingSourceItems);
|
|
||||||
|
|
||||||
// download media items from new sources
|
|
||||||
const fetchedItems = await fetchItems(presentSources, existingSourceItemsBySource, domain, role);
|
|
||||||
const { hash: fetchedItemsByHash } = groupItems(fetchedItems);
|
|
||||||
|
|
||||||
// find hash duplicates that don't need to be re-saved
|
|
||||||
const uniqueFetchedItems = Object.values(fetchedItemsByHash);
|
|
||||||
const existingHashItems = await knex('media').whereIn('hash', uniqueFetchedItems.map(item => item.hash));
|
|
||||||
const { hash: existingHashItemsByHash } = groupItems(existingHashItems);
|
|
||||||
|
|
||||||
// save new items to disk
|
|
||||||
const newItems = uniqueFetchedItems.filter(item => !existingHashItemsByHash[item.hash]);
|
|
||||||
const savedItems = await saveItems(newItems, domain, role);
|
|
||||||
|
|
||||||
// store new items in database
|
|
||||||
const curatedItemEntries = curateItemEntries(savedItems);
|
|
||||||
const storedItems = await knex('media').insert(curatedItemEntries).returning('*');
|
|
||||||
const { hash: storedItemsByHash } = groupItems(Array.isArray(storedItems) ? storedItems : []);
|
|
||||||
|
|
||||||
// accumulate existing and new items by source to be mapped onto releases
|
|
||||||
const itemsByHash = { ...existingSourceItemsByHash, ...existingHashItemsByHash, ...storedItemsByHash };
|
|
||||||
const itemsBySource = {
|
|
||||||
...existingSourceItemsBySource,
|
|
||||||
...fetchedItems.reduce((acc, item) => ({ ...acc, [item.source]: itemsByHash[item.hash] }), {}),
|
|
||||||
};
|
|
||||||
|
|
||||||
logger.info(`Stored ${fetchedItems.length} new ${domain} ${role}s`);
|
|
||||||
|
|
||||||
return itemsBySource;
|
|
||||||
} catch (error) {
|
|
||||||
logger.error(`Failed to store ${domain} ${role} batch: ${error.message}`);
|
|
||||||
|
|
||||||
return null;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// split up source list to prevent excessive RAM usage
|
||||||
|
const itemChunksBySource = await Promise.all(chunk(presentSources, 50).map(async (sourceChunk, index) => {
|
||||||
|
try {
|
||||||
|
// find source duplicates that don't need to be re-downloaded or re-saved
|
||||||
|
const existingSourceItems = await knex('media').whereIn('source', sourceChunk.flat().map(source => source.src || source));
|
||||||
|
const { source: existingSourceItemsBySource, hash: existingSourceItemsByHash } = groupItems(existingSourceItems);
|
||||||
|
|
||||||
|
// download media items from new sources
|
||||||
|
const fetchedItems = await fetchItems(sourceChunk, existingSourceItemsBySource, domain, role);
|
||||||
|
const { hash: fetchedItemsByHash } = groupItems(fetchedItems);
|
||||||
|
|
||||||
|
// find hash duplicates that don't need to be re-saved
|
||||||
|
const uniqueFetchedItems = Object.values(fetchedItemsByHash);
|
||||||
|
const existingHashItems = await knex('media').whereIn('hash', uniqueFetchedItems.map(item => item.hash));
|
||||||
|
const { hash: existingHashItemsByHash } = groupItems(existingHashItems);
|
||||||
|
|
||||||
|
// save new items to disk
|
||||||
|
const newItems = uniqueFetchedItems.filter(item => !existingHashItemsByHash[item.hash]);
|
||||||
|
const savedItems = await saveItems(newItems, domain, role);
|
||||||
|
|
||||||
|
// store new items in database
|
||||||
|
const curatedItemEntries = curateItemEntries(savedItems);
|
||||||
|
const storedItems = await knex('media').insert(curatedItemEntries).returning('*');
|
||||||
|
const { hash: storedItemsByHash } = groupItems(Array.isArray(storedItems) ? storedItems : []);
|
||||||
|
|
||||||
|
// accumulate existing and new items by source to be mapped onto releases
|
||||||
|
const itemsByHash = { ...existingSourceItemsByHash, ...existingHashItemsByHash, ...storedItemsByHash };
|
||||||
|
const itemsBySource = {
|
||||||
|
...existingSourceItemsBySource,
|
||||||
|
...fetchedItems.reduce((acc, item) => ({ ...acc, [item.source]: itemsByHash[item.hash] }), {}),
|
||||||
|
};
|
||||||
|
|
||||||
|
logger.info(`Stored batch ${index + 1} with ${fetchedItems.length} of new ${domain} ${role}s`);
|
||||||
|
|
||||||
|
return itemsBySource;
|
||||||
|
} catch (error) {
|
||||||
|
logger.error(`Failed to store ${domain} ${role} batch ${index + 1}: ${error.message}`);
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
return itemChunksBySource.reduce((acc, itemChunk) => ({ ...acc, ...itemChunk }), {});
|
||||||
}
|
}
|
||||||
|
|
||||||
function extractPrimaryItem(associations, targetId, role, primaryRole, primaryItemsByTargetId) {
|
function extractPrimaryItem(associations, targetId, role, primaryRole, primaryItemsByTargetId) {
|
||||||
|
@ -319,7 +329,7 @@ async function associateMedia(sourcesByTargetId, mediaBySource, domain, role, pr
|
||||||
const primaryAssociations = associationsPerTarget.map(association => association[primaryRole]).filter(Boolean);
|
const primaryAssociations = associationsPerTarget.map(association => association[primaryRole]).filter(Boolean);
|
||||||
|
|
||||||
logger.info(`Associated ${associations.length} ${role}s to ${domain}s`);
|
logger.info(`Associated ${associations.length} ${role}s to ${domain}s`);
|
||||||
logger.info(`Associated ${primaryAssociations.length} extracted ${primaryRole}s to ${domain}s`);
|
if (primaryRole) logger.info(`Associated ${primaryAssociations.length} extracted ${primaryRole}s to ${domain}s`);
|
||||||
|
|
||||||
return Promise.all([
|
return Promise.all([
|
||||||
(associations.length > 0 && knex.raw(`${knex(`${domain}s_${role}s`).insert(associations).toString()} ON CONFLICT DO NOTHING`)),
|
(associations.length > 0 && knex.raw(`${knex(`${domain}s_${role}s`).insert(associations).toString()} ON CONFLICT DO NOTHING`)),
|
||||||
|
|
|
@ -330,6 +330,10 @@ function accumulateMovies(releases) {
|
||||||
}
|
}
|
||||||
|
|
||||||
async function storeReleaseAssets(releases) {
|
async function storeReleaseAssets(releases) {
|
||||||
|
if (!argv.media) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
const releasePostersById = releases.reduce((acc, release) => ({ ...acc, [release.id]: [release.poster] }), {});
|
const releasePostersById = releases.reduce((acc, release) => ({ ...acc, [release.id]: [release.poster] }), {});
|
||||||
const releaseCoversById = releases.reduce((acc, release) => ({ ...acc, [release.id]: release.covers }), {});
|
const releaseCoversById = releases.reduce((acc, release) => ({ ...acc, [release.id]: release.covers }), {});
|
||||||
const releaseTrailersById = releases.reduce((acc, release) => ({ ...acc, [release.id]: [release.trailer] }), {});
|
const releaseTrailersById = releases.reduce((acc, release) => ({ ...acc, [release.id]: [release.trailer] }), {});
|
||||||
|
@ -340,28 +344,30 @@ async function storeReleaseAssets(releases) {
|
||||||
}), {});
|
}), {});
|
||||||
|
|
||||||
const [posters, covers] = await Promise.all([
|
const [posters, covers] = await Promise.all([
|
||||||
storeMedia(Object.values(releasePostersById).flat(), 'release', 'poster'),
|
argv.posters && storeMedia(Object.values(releasePostersById).flat(), 'release', 'poster'),
|
||||||
storeMedia(Object.values(releaseCoversById).flat(), 'release', 'cover'),
|
argv.covers && storeMedia(Object.values(releaseCoversById).flat(), 'release', 'cover'),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
// ensure posters are available before fetching supplementary media
|
// ensure posters are available before fetching supplementary media
|
||||||
await Promise.all([
|
await Promise.all([
|
||||||
(posters && associateMedia(releasePostersById, posters, 'release', 'poster')),
|
posters && associateMedia(releasePostersById, posters, 'release', 'poster'),
|
||||||
(covers && associateMedia(releaseCoversById, covers, 'release', 'cover')),
|
covers && associateMedia(releaseCoversById, covers, 'release', 'cover'),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
const photos = await storeMedia(Object.values(releasePhotosById).flat(), 'release', 'photo');
|
if (argv.photos) {
|
||||||
if (photos) await associateMedia(releasePhotosById, photos, 'release', 'photo');
|
const photos = await storeMedia(Object.values(releasePhotosById).flat(), 'release', 'photo');
|
||||||
|
if (photos) await associateMedia(releasePhotosById, photos, 'release', 'photo');
|
||||||
|
}
|
||||||
|
|
||||||
// videos take a long time, fetch last
|
// videos take a long time, fetch last
|
||||||
const [trailers, teasers] = await Promise.all([
|
const [trailers, teasers] = await Promise.all([
|
||||||
storeMedia(Object.values(releaseTrailersById).flat(), 'release', 'trailer'),
|
argv.trailers && storeMedia(Object.values(releaseTrailersById).flat(), 'release', 'trailer'),
|
||||||
storeMedia(Object.values(releaseTeasersById).flat(), 'release', 'teaser'),
|
argv.teasers && storeMedia(Object.values(releaseTeasersById).flat(), 'release', 'teaser'),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
await Promise.all([
|
await Promise.all([
|
||||||
(trailers && associateMedia(releaseTrailersById, trailers, 'release', 'trailer')),
|
trailers && associateMedia(releaseTrailersById, trailers, 'release', 'trailer'),
|
||||||
(teasers && associateMedia(releaseTeasersById, teasers, 'release', 'teaser')),
|
teasers && associateMedia(releaseTeasersById, teasers, 'release', 'teaser'),
|
||||||
]);
|
]);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -52,10 +52,20 @@ async function scrapeRelease(source, basicRelease = null, type = 'scene', prefli
|
||||||
}
|
}
|
||||||
|
|
||||||
if (type === 'scene' && !scraper.fetchScene) {
|
if (type === 'scene' && !scraper.fetchScene) {
|
||||||
|
if (release) {
|
||||||
|
logger.warn(`The '${site.name}'-scraper cannot fetch individual scenes`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
throw new Error(`The '${site.name}'-scraper cannot fetch individual scenes`);
|
throw new Error(`The '${site.name}'-scraper cannot fetch individual scenes`);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (type === 'movie' && !scraper.fetchMovie) {
|
if (type === 'movie' && !scraper.fetchMovie) {
|
||||||
|
if (release) {
|
||||||
|
logger.warn(`The '${site.name}'-scraper cannot fetch individual movies`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
throw new Error(`The '${site.name}'-scraper cannot fetch individual movies`);
|
throw new Error(`The '${site.name}'-scraper cannot fetch individual movies`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -80,7 +90,7 @@ async function scrapeRelease(source, basicRelease = null, type = 'scene', prefli
|
||||||
async function scrapeReleases(sources, release = null, type = 'scene', preflight = null) {
|
async function scrapeReleases(sources, release = null, type = 'scene', preflight = null) {
|
||||||
const scrapedReleases = await Promise.map(sources, async source => scrapeRelease(source, release, type, preflight), {
|
const scrapedReleases = await Promise.map(sources, async source => scrapeRelease(source, release, type, preflight), {
|
||||||
concurrency: 5,
|
concurrency: 5,
|
||||||
});
|
}).filter(Boolean);
|
||||||
|
|
||||||
const curatedReleases = scrapedReleases.map(scrapedRelease => ({ ...scrapedRelease, type }));
|
const curatedReleases = scrapedReleases.map(scrapedRelease => ({ ...scrapedRelease, type }));
|
||||||
|
|
||||||
|
|
|
@ -117,7 +117,7 @@ async function deepFetchReleases(baseReleases, preflight) {
|
||||||
|
|
||||||
return release;
|
return release;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error(error.stack);
|
logger.error(`Failed to scrape ${release.url}: ${error}`);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
...release,
|
...release,
|
||||||
|
|
|
@ -211,8 +211,13 @@ async function scrapeScene(html, url, site) {
|
||||||
|
|
||||||
release.title = $('.title_bar_hilite').text().trim();
|
release.title = $('.title_bar_hilite').text().trim();
|
||||||
|
|
||||||
const setIdIndex = html.indexOf('setid:"');
|
const entryId = html.match(/showtagform\((\d+)\)/);
|
||||||
release.entryId = html.slice(setIdIndex, html.indexOf(',', setIdIndex)).match(/\d+/)[0];
|
|
||||||
|
if (entryId) release.entryId = entryId[1];
|
||||||
|
else {
|
||||||
|
const setIdIndex = html.indexOf('setid:"');
|
||||||
|
if (setIdIndex) release.entryId = html.slice(setIdIndex, html.indexOf(',', setIdIndex)).match(/\d+/)[0];
|
||||||
|
}
|
||||||
|
|
||||||
const dateElement = $('.update_date').text().trim();
|
const dateElement = $('.update_date').text().trim();
|
||||||
const dateComment = $('*')
|
const dateComment = $('*')
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
'use strict';
|
'use strict';
|
||||||
|
|
||||||
const { geta, edate } = require('../utils/q');
|
const { geta, ed } = require('../utils/q');
|
||||||
|
|
||||||
function scrapeBlockLatest(scenes) {
|
function scrapeBlockLatest(scenes) {
|
||||||
return scenes.map(({ html, q, qa, qu, qt }) => {
|
return scenes.map(({ html, q, qa, qu, qt }) => {
|
||||||
|
@ -13,7 +13,7 @@ function scrapeBlockLatest(scenes) {
|
||||||
|
|
||||||
release.title = q('h4 a', true);
|
release.title = q('h4 a', true);
|
||||||
release.url = qu('h4 a');
|
release.url = qu('h4 a');
|
||||||
release.date = edate(html, 'MM/DD/YYYY', /\d{2}\/\d{2}\/\d{4}/);
|
release.date = ed(html, 'MM/DD/YYYY', /\d{2}\/\d{2}\/\d{4}/);
|
||||||
|
|
||||||
release.actors = qa('.tour_update_models a', true);
|
release.actors = qa('.tour_update_models a', true);
|
||||||
|
|
||||||
|
@ -22,8 +22,6 @@ function scrapeBlockLatest(scenes) {
|
||||||
|
|
||||||
release.teaser = qt();
|
release.teaser = qt();
|
||||||
|
|
||||||
console.log(release);
|
|
||||||
|
|
||||||
return release;
|
return release;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
@ -52,8 +50,6 @@ function scrapeClassicLatest(scenes) {
|
||||||
const photoCount = q('.update_thumb', 'cnt');
|
const photoCount = q('.update_thumb', 'cnt');
|
||||||
[release.poster, ...release.photos] = Array.from({ length: photoCount }).map((value, index) => q('.update_thumb', `src${index}_3x`) || q('.update_thumb', `src${index}_2x`) || q('.update_thumb', `src${index}_1x`));
|
[release.poster, ...release.photos] = Array.from({ length: photoCount }).map((value, index) => q('.update_thumb', `src${index}_3x`) || q('.update_thumb', `src${index}_2x`) || q('.update_thumb', `src${index}_1x`));
|
||||||
|
|
||||||
console.log(release);
|
|
||||||
|
|
||||||
return release;
|
return release;
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,8 @@
|
||||||
|
'use strict';
|
||||||
|
|
||||||
|
function chunk(array, chunkSize) {
|
||||||
|
return Array.from({ length: Math.ceil(array.length / chunkSize) })
|
||||||
|
.map((value, index) => array.slice(index * chunkSize, (index * chunkSize) + chunkSize));
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = chunk;
|
|
@ -0,0 +1,76 @@
|
||||||
|
'use strict';
|
||||||
|
|
||||||
|
const bhttp = require('bhttp');
|
||||||
|
const taskQueue = require('promise-task-queue');
|
||||||
|
|
||||||
|
const logger = require('../logger')(__filename);
|
||||||
|
|
||||||
|
const queue = taskQueue();
|
||||||
|
|
||||||
|
queue.on('concurrencyReached:httpGet', () => {
|
||||||
|
logger.silly('Queueing GET requests');
|
||||||
|
});
|
||||||
|
|
||||||
|
queue.on('concurrencyReached:httpPost', () => {
|
||||||
|
logger.silly('Queueing POST requests');
|
||||||
|
});
|
||||||
|
|
||||||
|
queue.define('httpGet', async ({
|
||||||
|
url,
|
||||||
|
timeout = 30000,
|
||||||
|
options = {},
|
||||||
|
}) => {
|
||||||
|
logger.silly(`GET ${url}`);
|
||||||
|
|
||||||
|
const res = await bhttp.get(url, {
|
||||||
|
responseTimeout: timeout,
|
||||||
|
...options,
|
||||||
|
});
|
||||||
|
|
||||||
|
res.code = res.statusCode;
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}, {
|
||||||
|
concurrency: 20,
|
||||||
|
});
|
||||||
|
|
||||||
|
queue.define('httpPost', async ({
|
||||||
|
url,
|
||||||
|
body,
|
||||||
|
timeout = 30000,
|
||||||
|
options = {},
|
||||||
|
}) => {
|
||||||
|
logger.silly(`POST ${url} with ${body}`);
|
||||||
|
|
||||||
|
const res = await bhttp.post(url, body, {
|
||||||
|
responseTimeout: timeout,
|
||||||
|
...options,
|
||||||
|
});
|
||||||
|
|
||||||
|
res.code = res.statusCode;
|
||||||
|
|
||||||
|
return res;
|
||||||
|
}, {
|
||||||
|
concurrency: 20,
|
||||||
|
});
|
||||||
|
|
||||||
|
async function get(url, options) {
|
||||||
|
return queue.push('httpGet', {
|
||||||
|
method: 'get',
|
||||||
|
url,
|
||||||
|
options,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
async function post(url, body, options) {
|
||||||
|
return queue.push('httpPost', {
|
||||||
|
url,
|
||||||
|
body,
|
||||||
|
options,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
get,
|
||||||
|
post,
|
||||||
|
};
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
const { JSDOM } = require('jsdom');
|
const { JSDOM } = require('jsdom');
|
||||||
const moment = require('moment');
|
const moment = require('moment');
|
||||||
const bhttp = require('bhttp');
|
const http = require('./http');
|
||||||
|
|
||||||
function trim(str) {
|
function trim(str) {
|
||||||
return str.trim().replace(/\s+/g, ' ');
|
return str.trim().replace(/\s+/g, ' ');
|
||||||
|
@ -228,7 +228,7 @@ function extractAll(html, selector) {
|
||||||
}
|
}
|
||||||
|
|
||||||
async function get(url, selector, headers, all = false) {
|
async function get(url, selector, headers, all = false) {
|
||||||
const res = await bhttp.get(url, {
|
const res = await http.get(url, {
|
||||||
headers,
|
headers,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue