Using new HTTP module with a dynamic rate limiter.

This commit is contained in:
DebaucheryLibrarian 2020-11-22 04:07:09 +01:00
parent 5d0fe44130
commit b9b777c621
27 changed files with 358 additions and 175 deletions

View File

@ -197,6 +197,12 @@ module.exports = {
'www.deeper.com',
],
},
limits: {
default: {
interval: 50,
concurrency: 20,
},
},
fetchAfter: [1, 'week'],
missingDateLimit: 3,
media: {

5
package-lock.json generated
View File

@ -2208,6 +2208,11 @@
"resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
"integrity": "sha1-aN/1++YMUes3cl6p4+0xDcwed24="
},
"bottleneck": {
"version": "2.19.5",
"resolved": "https://registry.npmjs.org/bottleneck/-/bottleneck-2.19.5.tgz",
"integrity": "sha512-VHiNCbI1lKdl44tGrhNfU3lup0Tj/ZBMJB5/2ZbNXRCPuRCO7ed2mgcK4r17y+KB2EfuYuRaVlwNbAeaWGSpbw=="
},
"brace-expansion": {
"version": "1.1.11",
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",

View File

@ -78,6 +78,7 @@
"blake2": "^4.0.0",
"bluebird": "^3.7.2",
"body-parser": "^1.19.0",
"bottleneck": "^2.19.5",
"canvas": "^2.6.1",
"casual": "^1.6.2",
"cheerio": "^1.0.0-rc.3",

Binary file not shown.

After

Width:  |  Height:  |  Size: 601 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 377 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 32 KiB

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

View File

@ -643,7 +643,7 @@ const tagPosters = [
['mff', 1, 'Anikka Albrite, Kelsi Monroe and Mick Blue for HardX'],
['mfm', 0, 'Vina Sky in "Jules Jordan\'s Three Ways" for Jules Jordan'],
['natural-boobs', 4, 'Miela (Marry Queen) in "Pure" for FemJoy'],
['nurse', 0, 'Sarah Vandella in "Cum For Nurse Sarah" for Brazzers'],
['nurse', 1, 'Mia Malkova in "Always Think Happy Thoughts" for Brazzers'],
['oil', 2, 'Jade Kush for Passion HD'],
['oral-creampie', 1, 'Valentina Nappi for Her Limit'],
['orgy', 1, 'Megan Rain (DP), Morgan Lee (anal), Jessa Rhodes, Melissa Moore and Kimmy Granger in "Orgy Masters 8" for Jules Jordan'],
@ -825,6 +825,7 @@ const tagPhotos = [
['natural-boobs', 3, 'Violet Starr in "Violet Starr 1st Lesbian Anal" for LesbianX'],
['natural-boobs', 0, 'Valentina Nappi in "Hypnotic Curves" for LesbianX'],
['natural-boobs', 2, 'Kylie Page for All Girl Massage'],
['nurse', 0, 'Sarah Vandella in "Cum For Nurse Sarah" for Brazzers'],
['oil', 1, 'Kissa Sins in "Oil Overload 14" for JulesJordan'],
['oil', 3, 'Vina Sky for Lubed'],
['oil', 0, 'Jada Stevens in "Jada Stevens Anal Ass Oiled Up For James Deen\'s Cock" for Jules Jordan'],

View File

@ -420,15 +420,18 @@ async function storeFile(media) {
} catch (error) {
logger.warn(`Failed to store ${media.src}: ${error.message}`);
await fsPromises.unlink(media.file.path);
return null;
}
}
async function fetchHttpSource(source, tempFileTarget, hashStream) {
const res = await http.get(source.src, {
...(source.referer && { referer: source.referer }),
...(source.host && { host: source.host }),
}, {
headers: {
...(source.referer && { referer: source.referer }),
...(source.host && { host: source.host }),
},
stream: true, // sources are fetched in parallel, don't gobble up memory
transforms: [hashStream],
destination: tempFileTarget,
@ -642,7 +645,7 @@ async function storeMedias(baseMedias) {
);
}
const newMediaWithEntries = savedMedias.map((media, index) => curateMediaEntry(media, index));
const newMediaWithEntries = savedMedias.filter(Boolean).map((media, index) => curateMediaEntry(media, index));
const newMediaEntries = newMediaWithEntries.filter(media => media.newEntry).map(media => media.entry);
await bulkInsert('media', newMediaEntries);

View File

@ -1,8 +1,6 @@
'use strict';
const bhttp = require('@thependulum/bhttp');
const { post } = require('../utils/http');
const http = require('../utils/http');
const { extractDate } = require('../utils/qu');
const { inchesToCm } = require('../utils/convert');
const slugify = require('../utils/slugify');
@ -84,7 +82,7 @@ function scrapeAll(scenes) {
}
async function fetchActorReleases(actor) {
const res = await bhttp.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, {
const res = await http.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, {
size: 50,
query: {
bool: {
@ -179,7 +177,7 @@ async function scrapeProfile(actor, include) {
}
async function fetchLatest(site, page = 1) {
const res = await bhttp.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, {
const res = await http.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, {
size: 50,
from: (page - 1) * 50,
query: {
@ -269,7 +267,7 @@ async function fetchScene(url) {
const encodedId = new URL(url).pathname.split('/')[2];
const entryId = decodeId(encodedId);
const res = await bhttp.get(`https://${clusterId}.us-east-1.aws.found.io/videos/video/${entryId}`, {
const res = await http.get(`https://${clusterId}.us-east-1.aws.found.io/videos/video/${entryId}`, {
headers: {
Authorization: `Basic ${authKey}`,
},
@ -279,7 +277,7 @@ async function fetchScene(url) {
}
async function fetchProfile({ name: actorName }, context, include) {
const res = await post(`https://${clusterId}.us-east-1.aws.found.io/actors/actor/_search`, {
const res = await http.post(`https://${clusterId}.us-east-1.aws.found.io/actors/actor/_search`, {
size: 5,
sort: [{
_score: {
@ -306,8 +304,11 @@ async function fetchProfile({ name: actorName }, context, include) {
},
},
}, {
Authorization: `Basic ${authKey}`,
}, { encodeJSON: true });
headers: {
Authorization: `Basic ${authKey}`,
},
encodeJSON: true,
});
if (res.ok) {
const actor = res.body.hits.hits.find(hit => hit._source.name.toLowerCase() === actorName.toLowerCase());

View File

@ -41,7 +41,9 @@ function scrapeScene({ query }, url, channel) {
}));
release.director = query.cnt('.director')?.split(/\s*:\s*/)[1];
release.poster = query.sourceSet('.player img', 'data-srcset');
const fallbackPoster = query.img('.player img');
release.poster = query.sourceSet('.player img', 'data-srcset') || [fallbackPoster.replace('_crop', ''), fallbackPoster];
release.movie = {
title: query.cnt('.movie a'),

View File

@ -7,7 +7,7 @@ const cheerio = require('cheerio');
const moment = require('moment');
const logger = require('../logger')(__filename);
const { ex, get } = require('../utils/q');
const qu = require('../utils/qu');
const http = require('../utils/http');
const slugify = require('../utils/slugify');
@ -318,7 +318,7 @@ async function fetchActorReleases(profileUrl, getActorReleasesUrl, page = 1, acc
const profilePath = `/${pathname.split('/').slice(-2).join('/')}`;
const url = getActorReleasesUrl(profilePath, page);
const res = await get(url);
const res = await qu.get(url);
if (!res.ok) return [];
@ -333,14 +333,14 @@ async function fetchActorReleases(profileUrl, getActorReleasesUrl, page = 1, acc
}
async function scrapeProfile(html, url, actorName, _siteSlug, getActorReleasesUrl, withReleases) {
const { q } = ex(html);
const { query } = qu.extract(html);
const avatar = q('img.actorPicture');
const hair = q('.actorProfile .attribute_hair_color', true);
const height = q('.actorProfile .attribute_height', true);
const weight = q('.actorProfile .attribute_weight', true);
const alias = q('.actorProfile .attribute_alternate_names', true);
const nationality = q('.actorProfile .attribute_home', true);
const avatar = query.el('img.actorPicture');
const hair = query.cnt('.actorProfile .attribute_hair_color');
const height = query.cnt('.actorProfile .attribute_height');
const weight = query.cnt('.actorProfile .attribute_weight');
const alias = query.cnt('.actorProfile .attribute_alternate_names');
const nationality = query.cnt('.actorProfile .attribute_home');
const profile = {
name: actorName,
@ -358,7 +358,7 @@ async function scrapeProfile(html, url, actorName, _siteSlug, getActorReleasesUr
profile.avatar = avatars;
}
profile.description = q('.actorBio p:not(.bioTitle)', true);
profile.description = query.cnt('.actorBio p:not(.bioTitle)');
if (hair) profile.hair = hair.split(':')[1].trim();
if (height) profile.height = Number(height.match(/\d+/)[0]);

View File

@ -129,7 +129,9 @@ async function fetchProfile(baseActor, entity, include) {
const searchRes = await http.post('https://tour.hitzefrei.com/search-preview', {
q: baseActor.name,
}, {
'Accept-Language': 'en-US',
headers: {
'Accept-Language': 'en-US',
},
});
if (searchRes.ok) {

View File

@ -115,7 +115,7 @@ async function scrapeSceneAlt({ query }, url, channel, session) {
release.trailer = query.video();
if (!release.trailer) {
const trailerRes = await http.get(`${channel.url}/api/play-api.php`, null, { useSession: session });
const trailerRes = await http.get(`${channel.url}/api/play-api.php`, { session });
if (trailerRes.ok) {
release.trailer = trailerRes.body;
@ -153,7 +153,7 @@ async function fetchLatest(site, page = 1) {
async function fetchScene(url, site) {
const session = http.session();
const res = await qu.get(url, null, null, { useSession: session });
const res = await qu.get(url, null, null, { session });
if (res.ok) {
if (site.parameters?.scraper === 'alt') {

View File

@ -23,7 +23,7 @@ async function fetchTrailerLocation(entryId, channel) {
const url = `${channel.url}/api/download/${entryId}/hd1080/stream`;
try {
const res = await http.get(url, null, {
const res = await http.get(url, {
followRedirects: false,
});

View File

@ -7,7 +7,7 @@ const cheerio = require('cheerio');
const { JSDOM } = require('jsdom');
const moment = require('moment');
const { get, geta, ctxa, parseDate, prefixUrl } = require('../utils/q');
const qu = require('../utils/qu');
const http = require('../utils/http');
const { heightToCm } = require('../utils/convert');
const slugify = require('../utils/slugify');
@ -82,7 +82,7 @@ async function getPhotosLegacy(entryId, site, type = 'highres', page = 1) {
async function getPhotos(entryId, site, type = 'highres', page = 1) {
const albumUrl = `${site.parameters?.photos || `${site.url}/gallery.php`}?id=${entryId}&type=${type}&page=${page}`;
const res = await bhttp.get(albumUrl);
const res = await http.get(albumUrl);
const html = res.body.toString();
const sourceLines = html.split(/\n/).filter(line => line.match(/ptx\["\w+"\]/));
@ -135,25 +135,25 @@ function getEntryId(html) {
}
function scrapeAll(scenes, site, entryIdFromTitle) {
return scenes.map(({ el, qu }) => {
return scenes.map(({ el, query }) => {
const release = {};
release.url = qu.url('.update_title a, .dvd_info > a, a ~ a');
release.title = qu.q('.update_title a, .dvd_info > a, a ~ a', true);
release.date = qu.date('.update_date', 'MM/DD/YYYY');
release.url = query.url('.update_title a, .dvd_info > a, a ~ a');
release.title = query.q('.update_title a, .dvd_info > a, a ~ a', true);
release.date = query.date('.update_date', 'MM/DD/YYYY');
release.entryId = (entryIdFromTitle && slugify(release.title)) || el.dataset.setid || qu.q('.rating_box')?.dataset.id;
release.entryId = (entryIdFromTitle && slugify(release.title)) || el.dataset.setid || query.q('.rating_box')?.dataset.id;
release.actors = qu.all('.update_models a', true);
release.actors = query.all('.update_models a', true);
const dvdPhotos = qu.imgs('.dvd_preview_thumb');
const photoCount = Number(qu.q('a img.thumbs', 'cnt')) || 1;
const dvdPhotos = query.imgs('.dvd_preview_thumb');
const photoCount = Number(query.q('a img.thumbs', 'cnt')) || 1;
[release.poster, ...release.photos] = dvdPhotos.length
? dvdPhotos
: Array.from({ length: photoCount }).map((value, index) => {
const src = qu.img('a img.thumbs', `src${index}_1x`) || qu.img('a img.thumbs', `src${index}`) || qu.img('a img.thumbs');
const prefixedSrc = prefixUrl(src, site.url);
const src = query.img('a img.thumbs', `src${index}_1x`) || query.img('a img.thumbs', `src${index}`) || query.img('a img.thumbs');
const prefixedSrc = qu.prefixUrl(src, site.url);
if (src) {
return [
@ -183,7 +183,7 @@ function scrapeAll(scenes, site, entryIdFromTitle) {
return null;
}).filter(Boolean);
const teaserScript = qu.html('script');
const teaserScript = query.html('script');
if (teaserScript) {
const src = teaserScript.slice(teaserScript.indexOf('http'), teaserScript.indexOf('.mp4') + 4);
if (src) release.teaser = { src };
@ -236,17 +236,17 @@ function scrapeUpcoming(html, site) {
});
}
async function scrapeScene({ html, qu }, url, site, include) {
async function scrapeScene({ html, query }, url, site, include) {
const release = { url, site };
release.entryId = getEntryId(html);
release.title = qu.q('.title_bar_hilite', true);
release.description = qu.q('.update_description', true);
release.title = query.q('.title_bar_hilite', true);
release.description = query.q('.update_description', true);
release.date = qu.date('.update_date', 'MM/DD/YYYY', null, 'innerHTML');
release.date = query.date('.update_date', 'MM/DD/YYYY', null, 'innerHTML');
release.actors = qu.all('.backgroundcolor_info > .update_models a, .item .update_models a', true);
release.tags = qu.all('.update_tags a', true);
release.actors = query.all('.backgroundcolor_info > .update_models a, .item .update_models a', true);
release.tags = query.all('.update_tags a', true);
const posterPath = html.match(/useimage = "(.*)"/)?.[1];
@ -280,14 +280,14 @@ async function scrapeScene({ html, qu }, url, site, include) {
if (include.photos) release.photos = await getPhotos(release.entryId, site);
if (qu.exists('.update_dvds a')) {
if (query.exists('.update_dvds a')) {
release.movie = {
url: qu.url('.update_dvds a'),
title: qu.q('.update_dvds a', true),
url: query.url('.update_dvds a'),
title: query.q('.update_dvds a', true),
};
}
const stars = Number(qu.q('.avg_rating', true)?.replace(/[\s|Avg Rating:]/g, ''));
const stars = Number(query.q('.avg_rating', true)?.replace(/[\s|Avg Rating:]/g, ''));
if (stars) release.stars = stars;
return release;
@ -302,7 +302,7 @@ function scrapeMovie({ el, query }, url, site) {
movie.channel = slugify(query.q('.update_date a', true), '');
// movie.releases = Array.from(document.querySelectorAll('.cell.dvd_info > a'), el => el.href);
const sceneQus = ctxa(el, '.dvd_details');
const sceneQus = qu.initAll(el, '.dvd_details');
const scenes = scrapeAll(sceneQus, site);
const curatedScenes = scenes
@ -332,7 +332,7 @@ function scrapeProfile(html, url, actorName, entity) {
const birthDateString = bio.match(/Age:\s*(\w+ \d{1,2}, \d{4})/);
const measurementsString = bio.match(/\w+-\d+-\d+/);
if (birthDateString) profile.birthdate = parseDate(birthDateString[1], 'MMMM D, YYYY');
if (birthDateString) profile.birthdate = qu.parseDate(birthDateString[1], 'MMMM D, YYYY');
if (ageString) profile.age = Number(ageString[1]);
if (heightString) profile.height = heightToCm(heightString[0]);
@ -354,7 +354,7 @@ function scrapeProfile(html, url, actorName, entity) {
avatarEl.getAttribute('src'),
]
.filter(avatar => avatar && !/p\d+.jpe?g/.test(avatar)) // remove non-existing attributes and placeholder images
.map(avatar => prefixUrl(avatar, entity.url));
.map(avatar => qu.prefixUrl(avatar, entity.url));
if (avatarSources.length) profile.avatar = avatarSources;
}
@ -370,7 +370,7 @@ async function fetchLatest(site, page = 1, include, preData, entryIdFromTitle =
: `${site.url}/trial/categories/movies_${page}_d.html`;
// const res = await bhttp.get(url);
const res = await geta(url, '.update_details');
const res = await qu.getAll(url, '.update_details');
return res.ok ? scrapeAll(res.items, site, entryIdFromTitle) : res.status;
}
@ -389,13 +389,13 @@ async function fetchUpcoming(site) {
}
async function fetchScene(url, site, baseRelease, include) {
const res = await get(url);
const res = await qu.get(url);
return res.ok ? scrapeScene(res.item, url, site, include) : res.status;
}
async function fetchMovie(url, site) {
const res = await get(url);
const res = await qu.get(url);
return res.ok ? scrapeMovie(res.item, url, site) : res.status;
}

View File

@ -97,8 +97,10 @@ async function scrapeScene({ query, html }, url, baseRelease) {
const token = query.meta('name=_token');
const trailerInfoUrl = `${origin}/episodes/trailer/sources/${release.entryId}?type=trailer`;
const trailerInfoRes = await http.post(trailerInfoUrl, null, {
'X-CSRF-Token': token,
'X-Requested-With': 'XMLHttpRequest',
headers: {
'X-CSRF-Token': token,
'X-Requested-With': 'XMLHttpRequest',
},
});
if (trailerInfoRes.ok && trailerInfoRes.body.sources.length > 0) {
@ -136,7 +138,9 @@ function scrapeProfile({ query }) {
async function fetchLatest(channel, page = 1) {
const url = `${channel.url}/episodes/search?page=${page}`; // TLS issues with teenfidelity.com, same overview on all sites
const res = await http.get(url, {
'X-Requested-With': 'XMLHttpRequest',
headers: {
'X-Requested-With': 'XMLHttpRequest',
},
});
if (res.ok && res.body.status === 'success') {
@ -157,7 +161,9 @@ async function fetchScene(url, channel, baseRelease) {
async function fetchProfile({ name: actorName }) {
const actorSlug = slugify(actorName);
const res = await qu.get(`https://www.kellymadison.com/models/${actorSlug}`, null, {
'X-Requested-With': 'XMLHttpRequest',
headers: {
'X-Requested-With': 'XMLHttpRequest',
},
});
if (res.ok) {

View File

@ -6,7 +6,7 @@ const moment = require('moment');
const http = require('../utils/http');
const slugify = require('../utils/slugify');
const { ex, get } = require('../utils/q');
const qu = require('../utils/q');
function titleExtractor(pathname) {
const components = pathname.split('/')[2].split('-');
@ -102,24 +102,24 @@ function scrapeScene(html, url, site) {
}
async function fetchActorReleases(url) {
const res = await get(url);
const res = await qu.get(url);
return res.ok
? res.item.qu.urls('.contain-block:not(.live-scenes) .scene-item > a:first-child') // live scenes repeat on all pages
? res.item.query.urls('.contain-block:not(.live-scenes) .scene-item > a:first-child') // live scenes repeat on all pages
: [];
}
async function scrapeProfile(html) {
const { qu } = ex(html);
const { query } = qu.extract(html);
const profile = {};
profile.description = qu.q('.bio_about_text', true);
profile.description = query.q('.bio_about_text', true);
const avatar = qu.q('img.performer-pic', 'src');
const avatar = query.q('img.performer-pic', 'src');
if (avatar) profile.avatar = `https:${avatar}`;
const releases = qu.urls('.scene-item > a:first-child');
const otherPages = qu.urls('.pagination a:not([rel=next]):not([rel=prev])');
const releases = query.urls('.scene-item > a:first-child');
const otherPages = query.urls('.pagination a:not([rel=next]):not([rel=prev])');
const olderReleases = await Promise.all(otherPages.map(async page => fetchActorReleases(page)));
profile.releases = releases.concat(olderReleases.flat());

View File

@ -71,10 +71,10 @@ async function fetchLatest(channel, page = 1) {
const headers = { 'X-Requested-With': 'XMLHttpRequest' };
for (let i = 0; i < page - 1; i += 1) {
await http.get(url, headers, { useSession: session }); // eslint-disable-line no-await-in-loop
await http.get(url, { headers, session }); // eslint-disable-line no-await-in-loop
}
const res = await http.get(url, headers, { useSession: session });
const res = await http.get(url, { headers, session });
if (res.ok) {
const items = qu.extractAll(res.body.snippets?.['snippet--videoItems'] || res.body, '.product-item');

View File

@ -74,9 +74,14 @@ async function scrapeScene({ query }, url) {
release.photos = query.imgs('.detail-grabs img');
const streamData = await http.get(`${origin}/video/source/${entryId}`, {
host,
referer: url,
}, { queueMethod: '5s' });
headers: {
host,
referer: url,
},
}, {
interval: 5000,
concurrency: 1,
});
if (streamData.ok && streamData.body.status === 'success') {
release.trailer = {

View File

@ -4,7 +4,7 @@
const Promise = require('bluebird');
const moment = require('moment');
const { get, post } = require('../utils/http');
const http = require('../utils/http');
const slugify = require('../utils/slugify');
const genderMap = {
@ -45,13 +45,15 @@ function getAvatarFallbacks(avatar) {
async function getTrailer(scene, site, url) {
const qualities = [360, 480, 720, 1080, 2160];
const tokenRes = await post(`${site.url}/api/__record_tknreq`, {
const tokenRes = await http.post(`${site.url}/api/__record_tknreq`, {
file: scene.previewVideoUrl1080P,
sizes: qualities.join('+'),
type: 'trailer',
}, {
referer: url,
origin: site.url,
headers: {
referer: url,
origin: site.url,
},
});
if (!tokenRes.ok) {
@ -59,7 +61,7 @@ async function getTrailer(scene, site, url) {
}
const trailerUrl = `${site.url}/api${tokenRes.body.data.url}`;
const trailersRes = await post(trailerUrl, null, { referer: url });
const trailersRes = await http.post(trailerUrl, null, { headers: { referer: url } });
if (trailersRes.ok) {
return qualities.map(quality => (trailersRes.body[quality] ? {
@ -155,7 +157,7 @@ async function scrapeScene(data, url, site, baseRelease) {
async function fetchActorReleases(pages, model, origin) {
const releasesPerPage = await Promise.map(pages, async (page) => {
const url = `${origin}/api${model.targetUrl}?page=${page}`;
const res = await get(url);
const res = await http.get(url);
if (res.code === 200) {
return scrapeAll(res.body.data.videos.videos, null, origin);
@ -203,7 +205,7 @@ async function scrapeProfile(data, origin, withReleases) {
async function fetchLatest(site, page = 1) {
const url = `${site.url}/api/videos?page=${page}`;
const res = await get(url);
const res = await http.get(url);
if (res.code === 200) {
return scrapeAll(res.body.data.videos, site);
@ -214,7 +216,7 @@ async function fetchLatest(site, page = 1) {
async function fetchUpcoming(site) {
const apiUrl = `${site.url}/api`;
const res = await get(apiUrl);
const res = await http.get(apiUrl);
if (res.code === 200) {
return scrapeUpcoming(res.body.data.nextScene, site);
@ -227,7 +229,7 @@ async function fetchScene(url, site, baseRelease) {
const { origin, pathname } = new URL(url);
const apiUrl = `${origin}/api${pathname}`;
const res = await get(apiUrl);
const res = await http.get(apiUrl);
if (res.code === 200) {
return scrapeScene(res.body.data, url, site, baseRelease);
@ -240,7 +242,7 @@ async function fetchProfile({ name: actorName }, { site }, include) {
const origin = site.url;
const actorSlug = slugify(actorName);
const url = `${origin}/api/${actorSlug}`;
const res = await get(url);
const res = await http.get(url);
if (res.code === 200) {
return scrapeProfile(res.body.data, origin, include.scenes);

146
src/utils/http-legacy.js Normal file
View File

@ -0,0 +1,146 @@
'use strict';
const util = require('util');
const stream = require('stream');
const config = require('config');
const tunnel = require('tunnel');
const bhttp = require('@thependulum/bhttp');
const taskQueue = require('promise-task-queue');
const pipeline = util.promisify(stream.pipeline);
const logger = require('../logger')(__filename);
const defaultHeaders = {
'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1',
};
const defaultOptions = {
responseTimeout: 30000,
};
const proxyAgent = tunnel.httpsOverHttp({
proxy: {
host: config.proxy.host,
port: config.proxy.port,
},
});
function useProxy(url) {
if (!config.proxy.enable) {
return false;
}
const { hostname } = new URL(url);
return config.proxy.hostnames.includes(hostname);
}
const queue = taskQueue();
const defaultQueueMethod = '20p';
async function handler({
url,
method = 'GET',
body,
headers = {},
options = {},
}) {
if (body) {
logger.silly(`${method.toUpperCase()} ${url} with ${JSON.stringify(body)} ${options.queueMethod || defaultQueueMethod}`);
} else {
logger.silly(`${method.toUpperCase()} ${url} ${options.queueMethod || defaultQueueMethod}`);
}
const reqOptions = {
headers: {
...(options?.defaultHeaders !== false && defaultHeaders),
...headers,
},
...defaultOptions,
...options,
...(options?.timeout && { responseTimeout: options?.timeout }),
};
if (useProxy(url)) {
reqOptions.agent = proxyAgent;
}
const res = ['POST', 'PUT', 'PATCH'].includes(method.toUpperCase())
? await (options.useSession || bhttp)[method.toLowerCase()](url, body, reqOptions)
: await (options.useSession || bhttp)[method.toLowerCase()](url, reqOptions);
if (options?.stream && options?.destination) {
await pipeline(res, ...(options?.transforms || []), options?.destination);
}
const html = Buffer.isBuffer(res.body) ? res.body.toString() : null;
const json = Buffer.isBuffer(res.body) ? null : res.body;
return {
...res,
originalRes: res,
html,
json,
pipe: res.pipe,
ok: res.statusCode >= 200 && res.statusCode <= 299,
code: res.statusCode,
status: res.statusCode,
};
}
queue.on('concurrencyReached:http', () => {
logger.silly('Queueing requests');
});
queue.define('20p', handler, {
concurrency: 20,
});
queue.define('1s', handler, {
interval: 1,
});
queue.define('5s', handler, {
interval: 5,
});
async function get(url, headers, options) {
return queue.push(options?.queueMethod || defaultQueueMethod, {
method: 'GET',
url,
headers,
options,
});
}
async function head(url, headers, options) {
return queue.push(options?.queueMethod || defaultQueueMethod, {
method: 'HEAD',
url,
headers,
options,
});
}
async function post(url, body, headers, options) {
return queue.push(options?.queueMethod || defaultQueueMethod, {
method: 'POST',
url,
body,
headers,
options,
});
}
function session(headers, options) {
return bhttp.session({
headers,
options,
});
}
module.exports = {
get,
post,
head,
session,
};

View File

@ -1,21 +1,23 @@
'use strict';
const config = require('config');
const bhttp = require('bhttp');
const util = require('util');
const stream = require('stream');
const config = require('config');
const tunnel = require('tunnel');
const bhttp = require('@thependulum/bhttp');
const taskQueue = require('promise-task-queue');
const Bottleneck = require('bottleneck');
const { JSDOM } = require('jsdom');
const pipeline = util.promisify(stream.pipeline);
const logger = require('../logger')(__filename);
const defaultHeaders = {
'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1',
};
const pipeline = util.promisify(stream.pipeline);
const limiters = {};
const defaultOptions = {
responseTimeout: 30000,
encodeJSON: true,
headers: {
'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1',
},
};
const proxyAgent = tunnel.httpsOverHttp({
@ -34,113 +36,114 @@ function useProxy(url) {
return config.proxy.hostnames.includes(hostname);
}
const queue = taskQueue();
const defaultQueueMethod = '20p';
function getLimiter(limit = {}) {
const interval = limit.interval === undefined ? config.limits.default.interval : limit.interval;
const concurrency = limit.concurrency === undefined ? config.limits.default.concurrency : limit.concurrency;
async function handler({
url,
method = 'GET',
body,
headers = {},
options = {},
}) {
if (body) {
logger.silly(`${method.toUpperCase()} ${url} with ${JSON.stringify(body)} ${options.queueMethod || defaultQueueMethod}`);
} else {
logger.silly(`${method.toUpperCase()} ${url} ${options.queueMethod || defaultQueueMethod}`);
if (!limiters[interval]?.[concurrency]) {
limiters[interval] = limiters[interval] || {};
limiters[interval][concurrency] = new Bottleneck({
minTime: interval,
maxConcurrent: concurrency,
});
}
const reqOptions = {
headers: {
...(options?.defaultHeaders !== false && defaultHeaders),
...headers,
},
return limiters[interval][concurrency];
}
async function request(method = 'get', url, body, requestOptions = {}) {
const http = requestOptions.session || bhttp;
const options = {
...defaultOptions,
...options,
...(options?.timeout && { responseTimeout: options?.timeout }),
...requestOptions,
responseTimeout: requestOptions.responseTimeout || requestOptions.timeout || 60000,
stream: !!requestOptions.destination,
interval: requestOptions.interval || config.limits.default.interval,
concurrency: requestOptions.concurrency || config.limits.default.concurrency,
session: null,
};
if (useProxy(url)) {
reqOptions.agent = proxyAgent;
options.agent = proxyAgent;
}
const res = ['POST', 'PUT', 'PATCH'].includes(method.toUpperCase())
? await (options.useSession || bhttp)[method.toLowerCase()](url, body, reqOptions)
: await (options.useSession || bhttp)[method.toLowerCase()](url, reqOptions);
logger.debug(`GET (${options.interval}ms/${options.concurrency}p) ${url}`);
if (options?.stream && options?.destination) {
await pipeline(res, ...(options?.transforms || []), options?.destination);
const res = await (body
? http[method](url, body, options)
: http[method](url, options));
const resIsOk = res.statusCode >= 200 && res.statusCode <= 299;
if (options.destination) {
// res.on('progress', (bytes, totalBytes) => logger.silly(`Downloaded ${Math.round((bytes / totalBytes) * 100)}% of ${url}`));
await pipeline(res, ...(options.transforms || []), options.destination);
}
const html = Buffer.isBuffer(res.body) ? res.body.toString() : null;
const json = Buffer.isBuffer(res.body) ? null : res.body;
if (Buffer.isBuffer(res.body)) {
const html = res.body.toString();
const window = new JSDOM(html).window;
return {
...res,
body: html,
html,
status: res.statusCode,
document: window.document,
window,
ok: resIsOk,
};
}
return {
...res,
originalRes: res,
html,
json,
pipe: res.pipe,
ok: res.statusCode >= 200 && res.statusCode <= 299,
code: res.statusCode,
body: res.body,
status: res.statusCode,
ok: res.statusCode >= 200 && res.statusCode <= 299,
};
}
queue.on('concurrencyReached:http', () => {
logger.silly('Queueing requests');
});
queue.define('20p', handler, {
concurrency: 20,
});
queue.define('1s', handler, {
interval: 1,
});
queue.define('5s', handler, {
interval: 5,
});
async function get(url, headers, options) {
return queue.push(options?.queueMethod || defaultQueueMethod, {
method: 'GET',
url,
headers,
options,
});
async function scheduleRequest(method = 'get', url, body, options) {
return getLimiter(options || {}).schedule(() => request(method, url, body, options));
}
async function head(url, headers, options) {
return queue.push(options?.queueMethod || defaultQueueMethod, {
method: 'HEAD',
url,
headers,
options,
});
async function get(url, options) {
return scheduleRequest('get', url, null, options);
}
async function post(url, body, headers, options) {
return queue.push(options?.queueMethod || defaultQueueMethod, {
method: 'POST',
url,
body,
headers,
options,
});
async function post(url, body, options) {
return scheduleRequest('post', url, body, options);
}
function session(headers, options) {
return bhttp.session({
headers,
options,
});
async function put(url, body, options) {
return scheduleRequest('put', url, body, options);
}
async function patch(url, body, options) {
return scheduleRequest('patch', url, body, options);
}
async function del(url, options) {
return scheduleRequest('delete', url, null, options);
}
async function head(url, options) {
return scheduleRequest('head', url, null, options);
}
function getSession(options) {
return bhttp.session(options);
}
module.exports = {
get,
post,
head,
session,
post,
delete: del,
put,
patch,
session: getSession,
};

View File

@ -457,8 +457,8 @@ function extractAll(htmlValue, selector) {
async function request(method = 'get', urlValue, body, selector, headers, options, queryAll = false) {
const res = await (method === 'post'
? http.post(urlValue, body, headers, options)
: http[method](urlValue, headers, options));
? http.post(urlValue, body, { ...options, headers })
: http[method](urlValue, { ...options, headers }));
if (res.ok) {
const item = queryAll
@ -494,7 +494,7 @@ async function post(urlValue, body, selector, headers, options) {
}
async function getAll(urlValue, selector, headers, options) {
return request('get,', urlValue, selector, headers, options, true);
return request('get', urlValue, null, selector, headers, options, true);
}
async function postAll(urlValue, body, selector, headers, options) {