diff --git a/config/default.js b/config/default.js index c734acad..27c02549 100644 --- a/config/default.js +++ b/config/default.js @@ -197,6 +197,12 @@ module.exports = { 'www.deeper.com', ], }, + limits: { + default: { + interval: 50, + concurrency: 20, + }, + }, fetchAfter: [1, 'week'], missingDateLimit: 3, media: { diff --git a/package-lock.json b/package-lock.json index 91bd0326..8c3721d1 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2208,6 +2208,11 @@ "resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz", "integrity": "sha1-aN/1++YMUes3cl6p4+0xDcwed24=" }, + "bottleneck": { + "version": "2.19.5", + "resolved": "https://registry.npmjs.org/bottleneck/-/bottleneck-2.19.5.tgz", + "integrity": "sha512-VHiNCbI1lKdl44tGrhNfU3lup0Tj/ZBMJB5/2ZbNXRCPuRCO7ed2mgcK4r17y+KB2EfuYuRaVlwNbAeaWGSpbw==" + }, "brace-expansion": { "version": "1.1.11", "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz", diff --git a/package.json b/package.json index d63932be..fc318691 100644 --- a/package.json +++ b/package.json @@ -78,6 +78,7 @@ "blake2": "^4.0.0", "bluebird": "^3.7.2", "body-parser": "^1.19.0", + "bottleneck": "^2.19.5", "canvas": "^2.6.1", "casual": "^1.6.2", "cheerio": "^1.0.0-rc.3", diff --git a/public/img/tags/nurse/1.jpeg b/public/img/tags/nurse/1.jpeg new file mode 100644 index 00000000..fbf1aba1 Binary files /dev/null and b/public/img/tags/nurse/1.jpeg differ diff --git a/public/img/tags/nurse/1a.jpeg b/public/img/tags/nurse/1a.jpeg new file mode 100644 index 00000000..c3a25022 Binary files /dev/null and b/public/img/tags/nurse/1a.jpeg differ diff --git a/public/img/tags/nurse/lazy/1.jpeg b/public/img/tags/nurse/lazy/1.jpeg new file mode 100644 index 00000000..90bdfa13 Binary files /dev/null and b/public/img/tags/nurse/lazy/1.jpeg differ diff --git a/public/img/tags/nurse/lazy/1a.jpeg b/public/img/tags/nurse/lazy/1a.jpeg new file mode 100644 index 00000000..b3af56ab Binary files /dev/null and b/public/img/tags/nurse/lazy/1a.jpeg differ diff --git a/public/img/tags/nurse/thumbs/0.jpeg b/public/img/tags/nurse/thumbs/0.jpeg index aada1926..582361b2 100644 Binary files a/public/img/tags/nurse/thumbs/0.jpeg and b/public/img/tags/nurse/thumbs/0.jpeg differ diff --git a/public/img/tags/nurse/thumbs/1.jpeg b/public/img/tags/nurse/thumbs/1.jpeg new file mode 100644 index 00000000..aaeba2a2 Binary files /dev/null and b/public/img/tags/nurse/thumbs/1.jpeg differ diff --git a/public/img/tags/nurse/thumbs/1a.jpeg b/public/img/tags/nurse/thumbs/1a.jpeg new file mode 100644 index 00000000..169ae082 Binary files /dev/null and b/public/img/tags/nurse/thumbs/1a.jpeg differ diff --git a/seeds/04_media.js b/seeds/04_media.js index 5d90495a..a6143d36 100644 --- a/seeds/04_media.js +++ b/seeds/04_media.js @@ -643,7 +643,7 @@ const tagPosters = [ ['mff', 1, 'Anikka Albrite, Kelsi Monroe and Mick Blue for HardX'], ['mfm', 0, 'Vina Sky in "Jules Jordan\'s Three Ways" for Jules Jordan'], ['natural-boobs', 4, 'Miela (Marry Queen) in "Pure" for FemJoy'], - ['nurse', 0, 'Sarah Vandella in "Cum For Nurse Sarah" for Brazzers'], + ['nurse', 1, 'Mia Malkova in "Always Think Happy Thoughts" for Brazzers'], ['oil', 2, 'Jade Kush for Passion HD'], ['oral-creampie', 1, 'Valentina Nappi for Her Limit'], ['orgy', 1, 'Megan Rain (DP), Morgan Lee (anal), Jessa Rhodes, Melissa Moore and Kimmy Granger in "Orgy Masters 8" for Jules Jordan'], @@ -825,6 +825,7 @@ const tagPhotos = [ ['natural-boobs', 3, 'Violet Starr in "Violet Starr 1st Lesbian Anal" for LesbianX'], ['natural-boobs', 0, 'Valentina Nappi in "Hypnotic Curves" for LesbianX'], ['natural-boobs', 2, 'Kylie Page for All Girl Massage'], + ['nurse', 0, 'Sarah Vandella in "Cum For Nurse Sarah" for Brazzers'], ['oil', 1, 'Kissa Sins in "Oil Overload 14" for JulesJordan'], ['oil', 3, 'Vina Sky for Lubed'], ['oil', 0, 'Jada Stevens in "Jada Stevens Anal Ass Oiled Up For James Deen\'s Cock" for Jules Jordan'], diff --git a/src/media.js b/src/media.js index a6808917..bedd1970 100644 --- a/src/media.js +++ b/src/media.js @@ -420,15 +420,18 @@ async function storeFile(media) { } catch (error) { logger.warn(`Failed to store ${media.src}: ${error.message}`); + await fsPromises.unlink(media.file.path); + return null; } } async function fetchHttpSource(source, tempFileTarget, hashStream) { const res = await http.get(source.src, { - ...(source.referer && { referer: source.referer }), - ...(source.host && { host: source.host }), - }, { + headers: { + ...(source.referer && { referer: source.referer }), + ...(source.host && { host: source.host }), + }, stream: true, // sources are fetched in parallel, don't gobble up memory transforms: [hashStream], destination: tempFileTarget, @@ -642,7 +645,7 @@ async function storeMedias(baseMedias) { ); } - const newMediaWithEntries = savedMedias.map((media, index) => curateMediaEntry(media, index)); + const newMediaWithEntries = savedMedias.filter(Boolean).map((media, index) => curateMediaEntry(media, index)); const newMediaEntries = newMediaWithEntries.filter(media => media.newEntry).map(media => media.entry); await bulkInsert('media', newMediaEntries); diff --git a/src/scrapers/bang.js b/src/scrapers/bang.js index 033cc0e3..df3a9b52 100644 --- a/src/scrapers/bang.js +++ b/src/scrapers/bang.js @@ -1,8 +1,6 @@ 'use strict'; -const bhttp = require('@thependulum/bhttp'); - -const { post } = require('../utils/http'); +const http = require('../utils/http'); const { extractDate } = require('../utils/qu'); const { inchesToCm } = require('../utils/convert'); const slugify = require('../utils/slugify'); @@ -84,7 +82,7 @@ function scrapeAll(scenes) { } async function fetchActorReleases(actor) { - const res = await bhttp.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, { + const res = await http.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, { size: 50, query: { bool: { @@ -179,7 +177,7 @@ async function scrapeProfile(actor, include) { } async function fetchLatest(site, page = 1) { - const res = await bhttp.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, { + const res = await http.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, { size: 50, from: (page - 1) * 50, query: { @@ -269,7 +267,7 @@ async function fetchScene(url) { const encodedId = new URL(url).pathname.split('/')[2]; const entryId = decodeId(encodedId); - const res = await bhttp.get(`https://${clusterId}.us-east-1.aws.found.io/videos/video/${entryId}`, { + const res = await http.get(`https://${clusterId}.us-east-1.aws.found.io/videos/video/${entryId}`, { headers: { Authorization: `Basic ${authKey}`, }, @@ -279,7 +277,7 @@ async function fetchScene(url) { } async function fetchProfile({ name: actorName }, context, include) { - const res = await post(`https://${clusterId}.us-east-1.aws.found.io/actors/actor/_search`, { + const res = await http.post(`https://${clusterId}.us-east-1.aws.found.io/actors/actor/_search`, { size: 5, sort: [{ _score: { @@ -306,8 +304,11 @@ async function fetchProfile({ name: actorName }, context, include) { }, }, }, { - Authorization: `Basic ${authKey}`, - }, { encodeJSON: true }); + headers: { + Authorization: `Basic ${authKey}`, + }, + encodeJSON: true, + }); if (res.ok) { const actor = res.body.hits.hits.find(hit => hit._source.name.toLowerCase() === actorName.toLowerCase()); diff --git a/src/scrapers/dorcel.js b/src/scrapers/dorcel.js index 05048a25..0fe74264 100644 --- a/src/scrapers/dorcel.js +++ b/src/scrapers/dorcel.js @@ -41,7 +41,9 @@ function scrapeScene({ query }, url, channel) { })); release.director = query.cnt('.director')?.split(/\s*:\s*/)[1]; - release.poster = query.sourceSet('.player img', 'data-srcset'); + + const fallbackPoster = query.img('.player img'); + release.poster = query.sourceSet('.player img', 'data-srcset') || [fallbackPoster.replace('_crop', ''), fallbackPoster]; release.movie = { title: query.cnt('.movie a'), diff --git a/src/scrapers/gamma.js b/src/scrapers/gamma.js index ee04805a..307f9d0d 100644 --- a/src/scrapers/gamma.js +++ b/src/scrapers/gamma.js @@ -7,7 +7,7 @@ const cheerio = require('cheerio'); const moment = require('moment'); const logger = require('../logger')(__filename); -const { ex, get } = require('../utils/q'); +const qu = require('../utils/qu'); const http = require('../utils/http'); const slugify = require('../utils/slugify'); @@ -318,7 +318,7 @@ async function fetchActorReleases(profileUrl, getActorReleasesUrl, page = 1, acc const profilePath = `/${pathname.split('/').slice(-2).join('/')}`; const url = getActorReleasesUrl(profilePath, page); - const res = await get(url); + const res = await qu.get(url); if (!res.ok) return []; @@ -333,14 +333,14 @@ async function fetchActorReleases(profileUrl, getActorReleasesUrl, page = 1, acc } async function scrapeProfile(html, url, actorName, _siteSlug, getActorReleasesUrl, withReleases) { - const { q } = ex(html); + const { query } = qu.extract(html); - const avatar = q('img.actorPicture'); - const hair = q('.actorProfile .attribute_hair_color', true); - const height = q('.actorProfile .attribute_height', true); - const weight = q('.actorProfile .attribute_weight', true); - const alias = q('.actorProfile .attribute_alternate_names', true); - const nationality = q('.actorProfile .attribute_home', true); + const avatar = query.el('img.actorPicture'); + const hair = query.cnt('.actorProfile .attribute_hair_color'); + const height = query.cnt('.actorProfile .attribute_height'); + const weight = query.cnt('.actorProfile .attribute_weight'); + const alias = query.cnt('.actorProfile .attribute_alternate_names'); + const nationality = query.cnt('.actorProfile .attribute_home'); const profile = { name: actorName, @@ -358,7 +358,7 @@ async function scrapeProfile(html, url, actorName, _siteSlug, getActorReleasesUr profile.avatar = avatars; } - profile.description = q('.actorBio p:not(.bioTitle)', true); + profile.description = query.cnt('.actorBio p:not(.bioTitle)'); if (hair) profile.hair = hair.split(':')[1].trim(); if (height) profile.height = Number(height.match(/\d+/)[0]); diff --git a/src/scrapers/hitzefrei.js b/src/scrapers/hitzefrei.js index f8f64c5f..e68b4c25 100644 --- a/src/scrapers/hitzefrei.js +++ b/src/scrapers/hitzefrei.js @@ -129,7 +129,9 @@ async function fetchProfile(baseActor, entity, include) { const searchRes = await http.post('https://tour.hitzefrei.com/search-preview', { q: baseActor.name, }, { - 'Accept-Language': 'en-US', + headers: { + 'Accept-Language': 'en-US', + }, }); if (searchRes.ok) { diff --git a/src/scrapers/insex.js b/src/scrapers/insex.js index 7000756d..57d7a25b 100644 --- a/src/scrapers/insex.js +++ b/src/scrapers/insex.js @@ -115,7 +115,7 @@ async function scrapeSceneAlt({ query }, url, channel, session) { release.trailer = query.video(); if (!release.trailer) { - const trailerRes = await http.get(`${channel.url}/api/play-api.php`, null, { useSession: session }); + const trailerRes = await http.get(`${channel.url}/api/play-api.php`, { session }); if (trailerRes.ok) { release.trailer = trailerRes.body; @@ -153,7 +153,7 @@ async function fetchLatest(site, page = 1) { async function fetchScene(url, site) { const session = http.session(); - const res = await qu.get(url, null, null, { useSession: session }); + const res = await qu.get(url, null, null, { session }); if (res.ok) { if (site.parameters?.scraper === 'alt') { diff --git a/src/scrapers/jayrock.js b/src/scrapers/jayrock.js index b01e86dc..6a9cceec 100644 --- a/src/scrapers/jayrock.js +++ b/src/scrapers/jayrock.js @@ -23,7 +23,7 @@ async function fetchTrailerLocation(entryId, channel) { const url = `${channel.url}/api/download/${entryId}/hd1080/stream`; try { - const res = await http.get(url, null, { + const res = await http.get(url, { followRedirects: false, }); diff --git a/src/scrapers/julesjordan.js b/src/scrapers/julesjordan.js index 7a90dd8e..f8b9c50e 100644 --- a/src/scrapers/julesjordan.js +++ b/src/scrapers/julesjordan.js @@ -7,7 +7,7 @@ const cheerio = require('cheerio'); const { JSDOM } = require('jsdom'); const moment = require('moment'); -const { get, geta, ctxa, parseDate, prefixUrl } = require('../utils/q'); +const qu = require('../utils/qu'); const http = require('../utils/http'); const { heightToCm } = require('../utils/convert'); const slugify = require('../utils/slugify'); @@ -82,7 +82,7 @@ async function getPhotosLegacy(entryId, site, type = 'highres', page = 1) { async function getPhotos(entryId, site, type = 'highres', page = 1) { const albumUrl = `${site.parameters?.photos || `${site.url}/gallery.php`}?id=${entryId}&type=${type}&page=${page}`; - const res = await bhttp.get(albumUrl); + const res = await http.get(albumUrl); const html = res.body.toString(); const sourceLines = html.split(/\n/).filter(line => line.match(/ptx\["\w+"\]/)); @@ -135,25 +135,25 @@ function getEntryId(html) { } function scrapeAll(scenes, site, entryIdFromTitle) { - return scenes.map(({ el, qu }) => { + return scenes.map(({ el, query }) => { const release = {}; - release.url = qu.url('.update_title a, .dvd_info > a, a ~ a'); - release.title = qu.q('.update_title a, .dvd_info > a, a ~ a', true); - release.date = qu.date('.update_date', 'MM/DD/YYYY'); + release.url = query.url('.update_title a, .dvd_info > a, a ~ a'); + release.title = query.q('.update_title a, .dvd_info > a, a ~ a', true); + release.date = query.date('.update_date', 'MM/DD/YYYY'); - release.entryId = (entryIdFromTitle && slugify(release.title)) || el.dataset.setid || qu.q('.rating_box')?.dataset.id; + release.entryId = (entryIdFromTitle && slugify(release.title)) || el.dataset.setid || query.q('.rating_box')?.dataset.id; - release.actors = qu.all('.update_models a', true); + release.actors = query.all('.update_models a', true); - const dvdPhotos = qu.imgs('.dvd_preview_thumb'); - const photoCount = Number(qu.q('a img.thumbs', 'cnt')) || 1; + const dvdPhotos = query.imgs('.dvd_preview_thumb'); + const photoCount = Number(query.q('a img.thumbs', 'cnt')) || 1; [release.poster, ...release.photos] = dvdPhotos.length ? dvdPhotos : Array.from({ length: photoCount }).map((value, index) => { - const src = qu.img('a img.thumbs', `src${index}_1x`) || qu.img('a img.thumbs', `src${index}`) || qu.img('a img.thumbs'); - const prefixedSrc = prefixUrl(src, site.url); + const src = query.img('a img.thumbs', `src${index}_1x`) || query.img('a img.thumbs', `src${index}`) || query.img('a img.thumbs'); + const prefixedSrc = qu.prefixUrl(src, site.url); if (src) { return [ @@ -183,7 +183,7 @@ function scrapeAll(scenes, site, entryIdFromTitle) { return null; }).filter(Boolean); - const teaserScript = qu.html('script'); + const teaserScript = query.html('script'); if (teaserScript) { const src = teaserScript.slice(teaserScript.indexOf('http'), teaserScript.indexOf('.mp4') + 4); if (src) release.teaser = { src }; @@ -236,17 +236,17 @@ function scrapeUpcoming(html, site) { }); } -async function scrapeScene({ html, qu }, url, site, include) { +async function scrapeScene({ html, query }, url, site, include) { const release = { url, site }; release.entryId = getEntryId(html); - release.title = qu.q('.title_bar_hilite', true); - release.description = qu.q('.update_description', true); + release.title = query.q('.title_bar_hilite', true); + release.description = query.q('.update_description', true); - release.date = qu.date('.update_date', 'MM/DD/YYYY', null, 'innerHTML'); + release.date = query.date('.update_date', 'MM/DD/YYYY', null, 'innerHTML'); - release.actors = qu.all('.backgroundcolor_info > .update_models a, .item .update_models a', true); - release.tags = qu.all('.update_tags a', true); + release.actors = query.all('.backgroundcolor_info > .update_models a, .item .update_models a', true); + release.tags = query.all('.update_tags a', true); const posterPath = html.match(/useimage = "(.*)"/)?.[1]; @@ -280,14 +280,14 @@ async function scrapeScene({ html, qu }, url, site, include) { if (include.photos) release.photos = await getPhotos(release.entryId, site); - if (qu.exists('.update_dvds a')) { + if (query.exists('.update_dvds a')) { release.movie = { - url: qu.url('.update_dvds a'), - title: qu.q('.update_dvds a', true), + url: query.url('.update_dvds a'), + title: query.q('.update_dvds a', true), }; } - const stars = Number(qu.q('.avg_rating', true)?.replace(/[\s|Avg Rating:]/g, '')); + const stars = Number(query.q('.avg_rating', true)?.replace(/[\s|Avg Rating:]/g, '')); if (stars) release.stars = stars; return release; @@ -302,7 +302,7 @@ function scrapeMovie({ el, query }, url, site) { movie.channel = slugify(query.q('.update_date a', true), ''); // movie.releases = Array.from(document.querySelectorAll('.cell.dvd_info > a'), el => el.href); - const sceneQus = ctxa(el, '.dvd_details'); + const sceneQus = qu.initAll(el, '.dvd_details'); const scenes = scrapeAll(sceneQus, site); const curatedScenes = scenes @@ -332,7 +332,7 @@ function scrapeProfile(html, url, actorName, entity) { const birthDateString = bio.match(/Age:\s*(\w+ \d{1,2}, \d{4})/); const measurementsString = bio.match(/\w+-\d+-\d+/); - if (birthDateString) profile.birthdate = parseDate(birthDateString[1], 'MMMM D, YYYY'); + if (birthDateString) profile.birthdate = qu.parseDate(birthDateString[1], 'MMMM D, YYYY'); if (ageString) profile.age = Number(ageString[1]); if (heightString) profile.height = heightToCm(heightString[0]); @@ -354,7 +354,7 @@ function scrapeProfile(html, url, actorName, entity) { avatarEl.getAttribute('src'), ] .filter(avatar => avatar && !/p\d+.jpe?g/.test(avatar)) // remove non-existing attributes and placeholder images - .map(avatar => prefixUrl(avatar, entity.url)); + .map(avatar => qu.prefixUrl(avatar, entity.url)); if (avatarSources.length) profile.avatar = avatarSources; } @@ -370,7 +370,7 @@ async function fetchLatest(site, page = 1, include, preData, entryIdFromTitle = : `${site.url}/trial/categories/movies_${page}_d.html`; // const res = await bhttp.get(url); - const res = await geta(url, '.update_details'); + const res = await qu.getAll(url, '.update_details'); return res.ok ? scrapeAll(res.items, site, entryIdFromTitle) : res.status; } @@ -389,13 +389,13 @@ async function fetchUpcoming(site) { } async function fetchScene(url, site, baseRelease, include) { - const res = await get(url); + const res = await qu.get(url); return res.ok ? scrapeScene(res.item, url, site, include) : res.status; } async function fetchMovie(url, site) { - const res = await get(url); + const res = await qu.get(url); return res.ok ? scrapeMovie(res.item, url, site) : res.status; } diff --git a/src/scrapers/kellymadison.js b/src/scrapers/kellymadison.js index a063bdb5..1a6b524f 100644 --- a/src/scrapers/kellymadison.js +++ b/src/scrapers/kellymadison.js @@ -97,8 +97,10 @@ async function scrapeScene({ query, html }, url, baseRelease) { const token = query.meta('name=_token'); const trailerInfoUrl = `${origin}/episodes/trailer/sources/${release.entryId}?type=trailer`; const trailerInfoRes = await http.post(trailerInfoUrl, null, { - 'X-CSRF-Token': token, - 'X-Requested-With': 'XMLHttpRequest', + headers: { + 'X-CSRF-Token': token, + 'X-Requested-With': 'XMLHttpRequest', + }, }); if (trailerInfoRes.ok && trailerInfoRes.body.sources.length > 0) { @@ -136,7 +138,9 @@ function scrapeProfile({ query }) { async function fetchLatest(channel, page = 1) { const url = `${channel.url}/episodes/search?page=${page}`; // TLS issues with teenfidelity.com, same overview on all sites const res = await http.get(url, { - 'X-Requested-With': 'XMLHttpRequest', + headers: { + 'X-Requested-With': 'XMLHttpRequest', + }, }); if (res.ok && res.body.status === 'success') { @@ -157,7 +161,9 @@ async function fetchScene(url, channel, baseRelease) { async function fetchProfile({ name: actorName }) { const actorSlug = slugify(actorName); const res = await qu.get(`https://www.kellymadison.com/models/${actorSlug}`, null, { - 'X-Requested-With': 'XMLHttpRequest', + headers: { + 'X-Requested-With': 'XMLHttpRequest', + }, }); if (res.ok) { diff --git a/src/scrapers/naughtyamerica.js b/src/scrapers/naughtyamerica.js index 7d5346fb..b2e4a2c6 100644 --- a/src/scrapers/naughtyamerica.js +++ b/src/scrapers/naughtyamerica.js @@ -6,7 +6,7 @@ const moment = require('moment'); const http = require('../utils/http'); const slugify = require('../utils/slugify'); -const { ex, get } = require('../utils/q'); +const qu = require('../utils/q'); function titleExtractor(pathname) { const components = pathname.split('/')[2].split('-'); @@ -102,24 +102,24 @@ function scrapeScene(html, url, site) { } async function fetchActorReleases(url) { - const res = await get(url); + const res = await qu.get(url); return res.ok - ? res.item.qu.urls('.contain-block:not(.live-scenes) .scene-item > a:first-child') // live scenes repeat on all pages + ? res.item.query.urls('.contain-block:not(.live-scenes) .scene-item > a:first-child') // live scenes repeat on all pages : []; } async function scrapeProfile(html) { - const { qu } = ex(html); + const { query } = qu.extract(html); const profile = {}; - profile.description = qu.q('.bio_about_text', true); + profile.description = query.q('.bio_about_text', true); - const avatar = qu.q('img.performer-pic', 'src'); + const avatar = query.q('img.performer-pic', 'src'); if (avatar) profile.avatar = `https:${avatar}`; - const releases = qu.urls('.scene-item > a:first-child'); - const otherPages = qu.urls('.pagination a:not([rel=next]):not([rel=prev])'); + const releases = query.urls('.scene-item > a:first-child'); + const otherPages = query.urls('.pagination a:not([rel=next]):not([rel=prev])'); const olderReleases = await Promise.all(otherPages.map(async page => fetchActorReleases(page))); profile.releases = releases.concat(olderReleases.flat()); diff --git a/src/scrapers/porncz.js b/src/scrapers/porncz.js index 747a20d0..3deea4ee 100644 --- a/src/scrapers/porncz.js +++ b/src/scrapers/porncz.js @@ -71,10 +71,10 @@ async function fetchLatest(channel, page = 1) { const headers = { 'X-Requested-With': 'XMLHttpRequest' }; for (let i = 0; i < page - 1; i += 1) { - await http.get(url, headers, { useSession: session }); // eslint-disable-line no-await-in-loop + await http.get(url, { headers, session }); // eslint-disable-line no-await-in-loop } - const res = await http.get(url, headers, { useSession: session }); + const res = await http.get(url, { headers, session }); if (res.ok) { const items = qu.extractAll(res.body.snippets?.['snippet--videoItems'] || res.body, '.product-item'); diff --git a/src/scrapers/teencoreclub.js b/src/scrapers/teencoreclub.js index 17cc86e5..083fb2bb 100644 --- a/src/scrapers/teencoreclub.js +++ b/src/scrapers/teencoreclub.js @@ -74,9 +74,14 @@ async function scrapeScene({ query }, url) { release.photos = query.imgs('.detail-grabs img'); const streamData = await http.get(`${origin}/video/source/${entryId}`, { - host, - referer: url, - }, { queueMethod: '5s' }); + headers: { + host, + referer: url, + }, + }, { + interval: 5000, + concurrency: 1, + }); if (streamData.ok && streamData.body.status === 'success') { release.trailer = { diff --git a/src/scrapers/vixen.js b/src/scrapers/vixen.js index 9ec8f280..47a2c92f 100644 --- a/src/scrapers/vixen.js +++ b/src/scrapers/vixen.js @@ -4,7 +4,7 @@ const Promise = require('bluebird'); const moment = require('moment'); -const { get, post } = require('../utils/http'); +const http = require('../utils/http'); const slugify = require('../utils/slugify'); const genderMap = { @@ -45,13 +45,15 @@ function getAvatarFallbacks(avatar) { async function getTrailer(scene, site, url) { const qualities = [360, 480, 720, 1080, 2160]; - const tokenRes = await post(`${site.url}/api/__record_tknreq`, { + const tokenRes = await http.post(`${site.url}/api/__record_tknreq`, { file: scene.previewVideoUrl1080P, sizes: qualities.join('+'), type: 'trailer', }, { - referer: url, - origin: site.url, + headers: { + referer: url, + origin: site.url, + }, }); if (!tokenRes.ok) { @@ -59,7 +61,7 @@ async function getTrailer(scene, site, url) { } const trailerUrl = `${site.url}/api${tokenRes.body.data.url}`; - const trailersRes = await post(trailerUrl, null, { referer: url }); + const trailersRes = await http.post(trailerUrl, null, { headers: { referer: url } }); if (trailersRes.ok) { return qualities.map(quality => (trailersRes.body[quality] ? { @@ -155,7 +157,7 @@ async function scrapeScene(data, url, site, baseRelease) { async function fetchActorReleases(pages, model, origin) { const releasesPerPage = await Promise.map(pages, async (page) => { const url = `${origin}/api${model.targetUrl}?page=${page}`; - const res = await get(url); + const res = await http.get(url); if (res.code === 200) { return scrapeAll(res.body.data.videos.videos, null, origin); @@ -203,7 +205,7 @@ async function scrapeProfile(data, origin, withReleases) { async function fetchLatest(site, page = 1) { const url = `${site.url}/api/videos?page=${page}`; - const res = await get(url); + const res = await http.get(url); if (res.code === 200) { return scrapeAll(res.body.data.videos, site); @@ -214,7 +216,7 @@ async function fetchLatest(site, page = 1) { async function fetchUpcoming(site) { const apiUrl = `${site.url}/api`; - const res = await get(apiUrl); + const res = await http.get(apiUrl); if (res.code === 200) { return scrapeUpcoming(res.body.data.nextScene, site); @@ -227,7 +229,7 @@ async function fetchScene(url, site, baseRelease) { const { origin, pathname } = new URL(url); const apiUrl = `${origin}/api${pathname}`; - const res = await get(apiUrl); + const res = await http.get(apiUrl); if (res.code === 200) { return scrapeScene(res.body.data, url, site, baseRelease); @@ -240,7 +242,7 @@ async function fetchProfile({ name: actorName }, { site }, include) { const origin = site.url; const actorSlug = slugify(actorName); const url = `${origin}/api/${actorSlug}`; - const res = await get(url); + const res = await http.get(url); if (res.code === 200) { return scrapeProfile(res.body.data, origin, include.scenes); diff --git a/src/utils/http-legacy.js b/src/utils/http-legacy.js new file mode 100644 index 00000000..45925baa --- /dev/null +++ b/src/utils/http-legacy.js @@ -0,0 +1,146 @@ +'use strict'; + +const util = require('util'); +const stream = require('stream'); +const config = require('config'); +const tunnel = require('tunnel'); +const bhttp = require('@thependulum/bhttp'); +const taskQueue = require('promise-task-queue'); + +const pipeline = util.promisify(stream.pipeline); +const logger = require('../logger')(__filename); + +const defaultHeaders = { + 'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1', +}; + +const defaultOptions = { + responseTimeout: 30000, +}; + +const proxyAgent = tunnel.httpsOverHttp({ + proxy: { + host: config.proxy.host, + port: config.proxy.port, + }, +}); + +function useProxy(url) { + if (!config.proxy.enable) { + return false; + } + + const { hostname } = new URL(url); + return config.proxy.hostnames.includes(hostname); +} + +const queue = taskQueue(); +const defaultQueueMethod = '20p'; + +async function handler({ + url, + method = 'GET', + body, + headers = {}, + options = {}, +}) { + if (body) { + logger.silly(`${method.toUpperCase()} ${url} with ${JSON.stringify(body)} ${options.queueMethod || defaultQueueMethod}`); + } else { + logger.silly(`${method.toUpperCase()} ${url} ${options.queueMethod || defaultQueueMethod}`); + } + + const reqOptions = { + headers: { + ...(options?.defaultHeaders !== false && defaultHeaders), + ...headers, + }, + ...defaultOptions, + ...options, + ...(options?.timeout && { responseTimeout: options?.timeout }), + }; + + if (useProxy(url)) { + reqOptions.agent = proxyAgent; + } + + const res = ['POST', 'PUT', 'PATCH'].includes(method.toUpperCase()) + ? await (options.useSession || bhttp)[method.toLowerCase()](url, body, reqOptions) + : await (options.useSession || bhttp)[method.toLowerCase()](url, reqOptions); + + if (options?.stream && options?.destination) { + await pipeline(res, ...(options?.transforms || []), options?.destination); + } + + const html = Buffer.isBuffer(res.body) ? res.body.toString() : null; + const json = Buffer.isBuffer(res.body) ? null : res.body; + + return { + ...res, + originalRes: res, + html, + json, + pipe: res.pipe, + ok: res.statusCode >= 200 && res.statusCode <= 299, + code: res.statusCode, + status: res.statusCode, + }; +} + +queue.on('concurrencyReached:http', () => { + logger.silly('Queueing requests'); +}); + +queue.define('20p', handler, { + concurrency: 20, +}); + +queue.define('1s', handler, { + interval: 1, +}); + +queue.define('5s', handler, { + interval: 5, +}); + +async function get(url, headers, options) { + return queue.push(options?.queueMethod || defaultQueueMethod, { + method: 'GET', + url, + headers, + options, + }); +} + +async function head(url, headers, options) { + return queue.push(options?.queueMethod || defaultQueueMethod, { + method: 'HEAD', + url, + headers, + options, + }); +} + +async function post(url, body, headers, options) { + return queue.push(options?.queueMethod || defaultQueueMethod, { + method: 'POST', + url, + body, + headers, + options, + }); +} + +function session(headers, options) { + return bhttp.session({ + headers, + options, + }); +} + +module.exports = { + get, + post, + head, + session, +}; diff --git a/src/utils/http.js b/src/utils/http.js index 45925baa..b62a45bf 100644 --- a/src/utils/http.js +++ b/src/utils/http.js @@ -1,21 +1,23 @@ 'use strict'; +const config = require('config'); +const bhttp = require('bhttp'); const util = require('util'); const stream = require('stream'); -const config = require('config'); const tunnel = require('tunnel'); -const bhttp = require('@thependulum/bhttp'); -const taskQueue = require('promise-task-queue'); +const Bottleneck = require('bottleneck'); +const { JSDOM } = require('jsdom'); -const pipeline = util.promisify(stream.pipeline); const logger = require('../logger')(__filename); -const defaultHeaders = { - 'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1', -}; +const pipeline = util.promisify(stream.pipeline); +const limiters = {}; const defaultOptions = { - responseTimeout: 30000, + encodeJSON: true, + headers: { + 'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1', + }, }; const proxyAgent = tunnel.httpsOverHttp({ @@ -34,113 +36,114 @@ function useProxy(url) { return config.proxy.hostnames.includes(hostname); } -const queue = taskQueue(); -const defaultQueueMethod = '20p'; +function getLimiter(limit = {}) { + const interval = limit.interval === undefined ? config.limits.default.interval : limit.interval; + const concurrency = limit.concurrency === undefined ? config.limits.default.concurrency : limit.concurrency; -async function handler({ - url, - method = 'GET', - body, - headers = {}, - options = {}, -}) { - if (body) { - logger.silly(`${method.toUpperCase()} ${url} with ${JSON.stringify(body)} ${options.queueMethod || defaultQueueMethod}`); - } else { - logger.silly(`${method.toUpperCase()} ${url} ${options.queueMethod || defaultQueueMethod}`); + if (!limiters[interval]?.[concurrency]) { + limiters[interval] = limiters[interval] || {}; + + limiters[interval][concurrency] = new Bottleneck({ + minTime: interval, + maxConcurrent: concurrency, + }); } - const reqOptions = { - headers: { - ...(options?.defaultHeaders !== false && defaultHeaders), - ...headers, - }, + return limiters[interval][concurrency]; +} + +async function request(method = 'get', url, body, requestOptions = {}) { + const http = requestOptions.session || bhttp; + + const options = { ...defaultOptions, - ...options, - ...(options?.timeout && { responseTimeout: options?.timeout }), + ...requestOptions, + responseTimeout: requestOptions.responseTimeout || requestOptions.timeout || 60000, + stream: !!requestOptions.destination, + interval: requestOptions.interval || config.limits.default.interval, + concurrency: requestOptions.concurrency || config.limits.default.concurrency, + session: null, }; if (useProxy(url)) { - reqOptions.agent = proxyAgent; + options.agent = proxyAgent; } - const res = ['POST', 'PUT', 'PATCH'].includes(method.toUpperCase()) - ? await (options.useSession || bhttp)[method.toLowerCase()](url, body, reqOptions) - : await (options.useSession || bhttp)[method.toLowerCase()](url, reqOptions); + logger.debug(`GET (${options.interval}ms/${options.concurrency}p) ${url}`); - if (options?.stream && options?.destination) { - await pipeline(res, ...(options?.transforms || []), options?.destination); + const res = await (body + ? http[method](url, body, options) + : http[method](url, options)); + + const resIsOk = res.statusCode >= 200 && res.statusCode <= 299; + + if (options.destination) { + // res.on('progress', (bytes, totalBytes) => logger.silly(`Downloaded ${Math.round((bytes / totalBytes) * 100)}% of ${url}`)); + + await pipeline(res, ...(options.transforms || []), options.destination); } - const html = Buffer.isBuffer(res.body) ? res.body.toString() : null; - const json = Buffer.isBuffer(res.body) ? null : res.body; + if (Buffer.isBuffer(res.body)) { + const html = res.body.toString(); + const window = new JSDOM(html).window; + + return { + ...res, + body: html, + html, + status: res.statusCode, + document: window.document, + window, + ok: resIsOk, + }; + } return { ...res, - originalRes: res, - html, - json, - pipe: res.pipe, - ok: res.statusCode >= 200 && res.statusCode <= 299, - code: res.statusCode, + body: res.body, status: res.statusCode, + ok: res.statusCode >= 200 && res.statusCode <= 299, }; } -queue.on('concurrencyReached:http', () => { - logger.silly('Queueing requests'); -}); - -queue.define('20p', handler, { - concurrency: 20, -}); - -queue.define('1s', handler, { - interval: 1, -}); - -queue.define('5s', handler, { - interval: 5, -}); - -async function get(url, headers, options) { - return queue.push(options?.queueMethod || defaultQueueMethod, { - method: 'GET', - url, - headers, - options, - }); +async function scheduleRequest(method = 'get', url, body, options) { + return getLimiter(options || {}).schedule(() => request(method, url, body, options)); } -async function head(url, headers, options) { - return queue.push(options?.queueMethod || defaultQueueMethod, { - method: 'HEAD', - url, - headers, - options, - }); +async function get(url, options) { + return scheduleRequest('get', url, null, options); } -async function post(url, body, headers, options) { - return queue.push(options?.queueMethod || defaultQueueMethod, { - method: 'POST', - url, - body, - headers, - options, - }); +async function post(url, body, options) { + return scheduleRequest('post', url, body, options); } -function session(headers, options) { - return bhttp.session({ - headers, - options, - }); +async function put(url, body, options) { + return scheduleRequest('put', url, body, options); +} + +async function patch(url, body, options) { + return scheduleRequest('patch', url, body, options); +} + +async function del(url, options) { + return scheduleRequest('delete', url, null, options); +} + +async function head(url, options) { + return scheduleRequest('head', url, null, options); +} + +function getSession(options) { + return bhttp.session(options); } module.exports = { get, - post, head, - session, + post, + delete: del, + put, + patch, + session: getSession, }; diff --git a/src/utils/qu.js b/src/utils/qu.js index 6169d947..b90d9c02 100644 --- a/src/utils/qu.js +++ b/src/utils/qu.js @@ -457,8 +457,8 @@ function extractAll(htmlValue, selector) { async function request(method = 'get', urlValue, body, selector, headers, options, queryAll = false) { const res = await (method === 'post' - ? http.post(urlValue, body, headers, options) - : http[method](urlValue, headers, options)); + ? http.post(urlValue, body, { ...options, headers }) + : http[method](urlValue, { ...options, headers })); if (res.ok) { const item = queryAll @@ -494,7 +494,7 @@ async function post(urlValue, body, selector, headers, options) { } async function getAll(urlValue, selector, headers, options) { - return request('get,', urlValue, selector, headers, options, true); + return request('get', urlValue, null, selector, headers, options, true); } async function postAll(urlValue, body, selector, headers, options) {