From 0633197793dfa2c02f6de1b739bd8751da5f46c6 Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Mon, 23 Nov 2020 00:05:02 +0100 Subject: [PATCH] Removed direct bhttp usage from scrapers in favor of local http module. Deleted legacy scrapers, as old code is available via git repo history. --- src/scrapers/bangbros.js | 8 +- src/scrapers/blowpass.js | 2 - src/scrapers/boobpedia.js | 5 +- src/scrapers/ddfnetwork.js | 9 +- src/scrapers/dogfart.js | 9 +- src/scrapers/freeones.js | 9 +- src/scrapers/freeones_legacy.js | 140 ------------------------ src/scrapers/julesjordan.js | 5 +- src/scrapers/mikeadriano.js | 7 +- src/scrapers/mindgeek.js | 23 ++-- src/scrapers/perfectgonzo.js | 6 +- src/scrapers/pervcity_legacy.js | 144 ------------------------- src/scrapers/pornhub.js | 9 +- src/scrapers/private.js | 10 +- src/scrapers/realitykings.js | 5 +- src/scrapers/score.js | 13 ++- src/scrapers/teamskeet_legacy.js | 180 ------------------------------- src/scrapers/vivid.js | 7 +- src/scrapers/vogov.js | 8 +- src/scrapers/whalemember.js | 7 +- src/scrapers/xempire.js | 5 +- src/utils/http.js | 3 +- 22 files changed, 77 insertions(+), 537 deletions(-) delete mode 100644 src/scrapers/freeones_legacy.js delete mode 100644 src/scrapers/pervcity_legacy.js delete mode 100644 src/scrapers/teamskeet_legacy.js diff --git a/src/scrapers/bangbros.js b/src/scrapers/bangbros.js index 6a1e186a..dc4069c2 100644 --- a/src/scrapers/bangbros.js +++ b/src/scrapers/bangbros.js @@ -1,12 +1,12 @@ 'use strict'; /* eslint-disable newline-per-chained-call */ -const bhttp = require('@thependulum/bhttp'); const cheerio = require('cheerio'); const moment = require('moment'); const logger = require('../logger')(__filename); const slugify = require('../utils/slugify'); +const http = require('../utils/http'); const { get, getAll, ex } = require('../utils/q'); function scrape(html, site) { @@ -192,7 +192,7 @@ async function fetchLatest(site, page = 1) { /* async function fetchUpcoming(site) { - const res = await bhttp.get('https://www.bangbros.com'); + const res = await http.get('https://www.bangbros.com'); return scrapeUpcoming(res.body.toString(), site); } @@ -224,13 +224,13 @@ async function fetchScene(url, site, release) { async function fetchProfile({ name: actorName }, scope) { const actorSlug = slugify(actorName); const url = `https://bangbros.com/search/${actorSlug}`; - const res = await bhttp.get(url); + const res = await http.get(url); if (res.statusCode === 200) { const actorUrl = scrapeProfileSearch(res.body.toString(), actorName); if (actorUrl) { - const actorRes = await bhttp.get(actorUrl); + const actorRes = await http.get(actorUrl); if (actorRes.statusCode === 200) { return scrapeProfile(actorRes.body.toString(), scope); diff --git a/src/scrapers/blowpass.js b/src/scrapers/blowpass.js index 7d5c2ca2..ded28901 100644 --- a/src/scrapers/blowpass.js +++ b/src/scrapers/blowpass.js @@ -1,7 +1,5 @@ 'use strict'; -// const bhttp = require('bhttp'); - const { fetchScene, fetchLatest, fetchUpcoming, fetchProfile } = require('./gamma'); async function fetchSceneWrapper(url, site, baseRelease) { diff --git a/src/scrapers/boobpedia.js b/src/scrapers/boobpedia.js index f942bae4..32e48cdd 100644 --- a/src/scrapers/boobpedia.js +++ b/src/scrapers/boobpedia.js @@ -1,8 +1,7 @@ 'use strict'; -const bhttp = require('bhttp'); - const { ex } = require('../utils/q'); +const http = require('../utils/http'); function scrapeProfile(html) { const { qu } = ex(html); /* eslint-disable-line object-curly-newline */ @@ -80,7 +79,7 @@ function scrapeProfile(html) { async function fetchProfile({ name: actorName }) { const actorSlug = actorName.replace(/\s+/, '_'); - const res = await bhttp.get(`http://www.boobpedia.com/boobs/${actorSlug}`); + const res = await http.get(`http://www.boobpedia.com/boobs/${actorSlug}`); if (res.statusCode === 200) { return scrapeProfile(res.body.toString()); diff --git a/src/scrapers/ddfnetwork.js b/src/scrapers/ddfnetwork.js index c5b07759..54dcd7c5 100644 --- a/src/scrapers/ddfnetwork.js +++ b/src/scrapers/ddfnetwork.js @@ -1,9 +1,8 @@ 'use strict'; -const bhttp = require('bhttp'); - const qu = require('../utils/qu'); const slugify = require('../utils/slugify'); +const http = require('../utils/http'); function scrapeAll(scenes, site, origin) { return scenes.map(({ query }) => { @@ -150,14 +149,14 @@ async function fetchLatest(channel, page = 1) { async function fetchScene(url, site) { // DDF's main site moved to Porn World - // const res = await bhttp.get(`https://ddfnetwork.com${new URL(url).pathname}`); + // const res = await http.get(`https://ddfnetwork.com${new URL(url).pathname}`); const res = await qu.get(url, '.content, #content, .taspVideoPage'); return res.ok ? scrapeScene(res.item, url, site) : res.status; } async function fetchProfile({ name: actorName }) { - const resSearch = await bhttp.post('https://ddfnetwork.com/search/ajax', + const resSearch = await http.post('https://ddfnetwork.com/search/ajax', { type: 'hints', word: actorName, @@ -180,7 +179,7 @@ async function fetchProfile({ name: actorName }) { const [actor] = resSearch.body.list.pornstarsName; const url = `https://ddfnetwork.com${actor.href}`; - const resActor = await bhttp.get(url); + const resActor = await http.get(url); if (resActor.statusCode !== 200) { return null; diff --git a/src/scrapers/dogfart.js b/src/scrapers/dogfart.js index 0334d104..56d55a3b 100644 --- a/src/scrapers/dogfart.js +++ b/src/scrapers/dogfart.js @@ -2,12 +2,13 @@ /* eslint-disable newline-per-chained-call */ // const Promise = require('bluebird'); -const bhttp = require('@thependulum/bhttp'); const { JSDOM } = require('jsdom'); const moment = require('moment'); +const http = require('../utils/http'); + async function getPhotos(albumUrl) { - const res = await bhttp.get(albumUrl); + const res = await http.get(albumUrl); const html = res.body.toString(); const { document } = new JSDOM(html).window; @@ -125,13 +126,13 @@ async function scrapeScene(html, url, site) { } async function fetchLatest(site, page = 1) { - const res = await bhttp.get(`https://dogfartnetwork.com/tour/scenes/?p=${page}`); + const res = await http.get(`https://dogfartnetwork.com/tour/scenes/?p=${page}`); return scrapeLatest(res.body.toString(), site); } async function fetchScene(url, site) { - const res = await bhttp.get(url); + const res = await http.get(url); return scrapeScene(res.body.toString(), url, site); } diff --git a/src/scrapers/freeones.js b/src/scrapers/freeones.js index c84dfe71..a40f1d67 100644 --- a/src/scrapers/freeones.js +++ b/src/scrapers/freeones.js @@ -1,9 +1,10 @@ 'use strict'; -const bhttp = require('bhttp'); const { JSDOM } = require('jsdom'); const moment = require('moment'); +const http = require('../utils/http'); + function scrapeProfile(html, actorName) { const { document } = new JSDOM(html).window; const profile = { name: actorName }; @@ -68,17 +69,17 @@ function scrapeSearch(html) { async function fetchProfile({ name: actorName }) { const actorSlug = actorName.toLowerCase().replace(/\s+/g, '-'); - const res = await bhttp.get(`https://freeones.nl/${actorSlug}/profile`); + const res = await http.get(`https://freeones.nl/${actorSlug}/profile`); if (res.statusCode === 200) { return scrapeProfile(res.body.toString(), actorName); } - const searchRes = await bhttp.get(`https://freeones.nl/babes?q=${actorName}`); + const searchRes = await http.get(`https://freeones.nl/babes?q=${actorName}`); const actorPath = scrapeSearch(searchRes.body.toString()); if (actorPath) { - const actorRes = await bhttp.get(`https://freeones.nl${actorPath}/profile`); + const actorRes = await http.get(`https://freeones.nl${actorPath}/profile`); if (actorRes.statusCode === 200) { return scrapeProfile(actorRes.body.toString(), actorName); diff --git a/src/scrapers/freeones_legacy.js b/src/scrapers/freeones_legacy.js deleted file mode 100644 index f4c4f5f5..00000000 --- a/src/scrapers/freeones_legacy.js +++ /dev/null @@ -1,140 +0,0 @@ -'use strict'; - -/* eslint-disable newline-per-chained-call */ -const bhttp = require('bhttp'); -const { JSDOM } = require('jsdom'); -const moment = require('moment'); - -async function scrapeProfileFrontpage(html, url, name) { - const { document } = new JSDOM(html).window; - const bioEl = document.querySelector('.dashboard-bio-list'); - - const bioUrl = `https:${document.querySelector('.seemore a').href}`; - - const keys = Array.from(bioEl.querySelectorAll('dt'), el => el.textContent.trim()); - const values = Array.from(bioEl.querySelectorAll('dd'), el => el.textContent.trim()); - - const bio = keys.reduce((acc, key, index) => ({ ...acc, [key]: values[index] }), {}); - - const profile = { - name, - gender: 'female', - }; - - const birthdateString = bio['Date of Birth:']; - const measurementsString = bio['Measurements:']; - - const birthCityString = bio['Place of Birth:']; - const birthCity = birthCityString !== undefined && birthCityString !== 'Unknown' && birthCityString !== 'Unknown (add)' && birthCityString; - - const birthCountryString = bio['Country of Origin:']; - const birthCountry = birthCountryString !== undefined && birthCountryString !== 'Unknown' && birthCountryString !== 'Unknown (add)' && birthCountryString; - - const piercingsString = bio['Piercings:']; - const tattoosString = bio['Tattoos:']; - - if (birthdateString && birthdateString !== 'Unknown (add)') profile.birthdate = moment.utc(birthdateString.slice(0, birthdateString.indexOf(' (')), 'MMMM D, YYYY').toDate(); - if (measurementsString) [profile.bust, profile.waist, profile.hip] = measurementsString.split('-').map(measurement => (measurement === '??' ? null : measurement)); - - if (bio['Fake Boobs:']) profile.naturalBoobs = bio['Fake Boobs:'] === 'No'; - profile.birthPlace = `${birthCity || ''}${birthCity ? ', ' : ''}${birthCountry || ''}`; - - profile.hair = bio['Hair Color:'].toLowerCase(); - profile.eyes = bio['Eye Color:'].toLowerCase(); - - if (piercingsString) profile.hasPiercings = !!(piercingsString !== 'Unknown (add)' && piercingsString !== 'None'); - if (tattoosString) profile.hasTattoos = !!(tattoosString !== 'Unknown (add)' && tattoosString !== 'None'); - - if (profile.hasPiercings && piercingsString !== 'various') profile.piercings = piercingsString; - if (profile.hasTattoos && tattoosString !== 'various') profile.tattoos = tattoosString; - - profile.social = Array.from(bioEl.querySelectorAll('.dashboard-socialmedia a'), el => el.href); - - return { - profile, - url: bioUrl, - }; -} - -async function scrapeProfileBio(html, frontpageProfile, url, name) { - const { document } = new JSDOM(html).window; - const bioEl = document.querySelector('#biographyTable'); - - const keys = Array.from(bioEl.querySelectorAll('td:nth-child(1)'), el => el.textContent.trim()); - const values = Array.from(bioEl.querySelectorAll('td:nth-child(2)'), el => el.textContent.trim()); - - const bio = keys.reduce((acc, key, index) => ({ ...acc, [key]: values[index] }), {}); - - const profile = { - ...frontpageProfile, - name, - gender: 'female', - }; - - const birthdateString = bio['Date of Birth:']; - const measurementsString = bio['Measurements:']; - - const birthCityString = bio['Place of Birth:']; - const birthCity = birthCityString !== undefined && birthCityString !== 'Unknown' && birthCityString !== 'Unknown (add)' && birthCityString; - - const birthCountryString = bio['Country of Origin:']; - const birthCountry = birthCountryString !== undefined && birthCountryString !== 'Unknown' && birthCountryString !== 'Unknown (add)' && birthCountryString; - - const piercingsString = bio['Piercings:']; - const tattoosString = bio['Tattoos:']; - - if (birthdateString && birthdateString !== 'Unknown') profile.birthdate = moment.utc(birthdateString.slice(0, birthdateString.indexOf(' (')), 'MMMM D, YYYY').toDate(); - if (measurementsString) [profile.bust, profile.waist, profile.hip] = measurementsString.split('-').map(measurement => (measurement === '??' ? null : measurement)); - - if (bio['Fake boobs']) profile.naturalBoobs = bio['Fake boobs:'] === 'No'; - profile.ethnicity = bio['Ethnicity:']; - - profile.birthPlace = `${birthCity || ''}${birthCity ? ', ' : ''}${birthCountry || ''}`; - - profile.hair = bio['Hair Color:'].toLowerCase(); - profile.eyes = bio['Eye Color:'].toLowerCase(); - profile.height = Number(bio['Height:'].match(/\d+/)[0]); - profile.weight = Number(bio['Weight:'].match(/\d+/)[0]); - - if (piercingsString) profile.hasPiercings = !!(piercingsString !== 'Unknown (add)' && piercingsString !== 'None'); - if (tattoosString) profile.hasTattoos = !!(tattoosString !== 'Unknown (add)' && tattoosString !== 'None'); - - if (profile.hasPiercings && piercingsString !== 'various') profile.piercings = piercingsString; - if (profile.hasTattoos && tattoosString !== 'various') profile.tattoos = tattoosString; - - profile.social = Array.from(bioEl.querySelectorAll('#socialmedia a'), el => el.href); - - return profile; -} - -async function fetchProfile({ name: actorName }) { - const slug = actorName.replace(' ', '_'); - const frontpageUrl = `https://www.freeones.com/html/v_links/${slug}`; - - const resFrontpage = await bhttp.get(frontpageUrl); - - if (resFrontpage.statusCode === 200) { - const { url, bio } = await scrapeProfileFrontpage(resFrontpage.body.toString(), frontpageUrl, actorName); - const resBio = await bhttp.get(url); - - return scrapeProfileBio(resBio.body.toString(), bio, url, actorName); - } - - // apparently some actors are appended 'Babe' as their surname... - const fallbackSlug = `${slug}_Babe`; - const fallbackUrl = `https://www.freeones.com/html/s_links/${fallbackSlug}`; - const resFallback = await bhttp.get(fallbackUrl); - - if (resFallback.statusCode === 200) { - const { url, profile } = await scrapeProfileFrontpage(resFallback.body.toString(), fallbackUrl, actorName); - const resBio = await bhttp.get(url); - - return scrapeProfileBio(resBio.body.toString(), profile, url, actorName); - } - - return null; -} - -module.exports = { - fetchProfile, -}; diff --git a/src/scrapers/julesjordan.js b/src/scrapers/julesjordan.js index f8b9c50e..a16cbe26 100644 --- a/src/scrapers/julesjordan.js +++ b/src/scrapers/julesjordan.js @@ -2,7 +2,6 @@ const util = require('util'); const Promise = require('bluebird'); -const bhttp = require('@thependulum/bhttp'); const cheerio = require('cheerio'); const { JSDOM } = require('jsdom'); const moment = require('moment'); @@ -13,7 +12,7 @@ const { heightToCm } = require('../utils/convert'); const slugify = require('../utils/slugify'); async function fetchPhotos(url) { - const res = await bhttp.get(url); + const res = await http.get(url); return res.body.toString(); } @@ -369,7 +368,7 @@ async function fetchLatest(site, page = 1, include, preData, entryIdFromTitle = ? util.format(site.parameters.latest, page) : `${site.url}/trial/categories/movies_${page}_d.html`; - // const res = await bhttp.get(url); + // const res = await http.get(url); const res = await qu.getAll(url, '.update_details'); return res.ok ? scrapeAll(res.items, site, entryIdFromTitle) : res.status; diff --git a/src/scrapers/mikeadriano.js b/src/scrapers/mikeadriano.js index a5abe4f6..849b4ef2 100644 --- a/src/scrapers/mikeadriano.js +++ b/src/scrapers/mikeadriano.js @@ -87,12 +87,13 @@ async function fetchScene(url, channel) { /* API protected async function fetchProfile({ name: actorName }, context , site) { - const session = bhttp.session(); + const session = http.session(); - await session.get(`https://tour.${site.slug}.com`); + await http.get(`https://tour.${site.slug}.com`, { session }); const url = `https://tour.${site.slug}.com/search-preview`; - const res = await session.post(url, { q: actorName }, { + const res = await http.post(url, { q: actorName }, { + session, headers: { 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36', origin: `https://tour.${site.slug}.com`, diff --git a/src/scrapers/mindgeek.js b/src/scrapers/mindgeek.js index 10ea4857..6beba0d6 100644 --- a/src/scrapers/mindgeek.js +++ b/src/scrapers/mindgeek.js @@ -2,12 +2,12 @@ /* eslint-disable newline-per-chained-call */ const Promise = require('bluebird'); -const bhttp = require('@thependulum/bhttp'); const { CookieJar } = Promise.promisifyAll(require('tough-cookie')); const moment = require('moment'); const qu = require('../utils/qu'); const slugify = require('../utils/slugify'); +const http = require('../utils/http'); const { inchesToCm, lbsToKg } = require('../utils/convert'); const { cookieToData } = require('../utils/cookies'); @@ -145,14 +145,14 @@ function getUrl(site) { async function getSession(site) { const cookieJar = new CookieJar(); - const session = bhttp.session({ cookieJar }); + const session = http.session({ cookieJar }); // const res = await session.get(url); const sessionUrl = site.parameters?.siteId && !(site.parameters?.childSession || site.parent?.parameters?.childSession) ? site.parent.url : site.url; - const res = await session.get(sessionUrl); + const res = await http.get(sessionUrl, { session }); if (res.statusCode === 200) { const cookieString = await cookieJar.getCookieStringAsync(sessionUrl); @@ -215,7 +215,8 @@ async function fetchLatest(site, page = 1) { ? `https://site-api.project1service.com/v2/releases?dateReleased=<${beforeDate}&limit=${limit}&offset=${limit * (page - 1)}&orderBy=-dateReleased&type=scene` : `https://site-api.project1service.com/v2/releases?collectionId=${siteId}&dateReleased=<${beforeDate}&limit=${limit}&offset=${limit * (page - 1)}&orderBy=-dateReleased&type=scene`; - const res = await session.get(apiUrl, { + const res = await http.get(apiUrl, { + session, headers: { Instance: instanceToken, Origin: site.url, @@ -236,7 +237,8 @@ async function fetchUpcoming(site) { const apiUrl = 'https://site-api.project1service.com/v2/upcoming-releases'; - const res = await session.get(apiUrl, { + const res = await http.get(apiUrl, { + session, headers: { Instance: instanceToken, Origin: site.url, @@ -260,7 +262,8 @@ async function fetchScene(url, site, baseScene) { const entryId = url.match(/\d+/)[0]; const { session, instanceToken } = await getSession(site); - const res = await session.get(`https://site-api.project1service.com/v2/releases/${entryId}`, { + const res = await http.get(`https://site-api.project1service.com/v2/releases/${entryId}`, { + session, headers: { Instance: instanceToken, }, @@ -277,7 +280,8 @@ async function fetchProfile({ name: actorName }, networkOrNetworkSlug, actorPath // const url = `https://www.${networkOrNetworkSlug.slug || networkOrNetworkSlug}.com`; const { session, instanceToken } = await getSession(networkOrNetworkSlug); - const res = await session.get(`https://site-api.project1service.com/v1/actors/?search=${encodeURI(actorName)}`, { + const res = await http.get(`https://site-api.project1service.com/v1/actors/?search=${encodeURI(actorName)}`, { + session, headers: { Instance: instanceToken, }, @@ -291,8 +295,9 @@ async function fetchProfile({ name: actorName }, networkOrNetworkSlug, actorPath const actorReleasesUrl = `https://site-api.project1service.com/v2/releases?actorId=${actorData.id}&limit=100&offset=0&orderBy=-dateReleased&type=scene`; const [actorRes, actorReleasesRes] = await Promise.all([ - bhttp.get(actorUrl), - session.get(actorReleasesUrl, { + http.get(actorUrl), + http.get(actorReleasesUrl, { + session, headers: { Instance: instanceToken, }, diff --git a/src/scrapers/perfectgonzo.js b/src/scrapers/perfectgonzo.js index 7247416e..e77a3bb0 100644 --- a/src/scrapers/perfectgonzo.js +++ b/src/scrapers/perfectgonzo.js @@ -1,10 +1,10 @@ 'use strict'; -const bhttp = require('bhttp'); const blake2 = require('blake2'); const knex = require('../knex'); const { ex, ctxa } = require('../utils/q'); +const http = require('../utils/http'); async function getSiteSlugs() { return knex('sites') @@ -124,7 +124,7 @@ async function scrapeScene(html, site, url, metaSiteSlugs) { async function fetchLatest(site, page = 1) { const url = `${site.url}/movies/page-${page}`; - const res = await bhttp.get(url); + const res = await http.get(url); if (res.statusCode === 200) { return scrapeLatest(res.body.toString(), site); @@ -134,7 +134,7 @@ async function fetchLatest(site, page = 1) { } async function fetchScene(url, site, release) { - const res = await bhttp.get(url); + const res = await http.get(url); if (res.statusCode === 200) { return scrapeScene(res.body.toString(), site, url, release?.meta.siteSlugs); diff --git a/src/scrapers/pervcity_legacy.js b/src/scrapers/pervcity_legacy.js deleted file mode 100644 index ff39ba70..00000000 --- a/src/scrapers/pervcity_legacy.js +++ /dev/null @@ -1,144 +0,0 @@ -'use strict'; - -const bhttp = require('bhttp'); -const cheerio = require('cheerio'); -const { JSDOM } = require('jsdom'); -const moment = require('moment'); - -async function getTrailer(entryId) { - const trailerRes = await bhttp.post('https://www.pervcity.com/gettoken.php', { - setId: entryId, - }); - - if (trailerRes.statusCode === 200) { - return { - poster: trailerRes.body.TrailerImg, - trailer: trailerRes.body.TrailerPath || trailerRes.body.Trailerfallback, - }; - } - - return null; -} - -function scrapeLatestScene(html, site) { - const $ = cheerio.load(html, { normalizeWhitespace: true }); - - const entryId = $('li').attr('id'); - const sceneLinkElement = $('#scene_title_border a'); - const url = `${site.url}/${sceneLinkElement.attr('href')}`; - const title = sceneLinkElement.attr('title').replace(/\u00E2\u0080\u0099/g, '\''); // replace weird apostrophes - - const actors = $('.home_model_name a').toArray().map(element => $(element).text().replace(/,[\u0020\u00A0\u202F]/, '')); // replace weird commas - const date = moment.utc($('.add_date').text(), 'DD-MM-YYYY').toDate(); - - const poster = $('a:nth-child(2) > img').attr('src'); - const photos = $('.sample-picker img').map((index, element) => $(element).attr('src').replace('tourpics', 'trailer')).toArray(); - - const stars = $('img[src*="/star.png"]') - .toArray() - .map(element => $(element).attr('src')) - .length || 0; - - return { - url, - entryId, - title, - actors, - date, - poster, - photos, - rating: { - stars, - }, - site, - }; -} - -async function scrapeScene(html, url, site) { - const { document } = new JSDOM(html).window; - - const release = { url, site }; - - release.entryId = document.querySelector('input#set_ID').value; - - release.title = document.querySelector('title').textContent; - release.description = document.querySelector('.player_data').textContent.trim(); - - const durationString = document.querySelector('.tag_lineR div:nth-child(2) span').textContent; - const [minutes, seconds] = durationString.match(/\d+/g); - - release.duration = Number(minutes) * 60 + Number(seconds); - release.tags = document.querySelector('meta[name="keywords"]').content.split(','); - - const { poster, trailer } = await getTrailer(release.entryId); - - release.poster = poster; - release.trailer = { src: trailer }; - - return release; -} - -function scrapeFallbackLanding(html) { - const { document } = new JSDOM(html).window; - - return document.querySelector('input#set_ID').value; -} - -async function scrapeFallbackScene(html, entryId, url, site) { - const { document } = new JSDOM(html).window; - const release = { url, entryId, site }; - - release.title = document.querySelector('.popup_data_set_head label').textContent; - release.description = document.querySelector('.popup_data_set_des p').textContent.trim(); - release.date = moment.utc(document.querySelector('.popup_left_top div span').textContent, 'MM-DD-YYYY').toDate(); - release.actors = Array.from(document.querySelectorAll('.popup_data_set_models a'), el => el.textContent); - - const { poster, trailer } = await getTrailer(release.entryId); - - release.poster = poster; - release.trailer = { src: trailer }; - - release.channel = document.querySelector('.popup_left_top div img').alt; - - return release; -} - -async function fetchLatest(channel, page = 1) { - const url = `${channel.url}/final_latestupdateview.php?limitstart=${(page - 1) * 9}&limitend=9&webchannelid=0&deviceview=browser&tourId=${channel.parameters.tourId}`; - const pagedUrl = `${channel.url}/final_load_latestupdate_grid_view.php?limitstart=0&limitend=${(page - 1) * 8 + 1}&webchannelid=0&deviceview=browser&tourId=${channel.parameters.tourId}`; - - const res = page === 1 - ? await bhttp.get(url) - : await bhttp.get(pagedUrl); - - const elements = JSON.parse(res.body.toString()); - - const latest = Object.values(elements.total_arr).map(html => scrapeLatestScene(html, channel)); // total_arr is a key-value object for final_load_latestupdate_grid_view.php - - return latest; -} - -async function fetchScene(url, site) { - const res = await bhttp.get(url); - - if (res.statusCode === 200) { - if (site.isNetwork) { - const entryId = scrapeFallbackLanding(res.body.toString(), url); - - const fallbackRes = await bhttp.post('https://www.pervcity.com/set_popupvideo.php', { - setId: entryId, - }); - - return scrapeFallbackScene(fallbackRes.body.toString(), entryId, url, site); - } - - return scrapeScene(res.body.toString(), url, site); - } - - return null; -} - -module.exports = { - fetchLatest, - fetchScene, -}; diff --git a/src/scrapers/pornhub.js b/src/scrapers/pornhub.js index 1367220c..5f08df43 100644 --- a/src/scrapers/pornhub.js +++ b/src/scrapers/pornhub.js @@ -1,9 +1,10 @@ 'use strict'; -const bhttp = require('@thependulum/bhttp'); const { JSDOM } = require('jsdom'); const moment = require('moment'); +const http = require('../utils/http'); + const ethnicityMap = { White: 'Caucasian', }; @@ -59,8 +60,8 @@ async function fetchProfile({ name: actorName }) { const pornstarUrl = `https://pornhub.com/pornstar/${actorSlug}`; const [modelRes, pornstarRes] = await Promise.all([ - bhttp.get(modelUrl), - bhttp.get(pornstarUrl), + http.get(modelUrl), + http.get(pornstarUrl), ]); const model = modelRes.statusCode === 200 && await scrapeProfile(modelRes.body.toString(), modelUrl, actorName); @@ -75,7 +76,7 @@ async function fetchProfile({ name: actorName }) { */ const pornstarUrl = `https://pornhub.com/pornstar/${actorSlug}`; - const pornstarRes = await bhttp.get(pornstarUrl); + const pornstarRes = await http.get(pornstarUrl); return scrapeProfile(pornstarRes.body.toString(), pornstarUrl, actorName); } diff --git a/src/scrapers/private.js b/src/scrapers/private.js index 9254d22c..6ac5988a 100644 --- a/src/scrapers/private.js +++ b/src/scrapers/private.js @@ -1,17 +1,17 @@ 'use strict'; /* eslint-disable newline-per-chained-call */ -const bhttp = require('@thependulum/bhttp'); const cheerio = require('cheerio'); const moment = require('moment'); const { get, geta } = require('../utils/q'); const slugify = require('../utils/slugify'); +const http = require('../utils/http'); async function getPhotos(entryId, site) { const { hostname } = new URL(site.url); - const res = await bhttp.get(`https://${hostname}/gallery.php?type=highres&id=${entryId}`); + const res = await http.get(`https://${hostname}/gallery.php?type=highres&id=${entryId}`); const html = res.body.toString(); const $ = cheerio.load(html, { normalizeWhitespace: true }); @@ -159,18 +159,18 @@ async function fetchLatest(site, page = 1) { const { hostname } = new URL(site.url); if (hostname.match('private.com')) { - const res = await bhttp.get(`${site.url}/${page}/`); + const res = await http.get(`${site.url}/${page}/`); return scrapeLatest(res.body.toString(), site); } - const res = await bhttp.get(`${site.url}/scenes/${page}/`); + const res = await http.get(`${site.url}/scenes/${page}/`); return scrapeLatest(res.body.toString(), site); } async function fetchScene(url, site) { - const res = await bhttp.get(url); + const res = await http.get(url); return scrapeScene(res.body.toString(), url, site); } diff --git a/src/scrapers/realitykings.js b/src/scrapers/realitykings.js index db4bfeed..cf92af08 100644 --- a/src/scrapers/realitykings.js +++ b/src/scrapers/realitykings.js @@ -1,8 +1,9 @@ 'use strict'; -const bhttp = require('@thependulum/bhttp'); const cheerio = require('cheerio'); +const http = require('../utils/http'); + const { scrapeLatestX, fetchLatest, @@ -24,7 +25,7 @@ function scrapeLatestClassic(html, site) { } async function fetchClassic(site, page) { - const res = await bhttp.get(`${site.url}/scenes?page=${page}`); + const res = await http.get(`${site.url}/scenes?page=${page}`); if (res.statusCode === 200) { return scrapeLatestClassic(res.body.toString(), site); diff --git a/src/scrapers/score.js b/src/scrapers/score.js index ba1967bc..95a5469c 100644 --- a/src/scrapers/score.js +++ b/src/scrapers/score.js @@ -1,9 +1,8 @@ 'use strict'; -const bhttp = require('bhttp'); - const { ex, exa, get } = require('../utils/q'); const slugify = require('../utils/slugify'); +const http = require('../utils/http'); const { heightToCm, lbsToKg } = require('../utils/convert'); function scrapePhotos(html) { @@ -19,7 +18,7 @@ function scrapePhotos(html) { } async function fetchPhotos(url) { - const res = await bhttp.get(url); + const res = await http.get(url); if (res.statusCode === 200) { return scrapePhotos(res.body.toString(), url); @@ -198,7 +197,7 @@ async function scrapeProfile(html, actorUrl, withReleases) { async function fetchLatest(site, page = 1) { const latestPath = site.parameters?.path || '/big-boob-videos'; const url = `${site.url}${latestPath}?page=${page}`; - const res = await bhttp.get(url); + const res = await http.get(url); if (res.statusCode === 200) { return scrapeAll(res.body.toString(), site); @@ -208,7 +207,7 @@ async function fetchLatest(site, page = 1) { } async function fetchScene(url, site) { - const res = await bhttp.get(url); + const res = await http.get(url); if (res.statusCode === 200) { return scrapeScene(res.body.toString(), url, site); @@ -227,7 +226,7 @@ async function fetchProfile({ name: actorName }, context, include, page = 1, sou const url = sources[source]; - const res = await bhttp.get(url, { + const res = await http.get(url, { followRedirects: false, }); @@ -235,7 +234,7 @@ async function fetchProfile({ name: actorName }, context, include, page = 1, sou const actorUrl = scrapeModels(res.body.toString(), actorName); if (actorUrl) { - const actorRes = await bhttp.get(actorUrl); + const actorRes = await http.get(actorUrl); if (actorRes.statusCode === 200) { return scrapeProfile(actorRes.body.toString(), actorUrl, include.scenes); diff --git a/src/scrapers/teamskeet_legacy.js b/src/scrapers/teamskeet_legacy.js deleted file mode 100644 index 3aefc7da..00000000 --- a/src/scrapers/teamskeet_legacy.js +++ /dev/null @@ -1,180 +0,0 @@ -'use strict'; - -const bhttp = require('bhttp'); -const { JSDOM } = require('jsdom'); -const moment = require('moment'); - -function extractTitle(pathname) { - return pathname - .split('/') - .slice(-2)[0] - .split('_') - .map(seg => `${seg.charAt(0).toUpperCase()}${seg.slice(1)}`) - .join(' '); -} - -function extractActors(str) { - return str - .split(/,|\band\b/ig) - .filter(actor => !/\.{3}/.test(actor)) - .map(actor => actor.trim()) - .filter(actor => actor.length > 0); -} - -function scrapeLatest(html, site) { - const { document } = new JSDOM(html).window; - - const scenes = Array.from(document.querySelectorAll('#updatesList li.grey, #updatesList li.white')); - - return scenes.map((scene) => { - const release = { site }; - - const link = scene.querySelector('.info a'); - const poster = scene.querySelector('img'); - const { pathname } = new URL(link); - - [release.entryId] = poster.id.match(/\d+/); - - release.url = `https://www.teamskeet.com${pathname}`; - release.title = extractTitle(pathname); - - release.date = moment.utc(scene.querySelector('strong').textContent, 'MM/DD/YYYY').toDate(); - - const photos = Array.from({ length: 5 }, (_value, index) => poster.dataset.original.replace(/\d+.jpg/, `${String(index + 1).padStart(2, '0')}.jpg`)); - [release.poster] = photos; - release.photos = photos.slice(1); - - const actors = scene.querySelector('div span[rel="test"]').textContent; - release.actors = extractActors(actors); - - return release; - }); -} - -function scrapeScene(html, site, url) { - const { document } = new JSDOM(html).window; - const release = { site }; - - release.entryId = document.querySelector('#story-and-tags .scene_rater').attributes.rel.value; - release.description = document.querySelector('#story-and-tags td:nth-child(2) div').textContent; - const [actors, title, channel] = document.querySelector('title').textContent.split('|').map(item => item.trim()); - - release.url = url; - release.title = title; - release.actors = extractActors(actors); - release.channel = channel.toLowerCase(); - release.tags = Array.from(document.querySelectorAll('#story-and-tags tr:nth-child(2) a'), el => el.rel); - - const date = document.querySelector('h3 ~ div:nth-child(4), h3 ~ div div.gray:not(.scene_rater)').textContent.split(':')[1].trim(); - release.date = moment.utc(date, 'MMMM Do, YYYY').toDate(); - - const { poster } = document.querySelector('video'); - if (poster && !/gen/.test(poster)) release.poster = [poster.replace('low', 'hi'), poster]; - - const siteId = document.querySelector('#story-and-tags img').src.match(/\w+.jpg/)[0].replace('.jpg', ''); - const actorsSlug = document.querySelector('h3 a').href.split('/').slice(-2)[0]; - - release.photos = Array.from({ length: 5 }, (value, index) => `https://images.psmcdn.net/teamskeet/${siteId}/${actorsSlug}/shared/scenes/new/${String(index + 1).padStart(2, '0')}.jpg`); - - const trailer = document.querySelector('div.right.gray a').href; - if (trailer) release.trailer = { src: trailer }; - - return release; -} - -function scrapeSceneA(html, site, sceneX, url) { - const scene = sceneX || new JSDOM(html).window.document; - const release = { site }; - - release.description = scene.querySelector('.scene-story').textContent.replace('...read more', '...').trim(); - - release.date = moment.utc(scene.querySelector('.scene-date').textContent, 'MM/DD/YYYY').toDate(); - release.actors = Array.from(scene.querySelectorAll('.starring span'), el => extractActors(el.textContent)).flat(); - - const durationString = scene.querySelector('.time').textContent.trim(); - const duration = ['00'].concat(durationString.split(':')).slice(-3).join(':'); // ensure hh:mm:ss - release.duration = moment.duration(duration).asSeconds(); - - if (sceneX) { - const titleEl = scene.querySelector(':scope > a'); - - release.url = titleEl.href; - release.entryId = titleEl.id; - release.title = titleEl.title; - - const [poster, ...photos] = Array.from(scene.querySelectorAll('.scene img'), el => el.src); - release.poster = [poster.replace('bio_big', 'video'), poster]; - release.photos = photos; - } - - if (!sceneX) { - release.title = scene.querySelector('.title span').textContent; - release.url = url; - - release.poster = scene.querySelector('video').poster; - release.photos = [release.poster.replace('video', 'bio_small'), release.poster.replace('video', 'bio_small2')]; - } - - const [, entryIdA, entryIdB] = new URL(release.url).pathname.split('/'); - release.entryId = entryIdA === 'scenes' ? entryIdB : entryIdA; - - return release; -} - -function scrapeLatestA(html, site) { - const { document } = new JSDOM(html).window; - - const scenes = Array.from(document.querySelectorAll('.scenewrapper')); - - return scenes.map(scene => scrapeSceneA(null, site, scene)); -} - -async function fetchLatestTeamSkeet(site, page = 1) { - const url = `https://www.teamskeet.com/t1/updates/load?fltrs[site]=${site.parameters.id}&page=${page}&view=newest&fltrs[time]=ALL&order=DESC`; - const res = await bhttp.get(url); - - if (res.statusCode === 200) { - return scrapeLatest(res.body.toString(), site); - } - - return null; -} - -async function fetchLatestA(site) { - const url = `${site.url}/scenes`; - const res = await bhttp.get(url); - - if (res.statusCode === 200) { - return scrapeLatestA(res.body.toString(), site); - } - - return null; -} - -async function fetchLatest(site, page = 1) { - if (site.parameters.id) { - return fetchLatestTeamSkeet(site, page); - } - - if (site.parameters.scraper === 'A') { - return fetchLatestA(site, page); - } - - return null; -} - -async function fetchScene(url, site) { - const session = bhttp.session(); // resolve redirects - const res = await session.get(url); - - if (site.parameters?.scraper === 'A') { - return scrapeSceneA(res.body.toString(), site, null, url); - } - - return scrapeScene(res.body.toString(), site, url); -} - -module.exports = { - fetchLatest, - fetchScene, -}; diff --git a/src/scrapers/vivid.js b/src/scrapers/vivid.js index 427a5b4f..e49d7a87 100644 --- a/src/scrapers/vivid.js +++ b/src/scrapers/vivid.js @@ -1,10 +1,9 @@ 'use strict'; /* eslint-disable no-unused-vars */ -const bhttp = require('@thependulum/bhttp'); - const { get, ed } = require('../utils/q'); const { fetchApiLatest, fetchApiUpcoming, fetchScene, fetchApiProfile } = require('./gamma'); +const http = require('../utils/http'); const slugify = require('../utils/slugify'); function scrapeLatestNative(scenes, site) { @@ -72,7 +71,7 @@ async function fetchLatestNative(site, page = 1) { } const apiUrl = `${site.url}/videos/api/?limit=50&offset=${(page - 1) * 50}&sort=datedesc`; - const res = await bhttp.get(apiUrl, { + const res = await http.get(apiUrl, { decodeJSON: true, }); @@ -107,7 +106,7 @@ async function fetchSceneWrapper(url, site, release) { if (scene.date - new Date(site.parameters?.lastNative) <= 0) { // scene is probably still available on Vivid site, use search API to get URL and original date const searchUrl = `${site.url}/videos/api/?limit=10&sort=datedesc&search=${encodeURI(scene.title)}`; - const searchRes = await bhttp.get(searchUrl, { + const searchRes = await http.get(searchUrl, { decodeJSON: true, }); diff --git a/src/scrapers/vogov.js b/src/scrapers/vogov.js index ce8006a0..51e73dd2 100644 --- a/src/scrapers/vogov.js +++ b/src/scrapers/vogov.js @@ -1,8 +1,8 @@ 'use strict'; -const bhttp = require('bhttp'); -const { ex, ctxa } = require('../utils/q'); // const slugify = require('../utils/slugify'); +const { ex, ctxa } = require('../utils/q'); +const http = require('../utils/http'); function getLicenseCode(html) { const licensePrefix = 'license_code: \''; @@ -178,7 +178,7 @@ function scrapeScene(html, url) { async function fetchLatest(site, page = 1) { const url = `https://vogov.com/latest-videos/?sort_by=post_date&from=${page}`; - const res = await bhttp.get(url); + const res = await http.get(url); if (res.statusCode === 200) { return scrapeLatest(res.body.toString(), site); @@ -188,7 +188,7 @@ async function fetchLatest(site, page = 1) { } async function fetchScene(url) { - const res = await bhttp.get(url); + const res = await http.get(url); if (res.statusCode === 200) { return scrapeScene(res.body.toString(), url); diff --git a/src/scrapers/whalemember.js b/src/scrapers/whalemember.js index 9d7b8c6c..43f12671 100644 --- a/src/scrapers/whalemember.js +++ b/src/scrapers/whalemember.js @@ -1,9 +1,10 @@ 'use strict'; -const bhttp = require('bhttp'); const { JSDOM } = require('jsdom'); const moment = require('moment'); +const http = require('../utils/http'); + function scrapeLatest(html, site) { const { document } = new JSDOM(html).window; const { origin } = new URL(site.url); @@ -112,7 +113,7 @@ function scrapeScene(html, site, url) { async function fetchLatest(site, page = 1) { const url = `${site.url}?page=${page}`; - const res = await bhttp.get(url); + const res = await http.get(url); if (res.statusCode === 200) { return scrapeLatest(res.body.toString(), site); @@ -122,7 +123,7 @@ async function fetchLatest(site, page = 1) { } async function fetchScene(url, site) { - const res = await bhttp.get(url); + const res = await http.get(url); if (res.statusCode === 200) { return scrapeScene(res.body.toString(), site, url); diff --git a/src/scrapers/xempire.js b/src/scrapers/xempire.js index 75df6e72..d9d4961a 100644 --- a/src/scrapers/xempire.js +++ b/src/scrapers/xempire.js @@ -1,11 +1,10 @@ 'use strict'; -const bhttp = require('@thependulum/bhttp'); - const { fetchLatest, fetchUpcoming, scrapeScene, fetchProfile } = require('./gamma'); +const http = require('../utils/http'); async function fetchScene(url, site) { - const res = await bhttp.get(url); + const res = await http.get(url); const release = await scrapeScene(res.body.toString(), url, site); diff --git a/src/utils/http.js b/src/utils/http.js index 90366d77..aef75a0b 100644 --- a/src/utils/http.js +++ b/src/utils/http.js @@ -1,7 +1,7 @@ 'use strict'; const config = require('config'); -const bhttp = require('bhttp'); +const bhttp = require('@thependulum/bhttp'); const util = require('util'); const stream = require('stream'); const tunnel = require('tunnel'); @@ -168,4 +168,5 @@ module.exports = { put, patch, session: getSession, + getSession, };