Removed direct bhttp usage from scrapers in favor of local http module. Deleted legacy scrapers, as old code is available via git repo history.

This commit is contained in:
DebaucheryLibrarian 2020-11-23 00:05:02 +01:00
parent 3d427f7e1d
commit 0633197793
22 changed files with 77 additions and 537 deletions

View File

@ -1,12 +1,12 @@
'use strict'; 'use strict';
/* eslint-disable newline-per-chained-call */ /* eslint-disable newline-per-chained-call */
const bhttp = require('@thependulum/bhttp');
const cheerio = require('cheerio'); const cheerio = require('cheerio');
const moment = require('moment'); const moment = require('moment');
const logger = require('../logger')(__filename); const logger = require('../logger')(__filename);
const slugify = require('../utils/slugify'); const slugify = require('../utils/slugify');
const http = require('../utils/http');
const { get, getAll, ex } = require('../utils/q'); const { get, getAll, ex } = require('../utils/q');
function scrape(html, site) { function scrape(html, site) {
@ -192,7 +192,7 @@ async function fetchLatest(site, page = 1) {
/* /*
async function fetchUpcoming(site) { async function fetchUpcoming(site) {
const res = await bhttp.get('https://www.bangbros.com'); const res = await http.get('https://www.bangbros.com');
return scrapeUpcoming(res.body.toString(), site); return scrapeUpcoming(res.body.toString(), site);
} }
@ -224,13 +224,13 @@ async function fetchScene(url, site, release) {
async function fetchProfile({ name: actorName }, scope) { async function fetchProfile({ name: actorName }, scope) {
const actorSlug = slugify(actorName); const actorSlug = slugify(actorName);
const url = `https://bangbros.com/search/${actorSlug}`; const url = `https://bangbros.com/search/${actorSlug}`;
const res = await bhttp.get(url); const res = await http.get(url);
if (res.statusCode === 200) { if (res.statusCode === 200) {
const actorUrl = scrapeProfileSearch(res.body.toString(), actorName); const actorUrl = scrapeProfileSearch(res.body.toString(), actorName);
if (actorUrl) { if (actorUrl) {
const actorRes = await bhttp.get(actorUrl); const actorRes = await http.get(actorUrl);
if (actorRes.statusCode === 200) { if (actorRes.statusCode === 200) {
return scrapeProfile(actorRes.body.toString(), scope); return scrapeProfile(actorRes.body.toString(), scope);

View File

@ -1,7 +1,5 @@
'use strict'; 'use strict';
// const bhttp = require('bhttp');
const { fetchScene, fetchLatest, fetchUpcoming, fetchProfile } = require('./gamma'); const { fetchScene, fetchLatest, fetchUpcoming, fetchProfile } = require('./gamma');
async function fetchSceneWrapper(url, site, baseRelease) { async function fetchSceneWrapper(url, site, baseRelease) {

View File

@ -1,8 +1,7 @@
'use strict'; 'use strict';
const bhttp = require('bhttp');
const { ex } = require('../utils/q'); const { ex } = require('../utils/q');
const http = require('../utils/http');
function scrapeProfile(html) { function scrapeProfile(html) {
const { qu } = ex(html); /* eslint-disable-line object-curly-newline */ const { qu } = ex(html); /* eslint-disable-line object-curly-newline */
@ -80,7 +79,7 @@ function scrapeProfile(html) {
async function fetchProfile({ name: actorName }) { async function fetchProfile({ name: actorName }) {
const actorSlug = actorName.replace(/\s+/, '_'); const actorSlug = actorName.replace(/\s+/, '_');
const res = await bhttp.get(`http://www.boobpedia.com/boobs/${actorSlug}`); const res = await http.get(`http://www.boobpedia.com/boobs/${actorSlug}`);
if (res.statusCode === 200) { if (res.statusCode === 200) {
return scrapeProfile(res.body.toString()); return scrapeProfile(res.body.toString());

View File

@ -1,9 +1,8 @@
'use strict'; 'use strict';
const bhttp = require('bhttp');
const qu = require('../utils/qu'); const qu = require('../utils/qu');
const slugify = require('../utils/slugify'); const slugify = require('../utils/slugify');
const http = require('../utils/http');
function scrapeAll(scenes, site, origin) { function scrapeAll(scenes, site, origin) {
return scenes.map(({ query }) => { return scenes.map(({ query }) => {
@ -150,14 +149,14 @@ async function fetchLatest(channel, page = 1) {
async function fetchScene(url, site) { async function fetchScene(url, site) {
// DDF's main site moved to Porn World // DDF's main site moved to Porn World
// const res = await bhttp.get(`https://ddfnetwork.com${new URL(url).pathname}`); // const res = await http.get(`https://ddfnetwork.com${new URL(url).pathname}`);
const res = await qu.get(url, '.content, #content, .taspVideoPage'); const res = await qu.get(url, '.content, #content, .taspVideoPage');
return res.ok ? scrapeScene(res.item, url, site) : res.status; return res.ok ? scrapeScene(res.item, url, site) : res.status;
} }
async function fetchProfile({ name: actorName }) { async function fetchProfile({ name: actorName }) {
const resSearch = await bhttp.post('https://ddfnetwork.com/search/ajax', const resSearch = await http.post('https://ddfnetwork.com/search/ajax',
{ {
type: 'hints', type: 'hints',
word: actorName, word: actorName,
@ -180,7 +179,7 @@ async function fetchProfile({ name: actorName }) {
const [actor] = resSearch.body.list.pornstarsName; const [actor] = resSearch.body.list.pornstarsName;
const url = `https://ddfnetwork.com${actor.href}`; const url = `https://ddfnetwork.com${actor.href}`;
const resActor = await bhttp.get(url); const resActor = await http.get(url);
if (resActor.statusCode !== 200) { if (resActor.statusCode !== 200) {
return null; return null;

View File

@ -2,12 +2,13 @@
/* eslint-disable newline-per-chained-call */ /* eslint-disable newline-per-chained-call */
// const Promise = require('bluebird'); // const Promise = require('bluebird');
const bhttp = require('@thependulum/bhttp');
const { JSDOM } = require('jsdom'); const { JSDOM } = require('jsdom');
const moment = require('moment'); const moment = require('moment');
const http = require('../utils/http');
async function getPhotos(albumUrl) { async function getPhotos(albumUrl) {
const res = await bhttp.get(albumUrl); const res = await http.get(albumUrl);
const html = res.body.toString(); const html = res.body.toString();
const { document } = new JSDOM(html).window; const { document } = new JSDOM(html).window;
@ -125,13 +126,13 @@ async function scrapeScene(html, url, site) {
} }
async function fetchLatest(site, page = 1) { async function fetchLatest(site, page = 1) {
const res = await bhttp.get(`https://dogfartnetwork.com/tour/scenes/?p=${page}`); const res = await http.get(`https://dogfartnetwork.com/tour/scenes/?p=${page}`);
return scrapeLatest(res.body.toString(), site); return scrapeLatest(res.body.toString(), site);
} }
async function fetchScene(url, site) { async function fetchScene(url, site) {
const res = await bhttp.get(url); const res = await http.get(url);
return scrapeScene(res.body.toString(), url, site); return scrapeScene(res.body.toString(), url, site);
} }

View File

@ -1,9 +1,10 @@
'use strict'; 'use strict';
const bhttp = require('bhttp');
const { JSDOM } = require('jsdom'); const { JSDOM } = require('jsdom');
const moment = require('moment'); const moment = require('moment');
const http = require('../utils/http');
function scrapeProfile(html, actorName) { function scrapeProfile(html, actorName) {
const { document } = new JSDOM(html).window; const { document } = new JSDOM(html).window;
const profile = { name: actorName }; const profile = { name: actorName };
@ -68,17 +69,17 @@ function scrapeSearch(html) {
async function fetchProfile({ name: actorName }) { async function fetchProfile({ name: actorName }) {
const actorSlug = actorName.toLowerCase().replace(/\s+/g, '-'); const actorSlug = actorName.toLowerCase().replace(/\s+/g, '-');
const res = await bhttp.get(`https://freeones.nl/${actorSlug}/profile`); const res = await http.get(`https://freeones.nl/${actorSlug}/profile`);
if (res.statusCode === 200) { if (res.statusCode === 200) {
return scrapeProfile(res.body.toString(), actorName); return scrapeProfile(res.body.toString(), actorName);
} }
const searchRes = await bhttp.get(`https://freeones.nl/babes?q=${actorName}`); const searchRes = await http.get(`https://freeones.nl/babes?q=${actorName}`);
const actorPath = scrapeSearch(searchRes.body.toString()); const actorPath = scrapeSearch(searchRes.body.toString());
if (actorPath) { if (actorPath) {
const actorRes = await bhttp.get(`https://freeones.nl${actorPath}/profile`); const actorRes = await http.get(`https://freeones.nl${actorPath}/profile`);
if (actorRes.statusCode === 200) { if (actorRes.statusCode === 200) {
return scrapeProfile(actorRes.body.toString(), actorName); return scrapeProfile(actorRes.body.toString(), actorName);

View File

@ -1,140 +0,0 @@
'use strict';
/* eslint-disable newline-per-chained-call */
const bhttp = require('bhttp');
const { JSDOM } = require('jsdom');
const moment = require('moment');
async function scrapeProfileFrontpage(html, url, name) {
const { document } = new JSDOM(html).window;
const bioEl = document.querySelector('.dashboard-bio-list');
const bioUrl = `https:${document.querySelector('.seemore a').href}`;
const keys = Array.from(bioEl.querySelectorAll('dt'), el => el.textContent.trim());
const values = Array.from(bioEl.querySelectorAll('dd'), el => el.textContent.trim());
const bio = keys.reduce((acc, key, index) => ({ ...acc, [key]: values[index] }), {});
const profile = {
name,
gender: 'female',
};
const birthdateString = bio['Date of Birth:'];
const measurementsString = bio['Measurements:'];
const birthCityString = bio['Place of Birth:'];
const birthCity = birthCityString !== undefined && birthCityString !== 'Unknown' && birthCityString !== 'Unknown (add)' && birthCityString;
const birthCountryString = bio['Country of Origin:'];
const birthCountry = birthCountryString !== undefined && birthCountryString !== 'Unknown' && birthCountryString !== 'Unknown (add)' && birthCountryString;
const piercingsString = bio['Piercings:'];
const tattoosString = bio['Tattoos:'];
if (birthdateString && birthdateString !== 'Unknown (add)') profile.birthdate = moment.utc(birthdateString.slice(0, birthdateString.indexOf(' (')), 'MMMM D, YYYY').toDate();
if (measurementsString) [profile.bust, profile.waist, profile.hip] = measurementsString.split('-').map(measurement => (measurement === '??' ? null : measurement));
if (bio['Fake Boobs:']) profile.naturalBoobs = bio['Fake Boobs:'] === 'No';
profile.birthPlace = `${birthCity || ''}${birthCity ? ', ' : ''}${birthCountry || ''}`;
profile.hair = bio['Hair Color:'].toLowerCase();
profile.eyes = bio['Eye Color:'].toLowerCase();
if (piercingsString) profile.hasPiercings = !!(piercingsString !== 'Unknown (add)' && piercingsString !== 'None');
if (tattoosString) profile.hasTattoos = !!(tattoosString !== 'Unknown (add)' && tattoosString !== 'None');
if (profile.hasPiercings && piercingsString !== 'various') profile.piercings = piercingsString;
if (profile.hasTattoos && tattoosString !== 'various') profile.tattoos = tattoosString;
profile.social = Array.from(bioEl.querySelectorAll('.dashboard-socialmedia a'), el => el.href);
return {
profile,
url: bioUrl,
};
}
async function scrapeProfileBio(html, frontpageProfile, url, name) {
const { document } = new JSDOM(html).window;
const bioEl = document.querySelector('#biographyTable');
const keys = Array.from(bioEl.querySelectorAll('td:nth-child(1)'), el => el.textContent.trim());
const values = Array.from(bioEl.querySelectorAll('td:nth-child(2)'), el => el.textContent.trim());
const bio = keys.reduce((acc, key, index) => ({ ...acc, [key]: values[index] }), {});
const profile = {
...frontpageProfile,
name,
gender: 'female',
};
const birthdateString = bio['Date of Birth:'];
const measurementsString = bio['Measurements:'];
const birthCityString = bio['Place of Birth:'];
const birthCity = birthCityString !== undefined && birthCityString !== 'Unknown' && birthCityString !== 'Unknown (add)' && birthCityString;
const birthCountryString = bio['Country of Origin:'];
const birthCountry = birthCountryString !== undefined && birthCountryString !== 'Unknown' && birthCountryString !== 'Unknown (add)' && birthCountryString;
const piercingsString = bio['Piercings:'];
const tattoosString = bio['Tattoos:'];
if (birthdateString && birthdateString !== 'Unknown') profile.birthdate = moment.utc(birthdateString.slice(0, birthdateString.indexOf(' (')), 'MMMM D, YYYY').toDate();
if (measurementsString) [profile.bust, profile.waist, profile.hip] = measurementsString.split('-').map(measurement => (measurement === '??' ? null : measurement));
if (bio['Fake boobs']) profile.naturalBoobs = bio['Fake boobs:'] === 'No';
profile.ethnicity = bio['Ethnicity:'];
profile.birthPlace = `${birthCity || ''}${birthCity ? ', ' : ''}${birthCountry || ''}`;
profile.hair = bio['Hair Color:'].toLowerCase();
profile.eyes = bio['Eye Color:'].toLowerCase();
profile.height = Number(bio['Height:'].match(/\d+/)[0]);
profile.weight = Number(bio['Weight:'].match(/\d+/)[0]);
if (piercingsString) profile.hasPiercings = !!(piercingsString !== 'Unknown (add)' && piercingsString !== 'None');
if (tattoosString) profile.hasTattoos = !!(tattoosString !== 'Unknown (add)' && tattoosString !== 'None');
if (profile.hasPiercings && piercingsString !== 'various') profile.piercings = piercingsString;
if (profile.hasTattoos && tattoosString !== 'various') profile.tattoos = tattoosString;
profile.social = Array.from(bioEl.querySelectorAll('#socialmedia a'), el => el.href);
return profile;
}
async function fetchProfile({ name: actorName }) {
const slug = actorName.replace(' ', '_');
const frontpageUrl = `https://www.freeones.com/html/v_links/${slug}`;
const resFrontpage = await bhttp.get(frontpageUrl);
if (resFrontpage.statusCode === 200) {
const { url, bio } = await scrapeProfileFrontpage(resFrontpage.body.toString(), frontpageUrl, actorName);
const resBio = await bhttp.get(url);
return scrapeProfileBio(resBio.body.toString(), bio, url, actorName);
}
// apparently some actors are appended 'Babe' as their surname...
const fallbackSlug = `${slug}_Babe`;
const fallbackUrl = `https://www.freeones.com/html/s_links/${fallbackSlug}`;
const resFallback = await bhttp.get(fallbackUrl);
if (resFallback.statusCode === 200) {
const { url, profile } = await scrapeProfileFrontpage(resFallback.body.toString(), fallbackUrl, actorName);
const resBio = await bhttp.get(url);
return scrapeProfileBio(resBio.body.toString(), profile, url, actorName);
}
return null;
}
module.exports = {
fetchProfile,
};

View File

@ -2,7 +2,6 @@
const util = require('util'); const util = require('util');
const Promise = require('bluebird'); const Promise = require('bluebird');
const bhttp = require('@thependulum/bhttp');
const cheerio = require('cheerio'); const cheerio = require('cheerio');
const { JSDOM } = require('jsdom'); const { JSDOM } = require('jsdom');
const moment = require('moment'); const moment = require('moment');
@ -13,7 +12,7 @@ const { heightToCm } = require('../utils/convert');
const slugify = require('../utils/slugify'); const slugify = require('../utils/slugify');
async function fetchPhotos(url) { async function fetchPhotos(url) {
const res = await bhttp.get(url); const res = await http.get(url);
return res.body.toString(); return res.body.toString();
} }
@ -369,7 +368,7 @@ async function fetchLatest(site, page = 1, include, preData, entryIdFromTitle =
? util.format(site.parameters.latest, page) ? util.format(site.parameters.latest, page)
: `${site.url}/trial/categories/movies_${page}_d.html`; : `${site.url}/trial/categories/movies_${page}_d.html`;
// const res = await bhttp.get(url); // const res = await http.get(url);
const res = await qu.getAll(url, '.update_details'); const res = await qu.getAll(url, '.update_details');
return res.ok ? scrapeAll(res.items, site, entryIdFromTitle) : res.status; return res.ok ? scrapeAll(res.items, site, entryIdFromTitle) : res.status;

View File

@ -87,12 +87,13 @@ async function fetchScene(url, channel) {
/* API protected /* API protected
async function fetchProfile({ name: actorName }, context , site) { async function fetchProfile({ name: actorName }, context , site) {
const session = bhttp.session(); const session = http.session();
await session.get(`https://tour.${site.slug}.com`); await http.get(`https://tour.${site.slug}.com`, { session });
const url = `https://tour.${site.slug}.com/search-preview`; const url = `https://tour.${site.slug}.com/search-preview`;
const res = await session.post(url, { q: actorName }, { const res = await http.post(url, { q: actorName }, {
session,
headers: { headers: {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36', 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
origin: `https://tour.${site.slug}.com`, origin: `https://tour.${site.slug}.com`,

View File

@ -2,12 +2,12 @@
/* eslint-disable newline-per-chained-call */ /* eslint-disable newline-per-chained-call */
const Promise = require('bluebird'); const Promise = require('bluebird');
const bhttp = require('@thependulum/bhttp');
const { CookieJar } = Promise.promisifyAll(require('tough-cookie')); const { CookieJar } = Promise.promisifyAll(require('tough-cookie'));
const moment = require('moment'); const moment = require('moment');
const qu = require('../utils/qu'); const qu = require('../utils/qu');
const slugify = require('../utils/slugify'); const slugify = require('../utils/slugify');
const http = require('../utils/http');
const { inchesToCm, lbsToKg } = require('../utils/convert'); const { inchesToCm, lbsToKg } = require('../utils/convert');
const { cookieToData } = require('../utils/cookies'); const { cookieToData } = require('../utils/cookies');
@ -145,14 +145,14 @@ function getUrl(site) {
async function getSession(site) { async function getSession(site) {
const cookieJar = new CookieJar(); const cookieJar = new CookieJar();
const session = bhttp.session({ cookieJar }); const session = http.session({ cookieJar });
// const res = await session.get(url); // const res = await session.get(url);
const sessionUrl = site.parameters?.siteId && !(site.parameters?.childSession || site.parent?.parameters?.childSession) const sessionUrl = site.parameters?.siteId && !(site.parameters?.childSession || site.parent?.parameters?.childSession)
? site.parent.url ? site.parent.url
: site.url; : site.url;
const res = await session.get(sessionUrl); const res = await http.get(sessionUrl, { session });
if (res.statusCode === 200) { if (res.statusCode === 200) {
const cookieString = await cookieJar.getCookieStringAsync(sessionUrl); const cookieString = await cookieJar.getCookieStringAsync(sessionUrl);
@ -215,7 +215,8 @@ async function fetchLatest(site, page = 1) {
? `https://site-api.project1service.com/v2/releases?dateReleased=<${beforeDate}&limit=${limit}&offset=${limit * (page - 1)}&orderBy=-dateReleased&type=scene` ? `https://site-api.project1service.com/v2/releases?dateReleased=<${beforeDate}&limit=${limit}&offset=${limit * (page - 1)}&orderBy=-dateReleased&type=scene`
: `https://site-api.project1service.com/v2/releases?collectionId=${siteId}&dateReleased=<${beforeDate}&limit=${limit}&offset=${limit * (page - 1)}&orderBy=-dateReleased&type=scene`; : `https://site-api.project1service.com/v2/releases?collectionId=${siteId}&dateReleased=<${beforeDate}&limit=${limit}&offset=${limit * (page - 1)}&orderBy=-dateReleased&type=scene`;
const res = await session.get(apiUrl, { const res = await http.get(apiUrl, {
session,
headers: { headers: {
Instance: instanceToken, Instance: instanceToken,
Origin: site.url, Origin: site.url,
@ -236,7 +237,8 @@ async function fetchUpcoming(site) {
const apiUrl = 'https://site-api.project1service.com/v2/upcoming-releases'; const apiUrl = 'https://site-api.project1service.com/v2/upcoming-releases';
const res = await session.get(apiUrl, { const res = await http.get(apiUrl, {
session,
headers: { headers: {
Instance: instanceToken, Instance: instanceToken,
Origin: site.url, Origin: site.url,
@ -260,7 +262,8 @@ async function fetchScene(url, site, baseScene) {
const entryId = url.match(/\d+/)[0]; const entryId = url.match(/\d+/)[0];
const { session, instanceToken } = await getSession(site); const { session, instanceToken } = await getSession(site);
const res = await session.get(`https://site-api.project1service.com/v2/releases/${entryId}`, { const res = await http.get(`https://site-api.project1service.com/v2/releases/${entryId}`, {
session,
headers: { headers: {
Instance: instanceToken, Instance: instanceToken,
}, },
@ -277,7 +280,8 @@ async function fetchProfile({ name: actorName }, networkOrNetworkSlug, actorPath
// const url = `https://www.${networkOrNetworkSlug.slug || networkOrNetworkSlug}.com`; // const url = `https://www.${networkOrNetworkSlug.slug || networkOrNetworkSlug}.com`;
const { session, instanceToken } = await getSession(networkOrNetworkSlug); const { session, instanceToken } = await getSession(networkOrNetworkSlug);
const res = await session.get(`https://site-api.project1service.com/v1/actors/?search=${encodeURI(actorName)}`, { const res = await http.get(`https://site-api.project1service.com/v1/actors/?search=${encodeURI(actorName)}`, {
session,
headers: { headers: {
Instance: instanceToken, Instance: instanceToken,
}, },
@ -291,8 +295,9 @@ async function fetchProfile({ name: actorName }, networkOrNetworkSlug, actorPath
const actorReleasesUrl = `https://site-api.project1service.com/v2/releases?actorId=${actorData.id}&limit=100&offset=0&orderBy=-dateReleased&type=scene`; const actorReleasesUrl = `https://site-api.project1service.com/v2/releases?actorId=${actorData.id}&limit=100&offset=0&orderBy=-dateReleased&type=scene`;
const [actorRes, actorReleasesRes] = await Promise.all([ const [actorRes, actorReleasesRes] = await Promise.all([
bhttp.get(actorUrl), http.get(actorUrl),
session.get(actorReleasesUrl, { http.get(actorReleasesUrl, {
session,
headers: { headers: {
Instance: instanceToken, Instance: instanceToken,
}, },

View File

@ -1,10 +1,10 @@
'use strict'; 'use strict';
const bhttp = require('bhttp');
const blake2 = require('blake2'); const blake2 = require('blake2');
const knex = require('../knex'); const knex = require('../knex');
const { ex, ctxa } = require('../utils/q'); const { ex, ctxa } = require('../utils/q');
const http = require('../utils/http');
async function getSiteSlugs() { async function getSiteSlugs() {
return knex('sites') return knex('sites')
@ -124,7 +124,7 @@ async function scrapeScene(html, site, url, metaSiteSlugs) {
async function fetchLatest(site, page = 1) { async function fetchLatest(site, page = 1) {
const url = `${site.url}/movies/page-${page}`; const url = `${site.url}/movies/page-${page}`;
const res = await bhttp.get(url); const res = await http.get(url);
if (res.statusCode === 200) { if (res.statusCode === 200) {
return scrapeLatest(res.body.toString(), site); return scrapeLatest(res.body.toString(), site);
@ -134,7 +134,7 @@ async function fetchLatest(site, page = 1) {
} }
async function fetchScene(url, site, release) { async function fetchScene(url, site, release) {
const res = await bhttp.get(url); const res = await http.get(url);
if (res.statusCode === 200) { if (res.statusCode === 200) {
return scrapeScene(res.body.toString(), site, url, release?.meta.siteSlugs); return scrapeScene(res.body.toString(), site, url, release?.meta.siteSlugs);

View File

@ -1,144 +0,0 @@
'use strict';
const bhttp = require('bhttp');
const cheerio = require('cheerio');
const { JSDOM } = require('jsdom');
const moment = require('moment');
async function getTrailer(entryId) {
const trailerRes = await bhttp.post('https://www.pervcity.com/gettoken.php', {
setId: entryId,
});
if (trailerRes.statusCode === 200) {
return {
poster: trailerRes.body.TrailerImg,
trailer: trailerRes.body.TrailerPath || trailerRes.body.Trailerfallback,
};
}
return null;
}
function scrapeLatestScene(html, site) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const entryId = $('li').attr('id');
const sceneLinkElement = $('#scene_title_border a');
const url = `${site.url}/${sceneLinkElement.attr('href')}`;
const title = sceneLinkElement.attr('title').replace(/\u00E2\u0080\u0099/g, '\''); // replace weird apostrophes
const actors = $('.home_model_name a').toArray().map(element => $(element).text().replace(/,[\u0020\u00A0\u202F]/, '')); // replace weird commas
const date = moment.utc($('.add_date').text(), 'DD-MM-YYYY').toDate();
const poster = $('a:nth-child(2) > img').attr('src');
const photos = $('.sample-picker img').map((index, element) => $(element).attr('src').replace('tourpics', 'trailer')).toArray();
const stars = $('img[src*="/star.png"]')
.toArray()
.map(element => $(element).attr('src'))
.length || 0;
return {
url,
entryId,
title,
actors,
date,
poster,
photos,
rating: {
stars,
},
site,
};
}
async function scrapeScene(html, url, site) {
const { document } = new JSDOM(html).window;
const release = { url, site };
release.entryId = document.querySelector('input#set_ID').value;
release.title = document.querySelector('title').textContent;
release.description = document.querySelector('.player_data').textContent.trim();
const durationString = document.querySelector('.tag_lineR div:nth-child(2) span').textContent;
const [minutes, seconds] = durationString.match(/\d+/g);
release.duration = Number(minutes) * 60 + Number(seconds);
release.tags = document.querySelector('meta[name="keywords"]').content.split(',');
const { poster, trailer } = await getTrailer(release.entryId);
release.poster = poster;
release.trailer = { src: trailer };
return release;
}
function scrapeFallbackLanding(html) {
const { document } = new JSDOM(html).window;
return document.querySelector('input#set_ID').value;
}
async function scrapeFallbackScene(html, entryId, url, site) {
const { document } = new JSDOM(html).window;
const release = { url, entryId, site };
release.title = document.querySelector('.popup_data_set_head label').textContent;
release.description = document.querySelector('.popup_data_set_des p').textContent.trim();
release.date = moment.utc(document.querySelector('.popup_left_top div span').textContent, 'MM-DD-YYYY').toDate();
release.actors = Array.from(document.querySelectorAll('.popup_data_set_models a'), el => el.textContent);
const { poster, trailer } = await getTrailer(release.entryId);
release.poster = poster;
release.trailer = { src: trailer };
release.channel = document.querySelector('.popup_left_top div img').alt;
return release;
}
async function fetchLatest(channel, page = 1) {
const url = `${channel.url}/final_latestupdateview.php?limitstart=${(page - 1) * 9}&limitend=9&webchannelid=0&deviceview=browser&tourId=${channel.parameters.tourId}`;
const pagedUrl = `${channel.url}/final_load_latestupdate_grid_view.php?limitstart=0&limitend=${(page - 1) * 8 + 1}&webchannelid=0&deviceview=browser&tourId=${channel.parameters.tourId}`;
const res = page === 1
? await bhttp.get(url)
: await bhttp.get(pagedUrl);
const elements = JSON.parse(res.body.toString());
const latest = Object.values(elements.total_arr).map(html => scrapeLatestScene(html, channel)); // total_arr is a key-value object for final_load_latestupdate_grid_view.php
return latest;
}
async function fetchScene(url, site) {
const res = await bhttp.get(url);
if (res.statusCode === 200) {
if (site.isNetwork) {
const entryId = scrapeFallbackLanding(res.body.toString(), url);
const fallbackRes = await bhttp.post('https://www.pervcity.com/set_popupvideo.php', {
setId: entryId,
});
return scrapeFallbackScene(fallbackRes.body.toString(), entryId, url, site);
}
return scrapeScene(res.body.toString(), url, site);
}
return null;
}
module.exports = {
fetchLatest,
fetchScene,
};

View File

@ -1,9 +1,10 @@
'use strict'; 'use strict';
const bhttp = require('@thependulum/bhttp');
const { JSDOM } = require('jsdom'); const { JSDOM } = require('jsdom');
const moment = require('moment'); const moment = require('moment');
const http = require('../utils/http');
const ethnicityMap = { const ethnicityMap = {
White: 'Caucasian', White: 'Caucasian',
}; };
@ -59,8 +60,8 @@ async function fetchProfile({ name: actorName }) {
const pornstarUrl = `https://pornhub.com/pornstar/${actorSlug}`; const pornstarUrl = `https://pornhub.com/pornstar/${actorSlug}`;
const [modelRes, pornstarRes] = await Promise.all([ const [modelRes, pornstarRes] = await Promise.all([
bhttp.get(modelUrl), http.get(modelUrl),
bhttp.get(pornstarUrl), http.get(pornstarUrl),
]); ]);
const model = modelRes.statusCode === 200 && await scrapeProfile(modelRes.body.toString(), modelUrl, actorName); const model = modelRes.statusCode === 200 && await scrapeProfile(modelRes.body.toString(), modelUrl, actorName);
@ -75,7 +76,7 @@ async function fetchProfile({ name: actorName }) {
*/ */
const pornstarUrl = `https://pornhub.com/pornstar/${actorSlug}`; const pornstarUrl = `https://pornhub.com/pornstar/${actorSlug}`;
const pornstarRes = await bhttp.get(pornstarUrl); const pornstarRes = await http.get(pornstarUrl);
return scrapeProfile(pornstarRes.body.toString(), pornstarUrl, actorName); return scrapeProfile(pornstarRes.body.toString(), pornstarUrl, actorName);
} }

View File

@ -1,17 +1,17 @@
'use strict'; 'use strict';
/* eslint-disable newline-per-chained-call */ /* eslint-disable newline-per-chained-call */
const bhttp = require('@thependulum/bhttp');
const cheerio = require('cheerio'); const cheerio = require('cheerio');
const moment = require('moment'); const moment = require('moment');
const { get, geta } = require('../utils/q'); const { get, geta } = require('../utils/q');
const slugify = require('../utils/slugify'); const slugify = require('../utils/slugify');
const http = require('../utils/http');
async function getPhotos(entryId, site) { async function getPhotos(entryId, site) {
const { hostname } = new URL(site.url); const { hostname } = new URL(site.url);
const res = await bhttp.get(`https://${hostname}/gallery.php?type=highres&id=${entryId}`); const res = await http.get(`https://${hostname}/gallery.php?type=highres&id=${entryId}`);
const html = res.body.toString(); const html = res.body.toString();
const $ = cheerio.load(html, { normalizeWhitespace: true }); const $ = cheerio.load(html, { normalizeWhitespace: true });
@ -159,18 +159,18 @@ async function fetchLatest(site, page = 1) {
const { hostname } = new URL(site.url); const { hostname } = new URL(site.url);
if (hostname.match('private.com')) { if (hostname.match('private.com')) {
const res = await bhttp.get(`${site.url}/${page}/`); const res = await http.get(`${site.url}/${page}/`);
return scrapeLatest(res.body.toString(), site); return scrapeLatest(res.body.toString(), site);
} }
const res = await bhttp.get(`${site.url}/scenes/${page}/`); const res = await http.get(`${site.url}/scenes/${page}/`);
return scrapeLatest(res.body.toString(), site); return scrapeLatest(res.body.toString(), site);
} }
async function fetchScene(url, site) { async function fetchScene(url, site) {
const res = await bhttp.get(url); const res = await http.get(url);
return scrapeScene(res.body.toString(), url, site); return scrapeScene(res.body.toString(), url, site);
} }

View File

@ -1,8 +1,9 @@
'use strict'; 'use strict';
const bhttp = require('@thependulum/bhttp');
const cheerio = require('cheerio'); const cheerio = require('cheerio');
const http = require('../utils/http');
const { const {
scrapeLatestX, scrapeLatestX,
fetchLatest, fetchLatest,
@ -24,7 +25,7 @@ function scrapeLatestClassic(html, site) {
} }
async function fetchClassic(site, page) { async function fetchClassic(site, page) {
const res = await bhttp.get(`${site.url}/scenes?page=${page}`); const res = await http.get(`${site.url}/scenes?page=${page}`);
if (res.statusCode === 200) { if (res.statusCode === 200) {
return scrapeLatestClassic(res.body.toString(), site); return scrapeLatestClassic(res.body.toString(), site);

View File

@ -1,9 +1,8 @@
'use strict'; 'use strict';
const bhttp = require('bhttp');
const { ex, exa, get } = require('../utils/q'); const { ex, exa, get } = require('../utils/q');
const slugify = require('../utils/slugify'); const slugify = require('../utils/slugify');
const http = require('../utils/http');
const { heightToCm, lbsToKg } = require('../utils/convert'); const { heightToCm, lbsToKg } = require('../utils/convert');
function scrapePhotos(html) { function scrapePhotos(html) {
@ -19,7 +18,7 @@ function scrapePhotos(html) {
} }
async function fetchPhotos(url) { async function fetchPhotos(url) {
const res = await bhttp.get(url); const res = await http.get(url);
if (res.statusCode === 200) { if (res.statusCode === 200) {
return scrapePhotos(res.body.toString(), url); return scrapePhotos(res.body.toString(), url);
@ -198,7 +197,7 @@ async function scrapeProfile(html, actorUrl, withReleases) {
async function fetchLatest(site, page = 1) { async function fetchLatest(site, page = 1) {
const latestPath = site.parameters?.path || '/big-boob-videos'; const latestPath = site.parameters?.path || '/big-boob-videos';
const url = `${site.url}${latestPath}?page=${page}`; const url = `${site.url}${latestPath}?page=${page}`;
const res = await bhttp.get(url); const res = await http.get(url);
if (res.statusCode === 200) { if (res.statusCode === 200) {
return scrapeAll(res.body.toString(), site); return scrapeAll(res.body.toString(), site);
@ -208,7 +207,7 @@ async function fetchLatest(site, page = 1) {
} }
async function fetchScene(url, site) { async function fetchScene(url, site) {
const res = await bhttp.get(url); const res = await http.get(url);
if (res.statusCode === 200) { if (res.statusCode === 200) {
return scrapeScene(res.body.toString(), url, site); return scrapeScene(res.body.toString(), url, site);
@ -227,7 +226,7 @@ async function fetchProfile({ name: actorName }, context, include, page = 1, sou
const url = sources[source]; const url = sources[source];
const res = await bhttp.get(url, { const res = await http.get(url, {
followRedirects: false, followRedirects: false,
}); });
@ -235,7 +234,7 @@ async function fetchProfile({ name: actorName }, context, include, page = 1, sou
const actorUrl = scrapeModels(res.body.toString(), actorName); const actorUrl = scrapeModels(res.body.toString(), actorName);
if (actorUrl) { if (actorUrl) {
const actorRes = await bhttp.get(actorUrl); const actorRes = await http.get(actorUrl);
if (actorRes.statusCode === 200) { if (actorRes.statusCode === 200) {
return scrapeProfile(actorRes.body.toString(), actorUrl, include.scenes); return scrapeProfile(actorRes.body.toString(), actorUrl, include.scenes);

View File

@ -1,180 +0,0 @@
'use strict';
const bhttp = require('bhttp');
const { JSDOM } = require('jsdom');
const moment = require('moment');
function extractTitle(pathname) {
return pathname
.split('/')
.slice(-2)[0]
.split('_')
.map(seg => `${seg.charAt(0).toUpperCase()}${seg.slice(1)}`)
.join(' ');
}
function extractActors(str) {
return str
.split(/,|\band\b/ig)
.filter(actor => !/\.{3}/.test(actor))
.map(actor => actor.trim())
.filter(actor => actor.length > 0);
}
function scrapeLatest(html, site) {
const { document } = new JSDOM(html).window;
const scenes = Array.from(document.querySelectorAll('#updatesList li.grey, #updatesList li.white'));
return scenes.map((scene) => {
const release = { site };
const link = scene.querySelector('.info a');
const poster = scene.querySelector('img');
const { pathname } = new URL(link);
[release.entryId] = poster.id.match(/\d+/);
release.url = `https://www.teamskeet.com${pathname}`;
release.title = extractTitle(pathname);
release.date = moment.utc(scene.querySelector('strong').textContent, 'MM/DD/YYYY').toDate();
const photos = Array.from({ length: 5 }, (_value, index) => poster.dataset.original.replace(/\d+.jpg/, `${String(index + 1).padStart(2, '0')}.jpg`));
[release.poster] = photos;
release.photos = photos.slice(1);
const actors = scene.querySelector('div span[rel="test"]').textContent;
release.actors = extractActors(actors);
return release;
});
}
function scrapeScene(html, site, url) {
const { document } = new JSDOM(html).window;
const release = { site };
release.entryId = document.querySelector('#story-and-tags .scene_rater').attributes.rel.value;
release.description = document.querySelector('#story-and-tags td:nth-child(2) div').textContent;
const [actors, title, channel] = document.querySelector('title').textContent.split('|').map(item => item.trim());
release.url = url;
release.title = title;
release.actors = extractActors(actors);
release.channel = channel.toLowerCase();
release.tags = Array.from(document.querySelectorAll('#story-and-tags tr:nth-child(2) a'), el => el.rel);
const date = document.querySelector('h3 ~ div:nth-child(4), h3 ~ div div.gray:not(.scene_rater)').textContent.split(':')[1].trim();
release.date = moment.utc(date, 'MMMM Do, YYYY').toDate();
const { poster } = document.querySelector('video');
if (poster && !/gen/.test(poster)) release.poster = [poster.replace('low', 'hi'), poster];
const siteId = document.querySelector('#story-and-tags img').src.match(/\w+.jpg/)[0].replace('.jpg', '');
const actorsSlug = document.querySelector('h3 a').href.split('/').slice(-2)[0];
release.photos = Array.from({ length: 5 }, (value, index) => `https://images.psmcdn.net/teamskeet/${siteId}/${actorsSlug}/shared/scenes/new/${String(index + 1).padStart(2, '0')}.jpg`);
const trailer = document.querySelector('div.right.gray a').href;
if (trailer) release.trailer = { src: trailer };
return release;
}
function scrapeSceneA(html, site, sceneX, url) {
const scene = sceneX || new JSDOM(html).window.document;
const release = { site };
release.description = scene.querySelector('.scene-story').textContent.replace('...read more', '...').trim();
release.date = moment.utc(scene.querySelector('.scene-date').textContent, 'MM/DD/YYYY').toDate();
release.actors = Array.from(scene.querySelectorAll('.starring span'), el => extractActors(el.textContent)).flat();
const durationString = scene.querySelector('.time').textContent.trim();
const duration = ['00'].concat(durationString.split(':')).slice(-3).join(':'); // ensure hh:mm:ss
release.duration = moment.duration(duration).asSeconds();
if (sceneX) {
const titleEl = scene.querySelector(':scope > a');
release.url = titleEl.href;
release.entryId = titleEl.id;
release.title = titleEl.title;
const [poster, ...photos] = Array.from(scene.querySelectorAll('.scene img'), el => el.src);
release.poster = [poster.replace('bio_big', 'video'), poster];
release.photos = photos;
}
if (!sceneX) {
release.title = scene.querySelector('.title span').textContent;
release.url = url;
release.poster = scene.querySelector('video').poster;
release.photos = [release.poster.replace('video', 'bio_small'), release.poster.replace('video', 'bio_small2')];
}
const [, entryIdA, entryIdB] = new URL(release.url).pathname.split('/');
release.entryId = entryIdA === 'scenes' ? entryIdB : entryIdA;
return release;
}
function scrapeLatestA(html, site) {
const { document } = new JSDOM(html).window;
const scenes = Array.from(document.querySelectorAll('.scenewrapper'));
return scenes.map(scene => scrapeSceneA(null, site, scene));
}
async function fetchLatestTeamSkeet(site, page = 1) {
const url = `https://www.teamskeet.com/t1/updates/load?fltrs[site]=${site.parameters.id}&page=${page}&view=newest&fltrs[time]=ALL&order=DESC`;
const res = await bhttp.get(url);
if (res.statusCode === 200) {
return scrapeLatest(res.body.toString(), site);
}
return null;
}
async function fetchLatestA(site) {
const url = `${site.url}/scenes`;
const res = await bhttp.get(url);
if (res.statusCode === 200) {
return scrapeLatestA(res.body.toString(), site);
}
return null;
}
async function fetchLatest(site, page = 1) {
if (site.parameters.id) {
return fetchLatestTeamSkeet(site, page);
}
if (site.parameters.scraper === 'A') {
return fetchLatestA(site, page);
}
return null;
}
async function fetchScene(url, site) {
const session = bhttp.session(); // resolve redirects
const res = await session.get(url);
if (site.parameters?.scraper === 'A') {
return scrapeSceneA(res.body.toString(), site, null, url);
}
return scrapeScene(res.body.toString(), site, url);
}
module.exports = {
fetchLatest,
fetchScene,
};

View File

@ -1,10 +1,9 @@
'use strict'; 'use strict';
/* eslint-disable no-unused-vars */ /* eslint-disable no-unused-vars */
const bhttp = require('@thependulum/bhttp');
const { get, ed } = require('../utils/q'); const { get, ed } = require('../utils/q');
const { fetchApiLatest, fetchApiUpcoming, fetchScene, fetchApiProfile } = require('./gamma'); const { fetchApiLatest, fetchApiUpcoming, fetchScene, fetchApiProfile } = require('./gamma');
const http = require('../utils/http');
const slugify = require('../utils/slugify'); const slugify = require('../utils/slugify');
function scrapeLatestNative(scenes, site) { function scrapeLatestNative(scenes, site) {
@ -72,7 +71,7 @@ async function fetchLatestNative(site, page = 1) {
} }
const apiUrl = `${site.url}/videos/api/?limit=50&offset=${(page - 1) * 50}&sort=datedesc`; const apiUrl = `${site.url}/videos/api/?limit=50&offset=${(page - 1) * 50}&sort=datedesc`;
const res = await bhttp.get(apiUrl, { const res = await http.get(apiUrl, {
decodeJSON: true, decodeJSON: true,
}); });
@ -107,7 +106,7 @@ async function fetchSceneWrapper(url, site, release) {
if (scene.date - new Date(site.parameters?.lastNative) <= 0) { if (scene.date - new Date(site.parameters?.lastNative) <= 0) {
// scene is probably still available on Vivid site, use search API to get URL and original date // scene is probably still available on Vivid site, use search API to get URL and original date
const searchUrl = `${site.url}/videos/api/?limit=10&sort=datedesc&search=${encodeURI(scene.title)}`; const searchUrl = `${site.url}/videos/api/?limit=10&sort=datedesc&search=${encodeURI(scene.title)}`;
const searchRes = await bhttp.get(searchUrl, { const searchRes = await http.get(searchUrl, {
decodeJSON: true, decodeJSON: true,
}); });

View File

@ -1,8 +1,8 @@
'use strict'; 'use strict';
const bhttp = require('bhttp');
const { ex, ctxa } = require('../utils/q');
// const slugify = require('../utils/slugify'); // const slugify = require('../utils/slugify');
const { ex, ctxa } = require('../utils/q');
const http = require('../utils/http');
function getLicenseCode(html) { function getLicenseCode(html) {
const licensePrefix = 'license_code: \''; const licensePrefix = 'license_code: \'';
@ -178,7 +178,7 @@ function scrapeScene(html, url) {
async function fetchLatest(site, page = 1) { async function fetchLatest(site, page = 1) {
const url = `https://vogov.com/latest-videos/?sort_by=post_date&from=${page}`; const url = `https://vogov.com/latest-videos/?sort_by=post_date&from=${page}`;
const res = await bhttp.get(url); const res = await http.get(url);
if (res.statusCode === 200) { if (res.statusCode === 200) {
return scrapeLatest(res.body.toString(), site); return scrapeLatest(res.body.toString(), site);
@ -188,7 +188,7 @@ async function fetchLatest(site, page = 1) {
} }
async function fetchScene(url) { async function fetchScene(url) {
const res = await bhttp.get(url); const res = await http.get(url);
if (res.statusCode === 200) { if (res.statusCode === 200) {
return scrapeScene(res.body.toString(), url); return scrapeScene(res.body.toString(), url);

View File

@ -1,9 +1,10 @@
'use strict'; 'use strict';
const bhttp = require('bhttp');
const { JSDOM } = require('jsdom'); const { JSDOM } = require('jsdom');
const moment = require('moment'); const moment = require('moment');
const http = require('../utils/http');
function scrapeLatest(html, site) { function scrapeLatest(html, site) {
const { document } = new JSDOM(html).window; const { document } = new JSDOM(html).window;
const { origin } = new URL(site.url); const { origin } = new URL(site.url);
@ -112,7 +113,7 @@ function scrapeScene(html, site, url) {
async function fetchLatest(site, page = 1) { async function fetchLatest(site, page = 1) {
const url = `${site.url}?page=${page}`; const url = `${site.url}?page=${page}`;
const res = await bhttp.get(url); const res = await http.get(url);
if (res.statusCode === 200) { if (res.statusCode === 200) {
return scrapeLatest(res.body.toString(), site); return scrapeLatest(res.body.toString(), site);
@ -122,7 +123,7 @@ async function fetchLatest(site, page = 1) {
} }
async function fetchScene(url, site) { async function fetchScene(url, site) {
const res = await bhttp.get(url); const res = await http.get(url);
if (res.statusCode === 200) { if (res.statusCode === 200) {
return scrapeScene(res.body.toString(), site, url); return scrapeScene(res.body.toString(), site, url);

View File

@ -1,11 +1,10 @@
'use strict'; 'use strict';
const bhttp = require('@thependulum/bhttp');
const { fetchLatest, fetchUpcoming, scrapeScene, fetchProfile } = require('./gamma'); const { fetchLatest, fetchUpcoming, scrapeScene, fetchProfile } = require('./gamma');
const http = require('../utils/http');
async function fetchScene(url, site) { async function fetchScene(url, site) {
const res = await bhttp.get(url); const res = await http.get(url);
const release = await scrapeScene(res.body.toString(), url, site); const release = await scrapeScene(res.body.toString(), url, site);

View File

@ -1,7 +1,7 @@
'use strict'; 'use strict';
const config = require('config'); const config = require('config');
const bhttp = require('bhttp'); const bhttp = require('@thependulum/bhttp');
const util = require('util'); const util = require('util');
const stream = require('stream'); const stream = require('stream');
const tunnel = require('tunnel'); const tunnel = require('tunnel');
@ -168,4 +168,5 @@ module.exports = {
put, put,
patch, patch,
session: getSession, session: getSession,
getSession,
}; };