Compare commits

...

6 Commits

Author SHA1 Message Date
DebaucheryLibrarian d4c5da2a76 1.142.0 2020-11-23 00:05:16 +01:00
DebaucheryLibrarian 0633197793 Removed direct bhttp usage from scrapers in favor of local http module. Deleted legacy scrapers, as old code is available via git repo history. 2020-11-23 00:05:02 +01:00
DebaucheryLibrarian 3d427f7e1d Allowing HTTP rate limits to be set by configuration or argument. 2020-11-22 23:50:24 +01:00
DebaucheryLibrarian 6a5063cf32 Fixed PornCZ scene photos attribute. 2020-11-22 04:13:21 +01:00
DebaucheryLibrarian 081a5a1e8c Updated HTTP call in Gamma scraper. 2020-11-22 04:09:44 +01:00
DebaucheryLibrarian b9b777c621 Using new HTTP module with a dynamic rate limiter. 2020-11-22 04:07:09 +01:00
48 changed files with 488 additions and 726 deletions

View File

@ -197,6 +197,17 @@ module.exports = {
'www.deeper.com',
],
},
limits: {
default: {
interval: 50,
concurrency: 20,
},
'www.deeper.com': {
enable: false, // can be omitted to enable
interval: 1000,
concurrency: 1,
},
},
fetchAfter: [1, 'week'],
missingDateLimit: 3,
media: {

7
package-lock.json generated
View File

@ -1,6 +1,6 @@
{
"name": "traxxx",
"version": "1.141.2",
"version": "1.142.0",
"lockfileVersion": 1,
"requires": true,
"dependencies": {
@ -2208,6 +2208,11 @@
"resolved": "https://registry.npmjs.org/boolbase/-/boolbase-1.0.0.tgz",
"integrity": "sha1-aN/1++YMUes3cl6p4+0xDcwed24="
},
"bottleneck": {
"version": "2.19.5",
"resolved": "https://registry.npmjs.org/bottleneck/-/bottleneck-2.19.5.tgz",
"integrity": "sha512-VHiNCbI1lKdl44tGrhNfU3lup0Tj/ZBMJB5/2ZbNXRCPuRCO7ed2mgcK4r17y+KB2EfuYuRaVlwNbAeaWGSpbw=="
},
"brace-expansion": {
"version": "1.1.11",
"resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.11.tgz",

View File

@ -1,6 +1,6 @@
{
"name": "traxxx",
"version": "1.141.2",
"version": "1.142.0",
"description": "All the latest porn releases in one place",
"main": "src/app.js",
"scripts": {
@ -78,6 +78,7 @@
"blake2": "^4.0.0",
"bluebird": "^3.7.2",
"body-parser": "^1.19.0",
"bottleneck": "^2.19.5",
"canvas": "^2.6.1",
"casual": "^1.6.2",
"cheerio": "^1.0.0-rc.3",

Binary file not shown.

After

Width:  |  Height:  |  Size: 601 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 377 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 7.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 32 KiB

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

View File

@ -643,7 +643,7 @@ const tagPosters = [
['mff', 1, 'Anikka Albrite, Kelsi Monroe and Mick Blue for HardX'],
['mfm', 0, 'Vina Sky in "Jules Jordan\'s Three Ways" for Jules Jordan'],
['natural-boobs', 4, 'Miela (Marry Queen) in "Pure" for FemJoy'],
['nurse', 0, 'Sarah Vandella in "Cum For Nurse Sarah" for Brazzers'],
['nurse', 1, 'Mia Malkova in "Always Think Happy Thoughts" for Brazzers'],
['oil', 2, 'Jade Kush for Passion HD'],
['oral-creampie', 1, 'Valentina Nappi for Her Limit'],
['orgy', 1, 'Megan Rain (DP), Morgan Lee (anal), Jessa Rhodes, Melissa Moore and Kimmy Granger in "Orgy Masters 8" for Jules Jordan'],
@ -825,6 +825,7 @@ const tagPhotos = [
['natural-boobs', 3, 'Violet Starr in "Violet Starr 1st Lesbian Anal" for LesbianX'],
['natural-boobs', 0, 'Valentina Nappi in "Hypnotic Curves" for LesbianX'],
['natural-boobs', 2, 'Kylie Page for All Girl Massage'],
['nurse', 0, 'Sarah Vandella in "Cum For Nurse Sarah" for Brazzers'],
['oil', 1, 'Kissa Sins in "Oil Overload 14" for JulesJordan'],
['oil', 3, 'Vina Sky for Lubed'],
['oil', 0, 'Jada Stevens in "Jada Stevens Anal Ass Oiled Up For James Deen\'s Cock" for Jules Jordan'],

View File

@ -153,6 +153,16 @@ const { argv } = yargs
type: 'number',
default: 1,
})
.option('interval', {
describe: 'Minimum wait time between HTTP requests',
type: 'number',
// don't set default, because argument has to override config, but config has to override default
})
.option('concurrency', {
describe: 'Maximum amount of parallel HTTP requests',
type: 'number',
// don't set default, because argument has to override config, but config has to override default
})
.option('save', {
describe: 'Save fetched releases to database',
type: 'boolean',

View File

@ -420,15 +420,18 @@ async function storeFile(media) {
} catch (error) {
logger.warn(`Failed to store ${media.src}: ${error.message}`);
await fsPromises.unlink(media.file.path);
return null;
}
}
async function fetchHttpSource(source, tempFileTarget, hashStream) {
const res = await http.get(source.src, {
...(source.referer && { referer: source.referer }),
...(source.host && { host: source.host }),
}, {
headers: {
...(source.referer && { referer: source.referer }),
...(source.host && { host: source.host }),
},
stream: true, // sources are fetched in parallel, don't gobble up memory
transforms: [hashStream],
destination: tempFileTarget,
@ -642,7 +645,7 @@ async function storeMedias(baseMedias) {
);
}
const newMediaWithEntries = savedMedias.map((media, index) => curateMediaEntry(media, index));
const newMediaWithEntries = savedMedias.filter(Boolean).map((media, index) => curateMediaEntry(media, index));
const newMediaEntries = newMediaWithEntries.filter(media => media.newEntry).map(media => media.entry);
await bulkInsert('media', newMediaEntries);

View File

@ -1,8 +1,6 @@
'use strict';
const bhttp = require('@thependulum/bhttp');
const { post } = require('../utils/http');
const http = require('../utils/http');
const { extractDate } = require('../utils/qu');
const { inchesToCm } = require('../utils/convert');
const slugify = require('../utils/slugify');
@ -84,7 +82,7 @@ function scrapeAll(scenes) {
}
async function fetchActorReleases(actor) {
const res = await bhttp.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, {
const res = await http.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, {
size: 50,
query: {
bool: {
@ -179,7 +177,7 @@ async function scrapeProfile(actor, include) {
}
async function fetchLatest(site, page = 1) {
const res = await bhttp.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, {
const res = await http.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, {
size: 50,
from: (page - 1) * 50,
query: {
@ -269,7 +267,7 @@ async function fetchScene(url) {
const encodedId = new URL(url).pathname.split('/')[2];
const entryId = decodeId(encodedId);
const res = await bhttp.get(`https://${clusterId}.us-east-1.aws.found.io/videos/video/${entryId}`, {
const res = await http.get(`https://${clusterId}.us-east-1.aws.found.io/videos/video/${entryId}`, {
headers: {
Authorization: `Basic ${authKey}`,
},
@ -279,7 +277,7 @@ async function fetchScene(url) {
}
async function fetchProfile({ name: actorName }, context, include) {
const res = await post(`https://${clusterId}.us-east-1.aws.found.io/actors/actor/_search`, {
const res = await http.post(`https://${clusterId}.us-east-1.aws.found.io/actors/actor/_search`, {
size: 5,
sort: [{
_score: {
@ -306,8 +304,11 @@ async function fetchProfile({ name: actorName }, context, include) {
},
},
}, {
Authorization: `Basic ${authKey}`,
}, { encodeJSON: true });
headers: {
Authorization: `Basic ${authKey}`,
},
encodeJSON: true,
});
if (res.ok) {
const actor = res.body.hits.hits.find(hit => hit._source.name.toLowerCase() === actorName.toLowerCase());

View File

@ -1,12 +1,12 @@
'use strict';
/* eslint-disable newline-per-chained-call */
const bhttp = require('@thependulum/bhttp');
const cheerio = require('cheerio');
const moment = require('moment');
const logger = require('../logger')(__filename);
const slugify = require('../utils/slugify');
const http = require('../utils/http');
const { get, getAll, ex } = require('../utils/q');
function scrape(html, site) {
@ -192,7 +192,7 @@ async function fetchLatest(site, page = 1) {
/*
async function fetchUpcoming(site) {
const res = await bhttp.get('https://www.bangbros.com');
const res = await http.get('https://www.bangbros.com');
return scrapeUpcoming(res.body.toString(), site);
}
@ -224,13 +224,13 @@ async function fetchScene(url, site, release) {
async function fetchProfile({ name: actorName }, scope) {
const actorSlug = slugify(actorName);
const url = `https://bangbros.com/search/${actorSlug}`;
const res = await bhttp.get(url);
const res = await http.get(url);
if (res.statusCode === 200) {
const actorUrl = scrapeProfileSearch(res.body.toString(), actorName);
if (actorUrl) {
const actorRes = await bhttp.get(actorUrl);
const actorRes = await http.get(actorUrl);
if (actorRes.statusCode === 200) {
return scrapeProfile(actorRes.body.toString(), scope);

View File

@ -1,7 +1,5 @@
'use strict';
// const bhttp = require('bhttp');
const { fetchScene, fetchLatest, fetchUpcoming, fetchProfile } = require('./gamma');
async function fetchSceneWrapper(url, site, baseRelease) {

View File

@ -1,8 +1,7 @@
'use strict';
const bhttp = require('bhttp');
const { ex } = require('../utils/q');
const http = require('../utils/http');
function scrapeProfile(html) {
const { qu } = ex(html); /* eslint-disable-line object-curly-newline */
@ -80,7 +79,7 @@ function scrapeProfile(html) {
async function fetchProfile({ name: actorName }) {
const actorSlug = actorName.replace(/\s+/, '_');
const res = await bhttp.get(`http://www.boobpedia.com/boobs/${actorSlug}`);
const res = await http.get(`http://www.boobpedia.com/boobs/${actorSlug}`);
if (res.statusCode === 200) {
return scrapeProfile(res.body.toString());

View File

@ -1,9 +1,8 @@
'use strict';
const bhttp = require('bhttp');
const qu = require('../utils/qu');
const slugify = require('../utils/slugify');
const http = require('../utils/http');
function scrapeAll(scenes, site, origin) {
return scenes.map(({ query }) => {
@ -150,14 +149,14 @@ async function fetchLatest(channel, page = 1) {
async function fetchScene(url, site) {
// DDF's main site moved to Porn World
// const res = await bhttp.get(`https://ddfnetwork.com${new URL(url).pathname}`);
// const res = await http.get(`https://ddfnetwork.com${new URL(url).pathname}`);
const res = await qu.get(url, '.content, #content, .taspVideoPage');
return res.ok ? scrapeScene(res.item, url, site) : res.status;
}
async function fetchProfile({ name: actorName }) {
const resSearch = await bhttp.post('https://ddfnetwork.com/search/ajax',
const resSearch = await http.post('https://ddfnetwork.com/search/ajax',
{
type: 'hints',
word: actorName,
@ -180,7 +179,7 @@ async function fetchProfile({ name: actorName }) {
const [actor] = resSearch.body.list.pornstarsName;
const url = `https://ddfnetwork.com${actor.href}`;
const resActor = await bhttp.get(url);
const resActor = await http.get(url);
if (resActor.statusCode !== 200) {
return null;

View File

@ -2,12 +2,13 @@
/* eslint-disable newline-per-chained-call */
// const Promise = require('bluebird');
const bhttp = require('@thependulum/bhttp');
const { JSDOM } = require('jsdom');
const moment = require('moment');
const http = require('../utils/http');
async function getPhotos(albumUrl) {
const res = await bhttp.get(albumUrl);
const res = await http.get(albumUrl);
const html = res.body.toString();
const { document } = new JSDOM(html).window;
@ -125,13 +126,13 @@ async function scrapeScene(html, url, site) {
}
async function fetchLatest(site, page = 1) {
const res = await bhttp.get(`https://dogfartnetwork.com/tour/scenes/?p=${page}`);
const res = await http.get(`https://dogfartnetwork.com/tour/scenes/?p=${page}`);
return scrapeLatest(res.body.toString(), site);
}
async function fetchScene(url, site) {
const res = await bhttp.get(url);
const res = await http.get(url);
return scrapeScene(res.body.toString(), url, site);
}

View File

@ -41,7 +41,9 @@ function scrapeScene({ query }, url, channel) {
}));
release.director = query.cnt('.director')?.split(/\s*:\s*/)[1];
release.poster = query.sourceSet('.player img', 'data-srcset');
const fallbackPoster = query.img('.player img');
release.poster = query.sourceSet('.player img', 'data-srcset') || [fallbackPoster.replace('_crop', ''), fallbackPoster];
release.movie = {
title: query.cnt('.movie a'),

View File

@ -1,9 +1,10 @@
'use strict';
const bhttp = require('bhttp');
const { JSDOM } = require('jsdom');
const moment = require('moment');
const http = require('../utils/http');
function scrapeProfile(html, actorName) {
const { document } = new JSDOM(html).window;
const profile = { name: actorName };
@ -68,17 +69,17 @@ function scrapeSearch(html) {
async function fetchProfile({ name: actorName }) {
const actorSlug = actorName.toLowerCase().replace(/\s+/g, '-');
const res = await bhttp.get(`https://freeones.nl/${actorSlug}/profile`);
const res = await http.get(`https://freeones.nl/${actorSlug}/profile`);
if (res.statusCode === 200) {
return scrapeProfile(res.body.toString(), actorName);
}
const searchRes = await bhttp.get(`https://freeones.nl/babes?q=${actorName}`);
const searchRes = await http.get(`https://freeones.nl/babes?q=${actorName}`);
const actorPath = scrapeSearch(searchRes.body.toString());
if (actorPath) {
const actorRes = await bhttp.get(`https://freeones.nl${actorPath}/profile`);
const actorRes = await http.get(`https://freeones.nl${actorPath}/profile`);
if (actorRes.statusCode === 200) {
return scrapeProfile(actorRes.body.toString(), actorName);

View File

@ -1,140 +0,0 @@
'use strict';
/* eslint-disable newline-per-chained-call */
const bhttp = require('bhttp');
const { JSDOM } = require('jsdom');
const moment = require('moment');
async function scrapeProfileFrontpage(html, url, name) {
const { document } = new JSDOM(html).window;
const bioEl = document.querySelector('.dashboard-bio-list');
const bioUrl = `https:${document.querySelector('.seemore a').href}`;
const keys = Array.from(bioEl.querySelectorAll('dt'), el => el.textContent.trim());
const values = Array.from(bioEl.querySelectorAll('dd'), el => el.textContent.trim());
const bio = keys.reduce((acc, key, index) => ({ ...acc, [key]: values[index] }), {});
const profile = {
name,
gender: 'female',
};
const birthdateString = bio['Date of Birth:'];
const measurementsString = bio['Measurements:'];
const birthCityString = bio['Place of Birth:'];
const birthCity = birthCityString !== undefined && birthCityString !== 'Unknown' && birthCityString !== 'Unknown (add)' && birthCityString;
const birthCountryString = bio['Country of Origin:'];
const birthCountry = birthCountryString !== undefined && birthCountryString !== 'Unknown' && birthCountryString !== 'Unknown (add)' && birthCountryString;
const piercingsString = bio['Piercings:'];
const tattoosString = bio['Tattoos:'];
if (birthdateString && birthdateString !== 'Unknown (add)') profile.birthdate = moment.utc(birthdateString.slice(0, birthdateString.indexOf(' (')), 'MMMM D, YYYY').toDate();
if (measurementsString) [profile.bust, profile.waist, profile.hip] = measurementsString.split('-').map(measurement => (measurement === '??' ? null : measurement));
if (bio['Fake Boobs:']) profile.naturalBoobs = bio['Fake Boobs:'] === 'No';
profile.birthPlace = `${birthCity || ''}${birthCity ? ', ' : ''}${birthCountry || ''}`;
profile.hair = bio['Hair Color:'].toLowerCase();
profile.eyes = bio['Eye Color:'].toLowerCase();
if (piercingsString) profile.hasPiercings = !!(piercingsString !== 'Unknown (add)' && piercingsString !== 'None');
if (tattoosString) profile.hasTattoos = !!(tattoosString !== 'Unknown (add)' && tattoosString !== 'None');
if (profile.hasPiercings && piercingsString !== 'various') profile.piercings = piercingsString;
if (profile.hasTattoos && tattoosString !== 'various') profile.tattoos = tattoosString;
profile.social = Array.from(bioEl.querySelectorAll('.dashboard-socialmedia a'), el => el.href);
return {
profile,
url: bioUrl,
};
}
async function scrapeProfileBio(html, frontpageProfile, url, name) {
const { document } = new JSDOM(html).window;
const bioEl = document.querySelector('#biographyTable');
const keys = Array.from(bioEl.querySelectorAll('td:nth-child(1)'), el => el.textContent.trim());
const values = Array.from(bioEl.querySelectorAll('td:nth-child(2)'), el => el.textContent.trim());
const bio = keys.reduce((acc, key, index) => ({ ...acc, [key]: values[index] }), {});
const profile = {
...frontpageProfile,
name,
gender: 'female',
};
const birthdateString = bio['Date of Birth:'];
const measurementsString = bio['Measurements:'];
const birthCityString = bio['Place of Birth:'];
const birthCity = birthCityString !== undefined && birthCityString !== 'Unknown' && birthCityString !== 'Unknown (add)' && birthCityString;
const birthCountryString = bio['Country of Origin:'];
const birthCountry = birthCountryString !== undefined && birthCountryString !== 'Unknown' && birthCountryString !== 'Unknown (add)' && birthCountryString;
const piercingsString = bio['Piercings:'];
const tattoosString = bio['Tattoos:'];
if (birthdateString && birthdateString !== 'Unknown') profile.birthdate = moment.utc(birthdateString.slice(0, birthdateString.indexOf(' (')), 'MMMM D, YYYY').toDate();
if (measurementsString) [profile.bust, profile.waist, profile.hip] = measurementsString.split('-').map(measurement => (measurement === '??' ? null : measurement));
if (bio['Fake boobs']) profile.naturalBoobs = bio['Fake boobs:'] === 'No';
profile.ethnicity = bio['Ethnicity:'];
profile.birthPlace = `${birthCity || ''}${birthCity ? ', ' : ''}${birthCountry || ''}`;
profile.hair = bio['Hair Color:'].toLowerCase();
profile.eyes = bio['Eye Color:'].toLowerCase();
profile.height = Number(bio['Height:'].match(/\d+/)[0]);
profile.weight = Number(bio['Weight:'].match(/\d+/)[0]);
if (piercingsString) profile.hasPiercings = !!(piercingsString !== 'Unknown (add)' && piercingsString !== 'None');
if (tattoosString) profile.hasTattoos = !!(tattoosString !== 'Unknown (add)' && tattoosString !== 'None');
if (profile.hasPiercings && piercingsString !== 'various') profile.piercings = piercingsString;
if (profile.hasTattoos && tattoosString !== 'various') profile.tattoos = tattoosString;
profile.social = Array.from(bioEl.querySelectorAll('#socialmedia a'), el => el.href);
return profile;
}
async function fetchProfile({ name: actorName }) {
const slug = actorName.replace(' ', '_');
const frontpageUrl = `https://www.freeones.com/html/v_links/${slug}`;
const resFrontpage = await bhttp.get(frontpageUrl);
if (resFrontpage.statusCode === 200) {
const { url, bio } = await scrapeProfileFrontpage(resFrontpage.body.toString(), frontpageUrl, actorName);
const resBio = await bhttp.get(url);
return scrapeProfileBio(resBio.body.toString(), bio, url, actorName);
}
// apparently some actors are appended 'Babe' as their surname...
const fallbackSlug = `${slug}_Babe`;
const fallbackUrl = `https://www.freeones.com/html/s_links/${fallbackSlug}`;
const resFallback = await bhttp.get(fallbackUrl);
if (resFallback.statusCode === 200) {
const { url, profile } = await scrapeProfileFrontpage(resFallback.body.toString(), fallbackUrl, actorName);
const resBio = await bhttp.get(url);
return scrapeProfileBio(resBio.body.toString(), profile, url, actorName);
}
return null;
}
module.exports = {
fetchProfile,
};

View File

@ -7,7 +7,7 @@ const cheerio = require('cheerio');
const moment = require('moment');
const logger = require('../logger')(__filename);
const { ex, get } = require('../utils/q');
const qu = require('../utils/qu');
const http = require('../utils/http');
const slugify = require('../utils/slugify');
@ -318,7 +318,7 @@ async function fetchActorReleases(profileUrl, getActorReleasesUrl, page = 1, acc
const profilePath = `/${pathname.split('/').slice(-2).join('/')}`;
const url = getActorReleasesUrl(profilePath, page);
const res = await get(url);
const res = await qu.get(url);
if (!res.ok) return [];
@ -333,14 +333,14 @@ async function fetchActorReleases(profileUrl, getActorReleasesUrl, page = 1, acc
}
async function scrapeProfile(html, url, actorName, _siteSlug, getActorReleasesUrl, withReleases) {
const { q } = ex(html);
const { query } = qu.extract(html);
const avatar = q('img.actorPicture');
const hair = q('.actorProfile .attribute_hair_color', true);
const height = q('.actorProfile .attribute_height', true);
const weight = q('.actorProfile .attribute_weight', true);
const alias = q('.actorProfile .attribute_alternate_names', true);
const nationality = q('.actorProfile .attribute_home', true);
const avatar = query.el('img.actorPicture');
const hair = query.cnt('.actorProfile .attribute_hair_color');
const height = query.cnt('.actorProfile .attribute_height');
const weight = query.cnt('.actorProfile .attribute_weight');
const alias = query.cnt('.actorProfile .attribute_alternate_names');
const nationality = query.cnt('.actorProfile .attribute_home');
const profile = {
name: actorName,
@ -358,7 +358,7 @@ async function scrapeProfile(html, url, actorName, _siteSlug, getActorReleasesUr
profile.avatar = avatars;
}
profile.description = q('.actorBio p:not(.bioTitle)', true);
profile.description = query.cnt('.actorBio p:not(.bioTitle)');
if (hair) profile.hair = hair.split(':')[1].trim();
if (height) profile.height = Number(height.match(/\d+/)[0]);
@ -541,8 +541,10 @@ async function fetchScene(url, site, baseRelease) {
const [res, mobileRes] = await Promise.all([
http.get(deepUrl),
mobileUrl && http.get(mobileUrl, {
// don't redirect to main site
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Mobile Safari/537.36',
headers: {
// don't redirect to main site
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Mobile Safari/537.36',
},
}),
]);

View File

@ -129,7 +129,9 @@ async function fetchProfile(baseActor, entity, include) {
const searchRes = await http.post('https://tour.hitzefrei.com/search-preview', {
q: baseActor.name,
}, {
'Accept-Language': 'en-US',
headers: {
'Accept-Language': 'en-US',
},
});
if (searchRes.ok) {

View File

@ -115,7 +115,7 @@ async function scrapeSceneAlt({ query }, url, channel, session) {
release.trailer = query.video();
if (!release.trailer) {
const trailerRes = await http.get(`${channel.url}/api/play-api.php`, null, { useSession: session });
const trailerRes = await http.get(`${channel.url}/api/play-api.php`, { session });
if (trailerRes.ok) {
release.trailer = trailerRes.body;
@ -153,7 +153,7 @@ async function fetchLatest(site, page = 1) {
async function fetchScene(url, site) {
const session = http.session();
const res = await qu.get(url, null, null, { useSession: session });
const res = await qu.get(url, null, null, { session });
if (res.ok) {
if (site.parameters?.scraper === 'alt') {

View File

@ -23,7 +23,7 @@ async function fetchTrailerLocation(entryId, channel) {
const url = `${channel.url}/api/download/${entryId}/hd1080/stream`;
try {
const res = await http.get(url, null, {
const res = await http.get(url, {
followRedirects: false,
});

View File

@ -2,18 +2,17 @@
const util = require('util');
const Promise = require('bluebird');
const bhttp = require('@thependulum/bhttp');
const cheerio = require('cheerio');
const { JSDOM } = require('jsdom');
const moment = require('moment');
const { get, geta, ctxa, parseDate, prefixUrl } = require('../utils/q');
const qu = require('../utils/qu');
const http = require('../utils/http');
const { heightToCm } = require('../utils/convert');
const slugify = require('../utils/slugify');
async function fetchPhotos(url) {
const res = await bhttp.get(url);
const res = await http.get(url);
return res.body.toString();
}
@ -82,7 +81,7 @@ async function getPhotosLegacy(entryId, site, type = 'highres', page = 1) {
async function getPhotos(entryId, site, type = 'highres', page = 1) {
const albumUrl = `${site.parameters?.photos || `${site.url}/gallery.php`}?id=${entryId}&type=${type}&page=${page}`;
const res = await bhttp.get(albumUrl);
const res = await http.get(albumUrl);
const html = res.body.toString();
const sourceLines = html.split(/\n/).filter(line => line.match(/ptx\["\w+"\]/));
@ -135,25 +134,25 @@ function getEntryId(html) {
}
function scrapeAll(scenes, site, entryIdFromTitle) {
return scenes.map(({ el, qu }) => {
return scenes.map(({ el, query }) => {
const release = {};
release.url = qu.url('.update_title a, .dvd_info > a, a ~ a');
release.title = qu.q('.update_title a, .dvd_info > a, a ~ a', true);
release.date = qu.date('.update_date', 'MM/DD/YYYY');
release.url = query.url('.update_title a, .dvd_info > a, a ~ a');
release.title = query.q('.update_title a, .dvd_info > a, a ~ a', true);
release.date = query.date('.update_date', 'MM/DD/YYYY');
release.entryId = (entryIdFromTitle && slugify(release.title)) || el.dataset.setid || qu.q('.rating_box')?.dataset.id;
release.entryId = (entryIdFromTitle && slugify(release.title)) || el.dataset.setid || query.q('.rating_box')?.dataset.id;
release.actors = qu.all('.update_models a', true);
release.actors = query.all('.update_models a', true);
const dvdPhotos = qu.imgs('.dvd_preview_thumb');
const photoCount = Number(qu.q('a img.thumbs', 'cnt')) || 1;
const dvdPhotos = query.imgs('.dvd_preview_thumb');
const photoCount = Number(query.q('a img.thumbs', 'cnt')) || 1;
[release.poster, ...release.photos] = dvdPhotos.length
? dvdPhotos
: Array.from({ length: photoCount }).map((value, index) => {
const src = qu.img('a img.thumbs', `src${index}_1x`) || qu.img('a img.thumbs', `src${index}`) || qu.img('a img.thumbs');
const prefixedSrc = prefixUrl(src, site.url);
const src = query.img('a img.thumbs', `src${index}_1x`) || query.img('a img.thumbs', `src${index}`) || query.img('a img.thumbs');
const prefixedSrc = qu.prefixUrl(src, site.url);
if (src) {
return [
@ -183,7 +182,7 @@ function scrapeAll(scenes, site, entryIdFromTitle) {
return null;
}).filter(Boolean);
const teaserScript = qu.html('script');
const teaserScript = query.html('script');
if (teaserScript) {
const src = teaserScript.slice(teaserScript.indexOf('http'), teaserScript.indexOf('.mp4') + 4);
if (src) release.teaser = { src };
@ -236,17 +235,17 @@ function scrapeUpcoming(html, site) {
});
}
async function scrapeScene({ html, qu }, url, site, include) {
async function scrapeScene({ html, query }, url, site, include) {
const release = { url, site };
release.entryId = getEntryId(html);
release.title = qu.q('.title_bar_hilite', true);
release.description = qu.q('.update_description', true);
release.title = query.q('.title_bar_hilite', true);
release.description = query.q('.update_description', true);
release.date = qu.date('.update_date', 'MM/DD/YYYY', null, 'innerHTML');
release.date = query.date('.update_date', 'MM/DD/YYYY', null, 'innerHTML');
release.actors = qu.all('.backgroundcolor_info > .update_models a, .item .update_models a', true);
release.tags = qu.all('.update_tags a', true);
release.actors = query.all('.backgroundcolor_info > .update_models a, .item .update_models a', true);
release.tags = query.all('.update_tags a', true);
const posterPath = html.match(/useimage = "(.*)"/)?.[1];
@ -280,14 +279,14 @@ async function scrapeScene({ html, qu }, url, site, include) {
if (include.photos) release.photos = await getPhotos(release.entryId, site);
if (qu.exists('.update_dvds a')) {
if (query.exists('.update_dvds a')) {
release.movie = {
url: qu.url('.update_dvds a'),
title: qu.q('.update_dvds a', true),
url: query.url('.update_dvds a'),
title: query.q('.update_dvds a', true),
};
}
const stars = Number(qu.q('.avg_rating', true)?.replace(/[\s|Avg Rating:]/g, ''));
const stars = Number(query.q('.avg_rating', true)?.replace(/[\s|Avg Rating:]/g, ''));
if (stars) release.stars = stars;
return release;
@ -302,7 +301,7 @@ function scrapeMovie({ el, query }, url, site) {
movie.channel = slugify(query.q('.update_date a', true), '');
// movie.releases = Array.from(document.querySelectorAll('.cell.dvd_info > a'), el => el.href);
const sceneQus = ctxa(el, '.dvd_details');
const sceneQus = qu.initAll(el, '.dvd_details');
const scenes = scrapeAll(sceneQus, site);
const curatedScenes = scenes
@ -332,7 +331,7 @@ function scrapeProfile(html, url, actorName, entity) {
const birthDateString = bio.match(/Age:\s*(\w+ \d{1,2}, \d{4})/);
const measurementsString = bio.match(/\w+-\d+-\d+/);
if (birthDateString) profile.birthdate = parseDate(birthDateString[1], 'MMMM D, YYYY');
if (birthDateString) profile.birthdate = qu.parseDate(birthDateString[1], 'MMMM D, YYYY');
if (ageString) profile.age = Number(ageString[1]);
if (heightString) profile.height = heightToCm(heightString[0]);
@ -354,7 +353,7 @@ function scrapeProfile(html, url, actorName, entity) {
avatarEl.getAttribute('src'),
]
.filter(avatar => avatar && !/p\d+.jpe?g/.test(avatar)) // remove non-existing attributes and placeholder images
.map(avatar => prefixUrl(avatar, entity.url));
.map(avatar => qu.prefixUrl(avatar, entity.url));
if (avatarSources.length) profile.avatar = avatarSources;
}
@ -369,8 +368,8 @@ async function fetchLatest(site, page = 1, include, preData, entryIdFromTitle =
? util.format(site.parameters.latest, page)
: `${site.url}/trial/categories/movies_${page}_d.html`;
// const res = await bhttp.get(url);
const res = await geta(url, '.update_details');
// const res = await http.get(url);
const res = await qu.getAll(url, '.update_details');
return res.ok ? scrapeAll(res.items, site, entryIdFromTitle) : res.status;
}
@ -389,13 +388,13 @@ async function fetchUpcoming(site) {
}
async function fetchScene(url, site, baseRelease, include) {
const res = await get(url);
const res = await qu.get(url);
return res.ok ? scrapeScene(res.item, url, site, include) : res.status;
}
async function fetchMovie(url, site) {
const res = await get(url);
const res = await qu.get(url);
return res.ok ? scrapeMovie(res.item, url, site) : res.status;
}

View File

@ -97,8 +97,10 @@ async function scrapeScene({ query, html }, url, baseRelease) {
const token = query.meta('name=_token');
const trailerInfoUrl = `${origin}/episodes/trailer/sources/${release.entryId}?type=trailer`;
const trailerInfoRes = await http.post(trailerInfoUrl, null, {
'X-CSRF-Token': token,
'X-Requested-With': 'XMLHttpRequest',
headers: {
'X-CSRF-Token': token,
'X-Requested-With': 'XMLHttpRequest',
},
});
if (trailerInfoRes.ok && trailerInfoRes.body.sources.length > 0) {
@ -136,7 +138,9 @@ function scrapeProfile({ query }) {
async function fetchLatest(channel, page = 1) {
const url = `${channel.url}/episodes/search?page=${page}`; // TLS issues with teenfidelity.com, same overview on all sites
const res = await http.get(url, {
'X-Requested-With': 'XMLHttpRequest',
headers: {
'X-Requested-With': 'XMLHttpRequest',
},
});
if (res.ok && res.body.status === 'success') {
@ -157,7 +161,9 @@ async function fetchScene(url, channel, baseRelease) {
async function fetchProfile({ name: actorName }) {
const actorSlug = slugify(actorName);
const res = await qu.get(`https://www.kellymadison.com/models/${actorSlug}`, null, {
'X-Requested-With': 'XMLHttpRequest',
headers: {
'X-Requested-With': 'XMLHttpRequest',
},
});
if (res.ok) {

View File

@ -87,12 +87,13 @@ async function fetchScene(url, channel) {
/* API protected
async function fetchProfile({ name: actorName }, context , site) {
const session = bhttp.session();
const session = http.session();
await session.get(`https://tour.${site.slug}.com`);
await http.get(`https://tour.${site.slug}.com`, { session });
const url = `https://tour.${site.slug}.com/search-preview`;
const res = await session.post(url, { q: actorName }, {
const res = await http.post(url, { q: actorName }, {
session,
headers: {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
origin: `https://tour.${site.slug}.com`,

View File

@ -2,12 +2,12 @@
/* eslint-disable newline-per-chained-call */
const Promise = require('bluebird');
const bhttp = require('@thependulum/bhttp');
const { CookieJar } = Promise.promisifyAll(require('tough-cookie'));
const moment = require('moment');
const qu = require('../utils/qu');
const slugify = require('../utils/slugify');
const http = require('../utils/http');
const { inchesToCm, lbsToKg } = require('../utils/convert');
const { cookieToData } = require('../utils/cookies');
@ -145,14 +145,14 @@ function getUrl(site) {
async function getSession(site) {
const cookieJar = new CookieJar();
const session = bhttp.session({ cookieJar });
const session = http.session({ cookieJar });
// const res = await session.get(url);
const sessionUrl = site.parameters?.siteId && !(site.parameters?.childSession || site.parent?.parameters?.childSession)
? site.parent.url
: site.url;
const res = await session.get(sessionUrl);
const res = await http.get(sessionUrl, { session });
if (res.statusCode === 200) {
const cookieString = await cookieJar.getCookieStringAsync(sessionUrl);
@ -215,7 +215,8 @@ async function fetchLatest(site, page = 1) {
? `https://site-api.project1service.com/v2/releases?dateReleased=<${beforeDate}&limit=${limit}&offset=${limit * (page - 1)}&orderBy=-dateReleased&type=scene`
: `https://site-api.project1service.com/v2/releases?collectionId=${siteId}&dateReleased=<${beforeDate}&limit=${limit}&offset=${limit * (page - 1)}&orderBy=-dateReleased&type=scene`;
const res = await session.get(apiUrl, {
const res = await http.get(apiUrl, {
session,
headers: {
Instance: instanceToken,
Origin: site.url,
@ -236,7 +237,8 @@ async function fetchUpcoming(site) {
const apiUrl = 'https://site-api.project1service.com/v2/upcoming-releases';
const res = await session.get(apiUrl, {
const res = await http.get(apiUrl, {
session,
headers: {
Instance: instanceToken,
Origin: site.url,
@ -260,7 +262,8 @@ async function fetchScene(url, site, baseScene) {
const entryId = url.match(/\d+/)[0];
const { session, instanceToken } = await getSession(site);
const res = await session.get(`https://site-api.project1service.com/v2/releases/${entryId}`, {
const res = await http.get(`https://site-api.project1service.com/v2/releases/${entryId}`, {
session,
headers: {
Instance: instanceToken,
},
@ -277,7 +280,8 @@ async function fetchProfile({ name: actorName }, networkOrNetworkSlug, actorPath
// const url = `https://www.${networkOrNetworkSlug.slug || networkOrNetworkSlug}.com`;
const { session, instanceToken } = await getSession(networkOrNetworkSlug);
const res = await session.get(`https://site-api.project1service.com/v1/actors/?search=${encodeURI(actorName)}`, {
const res = await http.get(`https://site-api.project1service.com/v1/actors/?search=${encodeURI(actorName)}`, {
session,
headers: {
Instance: instanceToken,
},
@ -291,8 +295,9 @@ async function fetchProfile({ name: actorName }, networkOrNetworkSlug, actorPath
const actorReleasesUrl = `https://site-api.project1service.com/v2/releases?actorId=${actorData.id}&limit=100&offset=0&orderBy=-dateReleased&type=scene`;
const [actorRes, actorReleasesRes] = await Promise.all([
bhttp.get(actorUrl),
session.get(actorReleasesUrl, {
http.get(actorUrl),
http.get(actorReleasesUrl, {
session,
headers: {
Instance: instanceToken,
},

View File

@ -6,7 +6,7 @@ const moment = require('moment');
const http = require('../utils/http');
const slugify = require('../utils/slugify');
const { ex, get } = require('../utils/q');
const qu = require('../utils/q');
function titleExtractor(pathname) {
const components = pathname.split('/')[2].split('-');
@ -102,24 +102,24 @@ function scrapeScene(html, url, site) {
}
async function fetchActorReleases(url) {
const res = await get(url);
const res = await qu.get(url);
return res.ok
? res.item.qu.urls('.contain-block:not(.live-scenes) .scene-item > a:first-child') // live scenes repeat on all pages
? res.item.query.urls('.contain-block:not(.live-scenes) .scene-item > a:first-child') // live scenes repeat on all pages
: [];
}
async function scrapeProfile(html) {
const { qu } = ex(html);
const { query } = qu.extract(html);
const profile = {};
profile.description = qu.q('.bio_about_text', true);
profile.description = query.q('.bio_about_text', true);
const avatar = qu.q('img.performer-pic', 'src');
const avatar = query.q('img.performer-pic', 'src');
if (avatar) profile.avatar = `https:${avatar}`;
const releases = qu.urls('.scene-item > a:first-child');
const otherPages = qu.urls('.pagination a:not([rel=next]):not([rel=prev])');
const releases = query.urls('.scene-item > a:first-child');
const otherPages = query.urls('.pagination a:not([rel=next]):not([rel=prev])');
const olderReleases = await Promise.all(otherPages.map(async page => fetchActorReleases(page)));
profile.releases = releases.concat(olderReleases.flat());

View File

@ -1,10 +1,10 @@
'use strict';
const bhttp = require('bhttp');
const blake2 = require('blake2');
const knex = require('../knex');
const { ex, ctxa } = require('../utils/q');
const http = require('../utils/http');
async function getSiteSlugs() {
return knex('sites')
@ -124,7 +124,7 @@ async function scrapeScene(html, site, url, metaSiteSlugs) {
async function fetchLatest(site, page = 1) {
const url = `${site.url}/movies/page-${page}`;
const res = await bhttp.get(url);
const res = await http.get(url);
if (res.statusCode === 200) {
return scrapeLatest(res.body.toString(), site);
@ -134,7 +134,7 @@ async function fetchLatest(site, page = 1) {
}
async function fetchScene(url, site, release) {
const res = await bhttp.get(url);
const res = await http.get(url);
if (res.statusCode === 200) {
return scrapeScene(res.body.toString(), site, url, release?.meta.siteSlugs);

View File

@ -1,144 +0,0 @@
'use strict';
const bhttp = require('bhttp');
const cheerio = require('cheerio');
const { JSDOM } = require('jsdom');
const moment = require('moment');
async function getTrailer(entryId) {
const trailerRes = await bhttp.post('https://www.pervcity.com/gettoken.php', {
setId: entryId,
});
if (trailerRes.statusCode === 200) {
return {
poster: trailerRes.body.TrailerImg,
trailer: trailerRes.body.TrailerPath || trailerRes.body.Trailerfallback,
};
}
return null;
}
function scrapeLatestScene(html, site) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const entryId = $('li').attr('id');
const sceneLinkElement = $('#scene_title_border a');
const url = `${site.url}/${sceneLinkElement.attr('href')}`;
const title = sceneLinkElement.attr('title').replace(/\u00E2\u0080\u0099/g, '\''); // replace weird apostrophes
const actors = $('.home_model_name a').toArray().map(element => $(element).text().replace(/,[\u0020\u00A0\u202F]/, '')); // replace weird commas
const date = moment.utc($('.add_date').text(), 'DD-MM-YYYY').toDate();
const poster = $('a:nth-child(2) > img').attr('src');
const photos = $('.sample-picker img').map((index, element) => $(element).attr('src').replace('tourpics', 'trailer')).toArray();
const stars = $('img[src*="/star.png"]')
.toArray()
.map(element => $(element).attr('src'))
.length || 0;
return {
url,
entryId,
title,
actors,
date,
poster,
photos,
rating: {
stars,
},
site,
};
}
async function scrapeScene(html, url, site) {
const { document } = new JSDOM(html).window;
const release = { url, site };
release.entryId = document.querySelector('input#set_ID').value;
release.title = document.querySelector('title').textContent;
release.description = document.querySelector('.player_data').textContent.trim();
const durationString = document.querySelector('.tag_lineR div:nth-child(2) span').textContent;
const [minutes, seconds] = durationString.match(/\d+/g);
release.duration = Number(minutes) * 60 + Number(seconds);
release.tags = document.querySelector('meta[name="keywords"]').content.split(',');
const { poster, trailer } = await getTrailer(release.entryId);
release.poster = poster;
release.trailer = { src: trailer };
return release;
}
function scrapeFallbackLanding(html) {
const { document } = new JSDOM(html).window;
return document.querySelector('input#set_ID').value;
}
async function scrapeFallbackScene(html, entryId, url, site) {
const { document } = new JSDOM(html).window;
const release = { url, entryId, site };
release.title = document.querySelector('.popup_data_set_head label').textContent;
release.description = document.querySelector('.popup_data_set_des p').textContent.trim();
release.date = moment.utc(document.querySelector('.popup_left_top div span').textContent, 'MM-DD-YYYY').toDate();
release.actors = Array.from(document.querySelectorAll('.popup_data_set_models a'), el => el.textContent);
const { poster, trailer } = await getTrailer(release.entryId);
release.poster = poster;
release.trailer = { src: trailer };
release.channel = document.querySelector('.popup_left_top div img').alt;
return release;
}
async function fetchLatest(channel, page = 1) {
const url = `${channel.url}/final_latestupdateview.php?limitstart=${(page - 1) * 9}&limitend=9&webchannelid=0&deviceview=browser&tourId=${channel.parameters.tourId}`;
const pagedUrl = `${channel.url}/final_load_latestupdate_grid_view.php?limitstart=0&limitend=${(page - 1) * 8 + 1}&webchannelid=0&deviceview=browser&tourId=${channel.parameters.tourId}`;
const res = page === 1
? await bhttp.get(url)
: await bhttp.get(pagedUrl);
const elements = JSON.parse(res.body.toString());
const latest = Object.values(elements.total_arr).map(html => scrapeLatestScene(html, channel)); // total_arr is a key-value object for final_load_latestupdate_grid_view.php
return latest;
}
async function fetchScene(url, site) {
const res = await bhttp.get(url);
if (res.statusCode === 200) {
if (site.isNetwork) {
const entryId = scrapeFallbackLanding(res.body.toString(), url);
const fallbackRes = await bhttp.post('https://www.pervcity.com/set_popupvideo.php', {
setId: entryId,
});
return scrapeFallbackScene(fallbackRes.body.toString(), entryId, url, site);
}
return scrapeScene(res.body.toString(), url, site);
}
return null;
}
module.exports = {
fetchLatest,
fetchScene,
};

View File

@ -45,7 +45,7 @@ function scrapeScene({ query }, url, channel) {
release.tags = query.cnts(details.genres, 'a');
release.poster = query.img('#video-poster', 'data-poster', { origin: channel.url });
release.photos = query.imgs('#gallery .photo-item img', 'src', { origin: channel.url });
release.photos = query.imgs('#gallery .photo-item img', 'data-src', { origin: channel.url });
release.trailer = query.video('.trailer source');
@ -71,10 +71,10 @@ async function fetchLatest(channel, page = 1) {
const headers = { 'X-Requested-With': 'XMLHttpRequest' };
for (let i = 0; i < page - 1; i += 1) {
await http.get(url, headers, { useSession: session }); // eslint-disable-line no-await-in-loop
await http.get(url, { headers, session }); // eslint-disable-line no-await-in-loop
}
const res = await http.get(url, headers, { useSession: session });
const res = await http.get(url, { headers, session });
if (res.ok) {
const items = qu.extractAll(res.body.snippets?.['snippet--videoItems'] || res.body, '.product-item');

View File

@ -1,9 +1,10 @@
'use strict';
const bhttp = require('@thependulum/bhttp');
const { JSDOM } = require('jsdom');
const moment = require('moment');
const http = require('../utils/http');
const ethnicityMap = {
White: 'Caucasian',
};
@ -59,8 +60,8 @@ async function fetchProfile({ name: actorName }) {
const pornstarUrl = `https://pornhub.com/pornstar/${actorSlug}`;
const [modelRes, pornstarRes] = await Promise.all([
bhttp.get(modelUrl),
bhttp.get(pornstarUrl),
http.get(modelUrl),
http.get(pornstarUrl),
]);
const model = modelRes.statusCode === 200 && await scrapeProfile(modelRes.body.toString(), modelUrl, actorName);
@ -75,7 +76,7 @@ async function fetchProfile({ name: actorName }) {
*/
const pornstarUrl = `https://pornhub.com/pornstar/${actorSlug}`;
const pornstarRes = await bhttp.get(pornstarUrl);
const pornstarRes = await http.get(pornstarUrl);
return scrapeProfile(pornstarRes.body.toString(), pornstarUrl, actorName);
}

View File

@ -1,17 +1,17 @@
'use strict';
/* eslint-disable newline-per-chained-call */
const bhttp = require('@thependulum/bhttp');
const cheerio = require('cheerio');
const moment = require('moment');
const { get, geta } = require('../utils/q');
const slugify = require('../utils/slugify');
const http = require('../utils/http');
async function getPhotos(entryId, site) {
const { hostname } = new URL(site.url);
const res = await bhttp.get(`https://${hostname}/gallery.php?type=highres&id=${entryId}`);
const res = await http.get(`https://${hostname}/gallery.php?type=highres&id=${entryId}`);
const html = res.body.toString();
const $ = cheerio.load(html, { normalizeWhitespace: true });
@ -159,18 +159,18 @@ async function fetchLatest(site, page = 1) {
const { hostname } = new URL(site.url);
if (hostname.match('private.com')) {
const res = await bhttp.get(`${site.url}/${page}/`);
const res = await http.get(`${site.url}/${page}/`);
return scrapeLatest(res.body.toString(), site);
}
const res = await bhttp.get(`${site.url}/scenes/${page}/`);
const res = await http.get(`${site.url}/scenes/${page}/`);
return scrapeLatest(res.body.toString(), site);
}
async function fetchScene(url, site) {
const res = await bhttp.get(url);
const res = await http.get(url);
return scrapeScene(res.body.toString(), url, site);
}

View File

@ -1,8 +1,9 @@
'use strict';
const bhttp = require('@thependulum/bhttp');
const cheerio = require('cheerio');
const http = require('../utils/http');
const {
scrapeLatestX,
fetchLatest,
@ -24,7 +25,7 @@ function scrapeLatestClassic(html, site) {
}
async function fetchClassic(site, page) {
const res = await bhttp.get(`${site.url}/scenes?page=${page}`);
const res = await http.get(`${site.url}/scenes?page=${page}`);
if (res.statusCode === 200) {
return scrapeLatestClassic(res.body.toString(), site);

View File

@ -1,9 +1,8 @@
'use strict';
const bhttp = require('bhttp');
const { ex, exa, get } = require('../utils/q');
const slugify = require('../utils/slugify');
const http = require('../utils/http');
const { heightToCm, lbsToKg } = require('../utils/convert');
function scrapePhotos(html) {
@ -19,7 +18,7 @@ function scrapePhotos(html) {
}
async function fetchPhotos(url) {
const res = await bhttp.get(url);
const res = await http.get(url);
if (res.statusCode === 200) {
return scrapePhotos(res.body.toString(), url);
@ -198,7 +197,7 @@ async function scrapeProfile(html, actorUrl, withReleases) {
async function fetchLatest(site, page = 1) {
const latestPath = site.parameters?.path || '/big-boob-videos';
const url = `${site.url}${latestPath}?page=${page}`;
const res = await bhttp.get(url);
const res = await http.get(url);
if (res.statusCode === 200) {
return scrapeAll(res.body.toString(), site);
@ -208,7 +207,7 @@ async function fetchLatest(site, page = 1) {
}
async function fetchScene(url, site) {
const res = await bhttp.get(url);
const res = await http.get(url);
if (res.statusCode === 200) {
return scrapeScene(res.body.toString(), url, site);
@ -227,7 +226,7 @@ async function fetchProfile({ name: actorName }, context, include, page = 1, sou
const url = sources[source];
const res = await bhttp.get(url, {
const res = await http.get(url, {
followRedirects: false,
});
@ -235,7 +234,7 @@ async function fetchProfile({ name: actorName }, context, include, page = 1, sou
const actorUrl = scrapeModels(res.body.toString(), actorName);
if (actorUrl) {
const actorRes = await bhttp.get(actorUrl);
const actorRes = await http.get(actorUrl);
if (actorRes.statusCode === 200) {
return scrapeProfile(actorRes.body.toString(), actorUrl, include.scenes);

View File

@ -1,180 +0,0 @@
'use strict';
const bhttp = require('bhttp');
const { JSDOM } = require('jsdom');
const moment = require('moment');
function extractTitle(pathname) {
return pathname
.split('/')
.slice(-2)[0]
.split('_')
.map(seg => `${seg.charAt(0).toUpperCase()}${seg.slice(1)}`)
.join(' ');
}
function extractActors(str) {
return str
.split(/,|\band\b/ig)
.filter(actor => !/\.{3}/.test(actor))
.map(actor => actor.trim())
.filter(actor => actor.length > 0);
}
function scrapeLatest(html, site) {
const { document } = new JSDOM(html).window;
const scenes = Array.from(document.querySelectorAll('#updatesList li.grey, #updatesList li.white'));
return scenes.map((scene) => {
const release = { site };
const link = scene.querySelector('.info a');
const poster = scene.querySelector('img');
const { pathname } = new URL(link);
[release.entryId] = poster.id.match(/\d+/);
release.url = `https://www.teamskeet.com${pathname}`;
release.title = extractTitle(pathname);
release.date = moment.utc(scene.querySelector('strong').textContent, 'MM/DD/YYYY').toDate();
const photos = Array.from({ length: 5 }, (_value, index) => poster.dataset.original.replace(/\d+.jpg/, `${String(index + 1).padStart(2, '0')}.jpg`));
[release.poster] = photos;
release.photos = photos.slice(1);
const actors = scene.querySelector('div span[rel="test"]').textContent;
release.actors = extractActors(actors);
return release;
});
}
function scrapeScene(html, site, url) {
const { document } = new JSDOM(html).window;
const release = { site };
release.entryId = document.querySelector('#story-and-tags .scene_rater').attributes.rel.value;
release.description = document.querySelector('#story-and-tags td:nth-child(2) div').textContent;
const [actors, title, channel] = document.querySelector('title').textContent.split('|').map(item => item.trim());
release.url = url;
release.title = title;
release.actors = extractActors(actors);
release.channel = channel.toLowerCase();
release.tags = Array.from(document.querySelectorAll('#story-and-tags tr:nth-child(2) a'), el => el.rel);
const date = document.querySelector('h3 ~ div:nth-child(4), h3 ~ div div.gray:not(.scene_rater)').textContent.split(':')[1].trim();
release.date = moment.utc(date, 'MMMM Do, YYYY').toDate();
const { poster } = document.querySelector('video');
if (poster && !/gen/.test(poster)) release.poster = [poster.replace('low', 'hi'), poster];
const siteId = document.querySelector('#story-and-tags img').src.match(/\w+.jpg/)[0].replace('.jpg', '');
const actorsSlug = document.querySelector('h3 a').href.split('/').slice(-2)[0];
release.photos = Array.from({ length: 5 }, (value, index) => `https://images.psmcdn.net/teamskeet/${siteId}/${actorsSlug}/shared/scenes/new/${String(index + 1).padStart(2, '0')}.jpg`);
const trailer = document.querySelector('div.right.gray a').href;
if (trailer) release.trailer = { src: trailer };
return release;
}
function scrapeSceneA(html, site, sceneX, url) {
const scene = sceneX || new JSDOM(html).window.document;
const release = { site };
release.description = scene.querySelector('.scene-story').textContent.replace('...read more', '...').trim();
release.date = moment.utc(scene.querySelector('.scene-date').textContent, 'MM/DD/YYYY').toDate();
release.actors = Array.from(scene.querySelectorAll('.starring span'), el => extractActors(el.textContent)).flat();
const durationString = scene.querySelector('.time').textContent.trim();
const duration = ['00'].concat(durationString.split(':')).slice(-3).join(':'); // ensure hh:mm:ss
release.duration = moment.duration(duration).asSeconds();
if (sceneX) {
const titleEl = scene.querySelector(':scope > a');
release.url = titleEl.href;
release.entryId = titleEl.id;
release.title = titleEl.title;
const [poster, ...photos] = Array.from(scene.querySelectorAll('.scene img'), el => el.src);
release.poster = [poster.replace('bio_big', 'video'), poster];
release.photos = photos;
}
if (!sceneX) {
release.title = scene.querySelector('.title span').textContent;
release.url = url;
release.poster = scene.querySelector('video').poster;
release.photos = [release.poster.replace('video', 'bio_small'), release.poster.replace('video', 'bio_small2')];
}
const [, entryIdA, entryIdB] = new URL(release.url).pathname.split('/');
release.entryId = entryIdA === 'scenes' ? entryIdB : entryIdA;
return release;
}
function scrapeLatestA(html, site) {
const { document } = new JSDOM(html).window;
const scenes = Array.from(document.querySelectorAll('.scenewrapper'));
return scenes.map(scene => scrapeSceneA(null, site, scene));
}
async function fetchLatestTeamSkeet(site, page = 1) {
const url = `https://www.teamskeet.com/t1/updates/load?fltrs[site]=${site.parameters.id}&page=${page}&view=newest&fltrs[time]=ALL&order=DESC`;
const res = await bhttp.get(url);
if (res.statusCode === 200) {
return scrapeLatest(res.body.toString(), site);
}
return null;
}
async function fetchLatestA(site) {
const url = `${site.url}/scenes`;
const res = await bhttp.get(url);
if (res.statusCode === 200) {
return scrapeLatestA(res.body.toString(), site);
}
return null;
}
async function fetchLatest(site, page = 1) {
if (site.parameters.id) {
return fetchLatestTeamSkeet(site, page);
}
if (site.parameters.scraper === 'A') {
return fetchLatestA(site, page);
}
return null;
}
async function fetchScene(url, site) {
const session = bhttp.session(); // resolve redirects
const res = await session.get(url);
if (site.parameters?.scraper === 'A') {
return scrapeSceneA(res.body.toString(), site, null, url);
}
return scrapeScene(res.body.toString(), site, url);
}
module.exports = {
fetchLatest,
fetchScene,
};

View File

@ -74,9 +74,14 @@ async function scrapeScene({ query }, url) {
release.photos = query.imgs('.detail-grabs img');
const streamData = await http.get(`${origin}/video/source/${entryId}`, {
host,
referer: url,
}, { queueMethod: '5s' });
headers: {
host,
referer: url,
},
}, {
interval: 5000,
concurrency: 1,
});
if (streamData.ok && streamData.body.status === 'success') {
release.trailer = {

View File

@ -1,10 +1,9 @@
'use strict';
/* eslint-disable no-unused-vars */
const bhttp = require('@thependulum/bhttp');
const { get, ed } = require('../utils/q');
const { fetchApiLatest, fetchApiUpcoming, fetchScene, fetchApiProfile } = require('./gamma');
const http = require('../utils/http');
const slugify = require('../utils/slugify');
function scrapeLatestNative(scenes, site) {
@ -72,7 +71,7 @@ async function fetchLatestNative(site, page = 1) {
}
const apiUrl = `${site.url}/videos/api/?limit=50&offset=${(page - 1) * 50}&sort=datedesc`;
const res = await bhttp.get(apiUrl, {
const res = await http.get(apiUrl, {
decodeJSON: true,
});
@ -107,7 +106,7 @@ async function fetchSceneWrapper(url, site, release) {
if (scene.date - new Date(site.parameters?.lastNative) <= 0) {
// scene is probably still available on Vivid site, use search API to get URL and original date
const searchUrl = `${site.url}/videos/api/?limit=10&sort=datedesc&search=${encodeURI(scene.title)}`;
const searchRes = await bhttp.get(searchUrl, {
const searchRes = await http.get(searchUrl, {
decodeJSON: true,
});

View File

@ -4,7 +4,7 @@
const Promise = require('bluebird');
const moment = require('moment');
const { get, post } = require('../utils/http');
const http = require('../utils/http');
const slugify = require('../utils/slugify');
const genderMap = {
@ -45,13 +45,15 @@ function getAvatarFallbacks(avatar) {
async function getTrailer(scene, site, url) {
const qualities = [360, 480, 720, 1080, 2160];
const tokenRes = await post(`${site.url}/api/__record_tknreq`, {
const tokenRes = await http.post(`${site.url}/api/__record_tknreq`, {
file: scene.previewVideoUrl1080P,
sizes: qualities.join('+'),
type: 'trailer',
}, {
referer: url,
origin: site.url,
headers: {
referer: url,
origin: site.url,
},
});
if (!tokenRes.ok) {
@ -59,7 +61,7 @@ async function getTrailer(scene, site, url) {
}
const trailerUrl = `${site.url}/api${tokenRes.body.data.url}`;
const trailersRes = await post(trailerUrl, null, { referer: url });
const trailersRes = await http.post(trailerUrl, null, { headers: { referer: url } });
if (trailersRes.ok) {
return qualities.map(quality => (trailersRes.body[quality] ? {
@ -155,9 +157,9 @@ async function scrapeScene(data, url, site, baseRelease) {
async function fetchActorReleases(pages, model, origin) {
const releasesPerPage = await Promise.map(pages, async (page) => {
const url = `${origin}/api${model.targetUrl}?page=${page}`;
const res = await get(url);
const res = await http.get(url);
if (res.code === 200) {
if (res.status === 200) {
return scrapeAll(res.body.data.videos.videos, null, origin);
}
@ -203,46 +205,46 @@ async function scrapeProfile(data, origin, withReleases) {
async function fetchLatest(site, page = 1) {
const url = `${site.url}/api/videos?page=${page}`;
const res = await get(url);
const res = await http.get(url);
if (res.code === 200) {
if (res.status === 200) {
return scrapeAll(res.body.data.videos, site);
}
return res.code;
return res.status;
}
async function fetchUpcoming(site) {
const apiUrl = `${site.url}/api`;
const res = await get(apiUrl);
const res = await http.get(apiUrl);
if (res.code === 200) {
if (res.status === 200) {
return scrapeUpcoming(res.body.data.nextScene, site);
}
return res.code;
return res.status;
}
async function fetchScene(url, site, baseRelease) {
const { origin, pathname } = new URL(url);
const apiUrl = `${origin}/api${pathname}`;
const res = await get(apiUrl);
const res = await http.get(apiUrl);
if (res.code === 200) {
if (res.status === 200) {
return scrapeScene(res.body.data, url, site, baseRelease);
}
return res.code;
return res.status;
}
async function fetchProfile({ name: actorName }, { site }, include) {
const origin = site.url;
const actorSlug = slugify(actorName);
const url = `${origin}/api/${actorSlug}`;
const res = await get(url);
const res = await http.get(url);
if (res.code === 200) {
if (res.status === 200) {
return scrapeProfile(res.body.data, origin, include.scenes);
}

View File

@ -1,8 +1,8 @@
'use strict';
const bhttp = require('bhttp');
const { ex, ctxa } = require('../utils/q');
// const slugify = require('../utils/slugify');
const { ex, ctxa } = require('../utils/q');
const http = require('../utils/http');
function getLicenseCode(html) {
const licensePrefix = 'license_code: \'';
@ -178,7 +178,7 @@ function scrapeScene(html, url) {
async function fetchLatest(site, page = 1) {
const url = `https://vogov.com/latest-videos/?sort_by=post_date&from=${page}`;
const res = await bhttp.get(url);
const res = await http.get(url);
if (res.statusCode === 200) {
return scrapeLatest(res.body.toString(), site);
@ -188,7 +188,7 @@ async function fetchLatest(site, page = 1) {
}
async function fetchScene(url) {
const res = await bhttp.get(url);
const res = await http.get(url);
if (res.statusCode === 200) {
return scrapeScene(res.body.toString(), url);

View File

@ -1,9 +1,10 @@
'use strict';
const bhttp = require('bhttp');
const { JSDOM } = require('jsdom');
const moment = require('moment');
const http = require('../utils/http');
function scrapeLatest(html, site) {
const { document } = new JSDOM(html).window;
const { origin } = new URL(site.url);
@ -112,7 +113,7 @@ function scrapeScene(html, site, url) {
async function fetchLatest(site, page = 1) {
const url = `${site.url}?page=${page}`;
const res = await bhttp.get(url);
const res = await http.get(url);
if (res.statusCode === 200) {
return scrapeLatest(res.body.toString(), site);
@ -122,7 +123,7 @@ async function fetchLatest(site, page = 1) {
}
async function fetchScene(url, site) {
const res = await bhttp.get(url);
const res = await http.get(url);
if (res.statusCode === 200) {
return scrapeScene(res.body.toString(), site, url);

View File

@ -1,11 +1,10 @@
'use strict';
const bhttp = require('@thependulum/bhttp');
const { fetchLatest, fetchUpcoming, scrapeScene, fetchProfile } = require('./gamma');
const http = require('../utils/http');
async function fetchScene(url, site) {
const res = await bhttp.get(url);
const res = await http.get(url);
const release = await scrapeScene(res.body.toString(), url, site);

146
src/utils/http-legacy.js Normal file
View File

@ -0,0 +1,146 @@
'use strict';
const util = require('util');
const stream = require('stream');
const config = require('config');
const tunnel = require('tunnel');
const bhttp = require('@thependulum/bhttp');
const taskQueue = require('promise-task-queue');
const pipeline = util.promisify(stream.pipeline);
const logger = require('../logger')(__filename);
const defaultHeaders = {
'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1',
};
const defaultOptions = {
responseTimeout: 30000,
};
const proxyAgent = tunnel.httpsOverHttp({
proxy: {
host: config.proxy.host,
port: config.proxy.port,
},
});
function useProxy(url) {
if (!config.proxy.enable) {
return false;
}
const { hostname } = new URL(url);
return config.proxy.hostnames.includes(hostname);
}
const queue = taskQueue();
const defaultQueueMethod = '20p';
async function handler({
url,
method = 'GET',
body,
headers = {},
options = {},
}) {
if (body) {
logger.silly(`${method.toUpperCase()} ${url} with ${JSON.stringify(body)} ${options.queueMethod || defaultQueueMethod}`);
} else {
logger.silly(`${method.toUpperCase()} ${url} ${options.queueMethod || defaultQueueMethod}`);
}
const reqOptions = {
headers: {
...(options?.defaultHeaders !== false && defaultHeaders),
...headers,
},
...defaultOptions,
...options,
...(options?.timeout && { responseTimeout: options?.timeout }),
};
if (useProxy(url)) {
reqOptions.agent = proxyAgent;
}
const res = ['POST', 'PUT', 'PATCH'].includes(method.toUpperCase())
? await (options.useSession || bhttp)[method.toLowerCase()](url, body, reqOptions)
: await (options.useSession || bhttp)[method.toLowerCase()](url, reqOptions);
if (options?.stream && options?.destination) {
await pipeline(res, ...(options?.transforms || []), options?.destination);
}
const html = Buffer.isBuffer(res.body) ? res.body.toString() : null;
const json = Buffer.isBuffer(res.body) ? null : res.body;
return {
...res,
originalRes: res,
html,
json,
pipe: res.pipe,
ok: res.statusCode >= 200 && res.statusCode <= 299,
code: res.statusCode,
status: res.statusCode,
};
}
queue.on('concurrencyReached:http', () => {
logger.silly('Queueing requests');
});
queue.define('20p', handler, {
concurrency: 20,
});
queue.define('1s', handler, {
interval: 1,
});
queue.define('5s', handler, {
interval: 5,
});
async function get(url, headers, options) {
return queue.push(options?.queueMethod || defaultQueueMethod, {
method: 'GET',
url,
headers,
options,
});
}
async function head(url, headers, options) {
return queue.push(options?.queueMethod || defaultQueueMethod, {
method: 'HEAD',
url,
headers,
options,
});
}
async function post(url, body, headers, options) {
return queue.push(options?.queueMethod || defaultQueueMethod, {
method: 'POST',
url,
body,
headers,
options,
});
}
function session(headers, options) {
return bhttp.session({
headers,
options,
});
}
module.exports = {
get,
post,
head,
session,
};

View File

@ -1,21 +1,24 @@
'use strict';
const config = require('config');
const bhttp = require('@thependulum/bhttp');
const util = require('util');
const stream = require('stream');
const config = require('config');
const tunnel = require('tunnel');
const bhttp = require('@thependulum/bhttp');
const taskQueue = require('promise-task-queue');
const Bottleneck = require('bottleneck');
const { JSDOM } = require('jsdom');
const logger = require('../logger')(__filename);
const argv = require('../argv');
const pipeline = util.promisify(stream.pipeline);
const logger = require('../logger')(__filename);
const defaultHeaders = {
'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1',
};
const limiters = {};
const defaultOptions = {
responseTimeout: 30000,
encodeJSON: true,
headers: {
'user-agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:15.0) Gecko/20100101 Firefox/15.0.1',
},
};
const proxyAgent = tunnel.httpsOverHttp({
@ -31,116 +34,139 @@ function useProxy(url) {
}
const { hostname } = new URL(url);
return config.proxy.hostnames.includes(hostname);
}
const queue = taskQueue();
const defaultQueueMethod = '20p';
async function handler({
url,
method = 'GET',
body,
headers = {},
options = {},
}) {
if (body) {
logger.silly(`${method.toUpperCase()} ${url} with ${JSON.stringify(body)} ${options.queueMethod || defaultQueueMethod}`);
} else {
logger.silly(`${method.toUpperCase()} ${url} ${options.queueMethod || defaultQueueMethod}`);
function getLimiterValue(prop, options, hostname) {
if (argv[prop] !== undefined) {
return argv[prop];
}
const reqOptions = {
headers: {
...(options?.defaultHeaders !== false && defaultHeaders),
...headers,
},
if (options[prop] !== undefined) {
return options[prop];
}
if (config.limits[hostname]?.enable !== false && config.limits[hostname]?.[prop] !== undefined) {
return config.limits[hostname][prop];
}
return config.limits.default[prop];
}
function getLimiter(options = {}, url) {
const { hostname } = new URL(url);
const interval = getLimiterValue('interval', options, hostname);
const concurrency = getLimiterValue('concurrency', options, hostname);
if (!limiters[interval]?.[concurrency]) {
limiters[interval] = limiters[interval] || {};
limiters[interval][concurrency] = new Bottleneck({
minTime: interval,
maxConcurrent: concurrency,
});
}
return limiters[interval][concurrency];
}
async function request(method = 'get', url, body, requestOptions = {}, limiter) {
const http = requestOptions.session || bhttp;
const options = {
...defaultOptions,
...options,
...(options?.timeout && { responseTimeout: options?.timeout }),
...requestOptions,
responseTimeout: requestOptions.responseTimeout || requestOptions.timeout || 60000,
stream: !!requestOptions.destination,
session: null,
};
if (useProxy(url)) {
reqOptions.agent = proxyAgent;
const withProxy = useProxy(url);
if (withProxy) {
options.agent = proxyAgent;
}
const res = ['POST', 'PUT', 'PATCH'].includes(method.toUpperCase())
? await (options.useSession || bhttp)[method.toLowerCase()](url, body, reqOptions)
: await (options.useSession || bhttp)[method.toLowerCase()](url, reqOptions);
logger.debug(`${method.toUpperCase()} (${limiter._store.storeOptions.minTime}ms/${limiter._store.storeOptions.maxConcurrent}p${withProxy ? ' proxy' : ''}) ${url}`);
if (options?.stream && options?.destination) {
await pipeline(res, ...(options?.transforms || []), options?.destination);
const res = await (body
? http[method](url, body, options)
: http[method](url, options));
const resIsOk = res.statusCode >= 200 && res.statusCode <= 299;
if (options.destination) {
// res.on('progress', (bytes, totalBytes) => logger.silly(`Downloaded ${Math.round((bytes / totalBytes) * 100)}% of ${url}`));
await pipeline(res, ...(options.transforms || []), options.destination);
}
const html = Buffer.isBuffer(res.body) ? res.body.toString() : null;
const json = Buffer.isBuffer(res.body) ? null : res.body;
if (Buffer.isBuffer(res.body)) {
const html = res.body.toString();
const window = new JSDOM(html).window;
return {
...res,
body: html,
html,
status: res.statusCode,
document: window.document,
window,
ok: resIsOk,
};
}
return {
...res,
originalRes: res,
html,
json,
pipe: res.pipe,
ok: res.statusCode >= 200 && res.statusCode <= 299,
code: res.statusCode,
body: res.body,
status: res.statusCode,
ok: res.statusCode >= 200 && res.statusCode <= 299,
};
}
queue.on('concurrencyReached:http', () => {
logger.silly('Queueing requests');
});
async function scheduleRequest(method = 'get', url, body, options) {
const limiter = getLimiter(options, url);
queue.define('20p', handler, {
concurrency: 20,
});
queue.define('1s', handler, {
interval: 1,
});
queue.define('5s', handler, {
interval: 5,
});
async function get(url, headers, options) {
return queue.push(options?.queueMethod || defaultQueueMethod, {
method: 'GET',
url,
headers,
options,
});
return limiter.schedule(() => request(method, url, body, options, limiter));
}
async function head(url, headers, options) {
return queue.push(options?.queueMethod || defaultQueueMethod, {
method: 'HEAD',
url,
headers,
options,
});
async function get(url, options) {
return scheduleRequest('get', url, null, options);
}
async function post(url, body, headers, options) {
return queue.push(options?.queueMethod || defaultQueueMethod, {
method: 'POST',
url,
body,
headers,
options,
});
async function post(url, body, options) {
return scheduleRequest('post', url, body, options);
}
function session(headers, options) {
return bhttp.session({
headers,
options,
});
async function put(url, body, options) {
return scheduleRequest('put', url, body, options);
}
async function patch(url, body, options) {
return scheduleRequest('patch', url, body, options);
}
async function del(url, options) {
return scheduleRequest('delete', url, null, options);
}
async function head(url, options) {
return scheduleRequest('head', url, null, options);
}
function getSession(options) {
return bhttp.session(options);
}
module.exports = {
get,
post,
head,
session,
post,
delete: del,
put,
patch,
session: getSession,
getSession,
};

View File

@ -457,8 +457,8 @@ function extractAll(htmlValue, selector) {
async function request(method = 'get', urlValue, body, selector, headers, options, queryAll = false) {
const res = await (method === 'post'
? http.post(urlValue, body, headers, options)
: http[method](urlValue, headers, options));
? http.post(urlValue, body, { ...options, headers })
: http[method](urlValue, { ...options, headers }));
if (res.ok) {
const item = queryAll
@ -494,7 +494,7 @@ async function post(urlValue, body, selector, headers, options) {
}
async function getAll(urlValue, selector, headers, options) {
return request('get,', urlValue, selector, headers, options, true);
return request('get', urlValue, null, selector, headers, options, true);
}
async function postAll(urlValue, body, selector, headers, options) {