traxxx/src/scrapers/kellymadison.js

225 lines
6.5 KiB
JavaScript
Executable File

'use strict';
const config = require('config');
const unprint = require('unprint');
// const { parse } = require('csv-parse/sync');
const slugify = require('../utils/slugify');
const http = require('../utils/http');
const { feetInchesToCm, femaleFeetUsToEu } = require('../utils/convert');
const thumbKeyRegex = /(thumb\d+_url)|(episode_thumb_image_\d+_url)/;
const qualityMap = {
'480p': 480,
mobile: 720, // as of recent, might've been lower in the past
'720p': 720,
'1080p': 1080,
'2k': 1440,
'4k': 2160,
'5k': 2280,
'8k': 4320,
};
function scrapeSceneApi(data, channel) {
const release = {};
release.entryId = data.id;
if (data.url) {
// provided URL works but always points to 8KMilfs instead of dedicated site
const { pathname } = new URL(data.url);
release.url = unprint.prefixUrl(pathname, channel.url);
}
if (channel.parameters.short && data.sequence_number) {
release.shootId = `${channel.parameters.short} #${data.sequence_number}`;
}
release.title = data.title;
release.description = data.short_description;
release.date = new Date(data.publish_on);
if (data.fullEpisodeLength) {
release.duration = data.fullEpisodeLength;
} else if (data.full_episode_minutes) {
// full_episode_seconds is always available so far, but no need to count on it
release.duration = (data.full_episode_minutes + (data.full_episode_seconds || 0)) * 60;
}
release.actors = data.models.map((model) => ({
name: model.name,
gender: model.sex?.toLowerCase(),
url: unprint.prefixUrl(`/models/${model.slug}`, channel.url),
}));
release.poster = data.thumb_url || data.thumb_image_url;
release.photos = [
data.poster_image_url,
...Object.entries(data).filter(([key]) => thumbKeyRegex.test(key)).map(([_key, url]) => url),
].filter(Boolean); // photo thumbs include poster, don't filter here but in client
const trailers = data.trailerVideos || data.trailer;
if (trailers) {
release.trailer = Object.entries(trailers)
.filter(([key, trailer]) => !key.toLowerCase().includes('_sfw') && !trailer.url?.toLowerCase().includes('_sfw'))
.map(([_key, trailer]) => ({
src: trailer.url,
quality: qualityMap[trailer.resolution?.toLowerCase()] || null,
}));
}
release.tags = data.categories.map((category) => category.name);
release.photoCount = data.photosetPhotoCount || data.episode_photoset_photo_count;
return release;
}
async function fetchLatestApi(channel, page = 1, { parameters }) {
// JSON API doesn't return poster images, CSV API doesn't have pagination. UPDATE: requested and received both, yet to test
const res = await http.get(`${parameters.apiAddress}/affiliates?site_id=${parameters.siteId}&page=${page}`, {
headers: {
Authorization: `Bearer ${config.apiKeys[parameters.apiKey]}`,
},
});
if (res.ok) {
return res.body.data.map((data) => scrapeSceneApi(data, channel));
}
return res.status;
}
/* not practical via API, updates endpoint contains all necessary data
async function fetchSceneApi(url, entity, baseRelease, { parameters }) {
// const episodeId = new URL(url).pathname.match(/\/episodes\/\w+\/(\d+)/)?.[1];
const episodeId = new URL(url).pathname.match(/\/episodes\/(\d+)/)?.[1];
if (!episodeId) {
return null;
}
// JSON API doesn't return poster images, CSV API doesn't have pagination. UPDATE: requested and received both, yet to test
const res = await http.get(`${parameters.apiAddress}/affiliates/episodes/${episodeId}`, {
headers: {
Authorization: `Bearer ${config.apiKeys[parameters.apiKey]}`,
},
});
console.log(res.body);
return;
if (res.ok) {
return scrapeSceneApi(res.body.data, entity);
}
return res.status;
}
*/
function composeBio(bioKeys, bioValues) {
return bioKeys.reduce((acc, key, index) => ({
...acc,
[slugify(key, '_')]: bioValues[index],
}), {});
}
function getBio(query) {
// Kelly Madison, Fidelity
if (query.exists('.profile-stats')) {
const bioKeys = query.contents('.profile-stats li strong');
const bioValues = query.texts('.profile-stats li');
return composeBio(bioKeys, bioValues);
}
// 8K
if (query.exists('//h4[contains(text(), "Stats")]')) {
const bioKeys = query.contents('(//h4[contains(text(), "Stats")])[1]//following-sibling::div//strong');
const bioValues = query.contents('(//h4[contains(text(), "Stats")])[1]//following-sibling::div//p/text()');
return composeBio(bioKeys, bioValues);
}
// 5K
if (query.exists('.bio-overlay-1')) {
const bioKeys = query.contents('.bio-overlay-1 td:first-child');
const bioValues = query.contents('.bio-overlay-1 td:last-child');
return composeBio(bioKeys, bioValues);
}
return null;
}
function scrapeProfile({ query }) {
const profile = {};
const bio = getBio(query);
const questions = query.contents('.model-faq .content-body .accordion-header, .card .card-header button');
const answers = query.contents('.model-faq .content-body .accordion-body, .card .collapse .card-body');
if (questions.length > 0 && questions.length === answers.length) {
profile.description = questions.map((question, index) => `**${question}**\n${answers[index]}`).join('\n');
}
if (bio) {
if (bio.ethnicity) profile.ethnicity = bio.ethnicity;
if (bio.measurements) profile.measurements = bio.measurements;
if (bio.birthplace) profile.birthPlace = bio.birthplace;
if (bio.shoe_size) profile.foot = femaleFeetUsToEu(bio.shoe_size);
if (bio.height) {
const [feet, inches] = bio.height.match(/\d+/g);
profile.height = feetInchesToCm(feet, inches);
}
if (bio.age) profile.age = Number(bio.age);
if (bio.birthday) {
const [month, day] = bio.birthday.split('/');
const birthday = new Date(Date.UTC(0, Number(month) - 1, Number(day)));
if (profile.age) {
birthday.setUTCFullYear(new Date().getFullYear() - profile.age); // indicate birth year is unknown
} else {
birthday.setUTCFullYear(0); // indicate birth year is unknown
}
profile.dateOfBirth = new Date(birthday);
}
}
profile.avatar = query.img('img[src*="model"][src*="headshot"]');
profile.photos = query.imgs('img[src*="model"][src*="thumb_image"], img[src*="model"][src*="bg_image"]');
return profile;
}
async function fetchProfile({ name: actorName }, { entity }) {
const actorSlug = slugify(actorName);
// 8K sites don't have avatar or interview on model page, always use 5K site
const res = await unprint.get(`${entity.slug === '5kvids' ? 'https://www.5kporn.com' : entity.url}/models/${actorSlug}`, {
headers: {
'X-Requested-With': 'XMLHttpRequest',
},
});
if (res.ok) {
return scrapeProfile(res.context);
}
return res.status;
}
module.exports = {
fetchLatest: fetchLatestApi,
fetchProfile,
};