traxxx/src/scrapers/mindgeek.js

337 lines
9.6 KiB
JavaScript

'use strict';
/* eslint-disable newline-per-chained-call */
const Promise = require('bluebird');
const { CookieJar } = Promise.promisifyAll(require('tough-cookie'));
const moment = require('moment');
const qu = require('../utils/qu');
const slugify = require('../utils/slugify');
const http = require('../utils/http');
const { inchesToCm, lbsToKg } = require('../utils/convert');
const { cookieToData } = require('../utils/cookies');
function getThumbs(scene) {
if (scene.images.poster) {
return scene.images.poster.map(image => image.xl.url);
}
if (scene.images.card_main_rect) {
return scene.images.card_main_rect
.concat(scene.images.card_secondary_rect || [])
.map(image => image.xl.url.replace('.thumb', ''));
}
return [];
}
function getVideos(data) {
const teaserSources = data.videos.mediabook?.files;
const trailerSources = data.children.find(child => child.type === 'trailer')?.videos.full?.files;
const teaser = teaserSources && Object.values(teaserSources).map(source => ({
src: source.urls.view,
quality: parseInt(source.format, 10),
}));
const trailer = trailerSources && Object.values(trailerSources).map(source => ({
src: source.urls.view,
quality: parseInt(source.format, 10),
}));
return { teaser, trailer };
}
function scrapeLatestX(data, site, filterChannel) {
if (site.parameters?.extract === true && data.collections.length > 0) {
// release should not belong to any channel
return null;
}
if (typeof site.parameters?.extract === 'string' && !data.collections.some(collection => collection.shortName === site.parameters.extract)) {
// release should belong to specific channel
return null;
}
if (filterChannel && !data.collections?.some(collection => collection.id === site.parameters?.siteId)) {
// used to separate upcoming Brazzers scenes
return null;
}
const release = {
entryId: data.id,
title: data.title,
description: data.description,
};
const basepath = site.parameters?.scene
|| (site.parameters?.native && `${site.url}/scene`)
|| `${site.parent.url}/scene`;
release.url = `${basepath}/${release.entryId}/`;
release.date = new Date(data.dateReleased);
release.actors = data.actors.map(actor => ({ name: actor.name, gender: actor.gender }));
release.tags = data.tags.map(tag => tag.name);
release.duration = data.videos.mediabook?.length;
[release.poster, ...release.photos] = getThumbs(data);
const { teaser, trailer } = getVideos(data);
if (teaser) release.teaser = teaser;
if (trailer) release.trailer = trailer;
return release;
}
async function scrapeLatest(items, site, filterChannel) {
const latestReleases = items.map(data => scrapeLatestX(data, site, filterChannel));
return latestReleases.filter(Boolean);
}
function scrapeScene(data, url, _site, networkName) {
const release = {};
const { id: entryId, title, description } = data;
release.entryId = data.id;
release.title = title;
release.description = description;
release.date = new Date(data.dateReleased);
release.actors = data.actors.map(actor => ({ name: actor.name, gender: actor.gender }));
release.tags = data.tags.map(tag => tag.name);
[release.poster, ...release.photos] = getThumbs(data);
const { teaser, trailer } = getVideos(data);
if (teaser) release.teaser = teaser;
if (trailer) release.trailer = trailer;
const siteName = data.collections[0]?.name || data.brand;
release.channel = slugify(siteName, '');
release.url = url || `https://www.${networkName || data.brand}.com/scene/${entryId}/`;
return release;
}
function getUrl(site) {
const { searchParams } = new URL(site.url);
// if (search.match(/\?site=\d+/)) {
if (searchParams.has('site')) {
return site.url;
}
if (site.parameters?.native) {
return `${site.url}/scenes`;
}
if (site.parameters?.extract) {
return `${site.url}/scenes`;
}
if (site.parameters?.siteId) {
return `${site.parent.url}/scenes?site=${site.parameters.siteId}`;
}
throw new Error(`Mind Geek site '${site.name}' (${site.url}) not supported`);
}
async function getSession(site) {
const cookieJar = new CookieJar();
const session = http.session({ cookieJar });
// const res = await session.get(url);
const sessionUrl = site.parameters?.siteId && !(site.parameters?.native || site.parameters?.childSession || site.parent?.parameters?.childSession)
? site.parent.url
: site.url;
const res = await http.get(sessionUrl, { session });
if (res.statusCode === 200) {
const cookieString = await cookieJar.getCookieStringAsync(sessionUrl);
const { instance_token: instanceToken } = cookieToData(cookieString);
return { session, instanceToken };
}
throw new Error(`Failed to acquire MindGeek session (${res.statusCode})`);
}
function scrapeProfile(data, html, releases = [], networkName) {
const { query } = qu.extract(html);
const profile = {
description: data.bio,
aliases: data.aliases,
};
profile.gender = data.gender === 'other' ? 'transsexual' : data.gender;
if (data.measurements) {
const [bust, waist, hip] = data.measurements.split('-');
if (profile.gender === 'female') {
if (bust) profile.bust = bust.toUpperCase();
if (waist) profile.waist = waist;
if (hip) profile.hip = hip;
}
}
if (data.birthPlace) profile.birthPlace = data.birthPlace;
if (data.height) profile.height = inchesToCm(data.height);
if (data.weight) profile.weight = lbsToKg(data.weight);
if (data.images.card_main_rect?.[0]) {
profile.avatar = data.images.card_main_rect[0].xl?.url
|| data.images.card_main_rect[0].lg?.url
|| data.images.card_main_rect[0].md?.url
|| data.images.card_main_rect[0].sm?.url
|| data.images.card_main_rect[0].xs?.url;
}
const birthdate = query.all('li').find(el => /Date of Birth/.test(el.textContent));
if (birthdate) profile.birthdate = query.date(birthdate, 'span', 'MMMM Do, YYYY');
if (data.tags.some(tag => /boob type/i.test(tag.category) && /natural tits/i.test(tag.name))) {
profile.naturalBoobs = true;
}
if (data.tags.some(tag => /boob type/i.test(tag.category) && /enhanced/i.test(tag.name))) {
profile.naturalBoobs = false;
}
profile.releases = releases.map(release => scrapeScene(release, null, null, networkName));
return profile;
}
async function fetchLatest(site, page = 1) {
const url = getUrl(site);
const { searchParams } = new URL(url);
const siteId = searchParams.get('site');
const { session, instanceToken } = await getSession(site);
const beforeDate = moment().add('1', 'day').format('YYYY-MM-DD');
const limit = 10;
const apiUrl = site.parameters?.native || site.parameters?.extract
? `https://site-api.project1service.com/v2/releases?dateReleased=<${beforeDate}&limit=${limit}&offset=${limit * (page - 1)}&orderBy=-dateReleased&type=scene`
: `https://site-api.project1service.com/v2/releases?collectionId=${siteId}&dateReleased=<${beforeDate}&limit=${limit}&offset=${limit * (page - 1)}&orderBy=-dateReleased&type=scene`;
const res = await http.get(apiUrl, {
session,
headers: {
Instance: instanceToken,
Origin: site.url,
Referer: url,
},
});
if (res.statusCode === 200 && res.body.result) {
return scrapeLatest(res.body.result, site);
}
return null;
}
async function fetchUpcoming(site) {
const url = getUrl(site);
const { session, instanceToken } = await getSession(site);
const apiUrl = 'https://site-api.project1service.com/v2/upcoming-releases';
const res = await http.get(apiUrl, {
session,
headers: {
Instance: instanceToken,
Origin: site.url,
Referer: url,
},
});
if (res.statusCode === 200 && res.body.result) {
return scrapeLatest(res.body.result, site, true);
}
return null;
}
async function fetchScene(url, site, baseScene) {
if (baseScene?.entryId) {
// overview and deep data is the same, don't hit server unnecessarily
return baseScene;
}
const entryId = url.match(/\d+/)[0];
const { session, instanceToken } = await getSession(site);
const res = await http.get(`https://site-api.project1service.com/v2/releases/${entryId}`, {
session,
headers: {
Instance: instanceToken,
},
});
if (res.statusCode === 200 && res.body.result) {
return scrapeScene(res.body.result, url, site);
}
return null;
}
async function fetchProfile({ name: actorName }, networkOrNetworkSlug) {
// const url = `https://www.${networkOrNetworkSlug.slug || networkOrNetworkSlug}.com`;
const { session, instanceToken } = await getSession(networkOrNetworkSlug);
const res = await http.get(`https://site-api.project1service.com/v1/actors/?search=${encodeURI(actorName)}`, {
session,
headers: {
Instance: instanceToken,
},
});
if (res.statusCode === 200) {
const actorData = res.body.result.find(actor => actor.name.toLowerCase() === actorName.toLowerCase());
if (actorData) {
const actorUrl = `https://www.${networkOrNetworkSlug.slug || networkOrNetworkSlug}.com/${networkOrNetworkSlug?.parameters?.actorPath || 'model'}/${actorData.id}/`;
const actorReleasesUrl = `https://site-api.project1service.com/v2/releases?actorId=${actorData.id}&limit=100&offset=0&orderBy=-dateReleased&type=scene`;
const [actorRes, actorReleasesRes] = await Promise.all([
http.get(actorUrl),
http.get(actorReleasesUrl, {
session,
headers: {
Instance: instanceToken,
},
}),
]);
if (actorRes.statusCode === 200 && actorReleasesRes.statusCode === 200 && actorReleasesRes.body.result) {
return scrapeProfile(actorData, actorRes.body.toString(), actorReleasesRes.body.result, networkOrNetworkSlug.slug || networkOrNetworkSlug);
}
if (actorRes.statusCode === 200) {
return scrapeProfile(actorData, actorRes.body.toString(), null, networkOrNetworkSlug.slug || networkOrNetworkSlug);
}
}
}
return null;
}
module.exports = {
scrapeLatestX,
fetchLatest,
fetchUpcoming,
fetchScene,
fetchProfile,
};