|
|
|
|
@ -2,86 +2,81 @@
|
|
|
|
|
|
|
|
|
|
const config = require('config');
|
|
|
|
|
const unprint = require('unprint');
|
|
|
|
|
const { parse } = require('csv-parse/sync');
|
|
|
|
|
// const { parse } = require('csv-parse/sync');
|
|
|
|
|
|
|
|
|
|
const slugify = require('../utils/slugify');
|
|
|
|
|
const qu = require('../utils/qu');
|
|
|
|
|
const http = require('../utils/http');
|
|
|
|
|
const { feetInchesToCm, femaleFeetUsToEu } = require('../utils/convert');
|
|
|
|
|
|
|
|
|
|
const siteMapByKey = {
|
|
|
|
|
PF: 'pornfidelity',
|
|
|
|
|
TF: 'teenfidelity',
|
|
|
|
|
KM: 'kellymadison',
|
|
|
|
|
'5KP': '5kporn',
|
|
|
|
|
'5KT': '5kteens',
|
|
|
|
|
const thumbKeyRegex = /(thumb\d+_url)|(episode_thumb_image_\d+_url)/;
|
|
|
|
|
|
|
|
|
|
const qualityMap = {
|
|
|
|
|
'480p': 480,
|
|
|
|
|
mobile: 720, // as of recent, might've been lower in the past
|
|
|
|
|
'720p': 720,
|
|
|
|
|
'1080p': 1080,
|
|
|
|
|
'2k': 1440,
|
|
|
|
|
'4k': 2160,
|
|
|
|
|
'5k': 2280,
|
|
|
|
|
'8k': 4320,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
const siteMapBySlug = Object.entries(siteMapByKey).reduce((acc, [key, value]) => ({ ...acc, [value]: key }), {});
|
|
|
|
|
function scrapeSceneApi(data, channel) {
|
|
|
|
|
const release = {};
|
|
|
|
|
|
|
|
|
|
function scrapeLatest(scenes, site) {
|
|
|
|
|
return scenes.map(({ query }) => {
|
|
|
|
|
const release = {};
|
|
|
|
|
release.entryId = data.id;
|
|
|
|
|
|
|
|
|
|
release.shootId = query.q('.card-meta .text-right, .row .text-right, .card-footer-item:last-child', true);
|
|
|
|
|
if (data.url) {
|
|
|
|
|
// provided URL works but always points to 8KMilfs instead of dedicated site
|
|
|
|
|
const { pathname } = new URL(data.url);
|
|
|
|
|
|
|
|
|
|
const { pathname } = new URL(query.url('h5 a, .ep-title a, .title a'));
|
|
|
|
|
[release.entryId] = pathname.match(/\d+$/);
|
|
|
|
|
|
|
|
|
|
release.title = query.cnt('h5 a, .ep-title a, .title a');
|
|
|
|
|
|
|
|
|
|
release.date = query.date('.card-meta .text-left, .row .col-4:first-child, .card-footer-item:first-child', ['MMM D', 'MMM D, YYYY'], /\w+ \d+(, \w+)?/);
|
|
|
|
|
release.actors = query.cnts('.models a, .ep-models a, a[href*="models/"]');
|
|
|
|
|
|
|
|
|
|
// older scenes do not have a working scene page on their native site, but they (often, not always) do on Porn Fidelity
|
|
|
|
|
// scenes older than year do not show a date; this is not when the URLs stop working, but it's a rough guideline
|
|
|
|
|
release.url = site.parameters.archive && !release.date
|
|
|
|
|
? `${site.parameters.archive}${pathname}`
|
|
|
|
|
: `${site.url}${pathname}`;
|
|
|
|
|
|
|
|
|
|
release.duration = query.dur('.content a');
|
|
|
|
|
|
|
|
|
|
const duration = query.cnt('.content a, .ep-runtime strong, .subtitle:last-child a')?.match(/(\d+) min/)?.[1];
|
|
|
|
|
if (duration) release.duration = Number(duration) * 60;
|
|
|
|
|
|
|
|
|
|
if (query.exists('.episodes-preview')) {
|
|
|
|
|
[release.poster, ...release.photos] = query.imgs('.episodes-preview img');
|
|
|
|
|
} else {
|
|
|
|
|
release.poster = query.img('.card-img-top, .image img');
|
|
|
|
|
release.teaser = {
|
|
|
|
|
src: query.video('video'),
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* using site ID, filter no longer needed
|
|
|
|
|
const siteId = release.shootId.match(/\d?\w{2}/)[0];
|
|
|
|
|
const siteSlug = siteMapByKey[siteId];
|
|
|
|
|
|
|
|
|
|
if (site.slug !== siteSlug) {
|
|
|
|
|
// using generic network overview, scene is not from the site we want
|
|
|
|
|
return { ...acc, unextracted: [...acc.unextracted, release] };
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return { ...acc, scenes: [...acc.scenes, release] };
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
return release;
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async function fetchLatest(channel, page = 1) {
|
|
|
|
|
const url = `${channel.url}/episodes/search?page=${page}&site=${channel.parameters.siteId || ''}`; // TLS issues with teenfidelity.com, same overview on all sites
|
|
|
|
|
const res = await http.get(url, {
|
|
|
|
|
headers: {
|
|
|
|
|
'X-Requested-With': 'XMLHttpRequest',
|
|
|
|
|
},
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
if (res.ok && res.body.status === 'success') {
|
|
|
|
|
return scrapeLatest(qu.extractAll(res.body.html, '.episode, .ep'), channel);
|
|
|
|
|
release.url = unprint.prefixUrl(pathname, channel.url);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return res.status;
|
|
|
|
|
if (channel.parameters.short && data.sequence_number) {
|
|
|
|
|
release.shootId = `${channel.parameters.short} #${data.sequence_number}`;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
release.title = data.title;
|
|
|
|
|
release.description = data.short_description;
|
|
|
|
|
|
|
|
|
|
release.date = new Date(data.publish_on);
|
|
|
|
|
|
|
|
|
|
if (data.fullEpisodeLength) {
|
|
|
|
|
release.duration = data.fullEpisodeLength;
|
|
|
|
|
} else if (data.full_episode_minutes) {
|
|
|
|
|
// full_episode_seconds is always available so far, but no need to count on it
|
|
|
|
|
release.duration = (data.full_episode_minutes + (data.full_episode_seconds || 0)) * 60;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
release.actors = data.models.map((model) => ({
|
|
|
|
|
name: model.name,
|
|
|
|
|
gender: model.sex?.toLowerCase(),
|
|
|
|
|
url: unprint.prefixUrl(`/models/${model.slug}`, channel.url),
|
|
|
|
|
}));
|
|
|
|
|
|
|
|
|
|
release.poster = data.thumb_url || data.thumb_image_url;
|
|
|
|
|
|
|
|
|
|
release.photos = [
|
|
|
|
|
data.poster_image_url,
|
|
|
|
|
...Object.entries(data).filter(([key]) => thumbKeyRegex.test(key)).map(([_key, url]) => url),
|
|
|
|
|
].filter(Boolean); // photo thumbs include poster, don't filter here but in client
|
|
|
|
|
|
|
|
|
|
const trailers = data.trailerVideos || data.trailer;
|
|
|
|
|
|
|
|
|
|
if (trailers) {
|
|
|
|
|
release.trailer = Object.entries(trailers)
|
|
|
|
|
.filter(([key, trailer]) => !key.toLowerCase().includes('_sfw') && !trailer.url?.toLowerCase().includes('_sfw'))
|
|
|
|
|
.map(([_key, trailer]) => ({
|
|
|
|
|
src: trailer.url,
|
|
|
|
|
quality: qualityMap[trailer.resolution?.toLowerCase()] || null,
|
|
|
|
|
}));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
release.tags = data.categories.map((category) => category.name);
|
|
|
|
|
release.photoCount = data.photosetPhotoCount || data.episode_photoset_photo_count;
|
|
|
|
|
|
|
|
|
|
return release;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async function fetchLatestApi(channel, page = 1, { parameters }) {
|
|
|
|
|
@ -92,126 +87,112 @@ async function fetchLatestApi(channel, page = 1, { parameters }) {
|
|
|
|
|
},
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
console.log(res.body.data[1]);
|
|
|
|
|
|
|
|
|
|
if (res.ok) {
|
|
|
|
|
const data = parse(res.body, {
|
|
|
|
|
columns: true,
|
|
|
|
|
skip_empty_lines: true,
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
console.log(data);
|
|
|
|
|
|
|
|
|
|
return null;
|
|
|
|
|
return res.body.data.map((data) => scrapeSceneApi(data, channel));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return res.status;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async function scrapeScene({ query, html }, url, baseRelease, channel, session) {
|
|
|
|
|
const { pathname } = new URL(url);
|
|
|
|
|
const release = {};
|
|
|
|
|
/* not practical via API, updates endpoint contains all necessary data
|
|
|
|
|
async function fetchSceneApi(url, entity, baseRelease, { parameters }) {
|
|
|
|
|
// const episodeId = new URL(url).pathname.match(/\/episodes\/\w+\/(\d+)/)?.[1];
|
|
|
|
|
const episodeId = new URL(url).pathname.match(/\/episodes\/(\d+)/)?.[1];
|
|
|
|
|
|
|
|
|
|
[release.entryId] = pathname.match(/\d+$/);
|
|
|
|
|
|
|
|
|
|
const titleString = query.cnt('.card-header.row h4, .trailer-starring span, .level-left .level-item');
|
|
|
|
|
const episode = titleString?.match(/#\d+$/)?.[0];
|
|
|
|
|
|
|
|
|
|
release.title = query.cnt('.trailer-title') || titleString?.match(/(?:Trailer: )?(.+) -/)?.[1];
|
|
|
|
|
release.channel = slugify(titleString?.match(/([\w\s]+) #\d+$/)?.[1], '');
|
|
|
|
|
|
|
|
|
|
const siteKey = siteMapBySlug[release.channel];
|
|
|
|
|
|
|
|
|
|
release.shootId = `${siteKey} ${episode}`;
|
|
|
|
|
release.description = query.text('p.card-text, h5.heavy + p, .card-content .is-three-fifths');
|
|
|
|
|
|
|
|
|
|
// order not reliable, get keys
|
|
|
|
|
const detailElsByKey = query.all('.card-body h4.card-title, .video-summary h5, .columns li').reduce((acc, rowEl) => ({
|
|
|
|
|
...acc,
|
|
|
|
|
[slugify(rowEl?.textContent.match(/(\w+):/)?.[1])]: rowEl,
|
|
|
|
|
}), {});
|
|
|
|
|
|
|
|
|
|
release.date = query.date(detailElsByKey.published, null, 'YYYY-MM-DD', /\d{4}-\d{2}-\d{2}/);
|
|
|
|
|
release.duration = query.dur(detailElsByKey.episode);
|
|
|
|
|
release.actors = query.cnts(detailElsByKey.starring, 'a');
|
|
|
|
|
|
|
|
|
|
const posterPrefix = html.indexOf('poster:');
|
|
|
|
|
const poster = query.img('.trailer-poster') || html.slice(html.indexOf('http', posterPrefix), html.indexOf('.jpg', posterPrefix) + 4);
|
|
|
|
|
|
|
|
|
|
if (poster) {
|
|
|
|
|
if (baseRelease?.poster) {
|
|
|
|
|
release.photos = [poster, ...(baseRelease.photos || [])];
|
|
|
|
|
} else {
|
|
|
|
|
release.poster = poster;
|
|
|
|
|
}
|
|
|
|
|
if (!episodeId) {
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// const token = query.meta('name=_token');
|
|
|
|
|
// const trailerInfoUrl = `${channel.url}/episodes/trailer/sources/${release.entryId}?type=trailer`;
|
|
|
|
|
const trailerInfoUrl = html.match(/'(http.*\/trailer\/sources.*)'/)?.[1];
|
|
|
|
|
|
|
|
|
|
if (trailerInfoUrl) {
|
|
|
|
|
const trailerInfoRes = await http.post(trailerInfoUrl, null, { session });
|
|
|
|
|
|
|
|
|
|
if (trailerInfoRes.ok && trailerInfoRes.body.sources?.length > 0) {
|
|
|
|
|
release.trailer = trailerInfoRes.body.sources.map((trailer) => ({
|
|
|
|
|
src: trailer.src,
|
|
|
|
|
type: trailer.type,
|
|
|
|
|
/* unreliable, sometimes actual video is 720p
|
|
|
|
|
quality: trailer.res
|
|
|
|
|
.replace(4000, 2160)
|
|
|
|
|
.replace(5000, 2880),
|
|
|
|
|
*/
|
|
|
|
|
}));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return release;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
async function fetchScene(url, channel, baseRelease) {
|
|
|
|
|
const session = http.session();
|
|
|
|
|
|
|
|
|
|
const res = await qu.get(url, null, {
|
|
|
|
|
'X-Requested-With': 'XMLHttpRequest',
|
|
|
|
|
}, {
|
|
|
|
|
session,
|
|
|
|
|
followRedirects: false, // redirects to sign-up page if scene not found
|
|
|
|
|
// JSON API doesn't return poster images, CSV API doesn't have pagination. UPDATE: requested and received both, yet to test
|
|
|
|
|
const res = await http.get(`${parameters.apiAddress}/affiliates/episodes/${episodeId}`, {
|
|
|
|
|
headers: {
|
|
|
|
|
Authorization: `Bearer ${config.apiKeys[parameters.apiKey]}`,
|
|
|
|
|
},
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
return res.ok
|
|
|
|
|
? scrapeScene(res.item, url, baseRelease, channel, session)
|
|
|
|
|
: res.status;
|
|
|
|
|
console.log(res.body);
|
|
|
|
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
if (res.ok) {
|
|
|
|
|
return scrapeSceneApi(res.body.data, entity);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return res.status;
|
|
|
|
|
}
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
function composeBio(bioKeys, bioValues) {
|
|
|
|
|
return bioKeys.reduce((acc, key, index) => ({
|
|
|
|
|
...acc,
|
|
|
|
|
[slugify(key, '_')]: bioValues[index],
|
|
|
|
|
}), {});
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function getBio(query) {
|
|
|
|
|
// Kelly Madison, Fidelity
|
|
|
|
|
if (query.exists('.profile-stats')) {
|
|
|
|
|
const bioKeys = query.contents('.profile-stats li strong');
|
|
|
|
|
const bioValues = query.texts('.profile-stats li');
|
|
|
|
|
|
|
|
|
|
return composeBio(bioKeys, bioValues);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 8K
|
|
|
|
|
if (query.exists('//h4[contains(text(), "Stats")]')) {
|
|
|
|
|
const bioKeys = query.contents('(//h4[contains(text(), "Stats")])[1]//following-sibling::div//strong');
|
|
|
|
|
const bioValues = query.contents('(//h4[contains(text(), "Stats")])[1]//following-sibling::div//p/text()');
|
|
|
|
|
|
|
|
|
|
return composeBio(bioKeys, bioValues);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// 5K
|
|
|
|
|
if (query.exists('.bio-overlay-1')) {
|
|
|
|
|
const bioKeys = query.contents('.bio-overlay-1 td:first-child');
|
|
|
|
|
const bioValues = query.contents('.bio-overlay-1 td:last-child');
|
|
|
|
|
|
|
|
|
|
return composeBio(bioKeys, bioValues);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
function scrapeProfile({ query }) {
|
|
|
|
|
const profile = {};
|
|
|
|
|
const bio = getBio(query);
|
|
|
|
|
|
|
|
|
|
const bioKeys = query.contents('table.table td:nth-child(1), table.table th');
|
|
|
|
|
const bioValues = query.contents('table.table td:nth-child(2)');
|
|
|
|
|
const questions = query.contents('.model-faq .content-body .accordion-header, .card .card-header button');
|
|
|
|
|
const answers = query.contents('.model-faq .content-body .accordion-body, .card .collapse .card-body');
|
|
|
|
|
|
|
|
|
|
const bio = bioKeys.reduce((acc, key, index) => ({
|
|
|
|
|
...acc,
|
|
|
|
|
[slugify(key, '_')]: bioValues[index],
|
|
|
|
|
}), {});
|
|
|
|
|
|
|
|
|
|
if (bio.ethnicity) profile.ethnicity = bio.ethnicity;
|
|
|
|
|
if (bio.measurements) profile.measurements = bio.measurements;
|
|
|
|
|
if (bio.birthplace) profile.birthPlace = bio.birthplace;
|
|
|
|
|
if (bio.shoe_size) profile.foot = femaleFeetUsToEu(bio.shoe_size);
|
|
|
|
|
|
|
|
|
|
if (bio.height) {
|
|
|
|
|
const [feet, inches] = bio.height.match(/\d+/g);
|
|
|
|
|
profile.height = feetInchesToCm(feet, inches);
|
|
|
|
|
if (questions.length > 0 && questions.length === answers.length) {
|
|
|
|
|
profile.description = questions.map((question, index) => `**${question}**\n${answers[index]}`).join('\n');
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (bio.birthday) {
|
|
|
|
|
const [month, day] = bio.birthday.split('/');
|
|
|
|
|
const birthday = new Date(Date.UTC(0, Number(month) - 1, Number(day)));
|
|
|
|
|
if (bio) {
|
|
|
|
|
if (bio.ethnicity) profile.ethnicity = bio.ethnicity;
|
|
|
|
|
if (bio.measurements) profile.measurements = bio.measurements;
|
|
|
|
|
if (bio.birthplace) profile.birthPlace = bio.birthplace;
|
|
|
|
|
if (bio.shoe_size) profile.foot = femaleFeetUsToEu(bio.shoe_size);
|
|
|
|
|
|
|
|
|
|
birthday.setUTCFullYear(0); // indicate birth year is unknown
|
|
|
|
|
if (bio.height) {
|
|
|
|
|
const [feet, inches] = bio.height.match(/\d+/g);
|
|
|
|
|
profile.height = feetInchesToCm(feet, inches);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
profile.dateOfBirth = new Date(birthday);
|
|
|
|
|
if (bio.age) profile.age = Number(bio.age);
|
|
|
|
|
|
|
|
|
|
if (bio.birthday) {
|
|
|
|
|
const [month, day] = bio.birthday.split('/');
|
|
|
|
|
const birthday = new Date(Date.UTC(0, Number(month) - 1, Number(day)));
|
|
|
|
|
|
|
|
|
|
if (profile.age) {
|
|
|
|
|
birthday.setUTCFullYear(new Date().getFullYear() - profile.age); // indicate birth year is unknown
|
|
|
|
|
} else {
|
|
|
|
|
birthday.setUTCFullYear(0); // indicate birth year is unknown
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
profile.dateOfBirth = new Date(birthday);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
profile.avatar = query.img('img[src*="model"][src*="headshot"]');
|
|
|
|
|
@ -223,7 +204,8 @@ function scrapeProfile({ query }) {
|
|
|
|
|
async function fetchProfile({ name: actorName }, { entity }) {
|
|
|
|
|
const actorSlug = slugify(actorName);
|
|
|
|
|
|
|
|
|
|
const res = await unprint.get(`${entity.url}/models/${actorSlug}`, {
|
|
|
|
|
// 8K sites don't have avatar or interview on model page, always use 5K site
|
|
|
|
|
const res = await unprint.get(`${entity.slug === '5kvids' ? 'https://www.5kporn.com' : entity.url}/models/${actorSlug}`, {
|
|
|
|
|
headers: {
|
|
|
|
|
'X-Requested-With': 'XMLHttpRequest',
|
|
|
|
|
},
|
|
|
|
|
@ -237,11 +219,6 @@ async function fetchProfile({ name: actorName }, { entity }) {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
module.exports = {
|
|
|
|
|
fetchLatest,
|
|
|
|
|
fetchLatest: fetchLatestApi,
|
|
|
|
|
fetchProfile,
|
|
|
|
|
fetchScene,
|
|
|
|
|
api: {
|
|
|
|
|
fetchLatest: fetchLatestApi,
|
|
|
|
|
// fetchScene, fetchSceneApi,
|
|
|
|
|
},
|
|
|
|
|
};
|
|
|
|
|
|