traxxx/src/scrapers/hush.js

411 lines
13 KiB
JavaScript

'use strict';
const util = require('util');
const knex = require('../knex');
const { get, geta, ed, fd, ctxa } = require('../utils/q');
const slugify = require('../utils/slugify');
const { feetInchesToCm } = require('../utils/convert');
async function getChannelRegExp(site) {
if (!['hushpass', 'interracialpass'].includes(site.network.slug)) return null;
const sites = await knex('sites').where('network_id', site.network.id);
return new RegExp(sites.map(channel => channel.parameters?.match || channel.name).join('|'), 'i');
}
function deriveEntryId(release) {
if (release.date && release.title) {
return `${slugify(fd(release.date, 'YYYY-MM-DD'))}-${slugify(release.title)}`;
}
return null;
}
function extractPoster(posterPath, site, baseRelease) {
if (posterPath && !/400.jpg/.test(posterPath)) {
const poster = `${site.parameters?.media || site.url}${posterPath}`;
const posterSources = [
poster,
// upscaled
poster.replace('-1x', '-2x'),
poster.replace('-1x', '-3x'),
];
if (baseRelease?.poster) {
return [posterSources, [baseRelease.poster]];
}
return [posterSources, []];
}
return [baseRelease?.poster || null, []];
}
function getImageWithFallbacks(q, selector, site, el) {
const sources = el
? [
q(el, selector, 'src0_3x'),
q(el, selector, 'src0_2x'),
q(el, selector, 'src0_1x'),
]
: [
q(selector, 'src0_3x'),
q(selector, 'src0_2x'),
q(selector, 'src0_1x'),
];
return sources.filter(Boolean).map(src => `${site.parameters?.media || site.url}${src}`);
}
function scrapeAll(scenes, site) {
return scenes.map(({ qu }) => {
const release = {};
release.title = qu.q('h3 a', 'title') || qu.q('h3 a', true);
release.url = qu.url('h3 a');
release.date = qu.date('.modeldata p', 'YYYY-MM-DD', /\d{4}-\d{2}-\d{2}/);
release.duration = qu.dur('.modeldata p');
if (/bts|behind the scenes/i.test(release.title)) release.tags = ['behind the scenes'];
release.poster = getImageWithFallbacks(qu.q, '.modelimg img', site);
// release.entryId = q('.modelimg img', 'id').match(/set-target-(\d+)/)[1];
release.entryId = deriveEntryId(release);
return release;
});
}
function scrapeAllT1(scenes, site, accSiteReleases) {
return scenes.map(({ qu }) => {
const release = {};
release.title = qu.q('h4 a', 'title') || qu.q('h4 a', true);
release.url = qu.url('h4 a');
release.date = qu.date('.more-info-div', 'MMM D, YYYY');
release.duration = qu.dur('.more-info-div');
if (/bts|behind the scenes/i.test(release.title)) release.tags = ['behind the scenes'];
const posterPath = qu.q('.img-div img', 'src0_1x') || qu.img('img.video_placeholder');
if (posterPath) {
const poster = /^http/.test(posterPath) ? posterPath : `${site.parameters?.media || site.url}${posterPath}`;
release.poster = [
poster.replace('-1x', '-3x'),
poster.replace('-1x', '-2x'),
poster,
];
}
// release.entryId = q('.img-div img', 'id')?.match(/set-target-(\d+)/)[1];
release.entryId = deriveEntryId(release);
if (site.parameters?.accFilter && accSiteReleases?.map(accRelease => accRelease.entryId).includes(release.entryId)) {
// filter out releases that were already scraped from a categorized site
return null;
}
return release;
}).filter(Boolean);
}
function scrapeAllTour(scenes) {
return scenes.map(({ qu }) => {
const release = {};
release.title = qu.q('h4 a', true);
release.url = qu.url('a');
release.date = qu.date('.tour_update_models + span', 'YYYY-MM-DD');
release.actors = qu.all('.tour_update_models a', true);
release.poster = qu.img('a img');
release.entryId = deriveEntryId(release);
return release;
});
}
function scrapeScene({ html, qu }, site, url, baseRelease) {
const release = { url };
release.title = qu.q('.centerwrap h2', true);
release.description = qu.q('.videocontent p', true);
release.date = qu.date('.videodetails .date', 'MM/DD/YYYY');
release.duration = qu.dur('.videodetails .date');
release.actors = qu.all('.modelname a', true);
const posterPath = html.match(/poster="([\w-/.]+)"/)?.[1];
[release.poster, release.photos] = extractPoster(posterPath, site, baseRelease);
const trailerPath = html.match(/\/trailers\/.*.mp4/);
if (trailerPath) release.trailer = { src: `${site.parameters?.media || site.url}${trailerPath}` };
const stars = qu.q('.modelrates + p', true).match(/\d.\d/)?.[0];
if (stars) release.stars = Number(stars);
// release.entryId = html.match(/set-target-(\d+)/)[1];
release.entryId = deriveEntryId(release);
return release;
}
function scrapeSceneT1({ html, qu }, site, url, baseRelease, channelRegExp) {
const release = { url };
release.title = qu.q('.trailer-section-head .section-title', true);
release.description = qu.text('.row .update-info-block');
release.date = qu.date('.update-info-row', 'MMM D, YYYY', /\w+ \d{1,2}, \d{4}/);
release.duration = qu.dur('.update-info-row:nth-child(2)');
release.actors = qu.all('.models-list-thumbs a').map(el => ({
name: qu.q(el, 'span', true),
avatar: getImageWithFallbacks(qu.q, 'img', site, el),
}));
release.tags = qu.all('.tags a', true);
// const posterPath = html.match(/poster="(.*\.jpg)/)?.[1];
const posterPath = qu.q('.player-thumb img', 'src0_1x');
[release.poster, release.photos] = extractPoster(posterPath, site, baseRelease);
const trailer = html.match(/<video.*src="(.*\.mp4)/)?.[1];
if (trailer && /^http/.test(trailer)) release.trailer = { src: trailer, referer: url };
else if (trailer) release.trailer = { src: `${site.parameters?.media || site.url}${trailer}`, referer: url };
const stars = qu.q('.update-rating', true).match(/\d.\d/)?.[0];
if (stars) release.stars = Number(stars);
if (channelRegExp) {
const channel = release.tags.find(tag => channelRegExp.test(tag));
if (channel) {
release.channel = {
force: true,
slug: slugify(channel, ''),
};
}
}
// release.entryId = q('.player-thumb img', 'id')?.match(/set-target-(\d+)/)[1];
release.entryId = deriveEntryId(release);
return release;
}
function scrapeSceneTour({ html, qu }, site, url) {
const release = {};
if (url) release.url = url;
release.title = qu.q('.update_title, .video-title', true);
release.description = qu.q('.latest_update_description, .video-summary', true);
const date = qu.date('.availdate, .update_date', 'YYYY-MM-DD');
if (date) release.date = date;
release.actors = qu.all('.update_block_info .tour_update_models a, .video-model .tour_update_models a', true);
release.tags = qu.all('.update_tags a, .tour_update_tags a', true);
const [photo, poster, ...photos] = qu.imgs('.update_image img:not(.play_icon_overlay)');
if (poster || photo) release.poster = poster || photo;
if ((photo && poster) || photos) release.photos = poster ? [photo, ...photos] : photos; // don't use first photo when already used as fallback poster
if (release.date) release.entryId = deriveEntryId(release);
const trailerCode = qu.q('.update_image a', 'onclick');
const trailerPath = trailerCode?.match(/tload\('(.*)'\)/)?.[1] || html.match(/\/trailer\/.*\.mp4/)?.[0];
if (trailerPath && /^http/.test(trailerPath)) release.trailer = { src: trailerPath };
else if (trailerPath) release.trailer = { src: `${site.parameters?.media || site.url}${trailerPath}` };
return release;
}
function scrapeProfile({ el, qu }, site) {
const profile = {};
const bio = qu.texts('.stats p').reduce((acc, info) => {
const [key, value] = info.split(':');
return {
...acc,
[slugify(key, '_')]: value.trim(),
};
}, {});
if (bio.measurements) {
const [bust, waist, hip] = bio.measurements.split('-');
if (bust) profile.bust = bust;
if (waist) profile.waist = Number(waist);
if (hip) profile.hip = Number(hip);
}
if (bio.age) profile.age = Number(bio.age);
if (bio.height) profile.height = feetInchesToCm(bio.height);
profile.avatar = getImageWithFallbacks(qu.q, '.profileimg img', site);
const qReleases = ctxa(el, '.modelFeatures .modelfeature');
profile.releases = scrapeAll(qReleases, site);
return profile;
}
function scrapeProfileT1({ el, qu }, site) {
const profile = {};
const bio = qu.all('.detail-div + .detail-div p, .detail-div p', true).reduce((acc, info) => {
const [key, value] = info.split(':');
if (!value) return acc;
return {
...acc,
[slugify(key, '_')]: value.trim(),
};
}, {});
if (bio.measurements) {
const [bust, waist, hip] = bio.measurements.split('-');
if (bust) profile.bust = bust;
if (waist) profile.waist = Number(waist);
if (hip) profile.hip = Number(hip);
}
if (bio.fun_fact) profile.description = bio.fun_fact;
if (bio.age) profile.age = Number(bio.age);
const heightMetric = bio.height?.match(/(\d{3})(\b|c)/);
const heightImperial = bio.height?.match(/\d{1}(\.\d)?/g);
if (heightMetric) profile.height = Number(heightMetric[1]);
if (heightImperial) profile.height = feetInchesToCm(Number(heightImperial[0]), Number(heightImperial[1]));
profile.avatar = getImageWithFallbacks(qu.q, '.img-div img', site);
const qReleases = ctxa(el, '.item-video');
profile.releases = scrapeAllT1(qReleases, site);
return profile;
}
function scrapeProfileTour({ el, qu }, site) {
const profile = {};
const bio = qu.texts('.model_bio').reduce((acc, info) => {
const [key, value] = info.split(':');
return {
...acc,
[slugify(key, '_')]: value.trim(),
};
}, {});
if (bio.date_of_birth) profile.birthdate = ed(bio.date_of_birth, 'MMMM D, YYYY');
if (bio.birthplace) profile.birthPlace = bio.birthplace;
if (bio.fun_fact) profile.description = bio.fun_fact;
if (bio.ethnicity) profile.ethnicity = bio.ethnicity;
if (bio.height) profile.height = Number(bio.height.match(/^\d{2,3}/)?.[0]);
if (bio.weight) profile.weight = Number(bio.weight.match(/^\d{2,3}/)?.[0]);
if (bio.measurements) {
const [bust, waist, hip] = bio.measurements.split('-');
if (bust) profile.bust = bust;
if (waist) profile.waist = Number(waist);
if (hip) profile.hip = Number(hip);
}
if (bio.natural_breasts && /yes/i.test(bio.natural_breasts)) profile.naturalBoobs = true;
if (bio.natural_breasts && /no/i.test(bio.natural_breasts)) profile.naturalBoobs = false;
if (bio.tattoos && /yes/i.test(bio.tattoos)) profile.hasTattoos = true;
if (bio.tattoos && /no/i.test(bio.tattoos)) profile.hasTattoos = false;
if (bio.piercings && /yes/i.test(bio.piercings)) profile.hasPiercings = true;
if (bio.piercings && /no/i.test(bio.piercings)) profile.hasPiercings = false;
if (bio.aliases) profile.aliases = bio.aliases.split(',').map(alias => alias.trim());
profile.avatar = getImageWithFallbacks(qu.q, '.model_picture img', site);
const qReleases = ctxa(el, '.update_block');
profile.releases = qReleases.map((qRelease) => {
const url = qRelease.qu.url('.update_image a[href]');
const release = scrapeSceneTour(qRelease, site);
if (!/\/(signup|join)/i.test(url)) release.url = url;
release.entryId = deriveEntryId(release);
release.site = site;
return release;
});
return profile;
}
async function fetchLatest(site, page = 1, _beforeFetchLatest, accSiteReleases) {
const url = (site.parameters?.latest && util.format(site.parameters.latest, page))
|| (site.parameters?.t1 && `${site.url}/t1/categories/movies_${page}_d.html`)
|| `${site.url}/categories/movies_${page}_d.html`;
const res = await geta(url, '.modelfeature, .item-video, .updateItem');
if (!res.ok) return res.status;
if (site.parameters?.t1) return scrapeAllT1(res.items, site, accSiteReleases);
if (site.parameters?.tour) return scrapeAllTour(res.items, site, accSiteReleases);
return scrapeAll(res.items, site, accSiteReleases);
}
async function fetchScene(url, site, baseRelease, beforeFetchLatest) {
const channelRegExp = beforeFetchLatest || await getChannelRegExp(site);
const res = await get(url);
if (!res.ok) return res.status;
if (site.parameters?.t1) return scrapeSceneT1(res.item, site, url, baseRelease, channelRegExp);
if (site.parameters?.tour) return scrapeSceneTour(res.item, site, url, baseRelease);
return scrapeScene(res.item, site, url, baseRelease);
}
async function fetchProfile(actorName, scraperSlug, site) {
const actorSlugA = slugify(actorName, '');
const actorSlugB = slugify(actorName);
const t1 = site.parameters?.t1 ? 't1/' : '';
const res1 = site.parameters?.profile
? await get(util.format(site.parameters.profile, actorSlugA))
: await get(`${site.url}/${t1}models/${actorSlugA}.html`);
const res = (res1.ok && res1)
|| (site.parameters?.profile && await get(util.format(site.parameters.profile, actorSlugB)))
|| await get(`${site.url}/${t1}models/${actorSlugB}.html`);
if (!res.ok) return res.status;
if (site.parameters?.t1) return scrapeProfileT1(res.item, site);
if (site.parameters?.tour) return scrapeProfileTour(res.item, site);
return scrapeProfile(res.item, site);
}
module.exports = {
beforeFetchLatest: getChannelRegExp,
fetchLatest,
fetchScene,
fetchProfile,
};