traxxx/src/scrapers/hush.js

373 lines
12 KiB
JavaScript
Raw Normal View History

2020-03-05 01:47:52 +00:00
'use strict';
const util = require('util');
2020-03-05 01:47:52 +00:00
const qu = require('../utils/q');
2020-03-05 01:47:52 +00:00
const slugify = require('../utils/slugify');
const { feetInchesToCm, inchesToCm } = require('../utils/convert');
2020-03-05 01:47:52 +00:00
function deriveEntryId(release) {
if (release.date && release.url) {
const slug = new URL(release.url).pathname.match(/\/trailers\/(.*).html/)[1];
return `${slugify(qu.formatDate(release.date, 'YYYY-MM-DD'))}-${slugify(slug)}`;
}
if (release.date && release.title) {
return `${slugify(qu.formatDate(release.date, 'YYYY-MM-DD'))}-${slugify(release.title)}`;
}
return null;
}
function extractPoster(posterPath, site, baseRelease) {
if (posterPath && !/400.jpg/.test(posterPath)) {
const poster = `${site.parameters?.media || site.url}${posterPath}`;
const posterSources = [
poster,
// upscaled
poster.replace('-1x', '-2x'),
poster.replace('-1x', '-3x'),
];
if (baseRelease?.poster) {
return [posterSources, [baseRelease.poster]];
}
return [posterSources, []];
}
return [baseRelease?.poster || null, []];
}
function getImageWithFallbacks(q, selector, site, el) {
const sources = el
? [
q(el, selector, 'src0_3x'),
q(el, selector, 'src0_2x'),
q(el, selector, 'src0_1x'),
]
: [
q(selector, 'src0_3x'),
q(selector, 'src0_2x'),
q(selector, 'src0_1x'),
];
2021-11-20 22:59:15 +00:00
return sources.filter(Boolean).map((src) => `${site.parameters?.media || site.url}${src}`);
}
function scrapeAll(scenes, channel) {
return scenes.map(({ query }) => {
const release = {};
2020-03-05 01:47:52 +00:00
release.title = query.q('h4 a', true);
release.url = query.url('a');
2020-03-05 01:47:52 +00:00
release.date = query.date('.date', 'YYYY-MM-DD');
release.duration = query.duration('.time');
2020-03-05 01:47:52 +00:00
const count = query.number('a img', null, 'cnt');
2020-03-05 01:47:52 +00:00
[release.poster, ...release.photos] = Array.from({ length: count }, (value, index) => [
query.img('a img', `src${index}_3x`, { origin: channel.url }),
query.img('a img', `src${index}_2x`, { origin: channel.url }),
query.img('a img', `src${index}_1x`, { origin: channel.url }),
]);
2020-03-05 01:47:52 +00:00
release.stars = query.count('img[src*="star_full"]') + (query.count('img[src*="star_half"]') * 0.5);
release.entryId = deriveEntryId(release);
return release;
});
2020-03-05 01:47:52 +00:00
}
function scrapeAllT1(scenes, site, accNetworkReleases) {
return scenes.map(({ query }) => {
const release = {};
2020-03-05 01:47:52 +00:00
release.title = query.q('h4 a', 'title') || query.q('h4 a', true);
release.url = query.url('h4 a');
2020-03-05 01:47:52 +00:00
release.date = query.date('.more-info-div', 'MMM D, YYYY');
release.duration = query.dur('.more-info-div');
2020-03-05 01:47:52 +00:00
if (/bts|behind the scenes/i.test(release.title)) release.tags = ['behind the scenes'];
const posterPath = query.q('.img-div img', 'src0_1x') || query.img('img.video_placeholder');
2020-03-05 01:47:52 +00:00
if (posterPath) {
const poster = /^http/.test(posterPath) ? posterPath : `${site.parameters?.media || site.url}${posterPath}`;
2020-03-05 01:47:52 +00:00
release.poster = [
poster.replace('-1x', '-3x'),
poster.replace('-1x', '-2x'),
poster,
];
}
2020-03-05 01:47:52 +00:00
// release.entryId = q('.img-div img', 'id')?.match(/set-target-(\d+)/)[1];
release.entryId = deriveEntryId(release);
2021-11-20 22:59:15 +00:00
if (site.parameters?.accFilter && accNetworkReleases?.map((accRelease) => accRelease.entryId).includes(release.entryId)) {
// filter out releases that were already scraped from a categorized site, requeryires sequeryential site scraping
return null;
}
return release;
}).filter(Boolean);
}
function scrapeScene({ html, query }, channel, url) {
const release = { url }; // url used for entry ID
release.title = query.cnt('.videoDetails h3');
release.description = query.cnt('.videoDetails p');
release.date = query.date('.videoInfo p', ['MM/DD/YYYY', 'YYYY-MM-DD']);
release.duration = Number(query.cnt('.videoInfo p:nth-of-type(2)')?.match(/(\d+) min/i)?.[1]) * 60;
release.actors = query.cnts('.update_models a');
2020-03-05 01:47:52 +00:00
const posterPath = html.match(/poster="([\w-/.]+)"/)?.[1];
const poster = qu.prefixUrl(posterPath, channel.url) || query.img('.update_thumb', 'src0_1x', { origin: channel.url }); // latter used when trailer requires signup
[release.poster, ...release.photos] = [poster, ...query.imgs('.item-thumb img', 'src0_1x', { origin: channel.url })]
2021-11-20 22:59:15 +00:00
.map((src) => src && [
src.replace('-1x', '-3x'),
src.replace('-1x', '-2x'),
src,
]);
2020-03-05 01:47:52 +00:00
const trailerPath = html.match(/\/trailers?\/.*.mp4/);
2020-03-05 01:47:52 +00:00
if (trailerPath) {
release.trailer = qu.prefixUrl(trailerPath, channel.parameters?.media || channel.url);
}
release.tags = query.cnts('.featuring a[href*="categories/"]');
release.stars = query.count('.stars img[src*="star_full"]') + (query.count('.stars img[src*="star_half"]') * 0.5);
2020-03-05 01:47:52 +00:00
release.entryId = deriveEntryId(release);
return release;
2020-03-05 01:47:52 +00:00
}
function scrapeSceneT1({ html, query }, site, url, baseRelease) {
const release = { url };
2020-03-05 01:47:52 +00:00
release.title = query.q('.trailer-section-head .section-title', true);
release.description = query.text('.row .update-info-block');
2020-03-05 01:47:52 +00:00
release.date = query.date('.update-info-row', 'MMM D, YYYY', /\w+ \d{1,2}, \d{4}/);
release.duration = query.dur('.update-info-row:nth-child(2)');
2020-03-05 01:47:52 +00:00
2021-11-20 22:59:15 +00:00
release.actors = query.all('.models-list-thumbs a').map((el) => ({
name: query.q(el, 'span', true),
avatar: getImageWithFallbacks(query.q, 'img', site, el),
}));
2020-03-05 01:47:52 +00:00
release.tags = query.all('.tags a', true);
2020-03-05 01:47:52 +00:00
// const posterPath = html.match(/poster="(.*\.jpg)/)?.[1];
const posterPath = query.q('.player-thumb img', 'src0_1x');
[release.poster, release.photos] = extractPoster(posterPath, site, baseRelease);
2020-03-05 01:47:52 +00:00
const trailer = html.match(/<video.*src="(.*\.mp4)/)?.[1];
if (trailer && /^http/.test(trailer)) release.trailer = { src: trailer, referer: url };
else if (trailer) release.trailer = { src: `${site.parameters?.media || site.url}${trailer}`, referer: url };
2020-03-05 01:47:52 +00:00
const stars = query.q('.update-rating', true).match(/\d.\d/)?.[0];
if (stars) release.stars = Number(stars);
2020-03-05 01:47:52 +00:00
if (site.type === 'network') {
2021-11-20 22:59:15 +00:00
const channelRegExp = new RegExp(site.children.map((channel) => channel.parameters?.match || channel.name).join('|'), 'i');
const channel = release.tags.find((tag) => channelRegExp.test(tag));
if (channel) {
release.channel = slugify(channel, '');
}
}
// release.entryId = q('.player-thumb img', 'id')?.match(/set-target-(\d+)/)[1];
release.entryId = deriveEntryId(release);
return release;
}
function scrapeProfileT1({ el, query }, site) {
const profile = {};
const bio = query.all('.detail-div + .detail-div p, .detail-div p', true).reduce((acc, info) => {
const [key, value] = info.split(':');
if (!value) return acc;
return {
...acc,
[slugify(key, '_')]: value.trim(),
};
}, {});
if (bio.measurements) {
const [bust, waist, hip] = bio.measurements.split('-');
if (bust) profile.bust = bust;
if (waist) profile.waist = Number(waist);
if (hip) profile.hip = Number(hip);
}
if (bio.fun_fact) profile.description = bio.fun_fact;
if (bio.age) profile.age = Number(bio.age);
const heightMetric = bio.height?.match(/(\d{3})(\b|c)/);
const heightImperial = bio.height?.match(/\d{1}(\.\d)?/g);
if (heightMetric) profile.height = Number(heightMetric[1]);
if (heightImperial) profile.height = feetInchesToCm(Number(heightImperial[0]), Number(heightImperial[1]));
profile.avatar = getImageWithFallbacks(query.q, '.img-div img', site);
const qReleases = qu.initAll(el, '.item-video');
profile.releases = scrapeAllT1(qReleases, site);
return profile;
}
async function fetchActorScenes({ query, el }, channel, accScenes = []) {
const scenes = scrapeAll(qu.initAll(el, '.item-video'), channel);
const nextPage = query.url('.next a');
if (nextPage) {
const res = await qu.get(nextPage);
if (res.ok) {
return fetchActorScenes(res.item, channel, scenes.concat(accScenes));
}
}
return accScenes.concat(scenes);
}
async function scrapeProfile({ query, el }, channel, options) {
const profile = {};
const bio = query.all('.stats li').reduce((acc, bioEl) => {
const key = query.cnt(bioEl, 'strong');
const value = query.url(bioEl) || query.text(bioEl);
return {
...acc,
[slugify(key, '_')]: value,
};
}, {});
if (bio.date_of_birth) profile.birthdate = qu.extractDate(bio.date_of_birth, 'MMMM D, YYYY');
if (bio.birthplace) profile.birthPlace = bio.birthplace;
if (bio.fun_fact) profile.description = bio.fun_fact;
if (bio.ethnicity) profile.ethnicity = bio.ethnicity;
if (bio.height) profile.height = Number(bio.height.match(/^\d{2,3}/)?.[0]);
if (bio.weight) profile.weight = Number(bio.weight.match(/^\d{2,3}/)?.[0]);
if (bio.measurements) {
const [bust, waist, hip] = bio.measurements.split('-');
if (bust) profile.bust = bust;
if (waist) profile.waist = Number(waist);
if (hip) profile.hip = Number(hip);
}
if (bio.penis_length) profile.penisLength = Number(bio.penis_length.match(/(\d+)\s*cm/i)?.[1] || inchesToCm(bio.penis_length.match(/(\d+\.?\d+)\s*in/i)?.[1])) || null;
if (bio.penis_girth) profile.penisGirth = Number(bio.penis_girth.match(/(\d+)\s*cm/i)?.[1] || inchesToCm(bio.penis_girth.match(/(\d+\.?\d+)\s*in/i)?.[1])) || null;
if (bio.circumcised && /yes/i.test(bio.circumcised)) profile.circumcised = true;
if (bio.circumcised && /no/i.test(bio.circumcised)) profile.circumcised = false;
if (bio.natural_breasts && /yes/i.test(bio.natural_breasts)) profile.naturalBoobs = true;
if (bio.natural_breasts && /no/i.test(bio.natural_breasts)) profile.naturalBoobs = false;
if (bio.tattoos && /yes/i.test(bio.tattoos)) profile.hasTattoos = true;
if (bio.tattoos && /no/i.test(bio.tattoos)) profile.hasTattoos = false;
if (bio.piercings && /yes/i.test(bio.piercings)) profile.hasPiercings = true;
if (bio.piercings && /no/i.test(bio.piercings)) profile.hasPiercings = false;
2021-11-20 22:59:15 +00:00
if (bio.aliases) profile.aliases = bio.aliases.split(',').map((alias) => alias.trim());
profile.social = [bio.onlyfans, bio.twitter, bio.instagram].filter(Boolean);
profile.avatar = [
query.img('.profile-pic img', 'src0_3x', { origin: channel.url }),
query.img('.profile-pic img', 'src0_2x', { origin: channel.url }),
query.img('.profile-pic img', 'src0_1x', { origin: channel.url }),
];
if (options.includeActorScenes) {
profile.releases = await fetchActorScenes({ query, el }, channel);
}
return profile;
}
async function fetchLatest(site, page = 1, include, { uniqueReleases = [], duplicateReleases = [] }) {
const url = (site.parameters?.latest && util.format(site.parameters.latest, page))
|| (site.parameters?.t1 && `${site.url}/t1/categories/movies_${page}_d.html`)
|| `${site.url}/categories/movies_${page}_d.html`;
const res = await qu.getAll(url, '.modelfeature, .item-video, .updateItem');
2020-03-05 01:47:52 +00:00
if (!res.ok) {
return res.status;
}
if (site.parameters?.t1) {
return scrapeAllT1(res.items, site, [...uniqueReleases, ...duplicateReleases]);
}
2020-03-05 01:47:52 +00:00
return scrapeAll(res.items, site, uniqueReleases);
2020-03-05 01:47:52 +00:00
}
async function fetchScene(url, site, baseRelease) {
const res = await qu.get(url);
2020-03-05 01:47:52 +00:00
if (!res.ok) {
return res.status;
}
if (site.parameters?.t1) {
return scrapeSceneT1(res.item, site, url, baseRelease);
}
2020-03-05 01:47:52 +00:00
return scrapeScene(res.item, site, url, baseRelease);
2020-03-05 01:47:52 +00:00
}
async function fetchProfile({ name: actorName }, { site }, options) {
const actorSlugA = slugify(actorName, '');
const actorSlugB = slugify(actorName);
const t1 = site.parameters?.t1 ? 't1/' : '';
const res1 = site.parameters?.profile
? await qu.get(util.format(site.parameters.profile, actorSlugA))
: await qu.get(`${site.url}/${t1}models/${actorSlugA}.html`, null, null, { followRedirects: false });
const res = (res1.ok && res1)
|| (site.parameters?.profile && await qu.get(util.format(site.parameters.profile, actorSlugB)))
|| await qu.get(`${site.url}/${t1}models/${actorSlugB}.html`, null, null, { followRedirects: false });
if (!res.ok) {
return res.status;
}
if (site.parameters?.t1) {
return scrapeProfileT1(res.item, site);
}
return scrapeProfile(res.item, site, options);
}
2020-03-05 01:47:52 +00:00
module.exports = {
fetchLatest,
fetchScene,
fetchProfile,
scrapeAllT1,
2020-03-05 01:47:52 +00:00
};