traxxx/src/scrapers/elevatedx.js

300 lines
8.5 KiB
JavaScript

'use strict';
const format = require('template-format');
const qu = require('../utils/q');
const slugify = require('../utils/slugify');
const { convert } = require('../utils/convert');
function deriveEntryId(release) {
if (release.date && release.url) {
const slug = new URL(release.url).pathname.match(/\/trailers\/(.*).html/)[1];
return `${slugify(qu.formatDate(release.date, 'YYYY-MM-DD'))}-${slugify(slug)}`;
}
if (release.date && release.title) {
return `${slugify(qu.formatDate(release.date, 'YYYY-MM-DD'))}-${slugify(release.title)}`;
}
return null;
}
function extractPoster(posterPath, channel, baseRelease) {
if (posterPath && !/400.jpg/.test(posterPath)) {
const poster = qu.prefixUrl(posterPath, channel.parameters?.media || channel.url);
const posterSources = [
poster,
// upscaled
poster.replace('-1x', '-2x'),
poster.replace('-1x', '-3x'),
];
if (baseRelease?.poster) {
return [posterSources, [baseRelease.poster]];
}
return [posterSources, []];
}
return [baseRelease?.poster || null, []];
}
function getImageWithFallbacks(q, selector, site, el) {
const sources = el
? [
q(el, selector, 'src0_3x'),
q(el, selector, 'src0_2x'),
q(el, selector, 'src0_1x'),
]
: [
q(selector, 'src0_3x'),
q(selector, 'src0_2x'),
q(selector, 'src0_1x'),
];
return sources.filter(Boolean).map((src) => `${site.parameters?.media || site.url}${src}`);
}
function scrapeAllClassic(scenes, channel) {
return scenes.map(({ query }) => {
const release = {};
release.url = query.url('.updateInfo h5 a:not([href*="content/"]):not([href*="#coming"])');
release.entryId = query.url('.updateThumb img', 'alt');
release.title = query.cnt('.updateInfo h5 a');
release.actors = query.cnts('.tour_update_models a');
release.date = query.date('.availdate, .updateInfo p span:last-child', 'MM/DD/YYYY');
release.poster = query.img('.updateThumb img');
const trailer = query.q('.updateInfo h5 a', 'onclick')?.match(/'(.+)'/)?.[1];
if (trailer) {
release.trailer = `${channel.url}${trailer}`;
}
return release;
});
}
function scrapeAllTubular(scenes, channel, accNetworkReleases) {
return scenes.map(({ query }) => {
const release = {};
release.title = query.q('h4 a', 'title') || query.q('h4 a', true);
release.url = query.url('h4 a');
release.date = query.date('.more-info-div', 'MMM D, YYYY');
release.duration = query.dur('.more-info-div');
const posterPath = query.q('.img-div img', 'src0_1x') || query.img('img.video_placeholder');
if (posterPath) {
const poster = /^http/.test(posterPath) ? posterPath : `${channel.parameters?.media || channel.url}${posterPath}`;
release.poster = [
poster.replace('-1x', '-3x'),
poster.replace('-1x', '-2x'),
poster,
];
}
release.teaser = query.video();
// release.entryId = q('.img-div img', 'id')?.match(/set-target-(\d+)/)[1];
release.entryId = deriveEntryId(release);
if (channel.parameters?.accFilter && accNetworkReleases?.map((accRelease) => accRelease.entryId).includes(release.entryId)) {
// filter out releases that were already scraped from a categorized site, requeryires sequeryential site scraping
return null;
}
return release;
});
}
function scrapeSceneClassic({ query, html }, url, channel) {
const release = {};
release.title = query.q('.updatesBlock h2', true);
release.poster = query.meta('property="og:image"');
release.entryId = release.poster.match(/\/content\/(.*)\//)?.[1];
const trailer = html.match(/src="(.+\.mp4)"/)?.[1];
if (trailer) {
release.trailer = {
src: `${channel.url}${trailer}`,
};
}
return release;
}
function scrapeSceneTubular({ query, html }, entity, url, baseRelease) {
const release = {};
release.title = query.q('.trailer-section-head .section-title, .title-block .section-title', true);
release.description = query.text('.row .update-info-block');
release.date = query.date('.update-info-row', 'MMM D, YYYY', /\w+ \d{1,2}, \d{4}/);
release.duration = query.dur('.update-info-row:nth-child(2)');
release.actors = query.all('.models-list-thumbs a').map((el) => ({
name: query.cnt(el, 'span'),
avatar: getImageWithFallbacks(query.q, 'img', entity, el),
url: query.url(el, null),
}));
release.tags = query.all('.tags a', true);
const posterPath = query.q('.player-thumb img', 'src0_1x');
const trailer = html.match(/<video.*src="(.*\.mp4)/)?.[1];
[release.poster, release.photos] = extractPoster(posterPath, entity, baseRelease);
if (trailer) {
release.trailer = { src: qu.prefixUrl(trailer, entity.parameters?.media || entity.url), referer: url };
}
const stars = query.q('.update-rating', true).match(/\d.\d/)?.[0];
if (stars) release.stars = Number(stars);
if (entity.type === 'network') {
const channelRegExp = new RegExp(entity.children.map((channel) => channel.parameters?.match || channel.name).join('|'), 'i');
const channel = release.tags.find((tag) => channelRegExp.test(tag));
if (channel) {
release.channel = slugify(channel, '');
}
}
release.entryId = deriveEntryId(release);
return release;
}
async function scrapeProfile({ query }, entity, parameters) {
const profile = {};
const bio = query.cnt('.model_bio, .detail-div');
const avatarEl = query.q('.model_bio_pic img, .model_bio_thumb');
profile.age = Number(bio?.match(/Age:\s*(\d{2})/)?.[1]) || null;
profile.dateOfBirth = qu.parseDate(bio?.match(/Age:\s*(\w+ \d{1,2}, \d{4})/)?.[0], 'MMMM D, YYYY');
profile.height = convert(bio?.match(/\d+\s*(feet|')\s*\d+\s*(inches|"|$)/)?.[0], 'cm');
profile.measurements = bio?.match(/\w+[-x]\d+[-x]\d+/)?.[0] || null;
profile.aliases = bio?.match(/also known as:\s*([\w\s]+(,\s*)?)+/i)?.[1].split(/,\s*/) || [];
if (avatarEl) {
const avatarSources = [
avatarEl.getAttribute('src0_3x'),
avatarEl.getAttribute('src0_2x'),
avatarEl.getAttribute('src0_1x'),
avatarEl.getAttribute('src0'),
avatarEl.getAttribute('src'),
]
.filter((avatar) => avatar && !/p\d+.jpe?g/.test(avatar)) // remove non-existing attributes and placeholder images
.map((avatar) => qu.prefixUrl(avatar, entity.url));
if (avatarSources.length) profile.avatar = avatarSources;
}
if (parameters?.layout === 'classic') {
profile.scenes = scrapeAllClassic(qu.initAll(query.all('.bodyArea .updateItem')), entity);
}
if (parameters?.layout === 'tubular') {
profile.scenes = scrapeAllTubular(qu.initAll(query.all('.modelfeature, .item-video')), entity);
}
return profile;
}
async function fetchLatest(site, page = 1, options, preData, allScraper) {
const url = (site.parameters?.latest && format(site.parameters.latest, { page }))
|| `${site.url}/categories/movies_${page}_d.html`;
const res = await qu.getAll(url, '.modelfeature, .item-video, .bodyArea .updateItem');
if (!res.ok) {
return res.status;
}
return allScraper(res.items, site, preData?.uniqueReleases);
}
async function fetchUpcomingClassic(channel) {
const res = await qu.getAll(channel.url, '#owl-upcomingScenes .updateItem');
if (res.ok) {
return scrapeAllClassic(res.items, channel);
}
return res.status;
}
async function fetchLatestClassic(channel, page, options, preData) {
return fetchLatest(channel, page, options, preData, scrapeAllClassic);
}
async function fetchLatestTubular(channel, page, options, preData) {
return fetchLatest(channel, page, options, preData, scrapeAllTubular);
}
async function fetchProfile({ name: actorName, url }, { entity, parameters }) {
const actorSlugA = slugify(actorName, '');
const actorSlugB = slugify(actorName, '-');
if (!url && !parameters?.profile && !entity.url) {
return null;
}
const urls = Array.from(new Set([
url,
entity.parameters?.profile ? format(entity.parameters.profile, { actorSlug: actorSlugA }) : `${entity.url}/models/${actorSlugA}.html`,
entity.parameters?.profile ? format(entity.parameters.profile, { actorSlug: actorSlugB }) : `${entity.url}/models/${actorSlugB}.html`,
]));
return urls.reduce(async (chain, profileUrl) => {
const profile = await chain;
if (profile) {
return profile;
}
if (!profileUrl) {
return null;
}
const res = await qu.get(profileUrl);
if (res.statusCode === 200) {
return scrapeProfile(res.item, entity, parameters);
}
return null;
}, Promise.resolve());
}
module.exports = {
classic: {
fetchLatest: fetchLatestClassic,
fetchUpcoming: fetchUpcomingClassic,
fetchProfile,
scrapeAll: scrapeAllClassic,
scrapeScene: scrapeSceneClassic,
},
tubular: {
fetchLatest: fetchLatestTubular,
fetchProfile,
scrapeAll: scrapeAllTubular,
scrapeScene: scrapeSceneTubular,
},
};