'use strict'; const util = require('util'); const knex = require('../knex'); const { get, geta, ed, fd, ctxa } = require('../utils/q'); const slugify = require('../utils/slugify'); const { feetInchesToCm } = require('../utils/convert'); async function getChannelRegExp(site) { if (!['hushpass', 'interracialpass'].includes(site.network.slug)) return null; const sites = await knex('sites').where('network_id', site.network.id); return new RegExp(sites.map(channel => channel.parameters?.match || channel.name).join('|'), 'i'); } function deriveEntryId(release) { if (release.date && release.title) { return `${slugify(fd(release.date, 'YYYY-MM-DD'))}-${slugify(release.title)}`; } return null; } function extractPoster(posterPath, site, baseRelease) { if (posterPath && !/400.jpg/.test(posterPath)) { const poster = `${site.parameters?.media || site.url}${posterPath}`; const posterSources = [ poster, // upscaled poster.replace('-1x', '-2x'), poster.replace('-1x', '-3x'), ]; if (baseRelease?.poster) { return [posterSources, [baseRelease.poster]]; } return [posterSources, []]; } return [baseRelease?.poster || null, []]; } function getImageWithFallbacks(q, selector, site, el) { const sources = el ? [ q(el, selector, 'src0_3x'), q(el, selector, 'src0_2x'), q(el, selector, 'src0_1x'), ] : [ q(selector, 'src0_3x'), q(selector, 'src0_2x'), q(selector, 'src0_1x'), ]; return sources.filter(Boolean).map(src => `${site.parameters?.media || site.url}${src}`); } function scrapeAll(scenes, site) { return scenes.map(({ q, qu, qd, ql }) => { const release = {}; release.title = q('h3 a', 'title') || q('h3 a', true); release.url = qu('h3 a'); release.date = qd('.modeldata p', 'YYYY-MM-DD', /\d{4}-\d{2}-\d{2}/); release.duration = ql('.modeldata p'); if (/bts|behind the scenes/i.test(release.title)) release.tags = ['behind the scenes']; release.poster = getImageWithFallbacks(q, '.modelimg img', site); // release.entryId = q('.modelimg img', 'id').match(/set-target-(\d+)/)[1]; release.entryId = deriveEntryId(release); return release; }); } function scrapeAllT1(scenes, site, accSiteReleases) { return scenes.map(({ q, qi, qd, ql, qu }) => { const release = {}; release.title = q('h4 a', 'title') || q('h4 a', true); release.url = qu('h4 a'); release.date = qd('.more-info-div', 'MMM D, YYYY'); release.duration = ql('.more-info-div'); if (/bts|behind the scenes/i.test(release.title)) release.tags = ['behind the scenes']; const posterPath = q('.img-div img', 'src0_1x') || qi('img.video_placeholder'); if (posterPath) { const poster = /^http/.test(posterPath) ? posterPath : `${site.parameters?.media || site.url}${posterPath}`; release.poster = [ poster.replace('-1x', '-3x'), poster.replace('-1x', '-2x'), poster, ]; } // release.entryId = q('.img-div img', 'id')?.match(/set-target-(\d+)/)[1]; release.entryId = deriveEntryId(release); if (site.parameters?.accFilter && accSiteReleases?.map(accRelease => accRelease.entryId).includes(release.entryId)) { // filter out releases that were already scraped from a categorized site return null; } return release; }).filter(Boolean); } function scrapeAllTour(scenes) { return scenes.map(({ q, qa, qu, qd, qi }) => { const release = {}; release.title = q('h4 a', true); release.url = qu('a'); release.date = qd('.tour_update_models + span', 'YYYY-MM-DD'); release.actors = qa('.tour_update_models a', true); release.poster = qi('a img'); release.entryId = deriveEntryId(release); return release; }); } function scrapeScene({ html, q, qa, qd, ql }, site, url, baseRelease) { const release = { url }; release.title = q('.centerwrap h2', true); release.description = q('.videocontent p', true); release.date = qd('.videodetails .date', 'MM/DD/YYYY'); release.duration = ql('.videodetails .date'); release.actors = qa('.modelname a', true); const posterPath = html.match(/poster="([\w-/.]+)"/)?.[1]; [release.poster, release.photos] = extractPoster(posterPath, site, baseRelease); const trailerPath = html.match(/\/trailers\/.*.mp4/); if (trailerPath) release.trailer = { src: `${site.parameters?.media || site.url}${trailerPath}` }; const stars = q('.modelrates + p', true).match(/\d.\d/)?.[0]; if (stars) release.stars = Number(stars); // release.entryId = html.match(/set-target-(\d+)/)[1]; release.entryId = deriveEntryId(release); return release; } function scrapeSceneT1({ html, q, qa, qd, ql, qtx }, site, url, baseRelease, channelRegExp) { const release = { url }; release.title = q('.trailer-section-head .section-title', true); release.description = qtx('.row .update-info-block'); release.date = qd('.update-info-row', 'MMM D, YYYY', /\w+ \d{1,2}, \d{4}/); release.duration = ql('.update-info-row:nth-child(2)'); release.actors = qa('.models-list-thumbs a').map(el => ({ name: q(el, 'span', true), avatar: getImageWithFallbacks(q, 'img', site, el), })); release.tags = qa('.tags a', true); // const posterPath = html.match(/poster="(.*\.jpg)/)?.[1]; const posterPath = q('.player-thumb img', 'src0_1x'); [release.poster, release.photos] = extractPoster(posterPath, site, baseRelease); const trailer = html.match(/ channelRegExp.test(tag)); if (channel) { release.channel = { force: true, slug: slugify(channel, { delimiter: '' }), }; } } // release.entryId = q('.player-thumb img', 'id')?.match(/set-target-(\d+)/)[1]; release.entryId = deriveEntryId(release); return release; } function scrapeSceneTour({ html, q, qd, qa, qis }, site, url) { const release = {}; if (url) release.url = url; release.title = q('.update_title, .video-title', true); release.description = q('.latest_update_description, .video-summary', true); const date = qd('.availdate, .update_date', 'YYYY-MM-DD'); if (date) release.date = date; release.actors = qa('.update_block_info .tour_update_models a, .video-model .tour_update_models a', true); release.tags = qa('.update_tags a, .tour_update_tags a', true); const [photo, poster, ...photos] = qis('.update_image img:not(.play_icon_overlay)'); if (poster || photo) release.poster = poster || photo; if ((photo && poster) || photos) release.photos = poster ? [photo, ...photos] : photos; // don't use first photo when already used as fallback poster if (release.date) release.entryId = deriveEntryId(release); const trailerCode = q('.update_image a', 'onclick'); const trailerPath = trailerCode?.match(/tload\('(.*)'\)/)?.[1] || html.match(/\/trailer\/.*\.mp4/)?.[0]; if (trailerPath && /^http/.test(trailerPath)) release.trailer = { src: trailerPath }; else if (trailerPath) release.trailer = { src: `${site.parameters?.media || site.url}${trailerPath}` }; return release; } function scrapeProfile({ el, q, qtxs }, site) { const profile = {}; const bio = qtxs('.stats p').reduce((acc, info) => { const [key, value] = info.split(':'); return { ...acc, [slugify(key, { delimiter: '_' })]: value.trim(), }; }, {}); if (bio.measurements) { const [bust, waist, hip] = bio.measurements.split('-'); if (bust) profile.bust = bust; if (waist) profile.waist = Number(waist); if (hip) profile.hip = Number(hip); } if (bio.age) profile.age = Number(bio.age); if (bio.height) profile.height = feetInchesToCm(bio.height); profile.avatar = getImageWithFallbacks(q, '.profileimg img', site); const qReleases = ctxa(el, '.modelFeatures .modelfeature'); profile.releases = scrapeAll(qReleases, site); return profile; } function scrapeProfileT1({ el, q, qa }, site) { const profile = {}; const bio = qa('.detail-div + .detail-div p, .detail-div p', true).reduce((acc, info) => { const [key, value] = info.split(':'); if (!value) return acc; return { ...acc, [slugify(key, { delimiter: '_' })]: value.trim(), }; }, {}); if (bio.measurements) { const [bust, waist, hip] = bio.measurements.split('-'); if (bust) profile.bust = bust; if (waist) profile.waist = Number(waist); if (hip) profile.hip = Number(hip); } if (bio.fun_fact) profile.description = bio.fun_fact; if (bio.age) profile.age = Number(bio.age); const heightMetric = bio.height?.match(/(\d{3})(\b|c)/); const heightImperial = bio.height?.match(/\d{1}(\.\d)?/g); if (heightMetric) profile.height = Number(heightMetric[1]); if (heightImperial) profile.height = feetInchesToCm(Number(heightImperial[0]), Number(heightImperial[1])); profile.avatar = getImageWithFallbacks(q, '.img-div img', site); const qReleases = ctxa(el, '.item-video'); profile.releases = scrapeAllT1(qReleases, site); return profile; } function scrapeProfileTour({ el, q, qtxs }, site) { const profile = {}; const bio = qtxs('.model_bio').reduce((acc, info) => { const [key, value] = info.split(':'); return { ...acc, [slugify(key, { delimiter: '_' })]: value.trim(), }; }, {}); if (bio.date_of_birth) profile.birthdate = ed(bio.date_of_birth, 'MMMM D, YYYY'); if (bio.birthplace) profile.birthPlace = bio.birthplace; if (bio.fun_fact) profile.description = bio.fun_fact; if (bio.ethnicity) profile.ethnicity = bio.ethnicity; if (bio.height) profile.height = Number(bio.height.match(/^\d{2,3}/)?.[0]); if (bio.weight) profile.weight = Number(bio.weight.match(/^\d{2,3}/)?.[0]); if (bio.measurements) { const [bust, waist, hip] = bio.measurements.split('-'); if (bust) profile.bust = bust; if (waist) profile.waist = Number(waist); if (hip) profile.hip = Number(hip); } if (bio.natural_breasts && /yes/i.test(bio.natural_breasts)) profile.naturalBoobs = true; if (bio.natural_breasts && /no/i.test(bio.natural_breasts)) profile.naturalBoobs = false; if (bio.tattoos && /yes/i.test(bio.tattoos)) profile.hasTattoos = true; if (bio.tattoos && /no/i.test(bio.tattoos)) profile.hasTattoos = false; if (bio.piercings && /yes/i.test(bio.piercings)) profile.hasPiercings = true; if (bio.piercings && /no/i.test(bio.piercings)) profile.hasPiercings = false; if (bio.aliases) profile.aliases = bio.aliases.split(',').map(alias => alias.trim()); profile.avatar = getImageWithFallbacks(q, '.model_picture img', site); const qReleases = ctxa(el, '.update_block'); profile.releases = qReleases.map((qRelease) => { const url = qRelease.qu('.update_image a[href]'); const release = scrapeSceneTour(qRelease, site); if (!/\/(signup|join)/i.test(url)) release.url = url; release.entryId = deriveEntryId(release); release.site = site; return release; }); return profile; } async function fetchLatest(site, page = 1, _beforeFetchLatest, accSiteReleases) { const url = (site.parameters?.latest && util.format(site.parameters.latest, page)) || (site.parameters?.t1 && `${site.url}/t1/categories/movies_${page}_d.html`) || `${site.url}/categories/movies_${page}_d.html`; const qLatest = await geta(url, '.modelfeature, .item-video, .updateItem'); if (!qLatest) return null; if (site.parameters?.t1) return scrapeAllT1(qLatest, site, accSiteReleases); if (site.parameters?.tour) return scrapeAllTour(qLatest, site, accSiteReleases); return scrapeAll(qLatest, site, accSiteReleases); } async function fetchScene(url, site, baseRelease, beforeFetchLatest) { const channelRegExp = beforeFetchLatest || await getChannelRegExp(site); const qScene = await get(url); if (!qScene) return null; if (site.parameters?.t1) return scrapeSceneT1(qScene, site, url, baseRelease, channelRegExp); if (site.parameters?.tour) return scrapeSceneTour(qScene, site, url, baseRelease); return scrapeScene(qScene, site, url, baseRelease); } async function fetchProfile(actorName, scraperSlug, site) { const actorSlugA = slugify(actorName, { delimiter: '' }); const actorSlugB = slugify(actorName); const t1 = site.parameters?.t1 ? 't1/' : ''; const qProfile = site.parameters?.profile ? (await get(util.format(site.parameters.profile, actorSlugA)) || await get(site.parameters.profile, actorSlugB)) : (await get(`${site.url}/${t1}models/${actorSlugA}.html`) || await get(`${site.url}/${t1}models/${actorSlugB}.html`)); if (site.parameters?.t1) return qProfile && scrapeProfileT1(qProfile, site); if (site.parameters?.tour) return qProfile && scrapeProfileTour(qProfile, site); return qProfile && scrapeProfile(qProfile, site); } module.exports = { beforeFetchLatest: getChannelRegExp, fetchLatest, fetchScene, fetchProfile, };