'use strict'; const util = require('util'); const Promise = require('bluebird'); const cheerio = require('cheerio'); const { JSDOM } = require('jsdom'); const moment = require('moment'); const unprint = require('unprint'); const qu = require('../utils/qu'); const http = require('../utils/http'); const { heightToCm } = require('../utils/convert'); const slugify = require('../utils/slugify'); function getEntryId(html) { const entryId = html.match(/showtagform\((\d+)\)/); if (entryId) { return entryId[1]; } const setIdIndex = html.indexOf('setid:"'); if (setIdIndex) { return html.slice(setIdIndex, html.indexOf(',', setIdIndex)).match(/\d+/)?.[0]; } return null; } function scrapeAll(scenes, site, entryIdFromTitle) { return scenes.map(({ element, query }) => { const release = {}; const title = query.content('.content_img div, .dvd_info > a, a.update_title, a[title] + a[title], .overlay-text') || query.content('a[title*=" "]'); release.title = title?.slice(0, title.match(/starring:/i)?.index || Infinity).trim(); release.url = query.url('.content_img a, .dvd_info > a, a.update_title, a[title]'); release.date = query.date('.update_date', 'MM/DD/YYYY'); release.entryId = (entryIdFromTitle && slugify(release.title)) || element.dataset.setid || query.element('.rating_box')?.dataset.id || query.attribute('a img', 'id')?.match(/set-target-(\d+)/)?.[1]; release.actors = query.all('.content_img .update_models a, .update_models a').map((actorEl) => ({ name: unprint.query.content(actorEl), url: unprint.query.url(actorEl, null), })); const dvdPhotos = query.imgs('.dvd_preview_thumb'); const photoCount = Number(query.attribute('a img.thumbs', 'cnt')) || 1; [release.poster, ...release.photos] = dvdPhotos.length ? dvdPhotos : Array.from({ length: photoCount }).map((value, index) => { const src = query.img('a img.thumbs', { attribute: `src${index}_1x` }) || query.img('a img.thumbs', { attribute: `src${index}` }) || query.img('a img.thumbs'); const prefixedSrc = qu.prefixUrl(src, site.url); if (src) { return [ { src: prefixedSrc.replace(/.jpg$/, '-full.jpg'), referer: site.url, verifyType: 'image', // sometimes returns 200 OK with text/html instead of 403 }, { src: prefixedSrc.replace(/-1x.jpg$/, '-4x.jpg'), referer: site.url, verifyType: 'image', }, { src: prefixedSrc.replace(/-1x.jpg$/, '-2x.jpg'), referer: site.url, verifyType: 'image', }, { src: prefixedSrc, referer: site.url, verifyType: 'image', }, ]; } return null; }).filter(Boolean); const teaserScript = query.html('script'); if (teaserScript) { release.teaser = teaserScript.slice(teaserScript.indexOf('http'), teaserScript.indexOf('.mp4') + 4); } return release; }); } function scrapeUpcoming(html, site) { const $ = cheerio.load(html, { normalizeWhitespace: true }); const scenesElements = $('#coming_soon_carousel').find('.table').toArray(); return scenesElements.map((element) => { const release = {}; release.entryId = $(element).find('.upcoming_updates_thumb').attr('id').match(/\d+/)[0]; const details = $(element).find('.update_details_comingsoon') .eq(1) .children() .remove(); release.title = details .end() .text() .trim(); release.actors = details .text() .trim() .split(', '); release.date = moment .utc($(element).find('.update_date_comingsoon').text().slice(7), 'MM/DD/YYYY') .toDate(); const photoElement = $(element).find('a img.thumbs'); const posterPath = photoElement.attr('src'); release.poster = posterPath.match(/^http/) ? posterPath : `${site.url}${posterPath}`; const videoClass = $(element).find('.update_thumbnail div').attr('class'); const videoScript = $(element).find(`script:contains(${videoClass})`).html(); if (videoScript) { release.teaser = videoScript.slice(videoScript.indexOf('https://'), videoScript.indexOf('.mp4') + 4); } return release; }); } function extractLegacyTrailer(html, context) { const trailerLines = html.split('\n').filter((line) => /movie\["trailer\w*"\]\[/i.test(line)); if (trailerLines.length) { return trailerLines.map((trailerLine) => { // const src = trailerLine.match(/path:"([\w-:/.&=?%]+)"/)?.[1]; const src = trailerLine.match(/path:"(.+)"/)?.[1]; const quality = trailerLine.match(/movie_height:'(\d+)/)?.[1]; return src && { src: /^http/.test(src) ? src : `${context.entity.url}${src}`, quality: quality && Number(quality.replace('558', '540')), }; }).filter(Boolean); } return null; } const qualities = [ 'photos', '1600watermarked', '1280watermarked', '1024watermarked', 'thumbs', ]; function getPhotos(query, release, context) { // https://thumbs.julesjordan.com/members/content//upload/dl03/julesjordan/whitney_wright_dredd/1024watermarked/whitney_wright_julesjordan.com-20.jpg // https://thumbs.julesjordan.com/members/content//upload/dl03/julesjordan/bambi_barton_manuel_ferrara/1024watermarked/bambi_barton_julesjordan_com-13.jpg if (!release.actors?.length > 0) { return null; } const photoCount = query.number('//div[contains(@class, "title-heading-content")][contains(text(), "Photos")]'); if (photoCount) { // slug actor order is not always the same as actor list order, prefer trailer slug if available const path = query.dataset('.movieformat_button', 'src')?.match(/:(.*)_trailer/)?.[1] || release.actors.map((actor) => slugify(actor.name || actor, '_')).join('_'); const derivedActorSlug = path.replace(`_${release.actors.slice(1).map(({ name }) => slugify(name, '_'))}`, ''); const actorSlug = derivedActorSlug === path // no replacement took place, so the slug is likely invalid ? slugify(release.actors[0].name || release.actors[0], '_') : derivedActorSlug; return Array.from({ length: photoCount }, (value, index) => qualities .flatMap((quality) => [ `https://thumbs.${context.entity.slug}.com/trial/content//upload/dl03/${context.entity.slug}/${path}/${quality}/${actorSlug}_${context.entity.slug}_com-${index + 1}.jpg`, `https://thumbs.${context.entity.slug}.com/trial/content//upload/dl03/${context.entity.slug}/${path}/${quality}/${actorSlug}_${context.entity.slug}.com-${index + 1}.jpg`, // .com instead of _com ]).map((src) => ({ src, attempts: 1 }))); } return null; } async function scrapeScene({ html, query }, context) { const release = {}; release.title = query.content('.title_bar_hilite, .movie_title'); release.description = query.content('.update_description') || query.text('//div[./span[contains(text(), "Description")]]'); release.entryId = context.entity.parameters?.entryIdFromTitle ? slugify(release.title) : getEntryId(html); release.date = query.date(['.update_date', '//div[./span[contains(text(), "Date")]]'], 'MM/DD/YYYY'); release.actors = query.all('.backgroundcolor_info > .update_models a, .item .update_models a, .player-scene-description .update_models a').map((actorEl) => ({ name: unprint.query.content(actorEl), url: unprint.query.url(actorEl, null), })); release.tags = query.contents('.update_tags a, .player-scene-description a[href*="/categories"]'); release.director = release.tags?.find((tag) => ['mike john', 'van styles'].includes(tag?.trim().toLowerCase())); const posterPath = query.poster('#video-player') || html.match(/useimage = "(.*)"/)?.[1]; if (posterPath) { const poster = /^http/.test(posterPath) ? posterPath : `${context.entity.url}${posterPath}`; if (poster) { release.poster = { src: poster, referer: context.entity.url, }; } } if (query.exists('source[data-bitrate="trailer"]')) { release.trailer = [ query.video('source[data-bitrate="trailer_1080" i]'), query.video('source[data-bitrate="trailer_720" i]'), query.video('source[data-bitrate="trailer" i]'), // also seems to be 720p query.video('source[data-bitrate="trailer_mobile" i]'), // also seems to be 720p ]; } else if (context.include.trailers && context.entity.slug !== 'manuelferrara') { release.trailer = extractLegacyTrailer(html, context); } // release.photos = async () => await getPhotos(release.entryId, context.entity); // probably no longer works on any site // release.photos = query.imgs('#images img'); release.photos = getPhotos(query, release, context); if (query.exists('.update_dvds a')) { release.movie = { url: query.url('.update_dvds a'), title: query.cnt('.update_dvds a'), }; release.movie.entryId = new URL(release.movie.url).pathname.split('/').slice(-1)[0]?.replace('.html', ''); } release.stars = query.number('.avg_rating'); return release; } function scrapeMovie({ el, query }, url, site) { const movie = { url, site }; movie.entryId = new URL(url).pathname.split('/').slice(-1)[0]?.replace('.html', ''); movie.title = query.cnt('.title_bar span'); movie.covers = query.urls('#dvd-cover-flip > a'); movie.channel = slugify(query.q('.update_date a', true), ''); // movie.releases = Array.from(document.querySelectorAll('.cell.dvd_info > a'), el => el.href); const sceneQus = qu.initAll(el, '.dvd_details'); const scenes = scrapeAll(sceneQus, site); const curatedScenes = scenes ?.map((scene) => ({ ...scene, movie })) .sort((sceneA, sceneB) => sceneA.date - sceneB.date); movie.date = curatedScenes?.[0]?.date; return { ...movie, ...(curatedScenes && { scenes: curatedScenes }), }; } function scrapeProfile(html, url, actorName, entity) { const { document } = new JSDOM(html).window; const bio = document.querySelector('.model_bio').textContent; const avatarEl = document.querySelector('.model_bio_pic img, .model_bio_thumb'); const profile = { name: actorName, }; const heightString = bio.match(/\d+ feet \d+ inches/); const ageString = bio.match(/Age:\s*(\d{2})/); const birthDateString = bio.match(/Age:\s*(\w+ \d{1,2}, \d{4})/); const measurementsString = bio.match(/\w+-\d+-\d+/); if (birthDateString) profile.birthdate = qu.parseDate(birthDateString[1], 'MMMM D, YYYY'); if (ageString) profile.age = Number(ageString[1]); if (heightString) profile.height = heightToCm(heightString[0]); if (measurementsString) { const [bust, waist, hip] = measurementsString[0].split('-'); if (bust) profile.bust = bust; if (waist) profile.waist = Number(waist); if (hip) profile.hip = Number(hip); } if (avatarEl) { const avatarSources = [ avatarEl.getAttribute('src0_3x'), avatarEl.getAttribute('src0_2x'), avatarEl.getAttribute('src0_1x'), avatarEl.getAttribute('src0'), avatarEl.getAttribute('src'), ] .filter((avatar) => avatar && !/p\d+.jpe?g/.test(avatar)) // remove non-existing attributes and placeholder images .map((avatar) => qu.prefixUrl(avatar, entity.url)); if (avatarSources.length) profile.avatar = avatarSources; } profile.releases = Array.from(document.querySelectorAll('.category_listing_block .update_details > a:first-child'), (el) => el.href); console.log(profile); return profile; } async function fetchLatest(site, page = 1, include, preData, entryIdFromTitle = false) { const url = site.parameters?.latest ? util.format(site.parameters.latest, page) : `${site.url}/trial/categories/movies_${page}_d.html`; // const res = await http.get(url); const res = await unprint.get(url, { selectAll: '.update_details, .grid-item' }); if (res.ok) { return scrapeAll(res.context, site, typeof site.parameters?.entryIdFromTitle === 'boolean' ? site.parameters.entryIdFromTitle : entryIdFromTitle); } return res.status; } async function fetchUpcoming(site) { if (site.parameters?.upcoming === false) return null; const url = site.parameters?.upcoming ? util.format(site.parameters.upcoming) : `${site.url}/trial/index.php`; const res = await http.get(url); if (res.statusCode === 200) { return scrapeUpcoming(res.body.toString(), site); } return res.statusCode; } async function fetchMovie(url, site) { const res = await qu.get(url); return res.ok ? scrapeMovie(res.item, url, site) : res.status; } async function fetchProfile({ name: actorName, url }, entity) { const actorSlugA = slugify(actorName, ''); const actorSlugB = slugify(actorName, '-'); const urls = [ url, `${entity.parameters?.profile || `${entity.url}/trial/models`}/${actorSlugA}.html`, `${entity.parameters?.profile || `${entity.url}/trial/models`}/${actorSlugB}.html`, ]; return urls.reduce(async (chain, profileUrl) => { const profile = await chain; if (profile) { return profile; } if (!profileUrl) { return null; } const res = await http.get(profileUrl); if (res.statusCode === 200) { return scrapeProfile(res.body.toString(), profileUrl, actorName, entity); } return null; }, Promise.resolve()); } module.exports = { fetchLatest, fetchMovie, fetchProfile, fetchUpcoming, scrapeScene: { scraper: scrapeScene, unprint: true, }, };