'use strict'; const unprint = require('unprint'); const { stripQuery } = require('../utils/url'); const slugify = require('../utils/slugify'); const dateRegex = /\d{4}-\d{2}-\d{2}T/; function scrapeLatest(scenes, fullData, channel, parameters) { return scenes.map(({ query }) => { const release = {}; release.url = query.url('[href*="/video"]', { origin: new URL(parameters.latest || channel.url).origin }); release.title = query.content('a[href*="/video"] strong'); release.entryId = release.url ? new URL(release.url).pathname.split('/').at(-1) : slugify(release.title); // Nuxt data array does not have a predictable structure, don't rely on it more than necessary const dataIndex = fullData?.indexOf(release.entryId); const data = dataIndex > -1 ? fullData?.slice(dataIndex - 5, dataIndex + 35) : []; // older scenes don't have date in html const date = data.find((item) => dateRegex.test(item)); if (date) { release.date = new Date(date); } else { release.date = query.date('a[href*="/video"] + p + p', 'MM/DD/YYYY'); } release.actors = query.all('a[href*="/model"]').map((actorEl) => ({ name: unprint.query.content(actorEl), url: unprint.query.url(actorEl, null, { origin: channel.url }), })); const poster = query.img('img[alt]'); if (poster) { release.poster = [ stripQuery(poster), poster, ]; } // photos and teasers can't be reliably extracted, MP4s include trailers and FULL SCENES return release; }); } async function passAgeCheck(ctx) { const ageButton = await ctx.getByText('Continue', { exact: true }); if (await ageButton.count() > 0) { await ageButton.click(); } } async function fetchLatest(channel, page = 1, { parameters }) { // going to e.g. https://holed.com/sites/holed defined by parameter gets rid of 'top rated' section, simplifying query const url = `${channel.parameters?.latest || channel.url.replace('/series', '/sites')}?page=${page}`; // site uses Nuxt without SSR, easiest to render in browser const res = await unprint.browserRequest(url, { page: { timeout: 120_000, // update pages can be very slow to respond, but they usually do }, async control(ctx) { await passAgeCheck(ctx); }, }); if (res.status === 200) { const scenes = unprint.initAll(res.context.query.all('.card-grid > div')); const data = res.context.query.json('#__NUXT_DATA__'); return scrapeLatest(scenes, data, channel, parameters); } return res.status; } function scrapeScene({ query }, { url, entity }) { const release = {}; const { query: infoQuery } = unprint.init(query.element('//div[./*/span[contains(text(), \'Featuring\')]]')); // release.entryId = query.attribute('div[data-id]', 'data-id'); release.entryId = new URL(url).pathname.split('/').at(-1); release.title = infoQuery.content('h2'); const description = infoQuery.content('h2 + p + p'); if (!description.toLowerCase().includes('n/a')) { release.description = description; } // Nuxt data array does not have a predictable structure, don't rely on it more than necessary const fullData = query.json('#__NUXT_DATA__'); const dataIndex = fullData?.indexOf(release.entryId); const data = dataIndex > -1 ? fullData?.slice(dataIndex - 5, dataIndex + 50) : []; const date = data.find((item) => dateRegex.test(item)); if (date) { release.date = new Date(date); } release.actors = infoQuery.all('a[href*="/models"]').map((actorEl) => ({ name: unprint.query.content(actorEl), url: unprint.query.url(actorEl, null, { origin: entity.url }), })); const poster = query.img('media-poster img') || query.poster('dl8-video'); if (poster) { release.poster = [ stripQuery(poster), poster, ]; } release.photos = Array.from(new Set(query.imgs('a img[src*="content/videos"]'))).map((src) => [ stripQuery(src), src, ]); release.trailer = query.video('media-player video') || query.video('dl8-video source'); release.qualities = query.contents('//table[.//span[contains(text(), \'480p\')]]//tr').map((resolution) => Number(resolution.split('x')[1])).filter(Boolean); return release; } async function fetchScene(url, entity) { const res = await unprint.browserRequest(url, { async control(ctx) { await passAgeCheck(ctx); try { await ctx.locator('media-player video').hover({ trial: true, timeout: 1000 }); // wait for trailer to initialize } catch (__error) { // no trailer, that's fine } }, }); if (res.ok) { return scrapeScene(res.context, { url, entity }); } return res.status; } module.exports = { fetchLatest, fetchScene, };