161 lines
4.5 KiB
JavaScript
Executable File
161 lines
4.5 KiB
JavaScript
Executable File
'use strict';
|
|
|
|
const unprint = require('unprint');
|
|
|
|
const { stripQuery } = require('../utils/url');
|
|
const slugify = require('../utils/slugify');
|
|
|
|
const dateRegex = /\d{4}-\d{2}-\d{2}T/;
|
|
|
|
function scrapeLatest(scenes, fullData, channel, parameters) {
|
|
return scenes.map(({ query }) => {
|
|
const release = {};
|
|
|
|
release.url = query.url('[href*="/video"]', { origin: new URL(parameters.latest || channel.url).origin });
|
|
release.title = query.content('a[href*="/video"] strong');
|
|
|
|
release.entryId = release.url
|
|
? new URL(release.url).pathname.split('/').at(-1)
|
|
: slugify(release.title);
|
|
|
|
// Nuxt data array does not have a predictable structure, don't rely on it more than necessary
|
|
const dataIndex = fullData?.indexOf(release.entryId);
|
|
const data = dataIndex > -1 ? fullData?.slice(dataIndex - 5, dataIndex + 35) : [];
|
|
|
|
// older scenes don't have date in html
|
|
const date = data.find((item) => dateRegex.test(item));
|
|
|
|
if (date) {
|
|
release.date = new Date(date);
|
|
} else {
|
|
release.date = query.date('a[href*="/video"] + p + p', 'MM/DD/YYYY');
|
|
}
|
|
|
|
release.actors = query.all('a[href*="/model"]').map((actorEl) => ({
|
|
name: unprint.query.content(actorEl),
|
|
url: unprint.query.url(actorEl, null, { origin: channel.url }),
|
|
}));
|
|
|
|
const poster = query.img('img[alt]');
|
|
|
|
if (poster) {
|
|
release.poster = [
|
|
stripQuery(poster),
|
|
poster,
|
|
];
|
|
}
|
|
|
|
// photos and teasers can't be reliably extracted, MP4s include trailers and FULL SCENES
|
|
|
|
return release;
|
|
});
|
|
}
|
|
|
|
async function passAgeCheck(ctx) {
|
|
const ageButton = await ctx.getByText('Continue', { exact: true });
|
|
|
|
if (await ageButton.count() > 0) {
|
|
await ageButton.click();
|
|
}
|
|
}
|
|
|
|
async function fetchLatest(channel, page = 1, { parameters }) {
|
|
// going to e.g. https://holed.com/sites/holed defined by parameter gets rid of 'top rated' section, simplifying query
|
|
const url = `${channel.parameters?.latest || channel.url.replace('/series', '/sites')}?page=${page}`;
|
|
|
|
// site uses Nuxt without SSR, easiest to render in browser
|
|
const res = await unprint.browserRequest(url, {
|
|
page: {
|
|
timeout: 120_000, // update pages can be very slow to respond, but they usually do
|
|
},
|
|
async control(ctx) {
|
|
await passAgeCheck(ctx);
|
|
},
|
|
});
|
|
|
|
if (res.status === 200) {
|
|
const scenes = unprint.initAll(res.context.query.all('.card-grid > div'));
|
|
const data = res.context.query.json('#__NUXT_DATA__');
|
|
|
|
return scrapeLatest(scenes, data, channel, parameters);
|
|
}
|
|
|
|
return res.status;
|
|
}
|
|
|
|
function scrapeScene({ query }, { url, entity }) {
|
|
const release = {};
|
|
const { query: infoQuery } = unprint.init(query.element('//div[./*/span[contains(text(), \'Featuring\')]]'));
|
|
|
|
// release.entryId = query.attribute('div[data-id]', 'data-id');
|
|
release.entryId = new URL(url).pathname.split('/').at(-1);
|
|
|
|
release.title = infoQuery.content('h2');
|
|
|
|
const description = infoQuery.content('h2 + p + p');
|
|
|
|
if (!description.toLowerCase().includes('n/a')) {
|
|
release.description = description;
|
|
}
|
|
|
|
// Nuxt data array does not have a predictable structure, don't rely on it more than necessary
|
|
const fullData = query.json('#__NUXT_DATA__');
|
|
const dataIndex = fullData?.indexOf(release.entryId);
|
|
const data = dataIndex > -1 ? fullData?.slice(dataIndex - 5, dataIndex + 50) : [];
|
|
|
|
const date = data.find((item) => dateRegex.test(item));
|
|
|
|
if (date) {
|
|
release.date = new Date(date);
|
|
}
|
|
|
|
release.actors = infoQuery.all('a[href*="/models"]').map((actorEl) => ({
|
|
name: unprint.query.content(actorEl),
|
|
url: unprint.query.url(actorEl, null, { origin: entity.url }),
|
|
}));
|
|
|
|
const poster = query.img('media-poster img') || query.poster('dl8-video');
|
|
|
|
if (poster) {
|
|
release.poster = [
|
|
stripQuery(poster),
|
|
poster,
|
|
];
|
|
}
|
|
|
|
release.photos = Array.from(new Set(query.imgs('a img[src*="content/videos"]'))).map((src) => [
|
|
stripQuery(src),
|
|
src,
|
|
]);
|
|
|
|
release.trailer = query.video('media-player video') || query.video('dl8-video source');
|
|
release.qualities = query.contents('//table[.//span[contains(text(), \'480p\')]]//tr').map((resolution) => Number(resolution.split('x')[1])).filter(Boolean);
|
|
|
|
return release;
|
|
}
|
|
|
|
async function fetchScene(url, entity) {
|
|
const res = await unprint.browserRequest(url, {
|
|
async control(ctx) {
|
|
await passAgeCheck(ctx);
|
|
|
|
try {
|
|
await ctx.locator('media-player video').hover({ trial: true, timeout: 1000 }); // wait for trailer to initialize
|
|
} catch (__error) {
|
|
// no trailer, that's fine
|
|
}
|
|
},
|
|
});
|
|
|
|
if (res.ok) {
|
|
return scrapeScene(res.context, { url, entity });
|
|
}
|
|
|
|
return res.status;
|
|
}
|
|
|
|
module.exports = {
|
|
fetchLatest,
|
|
fetchScene,
|
|
};
|