From c2afa571bfadb4c6c483a3a86b93d565a05221a1 Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Tue, 4 Jun 2024 03:30:26 +0200 Subject: [PATCH] Fixed Whale Member (Porn Pros, Holed) scraper. --- src/scrapers/whalemember.js | 162 ++++++++++++------------------------ 1 file changed, 52 insertions(+), 110 deletions(-) diff --git a/src/scrapers/whalemember.js b/src/scrapers/whalemember.js index af5f0fa2..b7e0e5ae 100755 --- a/src/scrapers/whalemember.js +++ b/src/scrapers/whalemember.js @@ -1,145 +1,87 @@ 'use strict'; -const { JSDOM } = require('jsdom'); -const moment = require('moment'); +const unprint = require('unprint'); -const http = require('../utils/http'); +const { stripQuery } = require('../utils/url'); -function scrapeLatest(html, site) { - const { document } = new JSDOM(html).window; - const { origin } = new URL(site.parameters?.latest || site.url); +function scrapeLatest(scenes, channel) { + return scenes.map(({ query, element }) => { + const release = {}; - const videos = Array.from(document.querySelectorAll('.video-releases-list')).slice(-1)[0]; + release.url = query.url('[href*="/video"]'); + release.entryId = unprint.query.attribute(element, null, 'data-vid'); - return Array.from(videos.querySelectorAll('.card'), (scene) => { - const release = { site }; + release.title = query.content('.video-thumbnail-footer a[href*="/video"]'); + release.date = query.date('.actor-list + span', 'MM/DD/YYYY'); - release.url = `${origin}${scene.querySelector(':scope > a').href}`; - release.entryId = scene.dataset.videoId; - release.title = scene.querySelector('.card-title').textContent; - release.date = moment.utc(scene.dataset.date, 'MMMM DD, YYYY').toDate(); - release.actors = Array.from(scene.querySelectorAll('.actors a'), (el) => el.textContent); - - // slow CDN? - const poster = scene.querySelector('.single-image').dataset.src; - const teaserEl = scene.querySelector('source'); - - release.poster = { - src: /^http/.test(poster) ? poster : `https:${poster}`, - referer: site.url, - attempts: 5, - interval: 5000, - concurrency: 1, - }; - - release.photos = Array.from(scene.querySelectorAll('.rollover-thumbs img'), (el) => ({ - src: (/^http/.test(el.dataset.src) ? el.dataset.src : `https:${el.dataset.src}`), - referer: site.url, - attempts: 5, - interval: 5000, - concurrency: 1, + release.actors = query.all('.actor-list a').map((actorEl) => ({ + name: unprint.query.content(actorEl), + url: unprint.query.url(actorEl, null, { origin: channel.url }), })); - if (teaserEl) { - release.teaser = { - src: teaserEl.dataset.src, - referer: site.url, - attempts: 5, - interval: 5000, - concurrency: 1, - }; + const poster = query.poster() || query.img('a img', { attribute: 'data-src' }); + + if (poster) { + release.poster = [ + stripQuery(poster), + poster, + ]; } + release.photos = query.imgs('img[data-index]', { attribute: 'data-src' }).map((src) => [ + stripQuery(src), + src, + ]); + + release.teaser = query.video('source', { attribute: 'data-src' }); + return release; }); } -function scrapeScene(html, site, url) { - const { document } = new JSDOM(html).window; - const release = { site }; +function scrapeScene({ query }, channel) { + const release = {}; - const scene = document.querySelector('#t2019-2col'); + release.entryId = query.attribute('div[data-id]', 'data-id'); - release.url = url; - release.title = scene.querySelector('.t2019-stitle').textContent.trim(); - release.description = scene.querySelector('#t2019-description').textContent.trim(); - release.actors = Array.from(scene.querySelectorAll('#t2019-models a'), (el) => el.textContent); + release.title = query.content('.scene-info h1'); + release.description = query.content('//div[contains(@class, \'scene-info\')]//i[contains(@class, \'fa-quote\')]/following-sibling::span'); - const durationEls = Array.from(scene.querySelectorAll('#t2019-stime span')); + release.duration = (query.number('//div[contains(@class, \'scene-info\')]//span[contains(text(), \'Duration\')]/following-sibling::span[contains(text(), \'minutes\')]') * 60) || null; - if (durationEls.length > 1) { - release.date = moment.utc(durationEls[0].textContent, 'MMMM DD, YYYY').toDate(); - release.duration = Number(durationEls[1].textContent.match(/\d+/)[0]) * 60; - } else { - release.duration = Number(durationEls[0].textContent.match(/\d+/)[0]) * 60; - } - - // unreliable CDN - release.photos = Array.from(scene.querySelectorAll('#t2019-main .t2019-thumbs img'), (el) => ({ - src: (/^http/.test(el.src) ? el.src : `https:${el.src}`), - referer: site.url, - attempts: 5, - interval: 5000, - concurrency: 1, + release.actors = query.all('.scene-info a[href*="/models"]').map((actorEl) => ({ + name: unprint.query.content(actorEl), + url: unprint.query.url(actorEl, null, { origin: channel.url }), })); - const posterEl = scene.querySelector('#no-player-image'); - const videoEl = scene.querySelector('video'); - const trailerEl = scene.querySelector('#t2019-video source'); + release.poster = query.poster('#player-wrapper video'); - if (posterEl) { - release.poster = { - src: /^http/.test(posterEl.src) ? posterEl.src : `https:${posterEl.src}`, - referer: site.url, - attempts: 5, - interval: 5000, - concurrency: 1, - }; - } else if (videoEl) { - release.poster = { - src: /^http/.test(videoEl.poster) ? videoEl.poster : `https:${videoEl.poster}`, - referer: site.url, - attempts: 5, - interval: 5000, - concurrency: 1, - }; - } + release.photos = query.imgs('#trailer_player .hidden > a img').map((src) => [ + stripQuery(src), + src, + ]); - if (trailerEl) { - release.trailer = { - src: trailerEl.src, - referer: site.url, - attempts: 5, - interval: 5000, - concurrency: 1, - }; - } + release.teaser = query.video('#player-wrapper source'); + release.qualities = query.contents('#trailer_player .resolution').map((resolution) => Number(resolution.split('x')[1])).filter(Boolean); return release; } -async function fetchLatest(site, page = 1) { - const url = `${site.parameters?.latest || site.url}?page=${page}`; - const res = await http.get(url); +async function fetchLatest(channel, page = 1) { + const url = `${channel.parameters?.latest || channel.url}?page=${page}`; + const res = await unprint.get(url, { selectAll: '//*[(starts-with(text(), \'Latest\') and contains(text(), \'Movies\')) or contains(text(), \'Most Recent\')]/following::div[contains(@class, \'video-thumbnail\') and @data-vid]' }); - if (res.statusCode === 200) { - return scrapeLatest(res.body.toString(), site); + if (res.status === 200) { + return scrapeLatest(res.context, channel); } - return []; -} - -async function fetchScene(url, site) { - const res = await http.get(url); - - if (res.statusCode === 200) { - return scrapeScene(res.body.toString(), site, url); - } - - return null; + return res.status; } module.exports = { fetchLatest, - fetchScene, + scrapeScene: { + scraper: scrapeScene, + unprint: true, + }, };