From ccc6d1c10c95e13ba35c1f147649ccf705d9387c Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Thu, 29 Aug 2024 22:18:56 +0200 Subject: [PATCH] Refactored Hookup Hotshot scraper. --- src/scrapers/hookuphotshot.js | 133 ++++++++++++++++++---------------- 1 file changed, 70 insertions(+), 63 deletions(-) diff --git a/src/scrapers/hookuphotshot.js b/src/scrapers/hookuphotshot.js index 98b88843..f5e37059 100755 --- a/src/scrapers/hookuphotshot.js +++ b/src/scrapers/hookuphotshot.js @@ -1,99 +1,105 @@ 'use strict'; -const qu = require('../utils/q'); -const slugify = require('../utils/slugify'); +const unprint = require('unprint'); -function scrapeAll(scenes) { +function scrapeAll(scenes, _channel) { return scenes.map(({ query }) => { const release = {}; - release.url = query.url('.date-title a'); + release.url = query.url('.item-thumb a, .item-info h4 a'); + release.entryId = new URL(release.url).pathname.match(/\/trailers\/(.*)\.html/)[1].toLowerCase(); - const avatarEl = query.el('.girl-thumb-container img'); - release.actors = query.all('.date-starring a').map((actorEl) => { - const name = query.cnt(actorEl); + release.title = query.content('.item-info h4 a, .item-info a[title]'); + release.date = query.date('.date', 'YYYY-MM-DD'); + release.duration = query.duration('.time'); - return { - name, - gender: 'female', - url: query.url(actorEl, null), - ...(new RegExp(name).test(avatarEl.alt) && { - avatar: [ - avatarEl.src.replace(/-\d+x\d+/, ''), - avatarEl.src, - ].map((src) => ({ src, interval: 1000, concurrency: 1 })), - }), - }; - }).concat({ - name: 'Bryan Gozzling', - gender: 'male', - }); + release.photoCount = query.number('.time'); - release.duration = query.dur('.date-facts'); - release.stars = query.number('[data-rating]', null, 'data-rating'); + const photoCount = query.number('.item-thumb img.mainThumb', { attribute: 'cnt' }); - const photoCount = query.number('input[id*=count]', null, 'value'); - const photoPath = query.url('input[id*=baseurl]', 'value'); - - release.poster = { - src: query.img('.date-img-swap'), - interval: 1000, - concurrency: 1, - }; - - release.photos = [...Array(photoCount)].map((value, index) => ({ - src: `${photoPath}/${String(index + 1).padStart(2, '0')}.jpg`, - interval: 1000, - concurrency: 1, - })); - - // dates appear to be manually curated - const fullTitle = query.cnt('.date-title a'); - const [monthName, date, title] = fullTitle.match(/(\w+)\.? (\d+)\s*-?\s*(.*)/)?.slice(1) || []; - const [year, month] = release.poster.src.match(/uploads\/(\d+)\/(\d+)/)?.slice(1) || []; - - release.title = title.replace(/behind the\.\.\./i, 'Behind the Scenes'); - release.date = qu.extractDate(`${year}-${monthName || month}-${date}`, ['YYYY-MM-DD', 'YYYY-MMM-DD', 'YYYY-MMMM-DD']); - - // release.entryId = new URL(release.url).pathname.split('/')[2]; - release.entryId = `${release.date.getFullYear()}-${release.date.getMonth() + 1}-${release.date.getDate()}-${slugify(release.actors[0].name)}`; - - release.tags = ['rough', ...release.title.match(/behind the scenes|anal/gi) || []]; + if (photoCount) { + [release.poster, ...release.photos] = Array.from({ length: photoCount }, (value, index) => [ + query.img('.item-thumb img.mainThumb', { attribute: `src${index}_2x` }), + query.img('.item-thumb img.mainThumb', { attribute: `src${index}_3x` }), // 3x is too big and usually inflated, try 2x first + query.img('.item-thumb img.mainThumb', { attribute: `src${index}_1x` }), + ]); + } return release; }); } +function scrapeScene({ query, html }, { url, entity, baseRelease }) { + const release = {}; + + release.entryId = new URL(url).pathname.match(/\/trailers\/(.*)\.html/)[1].toLowerCase(); + + release.title = query.content('.videoDetails h3'); + release.description = query.content('.videoDetails p'); + + release.date = query.date('.videoInfo', 'MMMM D, YYYY'); + release.duration = query.duration('.videoInfo'); + + release.actors = query.all('.update_models a').map((actorEl) => ({ + name: unprint.query.content(actorEl), + url: unprint.query.url(actorEl, null), + })); + + release.tags = query.contents('.featuring a[href*="categories/"]'); + release.photoCount = query.number('.videoInfo', { match: /(\d+) photos/i, matchIndex: 1 }); + + release.trailer = unprint.prefixUrl(html.match(/src="(\/trailers\/.*\.mp4)"/)?.[1], entity.url); + + const posterUrl = unprint.prefixUrl(html.match(/poster="(\/content\/.*\.jpg)"/)?.[1], entity.url); + + const posterFallbacks = [ + posterUrl.replace('-1x', '-2x'), + posterUrl.replace('-1x', '-3x'), + posterUrl, + ]; + + // scene page poster does not appear on update page + if (baseRelease?.poster) { + release.photos = [posterFallbacks, ...(baseRelease.photos || [])]; + } else { + release.poster = posterFallbacks; + } + + return release; +} + function scrapeProfile({ query }) { const profile = {}; - profile.gender = 'female'; + profile.description = query.content('.profile-about') || null; - profile.description = query.cnts('.girl-about p:not(.bio-facts)').join(' '); - profile.avatar = query.img('.girl-pic'); - - // no deep scraping available, and not all scene details available here + profile.avatar = [ + query.img('.profile-pic img', { attribute: 'src0_1x' }), + // too big, not desirable unless 1x fails + query.img('.profile-pic img', { attribute: 'src0_2x' }), + query.img('.profile-pic img', { attribute: 'src0_3x' }), + ]; return profile; } async function fetchLatest(channel, page = 1) { - const url = `${channel.url}/the-dates/page/${page}`; - const res = await qu.getAll(url, '#et-projects li'); + const url = `${channel.url}/categories/movies/${page}/latest/`; + const res = await unprint.get(url, { selectAll: '.items .item-video' }); if (res.ok) { - return scrapeAll(res.items, channel); + return scrapeAll(res.context, channel); } return res.status; } -async function fetchProfile({ name: actorName }, entity, include) { - const url = `${entity.url}/girls/${slugify(actorName)}`; - const res = await qu.get(url); +async function fetchProfile(actor, entity) { + const url = actor.url || `${entity.url}/models/${actor.slug}.html`; + const res = await unprint.get(url); if (res.ok) { - return scrapeProfile(res.item, actorName, entity, include); + return scrapeProfile(res.context); } return res.status; @@ -102,4 +108,5 @@ async function fetchProfile({ name: actorName }, entity, include) { module.exports = { fetchLatest, fetchProfile, + scrapeScene, };