From 56163f2b3d0e7da523e1aedea17af514d7ba0b2c Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Sat, 6 Jul 2024 06:08:35 +0200 Subject: [PATCH] Refactored and fixed Naughty America scraper. --- package-lock.json | 8 +- package.json | 2 +- seeds/02_sites.js | 12 ++- src/scrapers/naughtyamerica.js | 189 ++++++++++++++------------------- 4 files changed, 92 insertions(+), 119 deletions(-) diff --git a/package-lock.json b/package-lock.json index 1a470bc1..faf9df77 100644 --- a/package-lock.json +++ b/package-lock.json @@ -88,7 +88,7 @@ "tunnel": "0.0.6", "ua-parser-js": "^1.0.37", "undici": "^5.28.1", - "unprint": "^0.11.2", + "unprint": "^0.11.5", "url-pattern": "^1.0.3", "v-tooltip": "^2.1.3", "video.js": "^8.6.1", @@ -18293,9 +18293,9 @@ } }, "node_modules/unprint": { - "version": "0.11.2", - "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.11.2.tgz", - "integrity": "sha512-i4WJxmEnd6LKYbcnKAjX8bkaPRdyDlhAAqpxej0qIX0pjK5d17hp51x/RGDMfEe63dlcJtGCn9bhZrGcMY4PXQ==", + "version": "0.11.5", + "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.11.5.tgz", + "integrity": "sha512-tLhiFGeSU40GN12625+9oqmNGDFSToMPME60pB+DSGT9wd9fJM0L/lyZMQeNFmWMSThwa/id/FHAOnN7cE1aOw==", "dependencies": { "axios": "^0.27.2", "bottleneck": "^2.19.5", diff --git a/package.json b/package.json index 7514597d..fe0ec255 100755 --- a/package.json +++ b/package.json @@ -147,7 +147,7 @@ "tunnel": "0.0.6", "ua-parser-js": "^1.0.37", "undici": "^5.28.1", - "unprint": "^0.11.2", + "unprint": "^0.11.5", "url-pattern": "^1.0.3", "v-tooltip": "^2.1.3", "video.js": "^8.6.1", diff --git a/seeds/02_sites.js b/seeds/02_sites.js index 59b4e062..32f04e3a 100755 --- a/seeds/02_sites.js +++ b/seeds/02_sites.js @@ -7120,7 +7120,8 @@ const sites = [ { slug: 'myfriendshotmom', name: 'My Friend\'s Hot Mom', - alias: ['mfhm'], + alias: ['mfhm', 'watchyourmom', 'watch your mom'], + comment: 'Merged with Watch Your Mom.', url: 'https://www.naughtyamerica.com/site/my-friend-s-hot-mom', parent: 'naughtyamerica', }, @@ -7145,6 +7146,7 @@ const sites = [ { slug: 'watchyourmom', name: 'Watch Your Mom', + comment: 'Merged into My Friend\'s Hot Mom.', url: 'https://www.naughtyamerica.com/site/watch-your-mom', parent: 'naughtyamerica', }, @@ -7506,7 +7508,7 @@ const sites = [ { slug: 'afterschool', name: 'After School', - url: 'https://www.naughtyamerica.com/site/after-school', + url: 'https://www.naughtyamerica.com/site/after-schooll', // sic parent: 'naughtyamerica', }, { @@ -7617,6 +7619,12 @@ const sites = [ url: 'https://www.naughtyamerica.com/site/tonight-s-fuck', parent: 'naughtyamerica', }, + { + slug: 'fans', + name: 'Fans', + url: 'https://www.naughtyamerica.com/site/fans', + parent: 'naughtyamerica', + }, // NEBRASKA COEDS { name: 'Nebraska Coeds', diff --git a/src/scrapers/naughtyamerica.js b/src/scrapers/naughtyamerica.js index ae4b5025..10d3891c 100755 --- a/src/scrapers/naughtyamerica.js +++ b/src/scrapers/naughtyamerica.js @@ -1,158 +1,123 @@ 'use strict'; -/* eslint-disable newline-per-chained-call */ -const cheerio = require('cheerio'); -const moment = require('moment'); +const unprint = require('unprint'); -const http = require('../utils/http'); const slugify = require('../utils/slugify'); -const qu = require('../utils/q'); -function titleExtractor(pathname) { - const components = pathname.split('/')[2].split('-'); - const entryId = components.slice(-1)[0]; +function scrapeLatest(scenes) { + return scenes.map(({ query }) => { + const release = {}; + const url = query.url('a'); - const title = components.slice(0, -1).reduce((accTitle, word, index) => `${accTitle}${index > 0 ? ' ' : ''}${word.slice(0, 1).toUpperCase()}${word.slice(1)}`, ''); + release.url = url; + release.entryId = query.attribute('a', 'data-scene-id') || (url && new URL(url).pathname.match(/-(\d+)$/)?.[1]) || null; - return { title, entryId }; -} + release.date = query.date('.entry-date', 'MMM D, YYYY'); + release.duration = query.duration('.scene-runtime'); -function scrapeLatest(html, site) { - const $ = cheerio.load(html, { normalizeWhitespace: true }); - const sceneElements = $('.site-list .scene-item').toArray(); + release.actors = query.all('.contain-actors a').map((actorEl) => ({ + name: unprint.query.content(actorEl), + url: unprint.query.url(actorEl, null), + })); - return sceneElements.map((item) => { - const element = $(item); + release.poster = [ + ...(query.sourceSet('source[data-srcset*="scenes/"][type="image/jpeg"]', 'data-srcset') || []), + query.img('.main-scene-img', { attribute: 'data-srcset' }), + ]; - const sceneLinkElement = element.find('a').first(); - const { protocol, hostname, pathname } = new URL(sceneLinkElement.attr('href')); - const url = `${protocol}//${hostname}${pathname}`; - const { title, entryId } = titleExtractor(pathname); + release.tags = query.contents('.flag-bg'); - const date = moment.utc(element.find('.entry-date').text(), 'MMM D, YYYY').toDate(); - const actors = element.find('.contain-actors a').map((actorIndex, actorElement) => $(actorElement).text()).toArray(); + release.qualities = [ + query.exists('//a[contains(@class, "label-four-k") and contains(text(), "4K")]') && 2160, // label-four-k is also used for non-4K tags + query.exists('//a[contains(@class, "label-hd") and contains(text(), "HD")]') && 720, + ].filter(Boolean); - const duration = Number(element.find('.scene-runtime').text().slice(0, -4)) * 60; + release.channel = slugify(query.content('.site-title'), ''); - const posterString = sceneLinkElement.find('img[data-srcset]').attr('data-srcset') || sceneLinkElement.find('img[data-src]').attr('data-src'); - const poster = `https:${posterString.match(/[\w/.]+$/)[0]}`; - - return { - url, - entryId, - title, - actors, - date, - duration, - poster, - rating: null, - site, - }; + return release; }); } -function scrapeScene(html, url, site) { - const $ = cheerio.load(html, { normalizeWhitespace: true }); - const sceneElement = $('.scene-info'); +function scrapeScene({ query }, { url }) { + const release = {}; + release.entryId = new URL(url).pathname.match(/-(\d+)$/)?.[1]; - const { protocol, hostname, pathname } = new URL(url); - const originalUrl = `${protocol}//${hostname}${pathname}`; + release.title = query.content('.scene-title'); + release.description = query.text('.synopsis'); - const entryId = originalUrl.split('-').slice(-1)[0]; - const title = sceneElement.find('h1.scene-title').text(); - const description = sceneElement.find('.synopsis').contents().slice(2).text().replace(/[\s\n]+/g, ' ').trim(); + release.date = query.date('.entry-date', 'MMM D, YYYY'); + release.duration = query.duration('.duration'); - const date = moment.utc(sceneElement.find('span.entry-date').text()?.match(/\w+ \d{1,2}, \d{4}/), 'MMM D, YYYY').toDate(); - const actors = $('.performer-list a, h1 a.scene-title').map((actorIndex, actorElement) => $(actorElement).text()).toArray(); + release.actors = query.all('.performer-list a').map((actorEl) => ({ + name: unprint.query.content(actorEl), + url: unprint.query.url(actorEl, null), + })); - const duration = Number(sceneElement.find('.duration-ratings .duration').text().slice(10, -4)) * 60; + release.poster = [ + ...(query.sourceSet('source[data-srcset*="scenes/"][type="image/jpeg"]', 'data-srcset') || []), + query.img('.play-trailer img[data-srcset*="scenes/"]', { attribute: 'data-srcset' }), + ]; - const posterPath = $('video, dl8-video').attr('poster') || $('img.start-card').attr('src'); - const poster = posterPath && `https:${posterPath}`; - const photos = $('.contain-scene-images.desktop-only a').map((index, el) => $(el).attr('href')).toArray().filter(Boolean).map((photo) => `https:${photo}`); + release.photos = query.els('.contain-scene-images.desktop-only .scene-image').map((imgEl) => [ + unprint.query.url(imgEl, null), + unprint.query.img(imgEl, 'img', { attribute: 'srcset' }), + ]); - const trailerEl = $('source'); - const trailerSrc = trailerEl.attr('src'); - const trailerType = trailerEl.attr('type'); + const trailer = query.video('video source'); - const siteName = sceneElement.find('a.site-title').text(); - const channel = siteName.replace(/[\s']+/g, '').toLowerCase(); + if (trailer) { + release.trailer = [ + { + source: trailer.replace(/_\d+\.mp4/, '_1080.mp4'), + quality: 1080, + }, + trailer, + ]; + } - const tags = $('.categories a.cat-tag').map((tagIndex, tagElement) => $(tagElement).text()).toArray(); + release.channel = slugify(query.content('.site-title'), ''); - return { - url, - entryId, - title, - description, - actors, - date, - duration, - tags, - photos, - poster, - trailer: trailerSrc ? { - src: trailerSrc, - type: trailerType, - } : null, - rating: null, - site, - channel, - }; + release.tags = query.contents('.categories a'); + + release.qualities = [ + query.exists('//a[contains(@class, "label-four-k") and contains(text(), "4K")]') && 2160, // label-four-k is also used for non-4K tags + query.exists('img.icon-1080') && 1080, + query.exists('//a[contains(@class, "label-hd") and contains(text(), "HD")]') && 720, + ].filter(Boolean); + + return release; } -async function fetchActorReleases(url) { - const res = await qu.get(url); - - return res.ok - ? res.item.query.urls('.contain-block:not(.live-scenes) .scene-item > a:first-child') // live scenes repeat on all pages - : []; -} - -async function scrapeProfile(html) { - const { query } = qu.extract(html); +async function scrapeProfile({ query }) { const profile = {}; - profile.description = query.q('.bio_about_text', true); - - const avatar = query.q('img.performer-pic', 'src'); - if (avatar) profile.avatar = `https:${avatar}`; - - const releases = query.urls('.scene-item > a:first-child'); - const otherPages = query.urls('.pagination a:not([rel=next]):not([rel=prev])'); - const olderReleases = await Promise.all(otherPages.map(async (page) => fetchActorReleases(page))); - - profile.releases = releases.concat(olderReleases.flat()); + profile.description = query.content('.bio_about_text'); + profile.avatar = query.img('img.performer-pic'); return profile; } async function fetchLatest(site, page = 1) { - const res = await http.get(`${site.url}?page=${page}`); + const res = await unprint.get(`${site.url}?page=${page}`, { selectAll: '.site-list .scene-item' }); - return scrapeLatest(res.body.toString(), site); + return scrapeLatest(res.context, site); } -async function fetchScene(url, site) { - const res = await http.get(url); +async function fetchProfile({ slug }) { + const res = await unprint.get(`https://www.naughtyamerica.com/pornstar/${slug}`, { select: '.bio-info' }); - return scrapeScene(res.body.toString(), url, site); -} - -async function fetchProfile({ name: actorName }) { - const actorSlug = slugify(actorName); - - const res = await http.get(`https://www.naughtyamerica.com/pornstar/${actorSlug}`); - - if (res.statusCode === 200) { - return scrapeProfile(res.body.toString()); + if (res.ok) { + return scrapeProfile(res.context); } - return null; + return res.status; } module.exports = { fetchLatest, - fetchScene, fetchProfile, + scrapeScene: { + scraper: scrapeScene, + unprint: true, + }, };