From dde3ea3a34a6cc61d4d9db84494e8a7f5e10375a Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Mon, 19 Jan 2026 19:19:24 +0100 Subject: [PATCH] Refactored MariskaX to use Next data. Fixed Naughty America profile scraper breaking on Tonight's Girlfriend. --- src/scrapers/actors.js | 5 +- src/scrapers/mariskax.js | 180 +++++++++++++++++++++++---------- src/scrapers/naughtyamerica.js | 7 +- tests/profiles.js | 17 ++++ 4 files changed, 151 insertions(+), 58 deletions(-) diff --git a/src/scrapers/actors.js b/src/scrapers/actors.js index 7d4ff962..cf141913 100644 --- a/src/scrapers/actors.js +++ b/src/scrapers/actors.js @@ -175,6 +175,9 @@ module.exports = { // woodman pierrewoodman, wakeupnfuck: pierrewoodman, + // naughty america + naughtyamerica, + tonightsgirlfriend: naughtyamerica, // etc '18vr': badoink, theflourishxxx: theflourish, @@ -231,8 +234,6 @@ module.exports = { missax, mylf: teamskeet, mugfucked: fullpornnetwork, - naughtyamerica, - tonightsgirlfriend: naughtyamerica, nebraskacoeds: elevatedx, onlyprince: fullpornnetwork, pascalssubsluts, diff --git a/src/scrapers/mariskax.js b/src/scrapers/mariskax.js index d7447713..2af35083 100644 --- a/src/scrapers/mariskax.js +++ b/src/scrapers/mariskax.js @@ -3,84 +3,161 @@ const unprint = require('unprint'); const slugify = require('../utils/slugify'); +const { convert } = require('../utils/convert'); -function scrapeLatest(scenes) { - return scenes.map(({ query }) => { - const release = {}; +function scrapeScene(data, channel) { + const release = {}; - release.title = query.content('.title a'); - release.url = query.url('.title a') || query.url('.thumb-wrap a'); + release.entryId = data.id; + release.url = `${channel.origin}/scenes/${data.slug}`; - release.entryId = new URL(release.url).pathname.match(/view\/(\d+)\//)[1]; + release.title = data.title; + release.description = data.description; - release.date = query.date('time', 'Do MMM YYYY', { match: /\d+\w+ \w+ \d{4}/ }); - release.duration = query.duration('.total-time'); + release.date = unprint.extractDate(data.publish_date, 'YYYY/MM/DD hh:mm:ss'); + release.duration = unprint.extractDuration(data.videos_duration); - release.actors = query.all('.models a').map((el) => ({ - name: unprint.query.content(el), - url: unprint.query.url(el, null), - })); + release.actors = (data.models_thumbs || data.models_slugs)?.map((actor) => ({ + name: actor.name, + url: actor.slug && `${channel.origin}/models/${actor.slug}`, + avatar: actor.thumb, + })) || data.models; - [release.poster, ...release.photos] = query.json('.thumb-wrap a', { attribute: 'data-images' }); + release.tags = data.tags; - release.photoCount = query.number('.total-photos'); + release.poster = data.thumb || data.trailer_screencap; - return release; - }); + const posterPath = release.poster && new URL(release.poster).pathname.replace('//', '/'); + + release.photos = data.extra_thumbnails.filter((src) => !src.includes(posterPath)); + release.caps = data.thumbs; + + release.teaser = data.special_thumbnails; + release.trailer = data.trailer_url; + + release.photoCount = data.photos_duration; + release.channel = data.site?.toLowerCase(); + + release.qualities = data.videos && Array.from(new Set(Object.values(data.videos).map((video) => video.height))).filter(Boolean); + + return release; } async function fetchLatest(channel, page) { const res = await unprint.get(`https://tour.mariskax.com/scenes?page=${page}`, { - selectAll: '.content-item', timeout: 30000, // slow site }); if (res.ok) { - return scrapeLatest(res.context); + const data = res.context.query.json('#__NEXT_DATA__')?.props.pageProps.contents.data; + + if (data) { + return data.map((scene) => scrapeScene(scene, channel)); + } + + return null; } return res.status; } -function scrapeScene({ query }, { url }) { - const release = {}; +async function fetchScene(url, entity, baseRelease) { + if (baseRelease.entryId) { + // same as as deep data + return baseRelease; + } - release.title = query.content('.content-meta .title'); - release.entryId = new URL(url).pathname.match(/view\/(\d+)\//)[1]; - - release.date = query.date('.post-date', 'Do MMM YYYY', { match: /\d+\w+ \w+ \d{4}/ }); - - release.actors = query.all('.content-meta .models a').map((el) => ({ - name: unprint.query.content(el), - url: unprint.query.url(el, null), - })); - - release.poster = query.poster('.trailer-wrap video'); - release.trailer = query.video('.trailer-wrap source') || query.video('.download-trailer-wrap a', { attribute: 'href' }); - - return release; -} - -async function fetchProfile(actor) { - const res = await unprint.post('https://tour.mariskax.com/search-preview-mrx', `q=${slugify(actor.name, '+')}`, { - headers: { - 'Accept-Language': 'en-US,en', // necessary for some reason - 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', - }, + const res = await unprint.get(url, { + timeout: 30000, // slow site }); if (res.ok) { - const model = res.data.find((result) => result.type === 'model' && slugify(result.title) === actor.slug); + const data = res.context.query.json('#__NEXT_DATA__')?.props.pageProps.content; - if (model) { - const curatedModel = { - entryId: model.url?.match(/model\/(\d+)\//)?.[1], - url: model.url, - avatar: model.thumb, - }; - - return curatedModel; + if (data) { + return scrapeScene(data, entity); } + + return null; + } + + return res.status; +} + +function scrapeProfile(data) { + const profile = {}; + + const bio = Object.fromEntries(Object.entries(data).map(([key, value]) => [slugify(key, '_'), value])); + + profile.url = `https://tour.mariskax.com/models/${data.slug}`; + profile.entryId = data.id; + + profile.gender = bio.gender; + + profile.dateOfBirth = bio.birthdate; + profile.age = bio.age; + profile.placeOfBirth = bio.born; + + profile.measurements = bio.measurements; + profile.height = convert(bio.height, 'cm'); + profile.weight = convert(bio.weight, 'lb', 'kg'); + + profile.hairColor = bio.hair_color; + profile.eyes = bio.eye_color; + + profile.avatar = data.thumb; + profile.socials = [bio.x && `https://x.com/${bio.x.replace('@', '')}`].filter(Boolean); + + return profile; +} + +async function getActorUrl(actor, entity) { + if (actor.url) { + return { url: actor.url }; + } + + const res = await unprint.post(`${entity.origin}/api/search/${actor.name}`); + + if (res.ok) { + const model = res.data.models.find((result) => slugify(result.name) === actor.slug); + + if (model?.slug) { + return { + url: `${entity.origin}/models/${model.slug}`, + model, + }; + } + } + + return null; +} + +async function fetchProfile(actor, entity) { + const { url, model } = await getActorUrl(actor, entity); + + if (model) { + // search data already contains everything except for age, but DOB is included + return scrapeProfile(model); + } + + if (url) { + const res = await unprint.get(url, { + parser: { + runScripts: 'dangerously', + }, + }); + + if (res.ok) { + const data = res.context.query.json('#__NEXT_DATA__')?.props.pageProps.model; + + if (data) { + return scrapeProfile(data); + } + + return null; + } + + return res.status; } return null; @@ -89,5 +166,6 @@ async function fetchProfile(actor) { module.exports = { fetchLatest, scrapeScene, + fetchScene, fetchProfile, }; diff --git a/src/scrapers/naughtyamerica.js b/src/scrapers/naughtyamerica.js index 5289011c..c9210b95 100755 --- a/src/scrapers/naughtyamerica.js +++ b/src/scrapers/naughtyamerica.js @@ -141,7 +141,7 @@ async function fetchScene(url, _channel) { async function scrapeProfile({ query }) { const profile = {}; - profile.description = query.content('.bio_about_text'); + profile.description = query.content('.bio_about_text, .performer-description'); profile.avatar = query.img('img.performer-pic, img.performer-img, img.peformer-img'); // sic peformer return profile; @@ -151,12 +151,9 @@ async function fetchProfile({ slug }, { channel }) { const url = unprint.prefixUrl(`/pornstar/${slug}`, channel.url); const res = await unprint.browserRequest(url, { - browser: { - headless: false, - }, select: '.bio-info, .performer-details', async control(ctx) { - await ctx.locator('.bio-info').hover({ trial: true, timeout: 30000 }); // wait for bio to initialize + await ctx.locator('.bio-info, .performer-details').hover({ trial: true, timeout: 30000 }); // wait for bio to initialize }, }); diff --git a/tests/profiles.js b/tests/profiles.js index 79adb275..e491989b 100644 --- a/tests/profiles.js +++ b/tests/profiles.js @@ -1,5 +1,6 @@ 'use strict'; +const config = require('config'); const test = require('node:test'); const assert = require('node:assert/strict'); const unprint = require('unprint'); @@ -15,6 +16,19 @@ const knex = require('../src/knex'); unprint.options({ logErrors: false, + timeout: argv.requestTimeout, + userAgent: 'traxxx', + browserUserAgent: 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36', + apiUserAgent: 'traxxx', + limits: { + ...config.limits, + default: { + interval: argv.interval || config.limits.default.interval, + concurrency: argv.concurrency || config.limits.default.concurrency, + }, + browser: config.limits.browser, + }, + proxy: config.proxy, }); const actors = [ @@ -188,6 +202,9 @@ const actors = [ { entity: 'pierrewoodman', name: 'Abby Lee Brazil', fields: ['avatar', 'nationality'] }, { entity: 'dorcelclub', name: 'Clea Gaultier', fields: ['avatar'] }, { entity: 'hitzefrei', name: 'Jolee Love', fields: ['avatar', 'dateOfBirth', 'birthPlace', 'measurements', 'height', 'weight', 'eyes', 'hair', 'description'] }, + { entity: 'naughtyamerica', name: 'Nicole Aniston', fields: ['avatar', 'description'] }, + { entity: 'tonightsgirlfriend', name: 'Abella Danger', fields: ['avatar'] }, + { entity: 'mariskax', name: 'Honey Demon', fields: ['avatar', 'gender', 'dateOfBirth', 'placeOfBirth', 'measurements', 'height', 'weight', 'hairColor', 'eyes'] }, ]; const actorScrapers = scrapers.actors;