From 762e605bd1c6b81a0864a63f675881df3c85ce04 Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Sun, 1 Feb 2026 01:31:45 +0100 Subject: [PATCH] Extracting shoot IDs from title in PornBox scraper. --- package-lock.json | 8 ++-- package.json | 2 +- src/scrapers/analvids.js | 100 +++++++++++++-------------------------- src/scrapers/pornbox.js | 14 +++++- src/utils/slugify.js | 2 +- 5 files changed, 51 insertions(+), 75 deletions(-) diff --git a/package-lock.json b/package-lock.json index a19753d6..f3a5b41b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -94,7 +94,7 @@ "tunnel": "0.0.6", "ua-parser-js": "^1.0.37", "undici": "^5.28.1", - "unprint": "^0.18.11", + "unprint": "^0.18.13", "url-pattern": "^1.0.3", "v-tooltip": "^2.1.3", "video.js": "^8.6.1", @@ -20380,9 +20380,9 @@ } }, "node_modules/unprint": { - "version": "0.18.11", - "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.18.11.tgz", - "integrity": "sha512-mHOfweWWLqhEIRnjhdqCzEpHhIx+m/GwE2eDvJNNbnVEPbV8q8EaN6eGH3vkcAwDVgNIOakZaTZFK+VKy13Lsg==", + "version": "0.18.13", + "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.18.13.tgz", + "integrity": "sha512-vjUF7X7/dg2Os/zesJ0+23eVc7NH2oKzspPSyBzcIx6IuEcVm1rdlD9dAxdaRMUNBWEeA5ekyk263CBI3lyaBQ==", "dependencies": { "bottleneck": "^2.19.5", "cookie": "^1.1.1", diff --git a/package.json b/package.json index 3135623e..d1e86289 100755 --- a/package.json +++ b/package.json @@ -153,7 +153,7 @@ "tunnel": "0.0.6", "ua-parser-js": "^1.0.37", "undici": "^5.28.1", - "unprint": "^0.18.11", + "unprint": "^0.18.13", "url-pattern": "^1.0.3", "v-tooltip": "^2.1.3", "video.js": "^8.6.1", diff --git a/src/scrapers/analvids.js b/src/scrapers/analvids.js index 4b3836de..dd2670ad 100644 --- a/src/scrapers/analvids.js +++ b/src/scrapers/analvids.js @@ -2,7 +2,6 @@ const unprint = require('unprint'); -const http = require('../utils/http'); const slugify = require('../utils/slugify'); function extractTitle(originalTitle) { @@ -43,6 +42,25 @@ function scrapeAll(scenes, channel) { }); } +async function fetchLatest(channel, page) { + // const res = await unprint.get(`https://www.analvids.com/new-videos/${page}`, { selectAll: '.card-scene' }); // analvids as channel + // studios as channels + const url = `${channel.url}/latest/${page}`; + + const res = await unprint.get(url, { + selectAll: '.card-scene', + headers: { + Referer: url, + }, + }); + + if (res.ok) { + return scrapeAll(res.context, channel); + } + + return res.status; +} + function scrapeScene({ query }, url) { const release = {}; @@ -76,71 +94,6 @@ function scrapeScene({ query }, url) { return release; } -function scrapeProfile({ query }, url, channel) { - const profile = { url }; - - profile.nationality = query.content('.model__info a[href*="/nationality"]'); - profile.age = query.number('//td[contains(text(), "Age")]/following-sibling::td'); - - profile.avatar = query.img('.model__left img'); - - profile.scenes = scrapeAll(unprint.initAll(query.all('.card-scene')), channel); - - return profile; -} - -async function fetchLatest(channel, page) { - // const res = await unprint.get(`https://www.analvids.com/new-videos/${page}`, { selectAll: '.card-scene' }); // analvids as channel - // studios as channels - const url = `${channel.url}/latest/${page}`; - - const res = await unprint.get(url, { - selectAll: '.card-scene', - headers: { - Referer: url, - }, - }); - - if (res.ok) { - return scrapeAll(res.context, channel); - } - - return res.status; -} - -/* -async function fetchLatest(channel, page) { - // const res = await unprint.get(`https://www.analvids.com/new-videos/${page}`, { selectAll: '.card-scene' }); // analvids as channel - // const res = await unprint.get(`${channel.url}/latest/${page}`, { selectAll: '.card-scene' }); // studios as channels - const url = `${channel.url}/latest/${page}`; // studios as channels - - const { tab } = await http.getBrowserSession('analvids', { - bypass: { - headless: false, - }, - }); - - const res = await tab.goto(url); - - const status = res.status(); - - console.log('STATUS', status); - - if (status === 200) { - const html = await tab.content(); - const context = unprint.initAll(html, '.card-scene'); // studios as channels - - const scenes = scrapeAll(context, channel); - - tab.close(); - - return scenes; - } - - return res.status; -} -*/ - async function fetchScene(url) { const res = await unprint.get(url, { headers: { @@ -155,6 +108,19 @@ async function fetchScene(url) { return res.status; } +function scrapeProfile({ query }, url, channel) { + const profile = { url }; + + profile.nationality = query.content('.model__info a[href*="/nationality"]'); + profile.age = query.number('//td[contains(text(), "Age")]/following-sibling::td'); + + profile.avatar = query.img('.model__left img'); + + profile.scenes = scrapeAll(unprint.initAll(query.all('.card-scene')), channel); + + return profile; +} + async function getActorUrl(actor, channel) { if (actor.url) { return actor.url; @@ -162,7 +128,7 @@ async function getActorUrl(actor, channel) { const searchUrl = `${channel.url}/api/autocomplete/search?q=${slugify(actor.name, '+')}`; - const searchRes = await http.get(searchUrl, { + const searchRes = await unprint.get(searchUrl, { headers: { Referer: actor.url, }, diff --git a/src/scrapers/pornbox.js b/src/scrapers/pornbox.js index 87364cdd..35bc0738 100755 --- a/src/scrapers/pornbox.js +++ b/src/scrapers/pornbox.js @@ -25,14 +25,24 @@ async function getTrailer(data) { return null; } +function extractShootId(title) { + if (!title) { + return null; + } + + return title.trim().match(/[A-Z]{2,3}\d{3,4}\w?/)?.[0].toUpperCase(); +} + async function scrapeScene(data, channel, include) { const release = {}; const entityUrl = new URL(channel.url).origin; + release.entryId = data.id; + release.title = data.scene_name || data.custom_name; - release.entryId = data.id; - release.url = `${entityUrl}/watch/${data.id}/${slugify(release.title, '_')}`; + release.url = `${entityUrl}/watch/${data.id}/${slugify(release.title, '_') || ''}`; + release.shootId = extractShootId(release.title); release.date = new Date(data.release_date || data.publish_date); release.duration = unprint.extractDuration(data.runtime); diff --git a/src/utils/slugify.js b/src/utils/slugify.js index 5c7451d2..92fd64c1 100755 --- a/src/utils/slugify.js +++ b/src/utils/slugify.js @@ -56,7 +56,7 @@ function slugify(strings, delimiter = '-', { symbolRegex = defaultSymbolRegex, } = {}) { if (!strings || (typeof strings !== 'string' && !Array.isArray(strings))) { - return strings; + return ''; } const string = [].concat(strings).join(' ');