Extracting shoot IDs from title in PornBox scraper.

This commit is contained in:
DebaucheryLibrarian 2026-02-01 01:31:45 +01:00
parent f76341f0dd
commit 762e605bd1
5 changed files with 51 additions and 75 deletions

8
package-lock.json generated
View File

@ -94,7 +94,7 @@
"tunnel": "0.0.6", "tunnel": "0.0.6",
"ua-parser-js": "^1.0.37", "ua-parser-js": "^1.0.37",
"undici": "^5.28.1", "undici": "^5.28.1",
"unprint": "^0.18.11", "unprint": "^0.18.13",
"url-pattern": "^1.0.3", "url-pattern": "^1.0.3",
"v-tooltip": "^2.1.3", "v-tooltip": "^2.1.3",
"video.js": "^8.6.1", "video.js": "^8.6.1",
@ -20380,9 +20380,9 @@
} }
}, },
"node_modules/unprint": { "node_modules/unprint": {
"version": "0.18.11", "version": "0.18.13",
"resolved": "https://registry.npmjs.org/unprint/-/unprint-0.18.11.tgz", "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.18.13.tgz",
"integrity": "sha512-mHOfweWWLqhEIRnjhdqCzEpHhIx+m/GwE2eDvJNNbnVEPbV8q8EaN6eGH3vkcAwDVgNIOakZaTZFK+VKy13Lsg==", "integrity": "sha512-vjUF7X7/dg2Os/zesJ0+23eVc7NH2oKzspPSyBzcIx6IuEcVm1rdlD9dAxdaRMUNBWEeA5ekyk263CBI3lyaBQ==",
"dependencies": { "dependencies": {
"bottleneck": "^2.19.5", "bottleneck": "^2.19.5",
"cookie": "^1.1.1", "cookie": "^1.1.1",

View File

@ -153,7 +153,7 @@
"tunnel": "0.0.6", "tunnel": "0.0.6",
"ua-parser-js": "^1.0.37", "ua-parser-js": "^1.0.37",
"undici": "^5.28.1", "undici": "^5.28.1",
"unprint": "^0.18.11", "unprint": "^0.18.13",
"url-pattern": "^1.0.3", "url-pattern": "^1.0.3",
"v-tooltip": "^2.1.3", "v-tooltip": "^2.1.3",
"video.js": "^8.6.1", "video.js": "^8.6.1",

View File

@ -2,7 +2,6 @@
const unprint = require('unprint'); const unprint = require('unprint');
const http = require('../utils/http');
const slugify = require('../utils/slugify'); const slugify = require('../utils/slugify');
function extractTitle(originalTitle) { function extractTitle(originalTitle) {
@ -43,6 +42,25 @@ function scrapeAll(scenes, channel) {
}); });
} }
async function fetchLatest(channel, page) {
// const res = await unprint.get(`https://www.analvids.com/new-videos/${page}`, { selectAll: '.card-scene' }); // analvids as channel
// studios as channels
const url = `${channel.url}/latest/${page}`;
const res = await unprint.get(url, {
selectAll: '.card-scene',
headers: {
Referer: url,
},
});
if (res.ok) {
return scrapeAll(res.context, channel);
}
return res.status;
}
function scrapeScene({ query }, url) { function scrapeScene({ query }, url) {
const release = {}; const release = {};
@ -76,71 +94,6 @@ function scrapeScene({ query }, url) {
return release; return release;
} }
function scrapeProfile({ query }, url, channel) {
const profile = { url };
profile.nationality = query.content('.model__info a[href*="/nationality"]');
profile.age = query.number('//td[contains(text(), "Age")]/following-sibling::td');
profile.avatar = query.img('.model__left img');
profile.scenes = scrapeAll(unprint.initAll(query.all('.card-scene')), channel);
return profile;
}
async function fetchLatest(channel, page) {
// const res = await unprint.get(`https://www.analvids.com/new-videos/${page}`, { selectAll: '.card-scene' }); // analvids as channel
// studios as channels
const url = `${channel.url}/latest/${page}`;
const res = await unprint.get(url, {
selectAll: '.card-scene',
headers: {
Referer: url,
},
});
if (res.ok) {
return scrapeAll(res.context, channel);
}
return res.status;
}
/*
async function fetchLatest(channel, page) {
// const res = await unprint.get(`https://www.analvids.com/new-videos/${page}`, { selectAll: '.card-scene' }); // analvids as channel
// const res = await unprint.get(`${channel.url}/latest/${page}`, { selectAll: '.card-scene' }); // studios as channels
const url = `${channel.url}/latest/${page}`; // studios as channels
const { tab } = await http.getBrowserSession('analvids', {
bypass: {
headless: false,
},
});
const res = await tab.goto(url);
const status = res.status();
console.log('STATUS', status);
if (status === 200) {
const html = await tab.content();
const context = unprint.initAll(html, '.card-scene'); // studios as channels
const scenes = scrapeAll(context, channel);
tab.close();
return scenes;
}
return res.status;
}
*/
async function fetchScene(url) { async function fetchScene(url) {
const res = await unprint.get(url, { const res = await unprint.get(url, {
headers: { headers: {
@ -155,6 +108,19 @@ async function fetchScene(url) {
return res.status; return res.status;
} }
function scrapeProfile({ query }, url, channel) {
const profile = { url };
profile.nationality = query.content('.model__info a[href*="/nationality"]');
profile.age = query.number('//td[contains(text(), "Age")]/following-sibling::td');
profile.avatar = query.img('.model__left img');
profile.scenes = scrapeAll(unprint.initAll(query.all('.card-scene')), channel);
return profile;
}
async function getActorUrl(actor, channel) { async function getActorUrl(actor, channel) {
if (actor.url) { if (actor.url) {
return actor.url; return actor.url;
@ -162,7 +128,7 @@ async function getActorUrl(actor, channel) {
const searchUrl = `${channel.url}/api/autocomplete/search?q=${slugify(actor.name, '+')}`; const searchUrl = `${channel.url}/api/autocomplete/search?q=${slugify(actor.name, '+')}`;
const searchRes = await http.get(searchUrl, { const searchRes = await unprint.get(searchUrl, {
headers: { headers: {
Referer: actor.url, Referer: actor.url,
}, },

View File

@ -25,14 +25,24 @@ async function getTrailer(data) {
return null; return null;
} }
function extractShootId(title) {
if (!title) {
return null;
}
return title.trim().match(/[A-Z]{2,3}\d{3,4}\w?/)?.[0].toUpperCase();
}
async function scrapeScene(data, channel, include) { async function scrapeScene(data, channel, include) {
const release = {}; const release = {};
const entityUrl = new URL(channel.url).origin; const entityUrl = new URL(channel.url).origin;
release.entryId = data.id;
release.title = data.scene_name || data.custom_name; release.title = data.scene_name || data.custom_name;
release.entryId = data.id; release.url = `${entityUrl}/watch/${data.id}/${slugify(release.title, '_') || ''}`;
release.url = `${entityUrl}/watch/${data.id}/${slugify(release.title, '_')}`; release.shootId = extractShootId(release.title);
release.date = new Date(data.release_date || data.publish_date); release.date = new Date(data.release_date || data.publish_date);
release.duration = unprint.extractDuration(data.runtime); release.duration = unprint.extractDuration(data.runtime);

View File

@ -56,7 +56,7 @@ function slugify(strings, delimiter = '-', {
symbolRegex = defaultSymbolRegex, symbolRegex = defaultSymbolRegex,
} = {}) { } = {}) {
if (!strings || (typeof strings !== 'string' && !Array.isArray(strings))) { if (!strings || (typeof strings !== 'string' && !Array.isArray(strings))) {
return strings; return '';
} }
const string = [].concat(strings).join(' '); const string = [].concat(strings).join(' ');