Fixed comment field not updated. Refactored Nubiles scraper.

This commit is contained in:
DebaucheryLibrarian 2025-10-06 03:26:17 +02:00
parent e13c8ccfe0
commit 19c892ab13
7 changed files with 217 additions and 88 deletions

20
package-lock.json generated
View File

@ -89,7 +89,7 @@
"tunnel": "0.0.6", "tunnel": "0.0.6",
"ua-parser-js": "^1.0.37", "ua-parser-js": "^1.0.37",
"undici": "^5.28.1", "undici": "^5.28.1",
"unprint": "^0.15.7", "unprint": "^0.16.1",
"url-pattern": "^1.0.3", "url-pattern": "^1.0.3",
"v-tooltip": "^2.1.3", "v-tooltip": "^2.1.3",
"video.js": "^8.6.1", "video.js": "^8.6.1",
@ -17137,6 +17137,17 @@
"node": ">= 0.6" "node": ">= 0.6"
} }
}, },
"node_modules/srcset": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/srcset/-/srcset-4.0.0.tgz",
"integrity": "sha512-wvLeHgcVHKO8Sc/H/5lkGreJQVeYMm9rlmt8PuR1xE31rIuXhuzznUUqAt8MqLhB3MqJdFzlNAfpcWnxiFUcPw==",
"engines": {
"node": ">=12"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/sshpk": { "node_modules/sshpk": {
"version": "1.18.0", "version": "1.18.0",
"resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.18.0.tgz", "resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.18.0.tgz",
@ -18359,9 +18370,9 @@
} }
}, },
"node_modules/unprint": { "node_modules/unprint": {
"version": "0.15.7", "version": "0.16.1",
"resolved": "https://registry.npmjs.org/unprint/-/unprint-0.15.7.tgz", "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.16.1.tgz",
"integrity": "sha512-sR4HhdJbPxkcQlQem/Hl3N67Nhn47wiK71qvl+yCT1N31tknA+mhtD+aWW5MG5F9fnJpCTlr/s4mCLxalj6XEA==", "integrity": "sha512-vOT6kdoZwVae9iHS5H+eBOqTZaVJRJWrBJrfnAEIzqPO8KseFvajd+kLZSL9iCE6Al5S0hi2TuMW89c8YK3Baw==",
"dependencies": { "dependencies": {
"axios": "^0.27.2", "axios": "^0.27.2",
"bottleneck": "^2.19.5", "bottleneck": "^2.19.5",
@ -18371,6 +18382,7 @@
"eslint-config-airbnb-base": "^15.0.0", "eslint-config-airbnb-base": "^15.0.0",
"jsdom": "^17.0.0", "jsdom": "^17.0.0",
"moment-timezone": "^0.5.34", "moment-timezone": "^0.5.34",
"srcset": "^4.0.0",
"tunnel": "^0.0.6" "tunnel": "^0.0.6"
} }
}, },

View File

@ -148,7 +148,7 @@
"tunnel": "0.0.6", "tunnel": "0.0.6",
"ua-parser-js": "^1.0.37", "ua-parser-js": "^1.0.37",
"undici": "^5.28.1", "undici": "^5.28.1",
"unprint": "^0.15.7", "unprint": "^0.16.1",
"url-pattern": "^1.0.3", "url-pattern": "^1.0.3",
"v-tooltip": "^2.1.3", "v-tooltip": "^2.1.3",
"video.js": "^8.6.1", "video.js": "^8.6.1",

View File

@ -8663,7 +8663,7 @@ const sites = [
url: 'https://www.petitehdporn.com', url: 'https://www.petitehdporn.com',
parent: 'nubiles', parent: 'nubiles',
parameters: { parameters: {
upcoming: true, upcoming: false,
}, },
}, },
{ {
@ -11445,7 +11445,7 @@ const sites = [
slug: 'danejones', slug: 'danejones',
name: 'Dane Jones', name: 'Dane Jones',
alias: ['dnj'], alias: ['dnj'],
url: 'https://www.danejones.com/', url: 'https://www.danejones.com',
parameters: { parameters: {
siteId: 290, siteId: 290,
native: true, native: true,

View File

@ -2,6 +2,7 @@
const unprint = require('unprint'); const unprint = require('unprint');
const http = require('../utils/http');
const slugify = require('../utils/slugify'); const slugify = require('../utils/slugify');
// Naughty America network // Naughty America network
@ -42,6 +43,40 @@ function scrapeLatest(scenes, channel) {
}); });
} }
async function fetchLatest(channel, page = 1) {
const { tab } = await http.getBrowserSession('naughtyamerica', { useGlobalBrowser: false, useProxy: true, headless: false });
const url = `${channel.url}${channel.parameters?.scenes || ''}?page=${page}`;
const res = await tab.goto(url);
const status = res.status();
if (status === 200) {
const html = await tab.content();
const items = unprint.initAll(html, '.site-list .scene-item, .panel-body');
const scenes = scrapeLatest(items, channel);
await tab.close();
return scenes;
}
await tab.close();
return status;
}
/*
async function fetchLatest(site, page = 1) {
const res = await unprint.get(`${site.url}${site.parameters?.scenes || ''}?page=${page}`, { selectAll: '.site-list .scene-item, .panel-body' });
if (res.ok) {
return scrapeLatest(res.context, site);
}
return res.status;
}
*/
function scrapeScene({ query }, { url }) { function scrapeScene({ query }, { url }) {
const release = {}; const release = {};
@ -98,6 +133,28 @@ function scrapeScene({ query }, { url }) {
return release; return release;
} }
async function fetchScene(url, _channel) {
const { tab } = await http.getBrowserSession('naughtyamerica', { useGlobalBrowser: false, useProxy: true });
const res = await tab.goto(url);
const status = res.status();
if (status === 200) {
const html = await tab.content();
const item = unprint.init(html);
const scene = scrapeScene(item, { url });
await tab.close();
return scene;
}
await tab.close();
return status;
}
async function scrapeProfile({ query }) { async function scrapeProfile({ query }) {
const profile = {}; const profile = {};
@ -107,16 +164,30 @@ async function scrapeProfile({ query }) {
return profile; return profile;
} }
async function fetchLatest(site, page = 1) { async function fetchProfile({ slug }, { channel }) {
const res = await unprint.get(`${site.url}${site.parameters?.scenes || ''}?page=${page}`, { selectAll: '.site-list .scene-item, .panel-body' }); const { tab } = await http.getBrowserSession('naughtyamerica', { useGlobalBrowser: false, useProxy: true });
const url = `${channel.url}/pornstar/${slug}`;
const res = await tab.goto(url);
if (res.ok) { const status = res.status();
return scrapeLatest(res.context, site);
if (status === 200) {
const html = await tab.content();
const item = unprint.init(html, '.bio-info, .performer-details');
const profile = scrapeProfile(item, { url });
await tab.close();
return profile;
} }
return res.status; await tab.close();
return status;
} }
/*
async function fetchProfile({ slug }, { channel }) { async function fetchProfile({ slug }, { channel }) {
const res = await unprint.get(`${channel.url}/pornstar/${slug}`, { select: '.bio-info, .performer-details' }); const res = await unprint.get(`${channel.url}/pornstar/${slug}`, { select: '.bio-info, .performer-details' });
@ -126,9 +197,10 @@ async function fetchProfile({ slug }, { channel }) {
return res.status; return res.status;
} }
*/
module.exports = { module.exports = {
fetchLatest, fetchLatest,
fetchScene,
fetchProfile, fetchProfile,
scrapeScene,
}; };

View File

@ -1,6 +1,7 @@
'use strict'; 'use strict';
const qu = require('../utils/qu'); const unprint = require('unprint');
const slugify = require('../utils/slugify'); const slugify = require('../utils/slugify');
const { heightToCm } = require('../utils/convert'); const { heightToCm } = require('../utils/convert');
@ -9,44 +10,43 @@ const slugUrlMap = {
nubilesporn: 'https://www.nubiles-porn.com', nubilesporn: 'https://www.nubiles-porn.com',
}; };
function stripQuery(link) {
if (!link) {
return null;
}
const url = new URL(link);
return `${url.origin}${url.pathname}`;
}
async function getPhotos(albumUrl) { async function getPhotos(albumUrl) {
const res = await qu.getAll(albumUrl, '.photo-thumb'); const res = await unprint.get(albumUrl, { selectAll: '.photo-thumb' });
return res.ok return res.ok
? res.items.map(({ query }) => qu.prefixUrl(query.q('source').srcset)) ? res.context.map(({ query }) => unprint.prefixUrl(query.element('source').srcset))
: []; : [];
} }
function scrapeAll(scenes, site, origin) { function scrapeAll(scenes, entity) {
return scenes.map(({ query }) => { return scenes.map(({ query }) => {
const release = {}; const release = {};
release.title = query.q('.title a', true); release.title = query.content('.title a');
release.url = stripQuery(unprint.prefixUrl(query.url('.title a'), entity.url));
const url = query.url('.title a').split('?')[0]; release.entryId = Number(new URL(release.url).pathname.match(/\/watch\/(\d+)/)[1]);
const channelUrl = query.url('.site-link');
if (/^http/.test(url)) {
const { pathname } = new URL(url);
// release.entryId = pathname.split('/')[3];
if (channelUrl) release.url = `${channelUrl}${pathname}`;
else release.url = url;
} else if (!/\/join/.test(url)) {
// release.entryId = url.split('/')[3];
if (channelUrl) release.url = `${channelUrl}${url}`;
else if (site?.url) release.url = `${site.url}${url}`;
else if (origin) release.url = `${origin}${url}`;
} else {
// release.entryId = qu.q('a img', 'tube_tour_thumb_id');
}
release.date = query.date('.date', 'MMM D, YYYY'); release.date = query.date('.date', 'MMM D, YYYY');
release.actors = query.all('.models a.model', true);
// no reliable entry ID between upcoming and released scenes if (query.exists('.models a.model')) {
release.entryId = `${qu.formatDate(release.date, 'YYYY-MM-DD')}-${slugify(release.title)}`; release.actors = query.all('.models a.model').map((actorEl) => ({
name: unprint.query.content(actorEl),
url: unprint.prefixUrl(unprint.query.url(actorEl, null), entity.url),
}));
} else {
// upcoming page has single string of actors, implicitly separated by a lot of whitespace
release.actors = query.content('.models', { trim: false })?.trim().split(/\s{2,}/);
}
const poster = query.sourceSet('img', 'data-srcset')?.[0]; const poster = query.sourceSet('img', 'data-srcset')?.[0];
@ -58,24 +58,56 @@ function scrapeAll(scenes, site, origin) {
release.stars = query.number('.rating'); release.stars = query.number('.rating');
release.likes = query.number('.likes'); release.likes = query.number('.likes');
release.comment = `${unprint.formatDate(release.date, 'YYYY-MM-DD')}-${slugify(release.title)}`;
return release; return release;
}); });
} }
async function scrapeScene({ query }, url, site) { async function fetchLatest(site, page = 1) {
const url = `${site.url}/video/gallery/${(page - 1) * 12}`;
const res = await unprint.get(url, { selectAll: '.content-grid-item' });
if (res.ok) {
return scrapeAll(res.context, site);
}
return res.status;
}
async function fetchUpcoming(site) {
if (site.parameters?.upcoming) {
const url = `${site.url}/video/upcoming`;
const res = await unprint.get(url, { selectAll: '.content-grid-item' });
if (res.ok) {
return scrapeAll(res.context, site);
}
return res.status;
}
return [];
}
async function scrapeScene({ query }, { url, entity, include }) {
const release = {}; const release = {};
const { origin, pathname } = new URL(url); const { origin, pathname } = new URL(url);
release.url = `${origin}${pathname}`; release.url = `${origin}${pathname}`;
release.entryId = new URL(url).pathname.split('/')[3]; release.entryId = new URL(url).pathname.split('/')[3];
release.title = query.q('.content-pane-title h2', true); release.title = query.content('.content-pane-title h2');
release.description = query.q('.content-pane-column div', true); release.description = query.content('.content-pane-column div');
release.date = query.date('.date', 'MMM D, YYYY'); release.date = query.date('.date', 'MMM D, YYYY');
release.actors = query.all('.content-pane-performers .model', true); release.actors = query.all('.content-pane-performers .model').map((actorEl) => ({
release.tags = query.all('.categories a', true); name: unprint.query.content(actorEl),
url: unprint.prefixUrl(unprint.query.url(actorEl, null), entity.url),
}));
release.tags = query.contents('.categories a');
release.poster = query.poster() || query.img('.fake-video-player img'); release.poster = query.poster() || query.img('.fake-video-player img');
release.trailer = query.all('source').map((source) => ({ release.trailer = query.all('source').map((source) => ({
@ -83,76 +115,89 @@ async function scrapeScene({ query }, url, site) {
quality: Number(source.getAttribute('res')), quality: Number(source.getAttribute('res')),
})); }));
release.stars = Number(query.q('.score', true)); release.stars = query.number('.score');
release.likes = Number(query.q('#likecount', true)); release.likes = query.number('#likecount');
const albumLink = query.url('.content-pane-related-links a[href*="gallery"]'); const albumLink = query.url('.content-pane-related-links a[href*="gallery"]');
if (albumLink) { if (albumLink && include.photos) {
release.photos = await getPhotos(`${site.url}${albumLink}`); release.photos = await getPhotos(albumLink);
} }
return release; return release;
} }
function scrapeProfile({ query }, _actorName, origin) { function scrapeProfile({ query }, avatar) {
const profile = {}; const profile = {};
const keys = query.all('.model-profile h5', true); const keys = query.contents('.model-profile .model-profile-subheading');
const values = query.all('.model-profile h5 + p', true); const values = query.contents('.model-profile .model-profile-subheading + p');
const bio = keys.reduce((acc, key, index) => ({ ...acc, [slugify(key, '_')]: values[index] }), {}); const bio = keys.reduce((acc, key, index) => ({ ...acc, [slugify(key, '_')]: values[index] }), {});
profile.age = Number(bio.age); profile.age = Number(bio.age);
profile.description = query.q('.model-bio', true); profile.description = query.content('.model-bio');
profile.residencePlace = bio.location; profile.residencePlace = bio.location;
profile.height = heightToCm(bio.height); profile.height = heightToCm(bio.height);
[profile.bust, profile.waist, profile.hip] = bio.figure.split('-').map((v) => Number(v) || v); profile.measurements = bio.figure;
profile.avatar = query.img('.model-profile img'); const photo = query.img('.model-profile img');
const releases = query.all('.content-grid-item').filter((el) => /video\//.test(query.url(el, '.img-wrapper a'))); // filter out photos // avatar on profile page is different, index avatar preferred
profile.releases = scrapeAll(query.initAll(releases), null, origin); if (avatar?.length > 0) {
profile.avatar = avatar;
profile.photos = [photo];
} else {
profile.avatar = photo;
}
return profile; return profile;
} }
async function fetchLatest(site, page = 1) { async function findModel(actor, entity) {
const url = `${site.url}/video/gallery/${(page - 1) * 12}`; const firstLetter = actor.name.charAt(0).toLowerCase();
const res = await qu.getAll(url, '.content-grid-item'); const origin = slugUrlMap[entity.slug] || entity.url;
return res.ok ? scrapeAll(res.items, site) : res.status;
}
async function fetchUpcoming(site) {
if (site.parameters?.upcoming) {
const url = `${site.url}/video/upcoming`;
const res = await qu.getAll(url, '.content-grid-item');
return res.ok ? scrapeAll(res.items, site) : res.status;
}
return [];
}
async function fetchProfile({ name: actorName }, { site }) {
const firstLetter = actorName.charAt(0).toLowerCase();
const origin = slugUrlMap[site.slug] || site.url;
const url = `${origin}/model/alpha/${firstLetter}`; const url = `${origin}/model/alpha/${firstLetter}`;
const resModels = await qu.get(url); const resModels = await unprint.get(url);
if (!resModels.ok) return resModels.status; if (!resModels.ok) {
return resModels.status;
}
const modelPath = resModels.item.qu.all('.content-grid-item a.title').find((el) => slugify(el.textContent) === slugify(actorName)); const modelEl = resModels.context.query.all('.content-grid-item').find((el) => slugify(unprint.query.content(el, 'a.title')) === slugify(actor.name));
if (modelPath) { if (modelEl) {
const modelUrl = `${origin}${modelPath}`; const modelUrl = `${origin}${unprint.query.url(modelEl, 'a.title')}`;
const resModel = await qu.get(modelUrl); const modelAvatar = unprint.query.sourceSet(modelEl, 'a picture img', 'data-srcset');
return resModel.ok ? scrapeProfile(resModel.item, actorName, origin) : resModel.status; return {
url: modelUrl,
avatar: modelAvatar,
};
}
// try actor URL last in order to grab avatar
if (actor.url) {
return { url: actor.url };
}
return null;
}
async function fetchProfile(actor, { entity }) {
const model = await findModel(actor, entity);
if (model) {
const resModel = await unprint.get(model.url);
if (resModel.ok) {
return scrapeProfile(resModel.context, model.avatar);
}
return resModel.status;
} }
return null; return null;
@ -163,5 +208,4 @@ module.exports = {
fetchUpcoming, fetchUpcoming,
fetchProfile, fetchProfile,
scrapeScene, scrapeScene,
deprecated: true,
}; };

View File

@ -437,10 +437,11 @@ async function storeScenes(releases, useBatchId) {
title = COALESCE(new.title, releases.title), title = COALESCE(new.title, releases.title),
description = COALESCE(new.description, releases.description), description = COALESCE(new.description, releases.description),
duration = COALESCE(new.duration, releases.duration), duration = COALESCE(new.duration, releases.duration),
comment = COALESCE(new.comment, releases.comment),
deep = new.url IS NOT NULL, deep = new.url IS NOT NULL,
updated_at = NOW() updated_at = NOW()
FROM json_to_recordset(:scenes) FROM json_to_recordset(:scenes)
AS new(id int, url text, date timestamptz, entity json, title text, description text, duration integer, deep boolean) AS new(id int, url text, date timestamptz, entity json, title text, description text, duration integer, comment text, deep boolean)
WHERE releases.id = new.id WHERE releases.id = new.id
`, { `, {
scenes: JSON.stringify(curatedDuplicateReleases), scenes: JSON.stringify(curatedDuplicateReleases),

View File

@ -175,7 +175,7 @@ async function getBrowserSession(identifier, options = {}) {
const newBrowser = await puppeteer.launch({ const newBrowser = await puppeteer.launch({
headless: typeof options.headless === 'undefined' ? 'new' : options.headless, headless: typeof options.headless === 'undefined' ? 'new' : options.headless,
args: [ args: [
...(options.useProxy ? [`--proxy-server=${config.proxy.host}:${config.proxy.port}`] : []), ...(options.useProxy && config.proxy.enabled ? [`--proxy-server=${config.proxy.host}:${config.proxy.port}`] : []),
], ],
// headless: false, // headless: false,
}); });