Fixed comment field not updated. Refactored Nubiles scraper.

This commit is contained in:
DebaucheryLibrarian 2025-10-06 03:26:17 +02:00
parent e13c8ccfe0
commit 19c892ab13
7 changed files with 217 additions and 88 deletions

20
package-lock.json generated
View File

@ -89,7 +89,7 @@
"tunnel": "0.0.6",
"ua-parser-js": "^1.0.37",
"undici": "^5.28.1",
"unprint": "^0.15.7",
"unprint": "^0.16.1",
"url-pattern": "^1.0.3",
"v-tooltip": "^2.1.3",
"video.js": "^8.6.1",
@ -17137,6 +17137,17 @@
"node": ">= 0.6"
}
},
"node_modules/srcset": {
"version": "4.0.0",
"resolved": "https://registry.npmjs.org/srcset/-/srcset-4.0.0.tgz",
"integrity": "sha512-wvLeHgcVHKO8Sc/H/5lkGreJQVeYMm9rlmt8PuR1xE31rIuXhuzznUUqAt8MqLhB3MqJdFzlNAfpcWnxiFUcPw==",
"engines": {
"node": ">=12"
},
"funding": {
"url": "https://github.com/sponsors/sindresorhus"
}
},
"node_modules/sshpk": {
"version": "1.18.0",
"resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.18.0.tgz",
@ -18359,9 +18370,9 @@
}
},
"node_modules/unprint": {
"version": "0.15.7",
"resolved": "https://registry.npmjs.org/unprint/-/unprint-0.15.7.tgz",
"integrity": "sha512-sR4HhdJbPxkcQlQem/Hl3N67Nhn47wiK71qvl+yCT1N31tknA+mhtD+aWW5MG5F9fnJpCTlr/s4mCLxalj6XEA==",
"version": "0.16.1",
"resolved": "https://registry.npmjs.org/unprint/-/unprint-0.16.1.tgz",
"integrity": "sha512-vOT6kdoZwVae9iHS5H+eBOqTZaVJRJWrBJrfnAEIzqPO8KseFvajd+kLZSL9iCE6Al5S0hi2TuMW89c8YK3Baw==",
"dependencies": {
"axios": "^0.27.2",
"bottleneck": "^2.19.5",
@ -18371,6 +18382,7 @@
"eslint-config-airbnb-base": "^15.0.0",
"jsdom": "^17.0.0",
"moment-timezone": "^0.5.34",
"srcset": "^4.0.0",
"tunnel": "^0.0.6"
}
},

View File

@ -148,7 +148,7 @@
"tunnel": "0.0.6",
"ua-parser-js": "^1.0.37",
"undici": "^5.28.1",
"unprint": "^0.15.7",
"unprint": "^0.16.1",
"url-pattern": "^1.0.3",
"v-tooltip": "^2.1.3",
"video.js": "^8.6.1",

View File

@ -8663,7 +8663,7 @@ const sites = [
url: 'https://www.petitehdporn.com',
parent: 'nubiles',
parameters: {
upcoming: true,
upcoming: false,
},
},
{
@ -11445,7 +11445,7 @@ const sites = [
slug: 'danejones',
name: 'Dane Jones',
alias: ['dnj'],
url: 'https://www.danejones.com/',
url: 'https://www.danejones.com',
parameters: {
siteId: 290,
native: true,

View File

@ -2,6 +2,7 @@
const unprint = require('unprint');
const http = require('../utils/http');
const slugify = require('../utils/slugify');
// Naughty America network
@ -42,6 +43,40 @@ function scrapeLatest(scenes, channel) {
});
}
async function fetchLatest(channel, page = 1) {
const { tab } = await http.getBrowserSession('naughtyamerica', { useGlobalBrowser: false, useProxy: true, headless: false });
const url = `${channel.url}${channel.parameters?.scenes || ''}?page=${page}`;
const res = await tab.goto(url);
const status = res.status();
if (status === 200) {
const html = await tab.content();
const items = unprint.initAll(html, '.site-list .scene-item, .panel-body');
const scenes = scrapeLatest(items, channel);
await tab.close();
return scenes;
}
await tab.close();
return status;
}
/*
async function fetchLatest(site, page = 1) {
const res = await unprint.get(`${site.url}${site.parameters?.scenes || ''}?page=${page}`, { selectAll: '.site-list .scene-item, .panel-body' });
if (res.ok) {
return scrapeLatest(res.context, site);
}
return res.status;
}
*/
function scrapeScene({ query }, { url }) {
const release = {};
@ -98,6 +133,28 @@ function scrapeScene({ query }, { url }) {
return release;
}
async function fetchScene(url, _channel) {
const { tab } = await http.getBrowserSession('naughtyamerica', { useGlobalBrowser: false, useProxy: true });
const res = await tab.goto(url);
const status = res.status();
if (status === 200) {
const html = await tab.content();
const item = unprint.init(html);
const scene = scrapeScene(item, { url });
await tab.close();
return scene;
}
await tab.close();
return status;
}
async function scrapeProfile({ query }) {
const profile = {};
@ -107,16 +164,30 @@ async function scrapeProfile({ query }) {
return profile;
}
async function fetchLatest(site, page = 1) {
const res = await unprint.get(`${site.url}${site.parameters?.scenes || ''}?page=${page}`, { selectAll: '.site-list .scene-item, .panel-body' });
async function fetchProfile({ slug }, { channel }) {
const { tab } = await http.getBrowserSession('naughtyamerica', { useGlobalBrowser: false, useProxy: true });
const url = `${channel.url}/pornstar/${slug}`;
const res = await tab.goto(url);
if (res.ok) {
return scrapeLatest(res.context, site);
const status = res.status();
if (status === 200) {
const html = await tab.content();
const item = unprint.init(html, '.bio-info, .performer-details');
const profile = scrapeProfile(item, { url });
await tab.close();
return profile;
}
return res.status;
await tab.close();
return status;
}
/*
async function fetchProfile({ slug }, { channel }) {
const res = await unprint.get(`${channel.url}/pornstar/${slug}`, { select: '.bio-info, .performer-details' });
@ -126,9 +197,10 @@ async function fetchProfile({ slug }, { channel }) {
return res.status;
}
*/
module.exports = {
fetchLatest,
fetchScene,
fetchProfile,
scrapeScene,
};

View File

@ -1,6 +1,7 @@
'use strict';
const qu = require('../utils/qu');
const unprint = require('unprint');
const slugify = require('../utils/slugify');
const { heightToCm } = require('../utils/convert');
@ -9,44 +10,43 @@ const slugUrlMap = {
nubilesporn: 'https://www.nubiles-porn.com',
};
function stripQuery(link) {
if (!link) {
return null;
}
const url = new URL(link);
return `${url.origin}${url.pathname}`;
}
async function getPhotos(albumUrl) {
const res = await qu.getAll(albumUrl, '.photo-thumb');
const res = await unprint.get(albumUrl, { selectAll: '.photo-thumb' });
return res.ok
? res.items.map(({ query }) => qu.prefixUrl(query.q('source').srcset))
? res.context.map(({ query }) => unprint.prefixUrl(query.element('source').srcset))
: [];
}
function scrapeAll(scenes, site, origin) {
function scrapeAll(scenes, entity) {
return scenes.map(({ query }) => {
const release = {};
release.title = query.q('.title a', true);
const url = query.url('.title a').split('?')[0];
const channelUrl = query.url('.site-link');
if (/^http/.test(url)) {
const { pathname } = new URL(url);
// release.entryId = pathname.split('/')[3];
if (channelUrl) release.url = `${channelUrl}${pathname}`;
else release.url = url;
} else if (!/\/join/.test(url)) {
// release.entryId = url.split('/')[3];
if (channelUrl) release.url = `${channelUrl}${url}`;
else if (site?.url) release.url = `${site.url}${url}`;
else if (origin) release.url = `${origin}${url}`;
} else {
// release.entryId = qu.q('a img', 'tube_tour_thumb_id');
}
release.title = query.content('.title a');
release.url = stripQuery(unprint.prefixUrl(query.url('.title a'), entity.url));
release.entryId = Number(new URL(release.url).pathname.match(/\/watch\/(\d+)/)[1]);
release.date = query.date('.date', 'MMM D, YYYY');
release.actors = query.all('.models a.model', true);
// no reliable entry ID between upcoming and released scenes
release.entryId = `${qu.formatDate(release.date, 'YYYY-MM-DD')}-${slugify(release.title)}`;
if (query.exists('.models a.model')) {
release.actors = query.all('.models a.model').map((actorEl) => ({
name: unprint.query.content(actorEl),
url: unprint.prefixUrl(unprint.query.url(actorEl, null), entity.url),
}));
} else {
// upcoming page has single string of actors, implicitly separated by a lot of whitespace
release.actors = query.content('.models', { trim: false })?.trim().split(/\s{2,}/);
}
const poster = query.sourceSet('img', 'data-srcset')?.[0];
@ -58,24 +58,56 @@ function scrapeAll(scenes, site, origin) {
release.stars = query.number('.rating');
release.likes = query.number('.likes');
release.comment = `${unprint.formatDate(release.date, 'YYYY-MM-DD')}-${slugify(release.title)}`;
return release;
});
}
async function scrapeScene({ query }, url, site) {
async function fetchLatest(site, page = 1) {
const url = `${site.url}/video/gallery/${(page - 1) * 12}`;
const res = await unprint.get(url, { selectAll: '.content-grid-item' });
if (res.ok) {
return scrapeAll(res.context, site);
}
return res.status;
}
async function fetchUpcoming(site) {
if (site.parameters?.upcoming) {
const url = `${site.url}/video/upcoming`;
const res = await unprint.get(url, { selectAll: '.content-grid-item' });
if (res.ok) {
return scrapeAll(res.context, site);
}
return res.status;
}
return [];
}
async function scrapeScene({ query }, { url, entity, include }) {
const release = {};
const { origin, pathname } = new URL(url);
release.url = `${origin}${pathname}`;
release.entryId = new URL(url).pathname.split('/')[3];
release.title = query.q('.content-pane-title h2', true);
release.description = query.q('.content-pane-column div', true);
release.title = query.content('.content-pane-title h2');
release.description = query.content('.content-pane-column div');
release.date = query.date('.date', 'MMM D, YYYY');
release.actors = query.all('.content-pane-performers .model', true);
release.tags = query.all('.categories a', true);
release.actors = query.all('.content-pane-performers .model').map((actorEl) => ({
name: unprint.query.content(actorEl),
url: unprint.prefixUrl(unprint.query.url(actorEl, null), entity.url),
}));
release.tags = query.contents('.categories a');
release.poster = query.poster() || query.img('.fake-video-player img');
release.trailer = query.all('source').map((source) => ({
@ -83,76 +115,89 @@ async function scrapeScene({ query }, url, site) {
quality: Number(source.getAttribute('res')),
}));
release.stars = Number(query.q('.score', true));
release.likes = Number(query.q('#likecount', true));
release.stars = query.number('.score');
release.likes = query.number('#likecount');
const albumLink = query.url('.content-pane-related-links a[href*="gallery"]');
if (albumLink) {
release.photos = await getPhotos(`${site.url}${albumLink}`);
if (albumLink && include.photos) {
release.photos = await getPhotos(albumLink);
}
return release;
}
function scrapeProfile({ query }, _actorName, origin) {
function scrapeProfile({ query }, avatar) {
const profile = {};
const keys = query.all('.model-profile h5', true);
const values = query.all('.model-profile h5 + p', true);
const keys = query.contents('.model-profile .model-profile-subheading');
const values = query.contents('.model-profile .model-profile-subheading + p');
const bio = keys.reduce((acc, key, index) => ({ ...acc, [slugify(key, '_')]: values[index] }), {});
profile.age = Number(bio.age);
profile.description = query.q('.model-bio', true);
profile.description = query.content('.model-bio');
profile.residencePlace = bio.location;
profile.height = heightToCm(bio.height);
[profile.bust, profile.waist, profile.hip] = bio.figure.split('-').map((v) => Number(v) || v);
profile.measurements = bio.figure;
profile.avatar = query.img('.model-profile img');
const photo = query.img('.model-profile img');
const releases = query.all('.content-grid-item').filter((el) => /video\//.test(query.url(el, '.img-wrapper a'))); // filter out photos
profile.releases = scrapeAll(query.initAll(releases), null, origin);
// avatar on profile page is different, index avatar preferred
if (avatar?.length > 0) {
profile.avatar = avatar;
profile.photos = [photo];
} else {
profile.avatar = photo;
}
return profile;
}
async function fetchLatest(site, page = 1) {
const url = `${site.url}/video/gallery/${(page - 1) * 12}`;
const res = await qu.getAll(url, '.content-grid-item');
return res.ok ? scrapeAll(res.items, site) : res.status;
}
async function fetchUpcoming(site) {
if (site.parameters?.upcoming) {
const url = `${site.url}/video/upcoming`;
const res = await qu.getAll(url, '.content-grid-item');
return res.ok ? scrapeAll(res.items, site) : res.status;
}
return [];
}
async function fetchProfile({ name: actorName }, { site }) {
const firstLetter = actorName.charAt(0).toLowerCase();
const origin = slugUrlMap[site.slug] || site.url;
async function findModel(actor, entity) {
const firstLetter = actor.name.charAt(0).toLowerCase();
const origin = slugUrlMap[entity.slug] || entity.url;
const url = `${origin}/model/alpha/${firstLetter}`;
const resModels = await qu.get(url);
const resModels = await unprint.get(url);
if (!resModels.ok) return resModels.status;
if (!resModels.ok) {
return resModels.status;
}
const modelPath = resModels.item.qu.all('.content-grid-item a.title').find((el) => slugify(el.textContent) === slugify(actorName));
const modelEl = resModels.context.query.all('.content-grid-item').find((el) => slugify(unprint.query.content(el, 'a.title')) === slugify(actor.name));
if (modelPath) {
const modelUrl = `${origin}${modelPath}`;
const resModel = await qu.get(modelUrl);
if (modelEl) {
const modelUrl = `${origin}${unprint.query.url(modelEl, 'a.title')}`;
const modelAvatar = unprint.query.sourceSet(modelEl, 'a picture img', 'data-srcset');
return resModel.ok ? scrapeProfile(resModel.item, actorName, origin) : resModel.status;
return {
url: modelUrl,
avatar: modelAvatar,
};
}
// try actor URL last in order to grab avatar
if (actor.url) {
return { url: actor.url };
}
return null;
}
async function fetchProfile(actor, { entity }) {
const model = await findModel(actor, entity);
if (model) {
const resModel = await unprint.get(model.url);
if (resModel.ok) {
return scrapeProfile(resModel.context, model.avatar);
}
return resModel.status;
}
return null;
@ -163,5 +208,4 @@ module.exports = {
fetchUpcoming,
fetchProfile,
scrapeScene,
deprecated: true,
};

View File

@ -437,10 +437,11 @@ async function storeScenes(releases, useBatchId) {
title = COALESCE(new.title, releases.title),
description = COALESCE(new.description, releases.description),
duration = COALESCE(new.duration, releases.duration),
comment = COALESCE(new.comment, releases.comment),
deep = new.url IS NOT NULL,
updated_at = NOW()
FROM json_to_recordset(:scenes)
AS new(id int, url text, date timestamptz, entity json, title text, description text, duration integer, deep boolean)
AS new(id int, url text, date timestamptz, entity json, title text, description text, duration integer, comment text, deep boolean)
WHERE releases.id = new.id
`, {
scenes: JSON.stringify(curatedDuplicateReleases),

View File

@ -175,7 +175,7 @@ async function getBrowserSession(identifier, options = {}) {
const newBrowser = await puppeteer.launch({
headless: typeof options.headless === 'undefined' ? 'new' : options.headless,
args: [
...(options.useProxy ? [`--proxy-server=${config.proxy.host}:${config.proxy.port}`] : []),
...(options.useProxy && config.proxy.enabled ? [`--proxy-server=${config.proxy.host}:${config.proxy.port}`] : []),
],
// headless: false,
});