Fixed comment field not updated. Refactored Nubiles scraper.
This commit is contained in:
parent
e13c8ccfe0
commit
19c892ab13
|
@ -89,7 +89,7 @@
|
|||
"tunnel": "0.0.6",
|
||||
"ua-parser-js": "^1.0.37",
|
||||
"undici": "^5.28.1",
|
||||
"unprint": "^0.15.7",
|
||||
"unprint": "^0.16.1",
|
||||
"url-pattern": "^1.0.3",
|
||||
"v-tooltip": "^2.1.3",
|
||||
"video.js": "^8.6.1",
|
||||
|
@ -17137,6 +17137,17 @@
|
|||
"node": ">= 0.6"
|
||||
}
|
||||
},
|
||||
"node_modules/srcset": {
|
||||
"version": "4.0.0",
|
||||
"resolved": "https://registry.npmjs.org/srcset/-/srcset-4.0.0.tgz",
|
||||
"integrity": "sha512-wvLeHgcVHKO8Sc/H/5lkGreJQVeYMm9rlmt8PuR1xE31rIuXhuzznUUqAt8MqLhB3MqJdFzlNAfpcWnxiFUcPw==",
|
||||
"engines": {
|
||||
"node": ">=12"
|
||||
},
|
||||
"funding": {
|
||||
"url": "https://github.com/sponsors/sindresorhus"
|
||||
}
|
||||
},
|
||||
"node_modules/sshpk": {
|
||||
"version": "1.18.0",
|
||||
"resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.18.0.tgz",
|
||||
|
@ -18359,9 +18370,9 @@
|
|||
}
|
||||
},
|
||||
"node_modules/unprint": {
|
||||
"version": "0.15.7",
|
||||
"resolved": "https://registry.npmjs.org/unprint/-/unprint-0.15.7.tgz",
|
||||
"integrity": "sha512-sR4HhdJbPxkcQlQem/Hl3N67Nhn47wiK71qvl+yCT1N31tknA+mhtD+aWW5MG5F9fnJpCTlr/s4mCLxalj6XEA==",
|
||||
"version": "0.16.1",
|
||||
"resolved": "https://registry.npmjs.org/unprint/-/unprint-0.16.1.tgz",
|
||||
"integrity": "sha512-vOT6kdoZwVae9iHS5H+eBOqTZaVJRJWrBJrfnAEIzqPO8KseFvajd+kLZSL9iCE6Al5S0hi2TuMW89c8YK3Baw==",
|
||||
"dependencies": {
|
||||
"axios": "^0.27.2",
|
||||
"bottleneck": "^2.19.5",
|
||||
|
@ -18371,6 +18382,7 @@
|
|||
"eslint-config-airbnb-base": "^15.0.0",
|
||||
"jsdom": "^17.0.0",
|
||||
"moment-timezone": "^0.5.34",
|
||||
"srcset": "^4.0.0",
|
||||
"tunnel": "^0.0.6"
|
||||
}
|
||||
},
|
||||
|
|
|
@ -148,7 +148,7 @@
|
|||
"tunnel": "0.0.6",
|
||||
"ua-parser-js": "^1.0.37",
|
||||
"undici": "^5.28.1",
|
||||
"unprint": "^0.15.7",
|
||||
"unprint": "^0.16.1",
|
||||
"url-pattern": "^1.0.3",
|
||||
"v-tooltip": "^2.1.3",
|
||||
"video.js": "^8.6.1",
|
||||
|
|
|
@ -8663,7 +8663,7 @@ const sites = [
|
|||
url: 'https://www.petitehdporn.com',
|
||||
parent: 'nubiles',
|
||||
parameters: {
|
||||
upcoming: true,
|
||||
upcoming: false,
|
||||
},
|
||||
},
|
||||
{
|
||||
|
@ -11445,7 +11445,7 @@ const sites = [
|
|||
slug: 'danejones',
|
||||
name: 'Dane Jones',
|
||||
alias: ['dnj'],
|
||||
url: 'https://www.danejones.com/',
|
||||
url: 'https://www.danejones.com',
|
||||
parameters: {
|
||||
siteId: 290,
|
||||
native: true,
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
const unprint = require('unprint');
|
||||
|
||||
const http = require('../utils/http');
|
||||
const slugify = require('../utils/slugify');
|
||||
|
||||
// Naughty America network
|
||||
|
@ -42,6 +43,40 @@ function scrapeLatest(scenes, channel) {
|
|||
});
|
||||
}
|
||||
|
||||
async function fetchLatest(channel, page = 1) {
|
||||
const { tab } = await http.getBrowserSession('naughtyamerica', { useGlobalBrowser: false, useProxy: true, headless: false });
|
||||
const url = `${channel.url}${channel.parameters?.scenes || ''}?page=${page}`;
|
||||
const res = await tab.goto(url);
|
||||
const status = res.status();
|
||||
|
||||
if (status === 200) {
|
||||
const html = await tab.content();
|
||||
const items = unprint.initAll(html, '.site-list .scene-item, .panel-body');
|
||||
|
||||
const scenes = scrapeLatest(items, channel);
|
||||
|
||||
await tab.close();
|
||||
|
||||
return scenes;
|
||||
}
|
||||
|
||||
await tab.close();
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/*
|
||||
async function fetchLatest(site, page = 1) {
|
||||
const res = await unprint.get(`${site.url}${site.parameters?.scenes || ''}?page=${page}`, { selectAll: '.site-list .scene-item, .panel-body' });
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeLatest(res.context, site);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
*/
|
||||
|
||||
function scrapeScene({ query }, { url }) {
|
||||
const release = {};
|
||||
|
||||
|
@ -98,6 +133,28 @@ function scrapeScene({ query }, { url }) {
|
|||
return release;
|
||||
}
|
||||
|
||||
async function fetchScene(url, _channel) {
|
||||
const { tab } = await http.getBrowserSession('naughtyamerica', { useGlobalBrowser: false, useProxy: true });
|
||||
const res = await tab.goto(url);
|
||||
|
||||
const status = res.status();
|
||||
|
||||
if (status === 200) {
|
||||
const html = await tab.content();
|
||||
const item = unprint.init(html);
|
||||
|
||||
const scene = scrapeScene(item, { url });
|
||||
|
||||
await tab.close();
|
||||
|
||||
return scene;
|
||||
}
|
||||
|
||||
await tab.close();
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
async function scrapeProfile({ query }) {
|
||||
const profile = {};
|
||||
|
||||
|
@ -107,16 +164,30 @@ async function scrapeProfile({ query }) {
|
|||
return profile;
|
||||
}
|
||||
|
||||
async function fetchLatest(site, page = 1) {
|
||||
const res = await unprint.get(`${site.url}${site.parameters?.scenes || ''}?page=${page}`, { selectAll: '.site-list .scene-item, .panel-body' });
|
||||
async function fetchProfile({ slug }, { channel }) {
|
||||
const { tab } = await http.getBrowserSession('naughtyamerica', { useGlobalBrowser: false, useProxy: true });
|
||||
const url = `${channel.url}/pornstar/${slug}`;
|
||||
const res = await tab.goto(url);
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeLatest(res.context, site);
|
||||
const status = res.status();
|
||||
|
||||
if (status === 200) {
|
||||
const html = await tab.content();
|
||||
const item = unprint.init(html, '.bio-info, .performer-details');
|
||||
|
||||
const profile = scrapeProfile(item, { url });
|
||||
|
||||
await tab.close();
|
||||
|
||||
return profile;
|
||||
}
|
||||
|
||||
return res.status;
|
||||
await tab.close();
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
/*
|
||||
async function fetchProfile({ slug }, { channel }) {
|
||||
const res = await unprint.get(`${channel.url}/pornstar/${slug}`, { select: '.bio-info, .performer-details' });
|
||||
|
||||
|
@ -126,9 +197,10 @@ async function fetchProfile({ slug }, { channel }) {
|
|||
|
||||
return res.status;
|
||||
}
|
||||
*/
|
||||
|
||||
module.exports = {
|
||||
fetchLatest,
|
||||
fetchScene,
|
||||
fetchProfile,
|
||||
scrapeScene,
|
||||
};
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
'use strict';
|
||||
|
||||
const qu = require('../utils/qu');
|
||||
const unprint = require('unprint');
|
||||
|
||||
const slugify = require('../utils/slugify');
|
||||
const { heightToCm } = require('../utils/convert');
|
||||
|
||||
|
@ -9,44 +10,43 @@ const slugUrlMap = {
|
|||
nubilesporn: 'https://www.nubiles-porn.com',
|
||||
};
|
||||
|
||||
function stripQuery(link) {
|
||||
if (!link) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const url = new URL(link);
|
||||
|
||||
return `${url.origin}${url.pathname}`;
|
||||
}
|
||||
|
||||
async function getPhotos(albumUrl) {
|
||||
const res = await qu.getAll(albumUrl, '.photo-thumb');
|
||||
const res = await unprint.get(albumUrl, { selectAll: '.photo-thumb' });
|
||||
|
||||
return res.ok
|
||||
? res.items.map(({ query }) => qu.prefixUrl(query.q('source').srcset))
|
||||
? res.context.map(({ query }) => unprint.prefixUrl(query.element('source').srcset))
|
||||
: [];
|
||||
}
|
||||
|
||||
function scrapeAll(scenes, site, origin) {
|
||||
function scrapeAll(scenes, entity) {
|
||||
return scenes.map(({ query }) => {
|
||||
const release = {};
|
||||
|
||||
release.title = query.q('.title a', true);
|
||||
|
||||
const url = query.url('.title a').split('?')[0];
|
||||
const channelUrl = query.url('.site-link');
|
||||
|
||||
if (/^http/.test(url)) {
|
||||
const { pathname } = new URL(url);
|
||||
// release.entryId = pathname.split('/')[3];
|
||||
|
||||
if (channelUrl) release.url = `${channelUrl}${pathname}`;
|
||||
else release.url = url;
|
||||
} else if (!/\/join/.test(url)) {
|
||||
// release.entryId = url.split('/')[3];
|
||||
|
||||
if (channelUrl) release.url = `${channelUrl}${url}`;
|
||||
else if (site?.url) release.url = `${site.url}${url}`;
|
||||
else if (origin) release.url = `${origin}${url}`;
|
||||
} else {
|
||||
// release.entryId = qu.q('a img', 'tube_tour_thumb_id');
|
||||
}
|
||||
release.title = query.content('.title a');
|
||||
release.url = stripQuery(unprint.prefixUrl(query.url('.title a'), entity.url));
|
||||
release.entryId = Number(new URL(release.url).pathname.match(/\/watch\/(\d+)/)[1]);
|
||||
|
||||
release.date = query.date('.date', 'MMM D, YYYY');
|
||||
release.actors = query.all('.models a.model', true);
|
||||
|
||||
// no reliable entry ID between upcoming and released scenes
|
||||
release.entryId = `${qu.formatDate(release.date, 'YYYY-MM-DD')}-${slugify(release.title)}`;
|
||||
if (query.exists('.models a.model')) {
|
||||
release.actors = query.all('.models a.model').map((actorEl) => ({
|
||||
name: unprint.query.content(actorEl),
|
||||
url: unprint.prefixUrl(unprint.query.url(actorEl, null), entity.url),
|
||||
}));
|
||||
} else {
|
||||
// upcoming page has single string of actors, implicitly separated by a lot of whitespace
|
||||
release.actors = query.content('.models', { trim: false })?.trim().split(/\s{2,}/);
|
||||
}
|
||||
|
||||
const poster = query.sourceSet('img', 'data-srcset')?.[0];
|
||||
|
||||
|
@ -58,24 +58,56 @@ function scrapeAll(scenes, site, origin) {
|
|||
release.stars = query.number('.rating');
|
||||
release.likes = query.number('.likes');
|
||||
|
||||
release.comment = `${unprint.formatDate(release.date, 'YYYY-MM-DD')}-${slugify(release.title)}`;
|
||||
|
||||
return release;
|
||||
});
|
||||
}
|
||||
|
||||
async function scrapeScene({ query }, url, site) {
|
||||
async function fetchLatest(site, page = 1) {
|
||||
const url = `${site.url}/video/gallery/${(page - 1) * 12}`;
|
||||
const res = await unprint.get(url, { selectAll: '.content-grid-item' });
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeAll(res.context, site);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchUpcoming(site) {
|
||||
if (site.parameters?.upcoming) {
|
||||
const url = `${site.url}/video/upcoming`;
|
||||
const res = await unprint.get(url, { selectAll: '.content-grid-item' });
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeAll(res.context, site);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
async function scrapeScene({ query }, { url, entity, include }) {
|
||||
const release = {};
|
||||
|
||||
const { origin, pathname } = new URL(url);
|
||||
release.url = `${origin}${pathname}`;
|
||||
|
||||
release.entryId = new URL(url).pathname.split('/')[3];
|
||||
release.title = query.q('.content-pane-title h2', true);
|
||||
release.description = query.q('.content-pane-column div', true);
|
||||
release.title = query.content('.content-pane-title h2');
|
||||
release.description = query.content('.content-pane-column div');
|
||||
|
||||
release.date = query.date('.date', 'MMM D, YYYY');
|
||||
|
||||
release.actors = query.all('.content-pane-performers .model', true);
|
||||
release.tags = query.all('.categories a', true);
|
||||
release.actors = query.all('.content-pane-performers .model').map((actorEl) => ({
|
||||
name: unprint.query.content(actorEl),
|
||||
url: unprint.prefixUrl(unprint.query.url(actorEl, null), entity.url),
|
||||
}));
|
||||
|
||||
release.tags = query.contents('.categories a');
|
||||
|
||||
release.poster = query.poster() || query.img('.fake-video-player img');
|
||||
release.trailer = query.all('source').map((source) => ({
|
||||
|
@ -83,76 +115,89 @@ async function scrapeScene({ query }, url, site) {
|
|||
quality: Number(source.getAttribute('res')),
|
||||
}));
|
||||
|
||||
release.stars = Number(query.q('.score', true));
|
||||
release.likes = Number(query.q('#likecount', true));
|
||||
release.stars = query.number('.score');
|
||||
release.likes = query.number('#likecount');
|
||||
|
||||
const albumLink = query.url('.content-pane-related-links a[href*="gallery"]');
|
||||
|
||||
if (albumLink) {
|
||||
release.photos = await getPhotos(`${site.url}${albumLink}`);
|
||||
if (albumLink && include.photos) {
|
||||
release.photos = await getPhotos(albumLink);
|
||||
}
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
function scrapeProfile({ query }, _actorName, origin) {
|
||||
function scrapeProfile({ query }, avatar) {
|
||||
const profile = {};
|
||||
|
||||
const keys = query.all('.model-profile h5', true);
|
||||
const values = query.all('.model-profile h5 + p', true);
|
||||
const keys = query.contents('.model-profile .model-profile-subheading');
|
||||
const values = query.contents('.model-profile .model-profile-subheading + p');
|
||||
|
||||
const bio = keys.reduce((acc, key, index) => ({ ...acc, [slugify(key, '_')]: values[index] }), {});
|
||||
|
||||
profile.age = Number(bio.age);
|
||||
profile.description = query.q('.model-bio', true);
|
||||
profile.description = query.content('.model-bio');
|
||||
|
||||
profile.residencePlace = bio.location;
|
||||
|
||||
profile.height = heightToCm(bio.height);
|
||||
[profile.bust, profile.waist, profile.hip] = bio.figure.split('-').map((v) => Number(v) || v);
|
||||
profile.measurements = bio.figure;
|
||||
|
||||
profile.avatar = query.img('.model-profile img');
|
||||
const photo = query.img('.model-profile img');
|
||||
|
||||
const releases = query.all('.content-grid-item').filter((el) => /video\//.test(query.url(el, '.img-wrapper a'))); // filter out photos
|
||||
profile.releases = scrapeAll(query.initAll(releases), null, origin);
|
||||
// avatar on profile page is different, index avatar preferred
|
||||
if (avatar?.length > 0) {
|
||||
profile.avatar = avatar;
|
||||
profile.photos = [photo];
|
||||
} else {
|
||||
profile.avatar = photo;
|
||||
}
|
||||
|
||||
return profile;
|
||||
}
|
||||
|
||||
async function fetchLatest(site, page = 1) {
|
||||
const url = `${site.url}/video/gallery/${(page - 1) * 12}`;
|
||||
const res = await qu.getAll(url, '.content-grid-item');
|
||||
|
||||
return res.ok ? scrapeAll(res.items, site) : res.status;
|
||||
}
|
||||
|
||||
async function fetchUpcoming(site) {
|
||||
if (site.parameters?.upcoming) {
|
||||
const url = `${site.url}/video/upcoming`;
|
||||
const res = await qu.getAll(url, '.content-grid-item');
|
||||
|
||||
return res.ok ? scrapeAll(res.items, site) : res.status;
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
async function fetchProfile({ name: actorName }, { site }) {
|
||||
const firstLetter = actorName.charAt(0).toLowerCase();
|
||||
const origin = slugUrlMap[site.slug] || site.url;
|
||||
async function findModel(actor, entity) {
|
||||
const firstLetter = actor.name.charAt(0).toLowerCase();
|
||||
const origin = slugUrlMap[entity.slug] || entity.url;
|
||||
|
||||
const url = `${origin}/model/alpha/${firstLetter}`;
|
||||
const resModels = await qu.get(url);
|
||||
const resModels = await unprint.get(url);
|
||||
|
||||
if (!resModels.ok) return resModels.status;
|
||||
if (!resModels.ok) {
|
||||
return resModels.status;
|
||||
}
|
||||
|
||||
const modelPath = resModels.item.qu.all('.content-grid-item a.title').find((el) => slugify(el.textContent) === slugify(actorName));
|
||||
const modelEl = resModels.context.query.all('.content-grid-item').find((el) => slugify(unprint.query.content(el, 'a.title')) === slugify(actor.name));
|
||||
|
||||
if (modelPath) {
|
||||
const modelUrl = `${origin}${modelPath}`;
|
||||
const resModel = await qu.get(modelUrl);
|
||||
if (modelEl) {
|
||||
const modelUrl = `${origin}${unprint.query.url(modelEl, 'a.title')}`;
|
||||
const modelAvatar = unprint.query.sourceSet(modelEl, 'a picture img', 'data-srcset');
|
||||
|
||||
return resModel.ok ? scrapeProfile(resModel.item, actorName, origin) : resModel.status;
|
||||
return {
|
||||
url: modelUrl,
|
||||
avatar: modelAvatar,
|
||||
};
|
||||
}
|
||||
|
||||
// try actor URL last in order to grab avatar
|
||||
if (actor.url) {
|
||||
return { url: actor.url };
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
async function fetchProfile(actor, { entity }) {
|
||||
const model = await findModel(actor, entity);
|
||||
|
||||
if (model) {
|
||||
const resModel = await unprint.get(model.url);
|
||||
|
||||
if (resModel.ok) {
|
||||
return scrapeProfile(resModel.context, model.avatar);
|
||||
}
|
||||
|
||||
return resModel.status;
|
||||
}
|
||||
|
||||
return null;
|
||||
|
@ -163,5 +208,4 @@ module.exports = {
|
|||
fetchUpcoming,
|
||||
fetchProfile,
|
||||
scrapeScene,
|
||||
deprecated: true,
|
||||
};
|
||||
|
|
|
@ -437,10 +437,11 @@ async function storeScenes(releases, useBatchId) {
|
|||
title = COALESCE(new.title, releases.title),
|
||||
description = COALESCE(new.description, releases.description),
|
||||
duration = COALESCE(new.duration, releases.duration),
|
||||
comment = COALESCE(new.comment, releases.comment),
|
||||
deep = new.url IS NOT NULL,
|
||||
updated_at = NOW()
|
||||
FROM json_to_recordset(:scenes)
|
||||
AS new(id int, url text, date timestamptz, entity json, title text, description text, duration integer, deep boolean)
|
||||
AS new(id int, url text, date timestamptz, entity json, title text, description text, duration integer, comment text, deep boolean)
|
||||
WHERE releases.id = new.id
|
||||
`, {
|
||||
scenes: JSON.stringify(curatedDuplicateReleases),
|
||||
|
|
|
@ -175,7 +175,7 @@ async function getBrowserSession(identifier, options = {}) {
|
|||
const newBrowser = await puppeteer.launch({
|
||||
headless: typeof options.headless === 'undefined' ? 'new' : options.headless,
|
||||
args: [
|
||||
...(options.useProxy ? [`--proxy-server=${config.proxy.host}:${config.proxy.port}`] : []),
|
||||
...(options.useProxy && config.proxy.enabled ? [`--proxy-server=${config.proxy.host}:${config.proxy.port}`] : []),
|
||||
],
|
||||
// headless: false,
|
||||
});
|
||||
|
|
Loading…
Reference in New Issue