diff --git a/package-lock.json b/package-lock.json index 0d81d0e3..331e2315 100644 --- a/package-lock.json +++ b/package-lock.json @@ -89,7 +89,7 @@ "tunnel": "0.0.6", "ua-parser-js": "^1.0.37", "undici": "^5.28.1", - "unprint": "^0.15.7", + "unprint": "^0.16.1", "url-pattern": "^1.0.3", "v-tooltip": "^2.1.3", "video.js": "^8.6.1", @@ -17137,6 +17137,17 @@ "node": ">= 0.6" } }, + "node_modules/srcset": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/srcset/-/srcset-4.0.0.tgz", + "integrity": "sha512-wvLeHgcVHKO8Sc/H/5lkGreJQVeYMm9rlmt8PuR1xE31rIuXhuzznUUqAt8MqLhB3MqJdFzlNAfpcWnxiFUcPw==", + "engines": { + "node": ">=12" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, "node_modules/sshpk": { "version": "1.18.0", "resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.18.0.tgz", @@ -18359,9 +18370,9 @@ } }, "node_modules/unprint": { - "version": "0.15.7", - "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.15.7.tgz", - "integrity": "sha512-sR4HhdJbPxkcQlQem/Hl3N67Nhn47wiK71qvl+yCT1N31tknA+mhtD+aWW5MG5F9fnJpCTlr/s4mCLxalj6XEA==", + "version": "0.16.1", + "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.16.1.tgz", + "integrity": "sha512-vOT6kdoZwVae9iHS5H+eBOqTZaVJRJWrBJrfnAEIzqPO8KseFvajd+kLZSL9iCE6Al5S0hi2TuMW89c8YK3Baw==", "dependencies": { "axios": "^0.27.2", "bottleneck": "^2.19.5", @@ -18371,6 +18382,7 @@ "eslint-config-airbnb-base": "^15.0.0", "jsdom": "^17.0.0", "moment-timezone": "^0.5.34", + "srcset": "^4.0.0", "tunnel": "^0.0.6" } }, diff --git a/package.json b/package.json index effcc151..12fa936b 100755 --- a/package.json +++ b/package.json @@ -148,7 +148,7 @@ "tunnel": "0.0.6", "ua-parser-js": "^1.0.37", "undici": "^5.28.1", - "unprint": "^0.15.7", + "unprint": "^0.16.1", "url-pattern": "^1.0.3", "v-tooltip": "^2.1.3", "video.js": "^8.6.1", diff --git a/seeds/02_sites.js b/seeds/02_sites.js index cdc89028..edafdb09 100755 --- a/seeds/02_sites.js +++ b/seeds/02_sites.js @@ -8663,7 +8663,7 @@ const sites = [ url: 'https://www.petitehdporn.com', parent: 'nubiles', parameters: { - upcoming: true, + upcoming: false, }, }, { @@ -11445,7 +11445,7 @@ const sites = [ slug: 'danejones', name: 'Dane Jones', alias: ['dnj'], - url: 'https://www.danejones.com/', + url: 'https://www.danejones.com', parameters: { siteId: 290, native: true, diff --git a/src/scrapers/naughtyamerica.js b/src/scrapers/naughtyamerica.js index e4a31cf3..1beccc1e 100755 --- a/src/scrapers/naughtyamerica.js +++ b/src/scrapers/naughtyamerica.js @@ -2,6 +2,7 @@ const unprint = require('unprint'); +const http = require('../utils/http'); const slugify = require('../utils/slugify'); // Naughty America network @@ -42,6 +43,40 @@ function scrapeLatest(scenes, channel) { }); } +async function fetchLatest(channel, page = 1) { + const { tab } = await http.getBrowserSession('naughtyamerica', { useGlobalBrowser: false, useProxy: true, headless: false }); + const url = `${channel.url}${channel.parameters?.scenes || ''}?page=${page}`; + const res = await tab.goto(url); + const status = res.status(); + + if (status === 200) { + const html = await tab.content(); + const items = unprint.initAll(html, '.site-list .scene-item, .panel-body'); + + const scenes = scrapeLatest(items, channel); + + await tab.close(); + + return scenes; + } + + await tab.close(); + + return status; +} + +/* +async function fetchLatest(site, page = 1) { + const res = await unprint.get(`${site.url}${site.parameters?.scenes || ''}?page=${page}`, { selectAll: '.site-list .scene-item, .panel-body' }); + + if (res.ok) { + return scrapeLatest(res.context, site); + } + + return res.status; +} +*/ + function scrapeScene({ query }, { url }) { const release = {}; @@ -98,6 +133,28 @@ function scrapeScene({ query }, { url }) { return release; } +async function fetchScene(url, _channel) { + const { tab } = await http.getBrowserSession('naughtyamerica', { useGlobalBrowser: false, useProxy: true }); + const res = await tab.goto(url); + + const status = res.status(); + + if (status === 200) { + const html = await tab.content(); + const item = unprint.init(html); + + const scene = scrapeScene(item, { url }); + + await tab.close(); + + return scene; + } + + await tab.close(); + + return status; +} + async function scrapeProfile({ query }) { const profile = {}; @@ -107,16 +164,30 @@ async function scrapeProfile({ query }) { return profile; } -async function fetchLatest(site, page = 1) { - const res = await unprint.get(`${site.url}${site.parameters?.scenes || ''}?page=${page}`, { selectAll: '.site-list .scene-item, .panel-body' }); +async function fetchProfile({ slug }, { channel }) { + const { tab } = await http.getBrowserSession('naughtyamerica', { useGlobalBrowser: false, useProxy: true }); + const url = `${channel.url}/pornstar/${slug}`; + const res = await tab.goto(url); - if (res.ok) { - return scrapeLatest(res.context, site); + const status = res.status(); + + if (status === 200) { + const html = await tab.content(); + const item = unprint.init(html, '.bio-info, .performer-details'); + + const profile = scrapeProfile(item, { url }); + + await tab.close(); + + return profile; } - return res.status; + await tab.close(); + + return status; } +/* async function fetchProfile({ slug }, { channel }) { const res = await unprint.get(`${channel.url}/pornstar/${slug}`, { select: '.bio-info, .performer-details' }); @@ -126,9 +197,10 @@ async function fetchProfile({ slug }, { channel }) { return res.status; } +*/ module.exports = { fetchLatest, + fetchScene, fetchProfile, - scrapeScene, }; diff --git a/src/scrapers/nubiles.js b/src/scrapers/nubiles.js index 6f9f29fc..affd8f43 100755 --- a/src/scrapers/nubiles.js +++ b/src/scrapers/nubiles.js @@ -1,6 +1,7 @@ 'use strict'; -const qu = require('../utils/qu'); +const unprint = require('unprint'); + const slugify = require('../utils/slugify'); const { heightToCm } = require('../utils/convert'); @@ -9,44 +10,43 @@ const slugUrlMap = { nubilesporn: 'https://www.nubiles-porn.com', }; +function stripQuery(link) { + if (!link) { + return null; + } + + const url = new URL(link); + + return `${url.origin}${url.pathname}`; +} + async function getPhotos(albumUrl) { - const res = await qu.getAll(albumUrl, '.photo-thumb'); + const res = await unprint.get(albumUrl, { selectAll: '.photo-thumb' }); return res.ok - ? res.items.map(({ query }) => qu.prefixUrl(query.q('source').srcset)) + ? res.context.map(({ query }) => unprint.prefixUrl(query.element('source').srcset)) : []; } -function scrapeAll(scenes, site, origin) { +function scrapeAll(scenes, entity) { return scenes.map(({ query }) => { const release = {}; - release.title = query.q('.title a', true); - - const url = query.url('.title a').split('?')[0]; - const channelUrl = query.url('.site-link'); - - if (/^http/.test(url)) { - const { pathname } = new URL(url); - // release.entryId = pathname.split('/')[3]; - - if (channelUrl) release.url = `${channelUrl}${pathname}`; - else release.url = url; - } else if (!/\/join/.test(url)) { - // release.entryId = url.split('/')[3]; - - if (channelUrl) release.url = `${channelUrl}${url}`; - else if (site?.url) release.url = `${site.url}${url}`; - else if (origin) release.url = `${origin}${url}`; - } else { - // release.entryId = qu.q('a img', 'tube_tour_thumb_id'); - } + release.title = query.content('.title a'); + release.url = stripQuery(unprint.prefixUrl(query.url('.title a'), entity.url)); + release.entryId = Number(new URL(release.url).pathname.match(/\/watch\/(\d+)/)[1]); release.date = query.date('.date', 'MMM D, YYYY'); - release.actors = query.all('.models a.model', true); - // no reliable entry ID between upcoming and released scenes - release.entryId = `${qu.formatDate(release.date, 'YYYY-MM-DD')}-${slugify(release.title)}`; + if (query.exists('.models a.model')) { + release.actors = query.all('.models a.model').map((actorEl) => ({ + name: unprint.query.content(actorEl), + url: unprint.prefixUrl(unprint.query.url(actorEl, null), entity.url), + })); + } else { + // upcoming page has single string of actors, implicitly separated by a lot of whitespace + release.actors = query.content('.models', { trim: false })?.trim().split(/\s{2,}/); + } const poster = query.sourceSet('img', 'data-srcset')?.[0]; @@ -58,24 +58,56 @@ function scrapeAll(scenes, site, origin) { release.stars = query.number('.rating'); release.likes = query.number('.likes'); + release.comment = `${unprint.formatDate(release.date, 'YYYY-MM-DD')}-${slugify(release.title)}`; + return release; }); } -async function scrapeScene({ query }, url, site) { +async function fetchLatest(site, page = 1) { + const url = `${site.url}/video/gallery/${(page - 1) * 12}`; + const res = await unprint.get(url, { selectAll: '.content-grid-item' }); + + if (res.ok) { + return scrapeAll(res.context, site); + } + + return res.status; +} + +async function fetchUpcoming(site) { + if (site.parameters?.upcoming) { + const url = `${site.url}/video/upcoming`; + const res = await unprint.get(url, { selectAll: '.content-grid-item' }); + + if (res.ok) { + return scrapeAll(res.context, site); + } + + return res.status; + } + + return []; +} + +async function scrapeScene({ query }, { url, entity, include }) { const release = {}; const { origin, pathname } = new URL(url); release.url = `${origin}${pathname}`; release.entryId = new URL(url).pathname.split('/')[3]; - release.title = query.q('.content-pane-title h2', true); - release.description = query.q('.content-pane-column div', true); + release.title = query.content('.content-pane-title h2'); + release.description = query.content('.content-pane-column div'); release.date = query.date('.date', 'MMM D, YYYY'); - release.actors = query.all('.content-pane-performers .model', true); - release.tags = query.all('.categories a', true); + release.actors = query.all('.content-pane-performers .model').map((actorEl) => ({ + name: unprint.query.content(actorEl), + url: unprint.prefixUrl(unprint.query.url(actorEl, null), entity.url), + })); + + release.tags = query.contents('.categories a'); release.poster = query.poster() || query.img('.fake-video-player img'); release.trailer = query.all('source').map((source) => ({ @@ -83,76 +115,89 @@ async function scrapeScene({ query }, url, site) { quality: Number(source.getAttribute('res')), })); - release.stars = Number(query.q('.score', true)); - release.likes = Number(query.q('#likecount', true)); + release.stars = query.number('.score'); + release.likes = query.number('#likecount'); const albumLink = query.url('.content-pane-related-links a[href*="gallery"]'); - if (albumLink) { - release.photos = await getPhotos(`${site.url}${albumLink}`); + if (albumLink && include.photos) { + release.photos = await getPhotos(albumLink); } return release; } -function scrapeProfile({ query }, _actorName, origin) { +function scrapeProfile({ query }, avatar) { const profile = {}; - const keys = query.all('.model-profile h5', true); - const values = query.all('.model-profile h5 + p', true); + const keys = query.contents('.model-profile .model-profile-subheading'); + const values = query.contents('.model-profile .model-profile-subheading + p'); const bio = keys.reduce((acc, key, index) => ({ ...acc, [slugify(key, '_')]: values[index] }), {}); profile.age = Number(bio.age); - profile.description = query.q('.model-bio', true); + profile.description = query.content('.model-bio'); profile.residencePlace = bio.location; profile.height = heightToCm(bio.height); - [profile.bust, profile.waist, profile.hip] = bio.figure.split('-').map((v) => Number(v) || v); + profile.measurements = bio.figure; - profile.avatar = query.img('.model-profile img'); + const photo = query.img('.model-profile img'); - const releases = query.all('.content-grid-item').filter((el) => /video\//.test(query.url(el, '.img-wrapper a'))); // filter out photos - profile.releases = scrapeAll(query.initAll(releases), null, origin); + // avatar on profile page is different, index avatar preferred + if (avatar?.length > 0) { + profile.avatar = avatar; + profile.photos = [photo]; + } else { + profile.avatar = photo; + } return profile; } -async function fetchLatest(site, page = 1) { - const url = `${site.url}/video/gallery/${(page - 1) * 12}`; - const res = await qu.getAll(url, '.content-grid-item'); - - return res.ok ? scrapeAll(res.items, site) : res.status; -} - -async function fetchUpcoming(site) { - if (site.parameters?.upcoming) { - const url = `${site.url}/video/upcoming`; - const res = await qu.getAll(url, '.content-grid-item'); - - return res.ok ? scrapeAll(res.items, site) : res.status; - } - - return []; -} - -async function fetchProfile({ name: actorName }, { site }) { - const firstLetter = actorName.charAt(0).toLowerCase(); - const origin = slugUrlMap[site.slug] || site.url; +async function findModel(actor, entity) { + const firstLetter = actor.name.charAt(0).toLowerCase(); + const origin = slugUrlMap[entity.slug] || entity.url; const url = `${origin}/model/alpha/${firstLetter}`; - const resModels = await qu.get(url); + const resModels = await unprint.get(url); - if (!resModels.ok) return resModels.status; + if (!resModels.ok) { + return resModels.status; + } - const modelPath = resModels.item.qu.all('.content-grid-item a.title').find((el) => slugify(el.textContent) === slugify(actorName)); + const modelEl = resModels.context.query.all('.content-grid-item').find((el) => slugify(unprint.query.content(el, 'a.title')) === slugify(actor.name)); - if (modelPath) { - const modelUrl = `${origin}${modelPath}`; - const resModel = await qu.get(modelUrl); + if (modelEl) { + const modelUrl = `${origin}${unprint.query.url(modelEl, 'a.title')}`; + const modelAvatar = unprint.query.sourceSet(modelEl, 'a picture img', 'data-srcset'); - return resModel.ok ? scrapeProfile(resModel.item, actorName, origin) : resModel.status; + return { + url: modelUrl, + avatar: modelAvatar, + }; + } + + // try actor URL last in order to grab avatar + if (actor.url) { + return { url: actor.url }; + } + + return null; +} + +async function fetchProfile(actor, { entity }) { + const model = await findModel(actor, entity); + + if (model) { + const resModel = await unprint.get(model.url); + + if (resModel.ok) { + return scrapeProfile(resModel.context, model.avatar); + } + + return resModel.status; } return null; @@ -163,5 +208,4 @@ module.exports = { fetchUpcoming, fetchProfile, scrapeScene, - deprecated: true, }; diff --git a/src/store-releases.js b/src/store-releases.js index 64654c96..b4626fa5 100755 --- a/src/store-releases.js +++ b/src/store-releases.js @@ -437,10 +437,11 @@ async function storeScenes(releases, useBatchId) { title = COALESCE(new.title, releases.title), description = COALESCE(new.description, releases.description), duration = COALESCE(new.duration, releases.duration), + comment = COALESCE(new.comment, releases.comment), deep = new.url IS NOT NULL, updated_at = NOW() FROM json_to_recordset(:scenes) - AS new(id int, url text, date timestamptz, entity json, title text, description text, duration integer, deep boolean) + AS new(id int, url text, date timestamptz, entity json, title text, description text, duration integer, comment text, deep boolean) WHERE releases.id = new.id `, { scenes: JSON.stringify(curatedDuplicateReleases), diff --git a/src/utils/http.js b/src/utils/http.js index 4965261b..827315db 100755 --- a/src/utils/http.js +++ b/src/utils/http.js @@ -175,7 +175,7 @@ async function getBrowserSession(identifier, options = {}) { const newBrowser = await puppeteer.launch({ headless: typeof options.headless === 'undefined' ? 'new' : options.headless, args: [ - ...(options.useProxy ? [`--proxy-server=${config.proxy.host}:${config.proxy.port}`] : []), + ...(options.useProxy && config.proxy.enabled ? [`--proxy-server=${config.proxy.host}:${config.proxy.port}`] : []), ], // headless: false, });