diff --git a/package-lock.json b/package-lock.json index cc2e6aaf..80b85fd8 100644 --- a/package-lock.json +++ b/package-lock.json @@ -90,7 +90,7 @@ "tunnel": "0.0.6", "ua-parser-js": "^1.0.37", "undici": "^5.28.1", - "unprint": "^0.16.1", + "unprint": "^0.16.3", "url-pattern": "^1.0.3", "v-tooltip": "^2.1.3", "video.js": "^8.6.1", @@ -18376,9 +18376,10 @@ } }, "node_modules/unprint": { - "version": "0.16.1", - "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.16.1.tgz", - "integrity": "sha512-vOT6kdoZwVae9iHS5H+eBOqTZaVJRJWrBJrfnAEIzqPO8KseFvajd+kLZSL9iCE6Al5S0hi2TuMW89c8YK3Baw==", + "version": "0.16.3", + "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.16.3.tgz", + "integrity": "sha512-PnToOzQhneDFzf4FzOVQciWWtFTk/Xx7ZkngM+S8n8wfeRfOH7YiYa4EhbD6ZJdEcR2xfVRtlMl3w2fI7nRgPw==", + "license": "ISC", "dependencies": { "axios": "^0.27.2", "bottleneck": "^2.19.5", diff --git a/package.json b/package.json index 3cd712ec..48313b0f 100755 --- a/package.json +++ b/package.json @@ -149,7 +149,7 @@ "tunnel": "0.0.6", "ua-parser-js": "^1.0.37", "undici": "^5.28.1", - "unprint": "^0.16.1", + "unprint": "^0.16.3", "url-pattern": "^1.0.3", "v-tooltip": "^2.1.3", "video.js": "^8.6.1", diff --git a/seeds/00_tags.js b/seeds/00_tags.js index 53269495..9aa6ad64 100755 --- a/seeds/00_tags.js +++ b/seeds/00_tags.js @@ -324,6 +324,14 @@ const tags = [ name: 'choking', slug: 'choking', }, + { + name: 'condom', + slug: 'condom', + }, + { + name: 'no condom', + slug: 'no-condom', + }, { name: 'corporal punishment', slug: 'corporal-punishment', @@ -1645,6 +1653,10 @@ const aliases = [ for: 'enhanced-boobs', secondary: true, }, + { + name: 'implants', + for: 'enhanced-boobs', + }, { name: 'boob job', for: 'titty-fucking', diff --git a/seeds/01_networks.js b/seeds/01_networks.js index 3f40614a..33e878dc 100755 --- a/seeds/01_networks.js +++ b/seeds/01_networks.js @@ -114,7 +114,6 @@ const networks = [ name: '5K Vids', url: 'https://www.5kvids.com', parameters: { - // layout: 'api', apiKey: 'fiveKCash', apiAddress: 'https://www.8kmilfs.com/api', }, @@ -464,7 +463,6 @@ const networks = [ url: 'https://www.kellymadison.com', description: 'Home of Kelly Madison and Ryan Madison', parameters: { - // layout: 'api', apiKey: 'kellyCash', apiAddress: 'https://www.pornfidelity.com/api', }, diff --git a/seeds/02_sites.js b/seeds/02_sites.js index ec039937..373f3430 100755 --- a/seeds/02_sites.js +++ b/seeds/02_sites.js @@ -5329,6 +5329,7 @@ const sites = [ siteId: 3, // older scene pages are only available on PF, even though they are categorized on TS or KM archive: 'https://www.pornfidelity.com', + short: 'TF', }, }, { @@ -5339,6 +5340,7 @@ const sites = [ parent: 'kellymadison', parameters: { siteId: 2, + short: 'PF', }, }, { @@ -5350,6 +5352,7 @@ const sites = [ parameters: { siteId: 1, archive: 'https://www.pornfidelity.com', + short: 'KM', }, }, { @@ -5360,6 +5363,7 @@ const sites = [ parent: '5kvids', parameters: { siteId: 1, + short: '5KP', }, }, { @@ -5370,6 +5374,7 @@ const sites = [ parent: '5kvids', parameters: { siteId: 2, + short: '5KT', }, }, { @@ -5379,6 +5384,7 @@ const sites = [ parent: '5kvids', parameters: { siteId: 3, + short: '8KM', }, }, { @@ -5388,6 +5394,7 @@ const sites = [ parent: '5kvids', parameters: { siteId: 4, + short: '8KT', }, }, // KILLERGRAM diff --git a/seeds/06_affiliates.js b/seeds/06_affiliates.js index 17be4fe5..613c01c5 100755 --- a/seeds/06_affiliates.js +++ b/seeds/06_affiliates.js @@ -57,6 +57,13 @@ const affiliates = [ parameters: 'nats=OTczLjEuMy4zLjAuMC4wLjAuMA', comment: '50% rev share', }, + { + id: 'evilangel', + network: 'evilangel', + url: 'https://www.g2fame.com/evilangel/go.php?pr=8&su=2&si=128&ad=277470&pa=index&ar=&buffer=', + parameters: 'nats=OTczLjEuMy4zLjAuMC4wLjAuMA', + comment: '50% rev share', + }, { id: '_kellymadison', network: 'kellymadison', diff --git a/src/scrapers/kellymadison.js b/src/scrapers/kellymadison.js index 6f0db962..2c2e03b8 100755 --- a/src/scrapers/kellymadison.js +++ b/src/scrapers/kellymadison.js @@ -2,86 +2,81 @@ const config = require('config'); const unprint = require('unprint'); -const { parse } = require('csv-parse/sync'); +// const { parse } = require('csv-parse/sync'); const slugify = require('../utils/slugify'); -const qu = require('../utils/qu'); const http = require('../utils/http'); const { feetInchesToCm, femaleFeetUsToEu } = require('../utils/convert'); -const siteMapByKey = { - PF: 'pornfidelity', - TF: 'teenfidelity', - KM: 'kellymadison', - '5KP': '5kporn', - '5KT': '5kteens', +const thumbKeyRegex = /(thumb\d+_url)|(episode_thumb_image_\d+_url)/; + +const qualityMap = { + '480p': 480, + mobile: 720, // as of recent, might've been lower in the past + '720p': 720, + '1080p': 1080, + '2k': 1440, + '4k': 2160, + '5k': 2280, + '8k': 4320, }; -const siteMapBySlug = Object.entries(siteMapByKey).reduce((acc, [key, value]) => ({ ...acc, [value]: key }), {}); +function scrapeSceneApi(data, channel) { + const release = {}; -function scrapeLatest(scenes, site) { - return scenes.map(({ query }) => { - const release = {}; + release.entryId = data.id; - release.shootId = query.q('.card-meta .text-right, .row .text-right, .card-footer-item:last-child', true); + if (data.url) { + // provided URL works but always points to 8KMilfs instead of dedicated site + const { pathname } = new URL(data.url); - const { pathname } = new URL(query.url('h5 a, .ep-title a, .title a')); - [release.entryId] = pathname.match(/\d+$/); - - release.title = query.cnt('h5 a, .ep-title a, .title a'); - - release.date = query.date('.card-meta .text-left, .row .col-4:first-child, .card-footer-item:first-child', ['MMM D', 'MMM D, YYYY'], /\w+ \d+(, \w+)?/); - release.actors = query.cnts('.models a, .ep-models a, a[href*="models/"]'); - - // older scenes do not have a working scene page on their native site, but they (often, not always) do on Porn Fidelity - // scenes older than year do not show a date; this is not when the URLs stop working, but it's a rough guideline - release.url = site.parameters.archive && !release.date - ? `${site.parameters.archive}${pathname}` - : `${site.url}${pathname}`; - - release.duration = query.dur('.content a'); - - const duration = query.cnt('.content a, .ep-runtime strong, .subtitle:last-child a')?.match(/(\d+) min/)?.[1]; - if (duration) release.duration = Number(duration) * 60; - - if (query.exists('.episodes-preview')) { - [release.poster, ...release.photos] = query.imgs('.episodes-preview img'); - } else { - release.poster = query.img('.card-img-top, .image img'); - release.teaser = { - src: query.video('video'), - }; - } - - /* using site ID, filter no longer needed - const siteId = release.shootId.match(/\d?\w{2}/)[0]; - const siteSlug = siteMapByKey[siteId]; - - if (site.slug !== siteSlug) { - // using generic network overview, scene is not from the site we want - return { ...acc, unextracted: [...acc.unextracted, release] }; - } - - return { ...acc, scenes: [...acc.scenes, release] }; - */ - - return release; - }); -} - -async function fetchLatest(channel, page = 1) { - const url = `${channel.url}/episodes/search?page=${page}&site=${channel.parameters.siteId || ''}`; // TLS issues with teenfidelity.com, same overview on all sites - const res = await http.get(url, { - headers: { - 'X-Requested-With': 'XMLHttpRequest', - }, - }); - - if (res.ok && res.body.status === 'success') { - return scrapeLatest(qu.extractAll(res.body.html, '.episode, .ep'), channel); + release.url = unprint.prefixUrl(pathname, channel.url); } - return res.status; + if (channel.parameters.short && data.sequence_number) { + release.shootId = `${channel.parameters.short} #${data.sequence_number}`; + } + + release.title = data.title; + release.description = data.short_description; + + release.date = new Date(data.publish_on); + + if (data.fullEpisodeLength) { + release.duration = data.fullEpisodeLength; + } else if (data.full_episode_minutes) { + // full_episode_seconds is always available so far, but no need to count on it + release.duration = (data.full_episode_minutes + (data.full_episode_seconds || 0)) * 60; + } + + release.actors = data.models.map((model) => ({ + name: model.name, + gender: model.sex?.toLowerCase(), + url: unprint.prefixUrl(`/models/${model.slug}`, channel.url), + })); + + release.poster = data.thumb_url || data.thumb_image_url; + + release.photos = [ + data.poster_image_url, + ...Object.entries(data).filter(([key]) => thumbKeyRegex.test(key)).map(([_key, url]) => url), + ].filter(Boolean); // photo thumbs include poster, don't filter here but in client + + const trailers = data.trailerVideos || data.trailer; + + if (trailers) { + release.trailer = Object.entries(trailers) + .filter(([key, trailer]) => !key.toLowerCase().includes('_sfw') && !trailer.url?.toLowerCase().includes('_sfw')) + .map(([_key, trailer]) => ({ + src: trailer.url, + quality: qualityMap[trailer.resolution?.toLowerCase()] || null, + })); + } + + release.tags = data.categories.map((category) => category.name); + release.photoCount = data.photosetPhotoCount || data.episode_photoset_photo_count; + + return release; } async function fetchLatestApi(channel, page = 1, { parameters }) { @@ -92,126 +87,112 @@ async function fetchLatestApi(channel, page = 1, { parameters }) { }, }); - console.log(res.body.data[1]); - if (res.ok) { - const data = parse(res.body, { - columns: true, - skip_empty_lines: true, - }); - - console.log(data); - - return null; + return res.body.data.map((data) => scrapeSceneApi(data, channel)); } return res.status; } -async function scrapeScene({ query, html }, url, baseRelease, channel, session) { - const { pathname } = new URL(url); - const release = {}; +/* not practical via API, updates endpoint contains all necessary data +async function fetchSceneApi(url, entity, baseRelease, { parameters }) { + // const episodeId = new URL(url).pathname.match(/\/episodes\/\w+\/(\d+)/)?.[1]; + const episodeId = new URL(url).pathname.match(/\/episodes\/(\d+)/)?.[1]; - [release.entryId] = pathname.match(/\d+$/); - - const titleString = query.cnt('.card-header.row h4, .trailer-starring span, .level-left .level-item'); - const episode = titleString?.match(/#\d+$/)?.[0]; - - release.title = query.cnt('.trailer-title') || titleString?.match(/(?:Trailer: )?(.+) -/)?.[1]; - release.channel = slugify(titleString?.match(/([\w\s]+) #\d+$/)?.[1], ''); - - const siteKey = siteMapBySlug[release.channel]; - - release.shootId = `${siteKey} ${episode}`; - release.description = query.text('p.card-text, h5.heavy + p, .card-content .is-three-fifths'); - - // order not reliable, get keys - const detailElsByKey = query.all('.card-body h4.card-title, .video-summary h5, .columns li').reduce((acc, rowEl) => ({ - ...acc, - [slugify(rowEl?.textContent.match(/(\w+):/)?.[1])]: rowEl, - }), {}); - - release.date = query.date(detailElsByKey.published, null, 'YYYY-MM-DD', /\d{4}-\d{2}-\d{2}/); - release.duration = query.dur(detailElsByKey.episode); - release.actors = query.cnts(detailElsByKey.starring, 'a'); - - const posterPrefix = html.indexOf('poster:'); - const poster = query.img('.trailer-poster') || html.slice(html.indexOf('http', posterPrefix), html.indexOf('.jpg', posterPrefix) + 4); - - if (poster) { - if (baseRelease?.poster) { - release.photos = [poster, ...(baseRelease.photos || [])]; - } else { - release.poster = poster; - } + if (!episodeId) { + return null; } - // const token = query.meta('name=_token'); - // const trailerInfoUrl = `${channel.url}/episodes/trailer/sources/${release.entryId}?type=trailer`; - const trailerInfoUrl = html.match(/'(http.*\/trailer\/sources.*)'/)?.[1]; - - if (trailerInfoUrl) { - const trailerInfoRes = await http.post(trailerInfoUrl, null, { session }); - - if (trailerInfoRes.ok && trailerInfoRes.body.sources?.length > 0) { - release.trailer = trailerInfoRes.body.sources.map((trailer) => ({ - src: trailer.src, - type: trailer.type, - /* unreliable, sometimes actual video is 720p - quality: trailer.res - .replace(4000, 2160) - .replace(5000, 2880), - */ - })); - } - } - - return release; -} - -async function fetchScene(url, channel, baseRelease) { - const session = http.session(); - - const res = await qu.get(url, null, { - 'X-Requested-With': 'XMLHttpRequest', - }, { - session, - followRedirects: false, // redirects to sign-up page if scene not found + // JSON API doesn't return poster images, CSV API doesn't have pagination. UPDATE: requested and received both, yet to test + const res = await http.get(`${parameters.apiAddress}/affiliates/episodes/${episodeId}`, { + headers: { + Authorization: `Bearer ${config.apiKeys[parameters.apiKey]}`, + }, }); - return res.ok - ? scrapeScene(res.item, url, baseRelease, channel, session) - : res.status; + console.log(res.body); + + return; + + if (res.ok) { + return scrapeSceneApi(res.body.data, entity); + } + + return res.status; +} +*/ + +function composeBio(bioKeys, bioValues) { + return bioKeys.reduce((acc, key, index) => ({ + ...acc, + [slugify(key, '_')]: bioValues[index], + }), {}); +} + +function getBio(query) { + // Kelly Madison, Fidelity + if (query.exists('.profile-stats')) { + const bioKeys = query.contents('.profile-stats li strong'); + const bioValues = query.texts('.profile-stats li'); + + return composeBio(bioKeys, bioValues); + } + + // 8K + if (query.exists('//h4[contains(text(), "Stats")]')) { + const bioKeys = query.contents('(//h4[contains(text(), "Stats")])[1]//following-sibling::div//strong'); + const bioValues = query.contents('(//h4[contains(text(), "Stats")])[1]//following-sibling::div//p/text()'); + + return composeBio(bioKeys, bioValues); + } + + // 5K + if (query.exists('.bio-overlay-1')) { + const bioKeys = query.contents('.bio-overlay-1 td:first-child'); + const bioValues = query.contents('.bio-overlay-1 td:last-child'); + + return composeBio(bioKeys, bioValues); + } + + return null; } function scrapeProfile({ query }) { const profile = {}; + const bio = getBio(query); - const bioKeys = query.contents('table.table td:nth-child(1), table.table th'); - const bioValues = query.contents('table.table td:nth-child(2)'); + const questions = query.contents('.model-faq .content-body .accordion-header, .card .card-header button'); + const answers = query.contents('.model-faq .content-body .accordion-body, .card .collapse .card-body'); - const bio = bioKeys.reduce((acc, key, index) => ({ - ...acc, - [slugify(key, '_')]: bioValues[index], - }), {}); - - if (bio.ethnicity) profile.ethnicity = bio.ethnicity; - if (bio.measurements) profile.measurements = bio.measurements; - if (bio.birthplace) profile.birthPlace = bio.birthplace; - if (bio.shoe_size) profile.foot = femaleFeetUsToEu(bio.shoe_size); - - if (bio.height) { - const [feet, inches] = bio.height.match(/\d+/g); - profile.height = feetInchesToCm(feet, inches); + if (questions.length > 0 && questions.length === answers.length) { + profile.description = questions.map((question, index) => `**${question}**\n${answers[index]}`).join('\n'); } - if (bio.birthday) { - const [month, day] = bio.birthday.split('/'); - const birthday = new Date(Date.UTC(0, Number(month) - 1, Number(day))); + if (bio) { + if (bio.ethnicity) profile.ethnicity = bio.ethnicity; + if (bio.measurements) profile.measurements = bio.measurements; + if (bio.birthplace) profile.birthPlace = bio.birthplace; + if (bio.shoe_size) profile.foot = femaleFeetUsToEu(bio.shoe_size); - birthday.setUTCFullYear(0); // indicate birth year is unknown + if (bio.height) { + const [feet, inches] = bio.height.match(/\d+/g); + profile.height = feetInchesToCm(feet, inches); + } - profile.dateOfBirth = new Date(birthday); + if (bio.age) profile.age = Number(bio.age); + + if (bio.birthday) { + const [month, day] = bio.birthday.split('/'); + const birthday = new Date(Date.UTC(0, Number(month) - 1, Number(day))); + + if (profile.age) { + birthday.setUTCFullYear(new Date().getFullYear() - profile.age); // indicate birth year is unknown + } else { + birthday.setUTCFullYear(0); // indicate birth year is unknown + } + + profile.dateOfBirth = new Date(birthday); + } } profile.avatar = query.img('img[src*="model"][src*="headshot"]'); @@ -223,7 +204,8 @@ function scrapeProfile({ query }) { async function fetchProfile({ name: actorName }, { entity }) { const actorSlug = slugify(actorName); - const res = await unprint.get(`${entity.url}/models/${actorSlug}`, { + // 8K sites don't have avatar or interview on model page, always use 5K site + const res = await unprint.get(`${entity.slug === '5kvids' ? 'https://www.5kporn.com' : entity.url}/models/${actorSlug}`, { headers: { 'X-Requested-With': 'XMLHttpRequest', }, @@ -237,11 +219,6 @@ async function fetchProfile({ name: actorName }, { entity }) { } module.exports = { - fetchLatest, + fetchLatest: fetchLatestApi, fetchProfile, - fetchScene, - api: { - fetchLatest: fetchLatestApi, - // fetchScene, fetchSceneApi, - }, };