From 23f76fd1bef57c8523f49f32347f7e336ce81b42 Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Mon, 30 Dec 2024 01:57:26 +0100 Subject: [PATCH] Added sparse date mode. Fixed profile fetch error catching. Updated Kelly Madison scraper, using site IDs and fixed profile scraper. --- config/default.js | 1 + seeds/02_sites.js | 20 ++++++++++-- src/actors.js | 2 +- src/app.js | 2 ++ src/argv.js | 6 ++++ src/redis.js | 2 +- src/scrapers/kellymadison.js | 63 +++++++++++++++++++++++------------- src/scrapers/scrapers.js | 1 + src/updates.js | 6 ++-- 9 files changed, 73 insertions(+), 30 deletions(-) diff --git a/config/default.js b/config/default.js index 1b18379c..2e45b9b8 100755 --- a/config/default.js +++ b/config/default.js @@ -273,6 +273,7 @@ module.exports = { 'topwebmodels', 'pascalssubsluts', 'kellymadison', + '5kporn', 'private', 'bangbros', 'hitzefrei', diff --git a/seeds/02_sites.js b/seeds/02_sites.js index 4ec6f829..42bf306d 100755 --- a/seeds/02_sites.js +++ b/seeds/02_sites.js @@ -5281,23 +5281,30 @@ const sites = [ name: 'Teen Fidelity', alias: ['tf'], url: 'https://www.teenfidelity.com', - description: 'Home of Kelly Madison and Ryan Madison', parent: 'kellymadison', + parameters: { + siteId: 3, + }, }, { slug: 'pornfidelity', name: 'Porn Fidelity', alias: ['pf'], url: 'https://www.pornfidelity.com', - description: 'Home of Kelly Madison and Ryan Madison', parent: 'kellymadison', + parameters: { + siteId: 2, + }, }, { slug: 'kellymadison', name: 'Kelly Madison', - url: 'https://www.pornfidelity.com', + url: 'https://www.kellymadison.com', description: 'Home of Kelly Madison and Ryan Madison', parent: 'kellymadison', + parameters: { + siteId: 1, + }, }, { slug: '5kporn', @@ -5305,6 +5312,10 @@ const sites = [ url: 'https://www.5kporn.com', tags: ['5k'], parent: 'kellymadison', + parameters: { + // IDs overlap with Fidelity sites + siteId: 1, + }, }, { slug: '5kteens', @@ -5312,6 +5323,9 @@ const sites = [ url: 'https://www.5kteens.com', tags: ['5k'], parent: 'kellymadison', + parameters: { + siteId: 2, + }, }, // KILLERGRAM { diff --git a/src/actors.js b/src/actors.js index 3507003c..a88a0cf7 100755 --- a/src/actors.js +++ b/src/actors.js @@ -614,7 +614,7 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy const profiles = Promise.map(validSources, async (source) => { try { // config may group sources to try until success - return [].concat(source).reduce(async (outcome, scraperSlug) => outcome.catch(async () => { + return await [].concat(source).reduce(async (outcome, scraperSlug) => outcome.catch(async () => { try { const entity = entitiesBySlug[scraperSlug] || null; diff --git a/src/app.js b/src/app.js index 882fe078..466a7e85 100755 --- a/src/app.js +++ b/src/app.js @@ -108,6 +108,8 @@ async function startMemorySample(snapshotTriggers = []) { async function init() { try { + await redis.connect(); + if (argv.server) { await initServer(); return; diff --git a/src/argv.js b/src/argv.js index 8a861306..31ac198d 100755 --- a/src/argv.js +++ b/src/argv.js @@ -174,6 +174,12 @@ const { argv } = yargs default: config.upcomingMissingDateLimit, alias: ['upcoming-null-date-limit'], }) + .option('filter-sparse-dates', { + describe: 'If some but not all scenes have dates, filter out scenes without dates, instead of using missing date limit.', + type: 'boolean', + default: false, + alias: ['sparse'], + }) .option('page', { describe: 'Page to start scraping at', type: 'number', diff --git a/src/redis.js b/src/redis.js index ef932be8..f5f290b4 100644 --- a/src/redis.js +++ b/src/redis.js @@ -10,6 +10,6 @@ const redisClient = redis.createClient({ socket: config.redis, }); -redisClient.connect(); +// redisClient.connect(); module.exports = redisClient; diff --git a/src/scrapers/kellymadison.js b/src/scrapers/kellymadison.js index ce762747..2f52bc46 100755 --- a/src/scrapers/kellymadison.js +++ b/src/scrapers/kellymadison.js @@ -1,9 +1,11 @@ 'use strict'; +const unprint = require('unprint'); + const slugify = require('../utils/slugify'); const qu = require('../utils/qu'); const http = require('../utils/http'); -const { feetInchesToCm } = require('../utils/convert'); +const { feetInchesToCm, femaleFeetUsToEu } = require('../utils/convert'); const siteMapByKey = { PF: 'pornfidelity', @@ -16,14 +18,11 @@ const siteMapByKey = { const siteMapBySlug = Object.entries(siteMapByKey).reduce((acc, [key, value]) => ({ ...acc, [value]: key }), {}); function scrapeLatest(scenes, site) { - return scenes.reduce((acc, { query }) => { + return scenes.map(({ query }) => { const release = {}; release.shootId = query.q('.card-meta .text-right, .row .text-right, .card-footer-item:last-child', true); - const siteId = release.shootId.match(/\d?\w{2}/)[0]; - const siteSlug = siteMapByKey[siteId]; - const { pathname } = new URL(query.url('h5 a, .ep-title a, .title a')); [release.entryId] = pathname.match(/\d+$/); release.url = `${site.url}${pathname}`; @@ -47,15 +46,19 @@ function scrapeLatest(scenes, site) { }; } + /* using site ID, filter no longer needed + const siteId = release.shootId.match(/\d?\w{2}/)[0]; + const siteSlug = siteMapByKey[siteId]; + if (site.slug !== siteSlug) { // using generic network overview, scene is not from the site we want return { ...acc, unextracted: [...acc.unextracted, release] }; } return { ...acc, scenes: [...acc.scenes, release] }; - }, { - scenes: [], - unextracted: [], + */ + + return release; }); } @@ -114,34 +117,47 @@ async function scrapeScene({ query, html }, url, baseRelease, channel, session) })); } - console.log(release); - return release; } function scrapeProfile({ query }) { const profile = {}; - const bioKeys = query.all('table.table td:nth-child(1)', true); - const bioValues = query.all('table.table td:nth-child(2)', true); - const bio = bioKeys.reduce((acc, key, index) => ({ ...acc, [key.slice(0, -1)]: bioValues[index] }), {}); + const bioKeys = query.contents('table.table td:nth-child(1), table.table th'); + const bioValues = query.contents('table.table td:nth-child(2)'); - if (bio.Ethnicity) profile.ethnicity = bio.Ethnicity; - if (bio.Measurements) [profile.bust, profile.waist, profile.hip] = bio.Measurements.split('-'); - if (bio.Birthplace) profile.birthPlace = bio.Birthplace; + const bio = bioKeys.reduce((acc, key, index) => ({ + ...acc, + [slugify(key, '_')]: bioValues[index], + }), {}); - if (bio.Height) { - const [feet, inches] = bio.Height.match(/\d+/g); + if (bio.ethnicity) profile.ethnicity = bio.ethnicity; + if (bio.measurements) profile.measurements = bio.measurements; + if (bio.birthplace) profile.birthPlace = bio.birthplace; + if (bio.shoe_size) profile.foot = femaleFeetUsToEu(bio.shoe_size); + + if (bio.height) { + const [feet, inches] = bio.height.match(/\d+/g); profile.height = feetInchesToCm(feet, inches); } - profile.avatar = query.img('img[src*="model"]'); + if (bio.birthday) { + const [month, day] = bio.birthday.split('/'); + const birthday = new Date(Date.UTC(0, Number(month) - 1, Number(day))); + + birthday.setUTCFullYear(0); // indicate birth year is unknown + + profile.dateOfBirth = new Date(birthday); + } + + profile.avatar = query.img('img[src*="model"][src*="headshot"]'); + profile.photos = query.imgs('img[src*="model"][src*="thumb_image"], img[src*="model"][src*="bg_image"]'); return profile; } async function fetchLatest(channel, page = 1) { - const url = `${channel.url}/episodes/search?page=${page}`; // TLS issues with teenfidelity.com, same overview on all sites + const url = `${channel.url}/episodes/search?page=${page}&site=${channel.parameters.siteId || ''}`; // TLS issues with teenfidelity.com, same overview on all sites const res = await http.get(url, { headers: { 'X-Requested-With': 'XMLHttpRequest', @@ -165,16 +181,17 @@ async function fetchScene(url, channel, baseRelease) { return res.ok ? scrapeScene(res.item, url, baseRelease, channel, session) : res.status; } -async function fetchProfile({ name: actorName }) { +async function fetchProfile({ name: actorName }, { entity }) { const actorSlug = slugify(actorName); - const res = await qu.get(`https://www.kellymadison.com/models/${actorSlug}`, null, { + + const res = await unprint.get(`${entity.url}/models/${actorSlug}`, { headers: { 'X-Requested-With': 'XMLHttpRequest', }, }); if (res.ok) { - return scrapeProfile(res.item); + return scrapeProfile(res.context); } return res.status; diff --git a/src/scrapers/scrapers.js b/src/scrapers/scrapers.js index d02993a1..c39c7751 100755 --- a/src/scrapers/scrapers.js +++ b/src/scrapers/scrapers.js @@ -265,6 +265,7 @@ const scrapers = { julesjordan, karups, kellymadison, + '5kporn': kellymadison, killergram, kink, kinkmen: kink, diff --git a/src/updates.js b/src/updates.js index d5c2b8bc..181db656 100755 --- a/src/updates.js +++ b/src/updates.js @@ -147,10 +147,12 @@ async function scrapeReleases(scraper, entity, preData, isUpcoming) { const releases = await scrapeReleasesPage(argv.page || 1, []); - const hasDates = releases.every((release) => !!release.date); + const hasDates = argv.filterSparseDates + ? releases.some((release) => !!release.date) + : releases.every((release) => !!release.date); const limitedReleases = (argv.last && releases.slice(0, Math.max(argv.last, 0))) - || (hasDates && releases.filter((release) => moment(release.date).isAfter(argv.after))) + || (hasDates && releases.filter((release) => release.date && moment(release.date).isAfter(argv.after))) || releases.slice(0, Math.max(isUpcoming ? argv.upcomingMissingDateLimit : argv.missingDateLimit, 0)); const { uniqueReleases, duplicateReleases } = argv.force