Added sparse date mode. Fixed profile fetch error catching. Updated Kelly Madison scraper, using site IDs and fixed profile scraper.

This commit is contained in:
DebaucheryLibrarian 2024-12-30 01:57:26 +01:00
parent ffea90b5e8
commit 23f76fd1be
9 changed files with 73 additions and 30 deletions

View File

@ -273,6 +273,7 @@ module.exports = {
'topwebmodels', 'topwebmodels',
'pascalssubsluts', 'pascalssubsluts',
'kellymadison', 'kellymadison',
'5kporn',
'private', 'private',
'bangbros', 'bangbros',
'hitzefrei', 'hitzefrei',

View File

@ -5281,23 +5281,30 @@ const sites = [
name: 'Teen Fidelity', name: 'Teen Fidelity',
alias: ['tf'], alias: ['tf'],
url: 'https://www.teenfidelity.com', url: 'https://www.teenfidelity.com',
description: 'Home of Kelly Madison and Ryan Madison',
parent: 'kellymadison', parent: 'kellymadison',
parameters: {
siteId: 3,
},
}, },
{ {
slug: 'pornfidelity', slug: 'pornfidelity',
name: 'Porn Fidelity', name: 'Porn Fidelity',
alias: ['pf'], alias: ['pf'],
url: 'https://www.pornfidelity.com', url: 'https://www.pornfidelity.com',
description: 'Home of Kelly Madison and Ryan Madison',
parent: 'kellymadison', parent: 'kellymadison',
parameters: {
siteId: 2,
},
}, },
{ {
slug: 'kellymadison', slug: 'kellymadison',
name: 'Kelly Madison', name: 'Kelly Madison',
url: 'https://www.pornfidelity.com', url: 'https://www.kellymadison.com',
description: 'Home of Kelly Madison and Ryan Madison', description: 'Home of Kelly Madison and Ryan Madison',
parent: 'kellymadison', parent: 'kellymadison',
parameters: {
siteId: 1,
},
}, },
{ {
slug: '5kporn', slug: '5kporn',
@ -5305,6 +5312,10 @@ const sites = [
url: 'https://www.5kporn.com', url: 'https://www.5kporn.com',
tags: ['5k'], tags: ['5k'],
parent: 'kellymadison', parent: 'kellymadison',
parameters: {
// IDs overlap with Fidelity sites
siteId: 1,
},
}, },
{ {
slug: '5kteens', slug: '5kteens',
@ -5312,6 +5323,9 @@ const sites = [
url: 'https://www.5kteens.com', url: 'https://www.5kteens.com',
tags: ['5k'], tags: ['5k'],
parent: 'kellymadison', parent: 'kellymadison',
parameters: {
siteId: 2,
},
}, },
// KILLERGRAM // KILLERGRAM
{ {

View File

@ -614,7 +614,7 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy
const profiles = Promise.map(validSources, async (source) => { const profiles = Promise.map(validSources, async (source) => {
try { try {
// config may group sources to try until success // config may group sources to try until success
return [].concat(source).reduce(async (outcome, scraperSlug) => outcome.catch(async () => { return await [].concat(source).reduce(async (outcome, scraperSlug) => outcome.catch(async () => {
try { try {
const entity = entitiesBySlug[scraperSlug] || null; const entity = entitiesBySlug[scraperSlug] || null;

View File

@ -108,6 +108,8 @@ async function startMemorySample(snapshotTriggers = []) {
async function init() { async function init() {
try { try {
await redis.connect();
if (argv.server) { if (argv.server) {
await initServer(); await initServer();
return; return;

View File

@ -174,6 +174,12 @@ const { argv } = yargs
default: config.upcomingMissingDateLimit, default: config.upcomingMissingDateLimit,
alias: ['upcoming-null-date-limit'], alias: ['upcoming-null-date-limit'],
}) })
.option('filter-sparse-dates', {
describe: 'If some but not all scenes have dates, filter out scenes without dates, instead of using missing date limit.',
type: 'boolean',
default: false,
alias: ['sparse'],
})
.option('page', { .option('page', {
describe: 'Page to start scraping at', describe: 'Page to start scraping at',
type: 'number', type: 'number',

View File

@ -10,6 +10,6 @@ const redisClient = redis.createClient({
socket: config.redis, socket: config.redis,
}); });
redisClient.connect(); // redisClient.connect();
module.exports = redisClient; module.exports = redisClient;

View File

@ -1,9 +1,11 @@
'use strict'; 'use strict';
const unprint = require('unprint');
const slugify = require('../utils/slugify'); const slugify = require('../utils/slugify');
const qu = require('../utils/qu'); const qu = require('../utils/qu');
const http = require('../utils/http'); const http = require('../utils/http');
const { feetInchesToCm } = require('../utils/convert'); const { feetInchesToCm, femaleFeetUsToEu } = require('../utils/convert');
const siteMapByKey = { const siteMapByKey = {
PF: 'pornfidelity', PF: 'pornfidelity',
@ -16,14 +18,11 @@ const siteMapByKey = {
const siteMapBySlug = Object.entries(siteMapByKey).reduce((acc, [key, value]) => ({ ...acc, [value]: key }), {}); const siteMapBySlug = Object.entries(siteMapByKey).reduce((acc, [key, value]) => ({ ...acc, [value]: key }), {});
function scrapeLatest(scenes, site) { function scrapeLatest(scenes, site) {
return scenes.reduce((acc, { query }) => { return scenes.map(({ query }) => {
const release = {}; const release = {};
release.shootId = query.q('.card-meta .text-right, .row .text-right, .card-footer-item:last-child', true); release.shootId = query.q('.card-meta .text-right, .row .text-right, .card-footer-item:last-child', true);
const siteId = release.shootId.match(/\d?\w{2}/)[0];
const siteSlug = siteMapByKey[siteId];
const { pathname } = new URL(query.url('h5 a, .ep-title a, .title a')); const { pathname } = new URL(query.url('h5 a, .ep-title a, .title a'));
[release.entryId] = pathname.match(/\d+$/); [release.entryId] = pathname.match(/\d+$/);
release.url = `${site.url}${pathname}`; release.url = `${site.url}${pathname}`;
@ -47,15 +46,19 @@ function scrapeLatest(scenes, site) {
}; };
} }
/* using site ID, filter no longer needed
const siteId = release.shootId.match(/\d?\w{2}/)[0];
const siteSlug = siteMapByKey[siteId];
if (site.slug !== siteSlug) { if (site.slug !== siteSlug) {
// using generic network overview, scene is not from the site we want // using generic network overview, scene is not from the site we want
return { ...acc, unextracted: [...acc.unextracted, release] }; return { ...acc, unextracted: [...acc.unextracted, release] };
} }
return { ...acc, scenes: [...acc.scenes, release] }; return { ...acc, scenes: [...acc.scenes, release] };
}, { */
scenes: [],
unextracted: [], return release;
}); });
} }
@ -114,34 +117,47 @@ async function scrapeScene({ query, html }, url, baseRelease, channel, session)
})); }));
} }
console.log(release);
return release; return release;
} }
function scrapeProfile({ query }) { function scrapeProfile({ query }) {
const profile = {}; const profile = {};
const bioKeys = query.all('table.table td:nth-child(1)', true); const bioKeys = query.contents('table.table td:nth-child(1), table.table th');
const bioValues = query.all('table.table td:nth-child(2)', true); const bioValues = query.contents('table.table td:nth-child(2)');
const bio = bioKeys.reduce((acc, key, index) => ({ ...acc, [key.slice(0, -1)]: bioValues[index] }), {});
if (bio.Ethnicity) profile.ethnicity = bio.Ethnicity; const bio = bioKeys.reduce((acc, key, index) => ({
if (bio.Measurements) [profile.bust, profile.waist, profile.hip] = bio.Measurements.split('-'); ...acc,
if (bio.Birthplace) profile.birthPlace = bio.Birthplace; [slugify(key, '_')]: bioValues[index],
}), {});
if (bio.Height) { if (bio.ethnicity) profile.ethnicity = bio.ethnicity;
const [feet, inches] = bio.Height.match(/\d+/g); if (bio.measurements) profile.measurements = bio.measurements;
if (bio.birthplace) profile.birthPlace = bio.birthplace;
if (bio.shoe_size) profile.foot = femaleFeetUsToEu(bio.shoe_size);
if (bio.height) {
const [feet, inches] = bio.height.match(/\d+/g);
profile.height = feetInchesToCm(feet, inches); profile.height = feetInchesToCm(feet, inches);
} }
profile.avatar = query.img('img[src*="model"]'); if (bio.birthday) {
const [month, day] = bio.birthday.split('/');
const birthday = new Date(Date.UTC(0, Number(month) - 1, Number(day)));
birthday.setUTCFullYear(0); // indicate birth year is unknown
profile.dateOfBirth = new Date(birthday);
}
profile.avatar = query.img('img[src*="model"][src*="headshot"]');
profile.photos = query.imgs('img[src*="model"][src*="thumb_image"], img[src*="model"][src*="bg_image"]');
return profile; return profile;
} }
async function fetchLatest(channel, page = 1) { async function fetchLatest(channel, page = 1) {
const url = `${channel.url}/episodes/search?page=${page}`; // TLS issues with teenfidelity.com, same overview on all sites const url = `${channel.url}/episodes/search?page=${page}&site=${channel.parameters.siteId || ''}`; // TLS issues with teenfidelity.com, same overview on all sites
const res = await http.get(url, { const res = await http.get(url, {
headers: { headers: {
'X-Requested-With': 'XMLHttpRequest', 'X-Requested-With': 'XMLHttpRequest',
@ -165,16 +181,17 @@ async function fetchScene(url, channel, baseRelease) {
return res.ok ? scrapeScene(res.item, url, baseRelease, channel, session) : res.status; return res.ok ? scrapeScene(res.item, url, baseRelease, channel, session) : res.status;
} }
async function fetchProfile({ name: actorName }) { async function fetchProfile({ name: actorName }, { entity }) {
const actorSlug = slugify(actorName); const actorSlug = slugify(actorName);
const res = await qu.get(`https://www.kellymadison.com/models/${actorSlug}`, null, {
const res = await unprint.get(`${entity.url}/models/${actorSlug}`, {
headers: { headers: {
'X-Requested-With': 'XMLHttpRequest', 'X-Requested-With': 'XMLHttpRequest',
}, },
}); });
if (res.ok) { if (res.ok) {
return scrapeProfile(res.item); return scrapeProfile(res.context);
} }
return res.status; return res.status;

View File

@ -265,6 +265,7 @@ const scrapers = {
julesjordan, julesjordan,
karups, karups,
kellymadison, kellymadison,
'5kporn': kellymadison,
killergram, killergram,
kink, kink,
kinkmen: kink, kinkmen: kink,

View File

@ -147,10 +147,12 @@ async function scrapeReleases(scraper, entity, preData, isUpcoming) {
const releases = await scrapeReleasesPage(argv.page || 1, []); const releases = await scrapeReleasesPage(argv.page || 1, []);
const hasDates = releases.every((release) => !!release.date); const hasDates = argv.filterSparseDates
? releases.some((release) => !!release.date)
: releases.every((release) => !!release.date);
const limitedReleases = (argv.last && releases.slice(0, Math.max(argv.last, 0))) const limitedReleases = (argv.last && releases.slice(0, Math.max(argv.last, 0)))
|| (hasDates && releases.filter((release) => moment(release.date).isAfter(argv.after))) || (hasDates && releases.filter((release) => release.date && moment(release.date).isAfter(argv.after)))
|| releases.slice(0, Math.max(isUpcoming ? argv.upcomingMissingDateLimit : argv.missingDateLimit, 0)); || releases.slice(0, Math.max(isUpcoming ? argv.upcomingMissingDateLimit : argv.missingDateLimit, 0));
const { uniqueReleases, duplicateReleases } = argv.force const { uniqueReleases, duplicateReleases } = argv.force