Added profile scraper tests (WIP), fixed some profile scrapers. Fixed slugify not breaking existing slugs.

This commit is contained in:
DebaucheryLibrarian
2026-01-10 02:58:50 +01:00
parent 5acc2c607b
commit bddc33a734
12 changed files with 293 additions and 111 deletions

View File

@@ -216,7 +216,8 @@ function getUrl(site) {
}
async function getSession(site, parameters, url) {
if (site.slug === 'mindgeek' || site.parameters?.parentSession === false) {
// if (site.slug === 'aylo' || site.parameters?.parentSession === false) {
if (site.slug === 'aylo') {
// most MG sites have a parent network to acquire a session from, don't try to acquire session from mindgeek.com for independent channels
return null;
}
@@ -224,7 +225,7 @@ async function getSession(site, parameters, url) {
const cookieJar = new CookieJar();
const session = http.session({ cookieJar });
const sessionUrl = site.parameters?.siteId && !(site.parameters?.native || site.parameters?.childSession || site.parent?.parameters?.childSession)
const sessionUrl = site.parameters?.siteId && !(site.parameters?.native || site.parameters?.childSession || site.parent?.parameters?.childSession || site.parameters?.parentSession === false)
? site.parent.url
: (url || site.url);
@@ -360,7 +361,12 @@ function scrapeProfile(data, networkName, _releases = []) {
};
profile.gender = data.gender === 'other' ? 'transsexual' : data.gender;
profile.measurements = data.measurements;
if (profile.gender === 'male') {
profile.penisLength = Number(data.measurements);
} else {
profile.measurements = data.measurements;
}
profile.dateOfBirth = qu.parseDate(data.birthday);
profile.birthPlace = data.birthPlace;

View File

@@ -254,7 +254,7 @@ async function scrapeProfile({ query, el }, channel, options) {
};
}, {});
if (bio.date_of_birth) profile.birthdate = qu.extractDate(bio.date_of_birth, 'MMMM D, YYYY');
if (bio.date_of_birth) profile.dateOfBirth = qu.extractDate(bio.date_of_birth, 'MMMM D, YYYY');
if (bio.birthplace) profile.birthPlace = bio.birthplace;
if (bio.fun_fact) profile.description = bio.fun_fact;
@@ -262,6 +262,7 @@ async function scrapeProfile({ query, el }, channel, options) {
if (bio.height) profile.height = Number(bio.height.match(/^\d{2,3}/)?.[0]);
if (bio.weight) profile.weight = Number(bio.weight.match(/^\d{2,3}/)?.[0]);
if (bio.shoe_size) profile.foot = Number(bio.shoe_size);
profile.measurements = bio.measurements;
@@ -280,7 +281,7 @@ async function scrapeProfile({ query, el }, channel, options) {
if (bio.aliases) profile.aliases = bio.aliases.split(',').map((alias) => alias.trim());
profile.social = [bio.onlyfans, bio.twitter, bio.instagram, bio.domain].filter(Boolean);
profile.socials = [bio.onlyfans, bio.twitter, bio.instagram, bio.domain].filter(Boolean);
profile.avatar = [
query.img('.profile-pic img', 'src0_3x', { origin: channel.url }),
@@ -327,29 +328,29 @@ async function fetchScene(url, site, baseRelease) {
return scrapeScene(res.item, site, url, baseRelease);
}
async function fetchProfile({ name: actorName }, { site }, options) {
async function fetchProfile({ name: actorName }, { channel }, options) {
const actorSlugA = slugify(actorName, '');
const actorSlugB = slugify(actorName);
const t1 = site.parameters?.t1 ? 't1/' : '';
const t1 = channel.parameters?.t1 ? 't1/' : '';
const res1 = site.parameters?.profile
? await qu.get(util.format(site.parameters.profile, actorSlugA))
: await qu.get(`${site.url}/${t1}models/${actorSlugA}.html`, null, null, { followRedirects: false });
const res1 = channel.parameters?.profile
? await qu.get(util.format(channel.parameters.profile, actorSlugA))
: await qu.get(`${channel.url}/${t1}models/${actorSlugA}.html`, null, null, { followRedirects: false });
const res = (res1.ok && res1)
|| (site.parameters?.profile && await qu.get(util.format(site.parameters.profile, actorSlugB)))
|| await qu.get(`${site.url}/${t1}models/${actorSlugB}.html`, null, null, { followRedirects: false });
|| (channel.parameters?.profile && await qu.get(util.format(channel.parameters.profile, actorSlugB)))
|| await qu.get(`${channel.url}/${t1}models/${actorSlugB}.html`, null, null, { followRedirects: false });
if (!res.ok) {
return res.status;
}
if (site.parameters?.t1) {
return scrapeProfileT1(res.item, site);
if (channel.parameters?.t1) {
return scrapeProfileT1(res.item, channel);
}
return scrapeProfile(res.item, site, options);
return scrapeProfile(res.item, channel, options);
}
module.exports = {

View File

@@ -208,7 +208,7 @@ async function fetchProfile({ name: actorName }, { entity }) {
const actorSlug = slugify(actorName);
// 8K sites don't have avatar or interview on model page, always use 5K site
const res = await unprint.get(`${entity.slug === '5kvids' ? 'https://www.5kporn.com' : entity.url}/models/${actorSlug}`, {
const res = await unprint.get(`${entity.slug === '8kmembers' ? 'https://www.8kmilfs.com' : entity.url}/models/${actorSlug}`, {
headers: {
'X-Requested-With': 'XMLHttpRequest',
},

View File

@@ -3,7 +3,6 @@
const unprint = require('unprint');
const http = require('../utils/http');
const slugify = require('../utils/slugify');
const { convert } = require('../utils/convert');
function scrapeAll(scenes, channel) {
@@ -76,41 +75,6 @@ async function scrapeScene({ query }, url, channel) {
return release;
}
async function scrapeProfile({ query }) {
const profile = {};
const bio = Object.fromEntries(query.all('.model-info li, .model-desc li').map((el) => [
slugify(unprint.query.content(el, 'span')),
unprint.query.text(el),
]));
const avatar = query.img('.model-photo img, img[alt="model"]');
if (avatar) {
profile.avatar = [
avatar.replace(/-\d+x\d+/, ''),
avatar,
];
}
if (bio && Object.keys(bio).length > 0) {
profile.description = bio.bio;
profile.dateOfBirth = bio.birthdate && unprint.extractDate(bio.birthdate, 'YYYY-MM-DD');
profile.birthPlace = bio.born;
profile.measurements = bio.measurements;
profile.height = convert(bio.height, 'cm');
profile.weight = convert(bio.weight, 'lb', 'kg');
profile.eyes = bio.eyes;
profile.hairColor = bio.hair;
}
return profile;
}
async function fetchLatestContent(url, parameters) {
if (parameters.useBrowser) {
const res = await http.get(url, {
@@ -187,16 +151,54 @@ async function fetchScene(url, channel) {
return res.status;
}
async function scrapeProfile(data) {
const profile = {};
// unreliable key case, lowercase all
const bio = Object.fromEntries(Object.entries(data).map(([key, value]) => [key.toLowerCase(), value]));
profile.entryId = bio.id;
profile.gender = bio.gender;
profile.description = bio.bio;
profile.birthPlace = bio.born;
profile.dateOfBirth = unprint.extractDate(bio.birthdate, 'YYYY-MM-DD');
profile.age = bio.age;
profile.measurements = bio.measurements;
profile.height = convert(bio.height, 'cm');
profile.weight = convert(bio.weight, 'lb', 'kg');
profile.eyes = bio.eyes;
profile.hairColor = bio.hair;
profile.avatar = bio.thumb;
const tags = bio.tags?.split(',') || [];
if (tags.includes('tattoos')) profile.hasTattoos = true;
if (tags.includes('piercing')) profile.hasPiercings = true;
return profile;
}
async function fetchProfile(actor, context) {
const session = http.session();
await http.get(context.channel.url, { session });
const url = `${context.channel.url}/models/${actor.slug}`;
const res = await unprint.get(url);
const res = await unprint.get(url, {
parser: {
runScripts: 'dangerously',
},
});
if (res.ok) {
return scrapeProfile(res.context, context.channel);
const data = res.context.query.json('#__NEXT_DATA__');
if (data.props.pageProps.model) {
return scrapeProfile(data.props.pageProps.model, context.channel);
}
return null;
}
return res.status;

View File

@@ -220,8 +220,6 @@ const scrapers = {
bang,
bangbros: aylo,
bjraw: radical,
blacked: vixen,
blackedraw: vixen,
bluedonkeymedia,
delphine: modelmedia,
meidenvanholland: bluedonkeymedia,
@@ -233,7 +231,6 @@ const scrapers = {
burningangel: gamma,
cherrypimps,
cumlouder,
deeper: vixen,
deeplush: nubiles,
devilsfilm: famedigital,
digitalplayground: aylo,
@@ -276,6 +273,7 @@ const scrapers = {
kink,
kinkmen: kink,
kinkvr: kink,
letsdoeit: aylo,
loveherfilms,
loveherfeet: loveherfilms,
shelovesblack: loveherfilms,
@@ -287,7 +285,6 @@ const scrapers = {
mariskax,
metrohd: aylo,
milehighmedia: aylo,
milfy: vixen,
milfvr: wankzvr,
missax,
mofos: aylo,
@@ -299,7 +296,6 @@ const scrapers = {
nfbusty: nubiles,
nubilefilms: nubiles,
nubiles,
nubilesporn: nubiles,
nympho: mikeadriano,
onlyprince: fullpornnetwork,
pascalssubsluts,
@@ -353,15 +349,22 @@ const scrapers = {
transbella: porndoe,
tranzvr: wankzvr,
trueanal: mikeadriano,
tushy: vixen,
tushyraw: vixen,
twistys: aylo,
vipsexvault: porndoe,
virtualtaboo,
darkroomvr: virtualtaboo,
onlytarts: virtualtaboo,
oopsfamily: virtualtaboo,
// vixen
vixen,
blacked: vixen,
blackedraw: vixen,
tushy: vixen,
tushyraw: vixen,
deeper: vixen,
milfy: vixen,
slayed: vixen,
wifey: vixen,
vrcosplayx: badoink,
wankzvr,
wicked: gamma,

View File

@@ -31,12 +31,26 @@ function scrapeAll(scenes) {
});
}
async function fetchLatest(channel, page) {
const res = await unprint.get(`${channel.url}${format(channel.parameters?.latest || '/categories/movies_{page}_d.html', { page })}`, {
selectAll: '.thumb-big, .thumb-video, .thumbnail, .thumbnail-popular, .full-thumbnail',
});
if (res.ok) {
return scrapeAll(res.context, channel);
}
return res.status;
}
function scrapeScene({ query }, { url, entity }) {
const release = {};
release.entryId = getEntryId(url);
release.title = query.content(['#media-holder .title', '.content-holder h1', '#scene h1', 'h2.titular', 'title'])?.replace(/\s+-$/, '');
console.log(release);
release.date = query.date('#sceneInfo .date, #trailer-data .date', 'YYYY-MM-DD');
release.duration = query.duration('#sceneInfo .data-others, #trailer-data', /\d+:\d+/);
@@ -67,6 +81,28 @@ function scrapeScene({ query }, { url, entity }) {
return release;
}
function stripSizeParams(source) {
if (!source) {
return [];
}
try {
const url = new URL(source);
const params = url.searchParams;
params.delete('imgh');
params.delete('imgw');
params.delete('imgq');
return [
`${url.origin}${url.pathname}?${params.toString()}`,
source,
];
} catch (error) {
return [];
}
}
function scrapeProfile({ query }) {
const profile = {};
const bioKeys = query.contents('.statsText b');
@@ -77,13 +113,14 @@ function scrapeProfile({ query }) {
[slugify(key, '_')]: bioValues[index],
}), {});
profile.description = query.contents('.descriptionText');
profile.description = query.content('.descriptionText');
profile.avatar = [
...stripSizeParams(query.img('.model-bio-pic img', { attribute: 'src' })), // not available on e.g. Raw Attack
query.img('.model-bio-pic img', { attribute: 'src0_3x' }),
query.img('.model-bio-pic img', { attribute: 'src0_2x' }),
query.img('.model-bio-pic img', { attribute: 'src0_3x' }), // unnecessarily big
query.img('.model-bio-pic img', { attribute: 'src0_1x' }),
];
].filter(Boolean);
profile.height = Number(bio.height?.match(/(\d+)\s?cm/i)?.[1]);
profile.dateOfBirth = unprint.extractDate(bio.date_of_birth, 'MMMM D, YYYY');
@@ -108,18 +145,6 @@ function scrapeProfile({ query }) {
return profile;
}
async function fetchLatest(channel, page) {
const res = await unprint.get(`${channel.url}${format(channel.parameters?.latest || '/categories/movies_{page}_d.html', { page })}`, {
selectAll: '.thumb-big, .thumb-video, .thumbnail, .thumbnail-popular, .full-thumbnail',
});
if (res.ok) {
return scrapeAll(res.context, channel);
}
return res.status;
}
async function fetchProfile(actor, channel) {
if (actor.url) {
const res = await unprint.get(actor.url);

View File

@@ -411,19 +411,16 @@ async function fetchScene(url, channel, baseRelease, options) {
return res.status;
}
async function scrapeProfile(data, channel) {
async function scrapeProfile(data, _channel) {
const model = data.model;
const profile = {};
// most details seemingly unavailable in graphql
if (profile.dateOfBirth) profile.birthdate = new Date(model.dateOfBirth);
profile.gender = genderMap[model.sex];
profile.hair = model.hairColour;
profile.nationality = model.nationality;
if (model.biography.trim().length > 0) profile.description = model.biography;
// most details seemingly unavailable in graphql
if (profile.dateOfBirth) profile.birthdate = new Date(model.dateOfBirth);
if (model.cupSize && model.bustMeasurment) profile.bust = `${model.bustMeasurment}${model.cupSize}`;
if (model.waistMeasurment) profile.waist = model.waistMeasurment;
if (model.hipMeasurment) profile.hip = model.hipMeasurment;
@@ -432,9 +429,11 @@ async function scrapeProfile(data, channel) {
profile.poster = getAvatarFallbacks(model.images.profile);
profile.banner = getAvatarFallbacks(model.images.poster);
/*
if (model.videos) {
profile.scenes = scrapeAll(model.videos.edges.map((edge) => edge.node), channel);
}
*/
return profile;
}
@@ -558,6 +557,7 @@ async function fetchProfile(actor, { channel }) {
) {
model: findOneModel(input: { slug: $slug, site: $site }) {
name
sex
biography
images {
listing {

View File

@@ -1,6 +1,7 @@
'use strict';
const { convert, convertMany } = require('convert');
const { decode } = require('html-entities');
const logger = require('../logger')(__filename);
@@ -60,18 +61,20 @@ function kgToLbs(kgs) {
function convertManyApi(input, to) {
const curatedInput = input
.replace('\'', 'ft')
.replace(/"|''/, 'in')
.replace(/[']\s*/, 'ft ') // ensure 1 space
.replace(/["”]|('')/, 'in') // 54”
.replace(/\d+ft\s*\d+\s*$/, (match) => `${match}in`); // height without any inch symbol
return Math.round(convertMany(curatedInput).to(to)) || null;
}
function convertApi(input, fromOrTo, to) {
if (!input) {
function convertApi(rawInput, fromOrTo, to) {
if (!rawInput) {
return null;
}
const input = decode(rawInput); // remove html entities, e.g. 5' 8" for 5' 8"
try {
if (typeof input === 'string' && to === undefined) {
return convertManyApi(input, fromOrTo);

View File

@@ -42,7 +42,7 @@ const accentMap = {
};
const plainCharRegex = /[a-zA-Z0-9]/;
const defaultPunctuationRegex = /[.,?!:;&'"“”…()[]{}<>\/*—-]/;
const defaultPunctuationRegex = /[.,?!:;&'"“”…()[]{}<>\/*—]/;
const defaultSymbolRegex = /[@$€£#%^+=\\~]/;
function slugify(strings, delimiter = '-', {
@@ -66,6 +66,7 @@ function slugify(strings, delimiter = '-', {
: string;
const normalized = casedString
.replace(/[_-]/g, ' ')
.split('')
.map((char) => {
if (char === ' ') {