Added profile scraper tests (WIP), fixed some profile scrapers. Fixed slugify not breaking existing slugs.
This commit is contained in:
@@ -216,7 +216,8 @@ function getUrl(site) {
|
||||
}
|
||||
|
||||
async function getSession(site, parameters, url) {
|
||||
if (site.slug === 'mindgeek' || site.parameters?.parentSession === false) {
|
||||
// if (site.slug === 'aylo' || site.parameters?.parentSession === false) {
|
||||
if (site.slug === 'aylo') {
|
||||
// most MG sites have a parent network to acquire a session from, don't try to acquire session from mindgeek.com for independent channels
|
||||
return null;
|
||||
}
|
||||
@@ -224,7 +225,7 @@ async function getSession(site, parameters, url) {
|
||||
const cookieJar = new CookieJar();
|
||||
const session = http.session({ cookieJar });
|
||||
|
||||
const sessionUrl = site.parameters?.siteId && !(site.parameters?.native || site.parameters?.childSession || site.parent?.parameters?.childSession)
|
||||
const sessionUrl = site.parameters?.siteId && !(site.parameters?.native || site.parameters?.childSession || site.parent?.parameters?.childSession || site.parameters?.parentSession === false)
|
||||
? site.parent.url
|
||||
: (url || site.url);
|
||||
|
||||
@@ -360,7 +361,12 @@ function scrapeProfile(data, networkName, _releases = []) {
|
||||
};
|
||||
|
||||
profile.gender = data.gender === 'other' ? 'transsexual' : data.gender;
|
||||
profile.measurements = data.measurements;
|
||||
|
||||
if (profile.gender === 'male') {
|
||||
profile.penisLength = Number(data.measurements);
|
||||
} else {
|
||||
profile.measurements = data.measurements;
|
||||
}
|
||||
|
||||
profile.dateOfBirth = qu.parseDate(data.birthday);
|
||||
profile.birthPlace = data.birthPlace;
|
||||
|
||||
@@ -254,7 +254,7 @@ async function scrapeProfile({ query, el }, channel, options) {
|
||||
};
|
||||
}, {});
|
||||
|
||||
if (bio.date_of_birth) profile.birthdate = qu.extractDate(bio.date_of_birth, 'MMMM D, YYYY');
|
||||
if (bio.date_of_birth) profile.dateOfBirth = qu.extractDate(bio.date_of_birth, 'MMMM D, YYYY');
|
||||
if (bio.birthplace) profile.birthPlace = bio.birthplace;
|
||||
if (bio.fun_fact) profile.description = bio.fun_fact;
|
||||
|
||||
@@ -262,6 +262,7 @@ async function scrapeProfile({ query, el }, channel, options) {
|
||||
|
||||
if (bio.height) profile.height = Number(bio.height.match(/^\d{2,3}/)?.[0]);
|
||||
if (bio.weight) profile.weight = Number(bio.weight.match(/^\d{2,3}/)?.[0]);
|
||||
if (bio.shoe_size) profile.foot = Number(bio.shoe_size);
|
||||
|
||||
profile.measurements = bio.measurements;
|
||||
|
||||
@@ -280,7 +281,7 @@ async function scrapeProfile({ query, el }, channel, options) {
|
||||
|
||||
if (bio.aliases) profile.aliases = bio.aliases.split(',').map((alias) => alias.trim());
|
||||
|
||||
profile.social = [bio.onlyfans, bio.twitter, bio.instagram, bio.domain].filter(Boolean);
|
||||
profile.socials = [bio.onlyfans, bio.twitter, bio.instagram, bio.domain].filter(Boolean);
|
||||
|
||||
profile.avatar = [
|
||||
query.img('.profile-pic img', 'src0_3x', { origin: channel.url }),
|
||||
@@ -327,29 +328,29 @@ async function fetchScene(url, site, baseRelease) {
|
||||
return scrapeScene(res.item, site, url, baseRelease);
|
||||
}
|
||||
|
||||
async function fetchProfile({ name: actorName }, { site }, options) {
|
||||
async function fetchProfile({ name: actorName }, { channel }, options) {
|
||||
const actorSlugA = slugify(actorName, '');
|
||||
const actorSlugB = slugify(actorName);
|
||||
|
||||
const t1 = site.parameters?.t1 ? 't1/' : '';
|
||||
const t1 = channel.parameters?.t1 ? 't1/' : '';
|
||||
|
||||
const res1 = site.parameters?.profile
|
||||
? await qu.get(util.format(site.parameters.profile, actorSlugA))
|
||||
: await qu.get(`${site.url}/${t1}models/${actorSlugA}.html`, null, null, { followRedirects: false });
|
||||
const res1 = channel.parameters?.profile
|
||||
? await qu.get(util.format(channel.parameters.profile, actorSlugA))
|
||||
: await qu.get(`${channel.url}/${t1}models/${actorSlugA}.html`, null, null, { followRedirects: false });
|
||||
|
||||
const res = (res1.ok && res1)
|
||||
|| (site.parameters?.profile && await qu.get(util.format(site.parameters.profile, actorSlugB)))
|
||||
|| await qu.get(`${site.url}/${t1}models/${actorSlugB}.html`, null, null, { followRedirects: false });
|
||||
|| (channel.parameters?.profile && await qu.get(util.format(channel.parameters.profile, actorSlugB)))
|
||||
|| await qu.get(`${channel.url}/${t1}models/${actorSlugB}.html`, null, null, { followRedirects: false });
|
||||
|
||||
if (!res.ok) {
|
||||
return res.status;
|
||||
}
|
||||
|
||||
if (site.parameters?.t1) {
|
||||
return scrapeProfileT1(res.item, site);
|
||||
if (channel.parameters?.t1) {
|
||||
return scrapeProfileT1(res.item, channel);
|
||||
}
|
||||
|
||||
return scrapeProfile(res.item, site, options);
|
||||
return scrapeProfile(res.item, channel, options);
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
|
||||
@@ -208,7 +208,7 @@ async function fetchProfile({ name: actorName }, { entity }) {
|
||||
const actorSlug = slugify(actorName);
|
||||
|
||||
// 8K sites don't have avatar or interview on model page, always use 5K site
|
||||
const res = await unprint.get(`${entity.slug === '5kvids' ? 'https://www.5kporn.com' : entity.url}/models/${actorSlug}`, {
|
||||
const res = await unprint.get(`${entity.slug === '8kmembers' ? 'https://www.8kmilfs.com' : entity.url}/models/${actorSlug}`, {
|
||||
headers: {
|
||||
'X-Requested-With': 'XMLHttpRequest',
|
||||
},
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
const unprint = require('unprint');
|
||||
|
||||
const http = require('../utils/http');
|
||||
const slugify = require('../utils/slugify');
|
||||
const { convert } = require('../utils/convert');
|
||||
|
||||
function scrapeAll(scenes, channel) {
|
||||
@@ -76,41 +75,6 @@ async function scrapeScene({ query }, url, channel) {
|
||||
return release;
|
||||
}
|
||||
|
||||
async function scrapeProfile({ query }) {
|
||||
const profile = {};
|
||||
|
||||
const bio = Object.fromEntries(query.all('.model-info li, .model-desc li').map((el) => [
|
||||
slugify(unprint.query.content(el, 'span')),
|
||||
unprint.query.text(el),
|
||||
]));
|
||||
|
||||
const avatar = query.img('.model-photo img, img[alt="model"]');
|
||||
|
||||
if (avatar) {
|
||||
profile.avatar = [
|
||||
avatar.replace(/-\d+x\d+/, ''),
|
||||
avatar,
|
||||
];
|
||||
}
|
||||
|
||||
if (bio && Object.keys(bio).length > 0) {
|
||||
profile.description = bio.bio;
|
||||
|
||||
profile.dateOfBirth = bio.birthdate && unprint.extractDate(bio.birthdate, 'YYYY-MM-DD');
|
||||
profile.birthPlace = bio.born;
|
||||
|
||||
profile.measurements = bio.measurements;
|
||||
|
||||
profile.height = convert(bio.height, 'cm');
|
||||
profile.weight = convert(bio.weight, 'lb', 'kg');
|
||||
|
||||
profile.eyes = bio.eyes;
|
||||
profile.hairColor = bio.hair;
|
||||
}
|
||||
|
||||
return profile;
|
||||
}
|
||||
|
||||
async function fetchLatestContent(url, parameters) {
|
||||
if (parameters.useBrowser) {
|
||||
const res = await http.get(url, {
|
||||
@@ -187,16 +151,54 @@ async function fetchScene(url, channel) {
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function scrapeProfile(data) {
|
||||
const profile = {};
|
||||
// unreliable key case, lowercase all
|
||||
const bio = Object.fromEntries(Object.entries(data).map(([key, value]) => [key.toLowerCase(), value]));
|
||||
|
||||
profile.entryId = bio.id;
|
||||
|
||||
profile.gender = bio.gender;
|
||||
profile.description = bio.bio;
|
||||
|
||||
profile.birthPlace = bio.born;
|
||||
profile.dateOfBirth = unprint.extractDate(bio.birthdate, 'YYYY-MM-DD');
|
||||
profile.age = bio.age;
|
||||
|
||||
profile.measurements = bio.measurements;
|
||||
profile.height = convert(bio.height, 'cm');
|
||||
profile.weight = convert(bio.weight, 'lb', 'kg');
|
||||
|
||||
profile.eyes = bio.eyes;
|
||||
profile.hairColor = bio.hair;
|
||||
|
||||
profile.avatar = bio.thumb;
|
||||
|
||||
const tags = bio.tags?.split(',') || [];
|
||||
|
||||
if (tags.includes('tattoos')) profile.hasTattoos = true;
|
||||
if (tags.includes('piercing')) profile.hasPiercings = true;
|
||||
|
||||
return profile;
|
||||
}
|
||||
|
||||
async function fetchProfile(actor, context) {
|
||||
const session = http.session();
|
||||
|
||||
await http.get(context.channel.url, { session });
|
||||
|
||||
const url = `${context.channel.url}/models/${actor.slug}`;
|
||||
const res = await unprint.get(url);
|
||||
|
||||
const res = await unprint.get(url, {
|
||||
parser: {
|
||||
runScripts: 'dangerously',
|
||||
},
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeProfile(res.context, context.channel);
|
||||
const data = res.context.query.json('#__NEXT_DATA__');
|
||||
|
||||
if (data.props.pageProps.model) {
|
||||
return scrapeProfile(data.props.pageProps.model, context.channel);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
return res.status;
|
||||
|
||||
@@ -220,8 +220,6 @@ const scrapers = {
|
||||
bang,
|
||||
bangbros: aylo,
|
||||
bjraw: radical,
|
||||
blacked: vixen,
|
||||
blackedraw: vixen,
|
||||
bluedonkeymedia,
|
||||
delphine: modelmedia,
|
||||
meidenvanholland: bluedonkeymedia,
|
||||
@@ -233,7 +231,6 @@ const scrapers = {
|
||||
burningangel: gamma,
|
||||
cherrypimps,
|
||||
cumlouder,
|
||||
deeper: vixen,
|
||||
deeplush: nubiles,
|
||||
devilsfilm: famedigital,
|
||||
digitalplayground: aylo,
|
||||
@@ -276,6 +273,7 @@ const scrapers = {
|
||||
kink,
|
||||
kinkmen: kink,
|
||||
kinkvr: kink,
|
||||
letsdoeit: aylo,
|
||||
loveherfilms,
|
||||
loveherfeet: loveherfilms,
|
||||
shelovesblack: loveherfilms,
|
||||
@@ -287,7 +285,6 @@ const scrapers = {
|
||||
mariskax,
|
||||
metrohd: aylo,
|
||||
milehighmedia: aylo,
|
||||
milfy: vixen,
|
||||
milfvr: wankzvr,
|
||||
missax,
|
||||
mofos: aylo,
|
||||
@@ -299,7 +296,6 @@ const scrapers = {
|
||||
nfbusty: nubiles,
|
||||
nubilefilms: nubiles,
|
||||
nubiles,
|
||||
nubilesporn: nubiles,
|
||||
nympho: mikeadriano,
|
||||
onlyprince: fullpornnetwork,
|
||||
pascalssubsluts,
|
||||
@@ -353,15 +349,22 @@ const scrapers = {
|
||||
transbella: porndoe,
|
||||
tranzvr: wankzvr,
|
||||
trueanal: mikeadriano,
|
||||
tushy: vixen,
|
||||
tushyraw: vixen,
|
||||
twistys: aylo,
|
||||
vipsexvault: porndoe,
|
||||
virtualtaboo,
|
||||
darkroomvr: virtualtaboo,
|
||||
onlytarts: virtualtaboo,
|
||||
oopsfamily: virtualtaboo,
|
||||
// vixen
|
||||
vixen,
|
||||
blacked: vixen,
|
||||
blackedraw: vixen,
|
||||
tushy: vixen,
|
||||
tushyraw: vixen,
|
||||
deeper: vixen,
|
||||
milfy: vixen,
|
||||
slayed: vixen,
|
||||
wifey: vixen,
|
||||
vrcosplayx: badoink,
|
||||
wankzvr,
|
||||
wicked: gamma,
|
||||
|
||||
@@ -31,12 +31,26 @@ function scrapeAll(scenes) {
|
||||
});
|
||||
}
|
||||
|
||||
async function fetchLatest(channel, page) {
|
||||
const res = await unprint.get(`${channel.url}${format(channel.parameters?.latest || '/categories/movies_{page}_d.html', { page })}`, {
|
||||
selectAll: '.thumb-big, .thumb-video, .thumbnail, .thumbnail-popular, .full-thumbnail',
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeAll(res.context, channel);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
function scrapeScene({ query }, { url, entity }) {
|
||||
const release = {};
|
||||
|
||||
release.entryId = getEntryId(url);
|
||||
release.title = query.content(['#media-holder .title', '.content-holder h1', '#scene h1', 'h2.titular', 'title'])?.replace(/\s+-$/, '');
|
||||
|
||||
console.log(release);
|
||||
|
||||
release.date = query.date('#sceneInfo .date, #trailer-data .date', 'YYYY-MM-DD');
|
||||
release.duration = query.duration('#sceneInfo .data-others, #trailer-data', /\d+:\d+/);
|
||||
|
||||
@@ -67,6 +81,28 @@ function scrapeScene({ query }, { url, entity }) {
|
||||
return release;
|
||||
}
|
||||
|
||||
function stripSizeParams(source) {
|
||||
if (!source) {
|
||||
return [];
|
||||
}
|
||||
|
||||
try {
|
||||
const url = new URL(source);
|
||||
const params = url.searchParams;
|
||||
|
||||
params.delete('imgh');
|
||||
params.delete('imgw');
|
||||
params.delete('imgq');
|
||||
|
||||
return [
|
||||
`${url.origin}${url.pathname}?${params.toString()}`,
|
||||
source,
|
||||
];
|
||||
} catch (error) {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
function scrapeProfile({ query }) {
|
||||
const profile = {};
|
||||
const bioKeys = query.contents('.statsText b');
|
||||
@@ -77,13 +113,14 @@ function scrapeProfile({ query }) {
|
||||
[slugify(key, '_')]: bioValues[index],
|
||||
}), {});
|
||||
|
||||
profile.description = query.contents('.descriptionText');
|
||||
profile.description = query.content('.descriptionText');
|
||||
|
||||
profile.avatar = [
|
||||
...stripSizeParams(query.img('.model-bio-pic img', { attribute: 'src' })), // not available on e.g. Raw Attack
|
||||
query.img('.model-bio-pic img', { attribute: 'src0_3x' }),
|
||||
query.img('.model-bio-pic img', { attribute: 'src0_2x' }),
|
||||
query.img('.model-bio-pic img', { attribute: 'src0_3x' }), // unnecessarily big
|
||||
query.img('.model-bio-pic img', { attribute: 'src0_1x' }),
|
||||
];
|
||||
].filter(Boolean);
|
||||
|
||||
profile.height = Number(bio.height?.match(/(\d+)\s?cm/i)?.[1]);
|
||||
profile.dateOfBirth = unprint.extractDate(bio.date_of_birth, 'MMMM D, YYYY');
|
||||
@@ -108,18 +145,6 @@ function scrapeProfile({ query }) {
|
||||
return profile;
|
||||
}
|
||||
|
||||
async function fetchLatest(channel, page) {
|
||||
const res = await unprint.get(`${channel.url}${format(channel.parameters?.latest || '/categories/movies_{page}_d.html', { page })}`, {
|
||||
selectAll: '.thumb-big, .thumb-video, .thumbnail, .thumbnail-popular, .full-thumbnail',
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeAll(res.context, channel);
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function fetchProfile(actor, channel) {
|
||||
if (actor.url) {
|
||||
const res = await unprint.get(actor.url);
|
||||
|
||||
@@ -411,19 +411,16 @@ async function fetchScene(url, channel, baseRelease, options) {
|
||||
return res.status;
|
||||
}
|
||||
|
||||
async function scrapeProfile(data, channel) {
|
||||
async function scrapeProfile(data, _channel) {
|
||||
const model = data.model;
|
||||
const profile = {};
|
||||
|
||||
// most details seemingly unavailable in graphql
|
||||
if (profile.dateOfBirth) profile.birthdate = new Date(model.dateOfBirth);
|
||||
profile.gender = genderMap[model.sex];
|
||||
|
||||
profile.hair = model.hairColour;
|
||||
profile.nationality = model.nationality;
|
||||
|
||||
if (model.biography.trim().length > 0) profile.description = model.biography;
|
||||
|
||||
// most details seemingly unavailable in graphql
|
||||
if (profile.dateOfBirth) profile.birthdate = new Date(model.dateOfBirth);
|
||||
if (model.cupSize && model.bustMeasurment) profile.bust = `${model.bustMeasurment}${model.cupSize}`;
|
||||
if (model.waistMeasurment) profile.waist = model.waistMeasurment;
|
||||
if (model.hipMeasurment) profile.hip = model.hipMeasurment;
|
||||
@@ -432,9 +429,11 @@ async function scrapeProfile(data, channel) {
|
||||
profile.poster = getAvatarFallbacks(model.images.profile);
|
||||
profile.banner = getAvatarFallbacks(model.images.poster);
|
||||
|
||||
/*
|
||||
if (model.videos) {
|
||||
profile.scenes = scrapeAll(model.videos.edges.map((edge) => edge.node), channel);
|
||||
}
|
||||
*/
|
||||
|
||||
return profile;
|
||||
}
|
||||
@@ -558,6 +557,7 @@ async function fetchProfile(actor, { channel }) {
|
||||
) {
|
||||
model: findOneModel(input: { slug: $slug, site: $site }) {
|
||||
name
|
||||
sex
|
||||
biography
|
||||
images {
|
||||
listing {
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
'use strict';
|
||||
|
||||
const { convert, convertMany } = require('convert');
|
||||
const { decode } = require('html-entities');
|
||||
|
||||
const logger = require('../logger')(__filename);
|
||||
|
||||
@@ -60,18 +61,20 @@ function kgToLbs(kgs) {
|
||||
|
||||
function convertManyApi(input, to) {
|
||||
const curatedInput = input
|
||||
.replace('\'', 'ft')
|
||||
.replace(/"|''/, 'in')
|
||||
.replace(/['’]\s*/, 'ft ') // ensure 1 space
|
||||
.replace(/["”]|('')/, 'in') // 5’4”
|
||||
.replace(/\d+ft\s*\d+\s*$/, (match) => `${match}in`); // height without any inch symbol
|
||||
|
||||
return Math.round(convertMany(curatedInput).to(to)) || null;
|
||||
}
|
||||
|
||||
function convertApi(input, fromOrTo, to) {
|
||||
if (!input) {
|
||||
function convertApi(rawInput, fromOrTo, to) {
|
||||
if (!rawInput) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const input = decode(rawInput); // remove html entities, e.g. 5' 8" for 5' 8"
|
||||
|
||||
try {
|
||||
if (typeof input === 'string' && to === undefined) {
|
||||
return convertManyApi(input, fromOrTo);
|
||||
|
||||
@@ -42,7 +42,7 @@ const accentMap = {
|
||||
};
|
||||
|
||||
const plainCharRegex = /[a-zA-Z0-9]/;
|
||||
const defaultPunctuationRegex = /[.,?!:;&'‘’"“”…()[]{}<>\/*—-]/;
|
||||
const defaultPunctuationRegex = /[.,?!:;&'‘’"“”…()[]{}<>\/*—]/;
|
||||
const defaultSymbolRegex = /[@$€£#%^+=\\~]/;
|
||||
|
||||
function slugify(strings, delimiter = '-', {
|
||||
@@ -66,6 +66,7 @@ function slugify(strings, delimiter = '-', {
|
||||
: string;
|
||||
|
||||
const normalized = casedString
|
||||
.replace(/[_-]/g, ' ')
|
||||
.split('')
|
||||
.map((char) => {
|
||||
if (char === ' ') {
|
||||
|
||||
Reference in New Issue
Block a user