Refactored MariskaX to use Next data. Fixed Naughty America profile scraper breaking on Tonight's Girlfriend.
This commit is contained in:
parent
157667f771
commit
dde3ea3a34
|
|
@ -175,6 +175,9 @@ module.exports = {
|
|||
// woodman
|
||||
pierrewoodman,
|
||||
wakeupnfuck: pierrewoodman,
|
||||
// naughty america
|
||||
naughtyamerica,
|
||||
tonightsgirlfriend: naughtyamerica,
|
||||
// etc
|
||||
'18vr': badoink,
|
||||
theflourishxxx: theflourish,
|
||||
|
|
@ -231,8 +234,6 @@ module.exports = {
|
|||
missax,
|
||||
mylf: teamskeet,
|
||||
mugfucked: fullpornnetwork,
|
||||
naughtyamerica,
|
||||
tonightsgirlfriend: naughtyamerica,
|
||||
nebraskacoeds: elevatedx,
|
||||
onlyprince: fullpornnetwork,
|
||||
pascalssubsluts,
|
||||
|
|
|
|||
|
|
@ -3,84 +3,161 @@
|
|||
const unprint = require('unprint');
|
||||
|
||||
const slugify = require('../utils/slugify');
|
||||
const { convert } = require('../utils/convert');
|
||||
|
||||
function scrapeLatest(scenes) {
|
||||
return scenes.map(({ query }) => {
|
||||
const release = {};
|
||||
function scrapeScene(data, channel) {
|
||||
const release = {};
|
||||
|
||||
release.title = query.content('.title a');
|
||||
release.url = query.url('.title a') || query.url('.thumb-wrap a');
|
||||
release.entryId = data.id;
|
||||
release.url = `${channel.origin}/scenes/${data.slug}`;
|
||||
|
||||
release.entryId = new URL(release.url).pathname.match(/view\/(\d+)\//)[1];
|
||||
release.title = data.title;
|
||||
release.description = data.description;
|
||||
|
||||
release.date = query.date('time', 'Do MMM YYYY', { match: /\d+\w+ \w+ \d{4}/ });
|
||||
release.duration = query.duration('.total-time');
|
||||
release.date = unprint.extractDate(data.publish_date, 'YYYY/MM/DD hh:mm:ss');
|
||||
release.duration = unprint.extractDuration(data.videos_duration);
|
||||
|
||||
release.actors = query.all('.models a').map((el) => ({
|
||||
name: unprint.query.content(el),
|
||||
url: unprint.query.url(el, null),
|
||||
}));
|
||||
release.actors = (data.models_thumbs || data.models_slugs)?.map((actor) => ({
|
||||
name: actor.name,
|
||||
url: actor.slug && `${channel.origin}/models/${actor.slug}`,
|
||||
avatar: actor.thumb,
|
||||
})) || data.models;
|
||||
|
||||
[release.poster, ...release.photos] = query.json('.thumb-wrap a', { attribute: 'data-images' });
|
||||
release.tags = data.tags;
|
||||
|
||||
release.photoCount = query.number('.total-photos');
|
||||
release.poster = data.thumb || data.trailer_screencap;
|
||||
|
||||
return release;
|
||||
});
|
||||
const posterPath = release.poster && new URL(release.poster).pathname.replace('//', '/');
|
||||
|
||||
release.photos = data.extra_thumbnails.filter((src) => !src.includes(posterPath));
|
||||
release.caps = data.thumbs;
|
||||
|
||||
release.teaser = data.special_thumbnails;
|
||||
release.trailer = data.trailer_url;
|
||||
|
||||
release.photoCount = data.photos_duration;
|
||||
release.channel = data.site?.toLowerCase();
|
||||
|
||||
release.qualities = data.videos && Array.from(new Set(Object.values(data.videos).map((video) => video.height))).filter(Boolean);
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
async function fetchLatest(channel, page) {
|
||||
const res = await unprint.get(`https://tour.mariskax.com/scenes?page=${page}`, {
|
||||
selectAll: '.content-item',
|
||||
timeout: 30000, // slow site
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeLatest(res.context);
|
||||
const data = res.context.query.json('#__NEXT_DATA__')?.props.pageProps.contents.data;
|
||||
|
||||
if (data) {
|
||||
return data.map((scene) => scrapeScene(scene, channel));
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
function scrapeScene({ query }, { url }) {
|
||||
const release = {};
|
||||
async function fetchScene(url, entity, baseRelease) {
|
||||
if (baseRelease.entryId) {
|
||||
// same as as deep data
|
||||
return baseRelease;
|
||||
}
|
||||
|
||||
release.title = query.content('.content-meta .title');
|
||||
release.entryId = new URL(url).pathname.match(/view\/(\d+)\//)[1];
|
||||
|
||||
release.date = query.date('.post-date', 'Do MMM YYYY', { match: /\d+\w+ \w+ \d{4}/ });
|
||||
|
||||
release.actors = query.all('.content-meta .models a').map((el) => ({
|
||||
name: unprint.query.content(el),
|
||||
url: unprint.query.url(el, null),
|
||||
}));
|
||||
|
||||
release.poster = query.poster('.trailer-wrap video');
|
||||
release.trailer = query.video('.trailer-wrap source') || query.video('.download-trailer-wrap a', { attribute: 'href' });
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
async function fetchProfile(actor) {
|
||||
const res = await unprint.post('https://tour.mariskax.com/search-preview-mrx', `q=${slugify(actor.name, '+')}`, {
|
||||
headers: {
|
||||
'Accept-Language': 'en-US,en', // necessary for some reason
|
||||
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
|
||||
},
|
||||
const res = await unprint.get(url, {
|
||||
timeout: 30000, // slow site
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
const model = res.data.find((result) => result.type === 'model' && slugify(result.title) === actor.slug);
|
||||
const data = res.context.query.json('#__NEXT_DATA__')?.props.pageProps.content;
|
||||
|
||||
if (model) {
|
||||
const curatedModel = {
|
||||
entryId: model.url?.match(/model\/(\d+)\//)?.[1],
|
||||
url: model.url,
|
||||
avatar: model.thumb,
|
||||
};
|
||||
|
||||
return curatedModel;
|
||||
if (data) {
|
||||
return scrapeScene(data, entity);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
function scrapeProfile(data) {
|
||||
const profile = {};
|
||||
|
||||
const bio = Object.fromEntries(Object.entries(data).map(([key, value]) => [slugify(key, '_'), value]));
|
||||
|
||||
profile.url = `https://tour.mariskax.com/models/${data.slug}`;
|
||||
profile.entryId = data.id;
|
||||
|
||||
profile.gender = bio.gender;
|
||||
|
||||
profile.dateOfBirth = bio.birthdate;
|
||||
profile.age = bio.age;
|
||||
profile.placeOfBirth = bio.born;
|
||||
|
||||
profile.measurements = bio.measurements;
|
||||
profile.height = convert(bio.height, 'cm');
|
||||
profile.weight = convert(bio.weight, 'lb', 'kg');
|
||||
|
||||
profile.hairColor = bio.hair_color;
|
||||
profile.eyes = bio.eye_color;
|
||||
|
||||
profile.avatar = data.thumb;
|
||||
profile.socials = [bio.x && `https://x.com/${bio.x.replace('@', '')}`].filter(Boolean);
|
||||
|
||||
return profile;
|
||||
}
|
||||
|
||||
async function getActorUrl(actor, entity) {
|
||||
if (actor.url) {
|
||||
return { url: actor.url };
|
||||
}
|
||||
|
||||
const res = await unprint.post(`${entity.origin}/api/search/${actor.name}`);
|
||||
|
||||
if (res.ok) {
|
||||
const model = res.data.models.find((result) => slugify(result.name) === actor.slug);
|
||||
|
||||
if (model?.slug) {
|
||||
return {
|
||||
url: `${entity.origin}/models/${model.slug}`,
|
||||
model,
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
async function fetchProfile(actor, entity) {
|
||||
const { url, model } = await getActorUrl(actor, entity);
|
||||
|
||||
if (model) {
|
||||
// search data already contains everything except for age, but DOB is included
|
||||
return scrapeProfile(model);
|
||||
}
|
||||
|
||||
if (url) {
|
||||
const res = await unprint.get(url, {
|
||||
parser: {
|
||||
runScripts: 'dangerously',
|
||||
},
|
||||
});
|
||||
|
||||
if (res.ok) {
|
||||
const data = res.context.query.json('#__NEXT_DATA__')?.props.pageProps.model;
|
||||
|
||||
if (data) {
|
||||
return scrapeProfile(data);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
return res.status;
|
||||
}
|
||||
|
||||
return null;
|
||||
|
|
@ -89,5 +166,6 @@ async function fetchProfile(actor) {
|
|||
module.exports = {
|
||||
fetchLatest,
|
||||
scrapeScene,
|
||||
fetchScene,
|
||||
fetchProfile,
|
||||
};
|
||||
|
|
|
|||
|
|
@ -141,7 +141,7 @@ async function fetchScene(url, _channel) {
|
|||
async function scrapeProfile({ query }) {
|
||||
const profile = {};
|
||||
|
||||
profile.description = query.content('.bio_about_text');
|
||||
profile.description = query.content('.bio_about_text, .performer-description');
|
||||
profile.avatar = query.img('img.performer-pic, img.performer-img, img.peformer-img'); // sic peformer
|
||||
|
||||
return profile;
|
||||
|
|
@ -151,12 +151,9 @@ async function fetchProfile({ slug }, { channel }) {
|
|||
const url = unprint.prefixUrl(`/pornstar/${slug}`, channel.url);
|
||||
|
||||
const res = await unprint.browserRequest(url, {
|
||||
browser: {
|
||||
headless: false,
|
||||
},
|
||||
select: '.bio-info, .performer-details',
|
||||
async control(ctx) {
|
||||
await ctx.locator('.bio-info').hover({ trial: true, timeout: 30000 }); // wait for bio to initialize
|
||||
await ctx.locator('.bio-info, .performer-details').hover({ trial: true, timeout: 30000 }); // wait for bio to initialize
|
||||
},
|
||||
});
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
'use strict';
|
||||
|
||||
const config = require('config');
|
||||
const test = require('node:test');
|
||||
const assert = require('node:assert/strict');
|
||||
const unprint = require('unprint');
|
||||
|
|
@ -15,6 +16,19 @@ const knex = require('../src/knex');
|
|||
|
||||
unprint.options({
|
||||
logErrors: false,
|
||||
timeout: argv.requestTimeout,
|
||||
userAgent: 'traxxx',
|
||||
browserUserAgent: 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36',
|
||||
apiUserAgent: 'traxxx',
|
||||
limits: {
|
||||
...config.limits,
|
||||
default: {
|
||||
interval: argv.interval || config.limits.default.interval,
|
||||
concurrency: argv.concurrency || config.limits.default.concurrency,
|
||||
},
|
||||
browser: config.limits.browser,
|
||||
},
|
||||
proxy: config.proxy,
|
||||
});
|
||||
|
||||
const actors = [
|
||||
|
|
@ -188,6 +202,9 @@ const actors = [
|
|||
{ entity: 'pierrewoodman', name: 'Abby Lee Brazil', fields: ['avatar', 'nationality'] },
|
||||
{ entity: 'dorcelclub', name: 'Clea Gaultier', fields: ['avatar'] },
|
||||
{ entity: 'hitzefrei', name: 'Jolee Love', fields: ['avatar', 'dateOfBirth', 'birthPlace', 'measurements', 'height', 'weight', 'eyes', 'hair', 'description'] },
|
||||
{ entity: 'naughtyamerica', name: 'Nicole Aniston', fields: ['avatar', 'description'] },
|
||||
{ entity: 'tonightsgirlfriend', name: 'Abella Danger', fields: ['avatar'] },
|
||||
{ entity: 'mariskax', name: 'Honey Demon', fields: ['avatar', 'gender', 'dateOfBirth', 'placeOfBirth', 'measurements', 'height', 'weight', 'hairColor', 'eyes'] },
|
||||
];
|
||||
|
||||
const actorScrapers = scrapers.actors;
|
||||
|
|
|
|||
Loading…
Reference in New Issue