traxxx/src/scrapers/mikeadriano.js

212 lines
5.6 KiB
JavaScript
Executable File

'use strict';
const unprint = require('unprint');
const http = require('../utils/http');
const { convert } = require('../utils/convert');
function scrapeAll(scenes, channel) {
return scenes.map(({ query }) => {
const release = {};
release.title = query.content('h3.title a, .content-title-wrap a');
release.url = query.url('h3.title a, h1.title a, .content-title-wrap a', { origin: channel.url });
const pathname = new URL(release.url).pathname;
release.entryId = pathname.match(/\/scenes\/([\w-]+)/)?.[1];
release.description = query.content('.desc, .content-description');
release.date = query.date('.date, time, .hide', 'Do MMM YYYY', { match: null });
release.actors = query.contents('h4.models a, .content-models a');
release.duration = query.duration('//span[contains(@class, "total-time") and text()[contains(., ":")]]'); // total-time is also used for photo counts on True Anal
const [poster, ...primaryPhotos] = query.imgs('a img');
const secondaryPhotos = query.styles('.thumb-top, .thumb-bottom, .thumb-mouseover', { styleAttribute: 'background-image' }).map((style) => style.match(/url\((.*)\)/)?.[1]);
release.poster = [
poster.replace(/-c\d+x\d+/, ''),
poster,
];
release.photos = primaryPhotos.concat(secondaryPhotos);
return release;
});
}
async function scrapeScene({ query }, url, channel) {
const release = {};
const pathname = new URL(url).pathname;
const data = query.json('#__NEXT_DATA__')?.props?.pageProps?.content;
release.entryId = data?.slug || pathname.match(/\/scenes\/([\w-]+)/)?.[1];
release.title = data?.title || query.content('.content-page-info .title');
release.description = data?.description || query.content('.content-page-info .desc');
release.date = data?.formatted_date
? unprint.extractDate(data.formatted_date, 'Do MMM YYYY', { match: null })
: query.date('.content-page-info .date, .content-page-info .hide, .post-date', 'Do MMM YYYY', { match: null });
release.actors = data?.models_thumbs?.map((actor) => ({
name: actor.name,
url: actor.slug && `${channel.url}/models/${actor.slug}`,
avatar: actor.thumb,
}))
|| query.elements('.content-page-info .models a').map((actorEl) => ({
name: unprint.query(actorEl),
url: unprint.url(actorEl, null),
}));
release.duration = data?.seconds_duration || query.duration('.content-page-info .total-time:last-child');
release.poster = [data?.trailer_screencap, data?.thumb, data?.extra_thumbails?.[0]].filter(Boolean);
release.photos = data?.extra_thumbnails?.slice(1); // first photo is poster
release.trailer = data?.trailer_url || null;
release.caps = data?.thumbs;
release.tags = data?.tags;
release.qualities = data?.videos && Object.values(data.videos).map((video) => video.height);
return release;
}
async function fetchLatestContent(url, parameters) {
if (parameters.useBrowser) {
const res = await http.get(url, {
bypassBrowser: 'shared',
bypass: {
evaluate: async () => {
// images lazy loaded by JS, gradually scroll through page
return Array.from(document.querySelectorAll('.content-item ')).reduce(async (chain, el) => {
await chain;
return new Promise((resolve) => {
el.scrollIntoView();
setTimeout(resolve, 20);
});
}, Promise.resolve());
},
},
});
if (res.statusCode !== 200) {
return {
ok: false,
status: res.statusCode,
};
}
const context = unprint.init(res.body);
return {
ok: true,
status: res.statusCode,
context,
};
}
const res = await unprint.get(url);
return res;
}
async function fetchLatest(channel, page = 1, { parameters }) {
const url = `${channel.url}/scenes?page=${page}`;
const res = await fetchLatestContent(url, parameters);
if (res.ok) {
if (res.context.query.exists('a[href*="stackpath.com"]')) {
throw new Error('URL blocked by StackPath');
}
return scrapeAll(unprint.initAll(res.context.query.all('.content-item-large, .content-item, .content-border')), channel);
}
return res.status;
}
async function fetchScene(url, channel) {
const cookieJar = http.cookieJar();
const session = http.session({ cookieJar });
const res = await http.get(url, {
session,
});
if (res.ok) {
const context = unprint.init(res.body);
if (context.query.exists('a[href*="stackpath.com"]')) {
throw new Error('URL blocked by StackPath');
}
return scrapeScene(context, url, channel);
}
return res.status;
}
async function scrapeProfile(data) {
const profile = {};
// unreliable key case, lowercase all
const bio = Object.fromEntries(Object.entries(data).map(([key, value]) => [key.toLowerCase(), value]));
profile.entryId = bio.id;
profile.gender = bio.gender;
profile.description = bio.bio;
profile.birthPlace = bio.born;
profile.dateOfBirth = unprint.extractDate(bio.birthdate, 'YYYY-MM-DD');
profile.age = bio.age;
profile.measurements = bio.measurements;
profile.height = convert(bio.height, 'cm');
profile.weight = convert(bio.weight, 'lb', 'kg');
profile.eyes = bio.eyes;
profile.hairColor = bio.hair;
profile.avatar = bio.thumb;
const tags = bio.tags?.split(',') || [];
if (tags.includes('tattoos')) profile.hasTattoos = true;
if (tags.includes('piercing')) profile.hasPiercings = true;
return profile;
}
async function fetchProfile(actor, context) {
const url = `${context.channel.url}/models/${actor.slug}`;
const res = await unprint.get(url, {
parser: {
runScripts: 'dangerously',
},
});
if (res.ok) {
const data = res.context.query.json('#__NEXT_DATA__');
if (data.props.pageProps.model) {
return scrapeProfile(data.props.pageProps.model, context.channel);
}
return null;
}
return res.status;
}
module.exports = {
fetchLatest,
fetchProfile,
fetchScene,
};