traxxx/src/scrapers/bradmontana.js

159 lines
3.8 KiB
JavaScript
Executable File

'use strict';
const unprint = require('unprint');
const slugify = require('../utils/slugify');
function genderFromUrl(url) {
const { pathname } = new URL(url);
if (/atores/.test(pathname)) {
return 'male';
}
if (/atrizes/.test(pathname)) {
return 'female';
}
return null;
}
function scrapeAll(scenes) {
return scenes.map(({ query }) => {
const release = {};
release.url = query.url(null);
release.entryId = new URL(release.url).pathname.match(/\/videos\/([\w-]+)/)[1];
release.title = query.attribute('img', 'title') || query.content('.font-semibold');
const poster = query.img('img[src*="/uploads"]');
if (poster) {
release.poster = [
poster.replace(/-\d+x\d+/, ''),
poster,
];
const match = poster.match(/\/uploads\/(\d{4})\/(\d{2})/);
if (match) {
release.date = new Date(match[1], match[2] - 1, 1);
release.datePrecision = 'month';
}
}
return release;
});
}
function scrapeScene({ query, html }, { url, entity }) {
const release = {};
const data = query.json('.yoast-schema-graph')?.['@graph'];
const pageData = data?.find((item) => item['@type'] === 'WebPage');
const imageData = data?.find((item) => item['@type'] === 'ImageObject');
release.entryId = new URL(url).pathname.match(/\/videos\/([\w-]+)/)[1];
release.title = query.content('.w-screen + div .font-semibold')
|| data?.find((item) => item['@type'] === 'BreadcrumbList')?.itemListElement.slice(-1)[0].item?.name
|| pageData?.name.slice(0, pageData?.name.lastIndexOf('-')).trim();
release.description = query.content('.leading-relaxed');
release.date = pageData?.datePublished && new Date(pageData.datePublished);
release.actors = query.elements('.models-slider-single a').map((el) => {
const actorUrl = unprint.query.url(el, null);
const avatarUrl = unprint.query.img(el);
return {
name: unprint.query.content(el),
url: actorUrl,
avatar: [
avatarUrl?.replace(/-\d+x\d+/, ''),
avatarUrl,
],
gender: genderFromUrl(actorUrl),
};
});
release.poster = imageData?.url
|| query.meta('property="og:image"')
|| html.match(/poster: '(http.*\.jpg)'/)?.[1];
release.photos = query.imgs('.gallery img');
release.trailer = query.video('source', 'src', { origin: entity.url });
if (!release.date && release.poster) {
const match = release.poster.match(/\/uploads\/(\d{4})\/(\d{2})/);
if (match) {
release.date = new Date(match[1], match[2] - 1, 1);
release.datePrecision = 'month';
}
}
return release;
}
function scrapeProfile({ query }, entity, url) {
const profile = { url };
const data = query.json('.yoast-schema-graph');
profile.gender = genderFromUrl(url);
if (data) {
profile.avatar = data['@graph']?.find((item) => item['@type'] === 'ImageObject')?.url;
}
return profile;
}
async function fetchLatest(channel, page = 1) {
const url = `${channel.url}/videos/page/${page}`;
const res = await unprint.get(url, { selectAll: '.grid > a[href*="/videos"]' });
if (res.ok) {
return scrapeAll(res.context, channel);
}
return res.status;
}
async function fetchProfilePage({ name, gender, url: actorUrl }, entity, secondAttempt) {
const url = actorUrl || `${entity.url}/${gender === 'male' || secondAttempt ? 'atores' : 'atrizes'}/${slugify(name, '-')}`;
const res = await unprint.get(url);
if (res.ok) {
return { res, url };
}
if (actorUrl) {
return fetchProfilePage({ name, gender }, entity, false); // don't count as second attempt, retry without actor URL
}
if (secondAttempt) {
return res.status;
}
return fetchProfilePage({ name, gender }, entity, true);
}
async function fetchProfile(baseActor, entity, options) {
const { res, url } = await fetchProfilePage(baseActor, entity, false);
if (res.ok) {
return scrapeProfile(res.context, entity, url, options);
}
return res.status;
}
module.exports = {
fetchLatest,
fetchProfile,
scrapeScene,
};