traxxx/src/scrapers/bangbros.js

199 lines
6.2 KiB
JavaScript

'use strict';
/* eslint-disable newline-per-chained-call */
const bhttp = require('bhttp');
const cheerio = require('cheerio');
const moment = require('moment');
const slugify = require('../utils/slugify');
const { ex } = require('../utils/q');
function scrape(html, site) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const sceneElements = $('.echThumb').toArray();
return sceneElements.map((element) => {
const sceneLinkElement = $(element).find('.thmb_lnk');
const title = sceneLinkElement.attr('title');
const url = `https://bangbros.com${sceneLinkElement.attr('href')}`;
const shootId = sceneLinkElement.attr('id') && sceneLinkElement.attr('id').split('-')[1];
const entryId = url.split('/')[3].slice(5);
const date = moment.utc($(element).find('.thmb_mr_2 span.faTxt').text(), 'MMM D, YYYY').toDate();
const actors = $(element).find('.cast-wrapper a.cast').map((actorIndex, actorElement) => $(actorElement).text().trim()).toArray();
const photoElement = $(element).find('.rollover-image');
const poster = `https:${photoElement.attr('data-original')}`;
const photosUrl = photoElement.attr('data-rollover-url');
const photosMaxIndex = photoElement.attr('data-rollover-max-index');
const photos = Array.from({ length: photosMaxIndex }, (val, index) => `https:${photosUrl}big${index + 1}.jpg`);
const duration = moment.duration(`0:${$(element).find('.thmb_pic b.tTm').text()}`).asSeconds();
const channel = $(element).find('a[href*="/websites"]').attr('href').split('/').slice(-1)[0];
return {
url,
entryId,
shootId,
title,
actors,
date,
duration,
poster,
photos,
rating: null,
site,
channel,
};
});
}
/* no dates available, breaks database
function scrapeUpcoming(html, site) {
const { document } = ex(html);
return ctxa(document, 'a[id*="upcoming-videos"]').map(({ element, q }) => {
const release = {};
[release.shootId] = element.id.split('-').slice(-1);
const siteCode = release.shootId.match(/[a-z]+/)[0];
if (siteCode !== site.parameters.code) {
return null;
}
const posterEl = q('img');
[release.entryId] = element.href.split('/')[1].match(/\d+/);
release.url = `https://bangbros.com${element.href}`;
release.title = posterEl.alt;
release.poster = `https:${posterEl.src}`;
release.actors = q('.castName', true).split(/ in/g).slice(0, -1).map(actorName => actorName.trim());
console.log(release);
return release;
}).filter(Boolean);
}
*/
function scrapeScene(html, url, site) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const sceneElement = $('.playerSection');
const shootId = sceneElement.find('.vdoCast:contains("Release")').text().replace('Release: ', '');
const entryId = url.split('/')[3].slice(5);
const title = sceneElement.find('.ps-vdoHdd h1').text();
const description = sceneElement.find('.vdoDesc').text().trim();
const [siteName, ...actors] = sceneElement.find('.vdoCast a').map((actorIndex, actorElement) => $(actorElement).text()).toArray();
const siteSlug = siteName.replace(/[\s']+/g, '').toLowerCase();
const poster = `https:${$('img#player-overlay-image').attr('src')}`;
const trailer = `https:${$('source[type="video/mp4"]').attr('src')}`;
const firstPhotoUrl = `https:${$('img[data-slider-index="1"]').attr('src')}`;
// all scenes seem to have 12 album photos available, not always included on the page
const photos = Array.from({ length: 12 }, (val, index) => firstPhotoUrl.replace(/big\d+/, `big${index + 1}`));
const tags = $('.vdoTags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
const stars = Number(sceneElement.find('.bVdPl_it_like .bVdPl_txt').text().replace('% like', '')) / 20;
return {
url,
shootId,
entryId,
title,
description,
actors,
tags,
poster,
photos,
trailer: {
src: trailer,
},
rating: {
stars,
},
site,
channel: siteSlug === 'bangcasting' ? 'bangbroscasting' : siteSlug,
};
}
function scrapeProfile(html) {
const { q } = ex(html);
const profile = {};
const avatar = q('.profilePic img', 'src');
if (avatar) profile.avatar = `https:${avatar}`;
profile.releases = scrape(html);
return profile;
}
function scrapeProfileSearch(html, actorName) {
const { q } = ex(html);
const actorLink = q(`a[title="${actorName}"]`, 'href');
return actorLink ? `https://bangbros.com${actorLink}` : null;
}
async function fetchLatest(site, page = 1) {
const res = await bhttp.get(`${site.url}/${page}`);
return scrape(res.body.toString(), site);
}
/*
async function fetchUpcoming(site) {
const res = await bhttp.get('https://www.bangbros.com');
return scrapeUpcoming(res.body.toString(), site);
}
*/
async function fetchScene(url, site, release) {
if (!release?.date) {
throw new Error(`Cannot fetch Bang Bros scenes from argument URL, as scene pages do not have release dates: ${url}`);
}
const { origin } = new URL(url);
const res = await bhttp.get(url);
if (!/https?:\/\/(www.)?bangbros.com\/?$/.test(origin)) {
throw new Error('Cannot fetch from this URL. Please find the scene on https://bangbros.com and try again.');
}
return scrapeScene(res.body.toString(), url, site);
}
async function fetchProfile(actorName) {
const actorSlug = slugify(actorName);
const url = `https://bangbros.com/search/${actorSlug}`;
const res = await bhttp.get(url);
if (res.statusCode === 200) {
const actorUrl = scrapeProfileSearch(res.body.toString(), actorName);
if (actorUrl) {
const actorRes = await bhttp.get(actorUrl);
if (actorRes.statusCode === 200) {
return scrapeProfile(actorRes.body.toString());
}
}
}
return null;
}
module.exports = {
fetchLatest,
fetchScene,
fetchProfile,
// fetchUpcoming, no dates available
};