2019-04-12 01:12:53 +00:00
|
|
|
'use strict';
|
|
|
|
|
|
|
|
/* eslint-disable newline-per-chained-call */
|
|
|
|
const cheerio = require('cheerio');
|
|
|
|
const moment = require('moment');
|
|
|
|
|
2020-03-09 23:17:57 +00:00
|
|
|
const logger = require('../logger')(__filename);
|
2020-01-31 20:43:16 +00:00
|
|
|
const slugify = require('../utils/slugify');
|
2020-11-22 23:05:02 +00:00
|
|
|
const http = require('../utils/http');
|
2021-01-17 00:43:55 +00:00
|
|
|
const qu = require('../utils/qu');
|
2020-01-31 20:43:16 +00:00
|
|
|
|
|
|
|
function scrape(html, site) {
|
2020-05-14 02:26:05 +00:00
|
|
|
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
|
|
|
const sceneElements = $('.echThumb').toArray();
|
|
|
|
|
|
|
|
return sceneElements.map((element) => {
|
2021-01-17 00:43:55 +00:00
|
|
|
const release = {};
|
|
|
|
|
2020-05-14 02:26:05 +00:00
|
|
|
const sceneLinkElement = $(element).find('.thmb_lnk');
|
|
|
|
|
2021-01-17 00:43:55 +00:00
|
|
|
release.title = sceneLinkElement.attr('title');
|
2021-01-22 14:40:49 +00:00
|
|
|
release.url = site.parameters?.legacy || !site.parent
|
|
|
|
? `${site.url}${sceneLinkElement.attr('href')}`
|
2021-01-17 00:43:55 +00:00
|
|
|
: `${site.parent.url}${sceneLinkElement.attr('href')}`;
|
2020-05-14 02:26:05 +00:00
|
|
|
|
2021-01-17 00:43:55 +00:00
|
|
|
release.shootId = sceneLinkElement.attr('id') && sceneLinkElement.attr('id').split('-')[1];
|
|
|
|
release.entryId = new URL(release.url).pathname.match(/video(\d+)/)?.[1];
|
|
|
|
|
|
|
|
release.date = moment.utc($(element).find('.thmb_mr_2 span.faTxt').text(), 'MMM D, YYYY').toDate();
|
|
|
|
release.actors = $(element).find('.cast-wrapper a.cast').map((actorIndex, actorElement) => $(actorElement).text().trim()).toArray();
|
2020-05-14 02:26:05 +00:00
|
|
|
|
2021-01-17 00:43:55 +00:00
|
|
|
const photoElement = $(element).find('.rollover-image');
|
2020-05-14 02:26:05 +00:00
|
|
|
const photosUrl = photoElement.attr('data-rollover-url');
|
|
|
|
const photosMaxIndex = photoElement.attr('data-rollover-max-index');
|
2021-01-17 00:43:55 +00:00
|
|
|
|
|
|
|
release.poster = `https:${photoElement.attr('data-original')}`;
|
|
|
|
release.photos = Array.from({ length: photosMaxIndex }, (val, index) => `https:${photosUrl}big${index + 1}.jpg`);
|
|
|
|
|
|
|
|
release.duration = moment.duration(`0:${$(element).find('.thmb_pic b.tTm').text()}`).asSeconds();
|
|
|
|
release.channel = $(element).find('a[href*="/websites"]').attr('href').split('/').slice(-1)[0];
|
|
|
|
|
|
|
|
return release;
|
2020-05-14 02:26:05 +00:00
|
|
|
});
|
2019-04-12 01:12:53 +00:00
|
|
|
}
|
|
|
|
|
2020-05-24 01:54:29 +00:00
|
|
|
function scrapeLegacy(scenes, site) {
|
2021-01-17 00:43:55 +00:00
|
|
|
return scenes.map(({ query }) => {
|
2020-05-24 01:54:29 +00:00
|
|
|
const release = {};
|
|
|
|
|
2021-01-17 00:43:55 +00:00
|
|
|
const pathname = query.url('.mainplayer a, .palyer a'); // sic
|
2020-05-24 01:54:29 +00:00
|
|
|
release.url = `${site.url}${pathname}`;
|
2021-01-17 00:43:55 +00:00
|
|
|
release.entryId = pathname.match(/video(\d+)/)?.[1];
|
2020-05-24 01:54:29 +00:00
|
|
|
|
2021-01-17 00:43:55 +00:00
|
|
|
release.title = query.q('h2', true);
|
|
|
|
release.date = query.date('div:not(.videoDisc)', 'MMM DD, YYYY', /\w+ \d{1,2}, \d{4}/);
|
|
|
|
release.description = query.q('div + .videoDisc p', true);
|
|
|
|
release.duration = query.dur('.videoTag .title');
|
2020-05-24 01:54:29 +00:00
|
|
|
|
2021-01-17 00:43:55 +00:00
|
|
|
release.poster = query.img('.mainplayer img, .palyer img'); // sic
|
|
|
|
release.photos = query.imgs('article img').concat(qu.imgs('article img', 'data-original')).filter(Boolean);
|
2020-05-24 01:54:29 +00:00
|
|
|
|
|
|
|
return release;
|
|
|
|
});
|
|
|
|
}
|
|
|
|
|
2020-01-31 20:43:16 +00:00
|
|
|
/* no dates available, breaks database
|
|
|
|
function scrapeUpcoming(html, site) {
|
|
|
|
const { document } = ex(html);
|
|
|
|
|
|
|
|
return ctxa(document, 'a[id*="upcoming-videos"]').map(({ element, q }) => {
|
|
|
|
const release = {};
|
|
|
|
[release.shootId] = element.id.split('-').slice(-1);
|
|
|
|
const siteCode = release.shootId.match(/[a-z]+/)[0];
|
|
|
|
|
|
|
|
if (siteCode !== site.parameters.code) {
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
|
|
|
|
const posterEl = q('img');
|
|
|
|
|
|
|
|
[release.entryId] = element.href.split('/')[1].match(/\d+/);
|
|
|
|
release.url = `https://bangbros.com${element.href}`;
|
|
|
|
release.title = posterEl.alt;
|
|
|
|
release.poster = `https:${posterEl.src}`;
|
|
|
|
|
|
|
|
release.actors = q('.castName', true).split(/ in/g).slice(0, -1).map(actorName => actorName.trim());
|
|
|
|
|
|
|
|
console.log(release);
|
|
|
|
|
|
|
|
return release;
|
|
|
|
}).filter(Boolean);
|
|
|
|
}
|
|
|
|
*/
|
|
|
|
|
2020-02-07 00:06:39 +00:00
|
|
|
function scrapeScene(html, url, _site) {
|
2021-01-17 00:43:55 +00:00
|
|
|
const { query } = qu.ex(html, '.playerSection');
|
2020-05-14 02:26:05 +00:00
|
|
|
const release = {};
|
2019-04-12 01:12:53 +00:00
|
|
|
|
2021-01-17 00:43:55 +00:00
|
|
|
const { pathname, hostname } = new URL(url);
|
|
|
|
|
|
|
|
[release.shootId] = query.cnt('.vdoTags + .vdoCast')?.match(/\w+$/) || [];
|
|
|
|
release.entryId = pathname.match(/video(\d+)/)?.[1];
|
|
|
|
|
|
|
|
release.title = query.cnt('.ps-vdoHdd h1');
|
|
|
|
release.description = query.cnt('.vdoDesc');
|
2019-04-12 01:12:53 +00:00
|
|
|
|
2021-01-17 00:43:55 +00:00
|
|
|
release.actors = query.all('a[href*="/model"]', true);
|
|
|
|
release.tags = query.all('.vdoTags a', true);
|
2019-04-12 01:12:53 +00:00
|
|
|
|
2021-01-17 00:43:55 +00:00
|
|
|
release.stars = Number(query.q('div[class*="like"]', true).match(/^\d+/)[0]) / 20;
|
2019-10-31 00:53:26 +00:00
|
|
|
|
2021-01-17 00:43:55 +00:00
|
|
|
const poster = query.img('img#player-overlay-image, img.playerPic');
|
|
|
|
|
|
|
|
if (poster) {
|
|
|
|
release.poster = [
|
|
|
|
poster,
|
|
|
|
poster.replace('/big_trailer', '/members/450x340'), // load error fallback
|
|
|
|
];
|
|
|
|
}
|
2019-10-31 00:53:26 +00:00
|
|
|
|
2021-01-17 00:43:55 +00:00
|
|
|
release.trailer = query.trailer() || qu.prefixUrl(html.match(/'(\/\/trailers.*mp4)'/)?.[1], hostname);
|
2019-04-12 01:12:53 +00:00
|
|
|
|
2020-05-14 02:26:05 +00:00
|
|
|
// all scenes seem to have 12 album photos available, not always included on the page
|
2021-01-17 00:43:55 +00:00
|
|
|
const firstPhotoUrl = qu.ex(html).query.img('img[data-slider-index="1"]');
|
2020-05-14 02:26:05 +00:00
|
|
|
release.photos = Array.from({ length: 12 }, (val, index) => firstPhotoUrl.replace(/big\d+/, `big${index + 1}`));
|
2019-04-12 01:12:53 +00:00
|
|
|
|
2021-01-17 00:43:55 +00:00
|
|
|
const [channel] = query.url('a[href*="/websites"]').match(/\w+$/);
|
2020-03-09 23:17:57 +00:00
|
|
|
|
2020-05-14 02:26:05 +00:00
|
|
|
if (channel === 'bangcasting') release.channel = 'bangbroscasting';
|
|
|
|
if (channel === 'remaster') release.channel = 'bangbrosremastered';
|
|
|
|
else release.channel = channel;
|
2020-02-07 00:06:39 +00:00
|
|
|
|
2020-05-14 02:26:05 +00:00
|
|
|
return release;
|
2019-04-12 01:12:53 +00:00
|
|
|
}
|
|
|
|
|
2021-01-17 00:43:55 +00:00
|
|
|
function scrapeSceneLegacy({ query }, url) {
|
2020-05-24 01:54:29 +00:00
|
|
|
const release = {};
|
|
|
|
|
|
|
|
release.entryId = new URL(url).pathname.match(/video\d+/)?.[0];
|
|
|
|
|
2021-01-17 00:43:55 +00:00
|
|
|
release.title = query.q('h1', true);
|
|
|
|
release.description = query.q('.videoDetail', true);
|
|
|
|
release.duration = query.dur('.tags p span');
|
2020-05-24 01:54:29 +00:00
|
|
|
|
2021-01-17 00:43:55 +00:00
|
|
|
release.poster = query.img('#video_container + div img, .videoOverlay img');
|
2020-05-24 01:54:29 +00:00
|
|
|
|
|
|
|
return release;
|
|
|
|
}
|
|
|
|
|
2020-05-26 02:11:29 +00:00
|
|
|
function scrapeProfile(html, scope) {
|
2021-01-17 00:43:55 +00:00
|
|
|
const { query } = qu.ex(html);
|
2020-05-14 02:26:05 +00:00
|
|
|
const profile = {};
|
2020-01-31 20:43:16 +00:00
|
|
|
|
2021-01-17 00:43:55 +00:00
|
|
|
const avatar = query.q('.profilePic img', 'src');
|
2020-05-14 02:26:05 +00:00
|
|
|
if (avatar) profile.avatar = `https:${avatar}`;
|
2020-01-31 20:43:16 +00:00
|
|
|
|
2021-01-22 14:40:49 +00:00
|
|
|
profile.releases = scrape(html, scope.entity);
|
2020-01-31 20:43:16 +00:00
|
|
|
|
2020-05-14 02:26:05 +00:00
|
|
|
return profile;
|
2020-01-31 20:43:16 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
function scrapeProfileSearch(html, actorName) {
|
2021-01-17 00:43:55 +00:00
|
|
|
const { query } = qu.ex(html);
|
|
|
|
const actorLink = query.url(`a[title="${actorName}" i][href*="model"]`);
|
2020-01-31 20:43:16 +00:00
|
|
|
|
2020-05-14 02:26:05 +00:00
|
|
|
return actorLink ? `https://bangbros.com${actorLink}` : null;
|
2020-01-31 20:43:16 +00:00
|
|
|
}
|
|
|
|
|
2019-04-12 01:12:53 +00:00
|
|
|
async function fetchLatest(site, page = 1) {
|
2020-05-24 01:54:29 +00:00
|
|
|
if (site.parameters?.legacy) {
|
2021-01-17 00:43:55 +00:00
|
|
|
const url = `${site.parameters?.latest || site.url}/videos/${page}`;
|
|
|
|
const res = await qu.getAll(url, '.videoList');
|
2020-05-24 01:54:29 +00:00
|
|
|
|
|
|
|
if (res.ok) {
|
|
|
|
return scrapeLegacy(res.items, site);
|
|
|
|
}
|
2019-04-12 01:12:53 +00:00
|
|
|
|
2020-05-24 01:54:29 +00:00
|
|
|
return res.status;
|
|
|
|
}
|
|
|
|
|
2021-01-17 00:43:55 +00:00
|
|
|
const res = await qu.get(`${site.parameters?.latest || site.url}/${page}`);
|
2020-05-24 01:54:29 +00:00
|
|
|
|
|
|
|
if (res.ok) {
|
|
|
|
return scrape(res.item.html, site);
|
|
|
|
}
|
|
|
|
|
|
|
|
return res.status;
|
2020-01-31 20:43:16 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
async function fetchUpcoming(site) {
|
2020-11-22 23:05:02 +00:00
|
|
|
const res = await http.get('https://www.bangbros.com');
|
2020-01-31 20:43:16 +00:00
|
|
|
|
|
|
|
return scrapeUpcoming(res.body.toString(), site);
|
2019-04-12 01:12:53 +00:00
|
|
|
}
|
2020-01-31 20:43:16 +00:00
|
|
|
*/
|
|
|
|
|
|
|
|
async function fetchScene(url, site, release) {
|
2020-05-14 02:26:05 +00:00
|
|
|
if (!release?.date) {
|
|
|
|
logger.warn(`Scraping Bang Bros scene from URL without release date: ${url}`);
|
|
|
|
}
|
2019-04-12 01:12:53 +00:00
|
|
|
|
2020-05-14 02:26:05 +00:00
|
|
|
const { origin } = new URL(url);
|
2021-01-17 00:43:55 +00:00
|
|
|
const res = await qu.get(url);
|
2020-05-24 01:54:29 +00:00
|
|
|
|
|
|
|
if (!res.ok) {
|
|
|
|
return res.status;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (site.parameters?.legacy) {
|
|
|
|
return scrapeSceneLegacy(res.item, url, site);
|
|
|
|
}
|
2019-04-12 01:12:53 +00:00
|
|
|
|
2021-01-17 00:43:55 +00:00
|
|
|
if (!/https?:\/\/(www.)?(bangbros|gaywire).com\/?$/.test(origin)) {
|
|
|
|
throw new Error('Cannot fetch from this URL. Please find the scene on Bang Bros or Gaywire and try again.');
|
2020-05-14 02:26:05 +00:00
|
|
|
}
|
2019-04-12 01:12:53 +00:00
|
|
|
|
2020-05-24 01:54:29 +00:00
|
|
|
return scrapeScene(res.item.html, url, site);
|
2019-04-12 01:12:53 +00:00
|
|
|
}
|
|
|
|
|
2020-07-20 23:44:51 +00:00
|
|
|
async function fetchProfile({ name: actorName }, scope) {
|
2020-05-14 02:26:05 +00:00
|
|
|
const actorSlug = slugify(actorName);
|
|
|
|
const url = `https://bangbros.com/search/${actorSlug}`;
|
2020-11-22 23:05:02 +00:00
|
|
|
const res = await http.get(url);
|
2020-01-31 20:43:16 +00:00
|
|
|
|
2020-05-14 02:26:05 +00:00
|
|
|
if (res.statusCode === 200) {
|
|
|
|
const actorUrl = scrapeProfileSearch(res.body.toString(), actorName);
|
2020-01-31 20:43:16 +00:00
|
|
|
|
2020-05-14 02:26:05 +00:00
|
|
|
if (actorUrl) {
|
2020-11-22 23:05:02 +00:00
|
|
|
const actorRes = await http.get(actorUrl);
|
2020-01-31 20:43:16 +00:00
|
|
|
|
2020-05-14 02:26:05 +00:00
|
|
|
if (actorRes.statusCode === 200) {
|
2020-05-26 02:11:29 +00:00
|
|
|
return scrapeProfile(actorRes.body.toString(), scope);
|
2020-05-14 02:26:05 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2020-01-31 20:43:16 +00:00
|
|
|
|
2020-05-14 02:26:05 +00:00
|
|
|
return null;
|
2020-01-31 20:43:16 +00:00
|
|
|
}
|
|
|
|
|
2019-04-12 01:12:53 +00:00
|
|
|
module.exports = {
|
2020-05-14 02:26:05 +00:00
|
|
|
fetchLatest,
|
|
|
|
fetchScene,
|
|
|
|
fetchProfile,
|
|
|
|
// fetchUpcoming, no dates available
|
2019-04-12 01:12:53 +00:00
|
|
|
};
|