From 80e5d7828aef5c247fecb0327839f343937e9a18 Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Thu, 22 Aug 2024 02:07:39 +0200 Subject: [PATCH] Retired unused Bang Bros scraper (now part of Aylo). --- src/scrapers/bang-legacy.js | 429 ------------------------------------ src/scrapers/bangbros.js | 355 ----------------------------- src/scrapers/scrapers.js | 3 +- 3 files changed, 1 insertion(+), 786 deletions(-) delete mode 100755 src/scrapers/bang-legacy.js delete mode 100755 src/scrapers/bangbros.js diff --git a/src/scrapers/bang-legacy.js b/src/scrapers/bang-legacy.js deleted file mode 100755 index 47f1dff3..00000000 --- a/src/scrapers/bang-legacy.js +++ /dev/null @@ -1,429 +0,0 @@ -'use strict'; - -const http = require('../utils/http'); -const qu = require('../utils/qu'); -const { extractDate } = require('../utils/qu'); -const { inchesToCm } = require('../utils/convert'); -const slugify = require('../utils/slugify'); -const capitalize = require('../utils/capitalize'); - -const clusterId = '617fb597b659459bafe6472470d9073a'; -const authKey = 'YmFuZy1yZWFkOktqVDN0RzJacmQ1TFNRazI='; - -const genderMap = { - M: 'male', - F: 'female', -}; - -function getScreenUrl(item, scene) { - if (!scene.dvd?.id || !item?.screenId) { - return null; - } - - return `https://i.bang.com/screenshots/${scene.dvd.id}/${scene.type}/${scene.order}/${item.screenId}.jpg`; -} - -function encodeId(id) { - return Buffer - .from(id, 'hex') - .toString('base64') - .replace(/\+/g, '-') - .replace(/\//g, '_') - .replace(/=/g, ','); -} - -function decodeId(id) { - const restoredId = id - .replace(/-/g, '+') - .replace(/_/g, '/') - .replace(/,/g, '='); - - return Buffer - .from(restoredId, 'base64') - .toString('hex'); -} - -async function fetchPhotos(scene) { - const photoPaths = Array.from({ length: scene.photos }, (value, index) => `/${scene.dvd.id}/${scene.identifier}/final/${String(index + 1).padStart(6, '0')}.jpg`); - - const res = await http.post('https://www.bang.com/sign-images', { - images: photoPaths, - }, { - encodeJSON: false, - }); - - if (res.ok && res.body.images) { - return res.body.images.map((image) => qu.prefixUrl(image, 'https://photos.bang.com')); - } - - return null; -} - -async function scrapeScene(scene, entity, options) { - const release = { - entryId: scene.id, - title: scene.name || (scene.dvd?.name && scene.type === 'bonus' && capitalize(`${scene.dvd.name} - Bonus Scene ${scene.order || 1}`)) || null, - description: scene.description, - tags: scene.genres.concat(scene.actions).map((genre) => genre.name), - duration: scene.duration, - }; - - const slug = slugify(release.title); - release.url = `https://www.bang.com/video/${encodeId(release.entryId)}/${slug}`; - - const date = new Date(scene.releaseDate); - release.date = new Date(Date.UTC(date.getUTCFullYear(), date.getUTCMonth(), date.getUTCDate())); - - release.actors = scene.actors.map((actor) => ({ name: actor.name, gender: genderMap[actor.gender] })); - - if (scene.is4k) release.tags.push('4k'); - if (scene.gay) release.tags.push('gay'); - - const defaultPoster = scene.screenshots.find((photo) => photo.default === true); - const screens = scene.screenshots.filter((photo) => photo.default === false); - - const remainingScreens = defaultPoster ? screens : screens.slice(1); - const poster = defaultPoster || screens[0]; - - release.poster = getScreenUrl(poster, scene); - release.photos = remainingScreens.map((photo) => getScreenUrl(photo, scene)); - - if (options?.includePhotos) { - const photos = await fetchPhotos(scene); - - if (photos?.length > 0) { - release.photos = photos; - } - } - - release.teaser = `https://i.bang.com/v/${scene.dvd.id}/${scene.identifier}/preview.mp4`; - - release.channel = scene.series.name - .replace(/[! .]/g, '') - .replace('&', 'and'); - - return release; -} - -function scrapeAll(scenes, entity) { - return Promise.all(scenes.map(({ _source: scene }) => scrapeScene(scene, entity))); -} - -async function fetchActorReleases(actor, entity) { - const res = await http.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, { - size: 50, - query: { - bool: { - must: [ - { - match: { - status: 'ok', - }, - }, - { - nested: { - path: 'actors', - query: { - bool: { - must: [ - { - match: { - 'actors.mongoId': { - operator: 'AND', - query: actor.id, - }, - }, - }, - ], - }, - }, - }, - }, - ], - must_not: [ - { - match: { - type: 'trailer', - }, - }, - ], - }, - }, - sort: [ - { - releaseDate: { - order: 'desc', - }, - }, - ], - }, { - encodeJSON: true, - headers: { - Authorization: `Basic ${authKey}`, - }, - }); - - return scrapeAll(res.body.hits.hits, entity); -} - -async function scrapeProfile(actor, entity, include) { - const profile = {}; - - profile.aliases = actor.aliases; - profile.dateOfBirth = extractDate(actor.birthDate); - profile.gender = ({ F: 'female', M: 'male' })[actor.gender]; - - profile.ethnicity = actor.ethnicity; - profile.nationality = actor.nationality; - profile.birthPlace = `${actor.birthCity}, ${actor.birthCountry || ''}`; - - profile.hair = actor.hairColor; - profile.eyes = actor.eyeColor; - - profile.naturalBoobs = actor.naturalBreasts; - - if (actor.measurements) { - const { cupSize, shoulder, chest, waist, height } = actor.measurements; - - if (height) profile.height = inchesToCm(height); - if (cupSize) profile.cup = cupSize; - - // [SIC] - if (shoulder) profile.bust = shoulder; - if (chest) profile.waist = chest; - if (waist) profile.hip = waist; - } - - if (actor.twitter) profile.social = [`https://www.twitter.com/${actor.twitter}`]; - if (actor.image) profile.avatar = `https://i.bang.com/pornstars/${actor.identifier}.jpg`; - - if (include.releases) { - profile.releases = await fetchActorReleases(actor, entity); - } - - return profile; -} - -async function fetchLatest(site, page = 1) { - const res = await http.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, { - size: 50, - from: (page - 1) * 50, - query: { - bool: { - must: [ - { - match: { - status: 'ok', - }, - }, - { - range: { - releaseDate: { - lte: 'now', - }, - }, - }, - /* - * global fetch - { - nested: { - path: 'studio', - query: { - bool: { - must: [ - { - match: { - 'studio.name': { - operator: 'AND', - query: 'bang! originals', - }, - }, - }, - ], - }, - }, - }, - }, - */ - { - nested: { - path: 'series', - query: { - bool: { - must: [ - { - match: { - 'series.id': { - operator: 'AND', - query: site.parameters.siteId, - }, - }, - }, - ], - }, - }, - }, - }, - ], - must_not: [ - { - match: { - type: 'trailer', - }, - }, - ], - }, - }, - sort: [ - { - releaseDate: { - order: 'desc', - }, - }, - ], - }, { - encodeJSON: true, - headers: { - Authorization: `Basic ${authKey}`, - }, - }); - - return scrapeAll(res.body.hits.hits, site); -} - -async function fetchUpcoming(site, page = 1) { - const res = await http.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, { - size: 50, - from: (page - 1) * 50, - query: { - bool: { - must: [ - { - match: { - status: 'ok', - }, - }, - { - range: { - releaseDate: { - lte: 'now+7d', - }, - }, - }, - { - nested: { - path: 'series', - query: { - bool: { - must: [ - { - match: { - 'series.id': { - operator: 'AND', - query: site.parameters.siteId, - }, - }, - }, - ], - }, - }, - }, - }, - ], - must_not: [ - { - match: { - type: 'trailer', - }, - }, - ], - }, - }, - sort: [ - { - releaseDate: { - order: 'desc', - }, - }, - ], - }, { - encodeJSON: true, - headers: { - Authorization: `Basic ${authKey}`, - }, - }); - - return scrapeAll(res.body.hits.hits, site); -} - -async function fetchScene(url, entity, baseRelease, options) { - if (baseRelease?.entryId) { - // overview and deep data is the same, don't hit server unnecessarily - return baseRelease; - } - - const encodedId = new URL(url).pathname.split('/')[2]; - const entryId = decodeId(encodedId); - - const res = await http.get(`https://${clusterId}.us-east-1.aws.found.io/videos/video/${entryId}`, { - headers: { - Authorization: `Basic ${authKey}`, - }, - }); - - return scrapeScene(res.body._source, entity, options); // eslint-disable-line no-underscore-dangle -} - -async function fetchProfile({ name: actorName }, context, include) { - const res = await http.post(`https://${clusterId}.us-east-1.aws.found.io/actors/actor/_search`, { - size: 5, - sort: [{ - _score: { - order: 'desc', - }, - }], - query: { - bool: { - must: [ - { - match: { - name: { - query: actorName, - operator: 'and', - }, - }, - }, - { - match: { - status: 'ok', - }, - }, - ], - }, - }, - }, { - headers: { - Authorization: `Basic ${authKey}`, - }, - encodeJSON: true, - }); - - if (res.ok) { - const actor = res.body.hits.hits.find((hit) => hit._source.name.toLowerCase() === actorName.toLowerCase()); - - if (actor) { - return scrapeProfile(actor._source, context.entity, include); - } - - return null; - } - - return res.status; -} - -module.exports = { - fetchLatest, - fetchProfile, - fetchScene, - fetchUpcoming, -}; diff --git a/src/scrapers/bangbros.js b/src/scrapers/bangbros.js deleted file mode 100755 index 40973566..00000000 --- a/src/scrapers/bangbros.js +++ /dev/null @@ -1,355 +0,0 @@ -'use strict'; - -/* eslint-disable newline-per-chained-call */ -const cheerio = require('cheerio'); -const moment = require('moment'); - -const logger = require('../logger')(__filename); -const slugify = require('../utils/slugify'); -const http = require('../utils/http'); -const qu = require('../utils/qu'); -const args = require('../argv'); - -function scrape(html, site) { - const $ = cheerio.load(html, { normalizeWhitespace: true }); - const sceneElements = $('.echThumb').toArray(); - - return sceneElements.map((element) => { - const release = {}; - - const sceneLinkElement = $(element).find('.thmb_lnk'); - - release.title = sceneLinkElement.attr('title'); - release.url = site.parameters?.legacy || !site.parent - ? `${site.url}${sceneLinkElement.attr('href')}` - : `${site.parent.url}${sceneLinkElement.attr('href')}`; - - release.shootId = sceneLinkElement.attr('id') && sceneLinkElement.attr('id').split('-')[1]; - release.entryId = new URL(release.url).pathname.match(/video(\d+)/)?.[1]; - - release.date = moment.utc($(element).find('.thmb_mr_2 span.faTxt').text(), 'MMM D, YYYY').toDate(); - release.actors = $(element).find('.cast-wrapper a.cast').map((actorIndex, actorElement) => $(actorElement).text().trim()).toArray(); - - const photoElement = $(element).find('.rollover-image'); - const photosUrl = photoElement.attr('data-rollover-url'); - const photosMaxIndex = photoElement.attr('data-rollover-max-index'); - - release.poster = `https:${photoElement.attr('data-original')}`; - release.photos = Array.from({ length: photosMaxIndex }, (val, index) => `https:${photosUrl}big${index + 1}.jpg`); - - release.duration = moment.duration(`0:${$(element).find('.thmb_pic b.tTm').text()}`).asSeconds(); - release.channel = $(element).find('a[href*="/websites"]').attr('href').split('/').slice(-1)[0]; - - return release; - }); -} - -function scrapeAllLegacy(scenes, site) { - return scenes.map(({ query }) => { - const release = {}; - - const pathname = query.url('.mainplayer a, .palyer a'); // sic - release.url = `${site.url}${pathname}`; - release.entryId = pathname.match(/video(\d+)/)?.[1]; - - release.title = query.q('h2', true); - release.date = query.date('div:not(.videoDisc)', 'MMM DD, YYYY', /\w+ \d{1,2}, \d{4}/); - release.description = query.q('div + .videoDisc p', true); - release.duration = query.dur('.videoTag .title'); - - release.poster = query.img('.mainplayer img, .palyer img'); // sic - release.photos = query.imgs('article img').concat(qu.imgs('article img', 'data-original')).filter(Boolean); - - return release; - }); -} - -function scrapeAllMembers(scenes, _channel) { - return scenes.map(({ query, el }) => { - const release = {}; - const data = JSON.parse(query.q(el, null, 'data-shoot')); - - release.entryId = data?.id || query.url('a.etLnk')?.match(/\d+$/)?.[0]; - release.shootId = data?.code; - release.url = data.url ? qu.prefixUrl(data.url, 'https://members.bangbros.com') : query.url('a.etLnk'); - - release.title = data?.title || query.cnt('.etl-hdd'); - release.description = data?.description || query.cnt('.etl-desc'); - - release.date = query.date('.etl-dt', 'MMM DD, YYYY', /\w{3} \d{1,2}, \d{4}/); - release.actors = data?.model.map((actor) => ({ - name: actor.name, - url: qu.prefixUrl(actor.url, 'https://members.bangbros.com'), - })); - - const rolloverUrl = query.q('.rollover-image', 'data-rollover-url'); - release.poster = data?.image || query.img('.rollover-image', 'data-initial-image-url'); - - if (rolloverUrl) { - release.photos = Array.from({ length: 15 }, (value, index) => `${rolloverUrl}${index + 1}.jpg`); - } - - release.trailer = data?.trailer; - release.tags = data?.tag.map((tag) => tag.name); - - return release; - }); -} - -/* no dates available, breaks database -function scrapeUpcoming(html, site) { - const { document } = ex(html); - - return ctxa(document, 'a[id*="upcoming-videos"]').map(({ element, q }) => { - const release = {}; - [release.shootId] = element.id.split('-').slice(-1); - const siteCode = release.shootId.match(/[a-z]+/)[0]; - - if (siteCode !== site.parameters.code) { - return null; - } - - const posterEl = q('img'); - - [release.entryId] = element.href.split('/')[1].match(/\d+/); - release.url = `https://bangbros.com${element.href}`; - release.title = posterEl.alt; - release.poster = `https:${posterEl.src}`; - - release.actors = q('.castName', true).split(/ in/g).slice(0, -1).map(actorName => actorName.trim()); - - console.log(release); - - return release; - }).filter(Boolean); -} -*/ - -function scrapeScene(html, url, _site) { - const { query } = qu.ex(html, '.playerSection'); - const release = {}; - - const { pathname } = new URL(url); - - [release.shootId] = query.cnt('.vdoTags + .vdoCast')?.match(/\w+$/) || []; - release.entryId = pathname.match(/video(\d+)/)?.[1]; - - release.title = query.cnt('.ps-vdoHdd h1'); - release.description = query.cnt('.vdoDesc'); - - release.actors = query.all('a[href*="/model"]', true); - release.tags = query.all('.vdoTags a', true); - - release.stars = Number(query.q('div[class*="like"]', true).match(/^\d+/)[0]) / 20; - - const poster = query.img('img#player-overlay-image, img.playerPic'); - - if (poster) { - release.poster = [ - poster.replace('//big_trailer', '/big_trailer'), - poster.replace(/\/?\/big_trailer/, '/members/450x340'), // load error fallback - ]; - } - - release.trailer = [ - query.video('video source[type="video/mp4"]'), - query.video('video source[type="application/x-mpegURL"]'), - ]; - - // all scenes seem to have 12 album photos available, not always included on the page - const firstPhotoUrl = qu.ex(html).query.img('img[data-slider-index="1"]'); - release.photos = Array.from({ length: 12 }, (val, index) => firstPhotoUrl.replace(/big\d+/, `big${index + 1}`)); - - const [channel] = query.url('a[href*="/websites"]').match(/\w+$/); - - if (channel === 'bangcasting') release.channel = 'bangbroscasting'; - if (channel === 'remaster') release.channel = 'bangbrosremastered'; - else release.channel = channel; - - return release; -} - -function scrapeSceneLegacy({ query }, url) { - const release = {}; - - release.entryId = new URL(url).pathname.match(/video\d+/)?.[0]; - - release.title = query.q('h1', true); - release.description = query.q('.videoDetail', true); - release.duration = query.dur('.tags p span'); - - release.poster = query.img('#video_container + div img, .videoOverlay img'); - - return release; -} - -function scrapeSceneMembers({ query }, url) { - const release = {}; - - release.entryId = new URL(url).pathname.match(/(\d+)\/?$/)[1]; - release.shootId = query.img('.player img')?.match(/\/shoots\/(\w+)\//)?.[1]; - - release.title = query.cnt('.vdo-hdd1'); - release.description = query.cnt('.ndcp'); - - release.actors = query.all('.vdsc a[href*="/model"]').map((actorEl) => ({ - name: query.cnt(actorEl, 'span'), - url: query.url(actorEl, null, 'href', { origin: 'https://members.bangbros.com' }), - avatar: query.img(actorEl, 'img'), - })); - - release.date = query.date('.ran:nth-child(2)', 'MMM DD, YYYY', /\w{3} \d{1,2}, \d{4}/); - release.duration = query.duration('.ran:nth-child(3)'); - - release.tags = query.cnts('.tag a[href*="/tags"]'); - release.channel = slugify(query.cnt('.tag a[href*="/site"]'), ''); - - return release; -} - -function scrapeProfile(html, scope) { - const { query } = qu.ex(html); - const profile = {}; - - const avatar = query.q('.profilePic img', 'src'); - if (avatar) profile.avatar = `https:${avatar}`; - - profile.releases = scrape(html, scope.entity); - - return profile; -} - -function scrapeProfileSearch(html, actorName) { - const { query } = qu.ex(html); - const actorLink = query.url(`a[title="${actorName}" i][href*="model"]`); - - return actorLink ? `https://bangbros.com${actorLink}` : null; -} - -async function fetchLatest(site, page = 1) { - const res = await qu.get(`${site.parameters?.latest || site.url}/${page}`); - - if (res.ok) { - return scrape(res.item.html, site); - } - - return res.status; -} - -async function fetchLatestMembers(channel, page = 1, { parameters }) { - if (!parameters.product) { - throw new Error(`No member area product ID known for '${channel.name}'`); - } - - if (!args.cookie) { - throw new Error(`Please specifiy --cookie "PHPSESSID=xxx" to access the '${channel.name}' members area.`); - } - - const url = `https://members.bangbros.com/product/${parameters.product}/videos/latest/${page}`; - - const res = await qu.getAll(url, '.thumbHolder .echThumb', { - cookie: args.cookie, - }); - - if (res.ok) { - return scrapeAllMembers(res.items, channel); - } - - return res.status; -} - -async function fetchLatestLegacy(site, page = 1) { - const url = `${site.parameters?.latest || site.url}/videos/${page}`; - const res = await qu.getAll(url, '.videoList'); - - if (res.ok) { - return scrapeAllLegacy(res.items, site); - } - - return res.status; -} - -/* -async function fetchUpcoming(site) { - const res = await http.get('https://www.bangbros.com'); - - return scrapeUpcoming(res.body.toString(), site); -} -*/ - -async function fetchScene(url, site, release) { - if (!release?.date) { - logger.warn(`Scraping Bang Bros scene from URL without release date: ${url}`); - } - - const { origin } = new URL(url); - const res = await qu.get(url); - - if (!res.ok) { - return res.status; - } - - if (site.parameters?.legacy) { - return scrapeSceneLegacy(res.item, url, site); - } - - if (!/https?:\/\/(www.)?(bangbros|gaywire).com\/?$/.test(origin)) { - throw new Error('Cannot fetch from this URL. Please find the scene on Bang Bros or Gaywire and try again.'); - } - - return scrapeScene(res.item.html, url, site); -} - -async function fetchSceneMembers(url, baseRelease, channel, { parameters }) { - if (!parameters.product) { - throw new Error(`No member area product ID known for '${channel.name}'`); - } - - if (!args.cookie) { - throw new Error(`Please specifiy --cookie "PHPSESSID=xxx" to access the '${channel.name}' members area.`); - } - - const res = await qu.get(url, null, { - cookie: args.cookie, - }); - - if (res.ok) { - return scrapeSceneMembers(res.item, url, channel); - } - - return res.status; -} - -async function fetchProfile({ name: actorName }, scope) { - const actorSlug = slugify(actorName); - const url = `https://bangbros.com/search/${actorSlug}`; - const res = await http.get(url); - - if (res.statusCode === 200) { - const actorUrl = scrapeProfileSearch(res.body.toString(), actorName); - - if (actorUrl) { - const actorRes = await http.get(actorUrl); - - if (actorRes.statusCode === 200) { - return scrapeProfile(actorRes.body.toString(), scope); - } - } - } - - return null; -} - -module.exports = { - fetchLatest, - fetchScene, - fetchProfile, - legacy: { - fetchLatest: fetchLatestLegacy, - }, - members: { - fetchLatest: fetchLatestMembers, - fetchScene: fetchSceneMembers, - }, - // fetchUpcoming, no dates available -}; diff --git a/src/scrapers/scrapers.js b/src/scrapers/scrapers.js index daff2868..df377d53 100755 --- a/src/scrapers/scrapers.js +++ b/src/scrapers/scrapers.js @@ -9,7 +9,6 @@ const amnesiac = require('./amnesiac'); const badoink = require('./badoink'); const bamvisions = require('./bamvisions'); const bang = require('./bang'); -const bangbros = require('./bangbros'); const bradmontana = require('./bradmontana'); const cherrypimps = require('./cherrypimps'); const cliffmedia = require('./cliffmedia'); @@ -199,7 +198,7 @@ const scrapers = { badoinkvr: badoink, bamvisions, bang, - bangbros, + bangbros: aylo, bjraw: radical, blacked: vixen, blackedraw: vixen,