From b7f51a8deb6de0511123d26ae30eb6d51a74986a Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Fri, 31 Jan 2020 21:43:16 +0100 Subject: [PATCH] Added avatar and actor releases to Bang Bros scraper. --- seeds/01_sites.js | 338 ++++++++++++++++++------------------- src/scrape-releases.js | 12 +- src/scrapers/bangbros.js | 97 ++++++++++- src/scrapers/scrapers.js | 3 +- src/web/plugins/plugins.js | 2 +- 5 files changed, 273 insertions(+), 179 deletions(-) diff --git a/seeds/01_sites.js b/seeds/01_sites.js index 1e528a5b..c3dc1be5 100644 --- a/seeds/01_sites.js +++ b/seeds/01_sites.js @@ -261,420 +261,420 @@ function getSites(networksMap) { }, // BANGBROS { - slug: 'assparade', - network_id: networksMap.bangbros, name: 'Ass Parade', url: 'https://bangbros.com/websites/assparade', + slug: 'assparade', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'ap' }), }, { - slug: 'avaspice', - network_id: networksMap.bangbros, name: 'AvaSpice', url: 'https://bangbros.com/websites/avaspice', + slug: 'avaspice', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'av' }), }, { - slug: 'backroomfacials', - network_id: networksMap.bangbros, name: 'Back Room Facials', url: 'https://bangbros.com/websites/backroomfacials', + slug: 'backroomfacials', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'brf' }), }, { - slug: 'backroommilf', - network_id: networksMap.bangbros, name: 'Backroom MILF', url: 'https://bangbros.com/websites/backroommilf', + slug: 'backroommilf', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'mf' }), }, { - slug: 'ballhoneys', - network_id: networksMap.bangbros, name: 'Ball Honeys', url: 'https://bangbros.com/websites/ballhoneys', + slug: 'ballhoneys', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'es' }), }, { - slug: 'bangbros18', - network_id: networksMap.bangbros, name: 'BangBros 18', url: 'https://bangbros.com/websites/bangbros18', + slug: 'bangbros18', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'bbe' }), }, { - slug: 'bangbrosangels', - network_id: networksMap.bangbros, name: 'BangBros Angels', url: 'https://bangbros.com/websites/bangbrosangels', + slug: 'bangbrosangels', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'bng' }), }, { - slug: 'bangbrosclips', - network_id: networksMap.bangbros, name: 'Bangbros Clips', url: 'https://bangbros.com/websites/bangbrosclips', + slug: 'bangbrosclips', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'bbc' }), }, { - slug: 'bangbrosremastered', - network_id: networksMap.bangbros, name: 'BangBros Remastered', - url: 'https://bangbros.com/websites/bangbrosremastered', + url: 'https://bangbros.com/websites/remaster', + slug: 'bangbrosremastered', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'rm' }), }, { - slug: 'bangbus', - network_id: networksMap.bangbros, name: 'Bang Bus', url: 'https://bangbros.com/websites/bangbus', + slug: 'bangbus', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'bb' }), }, { - slug: 'bangbroscasting', - network_id: networksMap.bangbros, name: 'Bang Casting', url: 'https://bangbros.com/websites/bangcasting', + slug: 'bangbroscasting', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'hih' }), }, { - slug: 'bangpov', - network_id: networksMap.bangbros, name: 'Bang POV', url: 'https://bangbros.com/websites/bangpov', + slug: 'bangpov', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'bpov' }), }, { - slug: 'bangtryouts', - network_id: networksMap.bangbros, name: 'Bang Tryouts', url: 'https://bangbros.com/websites/bangtryouts', + slug: 'bangtryouts', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'bto' }), }, { - slug: 'bigmouthfuls', - network_id: networksMap.bangbros, name: 'Big Mouthfuls', url: 'https://bangbros.com/websites/bigmouthfuls', + slug: 'bigmouthfuls', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'bmf' }), }, { - slug: 'bigtitcreampie', - network_id: networksMap.bangbros, name: 'Big Tit Cream Pie', url: 'https://bangbros.com/websites/bigtitcreampie', + slug: 'bigtitcreampie', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'btcp' }), }, { - slug: 'bigtitsroundasses', - network_id: networksMap.bangbros, name: 'Big Tits, Round Asses', url: 'https://bangbros.com/websites/bigtitsroundasses', + slug: 'bigtitsroundasses', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'btra' }), }, { - slug: 'blowjobfridays', - network_id: networksMap.bangbros, name: 'BlowJob Fridays', url: 'https://bangbros.com/websites/blowjobfridays', + slug: 'blowjobfridays', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'bj' }), }, { - slug: 'blowjobninjas', - network_id: networksMap.bangbros, name: 'Blowjob Ninjas', url: 'https://bangbros.com/websites/blowjobninjas', + slug: 'blowjobninjas', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'aa' }), }, { - slug: 'boobsquad', - network_id: networksMap.bangbros, name: 'Boob Squad', url: 'https://bangbros.com/websites/boobsquad', + slug: 'boobsquad', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'bs' }), }, { - slug: 'brownbunnies', - network_id: networksMap.bangbros, name: 'Brown Bunnies', url: 'https://bangbros.com/websites/brownbunnies', + slug: 'brownbunnies', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'bkb' }), }, { - slug: 'canhescore', - network_id: networksMap.bangbros, name: 'Can He Score?', url: 'https://bangbros.com/websites/canhescore', + slug: 'canhescore', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'bd' }), }, { - slug: 'casting', - network_id: networksMap.bangbros, name: 'Casting', url: 'https://bangbros.com/websites/casting', + slug: 'casting', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'ca' }), }, { - slug: 'chongas', - network_id: networksMap.bangbros, name: 'Chongas', url: 'https://bangbros.com/websites/chongas', + slug: 'chongas', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'ch' }), }, { - slug: 'colombiafuckfest', - network_id: networksMap.bangbros, name: 'Colombia Fuck Fest', url: 'https://bangbros.com/websites/colombiafuckfest', + slug: 'colombiafuckfest', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'cff' }), }, { - slug: 'dirtyworldtour', - network_id: networksMap.bangbros, name: 'Dirty World Tour', url: 'https://bangbros.com/websites/dirtyworldtour', + slug: 'dirtyworldtour', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'bf' }), }, { - slug: 'dorminvasion', - network_id: networksMap.bangbros, name: 'Dorm Invasion', url: 'https://bangbros.com/websites/dorminvasion', + slug: 'dorminvasion', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'di' }), }, { - slug: 'facialfest', - network_id: networksMap.bangbros, name: 'Facial Fest', url: 'https://bangbros.com/websites/facialfest', + slug: 'facialfest', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'ff' }), }, { - slug: 'fuckteamfive', - network_id: networksMap.bangbros, name: 'Fuck Team Five', url: 'https://bangbros.com/websites/fuckteamfive', + slug: 'fuckteamfive', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'bbw' }), }, { - slug: 'gloryholeloads', - network_id: networksMap.bangbros, name: 'Glory Hole Loads', url: 'https://bangbros.com/websites/gloryholeloads', + slug: 'gloryholeloads', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'ghl' }), }, { - slug: 'latinarampage', - network_id: networksMap.bangbros, name: 'Latina Rampage', url: 'https://bangbros.com/websites/latinarampage', + slug: 'latinarampage', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'lrp' }), }, { - slug: 'livingwithanna', - network_id: networksMap.bangbros, name: 'Living With Anna', url: 'https://bangbros.com/websites/livingwithanna', + slug: 'livingwithanna', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'lr' }), }, { - slug: 'magicalfeet', - network_id: networksMap.bangbros, name: 'Magical Feet', url: 'https://bangbros.com/websites/magicalfeet', + slug: 'magicalfeet', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'fj' }), }, { - slug: 'milflessons', - network_id: networksMap.bangbros, - name: 'MILF Lessons', - url: 'https://bangbros.com/websites/milflessons', - description: null, - parameters: null, - }, - { - slug: 'milfsoup', - network_id: networksMap.bangbros, name: 'Milf Soup', url: 'https://bangbros.com/websites/milfsoup', + slug: 'milfsoup', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'ms' }), }, { - slug: 'momishorny', - network_id: networksMap.bangbros, name: 'MomIsHorny', url: 'https://bangbros.com/websites/momishorny', + slug: 'momishorny', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'mih' }), }, { - slug: 'monstersofcock', - network_id: networksMap.bangbros, name: 'Monsters of Cock', url: 'https://bangbros.com/websites/monstersofcock', + slug: 'monstersofcock', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'mc' }), }, { - slug: 'mranal', - network_id: networksMap.bangbros, - name: 'Mr. Anal', - url: 'https://bangbros.com/websites/mranal', - description: null, - parameters: null, - }, - { - slug: 'mrcameltoe', - network_id: networksMap.bangbros, name: 'Mr CamelToe', url: 'https://bangbros.com/websites/mrcameltoe', + slug: 'mrcameltoe', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'ct' }), }, { - slug: 'mydirtymaid', - network_id: networksMap.bangbros, name: 'My Dirty Maid', url: 'https://bangbros.com/websites/mydirtymaid', + slug: 'mydirtymaid', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'mda' }), }, { - slug: 'mylifeinbrazil', - network_id: networksMap.bangbros, name: 'My Life In Brazil', url: 'https://bangbros.com/websites/mylifeinbrazil', + slug: 'mylifeinbrazil', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'mb' }), }, { - slug: 'newbieblack', - network_id: networksMap.bangbros, name: 'Newbie Black', url: 'https://bangbros.com/websites/newbieblack', + slug: 'newbieblack', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'blkg' }), }, { - slug: 'partyofthree', - network_id: networksMap.bangbros, name: 'Party of Three', - url: 'https://bangbros.com/websites/partyof3', + url: 'https://bangbros.com/websites/partyofthree', + slug: 'partyofthree', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'ls' }), }, { - slug: 'pawg', - network_id: networksMap.bangbros, name: 'Pawg', url: 'https://bangbros.com/websites/pawg', + slug: 'pawg', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'pwg' }), }, { - slug: 'pennyshow', - network_id: networksMap.bangbros, name: 'Penny Show', url: 'https://bangbros.com/websites/pennyshow', + slug: 'pennyshow', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'ps' }), }, { - slug: 'pornstarspa', - network_id: networksMap.bangbros, name: 'Porn Star Spa', url: 'https://bangbros.com/websites/pornstarspa', + slug: 'pornstarspa', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'pos' }), }, { - slug: 'powermunch', - network_id: networksMap.bangbros, name: 'Power Munch', url: 'https://bangbros.com/websites/powermunch', + slug: 'powermunch', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'pm' }), }, { - slug: 'publicbang', - network_id: networksMap.bangbros, name: 'Public Bang', url: 'https://bangbros.com/websites/publicbang', + slug: 'publicbang', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'pb' }), }, { - slug: 'sluttywhitegirls', - network_id: networksMap.bangbros, name: 'Slutty White Girls', url: 'https://bangbros.com/websites/sluttywhitegirls', + slug: 'sluttywhitegirls', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'swg' }), }, { - slug: 'stepmomvideos', - network_id: networksMap.bangbros, name: 'Stepmom Videos', url: 'https://bangbros.com/websites/stepmomvideos', + slug: 'stepmomvideos', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'smv' }), }, { - slug: 'streetranger', - network_id: networksMap.bangbros, name: 'Street Ranger', - url: 'https://bangbros.com/websites/streetranger', + url: 'https://bangbros.com/websites/thewheeler', + slug: 'streetranger', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'sg' }), }, { - slug: 'tugjobs', - network_id: networksMap.bangbros, name: 'Tugjobs', url: 'https://bangbros.com/websites/tugjobs', + slug: 'tugjobs', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'hj' }), }, { - slug: 'workinglatinas', - network_id: networksMap.bangbros, name: 'Working Latinas', url: 'https://bangbros.com/websites/workinglatinas', + slug: 'workinglatinas', description: null, - parameters: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'lw' }), + }, + { + name: 'MILF Lessons', + url: 'https://bangbros.com/websites/milflessons', + slug: 'milflessons', + description: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'ml' }), + }, + { + name: 'Mr. Anal', + url: 'https://bangbros.com/websites/mranal', + slug: 'mranal', + description: null, + network_id: networksMap.bangbros, + parameters: JSON.stringify({ code: 'ma' }), }, // BLOWPASS { diff --git a/src/scrape-releases.js b/src/scrape-releases.js index fc3e06b4..f8954d76 100644 --- a/src/scrape-releases.js +++ b/src/scrape-releases.js @@ -30,7 +30,12 @@ async function findSite(url, release) { return null; } -async function scrapeRelease(url, release, type = 'scene') { +async function scrapeRelease(source, basicRelease = null, type = 'scene') { + // profile scraper may return either URLs or pre-scraped scenes + const sourceIsUrl = typeof source === 'string'; + const url = sourceIsUrl ? source : source.url; + const release = sourceIsUrl ? basicRelease : source; + const site = await findSite(url, release); if (!site) { @@ -61,12 +66,13 @@ async function scrapeRelease(url, release, type = 'scene') { return { ...scrapedRelease, + ...release, site, }; } -async function scrapeReleases(urls, release, type = 'scene') { - const scrapedReleases = await Promise.map(urls, async url => scrapeRelease(url, release, type), { +async function scrapeReleases(sources, release = null, type = 'scene') { + const scrapedReleases = await Promise.map(sources, async source => scrapeRelease(source, release, type), { concurrency: 5, }); diff --git a/src/scrapers/bangbros.js b/src/scrapers/bangbros.js index b00fce00..8e63b512 100644 --- a/src/scrapers/bangbros.js +++ b/src/scrapers/bangbros.js @@ -5,7 +5,10 @@ const bhttp = require('bhttp'); const cheerio = require('cheerio'); const moment = require('moment'); -function scrapeLatest(html, site) { +const slugify = require('../utils/slugify'); +const { ex } = require('../utils/q'); + +function scrape(html, site) { const $ = cheerio.load(html, { normalizeWhitespace: true }); const sceneElements = $('.echThumb').toArray(); @@ -27,6 +30,7 @@ function scrapeLatest(html, site) { const photos = Array.from({ length: photosMaxIndex }, (val, index) => `https:${photosUrl}big${index + 1}.jpg`); const duration = moment.duration(`0:${$(element).find('.thmb_pic b.tTm').text()}`).asSeconds(); + const channel = $(element).find('a[href*="/websites"]').attr('href').split('/').slice(-1)[0]; return { url, @@ -40,11 +44,41 @@ function scrapeLatest(html, site) { photos, rating: null, site, + channel, }; }); } -async function scrapeScene(html, url, site) { +/* no dates available, breaks database +function scrapeUpcoming(html, site) { + const { document } = ex(html); + + return ctxa(document, 'a[id*="upcoming-videos"]').map(({ element, q }) => { + const release = {}; + [release.shootId] = element.id.split('-').slice(-1); + const siteCode = release.shootId.match(/[a-z]+/)[0]; + + if (siteCode !== site.parameters.code) { + return null; + } + + const posterEl = q('img'); + + [release.entryId] = element.href.split('/')[1].match(/\d+/); + release.url = `https://bangbros.com${element.href}`; + release.title = posterEl.alt; + release.poster = `https:${posterEl.src}`; + + release.actors = q('.castName', true).split(/ in/g).slice(0, -1).map(actorName => actorName.trim()); + + console.log(release); + + return release; + }).filter(Boolean); +} +*/ + +function scrapeScene(html, url, site) { const $ = cheerio.load(html, { normalizeWhitespace: true }); const sceneElement = $('.playerSection'); @@ -88,24 +122,77 @@ async function scrapeScene(html, url, site) { }; } +function scrapeProfile(html) { + const { q } = ex(html); + const profile = {}; + + const avatar = q('.profilePic img', 'src'); + if (avatar) profile.avatar = `https:${avatar}`; + + profile.releases = scrape(html); + + return profile; +} + +function scrapeProfileSearch(html, actorName) { + const { q } = ex(html); + const actorLink = q(`a[title="${actorName}"]`, 'href'); + + return actorLink ? `https://bangbros.com${actorLink}` : null; +} + async function fetchLatest(site, page = 1) { const res = await bhttp.get(`${site.url}/${page}`); - return scrapeLatest(res.body.toString(), site); + return scrape(res.body.toString(), site); } -async function fetchScene(url, site) { +/* +async function fetchUpcoming(site) { + const res = await bhttp.get('https://www.bangbros.com'); + + return scrapeUpcoming(res.body.toString(), site); +} +*/ + +async function fetchScene(url, site, release) { + if (!release?.date) { + throw new Error(`Cannot fetch Bang Bros scenes from argument URL, as scene pages do not have release dates: ${url}`); + } + const { origin } = new URL(url); const res = await bhttp.get(url); - if (origin !== 'https://bangbros.com') { + if (!/https?:\/\/(www.)?bangbros.com\/?$/.test(origin)) { throw new Error('Cannot fetch from this URL. Please find the scene on https://bangbros.com and try again.'); } return scrapeScene(res.body.toString(), url, site); } +async function fetchProfile(actorName) { + const actorSlug = slugify(actorName); + const url = `https://bangbros.com/search/${actorSlug}`; + const res = await bhttp.get(url); + + if (res.statusCode === 200) { + const actorUrl = scrapeProfileSearch(res.body.toString(), actorName); + + if (actorUrl) { + const actorRes = await bhttp.get(actorUrl); + + if (actorRes.statusCode === 200) { + return scrapeProfile(actorRes.body.toString()); + } + } + } + + return null; +} + module.exports = { fetchLatest, fetchScene, + fetchProfile, + // fetchUpcoming, no dates available }; diff --git a/src/scrapers/scrapers.js b/src/scrapers/scrapers.js index 055ba436..024ccccb 100644 --- a/src/scrapers/scrapers.js +++ b/src/scrapers/scrapers.js @@ -3,7 +3,6 @@ // releases const babes = require('./babes'); const bang = require('./bang'); -const bangbros = require('./bangbros'); const dogfart = require('./dogfart'); const digitalplayground = require('./digitalplayground'); const fakehub = require('./fakehub'); @@ -23,6 +22,7 @@ const vixen = require('./vixen'); const vogov = require('./vogov'); // releases and profiles +const bangbros = require('./bangbros'); const blowpass = require('./blowpass'); const brazzers = require('./brazzers'); const ddfnetwork = require('./ddfnetwork'); @@ -104,6 +104,7 @@ module.exports = { boobpedia, legalporno, kellymadison, + bangbros, pornhub, freeones, freeonesLegacy, diff --git a/src/web/plugins/plugins.js b/src/web/plugins/plugins.js index 5f320ca7..6eb88628 100644 --- a/src/web/plugins/plugins.js +++ b/src/web/plugins/plugins.js @@ -7,5 +7,5 @@ const SitePlugins = require('./sites'); module.exports = { ActorPlugins, SitePlugins, - ReleasePlugins: [], + // ReleasePlugins, };