From 2198c7ceb09aa425dc7e6cf042d02e08eb93ec4f Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Thu, 31 Oct 2019 01:53:26 +0100 Subject: [PATCH] Added media support to Bang Bros scraper. Added untracked files. --- .nvmrc | 1 + seeds/04_studios.js | 145 +++++++++++++++++++++++++++++++++++++++ src/fetch-releases.js | 2 - src/scrapers/bangbros.js | 25 ++++++- 4 files changed, 169 insertions(+), 4 deletions(-) create mode 100644 .nvmrc create mode 100644 seeds/04_studios.js diff --git a/.nvmrc b/.nvmrc new file mode 100644 index 00000000..47c0a98a --- /dev/null +++ b/.nvmrc @@ -0,0 +1 @@ +12.13.0 diff --git a/seeds/04_studios.js b/seeds/04_studios.js new file mode 100644 index 00000000..8aed1c5d --- /dev/null +++ b/seeds/04_studios.js @@ -0,0 +1,145 @@ +'use strict'; + +/* eslint-disable max-len */ +exports.seed = knex => Promise.resolve() + .then(async () => { + // find network IDs + const networks = await knex('networks').select('*'); + const networksMap = networks.reduce((acc, { id, slug }) => ({ ...acc, [slug]: id }), {}); + + return knex.raw(`${knex('studios').insert([ + // LegalPorno + { + slug: 'gonzocom', + name: 'Gonzo.com', + url: 'https://www.legalporno.com/studios/gonzo_com', + network_id: networksMap['legalporno'], + }, + { + slug: 'giorgiograndi', + name: 'Giorgio Grandi', + url: 'https://www.legalporno.com/studios/giorgio-grandi', + network_id: networksMap['legalporno'], + }, + { + slug: 'hardpornworld', + name: 'Hard Porn World', + url: 'https://www.legalporno.com/studios/hard-porn-world', + network_id: networksMap['legalporno'], + }, + { + slug: 'interracialvision', + name: 'Interracial Vision', + url: 'https://www.legalporno.com/studios/interracial-vision', + network_id: networksMap['legalporno'], + }, + { + slug: 'giorgioslab', + name: 'Giorgio\'s Lab', + url: 'https://www.legalporno.com/studios/giorgio--s-lab', + network_id: networksMap['legalporno'], + }, + { + slug: 'americananal', + name: 'American Anal', + url: 'https://www.legalporno.com/studios/american-anal', + network_id: networksMap['legalporno'], + }, + { + slug: 'assablanca', + name: 'Assablanca', + url: 'https://www.legalporno.com/studios/assablanca', + network_id: networksMap['legalporno'], + }, + { + slug: 'focus', + name: 'Focus', + url: 'https://www.legalporno.com/studios/focus', + network_id: networksMap['legalporno'], + }, + { + slug: 'analforever', + name: 'Anal Forever', + url: 'https://www.legalporno.com/studios/anal-forever', + network_id: networksMap['legalporno'], + }, + { + slug: 'gonzoinbrazil', + name: 'Gonzo in Brazil', + url: 'https://www.legalporno.com/studios/gonzo-in-brazil', + network_id: networksMap['legalporno'], + }, + { + slug: 'mranal', + name: 'Mr Anal', + url: 'https://www.legalporno.com/studios/mr-anal', + network_id: networksMap['legalporno'], + }, + { + slug: 'tarrawhite', + name: 'Tarra White', + url: 'https://www.legalporno.com/studios/tarra-white', + network_id: networksMap['legalporno'], + }, + { + slug: 'sineplexsos', + name: 'Sineplex SOS', + url: 'https://www.legalporno.com/studios/sineplex-sos', + network_id: networksMap['legalporno'], + }, + { + slug: 'fmodels', + name: 'F Models', + url: 'https://www.legalporno.com/studios/f-models', + network_id: networksMap['legalporno'], + }, + { + slug: 'sineplexcz', + name: 'Sineplex CZ', + url: 'https://www.legalporno.com/studios/sineplex-cz', + network_id: networksMap['legalporno'], + }, + { + slug: 'gg', + name: 'GG', + url: 'https://www.legalporno.com/studios/gg', + network_id: networksMap['legalporno'], + }, + { + slug: 'firstgape', + name: 'First Gape', + url: 'https://www.legalporno.com/studios/first-gape', + network_id: networksMap['legalporno'], + }, + { + slug: 'omargalantiproductions', + name: 'Omar Galanti Productions', + url: 'https://www.legalporno.com/studios/omar-galanti-productions', + network_id: networksMap['legalporno'], + }, + { + slug: 'norestfortheass', + name: 'No Rest For The Ass', + url: 'https://www.legalporno.com/studios/no-rest-for-the-ass', + network_id: networksMap['legalporno'], + }, + { + slug: 'hairygonzo', + name: 'Hairy Gonzo', + url: 'https://www.legalporno.com/studios/hairy-gonzo', + network_id: networksMap['legalporno'], + }, + { + slug: 'sineplexclassic', + name: 'Sineplex Classic', + url: 'https://www.legalporno.com/studios/sineplex-classic', + network_id: networksMap['legalporno'], + }, + { + slug: 'sinemale', + name: 'Sinemale', + url: 'https://www.legalporno.com/studios/sinemale', + network_id: networksMap['legalporno'], + }, + ]).toString()} ON CONFLICT DO NOTHING`); + }); diff --git a/src/fetch-releases.js b/src/fetch-releases.js index 01020d5a..3230a987 100644 --- a/src/fetch-releases.js +++ b/src/fetch-releases.js @@ -90,8 +90,6 @@ async function storeActors(release, releaseEntry) { const actors = await knex('actors').whereIn('name', release.actors); const newActors = release.actors.filter(actorName => !actors.some(actor => actor.name === actorName)); - console.log(release.actors, actors, newActors); - const { rows: insertedActors } = newActors.length ? await knex.raw(`${knex('actors').insert(newActors.map(actorName => ({ name: actorName, diff --git a/src/scrapers/bangbros.js b/src/scrapers/bangbros.js index 9ffe5f74..5d3bfe6a 100644 --- a/src/scrapers/bangbros.js +++ b/src/scrapers/bangbros.js @@ -22,6 +22,13 @@ function scrapeLatest(html, site) { const date = moment.utc($(element).find('.thmb_mr_2 span.faTxt').text(), 'MMM D, YYYY').toDate(); const actors = $(element).find('.cast-wrapper a.cast').map((actorIndex, actorElement) => $(actorElement).text().trim()).toArray(); + const photoElement = $(element).find('.rollover-image'); + const poster = `https:${photoElement.attr('data-original')}`; + + const photosUrl = photoElement.attr('data-rollover-url'); + const photosMaxIndex = photoElement.attr('data-rollover-max-index'); + const photos = Array.from({ length: photosMaxIndex }, (val, index) => `https:${photosUrl}big${index + 1}.jpg`); + const duration = moment.duration(`0:${$(element).find('.thmb_pic b.tTm').text()}`).asSeconds(); return { @@ -32,6 +39,8 @@ function scrapeLatest(html, site) { actors, date, duration, + poster, + photos, rating: null, site, }; @@ -50,11 +59,18 @@ async function scrapeScene(html, url, site) { const [siteName, ...actors] = sceneElement.find('.vdoCast a').map((actorIndex, actorElement) => $(actorElement).text()).toArray(); const siteId = siteName.replace(/[\s']+/g, '').toLowerCase(); + const poster = `https:${$('img#player-overlay-image').attr('src')}`; + const trailer = `https:${$('source[type="video/mp4"]').attr('src')}`; + + const firstPhotoUrl = `https:${$('img[data-slider-index="1"]').attr('src')}`; + // all scenes seem to have 12 album photos available, not always included on the page + const photos = Array.from({ length: 12 }, (val, index) => firstPhotoUrl.replace(/big\d+/, `big${index + 1}`)); + const rawTags = $('.vdoTags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray(); const [channelSite, tags] = await Promise.all([ knex('sites') - .where({ id: siteId }) + .where({ slug: siteId }) .orWhere({ name: siteName }) .first(), matchTags(rawTags), @@ -70,6 +86,11 @@ async function scrapeScene(html, url, site) { description, actors, tags, + poster, + photos, + trailer: { + src: trailer, + }, rating: { stars, }, @@ -78,7 +99,7 @@ async function scrapeScene(html, url, site) { } async function fetchLatest(site, page = 1) { - const res = await bhttp.get(`https://bangbros.com/websites/${site.id}/${page}`); + const res = await bhttp.get(`https://bangbros.com/websites/${site.slug}/${page}`); return scrapeLatest(res.body.toString(), site); }