Added Bang! deep scrape. Improved network page layout. Added Bang Bros logos.

This commit is contained in:
2020-01-07 04:23:28 +01:00
parent 89064e9e0c
commit 0a19f2e624
71 changed files with 194 additions and 116 deletions

View File

@@ -2,6 +2,11 @@
const bhttp = require('bhttp');
const slugify = require('../utils/slugify');
const clusterId = '617fb597b659459bafe6472470d9073a';
const authKey = 'YmFuZy1yZWFkOktqVDN0RzJacmQ1TFNRazI=';
function encodeId(id) {
return Buffer
.from(id, 'hex')
@@ -11,52 +16,62 @@ function encodeId(id) {
.replace(/=/g, ',');
}
function decodeId(id) {
const restoredId = id
.replace(/-/g, '+')
.replace(/_/g, '/')
.replace(/,/g, '=');
return Buffer
.from(restoredId, 'base64')
.toString('hex');
}
function scrapeScene(scene, site) {
const release = {
site,
entryId: scene.id,
title: scene.name,
description: scene.description,
actors: scene.actors.map(actor => actor.name),
tags: scene.genres.concat(scene.actions).map(genre => genre.name),
duration: scene.duration,
};
const slug = slugify(release.title);
release.url = `https://www.bang.com/video/${encodeId(release.entryId)}/${slug}`;
const date = new Date(scene.releaseDate);
release.date = new Date(Date.UTC(date.getUTCFullYear(), date.getUTCMonth(), date.getUTCDate()));
if (scene.is4k) release.tags.push('4k');
if (scene.gay) release.tags.push('gay');
const defaultPoster = scene.screenshots.find(photo => photo.default === true);
const photoset = scene.screenshots.filter(photo => photo.default === false);
const photos = defaultPoster ? photoset : photoset.slice(1);
const poster = defaultPoster || photoset[0];
release.poster = `https://i.bang.com/screenshots/${scene.dvd.id}/movie/${scene.order}/${poster.screenId}.jpg`;
release.photos = photos.map(photo => `https://i.bang.com/screenshots/${scene.dvd.id}/movie/${scene.order}/${photo.screenId}.jpg`);
release.trailer = {
src: `https://i.bang.com/v/${scene.dvd.id}/${scene.identifier}/preview.mp4`,
};
release.channel = scene.series.name
.replace(/[! .]/g, '')
.replace('&', 'and');
return release;
}
function scrapeLatest(scenes, site) {
return scenes.map(({ _source: scene }) => {
const release = {
site,
entryId: encodeId(scene.id),
title: scene.name,
description: scene.description,
actors: scene.actors.map(actor => actor.name),
tags: scene.genres.concat(scene.actions).map(genre => genre.name),
duration: scene.duration,
};
const slug = release.title.toLowerCase().trim().replace(/\s+/g, '-');
release.url = `https://www.bang.com/video/${release.entryId}/${slug}`;
const date = new Date(scene.releaseDate);
release.date = new Date(Date.UTC(date.getUTCFullYear(), date.getUTCMonth(), date.getUTCDate()));
if (scene.is4k) release.tags.push('4k');
if (scene.gay) release.tags.push('gay');
const defaultPoster = scene.screenshots.find(photo => photo.default === true);
const photoset = scene.screenshots.filter(photo => photo.default === false);
const photos = defaultPoster ? photoset : photoset.slice(1);
const poster = defaultPoster || photoset[0];
release.poster = `https://i.bang.com/screenshots/${scene.dvd.id}/movie/1/${poster.screenId}.jpg`;
release.photos = photos.map(photo => `https://i.bang.com/screenshots/${scene.dvd.id}/movie/1/${photo.screenId}.jpg`);
release.trailer = {
src: `https://i.bang.com/v/${scene.dvd.id}/${scene.identifier}/preview.mp4`,
};
release.studio = scene.series.name
.replace(/[! .]/g, '')
.replace('&', 'and');
return release;
});
return scenes.map(({ _source: scene }) => scrapeScene(scene, site));
}
async function fetchLatest(site, page = 1) {
const clusterId = '617fb597b659459bafe6472470d9073a';
const authKey = 'YmFuZy1yZWFkOktqVDN0RzJacmQ1TFNRazI=';
const res = await bhttp.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, {
size: 50,
from: (page - 1) * 50,
@@ -75,6 +90,8 @@ async function fetchLatest(site, page = 1) {
},
},
},
/*
* global fetch
{
nested: {
path: 'studio',
@@ -94,6 +111,26 @@ async function fetchLatest(site, page = 1) {
},
},
},
*/
{
nested: {
path: 'series',
query: {
bool: {
must: [
{
match: {
'series.id': {
operator: 'AND',
query: site.parameters.siteId,
},
},
},
],
},
},
},
},
],
must_not: [
{
@@ -121,7 +158,20 @@ async function fetchLatest(site, page = 1) {
return scrapeLatest(res.body.hits.hits, site);
}
async function fetchScene(url, site) {
const encodedId = new URL(url).pathname.split('/')[2];
const entryId = decodeId(encodedId);
const res = await bhttp.get(`https://${clusterId}.us-east-1.aws.found.io/videos/video/${entryId}`, {
headers: {
Authorization: `Basic ${authKey}`,
},
});
return scrapeScene(res.body._source, site); // eslint-disable-line no-underscore-dangle
}
module.exports = {
fetchLatest,
// fetchScene,
fetchScene,
};

View File

@@ -5,9 +5,6 @@ const bhttp = require('bhttp');
const cheerio = require('cheerio');
const moment = require('moment');
const knex = require('../knex');
const { matchTags } = require('../tags');
function scrapeLatest(html, site) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const sceneElements = $('.echThumb').toArray();
@@ -57,7 +54,7 @@ async function scrapeScene(html, url, site) {
const description = sceneElement.find('.vdoDesc').text().trim();
const [siteName, ...actors] = sceneElement.find('.vdoCast a').map((actorIndex, actorElement) => $(actorElement).text()).toArray();
const siteId = siteName.replace(/[\s']+/g, '').toLowerCase();
const siteSlug = siteName.replace(/[\s']+/g, '').toLowerCase();
const poster = `https:${$('img#player-overlay-image').attr('src')}`;
const trailer = `https:${$('source[type="video/mp4"]').attr('src')}`;
@@ -66,17 +63,7 @@ async function scrapeScene(html, url, site) {
// all scenes seem to have 12 album photos available, not always included on the page
const photos = Array.from({ length: 12 }, (val, index) => firstPhotoUrl.replace(/big\d+/, `big${index + 1}`));
const rawTags = $('.vdoTags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
const [channelSite, tags] = await Promise.all([
site.isFallback
? knex('sites')
.where({ slug: siteId })
.orWhere({ name: siteName })
.first()
: site,
matchTags(rawTags),
]);
const tags = $('.vdoTags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
const stars = Number(sceneElement.find('.bVdPl_it_like .bVdPl_txt').text().replace('% like', '')) / 20;
@@ -96,12 +83,13 @@ async function scrapeScene(html, url, site) {
rating: {
stars,
},
site: channelSite || site,
site,
channel: siteSlug === 'bangcasting' ? 'bangbroscasting' : siteSlug,
};
}
async function fetchLatest(site, page = 1) {
const res = await bhttp.get(`https://bangbros.com/websites/${site.slug}/${page}`);
const res = await bhttp.get(`${site.url}/${page}`);
return scrapeLatest(res.body.toString(), site);
}