Added Bang! deep scrape. Improved network page layout. Added Bang Bros logos.

This commit is contained in:
ThePendulum 2020-01-07 04:23:28 +01:00
parent 89064e9e0c
commit 0a19f2e624
71 changed files with 194 additions and 116 deletions

View File

@ -120,16 +120,6 @@ export default {
};
</script>
<style lang="scss">
@import 'theme';
@media(max-width: $breakpoint3) {
.releases .tiles {
grid-template-columns: repeat(auto-fit, minmax(20rem, 1fr));
}
}
</style>
<style lang="scss" scoped>
@import 'theme';
@ -216,7 +206,7 @@ export default {
font-weight: bold;
}
@media(max-width: $breakpoint) {
@media(max-width: $breakpoint3) {
.header,
.header.hideable {
display: flex;

View File

@ -103,6 +103,7 @@ export default {
.trailer-video {
max-width: 100%;
object-fit: cover;
}
.item {

View File

@ -180,7 +180,6 @@ export default {
.details {
width: 100%;
display: flex;
align-items: center;
justify-content: space-between;
position: absolute;
font-size: 0;

View File

@ -163,7 +163,7 @@ function initActorActions(store, _router) {
const { actors } = await graphql(`
query Actors($limit:Int) {
actors(first:$limit) {
actors(first:$limit, orderBy: NAME_ASC) {
id
name
slug

View File

@ -1,4 +1,6 @@
function curateActor(actor) {
import dayjs from 'dayjs';
function curateActor(actor, release) {
const curatedActor = {
...actor,
origin: actor.originCountry && {
@ -8,13 +10,17 @@ function curateActor(actor) {
if (actor.avatar) curatedActor.avatar = actor.avatar.media;
if (release && release.date && curatedActor.birthdate) {
curatedActor.ageThen = dayjs(release.date).diff(actor.birthdate, 'year');
}
return curatedActor;
}
function curateRelease(release) {
const curatedRelease = {
...release,
actors: release.actors ? release.actors.map(({ actor }) => curateActor(actor)) : [],
actors: [],
poster: release.poster && release.poster.media,
tags: release.tags ? release.tags.map(({ tag }) => tag) : [],
network: release.site.network,
@ -22,6 +28,7 @@ function curateRelease(release) {
if (release.photos) curatedRelease.photos = release.photos.map(({ media }) => media);
if (release.trailer) curatedRelease.trailer = release.trailer.media;
if (release.actors) curatedRelease.actors = release.actors.map(({ actor }) => curateActor(actor, curatedRelease));
return curatedRelease;
}

View File

@ -145,8 +145,6 @@
width: 100%;
display: -webkit-box;
display: flex;
-webkit-box-align: center;
align-items: center;
-webkit-box-pack: justify;
justify-content: space-between;
position: absolute;
@ -309,6 +307,8 @@
}
.trailer-video[data-v-42bb19c4] {
max-width: 100%;
-o-object-fit: cover;
object-fit: cover;
}
.item[data-v-42bb19c4] {
height: 18rem;
@ -662,14 +662,6 @@
width: 15rem;
}
/* $primary: #ff886c; */
/* $logo-highlight: drop-shadow(1px 0 0 $highlight-weak) drop-shadow(-1px 0 0 $highlight-weak) drop-shadow(0 1px 0 $highlight-weak) drop-shadow(0 -1px 0 $highlight-weak); */
@media (max-width: 1200px) {
.releases .tiles {
grid-template-columns: repeat(auto-fit, minmax(20rem, 1fr));
}
}
/* $primary: #ff886c; */
/* $logo-highlight: drop-shadow(1px 0 0 $highlight-weak) drop-shadow(-1px 0 0 $highlight-weak) drop-shadow(0 1px 0 $highlight-weak) drop-shadow(0 -1px 0 $highlight-weak); */
.network[data-v-e2e12602] {
@ -759,7 +751,7 @@
font-size: .9rem;
font-weight: bold;
}
@media (max-width: 720px) {
@media (max-width: 1200px) {
.header[data-v-e2e12602],
.header.hideable[data-v-e2e12602] {
display: -webkit-box;

View File

Before

Width:  |  Height:  |  Size: 54 KiB

After

Width:  |  Height:  |  Size: 54 KiB

View File

Before

Width:  |  Height:  |  Size: 19 KiB

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 10 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 55 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 28 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.0 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 61 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 72 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 13 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.6 KiB

After

Width:  |  Height:  |  Size: 1018 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 8.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 48 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 41 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 65 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 42 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 33 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 17 KiB

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.4 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 19 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 45 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 37 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.1 KiB

View File

@ -124,6 +124,7 @@ function getSites(networksMap) {
name: 'Trickery',
slug: 'bangtrickery',
url: 'https://www.bang.com/original/4800/bang-trickery',
parameters: JSON.stringify({ siteId: 4800 }),
network_id: networksMap.bang,
},
{
@ -131,8 +132,8 @@ function getSites(networksMap) {
slug: 'yngrcom',
// url: 'https://www.bang.com/original/5010/bang-yngr',
url: 'https://yngr.com',
network_id: networksMap.bang,
parameters: JSON.stringify({ siteId: 5010 }),
network_id: networksMap.bang,
},
{
name: 'Roadside XXX',
@ -146,17 +147,19 @@ function getSites(networksMap) {
name: 'Surprise',
slug: 'bangsurprise',
url: 'https://www.bang.com/original/5000/bang-surprise',
parameters: JSON.stringify({ siteId: 5000 }),
network_id: networksMap.bang,
},
{
name: 'Real Teens',
slug: 'bangrealteens',
url: 'https://www.bang.com/original/3366/bang-real-teens',
parameters: JSON.stringify({ siteId: 3366 }),
network_id: networksMap.bang,
},
{
name: 'FCK.news',
slug: 'bangfcknews',
slug: 'bangfakenews',
// url: 'https://www.bang.com/original/4998/bang-fckNews',
url: 'https://fck.news',
parameters: JSON.stringify({ siteId: 4998 }),
@ -167,49 +170,56 @@ function getSites(networksMap) {
slug: 'prettyandraw',
// url: 'https://www.bang.com/original/4792/bang-pretty-and-raw',
url: 'https://prettyandraw.com',
parameters: JSON.stringify({ siteId: 4782 }),
parameters: JSON.stringify({ siteId: 4792 }),
network_id: networksMap.bang,
},
{
name: 'Japan',
slug: 'bangjapan',
url: 'https://www.bang.com/original/3079/bang-japan',
parameters: JSON.stringify({ siteId: 3079 }),
network_id: networksMap.bang,
},
{
name: 'Rammed',
slug: 'bangrammed',
url: 'https://www.bang.com/original/4836/bang-rammed',
parameters: JSON.stringify({ siteId: 4836 }),
network_id: networksMap.bang,
},
{
name: 'Glamkore',
slug: 'bangglamkore',
url: 'https://www.bang.com/original/4586/bang-glamkore',
parameters: JSON.stringify({ siteId: 4586 }),
network_id: networksMap.bang,
},
{
name: 'Screw The Cops',
slug: 'bangscrewthecops',
slug: 'screwthecops',
url: 'https://www.bang.com/original/4710/bang-screw-cops',
parameters: JSON.stringify({ siteId: 4710 }),
network_id: networksMap.bang,
},
{
name: 'Real MILFs',
slug: 'bangrealmilfs',
url: 'https://www.bang.com/original/4448/bang-real-milfs',
parameters: JSON.stringify({ siteId: 4448 }),
network_id: networksMap.bang,
},
{
name: 'Confessions',
slug: 'bangconfessions',
url: 'https://www.bang.com/original/4308/bang-confessions',
parameters: JSON.stringify({ siteId: 4308 }),
network_id: networksMap.bang,
},
{
name: 'Casting',
slug: 'bangcasting',
url: 'https://www.bang.com/original/3261/bang-casting',
parameters: JSON.stringify({ siteId: 3261 }),
network_id: networksMap.bang,
},
// BANGBROS
@ -294,7 +304,7 @@ function getSites(networksMap) {
parameters: null,
},
{
slug: 'bangcasting',
slug: 'bangbroscasting',
network_id: networksMap.bangbros,
name: 'Bang Casting',
url: 'https://bangbros.com/websites/bangcasting',
@ -542,9 +552,9 @@ function getSites(networksMap) {
parameters: null,
},
{
slug: 'partyof3',
slug: 'partyofthree',
network_id: networksMap.bangbros,
name: 'Party of 3',
name: 'Party of Three',
url: 'https://bangbros.com/websites/partyof3',
description: null,
parameters: null,

View File

@ -9,6 +9,7 @@ const argv = require('./argv');
const scrapers = require('./scrapers/scrapers');
const whereOr = require('./utils/where-or');
const resolvePlace = require('./utils/resolve-place');
const slugify = require('./utils/slugify');
const { createMediaDirectory, storePhotos } = require('./media');
async function curateActor(actor) {
@ -89,7 +90,7 @@ function curateActorEntry(actor, scraped, scrapeSuccess) {
.split(' ')
.map(segment => `${segment.charAt(0).toUpperCase()}${segment.slice(1)}`)
.join(' '),
slug: actor.name.toLowerCase().replace(/\s+/g, '-'),
slug: slugify(actor.name),
birthdate: actor.birthdate,
description: actor.description,
gender: actor.gender,
@ -320,7 +321,7 @@ async function mergeProfiles(profiles, actor) {
async function scrapeActors(actorNames) {
await Promise.map(actorNames || argv.actors, async (actorName) => {
try {
const actorSlug = actorName.toLowerCase().replace(/\s+/g, '-');
const actorSlug = slugify(actorName);
const actorEntry = await knex('actors').where({ slug: actorSlug }).first();
const sources = argv.sources ? argv.sources.map(source => [source, scrapers.actors[source]]) : Object.entries(scrapers.actors);
@ -393,12 +394,20 @@ async function scrapeBasicActors() {
}
async function associateActors(mappedActors, releases) {
const actorNames = Object.keys(mappedActors);
const actorSlugs = actorNames.map(name => slugify(name));
const [existingActorEntries, existingAssociationEntries] = await Promise.all([
knex('actors').whereIn('name', Object.keys(mappedActors)),
knex('actors')
.whereIn('name', actorNames)
.orWhereIn('slug', actorSlugs),
knex('releases_actors').whereIn('release_id', releases.map(release => release.id)),
]);
console.log(actorNames, actorSlugs, existingActorEntries.map(actor => actor.name));
const associations = await Promise.map(Object.entries(mappedActors), async ([actorName, releaseIds]) => {
try {
const actorEntry = existingActorEntries.find(actor => actor.name === actorName)
|| await storeActor({ name: actorName });
@ -411,10 +420,14 @@ async function associateActors(mappedActors, releases) {
// remove associations already in database
.some(associationEntry => associationEntry.actor_id === association.actor_id
&& associationEntry.release_id === association.release_id));
} catch (error) {
console.error(actorName, error);
return null;
}
});
await Promise.all([
knex('releases_actors').insert(associations.flat()),
knex('releases_actors').insert(associations.filter(association => association).flat()),
scrapeBasicActors(),
]);
}

View File

@ -172,12 +172,16 @@ async function attachChannelSite(release) {
};
}
try {
const urlSite = await findSiteByUrl(release.channel);
return {
...release,
site: urlSite,
};
} catch (error) {
throw new Error(`Unable to derive channel site from generic URL: ${release.url}.`);
}
}
async function attachStudio(release) {
@ -384,7 +388,7 @@ async function storeReleases(releases) {
const storedReleases = await Promise.map(releases, async (release) => {
try {
const releaseWithChannelSite = await attachChannelSite(release);
const releaseWithStudio = await attachStudio(release);
const releaseWithStudio = await attachStudio(releaseWithChannelSite);
const releaseId = await storeRelease(releaseWithStudio);
return {
@ -403,6 +407,8 @@ async function storeReleases(releases) {
const actors = accumulateActors(storedReleases);
const movies = accumulateMovies(storedReleases);
console.log(actors);
await Promise.all([
associateActors(actors, storedReleases),
Promise.map(storedReleases, async release => storeReleaseAssets(release, release.id), {

View File

@ -2,6 +2,11 @@
const bhttp = require('bhttp');
const slugify = require('../utils/slugify');
const clusterId = '617fb597b659459bafe6472470d9073a';
const authKey = 'YmFuZy1yZWFkOktqVDN0RzJacmQ1TFNRazI=';
function encodeId(id) {
return Buffer
.from(id, 'hex')
@ -11,11 +16,21 @@ function encodeId(id) {
.replace(/=/g, ',');
}
function scrapeLatest(scenes, site) {
return scenes.map(({ _source: scene }) => {
function decodeId(id) {
const restoredId = id
.replace(/-/g, '+')
.replace(/_/g, '/')
.replace(/,/g, '=');
return Buffer
.from(restoredId, 'base64')
.toString('hex');
}
function scrapeScene(scene, site) {
const release = {
site,
entryId: encodeId(scene.id),
entryId: scene.id,
title: scene.name,
description: scene.description,
actors: scene.actors.map(actor => actor.name),
@ -23,8 +38,8 @@ function scrapeLatest(scenes, site) {
duration: scene.duration,
};
const slug = release.title.toLowerCase().trim().replace(/\s+/g, '-');
release.url = `https://www.bang.com/video/${release.entryId}/${slug}`;
const slug = slugify(release.title);
release.url = `https://www.bang.com/video/${encodeId(release.entryId)}/${slug}`;
const date = new Date(scene.releaseDate);
release.date = new Date(Date.UTC(date.getUTCFullYear(), date.getUTCMonth(), date.getUTCDate()));
@ -38,25 +53,25 @@ function scrapeLatest(scenes, site) {
const photos = defaultPoster ? photoset : photoset.slice(1);
const poster = defaultPoster || photoset[0];
release.poster = `https://i.bang.com/screenshots/${scene.dvd.id}/movie/1/${poster.screenId}.jpg`;
release.photos = photos.map(photo => `https://i.bang.com/screenshots/${scene.dvd.id}/movie/1/${photo.screenId}.jpg`);
release.poster = `https://i.bang.com/screenshots/${scene.dvd.id}/movie/${scene.order}/${poster.screenId}.jpg`;
release.photos = photos.map(photo => `https://i.bang.com/screenshots/${scene.dvd.id}/movie/${scene.order}/${photo.screenId}.jpg`);
release.trailer = {
src: `https://i.bang.com/v/${scene.dvd.id}/${scene.identifier}/preview.mp4`,
};
release.studio = scene.series.name
release.channel = scene.series.name
.replace(/[! .]/g, '')
.replace('&', 'and');
return release;
});
}
function scrapeLatest(scenes, site) {
return scenes.map(({ _source: scene }) => scrapeScene(scene, site));
}
async function fetchLatest(site, page = 1) {
const clusterId = '617fb597b659459bafe6472470d9073a';
const authKey = 'YmFuZy1yZWFkOktqVDN0RzJacmQ1TFNRazI=';
const res = await bhttp.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, {
size: 50,
from: (page - 1) * 50,
@ -75,6 +90,8 @@ async function fetchLatest(site, page = 1) {
},
},
},
/*
* global fetch
{
nested: {
path: 'studio',
@ -94,6 +111,26 @@ async function fetchLatest(site, page = 1) {
},
},
},
*/
{
nested: {
path: 'series',
query: {
bool: {
must: [
{
match: {
'series.id': {
operator: 'AND',
query: site.parameters.siteId,
},
},
},
],
},
},
},
},
],
must_not: [
{
@ -121,7 +158,20 @@ async function fetchLatest(site, page = 1) {
return scrapeLatest(res.body.hits.hits, site);
}
async function fetchScene(url, site) {
const encodedId = new URL(url).pathname.split('/')[2];
const entryId = decodeId(encodedId);
const res = await bhttp.get(`https://${clusterId}.us-east-1.aws.found.io/videos/video/${entryId}`, {
headers: {
Authorization: `Basic ${authKey}`,
},
});
return scrapeScene(res.body._source, site); // eslint-disable-line no-underscore-dangle
}
module.exports = {
fetchLatest,
// fetchScene,
fetchScene,
};

View File

@ -5,9 +5,6 @@ const bhttp = require('bhttp');
const cheerio = require('cheerio');
const moment = require('moment');
const knex = require('../knex');
const { matchTags } = require('../tags');
function scrapeLatest(html, site) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const sceneElements = $('.echThumb').toArray();
@ -57,7 +54,7 @@ async function scrapeScene(html, url, site) {
const description = sceneElement.find('.vdoDesc').text().trim();
const [siteName, ...actors] = sceneElement.find('.vdoCast a').map((actorIndex, actorElement) => $(actorElement).text()).toArray();
const siteId = siteName.replace(/[\s']+/g, '').toLowerCase();
const siteSlug = siteName.replace(/[\s']+/g, '').toLowerCase();
const poster = `https:${$('img#player-overlay-image').attr('src')}`;
const trailer = `https:${$('source[type="video/mp4"]').attr('src')}`;
@ -66,17 +63,7 @@ async function scrapeScene(html, url, site) {
// all scenes seem to have 12 album photos available, not always included on the page
const photos = Array.from({ length: 12 }, (val, index) => firstPhotoUrl.replace(/big\d+/, `big${index + 1}`));
const rawTags = $('.vdoTags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
const [channelSite, tags] = await Promise.all([
site.isFallback
? knex('sites')
.where({ slug: siteId })
.orWhere({ name: siteName })
.first()
: site,
matchTags(rawTags),
]);
const tags = $('.vdoTags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
const stars = Number(sceneElement.find('.bVdPl_it_like .bVdPl_txt').text().replace('% like', '')) / 20;
@ -96,12 +83,13 @@ async function scrapeScene(html, url, site) {
rating: {
stars,
},
site: channelSite || site,
site,
channel: siteSlug === 'bangcasting' ? 'bangbroscasting' : siteSlug,
};
}
async function fetchLatest(site, page = 1) {
const res = await bhttp.get(`https://bangbros.com/websites/${site.slug}/${page}`);
const res = await bhttp.get(`${site.url}/${page}`);
return scrapeLatest(res.body.toString(), site);
}

7
src/utils/slugify.js Normal file
View File

@ -0,0 +1,7 @@
'use strict';
function slugify(string) {
return string.trim().toLowerCase().match(/\w+/g).join('-');
}
module.exports = slugify;

View File

@ -2,8 +2,10 @@
const ActorPlugins = require('./actors');
const SitePlugins = require('./sites');
// const ReleasePlugins = require('./releases');
module.exports = {
ActorPlugins,
SitePlugins,
ReleasePlugins: [],
};

View File

@ -0,0 +1,12 @@
'use strict';
const { makeExtendSchemaPlugin, gql } = require('graphile-utils');
const schemaExtender = makeExtendSchemaPlugin(_build => ({
typeDefs: gql`
`,
resolvers: {
},
}));
module.exports = [schemaExtender];

View File

@ -11,7 +11,7 @@ const PgConnectionFilterPlugin = require('postgraphile-plugin-connection-filter'
const PgSimplifyInflectorPlugin = require('@graphile-contrib/pg-simplify-inflector');
const PgOrderByRelatedPlugin = require('@graphile-contrib/pg-order-by-related');
const { ActorPlugins, SitePlugins } = require('./plugins/plugins');
const { ActorPlugins, SitePlugins, ReleasePlugins } = require('./plugins/plugins');
const {
fetchReleases,
@ -57,6 +57,7 @@ function initServer() {
PgOrderByRelatedPlugin,
...ActorPlugins,
...SitePlugins,
...ReleasePlugins,
],
},
));