Added Bang! deep scrape. Improved network page layout. Added Bang Bros logos.

2020-01-07 04:23:28 +01:00
parent 89064e9e0c
commit 0a19f2e624
71 changed files with 194 additions and 116 deletions
--- a/src/actors.js
+++ b/src/actors.js
@@ -9,6 +9,7 @@ const argv = require('./argv');
 const scrapers = require('./scrapers/scrapers');
 const whereOr = require('./utils/where-or');
 const resolvePlace = require('./utils/resolve-place');
+const slugify = require('./utils/slugify');
 const { createMediaDirectory, storePhotos } = require('./media');

 async function curateActor(actor) {
@@ -89,7 +90,7 @@ function curateActorEntry(actor, scraped, scrapeSuccess) {
            .split(' ')
            .map(segment => `${segment.charAt(0).toUpperCase()}${segment.slice(1)}`)
            .join(' '),
-        slug: actor.name.toLowerCase().replace(/\s+/g, '-'),
+        slug: slugify(actor.name),
        birthdate: actor.birthdate,
        description: actor.description,
        gender: actor.gender,
@@ -320,7 +321,7 @@ async function mergeProfiles(profiles, actor) {
 async function scrapeActors(actorNames) {
    await Promise.map(actorNames || argv.actors, async (actorName) => {
        try {
-            const actorSlug = actorName.toLowerCase().replace(/\s+/g, '-');
+            const actorSlug = slugify(actorName);
            const actorEntry = await knex('actors').where({ slug: actorSlug }).first();
            const sources = argv.sources ? argv.sources.map(source => [source, scrapers.actors[source]]) : Object.entries(scrapers.actors);

@@ -393,28 +394,40 @@ async function scrapeBasicActors() {
 }

 async function associateActors(mappedActors, releases) {
+    const actorNames = Object.keys(mappedActors);
+    const actorSlugs = actorNames.map(name => slugify(name));
+
    const [existingActorEntries, existingAssociationEntries] = await Promise.all([
-        knex('actors').whereIn('name', Object.keys(mappedActors)),
+        knex('actors')
+            .whereIn('name', actorNames)
+            .orWhereIn('slug', actorSlugs),
        knex('releases_actors').whereIn('release_id', releases.map(release => release.id)),
    ]);

-    const associations = await Promise.map(Object.entries(mappedActors), async ([actorName, releaseIds]) => {
-        const actorEntry = existingActorEntries.find(actor => actor.name === actorName)
-            || await storeActor({ name: actorName });
+    console.log(actorNames, actorSlugs, existingActorEntries.map(actor => actor.name));

-        return releaseIds
-            .map(releaseId => ({
-                release_id: releaseId,
-                actor_id: actorEntry.id,
-            }))
-            .filter(association => !existingAssociationEntries
-                // remove associations already in database
-                .some(associationEntry => associationEntry.actor_id === association.actor_id
-                    && associationEntry.release_id === association.release_id));
+    const associations = await Promise.map(Object.entries(mappedActors), async ([actorName, releaseIds]) => {
+        try {
+            const actorEntry = existingActorEntries.find(actor => actor.name === actorName)
+                || await storeActor({ name: actorName });
+
+            return releaseIds
+                .map(releaseId => ({
+                    release_id: releaseId,
+                    actor_id: actorEntry.id,
+                }))
+                .filter(association => !existingAssociationEntries
+                    // remove associations already in database
+                    .some(associationEntry => associationEntry.actor_id === association.actor_id
+                        && associationEntry.release_id === association.release_id));
+        } catch (error) {
+            console.error(actorName, error);
+            return null;
+        }
    });

    await Promise.all([
-        knex('releases_actors').insert(associations.flat()),
+        knex('releases_actors').insert(associations.filter(association => association).flat()),
        scrapeBasicActors(),
    ]);
 }
--- a/src/releases.js
+++ b/src/releases.js
@@ -172,12 +172,16 @@ async function attachChannelSite(release) {
        };
    }

-    const urlSite = await findSiteByUrl(release.channel);
+    try {
+        const urlSite = await findSiteByUrl(release.channel);

-    return {
-        ...release,
-        site: urlSite,
-    };
+        return {
+            ...release,
+            site: urlSite,
+        };
+    } catch (error) {
+        throw new Error(`Unable to derive channel site from generic URL: ${release.url}.`);
+    }
 }

 async function attachStudio(release) {
@@ -384,7 +388,7 @@ async function storeReleases(releases) {
    const storedReleases = await Promise.map(releases, async (release) => {
        try {
            const releaseWithChannelSite = await attachChannelSite(release);
-            const releaseWithStudio = await attachStudio(release);
+            const releaseWithStudio = await attachStudio(releaseWithChannelSite);
            const releaseId = await storeRelease(releaseWithStudio);

            return {
@@ -403,6 +407,8 @@ async function storeReleases(releases) {
    const actors = accumulateActors(storedReleases);
    const movies = accumulateMovies(storedReleases);

+    console.log(actors);
+
    await Promise.all([
        associateActors(actors, storedReleases),
        Promise.map(storedReleases, async release => storeReleaseAssets(release, release.id), {
--- a/src/scrapers/bang.js
+++ b/src/scrapers/bang.js
@@ -2,6 +2,11 @@

 const bhttp = require('bhttp');

+const slugify = require('../utils/slugify');
+
+const clusterId = '617fb597b659459bafe6472470d9073a';
+const authKey = 'YmFuZy1yZWFkOktqVDN0RzJacmQ1TFNRazI=';
+
 function encodeId(id) {
    return Buffer
        .from(id, 'hex')
@@ -11,52 +16,62 @@ function encodeId(id) {
        .replace(/=/g, ',');
 }

+function decodeId(id) {
+    const restoredId = id
+        .replace(/-/g, '+')
+        .replace(/_/g, '/')
+        .replace(/,/g, '=');
+
+    return Buffer
+        .from(restoredId, 'base64')
+        .toString('hex');
+}
+
+function scrapeScene(scene, site) {
+    const release = {
+        site,
+        entryId: scene.id,
+        title: scene.name,
+        description: scene.description,
+        actors: scene.actors.map(actor => actor.name),
+        tags: scene.genres.concat(scene.actions).map(genre => genre.name),
+        duration: scene.duration,
+    };
+
+    const slug = slugify(release.title);
+    release.url = `https://www.bang.com/video/${encodeId(release.entryId)}/${slug}`;
+
+    const date = new Date(scene.releaseDate);
+    release.date = new Date(Date.UTC(date.getUTCFullYear(), date.getUTCMonth(), date.getUTCDate()));
+
+    if (scene.is4k) release.tags.push('4k');
+    if (scene.gay) release.tags.push('gay');
+
+    const defaultPoster = scene.screenshots.find(photo => photo.default === true);
+    const photoset = scene.screenshots.filter(photo => photo.default === false);
+
+    const photos = defaultPoster ? photoset : photoset.slice(1);
+    const poster = defaultPoster || photoset[0];
+
+    release.poster = `https://i.bang.com/screenshots/${scene.dvd.id}/movie/${scene.order}/${poster.screenId}.jpg`;
+    release.photos = photos.map(photo => `https://i.bang.com/screenshots/${scene.dvd.id}/movie/${scene.order}/${photo.screenId}.jpg`);
+
+    release.trailer = {
+        src: `https://i.bang.com/v/${scene.dvd.id}/${scene.identifier}/preview.mp4`,
+    };
+
+    release.channel = scene.series.name
+        .replace(/[! .]/g, '')
+        .replace('&', 'and');
+
+    return release;
+}
+
 function scrapeLatest(scenes, site) {
-    return scenes.map(({ _source: scene }) => {
-        const release = {
-            site,
-            entryId: encodeId(scene.id),
-            title: scene.name,
-            description: scene.description,
-            actors: scene.actors.map(actor => actor.name),
-            tags: scene.genres.concat(scene.actions).map(genre => genre.name),
-            duration: scene.duration,
-        };
-
-        const slug = release.title.toLowerCase().trim().replace(/\s+/g, '-');
-        release.url = `https://www.bang.com/video/${release.entryId}/${slug}`;
-
-        const date = new Date(scene.releaseDate);
-        release.date = new Date(Date.UTC(date.getUTCFullYear(), date.getUTCMonth(), date.getUTCDate()));
-
-        if (scene.is4k) release.tags.push('4k');
-        if (scene.gay) release.tags.push('gay');
-
-        const defaultPoster = scene.screenshots.find(photo => photo.default === true);
-        const photoset = scene.screenshots.filter(photo => photo.default === false);
-
-        const photos = defaultPoster ? photoset : photoset.slice(1);
-        const poster = defaultPoster || photoset[0];
-
-        release.poster = `https://i.bang.com/screenshots/${scene.dvd.id}/movie/1/${poster.screenId}.jpg`;
-        release.photos = photos.map(photo => `https://i.bang.com/screenshots/${scene.dvd.id}/movie/1/${photo.screenId}.jpg`);
-
-        release.trailer = {
-            src: `https://i.bang.com/v/${scene.dvd.id}/${scene.identifier}/preview.mp4`,
-        };
-
-        release.studio = scene.series.name
-            .replace(/[! .]/g, '')
-            .replace('&', 'and');
-
-        return release;
-    });
+    return scenes.map(({ _source: scene }) => scrapeScene(scene, site));
 }

 async function fetchLatest(site, page = 1) {
-    const clusterId = '617fb597b659459bafe6472470d9073a';
-    const authKey = 'YmFuZy1yZWFkOktqVDN0RzJacmQ1TFNRazI=';
-
    const res = await bhttp.post(`https://${clusterId}.us-east-1.aws.found.io/videos/video/_search`, {
        size: 50,
        from: (page - 1) * 50,
@@ -75,6 +90,8 @@ async function fetchLatest(site, page = 1) {
                            },
                        },
                    },
+                    /*
+                     * global fetch
                    {
                        nested: {
                            path: 'studio',
@@ -94,6 +111,26 @@ async function fetchLatest(site, page = 1) {
                            },
                        },
                    },
+                    */
+                    {
+                        nested: {
+                            path: 'series',
+                            query: {
+                                bool: {
+                                    must: [
+                                        {
+                                            match: {
+                                                'series.id': {
+                                                    operator: 'AND',
+                                                    query: site.parameters.siteId,
+                                                },
+                                            },
+                                        },
+                                    ],
+                                },
+                            },
+                        },
+                    },
                ],
                must_not: [
                    {
@@ -121,7 +158,20 @@ async function fetchLatest(site, page = 1) {
    return scrapeLatest(res.body.hits.hits, site);
 }

+async function fetchScene(url, site) {
+    const encodedId = new URL(url).pathname.split('/')[2];
+    const entryId = decodeId(encodedId);
+
+    const res = await bhttp.get(`https://${clusterId}.us-east-1.aws.found.io/videos/video/${entryId}`, {
+        headers: {
+            Authorization: `Basic ${authKey}`,
+        },
+    });
+
+    return scrapeScene(res.body._source, site); // eslint-disable-line no-underscore-dangle
+}
+
 module.exports = {
    fetchLatest,
-    // fetchScene,
+    fetchScene,
 };
--- a/src/scrapers/bangbros.js
+++ b/src/scrapers/bangbros.js
@@ -5,9 +5,6 @@ const bhttp = require('bhttp');
 const cheerio = require('cheerio');
 const moment = require('moment');

-const knex = require('../knex');
-const { matchTags } = require('../tags');
-
 function scrapeLatest(html, site) {
    const $ = cheerio.load(html, { normalizeWhitespace: true });
    const sceneElements = $('.echThumb').toArray();
@@ -57,7 +54,7 @@ async function scrapeScene(html, url, site) {
    const description = sceneElement.find('.vdoDesc').text().trim();

    const [siteName, ...actors] = sceneElement.find('.vdoCast a').map((actorIndex, actorElement) => $(actorElement).text()).toArray();
-    const siteId = siteName.replace(/[\s']+/g, '').toLowerCase();
+    const siteSlug = siteName.replace(/[\s']+/g, '').toLowerCase();

    const poster = `https:${$('img#player-overlay-image').attr('src')}`;
    const trailer = `https:${$('source[type="video/mp4"]').attr('src')}`;
@@ -66,17 +63,7 @@ async function scrapeScene(html, url, site) {
    // all scenes seem to have 12 album photos available, not always included on the page
    const photos = Array.from({ length: 12 }, (val, index) => firstPhotoUrl.replace(/big\d+/, `big${index + 1}`));

-    const rawTags = $('.vdoTags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
-
-    const [channelSite, tags] = await Promise.all([
-        site.isFallback
-            ? knex('sites')
-                .where({ slug: siteId })
-                .orWhere({ name: siteName })
-                .first()
-            : site,
-        matchTags(rawTags),
-    ]);
+    const tags = $('.vdoTags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();

    const stars = Number(sceneElement.find('.bVdPl_it_like .bVdPl_txt').text().replace('% like', '')) / 20;

@@ -96,12 +83,13 @@ async function scrapeScene(html, url, site) {
        rating: {
            stars,
        },
-        site: channelSite || site,
+        site,
+        channel: siteSlug === 'bangcasting' ? 'bangbroscasting' : siteSlug,
    };
 }

 async function fetchLatest(site, page = 1) {
-    const res = await bhttp.get(`https://bangbros.com/websites/${site.slug}/${page}`);
+    const res = await bhttp.get(`${site.url}/${page}`);

    return scrapeLatest(res.body.toString(), site);
 }
--- a/src/utils/slugify.js
+++ b/src/utils/slugify.js
@@ -0,0 +1,7 @@
+'use strict';
+
+function slugify(string) {
+    return string.trim().toLowerCase().match(/\w+/g).join('-');
+}
+
+module.exports = slugify;
--- a/src/web/plugins/plugins.js
+++ b/src/web/plugins/plugins.js
@@ -2,8 +2,10 @@

 const ActorPlugins = require('./actors');
 const SitePlugins = require('./sites');
+// const ReleasePlugins = require('./releases');

 module.exports = {
    ActorPlugins,
    SitePlugins,
+    ReleasePlugins: [],
 };
--- a/src/web/plugins/releases.js
+++ b/src/web/plugins/releases.js
@@ -0,0 +1,12 @@
+'use strict';
+
+const { makeExtendSchemaPlugin, gql } = require('graphile-utils');
+
+const schemaExtender = makeExtendSchemaPlugin(_build => ({
+    typeDefs: gql`
+    `,
+    resolvers: {
+    },
+}));
+
+module.exports = [schemaExtender];
--- a/src/web/server.js
+++ b/src/web/server.js
@@ -11,7 +11,7 @@ const PgConnectionFilterPlugin = require('postgraphile-plugin-connection-filter'
 const PgSimplifyInflectorPlugin = require('@graphile-contrib/pg-simplify-inflector');
 const PgOrderByRelatedPlugin = require('@graphile-contrib/pg-order-by-related');

-const { ActorPlugins, SitePlugins } = require('./plugins/plugins');
+const { ActorPlugins, SitePlugins, ReleasePlugins } = require('./plugins/plugins');

 const {
    fetchReleases,
@@ -57,6 +57,7 @@ function initServer() {
                PgOrderByRelatedPlugin,
                ...ActorPlugins,
                ...SitePlugins,
+                ...ReleasePlugins,
            ],
        },
    ));