Added generic Gamma photo and actor scraper for XEmpire, 21Sextury, Blowpass and Evil Angel.

2020-01-22 22:25:58 +01:00
parent 4e4323704a
commit f8175f6054
17 changed files with 347 additions and 290 deletions
--- a/src/actors.js
+++ b/src/actors.js
@@ -301,7 +301,7 @@ async function mergeProfiles(profiles, actor) {
            piercings: prevProfile.piercings || profile.piercings,
            tattoos: prevProfile.tattoos || profile.tattoos,
            social: prevProfile.social.concat(profile.social || []),
-            avatars: prevProfile.avatars.concat(profile.avatar || []),
+            avatars: prevProfile.avatars.concat([profile.avatar] || []),
        };
    }, {
        social: [],
--- a/src/media.js
+++ b/src/media.js
@@ -175,7 +175,7 @@ async function storePhotos(photos, {
        return;
    }

-    const pluckedPhotos = pluckPhotos(photos);
+    const pluckedPhotos = pluckPhotos(Array.from(new Set(photos))); // pre-filter link duplicates, limit total per configuration
    const [sourceDuplicates, sourceOriginals] = await findDuplicates(pluckedPhotos, 'source', null, label);

    const metaFiles = await Promise.map(sourceOriginals, async (photoUrl, index) => fetchPhoto(photoUrl, index, label), {
--- a/src/releases.js
+++ b/src/releases.js
@@ -159,7 +159,7 @@ async function attachChannelSite(release) {
    }

    if (!release.channel) {
-        throw new Error(`Unable to derive channel site from generic URL: ${release.url}.`);
+        throw new Error(`Unable to derive channel site from generic URL: ${release.url}`);
    }

    const [site] = await fetchSites({
@@ -182,7 +182,7 @@ async function attachChannelSite(release) {
            site: urlSite,
        };
    } catch (error) {
-        throw new Error(`Unable to derive channel site from generic URL: ${release.url}.`);
+        throw new Error(`Unable to derive channel site from generic URL: ${release.url}`);
    }
 }

--- a/src/scrapers/21sextury.js
+++ b/src/scrapers/21sextury.js
@@ -1,51 +1,10 @@
 'use strict';

-const Promise = require('bluebird');
 const bhttp = require('bhttp');
 const cheerio = require('cheerio');
 const moment = require('moment');

-async function fetchPhotos(photoPath) {
-    const res = await bhttp.get(`https://21sextury.com${photoPath}`);
-
-    return res.body.toString();
-}
-
-function scrapePhotos(html) {
-    const $ = cheerio.load(html, { normalizeWhitespace: true });
-
-    const unlockedPhotos = $('.preview .imgLink.pgUnlocked')
-        .map((photoIndex, photoElement) => $(photoElement).attr('href')).toArray();
-
-    const lockedThumbnails = $('.preview .imgLink.lockedPicture img')
-        .map((photoIndex, photoElement) => $(photoElement)
-            .attr('src'))
-        // .replace('_tb.jpg', '.jpg')) does not always work
-        .toArray();
-
-    return unlockedPhotos.concat(lockedThumbnails);
-}
-
-async function getPhotos(photoPath) {
-    if (!photoPath || photoPath.match('join')) {
-        return [];
-    }
-
-    const html = await fetchPhotos(photoPath);
-    const $ = cheerio.load(html, { normalizeWhitespace: true });
-    const photos = scrapePhotos(html);
-    const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();
-
-    const otherPhotos = await Promise.map(pages, async (pagePath) => {
-        const pageHtml = await fetchPhotos(pagePath);
-
-        return scrapePhotos(pageHtml);
-    }, {
-        concurrency: 2,
-    });
-
-    return photos.concat(otherPhotos.flat());
-}
+const { getPhotos, fetchProfile } = require('./gamma');

 function scrape(html, site) {
    const $ = cheerio.load(html, { normalizeWhitespace: true });
@@ -131,8 +90,7 @@ async function scrapeScene(html, url, site) {
    const poster = videoData.picPreview;
    const trailer = `${videoData.playerOptions.host}${videoData.url}`;

-    const photoPath = $('.picturesItem a').attr('href');
-    const photos = await getPhotos(photoPath, site);
+    const photos = await getPhotos($('.picturesItem a').attr('href'), '21sextury.com', site);

    const tags = data.keywords.split(', ');
    const siteName = data.productionCompany ? data.productionCompany.name : $('#logoLink a').attr('title');
@@ -181,8 +139,13 @@ async function fetchScene(url, site) {
    return scrapeScene(res.body.toString(), url, site);
 }

+async function networkFetchProfile(actorName) {
+    return fetchProfile(actorName, '21sextury', true);
+}
+
 module.exports = {
    fetchLatest,
+    fetchProfile: networkFetchProfile,
    fetchUpcoming,
    fetchScene,
 };
--- a/src/scrapers/blowpass.js
+++ b/src/scrapers/blowpass.js
@@ -1,71 +1,11 @@
 'use strict';

 /* eslint-disable newline-per-chained-call */
-const Promise = require('bluebird');
 const bhttp = require('bhttp');
 const cheerio = require('cheerio');
 const moment = require('moment');

-async function fetchPhotos(url) {
-    const res = await bhttp.get(url);
-
-    return res.body.toString();
-}
-
-function scrapePhotos(html) {
-    const $ = cheerio.load(html, { normalizeWhitespace: true });
-
-    return $('.preview .imgLink').toArray().map((linkEl) => {
-        const url = $(linkEl).attr('href');
-
-        if (url.match('/join')) {
-            // URL links to join page instead of full photo, extract thumbnail
-            const src = $(linkEl).find('img').attr('src');
-
-            if (src.match('previews/')) {
-                // resource often serves full photo at a modifier URL anyway, add as primary source
-                const highRes = src
-                    .replace('previews/', '')
-                    .replace('_tb.jpg', '.jpg');
-
-                // keep original thumbnail as fallback in case full photo is not available
-                return [highRes, src];
-            }
-
-            return src;
-        }
-
-        // URL links to full photo
-        return url;
-    });
-}
-
-async function getPhotos(albumPath, siteDomain) {
-    const albumUrl = `https://www.blowpass.com${albumPath}`;
-
-    try {
-        const html = await fetchPhotos(albumUrl);
-        const $ = cheerio.load(html, { normalizeWhitespace: true });
-        const photos = scrapePhotos(html);
-
-        const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();
-
-        const otherPhotos = await Promise.map(pages, async (page) => {
-            const pageUrl = `https://${siteDomain}${page}`;
-            const pageHtml = await fetchPhotos(pageUrl);
-
-            return scrapePhotos(pageHtml);
-        }, {
-            concurrency: 2,
-        });
-
-        return photos.concat(otherPhotos.flat());
-    } catch (error) {
-        console.error(`Failed to fetch Blowpass photos from ${albumPath}: ${error.message}`);
-
-        return [];
-    }
-}
+const { getPhotos, fetchProfile } = require('./gamma');

 function scrape(html, site) {
    const $ = cheerio.load(html, { normalizeWhitespace: true });
@@ -132,7 +72,7 @@ async function scrapeScene(html, url, site) {

    const poster = playerData.picPreview;
    const trailer = `${playerData.playerOptions.host}${playerData.url}`;
-    const photos = await getPhotos($('.picturesItem a').attr('href'), channel, site);
+    const photos = await getPhotos($('.picturesItem a').attr('href'), 'blowpass.com', site);

    const duration = moment.duration(data.duration.slice(2)).asSeconds();
    const tags = data.keywords.split(', ');
@@ -180,8 +120,13 @@ async function fetchScene(url, site) {
    return scrapeScene(res.body.toString(), url, site);
 }

+async function blowpassFetchProfile(actorName) {
+    return fetchProfile(actorName, 'blowpass');
+}
+
 module.exports = {
    fetchLatest,
-    fetchUpcoming,
+    fetchProfile: blowpassFetchProfile,
    fetchScene,
+    fetchUpcoming,
 };
--- a/src/scrapers/evilangel.js
+++ b/src/scrapers/evilangel.js
@@ -4,6 +4,8 @@ const bhttp = require('bhttp');
 const cheerio = require('cheerio');
 const moment = require('moment');

+const { getPhotos } = require('./gamma');
+
 async function scrape(json, site) {
    return Promise.all(json.map(async (scene) => {
        const {
@@ -75,6 +77,8 @@ async function scrapeScene(html, url, site) {
    const poster = videoData.picPreview;
    const trailer = `${videoData.playerOptions.host}${videoData.url}`;

+    const photos = await getPhotos($('.picturesItem a').attr('href'), 'evilangel.com', site);
+
    return {
        url,
        entryId,
@@ -86,6 +90,7 @@ async function scrapeScene(html, url, site) {
        duration,
        tags,
        poster,
+        photos,
        trailer: {
            src: trailer,
            quality: parseInt(videoData.sizeOnLoad, 10),
@@ -97,7 +102,26 @@ async function scrapeScene(html, url, site) {
    };
 }

-async function fetchLatest(site, page = 1, upcoming = false) {
+function scrapeActor(data) {
+    const actor = {};
+
+    if (data.male === 1) actor.gender = 'male';
+    if (data.female === 1) actor.gender = 'female';
+    if (data.shemale === 1 || data.trans === 1) actor.gender = 'transsexual';
+
+    if (data.description) actor.description = data.description.trim();
+
+    if (data.attributes.ethnicity) actor.ethnicity = data.attributes.ethnicity;
+    if (data.attributes.eye_color) actor.eyes = data.attributes.eye_color;
+    if (data.attributes.hair_color) actor.hair = data.attributes.hair_color;
+
+    const avatarPath = Object.values(data.pictures).reverse()[0];
+    actor.avatar = `https://images01-evilangel.gammacdn.com/actors${avatarPath}`;
+
+    return actor;
+}
+
+async function fetchApiCredentials() {
    const res = await bhttp.get('https://evilangel.com/en/videos');
    const body = res.body.toString();

@@ -108,7 +132,20 @@ async function fetchLatest(site, page = 1, upcoming = false) {
    const { applicationID: appId, apiKey } = apiData.api.algolia;
    const userAgent = 'Algolia for vanilla JavaScript (lite) 3.27.0;instantsearch.js 2.7.4;JS Helper 2.26.0';

-    const apiRes = await bhttp.post(`https://${appId.toLowerCase()}-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=${userAgent}&x-algolia-application-id=${appId}&x-algolia-api-key=${apiKey}`, {
+    const apiUrl = `https://${appId.toLowerCase()}-dsn.algolia.net/1/indexes/*/queries?x-algolia-agent=${userAgent}&x-algolia-application-id=${appId}&x-algolia-api-key=${apiKey}`;
+
+    return {
+        appId,
+        apiKey,
+        userAgent,
+        apiUrl,
+    };
+}
+
+async function fetchLatest(site, page = 1, upcoming = false) {
+    const { apiUrl } = await fetchApiCredentials();
+
+    const res = await bhttp.post(apiUrl, {
        requests: [
            {
                indexName: 'all_scenes',
@@ -122,7 +159,7 @@ async function fetchLatest(site, page = 1, upcoming = false) {
        encodeJSON: true,
    });

-    return scrape(apiRes.body.results[0].hits, site);
+    return scrape(res.body.results[0].hits, site);
 }

 async function fetchUpcoming(site) {
@@ -135,8 +172,38 @@ async function fetchScene(url, site) {
    return scrapeScene(res.body.toString(), url, site);
 }

+async function fetchProfile(actorName) {
+    const { apiUrl } = await fetchApiCredentials();
+    const actorSlug = encodeURI(actorName);
+
+    const res = await bhttp.post(apiUrl, {
+        requests: [
+            {
+                indexName: 'all_actors',
+                params: `query=${actorSlug}`,
+            },
+        ],
+    }, {
+        headers: {
+            Referer: `https://www.evilangel.com/en/search?query=${actorSlug}&tab=actors`,
+        },
+        encodeJSON: true,
+    });
+
+    if (res.statusCode === 200 && res.body.results[0].hits.length > 0) {
+        const actorData = res.body.results[0].hits.find(actor => actor.name === actorName);
+
+        if (actorData) {
+            return scrapeActor(actorData);
+        }
+    }
+
+    return null;
+}
+
 module.exports = {
    fetchLatest,
-    fetchUpcoming,
+    fetchProfile,
    fetchScene,
+    fetchUpcoming,
 };
--- a/src/scrapers/gamma.js
+++ b/src/scrapers/gamma.js
@@ -0,0 +1,136 @@
+'use strict';
+
+const Promise = require('bluebird');
+const bhttp = require('bhttp');
+const { JSDOM } = require('jsdom');
+const cheerio = require('cheerio');
+
+async function fetchPhotos(url) {
+    const res = await bhttp.get(url);
+
+    return res.body.toString();
+}
+
+function scrapePhotos(html) {
+    const $ = cheerio.load(html, { normalizeWhitespace: true });
+
+    return $('.preview .imgLink').toArray().map((linkEl) => {
+        const url = $(linkEl).attr('href');
+
+        if (url.match('/join')) {
+            // URL links to join page instead of full photo, extract thumbnail
+            const src = $(linkEl).find('img').attr('src');
+
+            if (src.match('previews/')) {
+                // resource often serves full photo at a modifier URL anyway, add as primary source
+                const highRes = src
+                    .replace('previews/', '')
+                    .replace('_tb.jpg', '.jpg');
+
+                // keep original thumbnail as fallback in case full photo is not available
+                return [highRes, src];
+            }
+
+            return src;
+        }
+
+        // URL links to full photo
+        return url;
+    });
+}
+
+async function getPhotos(albumPath, siteDomain) {
+    const albumUrl = `https://${siteDomain}${albumPath}`;
+
+    try {
+        const html = await fetchPhotos(albumUrl);
+        const $ = cheerio.load(html, { normalizeWhitespace: true });
+        const photos = scrapePhotos(html);
+
+        const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();
+
+        const otherPhotos = await Promise.map(pages, async (page) => {
+            const pageUrl = `https://${siteDomain}${page}`;
+            const pageHtml = await fetchPhotos(pageUrl);
+
+            return scrapePhotos(pageHtml);
+        }, {
+            concurrency: 2,
+        });
+
+        return photos.concat(otherPhotos.flat());
+    } catch (error) {
+        console.error(`Failed to fetch ${siteDomain} photos from ${albumPath}: ${error.message}`);
+
+        return [];
+    }
+}
+
+function scrapeActorSearch(html, url, actorName) {
+    const { document } = new JSDOM(html).window;
+    const actorLink = document.querySelector(`a[title="${actorName}" i]`);
+
+    return actorLink ? actorLink.href : null;
+}
+
+function scrapeProfile(html, url, actorName, siteSlug) {
+    const { document } = new JSDOM(html).window;
+
+    const avatarEl = document.querySelector('img.actorPicture');
+    const descriptionEl = document.querySelector('.actorBio p:not(.bioTitle)');
+
+    const profile = {
+        name: actorName,
+    };
+
+    if (avatarEl) {
+        // larger sizes usually available, provide fallbacks
+        const avatars = [
+            avatarEl.src.replace(/\d+x\d+/, '500x750'),
+            avatarEl.src.replace(/\d+x\d+/, '240x360'),
+            avatarEl.src.replace(/\d+x\d+/, '200x300'),
+            avatarEl.src,
+        ];
+
+        profile.avatar = avatars;
+    }
+
+    if (descriptionEl) profile.description = descriptionEl.textContent.trim();
+
+    profile.releases = Array.from(document.querySelectorAll('.sceneList .scene a.imgLink'), el => `https://${siteSlug}.com${el.href}`);
+
+    return profile;
+}
+
+async function fetchProfile(actorName, siteSlug, altSearchUrl) {
+    const actorSlug = actorName.toLowerCase().replace(/\s+/, '+');
+    const searchUrl = altSearchUrl
+        ? `https://www.${siteSlug}.com/en/search/${actorSlug}/1/actor`
+        : `https://www.${siteSlug}.com/en/search/${siteSlug}/actor/${actorSlug}`;
+    const searchRes = await bhttp.get(searchUrl);
+
+    if (searchRes.statusCode !== 200) {
+        return null;
+    }
+
+    const actorUrl = scrapeActorSearch(searchRes.body.toString(), searchUrl, actorName);
+
+    if (actorUrl) {
+        const url = `https://${siteSlug}.com${actorUrl}`;
+        const actorRes = await bhttp.get(url);
+
+        if (actorRes.statusCode !== 200) {
+            return null;
+        }
+
+        return scrapeProfile(actorRes.body.toString(), url, actorName, siteSlug);
+    }
+
+    return null;
+}
+
+module.exports = {
+    getPhotos,
+    fetchProfile,
+    scrapeProfile,
+};
--- a/src/scrapers/scrapers.js
+++ b/src/scrapers/scrapers.js
@@ -1,14 +1,11 @@
 'use strict';

 // releases
-const twentyonesextury = require('./21sextury');
 const babes = require('./babes');
 const bang = require('./bang');
 const bangbros = require('./bangbros');
-const blowpass = require('./blowpass');
 const dogfart = require('./dogfart');
 const digitalplayground = require('./digitalplayground');
-const evilangel = require('./evilangel');
 const fakehub = require('./fakehub');
 const jayrock = require('./jayrock');
 const kink = require('./kink');
@@ -25,11 +22,14 @@ const teamskeet = require('./teamskeet');
 const vixen = require('./vixen');

 // releases and profiles
-const ddfnetwork = require('./ddfnetwork');
+const blowpass = require('./blowpass');
 const brazzers = require('./brazzers');
+const ddfnetwork = require('./ddfnetwork');
+const evilangel = require('./evilangel');
 const julesjordan = require('./julesjordan');
 const kellymadison = require('./kellymadison');
 const legalporno = require('./legalporno');
+const twentyonesextury = require('./21sextury');
 const xempire = require('./xempire');

 // profiles
@@ -71,7 +71,10 @@ module.exports = {
    },
    actors: {
        // ordered by data priority
+        '21sextury': twentyonesextury,
+        evilangel,
        xempire,
+        blowpass,
        julesjordan,
        brazzers,
        legalporno,
--- a/src/scrapers/xempire.js
+++ b/src/scrapers/xempire.js
@@ -1,79 +1,10 @@
 'use strict';

-const Promise = require('bluebird');
 const bhttp = require('bhttp');
 const cheerio = require('cheerio');
-const { JSDOM } = require('jsdom');
 const moment = require('moment');

-const defaultTags = {
-    hardx: [],
-    darkx: ['interracial'],
-    eroticax: [],
-    lesbianx: ['lesbian'],
-    allblackx: ['ebony', 'bbc'],
-};
-
-async function fetchPhotos(url) {
-    const res = await bhttp.get(url);
-
-    return res.body.toString();
-}
-
-function scrapePhotos(html) {
-    const $ = cheerio.load(html, { normalizeWhitespace: true });
-
-    return $('.preview .imgLink').toArray().map((linkEl) => {
-        const url = $(linkEl).attr('href');
-
-        if (url.match('/join')) {
-            // URL links to join page instead of full photo, extract thumbnail
-            const src = $(linkEl).find('img').attr('src');
-
-            if (src.match('previews/')) {
-                // resource often serves full photo at a modifier URL anyway, add as primary source
-                const highRes = src
-                    .replace('previews/', '')
-                    .replace('_tb.jpg', '.jpg');
-
-                // keep original thumbnail as fallback in case full photo is not available
-                return [highRes, src];
-            }
-
-            return src;
-        }
-
-        // URL links to full photo
-        return url;
-    });
-}
-
-async function getPhotos(albumPath, siteDomain) {
-    const albumUrl = `https://${siteDomain}${albumPath}`;
-
-    try {
-        const html = await fetchPhotos(albumUrl);
-        const $ = cheerio.load(html, { normalizeWhitespace: true });
-        const photos = scrapePhotos(html);
-
-        const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();
-
-        const otherPhotos = await Promise.map(pages, async (page) => {
-            const pageUrl = `https://${siteDomain}${page}`;
-            const pageHtml = await fetchPhotos(pageUrl);
-
-            return scrapePhotos(pageHtml);
-        }, {
-            concurrency: 2,
-        });
-
-        return photos.concat(otherPhotos.flat());
-    } catch (error) {
-        console.error(`Failed to fetch XEmpire photos from ${albumPath}: ${error.message}`);
-
-        return [];
-    }
-}
+const { getPhotos, fetchProfile } = require('./gamma');

 function scrape(html, site) {
    const $ = cheerio.load(html, { normalizeWhitespace: true });
@@ -154,8 +85,7 @@ async function scrapeScene(html, url, site) {

    const photos = await getPhotos($('.picturesItem a').attr('href'), siteDomain, site);

-    const rawTags = data.keywords.split(', ');
-    const tags = [...defaultTags[siteSlug], ...rawTags];
+    const tags = data.keywords.split(', ');

    return {
        url: `${siteUrl}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}`,
@@ -181,31 +111,6 @@ async function scrapeScene(html, url, site) {
    };
 }

-function scrapeActorSearch(html, url, actorName) {
-    const { document } = new JSDOM(html).window;
-    const actorLink = document.querySelector(`a[title="${actorName}" i]`);
-
-    return actorLink ? actorLink.href : null;
-}
-
-function scrapeProfile(html, url, actorName) {
-    const { document } = new JSDOM(html).window;
-
-    const avatarEl = document.querySelector('img.actorPicture');
-    const descriptionEl = document.querySelector('.actorBio p:not(.bioTitle)');
-
-    const profile = {
-        name: actorName,
-    };
-
-    if (avatarEl) profile.avatar = avatarEl.src;
-    if (descriptionEl) profile.description = descriptionEl.textContent.trim();
-
-    profile.releases = Array.from(document.querySelectorAll('.sceneList .scene a.imgLink'), el => `https://xempire.com${el.href}`);
-
-    return profile;
-}
-
 async function fetchLatest(site, page = 1) {
    const res = await bhttp.get(`${site.url}/en/videos/AllCategories/0/${page}`);

@@ -224,34 +129,13 @@ async function fetchScene(url, site) {
    return scrapeScene(res.body.toString(), url, site);
 }

-async function fetchProfile(actorName) {
-    const actorSlug = actorName.toLowerCase().replace(/\s+/, '+');
-    const searchUrl = `https://www.xempire.com/en/search/xempire/actor/${actorSlug}`;
-    const searchRes = await bhttp.get(searchUrl);
-
-    if (searchRes.statusCode !== 200) {
-        return null;
-    }
-
-    const actorUrl = scrapeActorSearch(searchRes.body.toString(), searchUrl, actorName);
-
-    if (actorUrl) {
-        const url = `https://xempire.com${actorUrl}`;
-        const actorRes = await bhttp.get(url);
-
-        if (actorRes.statusCode !== 200) {
-            return null;
-        }
-
-        return scrapeProfile(actorRes.body.toString(), url, actorName);
-    }
-
-    return null;
+async function xEmpireFetchProfile(actorName) {
+    return fetchProfile(actorName, 'xempire');
 }

 module.exports = {
    fetchLatest,
-    fetchProfile,
+    fetchProfile: xEmpireFetchProfile,
    fetchUpcoming,
    fetchScene,
 };
--- a/src/sites.js
+++ b/src/sites.js
@@ -60,7 +60,7 @@ function destructConfigNetworks(networks) {
 }

 async function findSiteByUrl(url) {
-    const { hostname, origin } = new URL(url);
+    const { hostname } = new URL(url);
    const domain = hostname.replace(/www.|tour./, '');

    const sites = await knex('sites')
@@ -69,8 +69,8 @@ async function findSiteByUrl(url) {
            'sites.*',
            'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
        )
-        .where('sites.url', 'like', `${domain}`)
-        .orWhere('sites.url', 'like', `${origin}`)
+        .where('sites.url', 'like', `%${domain}`)
+        .orWhere('sites.url', 'like', url)
        .orWhere('sites.url', url);

    if (sites.length > 0) {