Added generic Gamma photo and actor scraper for XEmpire, 21Sextury, Blowpass and Evil Angel.

2020-01-22 22:25:58 +01:00
parent 4e4323704a
commit f8175f6054
17 changed files with 347 additions and 290 deletions
--- a/src/scrapers/gamma.js
+++ b/src/scrapers/gamma.js
@@ -0,0 +1,136 @@
+'use strict';
+
+const Promise = require('bluebird');
+const bhttp = require('bhttp');
+const { JSDOM } = require('jsdom');
+const cheerio = require('cheerio');
+
+async function fetchPhotos(url) {
+    const res = await bhttp.get(url);
+
+    return res.body.toString();
+}
+
+function scrapePhotos(html) {
+    const $ = cheerio.load(html, { normalizeWhitespace: true });
+
+    return $('.preview .imgLink').toArray().map((linkEl) => {
+        const url = $(linkEl).attr('href');
+
+        if (url.match('/join')) {
+            // URL links to join page instead of full photo, extract thumbnail
+            const src = $(linkEl).find('img').attr('src');
+
+            if (src.match('previews/')) {
+                // resource often serves full photo at a modifier URL anyway, add as primary source
+                const highRes = src
+                    .replace('previews/', '')
+                    .replace('_tb.jpg', '.jpg');
+
+                // keep original thumbnail as fallback in case full photo is not available
+                return [highRes, src];
+            }
+
+            return src;
+        }
+
+        // URL links to full photo
+        return url;
+    });
+}
+
+async function getPhotos(albumPath, siteDomain) {
+    const albumUrl = `https://${siteDomain}${albumPath}`;
+
+    try {
+        const html = await fetchPhotos(albumUrl);
+        const $ = cheerio.load(html, { normalizeWhitespace: true });
+        const photos = scrapePhotos(html);
+
+        const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();
+
+        const otherPhotos = await Promise.map(pages, async (page) => {
+            const pageUrl = `https://${siteDomain}${page}`;
+            const pageHtml = await fetchPhotos(pageUrl);
+
+            return scrapePhotos(pageHtml);
+        }, {
+            concurrency: 2,
+        });
+
+        return photos.concat(otherPhotos.flat());
+    } catch (error) {
+        console.error(`Failed to fetch ${siteDomain} photos from ${albumPath}: ${error.message}`);
+
+        return [];
+    }
+}
+
+function scrapeActorSearch(html, url, actorName) {
+    const { document } = new JSDOM(html).window;
+    const actorLink = document.querySelector(`a[title="${actorName}" i]`);
+
+    return actorLink ? actorLink.href : null;
+}
+
+function scrapeProfile(html, url, actorName, siteSlug) {
+    const { document } = new JSDOM(html).window;
+
+    const avatarEl = document.querySelector('img.actorPicture');
+    const descriptionEl = document.querySelector('.actorBio p:not(.bioTitle)');
+
+    const profile = {
+        name: actorName,
+    };
+
+    if (avatarEl) {
+        // larger sizes usually available, provide fallbacks
+        const avatars = [
+            avatarEl.src.replace(/\d+x\d+/, '500x750'),
+            avatarEl.src.replace(/\d+x\d+/, '240x360'),
+            avatarEl.src.replace(/\d+x\d+/, '200x300'),
+            avatarEl.src,
+        ];
+
+        profile.avatar = avatars;
+    }
+
+    if (descriptionEl) profile.description = descriptionEl.textContent.trim();
+
+    profile.releases = Array.from(document.querySelectorAll('.sceneList .scene a.imgLink'), el => `https://${siteSlug}.com${el.href}`);
+
+    return profile;
+}
+
+async function fetchProfile(actorName, siteSlug, altSearchUrl) {
+    const actorSlug = actorName.toLowerCase().replace(/\s+/, '+');
+    const searchUrl = altSearchUrl
+        ? `https://www.${siteSlug}.com/en/search/${actorSlug}/1/actor`
+        : `https://www.${siteSlug}.com/en/search/${siteSlug}/actor/${actorSlug}`;
+    const searchRes = await bhttp.get(searchUrl);
+
+    if (searchRes.statusCode !== 200) {
+        return null;
+    }
+
+    const actorUrl = scrapeActorSearch(searchRes.body.toString(), searchUrl, actorName);
+
+    if (actorUrl) {
+        const url = `https://${siteSlug}.com${actorUrl}`;
+        const actorRes = await bhttp.get(url);
+
+        if (actorRes.statusCode !== 200) {
+            return null;
+        }
+
+        return scrapeProfile(actorRes.body.toString(), url, actorName, siteSlug);
+    }
+
+    return null;
+}
+
+module.exports = {
+    getPhotos,
+    fetchProfile,
+    scrapeProfile,
+};