Added photo album support to Blowpass scraper.

2019-12-13 05:04:04 +01:00
parent c9e0c29d51
commit c998fcf933
8 changed files with 606 additions and 6 deletions
--- a/src/releases.js
+++ b/src/releases.js
@@ -65,6 +65,7 @@ async function curateRelease(release) {
            gender: actor.gender,
            birthdate: actor.birthdate,
            age: moment().diff(actor.birthdate, 'years'),
+            ageThen: moment(release.date).diff(actor.birthdate, 'years'),
            avatar: actor.avatar,
            origin: actor.birth_country_alpha2
                ? {
--- a/src/scrape-sites.js
+++ b/src/scrape-sites.js
@@ -44,6 +44,8 @@ async function scrapeUniqueReleases(scraper, site, afterDate = getAfterDate(), a

    console.log(`\x1b[90m${site.name}: Scraped page ${page}, ${uniqueReleases.length} unique recent releases\x1b[0m`);

+    console.log(oldestReleaseOnPage, afterDate, moment(oldestReleaseOnPage).isAfter(afterDate));
+
    if (
        uniqueReleases.length > 0
            && (oldestReleaseOnPage || page < argv.pages)
--- a/src/scrapers/blowpass.js
+++ b/src/scrapers/blowpass.js
@@ -1,11 +1,71 @@
 'use strict';

 /* eslint-disable newline-per-chained-call */
+const Promise = require('bluebird');
 const bhttp = require('bhttp');
 const cheerio = require('cheerio');
 const moment = require('moment');

-const { matchTags } = require('../tags');
+async function fetchPhotos(url) {
+    const res = await bhttp.get(url);
+
+    return res.body.toString();
+}
+
+function scrapePhotos(html) {
+    const $ = cheerio.load(html, { normalizeWhitespace: true });
+
+    return $('.preview .imgLink').toArray().map((linkEl) => {
+        const url = $(linkEl).attr('href');
+
+        if (url.match('/join')) {
+            // URL links to join page instead of full photo, extract thumbnail
+            const src = $(linkEl).find('img').attr('src');
+
+            if (src.match('previews/')) {
+                // resource often serves full photo at a modifier URL anyway, add as primary source
+                const highRes = src
+                    .replace('previews/', '')
+                    .replace('_tb.jpg', '.jpg');
+
+                // keep original thumbnail as fallback in case full photo is not available
+                return [highRes, src];
+            }
+
+            return src;
+        }
+
+        // URL links to full photo
+        return url;
+    });
+}
+
+async function getPhotos(albumPath, siteDomain) {
+    const albumUrl = `https://www.blowpass.com${albumPath}`;
+
+    try {
+        const html = await fetchPhotos(albumUrl);
+        const $ = cheerio.load(html, { normalizeWhitespace: true });
+        const photos = scrapePhotos(html);
+
+        const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();
+
+        const otherPhotos = await Promise.map(pages, async (page) => {
+            const pageUrl = `https://${siteDomain}${page}`;
+            const pageHtml = await fetchPhotos(pageUrl);
+
+            return scrapePhotos(pageHtml);
+        }, {
+            concurrency: 2,
+        });
+
+        return photos.concat(otherPhotos.flat());
+    } catch (error) {
+        console.error(`Failed to fetch Blowpass photos from ${albumPath}: ${error.message}`);
+
+        return [];
+    }
+}

 function scrape(html, site) {
    const $ = cheerio.load(html, { normalizeWhitespace: true });
@@ -68,13 +128,14 @@ async function scrapeScene(html, url, site) {
    const likes = Number(sceneElement.find('.rating .state_1 .value').text());
    const dislikes = Number(sceneElement.find('.rating .state_2 .value').text());

+    const channel = $('.siteNameSpan').text().trim().toLowerCase();
+
    const poster = playerData.picPreview;
    const trailer = `${playerData.playerOptions.host}${playerData.url}`;
+    const photos = await getPhotos($('.picturesItem a').attr('href'), channel, site);

    const duration = moment.duration(data.duration.slice(2)).asSeconds();
-
-    const rawTags = data.keywords.split(', ');
-    const tags = await matchTags(rawTags);
+    const tags = data.keywords.split(', ');

    return {
        url,
@@ -86,6 +147,7 @@ async function scrapeScene(html, url, site) {
        date,
        duration,
        poster,
+        photos,
        trailer: {
            src: trailer,
            quality: playerData.sizeOnLoad.slice(0, -1),
@@ -96,6 +158,7 @@ async function scrapeScene(html, url, site) {
            dislikes,
        },
        site,
+        channel,
    };
 }