Added mobile album scraping to Blowpass, improved wrapper.

2020-03-07 02:35:13 +01:00 · 2020-03-07 02:35:13 +01:00 · ff3e956fc7
parent 4773a388ac
commit ff3e956fc7
5 changed files with 56 additions and 38 deletions
--- a/seeds/01_networks.js
+++ b/seeds/01_networks.js
@ -94,6 +94,9 @@ const networks = [
        name: 'Blowpass',
        url: 'https://www.blowpass.com',
        description: 'Welcome to Blowpass.com, your ultimate source for deepthroat porn, MILF and teen blowjob videos, big cumshots and any and everything oral!',
+        parameters: {
+            mobile: 'https://m.blowpass.com/en/video/v/%d', // v can be any string, %d will be scene ID
+        },
        parent: 'gamma',
    },
    {
--- a/seeds/02_sites.js
+++ b/seeds/02_sites.js
@ -1011,8 +1011,8 @@ const sites = [
        description: 'Welcome to 1000Facials.com, your source for the best facial porn with huge cumshots on your favorite teen and MILF pornstars. Watch all the blowjob action inside!',
        network: 'blowpass',
        parameters: {
-            latest: '/en/videos/latest/All-Categories/0/All-Pornstars/0/',
-            upcoming: '/en/videos/upcoming',
+            latest: '/en/scenes/updates/%d/Category/0/Pornstar/0',
+            upcoming: '/en/scenes/upcoming',
        },
    },
    {
@ -1022,8 +1022,8 @@ const sites = [
        description: 'Watch live sex shows and videos on ImmoralLive.com, featuring wild and crazy sex orgies, group sex, blowjob competitions and toy play from the famous Porno Dan. The hottest pornstars and amateur girls cum hard inside',
        network: 'blowpass',
        parameters: {
-            latest: '/en/videos/latest/All-Categories/0/All-Pornstars/0/',
-            upcoming: '/en/videos/upcoming',
+            latest: '/en/videos/All-Categories/0/All-Pornstars/0/All/0/',
+            upcoming: '/en/videos/All-Categories/0/All-Pornstars/0/All/0/1/upcoming',
        },
    },
    {
@ -1033,8 +1033,8 @@ const sites = [
        description: 'Welcome to MommyBlowsBest.com. Home to thousands of MILF blowjobs and hot mom porn! Come see why experience counts, right here at MommyBlowsBest.com!',
        network: 'blowpass',
        parameters: {
-            latest: '/en/videos/latest/All-Categories/0/All-Pornstars/0/',
-            upcoming: '/en/videos/upcoming',
+            latest: '/en/scenes/updates/0/Category/0/Actor/',
+            upcoming: '/en/scenes/upcoming',
        },
    },
    {
@ -1044,8 +1044,8 @@ const sites = [
        description: 'OnlyTeenBlowjobs.com brings you the best teen blowjob porn featuring today\'s hottest young pornstars and amateurs. Watch as teens use their little mouths to suck and deepthroat the biggest of cocks!',
        network: 'blowpass',
        parameters: {
-            latest: '/en/videos/latest/All-Categories/0/All-Pornstars/0/',
-            upcoming: '/en/videos/upcoming',
+            latest: '/en/scenes/updates/0/Category/0/Actor/',
+            upcoming: '/en/scenes/upcoming',
        },
    },
    {
--- a/src/scrapers/blowpass.js
+++ b/src/scrapers/blowpass.js
@ -1,22 +1,22 @@
 'use strict';

-const bhttp = require('bhttp');
+// const bhttp = require('bhttp');

-const { fetchLatest, fetchUpcoming, scrapeScene, fetchProfile } = require('./gamma');
+const { fetchScene, fetchLatest, fetchUpcoming, fetchProfile } = require('./gamma');

-async function fetchScene(url, site) {
-    // const res = await bhttp.get(url);
-    const res = await bhttp.get(`https://www.blowpass.com/en/video/${site.id}/${new URL(url).pathname.split('/').slice(-2).join('/')}`);
+async function fetchSceneWrapper(url, site, baseRelease) {
+    const release = await fetchScene(url, site, baseRelease);

-    const release = await scrapeScene(res.body.toString(), url, site);
-    release.channel = release.$('.siteNameSpan')
-        .text()
-        .trim()
-        .toLowerCase()
-        .replace('.com', '');
+    if (site.isFallback && release.channel) {
+        const channelUrl = url.replace('blowpass.com', `${release.channel}.com`);

-    if (['onlyteenblowjobs.com', 'mommyblowsbest.com'].includes(release.channel)) release.url = url.replace(/video\/\w+\//, 'scene/');
-    else release.url = url.replace(/video\/\w+\//, 'video/');
+        if (['onlyteenblowjobs', 'mommyblowsbest'].includes(release.channel)) {
+            release.url = channelUrl.replace(/video\/\w+\//, 'scene/');
+            return release;
+        }
+
+        release.url = channelUrl.replace(/video\/\w+\//, 'video/');
+    }

    return release;
 }
@ -33,5 +33,5 @@ module.exports = {
    fetchLatest,
    fetchProfile: networkFetchProfile,
    fetchUpcoming,
-    fetchScene,
+    fetchScene: fetchSceneWrapper,
 };
--- a/src/scrapers/gamma.js
+++ b/src/scrapers/gamma.js
@ -1,6 +1,7 @@
 'use strict';

 const Promise = require('bluebird');
+const util = require('util');
 const bhttp = require('bhttp');
 const { JSDOM } = require('jsdom');
 const cheerio = require('cheerio');
@ -38,8 +39,6 @@ function scrapePhotos(html, includeThumbnails = true) {
        const url = $(linkEl).attr('href');

        if (/\/join|\/createaccount/.test(url)) {
-            if (!includeThumbnails) return null;
-
            // URL links to join page instead of full photo, extract thumbnail
            // /createaccount is used by e.g. Tricky Spa native site
            const src = $(linkEl).find('img').attr('src');
@ -54,6 +53,8 @@ function scrapePhotos(html, includeThumbnails = true) {
                return [highRes, src];
            }

+            if (!includeThumbnails) return null;
+
            return src;
        }

@ -232,7 +233,7 @@ async function scrapeScene(html, url, site, scrapedRelease, mobileHtml) {
    const rawTags = data?.keywords?.split(', ') || data2?.keywords?.split(', ') || [];
    release.tags = hasTrans ? [...rawTags, 'transsexual'] : rawTags;

-    const channel = data?.productionCompany?.name || $('.studioLink a, .siteLink a').attr('title')?.trim();
+    const channel = data?.productionCompany?.name || $('.studioLink a, .siteLink a').attr('title')?.trim() || $('.siteNameSpan').text()?.trim().toLowerCase().replace('.com', '');
    if (channel) release.channel = slugify(channel, { delimiter: '' });

    if (videoData.picPreview && new URL(videoData.picPreview).pathname.length > 1) release.poster = videoData.picPreview; // sometimes links to just https://images02-fame.gammacdn.com/
@ -242,7 +243,9 @@ async function scrapeScene(html, url, site, scrapedRelease, mobileHtml) {

    if (photoLink) {
        const photos = await getPhotos(photoLink, site, mobilePhotos.length < 3); // only get thumbnails when less than 3 mobile photos are available
-        release.photos = [...photos, ...mobilePhotos];
+
+        if (photos.length < 7) release.photos = [...photos, ...mobilePhotos]; // probably only teaser photos available, supplement with mobile album
+        else release.photos = photos;
    } else {
        release.photos = mobilePhotos;
    }
@ -435,9 +438,15 @@ async function fetchApiUpcoming(site) {

 function getLatestUrl(site, page) {
    if (site.parameters?.latest) {
-        return /^http/.test(site.parameters.latest)
-            ? `${site.parameters.latest}${page}`
-            : `${site.url}${site.parameters.latest}${page}`;
+        if (/^http/.test(site.parameters.latest)) {
+            return /%d/.test(site.parameters.latest)
+                ? util.format(site.parameters.latest, page)
+                : `${site.parameters.latest}${page}`;
+        }
+
+    return /%d/.test(site.parameters.latest)
+        ? util.format(`${site.url}${site.parameters.latest}`, page)
+        : `${site.url}${site.parameters.latest}${page}`;
    }

    return `${site.url}/en/videos/AllCategories/0/${page}`;
@ -467,14 +476,20 @@ async function fetchUpcoming(site) {
    return scrapeAll(res.body.toString(), site, null, false);
 }

-function getDeepUrl(url, site, release, mobile) {
+function getDeepUrl(url, site, baseRelease, mobile) {
    const filter = new Set(['en', 'video', 'scene', site.slug, site.network.slug]);
-    const pathname = release?.path || new URL(url).pathname
+    const pathname = baseRelease?.path || new URL(url).pathname
        .split('/')
        .filter(component => !filter.has(component))
        .join('/'); // reduce to scene ID and title slug

-    if (mobile) {
+    const sceneId = baseRelease?.entryId || pathname.match(/\/(\d+)\//)?.[1];
+
+    if (mobile && /%d/.test(mobile)) {
+        return util.format(mobile, sceneId);
+    }
+
+    if (mobile && sceneId) {
        return `${mobile}${pathname}`;
    }

@ -485,13 +500,13 @@ function getDeepUrl(url, site, release, mobile) {
    return url;
 }

-async function fetchScene(url, site, release) {
+async function fetchScene(url, site, baseRelease) {
    if (site.parameters?.deep === false) {
-        return release;
+        return baseRelease;
    }

-    const deepUrl = getDeepUrl(url, site, release);
-    const mobileUrl = getDeepUrl(url, site, release, site.parameters?.mobile || site.network.parameters?.mobile);
+    const deepUrl = getDeepUrl(url, site, baseRelease);
+    const mobileUrl = getDeepUrl(url, site, baseRelease, site.parameters?.mobile || site.network.parameters?.mobile);

    if (deepUrl) {
        const [res, mobileRes] = await Promise.all([
@ -506,7 +521,7 @@ async function fetchScene(url, site, release) {

        if (res.statusCode === 200) {
            const mobileBody = mobileRes?.statusCode === 200 ? mobileRes.body.toString() : null;
-            const scene = await scrapeScene(res.body.toString(), url, site, release, mobileBody);
+            const scene = await scrapeScene(res.body.toString(), url, site, baseRelease, mobileBody);
            return { ...scene, deepUrl };
        }
    }
--- a/src/tags.js
+++ b/src/tags.js
@ -62,7 +62,7 @@ async function matchTags(rawTags) {
 async function associateTags(release, releaseId) {
    const siteTags = release.site?.tags?.filter(tag => tag.inherit === true).map(tag => tag.id) || [];

-    const rawReleaseTags = release.tags.filter(Boolean) || [];
+    const rawReleaseTags = release.tags?.filter(Boolean) || [];
    const releaseTags = rawReleaseTags.some(tag => typeof tag === 'string')
        ? await matchTags(release.tags) // scraper returned raw tags
        : rawReleaseTags; // tags already matched by (outdated) scraper