Added mobile album scraping to Blowpass, improved wrapper.

This commit is contained in:
2020-03-07 02:35:13 +01:00
parent 4773a388ac
commit ff3e956fc7
5 changed files with 56 additions and 38 deletions

View File

@@ -1,22 +1,22 @@
'use strict';
const bhttp = require('bhttp');
// const bhttp = require('bhttp');
const { fetchLatest, fetchUpcoming, scrapeScene, fetchProfile } = require('./gamma');
const { fetchScene, fetchLatest, fetchUpcoming, fetchProfile } = require('./gamma');
async function fetchScene(url, site) {
// const res = await bhttp.get(url);
const res = await bhttp.get(`https://www.blowpass.com/en/video/${site.id}/${new URL(url).pathname.split('/').slice(-2).join('/')}`);
async function fetchSceneWrapper(url, site, baseRelease) {
const release = await fetchScene(url, site, baseRelease);
const release = await scrapeScene(res.body.toString(), url, site);
release.channel = release.$('.siteNameSpan')
.text()
.trim()
.toLowerCase()
.replace('.com', '');
if (site.isFallback && release.channel) {
const channelUrl = url.replace('blowpass.com', `${release.channel}.com`);
if (['onlyteenblowjobs.com', 'mommyblowsbest.com'].includes(release.channel)) release.url = url.replace(/video\/\w+\//, 'scene/');
else release.url = url.replace(/video\/\w+\//, 'video/');
if (['onlyteenblowjobs', 'mommyblowsbest'].includes(release.channel)) {
release.url = channelUrl.replace(/video\/\w+\//, 'scene/');
return release;
}
release.url = channelUrl.replace(/video\/\w+\//, 'video/');
}
return release;
}
@@ -33,5 +33,5 @@ module.exports = {
fetchLatest,
fetchProfile: networkFetchProfile,
fetchUpcoming,
fetchScene,
fetchScene: fetchSceneWrapper,
};

View File

@@ -1,6 +1,7 @@
'use strict';
const Promise = require('bluebird');
const util = require('util');
const bhttp = require('bhttp');
const { JSDOM } = require('jsdom');
const cheerio = require('cheerio');
@@ -38,8 +39,6 @@ function scrapePhotos(html, includeThumbnails = true) {
const url = $(linkEl).attr('href');
if (/\/join|\/createaccount/.test(url)) {
if (!includeThumbnails) return null;
// URL links to join page instead of full photo, extract thumbnail
// /createaccount is used by e.g. Tricky Spa native site
const src = $(linkEl).find('img').attr('src');
@@ -54,6 +53,8 @@ function scrapePhotos(html, includeThumbnails = true) {
return [highRes, src];
}
if (!includeThumbnails) return null;
return src;
}
@@ -232,7 +233,7 @@ async function scrapeScene(html, url, site, scrapedRelease, mobileHtml) {
const rawTags = data?.keywords?.split(', ') || data2?.keywords?.split(', ') || [];
release.tags = hasTrans ? [...rawTags, 'transsexual'] : rawTags;
const channel = data?.productionCompany?.name || $('.studioLink a, .siteLink a').attr('title')?.trim();
const channel = data?.productionCompany?.name || $('.studioLink a, .siteLink a').attr('title')?.trim() || $('.siteNameSpan').text()?.trim().toLowerCase().replace('.com', '');
if (channel) release.channel = slugify(channel, { delimiter: '' });
if (videoData.picPreview && new URL(videoData.picPreview).pathname.length > 1) release.poster = videoData.picPreview; // sometimes links to just https://images02-fame.gammacdn.com/
@@ -242,7 +243,9 @@ async function scrapeScene(html, url, site, scrapedRelease, mobileHtml) {
if (photoLink) {
const photos = await getPhotos(photoLink, site, mobilePhotos.length < 3); // only get thumbnails when less than 3 mobile photos are available
release.photos = [...photos, ...mobilePhotos];
if (photos.length < 7) release.photos = [...photos, ...mobilePhotos]; // probably only teaser photos available, supplement with mobile album
else release.photos = photos;
} else {
release.photos = mobilePhotos;
}
@@ -435,9 +438,15 @@ async function fetchApiUpcoming(site) {
function getLatestUrl(site, page) {
if (site.parameters?.latest) {
return /^http/.test(site.parameters.latest)
? `${site.parameters.latest}${page}`
: `${site.url}${site.parameters.latest}${page}`;
if (/^http/.test(site.parameters.latest)) {
return /%d/.test(site.parameters.latest)
? util.format(site.parameters.latest, page)
: `${site.parameters.latest}${page}`;
}
return /%d/.test(site.parameters.latest)
? util.format(`${site.url}${site.parameters.latest}`, page)
: `${site.url}${site.parameters.latest}${page}`;
}
return `${site.url}/en/videos/AllCategories/0/${page}`;
@@ -467,14 +476,20 @@ async function fetchUpcoming(site) {
return scrapeAll(res.body.toString(), site, null, false);
}
function getDeepUrl(url, site, release, mobile) {
function getDeepUrl(url, site, baseRelease, mobile) {
const filter = new Set(['en', 'video', 'scene', site.slug, site.network.slug]);
const pathname = release?.path || new URL(url).pathname
const pathname = baseRelease?.path || new URL(url).pathname
.split('/')
.filter(component => !filter.has(component))
.join('/'); // reduce to scene ID and title slug
if (mobile) {
const sceneId = baseRelease?.entryId || pathname.match(/\/(\d+)\//)?.[1];
if (mobile && /%d/.test(mobile)) {
return util.format(mobile, sceneId);
}
if (mobile && sceneId) {
return `${mobile}${pathname}`;
}
@@ -485,13 +500,13 @@ function getDeepUrl(url, site, release, mobile) {
return url;
}
async function fetchScene(url, site, release) {
async function fetchScene(url, site, baseRelease) {
if (site.parameters?.deep === false) {
return release;
return baseRelease;
}
const deepUrl = getDeepUrl(url, site, release);
const mobileUrl = getDeepUrl(url, site, release, site.parameters?.mobile || site.network.parameters?.mobile);
const deepUrl = getDeepUrl(url, site, baseRelease);
const mobileUrl = getDeepUrl(url, site, baseRelease, site.parameters?.mobile || site.network.parameters?.mobile);
if (deepUrl) {
const [res, mobileRes] = await Promise.all([
@@ -506,7 +521,7 @@ async function fetchScene(url, site, release) {
if (res.statusCode === 200) {
const mobileBody = mobileRes?.statusCode === 200 ? mobileRes.body.toString() : null;
const scene = await scrapeScene(res.body.toString(), url, site, release, mobileBody);
const scene = await scrapeScene(res.body.toString(), url, site, baseRelease, mobileBody);
return { ...scene, deepUrl };
}
}