Added mobile album scraping to Blowpass, improved wrapper.
This commit is contained in:
@@ -1,22 +1,22 @@
|
||||
'use strict';
|
||||
|
||||
const bhttp = require('bhttp');
|
||||
// const bhttp = require('bhttp');
|
||||
|
||||
const { fetchLatest, fetchUpcoming, scrapeScene, fetchProfile } = require('./gamma');
|
||||
const { fetchScene, fetchLatest, fetchUpcoming, fetchProfile } = require('./gamma');
|
||||
|
||||
async function fetchScene(url, site) {
|
||||
// const res = await bhttp.get(url);
|
||||
const res = await bhttp.get(`https://www.blowpass.com/en/video/${site.id}/${new URL(url).pathname.split('/').slice(-2).join('/')}`);
|
||||
async function fetchSceneWrapper(url, site, baseRelease) {
|
||||
const release = await fetchScene(url, site, baseRelease);
|
||||
|
||||
const release = await scrapeScene(res.body.toString(), url, site);
|
||||
release.channel = release.$('.siteNameSpan')
|
||||
.text()
|
||||
.trim()
|
||||
.toLowerCase()
|
||||
.replace('.com', '');
|
||||
if (site.isFallback && release.channel) {
|
||||
const channelUrl = url.replace('blowpass.com', `${release.channel}.com`);
|
||||
|
||||
if (['onlyteenblowjobs.com', 'mommyblowsbest.com'].includes(release.channel)) release.url = url.replace(/video\/\w+\//, 'scene/');
|
||||
else release.url = url.replace(/video\/\w+\//, 'video/');
|
||||
if (['onlyteenblowjobs', 'mommyblowsbest'].includes(release.channel)) {
|
||||
release.url = channelUrl.replace(/video\/\w+\//, 'scene/');
|
||||
return release;
|
||||
}
|
||||
|
||||
release.url = channelUrl.replace(/video\/\w+\//, 'video/');
|
||||
}
|
||||
|
||||
return release;
|
||||
}
|
||||
@@ -33,5 +33,5 @@ module.exports = {
|
||||
fetchLatest,
|
||||
fetchProfile: networkFetchProfile,
|
||||
fetchUpcoming,
|
||||
fetchScene,
|
||||
fetchScene: fetchSceneWrapper,
|
||||
};
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
'use strict';
|
||||
|
||||
const Promise = require('bluebird');
|
||||
const util = require('util');
|
||||
const bhttp = require('bhttp');
|
||||
const { JSDOM } = require('jsdom');
|
||||
const cheerio = require('cheerio');
|
||||
@@ -38,8 +39,6 @@ function scrapePhotos(html, includeThumbnails = true) {
|
||||
const url = $(linkEl).attr('href');
|
||||
|
||||
if (/\/join|\/createaccount/.test(url)) {
|
||||
if (!includeThumbnails) return null;
|
||||
|
||||
// URL links to join page instead of full photo, extract thumbnail
|
||||
// /createaccount is used by e.g. Tricky Spa native site
|
||||
const src = $(linkEl).find('img').attr('src');
|
||||
@@ -54,6 +53,8 @@ function scrapePhotos(html, includeThumbnails = true) {
|
||||
return [highRes, src];
|
||||
}
|
||||
|
||||
if (!includeThumbnails) return null;
|
||||
|
||||
return src;
|
||||
}
|
||||
|
||||
@@ -232,7 +233,7 @@ async function scrapeScene(html, url, site, scrapedRelease, mobileHtml) {
|
||||
const rawTags = data?.keywords?.split(', ') || data2?.keywords?.split(', ') || [];
|
||||
release.tags = hasTrans ? [...rawTags, 'transsexual'] : rawTags;
|
||||
|
||||
const channel = data?.productionCompany?.name || $('.studioLink a, .siteLink a').attr('title')?.trim();
|
||||
const channel = data?.productionCompany?.name || $('.studioLink a, .siteLink a').attr('title')?.trim() || $('.siteNameSpan').text()?.trim().toLowerCase().replace('.com', '');
|
||||
if (channel) release.channel = slugify(channel, { delimiter: '' });
|
||||
|
||||
if (videoData.picPreview && new URL(videoData.picPreview).pathname.length > 1) release.poster = videoData.picPreview; // sometimes links to just https://images02-fame.gammacdn.com/
|
||||
@@ -242,7 +243,9 @@ async function scrapeScene(html, url, site, scrapedRelease, mobileHtml) {
|
||||
|
||||
if (photoLink) {
|
||||
const photos = await getPhotos(photoLink, site, mobilePhotos.length < 3); // only get thumbnails when less than 3 mobile photos are available
|
||||
release.photos = [...photos, ...mobilePhotos];
|
||||
|
||||
if (photos.length < 7) release.photos = [...photos, ...mobilePhotos]; // probably only teaser photos available, supplement with mobile album
|
||||
else release.photos = photos;
|
||||
} else {
|
||||
release.photos = mobilePhotos;
|
||||
}
|
||||
@@ -435,9 +438,15 @@ async function fetchApiUpcoming(site) {
|
||||
|
||||
function getLatestUrl(site, page) {
|
||||
if (site.parameters?.latest) {
|
||||
return /^http/.test(site.parameters.latest)
|
||||
? `${site.parameters.latest}${page}`
|
||||
: `${site.url}${site.parameters.latest}${page}`;
|
||||
if (/^http/.test(site.parameters.latest)) {
|
||||
return /%d/.test(site.parameters.latest)
|
||||
? util.format(site.parameters.latest, page)
|
||||
: `${site.parameters.latest}${page}`;
|
||||
}
|
||||
|
||||
return /%d/.test(site.parameters.latest)
|
||||
? util.format(`${site.url}${site.parameters.latest}`, page)
|
||||
: `${site.url}${site.parameters.latest}${page}`;
|
||||
}
|
||||
|
||||
return `${site.url}/en/videos/AllCategories/0/${page}`;
|
||||
@@ -467,14 +476,20 @@ async function fetchUpcoming(site) {
|
||||
return scrapeAll(res.body.toString(), site, null, false);
|
||||
}
|
||||
|
||||
function getDeepUrl(url, site, release, mobile) {
|
||||
function getDeepUrl(url, site, baseRelease, mobile) {
|
||||
const filter = new Set(['en', 'video', 'scene', site.slug, site.network.slug]);
|
||||
const pathname = release?.path || new URL(url).pathname
|
||||
const pathname = baseRelease?.path || new URL(url).pathname
|
||||
.split('/')
|
||||
.filter(component => !filter.has(component))
|
||||
.join('/'); // reduce to scene ID and title slug
|
||||
|
||||
if (mobile) {
|
||||
const sceneId = baseRelease?.entryId || pathname.match(/\/(\d+)\//)?.[1];
|
||||
|
||||
if (mobile && /%d/.test(mobile)) {
|
||||
return util.format(mobile, sceneId);
|
||||
}
|
||||
|
||||
if (mobile && sceneId) {
|
||||
return `${mobile}${pathname}`;
|
||||
}
|
||||
|
||||
@@ -485,13 +500,13 @@ function getDeepUrl(url, site, release, mobile) {
|
||||
return url;
|
||||
}
|
||||
|
||||
async function fetchScene(url, site, release) {
|
||||
async function fetchScene(url, site, baseRelease) {
|
||||
if (site.parameters?.deep === false) {
|
||||
return release;
|
||||
return baseRelease;
|
||||
}
|
||||
|
||||
const deepUrl = getDeepUrl(url, site, release);
|
||||
const mobileUrl = getDeepUrl(url, site, release, site.parameters?.mobile || site.network.parameters?.mobile);
|
||||
const deepUrl = getDeepUrl(url, site, baseRelease);
|
||||
const mobileUrl = getDeepUrl(url, site, baseRelease, site.parameters?.mobile || site.network.parameters?.mobile);
|
||||
|
||||
if (deepUrl) {
|
||||
const [res, mobileRes] = await Promise.all([
|
||||
@@ -506,7 +521,7 @@ async function fetchScene(url, site, release) {
|
||||
|
||||
if (res.statusCode === 200) {
|
||||
const mobileBody = mobileRes?.statusCode === 200 ? mobileRes.body.toString() : null;
|
||||
const scene = await scrapeScene(res.body.toString(), url, site, release, mobileBody);
|
||||
const scene = await scrapeScene(res.body.toString(), url, site, baseRelease, mobileBody);
|
||||
return { ...scene, deepUrl };
|
||||
}
|
||||
}
|
||||
|
||||
@@ -62,7 +62,7 @@ async function matchTags(rawTags) {
|
||||
async function associateTags(release, releaseId) {
|
||||
const siteTags = release.site?.tags?.filter(tag => tag.inherit === true).map(tag => tag.id) || [];
|
||||
|
||||
const rawReleaseTags = release.tags.filter(Boolean) || [];
|
||||
const rawReleaseTags = release.tags?.filter(Boolean) || [];
|
||||
const releaseTags = rawReleaseTags.some(tag => typeof tag === 'string')
|
||||
? await matchTags(release.tags) // scraper returned raw tags
|
||||
: rawReleaseTags; // tags already matched by (outdated) scraper
|
||||
|
||||
Reference in New Issue
Block a user