Added mobile album support to Gamma scraper.

This commit is contained in:
2020-03-06 04:28:01 +01:00
parent 3c14bb26c2
commit 90172ea19a
2 changed files with 45 additions and 13 deletions

View File

@@ -31,13 +31,15 @@ async function fetchPhotos(url) {
return res.body.toString();
}
function scrapePhotos(html) {
function scrapePhotos(html, includeThumbnails = true) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
return $('.preview .imgLink, .pgFooterThumb a').toArray().map((linkEl) => {
const url = $(linkEl).attr('href');
if (/\/join|\/createaccount/.test(url)) {
if (!includeThumbnails) return null;
// URL links to join page instead of full photo, extract thumbnail
// /createaccount is used by e.g. Tricky Spa native site
const src = $(linkEl).find('img').attr('src');
@@ -57,10 +59,10 @@ function scrapePhotos(html) {
// URL links to full photo
return url;
});
}).filter(Boolean);
}
async function getPhotos(albumPath, site) {
async function getPhotos(albumPath, site, includeThumbnails = true) {
const albumUrl = getAlbumUrl(albumPath, site);
if (!albumUrl) {
@@ -70,7 +72,7 @@ async function getPhotos(albumPath, site) {
try {
const html = await fetchPhotos(albumUrl);
const $ = cheerio.load(html, { normalizeWhitespace: true });
const photos = scrapePhotos(html, site);
const photos = scrapePhotos(html, includeThumbnails);
const lastPage = $('.Gamma_Paginator a.last').attr('href')?.match(/\d+$/)[0];
@@ -81,7 +83,7 @@ async function getPhotos(albumPath, site) {
const pageUrl = `${albumUrl}/${page}`;
const pageHtml = await fetchPhotos(pageUrl);
return scrapePhotos(pageHtml, site);
return scrapePhotos(pageHtml, includeThumbnails);
}, {
concurrency: 2,
});
@@ -184,9 +186,10 @@ function scrapeAll(html, site, networkUrl, hasTeaser = true) {
});
}
async function scrapeScene(html, url, site, scrapedRelease) {
async function scrapeScene(html, url, site, scrapedRelease, mobileHtml) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const release = { $ };
const m$ = mobileHtml && cheerio.load(mobileHtml, { normalizeWhitespace: true });
const release = { $, url };
const json = $('script[type="application/ld+json"]').html();
const videoJson = $('script:contains("window.ScenePlayerOptions")').html();
@@ -235,7 +238,14 @@ async function scrapeScene(html, url, site, scrapedRelease) {
if (videoData.picPreview && new URL(videoData.picPreview).pathname.length > 1) release.poster = videoData.picPreview; // sometimes links to just https://images02-fame.gammacdn.com/
const photoLink = $('.picturesItem a').attr('href');
if (photoLink) release.photos = await getPhotos(photoLink, site);
const mobilePhotos = m$ ? m$('.preview-displayer a img').map((photoIndex, photoEl) => $(photoEl).attr('src')).toArray() : [];
if (photoLink) {
const photos = await getPhotos(photoLink, site, mobilePhotos.length < 3); // only get thumbnails when less than 3 mobile photos are available
release.photos = [...photos, ...mobilePhotos];
} else {
release.photos = mobilePhotos;
}
const trailer = `${videoData.playerOptions.host}${videoData.url}`;
release.trailer = [
@@ -457,9 +467,13 @@ async function fetchUpcoming(site) {
return scrapeAll(res.body.toString(), site, null, false);
}
function getDeepUrl(url, site, release) {
function getDeepUrl(url, site, release, mobile = false) {
const pathname = release?.path || new URL(url).pathname;
if (mobile) {
return `${site.parameters.mobile}${pathname}`;
}
if (site.parameters?.deep === 'network') {
return `${site.network.url}${pathname}`;
}
@@ -477,13 +491,24 @@ async function fetchScene(url, site, release) {
}
const deepUrl = getDeepUrl(url, site, release);
const mobileUrl = site.parameters?.mobile && getDeepUrl(url, site, release, true);
console.log(mobileUrl);
if (deepUrl) {
const res = await bhttp.get(deepUrl);
const [res, mobileRes] = await Promise.all([
bhttp.get(deepUrl),
mobileUrl && bhttp.get(mobileUrl, {
headers: {
// don't redirect to main site
'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Mobile Safari/537.36',
},
}),
]);
if (res.statusCode === 200) {
const scene = await scrapeScene(res.body.toString(), url, site, release);
const mobileBody = mobileRes.statusCode === 200 ? mobileRes.body.toString() : null;
const scene = await scrapeScene(res.body.toString(), url, site, release, mobileBody);
return { ...scene, deepUrl };
}
}