From 90172ea19a101f24857e59428a1361544e254f83 Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Fri, 6 Mar 2020 04:28:01 +0100 Subject: [PATCH] Added mobile album support to Gamma scraper. --- seeds/02_sites.js | 9 +++++++- src/scrapers/gamma.js | 49 ++++++++++++++++++++++++++++++++----------- 2 files changed, 45 insertions(+), 13 deletions(-) diff --git a/seeds/02_sites.js b/seeds/02_sites.js index 4d834ff3..15a1e97b 100644 --- a/seeds/02_sites.js +++ b/seeds/02_sites.js @@ -156,6 +156,7 @@ const sites = [ network: '21sextury', parameters: { referer: 'https://www.21sextury.com', + mobile: 'https://m.analteenangels.com/', }, }, { @@ -164,7 +165,10 @@ const sites = [ url: 'https://www.assholefever.com', description: 'Welcome to AssholeFever, the most hardcore anal site on the net. Watch your favorite pornstars and anal sluts from all over the world in big booty hardcore porn, anal gape, beads, anal creampie and more! Look inside if you dare!', network: '21sextury', - parameters: { networkReferer: true }, + parameters: { + networkReferer: true, + mobile: 'https://m.assholefever.com/', + }, }, { slug: 'buttplays', @@ -193,6 +197,7 @@ const sites = [ parameters: { referer: 'https://www.21sextury.com', scene: 'https://www.21sextury.com/en/video', + mobile: 'https://m.dpfanatics.com', photos: 'https://www.21sextury.com/en/photo', }, }, @@ -204,6 +209,7 @@ const sites = [ network: '21sextury', parameters: { referer: 'https://www.21sextury.com', + mobile: 'https://m.dpfanatics.com/', }, }, { @@ -215,6 +221,7 @@ const sites = [ parameters: { referer: 'https://www.21sextury.com', scene: 'https://www.21sextury.com/en/video', + mobile: 'https://m.footsiebabes.com', photos: 'https://www.21sextury.com/en/photo', }, }, diff --git a/src/scrapers/gamma.js b/src/scrapers/gamma.js index 9caa9ce7..6e6d6cc1 100644 --- a/src/scrapers/gamma.js +++ b/src/scrapers/gamma.js @@ -31,13 +31,15 @@ async function fetchPhotos(url) { return res.body.toString(); } -function scrapePhotos(html) { +function scrapePhotos(html, includeThumbnails = true) { const $ = cheerio.load(html, { normalizeWhitespace: true }); return $('.preview .imgLink, .pgFooterThumb a').toArray().map((linkEl) => { const url = $(linkEl).attr('href'); if (/\/join|\/createaccount/.test(url)) { + if (!includeThumbnails) return null; + // URL links to join page instead of full photo, extract thumbnail // /createaccount is used by e.g. Tricky Spa native site const src = $(linkEl).find('img').attr('src'); @@ -57,10 +59,10 @@ function scrapePhotos(html) { // URL links to full photo return url; - }); + }).filter(Boolean); } -async function getPhotos(albumPath, site) { +async function getPhotos(albumPath, site, includeThumbnails = true) { const albumUrl = getAlbumUrl(albumPath, site); if (!albumUrl) { @@ -70,7 +72,7 @@ async function getPhotos(albumPath, site) { try { const html = await fetchPhotos(albumUrl); const $ = cheerio.load(html, { normalizeWhitespace: true }); - const photos = scrapePhotos(html, site); + const photos = scrapePhotos(html, includeThumbnails); const lastPage = $('.Gamma_Paginator a.last').attr('href')?.match(/\d+$/)[0]; @@ -81,7 +83,7 @@ async function getPhotos(albumPath, site) { const pageUrl = `${albumUrl}/${page}`; const pageHtml = await fetchPhotos(pageUrl); - return scrapePhotos(pageHtml, site); + return scrapePhotos(pageHtml, includeThumbnails); }, { concurrency: 2, }); @@ -184,9 +186,10 @@ function scrapeAll(html, site, networkUrl, hasTeaser = true) { }); } -async function scrapeScene(html, url, site, scrapedRelease) { +async function scrapeScene(html, url, site, scrapedRelease, mobileHtml) { const $ = cheerio.load(html, { normalizeWhitespace: true }); - const release = { $ }; + const m$ = mobileHtml && cheerio.load(mobileHtml, { normalizeWhitespace: true }); + const release = { $, url }; const json = $('script[type="application/ld+json"]').html(); const videoJson = $('script:contains("window.ScenePlayerOptions")').html(); @@ -235,7 +238,14 @@ async function scrapeScene(html, url, site, scrapedRelease) { if (videoData.picPreview && new URL(videoData.picPreview).pathname.length > 1) release.poster = videoData.picPreview; // sometimes links to just https://images02-fame.gammacdn.com/ const photoLink = $('.picturesItem a').attr('href'); - if (photoLink) release.photos = await getPhotos(photoLink, site); + const mobilePhotos = m$ ? m$('.preview-displayer a img').map((photoIndex, photoEl) => $(photoEl).attr('src')).toArray() : []; + + if (photoLink) { + const photos = await getPhotos(photoLink, site, mobilePhotos.length < 3); // only get thumbnails when less than 3 mobile photos are available + release.photos = [...photos, ...mobilePhotos]; + } else { + release.photos = mobilePhotos; + } const trailer = `${videoData.playerOptions.host}${videoData.url}`; release.trailer = [ @@ -457,9 +467,13 @@ async function fetchUpcoming(site) { return scrapeAll(res.body.toString(), site, null, false); } -function getDeepUrl(url, site, release) { +function getDeepUrl(url, site, release, mobile = false) { const pathname = release?.path || new URL(url).pathname; + if (mobile) { + return `${site.parameters.mobile}${pathname}`; + } + if (site.parameters?.deep === 'network') { return `${site.network.url}${pathname}`; } @@ -477,13 +491,24 @@ async function fetchScene(url, site, release) { } const deepUrl = getDeepUrl(url, site, release); + const mobileUrl = site.parameters?.mobile && getDeepUrl(url, site, release, true); + + console.log(mobileUrl); if (deepUrl) { - const res = await bhttp.get(deepUrl); + const [res, mobileRes] = await Promise.all([ + bhttp.get(deepUrl), + mobileUrl && bhttp.get(mobileUrl, { + headers: { + // don't redirect to main site + 'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Mobile Safari/537.36', + }, + }), + ]); if (res.statusCode === 200) { - const scene = await scrapeScene(res.body.toString(), url, site, release); - + const mobileBody = mobileRes.statusCode === 200 ? mobileRes.body.toString() : null; + const scene = await scrapeScene(res.body.toString(), url, site, release, mobileBody); return { ...scene, deepUrl }; } }