From ea9c2dfe677297bb760880ded98f130e6caad3e5 Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Fri, 7 Feb 2020 01:48:21 +0100 Subject: [PATCH] Scraping all actor release pages for Gamma. Improved actor matching for Gamma API. --- src/scrapers/blowpass.js | 10 +++++++++- src/scrapers/gamma.js | 43 +++++++++++++++++++++++++++------------- src/scrapers/xempire.js | 10 +++++++++- 3 files changed, 47 insertions(+), 16 deletions(-) diff --git a/src/scrapers/blowpass.js b/src/scrapers/blowpass.js index d89fd9de..c2f835a3 100644 --- a/src/scrapers/blowpass.js +++ b/src/scrapers/blowpass.js @@ -21,9 +21,17 @@ async function fetchScene(url, site) { return release; } +function getActorReleasesUrl(actorPath, page = 1) { + return `https://www.blowpass.com/en/videos/blowpass/latest/All-Categories/0${actorPath}/${page}`; +} + +async function networkFetchProfile(actorName, siteSlug) { + return fetchProfile(actorName, siteSlug, null, getActorReleasesUrl); +} + module.exports = { fetchLatest, - fetchProfile, + fetchProfile: networkFetchProfile, fetchUpcoming, fetchScene, }; diff --git a/src/scrapers/gamma.js b/src/scrapers/gamma.js index 1a35e0c4..7131215a 100644 --- a/src/scrapers/gamma.js +++ b/src/scrapers/gamma.js @@ -6,7 +6,8 @@ const { JSDOM } = require('jsdom'); const cheerio = require('cheerio'); const moment = require('moment'); -const { ex } = require('../utils/q'); +const argv = require('../argv'); +const { ex, get } = require('../utils/q'); const slugify = require('../utils/slugify'); async function fetchPhotos(url) { @@ -109,7 +110,7 @@ async function scrapeApiReleases(json, site) { }); } -function scrapeAll(html, site, useNetworkUrl) { +function scrapeAll(html, site, networkUrl) { const $ = cheerio.load(html, { normalizeWhitespace: true }); const scenesElements = $('li[data-itemtype=scene]').toArray(); @@ -118,7 +119,9 @@ function scrapeAll(html, site, useNetworkUrl) { const sceneLinkElement = $(element).find('.sceneTitle a'); - release.url = `${useNetworkUrl ? site.network.url : site.url}${sceneLinkElement.attr('href')}`; + if (site) release.url = `${networkUrl ? site.network.url : site.url}${sceneLinkElement.attr('href')}`; + else release.url = `${networkUrl}${sceneLinkElement.attr('href')}`; + release.title = sceneLinkElement.attr('title'); release.entryId = $(element).attr('data-itemid'); @@ -241,7 +244,24 @@ function scrapeActorSearch(html, url, actorName) { return actorLink ? actorLink.href : null; } -function scrapeProfile(html, url, actorName, _siteSlug) { +async function fetchActorReleases(profileUrl, getActorReleasesUrl, page = 1, accReleases = []) { + const { origin, pathname } = new URL(profileUrl); + const profilePath = `/${pathname.split('/').slice(-2).join('/')}`; + + const url = getActorReleasesUrl(profilePath, page); + const { html, qu } = await get(url); + + const releases = scrapeAll(html, null, origin); + const nextPage = qu('.Gamma_Paginator a.next'); + + if (nextPage) { + return fetchActorReleases(profileUrl, getActorReleasesUrl, page + 1, accReleases.concat(releases)); + } + + return accReleases.concat(releases); +} + +async function scrapeProfile(html, url, actorName, _siteSlug, getActorReleasesUrl) { const { q } = ex(html); const avatar = q('img.actorPicture'); @@ -275,12 +295,9 @@ function scrapeProfile(html, url, actorName, _siteSlug) { if (alias) profile.aliases = alias.split(':')[1].trim().split(', '); if (nationality) profile.nationality = nationality.split(':')[1].trim(); - /* not fetching all releases - profile.releases = Array.from(document.querySelectorAll('.sceneList .scene a.imgLink'), el => `https://${siteSlug}.com${el.href}`); - const moreReleases = qu('.seeAllTop a'); - - console.log(moreReleases); - */ + if (getActorReleasesUrl && argv.withReleases) { + profile.releases = await fetchActorReleases(url, getActorReleasesUrl); + } return profile; } @@ -365,8 +382,6 @@ async function fetchLatest(site, page = 1) { const url = `${site.url}${site.parameters?.latest || '/en/videos/AllCategories/0/'}${page}`; const res = await bhttp.get(url); - console.log(url); - return scrapeAll(res.body.toString(), site); } @@ -405,7 +420,7 @@ async function fetchActorScenes(actorName, apiUrl, siteSlug) { return []; } -async function fetchProfile(actorName, siteSlug, altSearchUrl) { +async function fetchProfile(actorName, siteSlug, altSearchUrl, getActorReleasesUrl) { const actorSlug = actorName.toLowerCase().replace(/\s+/, '+'); const searchUrl = altSearchUrl ? `https://www.${siteSlug}.com/en/search/${actorSlug}/1/actor` @@ -426,7 +441,7 @@ async function fetchProfile(actorName, siteSlug, altSearchUrl) { return null; } - return scrapeProfile(actorRes.body.toString(), url, actorName, siteSlug); + return scrapeProfile(actorRes.body.toString(), url, actorName, siteSlug, getActorReleasesUrl); } return null; diff --git a/src/scrapers/xempire.js b/src/scrapers/xempire.js index 99237194..0a8e89c2 100644 --- a/src/scrapers/xempire.js +++ b/src/scrapers/xempire.js @@ -19,9 +19,17 @@ async function fetchScene(url, site) { return release; } +function getActorReleasesUrl(actorPath, page = 1) { + return `https://www.xempire.com/en/videos/xempire/latest/${page}/All-Categories/0${actorPath}`; +} + +async function networkFetchProfile(actorName, siteSlug) { + return fetchProfile(actorName, siteSlug, null, getActorReleasesUrl); +} + module.exports = { fetchLatest, - fetchProfile, + fetchProfile: networkFetchProfile, fetchUpcoming, fetchScene, };