Scraping all actor release pages for Gamma. Improved actor matching for Gamma API.
This commit is contained in:
parent
5fc56308d2
commit
ea9c2dfe67
|
@ -21,9 +21,17 @@ async function fetchScene(url, site) {
|
||||||
return release;
|
return release;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function getActorReleasesUrl(actorPath, page = 1) {
|
||||||
|
return `https://www.blowpass.com/en/videos/blowpass/latest/All-Categories/0${actorPath}/${page}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function networkFetchProfile(actorName, siteSlug) {
|
||||||
|
return fetchProfile(actorName, siteSlug, null, getActorReleasesUrl);
|
||||||
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
fetchLatest,
|
fetchLatest,
|
||||||
fetchProfile,
|
fetchProfile: networkFetchProfile,
|
||||||
fetchUpcoming,
|
fetchUpcoming,
|
||||||
fetchScene,
|
fetchScene,
|
||||||
};
|
};
|
||||||
|
|
|
@ -6,7 +6,8 @@ const { JSDOM } = require('jsdom');
|
||||||
const cheerio = require('cheerio');
|
const cheerio = require('cheerio');
|
||||||
const moment = require('moment');
|
const moment = require('moment');
|
||||||
|
|
||||||
const { ex } = require('../utils/q');
|
const argv = require('../argv');
|
||||||
|
const { ex, get } = require('../utils/q');
|
||||||
const slugify = require('../utils/slugify');
|
const slugify = require('../utils/slugify');
|
||||||
|
|
||||||
async function fetchPhotos(url) {
|
async function fetchPhotos(url) {
|
||||||
|
@ -109,7 +110,7 @@ async function scrapeApiReleases(json, site) {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
function scrapeAll(html, site, useNetworkUrl) {
|
function scrapeAll(html, site, networkUrl) {
|
||||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||||
const scenesElements = $('li[data-itemtype=scene]').toArray();
|
const scenesElements = $('li[data-itemtype=scene]').toArray();
|
||||||
|
|
||||||
|
@ -118,7 +119,9 @@ function scrapeAll(html, site, useNetworkUrl) {
|
||||||
|
|
||||||
const sceneLinkElement = $(element).find('.sceneTitle a');
|
const sceneLinkElement = $(element).find('.sceneTitle a');
|
||||||
|
|
||||||
release.url = `${useNetworkUrl ? site.network.url : site.url}${sceneLinkElement.attr('href')}`;
|
if (site) release.url = `${networkUrl ? site.network.url : site.url}${sceneLinkElement.attr('href')}`;
|
||||||
|
else release.url = `${networkUrl}${sceneLinkElement.attr('href')}`;
|
||||||
|
|
||||||
release.title = sceneLinkElement.attr('title');
|
release.title = sceneLinkElement.attr('title');
|
||||||
|
|
||||||
release.entryId = $(element).attr('data-itemid');
|
release.entryId = $(element).attr('data-itemid');
|
||||||
|
@ -241,7 +244,24 @@ function scrapeActorSearch(html, url, actorName) {
|
||||||
return actorLink ? actorLink.href : null;
|
return actorLink ? actorLink.href : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
function scrapeProfile(html, url, actorName, _siteSlug) {
|
async function fetchActorReleases(profileUrl, getActorReleasesUrl, page = 1, accReleases = []) {
|
||||||
|
const { origin, pathname } = new URL(profileUrl);
|
||||||
|
const profilePath = `/${pathname.split('/').slice(-2).join('/')}`;
|
||||||
|
|
||||||
|
const url = getActorReleasesUrl(profilePath, page);
|
||||||
|
const { html, qu } = await get(url);
|
||||||
|
|
||||||
|
const releases = scrapeAll(html, null, origin);
|
||||||
|
const nextPage = qu('.Gamma_Paginator a.next');
|
||||||
|
|
||||||
|
if (nextPage) {
|
||||||
|
return fetchActorReleases(profileUrl, getActorReleasesUrl, page + 1, accReleases.concat(releases));
|
||||||
|
}
|
||||||
|
|
||||||
|
return accReleases.concat(releases);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function scrapeProfile(html, url, actorName, _siteSlug, getActorReleasesUrl) {
|
||||||
const { q } = ex(html);
|
const { q } = ex(html);
|
||||||
|
|
||||||
const avatar = q('img.actorPicture');
|
const avatar = q('img.actorPicture');
|
||||||
|
@ -275,12 +295,9 @@ function scrapeProfile(html, url, actorName, _siteSlug) {
|
||||||
if (alias) profile.aliases = alias.split(':')[1].trim().split(', ');
|
if (alias) profile.aliases = alias.split(':')[1].trim().split(', ');
|
||||||
if (nationality) profile.nationality = nationality.split(':')[1].trim();
|
if (nationality) profile.nationality = nationality.split(':')[1].trim();
|
||||||
|
|
||||||
/* not fetching all releases
|
if (getActorReleasesUrl && argv.withReleases) {
|
||||||
profile.releases = Array.from(document.querySelectorAll('.sceneList .scene a.imgLink'), el => `https://${siteSlug}.com${el.href}`);
|
profile.releases = await fetchActorReleases(url, getActorReleasesUrl);
|
||||||
const moreReleases = qu('.seeAllTop a');
|
}
|
||||||
|
|
||||||
console.log(moreReleases);
|
|
||||||
*/
|
|
||||||
|
|
||||||
return profile;
|
return profile;
|
||||||
}
|
}
|
||||||
|
@ -365,8 +382,6 @@ async function fetchLatest(site, page = 1) {
|
||||||
const url = `${site.url}${site.parameters?.latest || '/en/videos/AllCategories/0/'}${page}`;
|
const url = `${site.url}${site.parameters?.latest || '/en/videos/AllCategories/0/'}${page}`;
|
||||||
const res = await bhttp.get(url);
|
const res = await bhttp.get(url);
|
||||||
|
|
||||||
console.log(url);
|
|
||||||
|
|
||||||
return scrapeAll(res.body.toString(), site);
|
return scrapeAll(res.body.toString(), site);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -405,7 +420,7 @@ async function fetchActorScenes(actorName, apiUrl, siteSlug) {
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchProfile(actorName, siteSlug, altSearchUrl) {
|
async function fetchProfile(actorName, siteSlug, altSearchUrl, getActorReleasesUrl) {
|
||||||
const actorSlug = actorName.toLowerCase().replace(/\s+/, '+');
|
const actorSlug = actorName.toLowerCase().replace(/\s+/, '+');
|
||||||
const searchUrl = altSearchUrl
|
const searchUrl = altSearchUrl
|
||||||
? `https://www.${siteSlug}.com/en/search/${actorSlug}/1/actor`
|
? `https://www.${siteSlug}.com/en/search/${actorSlug}/1/actor`
|
||||||
|
@ -426,7 +441,7 @@ async function fetchProfile(actorName, siteSlug, altSearchUrl) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
return scrapeProfile(actorRes.body.toString(), url, actorName, siteSlug);
|
return scrapeProfile(actorRes.body.toString(), url, actorName, siteSlug, getActorReleasesUrl);
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
|
|
|
@ -19,9 +19,17 @@ async function fetchScene(url, site) {
|
||||||
return release;
|
return release;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function getActorReleasesUrl(actorPath, page = 1) {
|
||||||
|
return `https://www.xempire.com/en/videos/xempire/latest/${page}/All-Categories/0${actorPath}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function networkFetchProfile(actorName, siteSlug) {
|
||||||
|
return fetchProfile(actorName, siteSlug, null, getActorReleasesUrl);
|
||||||
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
fetchLatest,
|
fetchLatest,
|
||||||
fetchProfile,
|
fetchProfile: networkFetchProfile,
|
||||||
fetchUpcoming,
|
fetchUpcoming,
|
||||||
fetchScene,
|
fetchScene,
|
||||||
};
|
};
|
||||||
|
|
Loading…
Reference in New Issue