Added support for release photo fallbacks. Limited photo fetching concurrency. Modifier XEmpire scraper for AllBlackX support and improved photo scraping. Added movie property to Evil Angel scraper.

This commit is contained in:
2019-12-12 03:12:05 +01:00
parent c26d5b8655
commit a310f9bb1d
9 changed files with 113 additions and 70 deletions

View File

@@ -7,7 +7,6 @@ const { JSDOM } = require('jsdom');
const moment = require('moment');
const { heightToCm } = require('../utils/convert');
const { matchTags } = require('../tags');
async function fetchPhotos(url) {
const res = await bhttp.get(url);
@@ -22,13 +21,8 @@ function scrapePhotos(html) {
.map((photoIndex, photoElement) => {
const src = $(photoElement).attr('src');
if (src.match(/dl\d+/)) {
// thumbnail URLs containing dl02/ or dl03/ don't appear to have
// a full photo available, fall back to thumbnail
return src;
}
return src.replace('thumbs/', 'photos/');
// high res often available in photos/ directory, but not always, provide original as fallback
return [src.replace('thumbs/', 'photos/'), src];
})
.toArray();
@@ -172,8 +166,8 @@ async function scrapeScene(html, url, site) {
const photos = await getPhotos(entryId, site);
const rawTags = $('.update_tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
const tags = await matchTags(rawTags);
const tags = $('.update_tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
const movie = $('.update_dvds a').href;
return {
url,
@@ -184,6 +178,7 @@ async function scrapeScene(html, url, site) {
description,
poster,
photos,
movie,
trailer: {
src: trailer,
quality: 720,

View File

@@ -6,14 +6,12 @@ const cheerio = require('cheerio');
const { JSDOM } = require('jsdom');
const moment = require('moment');
const { fetchSites } = require('../sites');
const { matchTags } = require('../tags');
const defaultTags = {
hardx: [],
darkx: ['interracial'],
eroticax: [],
lesbianx: ['lesbian'],
allblackx: ['ebony', 'bbc'],
};
async function fetchPhotos(url) {
@@ -25,37 +23,56 @@ async function fetchPhotos(url) {
function scrapePhotos(html) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const unlockedPhotos = $('.preview .imgLink.pgUnlocked')
.map((photoIndex, photoElement) => $(photoElement).attr('href')).toArray();
return $('.preview .imgLink').toArray().map((linkEl) => {
const url = $(linkEl).attr('href');
const lockedThumbnails = $('.preview .imgLink.lockedPicture img')
.map((photoIndex, photoElement) => $(photoElement)
.attr('src'))
// .replace('_tb.jpg', '.jpg')) does not always work
.toArray();
if (url.match('/join')) {
// URL links to join page instead of full photo, extract thumbnail
const src = $(linkEl).find('img').attr('src');
return unlockedPhotos.concat(lockedThumbnails);
if (src.match('previews/')) {
// resource often serves full photo at a modifier URL anyway, add as primary source
const highRes = src
.replace('previews/', '')
.replace('_tb.jpg', '.jpg');
// keep original thumbnail as fallback in case full photo is not available
return [highRes, src];
}
return src;
}
// URL links to full photo
return url;
});
}
async function getPhotos(albumPath, siteDomain) {
const albumUrl = `https://${siteDomain}${albumPath}`;
const html = await fetchPhotos(albumUrl);
const $ = cheerio.load(html, { normalizeWhitespace: true });
const photos = scrapePhotos(html);
try {
const html = await fetchPhotos(albumUrl);
const $ = cheerio.load(html, { normalizeWhitespace: true });
const photos = scrapePhotos(html);
const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();
const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();
const otherPhotos = await Promise.map(pages, async (page) => {
const pageUrl = `https://${siteDomain}${page}`;
const pageHtml = await fetchPhotos(pageUrl);
const otherPhotos = await Promise.map(pages, async (page) => {
const pageUrl = `https://${siteDomain}${page}`;
const pageHtml = await fetchPhotos(pageUrl);
return scrapePhotos(pageHtml);
}, {
concurrency: 2,
});
return scrapePhotos(pageHtml);
}, {
concurrency: 2,
});
return photos.concat(otherPhotos.flat());
return photos.concat(otherPhotos.flat());
} catch (error) {
console.error(`Failed to fetch XEmpire photos from ${albumPath}: ${error.message}`);
return [];
}
}
function scrape(html, site) {
@@ -109,32 +126,26 @@ function scrape(html, site) {
async function scrapeScene(html, url, site) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const json = $('script[type="application/ld+json"]').html();
const json2 = $('script:contains("dataLayer = ")').html();
const videoJson = $('script:contains("window.ScenePlayerOptions")').html();
const data = JSON.parse(json)[0];
const data2 = JSON.parse(json2.slice(json2.indexOf('[{'), -1))[0];
const videoData = JSON.parse(videoJson.slice(videoJson.indexOf('{"id":'), videoJson.indexOf('};') + 1));
const entryId = new URL(url).pathname.split('/').slice(-1)[0];
const entryId = data2.sceneDetails.sceneId || new URL(url).pathname.split('/').slice(-1)[0];
const title = $('meta[name="twitter:title"]').attr('content');
const description = data.description || $('meta[name="twitter:description"]').attr('content');
const title = data2.sceneDetails.sceneTitle || $('meta[name="twitter:title"]').attr('content');
const description = data2.sceneDetails.sceneDescription || data.description || $('meta[name="twitter:description"]').attr('content');
// date in data object is not the release date of the scene, but the date the entry was added
const date = moment.utc($('.updatedDate').first().text(), 'MM-DD-YYYY').toDate();
const actors = data.actor
.sort(({ gender: genderA }, { gender: genderB }) => {
if (genderA === 'female' && genderB === 'male') return -1;
if (genderA === 'male' && genderB === 'female') return 1;
return 0;
})
.map(actor => actor.name);
const actors = (data2.sceneDetails.sceneActors || data.actor).map(actor => actor.actorName || actor.name);
const stars = (data.aggregateRating.ratingValue / data.aggregateRating.bestRating) * 5;
const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds();
const siteDomain = $('meta[name="twitter:domain"]').attr('content');
const siteDomain = $('meta[name="twitter:domain"]').attr('content') || 'allblackx.com'; // only AllBlackX has no twitter domain, no other useful hints available
const siteSlug = siteDomain && siteDomain.split('.')[0].toLowerCase();
const siteUrl = siteDomain && `https://www.${siteDomain}`;
@@ -144,19 +155,10 @@ async function scrapeScene(html, url, site) {
const photos = await getPhotos($('.picturesItem a').attr('href'), siteDomain, site);
const rawTags = data.keywords.split(', ');
const [[channelSite], tags] = await Promise.all([
site.isFallback
? fetchSites({
url: siteUrl,
slug: siteSlug,
})
: [site],
matchTags([...defaultTags[siteSlug], ...rawTags]),
]);
const tags = [...defaultTags[siteSlug], ...rawTags];
return {
url: channelSite ? `${channelSite.url}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}` : url,
url: `${siteUrl}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}`,
entryId,
title,
date,
@@ -174,7 +176,8 @@ async function scrapeScene(html, url, site) {
rating: {
stars,
},
site: channelSite || site,
site,
channel: siteSlug,
};
}