forked from DebaucheryLibrarian/traxxx
Added support for release photo fallbacks. Limited photo fetching concurrency. Modifier XEmpire scraper for AllBlackX support and improved photo scraping. Added movie property to Evil Angel scraper.
This commit is contained in:
@@ -7,7 +7,6 @@ const { JSDOM } = require('jsdom');
|
||||
const moment = require('moment');
|
||||
|
||||
const { heightToCm } = require('../utils/convert');
|
||||
const { matchTags } = require('../tags');
|
||||
|
||||
async function fetchPhotos(url) {
|
||||
const res = await bhttp.get(url);
|
||||
@@ -22,13 +21,8 @@ function scrapePhotos(html) {
|
||||
.map((photoIndex, photoElement) => {
|
||||
const src = $(photoElement).attr('src');
|
||||
|
||||
if (src.match(/dl\d+/)) {
|
||||
// thumbnail URLs containing dl02/ or dl03/ don't appear to have
|
||||
// a full photo available, fall back to thumbnail
|
||||
return src;
|
||||
}
|
||||
|
||||
return src.replace('thumbs/', 'photos/');
|
||||
// high res often available in photos/ directory, but not always, provide original as fallback
|
||||
return [src.replace('thumbs/', 'photos/'), src];
|
||||
})
|
||||
.toArray();
|
||||
|
||||
@@ -172,8 +166,8 @@ async function scrapeScene(html, url, site) {
|
||||
|
||||
const photos = await getPhotos(entryId, site);
|
||||
|
||||
const rawTags = $('.update_tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
|
||||
const tags = await matchTags(rawTags);
|
||||
const tags = $('.update_tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
|
||||
const movie = $('.update_dvds a').href;
|
||||
|
||||
return {
|
||||
url,
|
||||
@@ -184,6 +178,7 @@ async function scrapeScene(html, url, site) {
|
||||
description,
|
||||
poster,
|
||||
photos,
|
||||
movie,
|
||||
trailer: {
|
||||
src: trailer,
|
||||
quality: 720,
|
||||
|
||||
@@ -6,14 +6,12 @@ const cheerio = require('cheerio');
|
||||
const { JSDOM } = require('jsdom');
|
||||
const moment = require('moment');
|
||||
|
||||
const { fetchSites } = require('../sites');
|
||||
const { matchTags } = require('../tags');
|
||||
|
||||
const defaultTags = {
|
||||
hardx: [],
|
||||
darkx: ['interracial'],
|
||||
eroticax: [],
|
||||
lesbianx: ['lesbian'],
|
||||
allblackx: ['ebony', 'bbc'],
|
||||
};
|
||||
|
||||
async function fetchPhotos(url) {
|
||||
@@ -25,37 +23,56 @@ async function fetchPhotos(url) {
|
||||
function scrapePhotos(html) {
|
||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||
|
||||
const unlockedPhotos = $('.preview .imgLink.pgUnlocked')
|
||||
.map((photoIndex, photoElement) => $(photoElement).attr('href')).toArray();
|
||||
return $('.preview .imgLink').toArray().map((linkEl) => {
|
||||
const url = $(linkEl).attr('href');
|
||||
|
||||
const lockedThumbnails = $('.preview .imgLink.lockedPicture img')
|
||||
.map((photoIndex, photoElement) => $(photoElement)
|
||||
.attr('src'))
|
||||
// .replace('_tb.jpg', '.jpg')) does not always work
|
||||
.toArray();
|
||||
if (url.match('/join')) {
|
||||
// URL links to join page instead of full photo, extract thumbnail
|
||||
const src = $(linkEl).find('img').attr('src');
|
||||
|
||||
return unlockedPhotos.concat(lockedThumbnails);
|
||||
if (src.match('previews/')) {
|
||||
// resource often serves full photo at a modifier URL anyway, add as primary source
|
||||
const highRes = src
|
||||
.replace('previews/', '')
|
||||
.replace('_tb.jpg', '.jpg');
|
||||
|
||||
// keep original thumbnail as fallback in case full photo is not available
|
||||
return [highRes, src];
|
||||
}
|
||||
|
||||
return src;
|
||||
}
|
||||
|
||||
// URL links to full photo
|
||||
return url;
|
||||
});
|
||||
}
|
||||
|
||||
async function getPhotos(albumPath, siteDomain) {
|
||||
const albumUrl = `https://${siteDomain}${albumPath}`;
|
||||
|
||||
const html = await fetchPhotos(albumUrl);
|
||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||
const photos = scrapePhotos(html);
|
||||
try {
|
||||
const html = await fetchPhotos(albumUrl);
|
||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||
const photos = scrapePhotos(html);
|
||||
|
||||
const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();
|
||||
const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();
|
||||
|
||||
const otherPhotos = await Promise.map(pages, async (page) => {
|
||||
const pageUrl = `https://${siteDomain}${page}`;
|
||||
const pageHtml = await fetchPhotos(pageUrl);
|
||||
const otherPhotos = await Promise.map(pages, async (page) => {
|
||||
const pageUrl = `https://${siteDomain}${page}`;
|
||||
const pageHtml = await fetchPhotos(pageUrl);
|
||||
|
||||
return scrapePhotos(pageHtml);
|
||||
}, {
|
||||
concurrency: 2,
|
||||
});
|
||||
return scrapePhotos(pageHtml);
|
||||
}, {
|
||||
concurrency: 2,
|
||||
});
|
||||
|
||||
return photos.concat(otherPhotos.flat());
|
||||
return photos.concat(otherPhotos.flat());
|
||||
} catch (error) {
|
||||
console.error(`Failed to fetch XEmpire photos from ${albumPath}: ${error.message}`);
|
||||
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
function scrape(html, site) {
|
||||
@@ -109,32 +126,26 @@ function scrape(html, site) {
|
||||
async function scrapeScene(html, url, site) {
|
||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||
const json = $('script[type="application/ld+json"]').html();
|
||||
const json2 = $('script:contains("dataLayer = ")').html();
|
||||
const videoJson = $('script:contains("window.ScenePlayerOptions")').html();
|
||||
|
||||
const data = JSON.parse(json)[0];
|
||||
const data2 = JSON.parse(json2.slice(json2.indexOf('[{'), -1))[0];
|
||||
const videoData = JSON.parse(videoJson.slice(videoJson.indexOf('{"id":'), videoJson.indexOf('};') + 1));
|
||||
|
||||
const entryId = new URL(url).pathname.split('/').slice(-1)[0];
|
||||
const entryId = data2.sceneDetails.sceneId || new URL(url).pathname.split('/').slice(-1)[0];
|
||||
|
||||
const title = $('meta[name="twitter:title"]').attr('content');
|
||||
const description = data.description || $('meta[name="twitter:description"]').attr('content');
|
||||
const title = data2.sceneDetails.sceneTitle || $('meta[name="twitter:title"]').attr('content');
|
||||
const description = data2.sceneDetails.sceneDescription || data.description || $('meta[name="twitter:description"]').attr('content');
|
||||
// date in data object is not the release date of the scene, but the date the entry was added
|
||||
const date = moment.utc($('.updatedDate').first().text(), 'MM-DD-YYYY').toDate();
|
||||
|
||||
const actors = data.actor
|
||||
.sort(({ gender: genderA }, { gender: genderB }) => {
|
||||
if (genderA === 'female' && genderB === 'male') return -1;
|
||||
if (genderA === 'male' && genderB === 'female') return 1;
|
||||
|
||||
return 0;
|
||||
})
|
||||
.map(actor => actor.name);
|
||||
|
||||
const actors = (data2.sceneDetails.sceneActors || data.actor).map(actor => actor.actorName || actor.name);
|
||||
const stars = (data.aggregateRating.ratingValue / data.aggregateRating.bestRating) * 5;
|
||||
|
||||
const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds();
|
||||
|
||||
const siteDomain = $('meta[name="twitter:domain"]').attr('content');
|
||||
const siteDomain = $('meta[name="twitter:domain"]').attr('content') || 'allblackx.com'; // only AllBlackX has no twitter domain, no other useful hints available
|
||||
const siteSlug = siteDomain && siteDomain.split('.')[0].toLowerCase();
|
||||
const siteUrl = siteDomain && `https://www.${siteDomain}`;
|
||||
|
||||
@@ -144,19 +155,10 @@ async function scrapeScene(html, url, site) {
|
||||
const photos = await getPhotos($('.picturesItem a').attr('href'), siteDomain, site);
|
||||
|
||||
const rawTags = data.keywords.split(', ');
|
||||
|
||||
const [[channelSite], tags] = await Promise.all([
|
||||
site.isFallback
|
||||
? fetchSites({
|
||||
url: siteUrl,
|
||||
slug: siteSlug,
|
||||
})
|
||||
: [site],
|
||||
matchTags([...defaultTags[siteSlug], ...rawTags]),
|
||||
]);
|
||||
const tags = [...defaultTags[siteSlug], ...rawTags];
|
||||
|
||||
return {
|
||||
url: channelSite ? `${channelSite.url}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}` : url,
|
||||
url: `${siteUrl}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}`,
|
||||
entryId,
|
||||
title,
|
||||
date,
|
||||
@@ -174,7 +176,8 @@ async function scrapeScene(html, url, site) {
|
||||
rating: {
|
||||
stars,
|
||||
},
|
||||
site: channelSite || site,
|
||||
site,
|
||||
channel: siteSlug,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user