forked from DebaucheryLibrarian/traxxx
When poster is not available during scraping, first photo is stored as poster; removed client-side fallback. Added screencap album fallback to Jules Jordan scraper. Simplified JJ page traversal.
This commit is contained in:
@@ -14,7 +14,7 @@ async function fetchPhotos(url) {
|
||||
return res.body.toString();
|
||||
}
|
||||
|
||||
function scrapePhotos(html) {
|
||||
function scrapePhotos(html, type) {
|
||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||
|
||||
const photos = $('.photo_gallery_thumbnail_wrapper .thumbs')
|
||||
@@ -23,6 +23,13 @@ function scrapePhotos(html) {
|
||||
const src = $(photoElement).attr('src');
|
||||
|
||||
// high res often available in alternative directories, but not always, provide original as fallback
|
||||
if (type === 'caps') {
|
||||
return [
|
||||
src.replace('capthumbs/', 'caps/'),
|
||||
src,
|
||||
];
|
||||
}
|
||||
|
||||
return [
|
||||
src.replace('thumbs/', 'photos/'),
|
||||
src.replace('thumbs/', '1024watermarked/'),
|
||||
@@ -33,28 +40,35 @@ function scrapePhotos(html) {
|
||||
return photos;
|
||||
}
|
||||
|
||||
async function getPhotos(entryId, site, page = 1) {
|
||||
const albumUrl = `https://www.julesjordan.com/trial/gallery.php?id=${entryId}&type=highres&page=${page}`;
|
||||
async function getPhotos(entryId, site, page = 1, type = 'highres') {
|
||||
const albumUrl = `https://www.julesjordan.com/trial/gallery.php?id=${entryId}&type=${type}&page=${page}`;
|
||||
|
||||
const html = await fetchPhotos(albumUrl);
|
||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||
|
||||
const photos = scrapePhotos(html);
|
||||
const pagesString = $('.page_totals').text().trim();
|
||||
const pages = pagesString.length > 0 ? Number($('.page_totals').text().trim().match(/\d+$/)[0]) : null;
|
||||
// don't add first URL to pages to prevent unnecessary duplicate request
|
||||
const photos = scrapePhotos(html, type);
|
||||
const pages = Array.from(new Set($('.page_numbers a').toArray().map(el => $(el).attr('href'))));
|
||||
|
||||
const otherPhotos = pages
|
||||
? await Promise.map(Array.from({ length: pages - 1 }), async (val, index) => {
|
||||
const pageUrl = `https://www.julesjordan.com/trial/gallery.php?id=${entryId}&type=highres&page=${index + 2}`;
|
||||
? await Promise.map(pages, async (pageX) => {
|
||||
const pageUrl = `https://www.julesjordan.com/trial/${pageX}`;
|
||||
const pageHtml = await fetchPhotos(pageUrl);
|
||||
|
||||
return scrapePhotos(pageHtml);
|
||||
return scrapePhotos(pageHtml, type);
|
||||
}, {
|
||||
concurrency: 2,
|
||||
})
|
||||
: [];
|
||||
|
||||
return photos.concat(otherPhotos.flat());
|
||||
const allPhotos = photos.concat(otherPhotos.flat());
|
||||
|
||||
if (allPhotos.length === 0 && type === 'highres') {
|
||||
// photos not available, try for screencaps instead
|
||||
return getPhotos(entryId, site, 1, 'caps');
|
||||
}
|
||||
|
||||
return allPhotos;
|
||||
}
|
||||
|
||||
function scrapeLatest(html, site) {
|
||||
@@ -153,55 +167,42 @@ function scrapeUpcoming(html, site) {
|
||||
async function scrapeScene(html, url, site) {
|
||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||
|
||||
const title = $('.title_bar_hilite').text().trim();
|
||||
const entryId = $('.suggest_tags a').attr('href').match(/\d+/)[0];
|
||||
const date = moment
|
||||
const release = { url, site };
|
||||
|
||||
release.title = $('.title_bar_hilite').text().trim();
|
||||
[release.entryId] = $('.suggest_tags a').attr('href').match(/\d+/);
|
||||
release.date = moment
|
||||
.utc($('.update_date').text(), 'MM/DD/YYYY')
|
||||
.toDate();
|
||||
|
||||
const actors = $('.backgroundcolor_info > .update_models a')
|
||||
release.description = $('.update_description').text().trim();
|
||||
|
||||
release.actors = $('.backgroundcolor_info > .update_models a')
|
||||
.map((_actorIndex, actorElement) => $(actorElement).text())
|
||||
.toArray();
|
||||
|
||||
const description = $('.update_description').text().trim();
|
||||
|
||||
const stars = Number($('.avg_rating').text().trim().replace(/[\s|Avg Rating:]/g, ''));
|
||||
|
||||
const infoLines = $('script:contains("useimage")')
|
||||
.html()
|
||||
.split('\n');
|
||||
|
||||
const posterPath = infoLines.find(line => line.match('useimage')).replace('useimage = "', '').slice(0, -2);
|
||||
const poster = posterPath.match(/^http/) ? posterPath : `${site.url}${posterPath}`;
|
||||
if (posterPath) release.poster = posterPath.match(/^http/) ? posterPath : `${site.url}${posterPath}`;
|
||||
|
||||
const trailerLine = infoLines.find(line => line.match('movie["Trailer_720"]'));
|
||||
const trailer = trailerLine.slice(trailerLine.indexOf('path:"') + 6, trailerLine.indexOf('",movie'));
|
||||
|
||||
const photos = await getPhotos(entryId, site);
|
||||
|
||||
const tags = $('.update_tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
|
||||
const movie = $('.update_dvds a').attr('href');
|
||||
|
||||
return {
|
||||
url,
|
||||
entryId,
|
||||
title,
|
||||
date,
|
||||
actors,
|
||||
description,
|
||||
poster,
|
||||
photos,
|
||||
movie,
|
||||
trailer: {
|
||||
src: trailer,
|
||||
quality: 720,
|
||||
},
|
||||
rating: {
|
||||
stars,
|
||||
},
|
||||
tags,
|
||||
site,
|
||||
release.trailer = {
|
||||
src: trailerLine.slice(trailerLine.indexOf('path:"') + 6, trailerLine.indexOf('",movie')),
|
||||
quality: 720,
|
||||
};
|
||||
|
||||
release.photos = await getPhotos(release.entryId, site);
|
||||
|
||||
release.tags = $('.update_tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
|
||||
release.movie = $('.update_dvds a').attr('href');
|
||||
|
||||
release.stars = Number($('.avg_rating').text().trim().replace(/[\s|Avg Rating:]/g, ''));
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
function scrapeMovie(html, url, site) {
|
||||
|
||||
Reference in New Issue
Block a user