When poster is not available during scraping, first photo is stored as poster; removed client-side fallback. Added screencap album fallback to Jules Jordan scraper. Simplified JJ page traversal.

This commit is contained in:
ThePendulum 2019-12-13 16:59:04 +01:00
parent 5b5d383363
commit fed2b0be8a
6 changed files with 58 additions and 69 deletions

View File

@ -22,7 +22,7 @@
<video <video
v-if="release.trailer" v-if="release.trailer"
:src="`/media/${release.trailer.path}`" :src="`/media/${release.trailer.path}`"
:poster="`/media/${(release.poster && release.poster.thumbnail) || (release.photos.length && release.photos[Math.floor(Math.random() * release.photos.length)].path)}`" :poster="`/media/${(release.poster && release.poster.thumbnail)}`"
:alt="release.title" :alt="release.title"
class="item trailer-video" class="item trailer-video"
controls controls
@ -47,21 +47,16 @@
<script> <script>
function photos() { function photos() {
if (this.release.photos.length) {
const set = this.release.photos.sort(({ index: indexA }, { index: indexB }) => indexA - indexB);
if (this.release.trailer) { if (this.release.trailer) {
return set; // poster will be on trailer video
return this.release.photos;
} }
return [this.release.poster].concat(set); if (this.release.poster) {
return [this.release.poster].concat(this.release.photos);
} }
if (this.release.poster && !this.release.trailer) { return this.release.photos;
return [this.release.poster];
}
return [];
} }
function scrollBanner(event) { function scrollBanner(event) {

View File

@ -56,13 +56,6 @@
class="thumbnail" class="thumbnail"
> >
<img
v-else-if="release.photos.length > 0"
:src="`/media/${release.photos[0].thumbnail}`"
:alt="release.title"
class="thumbnail"
>
<div <div
v-else v-else
:title="release.title" :title="release.title"

View File

@ -101,7 +101,6 @@ async function filterHashDuplicates(files, domains = ['releases'], roles = ['pho
async function fetchPhoto(photoUrl, index, identifier, attempt = 1) { async function fetchPhoto(photoUrl, index, identifier, attempt = 1) {
if (Array.isArray(photoUrl)) { if (Array.isArray(photoUrl)) {
// return fetchPhoto(photoUrl[0], index, identifier);
return photoUrl.reduce(async (outcome, url) => outcome.catch(async () => { return photoUrl.reduce(async (outcome, url) => outcome.catch(async () => {
const photo = await fetchPhoto(url, index, identifier); const photo = await fetchPhoto(url, index, identifier);
@ -156,7 +155,7 @@ async function savePhotos(files, {
const thumbnail = await createThumbnail(file.photo); const thumbnail = await createThumbnail(file.photo);
const filename = naming === 'index' const filename = naming === 'index'
? `${file.role || role}-${index}` ? `${file.role || role}-${index + 1}`
: `${timestamp + index}`; : `${timestamp + index}`;
const filepath = path.join(domain, subpath, `${filename}.${file.extension}`); const filepath = path.join(domain, subpath, `${filename}.${file.extension}`);
@ -214,6 +213,8 @@ async function storePhotos(photos, {
]); ]);
if (primaryRole && !primaryPhoto) { if (primaryRole && !primaryPhoto) {
console.log(`Setting first photo as ${primaryRole} for ${identifier}`);
uniquePhotos[0].role = primaryRole; uniquePhotos[0].role = primaryRole;
} }

View File

@ -46,7 +46,7 @@ async function curateRelease(release) {
target_id: release.id, target_id: release.id,
domain: 'releases', domain: 'releases',
}) })
.orderBy('role'), .orderBy(['role', 'index']),
]); ]);
return { return {
@ -255,6 +255,7 @@ async function storeReleaseAssets(release, releaseId) {
storePhotos(release.photos, { storePhotos(release.photos, {
targetId: releaseId, targetId: releaseId,
subpath, subpath,
primaryRole: release.poster ? null : 'poster',
}, identifier), }, identifier),
release.poster && storePhotos([release.poster], { release.poster && storePhotos([release.poster], {
role: 'poster', role: 'poster',

View File

@ -32,7 +32,7 @@ async function scrapeUniqueReleases(scraper, site, afterDate = getAfterDate(), a
const latestReleases = await scraper.fetchLatest(site, page); const latestReleases = await scraper.fetchLatest(site, page);
if (latestReleases.length === 0) { if (latestReleases.length === 0) {
return []; return accReleases;
} }
const oldestReleaseOnPage = latestReleases.slice(-1)[0].date; const oldestReleaseOnPage = latestReleases.slice(-1)[0].date;
@ -44,8 +44,6 @@ async function scrapeUniqueReleases(scraper, site, afterDate = getAfterDate(), a
console.log(`\x1b[90m${site.name}: Scraped page ${page}, ${uniqueReleases.length} unique recent releases\x1b[0m`); console.log(`\x1b[90m${site.name}: Scraped page ${page}, ${uniqueReleases.length} unique recent releases\x1b[0m`);
console.log(oldestReleaseOnPage, afterDate, moment(oldestReleaseOnPage).isAfter(afterDate));
if ( if (
uniqueReleases.length > 0 uniqueReleases.length > 0
&& (oldestReleaseOnPage || page < argv.pages) && (oldestReleaseOnPage || page < argv.pages)

View File

@ -14,7 +14,7 @@ async function fetchPhotos(url) {
return res.body.toString(); return res.body.toString();
} }
function scrapePhotos(html) { function scrapePhotos(html, type) {
const $ = cheerio.load(html, { normalizeWhitespace: true }); const $ = cheerio.load(html, { normalizeWhitespace: true });
const photos = $('.photo_gallery_thumbnail_wrapper .thumbs') const photos = $('.photo_gallery_thumbnail_wrapper .thumbs')
@ -23,6 +23,13 @@ function scrapePhotos(html) {
const src = $(photoElement).attr('src'); const src = $(photoElement).attr('src');
// high res often available in alternative directories, but not always, provide original as fallback // high res often available in alternative directories, but not always, provide original as fallback
if (type === 'caps') {
return [
src.replace('capthumbs/', 'caps/'),
src,
];
}
return [ return [
src.replace('thumbs/', 'photos/'), src.replace('thumbs/', 'photos/'),
src.replace('thumbs/', '1024watermarked/'), src.replace('thumbs/', '1024watermarked/'),
@ -33,28 +40,35 @@ function scrapePhotos(html) {
return photos; return photos;
} }
async function getPhotos(entryId, site, page = 1) { async function getPhotos(entryId, site, page = 1, type = 'highres') {
const albumUrl = `https://www.julesjordan.com/trial/gallery.php?id=${entryId}&type=highres&page=${page}`; const albumUrl = `https://www.julesjordan.com/trial/gallery.php?id=${entryId}&type=${type}&page=${page}`;
const html = await fetchPhotos(albumUrl); const html = await fetchPhotos(albumUrl);
const $ = cheerio.load(html, { normalizeWhitespace: true }); const $ = cheerio.load(html, { normalizeWhitespace: true });
const photos = scrapePhotos(html); // don't add first URL to pages to prevent unnecessary duplicate request
const pagesString = $('.page_totals').text().trim(); const photos = scrapePhotos(html, type);
const pages = pagesString.length > 0 ? Number($('.page_totals').text().trim().match(/\d+$/)[0]) : null; const pages = Array.from(new Set($('.page_numbers a').toArray().map(el => $(el).attr('href'))));
const otherPhotos = pages const otherPhotos = pages
? await Promise.map(Array.from({ length: pages - 1 }), async (val, index) => { ? await Promise.map(pages, async (pageX) => {
const pageUrl = `https://www.julesjordan.com/trial/gallery.php?id=${entryId}&type=highres&page=${index + 2}`; const pageUrl = `https://www.julesjordan.com/trial/${pageX}`;
const pageHtml = await fetchPhotos(pageUrl); const pageHtml = await fetchPhotos(pageUrl);
return scrapePhotos(pageHtml); return scrapePhotos(pageHtml, type);
}, { }, {
concurrency: 2, concurrency: 2,
}) })
: []; : [];
return photos.concat(otherPhotos.flat()); const allPhotos = photos.concat(otherPhotos.flat());
if (allPhotos.length === 0 && type === 'highres') {
// photos not available, try for screencaps instead
return getPhotos(entryId, site, 1, 'caps');
}
return allPhotos;
} }
function scrapeLatest(html, site) { function scrapeLatest(html, site) {
@ -153,55 +167,42 @@ function scrapeUpcoming(html, site) {
async function scrapeScene(html, url, site) { async function scrapeScene(html, url, site) {
const $ = cheerio.load(html, { normalizeWhitespace: true }); const $ = cheerio.load(html, { normalizeWhitespace: true });
const title = $('.title_bar_hilite').text().trim(); const release = { url, site };
const entryId = $('.suggest_tags a').attr('href').match(/\d+/)[0];
const date = moment release.title = $('.title_bar_hilite').text().trim();
[release.entryId] = $('.suggest_tags a').attr('href').match(/\d+/);
release.date = moment
.utc($('.update_date').text(), 'MM/DD/YYYY') .utc($('.update_date').text(), 'MM/DD/YYYY')
.toDate(); .toDate();
const actors = $('.backgroundcolor_info > .update_models a') release.description = $('.update_description').text().trim();
release.actors = $('.backgroundcolor_info > .update_models a')
.map((_actorIndex, actorElement) => $(actorElement).text()) .map((_actorIndex, actorElement) => $(actorElement).text())
.toArray(); .toArray();
const description = $('.update_description').text().trim();
const stars = Number($('.avg_rating').text().trim().replace(/[\s|Avg Rating:]/g, ''));
const infoLines = $('script:contains("useimage")') const infoLines = $('script:contains("useimage")')
.html() .html()
.split('\n'); .split('\n');
const posterPath = infoLines.find(line => line.match('useimage')).replace('useimage = "', '').slice(0, -2); const posterPath = infoLines.find(line => line.match('useimage')).replace('useimage = "', '').slice(0, -2);
const poster = posterPath.match(/^http/) ? posterPath : `${site.url}${posterPath}`; if (posterPath) release.poster = posterPath.match(/^http/) ? posterPath : `${site.url}${posterPath}`;
const trailerLine = infoLines.find(line => line.match('movie["Trailer_720"]')); const trailerLine = infoLines.find(line => line.match('movie["Trailer_720"]'));
const trailer = trailerLine.slice(trailerLine.indexOf('path:"') + 6, trailerLine.indexOf('",movie'));
const photos = await getPhotos(entryId, site); release.trailer = {
src: trailerLine.slice(trailerLine.indexOf('path:"') + 6, trailerLine.indexOf('",movie')),
const tags = $('.update_tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
const movie = $('.update_dvds a').attr('href');
return {
url,
entryId,
title,
date,
actors,
description,
poster,
photos,
movie,
trailer: {
src: trailer,
quality: 720, quality: 720,
},
rating: {
stars,
},
tags,
site,
}; };
release.photos = await getPhotos(release.entryId, site);
release.tags = $('.update_tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
release.movie = $('.update_dvds a').attr('href');
release.stars = Number($('.avg_rating').text().trim().replace(/[\s|Avg Rating:]/g, ''));
return release;
} }
function scrapeMovie(html, url, site) { function scrapeMovie(html, url, site) {