When poster is not available during scraping, first photo is stored as poster; removed client-side fallback. Added screencap album fallback to Jules Jordan scraper. Simplified JJ page traversal.

2019-12-13 16:59:04 +01:00
parent 5b5d383363
commit fed2b0be8a
6 changed files with 58 additions and 69 deletions
--- a/assets/components/releases/banner.vue
+++ b/assets/components/releases/banner.vue
@@ -22,7 +22,7 @@
            <video
                v-if="release.trailer"
                :src="`/media/${release.trailer.path}`"
-                :poster="`/media/${(release.poster && release.poster.thumbnail) || (release.photos.length && release.photos[Math.floor(Math.random() * release.photos.length)].path)}`"
+                :poster="`/media/${(release.poster && release.poster.thumbnail)}`"
                :alt="release.title"
                class="item trailer-video"
                controls
@@ -47,21 +47,16 @@

 <script>
 function photos() {
-    if (this.release.photos.length) {
-        const set = this.release.photos.sort(({ index: indexA }, { index: indexB }) => indexA - indexB);
-
    if (this.release.trailer) {
-            return set;
+        // poster will be on trailer video
+        return this.release.photos;
    }

-        return [this.release.poster].concat(set);
+    if (this.release.poster) {
+        return [this.release.poster].concat(this.release.photos);
    }

-    if (this.release.poster && !this.release.trailer) {
-        return [this.release.poster];
-    }
-
-    return [];
+    return this.release.photos;
 }

 function scrollBanner(event) {
--- a/assets/components/tile/release.vue
+++ b/assets/components/tile/release.vue
@@ -56,13 +56,6 @@
                    class="thumbnail"
                >

-                <img
-                    v-else-if="release.photos.length > 0"
-                    :src="`/media/${release.photos[0].thumbnail}`"
-                    :alt="release.title"
-                    class="thumbnail"
-                >
-
                <div
                    v-else
                    :title="release.title"
--- a/src/media.js
+++ b/src/media.js
@@ -101,7 +101,6 @@ async function filterHashDuplicates(files, domains = ['releases'], roles = ['pho

 async function fetchPhoto(photoUrl, index, identifier, attempt = 1) {
    if (Array.isArray(photoUrl)) {
-        // return fetchPhoto(photoUrl[0], index, identifier);
        return photoUrl.reduce(async (outcome, url) => outcome.catch(async () => {
            const photo = await fetchPhoto(url, index, identifier);

@@ -156,7 +155,7 @@ async function savePhotos(files, {
        const thumbnail = await createThumbnail(file.photo);

        const filename = naming === 'index'
-            ? `${file.role || role}-${index}`
+            ? `${file.role || role}-${index + 1}`
            : `${timestamp + index}`;

        const filepath = path.join(domain, subpath, `${filename}.${file.extension}`);
@@ -214,6 +213,8 @@ async function storePhotos(photos, {
    ]);

    if (primaryRole && !primaryPhoto) {
+        console.log(`Setting first photo as ${primaryRole} for ${identifier}`);
+
        uniquePhotos[0].role = primaryRole;
    }

--- a/src/releases.js
+++ b/src/releases.js
@@ -46,7 +46,7 @@ async function curateRelease(release) {
                target_id: release.id,
                domain: 'releases',
            })
-            .orderBy('role'),
+            .orderBy(['role', 'index']),
    ]);

    return {
@@ -255,6 +255,7 @@ async function storeReleaseAssets(release, releaseId) {
            storePhotos(release.photos, {
                targetId: releaseId,
                subpath,
+                primaryRole: release.poster ? null : 'poster',
            }, identifier),
            release.poster && storePhotos([release.poster], {
                role: 'poster',
--- a/src/scrape-sites.js
+++ b/src/scrape-sites.js
@@ -32,7 +32,7 @@ async function scrapeUniqueReleases(scraper, site, afterDate = getAfterDate(), a
    const latestReleases = await scraper.fetchLatest(site, page);

    if (latestReleases.length === 0) {
-        return [];
+        return accReleases;
    }

    const oldestReleaseOnPage = latestReleases.slice(-1)[0].date;
@@ -44,8 +44,6 @@ async function scrapeUniqueReleases(scraper, site, afterDate = getAfterDate(), a

    console.log(`\x1b[90m${site.name}: Scraped page ${page}, ${uniqueReleases.length} unique recent releases\x1b[0m`);

-    console.log(oldestReleaseOnPage, afterDate, moment(oldestReleaseOnPage).isAfter(afterDate));
-
    if (
        uniqueReleases.length > 0
            && (oldestReleaseOnPage || page < argv.pages)
--- a/src/scrapers/julesjordan.js
+++ b/src/scrapers/julesjordan.js
@@ -14,7 +14,7 @@ async function fetchPhotos(url) {
    return res.body.toString();
 }

-function scrapePhotos(html) {
+function scrapePhotos(html, type) {
    const $ = cheerio.load(html, { normalizeWhitespace: true });

    const photos = $('.photo_gallery_thumbnail_wrapper .thumbs')
@@ -23,6 +23,13 @@ function scrapePhotos(html) {
            const src = $(photoElement).attr('src');

            // high res often available in alternative directories, but not always, provide original as fallback
+            if (type === 'caps') {
+                return [
+                    src.replace('capthumbs/', 'caps/'),
+                    src,
+                ];
+            }
+
            return [
                src.replace('thumbs/', 'photos/'),
                src.replace('thumbs/', '1024watermarked/'),
@@ -33,28 +40,35 @@ function scrapePhotos(html) {
    return photos;
 }

-async function getPhotos(entryId, site, page = 1) {
-    const albumUrl = `https://www.julesjordan.com/trial/gallery.php?id=${entryId}&type=highres&page=${page}`;
+async function getPhotos(entryId, site, page = 1, type = 'highres') {
+    const albumUrl = `https://www.julesjordan.com/trial/gallery.php?id=${entryId}&type=${type}&page=${page}`;

    const html = await fetchPhotos(albumUrl);
    const $ = cheerio.load(html, { normalizeWhitespace: true });

-    const photos = scrapePhotos(html);
-    const pagesString = $('.page_totals').text().trim();
-    const pages = pagesString.length > 0 ? Number($('.page_totals').text().trim().match(/\d+$/)[0]) : null;
+    // don't add first URL to pages to prevent unnecessary duplicate request
+    const photos = scrapePhotos(html, type);
+    const pages = Array.from(new Set($('.page_numbers a').toArray().map(el => $(el).attr('href'))));

    const otherPhotos = pages
-        ? await Promise.map(Array.from({ length: pages - 1 }), async (val, index) => {
-            const pageUrl = `https://www.julesjordan.com/trial/gallery.php?id=${entryId}&type=highres&page=${index + 2}`;
+        ? await Promise.map(pages, async (pageX) => {
+            const pageUrl = `https://www.julesjordan.com/trial/${pageX}`;
            const pageHtml = await fetchPhotos(pageUrl);

-            return scrapePhotos(pageHtml);
+            return scrapePhotos(pageHtml, type);
        }, {
            concurrency: 2,
        })
        : [];

-    return photos.concat(otherPhotos.flat());
+    const allPhotos = photos.concat(otherPhotos.flat());
+
+    if (allPhotos.length === 0 && type === 'highres') {
+        // photos not available, try for screencaps instead
+        return getPhotos(entryId, site, 1, 'caps');
+    }
+
+    return allPhotos;
 }

 function scrapeLatest(html, site) {
@@ -153,55 +167,42 @@ function scrapeUpcoming(html, site) {
 async function scrapeScene(html, url, site) {
    const $ = cheerio.load(html, { normalizeWhitespace: true });

-    const title = $('.title_bar_hilite').text().trim();
-    const entryId = $('.suggest_tags a').attr('href').match(/\d+/)[0];
-    const date = moment
+    const release = { url, site };
+
+    release.title = $('.title_bar_hilite').text().trim();
+    [release.entryId] = $('.suggest_tags a').attr('href').match(/\d+/);
+    release.date = moment
        .utc($('.update_date').text(), 'MM/DD/YYYY')
        .toDate();

-    const actors = $('.backgroundcolor_info > .update_models a')
+    release.description = $('.update_description').text().trim();
+
+    release.actors = $('.backgroundcolor_info > .update_models a')
        .map((_actorIndex, actorElement) => $(actorElement).text())
        .toArray();

-    const description = $('.update_description').text().trim();
-
-    const stars = Number($('.avg_rating').text().trim().replace(/[\s|Avg Rating:]/g, ''));
-
    const infoLines = $('script:contains("useimage")')
        .html()
        .split('\n');

    const posterPath = infoLines.find(line => line.match('useimage')).replace('useimage = "', '').slice(0, -2);
-    const poster = posterPath.match(/^http/) ? posterPath : `${site.url}${posterPath}`;
+    if (posterPath) release.poster = posterPath.match(/^http/) ? posterPath : `${site.url}${posterPath}`;

    const trailerLine = infoLines.find(line => line.match('movie["Trailer_720"]'));
-    const trailer = trailerLine.slice(trailerLine.indexOf('path:"') + 6, trailerLine.indexOf('",movie'));

-    const photos = await getPhotos(entryId, site);
-
-    const tags = $('.update_tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
-    const movie = $('.update_dvds a').attr('href');
-
-    return {
-        url,
-        entryId,
-        title,
-        date,
-        actors,
-        description,
-        poster,
-        photos,
-        movie,
-        trailer: {
-            src: trailer,
+    release.trailer = {
+        src: trailerLine.slice(trailerLine.indexOf('path:"') + 6, trailerLine.indexOf('",movie')),
        quality: 720,
-        },
-        rating: {
-            stars,
-        },
-        tags,
-        site,
    };
+
+    release.photos = await getPhotos(release.entryId, site);
+
+    release.tags = $('.update_tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
+    release.movie = $('.update_dvds a').attr('href');
+
+    release.stars = Number($('.avg_rating').text().trim().replace(/[\s|Avg Rating:]/g, ''));
+
+    return release;
 }

 function scrapeMovie(html, url, site) {