From b1b7cd6d50f447d82ffd76d9e711cf8bcabf6372 Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Wed, 20 May 2020 02:23:45 +0200 Subject: [PATCH] Fixed Whale Member posters and photos. --- src/app.js | 7 +++++++ src/deep.js | 14 ++++++++++---- src/releases.js | 4 ++-- src/scrapers/whalemember.js | 12 +++++++----- 4 files changed, 26 insertions(+), 11 deletions(-) diff --git a/src/app.js b/src/app.js index 741e5344..152844c0 100644 --- a/src/app.js +++ b/src/app.js @@ -1,5 +1,7 @@ 'use strict'; +const util = require('util'); + const argv = require('./argv'); const initServer = require('./web/server'); @@ -39,6 +41,11 @@ async function init() { const sceneMovies = deepScenes && argv.sceneMovies && deepScenes.map(scene => scene.movie).filter(Boolean); const deepMovies = await fetchMovies([...(argv.movies || []), ...(sceneMovies || [])]); + if (argv.inspect) { + console.log(util.inspect(deepScenes)); + console.log(util.inspect(deepMovies)); + } + if (argv.save) { await storeReleases([ ...(deepScenes || []), diff --git a/src/deep.js b/src/deep.js index 13f30345..ae732274 100644 --- a/src/deep.js +++ b/src/deep.js @@ -14,7 +14,8 @@ function urlToSiteSlug(url) { try { const slug = new URL(url) .hostname - .match(/([\w-]+)\.\w+$/)?.[1]; + .match(/([\w-]+)\.\w+$/)?.[1] + .replace(/[-_]+/g, ''); return slug; } catch (error) { @@ -90,7 +91,6 @@ function toBaseReleases(baseReleasesOrUrls) { async function scrapeRelease(baseRelease, sites, type = 'scene') { const site = baseRelease.site || sites[urlToSiteSlug(baseRelease.url)]; - const siteWithFallbackNetwork = site.isNetwork ? { ...site, network: site } : site; // make site.network available, even when site is network fallback if (!site) { logger.warn(`No site available for ${baseRelease.url}`); @@ -104,6 +104,7 @@ async function scrapeRelease(baseRelease, sites, type = 'scene') { }; } + const siteWithFallbackNetwork = site.isNetwork ? { ...site, network: site } : site; // make site.network available, even when site is network fallback const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug]; if (!scraper) { @@ -131,7 +132,7 @@ async function scrapeRelease(baseRelease, sites, type = 'scene') { }; if (!mergedRelease.entryId) { - throw new Error('No entry ID supplied'); + throw Object.assign(new Error('No entry ID supplied'), { code: 'NO_ENTRY_ID' }); } if (scrapedRelease && baseRelease?.tags) { @@ -142,6 +143,11 @@ async function scrapeRelease(baseRelease, sites, type = 'scene') { return mergedRelease; } catch (error) { logger.error(`Deep scrape failed for ${baseRelease.url}: ${error.message}`); + + if (error.code === 'NO_ENTRY_ID') { + return null; + } + return baseRelease; } } @@ -160,7 +166,7 @@ async function fetchReleases(baseReleasesOrUrls, type = 'scene') { const deepReleases = await scrapeReleases(baseReleases, sites, type); - return deepReleases; + return deepReleases.filter(Boolean); } async function fetchScenes(baseReleasesOrUrls) { diff --git a/src/releases.js b/src/releases.js index 43a7c563..87070696 100644 --- a/src/releases.js +++ b/src/releases.js @@ -67,7 +67,7 @@ function withRelations(queryBuilder, withMedia = false, type = 'scene') { row_to_json(sites) as site, row_to_json(networks) as network, row_to_json(site_networks) as site_network, - json_agg(DISTINCT actors) as actors + COALESCE(json_agg(DISTINCT actors) FILTER (WHERE actors.id IS NOT NULL), '[]') as actors `)) .where('type', type) .leftJoin('sites', 'sites.id', 'releases.site_id') @@ -84,7 +84,7 @@ function withRelations(queryBuilder, withMedia = false, type = 'scene') { queryBuilder .select(knex.raw(` row_to_json(posters) as poster, - json_agg(DISTINCT photos) as photos + COALESCE(json_agg(DISTINCT photos) FILTER (WHERE photos.id IS NOT NULL), '[]') as photos `)) .leftJoin('releases_posters', 'releases_posters.release_id', 'releases.id') .leftJoin('media as posters', 'posters.id', 'releases_posters.media_id') diff --git a/src/scrapers/whalemember.js b/src/scrapers/whalemember.js index 05ff936a..14f40d37 100644 --- a/src/scrapers/whalemember.js +++ b/src/scrapers/whalemember.js @@ -19,8 +19,10 @@ function scrapeLatest(html, site) { release.date = moment.utc(scene.dataset.date, 'MMMM DD, YYYY').toDate(); release.actors = Array.from(scene.querySelectorAll('.actors a'), el => el.textContent); - release.poster = `https:${scene.querySelector('.single-image').src}`; - release.photos = Array.from(scene.querySelectorAll('.rollover-thumbs img'), el => `https:${el.dataset.src}`); + const poster = scene.querySelector('.single-image').dataset.src; + release.poster = /^http/.test(poster) ? poster : `https:${poster}`; + + release.photos = Array.from(scene.querySelectorAll('.rollover-thumbs img'), el => (/^http/.test(el.dataset.src) ? el.dataset.src : `https:${el.dataset.src}`)); const trailerEl = scene.querySelector('source'); if (trailerEl) release.trailer = { src: trailerEl.dataset.src }; @@ -49,13 +51,13 @@ function scrapeScene(html, site, url) { release.duration = Number(durationEls[0].textContent.match(/\d+/)[0]) * 60; } - release.photos = Array.from(scene.querySelectorAll('#t2019-main .t2019-thumbs img'), el => `https:${el.src}`); + release.photos = Array.from(scene.querySelectorAll('#t2019-main .t2019-thumbs img'), el => (/^http/.test(el.src) ? el.src : `https:${el.src}`)); const posterEl = scene.querySelector('#no-player-image'); const videoEl = scene.querySelector('video'); - if (posterEl) release.poster = `https:${posterEl.src}`; - else if (videoEl) release.poster = `https:${videoEl.poster}`; + if (posterEl) release.poster = /^http/.test(posterEl.src) ? posterEl.src : `https:${posterEl.src}`; + else if (videoEl) release.poster = /^http/.test(videoEl.poster) ? videoEl.poster : `https:${videoEl.poster}`; const trailerEl = scene.querySelector('#t2019-video source'); if (trailerEl) release.trailer = { src: trailerEl.src };