From db63be8f92da07c9c028b3cc41697337df8d5d4e Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Tue, 10 Mar 2020 04:42:15 +0100 Subject: [PATCH] Photo plucker will use discarded photos as fallback. Returning high res photo sources from LegalPorno. --- src/media.js | 78 +++++++++++++++++++++++++++++--------- src/scrapers/legalporno.js | 70 +++++++++++++++++++--------------- 2 files changed, 100 insertions(+), 48 deletions(-) diff --git a/src/media.js b/src/media.js index 1f6091da..b8185917 100644 --- a/src/media.js +++ b/src/media.js @@ -42,11 +42,50 @@ async function createThumbnail(buffer) { return null; } -function pluckItems(items, specifiedLimit) { +function groupFallbacksByPriority(chunks) { + /* + Chunks naturally give priority to all of the first item's fallbacks, generally lower quality images. + This function ensures every item's first source is tried, before trying every item's second source, etc., example: + IN: [[1, 2, 3,], 10, [1, 2, 3, 4, 5], [1, 2, 3]] + OUT [[1, 1, 1], [2, 2, 2], [3, 3, 3], [4], [5]] + */ + return chunks.map(group => group.reduce((acc, item) => { + if (Array.isArray(item)) { + // place provided fallbacks at same index (priority) in parent array + item.forEach((fallback, fallbackIndex) => { + if (!acc[fallbackIndex]) { + acc[fallbackIndex] = []; + } + + acc[fallbackIndex].push(fallback); + }); + + return acc; + } + + // no fallbacks provided, first priority + if (!acc[0]) { + acc[0] = []; + } + + acc[0].push(item); + + return acc; + }, []).flat()); +} + +function pluckItems(items, specifiedLimit, asFallbacks = true) { const limit = specifiedLimit || config.media.limit; if (!items || items.length <= limit) return items; + if (asFallbacks) { + const chunks = chunk(items, Math.ceil(items.length / limit)); + const fallbacks = groupFallbacksByPriority(chunks); + + return fallbacks; + } + const plucked = [1] .concat( Array.from({ length: limit - 1 }, (value, index) => Math.round((index + 1) * (items.length / (limit - 1)))), @@ -93,8 +132,8 @@ async function extractItem(source) { return null; } -async function fetchSource(source, domain, role, originalSource) { - logger.verbose(`Fetching ${domain} ${role} from ${source.src || source}`); +async function fetchSource(source, domain, role) { + logger.silly(`Fetching ${domain} ${role} from ${source.src || source}`); // const res = await bhttp.get(source.src || source); const res = await get(source.src || source, { @@ -111,7 +150,7 @@ async function fetchSource(source, domain, role, originalSource) { const hash = getHash(res.body); const { entropy, size, width, height } = /image/.test(mimetype) ? await getMeta(res.body) : {}; - logger.verbose(`Fetched media item from ${source.src || source}`); + logger.silly(`Fetched media item from ${source.src || source}`); return { file: res.body, @@ -123,7 +162,7 @@ async function fetchSource(source, domain, role, originalSource) { width: width || null, height: height || null, quality: source.quality || null, - source: originalSource?.src || originalSource || source.src || source, + source: source.src || source, scraper: source.scraper, copyright: source.copyright, }; @@ -133,9 +172,11 @@ async function fetchSource(source, domain, role, originalSource) { } async function fetchItem(source, index, existingItemsBySource, domain, role, attempt = 1, originalSource = null, sourceIndex = 0) { - if (!source) return null; - try { + if (!source) { + throw new Error(`Empty ${domain} ${role} source in ${originalSource}`); + } + if (Array.isArray(source)) { if (source.every(sourceX => sourceX.quality)) { // various video qualities provided @@ -160,19 +201,18 @@ async function fetchItem(source, index, existingItemsBySource, domain, role, att return null; } - return fetchSource(source, domain, role, originalSource); + return await fetchSource(source, domain, role, originalSource); } catch (error) { logger.warn(`Failed attempt ${attempt}/3 to fetch ${domain} ${role} ${index + 1} (${source.src || source}): ${error}`); - /* - if (attempt < 3) { + if (source && attempt < 3) { + // only retry if source is provided at all await Promise.delay(5000); return fetchItem(source, index, existingItemsBySource, domain, role, attempt + 1, originalSource, sourceIndex); } - */ - if (originalSource && sourceIndex < originalSource.length) { - throw error; + if (originalSource && sourceIndex < originalSource.length - 1) { + throw error; // gets caught to try next source } return null; @@ -285,6 +325,8 @@ async function storeMedia(sources, domain, role, { entropyFilter = 2.5 } = {}) { return {}; } + console.log(presentSources, presentSources.length); + // split up source list to prevent excessive RAM usage const itemChunksBySource = await Promise.all(chunk(presentSources, 50).map(async (sourceChunk, index) => { try { @@ -354,12 +396,12 @@ function associateTargetMedia(targetId, sources, mediaBySource, domain, role, pr .map((source) => { if (!source) return null; - const mediaItem = Array.isArray(source) - ? mediaBySource[source.map(sourceX => sourceX.src || sourceX).toString()] - : mediaBySource[source.src || source]; + if (Array.isArray(source)) { + const availableSource = source.find(fallbackSource => mediaBySource[fallbackSource.src || fallbackSource]); + return mediaBySource[availableSource]; + } - // return mediaItem && { [`${domain}_id`]: targetId, media_id: mediaItem.id }; - return mediaItem; + return mediaBySource[source.src || source]; }) .filter(Boolean) // .sort((mediaItemA, mediaItemB) => mediaItemB.height - mediaItemA.height) // prefer high res images for primary item diff --git a/src/scrapers/legalporno.js b/src/scrapers/legalporno.js index aa53d22a..0d64312f 100644 --- a/src/scrapers/legalporno.js +++ b/src/scrapers/legalporno.js @@ -76,53 +76,63 @@ async function scrapeScene(html, url, site, useGallery) { const playerObject = $('script:contains("new VideoPlayer")').html(); const data = JSON.parse(playerObject.slice(playerObject.indexOf('{"swf":'), playerObject.indexOf('} );') + 1)); + const release = { url }; + const originalTitle = $('h1.watchpage-title').text().trim(); const { shootId, title } = extractTitle(originalTitle); - const entryId = new URL(url).pathname.split('/')[2]; - const date = moment.utc($('span[title="Release date"] a').text(), 'YYYY-MM-DD').toDate(); + release.shootId = shootId; + release.entryId = new URL(url).pathname.split('/')[2]; + + release.title = title; + release.date = moment.utc($('span[title="Release date"] a').text(), 'YYYY-MM-DD').toDate(); const [actorsElement, tagsElement, descriptionElement] = $('.scene-description__row').toArray(); - const actors = $(actorsElement) + + release.description = $('meta[name="description"]')?.attr('content')?.trim() + || (descriptionElement && $(descriptionElement).find('dd').text().trim()); + + release.actors = $(actorsElement) .find('a[href*="com/model"]') .map((actorIndex, actorElement) => $(actorElement).text()).toArray(); - const description = $('meta[name="description"]')?.attr('content')?.trim() || (descriptionElement && $(descriptionElement).find('dd').text().trim()); - const duration = moment.duration($('span[title="Runtime"]').text().trim()).asSeconds(); - - const posterStyle = $('#player').attr('style'); - const poster = posterStyle.slice(posterStyle.indexOf('(') + 1, -1); + release.duration = moment.duration($('span[title="Runtime"]').text().trim()).asSeconds(); + release.tags = $(tagsElement).find('a').map((tagIndex, tagElement) => $(tagElement).text()).toArray(); const photos = useGallery ? $('.gallery a img').map((photoIndex, photoElement) => $(photoElement).attr('src')).toArray() : $('.screenshots img').map((photoIndex, photoElement) => $(photoElement).attr('src')).toArray(); + release.photos = photos.map((source) => { + // source without parameters sometimes serves larger preview photo + const { origin, pathname } = new URL(source); + + return `${origin}${pathname}`; + + /* disable thumbnail as fallback, usually enough high res photos available + return [ + `${origin}${pathname}`, + source, + ]; + */ + }); + + const posterStyle = $('#player').attr('style'); + const poster = posterStyle.slice(posterStyle.indexOf('(') + 1, -1); + + release.poster = poster || release.photos.slice(Math.floor(release.photos.length / 3) * -1); // poster unavailable, try last 1/3rd of high res photos as fallback + const trailer = data.clip.qualities.find(clip => clip.quality === 'vga' || clip.quality === 'hd'); + release.trailer = { + src: trailer.src, + type: trailer.type, + quality: trailer.quality === 'vga' ? 480 : 720, + }; const studioName = $('.watchpage-studioname').first().text().trim(); - const studio = studioName.replace(/[\s.']+/g, '').toLowerCase(); - const tags = $(tagsElement).find('a').map((tagIndex, tagElement) => $(tagElement).text()).toArray(); + release.studio = studioName.replace(/[\s.']+/g, '').toLowerCase(); - return { - url, - shootId, - entryId, - title, - description, - date, - actors, - duration, - poster, - photos, - trailer: { - src: trailer.src, - type: trailer.type, - quality: trailer.quality === 'vga' ? 480 : 720, - }, - tags, - site, - studio, - }; + return release; } async function scrapeProfile(html, _url, actorName) {