Photo plucker will use discarded photos as fallback. Returning high res photo sources from LegalPorno.

This commit is contained in:
ThePendulum 2020-03-10 04:42:15 +01:00
parent 6bfc5e4378
commit db63be8f92
2 changed files with 100 additions and 48 deletions

View File

@ -42,11 +42,50 @@ async function createThumbnail(buffer) {
return null; return null;
} }
function pluckItems(items, specifiedLimit) { function groupFallbacksByPriority(chunks) {
/*
Chunks naturally give priority to all of the first item's fallbacks, generally lower quality images.
This function ensures every item's first source is tried, before trying every item's second source, etc., example:
IN: [[1, 2, 3,], 10, [1, 2, 3, 4, 5], [1, 2, 3]]
OUT [[1, 1, 1], [2, 2, 2], [3, 3, 3], [4], [5]]
*/
return chunks.map(group => group.reduce((acc, item) => {
if (Array.isArray(item)) {
// place provided fallbacks at same index (priority) in parent array
item.forEach((fallback, fallbackIndex) => {
if (!acc[fallbackIndex]) {
acc[fallbackIndex] = [];
}
acc[fallbackIndex].push(fallback);
});
return acc;
}
// no fallbacks provided, first priority
if (!acc[0]) {
acc[0] = [];
}
acc[0].push(item);
return acc;
}, []).flat());
}
function pluckItems(items, specifiedLimit, asFallbacks = true) {
const limit = specifiedLimit || config.media.limit; const limit = specifiedLimit || config.media.limit;
if (!items || items.length <= limit) return items; if (!items || items.length <= limit) return items;
if (asFallbacks) {
const chunks = chunk(items, Math.ceil(items.length / limit));
const fallbacks = groupFallbacksByPriority(chunks);
return fallbacks;
}
const plucked = [1] const plucked = [1]
.concat( .concat(
Array.from({ length: limit - 1 }, (value, index) => Math.round((index + 1) * (items.length / (limit - 1)))), Array.from({ length: limit - 1 }, (value, index) => Math.round((index + 1) * (items.length / (limit - 1)))),
@ -93,8 +132,8 @@ async function extractItem(source) {
return null; return null;
} }
async function fetchSource(source, domain, role, originalSource) { async function fetchSource(source, domain, role) {
logger.verbose(`Fetching ${domain} ${role} from ${source.src || source}`); logger.silly(`Fetching ${domain} ${role} from ${source.src || source}`);
// const res = await bhttp.get(source.src || source); // const res = await bhttp.get(source.src || source);
const res = await get(source.src || source, { const res = await get(source.src || source, {
@ -111,7 +150,7 @@ async function fetchSource(source, domain, role, originalSource) {
const hash = getHash(res.body); const hash = getHash(res.body);
const { entropy, size, width, height } = /image/.test(mimetype) ? await getMeta(res.body) : {}; const { entropy, size, width, height } = /image/.test(mimetype) ? await getMeta(res.body) : {};
logger.verbose(`Fetched media item from ${source.src || source}`); logger.silly(`Fetched media item from ${source.src || source}`);
return { return {
file: res.body, file: res.body,
@ -123,7 +162,7 @@ async function fetchSource(source, domain, role, originalSource) {
width: width || null, width: width || null,
height: height || null, height: height || null,
quality: source.quality || null, quality: source.quality || null,
source: originalSource?.src || originalSource || source.src || source, source: source.src || source,
scraper: source.scraper, scraper: source.scraper,
copyright: source.copyright, copyright: source.copyright,
}; };
@ -133,9 +172,11 @@ async function fetchSource(source, domain, role, originalSource) {
} }
async function fetchItem(source, index, existingItemsBySource, domain, role, attempt = 1, originalSource = null, sourceIndex = 0) { async function fetchItem(source, index, existingItemsBySource, domain, role, attempt = 1, originalSource = null, sourceIndex = 0) {
if (!source) return null;
try { try {
if (!source) {
throw new Error(`Empty ${domain} ${role} source in ${originalSource}`);
}
if (Array.isArray(source)) { if (Array.isArray(source)) {
if (source.every(sourceX => sourceX.quality)) { if (source.every(sourceX => sourceX.quality)) {
// various video qualities provided // various video qualities provided
@ -160,19 +201,18 @@ async function fetchItem(source, index, existingItemsBySource, domain, role, att
return null; return null;
} }
return fetchSource(source, domain, role, originalSource); return await fetchSource(source, domain, role, originalSource);
} catch (error) { } catch (error) {
logger.warn(`Failed attempt ${attempt}/3 to fetch ${domain} ${role} ${index + 1} (${source.src || source}): ${error}`); logger.warn(`Failed attempt ${attempt}/3 to fetch ${domain} ${role} ${index + 1} (${source.src || source}): ${error}`);
/* if (source && attempt < 3) {
if (attempt < 3) { // only retry if source is provided at all
await Promise.delay(5000); await Promise.delay(5000);
return fetchItem(source, index, existingItemsBySource, domain, role, attempt + 1, originalSource, sourceIndex); return fetchItem(source, index, existingItemsBySource, domain, role, attempt + 1, originalSource, sourceIndex);
} }
*/
if (originalSource && sourceIndex < originalSource.length) { if (originalSource && sourceIndex < originalSource.length - 1) {
throw error; throw error; // gets caught to try next source
} }
return null; return null;
@ -285,6 +325,8 @@ async function storeMedia(sources, domain, role, { entropyFilter = 2.5 } = {}) {
return {}; return {};
} }
console.log(presentSources, presentSources.length);
// split up source list to prevent excessive RAM usage // split up source list to prevent excessive RAM usage
const itemChunksBySource = await Promise.all(chunk(presentSources, 50).map(async (sourceChunk, index) => { const itemChunksBySource = await Promise.all(chunk(presentSources, 50).map(async (sourceChunk, index) => {
try { try {
@ -354,12 +396,12 @@ function associateTargetMedia(targetId, sources, mediaBySource, domain, role, pr
.map((source) => { .map((source) => {
if (!source) return null; if (!source) return null;
const mediaItem = Array.isArray(source) if (Array.isArray(source)) {
? mediaBySource[source.map(sourceX => sourceX.src || sourceX).toString()] const availableSource = source.find(fallbackSource => mediaBySource[fallbackSource.src || fallbackSource]);
: mediaBySource[source.src || source]; return mediaBySource[availableSource];
}
// return mediaItem && { [`${domain}_id`]: targetId, media_id: mediaItem.id }; return mediaBySource[source.src || source];
return mediaItem;
}) })
.filter(Boolean) .filter(Boolean)
// .sort((mediaItemA, mediaItemB) => mediaItemB.height - mediaItemA.height) // prefer high res images for primary item // .sort((mediaItemA, mediaItemB) => mediaItemB.height - mediaItemA.height) // prefer high res images for primary item

View File

@ -76,53 +76,63 @@ async function scrapeScene(html, url, site, useGallery) {
const playerObject = $('script:contains("new VideoPlayer")').html(); const playerObject = $('script:contains("new VideoPlayer")').html();
const data = JSON.parse(playerObject.slice(playerObject.indexOf('{"swf":'), playerObject.indexOf('} );') + 1)); const data = JSON.parse(playerObject.slice(playerObject.indexOf('{"swf":'), playerObject.indexOf('} );') + 1));
const release = { url };
const originalTitle = $('h1.watchpage-title').text().trim(); const originalTitle = $('h1.watchpage-title').text().trim();
const { shootId, title } = extractTitle(originalTitle); const { shootId, title } = extractTitle(originalTitle);
const entryId = new URL(url).pathname.split('/')[2];
const date = moment.utc($('span[title="Release date"] a').text(), 'YYYY-MM-DD').toDate(); release.shootId = shootId;
release.entryId = new URL(url).pathname.split('/')[2];
release.title = title;
release.date = moment.utc($('span[title="Release date"] a').text(), 'YYYY-MM-DD').toDate();
const [actorsElement, tagsElement, descriptionElement] = $('.scene-description__row').toArray(); const [actorsElement, tagsElement, descriptionElement] = $('.scene-description__row').toArray();
const actors = $(actorsElement)
release.description = $('meta[name="description"]')?.attr('content')?.trim()
|| (descriptionElement && $(descriptionElement).find('dd').text().trim());
release.actors = $(actorsElement)
.find('a[href*="com/model"]') .find('a[href*="com/model"]')
.map((actorIndex, actorElement) => $(actorElement).text()).toArray(); .map((actorIndex, actorElement) => $(actorElement).text()).toArray();
const description = $('meta[name="description"]')?.attr('content')?.trim() || (descriptionElement && $(descriptionElement).find('dd').text().trim()); release.duration = moment.duration($('span[title="Runtime"]').text().trim()).asSeconds();
const duration = moment.duration($('span[title="Runtime"]').text().trim()).asSeconds(); release.tags = $(tagsElement).find('a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
const posterStyle = $('#player').attr('style');
const poster = posterStyle.slice(posterStyle.indexOf('(') + 1, -1);
const photos = useGallery const photos = useGallery
? $('.gallery a img').map((photoIndex, photoElement) => $(photoElement).attr('src')).toArray() ? $('.gallery a img').map((photoIndex, photoElement) => $(photoElement).attr('src')).toArray()
: $('.screenshots img').map((photoIndex, photoElement) => $(photoElement).attr('src')).toArray(); : $('.screenshots img').map((photoIndex, photoElement) => $(photoElement).attr('src')).toArray();
release.photos = photos.map((source) => {
// source without parameters sometimes serves larger preview photo
const { origin, pathname } = new URL(source);
return `${origin}${pathname}`;
/* disable thumbnail as fallback, usually enough high res photos available
return [
`${origin}${pathname}`,
source,
];
*/
});
const posterStyle = $('#player').attr('style');
const poster = posterStyle.slice(posterStyle.indexOf('(') + 1, -1);
release.poster = poster || release.photos.slice(Math.floor(release.photos.length / 3) * -1); // poster unavailable, try last 1/3rd of high res photos as fallback
const trailer = data.clip.qualities.find(clip => clip.quality === 'vga' || clip.quality === 'hd'); const trailer = data.clip.qualities.find(clip => clip.quality === 'vga' || clip.quality === 'hd');
release.trailer = {
const studioName = $('.watchpage-studioname').first().text().trim();
const studio = studioName.replace(/[\s.']+/g, '').toLowerCase();
const tags = $(tagsElement).find('a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
return {
url,
shootId,
entryId,
title,
description,
date,
actors,
duration,
poster,
photos,
trailer: {
src: trailer.src, src: trailer.src,
type: trailer.type, type: trailer.type,
quality: trailer.quality === 'vga' ? 480 : 720, quality: trailer.quality === 'vga' ? 480 : 720,
},
tags,
site,
studio,
}; };
const studioName = $('.watchpage-studioname').first().text().trim();
release.studio = studioName.replace(/[\s.']+/g, '').toLowerCase();
return release;
} }
async function scrapeProfile(html, _url, actorName) { async function scrapeProfile(html, _url, actorName) {