Photo plucker will use discarded photos as fallback. Returning high res photo sources from LegalPorno.

This commit is contained in:
ThePendulum 2020-03-10 04:42:15 +01:00
parent 6bfc5e4378
commit db63be8f92
2 changed files with 100 additions and 48 deletions

View File

@ -42,11 +42,50 @@ async function createThumbnail(buffer) {
return null;
}
function pluckItems(items, specifiedLimit) {
function groupFallbacksByPriority(chunks) {
/*
Chunks naturally give priority to all of the first item's fallbacks, generally lower quality images.
This function ensures every item's first source is tried, before trying every item's second source, etc., example:
IN: [[1, 2, 3,], 10, [1, 2, 3, 4, 5], [1, 2, 3]]
OUT [[1, 1, 1], [2, 2, 2], [3, 3, 3], [4], [5]]
*/
return chunks.map(group => group.reduce((acc, item) => {
if (Array.isArray(item)) {
// place provided fallbacks at same index (priority) in parent array
item.forEach((fallback, fallbackIndex) => {
if (!acc[fallbackIndex]) {
acc[fallbackIndex] = [];
}
acc[fallbackIndex].push(fallback);
});
return acc;
}
// no fallbacks provided, first priority
if (!acc[0]) {
acc[0] = [];
}
acc[0].push(item);
return acc;
}, []).flat());
}
function pluckItems(items, specifiedLimit, asFallbacks = true) {
const limit = specifiedLimit || config.media.limit;
if (!items || items.length <= limit) return items;
if (asFallbacks) {
const chunks = chunk(items, Math.ceil(items.length / limit));
const fallbacks = groupFallbacksByPriority(chunks);
return fallbacks;
}
const plucked = [1]
.concat(
Array.from({ length: limit - 1 }, (value, index) => Math.round((index + 1) * (items.length / (limit - 1)))),
@ -93,8 +132,8 @@ async function extractItem(source) {
return null;
}
async function fetchSource(source, domain, role, originalSource) {
logger.verbose(`Fetching ${domain} ${role} from ${source.src || source}`);
async function fetchSource(source, domain, role) {
logger.silly(`Fetching ${domain} ${role} from ${source.src || source}`);
// const res = await bhttp.get(source.src || source);
const res = await get(source.src || source, {
@ -111,7 +150,7 @@ async function fetchSource(source, domain, role, originalSource) {
const hash = getHash(res.body);
const { entropy, size, width, height } = /image/.test(mimetype) ? await getMeta(res.body) : {};
logger.verbose(`Fetched media item from ${source.src || source}`);
logger.silly(`Fetched media item from ${source.src || source}`);
return {
file: res.body,
@ -123,7 +162,7 @@ async function fetchSource(source, domain, role, originalSource) {
width: width || null,
height: height || null,
quality: source.quality || null,
source: originalSource?.src || originalSource || source.src || source,
source: source.src || source,
scraper: source.scraper,
copyright: source.copyright,
};
@ -133,9 +172,11 @@ async function fetchSource(source, domain, role, originalSource) {
}
async function fetchItem(source, index, existingItemsBySource, domain, role, attempt = 1, originalSource = null, sourceIndex = 0) {
if (!source) return null;
try {
if (!source) {
throw new Error(`Empty ${domain} ${role} source in ${originalSource}`);
}
if (Array.isArray(source)) {
if (source.every(sourceX => sourceX.quality)) {
// various video qualities provided
@ -160,19 +201,18 @@ async function fetchItem(source, index, existingItemsBySource, domain, role, att
return null;
}
return fetchSource(source, domain, role, originalSource);
return await fetchSource(source, domain, role, originalSource);
} catch (error) {
logger.warn(`Failed attempt ${attempt}/3 to fetch ${domain} ${role} ${index + 1} (${source.src || source}): ${error}`);
/*
if (attempt < 3) {
if (source && attempt < 3) {
// only retry if source is provided at all
await Promise.delay(5000);
return fetchItem(source, index, existingItemsBySource, domain, role, attempt + 1, originalSource, sourceIndex);
}
*/
if (originalSource && sourceIndex < originalSource.length) {
throw error;
if (originalSource && sourceIndex < originalSource.length - 1) {
throw error; // gets caught to try next source
}
return null;
@ -285,6 +325,8 @@ async function storeMedia(sources, domain, role, { entropyFilter = 2.5 } = {}) {
return {};
}
console.log(presentSources, presentSources.length);
// split up source list to prevent excessive RAM usage
const itemChunksBySource = await Promise.all(chunk(presentSources, 50).map(async (sourceChunk, index) => {
try {
@ -354,12 +396,12 @@ function associateTargetMedia(targetId, sources, mediaBySource, domain, role, pr
.map((source) => {
if (!source) return null;
const mediaItem = Array.isArray(source)
? mediaBySource[source.map(sourceX => sourceX.src || sourceX).toString()]
: mediaBySource[source.src || source];
if (Array.isArray(source)) {
const availableSource = source.find(fallbackSource => mediaBySource[fallbackSource.src || fallbackSource]);
return mediaBySource[availableSource];
}
// return mediaItem && { [`${domain}_id`]: targetId, media_id: mediaItem.id };
return mediaItem;
return mediaBySource[source.src || source];
})
.filter(Boolean)
// .sort((mediaItemA, mediaItemB) => mediaItemB.height - mediaItemA.height) // prefer high res images for primary item

View File

@ -76,53 +76,63 @@ async function scrapeScene(html, url, site, useGallery) {
const playerObject = $('script:contains("new VideoPlayer")').html();
const data = JSON.parse(playerObject.slice(playerObject.indexOf('{"swf":'), playerObject.indexOf('} );') + 1));
const release = { url };
const originalTitle = $('h1.watchpage-title').text().trim();
const { shootId, title } = extractTitle(originalTitle);
const entryId = new URL(url).pathname.split('/')[2];
const date = moment.utc($('span[title="Release date"] a').text(), 'YYYY-MM-DD').toDate();
release.shootId = shootId;
release.entryId = new URL(url).pathname.split('/')[2];
release.title = title;
release.date = moment.utc($('span[title="Release date"] a').text(), 'YYYY-MM-DD').toDate();
const [actorsElement, tagsElement, descriptionElement] = $('.scene-description__row').toArray();
const actors = $(actorsElement)
release.description = $('meta[name="description"]')?.attr('content')?.trim()
|| (descriptionElement && $(descriptionElement).find('dd').text().trim());
release.actors = $(actorsElement)
.find('a[href*="com/model"]')
.map((actorIndex, actorElement) => $(actorElement).text()).toArray();
const description = $('meta[name="description"]')?.attr('content')?.trim() || (descriptionElement && $(descriptionElement).find('dd').text().trim());
const duration = moment.duration($('span[title="Runtime"]').text().trim()).asSeconds();
const posterStyle = $('#player').attr('style');
const poster = posterStyle.slice(posterStyle.indexOf('(') + 1, -1);
release.duration = moment.duration($('span[title="Runtime"]').text().trim()).asSeconds();
release.tags = $(tagsElement).find('a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
const photos = useGallery
? $('.gallery a img').map((photoIndex, photoElement) => $(photoElement).attr('src')).toArray()
: $('.screenshots img').map((photoIndex, photoElement) => $(photoElement).attr('src')).toArray();
release.photos = photos.map((source) => {
// source without parameters sometimes serves larger preview photo
const { origin, pathname } = new URL(source);
return `${origin}${pathname}`;
/* disable thumbnail as fallback, usually enough high res photos available
return [
`${origin}${pathname}`,
source,
];
*/
});
const posterStyle = $('#player').attr('style');
const poster = posterStyle.slice(posterStyle.indexOf('(') + 1, -1);
release.poster = poster || release.photos.slice(Math.floor(release.photos.length / 3) * -1); // poster unavailable, try last 1/3rd of high res photos as fallback
const trailer = data.clip.qualities.find(clip => clip.quality === 'vga' || clip.quality === 'hd');
const studioName = $('.watchpage-studioname').first().text().trim();
const studio = studioName.replace(/[\s.']+/g, '').toLowerCase();
const tags = $(tagsElement).find('a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
return {
url,
shootId,
entryId,
title,
description,
date,
actors,
duration,
poster,
photos,
trailer: {
release.trailer = {
src: trailer.src,
type: trailer.type,
quality: trailer.quality === 'vga' ? 480 : 720,
},
tags,
site,
studio,
};
const studioName = $('.watchpage-studioname').first().text().trim();
release.studio = studioName.replace(/[\s.']+/g, '').toLowerCase();
return release;
}
async function scrapeProfile(html, _url, actorName) {