Photo plucker will use discarded photos as fallback. Returning high res photo sources from LegalPorno.
This commit is contained in:
parent
6bfc5e4378
commit
db63be8f92
78
src/media.js
78
src/media.js
|
@ -42,11 +42,50 @@ async function createThumbnail(buffer) {
|
|||
return null;
|
||||
}
|
||||
|
||||
function pluckItems(items, specifiedLimit) {
|
||||
function groupFallbacksByPriority(chunks) {
|
||||
/*
|
||||
Chunks naturally give priority to all of the first item's fallbacks, generally lower quality images.
|
||||
This function ensures every item's first source is tried, before trying every item's second source, etc., example:
|
||||
IN: [[1, 2, 3,], 10, [1, 2, 3, 4, 5], [1, 2, 3]]
|
||||
OUT [[1, 1, 1], [2, 2, 2], [3, 3, 3], [4], [5]]
|
||||
*/
|
||||
return chunks.map(group => group.reduce((acc, item) => {
|
||||
if (Array.isArray(item)) {
|
||||
// place provided fallbacks at same index (priority) in parent array
|
||||
item.forEach((fallback, fallbackIndex) => {
|
||||
if (!acc[fallbackIndex]) {
|
||||
acc[fallbackIndex] = [];
|
||||
}
|
||||
|
||||
acc[fallbackIndex].push(fallback);
|
||||
});
|
||||
|
||||
return acc;
|
||||
}
|
||||
|
||||
// no fallbacks provided, first priority
|
||||
if (!acc[0]) {
|
||||
acc[0] = [];
|
||||
}
|
||||
|
||||
acc[0].push(item);
|
||||
|
||||
return acc;
|
||||
}, []).flat());
|
||||
}
|
||||
|
||||
function pluckItems(items, specifiedLimit, asFallbacks = true) {
|
||||
const limit = specifiedLimit || config.media.limit;
|
||||
|
||||
if (!items || items.length <= limit) return items;
|
||||
|
||||
if (asFallbacks) {
|
||||
const chunks = chunk(items, Math.ceil(items.length / limit));
|
||||
const fallbacks = groupFallbacksByPriority(chunks);
|
||||
|
||||
return fallbacks;
|
||||
}
|
||||
|
||||
const plucked = [1]
|
||||
.concat(
|
||||
Array.from({ length: limit - 1 }, (value, index) => Math.round((index + 1) * (items.length / (limit - 1)))),
|
||||
|
@ -93,8 +132,8 @@ async function extractItem(source) {
|
|||
return null;
|
||||
}
|
||||
|
||||
async function fetchSource(source, domain, role, originalSource) {
|
||||
logger.verbose(`Fetching ${domain} ${role} from ${source.src || source}`);
|
||||
async function fetchSource(source, domain, role) {
|
||||
logger.silly(`Fetching ${domain} ${role} from ${source.src || source}`);
|
||||
|
||||
// const res = await bhttp.get(source.src || source);
|
||||
const res = await get(source.src || source, {
|
||||
|
@ -111,7 +150,7 @@ async function fetchSource(source, domain, role, originalSource) {
|
|||
const hash = getHash(res.body);
|
||||
const { entropy, size, width, height } = /image/.test(mimetype) ? await getMeta(res.body) : {};
|
||||
|
||||
logger.verbose(`Fetched media item from ${source.src || source}`);
|
||||
logger.silly(`Fetched media item from ${source.src || source}`);
|
||||
|
||||
return {
|
||||
file: res.body,
|
||||
|
@ -123,7 +162,7 @@ async function fetchSource(source, domain, role, originalSource) {
|
|||
width: width || null,
|
||||
height: height || null,
|
||||
quality: source.quality || null,
|
||||
source: originalSource?.src || originalSource || source.src || source,
|
||||
source: source.src || source,
|
||||
scraper: source.scraper,
|
||||
copyright: source.copyright,
|
||||
};
|
||||
|
@ -133,9 +172,11 @@ async function fetchSource(source, domain, role, originalSource) {
|
|||
}
|
||||
|
||||
async function fetchItem(source, index, existingItemsBySource, domain, role, attempt = 1, originalSource = null, sourceIndex = 0) {
|
||||
if (!source) return null;
|
||||
|
||||
try {
|
||||
if (!source) {
|
||||
throw new Error(`Empty ${domain} ${role} source in ${originalSource}`);
|
||||
}
|
||||
|
||||
if (Array.isArray(source)) {
|
||||
if (source.every(sourceX => sourceX.quality)) {
|
||||
// various video qualities provided
|
||||
|
@ -160,19 +201,18 @@ async function fetchItem(source, index, existingItemsBySource, domain, role, att
|
|||
return null;
|
||||
}
|
||||
|
||||
return fetchSource(source, domain, role, originalSource);
|
||||
return await fetchSource(source, domain, role, originalSource);
|
||||
} catch (error) {
|
||||
logger.warn(`Failed attempt ${attempt}/3 to fetch ${domain} ${role} ${index + 1} (${source.src || source}): ${error}`);
|
||||
|
||||
/*
|
||||
if (attempt < 3) {
|
||||
if (source && attempt < 3) {
|
||||
// only retry if source is provided at all
|
||||
await Promise.delay(5000);
|
||||
return fetchItem(source, index, existingItemsBySource, domain, role, attempt + 1, originalSource, sourceIndex);
|
||||
}
|
||||
*/
|
||||
|
||||
if (originalSource && sourceIndex < originalSource.length) {
|
||||
throw error;
|
||||
if (originalSource && sourceIndex < originalSource.length - 1) {
|
||||
throw error; // gets caught to try next source
|
||||
}
|
||||
|
||||
return null;
|
||||
|
@ -285,6 +325,8 @@ async function storeMedia(sources, domain, role, { entropyFilter = 2.5 } = {}) {
|
|||
return {};
|
||||
}
|
||||
|
||||
console.log(presentSources, presentSources.length);
|
||||
|
||||
// split up source list to prevent excessive RAM usage
|
||||
const itemChunksBySource = await Promise.all(chunk(presentSources, 50).map(async (sourceChunk, index) => {
|
||||
try {
|
||||
|
@ -354,12 +396,12 @@ function associateTargetMedia(targetId, sources, mediaBySource, domain, role, pr
|
|||
.map((source) => {
|
||||
if (!source) return null;
|
||||
|
||||
const mediaItem = Array.isArray(source)
|
||||
? mediaBySource[source.map(sourceX => sourceX.src || sourceX).toString()]
|
||||
: mediaBySource[source.src || source];
|
||||
if (Array.isArray(source)) {
|
||||
const availableSource = source.find(fallbackSource => mediaBySource[fallbackSource.src || fallbackSource]);
|
||||
return mediaBySource[availableSource];
|
||||
}
|
||||
|
||||
// return mediaItem && { [`${domain}_id`]: targetId, media_id: mediaItem.id };
|
||||
return mediaItem;
|
||||
return mediaBySource[source.src || source];
|
||||
})
|
||||
.filter(Boolean)
|
||||
// .sort((mediaItemA, mediaItemB) => mediaItemB.height - mediaItemA.height) // prefer high res images for primary item
|
||||
|
|
|
@ -76,53 +76,63 @@ async function scrapeScene(html, url, site, useGallery) {
|
|||
const playerObject = $('script:contains("new VideoPlayer")').html();
|
||||
const data = JSON.parse(playerObject.slice(playerObject.indexOf('{"swf":'), playerObject.indexOf('} );') + 1));
|
||||
|
||||
const release = { url };
|
||||
|
||||
const originalTitle = $('h1.watchpage-title').text().trim();
|
||||
const { shootId, title } = extractTitle(originalTitle);
|
||||
const entryId = new URL(url).pathname.split('/')[2];
|
||||
|
||||
const date = moment.utc($('span[title="Release date"] a').text(), 'YYYY-MM-DD').toDate();
|
||||
release.shootId = shootId;
|
||||
release.entryId = new URL(url).pathname.split('/')[2];
|
||||
|
||||
release.title = title;
|
||||
release.date = moment.utc($('span[title="Release date"] a').text(), 'YYYY-MM-DD').toDate();
|
||||
|
||||
const [actorsElement, tagsElement, descriptionElement] = $('.scene-description__row').toArray();
|
||||
const actors = $(actorsElement)
|
||||
|
||||
release.description = $('meta[name="description"]')?.attr('content')?.trim()
|
||||
|| (descriptionElement && $(descriptionElement).find('dd').text().trim());
|
||||
|
||||
release.actors = $(actorsElement)
|
||||
.find('a[href*="com/model"]')
|
||||
.map((actorIndex, actorElement) => $(actorElement).text()).toArray();
|
||||
|
||||
const description = $('meta[name="description"]')?.attr('content')?.trim() || (descriptionElement && $(descriptionElement).find('dd').text().trim());
|
||||
const duration = moment.duration($('span[title="Runtime"]').text().trim()).asSeconds();
|
||||
|
||||
const posterStyle = $('#player').attr('style');
|
||||
const poster = posterStyle.slice(posterStyle.indexOf('(') + 1, -1);
|
||||
release.duration = moment.duration($('span[title="Runtime"]').text().trim()).asSeconds();
|
||||
release.tags = $(tagsElement).find('a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
|
||||
|
||||
const photos = useGallery
|
||||
? $('.gallery a img').map((photoIndex, photoElement) => $(photoElement).attr('src')).toArray()
|
||||
: $('.screenshots img').map((photoIndex, photoElement) => $(photoElement).attr('src')).toArray();
|
||||
|
||||
release.photos = photos.map((source) => {
|
||||
// source without parameters sometimes serves larger preview photo
|
||||
const { origin, pathname } = new URL(source);
|
||||
|
||||
return `${origin}${pathname}`;
|
||||
|
||||
/* disable thumbnail as fallback, usually enough high res photos available
|
||||
return [
|
||||
`${origin}${pathname}`,
|
||||
source,
|
||||
];
|
||||
*/
|
||||
});
|
||||
|
||||
const posterStyle = $('#player').attr('style');
|
||||
const poster = posterStyle.slice(posterStyle.indexOf('(') + 1, -1);
|
||||
|
||||
release.poster = poster || release.photos.slice(Math.floor(release.photos.length / 3) * -1); // poster unavailable, try last 1/3rd of high res photos as fallback
|
||||
|
||||
const trailer = data.clip.qualities.find(clip => clip.quality === 'vga' || clip.quality === 'hd');
|
||||
|
||||
const studioName = $('.watchpage-studioname').first().text().trim();
|
||||
const studio = studioName.replace(/[\s.']+/g, '').toLowerCase();
|
||||
const tags = $(tagsElement).find('a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
|
||||
|
||||
return {
|
||||
url,
|
||||
shootId,
|
||||
entryId,
|
||||
title,
|
||||
description,
|
||||
date,
|
||||
actors,
|
||||
duration,
|
||||
poster,
|
||||
photos,
|
||||
trailer: {
|
||||
release.trailer = {
|
||||
src: trailer.src,
|
||||
type: trailer.type,
|
||||
quality: trailer.quality === 'vga' ? 480 : 720,
|
||||
},
|
||||
tags,
|
||||
site,
|
||||
studio,
|
||||
};
|
||||
|
||||
const studioName = $('.watchpage-studioname').first().text().trim();
|
||||
release.studio = studioName.replace(/[\s.']+/g, '').toLowerCase();
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
async function scrapeProfile(html, _url, actorName) {
|
||||
|
|
Loading…
Reference in New Issue