Photo plucker will use discarded photos as fallback. Returning high res photo sources from LegalPorno.
This commit is contained in:
parent
6bfc5e4378
commit
db63be8f92
78
src/media.js
78
src/media.js
|
@ -42,11 +42,50 @@ async function createThumbnail(buffer) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
function pluckItems(items, specifiedLimit) {
|
function groupFallbacksByPriority(chunks) {
|
||||||
|
/*
|
||||||
|
Chunks naturally give priority to all of the first item's fallbacks, generally lower quality images.
|
||||||
|
This function ensures every item's first source is tried, before trying every item's second source, etc., example:
|
||||||
|
IN: [[1, 2, 3,], 10, [1, 2, 3, 4, 5], [1, 2, 3]]
|
||||||
|
OUT [[1, 1, 1], [2, 2, 2], [3, 3, 3], [4], [5]]
|
||||||
|
*/
|
||||||
|
return chunks.map(group => group.reduce((acc, item) => {
|
||||||
|
if (Array.isArray(item)) {
|
||||||
|
// place provided fallbacks at same index (priority) in parent array
|
||||||
|
item.forEach((fallback, fallbackIndex) => {
|
||||||
|
if (!acc[fallbackIndex]) {
|
||||||
|
acc[fallbackIndex] = [];
|
||||||
|
}
|
||||||
|
|
||||||
|
acc[fallbackIndex].push(fallback);
|
||||||
|
});
|
||||||
|
|
||||||
|
return acc;
|
||||||
|
}
|
||||||
|
|
||||||
|
// no fallbacks provided, first priority
|
||||||
|
if (!acc[0]) {
|
||||||
|
acc[0] = [];
|
||||||
|
}
|
||||||
|
|
||||||
|
acc[0].push(item);
|
||||||
|
|
||||||
|
return acc;
|
||||||
|
}, []).flat());
|
||||||
|
}
|
||||||
|
|
||||||
|
function pluckItems(items, specifiedLimit, asFallbacks = true) {
|
||||||
const limit = specifiedLimit || config.media.limit;
|
const limit = specifiedLimit || config.media.limit;
|
||||||
|
|
||||||
if (!items || items.length <= limit) return items;
|
if (!items || items.length <= limit) return items;
|
||||||
|
|
||||||
|
if (asFallbacks) {
|
||||||
|
const chunks = chunk(items, Math.ceil(items.length / limit));
|
||||||
|
const fallbacks = groupFallbacksByPriority(chunks);
|
||||||
|
|
||||||
|
return fallbacks;
|
||||||
|
}
|
||||||
|
|
||||||
const plucked = [1]
|
const plucked = [1]
|
||||||
.concat(
|
.concat(
|
||||||
Array.from({ length: limit - 1 }, (value, index) => Math.round((index + 1) * (items.length / (limit - 1)))),
|
Array.from({ length: limit - 1 }, (value, index) => Math.round((index + 1) * (items.length / (limit - 1)))),
|
||||||
|
@ -93,8 +132,8 @@ async function extractItem(source) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchSource(source, domain, role, originalSource) {
|
async function fetchSource(source, domain, role) {
|
||||||
logger.verbose(`Fetching ${domain} ${role} from ${source.src || source}`);
|
logger.silly(`Fetching ${domain} ${role} from ${source.src || source}`);
|
||||||
|
|
||||||
// const res = await bhttp.get(source.src || source);
|
// const res = await bhttp.get(source.src || source);
|
||||||
const res = await get(source.src || source, {
|
const res = await get(source.src || source, {
|
||||||
|
@ -111,7 +150,7 @@ async function fetchSource(source, domain, role, originalSource) {
|
||||||
const hash = getHash(res.body);
|
const hash = getHash(res.body);
|
||||||
const { entropy, size, width, height } = /image/.test(mimetype) ? await getMeta(res.body) : {};
|
const { entropy, size, width, height } = /image/.test(mimetype) ? await getMeta(res.body) : {};
|
||||||
|
|
||||||
logger.verbose(`Fetched media item from ${source.src || source}`);
|
logger.silly(`Fetched media item from ${source.src || source}`);
|
||||||
|
|
||||||
return {
|
return {
|
||||||
file: res.body,
|
file: res.body,
|
||||||
|
@ -123,7 +162,7 @@ async function fetchSource(source, domain, role, originalSource) {
|
||||||
width: width || null,
|
width: width || null,
|
||||||
height: height || null,
|
height: height || null,
|
||||||
quality: source.quality || null,
|
quality: source.quality || null,
|
||||||
source: originalSource?.src || originalSource || source.src || source,
|
source: source.src || source,
|
||||||
scraper: source.scraper,
|
scraper: source.scraper,
|
||||||
copyright: source.copyright,
|
copyright: source.copyright,
|
||||||
};
|
};
|
||||||
|
@ -133,9 +172,11 @@ async function fetchSource(source, domain, role, originalSource) {
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchItem(source, index, existingItemsBySource, domain, role, attempt = 1, originalSource = null, sourceIndex = 0) {
|
async function fetchItem(source, index, existingItemsBySource, domain, role, attempt = 1, originalSource = null, sourceIndex = 0) {
|
||||||
if (!source) return null;
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
if (!source) {
|
||||||
|
throw new Error(`Empty ${domain} ${role} source in ${originalSource}`);
|
||||||
|
}
|
||||||
|
|
||||||
if (Array.isArray(source)) {
|
if (Array.isArray(source)) {
|
||||||
if (source.every(sourceX => sourceX.quality)) {
|
if (source.every(sourceX => sourceX.quality)) {
|
||||||
// various video qualities provided
|
// various video qualities provided
|
||||||
|
@ -160,19 +201,18 @@ async function fetchItem(source, index, existingItemsBySource, domain, role, att
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
return fetchSource(source, domain, role, originalSource);
|
return await fetchSource(source, domain, role, originalSource);
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.warn(`Failed attempt ${attempt}/3 to fetch ${domain} ${role} ${index + 1} (${source.src || source}): ${error}`);
|
logger.warn(`Failed attempt ${attempt}/3 to fetch ${domain} ${role} ${index + 1} (${source.src || source}): ${error}`);
|
||||||
|
|
||||||
/*
|
if (source && attempt < 3) {
|
||||||
if (attempt < 3) {
|
// only retry if source is provided at all
|
||||||
await Promise.delay(5000);
|
await Promise.delay(5000);
|
||||||
return fetchItem(source, index, existingItemsBySource, domain, role, attempt + 1, originalSource, sourceIndex);
|
return fetchItem(source, index, existingItemsBySource, domain, role, attempt + 1, originalSource, sourceIndex);
|
||||||
}
|
}
|
||||||
*/
|
|
||||||
|
|
||||||
if (originalSource && sourceIndex < originalSource.length) {
|
if (originalSource && sourceIndex < originalSource.length - 1) {
|
||||||
throw error;
|
throw error; // gets caught to try next source
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
|
@ -285,6 +325,8 @@ async function storeMedia(sources, domain, role, { entropyFilter = 2.5 } = {}) {
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
console.log(presentSources, presentSources.length);
|
||||||
|
|
||||||
// split up source list to prevent excessive RAM usage
|
// split up source list to prevent excessive RAM usage
|
||||||
const itemChunksBySource = await Promise.all(chunk(presentSources, 50).map(async (sourceChunk, index) => {
|
const itemChunksBySource = await Promise.all(chunk(presentSources, 50).map(async (sourceChunk, index) => {
|
||||||
try {
|
try {
|
||||||
|
@ -354,12 +396,12 @@ function associateTargetMedia(targetId, sources, mediaBySource, domain, role, pr
|
||||||
.map((source) => {
|
.map((source) => {
|
||||||
if (!source) return null;
|
if (!source) return null;
|
||||||
|
|
||||||
const mediaItem = Array.isArray(source)
|
if (Array.isArray(source)) {
|
||||||
? mediaBySource[source.map(sourceX => sourceX.src || sourceX).toString()]
|
const availableSource = source.find(fallbackSource => mediaBySource[fallbackSource.src || fallbackSource]);
|
||||||
: mediaBySource[source.src || source];
|
return mediaBySource[availableSource];
|
||||||
|
}
|
||||||
|
|
||||||
// return mediaItem && { [`${domain}_id`]: targetId, media_id: mediaItem.id };
|
return mediaBySource[source.src || source];
|
||||||
return mediaItem;
|
|
||||||
})
|
})
|
||||||
.filter(Boolean)
|
.filter(Boolean)
|
||||||
// .sort((mediaItemA, mediaItemB) => mediaItemB.height - mediaItemA.height) // prefer high res images for primary item
|
// .sort((mediaItemA, mediaItemB) => mediaItemB.height - mediaItemA.height) // prefer high res images for primary item
|
||||||
|
|
|
@ -76,53 +76,63 @@ async function scrapeScene(html, url, site, useGallery) {
|
||||||
const playerObject = $('script:contains("new VideoPlayer")').html();
|
const playerObject = $('script:contains("new VideoPlayer")').html();
|
||||||
const data = JSON.parse(playerObject.slice(playerObject.indexOf('{"swf":'), playerObject.indexOf('} );') + 1));
|
const data = JSON.parse(playerObject.slice(playerObject.indexOf('{"swf":'), playerObject.indexOf('} );') + 1));
|
||||||
|
|
||||||
|
const release = { url };
|
||||||
|
|
||||||
const originalTitle = $('h1.watchpage-title').text().trim();
|
const originalTitle = $('h1.watchpage-title').text().trim();
|
||||||
const { shootId, title } = extractTitle(originalTitle);
|
const { shootId, title } = extractTitle(originalTitle);
|
||||||
const entryId = new URL(url).pathname.split('/')[2];
|
|
||||||
|
|
||||||
const date = moment.utc($('span[title="Release date"] a').text(), 'YYYY-MM-DD').toDate();
|
release.shootId = shootId;
|
||||||
|
release.entryId = new URL(url).pathname.split('/')[2];
|
||||||
|
|
||||||
|
release.title = title;
|
||||||
|
release.date = moment.utc($('span[title="Release date"] a').text(), 'YYYY-MM-DD').toDate();
|
||||||
|
|
||||||
const [actorsElement, tagsElement, descriptionElement] = $('.scene-description__row').toArray();
|
const [actorsElement, tagsElement, descriptionElement] = $('.scene-description__row').toArray();
|
||||||
const actors = $(actorsElement)
|
|
||||||
|
release.description = $('meta[name="description"]')?.attr('content')?.trim()
|
||||||
|
|| (descriptionElement && $(descriptionElement).find('dd').text().trim());
|
||||||
|
|
||||||
|
release.actors = $(actorsElement)
|
||||||
.find('a[href*="com/model"]')
|
.find('a[href*="com/model"]')
|
||||||
.map((actorIndex, actorElement) => $(actorElement).text()).toArray();
|
.map((actorIndex, actorElement) => $(actorElement).text()).toArray();
|
||||||
|
|
||||||
const description = $('meta[name="description"]')?.attr('content')?.trim() || (descriptionElement && $(descriptionElement).find('dd').text().trim());
|
release.duration = moment.duration($('span[title="Runtime"]').text().trim()).asSeconds();
|
||||||
const duration = moment.duration($('span[title="Runtime"]').text().trim()).asSeconds();
|
release.tags = $(tagsElement).find('a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
|
||||||
|
|
||||||
const posterStyle = $('#player').attr('style');
|
|
||||||
const poster = posterStyle.slice(posterStyle.indexOf('(') + 1, -1);
|
|
||||||
|
|
||||||
const photos = useGallery
|
const photos = useGallery
|
||||||
? $('.gallery a img').map((photoIndex, photoElement) => $(photoElement).attr('src')).toArray()
|
? $('.gallery a img').map((photoIndex, photoElement) => $(photoElement).attr('src')).toArray()
|
||||||
: $('.screenshots img').map((photoIndex, photoElement) => $(photoElement).attr('src')).toArray();
|
: $('.screenshots img').map((photoIndex, photoElement) => $(photoElement).attr('src')).toArray();
|
||||||
|
|
||||||
|
release.photos = photos.map((source) => {
|
||||||
|
// source without parameters sometimes serves larger preview photo
|
||||||
|
const { origin, pathname } = new URL(source);
|
||||||
|
|
||||||
|
return `${origin}${pathname}`;
|
||||||
|
|
||||||
|
/* disable thumbnail as fallback, usually enough high res photos available
|
||||||
|
return [
|
||||||
|
`${origin}${pathname}`,
|
||||||
|
source,
|
||||||
|
];
|
||||||
|
*/
|
||||||
|
});
|
||||||
|
|
||||||
|
const posterStyle = $('#player').attr('style');
|
||||||
|
const poster = posterStyle.slice(posterStyle.indexOf('(') + 1, -1);
|
||||||
|
|
||||||
|
release.poster = poster || release.photos.slice(Math.floor(release.photos.length / 3) * -1); // poster unavailable, try last 1/3rd of high res photos as fallback
|
||||||
|
|
||||||
const trailer = data.clip.qualities.find(clip => clip.quality === 'vga' || clip.quality === 'hd');
|
const trailer = data.clip.qualities.find(clip => clip.quality === 'vga' || clip.quality === 'hd');
|
||||||
|
release.trailer = {
|
||||||
|
src: trailer.src,
|
||||||
|
type: trailer.type,
|
||||||
|
quality: trailer.quality === 'vga' ? 480 : 720,
|
||||||
|
};
|
||||||
|
|
||||||
const studioName = $('.watchpage-studioname').first().text().trim();
|
const studioName = $('.watchpage-studioname').first().text().trim();
|
||||||
const studio = studioName.replace(/[\s.']+/g, '').toLowerCase();
|
release.studio = studioName.replace(/[\s.']+/g, '').toLowerCase();
|
||||||
const tags = $(tagsElement).find('a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
|
|
||||||
|
|
||||||
return {
|
return release;
|
||||||
url,
|
|
||||||
shootId,
|
|
||||||
entryId,
|
|
||||||
title,
|
|
||||||
description,
|
|
||||||
date,
|
|
||||||
actors,
|
|
||||||
duration,
|
|
||||||
poster,
|
|
||||||
photos,
|
|
||||||
trailer: {
|
|
||||||
src: trailer.src,
|
|
||||||
type: trailer.type,
|
|
||||||
quality: trailer.quality === 'vga' ? 480 : 720,
|
|
||||||
},
|
|
||||||
tags,
|
|
||||||
site,
|
|
||||||
studio,
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapeProfile(html, _url, actorName) {
|
async function scrapeProfile(html, _url, actorName) {
|
||||||
|
|
Loading…
Reference in New Issue