Added media limit sampling.

This commit is contained in:
2020-04-11 22:49:37 +02:00
parent cb68319ac0
commit fc58850e56
6 changed files with 81 additions and 10 deletions

View File

@@ -1,13 +1,12 @@
'use strict';
const config = require('config');
// const util = require('util');
const Promise = require('bluebird');
const fs = require('fs').promises;
const path = require('path');
const nanoid = require('nanoid/non-secure');
const mime = require('mime');
const fileType = require('file-type');
// const fileType = require('file-type');
const sharp = require('sharp');
const blake2 = require('blake2');
@@ -72,6 +71,60 @@ async function getThumbnail(buffer, height = config.media.thumbnailSize) {
return null;
}
function sampleMedias(medias, limit = config.media.limit, preferLast = true) {
// limit media sets, use extrax as fallbacks
if (medias.length <= limit) {
return medias;
}
const chunkSize = Math.floor(medias.length / limit);
const rest = medias.length - (limit * chunkSize);
const chunks = Array.from(
{ length: limit },
(value, index) => {
const start = (chunkSize * index) + Math.min(index, rest);
return medias.slice(
start,
start + chunkSize + (index < rest ? 1 : 0),
);
},
);
// flip last chunk so the very last image (often the best cumshot) is tried first
const lastPreferredChunks = preferLast
? chunks.slice(0, -1).concat(chunks.slice(-1).reverse())
: chunks;
const groupedMedias = lastPreferredChunks.map((chunk) => {
// merge chunked medias into single media with grouped fallback priorities,
// so the first sources of each media is preferred over all second sources, etc.
const sources = chunk
.reduce((accSources, media) => {
media.sources.forEach((source, index) => {
if (!accSources[index]) {
accSources.push([source]);
return;
}
accSources[index].push(source);
});
return accSources;
}, [])
.flat();
return {
id: chunk[0].id,
role: chunk[0].role,
sources,
};
});
return groupedMedias;
}
function itemsByKey(items, key) {
return items.reduce((acc, item) => ({ ...acc, [item[key]]: item }), {});
}
@@ -143,7 +196,7 @@ function toBaseMedias(rawMedias, role) {
return [];
}
return rawMedias.map((rawMedia) => {
const baseMedias = rawMedias.map((rawMedia) => {
if (!rawMedia) {
return null;
}
@@ -157,6 +210,10 @@ function toBaseMedias(rawMedias, role) {
return baseSourceToBaseMedia(baseSource, role);
}).filter(Boolean);
const sampledBaseMedias = sampleMedias(baseMedias);
return sampledBaseMedias;
}
async function findSourceDuplicates(baseMedias) {
@@ -465,7 +522,6 @@ async function associateReleaseMedia(releases) {
return;
}
// TODO: media count limits
// TODO: catch errors
// TODO: stage by role

View File

@@ -42,7 +42,7 @@ function scrapePhotos(html, includeThumbnails = true) {
// /createaccount is used by e.g. Tricky Spa native site
const src = $(linkEl).find('img').attr('src');
if (src.match('previews/')) {
if (/previews\//.test(src)) {
// resource often serves full photo at a modifier URL anyway, add as primary source
const highRes = src
.replace('previews/', '')

View File

@@ -5,6 +5,8 @@ const { JSDOM } = require('jsdom');
const cheerio = require('cheerio');
const moment = require('moment');
const slugify = require('../utils/slugify');
function extractTitle(originalTitle) {
const titleComponents = originalTitle.split(' ');
const sceneIdMatch = titleComponents.slice(-1)[0].match(/(AB|AF|GP|SZ|IV|GIO|RS|TW|MA|FM|SAL|NR|AA|GL|BZ|FS|KS|OT)\d+/); // detect studio prefixes
@@ -139,7 +141,7 @@ async function scrapeScene(html, url, site, useGallery) {
}
const studioName = $('.watchpage-studioname').first().text().trim();
release.studio = studioName.replace(/[\s.']+/g, '').toLowerCase();
release.studio = slugify(studioName, '');
return release;
}
@@ -175,6 +177,7 @@ async function fetchLatest(site, page = 1) {
async function fetchScene(url, site) {
const useGallery = true;
// TODO: fall back on screenshots when gallery is not available
const res = useGallery
? await bhttp.get(`${url}/gallery#gallery`)
: await bhttp.get(`${url}/screenshots#screenshots`);

View File

@@ -87,7 +87,7 @@ async function attachStudios(releases) {
if (release.studio && studioBySlug[release.studio]) {
return {
...release,
studio: release.studio,
studio: studioBySlug[release.studio],
};
}