Added media limit sampling.

This commit is contained in:
ThePendulum 2020-04-11 22:49:37 +02:00
parent cb68319ac0
commit fc58850e56
6 changed files with 81 additions and 10 deletions

View File

@ -23,8 +23,7 @@ Do not modify `config/default.js`, but instead create a copy at `config/local.js
### Options ### Options
`npm start -- --option value` `npm start -- --option value`
Running `npm start` without any arguments will run the web server. * `--server`: Run the web server
* `--fetch`: Fetch updates instead of running the webserver. Without further arguments, it will use the networks and sites defined in the configuration file. * `--fetch`: Fetch updates instead of running the webserver. Without further arguments, it will use the networks and sites defined in the configuration file.
* `--site [site ID]`: Fetch updates from a specific site. The site ID is typically the site name in lowercase and without cases or special characters. For example, Teens Like It Big is teenslikeitbig. * `--site [site ID]`: Fetch updates from a specific site. The site ID is typically the site name in lowercase and without cases or special characters. For example, Teens Like It Big is teenslikeitbig.
* `--network [network ID]`: Fetch updates from all sites of a specific network. The network ID is composed similarly to the site ID. * `--network [network ID]`: Fetch updates from all sites of a specific network. The network ID is composed similarly to the site ID.
@ -33,10 +32,23 @@ Running `npm start` without any arguments will run the web server.
* `--deep`: Follow each release link found running `--site` or `--network` and scrape it for more details. Enabled by default at the moment of writing; use `--no-deep` to only save information found on the overview pages. * `--deep`: Follow each release link found running `--site` or `--network` and scrape it for more details. Enabled by default at the moment of writing; use `--no-deep` to only save information found on the overview pages.
* `--copy`: Try to copy relevant results to the clipboard. When used with `--scene`, it will copy the filename as defined in the config with all the details filled in. * `--copy`: Try to copy relevant results to the clipboard. When used with `--scene`, it will copy the filename as defined in the config with all the details filled in.
#### Developer options ## Developers
### Options
* `--no-save`: Do not store retrieved information in local database, forcing re-fetch. * `--no-save`: Do not store retrieved information in local database, forcing re-fetch.
* `--debug`: Show full error stack trace. * `--debug`: Show full error stack trace.
### Generating thumbnails
Ensure each tag or sfw category directory has a `thumbs` and `lazy` directory: `for dir in \*; do mkdir "$dir/thumbs $dir/lazy"; done`
Using ImageMagick's bulk tool `mogrify` to generate 240px thumbnails and 90px lazy pre-loading images:
* Generate thumbnails within tag or sfw directory: `mogrify -path lazy -resize x240 -quality 90% \*.jpeg`
* Generate lazy loading images within tag or sfw directory: `mogrify -path lazy -resize x90 -quality 90% \*.jpeg`
* Generate thumbnails for all tags or categories in `tags` or `sfw` directory: `for dir in \*; do mogrify -path "$dir/thumbs" -resize x240 -quality 90% "$dir/\*.jpeg"; done`
* Generate lazy loading images for all tags categories in `tags` or `sfw` directory: `for dir in \*; do mogrify -path "$dir/lazy" -resize x90 -quality 90% "$dir/\*.jpeg"; done`
## Supported networks & sites ## Supported networks & sites
768 sites on 62 networks, continuously expanding! 768 sites on 62 networks, continuously expanding!

Binary file not shown.

Before

Width:  |  Height:  |  Size: 882 B

After

Width:  |  Height:  |  Size: 1.6 KiB

View File

@ -1,13 +1,12 @@
'use strict'; 'use strict';
const config = require('config'); const config = require('config');
// const util = require('util');
const Promise = require('bluebird'); const Promise = require('bluebird');
const fs = require('fs').promises; const fs = require('fs').promises;
const path = require('path'); const path = require('path');
const nanoid = require('nanoid/non-secure'); const nanoid = require('nanoid/non-secure');
const mime = require('mime'); const mime = require('mime');
const fileType = require('file-type'); // const fileType = require('file-type');
const sharp = require('sharp'); const sharp = require('sharp');
const blake2 = require('blake2'); const blake2 = require('blake2');
@ -72,6 +71,60 @@ async function getThumbnail(buffer, height = config.media.thumbnailSize) {
return null; return null;
} }
function sampleMedias(medias, limit = config.media.limit, preferLast = true) {
// limit media sets, use extrax as fallbacks
if (medias.length <= limit) {
return medias;
}
const chunkSize = Math.floor(medias.length / limit);
const rest = medias.length - (limit * chunkSize);
const chunks = Array.from(
{ length: limit },
(value, index) => {
const start = (chunkSize * index) + Math.min(index, rest);
return medias.slice(
start,
start + chunkSize + (index < rest ? 1 : 0),
);
},
);
// flip last chunk so the very last image (often the best cumshot) is tried first
const lastPreferredChunks = preferLast
? chunks.slice(0, -1).concat(chunks.slice(-1).reverse())
: chunks;
const groupedMedias = lastPreferredChunks.map((chunk) => {
// merge chunked medias into single media with grouped fallback priorities,
// so the first sources of each media is preferred over all second sources, etc.
const sources = chunk
.reduce((accSources, media) => {
media.sources.forEach((source, index) => {
if (!accSources[index]) {
accSources.push([source]);
return;
}
accSources[index].push(source);
});
return accSources;
}, [])
.flat();
return {
id: chunk[0].id,
role: chunk[0].role,
sources,
};
});
return groupedMedias;
}
function itemsByKey(items, key) { function itemsByKey(items, key) {
return items.reduce((acc, item) => ({ ...acc, [item[key]]: item }), {}); return items.reduce((acc, item) => ({ ...acc, [item[key]]: item }), {});
} }
@ -143,7 +196,7 @@ function toBaseMedias(rawMedias, role) {
return []; return [];
} }
return rawMedias.map((rawMedia) => { const baseMedias = rawMedias.map((rawMedia) => {
if (!rawMedia) { if (!rawMedia) {
return null; return null;
} }
@ -157,6 +210,10 @@ function toBaseMedias(rawMedias, role) {
return baseSourceToBaseMedia(baseSource, role); return baseSourceToBaseMedia(baseSource, role);
}).filter(Boolean); }).filter(Boolean);
const sampledBaseMedias = sampleMedias(baseMedias);
return sampledBaseMedias;
} }
async function findSourceDuplicates(baseMedias) { async function findSourceDuplicates(baseMedias) {
@ -465,7 +522,6 @@ async function associateReleaseMedia(releases) {
return; return;
} }
// TODO: media count limits
// TODO: catch errors // TODO: catch errors
// TODO: stage by role // TODO: stage by role

View File

@ -42,7 +42,7 @@ function scrapePhotos(html, includeThumbnails = true) {
// /createaccount is used by e.g. Tricky Spa native site // /createaccount is used by e.g. Tricky Spa native site
const src = $(linkEl).find('img').attr('src'); const src = $(linkEl).find('img').attr('src');
if (src.match('previews/')) { if (/previews\//.test(src)) {
// resource often serves full photo at a modifier URL anyway, add as primary source // resource often serves full photo at a modifier URL anyway, add as primary source
const highRes = src const highRes = src
.replace('previews/', '') .replace('previews/', '')

View File

@ -5,6 +5,8 @@ const { JSDOM } = require('jsdom');
const cheerio = require('cheerio'); const cheerio = require('cheerio');
const moment = require('moment'); const moment = require('moment');
const slugify = require('../utils/slugify');
function extractTitle(originalTitle) { function extractTitle(originalTitle) {
const titleComponents = originalTitle.split(' '); const titleComponents = originalTitle.split(' ');
const sceneIdMatch = titleComponents.slice(-1)[0].match(/(AB|AF|GP|SZ|IV|GIO|RS|TW|MA|FM|SAL|NR|AA|GL|BZ|FS|KS|OT)\d+/); // detect studio prefixes const sceneIdMatch = titleComponents.slice(-1)[0].match(/(AB|AF|GP|SZ|IV|GIO|RS|TW|MA|FM|SAL|NR|AA|GL|BZ|FS|KS|OT)\d+/); // detect studio prefixes
@ -139,7 +141,7 @@ async function scrapeScene(html, url, site, useGallery) {
} }
const studioName = $('.watchpage-studioname').first().text().trim(); const studioName = $('.watchpage-studioname').first().text().trim();
release.studio = studioName.replace(/[\s.']+/g, '').toLowerCase(); release.studio = slugify(studioName, '');
return release; return release;
} }
@ -175,6 +177,7 @@ async function fetchLatest(site, page = 1) {
async function fetchScene(url, site) { async function fetchScene(url, site) {
const useGallery = true; const useGallery = true;
// TODO: fall back on screenshots when gallery is not available
const res = useGallery const res = useGallery
? await bhttp.get(`${url}/gallery#gallery`) ? await bhttp.get(`${url}/gallery#gallery`)
: await bhttp.get(`${url}/screenshots#screenshots`); : await bhttp.get(`${url}/screenshots#screenshots`);

View File

@ -87,7 +87,7 @@ async function attachStudios(releases) {
if (release.studio && studioBySlug[release.studio]) { if (release.studio && studioBySlug[release.studio]) {
return { return {
...release, ...release,
studio: release.studio, studio: studioBySlug[release.studio],
}; };
} }