Fixed Porn Pros scraper. Added various Score site logos.

This commit is contained in:
ThePendulum 2020-02-18 16:00:36 +01:00
parent cabae4989e
commit 40bf476ea6
99 changed files with 483 additions and 316 deletions

View File

@ -25,10 +25,11 @@ exports.up = knex => Promise.resolve()
table.integer('index'); table.integer('index');
table.string('mime'); table.string('mime');
table.string('hash');
table.string('type'); table.string('type');
table.string('quality', 6); table.string('quality', 6);
table.float('entropy');
table.string('hash');
table.text('comment'); table.text('comment');
table.string('source', 1000); table.string('source', 1000);

Binary file not shown.

Before

Width:  |  Height:  |  Size: 21 KiB

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 9.0 KiB

After

Width:  |  Height:  |  Size: 9.0 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 13 KiB

After

Width:  |  Height:  |  Size: 13 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 91 KiB

After

Width:  |  Height:  |  Size: 91 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 54 KiB

After

Width:  |  Height:  |  Size: 54 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 10 KiB

After

Width:  |  Height:  |  Size: 10 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 8.7 KiB

After

Width:  |  Height:  |  Size: 8.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 43 KiB

After

Width:  |  Height:  |  Size: 43 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 26 KiB

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 32 KiB

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 69 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 82 KiB

After

Width:  |  Height:  |  Size: 83 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 23 KiB

After

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 6.9 KiB

After

Width:  |  Height:  |  Size: 6.9 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 40 KiB

After

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 43 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 32 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 31 KiB

After

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 64 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 38 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 54 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 78 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 124 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 10 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 41 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 52 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 129 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.8 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 60 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 65 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.9 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.2 KiB

After

Width:  |  Height:  |  Size: 2.3 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 43 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 86 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 59 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 79 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 91 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.3 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 32 KiB

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 64 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 51 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 80 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 40 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 38 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 89 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 49 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 39 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 43 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 77 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 22 KiB

After

Width:  |  Height:  |  Size: 12 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 74 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 85 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 64 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 42 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 42 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 108 KiB

After

Width:  |  Height:  |  Size: 108 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.6 KiB

After

Width:  |  Height:  |  Size: 7.6 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 22 KiB

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 20 KiB

After

Width:  |  Height:  |  Size: 20 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 33 KiB

After

Width:  |  Height:  |  Size: 34 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 46 KiB

After

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 113 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 49 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 5.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 43 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 42 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 16 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 111 KiB

After

Width:  |  Height:  |  Size: 113 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 70 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 50 KiB

View File

@ -4481,6 +4481,8 @@ const sites = [
slug: 'milfbundle', slug: 'milfbundle',
url: 'https://www.milfbundle.com', url: 'https://www.milfbundle.com',
network: 'score', network: 'score',
show: false,
scrape: false,
}, },
{ {
name: 'Teaming Cock', name: 'Teaming Cock',
@ -4543,10 +4545,12 @@ const sites = [
network: 'score', network: 'score',
}, },
{ {
name: 'Definition Porn Videos', name: 'Porn Mega Load',
slug: 'pornmegaload', slug: 'pornmegaload',
url: 'https://www.pornmegaload.com', url: 'https://www.pornmegaload.com',
network: 'score', network: 'score',
show: false,
scrape: false,
}, },
{ {
name: 'SaRennas World', name: 'SaRennas World',
@ -4630,13 +4634,13 @@ const sites = [
}, },
{ {
name: 'Tawny Peaks', name: 'Tawny Peaks',
slug: 'tawny', slug: 'tawnypeaks',
url: 'https://www.bigboobbundle.com/tawny-peaks', url: 'https://www.bigboobbundle.com/tawny-peaks',
network: 'score', network: 'score',
}, },
{ {
name: 'Tiffany Towers', name: 'Tiffany Towers',
slug: 'tiffany', slug: 'tiffanytowers',
url: 'https://www.bigboobbundle.com/tiffany-towers', url: 'https://www.bigboobbundle.com/tiffany-towers',
network: 'score', network: 'score',
}, },

View File

@ -12,7 +12,7 @@ const scrapers = require('./scrapers/scrapers');
const whereOr = require('./utils/where-or'); const whereOr = require('./utils/where-or');
const resolvePlace = require('./utils/resolve-place'); const resolvePlace = require('./utils/resolve-place');
const slugify = require('./utils/slugify'); const slugify = require('./utils/slugify');
const { createMediaDirectory, storePhotos } = require('./media'); const { createMediaDirectory, storePhotos } = require('./media_legacy');
async function curateActor(actor) { async function curateActor(actor) {
const [aliases, avatar, photos, social] = await Promise.all([ const [aliases, avatar, photos, social] = await Promise.all([

View File

@ -2,39 +2,32 @@
const config = require('config'); const config = require('config');
const Promise = require('bluebird'); const Promise = require('bluebird');
const path = require('path');
const fs = require('fs-extra');
const bhttp = require('bhttp'); const bhttp = require('bhttp');
const mime = require('mime'); const mime = require('mime');
const sharp = require('sharp'); const sharp = require('sharp');
const blake2 = require('blake2'); const blake2 = require('blake2');
const logger = require('./logger')(__filename); const logger = require('./logger');
const knex = require('./knex'); const knex = require('./knex');
const upsert = require('./utils/upsert');
const { ex } = require('./utils/q');
function getHash(buffer) { function getHash(buffer) {
const hash = blake2.createHash('blake2b', { digestLength: 24 }); const hash = blake2.createHash('blake2b', { digestLength: 24 });
hash.update(buffer); hash.update(buffer);
return hash.digest('hex'); return hash.digest('hex');
} }
function pluckPhotos(photos, specifiedLimit) { function pluckItems(items, specifiedLimit) {
const limit = specifiedLimit || config.media.limit; const limit = specifiedLimit || config.media.limit;
if (photos.length <= limit) { if (items.length <= limit) return items;
return photos;
}
const plucked = [1] const plucked = [1]
.concat( .concat(
Array.from({ length: limit - 1 }, (value, index) => Math.round((index + 1) * (photos.length / (limit - 1)))), Array.from({ length: limit - 1 }, (value, index) => Math.round((index + 1) * (items.length / (limit - 1)))),
); );
return Array.from(new Set(plucked)).map(photoIndex => photos[photoIndex - 1]); // remove duplicates, may happen when photo total and photo limit are close return Array.from(new Set(plucked)).map(itemIndex => items[itemIndex - 1]); // remove duplicates, may happen when photo total and photo limit are close
} }
async function getEntropy(buffer) { async function getEntropy(buffer) {
@ -49,337 +42,74 @@ async function getEntropy(buffer) {
} }
} }
async function createThumbnail(buffer) { async function fetchItem(source, index, existingItemsBySource, attempt = 1) {
try { try {
const thumbnail = sharp(buffer)
.resize({
height: config.media.thumbnailSize,
withoutEnlargement: true,
})
.jpeg({
quality: config.media.thumbnailQuality,
})
.toBuffer();
return thumbnail;
} catch (error) {
logger.error(`Failed to create thumbnail: ${error.message}`);
}
return null;
}
async function createMediaDirectory(domain, subpath) {
const filepath = path.join(config.media.path, domain, subpath);
await fs.mkdir(filepath, { recursive: true });
return filepath;
}
function curatePhotoEntries(files) {
return files.map((file, index) => ({
path: file.filepath,
thumbnail: file.thumbpath,
mime: file.mimetype,
hash: file.hash,
source: file.source,
index,
}));
}
async function findDuplicates(photos, identifier, prop = null) {
const duplicates = await knex('media')
.whereIn(identifier, photos.flat().map((photo) => {
if (prop) return photo[prop];
if (photo.src) return photo.src;
return photo;
}));
const duplicateLookup = new Set(duplicates.map(photo => photo[prop || identifier]));
const originals = photos.filter((source) => {
if (Array.isArray(source)) { if (Array.isArray(source)) {
return !source.some(sourceX => duplicateLookup.has((prop && sourceX[prop]) || (sourceX.src && sourceX))); // fallbacks provided
return source.reduce((outcome, sourceX) => outcome.catch(async () => {
const item = await fetchItem(sourceX, index, existingItemsBySource);
if (item) {
return item;
}
throw new Error(`Item not available: ${source}`);
}), Promise.reject(new Error()));
} }
return !duplicateLookup.has((prop && source[prop]) || (source.src && source)); if (existingItemsBySource[source]) {
}); return existingItemsBySource[source];
}
return [duplicates, originals]; const res = await bhttp.get(source);
}
async function extractPhoto(source) {
const res = await bhttp.get(source.src);
if (res.statusCode === 200) {
const { q } = ex(res.body.toString());
return source.extract(q);
}
return null;
}
async function fetchPhoto(photoUrl, index, label, attempt = 1) {
if (photoUrl.src && photoUrl.extract) {
// source links to page containing a (presumably) tokenized photo
const photo = await extractPhoto(photoUrl);
return fetchPhoto(photo, index, label);
}
if (Array.isArray(photoUrl)) {
return photoUrl.reduce(async (outcome, url) => outcome.catch(async () => {
const photo = await fetchPhoto(url, index, label);
if (photo) {
return photo;
}
throw new Error('Photo not available');
}), Promise.reject(new Error()));
}
try {
const { pathname } = new URL(photoUrl);
const res = await bhttp.get(photoUrl);
if (res.statusCode === 200) { if (res.statusCode === 200) {
const { pathname } = new URL(source);
const mimetype = mime.getType(pathname); const mimetype = mime.getType(pathname);
const extension = mime.getExtension(mimetype); const extension = mime.getExtension(mimetype);
const hash = getHash(res.body); const hash = getHash(res.body);
const entropy = await getEntropy(res.body); const entropy = await getEntropy(res.body);
return { return {
photo: res.body, file: res.body,
mimetype, mimetype,
extension, extension,
hash, hash,
entropy, entropy,
source: photoUrl, source,
}; };
} }
throw new Error(`Response ${res.statusCode} not OK`); throw new Error(`Response ${res.statusCode} not OK`);
} catch (error) { } catch (error) {
logger.warn(`Failed attempt ${attempt}/3 to fetch photo ${index + 1} for ${label} (${photoUrl}): ${error}`); if (attempt <= 3) {
return fetchItem(source, index, existingItemsBySource, attempt + 1);
if (attempt < 3) {
await Promise.delay(5000);
return fetchPhoto(photoUrl, index, label, attempt + 1);
} }
return null; throw new Error(`Failed to fetch media from ${source}: ${error}`);
} }
} }
async function savePhotos(files, { async function fetchItems(itemSources, existingItemsBySource) {
domain = 'release', return Promise.map(itemSources, async (source, index) => fetchItem(source, index, existingItemsBySource));
subpath,
role = 'photo',
naming = 'index',
}) {
return Promise.map(files, async (file, index) => {
try {
const timestamp = new Date().getTime();
const thumbnail = await createThumbnail(file.photo);
const filename = naming === 'index'
? `${file.role || role}${index + 1}`
: `${timestamp + index}`;
const filepath = path.join(`${domain}s`, subpath, `${filename}.${file.extension}`);
const thumbpath = path.join(`${domain}s`, subpath, `${filename}_thumb.${file.extension}`);
await Promise.all([
fs.writeFile(path.join(config.media.path, filepath), file.photo),
fs.writeFile(path.join(config.media.path, thumbpath), thumbnail),
]);
return {
...file,
thumbnail,
filepath,
thumbpath,
};
} catch (error) {
logger.error(`Failed to store ${domain} ${role} to ${subpath}: ${error.message}`);
return null;
}
});
} }
async function storePhotos(photos, { async function storeReleaseMedia(releases, {
domain = 'release', type = 'poster',
role = 'photo', } = {}) {
naming = 'index', const pluckedSources = releases.map(release => pluckItems(release[type]));
targetId, const existingSourceItems = await knex('media').whereIn('source', pluckedSources.flat());
subpath, const existingItemsBySource = existingSourceItems.reduce((acc, item) => ({ ...acc, [item.source]: item }), {});
primaryRole, // role to assign to first photo if not already in database, used mainly for avatars
entropyFilter = 2.5, // filter out fallback avatars and other generic clipart
}, label) {
if (!photos || photos.length === 0) {
logger.info(`No ${role}s available for ${label}`);
return;
}
const pluckedPhotos = pluckPhotos(Array.from(new Set(photos))); // pre-filter link duplicates, limit total per configuration const fetchedItems = await fetchItems(pluckedSources, existingItemsBySource);
const [sourceDuplicates, sourceOriginals] = await findDuplicates(pluckedPhotos, 'source', null, label); const existingHashItems = await knex('media').whereIn('hash', fetchedItems.map(item => item.hash));
const existingItemsByHash = existingHashItems.reduce((acc, item) => ({ ...acc, [item.hash]: item }), {});
logger.info(`Fetching ${sourceOriginals.length} new ${role}s, ${sourceDuplicates.length} already present by source for ${label}`); const newItems = fetchedItems.filter(item => !existingItemsByHash[item.hash]);
const metaFiles = await Promise.map(sourceOriginals, async (photoUrl, index) => fetchPhoto(photoUrl, index, label), { console.log(fetchedItems, existingHashItems, existingItemsByHash, newItems);
concurrency: 10,
}).filter(photo => photo && photo.entropy > entropyFilter);
const metaFilesByHash = metaFiles.reduce((acc, photo) => ({ ...acc, [photo.hash]: photo }), {}); // pre-filter hash duplicates within set; may occur through fallbacks
const [hashDuplicates, hashOriginals] = await findDuplicates(Object.values(metaFilesByHash), 'hash', 'hash', label);
logger.info(`Saving ${hashOriginals.length} new ${role}s, ${hashDuplicates.length} already present by hash for ${label}`);
const savedPhotos = await savePhotos(hashOriginals, {
domain,
role,
targetId,
subpath,
naming,
});
const curatedPhotoEntries = curatePhotoEntries(savedPhotos.filter(Boolean), domain, role, targetId);
const newPhotos = await knex('media').insert(curatedPhotoEntries).returning('*');
const photoEntries = Array.isArray(newPhotos)
? [...sourceDuplicates, ...hashDuplicates, ...newPhotos]
: [...sourceDuplicates, ...hashDuplicates];
const photoAssociations = photoEntries
.map(photoEntry => ({
[`${domain}_id`]: targetId,
media_id: photoEntry.id,
}));
if (primaryRole) {
// store one photo as a 'primary' photo, such as an avatar or cover
const primaryPhoto = await knex(`${domain}s_${primaryRole}s`)
.where(`${domain}_id`, targetId)
.first();
if (primaryPhoto) {
const remainingAssociations = photoAssociations.filter(association => association.media_id !== primaryPhoto.media_id);
await upsert(`${domain}s_${role}s`, remainingAssociations, [`${domain}_id`, 'media_id']);
return;
}
await Promise.all([
upsert(`${domain}s_${primaryRole}s`, photoAssociations.slice(0, 1), [`${domain}_id`, 'media_id']),
upsert(`${domain}s_${role}s`, photoAssociations.slice(1), [`${domain}_id`, 'media_id']),
]);
return;
}
await upsert(`${domain}s_${role}s`, photoAssociations, [`${domain}_id`, 'media_id']);
}
/*
async function storeReleasePhotos(releases, label) {
const sources = releases.map(release => pluckPhotos(release.photos)).flat();
const uniqueSources = Array.from(new Set(sources));
const [sourceDuplicates, sourceOriginals] = await findDuplicates(uniqueSources, 'source', null, label);
const metaFiles = await Promise.map(
sourceOriginals,
async (photoUrl, index) => fetchPhoto(photoUrl, index, label),
{ concurrency: 10 },
)
.filter(photo => photo);
const hashUniques = Object.values(metaFiles.reduce((acc, file) => {
if (!acc[file.hash]) acc[file.hash] = file;
return acc;
}, {}));
const [hashDuplicates, hashOriginals] = await findDuplicates(hashUniques, 'hash', 'hash', label);
const sourceHashes = metaFiles.concat(sourceDuplicates).reduce((acc, file) => {
acc[file.source] = file.hash;
return acc;
}, {});
const associations = releases.map(release => release.photos.map(source => [release.id, sourceHashes[source]])).flat();
console.log(associations);
}
*/
async function storeTrailer(trailers, {
domain = 'releases',
role = 'trailer',
targetId,
subpath,
}, label) {
// support scrapers supplying multiple qualities
const trailer = Array.isArray(trailers)
? trailers.find(trailerX => config.media.trailerQuality.includes(trailerX.quality)) || trailers[0]
: trailers;
if (!trailer || !trailer.src) {
logger.info(`No ${role} available for ${label}`);
return;
}
const [sourceDuplicates, sourceOriginals] = await findDuplicates([trailer], 'source', 'src', label);
const metaFiles = await Promise.map(sourceOriginals, async (trailerX) => {
const { pathname } = new URL(trailerX.src);
const mimetype = trailerX.type || mime.getType(pathname);
const res = await bhttp.get(trailerX.src);
const hash = getHash(res.body);
const filepath = path.join(domain, subpath, `${role}${trailerX.quality ? `_${trailerX.quality}` : ''}.${mime.getExtension(mimetype)}`);
return {
trailer: res.body,
path: filepath,
mime: mimetype,
source: trailerX.src,
quality: trailerX.quality || null,
hash,
};
});
const [hashDuplicates, hashOriginals] = await findDuplicates(metaFiles, 'hash', 'hash', label);
const newTrailers = await knex('media')
.insert(hashOriginals.map(trailerX => ({
path: trailerX.path,
mime: trailerX.mime,
source: trailerX.source,
quality: trailerX.quality,
hash: trailerX.hash,
type: role,
})))
.returning('*');
await Promise.all(hashOriginals.map(trailerX => fs.writeFile(path.join(config.media.path, trailerX.path), trailerX.trailer)));
const trailerEntries = Array.isArray(newTrailers)
? [...sourceDuplicates, ...hashDuplicates, ...newTrailers]
: [...sourceDuplicates, ...hashDuplicates];
await upsert(`releases_${role}s`, trailerEntries.map(trailerEntry => ({
release_id: targetId,
media_id: trailerEntry.id,
})), ['release_id', 'media_id']);
} }
module.exports = { module.exports = {
createMediaDirectory, storeReleaseMedia,
storePhotos,
// storeReleasePhotos,
storeTrailer,
}; };

385
src/media_legacy.js Normal file
View File

@ -0,0 +1,385 @@
'use strict';
const config = require('config');
const Promise = require('bluebird');
const path = require('path');
const fs = require('fs-extra');
const bhttp = require('bhttp');
const mime = require('mime');
const sharp = require('sharp');
const blake2 = require('blake2');
const logger = require('./logger')(__filename);
const knex = require('./knex');
const upsert = require('./utils/upsert');
const { ex } = require('./utils/q');
function getHash(buffer) {
const hash = blake2.createHash('blake2b', { digestLength: 24 });
hash.update(buffer);
return hash.digest('hex');
}
function pluckPhotos(photos, specifiedLimit) {
const limit = specifiedLimit || config.media.limit;
if (photos.length <= limit) {
return photos;
}
const plucked = [1]
.concat(
Array.from({ length: limit - 1 }, (value, index) => Math.round((index + 1) * (photos.length / (limit - 1)))),
);
return Array.from(new Set(plucked)).map(photoIndex => photos[photoIndex - 1]); // remove duplicates, may happen when photo total and photo limit are close
}
async function getEntropy(buffer) {
try {
const { entropy } = await sharp(buffer).stats();
return entropy;
} catch (error) {
logger.warn(`Failed to retrieve image entropy, using 7.5: ${error.message}`);
return 7.5;
}
}
async function createThumbnail(buffer) {
try {
const thumbnail = sharp(buffer)
.resize({
height: config.media.thumbnailSize,
withoutEnlargement: true,
})
.jpeg({
quality: config.media.thumbnailQuality,
})
.toBuffer();
return thumbnail;
} catch (error) {
logger.error(`Failed to create thumbnail: ${error.message}`);
}
return null;
}
async function createMediaDirectory(domain, subpath) {
const filepath = path.join(config.media.path, domain, subpath);
await fs.mkdir(filepath, { recursive: true });
return filepath;
}
function curatePhotoEntries(files) {
return files.map((file, index) => ({
path: file.filepath,
thumbnail: file.thumbpath,
mime: file.mimetype,
hash: file.hash,
source: file.source,
index,
}));
}
async function findDuplicates(photos, identifier, prop = null) {
const duplicates = await knex('media')
.whereIn(identifier, photos.flat().map((photo) => {
if (prop) return photo[prop];
if (photo.src) return photo.src;
return photo;
}));
const duplicateLookup = new Set(duplicates.map(photo => photo[prop || identifier]));
const originals = photos.filter((source) => {
if (Array.isArray(source)) {
return !source.some(sourceX => duplicateLookup.has((prop && sourceX[prop]) || (sourceX.src && sourceX)));
}
return !duplicateLookup.has((prop && source[prop]) || (source.src && source));
});
return [duplicates, originals];
}
async function extractPhoto(source) {
const res = await bhttp.get(source.src);
if (res.statusCode === 200) {
const { q } = ex(res.body.toString());
return source.extract(q);
}
return null;
}
async function fetchPhoto(photoUrl, index, label, attempt = 1) {
if (photoUrl.src && photoUrl.extract) {
// source links to page containing a (presumably) tokenized photo
const photo = await extractPhoto(photoUrl);
return fetchPhoto(photo, index, label);
}
if (Array.isArray(photoUrl)) {
return photoUrl.reduce(async (outcome, url) => outcome.catch(async () => {
const photo = await fetchPhoto(url, index, label);
if (photo) {
return photo;
}
throw new Error('Photo not available');
}), Promise.reject(new Error()));
}
try {
const { pathname } = new URL(photoUrl);
const res = await bhttp.get(photoUrl);
if (res.statusCode === 200) {
const mimetype = mime.getType(pathname);
const extension = mime.getExtension(mimetype);
const hash = getHash(res.body);
const entropy = await getEntropy(res.body);
return {
photo: res.body,
mimetype,
extension,
hash,
entropy,
source: photoUrl,
};
}
throw new Error(`Response ${res.statusCode} not OK`);
} catch (error) {
logger.warn(`Failed attempt ${attempt}/3 to fetch photo ${index + 1} for ${label} (${photoUrl}): ${error}`);
if (attempt < 3) {
await Promise.delay(5000);
return fetchPhoto(photoUrl, index, label, attempt + 1);
}
return null;
}
}
async function savePhotos(files, {
domain = 'release',
subpath,
role = 'photo',
naming = 'index',
}) {
return Promise.map(files, async (file, index) => {
try {
const timestamp = new Date().getTime();
const thumbnail = await createThumbnail(file.photo);
const filename = naming === 'index'
? `${file.role || role}${index + 1}`
: `${timestamp + index}`;
const filepath = path.join(`${domain}s`, subpath, `${filename}.${file.extension}`);
const thumbpath = path.join(`${domain}s`, subpath, `${filename}_thumb.${file.extension}`);
await Promise.all([
fs.writeFile(path.join(config.media.path, filepath), file.photo),
fs.writeFile(path.join(config.media.path, thumbpath), thumbnail),
]);
return {
...file,
thumbnail,
filepath,
thumbpath,
};
} catch (error) {
logger.error(`Failed to store ${domain} ${role} to ${subpath}: ${error.message}`);
return null;
}
});
}
async function storePhotos(photos, {
domain = 'release',
role = 'photo',
naming = 'index',
targetId,
subpath,
primaryRole, // role to assign to first photo if not already in database, used mainly for avatars
entropyFilter = 2.5, // filter out fallback avatars and other generic clipart
}, label) {
if (!photos || photos.length === 0) {
logger.info(`No ${role}s available for ${label}`);
return;
}
const pluckedPhotos = pluckPhotos(Array.from(new Set(photos))); // pre-filter link duplicates, limit total per configuration
const [sourceDuplicates, sourceOriginals] = await findDuplicates(pluckedPhotos, 'source', null, label);
logger.info(`Fetching ${sourceOriginals.length} new ${role}s, ${sourceDuplicates.length} already present by source for ${label}`);
const metaFiles = await Promise.map(sourceOriginals, async (photoUrl, index) => fetchPhoto(photoUrl, index, label), {
concurrency: 10,
}).filter(photo => photo && photo.entropy > entropyFilter);
const metaFilesByHash = metaFiles.reduce((acc, photo) => ({ ...acc, [photo.hash]: photo }), {}); // pre-filter hash duplicates within set; may occur through fallbacks
const [hashDuplicates, hashOriginals] = await findDuplicates(Object.values(metaFilesByHash), 'hash', 'hash', label);
logger.info(`Saving ${hashOriginals.length} new ${role}s, ${hashDuplicates.length} already present by hash for ${label}`);
const savedPhotos = await savePhotos(hashOriginals, {
domain,
role,
targetId,
subpath,
naming,
});
const curatedPhotoEntries = curatePhotoEntries(savedPhotos.filter(Boolean), domain, role, targetId);
const newPhotos = await knex('media').insert(curatedPhotoEntries).returning('*');
const photoEntries = Array.isArray(newPhotos)
? [...sourceDuplicates, ...hashDuplicates, ...newPhotos]
: [...sourceDuplicates, ...hashDuplicates];
const photoAssociations = photoEntries
.map(photoEntry => ({
[`${domain}_id`]: targetId,
media_id: photoEntry.id,
}));
if (primaryRole) {
// store one photo as a 'primary' photo, such as an avatar or cover
const primaryPhoto = await knex(`${domain}s_${primaryRole}s`)
.where(`${domain}_id`, targetId)
.first();
if (primaryPhoto) {
const remainingAssociations = photoAssociations.filter(association => association.media_id !== primaryPhoto.media_id);
await upsert(`${domain}s_${role}s`, remainingAssociations, [`${domain}_id`, 'media_id']);
return;
}
await Promise.all([
upsert(`${domain}s_${primaryRole}s`, photoAssociations.slice(0, 1), [`${domain}_id`, 'media_id']),
upsert(`${domain}s_${role}s`, photoAssociations.slice(1), [`${domain}_id`, 'media_id']),
]);
return;
}
await upsert(`${domain}s_${role}s`, photoAssociations, [`${domain}_id`, 'media_id']);
}
/*
async function storeReleasePhotos(releases, label) {
const sources = releases.map(release => pluckPhotos(release.photos)).flat();
const uniqueSources = Array.from(new Set(sources));
const [sourceDuplicates, sourceOriginals] = await findDuplicates(uniqueSources, 'source', null, label);
const metaFiles = await Promise.map(
sourceOriginals,
async (photoUrl, index) => fetchPhoto(photoUrl, index, label),
{ concurrency: 10 },
)
.filter(photo => photo);
const hashUniques = Object.values(metaFiles.reduce((acc, file) => {
if (!acc[file.hash]) acc[file.hash] = file;
return acc;
}, {}));
const [hashDuplicates, hashOriginals] = await findDuplicates(hashUniques, 'hash', 'hash', label);
const sourceHashes = metaFiles.concat(sourceDuplicates).reduce((acc, file) => {
acc[file.source] = file.hash;
return acc;
}, {});
const associations = releases.map(release => release.photos.map(source => [release.id, sourceHashes[source]])).flat();
console.log(associations);
}
*/
async function storeTrailer(trailers, {
domain = 'releases',
role = 'trailer',
targetId,
subpath,
}, label) {
// support scrapers supplying multiple qualities
const trailer = Array.isArray(trailers)
? trailers.find(trailerX => config.media.trailerQuality.includes(trailerX.quality)) || trailers[0]
: trailers;
if (!trailer || !trailer.src) {
logger.info(`No ${role} available for ${label}`);
return;
}
const [sourceDuplicates, sourceOriginals] = await findDuplicates([trailer], 'source', 'src', label);
const metaFiles = await Promise.map(sourceOriginals, async (trailerX) => {
const { pathname } = new URL(trailerX.src);
const mimetype = trailerX.type || mime.getType(pathname);
const res = await bhttp.get(trailerX.src);
const hash = getHash(res.body);
const filepath = path.join(domain, subpath, `${role}${trailerX.quality ? `_${trailerX.quality}` : ''}.${mime.getExtension(mimetype)}`);
return {
trailer: res.body,
path: filepath,
mime: mimetype,
source: trailerX.src,
quality: trailerX.quality || null,
hash,
};
});
const [hashDuplicates, hashOriginals] = await findDuplicates(metaFiles, 'hash', 'hash', label);
const newTrailers = await knex('media')
.insert(hashOriginals.map(trailerX => ({
path: trailerX.path,
mime: trailerX.mime,
source: trailerX.source,
quality: trailerX.quality,
hash: trailerX.hash,
type: role,
})))
.returning('*');
await Promise.all(hashOriginals.map(trailerX => fs.writeFile(path.join(config.media.path, trailerX.path), trailerX.trailer)));
const trailerEntries = Array.isArray(newTrailers)
? [...sourceDuplicates, ...hashDuplicates, ...newTrailers]
: [...sourceDuplicates, ...hashDuplicates];
await upsert(`releases_${role}s`, trailerEntries.map(trailerEntry => ({
release_id: targetId,
media_id: trailerEntry.id,
})), ['release_id', 'media_id']);
}
module.exports = {
createMediaDirectory,
storePhotos,
// storeReleasePhotos,
storeTrailer,
};

View File

@ -10,12 +10,20 @@ const argv = require('./argv');
const whereOr = require('./utils/where-or'); const whereOr = require('./utils/where-or');
const { associateTags } = require('./tags'); const { associateTags } = require('./tags');
const { associateActors, scrapeBasicActors } = require('./actors'); const { associateActors, scrapeBasicActors } = require('./actors');
/*
const { const {
createMediaDirectory, createMediaDirectory,
storePhotos, storePhotos,
// storeReleasePhotos, // storeReleasePhotos,
storeTrailer, storeTrailer,
storeReleaseMedia,
} = require('./media'); } = require('./media');
*/
const {
createMediaDirectory,
storePhotos,
storeTrailer,
} = require('./media_legacy');
const { fetchSites, findSiteByUrl } = require('./sites'); const { fetchSites, findSiteByUrl } = require('./sites');
const slugify = require('./utils/slugify'); const slugify = require('./utils/slugify');
@ -331,6 +339,8 @@ function accumulateMovies(releases) {
async function storeReleaseAssets(releases) { async function storeReleaseAssets(releases) {
// await storeReleasePhotos(releases); // await storeReleasePhotos(releases);
// return storeReleaseMedia(releases);
await Promise.map(releases, async (release) => { await Promise.map(releases, async (release) => {
const subpath = `${release.site.network.slug}/${release.site.slug}/${release.id}/`; const subpath = `${release.site.network.slug}/${release.site.slug}/${release.id}/`;
const identifier = `"${release.title}" (${release.id})`; const identifier = `"${release.title}" (${release.id})`;

View File

@ -8,7 +8,7 @@ function scrapeLatest(html, site) {
const { document } = new JSDOM(html).window; const { document } = new JSDOM(html).window;
const { origin } = new URL(site.url); const { origin } = new URL(site.url);
const videos = document.querySelectorAll('.video-releases-list').slice(-1)[0]; const videos = Array.from(document.querySelectorAll('.video-releases-list')).slice(-1)[0];
return Array.from(videos.querySelectorAll('.card'), (scene) => { return Array.from(videos.querySelectorAll('.card'), (scene) => {
const release = { site }; const release = { site };

37
src/utils/scorelogos.js Normal file
View File

@ -0,0 +1,37 @@
'use strict';
const Promise = require('bluebird');
const bhttp = require('bhttp');
const fs = require('fs-extra');
const knex = require('../knex');
async function init() {
const sites = await knex('sites')
.select('networks.name', 'sites.slug')
.join('networks', 'networks.id', 'sites.network_id')
.where('networks.slug', 'score');
await Promise.map(sites, async (site) => {
const url = `https://cdn77.scoreuniverse.com/${site.slug}/images/logo.png`;
console.log(url);
const res = await bhttp.get(url, {
responseTimeout: 5000,
});
if (res.statusCode === 200) {
console.log(`Saving logo for ${site.slug}`);
await fs.writeFile(`./score/${site.slug}.png`, res.body);
}
console.log(`No logo found for ${site.slug}`);
}, {
concurrency: 10,
});
knex.destroy();
}
init();