Refactored media module to generalize avatar and poster storage into photo storage.

This commit is contained in:
ThePendulum 2019-12-13 03:28:52 +01:00
parent 20011a74e8
commit 77307d2d13
7 changed files with 183 additions and 142 deletions

View File

@ -9,7 +9,7 @@ const argv = require('./argv');
const scrapers = require('./scrapers/scrapers');
const whereOr = require('./utils/where-or');
const resolvePlace = require('./utils/resolve-place');
const { createActorMediaDirectory, storeAvatars } = require('./media');
const { createMediaDirectory, storePhotos } = require('./media');
async function curateActor(actor) {
const [aliases, photos, social] = await Promise.all([
@ -352,11 +352,19 @@ async function scrapeActors(actorNames) {
if (argv.save) {
if (actorEntry && profile) {
await createActorMediaDirectory(profile, actorEntry);
await createMediaDirectory('actors', `${actorEntry.slug}/`);
await Promise.all([
updateActor(profile, true, true),
storeAvatars(profile, actorEntry),
// storeAvatars(profile, actorEntry),
storePhotos(profile.avatars, {
domain: 'actors',
role: 'photo',
primaryRole: 'avatar',
targetId: actorEntry.id,
subpath: `${actorEntry.slug}/`,
naming: 'timestamp',
}, actorEntry.name),
]);
return;
@ -364,8 +372,15 @@ async function scrapeActors(actorNames) {
const newActorEntry = await storeActor(profile, true, true);
await createActorMediaDirectory(profile, newActorEntry);
await storeAvatars(profile, newActorEntry);
await createMediaDirectory('actors', `${newActorEntry.slug}/`);
await storePhotos(profile.avatars, {
domain: 'actors',
role: 'photo',
primaryRole: 'avatar',
targetId: newActorEntry.id,
subpath: `${newActorEntry.slug}/`,
naming: 'timestamp',
}, newActorEntry.name);
}
} catch (error) {
console.warn(actorName, error);

View File

@ -11,39 +11,36 @@ const scrapeRelease = require('./scrape-release');
const { scrapeActors, scrapeBasicActors } = require('./actors');
async function init() {
if (argv.url) {
await Promise.map(argv.url, async url => scrapeRelease(url), {
if (argv.scene) {
await Promise.map(argv.scene, async url => scrapeRelease(url, null, false, false), {
concurrency: 5,
});
knex.destroy();
return;
}
if (argv.movie) {
await Promise.map(argv.movie, async url => scrapeRelease(url, null, false, true), {
concurrency: 5,
});
}
if (argv.scrape || argv.networks || argv.sites) {
await scrapeSites();
knex.destroy();
return;
}
if (argv.actors && argv.actors.length > 0) {
await scrapeActors();
knex.destroy();
return;
}
if (argv.actors) {
await scrapeBasicActors();
knex.destroy();
}
if (argv.server) {
await initServer();
return;
}
await initServer();
knex.destroy();
}
module.exports = init;

View File

@ -5,6 +5,11 @@ const yargs = require('yargs');
const { argv } = yargs
.command('npm start')
.option('server', {
describe: 'Start web server',
type: 'boolean',
alias: 'web',
})
.option('scrape', {
describe: 'Scrape sites and networks defined in configuration',
type: 'boolean',
@ -24,6 +29,16 @@ const { argv } = yargs
type: 'array',
alias: 'actor',
})
.option('scene', {
describe: 'Scrape scene info from URL',
type: 'array',
alias: 'release',
})
.option('movie', {
describe: 'Scrape movie info from URL',
type: 'array',
alias: 'dvd',
})
.option('sources', {
describe: 'Use these scrapers for actor data',
type: 'array',
@ -39,11 +54,6 @@ const { argv } = yargs
type: 'boolean',
alias: 'force',
})
.option('url', {
describe: 'Scrape scene info from URL',
type: 'array',
alias: 'fetch',
})
.option('after', {
describe: 'Don\'t fetch scenes older than',
type: 'string',

View File

@ -19,7 +19,7 @@ function getHash(buffer) {
return hash.digest('hex');
}
function pluckPhotos(photos, release, specifiedLimit) {
function pluckPhotos(photos, specifiedLimit) {
const limit = specifiedLimit || config.media.limit;
if (photos.length <= limit) {
@ -34,7 +34,7 @@ function pluckPhotos(photos, release, specifiedLimit) {
return Array.from(new Set(plucked)).map(photoIndex => photos[photoIndex - 1]); // remove duplicates, may happen when photo total and photo limit are close
}
async function getThumbnail(buffer) {
async function createThumbnail(buffer) {
return sharp(buffer)
.resize({
height: config.media.thumbnailSize,
@ -43,25 +43,14 @@ async function getThumbnail(buffer) {
.toBuffer();
}
async function createReleaseMediaDirectory(release, releaseId) {
if (release.poster || (release.photos && release.photos.length) || release.trailer) {
await fs.mkdir(
path.join(config.media.path, 'releases', release.site.network.slug, release.site.slug, releaseId.toString()),
{ recursive: true },
);
}
async function createMediaDirectory(domain, subpath) {
const filepath = path.join(config.media.path, domain, subpath);
await fs.mkdir(filepath, { recursive: true });
return filepath;
}
async function createActorMediaDirectory(profile, actor) {
if (profile.avatars && profile.avatars.length) {
await fs.mkdir(
path.join(config.media.path, 'actors', actor.slug),
{ recursive: true },
);
}
}
function curatePhotoEntries(files, domain = 'releases', role = 'photo', targetId, setAvatar = false) {
function curatePhotoEntries(files, domain = 'releases', role = 'photo', targetId) {
return files.map((file, index) => ({
path: file.filepath,
thumbnail: file.thumbpath,
@ -71,7 +60,7 @@ function curatePhotoEntries(files, domain = 'releases', role = 'photo', targetId
index,
domain,
target_id: targetId,
role: setAvatar && index === 0 ? 'avatar' : role,
role: file.role || role,
}));
}
@ -79,8 +68,8 @@ function curatePhotoEntries(files, domain = 'releases', role = 'photo', targetId
async function filterSourceDuplicates(photos, domains = ['releases'], roles = ['photo'], identifier) {
const photoSourceEntries = await knex('media')
.whereIn('source', photos.flat())
.whereIn('domain', [].concat(domains))
.whereIn('role', [].concat(roles)); // accept string argument
.whereIn('domain', domains)
.whereIn('role', roles); // accept string argument
const photoSources = new Set(photoSourceEntries.map(photo => photo.source));
const newPhotos = photos.filter(source => (Array.isArray(source) // fallbacks provided?
@ -156,18 +145,22 @@ async function fetchPhoto(photoUrl, index, identifier, attempt = 1) {
}
}
async function savePhotos(files, release, releaseId, actorSlug, isPoster = false) {
async function savePhotos(files, {
domain = 'releases',
subpath,
role = 'photo',
naming = 'index',
}) {
return Promise.map(files, async (file, index) => {
const timestamp = new Date().getTime();
const thumbnail = await getThumbnail(file.photo);
const thumbnail = await createThumbnail(file.photo);
const filepath = actorSlug
? path.join('actors', actorSlug, `${timestamp + index}.${file.extension}`)
: path.join('releases', release.site.network.slug, release.site.slug, releaseId.toString(), `${isPoster ? 'poster' : index + 1}.${file.extension}`);
const filename = naming === 'index'
? `${file.role || role}-${index}`
: `${timestamp + index}`;
const thumbpath = actorSlug
? path.join('actors', actorSlug, `${timestamp + index}_thumb.${file.extension}`)
: path.join('releases', release.site.network.slug, release.site.slug, releaseId.toString(), `${isPoster ? 'poster' : index + 1}_thumb.${file.extension}`);
const filepath = path.join(domain, subpath, `${filename}.${file.extension}`);
const thumbpath = path.join(domain, subpath, `${filename}_thumb.${file.extension}`);
await Promise.all([
fs.writeFile(path.join(config.media.path, filepath), file.photo),
@ -183,75 +176,85 @@ async function savePhotos(files, release, releaseId, actorSlug, isPoster = false
});
}
async function storePoster(release, releaseId) {
if (!release.poster) {
console.warn(`No poster available for (${release.site.name}, ${releaseId}}) "${release.title}"`);
return;
}
const [newPoster] = await filterSourceDuplicates([release.poster], 'releases', 'poster', `(${release.site.name}, ${releaseId}) "${release.title}"`);
if (!newPoster) return;
console.log(`Fetching poster for (${release.site.name}, ${releaseId}) "${release.title}"`);
const metaFile = await fetchPhoto(release.poster, null, `(${release.site.name}, ${releaseId}) "${release.title}"`);
const [uniquePoster] = await filterHashDuplicates([metaFile], 'releases', 'poster', `(${release.site.name}, ${releaseId}) "${release.title}"`);
if (!uniquePoster) return;
const savedPosters = await savePhotos([uniquePoster], release, releaseId, null, true);
await knex('media').insert(curatePhotoEntries(savedPosters, 'releases', 'poster', releaseId));
}
async function storePhotos(release, releaseId) {
if (!release.photos || release.photos.length === 0) {
console.warn(`No photos available for (${release.site.name}, ${releaseId}) "${release.title}"`);
async function storePhotos(photos, {
domain = 'releases',
role = 'photo',
naming = 'index',
targetId,
subpath,
primaryRole, // role to assign to first photo if not already in database, used mainly for avatars
}, identifier) {
if (!photos || photos.length === 0) {
console.warn(`No ${role}s available for ${identifier}`);
return;
}
const pluckedPhotos = pluckPhotos(release.photos, release);
const newPhotos = await filterSourceDuplicates(pluckedPhotos, 'releases', 'photo', `(${release.site.name}, ${releaseId}) "${release.title}"`);
const pluckedPhotos = pluckPhotos(photos);
const roles = primaryRole ? [role, primaryRole] : [role];
const newPhotos = await filterSourceDuplicates(pluckedPhotos, [domain], roles, identifier);
if (newPhotos.length === 0) return;
console.log(`Fetching ${newPhotos.length} photos for (${release.site.name}, ${releaseId}) "${release.title}"`);
console.log(`Fetching ${newPhotos.length} ${role}s for ${identifier}`);
const metaFiles = await Promise.map(newPhotos, async (photoUrl, index) => fetchPhoto(photoUrl, index, `(${release.site.name}, ${releaseId}) "${release.title}"`), {
const metaFiles = await Promise.map(newPhotos, async (photoUrl, index) => fetchPhoto(photoUrl, index, identifier), {
concurrency: 10,
}).filter(photo => photo);
console.log(metaFiles);
const [uniquePhotos, primaryPhoto] = await Promise.all([
filterHashDuplicates(metaFiles, [domain], roles, identifier),
primaryRole
? await knex('media')
.where('domain', domain)
.where('target_id', targetId)
.where('role', primaryRole)
.first()
: null,
]);
const uniquePhotos = await filterHashDuplicates(metaFiles, 'releases', 'photo', `(${release.site.name}, ${releaseId}) "${release.title}"`);
const savedPhotos = await savePhotos(uniquePhotos, release, releaseId);
if (primaryRole && !primaryPhoto) {
uniquePhotos[0].role = primaryRole;
}
const curatedPhotoEntries = curatePhotoEntries(savedPhotos, 'releases', 'photo', releaseId);
const savedPhotos = await savePhotos(uniquePhotos, {
domain,
role,
targetId,
subpath,
naming,
});
const curatedPhotoEntries = curatePhotoEntries(savedPhotos, domain, role, targetId);
await knex('media').insert(curatedPhotoEntries);
console.log(`Stored ${newPhotos.length} photos for (${release.site.name}, ${releaseId}) "${release.title}"`);
console.log(`Stored ${newPhotos.length} ${role}s for ${identifier}`);
}
async function storeTrailer(release, releaseId) {
async function storeTrailer(trailers, {
domain = 'releases',
role = 'trailer',
targetId,
subpath,
}, identifier) {
// support scrapers supplying multiple qualities
const trailer = Array.isArray(release.trailer)
? (release.trailer.find(trailerX => [1080, 720].includes(trailerX.quality) || release.trailer[0]))
: release.trailer;
const trailer = Array.isArray(trailers)
? trailers.find(trailerX => [1080, 720].includes(trailerX.quality)) || trailers[0]
: trailers;
if (!trailer || !trailer.src) {
console.warn(`No trailer available for (${release.site.name}, ${releaseId}}) "${release.title}"`);
console.warn(`No trailer available for ${identifier}`);
return;
}
console.log(`Storing trailer for (${release.site.name}, ${releaseId}) "${release.title}"`);
console.log(`Storing trailer for ${identifier}`);
const { pathname } = new URL(trailer.src);
const mimetype = trailer.type || mime.getType(pathname);
const res = await bhttp.get(trailer.src);
const filepath = path.join('releases', release.site.network.slug, release.site.slug, releaseId.toString(), `trailer${trailer.quality ? `_${trailer.quality}` : ''}.${mime.getExtension(mimetype)}`);
const filepath = path.join('releases', subpath, `trailer${trailer.quality ? `_${trailer.quality}` : ''}.${mime.getExtension(mimetype)}`);
await Promise.all([
fs.writeFile(path.join(config.media.path, filepath), res.body),
@ -259,49 +262,24 @@ async function storeTrailer(release, releaseId) {
path: filepath,
mime: mimetype,
source: trailer.src,
domain: 'releases',
target_id: releaseId,
role: 'trailer',
domain,
target_id: targetId,
role,
quality: trailer.quality || null,
}),
]);
}
async function storeAvatars(profile, actor) {
if (!profile.avatars || profile.avatars.length === 0) {
console.warn(`No avatars available for '${profile.name}'`);
return;
}
const newPhotos = await filterSourceDuplicates(profile.avatars, 'actors', ['avatar', 'photo'], actor.name);
if (newPhotos.length === 0) return;
console.log(`Fetching ${newPhotos.length} avatars for '${actor.name}'`);
const metaFiles = await Promise.map(newPhotos, async (photoUrl, index) => fetchPhoto(photoUrl, index, actor.name), {
concurrency: 10,
}).filter(photo => photo);
const uniquePhotos = await filterHashDuplicates(metaFiles, 'actors', ['avatar', 'photo'], actor.name);
const [savedPhotos, avatarEntry] = await Promise.all([
savePhotos(uniquePhotos, null, null, actor.slug),
knex('media').where({
target_id: actor.id,
domain: 'actors',
role: 'avatar',
}).first(),
]);
// if no avatar entry is present, curatePhotoEntries will store the first photo as avatar
await knex('media').insert(curatePhotoEntries(savedPhotos, 'actors', 'photo', actor.id, !avatarEntry));
async function findAvatar(actorId, domain = 'actors') {
return knex('media')
.where('domain', domain)
.where('target_id', actorId)
.where('role', 'avatar');
}
module.exports = {
createActorMediaDirectory,
createReleaseMediaDirectory,
storeAvatars,
storePoster,
createMediaDirectory,
findAvatar,
storePhotos,
storeTrailer,
};

View File

@ -9,8 +9,7 @@ const whereOr = require('./utils/where-or');
const { associateTags } = require('./tags');
const { associateActors } = require('./actors');
const {
createReleaseMediaDirectory,
storePoster,
createMediaDirectory,
storePhotos,
storeTrailer,
} = require('./media');
@ -244,13 +243,27 @@ async function fetchTagReleases(queryObject, options = {}) {
}
async function storeReleaseAssets(release, releaseId) {
await createReleaseMediaDirectory(release, releaseId);
const subpath = `${release.site.network.slug}/${release.site.slug}/${release.id}/`;
await createMediaDirectory('releases', subpath);
console.log(release.poster);
try {
await Promise.all([
storePhotos(release, releaseId),
storePoster(release, releaseId),
storeTrailer(release, releaseId),
storePhotos(release.photos, {
targetId: releaseId,
subpath,
}),
storePhotos([release.poster], {
role: 'poster',
targetId: releaseId,
subpath,
}),
storeTrailer(release.trailer, {
targetId: releaseId,
subpath,
}),
]);
} catch (error) {
console.log(release.url, error);

View File

@ -28,7 +28,7 @@ async function findSite(url, release) {
return null;
}
async function scrapeRelease(url, release, deep = false) {
async function scrapeRelease(url, release, deep = false, isMovie = false) {
const site = await findSite(url, release);
if (!site) {
@ -41,22 +41,28 @@ async function scrapeRelease(url, release, deep = false) {
throw new Error('Could not find scraper for URL');
}
if (!scraper.fetchScene) {
throw new Error(`The '${site.name}'-scraper cannot fetch individual releases`);
if (!isMovie && !scraper.fetchScene) {
throw new Error(`The '${site.name}'-scraper cannot fetch individual scenes`);
}
const scene = await scraper.fetchScene(url, site, release);
if (isMovie && !scraper.fetchMovie) {
throw new Error(`The '${site.name}'-scraper cannot fetch individual movies`);
}
const scrapedRelease = isMovie
? await scraper.fetchMovie(url, site, release)
: await scraper.fetchScene(url, site, release);
if (!deep && argv.save) {
// don't store release when called by site scraper
const [storedRelease] = await storeReleases([scene]);
const [storedRelease] = await storeReleases([scrapedRelease]);
if (storedRelease) {
console.log(`http://${config.web.host}:${config.web.port}/scene/${storedRelease.id}`);
}
}
return scene;
return scrapedRelease;
}
module.exports = scrapeRelease;

View File

@ -204,6 +204,21 @@ async function scrapeScene(html, url, site) {
};
}
function scrapeMovie(html, url, site) {
const { document } = new JSDOM(html).window;
const movie = { url, site };
console.log(url);
movie.entryId = document.querySelector('.dvd_details_overview .rating_box').dataset.id;
movie.title = document.querySelector('.title_bar span').textContent;
movie.covers = Array.from(document.querySelectorAll('#dvd-cover-flip > a'), el => el.href);
movie.channel = document.querySelector('.update_date a').textContent;
movie.date = new Date();
return movie;
}
function scrapeProfile(html, url, actorName) {
const { document } = new JSDOM(html).window;
@ -257,6 +272,12 @@ async function fetchScene(url, site) {
return scrapeScene(res.body.toString(), url, site);
}
async function fetchMovie(url, site) {
const res = await bhttp.get(url);
return scrapeMovie(res.body.toString(), url, site);
}
async function fetchProfile(actorName) {
const actorSlugA = actorName.toLowerCase().replace(/\s+/g, '-');
const actorSlugB = actorName.toLowerCase().replace(/\s+/g, '');
@ -285,6 +306,7 @@ async function fetchProfile(actorName) {
module.exports = {
fetchLatest,
fetchMovie,
fetchProfile,
fetchUpcoming,
fetchScene,