Refactored media module to generalize avatar and poster storage into photo storage.

This commit is contained in:
ThePendulum 2019-12-13 03:28:52 +01:00
parent 20011a74e8
commit 77307d2d13
7 changed files with 183 additions and 142 deletions

View File

@ -9,7 +9,7 @@ const argv = require('./argv');
const scrapers = require('./scrapers/scrapers'); const scrapers = require('./scrapers/scrapers');
const whereOr = require('./utils/where-or'); const whereOr = require('./utils/where-or');
const resolvePlace = require('./utils/resolve-place'); const resolvePlace = require('./utils/resolve-place');
const { createActorMediaDirectory, storeAvatars } = require('./media'); const { createMediaDirectory, storePhotos } = require('./media');
async function curateActor(actor) { async function curateActor(actor) {
const [aliases, photos, social] = await Promise.all([ const [aliases, photos, social] = await Promise.all([
@ -352,11 +352,19 @@ async function scrapeActors(actorNames) {
if (argv.save) { if (argv.save) {
if (actorEntry && profile) { if (actorEntry && profile) {
await createActorMediaDirectory(profile, actorEntry); await createMediaDirectory('actors', `${actorEntry.slug}/`);
await Promise.all([ await Promise.all([
updateActor(profile, true, true), updateActor(profile, true, true),
storeAvatars(profile, actorEntry), // storeAvatars(profile, actorEntry),
storePhotos(profile.avatars, {
domain: 'actors',
role: 'photo',
primaryRole: 'avatar',
targetId: actorEntry.id,
subpath: `${actorEntry.slug}/`,
naming: 'timestamp',
}, actorEntry.name),
]); ]);
return; return;
@ -364,8 +372,15 @@ async function scrapeActors(actorNames) {
const newActorEntry = await storeActor(profile, true, true); const newActorEntry = await storeActor(profile, true, true);
await createActorMediaDirectory(profile, newActorEntry); await createMediaDirectory('actors', `${newActorEntry.slug}/`);
await storeAvatars(profile, newActorEntry); await storePhotos(profile.avatars, {
domain: 'actors',
role: 'photo',
primaryRole: 'avatar',
targetId: newActorEntry.id,
subpath: `${newActorEntry.slug}/`,
naming: 'timestamp',
}, newActorEntry.name);
} }
} catch (error) { } catch (error) {
console.warn(actorName, error); console.warn(actorName, error);

View File

@ -11,39 +11,36 @@ const scrapeRelease = require('./scrape-release');
const { scrapeActors, scrapeBasicActors } = require('./actors'); const { scrapeActors, scrapeBasicActors } = require('./actors');
async function init() { async function init() {
if (argv.url) { if (argv.scene) {
await Promise.map(argv.url, async url => scrapeRelease(url), { await Promise.map(argv.scene, async url => scrapeRelease(url, null, false, false), {
concurrency: 5, concurrency: 5,
}); });
knex.destroy();
return;
} }
if (argv.movie) {
await Promise.map(argv.movie, async url => scrapeRelease(url, null, false, true), {
concurrency: 5,
});
}
if (argv.scrape || argv.networks || argv.sites) { if (argv.scrape || argv.networks || argv.sites) {
await scrapeSites(); await scrapeSites();
knex.destroy();
return;
} }
if (argv.actors && argv.actors.length > 0) { if (argv.actors && argv.actors.length > 0) {
await scrapeActors(); await scrapeActors();
knex.destroy();
return;
} }
if (argv.actors) { if (argv.actors) {
await scrapeBasicActors(); await scrapeBasicActors();
knex.destroy(); }
if (argv.server) {
await initServer();
return; return;
} }
await initServer(); knex.destroy();
} }
module.exports = init; module.exports = init;

View File

@ -5,6 +5,11 @@ const yargs = require('yargs');
const { argv } = yargs const { argv } = yargs
.command('npm start') .command('npm start')
.option('server', {
describe: 'Start web server',
type: 'boolean',
alias: 'web',
})
.option('scrape', { .option('scrape', {
describe: 'Scrape sites and networks defined in configuration', describe: 'Scrape sites and networks defined in configuration',
type: 'boolean', type: 'boolean',
@ -24,6 +29,16 @@ const { argv } = yargs
type: 'array', type: 'array',
alias: 'actor', alias: 'actor',
}) })
.option('scene', {
describe: 'Scrape scene info from URL',
type: 'array',
alias: 'release',
})
.option('movie', {
describe: 'Scrape movie info from URL',
type: 'array',
alias: 'dvd',
})
.option('sources', { .option('sources', {
describe: 'Use these scrapers for actor data', describe: 'Use these scrapers for actor data',
type: 'array', type: 'array',
@ -39,11 +54,6 @@ const { argv } = yargs
type: 'boolean', type: 'boolean',
alias: 'force', alias: 'force',
}) })
.option('url', {
describe: 'Scrape scene info from URL',
type: 'array',
alias: 'fetch',
})
.option('after', { .option('after', {
describe: 'Don\'t fetch scenes older than', describe: 'Don\'t fetch scenes older than',
type: 'string', type: 'string',

View File

@ -19,7 +19,7 @@ function getHash(buffer) {
return hash.digest('hex'); return hash.digest('hex');
} }
function pluckPhotos(photos, release, specifiedLimit) { function pluckPhotos(photos, specifiedLimit) {
const limit = specifiedLimit || config.media.limit; const limit = specifiedLimit || config.media.limit;
if (photos.length <= limit) { if (photos.length <= limit) {
@ -34,7 +34,7 @@ function pluckPhotos(photos, release, specifiedLimit) {
return Array.from(new Set(plucked)).map(photoIndex => photos[photoIndex - 1]); // remove duplicates, may happen when photo total and photo limit are close return Array.from(new Set(plucked)).map(photoIndex => photos[photoIndex - 1]); // remove duplicates, may happen when photo total and photo limit are close
} }
async function getThumbnail(buffer) { async function createThumbnail(buffer) {
return sharp(buffer) return sharp(buffer)
.resize({ .resize({
height: config.media.thumbnailSize, height: config.media.thumbnailSize,
@ -43,25 +43,14 @@ async function getThumbnail(buffer) {
.toBuffer(); .toBuffer();
} }
async function createReleaseMediaDirectory(release, releaseId) { async function createMediaDirectory(domain, subpath) {
if (release.poster || (release.photos && release.photos.length) || release.trailer) { const filepath = path.join(config.media.path, domain, subpath);
await fs.mkdir(
path.join(config.media.path, 'releases', release.site.network.slug, release.site.slug, releaseId.toString()), await fs.mkdir(filepath, { recursive: true });
{ recursive: true }, return filepath;
);
}
} }
async function createActorMediaDirectory(profile, actor) { function curatePhotoEntries(files, domain = 'releases', role = 'photo', targetId) {
if (profile.avatars && profile.avatars.length) {
await fs.mkdir(
path.join(config.media.path, 'actors', actor.slug),
{ recursive: true },
);
}
}
function curatePhotoEntries(files, domain = 'releases', role = 'photo', targetId, setAvatar = false) {
return files.map((file, index) => ({ return files.map((file, index) => ({
path: file.filepath, path: file.filepath,
thumbnail: file.thumbpath, thumbnail: file.thumbpath,
@ -71,7 +60,7 @@ function curatePhotoEntries(files, domain = 'releases', role = 'photo', targetId
index, index,
domain, domain,
target_id: targetId, target_id: targetId,
role: setAvatar && index === 0 ? 'avatar' : role, role: file.role || role,
})); }));
} }
@ -79,8 +68,8 @@ function curatePhotoEntries(files, domain = 'releases', role = 'photo', targetId
async function filterSourceDuplicates(photos, domains = ['releases'], roles = ['photo'], identifier) { async function filterSourceDuplicates(photos, domains = ['releases'], roles = ['photo'], identifier) {
const photoSourceEntries = await knex('media') const photoSourceEntries = await knex('media')
.whereIn('source', photos.flat()) .whereIn('source', photos.flat())
.whereIn('domain', [].concat(domains)) .whereIn('domain', domains)
.whereIn('role', [].concat(roles)); // accept string argument .whereIn('role', roles); // accept string argument
const photoSources = new Set(photoSourceEntries.map(photo => photo.source)); const photoSources = new Set(photoSourceEntries.map(photo => photo.source));
const newPhotos = photos.filter(source => (Array.isArray(source) // fallbacks provided? const newPhotos = photos.filter(source => (Array.isArray(source) // fallbacks provided?
@ -156,18 +145,22 @@ async function fetchPhoto(photoUrl, index, identifier, attempt = 1) {
} }
} }
async function savePhotos(files, release, releaseId, actorSlug, isPoster = false) { async function savePhotos(files, {
domain = 'releases',
subpath,
role = 'photo',
naming = 'index',
}) {
return Promise.map(files, async (file, index) => { return Promise.map(files, async (file, index) => {
const timestamp = new Date().getTime(); const timestamp = new Date().getTime();
const thumbnail = await getThumbnail(file.photo); const thumbnail = await createThumbnail(file.photo);
const filepath = actorSlug const filename = naming === 'index'
? path.join('actors', actorSlug, `${timestamp + index}.${file.extension}`) ? `${file.role || role}-${index}`
: path.join('releases', release.site.network.slug, release.site.slug, releaseId.toString(), `${isPoster ? 'poster' : index + 1}.${file.extension}`); : `${timestamp + index}`;
const thumbpath = actorSlug const filepath = path.join(domain, subpath, `${filename}.${file.extension}`);
? path.join('actors', actorSlug, `${timestamp + index}_thumb.${file.extension}`) const thumbpath = path.join(domain, subpath, `${filename}_thumb.${file.extension}`);
: path.join('releases', release.site.network.slug, release.site.slug, releaseId.toString(), `${isPoster ? 'poster' : index + 1}_thumb.${file.extension}`);
await Promise.all([ await Promise.all([
fs.writeFile(path.join(config.media.path, filepath), file.photo), fs.writeFile(path.join(config.media.path, filepath), file.photo),
@ -183,75 +176,85 @@ async function savePhotos(files, release, releaseId, actorSlug, isPoster = false
}); });
} }
async function storePoster(release, releaseId) { async function storePhotos(photos, {
if (!release.poster) { domain = 'releases',
console.warn(`No poster available for (${release.site.name}, ${releaseId}}) "${release.title}"`); role = 'photo',
return; naming = 'index',
} targetId,
const [newPoster] = await filterSourceDuplicates([release.poster], 'releases', 'poster', `(${release.site.name}, ${releaseId}) "${release.title}"`); subpath,
primaryRole, // role to assign to first photo if not already in database, used mainly for avatars
if (!newPoster) return; }, identifier) {
if (!photos || photos.length === 0) {
console.log(`Fetching poster for (${release.site.name}, ${releaseId}) "${release.title}"`); console.warn(`No ${role}s available for ${identifier}`);
const metaFile = await fetchPhoto(release.poster, null, `(${release.site.name}, ${releaseId}) "${release.title}"`);
const [uniquePoster] = await filterHashDuplicates([metaFile], 'releases', 'poster', `(${release.site.name}, ${releaseId}) "${release.title}"`);
if (!uniquePoster) return;
const savedPosters = await savePhotos([uniquePoster], release, releaseId, null, true);
await knex('media').insert(curatePhotoEntries(savedPosters, 'releases', 'poster', releaseId));
}
async function storePhotos(release, releaseId) {
if (!release.photos || release.photos.length === 0) {
console.warn(`No photos available for (${release.site.name}, ${releaseId}) "${release.title}"`);
return; return;
} }
const pluckedPhotos = pluckPhotos(release.photos, release); const pluckedPhotos = pluckPhotos(photos);
const newPhotos = await filterSourceDuplicates(pluckedPhotos, 'releases', 'photo', `(${release.site.name}, ${releaseId}) "${release.title}"`); const roles = primaryRole ? [role, primaryRole] : [role];
const newPhotos = await filterSourceDuplicates(pluckedPhotos, [domain], roles, identifier);
if (newPhotos.length === 0) return; if (newPhotos.length === 0) return;
console.log(`Fetching ${newPhotos.length} photos for (${release.site.name}, ${releaseId}) "${release.title}"`); console.log(`Fetching ${newPhotos.length} ${role}s for ${identifier}`);
const metaFiles = await Promise.map(newPhotos, async (photoUrl, index) => fetchPhoto(photoUrl, index, `(${release.site.name}, ${releaseId}) "${release.title}"`), { const metaFiles = await Promise.map(newPhotos, async (photoUrl, index) => fetchPhoto(photoUrl, index, identifier), {
concurrency: 10, concurrency: 10,
}).filter(photo => photo); }).filter(photo => photo);
console.log(metaFiles); const [uniquePhotos, primaryPhoto] = await Promise.all([
filterHashDuplicates(metaFiles, [domain], roles, identifier),
primaryRole
? await knex('media')
.where('domain', domain)
.where('target_id', targetId)
.where('role', primaryRole)
.first()
: null,
]);
const uniquePhotos = await filterHashDuplicates(metaFiles, 'releases', 'photo', `(${release.site.name}, ${releaseId}) "${release.title}"`); if (primaryRole && !primaryPhoto) {
const savedPhotos = await savePhotos(uniquePhotos, release, releaseId); uniquePhotos[0].role = primaryRole;
}
const curatedPhotoEntries = curatePhotoEntries(savedPhotos, 'releases', 'photo', releaseId); const savedPhotos = await savePhotos(uniquePhotos, {
domain,
role,
targetId,
subpath,
naming,
});
const curatedPhotoEntries = curatePhotoEntries(savedPhotos, domain, role, targetId);
await knex('media').insert(curatedPhotoEntries); await knex('media').insert(curatedPhotoEntries);
console.log(`Stored ${newPhotos.length} photos for (${release.site.name}, ${releaseId}) "${release.title}"`); console.log(`Stored ${newPhotos.length} ${role}s for ${identifier}`);
} }
async function storeTrailer(release, releaseId) { async function storeTrailer(trailers, {
domain = 'releases',
role = 'trailer',
targetId,
subpath,
}, identifier) {
// support scrapers supplying multiple qualities // support scrapers supplying multiple qualities
const trailer = Array.isArray(release.trailer) const trailer = Array.isArray(trailers)
? (release.trailer.find(trailerX => [1080, 720].includes(trailerX.quality) || release.trailer[0])) ? trailers.find(trailerX => [1080, 720].includes(trailerX.quality)) || trailers[0]
: release.trailer; : trailers;
if (!trailer || !trailer.src) { if (!trailer || !trailer.src) {
console.warn(`No trailer available for (${release.site.name}, ${releaseId}}) "${release.title}"`); console.warn(`No trailer available for ${identifier}`);
return; return;
} }
console.log(`Storing trailer for (${release.site.name}, ${releaseId}) "${release.title}"`); console.log(`Storing trailer for ${identifier}`);
const { pathname } = new URL(trailer.src); const { pathname } = new URL(trailer.src);
const mimetype = trailer.type || mime.getType(pathname); const mimetype = trailer.type || mime.getType(pathname);
const res = await bhttp.get(trailer.src); const res = await bhttp.get(trailer.src);
const filepath = path.join('releases', release.site.network.slug, release.site.slug, releaseId.toString(), `trailer${trailer.quality ? `_${trailer.quality}` : ''}.${mime.getExtension(mimetype)}`); const filepath = path.join('releases', subpath, `trailer${trailer.quality ? `_${trailer.quality}` : ''}.${mime.getExtension(mimetype)}`);
await Promise.all([ await Promise.all([
fs.writeFile(path.join(config.media.path, filepath), res.body), fs.writeFile(path.join(config.media.path, filepath), res.body),
@ -259,49 +262,24 @@ async function storeTrailer(release, releaseId) {
path: filepath, path: filepath,
mime: mimetype, mime: mimetype,
source: trailer.src, source: trailer.src,
domain: 'releases', domain,
target_id: releaseId, target_id: targetId,
role: 'trailer', role,
quality: trailer.quality || null, quality: trailer.quality || null,
}), }),
]); ]);
} }
async function storeAvatars(profile, actor) { async function findAvatar(actorId, domain = 'actors') {
if (!profile.avatars || profile.avatars.length === 0) { return knex('media')
console.warn(`No avatars available for '${profile.name}'`); .where('domain', domain)
return; .where('target_id', actorId)
} .where('role', 'avatar');
const newPhotos = await filterSourceDuplicates(profile.avatars, 'actors', ['avatar', 'photo'], actor.name);
if (newPhotos.length === 0) return;
console.log(`Fetching ${newPhotos.length} avatars for '${actor.name}'`);
const metaFiles = await Promise.map(newPhotos, async (photoUrl, index) => fetchPhoto(photoUrl, index, actor.name), {
concurrency: 10,
}).filter(photo => photo);
const uniquePhotos = await filterHashDuplicates(metaFiles, 'actors', ['avatar', 'photo'], actor.name);
const [savedPhotos, avatarEntry] = await Promise.all([
savePhotos(uniquePhotos, null, null, actor.slug),
knex('media').where({
target_id: actor.id,
domain: 'actors',
role: 'avatar',
}).first(),
]);
// if no avatar entry is present, curatePhotoEntries will store the first photo as avatar
await knex('media').insert(curatePhotoEntries(savedPhotos, 'actors', 'photo', actor.id, !avatarEntry));
} }
module.exports = { module.exports = {
createActorMediaDirectory, createMediaDirectory,
createReleaseMediaDirectory, findAvatar,
storeAvatars,
storePoster,
storePhotos, storePhotos,
storeTrailer, storeTrailer,
}; };

View File

@ -9,8 +9,7 @@ const whereOr = require('./utils/where-or');
const { associateTags } = require('./tags'); const { associateTags } = require('./tags');
const { associateActors } = require('./actors'); const { associateActors } = require('./actors');
const { const {
createReleaseMediaDirectory, createMediaDirectory,
storePoster,
storePhotos, storePhotos,
storeTrailer, storeTrailer,
} = require('./media'); } = require('./media');
@ -244,13 +243,27 @@ async function fetchTagReleases(queryObject, options = {}) {
} }
async function storeReleaseAssets(release, releaseId) { async function storeReleaseAssets(release, releaseId) {
await createReleaseMediaDirectory(release, releaseId); const subpath = `${release.site.network.slug}/${release.site.slug}/${release.id}/`;
await createMediaDirectory('releases', subpath);
console.log(release.poster);
try { try {
await Promise.all([ await Promise.all([
storePhotos(release, releaseId), storePhotos(release.photos, {
storePoster(release, releaseId), targetId: releaseId,
storeTrailer(release, releaseId), subpath,
}),
storePhotos([release.poster], {
role: 'poster',
targetId: releaseId,
subpath,
}),
storeTrailer(release.trailer, {
targetId: releaseId,
subpath,
}),
]); ]);
} catch (error) { } catch (error) {
console.log(release.url, error); console.log(release.url, error);

View File

@ -28,7 +28,7 @@ async function findSite(url, release) {
return null; return null;
} }
async function scrapeRelease(url, release, deep = false) { async function scrapeRelease(url, release, deep = false, isMovie = false) {
const site = await findSite(url, release); const site = await findSite(url, release);
if (!site) { if (!site) {
@ -41,22 +41,28 @@ async function scrapeRelease(url, release, deep = false) {
throw new Error('Could not find scraper for URL'); throw new Error('Could not find scraper for URL');
} }
if (!scraper.fetchScene) { if (!isMovie && !scraper.fetchScene) {
throw new Error(`The '${site.name}'-scraper cannot fetch individual releases`); throw new Error(`The '${site.name}'-scraper cannot fetch individual scenes`);
} }
const scene = await scraper.fetchScene(url, site, release); if (isMovie && !scraper.fetchMovie) {
throw new Error(`The '${site.name}'-scraper cannot fetch individual movies`);
}
const scrapedRelease = isMovie
? await scraper.fetchMovie(url, site, release)
: await scraper.fetchScene(url, site, release);
if (!deep && argv.save) { if (!deep && argv.save) {
// don't store release when called by site scraper // don't store release when called by site scraper
const [storedRelease] = await storeReleases([scene]); const [storedRelease] = await storeReleases([scrapedRelease]);
if (storedRelease) { if (storedRelease) {
console.log(`http://${config.web.host}:${config.web.port}/scene/${storedRelease.id}`); console.log(`http://${config.web.host}:${config.web.port}/scene/${storedRelease.id}`);
} }
} }
return scene; return scrapedRelease;
} }
module.exports = scrapeRelease; module.exports = scrapeRelease;

View File

@ -204,6 +204,21 @@ async function scrapeScene(html, url, site) {
}; };
} }
function scrapeMovie(html, url, site) {
const { document } = new JSDOM(html).window;
const movie = { url, site };
console.log(url);
movie.entryId = document.querySelector('.dvd_details_overview .rating_box').dataset.id;
movie.title = document.querySelector('.title_bar span').textContent;
movie.covers = Array.from(document.querySelectorAll('#dvd-cover-flip > a'), el => el.href);
movie.channel = document.querySelector('.update_date a').textContent;
movie.date = new Date();
return movie;
}
function scrapeProfile(html, url, actorName) { function scrapeProfile(html, url, actorName) {
const { document } = new JSDOM(html).window; const { document } = new JSDOM(html).window;
@ -257,6 +272,12 @@ async function fetchScene(url, site) {
return scrapeScene(res.body.toString(), url, site); return scrapeScene(res.body.toString(), url, site);
} }
async function fetchMovie(url, site) {
const res = await bhttp.get(url);
return scrapeMovie(res.body.toString(), url, site);
}
async function fetchProfile(actorName) { async function fetchProfile(actorName) {
const actorSlugA = actorName.toLowerCase().replace(/\s+/g, '-'); const actorSlugA = actorName.toLowerCase().replace(/\s+/g, '-');
const actorSlugB = actorName.toLowerCase().replace(/\s+/g, ''); const actorSlugB = actorName.toLowerCase().replace(/\s+/g, '');
@ -285,6 +306,7 @@ async function fetchProfile(actorName) {
module.exports = { module.exports = {
fetchLatest, fetchLatest,
fetchMovie,
fetchProfile, fetchProfile,
fetchUpcoming, fetchUpcoming,
fetchScene, fetchScene,