diff --git a/src/actors.js b/src/actors.js index 617e62de..de609893 100644 --- a/src/actors.js +++ b/src/actors.js @@ -9,7 +9,7 @@ const argv = require('./argv'); const scrapers = require('./scrapers/scrapers'); const whereOr = require('./utils/where-or'); const resolvePlace = require('./utils/resolve-place'); -const { createActorMediaDirectory, storeAvatars } = require('./media'); +const { createMediaDirectory, storePhotos } = require('./media'); async function curateActor(actor) { const [aliases, photos, social] = await Promise.all([ @@ -352,11 +352,19 @@ async function scrapeActors(actorNames) { if (argv.save) { if (actorEntry && profile) { - await createActorMediaDirectory(profile, actorEntry); + await createMediaDirectory('actors', `${actorEntry.slug}/`); await Promise.all([ updateActor(profile, true, true), - storeAvatars(profile, actorEntry), + // storeAvatars(profile, actorEntry), + storePhotos(profile.avatars, { + domain: 'actors', + role: 'photo', + primaryRole: 'avatar', + targetId: actorEntry.id, + subpath: `${actorEntry.slug}/`, + naming: 'timestamp', + }, actorEntry.name), ]); return; @@ -364,8 +372,15 @@ async function scrapeActors(actorNames) { const newActorEntry = await storeActor(profile, true, true); - await createActorMediaDirectory(profile, newActorEntry); - await storeAvatars(profile, newActorEntry); + await createMediaDirectory('actors', `${newActorEntry.slug}/`); + await storePhotos(profile.avatars, { + domain: 'actors', + role: 'photo', + primaryRole: 'avatar', + targetId: newActorEntry.id, + subpath: `${newActorEntry.slug}/`, + naming: 'timestamp', + }, newActorEntry.name); } } catch (error) { console.warn(actorName, error); diff --git a/src/app.js b/src/app.js index fcf113d4..bb72d5a6 100644 --- a/src/app.js +++ b/src/app.js @@ -11,39 +11,36 @@ const scrapeRelease = require('./scrape-release'); const { scrapeActors, scrapeBasicActors } = require('./actors'); async function init() { - if (argv.url) { - await Promise.map(argv.url, async url => scrapeRelease(url), { + if (argv.scene) { + await Promise.map(argv.scene, async url => scrapeRelease(url, null, false, false), { concurrency: 5, }); - - knex.destroy(); - - return; } + if (argv.movie) { + await Promise.map(argv.movie, async url => scrapeRelease(url, null, false, true), { + concurrency: 5, + }); + } if (argv.scrape || argv.networks || argv.sites) { await scrapeSites(); - knex.destroy(); - - return; } if (argv.actors && argv.actors.length > 0) { await scrapeActors(); - knex.destroy(); - - return; } if (argv.actors) { await scrapeBasicActors(); - knex.destroy(); + } + if (argv.server) { + await initServer(); return; } - await initServer(); + knex.destroy(); } module.exports = init; diff --git a/src/argv.js b/src/argv.js index 239bc2a2..ca46ca5e 100644 --- a/src/argv.js +++ b/src/argv.js @@ -5,6 +5,11 @@ const yargs = require('yargs'); const { argv } = yargs .command('npm start') + .option('server', { + describe: 'Start web server', + type: 'boolean', + alias: 'web', + }) .option('scrape', { describe: 'Scrape sites and networks defined in configuration', type: 'boolean', @@ -24,6 +29,16 @@ const { argv } = yargs type: 'array', alias: 'actor', }) + .option('scene', { + describe: 'Scrape scene info from URL', + type: 'array', + alias: 'release', + }) + .option('movie', { + describe: 'Scrape movie info from URL', + type: 'array', + alias: 'dvd', + }) .option('sources', { describe: 'Use these scrapers for actor data', type: 'array', @@ -39,11 +54,6 @@ const { argv } = yargs type: 'boolean', alias: 'force', }) - .option('url', { - describe: 'Scrape scene info from URL', - type: 'array', - alias: 'fetch', - }) .option('after', { describe: 'Don\'t fetch scenes older than', type: 'string', diff --git a/src/media.js b/src/media.js index 45a79aa3..ed3174b6 100644 --- a/src/media.js +++ b/src/media.js @@ -19,7 +19,7 @@ function getHash(buffer) { return hash.digest('hex'); } -function pluckPhotos(photos, release, specifiedLimit) { +function pluckPhotos(photos, specifiedLimit) { const limit = specifiedLimit || config.media.limit; if (photos.length <= limit) { @@ -34,7 +34,7 @@ function pluckPhotos(photos, release, specifiedLimit) { return Array.from(new Set(plucked)).map(photoIndex => photos[photoIndex - 1]); // remove duplicates, may happen when photo total and photo limit are close } -async function getThumbnail(buffer) { +async function createThumbnail(buffer) { return sharp(buffer) .resize({ height: config.media.thumbnailSize, @@ -43,25 +43,14 @@ async function getThumbnail(buffer) { .toBuffer(); } -async function createReleaseMediaDirectory(release, releaseId) { - if (release.poster || (release.photos && release.photos.length) || release.trailer) { - await fs.mkdir( - path.join(config.media.path, 'releases', release.site.network.slug, release.site.slug, releaseId.toString()), - { recursive: true }, - ); - } +async function createMediaDirectory(domain, subpath) { + const filepath = path.join(config.media.path, domain, subpath); + + await fs.mkdir(filepath, { recursive: true }); + return filepath; } -async function createActorMediaDirectory(profile, actor) { - if (profile.avatars && profile.avatars.length) { - await fs.mkdir( - path.join(config.media.path, 'actors', actor.slug), - { recursive: true }, - ); - } -} - -function curatePhotoEntries(files, domain = 'releases', role = 'photo', targetId, setAvatar = false) { +function curatePhotoEntries(files, domain = 'releases', role = 'photo', targetId) { return files.map((file, index) => ({ path: file.filepath, thumbnail: file.thumbpath, @@ -71,7 +60,7 @@ function curatePhotoEntries(files, domain = 'releases', role = 'photo', targetId index, domain, target_id: targetId, - role: setAvatar && index === 0 ? 'avatar' : role, + role: file.role || role, })); } @@ -79,8 +68,8 @@ function curatePhotoEntries(files, domain = 'releases', role = 'photo', targetId async function filterSourceDuplicates(photos, domains = ['releases'], roles = ['photo'], identifier) { const photoSourceEntries = await knex('media') .whereIn('source', photos.flat()) - .whereIn('domain', [].concat(domains)) - .whereIn('role', [].concat(roles)); // accept string argument + .whereIn('domain', domains) + .whereIn('role', roles); // accept string argument const photoSources = new Set(photoSourceEntries.map(photo => photo.source)); const newPhotos = photos.filter(source => (Array.isArray(source) // fallbacks provided? @@ -156,18 +145,22 @@ async function fetchPhoto(photoUrl, index, identifier, attempt = 1) { } } -async function savePhotos(files, release, releaseId, actorSlug, isPoster = false) { +async function savePhotos(files, { + domain = 'releases', + subpath, + role = 'photo', + naming = 'index', +}) { return Promise.map(files, async (file, index) => { const timestamp = new Date().getTime(); - const thumbnail = await getThumbnail(file.photo); + const thumbnail = await createThumbnail(file.photo); - const filepath = actorSlug - ? path.join('actors', actorSlug, `${timestamp + index}.${file.extension}`) - : path.join('releases', release.site.network.slug, release.site.slug, releaseId.toString(), `${isPoster ? 'poster' : index + 1}.${file.extension}`); + const filename = naming === 'index' + ? `${file.role || role}-${index}` + : `${timestamp + index}`; - const thumbpath = actorSlug - ? path.join('actors', actorSlug, `${timestamp + index}_thumb.${file.extension}`) - : path.join('releases', release.site.network.slug, release.site.slug, releaseId.toString(), `${isPoster ? 'poster' : index + 1}_thumb.${file.extension}`); + const filepath = path.join(domain, subpath, `${filename}.${file.extension}`); + const thumbpath = path.join(domain, subpath, `${filename}_thumb.${file.extension}`); await Promise.all([ fs.writeFile(path.join(config.media.path, filepath), file.photo), @@ -183,75 +176,85 @@ async function savePhotos(files, release, releaseId, actorSlug, isPoster = false }); } -async function storePoster(release, releaseId) { - if (!release.poster) { - console.warn(`No poster available for (${release.site.name}, ${releaseId}}) "${release.title}"`); - return; - } - const [newPoster] = await filterSourceDuplicates([release.poster], 'releases', 'poster', `(${release.site.name}, ${releaseId}) "${release.title}"`); - - if (!newPoster) return; - - console.log(`Fetching poster for (${release.site.name}, ${releaseId}) "${release.title}"`); - - const metaFile = await fetchPhoto(release.poster, null, `(${release.site.name}, ${releaseId}) "${release.title}"`); - const [uniquePoster] = await filterHashDuplicates([metaFile], 'releases', 'poster', `(${release.site.name}, ${releaseId}) "${release.title}"`); - - if (!uniquePoster) return; - - const savedPosters = await savePhotos([uniquePoster], release, releaseId, null, true); - - await knex('media').insert(curatePhotoEntries(savedPosters, 'releases', 'poster', releaseId)); -} - - -async function storePhotos(release, releaseId) { - if (!release.photos || release.photos.length === 0) { - console.warn(`No photos available for (${release.site.name}, ${releaseId}) "${release.title}"`); +async function storePhotos(photos, { + domain = 'releases', + role = 'photo', + naming = 'index', + targetId, + subpath, + primaryRole, // role to assign to first photo if not already in database, used mainly for avatars +}, identifier) { + if (!photos || photos.length === 0) { + console.warn(`No ${role}s available for ${identifier}`); return; } - const pluckedPhotos = pluckPhotos(release.photos, release); - const newPhotos = await filterSourceDuplicates(pluckedPhotos, 'releases', 'photo', `(${release.site.name}, ${releaseId}) "${release.title}"`); + const pluckedPhotos = pluckPhotos(photos); + const roles = primaryRole ? [role, primaryRole] : [role]; + + const newPhotos = await filterSourceDuplicates(pluckedPhotos, [domain], roles, identifier); if (newPhotos.length === 0) return; - console.log(`Fetching ${newPhotos.length} photos for (${release.site.name}, ${releaseId}) "${release.title}"`); + console.log(`Fetching ${newPhotos.length} ${role}s for ${identifier}`); - const metaFiles = await Promise.map(newPhotos, async (photoUrl, index) => fetchPhoto(photoUrl, index, `(${release.site.name}, ${releaseId}) "${release.title}"`), { + const metaFiles = await Promise.map(newPhotos, async (photoUrl, index) => fetchPhoto(photoUrl, index, identifier), { concurrency: 10, }).filter(photo => photo); - console.log(metaFiles); + const [uniquePhotos, primaryPhoto] = await Promise.all([ + filterHashDuplicates(metaFiles, [domain], roles, identifier), + primaryRole + ? await knex('media') + .where('domain', domain) + .where('target_id', targetId) + .where('role', primaryRole) + .first() + : null, + ]); - const uniquePhotos = await filterHashDuplicates(metaFiles, 'releases', 'photo', `(${release.site.name}, ${releaseId}) "${release.title}"`); - const savedPhotos = await savePhotos(uniquePhotos, release, releaseId); + if (primaryRole && !primaryPhoto) { + uniquePhotos[0].role = primaryRole; + } - const curatedPhotoEntries = curatePhotoEntries(savedPhotos, 'releases', 'photo', releaseId); + const savedPhotos = await savePhotos(uniquePhotos, { + domain, + role, + targetId, + subpath, + naming, + }); + + const curatedPhotoEntries = curatePhotoEntries(savedPhotos, domain, role, targetId); await knex('media').insert(curatedPhotoEntries); - console.log(`Stored ${newPhotos.length} photos for (${release.site.name}, ${releaseId}) "${release.title}"`); + console.log(`Stored ${newPhotos.length} ${role}s for ${identifier}`); } -async function storeTrailer(release, releaseId) { +async function storeTrailer(trailers, { + domain = 'releases', + role = 'trailer', + targetId, + subpath, +}, identifier) { // support scrapers supplying multiple qualities - const trailer = Array.isArray(release.trailer) - ? (release.trailer.find(trailerX => [1080, 720].includes(trailerX.quality) || release.trailer[0])) - : release.trailer; + const trailer = Array.isArray(trailers) + ? trailers.find(trailerX => [1080, 720].includes(trailerX.quality)) || trailers[0] + : trailers; if (!trailer || !trailer.src) { - console.warn(`No trailer available for (${release.site.name}, ${releaseId}}) "${release.title}"`); + console.warn(`No trailer available for ${identifier}`); return; } - console.log(`Storing trailer for (${release.site.name}, ${releaseId}) "${release.title}"`); + console.log(`Storing trailer for ${identifier}`); const { pathname } = new URL(trailer.src); const mimetype = trailer.type || mime.getType(pathname); const res = await bhttp.get(trailer.src); - const filepath = path.join('releases', release.site.network.slug, release.site.slug, releaseId.toString(), `trailer${trailer.quality ? `_${trailer.quality}` : ''}.${mime.getExtension(mimetype)}`); + const filepath = path.join('releases', subpath, `trailer${trailer.quality ? `_${trailer.quality}` : ''}.${mime.getExtension(mimetype)}`); await Promise.all([ fs.writeFile(path.join(config.media.path, filepath), res.body), @@ -259,49 +262,24 @@ async function storeTrailer(release, releaseId) { path: filepath, mime: mimetype, source: trailer.src, - domain: 'releases', - target_id: releaseId, - role: 'trailer', + domain, + target_id: targetId, + role, quality: trailer.quality || null, }), ]); } -async function storeAvatars(profile, actor) { - if (!profile.avatars || profile.avatars.length === 0) { - console.warn(`No avatars available for '${profile.name}'`); - return; - } - - const newPhotos = await filterSourceDuplicates(profile.avatars, 'actors', ['avatar', 'photo'], actor.name); - - if (newPhotos.length === 0) return; - - console.log(`Fetching ${newPhotos.length} avatars for '${actor.name}'`); - - const metaFiles = await Promise.map(newPhotos, async (photoUrl, index) => fetchPhoto(photoUrl, index, actor.name), { - concurrency: 10, - }).filter(photo => photo); - - const uniquePhotos = await filterHashDuplicates(metaFiles, 'actors', ['avatar', 'photo'], actor.name); - const [savedPhotos, avatarEntry] = await Promise.all([ - savePhotos(uniquePhotos, null, null, actor.slug), - knex('media').where({ - target_id: actor.id, - domain: 'actors', - role: 'avatar', - }).first(), - ]); - - // if no avatar entry is present, curatePhotoEntries will store the first photo as avatar - await knex('media').insert(curatePhotoEntries(savedPhotos, 'actors', 'photo', actor.id, !avatarEntry)); +async function findAvatar(actorId, domain = 'actors') { + return knex('media') + .where('domain', domain) + .where('target_id', actorId) + .where('role', 'avatar'); } module.exports = { - createActorMediaDirectory, - createReleaseMediaDirectory, - storeAvatars, - storePoster, + createMediaDirectory, + findAvatar, storePhotos, storeTrailer, }; diff --git a/src/releases.js b/src/releases.js index 25d108ed..620b7343 100644 --- a/src/releases.js +++ b/src/releases.js @@ -9,8 +9,7 @@ const whereOr = require('./utils/where-or'); const { associateTags } = require('./tags'); const { associateActors } = require('./actors'); const { - createReleaseMediaDirectory, - storePoster, + createMediaDirectory, storePhotos, storeTrailer, } = require('./media'); @@ -244,13 +243,27 @@ async function fetchTagReleases(queryObject, options = {}) { } async function storeReleaseAssets(release, releaseId) { - await createReleaseMediaDirectory(release, releaseId); + const subpath = `${release.site.network.slug}/${release.site.slug}/${release.id}/`; + + await createMediaDirectory('releases', subpath); + + console.log(release.poster); try { await Promise.all([ - storePhotos(release, releaseId), - storePoster(release, releaseId), - storeTrailer(release, releaseId), + storePhotos(release.photos, { + targetId: releaseId, + subpath, + }), + storePhotos([release.poster], { + role: 'poster', + targetId: releaseId, + subpath, + }), + storeTrailer(release.trailer, { + targetId: releaseId, + subpath, + }), ]); } catch (error) { console.log(release.url, error); diff --git a/src/scrape-release.js b/src/scrape-release.js index c39641a0..fecd4d55 100644 --- a/src/scrape-release.js +++ b/src/scrape-release.js @@ -28,7 +28,7 @@ async function findSite(url, release) { return null; } -async function scrapeRelease(url, release, deep = false) { +async function scrapeRelease(url, release, deep = false, isMovie = false) { const site = await findSite(url, release); if (!site) { @@ -41,22 +41,28 @@ async function scrapeRelease(url, release, deep = false) { throw new Error('Could not find scraper for URL'); } - if (!scraper.fetchScene) { - throw new Error(`The '${site.name}'-scraper cannot fetch individual releases`); + if (!isMovie && !scraper.fetchScene) { + throw new Error(`The '${site.name}'-scraper cannot fetch individual scenes`); } - const scene = await scraper.fetchScene(url, site, release); + if (isMovie && !scraper.fetchMovie) { + throw new Error(`The '${site.name}'-scraper cannot fetch individual movies`); + } + + const scrapedRelease = isMovie + ? await scraper.fetchMovie(url, site, release) + : await scraper.fetchScene(url, site, release); if (!deep && argv.save) { // don't store release when called by site scraper - const [storedRelease] = await storeReleases([scene]); + const [storedRelease] = await storeReleases([scrapedRelease]); if (storedRelease) { console.log(`http://${config.web.host}:${config.web.port}/scene/${storedRelease.id}`); } } - return scene; + return scrapedRelease; } module.exports = scrapeRelease; diff --git a/src/scrapers/julesjordan.js b/src/scrapers/julesjordan.js index 76b2dfa0..b38e4bef 100644 --- a/src/scrapers/julesjordan.js +++ b/src/scrapers/julesjordan.js @@ -204,6 +204,21 @@ async function scrapeScene(html, url, site) { }; } +function scrapeMovie(html, url, site) { + const { document } = new JSDOM(html).window; + const movie = { url, site }; + + console.log(url); + + movie.entryId = document.querySelector('.dvd_details_overview .rating_box').dataset.id; + movie.title = document.querySelector('.title_bar span').textContent; + movie.covers = Array.from(document.querySelectorAll('#dvd-cover-flip > a'), el => el.href); + movie.channel = document.querySelector('.update_date a').textContent; + movie.date = new Date(); + + return movie; +} + function scrapeProfile(html, url, actorName) { const { document } = new JSDOM(html).window; @@ -257,6 +272,12 @@ async function fetchScene(url, site) { return scrapeScene(res.body.toString(), url, site); } +async function fetchMovie(url, site) { + const res = await bhttp.get(url); + + return scrapeMovie(res.body.toString(), url, site); +} + async function fetchProfile(actorName) { const actorSlugA = actorName.toLowerCase().replace(/\s+/g, '-'); const actorSlugB = actorName.toLowerCase().replace(/\s+/g, ''); @@ -285,6 +306,7 @@ async function fetchProfile(actorName) { module.exports = { fetchLatest, + fetchMovie, fetchProfile, fetchUpcoming, fetchScene,