diff --git a/src/media.js b/src/media.js index becf7540..a5f98ba8 100644 --- a/src/media.js +++ b/src/media.js @@ -46,109 +46,152 @@ async function createActorMediaDirectory(profile, actor) { } } +function curatePhotoEntries(files, domain = 'releases', role = 'photo', targetId, setAvatar = false) { + return files.map((file, index) => ({ + path: file.filepath, + thumbnail: file.thumbpath, + mime: file.mimetype, + hash: file.hash, + source: file.source, + index, + domain, + target_id: targetId, + role: setAvatar && index === 0 ? 'avatar' : role, + })); +} + +// before fetching +async function filterSourceDuplicates(photos, domains = ['releases'], roles = ['photo'], identifier) { + const photoSourceEntries = await knex('media') + .whereIn('source', photos) + .whereIn('domain', [].concat(domains)) + .whereIn('role', [].concat(roles)); // accept string argument + + const photoSources = new Set(photoSourceEntries.map(photo => photo.source)); + const newPhotos = photos.filter(source => !photoSources.has(source)); + + if (photoSourceEntries.length > 0) { + console.log(`Ignoring ${photoSourceEntries.length} ${roles} items already present by source for ${identifier}`); + } + + return newPhotos; +} + +// after fetching +async function filterHashDuplicates(files, domains = ['releases'], roles = ['photo'], identifier) { + const photoHashEntries = await knex('media') + .whereIn('hash', files.map(file => file.hash)) + .whereIn('domain', [].concat(domains)) + .whereIn('role', [].concat(roles)); // accept string argument + + const photoHashes = new Set(photoHashEntries.map(entry => entry.hash)); + + if (photoHashEntries.length > 0) { + console.log(`Ignoring ${photoHashEntries.length} ${roles} items already present by hash for ${identifier}`); + } + + return files.filter(file => file && !photoHashes.has(file.hash)); +} + +async function fetchPhoto(photoUrl, index, identifier) { + const { pathname } = new URL(photoUrl); + const mimetype = mime.getType(pathname); + + try { + const res = await bhttp.get(photoUrl); + + if (res.statusCode === 200) { + const extension = mime.getExtension(mimetype); + const hash = getHash(res.body); + + return { + photo: res.body, + mimetype, + extension, + hash, + source: photoUrl, + }; + } + + throw new Error(`Response ${res.statusCode} not OK`); + } catch (error) { + console.warn(`Failed to store photo ${index + 1} (${photoUrl}) for ${identifier}: ${error}`); + + return null; + } +} + +async function savePhotos(files, release, releaseId, actorSlug, isPoster = false) { + return Promise.map(files, async (file, index) => { + const timestamp = new Date().getTime(); + const thumbnail = await getThumbnail(file.photo); + + const filepath = actorSlug + ? path.join('actors', actorSlug, `${timestamp + index}.${file.extension}`) + : path.join('releases', release.site.network.slug, release.site.slug, releaseId.toString(), `${isPoster ? 'poster' : index + 1}.${file.extension}`); + + const thumbpath = actorSlug + ? path.join('actors', actorSlug, `${timestamp + index}_thumb.${file.extension}`) + : path.join('releases', release.site.network.slug, release.site.slug, releaseId.toString(), `${isPoster ? 'poster' : index + 1}_thumb.${file.extension}`); + + await Promise.all([ + fs.writeFile(path.join(config.media.path, filepath), file.photo), + fs.writeFile(path.join(config.media.path, thumbpath), thumbnail), + ]); + + return { + ...file, + thumbnail, + filepath, + thumbpath, + }; + }); +} + async function storePoster(release, releaseId) { if (!release.poster) { console.warn(`No poster available for (${release.site.name}, ${releaseId}}) "${release.title}"`); return; } + const [newPoster] = await filterSourceDuplicates([release.poster], 'releases', 'poster', `(${release.site.name}, ${releaseId}) "${release.title}"`); - console.log(`Storing poster for (${release.site.name}, ${releaseId}) "${release.title}"`); + if (!newPoster) return; - const res = await bhttp.get(release.poster); + console.log(`Fetching poster for (${release.site.name}, ${releaseId}) "${release.title}"`); - if (res.statusCode === 200) { - const thumbnail = await getThumbnail(res.body); + const metaFile = await fetchPhoto(release.poster, null, `(${release.site.name}, ${releaseId}) "${release.title}"`); + const [uniquePoster] = await filterHashDuplicates([metaFile], 'releases', 'poster', `(${release.site.name}, ${releaseId}) "${release.title}"`); - const { pathname } = new URL(release.poster); - const mimetype = res.headers['content-type'] || mime.getType(pathname) || 'image/jpeg'; - const extension = mime.getExtension(mimetype); + if (!uniquePoster) return; - const filepath = path.join('releases', release.site.network.slug, release.site.slug, releaseId.toString(), `poster.${extension}`); - const thumbpath = path.join('releases', release.site.network.slug, release.site.slug, releaseId.toString(), `poster_thumb.${extension}`); - const hash = getHash(res.body); + const savedPosters = await savePhotos([uniquePoster], release, releaseId, null, true); - await Promise.all([ - fs.writeFile(path.join(config.media.path, filepath), res.body), - fs.writeFile(path.join(config.media.path, thumbpath), thumbnail), - ]); - - await knex('media').insert({ - path: filepath, - thumbnail: thumbpath, - mime: mimetype, - hash, - source: release.poster, - domain: 'releases', - target_id: releaseId, - role: 'poster', - }); - - return; - } - - console.warn(`Failed to store poster for (${release.site.name}, ${releaseId}) "${release.title}": ${res.statusCode}`); + await knex('media').insert(curatePhotoEntries(savedPosters, 'releases', 'poster', releaseId)); } + async function storePhotos(release, releaseId) { if (!release.photos || release.photos.length === 0) { - console.warn(`No photos available for (${release.site.name}, ${releaseId}}) "${release.title}"`); + console.warn(`No photos available for (${release.site.name}, ${releaseId}) "${release.title}"`); return; } - console.log(`Storing ${release.photos.length} photos for (${release.site.name}, ${releaseId}) "${release.title}"`); + const newPhotos = await filterSourceDuplicates(release.photos, 'releases', 'photo', `(${release.site.name}, ${releaseId}) "${release.title}"`); - const files = await Promise.map(release.photos, async (photoUrl, index) => { - const { pathname } = new URL(photoUrl); - const mimetype = mime.getType(pathname); + if (newPhotos.length === 0) return; - try { - const res = await bhttp.get(photoUrl); + console.log(`Fetching ${newPhotos.length} photos for (${release.site.name}, ${releaseId}) "${release.title}"`); - if (res.statusCode === 200) { - const thumbnail = await getThumbnail(res.body); - const extension = mime.getExtension(mimetype); - - const filepath = path.join('releases', release.site.network.slug, release.site.slug, releaseId.toString(), `${index + 1}.${extension}`); - const thumbpath = path.join('releases', release.site.network.slug, release.site.slug, releaseId.toString(), `${index + 1}_thumb.${extension}`); - const hash = getHash(res.body); - - await Promise.all([ - fs.writeFile(path.join(config.media.path, filepath), res.body), - fs.writeFile(path.join(config.media.path, thumbpath), thumbnail), - ]); - - return { - filepath, - thumbpath, - mimetype, - hash, - source: photoUrl, - }; - } - - throw new Error(`Response ${res.statusCode} not OK`); - } catch (error) { - console.warn(`Failed to store photo ${index + 1} for "${release.title}" (${photoUrl}, ${release.url}, ${release.site.name}, ${releaseId}): ${error}`); - - return null; - } - }, { + const metaFiles = await Promise.map(newPhotos, async (photoUrl, index) => fetchPhoto(photoUrl, index, `(${release.site.name}, ${releaseId}) "${release.title}"`), { concurrency: 10, }); - await knex('media') - .insert(files.filter(file => file) - .map((file, index) => ({ - path: file.filepath, - thumbnail: file.thumbpath, - mime: file.mimetype, - hash: file.hash, - source: file.source, - index, - domain: 'releases', - target_id: releaseId, - role: 'photo', - }))); + const uniquePhotos = await filterHashDuplicates(metaFiles, 'releases', 'photo', `(${release.site.name}, ${releaseId}) "${release.title}"`); + const savedPhotos = await savePhotos(uniquePhotos, release, releaseId); + + await knex('media').insert(curatePhotoEntries(savedPhotos, 'releases', 'photo', releaseId)); + + console.log(`Stored ${newPhotos.length} photos for (${release.site.name}, ${releaseId}) "${release.title}"`); } async function storeTrailer(release, releaseId) { @@ -185,69 +228,28 @@ async function storeAvatars(profile, actor) { return; } - console.log(`Storing ${profile.avatars.length} avatars for '${profile.name}'`); + const newPhotos = await filterSourceDuplicates(profile.avatars, 'actors', ['avatar', 'photo'], actor.name); - const files = await Promise.map(profile.avatars, async (avatarUrl, index) => { - try { - const { pathname } = new URL(avatarUrl); - const mimetype = mime.getType(pathname); + if (newPhotos.length === 0) return; - const res = await bhttp.get(avatarUrl); + console.log(`Fetching ${newPhotos.length} avatars for '${actor.name}'`); - if (res.statusCode === 200) { - const thumbnail = await getThumbnail(res.body); - const extension = mime.getExtension(mimetype); - - const timestamp = new Date().getTime(); - - const filepath = path.join('actors', actor.slug, `${timestamp + index}.${extension}`); - const thumbpath = path.join('actors', actor.slug, `${timestamp + index}_thumb.${extension}`); - const hash = getHash(res.body); - - await Promise.all([ - fs.writeFile(path.join(config.media.path, filepath), res.body), - fs.writeFile(path.join(config.media.path, thumbpath), thumbnail), - ]); - - return { - filepath, - thumbpath, - mimetype, - hash, - source: avatarUrl, - }; - } - - throw new Error(`Response ${res.statusCode} not OK`); - } catch (error) { - console.warn(`Failed to store avatar ${index + 1} for '${profile.name}': ${avatarUrl}`); - - return null; - } - }, { + const metaFiles = await Promise.map(newPhotos, async (photoUrl, index) => fetchPhoto(photoUrl, index, actor.name), { concurrency: 10, }); - const avatars = files.filter(file => file); - - const existingAvatars = await knex('media') - .whereIn('hash', avatars.map(file => file.hash)); - - const newAvatars = avatars.filter(file => !existingAvatars.some(avatar => file.hash === avatar.hash)); - const hasAvatar = existingAvatars.some(avatar => avatar.role === 'avatar'); - - await knex('media') - .insert(newAvatars.map((file, index) => ({ - path: file.filepath, - thumbnail: file.thumbpath, - mime: file.mimetype, - hash: file.hash, - source: file.source, - index, - domain: 'actors', + const uniquePhotos = await filterHashDuplicates(metaFiles, 'actors', ['avatar', 'photo'], actor.name); + const [savedPhotos, avatarEntry] = await Promise.all([ + savePhotos(uniquePhotos, null, null, actor.slug), + knex('media').where({ target_id: actor.id, - role: index === 0 && !hasAvatar ? 'avatar' : 'photo', - }))); + domain: 'actors', + role: 'avatar', + }).first(), + ]); + + // if no avatar entry is present, curatePhotoEntries will store the first photo as avatar + await knex('media').insert(curatePhotoEntries(savedPhotos, 'actors', 'photo', actor.id, !avatarEntry)); } module.exports = { diff --git a/src/releases.js b/src/releases.js index be802527..e5b17fe5 100644 --- a/src/releases.js +++ b/src/releases.js @@ -225,13 +225,13 @@ async function storeReleaseAssets(release, releaseId) { try { await Promise.all([ - associateTags(release, releaseId), + // associateTags(release, releaseId), storePhotos(release, releaseId), storePoster(release, releaseId), storeTrailer(release, releaseId), ]); } catch (error) { - console.log(release, error); + console.log(release.url, error); } } diff --git a/src/scrape-release.js b/src/scrape-release.js index 5537358d..74747dfb 100644 --- a/src/scrape-release.js +++ b/src/scrape-release.js @@ -31,8 +31,6 @@ async function findSite(url, release) { async function scrapeRelease(url, release, deep = false) { const site = await findSite(url, release); - console.log(url, site); - if (!site) { throw new Error('Could not find site in database'); } diff --git a/src/tags.js b/src/tags.js index da8cfa4a..9f0cd0a7 100644 --- a/src/tags.js +++ b/src/tags.js @@ -66,14 +66,10 @@ async function associateTags(release, releaseId) { ? await matchTags(release.tags) // scraper returned raw tags : release.tags; // tags already matched by scraper - try { - await knex('tags_associated').insert(tags.map(tagId => ({ - tag_id: tagId, - release_id: releaseId, - }))); - } catch (error) { - console.log(release, error); - } + await knex('tags_associated').insert(tags.map(tagId => ({ + tag_id: tagId, + release_id: releaseId, + }))); } async function fetchTags(queryObject, groupsQueryObject, limit = 100) {