From 0b819713b5af2f864ee0ebdde29f09cfa73eb1fc Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Thu, 12 Dec 2019 04:04:35 +0100 Subject: [PATCH] Fixed off-by-one in photo plucker. Fixed source duplicate photo function not handling fallback sources. --- package-lock.json | 22 +++++++++++++++++----- package.json | 2 +- src/media.js | 15 +++++++++------ src/scrapers/julesjordan.js | 6 +++--- src/utils/pluck-photos.js | 21 --------------------- 5 files changed, 30 insertions(+), 36 deletions(-) delete mode 100644 src/utils/pluck-photos.js diff --git a/package-lock.json b/package-lock.json index 2ee375e0..ef3747bf 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1992,6 +1992,17 @@ "resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz", "integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg=" }, + "array.prototype.flat": { + "version": "1.2.2", + "resolved": "https://registry.npmjs.org/array.prototype.flat/-/array.prototype.flat-1.2.2.tgz", + "integrity": "sha512-VXjh7lAL4KXKF2hY4FnEW9eRW6IhdvFW1sN/JwLbmECbCgACCnBHNyP3lFiYuttr0jxRN9Bsc5+G27dMseSWqQ==", + "dev": true, + "requires": { + "define-properties": "^1.1.3", + "es-abstract": "^1.15.0", + "function-bind": "^1.1.1" + } + }, "asn1": { "version": "0.2.4", "resolved": "https://registry.npmjs.org/asn1/-/asn1-0.2.4.tgz", @@ -4534,22 +4545,23 @@ "dev": true }, "eslint-plugin-import": { - "version": "2.18.2", - "resolved": "https://registry.npmjs.org/eslint-plugin-import/-/eslint-plugin-import-2.18.2.tgz", - "integrity": "sha512-5ohpsHAiUBRNaBWAF08izwUGlbrJoJJ+W9/TBwsGoR1MnlgfwMIKrFeSjWbt6moabiXW9xNvtFz+97KHRfI4HQ==", + "version": "2.19.1", + "resolved": "https://registry.npmjs.org/eslint-plugin-import/-/eslint-plugin-import-2.19.1.tgz", + "integrity": "sha512-x68131aKoCZlCae7rDXKSAQmbT5DQuManyXo2sK6fJJ0aK5CWAkv6A6HJZGgqC8IhjQxYPgo6/IY4Oz8AFsbBw==", "dev": true, "requires": { "array-includes": "^3.0.3", + "array.prototype.flat": "^1.2.1", "contains-path": "^0.1.0", "debug": "^2.6.9", "doctrine": "1.5.0", "eslint-import-resolver-node": "^0.3.2", - "eslint-module-utils": "^2.4.0", + "eslint-module-utils": "^2.4.1", "has": "^1.0.3", "minimatch": "^3.0.4", "object.values": "^1.1.0", "read-pkg-up": "^2.0.0", - "resolve": "^1.11.0" + "resolve": "^1.12.0" }, "dependencies": { "doctrine": { diff --git a/package.json b/package.json index 58169e82..12d276d1 100644 --- a/package.json +++ b/package.json @@ -50,7 +50,7 @@ "eslint-config-airbnb": "^17.1.1", "eslint-config-airbnb-base": "^13.2.0", "eslint-loader": "^2.2.1", - "eslint-plugin-import": "^2.18.2", + "eslint-plugin-import": "^2.19.1", "eslint-plugin-jsx-a11y": "^6.2.3", "eslint-plugin-react": "^7.17.0", "eslint-plugin-vue": "^6.0.1", diff --git a/src/media.js b/src/media.js index f555711a..aae974c0 100644 --- a/src/media.js +++ b/src/media.js @@ -28,7 +28,7 @@ function pluckPhotos(photos, release, specifiedLimit) { const plucked = [1] .concat( - Array.from({ length: limit }, (value, index) => Math.round((index + 1) * (photos.length / (limit)))), + Array.from({ length: limit - 1 }, (value, index) => Math.round((index + 1) * (photos.length / (limit - 1)))), ); return Array.from(new Set(plucked)).map(photoIndex => photos[photoIndex - 1]); // remove duplicates, may happen when photo total and photo limit are close @@ -78,12 +78,14 @@ function curatePhotoEntries(files, domain = 'releases', role = 'photo', targetId // before fetching async function filterSourceDuplicates(photos, domains = ['releases'], roles = ['photo'], identifier) { const photoSourceEntries = await knex('media') - .whereIn('source', photos) + .whereIn('source', photos.flat()) .whereIn('domain', [].concat(domains)) .whereIn('role', [].concat(roles)); // accept string argument const photoSources = new Set(photoSourceEntries.map(photo => photo.source)); - const newPhotos = photos.filter(source => !photoSources.has(source)); + const newPhotos = photos.filter(source => (Array.isArray(source) // fallbacks provided? + ? !source.some(sourceX => photoSources.has(sourceX)) // ensure none of the sources match + : !photoSources.has(source))); if (photoSourceEntries.length > 0) { console.log(`Ignoring ${photoSourceEntries.length} ${roles} items already present by source for ${identifier}`); @@ -135,7 +137,7 @@ async function fetchPhoto(photoUrl, index, identifier, attempt = 1) { throw new Error(`Response ${res.statusCode} not OK`); } catch (error) { - console.warn(`Failed attempt ${attempt}/3 to fetch photo ${index + 1} (${photoUrl}) for ${identifier}: ${error}`); + console.warn(`Failed attempt ${attempt}/3 to fetch photo ${index + 1} for ${identifier} (${photoUrl}): ${error}`); if (attempt < 3) { await Promise.delay(1000); @@ -202,7 +204,6 @@ async function storePhotos(release, releaseId) { } const pluckedPhotos = pluckPhotos(release.photos, release); - const newPhotos = await filterSourceDuplicates(pluckedPhotos, 'releases', 'photo', `(${release.site.name}, ${releaseId}) "${release.title}"`); if (newPhotos.length === 0) return; @@ -216,7 +217,9 @@ async function storePhotos(release, releaseId) { const uniquePhotos = await filterHashDuplicates(metaFiles, 'releases', 'photo', `(${release.site.name}, ${releaseId}) "${release.title}"`); const savedPhotos = await savePhotos(uniquePhotos, release, releaseId); - await knex('media').insert(curatePhotoEntries(savedPhotos, 'releases', 'photo', releaseId)); + const curatedPhotoEntries = curatePhotoEntries(savedPhotos, 'releases', 'photo', releaseId); + + await knex('media').insert(curatedPhotoEntries); console.log(`Stored ${newPhotos.length} photos for (${release.site.name}, ${releaseId}) "${release.title}"`); } diff --git a/src/scrapers/julesjordan.js b/src/scrapers/julesjordan.js index a7fd5a85..37047d34 100644 --- a/src/scrapers/julesjordan.js +++ b/src/scrapers/julesjordan.js @@ -18,13 +18,13 @@ function scrapePhotos(html) { const $ = cheerio.load(html, { normalizeWhitespace: true }); const photos = $('.photo_gallery_thumbnail_wrapper .thumbs') - .map((photoIndex, photoElement) => { + .toArray() + .map((photoElement) => { const src = $(photoElement).attr('src'); // high res often available in photos/ directory, but not always, provide original as fallback return [src.replace('thumbs/', 'photos/'), src]; - }) - .toArray(); + }); return photos; } diff --git a/src/utils/pluck-photos.js b/src/utils/pluck-photos.js deleted file mode 100644 index e6cb1a1c..00000000 --- a/src/utils/pluck-photos.js +++ /dev/null @@ -1,21 +0,0 @@ -'use strict'; - -const config = require('config'); - -// pick {photoLimit} photos evenly distributed photos from a set with {photoTotal} photos, return array of indexes starting at 1 -function pluckPhotos(photos, release, specifiedLimit) { - const limit = specifiedLimit || config.media.limit; - - if (photos.length <= limit) { - return photos; - } - - const plucked = [1] - .concat( - Array.from({ length: limit }, (value, index) => Math.round((index + 1) * (photos.length / (limit)))), - ); - - return Array.from(new Set(plucked)).map(photoIndex => photos[photoIndex - 1]); // remove duplicates, may happen when photo total and photo limit are close -} - -module.exports = pluckPhotos;