Fixed off-by-one in photo plucker. Fixed source duplicate photo function not handling fallback sources.

This commit is contained in:
ThePendulum 2019-12-12 04:04:35 +01:00
parent dbaf1a9a9c
commit 0b819713b5
5 changed files with 30 additions and 36 deletions

22
package-lock.json generated
View File

@ -1992,6 +1992,17 @@
"resolved": "https://registry.npmjs.org/array-unique/-/array-unique-0.3.2.tgz",
"integrity": "sha1-qJS3XUvE9s1nnvMkSp/Y9Gri1Cg="
},
"array.prototype.flat": {
"version": "1.2.2",
"resolved": "https://registry.npmjs.org/array.prototype.flat/-/array.prototype.flat-1.2.2.tgz",
"integrity": "sha512-VXjh7lAL4KXKF2hY4FnEW9eRW6IhdvFW1sN/JwLbmECbCgACCnBHNyP3lFiYuttr0jxRN9Bsc5+G27dMseSWqQ==",
"dev": true,
"requires": {
"define-properties": "^1.1.3",
"es-abstract": "^1.15.0",
"function-bind": "^1.1.1"
}
},
"asn1": {
"version": "0.2.4",
"resolved": "https://registry.npmjs.org/asn1/-/asn1-0.2.4.tgz",
@ -4534,22 +4545,23 @@
"dev": true
},
"eslint-plugin-import": {
"version": "2.18.2",
"resolved": "https://registry.npmjs.org/eslint-plugin-import/-/eslint-plugin-import-2.18.2.tgz",
"integrity": "sha512-5ohpsHAiUBRNaBWAF08izwUGlbrJoJJ+W9/TBwsGoR1MnlgfwMIKrFeSjWbt6moabiXW9xNvtFz+97KHRfI4HQ==",
"version": "2.19.1",
"resolved": "https://registry.npmjs.org/eslint-plugin-import/-/eslint-plugin-import-2.19.1.tgz",
"integrity": "sha512-x68131aKoCZlCae7rDXKSAQmbT5DQuManyXo2sK6fJJ0aK5CWAkv6A6HJZGgqC8IhjQxYPgo6/IY4Oz8AFsbBw==",
"dev": true,
"requires": {
"array-includes": "^3.0.3",
"array.prototype.flat": "^1.2.1",
"contains-path": "^0.1.0",
"debug": "^2.6.9",
"doctrine": "1.5.0",
"eslint-import-resolver-node": "^0.3.2",
"eslint-module-utils": "^2.4.0",
"eslint-module-utils": "^2.4.1",
"has": "^1.0.3",
"minimatch": "^3.0.4",
"object.values": "^1.1.0",
"read-pkg-up": "^2.0.0",
"resolve": "^1.11.0"
"resolve": "^1.12.0"
},
"dependencies": {
"doctrine": {

View File

@ -50,7 +50,7 @@
"eslint-config-airbnb": "^17.1.1",
"eslint-config-airbnb-base": "^13.2.0",
"eslint-loader": "^2.2.1",
"eslint-plugin-import": "^2.18.2",
"eslint-plugin-import": "^2.19.1",
"eslint-plugin-jsx-a11y": "^6.2.3",
"eslint-plugin-react": "^7.17.0",
"eslint-plugin-vue": "^6.0.1",

View File

@ -28,7 +28,7 @@ function pluckPhotos(photos, release, specifiedLimit) {
const plucked = [1]
.concat(
Array.from({ length: limit }, (value, index) => Math.round((index + 1) * (photos.length / (limit)))),
Array.from({ length: limit - 1 }, (value, index) => Math.round((index + 1) * (photos.length / (limit - 1)))),
);
return Array.from(new Set(plucked)).map(photoIndex => photos[photoIndex - 1]); // remove duplicates, may happen when photo total and photo limit are close
@ -78,12 +78,14 @@ function curatePhotoEntries(files, domain = 'releases', role = 'photo', targetId
// before fetching
async function filterSourceDuplicates(photos, domains = ['releases'], roles = ['photo'], identifier) {
const photoSourceEntries = await knex('media')
.whereIn('source', photos)
.whereIn('source', photos.flat())
.whereIn('domain', [].concat(domains))
.whereIn('role', [].concat(roles)); // accept string argument
const photoSources = new Set(photoSourceEntries.map(photo => photo.source));
const newPhotos = photos.filter(source => !photoSources.has(source));
const newPhotos = photos.filter(source => (Array.isArray(source) // fallbacks provided?
? !source.some(sourceX => photoSources.has(sourceX)) // ensure none of the sources match
: !photoSources.has(source)));
if (photoSourceEntries.length > 0) {
console.log(`Ignoring ${photoSourceEntries.length} ${roles} items already present by source for ${identifier}`);
@ -135,7 +137,7 @@ async function fetchPhoto(photoUrl, index, identifier, attempt = 1) {
throw new Error(`Response ${res.statusCode} not OK`);
} catch (error) {
console.warn(`Failed attempt ${attempt}/3 to fetch photo ${index + 1} (${photoUrl}) for ${identifier}: ${error}`);
console.warn(`Failed attempt ${attempt}/3 to fetch photo ${index + 1} for ${identifier} (${photoUrl}): ${error}`);
if (attempt < 3) {
await Promise.delay(1000);
@ -202,7 +204,6 @@ async function storePhotos(release, releaseId) {
}
const pluckedPhotos = pluckPhotos(release.photos, release);
const newPhotos = await filterSourceDuplicates(pluckedPhotos, 'releases', 'photo', `(${release.site.name}, ${releaseId}) "${release.title}"`);
if (newPhotos.length === 0) return;
@ -216,7 +217,9 @@ async function storePhotos(release, releaseId) {
const uniquePhotos = await filterHashDuplicates(metaFiles, 'releases', 'photo', `(${release.site.name}, ${releaseId}) "${release.title}"`);
const savedPhotos = await savePhotos(uniquePhotos, release, releaseId);
await knex('media').insert(curatePhotoEntries(savedPhotos, 'releases', 'photo', releaseId));
const curatedPhotoEntries = curatePhotoEntries(savedPhotos, 'releases', 'photo', releaseId);
await knex('media').insert(curatedPhotoEntries);
console.log(`Stored ${newPhotos.length} photos for (${release.site.name}, ${releaseId}) "${release.title}"`);
}

View File

@ -18,13 +18,13 @@ function scrapePhotos(html) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const photos = $('.photo_gallery_thumbnail_wrapper .thumbs')
.map((photoIndex, photoElement) => {
.toArray()
.map((photoElement) => {
const src = $(photoElement).attr('src');
// high res often available in photos/ directory, but not always, provide original as fallback
return [src.replace('thumbs/', 'photos/'), src];
})
.toArray();
});
return photos;
}

View File

@ -1,21 +0,0 @@
'use strict';
const config = require('config');
// pick {photoLimit} photos evenly distributed photos from a set with {photoTotal} photos, return array of indexes starting at 1
function pluckPhotos(photos, release, specifiedLimit) {
const limit = specifiedLimit || config.media.limit;
if (photos.length <= limit) {
return photos;
}
const plucked = [1]
.concat(
Array.from({ length: limit }, (value, index) => Math.round((index + 1) * (photos.length / (limit)))),
);
return Array.from(new Set(plucked)).map(photoIndex => photos[photoIndex - 1]); // remove duplicates, may happen when photo total and photo limit are close
}
module.exports = pluckPhotos;