From c37d4ad01f9b2b9bcf73bcb2a1cc41c14701b336 Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Thu, 29 Oct 2020 16:06:20 +0100 Subject: [PATCH] Filtering invalid actors from releases before storing. --- src/actors.js | 45 +++++++++++++++++++++++++----------------- src/argv.js | 8 ++++---- src/entities.js | 4 ++-- src/scrapers/traxxx.js | 3 ++- src/store-releases.js | 5 +++-- 5 files changed, 38 insertions(+), 27 deletions(-) diff --git a/src/actors.js b/src/actors.js index 12479ffa..27e469f5 100644 --- a/src/actors.js +++ b/src/actors.js @@ -114,29 +114,37 @@ function getAverage(items) { } function toBaseActors(actorsOrNames, release) { - return actorsOrNames.map((actorOrName) => { - const [baseName, entryId] = (actorOrName.name || actorOrName).split(':'); + if (!actorsOrNames) { + return []; + } - const name = capitalize(baseName); - const slug = slugify(name); + const baseActors = actorsOrNames + .filter(actorOrName => actorOrName && (typeof actorOrName === 'string' || actorOrName.name)) + .map((actorOrName) => { + const [baseName, entryId] = (actorOrName.name || actorOrName).split(':'); - const baseActor = { - name, - slug, - entryId: entryId || actorOrName.entryId || null, - entity: release?.entity?.parent || release?.entity || null, - hasProfile: !!actorOrName.name, // actor contains profile information - }; + const name = capitalize(baseName); + const slug = slugify(name); - if (actorOrName.name) { - return { - ...actorOrName, - ...baseActor, + const baseActor = { + name, + slug, + entryId: entryId || actorOrName.entryId || null, + entity: release?.entity?.parent || release?.entity || null, + hasProfile: !!actorOrName.name, // actor contains profile information }; - } - return baseActor; - }); + if (actorOrName.name) { + return { + ...actorOrName, + ...baseActor, + }; + } + + return baseActor; + }); + + return baseActors; } function curateActor(actor, withDetails = false, isProfile = false) { @@ -832,4 +840,5 @@ module.exports = { fetchActor, scrapeActors, searchActors, + toBaseActors, }; diff --git a/src/argv.js b/src/argv.js index fc37fdc0..707948be 100644 --- a/src/argv.js +++ b/src/argv.js @@ -30,20 +30,20 @@ const { argv } = yargs type: 'boolean', alias: 'web', }) - .option('networks', { + .option('include-networks', { describe: 'Network to scrape all channels from (overrides configuration)', type: 'array', - alias: 'network', + alias: ['include-network', 'networks', 'network'], }) .option('exclude-networks', { describe: 'Network not to scrape any channels from (overrides configuration)', type: 'array', alias: 'exclude-network', }) - .option('channels', { + .option('include-channels', { describe: 'Channel to scrape (overrides configuration)', type: 'array', - alias: 'channel', + alias: ['include-channel', 'channels', 'channel'], }) .option('exclude-channels', { describe: 'Channel not to scrape (overrides configuration)', diff --git a/src/entities.js b/src/entities.js index 397e38ad..40379771 100644 --- a/src/entities.js +++ b/src/entities.js @@ -64,8 +64,8 @@ async function fetchIncludedEntities() { includeAll: !argv.networks && !argv.channels && !config.include?.networks && !config.include?.channels, includedNetworks: argv.networks || (!argv.channels && config.include?.networks) || [], includedChannels: argv.channels || (!argv.networks && config.include?.channels) || [], - excludedNetworks: argv.excludeNetworks || config.exclude?.networks || [], - excludedChannels: argv.excludeChannels || config.exclude?.channels || [], + excludedNetworks: argv.excludeNetworks || config.exclude?.networks.filter(network => !argv.networks?.includes(network)) || [], // ignore explicitly included networks + excludedChannels: argv.excludeChannels || config.exclude?.channels.filter(channel => !argv.channels?.includes(channel)) || [], // ignore explicitly included channels }; const rawNetworks = await knex.raw(` diff --git a/src/scrapers/traxxx.js b/src/scrapers/traxxx.js index da0b481b..41b824cc 100644 --- a/src/scrapers/traxxx.js +++ b/src/scrapers/traxxx.js @@ -258,7 +258,8 @@ async function fetchLatest(entity, page, options) { .limit(faker.random.number({ min: 2, max: 15 })) .pluck('name'); - release.actors = actors(release); + // release.actors = actors(release); + release.actors = [null, 'Charles Darwin']; release.title = title(release); return release; diff --git a/src/store-releases.js b/src/store-releases.js index 34a97c69..f51a3f35 100644 --- a/src/store-releases.js +++ b/src/store-releases.js @@ -9,7 +9,7 @@ const slugify = require('./utils/slugify'); const bulkInsert = require('./utils/bulk-insert'); const resolvePlace = require('./utils/resolve-place'); const { formatDate } = require('./utils/qu'); -const { associateActors, scrapeActors } = require('./actors'); +const { associateActors, scrapeActors, toBaseActors } = require('./actors'); const { associateReleaseTags } = require('./tags'); const { curateEntity } = require('./entities'); const { associateReleaseMedia } = require('./media'); @@ -291,7 +291,8 @@ async function storeScenes(releases) { const [batchId] = await knex('batches').insert({ comment: null }).returning('id'); const releasesWithChannels = await attachChannelEntities(releases); - const releasesWithStudios = await attachStudios(releasesWithChannels); + const releasesWithBaseActors = releasesWithChannels.map(release => ({ ...release, actors: toBaseActors(release.actors) })); + const releasesWithStudios = await attachStudios(releasesWithBaseActors); // uniqueness is entity ID + entry ID, filter uniques after adding entities const { uniqueReleases, duplicateReleases, duplicateReleaseEntries } = await filterDuplicateReleases(releasesWithStudios);