Filtering invalid actors from releases before storing.

This commit is contained in:
DebaucheryLibrarian 2020-10-29 16:06:20 +01:00
parent 2801732f57
commit c37d4ad01f
5 changed files with 38 additions and 27 deletions

View File

@ -114,7 +114,13 @@ function getAverage(items) {
} }
function toBaseActors(actorsOrNames, release) { function toBaseActors(actorsOrNames, release) {
return actorsOrNames.map((actorOrName) => { if (!actorsOrNames) {
return [];
}
const baseActors = actorsOrNames
.filter(actorOrName => actorOrName && (typeof actorOrName === 'string' || actorOrName.name))
.map((actorOrName) => {
const [baseName, entryId] = (actorOrName.name || actorOrName).split(':'); const [baseName, entryId] = (actorOrName.name || actorOrName).split(':');
const name = capitalize(baseName); const name = capitalize(baseName);
@ -137,6 +143,8 @@ function toBaseActors(actorsOrNames, release) {
return baseActor; return baseActor;
}); });
return baseActors;
} }
function curateActor(actor, withDetails = false, isProfile = false) { function curateActor(actor, withDetails = false, isProfile = false) {
@ -832,4 +840,5 @@ module.exports = {
fetchActor, fetchActor,
scrapeActors, scrapeActors,
searchActors, searchActors,
toBaseActors,
}; };

View File

@ -30,20 +30,20 @@ const { argv } = yargs
type: 'boolean', type: 'boolean',
alias: 'web', alias: 'web',
}) })
.option('networks', { .option('include-networks', {
describe: 'Network to scrape all channels from (overrides configuration)', describe: 'Network to scrape all channels from (overrides configuration)',
type: 'array', type: 'array',
alias: 'network', alias: ['include-network', 'networks', 'network'],
}) })
.option('exclude-networks', { .option('exclude-networks', {
describe: 'Network not to scrape any channels from (overrides configuration)', describe: 'Network not to scrape any channels from (overrides configuration)',
type: 'array', type: 'array',
alias: 'exclude-network', alias: 'exclude-network',
}) })
.option('channels', { .option('include-channels', {
describe: 'Channel to scrape (overrides configuration)', describe: 'Channel to scrape (overrides configuration)',
type: 'array', type: 'array',
alias: 'channel', alias: ['include-channel', 'channels', 'channel'],
}) })
.option('exclude-channels', { .option('exclude-channels', {
describe: 'Channel not to scrape (overrides configuration)', describe: 'Channel not to scrape (overrides configuration)',

View File

@ -64,8 +64,8 @@ async function fetchIncludedEntities() {
includeAll: !argv.networks && !argv.channels && !config.include?.networks && !config.include?.channels, includeAll: !argv.networks && !argv.channels && !config.include?.networks && !config.include?.channels,
includedNetworks: argv.networks || (!argv.channels && config.include?.networks) || [], includedNetworks: argv.networks || (!argv.channels && config.include?.networks) || [],
includedChannels: argv.channels || (!argv.networks && config.include?.channels) || [], includedChannels: argv.channels || (!argv.networks && config.include?.channels) || [],
excludedNetworks: argv.excludeNetworks || config.exclude?.networks || [], excludedNetworks: argv.excludeNetworks || config.exclude?.networks.filter(network => !argv.networks?.includes(network)) || [], // ignore explicitly included networks
excludedChannels: argv.excludeChannels || config.exclude?.channels || [], excludedChannels: argv.excludeChannels || config.exclude?.channels.filter(channel => !argv.channels?.includes(channel)) || [], // ignore explicitly included channels
}; };
const rawNetworks = await knex.raw(` const rawNetworks = await knex.raw(`

View File

@ -258,7 +258,8 @@ async function fetchLatest(entity, page, options) {
.limit(faker.random.number({ min: 2, max: 15 })) .limit(faker.random.number({ min: 2, max: 15 }))
.pluck('name'); .pluck('name');
release.actors = actors(release); // release.actors = actors(release);
release.actors = [null, 'Charles Darwin'];
release.title = title(release); release.title = title(release);
return release; return release;

View File

@ -9,7 +9,7 @@ const slugify = require('./utils/slugify');
const bulkInsert = require('./utils/bulk-insert'); const bulkInsert = require('./utils/bulk-insert');
const resolvePlace = require('./utils/resolve-place'); const resolvePlace = require('./utils/resolve-place');
const { formatDate } = require('./utils/qu'); const { formatDate } = require('./utils/qu');
const { associateActors, scrapeActors } = require('./actors'); const { associateActors, scrapeActors, toBaseActors } = require('./actors');
const { associateReleaseTags } = require('./tags'); const { associateReleaseTags } = require('./tags');
const { curateEntity } = require('./entities'); const { curateEntity } = require('./entities');
const { associateReleaseMedia } = require('./media'); const { associateReleaseMedia } = require('./media');
@ -291,7 +291,8 @@ async function storeScenes(releases) {
const [batchId] = await knex('batches').insert({ comment: null }).returning('id'); const [batchId] = await knex('batches').insert({ comment: null }).returning('id');
const releasesWithChannels = await attachChannelEntities(releases); const releasesWithChannels = await attachChannelEntities(releases);
const releasesWithStudios = await attachStudios(releasesWithChannels); const releasesWithBaseActors = releasesWithChannels.map(release => ({ ...release, actors: toBaseActors(release.actors) }));
const releasesWithStudios = await attachStudios(releasesWithBaseActors);
// uniqueness is entity ID + entry ID, filter uniques after adding entities // uniqueness is entity ID + entry ID, filter uniques after adding entities
const { uniqueReleases, duplicateReleases, duplicateReleaseEntries } = await filterDuplicateReleases(releasesWithStudios); const { uniqueReleases, duplicateReleases, duplicateReleaseEntries } = await filterDuplicateReleases(releasesWithStudios);