From 67055bf9201cac2aae0be5206e1272f5ae6e79d4 Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Tue, 16 Feb 2021 03:37:52 +0100 Subject: [PATCH] Improved actor entity and entry ID storage. --- assets/components/actors/tile.vue | 8 ++-- assets/js/fragments.js | 2 +- src/actors.js | 61 ++++++++++++++++++++++--------- src/scrapers/resolve.js | 2 +- src/scrapers/traxxx.js | 3 +- 5 files changed, 52 insertions(+), 24 deletions(-) diff --git a/assets/components/actors/tile.vue b/assets/components/actors/tile.vue index 0c7f3af5..5c0a45d7 100644 --- a/assets/components/actors/tile.vue +++ b/assets/components/actors/tile.vue @@ -16,13 +16,13 @@ >{{ actor.name }} diff --git a/assets/js/fragments.js b/assets/js/fragments.js index 197a555d..bc95747a 100644 --- a/assets/js/fragments.js +++ b/assets/js/fragments.js @@ -61,7 +61,7 @@ const actorFields = ` lazy } } - network: entity { + entity { id name slug diff --git a/src/actors.js b/src/actors.js index 474e1e56..c38394e1 100644 --- a/src/actors.js +++ b/src/actors.js @@ -180,6 +180,11 @@ function toBaseActors(actorsOrNames, release) { return baseActors; } +function getCollisionLikely(actor) { + // actor with single name + return actor.name.match(/\w+/g).length === 1; +} + function curateActor(actor, withDetails = false, isProfile = false) { if (!actor) { return null; @@ -260,11 +265,13 @@ function curateActor(actor, withDetails = false, isProfile = false) { } function curateActorEntry(baseActor, batchId) { + const collisionLikely = getCollisionLikely(baseActor); + return { name: baseActor.name, slug: baseActor.slug, - entity_id: null, - entry_id: baseActor.entryId, + entity_id: collisionLikely ? baseActor.entity.id : null, + entry_id: collisionLikely ? baseActor.entryId : null, batch_id: batchId, }; } @@ -641,6 +648,11 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy const scraper = scrapers[scraperSlug]; const layoutScraper = resolveLayoutScraper(entity, scraper); + if (!layoutScraper?.fetchProfile) { + logger.warn(`No profile profile scraper available for ${scraperSlug}`); + throw new Error(`No profile profile scraper available for ${scraperSlug}`); + } + const context = { ...entity, // legacy @@ -653,11 +665,6 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy const label = context.entity?.name; - if (!layoutScraper?.fetchProfile) { - logger.warn(`No profile profile scraper available for ${scraperSlug}`); - throw new Error(`No profile profile scraper available for ${scraperSlug}`); - } - if (!context.entity) { logger.warn(`No entity found for ${scraperSlug}`); throw new Error(`No entity found for ${scraperSlug}`); @@ -813,33 +820,53 @@ async function scrapeActors(argNames) { async function getOrCreateActors(baseActors, batchId) { // WHERE IN causes stack depth error and performance issues with a large amount of values, no knex VALUES helper available - const actorValues = baseActors.map(actor => knex.raw('(:slug, :entityId)', { slug: actor.slug, entityId: actor.entity.id })).join(', '); + const actorValues = baseActors.map(actor => knex.raw('(:slug, :entityId, :entryId, :collisionLikely)', { + slug: actor.slug, + entityId: actor.entity.id, + entryId: actor.entryId, + collisionLikely: getCollisionLikely(actor), + })).join(', '); const existingActors = await knex .select('actors.*') - .from(knex.raw(`actors, (VALUES ${actorValues}) AS base_actors (slug, entity_id)`)) - .whereRaw('actors.slug = base_actors.slug AND actors.entity_id IS NULL') - .orWhereRaw('actors.slug = base_actors.slug AND actors.entity_id = base_actors.entity_id'); + .from(knex.raw(`actors, (VALUES ${actorValues}) AS base_actors (slug, entity_id, entry_id, collision_likely)`)) + .whereRaw(` + actors.slug = base_actors.slug + AND actors.entity_id IS NULL + AND NOT base_actors.collision_likely + `) + .orWhereRaw(` + actors.slug = base_actors.slug + AND actors.entity_id = base_actors.entity_id + AND ((actors.entry_id IS NULL AND base_actors.entry_id IS NULL) + OR actors.entry_id = base_actors.entry_id) + `); // const existingActorSlugs = new Set(existingActors.map(actor => actor.slug)); const existingActorSlugs = existingActors.reduce((acc, actor) => ({ ...acc, [actor.entity_id]: { ...acc[actor.entity_id], - [actor.slug]: true, + [actor.entry_id]: { + ...acc[actor.entity_id]?.[actor.entry_id], + [actor.slug]: true, + }, }, }), {}); - const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.entity.id]?.[baseActor.slug] && !existingActorSlugs.null?.[baseActor.slug]); - + const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.entity.id]?.[baseActor.entryId]?.[baseActor.slug] && !existingActorSlugs.null?.null?.[baseActor.slug]); const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId); + const newActors = await bulkInsert('actors', curatedActorEntries); - const newActorIdsByEntityIdAndSlug = newActors.reduce((acc, actor) => ({ + const newActorIdsByEntityIdEntryIdAndSlug = newActors.reduce((acc, actor) => ({ ...acc, [actor.entity_id]: { ...acc[actor.entity_id], - [actor.slug]: actor.id, + [actor.entry_id]: { + ...acc[actor.entity_id]?.[actor.entry_id], + [actor.slug]: actor.id, + }, }, }), {}); @@ -847,7 +874,7 @@ async function getOrCreateActors(baseActors, batchId) { .filter(actor => actor.hasProfile) .map(actor => ({ ...actor, - id: newActorIdsByEntityIdAndSlug[actor.entity?.id]?.[actor.slug] || newActorIdsByEntityIdAndSlug.null?.[actor.slug], + id: newActorIdsByEntityIdEntryIdAndSlug[actor.entity?.id]?.[actor.entryId]?.[actor.slug] || newActorIdsByEntityIdEntryIdAndSlug.null?.null?.[actor.slug], })) .filter(actor => !!actor.id) .map(actor => curateProfile(actor))); diff --git a/src/scrapers/resolve.js b/src/scrapers/resolve.js index 89726b56..db72f51a 100644 --- a/src/scrapers/resolve.js +++ b/src/scrapers/resolve.js @@ -19,7 +19,7 @@ function resolveLayoutScraper(entity, scraper) { return scraper[entity.parameters.layout]; } - if (entity.parent) { + if (entity?.parent) { return resolveLayoutScraper(entity.parent, scraper); } diff --git a/src/scrapers/traxxx.js b/src/scrapers/traxxx.js index 76d51060..60270ed6 100644 --- a/src/scrapers/traxxx.js +++ b/src/scrapers/traxxx.js @@ -258,7 +258,8 @@ async function fetchLatest(entity, page, options) { .limit(faker.random.number({ min: 2, max: 15 })) .pluck('name'); - release.actors = [...actors(release), null]; // include empty actor to ensure proper handling + // release.actors = [...actors(release), null]; // include empty actor to ensure proper handling + release.actors = ['Amber']; release.title = title(release); return release;