|
|
|
@@ -180,6 +180,11 @@ function toBaseActors(actorsOrNames, release) {
|
|
|
|
return baseActors;
|
|
|
|
return baseActors;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function getCollisionLikely(actor) {
|
|
|
|
|
|
|
|
// actor with single name
|
|
|
|
|
|
|
|
return actor.name.match(/\w+/g).length === 1;
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
function curateActor(actor, withDetails = false, isProfile = false) {
|
|
|
|
function curateActor(actor, withDetails = false, isProfile = false) {
|
|
|
|
if (!actor) {
|
|
|
|
if (!actor) {
|
|
|
|
return null;
|
|
|
|
return null;
|
|
|
|
@@ -260,11 +265,13 @@ function curateActor(actor, withDetails = false, isProfile = false) {
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
function curateActorEntry(baseActor, batchId) {
|
|
|
|
function curateActorEntry(baseActor, batchId) {
|
|
|
|
|
|
|
|
const collisionLikely = getCollisionLikely(baseActor);
|
|
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
return {
|
|
|
|
name: baseActor.name,
|
|
|
|
name: baseActor.name,
|
|
|
|
slug: baseActor.slug,
|
|
|
|
slug: baseActor.slug,
|
|
|
|
entity_id: null,
|
|
|
|
entity_id: collisionLikely ? baseActor.entity.id : null,
|
|
|
|
entry_id: baseActor.entryId,
|
|
|
|
entry_id: collisionLikely ? baseActor.entryId : null,
|
|
|
|
batch_id: batchId,
|
|
|
|
batch_id: batchId,
|
|
|
|
};
|
|
|
|
};
|
|
|
|
}
|
|
|
|
}
|
|
|
|
@@ -641,6 +648,11 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy
|
|
|
|
const scraper = scrapers[scraperSlug];
|
|
|
|
const scraper = scrapers[scraperSlug];
|
|
|
|
const layoutScraper = resolveLayoutScraper(entity, scraper);
|
|
|
|
const layoutScraper = resolveLayoutScraper(entity, scraper);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (!layoutScraper?.fetchProfile) {
|
|
|
|
|
|
|
|
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
|
|
|
|
|
|
|
|
throw new Error(`No profile profile scraper available for ${scraperSlug}`);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const context = {
|
|
|
|
const context = {
|
|
|
|
...entity,
|
|
|
|
...entity,
|
|
|
|
// legacy
|
|
|
|
// legacy
|
|
|
|
@@ -653,11 +665,6 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy
|
|
|
|
|
|
|
|
|
|
|
|
const label = context.entity?.name;
|
|
|
|
const label = context.entity?.name;
|
|
|
|
|
|
|
|
|
|
|
|
if (!layoutScraper?.fetchProfile) {
|
|
|
|
|
|
|
|
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
|
|
|
|
|
|
|
|
throw new Error(`No profile profile scraper available for ${scraperSlug}`);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if (!context.entity) {
|
|
|
|
if (!context.entity) {
|
|
|
|
logger.warn(`No entity found for ${scraperSlug}`);
|
|
|
|
logger.warn(`No entity found for ${scraperSlug}`);
|
|
|
|
throw new Error(`No entity found for ${scraperSlug}`);
|
|
|
|
throw new Error(`No entity found for ${scraperSlug}`);
|
|
|
|
@@ -813,33 +820,53 @@ async function scrapeActors(argNames) {
|
|
|
|
|
|
|
|
|
|
|
|
async function getOrCreateActors(baseActors, batchId) {
|
|
|
|
async function getOrCreateActors(baseActors, batchId) {
|
|
|
|
// WHERE IN causes stack depth error and performance issues with a large amount of values, no knex VALUES helper available
|
|
|
|
// WHERE IN causes stack depth error and performance issues with a large amount of values, no knex VALUES helper available
|
|
|
|
const actorValues = baseActors.map(actor => knex.raw('(:slug, :entityId)', { slug: actor.slug, entityId: actor.entity.id })).join(', ');
|
|
|
|
const actorValues = baseActors.map(actor => knex.raw('(:slug, :entityId, :entryId, :collisionLikely)', {
|
|
|
|
|
|
|
|
slug: actor.slug,
|
|
|
|
|
|
|
|
entityId: actor.entity.id,
|
|
|
|
|
|
|
|
entryId: actor.entryId,
|
|
|
|
|
|
|
|
collisionLikely: getCollisionLikely(actor),
|
|
|
|
|
|
|
|
})).join(', ');
|
|
|
|
|
|
|
|
|
|
|
|
const existingActors = await knex
|
|
|
|
const existingActors = await knex
|
|
|
|
.select('actors.*')
|
|
|
|
.select('actors.*')
|
|
|
|
.from(knex.raw(`actors, (VALUES ${actorValues}) AS base_actors (slug, entity_id)`))
|
|
|
|
.from(knex.raw(`actors, (VALUES ${actorValues}) AS base_actors (slug, entity_id, entry_id, collision_likely)`))
|
|
|
|
.whereRaw('actors.slug = base_actors.slug AND actors.entity_id IS NULL')
|
|
|
|
.whereRaw(`
|
|
|
|
.orWhereRaw('actors.slug = base_actors.slug AND actors.entity_id = base_actors.entity_id');
|
|
|
|
actors.slug = base_actors.slug
|
|
|
|
|
|
|
|
AND actors.entity_id IS NULL
|
|
|
|
|
|
|
|
AND NOT base_actors.collision_likely
|
|
|
|
|
|
|
|
`)
|
|
|
|
|
|
|
|
.orWhereRaw(`
|
|
|
|
|
|
|
|
actors.slug = base_actors.slug
|
|
|
|
|
|
|
|
AND actors.entity_id = base_actors.entity_id
|
|
|
|
|
|
|
|
AND ((actors.entry_id IS NULL AND base_actors.entry_id IS NULL)
|
|
|
|
|
|
|
|
OR actors.entry_id = base_actors.entry_id)
|
|
|
|
|
|
|
|
`);
|
|
|
|
|
|
|
|
|
|
|
|
// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug));
|
|
|
|
// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug));
|
|
|
|
const existingActorSlugs = existingActors.reduce((acc, actor) => ({
|
|
|
|
const existingActorSlugs = existingActors.reduce((acc, actor) => ({
|
|
|
|
...acc,
|
|
|
|
...acc,
|
|
|
|
[actor.entity_id]: {
|
|
|
|
[actor.entity_id]: {
|
|
|
|
...acc[actor.entity_id],
|
|
|
|
...acc[actor.entity_id],
|
|
|
|
[actor.slug]: true,
|
|
|
|
[actor.entry_id]: {
|
|
|
|
|
|
|
|
...acc[actor.entity_id]?.[actor.entry_id],
|
|
|
|
|
|
|
|
[actor.slug]: true,
|
|
|
|
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}), {});
|
|
|
|
}), {});
|
|
|
|
|
|
|
|
|
|
|
|
const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.entity.id]?.[baseActor.slug] && !existingActorSlugs.null?.[baseActor.slug]);
|
|
|
|
const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.entity.id]?.[baseActor.entryId]?.[baseActor.slug] && !existingActorSlugs.null?.null?.[baseActor.slug]);
|
|
|
|
|
|
|
|
|
|
|
|
const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId);
|
|
|
|
const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId);
|
|
|
|
|
|
|
|
|
|
|
|
const newActors = await bulkInsert('actors', curatedActorEntries);
|
|
|
|
const newActors = await bulkInsert('actors', curatedActorEntries);
|
|
|
|
|
|
|
|
|
|
|
|
const newActorIdsByEntityIdAndSlug = newActors.reduce((acc, actor) => ({
|
|
|
|
const newActorIdsByEntityIdEntryIdAndSlug = newActors.reduce((acc, actor) => ({
|
|
|
|
...acc,
|
|
|
|
...acc,
|
|
|
|
[actor.entity_id]: {
|
|
|
|
[actor.entity_id]: {
|
|
|
|
...acc[actor.entity_id],
|
|
|
|
...acc[actor.entity_id],
|
|
|
|
[actor.slug]: actor.id,
|
|
|
|
[actor.entry_id]: {
|
|
|
|
|
|
|
|
...acc[actor.entity_id]?.[actor.entry_id],
|
|
|
|
|
|
|
|
[actor.slug]: actor.id,
|
|
|
|
|
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}), {});
|
|
|
|
}), {});
|
|
|
|
|
|
|
|
|
|
|
|
@@ -847,7 +874,7 @@ async function getOrCreateActors(baseActors, batchId) {
|
|
|
|
.filter(actor => actor.hasProfile)
|
|
|
|
.filter(actor => actor.hasProfile)
|
|
|
|
.map(actor => ({
|
|
|
|
.map(actor => ({
|
|
|
|
...actor,
|
|
|
|
...actor,
|
|
|
|
id: newActorIdsByEntityIdAndSlug[actor.entity?.id]?.[actor.slug] || newActorIdsByEntityIdAndSlug.null?.[actor.slug],
|
|
|
|
id: newActorIdsByEntityIdEntryIdAndSlug[actor.entity?.id]?.[actor.entryId]?.[actor.slug] || newActorIdsByEntityIdEntryIdAndSlug.null?.null?.[actor.slug],
|
|
|
|
}))
|
|
|
|
}))
|
|
|
|
.filter(actor => !!actor.id)
|
|
|
|
.filter(actor => !!actor.id)
|
|
|
|
.map(actor => curateProfile(actor)));
|
|
|
|
.map(actor => curateProfile(actor)));
|
|
|
|
|