Refactored deep and store modules to use entities.
This commit is contained in:
@@ -20,7 +20,6 @@ const logger = require('./logger')(__filename);
|
||||
|
||||
const { toBaseReleases } = require('./deep');
|
||||
const { associateAvatars } = require('./media');
|
||||
const { curateSite } = require('./sites');
|
||||
|
||||
const slugify = require('./utils/slugify');
|
||||
const capitalize = require('./utils/capitalize');
|
||||
@@ -120,7 +119,7 @@ function toBaseActors(actorsOrNames, release) {
|
||||
const baseActor = {
|
||||
name,
|
||||
slug,
|
||||
network: release?.site.network,
|
||||
entity: release?.site?.network || release?.entity?.parent || null,
|
||||
};
|
||||
|
||||
if (actorOrName.name) {
|
||||
@@ -144,7 +143,7 @@ function curateActor(actor, withDetails = false) {
|
||||
name: actor.name,
|
||||
slug: actor.slug,
|
||||
gender: actor.gender,
|
||||
networkId: actor.entity_id,
|
||||
entityId: actor.entity_id,
|
||||
aliasFor: actor.alias_for,
|
||||
dateOfBirth: actor.date_of_birth,
|
||||
birthCountry: actor.birth_country_alpha2,
|
||||
@@ -155,10 +154,10 @@ function curateActor(actor, withDetails = false) {
|
||||
slug: actor.slug,
|
||||
gender: actor.alias.gender,
|
||||
},
|
||||
network: actor.network && {
|
||||
id: actor.network.id,
|
||||
name: actor.network.name,
|
||||
slug: actor.network.slug,
|
||||
entity: actor.entity && {
|
||||
id: actor.entity.id,
|
||||
name: actor.entity.name,
|
||||
slug: actor.entity.slug,
|
||||
},
|
||||
dateOfDeath: actor.date_of_death,
|
||||
cup: actor.cup,
|
||||
@@ -224,8 +223,7 @@ function curateProfileEntry(profile) {
|
||||
const curatedProfileEntry = {
|
||||
...(profile.update !== false && { id: profile.update }),
|
||||
actor_id: profile.id,
|
||||
site_id: profile.site?.id || null,
|
||||
entity_id: profile.network?.id || null,
|
||||
entity_id: profile.entity?.id || null,
|
||||
date_of_birth: profile.dateOfBirth,
|
||||
date_of_death: profile.dateOfDeath,
|
||||
gender: profile.gender,
|
||||
@@ -268,8 +266,7 @@ async function curateProfile(profile) {
|
||||
name: profile.name,
|
||||
avatar: profile.avatar,
|
||||
scraper: profile.scraper,
|
||||
site: profile.site,
|
||||
network: profile.network,
|
||||
entity: profile.entity,
|
||||
update: profile.update,
|
||||
};
|
||||
|
||||
@@ -343,7 +340,7 @@ async function curateProfile(profile) {
|
||||
const { href } = new URL(social);
|
||||
return href;
|
||||
} catch (error) {
|
||||
logger.warn(`Profile scraper for '${profile.site.name}' returned invalid social link: ${social}`);
|
||||
logger.warn(`Profile scraper for '${profile.entity.name}' returned invalid social link: ${social}`);
|
||||
return null;
|
||||
}
|
||||
}).filter(Boolean)
|
||||
@@ -351,9 +348,9 @@ async function curateProfile(profile) {
|
||||
|
||||
curatedProfile.releases = toBaseReleases(profile.releases);
|
||||
|
||||
if (profile.ethnicity && !curatedProfile.ethnicity) logger.warn(`Unrecognized ethnicity returned by '${profile.site?.name || profile.network?.slug}' scraper: ${profile.ethnicity}`);
|
||||
if ((profile.hairColor || profile.hair) && !curatedProfile.hairColor) logger.warn(`Unrecognized hair color returned by '${profile.site?.name || profile.network?.slug}' scraper: ${profile.hairColor || profile.hair}`);
|
||||
if (profile.eyes && !curatedProfile.eyes) logger.warn(`Unrecognized eye color returned by '${profile.site?.name || profile.network?.slug}' scraper: ${profile.eyes}`);
|
||||
if (profile.ethnicity && !curatedProfile.ethnicity) logger.warn(`Unrecognized ethnicity returned by '${profile.entity.name}' scraper: ${profile.ethnicity}`);
|
||||
if ((profile.hairColor || profile.hair) && !curatedProfile.hairColor) logger.warn(`Unrecognized hair color returned by '${profile.entity.name}' scraper: ${profile.hairColor || profile.hair}`);
|
||||
if (profile.eyes && !curatedProfile.eyes) logger.warn(`Unrecognized eye color returned by '${profile.entity.name}' scraper: ${profile.eyes}`);
|
||||
|
||||
return curatedProfile;
|
||||
} catch (error) {
|
||||
@@ -499,7 +496,7 @@ async function upsertProfiles(profiles) {
|
||||
}
|
||||
}
|
||||
|
||||
async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug, existingProfilesByActorNetworkSiteId) {
|
||||
async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesByActorEntityId) {
|
||||
const profiles = Promise.map(sources, async (source) => {
|
||||
try {
|
||||
// config may group sources to try until success
|
||||
@@ -507,24 +504,25 @@ async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug, exist
|
||||
try {
|
||||
const scraper = scrapers[scraperSlug];
|
||||
const context = {
|
||||
site: sitesBySlug[scraperSlug] || null,
|
||||
network: networksBySlug[scraperSlug] || sitesBySlug[scraperSlug]?.network || null,
|
||||
site: entitiesBySlug[scraperSlug] || null,
|
||||
network: entitiesBySlug[scraperSlug] || null,
|
||||
entity: entitiesBySlug[scraperSlug] || null,
|
||||
scraper: scraperSlug,
|
||||
};
|
||||
|
||||
const label = context.site?.name || context.network?.name;
|
||||
const label = context.entity?.name;
|
||||
|
||||
if (!scraper?.fetchProfile) {
|
||||
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
|
||||
throw new Error(`No profile profile scraper available for ${scraperSlug}`);
|
||||
}
|
||||
|
||||
if (!context.site && !context.network) {
|
||||
logger.warn(`No site or network found for ${scraperSlug}`);
|
||||
throw new Error(`No site or network found for ${scraperSlug}`);
|
||||
if (!context.entity) {
|
||||
logger.warn(`No entity found for ${scraperSlug}`);
|
||||
throw new Error(`No entity found for ${scraperSlug}`);
|
||||
}
|
||||
|
||||
const existingProfile = existingProfilesByActorNetworkSiteId[actor.id]?.[context.network?.id || null]?.[context.site?.id || null];
|
||||
const existingProfile = existingProfilesByActorEntityId[actor.id]?.[context.entity?.id || null];
|
||||
|
||||
if (existingProfile && !argv.force) {
|
||||
logger.verbose(`Found existing profile for '${actor.name}' on '${label}', use --force to scrape again`);
|
||||
@@ -574,20 +572,14 @@ async function scrapeActors(actorNames) {
|
||||
const baseActors = toBaseActors(actorNames);
|
||||
|
||||
const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
|
||||
const siteSlugs = sources.flat();
|
||||
const entitySlugs = sources.flat();
|
||||
|
||||
const [networks, sites, existingActorEntries] = await Promise.all([
|
||||
const [entities, existingActorEntries] = await Promise.all([
|
||||
knex('entities')
|
||||
.where('type', 2)
|
||||
.whereIn('slug', siteSlugs),
|
||||
knex('entities')
|
||||
.select(
|
||||
'entities.*',
|
||||
'parents.name as network_name', 'parents.slug as network_slug', 'parents.url as network_url', 'parents.description as network_description', 'parents.parameters as network_parameters',
|
||||
)
|
||||
.where('type', 2)
|
||||
.whereIn('entities.slug', siteSlugs)
|
||||
.leftJoin('entities as parents', 'parents.id', 'entities.parent_id'),
|
||||
.select(knex.raw('entities.*, row_to_json(parents) as parent'))
|
||||
.whereIn('entities.slug', entitySlugs)
|
||||
.leftJoin('entities as parents', 'parents.id', 'entities.parent_id')
|
||||
.orderBy('entities.type'),
|
||||
knex('actors')
|
||||
.select(['id', 'name', 'slug'])
|
||||
.modify((queryBuilder) => {
|
||||
@@ -598,8 +590,7 @@ async function scrapeActors(actorNames) {
|
||||
.whereNull('alias_for'),
|
||||
]);
|
||||
|
||||
const networksBySlug = networks.reduce((acc, network) => ({ ...acc, [network.slug]: network }), {});
|
||||
const sitesBySlug = sites.reduce((acc, site) => ({ ...acc, [site.slug]: curateSite(site) }), {});
|
||||
const entitiesBySlug = entities.reduce((acc, entity) => ({ ...acc, [entity.slug]: entity }), {});
|
||||
|
||||
const existingActorEntriesBySlug = existingActorEntries.reduce((acc, actorEntry) => ({ ...acc, [actorEntry.slug]: actorEntry }), {});
|
||||
const newBaseActors = baseActors.filter(baseActor => !existingActorEntriesBySlug[baseActor.slug]);
|
||||
@@ -611,20 +602,17 @@ async function scrapeActors(actorNames) {
|
||||
const actors = existingActorEntries.concat(Array.isArray(newActorEntries) ? newActorEntries : []);
|
||||
|
||||
const existingProfiles = await knex('actors_profiles').whereIn('actor_id', actors.map(actor => actor.id));
|
||||
const existingProfilesByActorNetworkSiteId = existingProfiles.reduce((acc, profile) => ({
|
||||
const existingProfilesByActorEntityId = existingProfiles.reduce((acc, profile) => ({
|
||||
...acc,
|
||||
[profile.actor_id]: {
|
||||
...acc[profile.actor_id],
|
||||
[profile.entity_id]: {
|
||||
...acc[profile.entity_id],
|
||||
[profile.site_id]: profile,
|
||||
},
|
||||
[profile.entity_id]: profile,
|
||||
},
|
||||
}), {});
|
||||
|
||||
const profilesPerActor = await Promise.map(
|
||||
actors,
|
||||
async actor => scrapeProfiles(actor, sources, networksBySlug, sitesBySlug, existingProfilesByActorNetworkSiteId),
|
||||
async actor => scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesByActorEntityId),
|
||||
{ concurrency: 10 },
|
||||
);
|
||||
|
||||
@@ -647,13 +635,11 @@ async function scrapeActors(actorNames) {
|
||||
}
|
||||
|
||||
async function getOrCreateActors(baseActors, batchId) {
|
||||
console.log(baseActors);
|
||||
|
||||
const existingActors = await knex('actors')
|
||||
.select('id', 'alias_for', 'name', 'slug', 'entity_id')
|
||||
.whereIn('slug', baseActors.map(baseActor => baseActor.slug))
|
||||
.whereNull('entity_id')
|
||||
.orWhereIn(['slug', 'entity_id'], baseActors.map(baseActor => [baseActor.slug, baseActor.network.id]));
|
||||
.orWhereIn(['slug', 'entity_id'], baseActors.map(baseActor => [baseActor.slug, baseActor.entity.id]));
|
||||
|
||||
// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug));
|
||||
const existingActorSlugs = existingActors.reduce((acc, actor) => ({
|
||||
@@ -664,7 +650,7 @@ async function getOrCreateActors(baseActors, batchId) {
|
||||
},
|
||||
}), {});
|
||||
|
||||
const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.network.id]?.[baseActor.slug] && !existingActorSlugs.null?.[baseActor.slug]);
|
||||
const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.entity.id]?.[baseActor.slug] && !existingActorSlugs.null?.[baseActor.slug]);
|
||||
|
||||
const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId);
|
||||
const newActors = await knex('actors').insert(curatedActorEntries, ['id', 'alias_for', 'name', 'slug', 'entity_id']);
|
||||
@@ -722,7 +708,7 @@ async function fetchActor(actorId) {
|
||||
const actor = await knex('actors')
|
||||
.select(knex.raw(`
|
||||
actors.*,
|
||||
row_to_json(networks) as network,
|
||||
row_to_json(entities) as entity,
|
||||
row_to_json(actor_alias) as alias,
|
||||
row_to_json(birth_country) as birth_country,
|
||||
row_to_json(residence_country) as residence_country,
|
||||
@@ -737,7 +723,7 @@ async function fetchActor(actorId) {
|
||||
queryBuilder.where('actors.id', actorId);
|
||||
})
|
||||
.leftJoin('actors as actor_alias', 'actor_alias.id', 'actors.alias_for')
|
||||
.leftJoin('networks', 'networks.id', 'actors.entity_id')
|
||||
.leftJoin('entities', 'entities.id', 'actors.entity_id')
|
||||
.leftJoin('countries as birth_country', 'birth_country.alpha2', 'actors.birth_country_alpha2')
|
||||
.leftJoin('countries as residence_country', 'residence_country.alpha2', 'actors.residence_country_alpha2')
|
||||
.leftJoin('media', 'media.id', 'actors.avatar_media_id')
|
||||
|
||||
Reference in New Issue
Block a user