Refactored deep and store modules to use entities.

This commit is contained in:
2020-06-25 02:26:25 +02:00
parent f0a89df6ab
commit 4959dfd14f
14 changed files with 132 additions and 164 deletions

View File

@@ -20,7 +20,6 @@ const logger = require('./logger')(__filename);
const { toBaseReleases } = require('./deep');
const { associateAvatars } = require('./media');
const { curateSite } = require('./sites');
const slugify = require('./utils/slugify');
const capitalize = require('./utils/capitalize');
@@ -120,7 +119,7 @@ function toBaseActors(actorsOrNames, release) {
const baseActor = {
name,
slug,
network: release?.site.network,
entity: release?.site?.network || release?.entity?.parent || null,
};
if (actorOrName.name) {
@@ -144,7 +143,7 @@ function curateActor(actor, withDetails = false) {
name: actor.name,
slug: actor.slug,
gender: actor.gender,
networkId: actor.entity_id,
entityId: actor.entity_id,
aliasFor: actor.alias_for,
dateOfBirth: actor.date_of_birth,
birthCountry: actor.birth_country_alpha2,
@@ -155,10 +154,10 @@ function curateActor(actor, withDetails = false) {
slug: actor.slug,
gender: actor.alias.gender,
},
network: actor.network && {
id: actor.network.id,
name: actor.network.name,
slug: actor.network.slug,
entity: actor.entity && {
id: actor.entity.id,
name: actor.entity.name,
slug: actor.entity.slug,
},
dateOfDeath: actor.date_of_death,
cup: actor.cup,
@@ -224,8 +223,7 @@ function curateProfileEntry(profile) {
const curatedProfileEntry = {
...(profile.update !== false && { id: profile.update }),
actor_id: profile.id,
site_id: profile.site?.id || null,
entity_id: profile.network?.id || null,
entity_id: profile.entity?.id || null,
date_of_birth: profile.dateOfBirth,
date_of_death: profile.dateOfDeath,
gender: profile.gender,
@@ -268,8 +266,7 @@ async function curateProfile(profile) {
name: profile.name,
avatar: profile.avatar,
scraper: profile.scraper,
site: profile.site,
network: profile.network,
entity: profile.entity,
update: profile.update,
};
@@ -343,7 +340,7 @@ async function curateProfile(profile) {
const { href } = new URL(social);
return href;
} catch (error) {
logger.warn(`Profile scraper for '${profile.site.name}' returned invalid social link: ${social}`);
logger.warn(`Profile scraper for '${profile.entity.name}' returned invalid social link: ${social}`);
return null;
}
}).filter(Boolean)
@@ -351,9 +348,9 @@ async function curateProfile(profile) {
curatedProfile.releases = toBaseReleases(profile.releases);
if (profile.ethnicity && !curatedProfile.ethnicity) logger.warn(`Unrecognized ethnicity returned by '${profile.site?.name || profile.network?.slug}' scraper: ${profile.ethnicity}`);
if ((profile.hairColor || profile.hair) && !curatedProfile.hairColor) logger.warn(`Unrecognized hair color returned by '${profile.site?.name || profile.network?.slug}' scraper: ${profile.hairColor || profile.hair}`);
if (profile.eyes && !curatedProfile.eyes) logger.warn(`Unrecognized eye color returned by '${profile.site?.name || profile.network?.slug}' scraper: ${profile.eyes}`);
if (profile.ethnicity && !curatedProfile.ethnicity) logger.warn(`Unrecognized ethnicity returned by '${profile.entity.name}' scraper: ${profile.ethnicity}`);
if ((profile.hairColor || profile.hair) && !curatedProfile.hairColor) logger.warn(`Unrecognized hair color returned by '${profile.entity.name}' scraper: ${profile.hairColor || profile.hair}`);
if (profile.eyes && !curatedProfile.eyes) logger.warn(`Unrecognized eye color returned by '${profile.entity.name}' scraper: ${profile.eyes}`);
return curatedProfile;
} catch (error) {
@@ -499,7 +496,7 @@ async function upsertProfiles(profiles) {
}
}
async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug, existingProfilesByActorNetworkSiteId) {
async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesByActorEntityId) {
const profiles = Promise.map(sources, async (source) => {
try {
// config may group sources to try until success
@@ -507,24 +504,25 @@ async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug, exist
try {
const scraper = scrapers[scraperSlug];
const context = {
site: sitesBySlug[scraperSlug] || null,
network: networksBySlug[scraperSlug] || sitesBySlug[scraperSlug]?.network || null,
site: entitiesBySlug[scraperSlug] || null,
network: entitiesBySlug[scraperSlug] || null,
entity: entitiesBySlug[scraperSlug] || null,
scraper: scraperSlug,
};
const label = context.site?.name || context.network?.name;
const label = context.entity?.name;
if (!scraper?.fetchProfile) {
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
throw new Error(`No profile profile scraper available for ${scraperSlug}`);
}
if (!context.site && !context.network) {
logger.warn(`No site or network found for ${scraperSlug}`);
throw new Error(`No site or network found for ${scraperSlug}`);
if (!context.entity) {
logger.warn(`No entity found for ${scraperSlug}`);
throw new Error(`No entity found for ${scraperSlug}`);
}
const existingProfile = existingProfilesByActorNetworkSiteId[actor.id]?.[context.network?.id || null]?.[context.site?.id || null];
const existingProfile = existingProfilesByActorEntityId[actor.id]?.[context.entity?.id || null];
if (existingProfile && !argv.force) {
logger.verbose(`Found existing profile for '${actor.name}' on '${label}', use --force to scrape again`);
@@ -574,20 +572,14 @@ async function scrapeActors(actorNames) {
const baseActors = toBaseActors(actorNames);
const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
const siteSlugs = sources.flat();
const entitySlugs = sources.flat();
const [networks, sites, existingActorEntries] = await Promise.all([
const [entities, existingActorEntries] = await Promise.all([
knex('entities')
.where('type', 2)
.whereIn('slug', siteSlugs),
knex('entities')
.select(
'entities.*',
'parents.name as network_name', 'parents.slug as network_slug', 'parents.url as network_url', 'parents.description as network_description', 'parents.parameters as network_parameters',
)
.where('type', 2)
.whereIn('entities.slug', siteSlugs)
.leftJoin('entities as parents', 'parents.id', 'entities.parent_id'),
.select(knex.raw('entities.*, row_to_json(parents) as parent'))
.whereIn('entities.slug', entitySlugs)
.leftJoin('entities as parents', 'parents.id', 'entities.parent_id')
.orderBy('entities.type'),
knex('actors')
.select(['id', 'name', 'slug'])
.modify((queryBuilder) => {
@@ -598,8 +590,7 @@ async function scrapeActors(actorNames) {
.whereNull('alias_for'),
]);
const networksBySlug = networks.reduce((acc, network) => ({ ...acc, [network.slug]: network }), {});
const sitesBySlug = sites.reduce((acc, site) => ({ ...acc, [site.slug]: curateSite(site) }), {});
const entitiesBySlug = entities.reduce((acc, entity) => ({ ...acc, [entity.slug]: entity }), {});
const existingActorEntriesBySlug = existingActorEntries.reduce((acc, actorEntry) => ({ ...acc, [actorEntry.slug]: actorEntry }), {});
const newBaseActors = baseActors.filter(baseActor => !existingActorEntriesBySlug[baseActor.slug]);
@@ -611,20 +602,17 @@ async function scrapeActors(actorNames) {
const actors = existingActorEntries.concat(Array.isArray(newActorEntries) ? newActorEntries : []);
const existingProfiles = await knex('actors_profiles').whereIn('actor_id', actors.map(actor => actor.id));
const existingProfilesByActorNetworkSiteId = existingProfiles.reduce((acc, profile) => ({
const existingProfilesByActorEntityId = existingProfiles.reduce((acc, profile) => ({
...acc,
[profile.actor_id]: {
...acc[profile.actor_id],
[profile.entity_id]: {
...acc[profile.entity_id],
[profile.site_id]: profile,
},
[profile.entity_id]: profile,
},
}), {});
const profilesPerActor = await Promise.map(
actors,
async actor => scrapeProfiles(actor, sources, networksBySlug, sitesBySlug, existingProfilesByActorNetworkSiteId),
async actor => scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesByActorEntityId),
{ concurrency: 10 },
);
@@ -647,13 +635,11 @@ async function scrapeActors(actorNames) {
}
async function getOrCreateActors(baseActors, batchId) {
console.log(baseActors);
const existingActors = await knex('actors')
.select('id', 'alias_for', 'name', 'slug', 'entity_id')
.whereIn('slug', baseActors.map(baseActor => baseActor.slug))
.whereNull('entity_id')
.orWhereIn(['slug', 'entity_id'], baseActors.map(baseActor => [baseActor.slug, baseActor.network.id]));
.orWhereIn(['slug', 'entity_id'], baseActors.map(baseActor => [baseActor.slug, baseActor.entity.id]));
// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug));
const existingActorSlugs = existingActors.reduce((acc, actor) => ({
@@ -664,7 +650,7 @@ async function getOrCreateActors(baseActors, batchId) {
},
}), {});
const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.network.id]?.[baseActor.slug] && !existingActorSlugs.null?.[baseActor.slug]);
const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.entity.id]?.[baseActor.slug] && !existingActorSlugs.null?.[baseActor.slug]);
const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId);
const newActors = await knex('actors').insert(curatedActorEntries, ['id', 'alias_for', 'name', 'slug', 'entity_id']);
@@ -722,7 +708,7 @@ async function fetchActor(actorId) {
const actor = await knex('actors')
.select(knex.raw(`
actors.*,
row_to_json(networks) as network,
row_to_json(entities) as entity,
row_to_json(actor_alias) as alias,
row_to_json(birth_country) as birth_country,
row_to_json(residence_country) as residence_country,
@@ -737,7 +723,7 @@ async function fetchActor(actorId) {
queryBuilder.where('actors.id', actorId);
})
.leftJoin('actors as actor_alias', 'actor_alias.id', 'actors.alias_for')
.leftJoin('networks', 'networks.id', 'actors.entity_id')
.leftJoin('entities', 'entities.id', 'actors.entity_id')
.leftJoin('countries as birth_country', 'birth_country.alpha2', 'actors.birth_country_alpha2')
.leftJoin('countries as residence_country', 'residence_country.alpha2', 'actors.residence_country_alpha2')
.leftJoin('media', 'media.id', 'actors.avatar_media_id')