diff --git a/src/actors.js b/src/actors.js index 508dac36..50129a50 100644 --- a/src/actors.js +++ b/src/actors.js @@ -139,6 +139,7 @@ function curateActorEntries(baseActors, batchId) { function curateProfileEntry(profile) { const curatedProfileEntry = { + ...(profile.update !== false && { id: profile.update }), actor_id: profile.id, site_id: profile.site?.id || null, network_id: profile.network?.id || null, @@ -185,6 +186,7 @@ async function curateProfile(profile) { scraper: profile.scraper, site: profile.site, network: profile.network, + update: profile.update, }; curatedProfile.description = profile.description?.trim() || null; @@ -381,47 +383,13 @@ async function interpolateProfiles(actors) { } async function upsertProfiles(profiles) { - const curatedProfileEntries = profiles.map(profile => curateProfileEntry(profile)); - - const existingProfiles = await knex('actors_profiles') - .whereIn(['actor_id', 'network_id'], curatedProfileEntries.map(entry => [entry.actor_id, entry.network_id])) - .orWhereIn(['actor_id', 'site_id'], curatedProfileEntries.map(entry => [entry.actor_id, entry.site_id])); - - const existingProfilesByActorNetworkSiteIds = existingProfiles.reduce((acc, profile) => ({ - ...acc, - [profile.actor_id]: { - ...acc[profile.actor_id], - [profile.network_id]: { - ...acc[profile.actor_id]?.[profile.network_id], - [profile.site_id]: profile, - }, - }, - }), {}); - - const { updatingProfileEntries, newProfileEntries } = curatedProfileEntries.reduce((acc, profile) => { - const existingProfile = existingProfilesByActorNetworkSiteIds[profile.actor_id]?.[profile.network_id]?.[profile.site_id]; - - if (existingProfile) { - return { - ...acc, - updatingProfileEntries: [...acc.updatingProfileEntries, { - ...profile, - id: existingProfile.id, - }], - }; - } - - return { - ...acc, - newProfileEntries: [...acc.newProfileEntries, profile], - }; - }, { - updatingProfileEntries: [], - newProfileEntries: [], - }); + const newProfileEntries = profiles.filter(profile => !profile.update).map(profile => curateProfileEntry(profile)); + const updatingProfileEntries = profiles.filter(profile => profile.update).map(profile => curateProfileEntry(profile)); if (newProfileEntries.length > 0) { await knex('actors_profiles').insert(newProfileEntries); + + logger.info(`Saved ${newProfileEntries.length} new actor profiles`); } if (argv.force && updatingProfileEntries.length > 0) { @@ -435,10 +403,12 @@ async function upsertProfiles(profiles) { await Promise.all(queries) .then(transaction.commit) .catch(transaction.rollback); + + logger.info(`Updated ${updatingProfileEntries.length} new actor profiles`); } } -async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) { +async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug, existingProfilesByActorNetworkSiteId) { const profiles = Promise.map(sources, async (source) => { try { // config may group sources to try until success @@ -451,6 +421,8 @@ async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) { scraper: scraperSlug, }; + const label = context.site?.name || context.network?.name; + if (!scraper?.fetchProfile) { logger.warn(`No profile profile scraper available for ${scraperSlug}`); throw new Error(`No profile profile scraper available for ${scraperSlug}`); @@ -461,25 +433,37 @@ async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) { throw new Error(`No site or network found for ${scraperSlug}`); } - logger.verbose(`Searching profile for '${actor.name}' on '${scraperSlug}'`); + const existingProfile = existingProfilesByActorNetworkSiteId[actor.id]?.[context.network?.id || null]?.[context.site?.id || null]; + + if (existingProfile && !argv.force) { + logger.verbose(`Found existing profile for '${actor.name}' on '${label}', use --force to scrape again`); + + return null; + } + + logger.verbose(`Searching profile for '${actor.name}' on '${label}'`); const profile = await scraper.fetchProfile(actor.name, context, include); if (!profile || typeof profile === 'number') { // scraper returns HTTP code on request failure - logger.verbose(`Profile for '${actor.name}' not available on ${context.site?.name || context.network?.name || context.scraper}, scraper returned ${profile}`); - throw Object.assign(new Error(`Profile for '${actor.name}' not available on ${context.site?.name || context.network?.name || context.scraper}`), { code: 'PROFILE_NOT_AVAILABLE' }); + logger.verbose(`Profile for '${actor.name}' not available on ${label}, scraper returned ${profile}`); + throw Object.assign(new Error(`Profile for '${actor.name}' not available on ${label}`), { code: 'PROFILE_NOT_AVAILABLE' }); } - return { + logger.verbose(`Found profile for '${actor.name}' on '${label}'`); + + return await curateProfile({ ...actor, ...profile, ...context, - }; + update: existingProfile?.id || false, + }); } catch (error) { if (error.code !== 'PROFILE_NOT_AVAILABLE') { logger.error(`Failed to fetch profile for '${actor.name}' from '${scraperSlug}': ${error.message}`); } + // throw error to try next source throw error; } }), Promise.reject(new Error())); @@ -516,11 +500,10 @@ async function scrapeActors(actorNames) { .whereNull('network_id'), ]); - const existingActorEntriesBySlug = existingActorEntries.reduce((acc, actorEntry) => ({ ...acc, [actorEntry.slug]: actorEntry }), {}); - const networksBySlug = networks.reduce((acc, network) => ({ ...acc, [network.slug]: network }), {}); const sitesBySlug = sites.reduce((acc, site) => ({ ...acc, [site.slug]: curateSite(site) }), {}); + const existingActorEntriesBySlug = existingActorEntries.reduce((acc, actorEntry) => ({ ...acc, [actorEntry.slug]: actorEntry }), {}); const newBaseActors = baseActors.filter(baseActor => !existingActorEntriesBySlug[baseActor.slug]); const [batchId] = newBaseActors.length > 0 ? await knex('batches').insert({ comment: null }).returning('id') : [null]; @@ -529,16 +512,27 @@ async function scrapeActors(actorNames) { const actors = existingActorEntries.concat(Array.isArray(newActorEntries) ? newActorEntries : []); - // TODO: don't fetch existing profiles unless --force is used + const existingProfiles = await knex('actors_profiles').whereIn('actor_id', actors.map(actor => actor.id)); + const existingProfilesByActorNetworkSiteId = existingProfiles.reduce((acc, profile) => ({ + ...acc, + [profile.actor_id]: { + ...acc[profile.actor_id], + [profile.network_id]: { + ...acc[profile.network_id], + [profile.site_id]: profile, + }, + }, + }), {}); const profilesPerActor = await Promise.map( actors, - async actor => scrapeProfiles(actor, sources, networksBySlug, sitesBySlug), + async actor => scrapeProfiles(actor, sources, networksBySlug, sitesBySlug, existingProfilesByActorNetworkSiteId), { concurrency: 10 }, ); - const curatedProfiles = await Promise.all(profilesPerActor.flat().map(profile => curateProfile(profile))); - const profiles = curatedProfiles.filter(Boolean); + const profiles = profilesPerActor.flat().filter(Boolean); + + logger.info(`Scraped ${profiles.length} profiles`); if (argv.inspect) { console.log(profiles);