Not scraping existing actor profiles unless --force is used.
This commit is contained in:
parent
0c4628677f
commit
9883c3d9c2
|
@ -139,6 +139,7 @@ function curateActorEntries(baseActors, batchId) {
|
|||
|
||||
function curateProfileEntry(profile) {
|
||||
const curatedProfileEntry = {
|
||||
...(profile.update !== false && { id: profile.update }),
|
||||
actor_id: profile.id,
|
||||
site_id: profile.site?.id || null,
|
||||
network_id: profile.network?.id || null,
|
||||
|
@ -185,6 +186,7 @@ async function curateProfile(profile) {
|
|||
scraper: profile.scraper,
|
||||
site: profile.site,
|
||||
network: profile.network,
|
||||
update: profile.update,
|
||||
};
|
||||
|
||||
curatedProfile.description = profile.description?.trim() || null;
|
||||
|
@ -381,47 +383,13 @@ async function interpolateProfiles(actors) {
|
|||
}
|
||||
|
||||
async function upsertProfiles(profiles) {
|
||||
const curatedProfileEntries = profiles.map(profile => curateProfileEntry(profile));
|
||||
|
||||
const existingProfiles = await knex('actors_profiles')
|
||||
.whereIn(['actor_id', 'network_id'], curatedProfileEntries.map(entry => [entry.actor_id, entry.network_id]))
|
||||
.orWhereIn(['actor_id', 'site_id'], curatedProfileEntries.map(entry => [entry.actor_id, entry.site_id]));
|
||||
|
||||
const existingProfilesByActorNetworkSiteIds = existingProfiles.reduce((acc, profile) => ({
|
||||
...acc,
|
||||
[profile.actor_id]: {
|
||||
...acc[profile.actor_id],
|
||||
[profile.network_id]: {
|
||||
...acc[profile.actor_id]?.[profile.network_id],
|
||||
[profile.site_id]: profile,
|
||||
},
|
||||
},
|
||||
}), {});
|
||||
|
||||
const { updatingProfileEntries, newProfileEntries } = curatedProfileEntries.reduce((acc, profile) => {
|
||||
const existingProfile = existingProfilesByActorNetworkSiteIds[profile.actor_id]?.[profile.network_id]?.[profile.site_id];
|
||||
|
||||
if (existingProfile) {
|
||||
return {
|
||||
...acc,
|
||||
updatingProfileEntries: [...acc.updatingProfileEntries, {
|
||||
...profile,
|
||||
id: existingProfile.id,
|
||||
}],
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
...acc,
|
||||
newProfileEntries: [...acc.newProfileEntries, profile],
|
||||
};
|
||||
}, {
|
||||
updatingProfileEntries: [],
|
||||
newProfileEntries: [],
|
||||
});
|
||||
const newProfileEntries = profiles.filter(profile => !profile.update).map(profile => curateProfileEntry(profile));
|
||||
const updatingProfileEntries = profiles.filter(profile => profile.update).map(profile => curateProfileEntry(profile));
|
||||
|
||||
if (newProfileEntries.length > 0) {
|
||||
await knex('actors_profiles').insert(newProfileEntries);
|
||||
|
||||
logger.info(`Saved ${newProfileEntries.length} new actor profiles`);
|
||||
}
|
||||
|
||||
if (argv.force && updatingProfileEntries.length > 0) {
|
||||
|
@ -435,10 +403,12 @@ async function upsertProfiles(profiles) {
|
|||
await Promise.all(queries)
|
||||
.then(transaction.commit)
|
||||
.catch(transaction.rollback);
|
||||
|
||||
logger.info(`Updated ${updatingProfileEntries.length} new actor profiles`);
|
||||
}
|
||||
}
|
||||
|
||||
async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) {
|
||||
async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug, existingProfilesByActorNetworkSiteId) {
|
||||
const profiles = Promise.map(sources, async (source) => {
|
||||
try {
|
||||
// config may group sources to try until success
|
||||
|
@ -451,6 +421,8 @@ async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) {
|
|||
scraper: scraperSlug,
|
||||
};
|
||||
|
||||
const label = context.site?.name || context.network?.name;
|
||||
|
||||
if (!scraper?.fetchProfile) {
|
||||
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
|
||||
throw new Error(`No profile profile scraper available for ${scraperSlug}`);
|
||||
|
@ -461,25 +433,37 @@ async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) {
|
|||
throw new Error(`No site or network found for ${scraperSlug}`);
|
||||
}
|
||||
|
||||
logger.verbose(`Searching profile for '${actor.name}' on '${scraperSlug}'`);
|
||||
const existingProfile = existingProfilesByActorNetworkSiteId[actor.id]?.[context.network?.id || null]?.[context.site?.id || null];
|
||||
|
||||
if (existingProfile && !argv.force) {
|
||||
logger.verbose(`Found existing profile for '${actor.name}' on '${label}', use --force to scrape again`);
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
logger.verbose(`Searching profile for '${actor.name}' on '${label}'`);
|
||||
|
||||
const profile = await scraper.fetchProfile(actor.name, context, include);
|
||||
|
||||
if (!profile || typeof profile === 'number') { // scraper returns HTTP code on request failure
|
||||
logger.verbose(`Profile for '${actor.name}' not available on ${context.site?.name || context.network?.name || context.scraper}, scraper returned ${profile}`);
|
||||
throw Object.assign(new Error(`Profile for '${actor.name}' not available on ${context.site?.name || context.network?.name || context.scraper}`), { code: 'PROFILE_NOT_AVAILABLE' });
|
||||
logger.verbose(`Profile for '${actor.name}' not available on ${label}, scraper returned ${profile}`);
|
||||
throw Object.assign(new Error(`Profile for '${actor.name}' not available on ${label}`), { code: 'PROFILE_NOT_AVAILABLE' });
|
||||
}
|
||||
|
||||
return {
|
||||
logger.verbose(`Found profile for '${actor.name}' on '${label}'`);
|
||||
|
||||
return await curateProfile({
|
||||
...actor,
|
||||
...profile,
|
||||
...context,
|
||||
};
|
||||
update: existingProfile?.id || false,
|
||||
});
|
||||
} catch (error) {
|
||||
if (error.code !== 'PROFILE_NOT_AVAILABLE') {
|
||||
logger.error(`Failed to fetch profile for '${actor.name}' from '${scraperSlug}': ${error.message}`);
|
||||
}
|
||||
|
||||
// throw error to try next source
|
||||
throw error;
|
||||
}
|
||||
}), Promise.reject(new Error()));
|
||||
|
@ -516,11 +500,10 @@ async function scrapeActors(actorNames) {
|
|||
.whereNull('network_id'),
|
||||
]);
|
||||
|
||||
const existingActorEntriesBySlug = existingActorEntries.reduce((acc, actorEntry) => ({ ...acc, [actorEntry.slug]: actorEntry }), {});
|
||||
|
||||
const networksBySlug = networks.reduce((acc, network) => ({ ...acc, [network.slug]: network }), {});
|
||||
const sitesBySlug = sites.reduce((acc, site) => ({ ...acc, [site.slug]: curateSite(site) }), {});
|
||||
|
||||
const existingActorEntriesBySlug = existingActorEntries.reduce((acc, actorEntry) => ({ ...acc, [actorEntry.slug]: actorEntry }), {});
|
||||
const newBaseActors = baseActors.filter(baseActor => !existingActorEntriesBySlug[baseActor.slug]);
|
||||
|
||||
const [batchId] = newBaseActors.length > 0 ? await knex('batches').insert({ comment: null }).returning('id') : [null];
|
||||
|
@ -529,16 +512,27 @@ async function scrapeActors(actorNames) {
|
|||
|
||||
const actors = existingActorEntries.concat(Array.isArray(newActorEntries) ? newActorEntries : []);
|
||||
|
||||
// TODO: don't fetch existing profiles unless --force is used
|
||||
const existingProfiles = await knex('actors_profiles').whereIn('actor_id', actors.map(actor => actor.id));
|
||||
const existingProfilesByActorNetworkSiteId = existingProfiles.reduce((acc, profile) => ({
|
||||
...acc,
|
||||
[profile.actor_id]: {
|
||||
...acc[profile.actor_id],
|
||||
[profile.network_id]: {
|
||||
...acc[profile.network_id],
|
||||
[profile.site_id]: profile,
|
||||
},
|
||||
},
|
||||
}), {});
|
||||
|
||||
const profilesPerActor = await Promise.map(
|
||||
actors,
|
||||
async actor => scrapeProfiles(actor, sources, networksBySlug, sitesBySlug),
|
||||
async actor => scrapeProfiles(actor, sources, networksBySlug, sitesBySlug, existingProfilesByActorNetworkSiteId),
|
||||
{ concurrency: 10 },
|
||||
);
|
||||
|
||||
const curatedProfiles = await Promise.all(profilesPerActor.flat().map(profile => curateProfile(profile)));
|
||||
const profiles = curatedProfiles.filter(Boolean);
|
||||
const profiles = profilesPerActor.flat().filter(Boolean);
|
||||
|
||||
logger.info(`Scraped ${profiles.length} profiles`);
|
||||
|
||||
if (argv.inspect) {
|
||||
console.log(profiles);
|
||||
|
|
Loading…
Reference in New Issue