Not scraping existing actor profiles unless --force is used.

This commit is contained in:
ThePendulum 2020-05-19 02:02:48 +02:00
parent 0c4628677f
commit 9883c3d9c2
1 changed files with 44 additions and 50 deletions

View File

@ -139,6 +139,7 @@ function curateActorEntries(baseActors, batchId) {
function curateProfileEntry(profile) {
const curatedProfileEntry = {
...(profile.update !== false && { id: profile.update }),
actor_id: profile.id,
site_id: profile.site?.id || null,
network_id: profile.network?.id || null,
@ -185,6 +186,7 @@ async function curateProfile(profile) {
scraper: profile.scraper,
site: profile.site,
network: profile.network,
update: profile.update,
};
curatedProfile.description = profile.description?.trim() || null;
@ -381,47 +383,13 @@ async function interpolateProfiles(actors) {
}
async function upsertProfiles(profiles) {
const curatedProfileEntries = profiles.map(profile => curateProfileEntry(profile));
const existingProfiles = await knex('actors_profiles')
.whereIn(['actor_id', 'network_id'], curatedProfileEntries.map(entry => [entry.actor_id, entry.network_id]))
.orWhereIn(['actor_id', 'site_id'], curatedProfileEntries.map(entry => [entry.actor_id, entry.site_id]));
const existingProfilesByActorNetworkSiteIds = existingProfiles.reduce((acc, profile) => ({
...acc,
[profile.actor_id]: {
...acc[profile.actor_id],
[profile.network_id]: {
...acc[profile.actor_id]?.[profile.network_id],
[profile.site_id]: profile,
},
},
}), {});
const { updatingProfileEntries, newProfileEntries } = curatedProfileEntries.reduce((acc, profile) => {
const existingProfile = existingProfilesByActorNetworkSiteIds[profile.actor_id]?.[profile.network_id]?.[profile.site_id];
if (existingProfile) {
return {
...acc,
updatingProfileEntries: [...acc.updatingProfileEntries, {
...profile,
id: existingProfile.id,
}],
};
}
return {
...acc,
newProfileEntries: [...acc.newProfileEntries, profile],
};
}, {
updatingProfileEntries: [],
newProfileEntries: [],
});
const newProfileEntries = profiles.filter(profile => !profile.update).map(profile => curateProfileEntry(profile));
const updatingProfileEntries = profiles.filter(profile => profile.update).map(profile => curateProfileEntry(profile));
if (newProfileEntries.length > 0) {
await knex('actors_profiles').insert(newProfileEntries);
logger.info(`Saved ${newProfileEntries.length} new actor profiles`);
}
if (argv.force && updatingProfileEntries.length > 0) {
@ -435,10 +403,12 @@ async function upsertProfiles(profiles) {
await Promise.all(queries)
.then(transaction.commit)
.catch(transaction.rollback);
logger.info(`Updated ${updatingProfileEntries.length} new actor profiles`);
}
}
async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) {
async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug, existingProfilesByActorNetworkSiteId) {
const profiles = Promise.map(sources, async (source) => {
try {
// config may group sources to try until success
@ -451,6 +421,8 @@ async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) {
scraper: scraperSlug,
};
const label = context.site?.name || context.network?.name;
if (!scraper?.fetchProfile) {
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
throw new Error(`No profile profile scraper available for ${scraperSlug}`);
@ -461,25 +433,37 @@ async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) {
throw new Error(`No site or network found for ${scraperSlug}`);
}
logger.verbose(`Searching profile for '${actor.name}' on '${scraperSlug}'`);
const existingProfile = existingProfilesByActorNetworkSiteId[actor.id]?.[context.network?.id || null]?.[context.site?.id || null];
if (existingProfile && !argv.force) {
logger.verbose(`Found existing profile for '${actor.name}' on '${label}', use --force to scrape again`);
return null;
}
logger.verbose(`Searching profile for '${actor.name}' on '${label}'`);
const profile = await scraper.fetchProfile(actor.name, context, include);
if (!profile || typeof profile === 'number') { // scraper returns HTTP code on request failure
logger.verbose(`Profile for '${actor.name}' not available on ${context.site?.name || context.network?.name || context.scraper}, scraper returned ${profile}`);
throw Object.assign(new Error(`Profile for '${actor.name}' not available on ${context.site?.name || context.network?.name || context.scraper}`), { code: 'PROFILE_NOT_AVAILABLE' });
logger.verbose(`Profile for '${actor.name}' not available on ${label}, scraper returned ${profile}`);
throw Object.assign(new Error(`Profile for '${actor.name}' not available on ${label}`), { code: 'PROFILE_NOT_AVAILABLE' });
}
return {
logger.verbose(`Found profile for '${actor.name}' on '${label}'`);
return await curateProfile({
...actor,
...profile,
...context,
};
update: existingProfile?.id || false,
});
} catch (error) {
if (error.code !== 'PROFILE_NOT_AVAILABLE') {
logger.error(`Failed to fetch profile for '${actor.name}' from '${scraperSlug}': ${error.message}`);
}
// throw error to try next source
throw error;
}
}), Promise.reject(new Error()));
@ -516,11 +500,10 @@ async function scrapeActors(actorNames) {
.whereNull('network_id'),
]);
const existingActorEntriesBySlug = existingActorEntries.reduce((acc, actorEntry) => ({ ...acc, [actorEntry.slug]: actorEntry }), {});
const networksBySlug = networks.reduce((acc, network) => ({ ...acc, [network.slug]: network }), {});
const sitesBySlug = sites.reduce((acc, site) => ({ ...acc, [site.slug]: curateSite(site) }), {});
const existingActorEntriesBySlug = existingActorEntries.reduce((acc, actorEntry) => ({ ...acc, [actorEntry.slug]: actorEntry }), {});
const newBaseActors = baseActors.filter(baseActor => !existingActorEntriesBySlug[baseActor.slug]);
const [batchId] = newBaseActors.length > 0 ? await knex('batches').insert({ comment: null }).returning('id') : [null];
@ -529,16 +512,27 @@ async function scrapeActors(actorNames) {
const actors = existingActorEntries.concat(Array.isArray(newActorEntries) ? newActorEntries : []);
// TODO: don't fetch existing profiles unless --force is used
const existingProfiles = await knex('actors_profiles').whereIn('actor_id', actors.map(actor => actor.id));
const existingProfilesByActorNetworkSiteId = existingProfiles.reduce((acc, profile) => ({
...acc,
[profile.actor_id]: {
...acc[profile.actor_id],
[profile.network_id]: {
...acc[profile.network_id],
[profile.site_id]: profile,
},
},
}), {});
const profilesPerActor = await Promise.map(
actors,
async actor => scrapeProfiles(actor, sources, networksBySlug, sitesBySlug),
async actor => scrapeProfiles(actor, sources, networksBySlug, sitesBySlug, existingProfilesByActorNetworkSiteId),
{ concurrency: 10 },
);
const curatedProfiles = await Promise.all(profilesPerActor.flat().map(profile => curateProfile(profile)));
const profiles = curatedProfiles.filter(Boolean);
const profiles = profilesPerActor.flat().filter(Boolean);
logger.info(`Scraped ${profiles.length} profiles`);
if (argv.inspect) {
console.log(profiles);