Not scraping existing actor profiles unless --force is used.
This commit is contained in:
parent
0c4628677f
commit
9883c3d9c2
|
@ -139,6 +139,7 @@ function curateActorEntries(baseActors, batchId) {
|
||||||
|
|
||||||
function curateProfileEntry(profile) {
|
function curateProfileEntry(profile) {
|
||||||
const curatedProfileEntry = {
|
const curatedProfileEntry = {
|
||||||
|
...(profile.update !== false && { id: profile.update }),
|
||||||
actor_id: profile.id,
|
actor_id: profile.id,
|
||||||
site_id: profile.site?.id || null,
|
site_id: profile.site?.id || null,
|
||||||
network_id: profile.network?.id || null,
|
network_id: profile.network?.id || null,
|
||||||
|
@ -185,6 +186,7 @@ async function curateProfile(profile) {
|
||||||
scraper: profile.scraper,
|
scraper: profile.scraper,
|
||||||
site: profile.site,
|
site: profile.site,
|
||||||
network: profile.network,
|
network: profile.network,
|
||||||
|
update: profile.update,
|
||||||
};
|
};
|
||||||
|
|
||||||
curatedProfile.description = profile.description?.trim() || null;
|
curatedProfile.description = profile.description?.trim() || null;
|
||||||
|
@ -381,47 +383,13 @@ async function interpolateProfiles(actors) {
|
||||||
}
|
}
|
||||||
|
|
||||||
async function upsertProfiles(profiles) {
|
async function upsertProfiles(profiles) {
|
||||||
const curatedProfileEntries = profiles.map(profile => curateProfileEntry(profile));
|
const newProfileEntries = profiles.filter(profile => !profile.update).map(profile => curateProfileEntry(profile));
|
||||||
|
const updatingProfileEntries = profiles.filter(profile => profile.update).map(profile => curateProfileEntry(profile));
|
||||||
const existingProfiles = await knex('actors_profiles')
|
|
||||||
.whereIn(['actor_id', 'network_id'], curatedProfileEntries.map(entry => [entry.actor_id, entry.network_id]))
|
|
||||||
.orWhereIn(['actor_id', 'site_id'], curatedProfileEntries.map(entry => [entry.actor_id, entry.site_id]));
|
|
||||||
|
|
||||||
const existingProfilesByActorNetworkSiteIds = existingProfiles.reduce((acc, profile) => ({
|
|
||||||
...acc,
|
|
||||||
[profile.actor_id]: {
|
|
||||||
...acc[profile.actor_id],
|
|
||||||
[profile.network_id]: {
|
|
||||||
...acc[profile.actor_id]?.[profile.network_id],
|
|
||||||
[profile.site_id]: profile,
|
|
||||||
},
|
|
||||||
},
|
|
||||||
}), {});
|
|
||||||
|
|
||||||
const { updatingProfileEntries, newProfileEntries } = curatedProfileEntries.reduce((acc, profile) => {
|
|
||||||
const existingProfile = existingProfilesByActorNetworkSiteIds[profile.actor_id]?.[profile.network_id]?.[profile.site_id];
|
|
||||||
|
|
||||||
if (existingProfile) {
|
|
||||||
return {
|
|
||||||
...acc,
|
|
||||||
updatingProfileEntries: [...acc.updatingProfileEntries, {
|
|
||||||
...profile,
|
|
||||||
id: existingProfile.id,
|
|
||||||
}],
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
...acc,
|
|
||||||
newProfileEntries: [...acc.newProfileEntries, profile],
|
|
||||||
};
|
|
||||||
}, {
|
|
||||||
updatingProfileEntries: [],
|
|
||||||
newProfileEntries: [],
|
|
||||||
});
|
|
||||||
|
|
||||||
if (newProfileEntries.length > 0) {
|
if (newProfileEntries.length > 0) {
|
||||||
await knex('actors_profiles').insert(newProfileEntries);
|
await knex('actors_profiles').insert(newProfileEntries);
|
||||||
|
|
||||||
|
logger.info(`Saved ${newProfileEntries.length} new actor profiles`);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (argv.force && updatingProfileEntries.length > 0) {
|
if (argv.force && updatingProfileEntries.length > 0) {
|
||||||
|
@ -435,10 +403,12 @@ async function upsertProfiles(profiles) {
|
||||||
await Promise.all(queries)
|
await Promise.all(queries)
|
||||||
.then(transaction.commit)
|
.then(transaction.commit)
|
||||||
.catch(transaction.rollback);
|
.catch(transaction.rollback);
|
||||||
|
|
||||||
|
logger.info(`Updated ${updatingProfileEntries.length} new actor profiles`);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) {
|
async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug, existingProfilesByActorNetworkSiteId) {
|
||||||
const profiles = Promise.map(sources, async (source) => {
|
const profiles = Promise.map(sources, async (source) => {
|
||||||
try {
|
try {
|
||||||
// config may group sources to try until success
|
// config may group sources to try until success
|
||||||
|
@ -451,6 +421,8 @@ async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) {
|
||||||
scraper: scraperSlug,
|
scraper: scraperSlug,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const label = context.site?.name || context.network?.name;
|
||||||
|
|
||||||
if (!scraper?.fetchProfile) {
|
if (!scraper?.fetchProfile) {
|
||||||
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
|
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
|
||||||
throw new Error(`No profile profile scraper available for ${scraperSlug}`);
|
throw new Error(`No profile profile scraper available for ${scraperSlug}`);
|
||||||
|
@ -461,25 +433,37 @@ async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) {
|
||||||
throw new Error(`No site or network found for ${scraperSlug}`);
|
throw new Error(`No site or network found for ${scraperSlug}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.verbose(`Searching profile for '${actor.name}' on '${scraperSlug}'`);
|
const existingProfile = existingProfilesByActorNetworkSiteId[actor.id]?.[context.network?.id || null]?.[context.site?.id || null];
|
||||||
|
|
||||||
|
if (existingProfile && !argv.force) {
|
||||||
|
logger.verbose(`Found existing profile for '${actor.name}' on '${label}', use --force to scrape again`);
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.verbose(`Searching profile for '${actor.name}' on '${label}'`);
|
||||||
|
|
||||||
const profile = await scraper.fetchProfile(actor.name, context, include);
|
const profile = await scraper.fetchProfile(actor.name, context, include);
|
||||||
|
|
||||||
if (!profile || typeof profile === 'number') { // scraper returns HTTP code on request failure
|
if (!profile || typeof profile === 'number') { // scraper returns HTTP code on request failure
|
||||||
logger.verbose(`Profile for '${actor.name}' not available on ${context.site?.name || context.network?.name || context.scraper}, scraper returned ${profile}`);
|
logger.verbose(`Profile for '${actor.name}' not available on ${label}, scraper returned ${profile}`);
|
||||||
throw Object.assign(new Error(`Profile for '${actor.name}' not available on ${context.site?.name || context.network?.name || context.scraper}`), { code: 'PROFILE_NOT_AVAILABLE' });
|
throw Object.assign(new Error(`Profile for '${actor.name}' not available on ${label}`), { code: 'PROFILE_NOT_AVAILABLE' });
|
||||||
}
|
}
|
||||||
|
|
||||||
return {
|
logger.verbose(`Found profile for '${actor.name}' on '${label}'`);
|
||||||
|
|
||||||
|
return await curateProfile({
|
||||||
...actor,
|
...actor,
|
||||||
...profile,
|
...profile,
|
||||||
...context,
|
...context,
|
||||||
};
|
update: existingProfile?.id || false,
|
||||||
|
});
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
if (error.code !== 'PROFILE_NOT_AVAILABLE') {
|
if (error.code !== 'PROFILE_NOT_AVAILABLE') {
|
||||||
logger.error(`Failed to fetch profile for '${actor.name}' from '${scraperSlug}': ${error.message}`);
|
logger.error(`Failed to fetch profile for '${actor.name}' from '${scraperSlug}': ${error.message}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// throw error to try next source
|
||||||
throw error;
|
throw error;
|
||||||
}
|
}
|
||||||
}), Promise.reject(new Error()));
|
}), Promise.reject(new Error()));
|
||||||
|
@ -516,11 +500,10 @@ async function scrapeActors(actorNames) {
|
||||||
.whereNull('network_id'),
|
.whereNull('network_id'),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
const existingActorEntriesBySlug = existingActorEntries.reduce((acc, actorEntry) => ({ ...acc, [actorEntry.slug]: actorEntry }), {});
|
|
||||||
|
|
||||||
const networksBySlug = networks.reduce((acc, network) => ({ ...acc, [network.slug]: network }), {});
|
const networksBySlug = networks.reduce((acc, network) => ({ ...acc, [network.slug]: network }), {});
|
||||||
const sitesBySlug = sites.reduce((acc, site) => ({ ...acc, [site.slug]: curateSite(site) }), {});
|
const sitesBySlug = sites.reduce((acc, site) => ({ ...acc, [site.slug]: curateSite(site) }), {});
|
||||||
|
|
||||||
|
const existingActorEntriesBySlug = existingActorEntries.reduce((acc, actorEntry) => ({ ...acc, [actorEntry.slug]: actorEntry }), {});
|
||||||
const newBaseActors = baseActors.filter(baseActor => !existingActorEntriesBySlug[baseActor.slug]);
|
const newBaseActors = baseActors.filter(baseActor => !existingActorEntriesBySlug[baseActor.slug]);
|
||||||
|
|
||||||
const [batchId] = newBaseActors.length > 0 ? await knex('batches').insert({ comment: null }).returning('id') : [null];
|
const [batchId] = newBaseActors.length > 0 ? await knex('batches').insert({ comment: null }).returning('id') : [null];
|
||||||
|
@ -529,16 +512,27 @@ async function scrapeActors(actorNames) {
|
||||||
|
|
||||||
const actors = existingActorEntries.concat(Array.isArray(newActorEntries) ? newActorEntries : []);
|
const actors = existingActorEntries.concat(Array.isArray(newActorEntries) ? newActorEntries : []);
|
||||||
|
|
||||||
// TODO: don't fetch existing profiles unless --force is used
|
const existingProfiles = await knex('actors_profiles').whereIn('actor_id', actors.map(actor => actor.id));
|
||||||
|
const existingProfilesByActorNetworkSiteId = existingProfiles.reduce((acc, profile) => ({
|
||||||
|
...acc,
|
||||||
|
[profile.actor_id]: {
|
||||||
|
...acc[profile.actor_id],
|
||||||
|
[profile.network_id]: {
|
||||||
|
...acc[profile.network_id],
|
||||||
|
[profile.site_id]: profile,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}), {});
|
||||||
|
|
||||||
const profilesPerActor = await Promise.map(
|
const profilesPerActor = await Promise.map(
|
||||||
actors,
|
actors,
|
||||||
async actor => scrapeProfiles(actor, sources, networksBySlug, sitesBySlug),
|
async actor => scrapeProfiles(actor, sources, networksBySlug, sitesBySlug, existingProfilesByActorNetworkSiteId),
|
||||||
{ concurrency: 10 },
|
{ concurrency: 10 },
|
||||||
);
|
);
|
||||||
|
|
||||||
const curatedProfiles = await Promise.all(profilesPerActor.flat().map(profile => curateProfile(profile)));
|
const profiles = profilesPerActor.flat().filter(Boolean);
|
||||||
const profiles = curatedProfiles.filter(Boolean);
|
|
||||||
|
logger.info(`Scraped ${profiles.length} profiles`);
|
||||||
|
|
||||||
if (argv.inspect) {
|
if (argv.inspect) {
|
||||||
console.log(profiles);
|
console.log(profiles);
|
||||||
|
|
Loading…
Reference in New Issue