Improved actor scraping and display.

This commit is contained in:
2020-05-18 01:22:56 +02:00
parent af5543190a
commit 8733fdc657
28 changed files with 1033 additions and 793 deletions

View File

@@ -127,6 +127,10 @@ function curateProfileEntry(profile) {
}
async function curateProfile(profile) {
if (!profile) {
return null;
}
try {
const curatedProfile = {
id: profile.id,
@@ -161,7 +165,7 @@ async function curateProfile(profile) {
curatedProfile.dateOfDeath = Number.isNaN(Number(profile.dateOfDeath)) ? null : profile.dateOfDeath;
curatedProfile.cup = profile.cup || profile.bust?.match(/[a-zA-Z]+/)?.[0] || null;
curatedProfile.cup = profile.cup || (typeof profile.bust === 'string' && profile.bust?.match(/[a-zA-Z]+/)?.[0]) || null;
curatedProfile.bust = Number(profile.bust) || profile.bust?.match(/\d+/)?.[0] || null;
curatedProfile.waist = Number(profile.waist) || profile.waist?.match(/\d+/)?.[0] || null;
curatedProfile.hip = Number(profile.hip) || profile.hip?.match(/\d+/)?.[0] || null;
@@ -257,6 +261,7 @@ async function interpolateProfiles(actors) {
profile.date_of_birth = getMostFrequentDate(valuesByProperty.date_of_birth);
profile.date_of_death = getMostFrequentDate(valuesByProperty.date_of_death);
// TODO: fix city, state and country not matching
profile.birth_city = getMostFrequent(valuesByProperty.birth_city);
profile.birth_state = getMostFrequent(valuesByProperty.birth_state);
profile.birth_country_alpha2 = getMostFrequent(valuesByProperty.birth_country_alpha2);
@@ -300,51 +305,6 @@ async function interpolateProfiles(actors) {
.catch(transaction.rollback);
}
async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) {
const profiles = Promise.map(sources, async (source) => {
try {
return await [].concat(source).reduce(async (outcome, scraperSlug) => outcome.catch(async () => {
const scraper = scrapers[scraperSlug];
const siteOrNetwork = networksBySlug[scraperSlug] || sitesBySlug[scraperSlug];
if (!scraper?.fetchProfile) {
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
throw new Error(`No profile profile scraper available for ${scraperSlug}`);
}
if (!siteOrNetwork) {
logger.warn(`No site or network found for ${scraperSlug}`);
throw new Error(`No site or network found for ${scraperSlug}`);
}
logger.verbose(`Searching profile for '${actor.name}' on '${scraperSlug}'`);
const profile = await scraper.fetchProfile(actor.name, scraperSlug, siteOrNetwork, include);
if (!profile || typeof profile === 'number') { // scraper returns HTTP code on request failure
logger.verbose(`Profile for '${actor.name}' not available on ${scraperSlug}, scraper returned ${profile}`);
throw Object.assign(new Error(`Profile for '${actor.name}' not available on ${scraperSlug}`), { code: 'PROFILE_NOT_AVAILABLE' });
}
return {
...actor,
...profile,
scraper: scraperSlug,
site: siteOrNetwork,
};
}), Promise.reject(new Error()));
} catch (error) {
if (error.code !== 'PROFILE_NOT_AVAILABLE') {
logger.error(`Failed to fetch profile for '${actor.name}': ${error.message}`);
}
}
return null;
});
return profiles.filter(Boolean);
}
async function upsertProfiles(profiles) {
const curatedProfileEntries = profiles.map(profile => curateProfileEntry(profile));
@@ -403,6 +363,51 @@ async function upsertProfiles(profiles) {
}
}
async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) {
const profiles = Promise.map(sources, async (source) => {
try {
return await [].concat(source).reduce(async (outcome, scraperSlug) => outcome.catch(async () => {
const scraper = scrapers[scraperSlug];
const siteOrNetwork = networksBySlug[scraperSlug] || sitesBySlug[scraperSlug];
if (!scraper?.fetchProfile) {
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
throw new Error(`No profile profile scraper available for ${scraperSlug}`);
}
if (!siteOrNetwork) {
logger.warn(`No site or network found for ${scraperSlug}`);
throw new Error(`No site or network found for ${scraperSlug}`);
}
logger.verbose(`Searching profile for '${actor.name}' on '${scraperSlug}'`);
const profile = await scraper.fetchProfile(actor.name, scraperSlug, siteOrNetwork, include);
if (!profile || typeof profile === 'number') { // scraper returns HTTP code on request failure
logger.verbose(`Profile for '${actor.name}' not available on ${scraperSlug}, scraper returned ${profile}`);
throw Object.assign(new Error(`Profile for '${actor.name}' not available on ${scraperSlug}`), { code: 'PROFILE_NOT_AVAILABLE' });
}
return {
...actor,
...profile,
scraper: scraperSlug,
site: siteOrNetwork,
};
}), Promise.reject(new Error()));
} catch (error) {
if (error.code !== 'PROFILE_NOT_AVAILABLE') {
logger.error(`Failed to fetch profile for '${actor.name}': ${error.message}`);
}
}
return null;
});
return profiles.filter(Boolean);
}
async function scrapeActors(actorNames) {
const baseActors = toBaseActors(actorNames);
@@ -438,7 +443,8 @@ async function scrapeActors(actorNames) {
{ concurrency: 10 },
);
const profiles = await Promise.all(profilesPerActor.flat().map(profile => curateProfile(profile)));
const curatedProfiles = await Promise.all(profilesPerActor.flat().map(profile => curateProfile(profile)));
const profiles = curatedProfiles.filter(Boolean);
if (argv.inspect) {
console.log(profiles);
@@ -495,31 +501,25 @@ async function associateActors(releases, batchId) {
return null;
}
const baseActorsBySlugAndNetworkId = baseActors.reduce((acc, baseActor) => ({
const baseActorsBySlug = baseActors.reduce((acc, baseActor) => ({
...acc,
[baseActor.slug]: {
...acc[baseActor.slug],
[baseActor.network.id]: baseActor,
},
[baseActor.slug]: baseActor,
}), {});
const uniqueBaseActors = Object.values(baseActorsBySlugAndNetworkId).map(baseActorsByNetworkId => Object.values(baseActorsByNetworkId)).flat();
const uniqueBaseActors = Object.values(baseActorsBySlug);
const actors = await getOrCreateActors(uniqueBaseActors, batchId);
const actorIdsBySlugAndNetworkId = actors.reduce((acc, actor) => ({
const actorIdsBySlug = actors.reduce((acc, actor) => ({
...acc,
[actor.network_id]: {
...acc[actor.network_id],
[actor.slug]: actor.alias_for || actor.id,
},
[actor.slug]: actor.alias_for || actor.id,
}), {});
const releaseActorAssociations = Object.entries(baseActorsByReleaseId)
.map(([releaseId, releaseActors]) => releaseActors
.map(releaseActor => ({
release_id: releaseId,
actor_id: actorIdsBySlugAndNetworkId[releaseActor.network.id]?.[releaseActor.slug] || actorIdsBySlugAndNetworkId.null[releaseActor.slug],
actor_id: actorIdsBySlug[releaseActor.slug],
})))
.flat();

View File

@@ -252,9 +252,13 @@ async function fetchProfile(actorName) {
}, { encodeJSON: true });
if (res.ok) {
const actor = res.body.hits.hits.find(hit => hit._source.name === actorName);
const actor = res.body.hits.hits.find(hit => hit._source.name.toLowerCase() === actorName.toLowerCase());
return scrapeProfile(actor._source);
if (actor) {
return scrapeProfile(actor._source);
}
return null;
}
return res.status;