Improved actor scraping and display.
This commit is contained in:
118
src/actors.js
118
src/actors.js
@@ -127,6 +127,10 @@ function curateProfileEntry(profile) {
|
||||
}
|
||||
|
||||
async function curateProfile(profile) {
|
||||
if (!profile) {
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
const curatedProfile = {
|
||||
id: profile.id,
|
||||
@@ -161,7 +165,7 @@ async function curateProfile(profile) {
|
||||
|
||||
curatedProfile.dateOfDeath = Number.isNaN(Number(profile.dateOfDeath)) ? null : profile.dateOfDeath;
|
||||
|
||||
curatedProfile.cup = profile.cup || profile.bust?.match(/[a-zA-Z]+/)?.[0] || null;
|
||||
curatedProfile.cup = profile.cup || (typeof profile.bust === 'string' && profile.bust?.match(/[a-zA-Z]+/)?.[0]) || null;
|
||||
curatedProfile.bust = Number(profile.bust) || profile.bust?.match(/\d+/)?.[0] || null;
|
||||
curatedProfile.waist = Number(profile.waist) || profile.waist?.match(/\d+/)?.[0] || null;
|
||||
curatedProfile.hip = Number(profile.hip) || profile.hip?.match(/\d+/)?.[0] || null;
|
||||
@@ -257,6 +261,7 @@ async function interpolateProfiles(actors) {
|
||||
profile.date_of_birth = getMostFrequentDate(valuesByProperty.date_of_birth);
|
||||
profile.date_of_death = getMostFrequentDate(valuesByProperty.date_of_death);
|
||||
|
||||
// TODO: fix city, state and country not matching
|
||||
profile.birth_city = getMostFrequent(valuesByProperty.birth_city);
|
||||
profile.birth_state = getMostFrequent(valuesByProperty.birth_state);
|
||||
profile.birth_country_alpha2 = getMostFrequent(valuesByProperty.birth_country_alpha2);
|
||||
@@ -300,51 +305,6 @@ async function interpolateProfiles(actors) {
|
||||
.catch(transaction.rollback);
|
||||
}
|
||||
|
||||
async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) {
|
||||
const profiles = Promise.map(sources, async (source) => {
|
||||
try {
|
||||
return await [].concat(source).reduce(async (outcome, scraperSlug) => outcome.catch(async () => {
|
||||
const scraper = scrapers[scraperSlug];
|
||||
const siteOrNetwork = networksBySlug[scraperSlug] || sitesBySlug[scraperSlug];
|
||||
|
||||
if (!scraper?.fetchProfile) {
|
||||
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
|
||||
throw new Error(`No profile profile scraper available for ${scraperSlug}`);
|
||||
}
|
||||
|
||||
if (!siteOrNetwork) {
|
||||
logger.warn(`No site or network found for ${scraperSlug}`);
|
||||
throw new Error(`No site or network found for ${scraperSlug}`);
|
||||
}
|
||||
|
||||
logger.verbose(`Searching profile for '${actor.name}' on '${scraperSlug}'`);
|
||||
|
||||
const profile = await scraper.fetchProfile(actor.name, scraperSlug, siteOrNetwork, include);
|
||||
|
||||
if (!profile || typeof profile === 'number') { // scraper returns HTTP code on request failure
|
||||
logger.verbose(`Profile for '${actor.name}' not available on ${scraperSlug}, scraper returned ${profile}`);
|
||||
throw Object.assign(new Error(`Profile for '${actor.name}' not available on ${scraperSlug}`), { code: 'PROFILE_NOT_AVAILABLE' });
|
||||
}
|
||||
|
||||
return {
|
||||
...actor,
|
||||
...profile,
|
||||
scraper: scraperSlug,
|
||||
site: siteOrNetwork,
|
||||
};
|
||||
}), Promise.reject(new Error()));
|
||||
} catch (error) {
|
||||
if (error.code !== 'PROFILE_NOT_AVAILABLE') {
|
||||
logger.error(`Failed to fetch profile for '${actor.name}': ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
});
|
||||
|
||||
return profiles.filter(Boolean);
|
||||
}
|
||||
|
||||
async function upsertProfiles(profiles) {
|
||||
const curatedProfileEntries = profiles.map(profile => curateProfileEntry(profile));
|
||||
|
||||
@@ -403,6 +363,51 @@ async function upsertProfiles(profiles) {
|
||||
}
|
||||
}
|
||||
|
||||
async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) {
|
||||
const profiles = Promise.map(sources, async (source) => {
|
||||
try {
|
||||
return await [].concat(source).reduce(async (outcome, scraperSlug) => outcome.catch(async () => {
|
||||
const scraper = scrapers[scraperSlug];
|
||||
const siteOrNetwork = networksBySlug[scraperSlug] || sitesBySlug[scraperSlug];
|
||||
|
||||
if (!scraper?.fetchProfile) {
|
||||
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
|
||||
throw new Error(`No profile profile scraper available for ${scraperSlug}`);
|
||||
}
|
||||
|
||||
if (!siteOrNetwork) {
|
||||
logger.warn(`No site or network found for ${scraperSlug}`);
|
||||
throw new Error(`No site or network found for ${scraperSlug}`);
|
||||
}
|
||||
|
||||
logger.verbose(`Searching profile for '${actor.name}' on '${scraperSlug}'`);
|
||||
|
||||
const profile = await scraper.fetchProfile(actor.name, scraperSlug, siteOrNetwork, include);
|
||||
|
||||
if (!profile || typeof profile === 'number') { // scraper returns HTTP code on request failure
|
||||
logger.verbose(`Profile for '${actor.name}' not available on ${scraperSlug}, scraper returned ${profile}`);
|
||||
throw Object.assign(new Error(`Profile for '${actor.name}' not available on ${scraperSlug}`), { code: 'PROFILE_NOT_AVAILABLE' });
|
||||
}
|
||||
|
||||
return {
|
||||
...actor,
|
||||
...profile,
|
||||
scraper: scraperSlug,
|
||||
site: siteOrNetwork,
|
||||
};
|
||||
}), Promise.reject(new Error()));
|
||||
} catch (error) {
|
||||
if (error.code !== 'PROFILE_NOT_AVAILABLE') {
|
||||
logger.error(`Failed to fetch profile for '${actor.name}': ${error.message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
});
|
||||
|
||||
return profiles.filter(Boolean);
|
||||
}
|
||||
|
||||
async function scrapeActors(actorNames) {
|
||||
const baseActors = toBaseActors(actorNames);
|
||||
|
||||
@@ -438,7 +443,8 @@ async function scrapeActors(actorNames) {
|
||||
{ concurrency: 10 },
|
||||
);
|
||||
|
||||
const profiles = await Promise.all(profilesPerActor.flat().map(profile => curateProfile(profile)));
|
||||
const curatedProfiles = await Promise.all(profilesPerActor.flat().map(profile => curateProfile(profile)));
|
||||
const profiles = curatedProfiles.filter(Boolean);
|
||||
|
||||
if (argv.inspect) {
|
||||
console.log(profiles);
|
||||
@@ -495,31 +501,25 @@ async function associateActors(releases, batchId) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const baseActorsBySlugAndNetworkId = baseActors.reduce((acc, baseActor) => ({
|
||||
const baseActorsBySlug = baseActors.reduce((acc, baseActor) => ({
|
||||
...acc,
|
||||
[baseActor.slug]: {
|
||||
...acc[baseActor.slug],
|
||||
[baseActor.network.id]: baseActor,
|
||||
},
|
||||
[baseActor.slug]: baseActor,
|
||||
}), {});
|
||||
|
||||
const uniqueBaseActors = Object.values(baseActorsBySlugAndNetworkId).map(baseActorsByNetworkId => Object.values(baseActorsByNetworkId)).flat();
|
||||
const uniqueBaseActors = Object.values(baseActorsBySlug);
|
||||
|
||||
const actors = await getOrCreateActors(uniqueBaseActors, batchId);
|
||||
|
||||
const actorIdsBySlugAndNetworkId = actors.reduce((acc, actor) => ({
|
||||
const actorIdsBySlug = actors.reduce((acc, actor) => ({
|
||||
...acc,
|
||||
[actor.network_id]: {
|
||||
...acc[actor.network_id],
|
||||
[actor.slug]: actor.alias_for || actor.id,
|
||||
},
|
||||
[actor.slug]: actor.alias_for || actor.id,
|
||||
}), {});
|
||||
|
||||
const releaseActorAssociations = Object.entries(baseActorsByReleaseId)
|
||||
.map(([releaseId, releaseActors]) => releaseActors
|
||||
.map(releaseActor => ({
|
||||
release_id: releaseId,
|
||||
actor_id: actorIdsBySlugAndNetworkId[releaseActor.network.id]?.[releaseActor.slug] || actorIdsBySlugAndNetworkId.null[releaseActor.slug],
|
||||
actor_id: actorIdsBySlug[releaseActor.slug],
|
||||
})))
|
||||
.flat();
|
||||
|
||||
|
||||
@@ -252,9 +252,13 @@ async function fetchProfile(actorName) {
|
||||
}, { encodeJSON: true });
|
||||
|
||||
if (res.ok) {
|
||||
const actor = res.body.hits.hits.find(hit => hit._source.name === actorName);
|
||||
const actor = res.body.hits.hits.find(hit => hit._source.name.toLowerCase() === actorName.toLowerCase());
|
||||
|
||||
return scrapeProfile(actor._source);
|
||||
if (actor) {
|
||||
return scrapeProfile(actor._source);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
return res.status;
|
||||
|
||||
Reference in New Issue
Block a user