Changed qu's HTML element detection. Passing base actor instead of actorName to profile scrapers.

This commit is contained in:
DebaucheryLibrarian
2020-07-21 01:16:26 +02:00
parent 0e4c0d8fff
commit 939eba8e61
9 changed files with 84 additions and 16 deletions

View File

@@ -114,12 +114,15 @@ function getAverage(items) {
function toBaseActors(actorsOrNames, release) {
return actorsOrNames.map((actorOrName) => {
const name = capitalize(actorOrName.name || actorOrName);
const [baseName, entryId] = (actorOrName.name || actorOrName).split(':');
const name = capitalize(baseName);
const slug = slugify(name);
const baseActor = {
name,
slug,
entryId: entryId || null,
entity: release?.site?.network || release?.entity?.parent || release?.entity || null,
};
@@ -213,6 +216,7 @@ function curateActorEntry(baseActor, batchId) {
name: baseActor.name,
slug: baseActor.slug,
entity_id: null,
entry_id: baseActor.entry_id,
batch_id: batchId,
};
}
@@ -538,7 +542,7 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy
logger.verbose(`Searching profile for '${actor.name}' on '${label}'`);
const profile = await scraper.fetchProfile(actor.name, context, include);
const profile = await scraper.fetchProfile(actor, context, include);
if (!profile || typeof profile === 'number') { // scraper returns HTTP code on request failure
logger.verbose(`Profile for '${actor.name}' not available on ${label}, scraper returned ${profile}`);
@@ -587,7 +591,7 @@ async function scrapeActors(actorNames) {
.leftJoin('entities as parents', 'parents.id', 'entities.parent_id')
.orderBy('entities.type'),
knex('actors')
.select(['id', 'name', 'slug'])
.select(['id', 'name', 'slug', 'entry_id'])
.modify((queryBuilder) => {
if (actorNames.length > 0) {
queryBuilder.whereIn('slug', baseActors.map(baseActor => baseActor.slug));
@@ -598,12 +602,22 @@ async function scrapeActors(actorNames) {
const entitiesBySlug = entities.reduce((acc, entity) => ({ ...acc, [entity.slug]: entity }), {});
const existingActorEntriesBySlug = existingActorEntries.reduce((acc, actorEntry) => ({ ...acc, [actorEntry.slug]: actorEntry }), {});
const newBaseActors = baseActors.filter(baseActor => !existingActorEntriesBySlug[baseActor.slug]);
const existingActorEntriesBySlugAndEntryId = existingActorEntries.reduce((acc, actorEntry) => ({
...acc,
[actorEntry.slug]: {
...acc[actorEntry.slug],
[actorEntry.entryId || null]: actorEntry,
},
}), {});
const newBaseActors = baseActors.filter(baseActor => !existingActorEntriesBySlugAndEntryId[baseActor.slug][baseActor.entryId]);
const [batchId] = newBaseActors.length > 0 ? await knex('batches').insert({ comment: null }).returning('id') : [null];
const curatedActorEntries = batchId && curateActorEntries(newBaseActors, batchId);
const newActorEntries = batchId && await knex('actors').insert(curatedActorEntries).returning(['id', 'name', 'slug']);
const newActorEntries = batchId && await knex('actors')
.insert(curatedActorEntries)
.returning(['id', 'name', 'slug', 'entry_id']);
const actors = existingActorEntries.concat(Array.isArray(newActorEntries) ? newActorEntries : []);