Refactored Aziani scraper. Improved actor profile update logic.

This commit is contained in:
DebaucheryLibrarian
2024-11-24 06:10:21 +01:00
parent 909dc36569
commit fbfd52e831
7 changed files with 354 additions and 163 deletions

View File

@@ -168,7 +168,7 @@ function toBaseActors(actorsOrNames, release) {
// using top level parent widens the scope too much, e.g. different Gamma sites may not use the same actor database
// const entity = getRecursiveParent(release?.entity);
const entity = (release?.entity?.indepdendent && release?.entity)
const entity = (release?.entity?.independent && release?.entity)
|| release?.entity?.parent
|| release?.entity
|| null;
@@ -308,7 +308,7 @@ function curateProfileEntry(profile) {
}
const curatedProfileEntry = {
...(profile.update !== false && { id: profile.update }),
...(typeof profile.update === 'number' && { id: profile.update }),
actor_id: profile.actorId,
entity_id: profile.entity?.id || null,
date_of_birth: profile.dateOfBirth,
@@ -552,21 +552,34 @@ async function upsertProfiles(profiles) {
const newProfileEntries = profiles.filter((profile) => !profile.update).map((profile) => curateProfileEntry(profile)).filter(Boolean);
const updatingProfileEntries = profiles.filter((profile) => profile.update).map((profile) => curateProfileEntry(profile)).filter(Boolean);
const newProfiles = await insertProfiles(newProfileEntries);
const newProfiles = newProfileEntries.length > 0
? await insertProfiles(newProfileEntries)
: [];
if (argv.force && updatingProfileEntries.length > 0) {
const transaction = await knex.transaction();
const queries = updatingProfileEntries.map((profileEntry) => knex('actors_profiles')
.where('id', profileEntry.id)
const queries = updatingProfileEntries.map(async (profileEntry) => knex('actors_profiles')
.modify((builder) => {
if (profileEntry.id) {
builder.where('id', profileEntry.id);
} else {
builder
.where('actor_id', profileEntry.actor_id)
.where('entity_id', profileEntry.entity_id);
}
})
.update(profileEntry)
.returning(['id', 'actor_id'])
.transacting(transaction));
await Promise.all(queries)
.then(transaction.commit)
.catch(transaction.rollback);
.catch((error) => {
logger.error(error.message);
return transaction.rollback();
});
logger.info(`Updated ${updatingProfileEntries.length} new actor profiles`);
logger.info(`Updated ${updatingProfileEntries.length} actor profiles`);
}
if (profiles.length > 0) {
@@ -586,10 +599,12 @@ async function upsertProfiles(profiles) {
media_id: profile.avatarMediaId,
}));
await knex('actors_avatars')
.insert(avatars)
.onConflict()
.ignore();
if (avatars.length > 0) {
await knex('actors_avatars')
.insert(avatars)
.onConflict()
.ignore();
}
}
}
@@ -599,7 +614,7 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy
const profiles = Promise.map(validSources, async (source) => {
try {
// config may group sources to try until success
return await [].concat(source).reduce(async (outcome, scraperSlug) => outcome.catch(async () => {
return [].concat(source).reduce(async (outcome, scraperSlug) => outcome.catch(async () => {
try {
const entity = entitiesBySlug[scraperSlug] || null;
@@ -846,6 +861,13 @@ async function getOrCreateActors(baseActors, batchId) {
OR actors.entry_id = base_actors.entry_id)
`);
const actorIds = existingActors.map((actor) => actor.id);
const entityIds = Array.from(new Set(baseActors.map((actor) => actor.entity?.id).filter(Boolean)));
const existingProfiles = await knex('actors_profiles')
.whereIn('actor_id', actorIds)
.whereIn('entity_id', entityIds);
// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug));
const existingActorSlugs = existingActors.reduce((acc, actor) => ({
...acc,
@@ -863,7 +885,7 @@ async function getOrCreateActors(baseActors, batchId) {
const newActors = await bulkInsert('actors', curatedActorEntries);
const newActorIdsByEntityIdEntryIdAndSlug = newActors.reduce((acc, actor) => ({
const actorIdsByEntityIdEntryIdAndSlug = [...existingActors, ...newActors].reduce((acc, actor) => ({
...acc,
[actor.entity_id]: {
...acc[actor.entity_id],
@@ -874,13 +896,26 @@ async function getOrCreateActors(baseActors, batchId) {
},
}), {});
const profileIdsByActorIdAndEntityId = existingProfiles.reduce((acc, profile) => ({
...acc,
[profile.actor_id]: {
...acc[profile.actor_id],
[profile.entity_id]: profile.id,
},
}), {});
const newActorProfiles = await Promise.all(baseActors
.filter((actor) => actor.hasProfile)
.map((actor) => ({
...actor,
actorId: newActorIdsByEntityIdEntryIdAndSlug[actor.entity?.id]?.[actor.entryId]?.[actor.slug] || newActorIdsByEntityIdEntryIdAndSlug.null?.null?.[actor.slug],
}))
.filter((actor) => !!actor.id)
.map((actor) => {
const actorId = actorIdsByEntityIdEntryIdAndSlug[actor.entity?.id]?.[actor.entryId]?.[actor.slug] || actorIdsByEntityIdEntryIdAndSlug.null?.null?.[actor.slug];
return {
...actor,
actorId,
update: profileIdsByActorIdAndEntityId[actorId]?.[actor.entity?.id],
};
})
.filter((actor) => !!actor.actorId)
.map((actor) => curateProfile(actor)));
await storeProfiles(newActorProfiles);