batch update actors

This commit is contained in:
SamPulsar1 2021-02-17 11:11:32 +10:30
parent 4b30398983
commit 86bc376b41
2 changed files with 147 additions and 114 deletions

View File

@ -283,7 +283,7 @@ function curateActorEntries(baseActors, batchId) {
function curateProfileEntry(profile) {
if (!profile.id) {
return null;
}
}
const curatedProfileEntry = {
...(profile.update !== false && { id: profile.update }),
@ -322,6 +322,8 @@ function curateProfileEntry(profile) {
avatar_media_id: profile.avatarMediaId || null,
};
if (profile.update) curatedProfileEntry.updated_at = new Date().toDateString();
return curatedProfileEntry;
}
@ -453,7 +455,7 @@ async function curateProfile(profile, actor) {
if (profile.ethnicity && !curatedProfile.ethnicity) logger.warn(`Unrecognized ethnicity returned by '${profile.entity.name}' scraper: ${profile.ethnicity}`);
if ((profile.hairColor || profile.hair) && !curatedProfile.hairColor) logger.warn(`Unrecognized hair color returned by '${profile.entity.name}' scraper: ${profile.hairColor || profile.hair}`);
if (profile.eyes && !curatedProfile.eyes) logger.warn(`Unrecognized eye color returned by '${profile.entity.name}' scraper: ${profile.eyes}`);
return curatedProfile;
} catch (error) {
logger.error(`Failed to curate '${profile.name}': ${error.message}`);
@ -691,7 +693,7 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy
}
logger.verbose(`Found profile for '${actor.name}' on '${label}'`);
return await curateProfile({
...actor,
...profile,
@ -727,14 +729,15 @@ async function getActorNames(actorNames) {
}
const actorsWithoutProfiles = await knex.raw(`
SELECT actors.name
FROM actors
WHERE NOT EXISTS (
SELECT *
FROM actors_profiles
WHERE actors_profiles.actor_id = actors.id
AND actors_profiles.updated_at <= (?)
)
SELECT actors.name
FROM actors
WHERE NOT EXISTS (
SELECT *
FROM actors_profiles
WHERE actors_profiles.actor_id = actors.id
AND actors_profiles.updated_at >= (?)
) AND alias_for IS NULL
ORDER BY actors.name
`, [argv.actorsUpdate || new Date()]);
return actorsWithoutProfiles.rows.map(actor => actor.name);
@ -750,9 +753,27 @@ async function storeProfiles(profiles) {
async function scrapeActors(argNames) {
const actorNames = await getActorNames(argNames);
const profiles = [];
const batchSize = argv.actorsBatch;
logger.info(`Scraping profiles for ${actorNames.length} actors`);
if (batchSize > 0) {
for (let i=0; i < actorNames.length; i=i+batchSize) {
logger.info(`Scraping profiles ${((i/actorNames.length)*100).toFixed(2)}%`);
profiles.push.apply(profiles, await scrapeActorsBatch(actorNames.slice(i, i + batchSize)));
}
} else {
profiles.push.apply(profiles, await scrapeActorsBatch(actorNames));
}
return profiles;
}
async function scrapeActorsBatch(actorNames) {
const baseActors = toBaseActors(actorNames);
logger.info(`Scraping profiles for ${actorNames.length} actors`);
logger.info(`Actors: ${actorNames.join(', ')}`);
const sources = argv.profileSources || config.profiles || Object.keys(scrapers.actors);
const entitySlugs = sources.flat();
@ -760,7 +781,7 @@ async function scrapeActors(argNames) {
const [entitiesBySlug, existingActorEntries] = await Promise.all([
fetchEntitiesBySlug(entitySlugs, 'desc'),
knex('actors')
.select(['id', 'name', 'slug', 'entry_id'])
.select(['id', 'name', 'slug', 'entry_id', 'gender'])
.whereIn('slug', baseActors.map(baseActor => baseActor.slug))
.whereNull('alias_for'),
]);
@ -820,132 +841,132 @@ async function scrapeActors(argNames) {
async function getOrCreateActors(baseActors, batchId) {
// WHERE IN causes stack depth error and performance issues with a large amount of values, no knex VALUES helper available
const actorValues = baseActors.map(actor => knex.raw('(:slug, :entityId, :entryId, :collisionLikely)', {
slug: actor.slug,
entityId: actor.entity.id,
entryId: actor.entryId,
collisionLikely: getCollisionLikely(actor),
})).join(', ');
const actorValues = baseActors.map(actor => knex.raw('(:slug, :entityId, :entryId, :collisionLikely)', {
slug: actor.slug,
entityId: actor.entity.id,
entryId: actor.entryId,
collisionLikely: getCollisionLikely(actor),
})).join(', ');
const existingActors = await knex
.select('actors.*')
.from(knex.raw(`actors, (VALUES ${actorValues}) AS base_actors (slug, entity_id, entry_id, collision_likely)`))
.whereRaw(`
actors.slug = base_actors.slug
AND actors.entity_id IS NULL
AND NOT base_actors.collision_likely
`)
.orWhereRaw(`
actors.slug = base_actors.slug
AND actors.entity_id = base_actors.entity_id
AND ((actors.entry_id IS NULL AND base_actors.entry_id IS NULL)
OR actors.entry_id = base_actors.entry_id)
`);
const existingActors = await knex
.select('actors.*')
.from(knex.raw(`actors, (VALUES ${actorValues}) AS base_actors (slug, entity_id, entry_id, collision_likely)`))
.whereRaw(`
actors.slug = base_actors.slug
AND actors.entity_id IS NULL
AND NOT base_actors.collision_likely
`)
.orWhereRaw(`
actors.slug = base_actors.slug
AND actors.entity_id = base_actors.entity_id
AND ((actors.entry_id IS NULL AND base_actors.entry_id IS NULL)
OR actors.entry_id = base_actors.entry_id)
`);
// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug));
const existingActorSlugs = existingActors.reduce((acc, actor) => ({
...acc,
[actor.entity_id]: {
...acc[actor.entity_id],
[actor.entry_id]: {
...acc[actor.entity_id]?.[actor.entry_id],
[actor.slug]: true,
},
// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug));
const existingActorSlugs = existingActors.reduce((acc, actor) => ({
...acc,
[actor.entity_id]: {
...acc[actor.entity_id],
[actor.entry_id]: {
...acc[actor.entity_id]?.[actor.entry_id],
[actor.slug]: true,
},
}), {});
},
}), {});
const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.entity.id]?.[baseActor.entryId]?.[baseActor.slug] && !existingActorSlugs.null?.null?.[baseActor.slug]);
const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId);
const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.entity.id]?.[baseActor.entryId]?.[baseActor.slug] && !existingActorSlugs.null?.null?.[baseActor.slug]);
const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId);
const newActors = await bulkInsert('actors', curatedActorEntries);
const newActors = await bulkInsert('actors', curatedActorEntries);
const newActorIdsByEntityIdEntryIdAndSlug = newActors.reduce((acc, actor) => ({
...acc,
[actor.entity_id]: {
...acc[actor.entity_id],
[actor.entry_id]: {
...acc[actor.entity_id]?.[actor.entry_id],
[actor.slug]: actor.id,
},
const newActorIdsByEntityIdEntryIdAndSlug = newActors.reduce((acc, actor) => ({
...acc,
[actor.entity_id]: {
...acc[actor.entity_id],
[actor.entry_id]: {
...acc[actor.entity_id]?.[actor.entry_id],
[actor.slug]: actor.id,
},
}), {});
},
}), {});
const newActorProfiles = await Promise.all(baseActors
.filter(actor => actor.hasProfile)
.map(actor => ({
...actor,
id: newActorIdsByEntityIdEntryIdAndSlug[actor.entity?.id]?.[actor.entryId]?.[actor.slug] || newActorIdsByEntityIdEntryIdAndSlug.null?.null?.[actor.slug],
}))
.filter(actor => !!actor.id)
.map(actor => curateProfile(actor)));
const newActorProfiles = await Promise.all(baseActors
.filter(actor => actor.hasProfile)
.map(actor => ({
...actor,
id: newActorIdsByEntityIdEntryIdAndSlug[actor.entity?.id]?.[actor.entryId]?.[actor.slug] || newActorIdsByEntityIdEntryIdAndSlug.null?.null?.[actor.slug],
}))
.filter(actor => !!actor.id)
.map(actor => curateProfile(actor)));
await storeProfiles(newActorProfiles);
await storeProfiles(newActorProfiles);
if (Array.isArray(newActors)) {
return newActors.concat(existingActors);
}
if (Array.isArray(newActors)) {
return newActors.concat(existingActors);
}
return existingActors;
return existingActors;
}
async function associateActors(releases, batchId) {
const baseActorsByReleaseId = releases.reduce((acc, release) => {
if (release.actors) {
acc[release.id] = toBaseActors(release.actors, release);
}
return acc;
}, {});
const baseActors = Object.values(baseActorsByReleaseId).flat();
if (baseActors.length === 0) {
return [];
const baseActorsByReleaseId = releases.reduce((acc, release) => {
if (release.actors) {
acc[release.id] = toBaseActors(release.actors, release);
}
const baseActorsBySlug = baseActors.reduce((acc, baseActor) => ({
...acc,
[baseActor.slug]: baseActor,
}), {});
return acc;
}, {});
const uniqueBaseActors = Object.values(baseActorsBySlug);
const baseActors = Object.values(baseActorsByReleaseId).flat();
const actors = await getOrCreateActors(uniqueBaseActors, batchId);
if (baseActors.length === 0) {
return [];
}
/*
const actorIdsBySlug = actors.reduce((acc, actor) => ({
...acc,
[actor.slug]: actor.alias_for || actor.id,
}), {});
*/
const baseActorsBySlug = baseActors.reduce((acc, baseActor) => ({
...acc,
[baseActor.slug]: baseActor,
}), {});
const actorIdsByEntityIdEntryIdAndSlug = actors.reduce((acc, actor) => ({
...acc,
[actor.entity_id]: {
...acc[actor.entity_id],
[actor.entry_id]: {
...acc[actor.entity_id]?.[actor.entry_id],
[actor.slug]: {
actor_id: actor.alias_for || actor.id,
alias_id: actor.alias_for ? actor.id : null,
},
const uniqueBaseActors = Object.values(baseActorsBySlug);
const actors = await getOrCreateActors(uniqueBaseActors, batchId);
/*
const actorIdsBySlug = actors.reduce((acc, actor) => ({
...acc,
[actor.slug]: actor.alias_for || actor.id,
}), {});
*/
const actorIdsByEntityIdEntryIdAndSlug = actors.reduce((acc, actor) => ({
...acc,
[actor.entity_id]: {
...acc[actor.entity_id],
[actor.entry_id]: {
...acc[actor.entity_id]?.[actor.entry_id],
[actor.slug]: {
actor_id: actor.alias_for || actor.id,
alias_id: actor.alias_for ? actor.id : null,
},
},
}), {});
},
}), {});
const releaseActorAssociations = Object.entries(baseActorsByReleaseId)
.map(([releaseId, releaseActors]) => releaseActors
.map(releaseActor => ({
release_id: releaseId,
...(actorIdsByEntityIdEntryIdAndSlug[releaseActor.entity?.id]?.[releaseActor.entryId]?.[releaseActor.slug] || actorIdsByEntityIdEntryIdAndSlug.null.null[releaseActor.slug]),
})))
.flat();
const releaseActorAssociations = Object.entries(baseActorsByReleaseId)
.map(([releaseId, releaseActors]) => releaseActors
.map(releaseActor => ({
release_id: releaseId,
...(actorIdsByEntityIdEntryIdAndSlug[releaseActor.entity?.id]?.[releaseActor.entryId]?.[releaseActor.slug] || actorIdsByEntityIdEntryIdAndSlug.null.null[releaseActor.slug]),
})))
.flat();
await bulkInsert('releases_actors', releaseActorAssociations, false);
await bulkInsert('releases_actors', releaseActorAssociations, false);
logger.verbose(`Associated ${releaseActorAssociations.length} actors to ${releases.length} scenes`);
logger.verbose(`Associated ${releaseActorAssociations.length} actors to ${releases.length} scenes`);
return actors;
return actors;
}
async function fetchActor(actorId) {

View File

@ -23,6 +23,13 @@ function interpretAfter(after) {
.toDate();
}
function interpretActorAfter(after) {
if (!after) {
return new Date();
}
return interpretAfter(after);
}
const { argv } = yargs
.command('npm start')
.option('server', {
@ -69,6 +76,11 @@ const { argv } = yargs
default: false,
alias: 'actor-scenes',
})
.option('actors-batch', {
describe: 'Bath size to scrape actors, if not seet then all are scraped in on pass',
type: 'number',
default: config?.actors?.batchSize === null ? 0 : config?.actors?.batchSize,
})
.option('actor-sources', {
describe: 'Use these scrapers for actor data',
type: 'array',
@ -307,6 +319,6 @@ const { argv } = yargs
alias: ['delete-movie', 'remove-movies', 'remove-movies'],
})
.coerce('after', interpretAfter)
.coerce('actors-update', interpretAfter);
.coerce('actors-update', interpretActorAfter);
module.exports = argv;