From 86bc376b412b45f43cac9167f0e008249e9f6ade Mon Sep 17 00:00:00 2001 From: SamPulsar1 Date: Wed, 17 Feb 2021 11:11:32 +1030 Subject: [PATCH] batch update actors --- src/actors.js | 247 +++++++++++++++++++++++++++----------------------- src/argv.js | 14 ++- 2 files changed, 147 insertions(+), 114 deletions(-) diff --git a/src/actors.js b/src/actors.js index 3f5146cc..56e4cb12 100644 --- a/src/actors.js +++ b/src/actors.js @@ -283,7 +283,7 @@ function curateActorEntries(baseActors, batchId) { function curateProfileEntry(profile) { if (!profile.id) { return null; - } + } const curatedProfileEntry = { ...(profile.update !== false && { id: profile.update }), @@ -322,6 +322,8 @@ function curateProfileEntry(profile) { avatar_media_id: profile.avatarMediaId || null, }; + if (profile.update) curatedProfileEntry.updated_at = new Date().toDateString(); + return curatedProfileEntry; } @@ -453,7 +455,7 @@ async function curateProfile(profile, actor) { if (profile.ethnicity && !curatedProfile.ethnicity) logger.warn(`Unrecognized ethnicity returned by '${profile.entity.name}' scraper: ${profile.ethnicity}`); if ((profile.hairColor || profile.hair) && !curatedProfile.hairColor) logger.warn(`Unrecognized hair color returned by '${profile.entity.name}' scraper: ${profile.hairColor || profile.hair}`); if (profile.eyes && !curatedProfile.eyes) logger.warn(`Unrecognized eye color returned by '${profile.entity.name}' scraper: ${profile.eyes}`); - + return curatedProfile; } catch (error) { logger.error(`Failed to curate '${profile.name}': ${error.message}`); @@ -691,7 +693,7 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy } logger.verbose(`Found profile for '${actor.name}' on '${label}'`); - + return await curateProfile({ ...actor, ...profile, @@ -727,14 +729,15 @@ async function getActorNames(actorNames) { } const actorsWithoutProfiles = await knex.raw(` - SELECT actors.name - FROM actors - WHERE NOT EXISTS ( - SELECT * - FROM actors_profiles - WHERE actors_profiles.actor_id = actors.id - AND actors_profiles.updated_at <= (?) - ) + SELECT actors.name + FROM actors + WHERE NOT EXISTS ( + SELECT * + FROM actors_profiles + WHERE actors_profiles.actor_id = actors.id + AND actors_profiles.updated_at >= (?) + ) AND alias_for IS NULL + ORDER BY actors.name `, [argv.actorsUpdate || new Date()]); return actorsWithoutProfiles.rows.map(actor => actor.name); @@ -750,9 +753,27 @@ async function storeProfiles(profiles) { async function scrapeActors(argNames) { const actorNames = await getActorNames(argNames); + const profiles = []; + + const batchSize = argv.actorsBatch; + logger.info(`Scraping profiles for ${actorNames.length} actors`); + + if (batchSize > 0) { + for (let i=0; i < actorNames.length; i=i+batchSize) { + logger.info(`Scraping profiles ${((i/actorNames.length)*100).toFixed(2)}%`); + profiles.push.apply(profiles, await scrapeActorsBatch(actorNames.slice(i, i + batchSize))); + } + } else { + profiles.push.apply(profiles, await scrapeActorsBatch(actorNames)); + } + + return profiles; +} + +async function scrapeActorsBatch(actorNames) { const baseActors = toBaseActors(actorNames); - logger.info(`Scraping profiles for ${actorNames.length} actors`); + logger.info(`Actors: ${actorNames.join(', ')}`); const sources = argv.profileSources || config.profiles || Object.keys(scrapers.actors); const entitySlugs = sources.flat(); @@ -760,7 +781,7 @@ async function scrapeActors(argNames) { const [entitiesBySlug, existingActorEntries] = await Promise.all([ fetchEntitiesBySlug(entitySlugs, 'desc'), knex('actors') - .select(['id', 'name', 'slug', 'entry_id']) + .select(['id', 'name', 'slug', 'entry_id', 'gender']) .whereIn('slug', baseActors.map(baseActor => baseActor.slug)) .whereNull('alias_for'), ]); @@ -820,132 +841,132 @@ async function scrapeActors(argNames) { async function getOrCreateActors(baseActors, batchId) { // WHERE IN causes stack depth error and performance issues with a large amount of values, no knex VALUES helper available - const actorValues = baseActors.map(actor => knex.raw('(:slug, :entityId, :entryId, :collisionLikely)', { - slug: actor.slug, - entityId: actor.entity.id, - entryId: actor.entryId, - collisionLikely: getCollisionLikely(actor), - })).join(', '); +const actorValues = baseActors.map(actor => knex.raw('(:slug, :entityId, :entryId, :collisionLikely)', { + slug: actor.slug, + entityId: actor.entity.id, + entryId: actor.entryId, + collisionLikely: getCollisionLikely(actor), +})).join(', '); - const existingActors = await knex - .select('actors.*') - .from(knex.raw(`actors, (VALUES ${actorValues}) AS base_actors (slug, entity_id, entry_id, collision_likely)`)) - .whereRaw(` - actors.slug = base_actors.slug - AND actors.entity_id IS NULL - AND NOT base_actors.collision_likely - `) - .orWhereRaw(` - actors.slug = base_actors.slug - AND actors.entity_id = base_actors.entity_id - AND ((actors.entry_id IS NULL AND base_actors.entry_id IS NULL) - OR actors.entry_id = base_actors.entry_id) - `); +const existingActors = await knex + .select('actors.*') + .from(knex.raw(`actors, (VALUES ${actorValues}) AS base_actors (slug, entity_id, entry_id, collision_likely)`)) + .whereRaw(` + actors.slug = base_actors.slug + AND actors.entity_id IS NULL + AND NOT base_actors.collision_likely + `) + .orWhereRaw(` + actors.slug = base_actors.slug + AND actors.entity_id = base_actors.entity_id + AND ((actors.entry_id IS NULL AND base_actors.entry_id IS NULL) + OR actors.entry_id = base_actors.entry_id) + `); - // const existingActorSlugs = new Set(existingActors.map(actor => actor.slug)); - const existingActorSlugs = existingActors.reduce((acc, actor) => ({ - ...acc, - [actor.entity_id]: { - ...acc[actor.entity_id], - [actor.entry_id]: { - ...acc[actor.entity_id]?.[actor.entry_id], - [actor.slug]: true, - }, +// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug)); +const existingActorSlugs = existingActors.reduce((acc, actor) => ({ + ...acc, + [actor.entity_id]: { + ...acc[actor.entity_id], + [actor.entry_id]: { + ...acc[actor.entity_id]?.[actor.entry_id], + [actor.slug]: true, }, - }), {}); + }, +}), {}); - const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.entity.id]?.[baseActor.entryId]?.[baseActor.slug] && !existingActorSlugs.null?.null?.[baseActor.slug]); - const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId); +const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.entity.id]?.[baseActor.entryId]?.[baseActor.slug] && !existingActorSlugs.null?.null?.[baseActor.slug]); +const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId); - const newActors = await bulkInsert('actors', curatedActorEntries); +const newActors = await bulkInsert('actors', curatedActorEntries); - const newActorIdsByEntityIdEntryIdAndSlug = newActors.reduce((acc, actor) => ({ - ...acc, - [actor.entity_id]: { - ...acc[actor.entity_id], - [actor.entry_id]: { - ...acc[actor.entity_id]?.[actor.entry_id], - [actor.slug]: actor.id, - }, +const newActorIdsByEntityIdEntryIdAndSlug = newActors.reduce((acc, actor) => ({ + ...acc, + [actor.entity_id]: { + ...acc[actor.entity_id], + [actor.entry_id]: { + ...acc[actor.entity_id]?.[actor.entry_id], + [actor.slug]: actor.id, }, - }), {}); + }, +}), {}); - const newActorProfiles = await Promise.all(baseActors - .filter(actor => actor.hasProfile) - .map(actor => ({ - ...actor, - id: newActorIdsByEntityIdEntryIdAndSlug[actor.entity?.id]?.[actor.entryId]?.[actor.slug] || newActorIdsByEntityIdEntryIdAndSlug.null?.null?.[actor.slug], - })) - .filter(actor => !!actor.id) - .map(actor => curateProfile(actor))); +const newActorProfiles = await Promise.all(baseActors + .filter(actor => actor.hasProfile) + .map(actor => ({ + ...actor, + id: newActorIdsByEntityIdEntryIdAndSlug[actor.entity?.id]?.[actor.entryId]?.[actor.slug] || newActorIdsByEntityIdEntryIdAndSlug.null?.null?.[actor.slug], + })) + .filter(actor => !!actor.id) + .map(actor => curateProfile(actor))); - await storeProfiles(newActorProfiles); +await storeProfiles(newActorProfiles); - if (Array.isArray(newActors)) { - return newActors.concat(existingActors); - } +if (Array.isArray(newActors)) { + return newActors.concat(existingActors); +} - return existingActors; +return existingActors; } async function associateActors(releases, batchId) { - const baseActorsByReleaseId = releases.reduce((acc, release) => { - if (release.actors) { - acc[release.id] = toBaseActors(release.actors, release); - } - - return acc; - }, {}); - - const baseActors = Object.values(baseActorsByReleaseId).flat(); - - if (baseActors.length === 0) { - return []; +const baseActorsByReleaseId = releases.reduce((acc, release) => { + if (release.actors) { + acc[release.id] = toBaseActors(release.actors, release); } - const baseActorsBySlug = baseActors.reduce((acc, baseActor) => ({ - ...acc, - [baseActor.slug]: baseActor, - }), {}); + return acc; +}, {}); - const uniqueBaseActors = Object.values(baseActorsBySlug); +const baseActors = Object.values(baseActorsByReleaseId).flat(); - const actors = await getOrCreateActors(uniqueBaseActors, batchId); +if (baseActors.length === 0) { + return []; +} - /* - const actorIdsBySlug = actors.reduce((acc, actor) => ({ - ...acc, - [actor.slug]: actor.alias_for || actor.id, - }), {}); - */ +const baseActorsBySlug = baseActors.reduce((acc, baseActor) => ({ + ...acc, + [baseActor.slug]: baseActor, +}), {}); - const actorIdsByEntityIdEntryIdAndSlug = actors.reduce((acc, actor) => ({ - ...acc, - [actor.entity_id]: { - ...acc[actor.entity_id], - [actor.entry_id]: { - ...acc[actor.entity_id]?.[actor.entry_id], - [actor.slug]: { - actor_id: actor.alias_for || actor.id, - alias_id: actor.alias_for ? actor.id : null, - }, +const uniqueBaseActors = Object.values(baseActorsBySlug); + +const actors = await getOrCreateActors(uniqueBaseActors, batchId); + +/* +const actorIdsBySlug = actors.reduce((acc, actor) => ({ + ...acc, + [actor.slug]: actor.alias_for || actor.id, +}), {}); +*/ + +const actorIdsByEntityIdEntryIdAndSlug = actors.reduce((acc, actor) => ({ + ...acc, + [actor.entity_id]: { + ...acc[actor.entity_id], + [actor.entry_id]: { + ...acc[actor.entity_id]?.[actor.entry_id], + [actor.slug]: { + actor_id: actor.alias_for || actor.id, + alias_id: actor.alias_for ? actor.id : null, }, }, - }), {}); + }, +}), {}); - const releaseActorAssociations = Object.entries(baseActorsByReleaseId) - .map(([releaseId, releaseActors]) => releaseActors - .map(releaseActor => ({ - release_id: releaseId, - ...(actorIdsByEntityIdEntryIdAndSlug[releaseActor.entity?.id]?.[releaseActor.entryId]?.[releaseActor.slug] || actorIdsByEntityIdEntryIdAndSlug.null.null[releaseActor.slug]), - }))) - .flat(); +const releaseActorAssociations = Object.entries(baseActorsByReleaseId) + .map(([releaseId, releaseActors]) => releaseActors + .map(releaseActor => ({ + release_id: releaseId, + ...(actorIdsByEntityIdEntryIdAndSlug[releaseActor.entity?.id]?.[releaseActor.entryId]?.[releaseActor.slug] || actorIdsByEntityIdEntryIdAndSlug.null.null[releaseActor.slug]), + }))) + .flat(); - await bulkInsert('releases_actors', releaseActorAssociations, false); +await bulkInsert('releases_actors', releaseActorAssociations, false); - logger.verbose(`Associated ${releaseActorAssociations.length} actors to ${releases.length} scenes`); +logger.verbose(`Associated ${releaseActorAssociations.length} actors to ${releases.length} scenes`); - return actors; +return actors; } async function fetchActor(actorId) { diff --git a/src/argv.js b/src/argv.js index 1dda020f..bf2c9205 100644 --- a/src/argv.js +++ b/src/argv.js @@ -23,6 +23,13 @@ function interpretAfter(after) { .toDate(); } +function interpretActorAfter(after) { + if (!after) { + return new Date(); + } + return interpretAfter(after); +} + const { argv } = yargs .command('npm start') .option('server', { @@ -69,6 +76,11 @@ const { argv } = yargs default: false, alias: 'actor-scenes', }) + .option('actors-batch', { + describe: 'Bath size to scrape actors, if not seet then all are scraped in on pass', + type: 'number', + default: config?.actors?.batchSize === null ? 0 : config?.actors?.batchSize, + }) .option('actor-sources', { describe: 'Use these scrapers for actor data', type: 'array', @@ -307,6 +319,6 @@ const { argv } = yargs alias: ['delete-movie', 'remove-movies', 'remove-movies'], }) .coerce('after', interpretAfter) - .coerce('actors-update', interpretAfter); + .coerce('actors-update', interpretActorAfter); module.exports = argv;