batch update actors

This commit is contained in:
SamPulsar1 2021-02-17 11:11:32 +10:30
parent 4b30398983
commit 86bc376b41
2 changed files with 147 additions and 114 deletions

View File

@ -322,6 +322,8 @@ function curateProfileEntry(profile) {
avatar_media_id: profile.avatarMediaId || null, avatar_media_id: profile.avatarMediaId || null,
}; };
if (profile.update) curatedProfileEntry.updated_at = new Date().toDateString();
return curatedProfileEntry; return curatedProfileEntry;
} }
@ -733,8 +735,9 @@ async function getActorNames(actorNames) {
SELECT * SELECT *
FROM actors_profiles FROM actors_profiles
WHERE actors_profiles.actor_id = actors.id WHERE actors_profiles.actor_id = actors.id
AND actors_profiles.updated_at <= (?) AND actors_profiles.updated_at >= (?)
) ) AND alias_for IS NULL
ORDER BY actors.name
`, [argv.actorsUpdate || new Date()]); `, [argv.actorsUpdate || new Date()]);
return actorsWithoutProfiles.rows.map(actor => actor.name); return actorsWithoutProfiles.rows.map(actor => actor.name);
@ -750,9 +753,27 @@ async function storeProfiles(profiles) {
async function scrapeActors(argNames) { async function scrapeActors(argNames) {
const actorNames = await getActorNames(argNames); const actorNames = await getActorNames(argNames);
const profiles = [];
const batchSize = argv.actorsBatch;
logger.info(`Scraping profiles for ${actorNames.length} actors`);
if (batchSize > 0) {
for (let i=0; i < actorNames.length; i=i+batchSize) {
logger.info(`Scraping profiles ${((i/actorNames.length)*100).toFixed(2)}%`);
profiles.push.apply(profiles, await scrapeActorsBatch(actorNames.slice(i, i + batchSize)));
}
} else {
profiles.push.apply(profiles, await scrapeActorsBatch(actorNames));
}
return profiles;
}
async function scrapeActorsBatch(actorNames) {
const baseActors = toBaseActors(actorNames); const baseActors = toBaseActors(actorNames);
logger.info(`Scraping profiles for ${actorNames.length} actors`); logger.info(`Actors: ${actorNames.join(', ')}`);
const sources = argv.profileSources || config.profiles || Object.keys(scrapers.actors); const sources = argv.profileSources || config.profiles || Object.keys(scrapers.actors);
const entitySlugs = sources.flat(); const entitySlugs = sources.flat();
@ -760,7 +781,7 @@ async function scrapeActors(argNames) {
const [entitiesBySlug, existingActorEntries] = await Promise.all([ const [entitiesBySlug, existingActorEntries] = await Promise.all([
fetchEntitiesBySlug(entitySlugs, 'desc'), fetchEntitiesBySlug(entitySlugs, 'desc'),
knex('actors') knex('actors')
.select(['id', 'name', 'slug', 'entry_id']) .select(['id', 'name', 'slug', 'entry_id', 'gender'])
.whereIn('slug', baseActors.map(baseActor => baseActor.slug)) .whereIn('slug', baseActors.map(baseActor => baseActor.slug))
.whereNull('alias_for'), .whereNull('alias_for'),
]); ]);
@ -820,14 +841,14 @@ async function scrapeActors(argNames) {
async function getOrCreateActors(baseActors, batchId) { async function getOrCreateActors(baseActors, batchId) {
// WHERE IN causes stack depth error and performance issues with a large amount of values, no knex VALUES helper available // WHERE IN causes stack depth error and performance issues with a large amount of values, no knex VALUES helper available
const actorValues = baseActors.map(actor => knex.raw('(:slug, :entityId, :entryId, :collisionLikely)', { const actorValues = baseActors.map(actor => knex.raw('(:slug, :entityId, :entryId, :collisionLikely)', {
slug: actor.slug, slug: actor.slug,
entityId: actor.entity.id, entityId: actor.entity.id,
entryId: actor.entryId, entryId: actor.entryId,
collisionLikely: getCollisionLikely(actor), collisionLikely: getCollisionLikely(actor),
})).join(', '); })).join(', ');
const existingActors = await knex const existingActors = await knex
.select('actors.*') .select('actors.*')
.from(knex.raw(`actors, (VALUES ${actorValues}) AS base_actors (slug, entity_id, entry_id, collision_likely)`)) .from(knex.raw(`actors, (VALUES ${actorValues}) AS base_actors (slug, entity_id, entry_id, collision_likely)`))
.whereRaw(` .whereRaw(`
@ -842,8 +863,8 @@ async function getOrCreateActors(baseActors, batchId) {
OR actors.entry_id = base_actors.entry_id) OR actors.entry_id = base_actors.entry_id)
`); `);
// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug)); // const existingActorSlugs = new Set(existingActors.map(actor => actor.slug));
const existingActorSlugs = existingActors.reduce((acc, actor) => ({ const existingActorSlugs = existingActors.reduce((acc, actor) => ({
...acc, ...acc,
[actor.entity_id]: { [actor.entity_id]: {
...acc[actor.entity_id], ...acc[actor.entity_id],
@ -852,14 +873,14 @@ async function getOrCreateActors(baseActors, batchId) {
[actor.slug]: true, [actor.slug]: true,
}, },
}, },
}), {}); }), {});
const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.entity.id]?.[baseActor.entryId]?.[baseActor.slug] && !existingActorSlugs.null?.null?.[baseActor.slug]); const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.entity.id]?.[baseActor.entryId]?.[baseActor.slug] && !existingActorSlugs.null?.null?.[baseActor.slug]);
const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId); const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId);
const newActors = await bulkInsert('actors', curatedActorEntries); const newActors = await bulkInsert('actors', curatedActorEntries);
const newActorIdsByEntityIdEntryIdAndSlug = newActors.reduce((acc, actor) => ({ const newActorIdsByEntityIdEntryIdAndSlug = newActors.reduce((acc, actor) => ({
...acc, ...acc,
[actor.entity_id]: { [actor.entity_id]: {
...acc[actor.entity_id], ...acc[actor.entity_id],
@ -868,9 +889,9 @@ async function getOrCreateActors(baseActors, batchId) {
[actor.slug]: actor.id, [actor.slug]: actor.id,
}, },
}, },
}), {}); }), {});
const newActorProfiles = await Promise.all(baseActors const newActorProfiles = await Promise.all(baseActors
.filter(actor => actor.hasProfile) .filter(actor => actor.hasProfile)
.map(actor => ({ .map(actor => ({
...actor, ...actor,
@ -879,47 +900,47 @@ async function getOrCreateActors(baseActors, batchId) {
.filter(actor => !!actor.id) .filter(actor => !!actor.id)
.map(actor => curateProfile(actor))); .map(actor => curateProfile(actor)));
await storeProfiles(newActorProfiles); await storeProfiles(newActorProfiles);
if (Array.isArray(newActors)) { if (Array.isArray(newActors)) {
return newActors.concat(existingActors); return newActors.concat(existingActors);
} }
return existingActors; return existingActors;
} }
async function associateActors(releases, batchId) { async function associateActors(releases, batchId) {
const baseActorsByReleaseId = releases.reduce((acc, release) => { const baseActorsByReleaseId = releases.reduce((acc, release) => {
if (release.actors) { if (release.actors) {
acc[release.id] = toBaseActors(release.actors, release); acc[release.id] = toBaseActors(release.actors, release);
} }
return acc; return acc;
}, {}); }, {});
const baseActors = Object.values(baseActorsByReleaseId).flat(); const baseActors = Object.values(baseActorsByReleaseId).flat();
if (baseActors.length === 0) { if (baseActors.length === 0) {
return []; return [];
} }
const baseActorsBySlug = baseActors.reduce((acc, baseActor) => ({ const baseActorsBySlug = baseActors.reduce((acc, baseActor) => ({
...acc, ...acc,
[baseActor.slug]: baseActor, [baseActor.slug]: baseActor,
}), {}); }), {});
const uniqueBaseActors = Object.values(baseActorsBySlug); const uniqueBaseActors = Object.values(baseActorsBySlug);
const actors = await getOrCreateActors(uniqueBaseActors, batchId); const actors = await getOrCreateActors(uniqueBaseActors, batchId);
/* /*
const actorIdsBySlug = actors.reduce((acc, actor) => ({ const actorIdsBySlug = actors.reduce((acc, actor) => ({
...acc, ...acc,
[actor.slug]: actor.alias_for || actor.id, [actor.slug]: actor.alias_for || actor.id,
}), {}); }), {});
*/ */
const actorIdsByEntityIdEntryIdAndSlug = actors.reduce((acc, actor) => ({ const actorIdsByEntityIdEntryIdAndSlug = actors.reduce((acc, actor) => ({
...acc, ...acc,
[actor.entity_id]: { [actor.entity_id]: {
...acc[actor.entity_id], ...acc[actor.entity_id],
@ -931,9 +952,9 @@ async function associateActors(releases, batchId) {
}, },
}, },
}, },
}), {}); }), {});
const releaseActorAssociations = Object.entries(baseActorsByReleaseId) const releaseActorAssociations = Object.entries(baseActorsByReleaseId)
.map(([releaseId, releaseActors]) => releaseActors .map(([releaseId, releaseActors]) => releaseActors
.map(releaseActor => ({ .map(releaseActor => ({
release_id: releaseId, release_id: releaseId,
@ -941,11 +962,11 @@ async function associateActors(releases, batchId) {
}))) })))
.flat(); .flat();
await bulkInsert('releases_actors', releaseActorAssociations, false); await bulkInsert('releases_actors', releaseActorAssociations, false);
logger.verbose(`Associated ${releaseActorAssociations.length} actors to ${releases.length} scenes`); logger.verbose(`Associated ${releaseActorAssociations.length} actors to ${releases.length} scenes`);
return actors; return actors;
} }
async function fetchActor(actorId) { async function fetchActor(actorId) {

View File

@ -23,6 +23,13 @@ function interpretAfter(after) {
.toDate(); .toDate();
} }
function interpretActorAfter(after) {
if (!after) {
return new Date();
}
return interpretAfter(after);
}
const { argv } = yargs const { argv } = yargs
.command('npm start') .command('npm start')
.option('server', { .option('server', {
@ -69,6 +76,11 @@ const { argv } = yargs
default: false, default: false,
alias: 'actor-scenes', alias: 'actor-scenes',
}) })
.option('actors-batch', {
describe: 'Bath size to scrape actors, if not seet then all are scraped in on pass',
type: 'number',
default: config?.actors?.batchSize === null ? 0 : config?.actors?.batchSize,
})
.option('actor-sources', { .option('actor-sources', {
describe: 'Use these scrapers for actor data', describe: 'Use these scrapers for actor data',
type: 'array', type: 'array',
@ -307,6 +319,6 @@ const { argv } = yargs
alias: ['delete-movie', 'remove-movies', 'remove-movies'], alias: ['delete-movie', 'remove-movies', 'remove-movies'],
}) })
.coerce('after', interpretAfter) .coerce('after', interpretAfter)
.coerce('actors-update', interpretAfter); .coerce('actors-update', interpretActorAfter);
module.exports = argv; module.exports = argv;