batch update actors
This commit is contained in:
parent
4b30398983
commit
86bc376b41
247
src/actors.js
247
src/actors.js
|
@ -283,7 +283,7 @@ function curateActorEntries(baseActors, batchId) {
|
|||
function curateProfileEntry(profile) {
|
||||
if (!profile.id) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
const curatedProfileEntry = {
|
||||
...(profile.update !== false && { id: profile.update }),
|
||||
|
@ -322,6 +322,8 @@ function curateProfileEntry(profile) {
|
|||
avatar_media_id: profile.avatarMediaId || null,
|
||||
};
|
||||
|
||||
if (profile.update) curatedProfileEntry.updated_at = new Date().toDateString();
|
||||
|
||||
return curatedProfileEntry;
|
||||
}
|
||||
|
||||
|
@ -453,7 +455,7 @@ async function curateProfile(profile, actor) {
|
|||
if (profile.ethnicity && !curatedProfile.ethnicity) logger.warn(`Unrecognized ethnicity returned by '${profile.entity.name}' scraper: ${profile.ethnicity}`);
|
||||
if ((profile.hairColor || profile.hair) && !curatedProfile.hairColor) logger.warn(`Unrecognized hair color returned by '${profile.entity.name}' scraper: ${profile.hairColor || profile.hair}`);
|
||||
if (profile.eyes && !curatedProfile.eyes) logger.warn(`Unrecognized eye color returned by '${profile.entity.name}' scraper: ${profile.eyes}`);
|
||||
|
||||
|
||||
return curatedProfile;
|
||||
} catch (error) {
|
||||
logger.error(`Failed to curate '${profile.name}': ${error.message}`);
|
||||
|
@ -691,7 +693,7 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy
|
|||
}
|
||||
|
||||
logger.verbose(`Found profile for '${actor.name}' on '${label}'`);
|
||||
|
||||
|
||||
return await curateProfile({
|
||||
...actor,
|
||||
...profile,
|
||||
|
@ -727,14 +729,15 @@ async function getActorNames(actorNames) {
|
|||
}
|
||||
|
||||
const actorsWithoutProfiles = await knex.raw(`
|
||||
SELECT actors.name
|
||||
FROM actors
|
||||
WHERE NOT EXISTS (
|
||||
SELECT *
|
||||
FROM actors_profiles
|
||||
WHERE actors_profiles.actor_id = actors.id
|
||||
AND actors_profiles.updated_at <= (?)
|
||||
)
|
||||
SELECT actors.name
|
||||
FROM actors
|
||||
WHERE NOT EXISTS (
|
||||
SELECT *
|
||||
FROM actors_profiles
|
||||
WHERE actors_profiles.actor_id = actors.id
|
||||
AND actors_profiles.updated_at >= (?)
|
||||
) AND alias_for IS NULL
|
||||
ORDER BY actors.name
|
||||
`, [argv.actorsUpdate || new Date()]);
|
||||
|
||||
return actorsWithoutProfiles.rows.map(actor => actor.name);
|
||||
|
@ -750,9 +753,27 @@ async function storeProfiles(profiles) {
|
|||
|
||||
async function scrapeActors(argNames) {
|
||||
const actorNames = await getActorNames(argNames);
|
||||
const profiles = [];
|
||||
|
||||
const batchSize = argv.actorsBatch;
|
||||
logger.info(`Scraping profiles for ${actorNames.length} actors`);
|
||||
|
||||
if (batchSize > 0) {
|
||||
for (let i=0; i < actorNames.length; i=i+batchSize) {
|
||||
logger.info(`Scraping profiles ${((i/actorNames.length)*100).toFixed(2)}%`);
|
||||
profiles.push.apply(profiles, await scrapeActorsBatch(actorNames.slice(i, i + batchSize)));
|
||||
}
|
||||
} else {
|
||||
profiles.push.apply(profiles, await scrapeActorsBatch(actorNames));
|
||||
}
|
||||
|
||||
return profiles;
|
||||
}
|
||||
|
||||
async function scrapeActorsBatch(actorNames) {
|
||||
const baseActors = toBaseActors(actorNames);
|
||||
|
||||
logger.info(`Scraping profiles for ${actorNames.length} actors`);
|
||||
logger.info(`Actors: ${actorNames.join(', ')}`);
|
||||
|
||||
const sources = argv.profileSources || config.profiles || Object.keys(scrapers.actors);
|
||||
const entitySlugs = sources.flat();
|
||||
|
@ -760,7 +781,7 @@ async function scrapeActors(argNames) {
|
|||
const [entitiesBySlug, existingActorEntries] = await Promise.all([
|
||||
fetchEntitiesBySlug(entitySlugs, 'desc'),
|
||||
knex('actors')
|
||||
.select(['id', 'name', 'slug', 'entry_id'])
|
||||
.select(['id', 'name', 'slug', 'entry_id', 'gender'])
|
||||
.whereIn('slug', baseActors.map(baseActor => baseActor.slug))
|
||||
.whereNull('alias_for'),
|
||||
]);
|
||||
|
@ -820,132 +841,132 @@ async function scrapeActors(argNames) {
|
|||
|
||||
async function getOrCreateActors(baseActors, batchId) {
|
||||
// WHERE IN causes stack depth error and performance issues with a large amount of values, no knex VALUES helper available
|
||||
const actorValues = baseActors.map(actor => knex.raw('(:slug, :entityId, :entryId, :collisionLikely)', {
|
||||
slug: actor.slug,
|
||||
entityId: actor.entity.id,
|
||||
entryId: actor.entryId,
|
||||
collisionLikely: getCollisionLikely(actor),
|
||||
})).join(', ');
|
||||
const actorValues = baseActors.map(actor => knex.raw('(:slug, :entityId, :entryId, :collisionLikely)', {
|
||||
slug: actor.slug,
|
||||
entityId: actor.entity.id,
|
||||
entryId: actor.entryId,
|
||||
collisionLikely: getCollisionLikely(actor),
|
||||
})).join(', ');
|
||||
|
||||
const existingActors = await knex
|
||||
.select('actors.*')
|
||||
.from(knex.raw(`actors, (VALUES ${actorValues}) AS base_actors (slug, entity_id, entry_id, collision_likely)`))
|
||||
.whereRaw(`
|
||||
actors.slug = base_actors.slug
|
||||
AND actors.entity_id IS NULL
|
||||
AND NOT base_actors.collision_likely
|
||||
`)
|
||||
.orWhereRaw(`
|
||||
actors.slug = base_actors.slug
|
||||
AND actors.entity_id = base_actors.entity_id
|
||||
AND ((actors.entry_id IS NULL AND base_actors.entry_id IS NULL)
|
||||
OR actors.entry_id = base_actors.entry_id)
|
||||
`);
|
||||
const existingActors = await knex
|
||||
.select('actors.*')
|
||||
.from(knex.raw(`actors, (VALUES ${actorValues}) AS base_actors (slug, entity_id, entry_id, collision_likely)`))
|
||||
.whereRaw(`
|
||||
actors.slug = base_actors.slug
|
||||
AND actors.entity_id IS NULL
|
||||
AND NOT base_actors.collision_likely
|
||||
`)
|
||||
.orWhereRaw(`
|
||||
actors.slug = base_actors.slug
|
||||
AND actors.entity_id = base_actors.entity_id
|
||||
AND ((actors.entry_id IS NULL AND base_actors.entry_id IS NULL)
|
||||
OR actors.entry_id = base_actors.entry_id)
|
||||
`);
|
||||
|
||||
// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug));
|
||||
const existingActorSlugs = existingActors.reduce((acc, actor) => ({
|
||||
...acc,
|
||||
[actor.entity_id]: {
|
||||
...acc[actor.entity_id],
|
||||
[actor.entry_id]: {
|
||||
...acc[actor.entity_id]?.[actor.entry_id],
|
||||
[actor.slug]: true,
|
||||
},
|
||||
// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug));
|
||||
const existingActorSlugs = existingActors.reduce((acc, actor) => ({
|
||||
...acc,
|
||||
[actor.entity_id]: {
|
||||
...acc[actor.entity_id],
|
||||
[actor.entry_id]: {
|
||||
...acc[actor.entity_id]?.[actor.entry_id],
|
||||
[actor.slug]: true,
|
||||
},
|
||||
}), {});
|
||||
},
|
||||
}), {});
|
||||
|
||||
const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.entity.id]?.[baseActor.entryId]?.[baseActor.slug] && !existingActorSlugs.null?.null?.[baseActor.slug]);
|
||||
const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId);
|
||||
const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.entity.id]?.[baseActor.entryId]?.[baseActor.slug] && !existingActorSlugs.null?.null?.[baseActor.slug]);
|
||||
const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId);
|
||||
|
||||
const newActors = await bulkInsert('actors', curatedActorEntries);
|
||||
const newActors = await bulkInsert('actors', curatedActorEntries);
|
||||
|
||||
const newActorIdsByEntityIdEntryIdAndSlug = newActors.reduce((acc, actor) => ({
|
||||
...acc,
|
||||
[actor.entity_id]: {
|
||||
...acc[actor.entity_id],
|
||||
[actor.entry_id]: {
|
||||
...acc[actor.entity_id]?.[actor.entry_id],
|
||||
[actor.slug]: actor.id,
|
||||
},
|
||||
const newActorIdsByEntityIdEntryIdAndSlug = newActors.reduce((acc, actor) => ({
|
||||
...acc,
|
||||
[actor.entity_id]: {
|
||||
...acc[actor.entity_id],
|
||||
[actor.entry_id]: {
|
||||
...acc[actor.entity_id]?.[actor.entry_id],
|
||||
[actor.slug]: actor.id,
|
||||
},
|
||||
}), {});
|
||||
},
|
||||
}), {});
|
||||
|
||||
const newActorProfiles = await Promise.all(baseActors
|
||||
.filter(actor => actor.hasProfile)
|
||||
.map(actor => ({
|
||||
...actor,
|
||||
id: newActorIdsByEntityIdEntryIdAndSlug[actor.entity?.id]?.[actor.entryId]?.[actor.slug] || newActorIdsByEntityIdEntryIdAndSlug.null?.null?.[actor.slug],
|
||||
}))
|
||||
.filter(actor => !!actor.id)
|
||||
.map(actor => curateProfile(actor)));
|
||||
const newActorProfiles = await Promise.all(baseActors
|
||||
.filter(actor => actor.hasProfile)
|
||||
.map(actor => ({
|
||||
...actor,
|
||||
id: newActorIdsByEntityIdEntryIdAndSlug[actor.entity?.id]?.[actor.entryId]?.[actor.slug] || newActorIdsByEntityIdEntryIdAndSlug.null?.null?.[actor.slug],
|
||||
}))
|
||||
.filter(actor => !!actor.id)
|
||||
.map(actor => curateProfile(actor)));
|
||||
|
||||
await storeProfiles(newActorProfiles);
|
||||
await storeProfiles(newActorProfiles);
|
||||
|
||||
if (Array.isArray(newActors)) {
|
||||
return newActors.concat(existingActors);
|
||||
}
|
||||
if (Array.isArray(newActors)) {
|
||||
return newActors.concat(existingActors);
|
||||
}
|
||||
|
||||
return existingActors;
|
||||
return existingActors;
|
||||
}
|
||||
|
||||
async function associateActors(releases, batchId) {
|
||||
const baseActorsByReleaseId = releases.reduce((acc, release) => {
|
||||
if (release.actors) {
|
||||
acc[release.id] = toBaseActors(release.actors, release);
|
||||
}
|
||||
|
||||
return acc;
|
||||
}, {});
|
||||
|
||||
const baseActors = Object.values(baseActorsByReleaseId).flat();
|
||||
|
||||
if (baseActors.length === 0) {
|
||||
return [];
|
||||
const baseActorsByReleaseId = releases.reduce((acc, release) => {
|
||||
if (release.actors) {
|
||||
acc[release.id] = toBaseActors(release.actors, release);
|
||||
}
|
||||
|
||||
const baseActorsBySlug = baseActors.reduce((acc, baseActor) => ({
|
||||
...acc,
|
||||
[baseActor.slug]: baseActor,
|
||||
}), {});
|
||||
return acc;
|
||||
}, {});
|
||||
|
||||
const uniqueBaseActors = Object.values(baseActorsBySlug);
|
||||
const baseActors = Object.values(baseActorsByReleaseId).flat();
|
||||
|
||||
const actors = await getOrCreateActors(uniqueBaseActors, batchId);
|
||||
if (baseActors.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
/*
|
||||
const actorIdsBySlug = actors.reduce((acc, actor) => ({
|
||||
...acc,
|
||||
[actor.slug]: actor.alias_for || actor.id,
|
||||
}), {});
|
||||
*/
|
||||
const baseActorsBySlug = baseActors.reduce((acc, baseActor) => ({
|
||||
...acc,
|
||||
[baseActor.slug]: baseActor,
|
||||
}), {});
|
||||
|
||||
const actorIdsByEntityIdEntryIdAndSlug = actors.reduce((acc, actor) => ({
|
||||
...acc,
|
||||
[actor.entity_id]: {
|
||||
...acc[actor.entity_id],
|
||||
[actor.entry_id]: {
|
||||
...acc[actor.entity_id]?.[actor.entry_id],
|
||||
[actor.slug]: {
|
||||
actor_id: actor.alias_for || actor.id,
|
||||
alias_id: actor.alias_for ? actor.id : null,
|
||||
},
|
||||
const uniqueBaseActors = Object.values(baseActorsBySlug);
|
||||
|
||||
const actors = await getOrCreateActors(uniqueBaseActors, batchId);
|
||||
|
||||
/*
|
||||
const actorIdsBySlug = actors.reduce((acc, actor) => ({
|
||||
...acc,
|
||||
[actor.slug]: actor.alias_for || actor.id,
|
||||
}), {});
|
||||
*/
|
||||
|
||||
const actorIdsByEntityIdEntryIdAndSlug = actors.reduce((acc, actor) => ({
|
||||
...acc,
|
||||
[actor.entity_id]: {
|
||||
...acc[actor.entity_id],
|
||||
[actor.entry_id]: {
|
||||
...acc[actor.entity_id]?.[actor.entry_id],
|
||||
[actor.slug]: {
|
||||
actor_id: actor.alias_for || actor.id,
|
||||
alias_id: actor.alias_for ? actor.id : null,
|
||||
},
|
||||
},
|
||||
}), {});
|
||||
},
|
||||
}), {});
|
||||
|
||||
const releaseActorAssociations = Object.entries(baseActorsByReleaseId)
|
||||
.map(([releaseId, releaseActors]) => releaseActors
|
||||
.map(releaseActor => ({
|
||||
release_id: releaseId,
|
||||
...(actorIdsByEntityIdEntryIdAndSlug[releaseActor.entity?.id]?.[releaseActor.entryId]?.[releaseActor.slug] || actorIdsByEntityIdEntryIdAndSlug.null.null[releaseActor.slug]),
|
||||
})))
|
||||
.flat();
|
||||
const releaseActorAssociations = Object.entries(baseActorsByReleaseId)
|
||||
.map(([releaseId, releaseActors]) => releaseActors
|
||||
.map(releaseActor => ({
|
||||
release_id: releaseId,
|
||||
...(actorIdsByEntityIdEntryIdAndSlug[releaseActor.entity?.id]?.[releaseActor.entryId]?.[releaseActor.slug] || actorIdsByEntityIdEntryIdAndSlug.null.null[releaseActor.slug]),
|
||||
})))
|
||||
.flat();
|
||||
|
||||
await bulkInsert('releases_actors', releaseActorAssociations, false);
|
||||
await bulkInsert('releases_actors', releaseActorAssociations, false);
|
||||
|
||||
logger.verbose(`Associated ${releaseActorAssociations.length} actors to ${releases.length} scenes`);
|
||||
logger.verbose(`Associated ${releaseActorAssociations.length} actors to ${releases.length} scenes`);
|
||||
|
||||
return actors;
|
||||
return actors;
|
||||
}
|
||||
|
||||
async function fetchActor(actorId) {
|
||||
|
|
14
src/argv.js
14
src/argv.js
|
@ -23,6 +23,13 @@ function interpretAfter(after) {
|
|||
.toDate();
|
||||
}
|
||||
|
||||
function interpretActorAfter(after) {
|
||||
if (!after) {
|
||||
return new Date();
|
||||
}
|
||||
return interpretAfter(after);
|
||||
}
|
||||
|
||||
const { argv } = yargs
|
||||
.command('npm start')
|
||||
.option('server', {
|
||||
|
@ -69,6 +76,11 @@ const { argv } = yargs
|
|||
default: false,
|
||||
alias: 'actor-scenes',
|
||||
})
|
||||
.option('actors-batch', {
|
||||
describe: 'Bath size to scrape actors, if not seet then all are scraped in on pass',
|
||||
type: 'number',
|
||||
default: config?.actors?.batchSize === null ? 0 : config?.actors?.batchSize,
|
||||
})
|
||||
.option('actor-sources', {
|
||||
describe: 'Use these scrapers for actor data',
|
||||
type: 'array',
|
||||
|
@ -307,6 +319,6 @@ const { argv } = yargs
|
|||
alias: ['delete-movie', 'remove-movies', 'remove-movies'],
|
||||
})
|
||||
.coerce('after', interpretAfter)
|
||||
.coerce('actors-update', interpretAfter);
|
||||
.coerce('actors-update', interpretActorAfter);
|
||||
|
||||
module.exports = argv;
|
||||
|
|
Loading…
Reference in New Issue