batch update actors
This commit is contained in:
parent
4b30398983
commit
86bc376b41
243
src/actors.js
243
src/actors.js
|
@ -283,7 +283,7 @@ function curateActorEntries(baseActors, batchId) {
|
||||||
function curateProfileEntry(profile) {
|
function curateProfileEntry(profile) {
|
||||||
if (!profile.id) {
|
if (!profile.id) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
const curatedProfileEntry = {
|
const curatedProfileEntry = {
|
||||||
...(profile.update !== false && { id: profile.update }),
|
...(profile.update !== false && { id: profile.update }),
|
||||||
|
@ -322,6 +322,8 @@ function curateProfileEntry(profile) {
|
||||||
avatar_media_id: profile.avatarMediaId || null,
|
avatar_media_id: profile.avatarMediaId || null,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (profile.update) curatedProfileEntry.updated_at = new Date().toDateString();
|
||||||
|
|
||||||
return curatedProfileEntry;
|
return curatedProfileEntry;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -727,14 +729,15 @@ async function getActorNames(actorNames) {
|
||||||
}
|
}
|
||||||
|
|
||||||
const actorsWithoutProfiles = await knex.raw(`
|
const actorsWithoutProfiles = await knex.raw(`
|
||||||
SELECT actors.name
|
SELECT actors.name
|
||||||
FROM actors
|
FROM actors
|
||||||
WHERE NOT EXISTS (
|
WHERE NOT EXISTS (
|
||||||
SELECT *
|
SELECT *
|
||||||
FROM actors_profiles
|
FROM actors_profiles
|
||||||
WHERE actors_profiles.actor_id = actors.id
|
WHERE actors_profiles.actor_id = actors.id
|
||||||
AND actors_profiles.updated_at <= (?)
|
AND actors_profiles.updated_at >= (?)
|
||||||
)
|
) AND alias_for IS NULL
|
||||||
|
ORDER BY actors.name
|
||||||
`, [argv.actorsUpdate || new Date()]);
|
`, [argv.actorsUpdate || new Date()]);
|
||||||
|
|
||||||
return actorsWithoutProfiles.rows.map(actor => actor.name);
|
return actorsWithoutProfiles.rows.map(actor => actor.name);
|
||||||
|
@ -750,9 +753,27 @@ async function storeProfiles(profiles) {
|
||||||
|
|
||||||
async function scrapeActors(argNames) {
|
async function scrapeActors(argNames) {
|
||||||
const actorNames = await getActorNames(argNames);
|
const actorNames = await getActorNames(argNames);
|
||||||
|
const profiles = [];
|
||||||
|
|
||||||
|
const batchSize = argv.actorsBatch;
|
||||||
|
logger.info(`Scraping profiles for ${actorNames.length} actors`);
|
||||||
|
|
||||||
|
if (batchSize > 0) {
|
||||||
|
for (let i=0; i < actorNames.length; i=i+batchSize) {
|
||||||
|
logger.info(`Scraping profiles ${((i/actorNames.length)*100).toFixed(2)}%`);
|
||||||
|
profiles.push.apply(profiles, await scrapeActorsBatch(actorNames.slice(i, i + batchSize)));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
profiles.push.apply(profiles, await scrapeActorsBatch(actorNames));
|
||||||
|
}
|
||||||
|
|
||||||
|
return profiles;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function scrapeActorsBatch(actorNames) {
|
||||||
const baseActors = toBaseActors(actorNames);
|
const baseActors = toBaseActors(actorNames);
|
||||||
|
|
||||||
logger.info(`Scraping profiles for ${actorNames.length} actors`);
|
logger.info(`Actors: ${actorNames.join(', ')}`);
|
||||||
|
|
||||||
const sources = argv.profileSources || config.profiles || Object.keys(scrapers.actors);
|
const sources = argv.profileSources || config.profiles || Object.keys(scrapers.actors);
|
||||||
const entitySlugs = sources.flat();
|
const entitySlugs = sources.flat();
|
||||||
|
@ -760,7 +781,7 @@ async function scrapeActors(argNames) {
|
||||||
const [entitiesBySlug, existingActorEntries] = await Promise.all([
|
const [entitiesBySlug, existingActorEntries] = await Promise.all([
|
||||||
fetchEntitiesBySlug(entitySlugs, 'desc'),
|
fetchEntitiesBySlug(entitySlugs, 'desc'),
|
||||||
knex('actors')
|
knex('actors')
|
||||||
.select(['id', 'name', 'slug', 'entry_id'])
|
.select(['id', 'name', 'slug', 'entry_id', 'gender'])
|
||||||
.whereIn('slug', baseActors.map(baseActor => baseActor.slug))
|
.whereIn('slug', baseActors.map(baseActor => baseActor.slug))
|
||||||
.whereNull('alias_for'),
|
.whereNull('alias_for'),
|
||||||
]);
|
]);
|
||||||
|
@ -820,132 +841,132 @@ async function scrapeActors(argNames) {
|
||||||
|
|
||||||
async function getOrCreateActors(baseActors, batchId) {
|
async function getOrCreateActors(baseActors, batchId) {
|
||||||
// WHERE IN causes stack depth error and performance issues with a large amount of values, no knex VALUES helper available
|
// WHERE IN causes stack depth error and performance issues with a large amount of values, no knex VALUES helper available
|
||||||
const actorValues = baseActors.map(actor => knex.raw('(:slug, :entityId, :entryId, :collisionLikely)', {
|
const actorValues = baseActors.map(actor => knex.raw('(:slug, :entityId, :entryId, :collisionLikely)', {
|
||||||
slug: actor.slug,
|
slug: actor.slug,
|
||||||
entityId: actor.entity.id,
|
entityId: actor.entity.id,
|
||||||
entryId: actor.entryId,
|
entryId: actor.entryId,
|
||||||
collisionLikely: getCollisionLikely(actor),
|
collisionLikely: getCollisionLikely(actor),
|
||||||
})).join(', ');
|
})).join(', ');
|
||||||
|
|
||||||
const existingActors = await knex
|
const existingActors = await knex
|
||||||
.select('actors.*')
|
.select('actors.*')
|
||||||
.from(knex.raw(`actors, (VALUES ${actorValues}) AS base_actors (slug, entity_id, entry_id, collision_likely)`))
|
.from(knex.raw(`actors, (VALUES ${actorValues}) AS base_actors (slug, entity_id, entry_id, collision_likely)`))
|
||||||
.whereRaw(`
|
.whereRaw(`
|
||||||
actors.slug = base_actors.slug
|
actors.slug = base_actors.slug
|
||||||
AND actors.entity_id IS NULL
|
AND actors.entity_id IS NULL
|
||||||
AND NOT base_actors.collision_likely
|
AND NOT base_actors.collision_likely
|
||||||
`)
|
`)
|
||||||
.orWhereRaw(`
|
.orWhereRaw(`
|
||||||
actors.slug = base_actors.slug
|
actors.slug = base_actors.slug
|
||||||
AND actors.entity_id = base_actors.entity_id
|
AND actors.entity_id = base_actors.entity_id
|
||||||
AND ((actors.entry_id IS NULL AND base_actors.entry_id IS NULL)
|
AND ((actors.entry_id IS NULL AND base_actors.entry_id IS NULL)
|
||||||
OR actors.entry_id = base_actors.entry_id)
|
OR actors.entry_id = base_actors.entry_id)
|
||||||
`);
|
`);
|
||||||
|
|
||||||
// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug));
|
// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug));
|
||||||
const existingActorSlugs = existingActors.reduce((acc, actor) => ({
|
const existingActorSlugs = existingActors.reduce((acc, actor) => ({
|
||||||
...acc,
|
...acc,
|
||||||
[actor.entity_id]: {
|
[actor.entity_id]: {
|
||||||
...acc[actor.entity_id],
|
...acc[actor.entity_id],
|
||||||
[actor.entry_id]: {
|
[actor.entry_id]: {
|
||||||
...acc[actor.entity_id]?.[actor.entry_id],
|
...acc[actor.entity_id]?.[actor.entry_id],
|
||||||
[actor.slug]: true,
|
[actor.slug]: true,
|
||||||
},
|
|
||||||
},
|
},
|
||||||
}), {});
|
},
|
||||||
|
}), {});
|
||||||
|
|
||||||
const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.entity.id]?.[baseActor.entryId]?.[baseActor.slug] && !existingActorSlugs.null?.null?.[baseActor.slug]);
|
const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.entity.id]?.[baseActor.entryId]?.[baseActor.slug] && !existingActorSlugs.null?.null?.[baseActor.slug]);
|
||||||
const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId);
|
const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId);
|
||||||
|
|
||||||
const newActors = await bulkInsert('actors', curatedActorEntries);
|
const newActors = await bulkInsert('actors', curatedActorEntries);
|
||||||
|
|
||||||
const newActorIdsByEntityIdEntryIdAndSlug = newActors.reduce((acc, actor) => ({
|
const newActorIdsByEntityIdEntryIdAndSlug = newActors.reduce((acc, actor) => ({
|
||||||
...acc,
|
...acc,
|
||||||
[actor.entity_id]: {
|
[actor.entity_id]: {
|
||||||
...acc[actor.entity_id],
|
...acc[actor.entity_id],
|
||||||
[actor.entry_id]: {
|
[actor.entry_id]: {
|
||||||
...acc[actor.entity_id]?.[actor.entry_id],
|
...acc[actor.entity_id]?.[actor.entry_id],
|
||||||
[actor.slug]: actor.id,
|
[actor.slug]: actor.id,
|
||||||
},
|
|
||||||
},
|
},
|
||||||
}), {});
|
},
|
||||||
|
}), {});
|
||||||
|
|
||||||
const newActorProfiles = await Promise.all(baseActors
|
const newActorProfiles = await Promise.all(baseActors
|
||||||
.filter(actor => actor.hasProfile)
|
.filter(actor => actor.hasProfile)
|
||||||
.map(actor => ({
|
.map(actor => ({
|
||||||
...actor,
|
...actor,
|
||||||
id: newActorIdsByEntityIdEntryIdAndSlug[actor.entity?.id]?.[actor.entryId]?.[actor.slug] || newActorIdsByEntityIdEntryIdAndSlug.null?.null?.[actor.slug],
|
id: newActorIdsByEntityIdEntryIdAndSlug[actor.entity?.id]?.[actor.entryId]?.[actor.slug] || newActorIdsByEntityIdEntryIdAndSlug.null?.null?.[actor.slug],
|
||||||
}))
|
}))
|
||||||
.filter(actor => !!actor.id)
|
.filter(actor => !!actor.id)
|
||||||
.map(actor => curateProfile(actor)));
|
.map(actor => curateProfile(actor)));
|
||||||
|
|
||||||
await storeProfiles(newActorProfiles);
|
await storeProfiles(newActorProfiles);
|
||||||
|
|
||||||
if (Array.isArray(newActors)) {
|
if (Array.isArray(newActors)) {
|
||||||
return newActors.concat(existingActors);
|
return newActors.concat(existingActors);
|
||||||
}
|
}
|
||||||
|
|
||||||
return existingActors;
|
return existingActors;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function associateActors(releases, batchId) {
|
async function associateActors(releases, batchId) {
|
||||||
const baseActorsByReleaseId = releases.reduce((acc, release) => {
|
const baseActorsByReleaseId = releases.reduce((acc, release) => {
|
||||||
if (release.actors) {
|
if (release.actors) {
|
||||||
acc[release.id] = toBaseActors(release.actors, release);
|
acc[release.id] = toBaseActors(release.actors, release);
|
||||||
}
|
|
||||||
|
|
||||||
return acc;
|
|
||||||
}, {});
|
|
||||||
|
|
||||||
const baseActors = Object.values(baseActorsByReleaseId).flat();
|
|
||||||
|
|
||||||
if (baseActors.length === 0) {
|
|
||||||
return [];
|
|
||||||
}
|
}
|
||||||
|
|
||||||
const baseActorsBySlug = baseActors.reduce((acc, baseActor) => ({
|
return acc;
|
||||||
...acc,
|
}, {});
|
||||||
[baseActor.slug]: baseActor,
|
|
||||||
}), {});
|
|
||||||
|
|
||||||
const uniqueBaseActors = Object.values(baseActorsBySlug);
|
const baseActors = Object.values(baseActorsByReleaseId).flat();
|
||||||
|
|
||||||
const actors = await getOrCreateActors(uniqueBaseActors, batchId);
|
if (baseActors.length === 0) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
const baseActorsBySlug = baseActors.reduce((acc, baseActor) => ({
|
||||||
const actorIdsBySlug = actors.reduce((acc, actor) => ({
|
...acc,
|
||||||
...acc,
|
[baseActor.slug]: baseActor,
|
||||||
[actor.slug]: actor.alias_for || actor.id,
|
}), {});
|
||||||
}), {});
|
|
||||||
*/
|
|
||||||
|
|
||||||
const actorIdsByEntityIdEntryIdAndSlug = actors.reduce((acc, actor) => ({
|
const uniqueBaseActors = Object.values(baseActorsBySlug);
|
||||||
...acc,
|
|
||||||
[actor.entity_id]: {
|
const actors = await getOrCreateActors(uniqueBaseActors, batchId);
|
||||||
...acc[actor.entity_id],
|
|
||||||
[actor.entry_id]: {
|
/*
|
||||||
...acc[actor.entity_id]?.[actor.entry_id],
|
const actorIdsBySlug = actors.reduce((acc, actor) => ({
|
||||||
[actor.slug]: {
|
...acc,
|
||||||
actor_id: actor.alias_for || actor.id,
|
[actor.slug]: actor.alias_for || actor.id,
|
||||||
alias_id: actor.alias_for ? actor.id : null,
|
}), {});
|
||||||
},
|
*/
|
||||||
|
|
||||||
|
const actorIdsByEntityIdEntryIdAndSlug = actors.reduce((acc, actor) => ({
|
||||||
|
...acc,
|
||||||
|
[actor.entity_id]: {
|
||||||
|
...acc[actor.entity_id],
|
||||||
|
[actor.entry_id]: {
|
||||||
|
...acc[actor.entity_id]?.[actor.entry_id],
|
||||||
|
[actor.slug]: {
|
||||||
|
actor_id: actor.alias_for || actor.id,
|
||||||
|
alias_id: actor.alias_for ? actor.id : null,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
}), {});
|
},
|
||||||
|
}), {});
|
||||||
|
|
||||||
const releaseActorAssociations = Object.entries(baseActorsByReleaseId)
|
const releaseActorAssociations = Object.entries(baseActorsByReleaseId)
|
||||||
.map(([releaseId, releaseActors]) => releaseActors
|
.map(([releaseId, releaseActors]) => releaseActors
|
||||||
.map(releaseActor => ({
|
.map(releaseActor => ({
|
||||||
release_id: releaseId,
|
release_id: releaseId,
|
||||||
...(actorIdsByEntityIdEntryIdAndSlug[releaseActor.entity?.id]?.[releaseActor.entryId]?.[releaseActor.slug] || actorIdsByEntityIdEntryIdAndSlug.null.null[releaseActor.slug]),
|
...(actorIdsByEntityIdEntryIdAndSlug[releaseActor.entity?.id]?.[releaseActor.entryId]?.[releaseActor.slug] || actorIdsByEntityIdEntryIdAndSlug.null.null[releaseActor.slug]),
|
||||||
})))
|
})))
|
||||||
.flat();
|
.flat();
|
||||||
|
|
||||||
await bulkInsert('releases_actors', releaseActorAssociations, false);
|
await bulkInsert('releases_actors', releaseActorAssociations, false);
|
||||||
|
|
||||||
logger.verbose(`Associated ${releaseActorAssociations.length} actors to ${releases.length} scenes`);
|
logger.verbose(`Associated ${releaseActorAssociations.length} actors to ${releases.length} scenes`);
|
||||||
|
|
||||||
return actors;
|
return actors;
|
||||||
}
|
}
|
||||||
|
|
||||||
async function fetchActor(actorId) {
|
async function fetchActor(actorId) {
|
||||||
|
|
14
src/argv.js
14
src/argv.js
|
@ -23,6 +23,13 @@ function interpretAfter(after) {
|
||||||
.toDate();
|
.toDate();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function interpretActorAfter(after) {
|
||||||
|
if (!after) {
|
||||||
|
return new Date();
|
||||||
|
}
|
||||||
|
return interpretAfter(after);
|
||||||
|
}
|
||||||
|
|
||||||
const { argv } = yargs
|
const { argv } = yargs
|
||||||
.command('npm start')
|
.command('npm start')
|
||||||
.option('server', {
|
.option('server', {
|
||||||
|
@ -69,6 +76,11 @@ const { argv } = yargs
|
||||||
default: false,
|
default: false,
|
||||||
alias: 'actor-scenes',
|
alias: 'actor-scenes',
|
||||||
})
|
})
|
||||||
|
.option('actors-batch', {
|
||||||
|
describe: 'Bath size to scrape actors, if not seet then all are scraped in on pass',
|
||||||
|
type: 'number',
|
||||||
|
default: config?.actors?.batchSize === null ? 0 : config?.actors?.batchSize,
|
||||||
|
})
|
||||||
.option('actor-sources', {
|
.option('actor-sources', {
|
||||||
describe: 'Use these scrapers for actor data',
|
describe: 'Use these scrapers for actor data',
|
||||||
type: 'array',
|
type: 'array',
|
||||||
|
@ -307,6 +319,6 @@ const { argv } = yargs
|
||||||
alias: ['delete-movie', 'remove-movies', 'remove-movies'],
|
alias: ['delete-movie', 'remove-movies', 'remove-movies'],
|
||||||
})
|
})
|
||||||
.coerce('after', interpretAfter)
|
.coerce('after', interpretAfter)
|
||||||
.coerce('actors-update', interpretAfter);
|
.coerce('actors-update', interpretActorAfter);
|
||||||
|
|
||||||
module.exports = argv;
|
module.exports = argv;
|
||||||
|
|
Loading…
Reference in New Issue