Switched to tabs. Adding missing actor entries when scraping actors, with batch ID.
This commit is contained in:
193
src/actors.js
193
src/actors.js
@@ -1,125 +1,156 @@
|
||||
'use strict';
|
||||
|
||||
const config = require('config');
|
||||
const Promise = require('bluebird');
|
||||
|
||||
// const logger = require('./logger')(__filename);
|
||||
const knex = require('./knex');
|
||||
const scrapers = require('./scrapers/scrapers');
|
||||
|
||||
const argv = require('./argv');
|
||||
const slugify = require('./utils/slugify');
|
||||
const capitalize = require('./utils/capitalize');
|
||||
|
||||
function toBaseActors(actorsOrNames, release) {
|
||||
return actorsOrNames.map((actorOrName) => {
|
||||
const name = capitalize(actorOrName.name || actorOrName);
|
||||
const slug = slugify(name);
|
||||
return actorsOrNames.map((actorOrName) => {
|
||||
const name = capitalize(actorOrName.name || actorOrName);
|
||||
const slug = slugify(name);
|
||||
|
||||
const baseActor = {
|
||||
name,
|
||||
slug,
|
||||
network: release.site.network,
|
||||
};
|
||||
const baseActor = {
|
||||
name,
|
||||
slug,
|
||||
network: release?.site.network,
|
||||
};
|
||||
|
||||
if (actorOrName.name) {
|
||||
return {
|
||||
...actorOrName,
|
||||
...baseActor,
|
||||
};
|
||||
}
|
||||
if (actorOrName.name) {
|
||||
return {
|
||||
...actorOrName,
|
||||
...baseActor,
|
||||
};
|
||||
}
|
||||
|
||||
return baseActor;
|
||||
});
|
||||
return baseActor;
|
||||
});
|
||||
}
|
||||
|
||||
function curateActorEntry(baseActor, batchId) {
|
||||
return {
|
||||
name: baseActor.name,
|
||||
slug: baseActor.slug,
|
||||
network_id: null,
|
||||
batch_id: batchId,
|
||||
};
|
||||
return {
|
||||
name: baseActor.name,
|
||||
slug: baseActor.slug,
|
||||
network_id: null,
|
||||
batch_id: batchId,
|
||||
};
|
||||
}
|
||||
|
||||
function curateActorEntries(baseActors, batchId) {
|
||||
return baseActors.map(baseActor => curateActorEntry(baseActor, batchId));
|
||||
return baseActors.map(baseActor => curateActorEntry(baseActor, batchId));
|
||||
}
|
||||
|
||||
async function scrapeProfiles() {
|
||||
async function scrapeActors(actorNames) {
|
||||
const baseActors = toBaseActors(actorNames);
|
||||
|
||||
const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
|
||||
const siteSlugs = sources.flat();
|
||||
|
||||
const [networks, sites, existingActorEntries] = await Promise.all([
|
||||
knex('networks').whereIn('slug', siteSlugs),
|
||||
knex('sites').whereIn('slug', siteSlugs),
|
||||
knex('actors')
|
||||
.select(['id', 'name', 'slug'])
|
||||
.whereIn('slug', baseActors.map(baseActor => baseActor.slug))
|
||||
.whereNull('network_id'),
|
||||
]);
|
||||
|
||||
const existingActorEntriesBySlug = existingActorEntries.reduce((acc, actorEntry) => ({ ...acc, [actorEntry.slug]: actorEntry }), {});
|
||||
const networksBySlug = networks.reduce((acc, network) => ({ ...acc, [network.slug]: { ...network, isNetwork: true } }), {});
|
||||
const sitesBySlug = sites.reduce((acc, site) => ({ ...acc, [site.slug]: site }), {});
|
||||
|
||||
const newBaseActors = baseActors.filter(baseActor => !existingActorEntriesBySlug[baseActor.slug]);
|
||||
|
||||
const [batchId] = newBaseActors.length > 0 ? await knex('batches').insert({ comment: null }).returning('id') : [null];
|
||||
const curatedActorEntries = batchId && curateActorEntries(newBaseActors, batchId);
|
||||
const newActorEntries = batchId && await knex('actors').insert(curatedActorEntries).returning(['id', 'name', 'slug']);
|
||||
|
||||
const actorEntries = existingActorEntries.concat(Array.isArray(newActorEntries) ? newActorEntries : []);
|
||||
|
||||
console.log(actorEntries, newActorEntries, actorEntries);
|
||||
}
|
||||
|
||||
async function getOrCreateActors(baseActors, batchId) {
|
||||
const existingActors = await knex('actors')
|
||||
.select('id', 'alias_for', 'name', 'slug', 'network_id')
|
||||
.whereIn('slug', baseActors.map(baseActor => baseActor.slug))
|
||||
.whereNull('network_id')
|
||||
.orWhereIn(['slug', 'network_id'], baseActors.map(baseActor => [baseActor.slug, baseActor.network.id]));
|
||||
const existingActors = await knex('actors')
|
||||
.select('id', 'alias_for', 'name', 'slug', 'network_id')
|
||||
.whereIn('slug', baseActors.map(baseActor => baseActor.slug))
|
||||
.whereNull('network_id')
|
||||
.orWhereIn(['slug', 'network_id'], baseActors.map(baseActor => [baseActor.slug, baseActor.network.id]));
|
||||
|
||||
// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug));
|
||||
const existingActorSlugs = existingActors.reduce((acc, actor) => ({
|
||||
...acc,
|
||||
[actor.network_id]: {
|
||||
...acc[actor.network_id],
|
||||
[actor.slug]: true,
|
||||
},
|
||||
}), {});
|
||||
// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug));
|
||||
const existingActorSlugs = existingActors.reduce((acc, actor) => ({
|
||||
...acc,
|
||||
[actor.network_id]: {
|
||||
...acc[actor.network_id],
|
||||
[actor.slug]: true,
|
||||
},
|
||||
}), {});
|
||||
|
||||
const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.network.id]?.[baseActor.slug] && !existingActorSlugs.null?.[baseActor.slug]);
|
||||
const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.network.id]?.[baseActor.slug] && !existingActorSlugs.null?.[baseActor.slug]);
|
||||
|
||||
const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId);
|
||||
const newActors = await knex('actors').insert(curatedActorEntries, ['id', 'alias_for', 'name', 'slug', 'network_id']);
|
||||
const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId);
|
||||
const newActors = await knex('actors').insert(curatedActorEntries, ['id', 'alias_for', 'name', 'slug', 'network_id']);
|
||||
|
||||
if (Array.isArray(newActors)) {
|
||||
return newActors.concat(existingActors);
|
||||
}
|
||||
if (Array.isArray(newActors)) {
|
||||
return newActors.concat(existingActors);
|
||||
}
|
||||
|
||||
return existingActors;
|
||||
return existingActors;
|
||||
}
|
||||
|
||||
async function associateActors(releases, batchId) {
|
||||
const baseActorsByReleaseId = releases.reduce((acc, release) => {
|
||||
if (release.actors) {
|
||||
acc[release.id] = toBaseActors(release.actors, release);
|
||||
}
|
||||
const baseActorsByReleaseId = releases.reduce((acc, release) => {
|
||||
if (release.actors) {
|
||||
acc[release.id] = toBaseActors(release.actors, release);
|
||||
}
|
||||
|
||||
return acc;
|
||||
}, {});
|
||||
return acc;
|
||||
}, {});
|
||||
|
||||
const baseActors = Object.values(baseActorsByReleaseId).flat();
|
||||
const baseActors = Object.values(baseActorsByReleaseId).flat();
|
||||
|
||||
if (baseActors.length === 0) {
|
||||
return;
|
||||
}
|
||||
if (baseActors.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const baseActorsBySlugAndNetworkId = baseActors.reduce((acc, baseActor) => ({
|
||||
...acc,
|
||||
[baseActor.slug]: {
|
||||
...acc[baseActor.slug],
|
||||
[baseActor.network.id]: baseActor,
|
||||
},
|
||||
}), {});
|
||||
const baseActorsBySlugAndNetworkId = baseActors.reduce((acc, baseActor) => ({
|
||||
...acc,
|
||||
[baseActor.slug]: {
|
||||
...acc[baseActor.slug],
|
||||
[baseActor.network.id]: baseActor,
|
||||
},
|
||||
}), {});
|
||||
|
||||
const uniqueBaseActors = Object.values(baseActorsBySlugAndNetworkId).map(baseActorsByNetworkId => Object.values(baseActorsByNetworkId)).flat();
|
||||
const uniqueBaseActors = Object.values(baseActorsBySlugAndNetworkId).map(baseActorsByNetworkId => Object.values(baseActorsByNetworkId)).flat();
|
||||
|
||||
const actors = await getOrCreateActors(uniqueBaseActors, batchId);
|
||||
console.log(actors);
|
||||
const actorIdsBySlugAndNetworkId = actors.reduce((acc, actor) => ({
|
||||
...acc,
|
||||
[actor.network_id]: {
|
||||
...acc[actor.network_id],
|
||||
[actor.slug]: actor.alias_for || actor.id,
|
||||
},
|
||||
}), {});
|
||||
const actors = await getOrCreateActors(uniqueBaseActors, batchId);
|
||||
|
||||
console.log(actorIdsBySlugAndNetworkId);
|
||||
const actorIdsBySlugAndNetworkId = actors.reduce((acc, actor) => ({
|
||||
...acc,
|
||||
[actor.network_id]: {
|
||||
...acc[actor.network_id],
|
||||
[actor.slug]: actor.alias_for || actor.id,
|
||||
},
|
||||
}), {});
|
||||
|
||||
const releaseActorAssociations = Object.entries(baseActorsByReleaseId)
|
||||
.map(([releaseId, releaseActors]) => releaseActors
|
||||
.map(releaseActor => ({
|
||||
release_id: releaseId,
|
||||
actor_id: actorIdsBySlugAndNetworkId[releaseActor.network.id]?.[releaseActor.slug] || actorIdsBySlugAndNetworkId.null[releaseActor.slug],
|
||||
})))
|
||||
.flat();
|
||||
const releaseActorAssociations = Object.entries(baseActorsByReleaseId)
|
||||
.map(([releaseId, releaseActors]) => releaseActors
|
||||
.map(releaseActor => ({
|
||||
release_id: releaseId,
|
||||
actor_id: actorIdsBySlugAndNetworkId[releaseActor.network.id]?.[releaseActor.slug] || actorIdsBySlugAndNetworkId.null[releaseActor.slug],
|
||||
})))
|
||||
.flat();
|
||||
|
||||
await knex.raw(`${knex('releases_actors').insert(releaseActorAssociations).toString()} ON CONFLICT DO NOTHING;`);
|
||||
await knex.raw(`${knex('releases_actors').insert(releaseActorAssociations).toString()} ON CONFLICT DO NOTHING;`);
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
associateActors,
|
||||
associateActors,
|
||||
scrapeActors,
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user