traxxx/src/store-releases.js

259 lines
8.9 KiB
JavaScript
Raw Normal View History

'use strict';
const config = require('config');
const argv = require('./argv');
const logger = require('./logger')(__filename);
const knex = require('./knex');
const slugify = require('./utils/slugify');
const { formatDate } = require('./utils/qu');
const { associateActors, scrapeActors } = require('./actors');
const { associateReleaseTags } = require('./tags');
const { curateEntity } = require('./entities');
const { associateReleaseMedia } = require('./media');
function curateReleaseEntry(release, batchId, existingRelease) {
const slugBase = release.title
|| (release.actors?.length && `${release.entity.slug} ${release.actors.map(actor => actor.name).join(' ')}`)
|| (release.date && `${release.entity.slug} ${formatDate(release.date, 'YYYY MM DD')}`)
|| null;
const slug = slugify(slugBase, '-', {
encode: true,
limit: config.titleSlugLength,
});
const curatedRelease = {
title: release.title,
entry_id: release.entryId || null,
entity_id: release.entity.id,
studio_id: release.studio?.id || null,
shoot_id: release.shootId || null,
url: release.url,
2020-05-26 02:11:29 +00:00
date: Number(release.date) ? release.date : null,
date_precision: release.datePrecision,
slug,
description: release.description,
duration: release.duration,
type: release.type,
comment: release.comment,
// director: release.director,
// likes: release.rating && release.rating.likes,
// dislikes: release.rating && release.rating.dislikes,
// rating: release.rating && release.rating.stars && Math.floor(release.rating.stars),
deep: typeof release.deep === 'boolean' ? release.deep : false,
deep_url: release.deepUrl,
updated_batch_id: batchId,
};
if (!existingRelease && !release.id) {
curatedRelease.created_batch_id = batchId;
}
return curatedRelease;
}
async function attachChannelEntities(releases) {
const releasesWithoutEntity = releases.filter(release => release.channel && (!release.entity || release.entity.type === 'network'));
const channelEntities = await knex('entities')
.select(knex.raw('entities.*, row_to_json(parents) as parent'))
.whereIn('entities.slug', releasesWithoutEntity.map(release => release.channel))
2020-06-30 22:25:27 +00:00
.where('entities.type', 'channel')
.leftJoin('entities AS parents', 'parents.id', 'entities.parent_id');
const channelEntitiesBySlug = channelEntities.reduce((acc, entity) => ({ ...acc, [entity.slug]: entity }), {});
const releasesWithChannelEntity = await Promise.all(releases
.map(async (release) => {
if (release.channel && channelEntitiesBySlug[release.channel]) {
const curatedEntity = await curateEntity(channelEntitiesBySlug[release.channel]);
return {
...release,
entity: curatedEntity,
};
}
if (release.entity) {
2020-05-26 02:11:29 +00:00
return release;
}
logger.error(`Unable to match channel '${release.channel?.slug || release.channel}' from generic URL ${release.url}`);
return null;
}));
return releasesWithChannelEntity.filter(Boolean);
}
async function attachStudios(releases) {
const studioSlugs = releases.map(release => release.studio).filter(Boolean);
const studios = await knex('entities')
.whereIn('slug', studioSlugs)
2020-06-30 22:25:27 +00:00
.where('type', 'studio');
const studioBySlug = studios.reduce((acc, studio) => ({ ...acc, [studio.slug]: studio }), {});
const releasesWithStudio = releases.map((release) => {
if (release.studio && studioBySlug[release.studio]) {
return {
...release,
studio: studioBySlug[release.studio],
};
}
if (release.studio) {
logger.warn(`Unable to match studio '${release.studio}' for ${release.url}`);
}
return release;
});
return releasesWithStudio;
}
function attachReleaseIds(releases, storedReleases) {
const storedReleaseIdsByEntityIdAndEntryId = storedReleases.reduce((acc, release) => {
if (!acc[release.entity_id]) acc[release.entity_id] = {};
acc[release.entity_id][release.entry_id] = release.id;
return acc;
}, {});
const releasesWithId = releases.map(release => ({
...release,
id: storedReleaseIdsByEntityIdAndEntryId[release.entity.id][release.entryId],
}));
return releasesWithId;
}
function filterInternalDuplicateReleases(releases) {
const releasesByEntityIdAndEntryId = releases.reduce((acc, release) => {
if (!release.entity) {
2020-05-26 02:11:29 +00:00
return acc;
}
if (!acc[release.entity.id]) {
acc[release.entity.id] = {};
}
acc[release.entity.id][release.entryId] = release;
return acc;
}, {});
return Object.values(releasesByEntityIdAndEntryId)
.map(entityReleases => Object.values(entityReleases))
.flat();
}
async function filterDuplicateReleases(releases) {
const internalUniqueReleases = filterInternalDuplicateReleases(releases);
const duplicateReleaseEntries = await knex('releases')
.whereIn(['entry_id', 'entity_id'], internalUniqueReleases.map(release => [release.entryId, release.entity.id]));
const duplicateReleasesByEntityIdAndEntryId = duplicateReleaseEntries.reduce((acc, release) => {
if (!acc[release.entity_id]) acc[release.entity_id] = {};
acc[release.entity_id][release.entry_id] = true;
return acc;
}, {});
const duplicateReleases = internalUniqueReleases.filter(release => duplicateReleasesByEntityIdAndEntryId[release.entity.id]?.[release.entryId]);
const uniqueReleases = internalUniqueReleases.filter(release => !duplicateReleasesByEntityIdAndEntryId[release.entity.id]?.[release.entryId]);
return {
uniqueReleases,
duplicateReleases,
duplicateReleaseEntries,
};
}
async function updateReleasesSearch(releaseIds) {
logger.info(`Updating search documents for ${releaseIds ? releaseIds.length : 'all' } releases`);
const documents = await knex.raw(`
SELECT
releases.id AS release_id,
TO_TSVECTOR(
'traxxx',
COALESCE(releases.title, '') || ' ' ||
entities.name || ' ' ||
entities.slug || ' ' ||
COALESCE(array_to_string(entities.alias, ' '), '') || ' ' ||
COALESCE(parents.name, '') || ' ' ||
COALESCE(parents.slug, '') || ' ' ||
COALESCE(array_to_string(parents.alias, ' '), '') || ' ' ||
COALESCE(releases.shoot_id, '') || ' ' ||
COALESCE(TO_CHAR(releases.date, 'YYYY YY MM FMMM FMmonth mon DD FMDD'), '') || ' ' ||
STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' ||
STRING_AGG(COALESCE(tags.name, ''), ' ') || ' ' ||
STRING_AGG(COALESCE(tags_aliases.name, ''), ' ')
) as document
FROM releases
LEFT JOIN entities ON releases.entity_id = entities.id
LEFT JOIN entities AS parents ON parents.id = entities.parent_id
LEFT JOIN releases_actors AS local_actors ON local_actors.release_id = releases.id
LEFT JOIN releases_tags AS local_tags ON local_tags.release_id = releases.id
LEFT JOIN actors ON local_actors.actor_id = actors.id
LEFT JOIN tags ON local_tags.tag_id = tags.id AND tags.priority >= 7
LEFT JOIN tags as tags_aliases ON local_tags.tag_id = tags_aliases.alias_for AND tags_aliases.secondary = true
${releaseIds ? 'WHERE releases.id = ANY(?)' : ''}
GROUP BY releases.id, entities.name, entities.slug, entities.alias, parents.name, parents.slug, parents.alias;
`, releaseIds && [releaseIds]);
if (documents.rows?.length > 0) {
const query = knex('releases_search').insert(documents.rows).toString();
await knex.raw(`${query} ON CONFLICT (release_id) DO UPDATE SET document = EXCLUDED.document`);
}
}
async function storeReleases(releases) {
if (releases.length === 0) {
return [];
}
const [batchId] = await knex('batches').insert({ comment: null }).returning('id');
const releasesWithChannels = await attachChannelEntities(releases);
const releasesWithStudios = await attachStudios(releasesWithChannels);
// uniqueness is entity ID + entry ID, filter uniques after adding entities
const { uniqueReleases, duplicateReleases, duplicateReleaseEntries } = await filterDuplicateReleases(releasesWithStudios);
const curatedNewReleaseEntries = uniqueReleases.map(release => curateReleaseEntry(release, batchId));
2020-05-26 02:11:29 +00:00
const storedReleases = await knex.batchInsert('releases', curatedNewReleaseEntries).returning('*');
// TODO: update duplicate releases
const storedReleaseEntries = Array.isArray(storedReleases) ? storedReleases : [];
const releasesWithId = attachReleaseIds([].concat(uniqueReleases, duplicateReleases), [].concat(storedReleaseEntries, duplicateReleaseEntries));
const [actors] = await Promise.all([
associateActors(releasesWithId, batchId),
associateReleaseTags(releasesWithId),
]);
await updateReleasesSearch(releasesWithId.map(release => release.id));
// media is more error-prone, associate separately
await associateReleaseMedia(releasesWithId);
if (argv.sceneActors) {
await scrapeActors(actors.map(actor => actor.name));
}
2020-05-13 00:56:20 +00:00
logger.info(`Stored ${storedReleaseEntries.length} releases`);
return releasesWithId;
}
module.exports = {
storeReleases,
updateReleasesSearch,
};