381 lines
12 KiB
JavaScript
381 lines
12 KiB
JavaScript
'use strict';
|
|
|
|
const config = require('config');
|
|
|
|
const argv = require('./argv');
|
|
const logger = require('./logger')(__filename);
|
|
const knex = require('./knex');
|
|
const slugify = require('./utils/slugify');
|
|
const bulkInsert = require('./utils/bulk-insert');
|
|
const resolvePlace = require('./utils/resolve-place');
|
|
const { formatDate } = require('./utils/qu');
|
|
const { associateActors, scrapeActors } = require('./actors');
|
|
const { associateReleaseTags } = require('./tags');
|
|
const { curateEntity } = require('./entities');
|
|
const { associateReleaseMedia } = require('./media');
|
|
|
|
async function curateReleaseEntry(release, batchId, existingRelease, type = 'scene') {
|
|
const slugBase = release.title
|
|
|| (release.actors?.length && `${release.entity.slug} ${release.actors.map(actor => actor.name).join(' ')}`)
|
|
|| (release.date && `${release.entity.slug} ${formatDate(release.date, 'YYYY MM DD')}`)
|
|
|| null;
|
|
|
|
const slug = slugify(slugBase, '-', {
|
|
encode: true,
|
|
limit: config.titleSlugLength,
|
|
});
|
|
|
|
const curatedRelease = {
|
|
title: release.title,
|
|
entry_id: release.entryId || null,
|
|
entity_id: release.entity.id,
|
|
studio_id: release.studio?.id || null,
|
|
url: release.url,
|
|
date: Number(release.date) ? release.date : null,
|
|
date_precision: release.datePrecision,
|
|
slug,
|
|
description: release.description,
|
|
comment: release.comment,
|
|
// director: release.director,
|
|
// likes: release.rating && release.rating.likes,
|
|
// dislikes: release.rating && release.rating.dislikes,
|
|
// rating: release.rating && release.rating.stars && Math.floor(release.rating.stars),
|
|
deep: typeof release.deep === 'boolean' ? release.deep : false,
|
|
deep_url: release.deepUrl,
|
|
updated_batch_id: batchId,
|
|
};
|
|
|
|
if (type === 'scene') {
|
|
curatedRelease.shoot_id = release.shootId || null;
|
|
curatedRelease.production_date = Number(release.productionDate) ? release.productionDate : null;
|
|
curatedRelease.duration = release.duration;
|
|
}
|
|
|
|
if (release.productionLocation) {
|
|
curatedRelease.production_location = release.productionLocation;
|
|
|
|
if (argv.resolvePlace) {
|
|
const productionLocation = await resolvePlace(release.productionLocation);
|
|
|
|
if (productionLocation) {
|
|
curatedRelease.production_city = productionLocation.city;
|
|
curatedRelease.production_state = productionLocation.state;
|
|
curatedRelease.production_country_alpha2 = productionLocation.country;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!existingRelease && !release.id) {
|
|
curatedRelease.created_batch_id = batchId;
|
|
}
|
|
|
|
return curatedRelease;
|
|
}
|
|
|
|
async function attachChannelEntities(releases) {
|
|
const releasesWithoutEntity = releases.filter(release => release.channel && (!release.entity || release.entity.type === 'network'));
|
|
|
|
const channelEntities = await knex('entities')
|
|
.select(knex.raw('entities.*, row_to_json(parents) as parent'))
|
|
.whereIn('entities.slug', releasesWithoutEntity.map(release => release.channel))
|
|
.where('entities.type', 'channel')
|
|
.leftJoin('entities AS parents', 'parents.id', 'entities.parent_id');
|
|
|
|
const channelEntitiesBySlug = channelEntities.reduce((acc, entity) => ({ ...acc, [entity.slug]: entity }), {});
|
|
|
|
const releasesWithChannelEntity = await Promise.all(releases
|
|
.map(async (release) => {
|
|
if (release.channel && channelEntitiesBySlug[release.channel]) {
|
|
const curatedEntity = await curateEntity(channelEntitiesBySlug[release.channel]);
|
|
|
|
return {
|
|
...release,
|
|
entity: curatedEntity,
|
|
};
|
|
}
|
|
|
|
if (release.entity) {
|
|
return release;
|
|
}
|
|
|
|
logger.error(`Unable to match channel '${release.channel?.slug || release.channel}' from generic URL ${release.url}`);
|
|
|
|
return null;
|
|
}));
|
|
|
|
return releasesWithChannelEntity.filter(Boolean);
|
|
}
|
|
|
|
async function attachStudios(releases) {
|
|
const studioSlugs = releases.map(release => release.studio).filter(Boolean);
|
|
|
|
const studios = await knex('entities')
|
|
.whereIn('slug', studioSlugs)
|
|
.where('type', 'studio');
|
|
|
|
const studioBySlug = studios.reduce((acc, studio) => ({ ...acc, [studio.slug]: studio }), {});
|
|
|
|
const releasesWithStudio = releases.map((release) => {
|
|
if (release.studio && studioBySlug[release.studio]) {
|
|
return {
|
|
...release,
|
|
studio: studioBySlug[release.studio],
|
|
};
|
|
}
|
|
|
|
if (release.studio) {
|
|
logger.warn(`Unable to match studio '${release.studio}' for ${release.url}`);
|
|
}
|
|
|
|
return release;
|
|
});
|
|
|
|
return releasesWithStudio;
|
|
}
|
|
|
|
function attachReleaseIds(releases, storedReleases) {
|
|
const storedReleaseIdsByEntityIdAndEntryId = storedReleases.reduce((acc, release) => {
|
|
if (!acc[release.entity_id]) acc[release.entity_id] = {};
|
|
acc[release.entity_id][release.entry_id] = release.id;
|
|
|
|
return acc;
|
|
}, {});
|
|
|
|
const releasesWithId = releases.map((release) => {
|
|
if (!release.entity) {
|
|
logger.error(`No entitity available for ${release.url}`);
|
|
return null;
|
|
}
|
|
|
|
const id = storedReleaseIdsByEntityIdAndEntryId[release.entity.id]?.[release.entryId];
|
|
|
|
if (id) {
|
|
return {
|
|
...release,
|
|
id,
|
|
};
|
|
}
|
|
|
|
return null;
|
|
}).filter(Boolean);
|
|
|
|
return releasesWithId;
|
|
}
|
|
|
|
function filterInternalDuplicateReleases(releases) {
|
|
const releasesByEntityIdAndEntryId = releases.reduce((acc, release) => {
|
|
if (!release.entity) {
|
|
return acc;
|
|
}
|
|
|
|
if (!acc[release.entity.id]) {
|
|
acc[release.entity.id] = {};
|
|
}
|
|
|
|
acc[release.entity.id][release.entryId] = release;
|
|
|
|
return acc;
|
|
}, {});
|
|
|
|
return Object.values(releasesByEntityIdAndEntryId)
|
|
.map(entityReleases => Object.values(entityReleases))
|
|
.flat();
|
|
}
|
|
|
|
async function filterDuplicateReleases(releases) {
|
|
const internalUniqueReleases = filterInternalDuplicateReleases(releases);
|
|
|
|
const duplicateReleaseEntries = await knex('releases')
|
|
.whereIn(['entry_id', 'entity_id'], internalUniqueReleases.map(release => [release.entryId, release.entity.id]));
|
|
|
|
const duplicateReleasesByEntityIdAndEntryId = duplicateReleaseEntries.reduce((acc, release) => {
|
|
if (!acc[release.entity_id]) acc[release.entity_id] = {};
|
|
acc[release.entity_id][release.entry_id] = true;
|
|
|
|
return acc;
|
|
}, {});
|
|
|
|
const duplicateReleases = internalUniqueReleases.filter(release => duplicateReleasesByEntityIdAndEntryId[release.entity.id]?.[release.entryId]);
|
|
const uniqueReleases = internalUniqueReleases.filter(release => !duplicateReleasesByEntityIdAndEntryId[release.entity.id]?.[release.entryId]);
|
|
|
|
return {
|
|
uniqueReleases,
|
|
duplicateReleases,
|
|
duplicateReleaseEntries,
|
|
};
|
|
}
|
|
|
|
async function updateReleasesSearch(releaseIds) {
|
|
logger.info(`Updating search documents for ${releaseIds ? releaseIds.length : 'all' } releases`);
|
|
|
|
const documents = await knex.raw(`
|
|
SELECT
|
|
releases.id AS release_id,
|
|
TO_TSVECTOR(
|
|
'traxxx',
|
|
COALESCE(releases.title, '') || ' ' ||
|
|
entities.name || ' ' ||
|
|
entities.slug || ' ' ||
|
|
COALESCE(array_to_string(entities.alias, ' '), '') || ' ' ||
|
|
COALESCE(parents.name, '') || ' ' ||
|
|
COALESCE(parents.slug, '') || ' ' ||
|
|
COALESCE(array_to_string(parents.alias, ' '), '') || ' ' ||
|
|
COALESCE(releases.shoot_id, '') || ' ' ||
|
|
COALESCE(TO_CHAR(releases.date, 'YYYY YY MM FMMM FMmonth mon DD FMDD'), '') || ' ' ||
|
|
STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' ||
|
|
STRING_AGG(COALESCE(tags.name, ''), ' ') || ' ' ||
|
|
STRING_AGG(COALESCE(tags_aliases.name, ''), ' ')
|
|
) as document
|
|
FROM releases
|
|
LEFT JOIN entities ON releases.entity_id = entities.id
|
|
LEFT JOIN entities AS parents ON parents.id = entities.parent_id
|
|
LEFT JOIN releases_actors AS local_actors ON local_actors.release_id = releases.id
|
|
LEFT JOIN releases_tags AS local_tags ON local_tags.release_id = releases.id
|
|
LEFT JOIN actors ON local_actors.actor_id = actors.id
|
|
LEFT JOIN tags ON local_tags.tag_id = tags.id AND tags.priority >= 7
|
|
LEFT JOIN tags as tags_aliases ON local_tags.tag_id = tags_aliases.alias_for AND tags_aliases.secondary = true
|
|
${releaseIds ? 'WHERE releases.id = ANY(?)' : ''}
|
|
GROUP BY releases.id, entities.name, entities.slug, entities.alias, parents.name, parents.slug, parents.alias;
|
|
`, releaseIds && [releaseIds]);
|
|
|
|
if (documents.rows?.length > 0) {
|
|
await bulkInsert('releases_search', documents.rows, ['release_id']);
|
|
}
|
|
}
|
|
|
|
async function storeClips(releases) {
|
|
const clips = releases.map(release => release.clips?.map((clip, index) => ({
|
|
title: clip.title,
|
|
description: clip.description,
|
|
releaseId: release.id,
|
|
clip: index + 1,
|
|
duration: clip.duration,
|
|
poster: clip.poster,
|
|
photos: clip.photos,
|
|
tags: clip.tags,
|
|
}))).flat().filter(Boolean);
|
|
|
|
const curatedClipEntries = clips.map(clip => ({
|
|
title: clip.title,
|
|
description: clip.description,
|
|
duration: clip.duration,
|
|
release_id: clip.releaseId,
|
|
clip: clip.clip,
|
|
}));
|
|
|
|
const storedClips = await bulkInsert('clips', curatedClipEntries, ['release_id', 'clip']);
|
|
const clipIdsByReleaseIdAndClip = storedClips.reduce((acc, clip) => ({
|
|
...acc,
|
|
[clip.release_id]: {
|
|
...acc[clip.release_id],
|
|
[clip.clip]: clip.id,
|
|
},
|
|
}), {});
|
|
|
|
const clipsWithId = clips.map(clip => ({
|
|
...clip,
|
|
id: clipIdsByReleaseIdAndClip[clip.releaseId][clip.clip],
|
|
}));
|
|
|
|
await associateReleaseTags(clipsWithId, 'clip');
|
|
|
|
// media is more error-prone, associate separately
|
|
await associateReleaseMedia(clipsWithId, 'clip');
|
|
}
|
|
|
|
async function storeScenes(releases) {
|
|
if (releases.length === 0) {
|
|
return [];
|
|
}
|
|
|
|
const [batchId] = await knex('batches').insert({ comment: null }).returning('id');
|
|
|
|
const releasesWithChannels = await attachChannelEntities(releases);
|
|
const releasesWithStudios = await attachStudios(releasesWithChannels);
|
|
|
|
// uniqueness is entity ID + entry ID, filter uniques after adding entities
|
|
const { uniqueReleases, duplicateReleases, duplicateReleaseEntries } = await filterDuplicateReleases(releasesWithStudios);
|
|
|
|
const curatedNewReleaseEntries = await Promise.all(uniqueReleases.map(release => curateReleaseEntry(release, batchId)));
|
|
|
|
const storedReleases = await bulkInsert('releases', curatedNewReleaseEntries);
|
|
|
|
const storedReleaseEntries = Array.isArray(storedReleases) ? storedReleases : [];
|
|
const releasesWithId = attachReleaseIds([].concat(uniqueReleases, duplicateReleases), [].concat(storedReleaseEntries, duplicateReleaseEntries));
|
|
|
|
const [actors] = await Promise.all([
|
|
associateActors(releasesWithId, batchId),
|
|
associateReleaseTags(releasesWithId),
|
|
storeClips(releasesWithId),
|
|
]);
|
|
|
|
await updateReleasesSearch(releasesWithId.map(release => release.id));
|
|
|
|
// media is more error-prone, associate separately
|
|
await associateReleaseMedia(releasesWithId);
|
|
|
|
if (argv.sceneActors) {
|
|
await scrapeActors(actors.map(actor => actor.name));
|
|
}
|
|
|
|
logger.info(`Stored ${storedReleaseEntries.length} releases`);
|
|
|
|
return releasesWithId;
|
|
}
|
|
|
|
async function associateMovieScenes(movies, movieScenes) {
|
|
const movieScenesByEntityIdAndEntryId = movieScenes.reduce((acc, scene) => ({
|
|
...acc,
|
|
[scene.entity.id]: {
|
|
...acc[scene.entity.id],
|
|
[scene.entryId]: scene,
|
|
},
|
|
}), {});
|
|
|
|
const associations = movies.map((movie) => {
|
|
if (!movie.scenes) {
|
|
return null;
|
|
}
|
|
|
|
return movie.scenes.map((scene) => {
|
|
const movieScene = movieScenesByEntityIdAndEntryId[movie.entity.id]?.[scene.entryId];
|
|
|
|
if (movieScene) {
|
|
return {
|
|
movie_id: movie.id,
|
|
scene_id: movieScene.id,
|
|
};
|
|
}
|
|
|
|
return null;
|
|
});
|
|
}).flat().filter(Boolean);
|
|
|
|
await bulkInsert('movies_scenes', associations, false);
|
|
}
|
|
|
|
async function storeMovies(movies, movieScenes) {
|
|
if (movies.length === 0) {
|
|
return [];
|
|
}
|
|
|
|
const { uniqueReleases } = await filterDuplicateReleases(movies);
|
|
const [batchId] = await knex('batches').insert({ comment: null }).returning('id');
|
|
|
|
const curatedMovieEntries = await Promise.all(uniqueReleases.map(release => curateReleaseEntry(release, batchId, null, 'movie')));
|
|
|
|
const storedMovies = await bulkInsert('movies', curatedMovieEntries, ['entity_id', 'entry_id'], true);
|
|
const moviesWithId = attachReleaseIds(movies, storedMovies);
|
|
|
|
await associateMovieScenes(moviesWithId, movieScenes);
|
|
await associateReleaseMedia(moviesWithId, 'movie');
|
|
|
|
return storedMovies;
|
|
}
|
|
|
|
module.exports = {
|
|
storeScenes,
|
|
storeMovies,
|
|
updateReleasesSearch,
|
|
};
|