traxxx/src/store-releases.js

461 lines
15 KiB
JavaScript
Executable File

'use strict';
const config = require('config');
const Promise = require('bluebird');
const { decode } = require('html-entities');
const argv = require('./argv');
const logger = require('./logger')(__filename);
const knex = require('./knex');
const slugify = require('./utils/slugify');
const bulkInsert = require('./utils/bulk-insert');
const resolvePlace = require('./utils/resolve-place');
const chunk = require('./utils/chunk');
const { formatDate } = require('./utils/qu');
const { associateActors, associateDirectors, scrapeActors, toBaseActors } = require('./actors');
const { associateReleaseTags } = require('./tags');
const { curateEntity } = require('./entities');
const { associateReleaseMedia } = require('./media');
const { updateSceneSearch, updateMovieSearch } = require('./update-search');
const { notify } = require('./alerts');
async function curateReleaseEntry(release, batchId, existingRelease, type = 'scene') {
const slugBase = release.title
|| (release.actors?.length && `${release.entity.slug} ${release.actors.map((actor) => actor.name).join(' ')}`)
|| (release.date && `${release.entity.slug} ${formatDate(release.date, 'YYYY MM DD')}`)
|| null;
const slug = slugify(slugBase, '-', {
encode: true,
limit: config.titleSlugLength,
});
const curatedRelease = {
title: decode(release.title),
alt_titles: release.altTitles?.map((title) => decode(title)),
entry_id: release.entryId || null,
entity_id: release.entity.id,
studio_id: release.studio?.id || null,
url: release.url,
date: Number(release.date) ? release.date : null,
date_precision: release.datePrecision,
slug,
description: decode(release.description),
comment: release.comment,
photo_count: Number(release.photoCount) || null,
deep: typeof release.deep === 'boolean' ? release.deep : false,
deep_url: release.deepUrl,
updated_batch_id: batchId,
};
if (release.id) {
// release is updated
curatedRelease.id = release.id;
}
if (type === 'scene') {
curatedRelease.shoot_id = release.shootId || null;
curatedRelease.production_date = Number(release.productionDate) ? release.productionDate : null;
curatedRelease.duration = Number(release.duration) || null;
curatedRelease.qualities = Array.from(new Set(release.qualities?.map(Number).filter(Boolean))).sort((qualityA, qualityB) => qualityB - qualityA);
}
if (release.productionLocation) {
curatedRelease.production_location = decode(release.productionLocation);
if (argv.resolvePlace) {
const productionLocation = await resolvePlace(decode(release.productionLocation));
if (productionLocation) {
curatedRelease.production_city = productionLocation.city;
curatedRelease.production_state = productionLocation.state;
curatedRelease.production_country_alpha2 = productionLocation.country;
}
}
}
if (!existingRelease && !release.id) {
curatedRelease.created_batch_id = batchId;
}
return curatedRelease;
}
async function attachChannelEntities(releases) {
const releasesWithoutEntity = releases.filter((release) => release.channel && (!release.entity || release.entity.type === 'network'));
const channelEntities = await knex('entities')
.select(knex.raw('entities.*, row_to_json(parents) as parent'))
.whereIn('entities.slug', releasesWithoutEntity.map((release) => release.channel))
.where('entities.type', 'channel')
.leftJoin('entities AS parents', 'parents.id', 'entities.parent_id');
const channelEntitiesBySlug = channelEntities.reduce((acc, entity) => ({ ...acc, [entity.slug]: entity }), {});
const releasesWithChannelEntity = await Promise.all(releases
.map(async (release) => {
if (release.channel && channelEntitiesBySlug[release.channel]) {
const curatedEntity = await curateEntity(channelEntitiesBySlug[release.channel]);
return {
...release,
entity: curatedEntity,
};
}
if (release.entity) {
return release;
}
logger.error(`Unable to match channel '${release.channel?.slug || release.channel}' from generic URL ${release.url}`);
return null;
}));
return releasesWithChannelEntity.filter(Boolean);
}
async function attachStudios(releases) {
const studioSlugs = releases.map((release) => release.studio).filter(Boolean);
const studios = await knex('entities')
.whereIn('slug', studioSlugs)
.where('type', 'studio');
const studioBySlug = studios.reduce((acc, studio) => ({ ...acc, [studio.slug]: studio }), {});
const releasesWithStudio = releases.map((release) => {
if (release.studio && studioBySlug[release.studio]) {
return {
...release,
studio: studioBySlug[release.studio],
};
}
if (release.studio) {
logger.warn(`Unable to match studio '${release.studio}' for ${release.url}`);
}
return release;
});
return releasesWithStudio;
}
function attachReleaseIds(releases, storedReleases, batchId) {
const storedReleaseIdsByEntityIdAndEntryId = storedReleases.reduce((acc, release) => {
if (!acc[release.entity_id]) acc[release.entity_id] = {};
acc[release.entity_id][release.entry_id] = release.id;
return acc;
}, {});
const releasesWithId = releases.map((release) => {
if (!release.entity) {
logger.error(`No entity available for ${release.url}`);
return null;
}
const id = storedReleaseIdsByEntityIdAndEntryId[release.entity.id]?.[release.entryId]
|| storedReleaseIdsByEntityIdAndEntryId[release.entity.parent?.id]?.[release.entryId];
if (id) {
return {
...release,
id,
batchId,
};
}
return null;
}).filter(Boolean);
return releasesWithId;
}
function filterInternalDuplicateReleases(releases) {
const releasesByEntityIdAndEntryId = releases.reduce((acc, release) => {
if (!release.entity) {
return acc;
}
if (!release.entryId) {
logger.warn(`No entry ID supplied for "${release.title}" from '${release.entity.name}'`);
return acc;
}
if (!acc[release.entity.id]) {
acc[release.entity.id] = {};
}
acc[release.entity.id][release.entryId] = release;
return acc;
}, {});
return Object.values(releasesByEntityIdAndEntryId)
.map((entityReleases) => Object.values(entityReleases))
.flat();
}
async function filterDuplicateReleases(releases, domain = 'releases') {
const internalUniqueReleases = filterInternalDuplicateReleases(releases);
const internalUniqueReleaseChunks = chunk(internalUniqueReleases);
const duplicateReleaseEntryChunks = await Promise.map(internalUniqueReleaseChunks, async (internalUniqueReleasesChunk) => knex(domain)
.whereIn(['entry_id', 'entity_id'], internalUniqueReleasesChunk.map((release) => [release.entryId, release.entity.id]))
.orWhereIn(['entry_id', 'entity_id'], internalUniqueReleasesChunk
// scene IDs shared across network, mark as duplicate so scene can be updated with channel if only available on release day (i.e. Perv City)
.filter((release) => release.entity.parent?.parameters?.networkEntryIds)
.map((release) => [release.entryId, release.entity.parent.id])), { concurrency: 10 });
const duplicateReleaseEntries = duplicateReleaseEntryChunks.flat();
const duplicateReleasesByEntityIdAndEntryId = duplicateReleaseEntries.reduce((acc, release) => {
if (!acc[release.entity_id]) acc[release.entity_id] = {};
acc[release.entity_id][release.entry_id] = true;
return acc;
}, {});
const duplicateReleases = internalUniqueReleases.filter((release) => duplicateReleasesByEntityIdAndEntryId[release.entity.id]?.[release.entryId]
|| duplicateReleasesByEntityIdAndEntryId[release.entity.parent?.id]?.[release.entryId]);
const uniqueReleases = internalUniqueReleases.filter((release) => !duplicateReleasesByEntityIdAndEntryId[release.entity.id]?.[release.entryId]
&& !duplicateReleasesByEntityIdAndEntryId[release.entity.parent?.id]?.[release.entryId]);
return {
uniqueReleases,
duplicateReleases,
duplicateReleaseEntries,
};
}
async function storeChapters(releases) {
const chapters = releases
.map((release) => release.chapters?.map((chapter, index) => ({
releaseId: release.id,
index: index + 1,
time: chapter.time,
duration: chapter.duration,
title: chapter.title,
description: chapter.description,
poster: chapter.poster,
photos: chapter.photos,
tags: chapter.tags,
})))
.flat()
.filter(Boolean)
.sort((chapterA, chapterB) => chapterA.time - chapterB.time);
const curatedChapterEntries = chapters.map((chapter) => ({
index: chapter.index,
time: chapter.time,
duration: chapter.duration,
title: chapter.title,
description: chapter.description,
release_id: chapter.releaseId,
}));
const storedChapters = await bulkInsert('chapters', curatedChapterEntries, ['release_id', 'index']);
const chapterIdsByReleaseIdAndChapter = storedChapters.reduce((acc, chapter) => ({
...acc,
[chapter.release_id]: {
...acc[chapter.release_id],
[chapter.index]: chapter.id,
},
}), {});
const chaptersWithId = chapters.map((chapter) => ({
...chapter,
id: chapterIdsByReleaseIdAndChapter[chapter.releaseId][chapter.index],
}));
await associateReleaseTags(chaptersWithId, 'chapter');
// media is more error-prone, associate separately
await associateReleaseMedia(chaptersWithId, 'chapter');
}
async function associateMovieScenes(movies, movieScenes) {
const moviesByEntityIdAndEntryId = movies.reduce((acc, movie) => ({
...acc,
[movie.entity.id]: {
...acc[movie.entity.id],
[movie.entryId]: movie,
},
}), {});
const associations = movieScenes
.toSorted((sceneA, sceneB) => {
return (sceneA.sceneIndex || 1) - (sceneB.sceneIndex || 1);
})
.map((scene) => {
if (!scene.movie) {
return null;
}
const sceneMovie = moviesByEntityIdAndEntryId[scene.entity.id]?.[scene.movie.entryId]
|| moviesByEntityIdAndEntryId[scene.entity.parent?.id]?.[scene.movie.entryId];
if (sceneMovie?.id) {
return {
movie_id: sceneMovie.id,
scene_id: scene.id,
};
}
return null;
})
.filter(Boolean);
await bulkInsert('movies_scenes', associations, false);
}
async function associateSerieScenes(series, serieScenes) {
const seriesByEntityIdAndEntryId = series.reduce((acc, serie) => ({
...acc,
[serie.entity.id]: {
...acc[serie.entity.id],
[serie.entryId]: serie,
},
}), {});
const associations = serieScenes.map((scene) => {
if (!scene.serie) {
return null;
}
const sceneSerie = seriesByEntityIdAndEntryId[scene.entity.id]?.[scene.serie.entryId]
|| seriesByEntityIdAndEntryId[scene.entity.parent?.id]?.[scene.serie.entryId];
if (sceneSerie?.id) {
return {
serie_id: sceneSerie.id,
scene_id: scene.id,
};
}
return null;
}).filter(Boolean);
await bulkInsert('series_scenes', associations, false);
}
async function storeMovies(movies, useBatchId) {
if (!movies || movies.length === 0) {
return [];
}
const { uniqueReleases } = await filterDuplicateReleases(movies, 'movies');
const [{ id: batchId }] = useBatchId ? [{ id: useBatchId }] : await knex('batches').insert({ showcased: argv.showcased, comment: null }).returning('id');
const curatedMovieEntries = await Promise.all(uniqueReleases.map((release) => curateReleaseEntry(release, batchId, null, 'movie')));
const storedMovies = await bulkInsert('movies', curatedMovieEntries, ['entity_id', 'entry_id'], true);
const moviesWithId = attachReleaseIds(movies, storedMovies);
await associateReleaseTags(moviesWithId, 'movie');
await updateMovieSearch(moviesWithId.map((movie) => movie.id));
await associateReleaseMedia(moviesWithId, 'movie');
return moviesWithId;
}
async function storeSeries(series, useBatchId) {
if (!series || series.length === 0) {
return [];
}
const { uniqueReleases } = await filterDuplicateReleases(series, 'series');
const [{ id: batchId }] = useBatchId ? [{ id: useBatchId }] : await knex('batches').insert({ showcased: argv.showcased, comment: null }).returning('id');
const curatedSerieEntries = await Promise.all(uniqueReleases.map((release) => curateReleaseEntry(release, batchId, null, 'serie')));
const storedSeries = await bulkInsert('series', curatedSerieEntries, ['entity_id', 'entry_id'], true);
const seriesWithId = attachReleaseIds(series, storedSeries);
await updateMovieSearch(seriesWithId.map((serie) => serie.id), 'serie');
await associateReleaseMedia(seriesWithId, 'serie');
return seriesWithId;
}
async function storeScenes(releases, useBatchId) {
if (!releases || releases.length === 0) {
return [];
}
const [{ id: batchId }] = useBatchId ? [{ id: useBatchId }] : await knex('batches').insert({ showcased: argv.showcased, comment: null }).returning('id');
const releasesWithChannels = await attachChannelEntities(releases);
const releasesWithBaseActors = releasesWithChannels.map((release) => ({ ...release, actors: toBaseActors(release.actors) }));
const releasesWithStudios = await attachStudios(releasesWithBaseActors);
// uniqueness is entity ID + entry ID, filter uniques after adding entities
const { uniqueReleases, duplicateReleases, duplicateReleaseEntries } = await filterDuplicateReleases(releasesWithStudios, 'releases');
const curatedNewReleaseEntries = await Promise.all(uniqueReleases.map((release) => curateReleaseEntry(release, batchId)));
const storedReleases = await bulkInsert('releases', curatedNewReleaseEntries);
const storedReleaseEntries = Array.isArray(storedReleases) ? storedReleases : [];
const uniqueReleasesWithId = attachReleaseIds(uniqueReleases, storedReleaseEntries, batchId);
const duplicateReleasesWithId = attachReleaseIds(duplicateReleases, duplicateReleaseEntries, batchId);
const curatedDuplicateReleases = await Promise.all(duplicateReleasesWithId.map((release) => curateReleaseEntry(release, batchId)));
const releasesWithId = uniqueReleasesWithId.concat(duplicateReleasesWithId);
const updated = await knex.raw(`
UPDATE releases
SET url = COALESCE(new.url, releases.url),
date = COALESCE(new.date, releases.date),
entity_id = COALESCE((new.entity->>'id')::integer, releases.entity_id),
title = COALESCE(new.title, releases.title),
description = COALESCE(new.description, releases.description),
duration = COALESCE(new.duration, releases.duration),
deep = new.url IS NOT NULL,
updated_at = NOW()
FROM json_to_recordset(:scenes)
AS new(id int, url text, date timestamptz, entity json, title text, description text, duration integer, deep boolean)
WHERE releases.id = new.id
`, {
scenes: JSON.stringify(curatedDuplicateReleases),
});
const [actors, storedSeries] = await Promise.all([
associateActors(releasesWithId, batchId),
storeSeries(releasesWithId.map((release) => release.serie && { ...release.serie, entity: release.entity }).filter(Boolean), batchId),
associateReleaseTags(releasesWithId),
storeChapters(releasesWithId),
]);
await associateSerieScenes(storedSeries, releasesWithId);
await associateDirectors(releasesWithId, batchId); // some directors may also be actors, don't associate at the same time
await updateSceneSearch(releasesWithId.map((release) => release.id));
// media is more error-prone, associate separately
await associateReleaseMedia(releasesWithId);
if (argv.sceneActors && actors) {
await scrapeActors(actors.map((actor) => actor.name));
}
logger.info(`Stored ${storedReleaseEntries.length}, updated ${updated.rowCount} releases`);
await notify(releasesWithId);
return releasesWithId;
}
module.exports = {
associateMovieScenes,
storeScenes,
storeMovies,
updateSceneSearch,
updateMovieSearch,
};