forked from DebaucheryLibrarian/traxxx
532 lines
19 KiB
JavaScript
Executable File
532 lines
19 KiB
JavaScript
Executable File
'use strict';
|
|
|
|
const config = require('config');
|
|
const Promise = require('bluebird');
|
|
const { decode } = require('html-entities');
|
|
|
|
const argv = require('./argv');
|
|
const logger = require('./logger')(__filename);
|
|
const knex = require('./knex');
|
|
const slugify = require('./utils/slugify');
|
|
const bulkInsert = require('./utils/bulk-insert');
|
|
const resolvePlace = require('./utils/resolve-place');
|
|
const chunk = require('./utils/chunk');
|
|
const { formatDate } = require('./utils/qu');
|
|
const { associateActors, associateDirectors, scrapeActors, toBaseActors } = require('./actors');
|
|
const { associateReleaseTags } = require('./tags');
|
|
const { curateEntity } = require('./entities');
|
|
const { associateReleaseMedia } = require('./media');
|
|
const { notify } = require('./alerts');
|
|
|
|
async function curateReleaseEntry(release, batchId, existingRelease, type = 'scene') {
|
|
const slugBase = release.title
|
|
|| (release.actors?.length && `${release.entity.slug} ${release.actors.map((actor) => actor.name).join(' ')}`)
|
|
|| (release.date && `${release.entity.slug} ${formatDate(release.date, 'YYYY MM DD')}`)
|
|
|| null;
|
|
|
|
const slug = slugify(slugBase, '-', {
|
|
encode: true,
|
|
limit: config.titleSlugLength,
|
|
});
|
|
|
|
const curatedRelease = {
|
|
title: decode(release.title),
|
|
entry_id: release.entryId || null,
|
|
entity_id: release.entity.id,
|
|
studio_id: release.studio?.id || null,
|
|
url: release.url,
|
|
date: Number(release.date) ? release.date : null,
|
|
date_precision: release.datePrecision,
|
|
slug,
|
|
description: decode(release.description),
|
|
comment: release.comment,
|
|
deep: typeof release.deep === 'boolean' ? release.deep : false,
|
|
deep_url: release.deepUrl,
|
|
updated_batch_id: batchId,
|
|
};
|
|
|
|
if (release.id) {
|
|
// release is updated
|
|
curatedRelease.id = release.id;
|
|
}
|
|
|
|
if (type === 'scene') {
|
|
curatedRelease.shoot_id = release.shootId || null;
|
|
curatedRelease.production_date = Number(release.productionDate) ? release.productionDate : null;
|
|
curatedRelease.duration = Number(release.duration) || null;
|
|
curatedRelease.qualities = Array.from(new Set(release.qualities?.map(Number).filter(Boolean))).sort((qualityA, qualityB) => qualityB - qualityA);
|
|
}
|
|
|
|
if (release.productionLocation) {
|
|
curatedRelease.production_location = decode(release.productionLocation);
|
|
|
|
if (argv.resolvePlace) {
|
|
const productionLocation = await resolvePlace(decode(release.productionLocation));
|
|
|
|
if (productionLocation) {
|
|
curatedRelease.production_city = productionLocation.city;
|
|
curatedRelease.production_state = productionLocation.state;
|
|
curatedRelease.production_country_alpha2 = productionLocation.country;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!existingRelease && !release.id) {
|
|
curatedRelease.created_batch_id = batchId;
|
|
}
|
|
|
|
return curatedRelease;
|
|
}
|
|
|
|
async function attachChannelEntities(releases) {
|
|
const releasesWithoutEntity = releases.filter((release) => release.channel && (!release.entity || release.entity.type === 'network'));
|
|
|
|
const channelEntities = await knex('entities')
|
|
.select(knex.raw('entities.*, row_to_json(parents) as parent'))
|
|
.whereIn('entities.slug', releasesWithoutEntity.map((release) => release.channel))
|
|
.where('entities.type', 'channel')
|
|
.leftJoin('entities AS parents', 'parents.id', 'entities.parent_id');
|
|
|
|
const channelEntitiesBySlug = channelEntities.reduce((acc, entity) => ({ ...acc, [entity.slug]: entity }), {});
|
|
|
|
const releasesWithChannelEntity = await Promise.all(releases
|
|
.map(async (release) => {
|
|
if (release.channel && channelEntitiesBySlug[release.channel]) {
|
|
const curatedEntity = await curateEntity(channelEntitiesBySlug[release.channel]);
|
|
|
|
return {
|
|
...release,
|
|
entity: curatedEntity,
|
|
};
|
|
}
|
|
|
|
if (release.entity) {
|
|
return release;
|
|
}
|
|
|
|
logger.error(`Unable to match channel '${release.channel?.slug || release.channel}' from generic URL ${release.url}`);
|
|
|
|
return null;
|
|
}));
|
|
|
|
return releasesWithChannelEntity.filter(Boolean);
|
|
}
|
|
|
|
async function attachStudios(releases) {
|
|
const studioSlugs = releases.map((release) => release.studio).filter(Boolean);
|
|
|
|
const studios = await knex('entities')
|
|
.whereIn('slug', studioSlugs)
|
|
.where('type', 'studio');
|
|
|
|
const studioBySlug = studios.reduce((acc, studio) => ({ ...acc, [studio.slug]: studio }), {});
|
|
|
|
const releasesWithStudio = releases.map((release) => {
|
|
if (release.studio && studioBySlug[release.studio]) {
|
|
return {
|
|
...release,
|
|
studio: studioBySlug[release.studio],
|
|
};
|
|
}
|
|
|
|
if (release.studio) {
|
|
logger.warn(`Unable to match studio '${release.studio}' for ${release.url}`);
|
|
}
|
|
|
|
return release;
|
|
});
|
|
|
|
return releasesWithStudio;
|
|
}
|
|
|
|
function attachReleaseIds(releases, storedReleases, batchId) {
|
|
const storedReleaseIdsByEntityIdAndEntryId = storedReleases.reduce((acc, release) => {
|
|
if (!acc[release.entity_id]) acc[release.entity_id] = {};
|
|
acc[release.entity_id][release.entry_id] = release.id;
|
|
|
|
return acc;
|
|
}, {});
|
|
|
|
const releasesWithId = releases.map((release) => {
|
|
if (!release.entity) {
|
|
logger.error(`No entity available for ${release.url}`);
|
|
return null;
|
|
}
|
|
|
|
const id = storedReleaseIdsByEntityIdAndEntryId[release.entity.id]?.[release.entryId]
|
|
|| storedReleaseIdsByEntityIdAndEntryId[release.entity.parent?.id]?.[release.entryId];
|
|
|
|
if (id) {
|
|
return {
|
|
...release,
|
|
id,
|
|
batchId,
|
|
};
|
|
}
|
|
|
|
return null;
|
|
}).filter(Boolean);
|
|
|
|
return releasesWithId;
|
|
}
|
|
|
|
function filterInternalDuplicateReleases(releases) {
|
|
const releasesByEntityIdAndEntryId = releases.reduce((acc, release) => {
|
|
if (!release.entity) {
|
|
return acc;
|
|
}
|
|
|
|
if (!release.entryId) {
|
|
logger.warn(`No entry ID supplied for "${release.title}" from '${release.entity.name}'`);
|
|
|
|
return acc;
|
|
}
|
|
|
|
if (!acc[release.entity.id]) {
|
|
acc[release.entity.id] = {};
|
|
}
|
|
|
|
acc[release.entity.id][release.entryId] = release;
|
|
|
|
return acc;
|
|
}, {});
|
|
|
|
return Object.values(releasesByEntityIdAndEntryId)
|
|
.map((entityReleases) => Object.values(entityReleases))
|
|
.flat();
|
|
}
|
|
|
|
async function filterDuplicateReleases(releases) {
|
|
const internalUniqueReleases = filterInternalDuplicateReleases(releases);
|
|
const internalUniqueReleaseChunks = chunk(internalUniqueReleases);
|
|
|
|
const duplicateReleaseEntryChunks = await Promise.map(internalUniqueReleaseChunks, async (internalUniqueReleasesChunk) => knex('releases')
|
|
.whereIn(['entry_id', 'entity_id'], internalUniqueReleasesChunk.map((release) => [release.entryId, release.entity.id]))
|
|
.orWhereIn(['entry_id', 'entity_id'], internalUniqueReleasesChunk
|
|
// scene IDs shared across network, mark as duplicate so scene can be updated with channel if only available on release day (i.e. Perv City)
|
|
.filter((release) => release.entity.parent?.parameters?.networkEntryIds)
|
|
.map((release) => [release.entryId, release.entity.parent.id])), { concurrency: 10 });
|
|
|
|
const duplicateReleaseEntries = duplicateReleaseEntryChunks.flat();
|
|
|
|
const duplicateReleasesByEntityIdAndEntryId = duplicateReleaseEntries.reduce((acc, release) => {
|
|
if (!acc[release.entity_id]) acc[release.entity_id] = {};
|
|
acc[release.entity_id][release.entry_id] = true;
|
|
|
|
return acc;
|
|
}, {});
|
|
|
|
const duplicateReleases = internalUniqueReleases.filter((release) => duplicateReleasesByEntityIdAndEntryId[release.entity.id]?.[release.entryId]
|
|
|| duplicateReleasesByEntityIdAndEntryId[release.entity.parent?.id]?.[release.entryId]);
|
|
|
|
const uniqueReleases = internalUniqueReleases.filter((release) => !duplicateReleasesByEntityIdAndEntryId[release.entity.id]?.[release.entryId]
|
|
&& !duplicateReleasesByEntityIdAndEntryId[release.entity.parent?.id]?.[release.entryId]);
|
|
|
|
return {
|
|
uniqueReleases,
|
|
duplicateReleases,
|
|
duplicateReleaseEntries,
|
|
};
|
|
}
|
|
|
|
async function updateSceneSearch(releaseIds) {
|
|
logger.info(`Updating search documents for ${releaseIds ? releaseIds.length : 'all' } releases`);
|
|
|
|
const documents = await knex.raw(`
|
|
SELECT
|
|
releases.id AS release_id,
|
|
TO_TSVECTOR(
|
|
'english',
|
|
COALESCE(releases.title, '') || ' ' ||
|
|
releases.entry_id || ' ' ||
|
|
entities.name || ' ' ||
|
|
entities.slug || ' ' ||
|
|
COALESCE(array_to_string(entities.alias, ' '), '') || ' ' ||
|
|
COALESCE(parents.name, '') || ' ' ||
|
|
COALESCE(parents.slug, '') || ' ' ||
|
|
COALESCE(array_to_string(parents.alias, ' '), '') || ' ' ||
|
|
COALESCE(releases.shoot_id, '') || ' ' ||
|
|
COALESCE(TO_CHAR(releases.date, 'YYYY YY MM FMMM FMMonth mon DD FMDD'), '') || ' ' ||
|
|
STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' ||
|
|
STRING_AGG(COALESCE(directors.name, ''), ' ') || ' ' ||
|
|
STRING_AGG(COALESCE(tags.name, ''), ' ') || ' ' ||
|
|
STRING_AGG(COALESCE(tags_aliases.name, ''), ' ')
|
|
) as document
|
|
FROM releases
|
|
LEFT JOIN entities ON releases.entity_id = entities.id
|
|
LEFT JOIN entities AS parents ON parents.id = entities.parent_id
|
|
LEFT JOIN releases_actors AS local_actors ON local_actors.release_id = releases.id
|
|
LEFT JOIN releases_directors AS local_directors ON local_directors.release_id = releases.id
|
|
LEFT JOIN releases_tags AS local_tags ON local_tags.release_id = releases.id
|
|
LEFT JOIN actors ON local_actors.actor_id = actors.id
|
|
LEFT JOIN actors AS directors ON local_directors.director_id = directors.id
|
|
LEFT JOIN tags ON local_tags.tag_id = tags.id AND tags.priority >= 6
|
|
LEFT JOIN tags as tags_aliases ON local_tags.tag_id = tags_aliases.alias_for AND tags_aliases.secondary = true
|
|
${releaseIds ? 'WHERE releases.id = ANY(?)' : ''}
|
|
GROUP BY releases.id, entities.name, entities.slug, entities.alias, parents.name, parents.slug, parents.alias;
|
|
`, releaseIds && [releaseIds]);
|
|
|
|
if (documents.rows?.length > 0) {
|
|
await bulkInsert('releases_search', documents.rows, ['release_id']);
|
|
}
|
|
|
|
await knex.raw('REFRESH MATERIALIZED VIEW CONCURRENTLY releases_not_showcased;');
|
|
}
|
|
|
|
async function storeChapters(releases) {
|
|
const chapters = releases
|
|
.map((release) => release.chapters?.map((chapter, index) => ({
|
|
releaseId: release.id,
|
|
index: index + 1,
|
|
time: chapter.time,
|
|
duration: chapter.duration,
|
|
title: chapter.title,
|
|
description: chapter.description,
|
|
poster: chapter.poster,
|
|
photos: chapter.photos,
|
|
tags: chapter.tags,
|
|
})))
|
|
.flat()
|
|
.filter(Boolean)
|
|
.sort((chapterA, chapterB) => chapterA.time - chapterB.time);
|
|
|
|
const curatedChapterEntries = chapters.map((chapter) => ({
|
|
index: chapter.index,
|
|
time: chapter.time,
|
|
duration: chapter.duration,
|
|
title: chapter.title,
|
|
description: chapter.description,
|
|
release_id: chapter.releaseId,
|
|
}));
|
|
|
|
const storedChapters = await bulkInsert('chapters', curatedChapterEntries, ['release_id', 'index']);
|
|
|
|
const chapterIdsByReleaseIdAndChapter = storedChapters.reduce((acc, chapter) => ({
|
|
...acc,
|
|
[chapter.release_id]: {
|
|
...acc[chapter.release_id],
|
|
[chapter.index]: chapter.id,
|
|
},
|
|
}), {});
|
|
|
|
const chaptersWithId = chapters.map((chapter) => ({
|
|
...chapter,
|
|
id: chapterIdsByReleaseIdAndChapter[chapter.releaseId][chapter.index],
|
|
}));
|
|
|
|
await associateReleaseTags(chaptersWithId, 'chapter');
|
|
|
|
// media is more error-prone, associate separately
|
|
await associateReleaseMedia(chaptersWithId, 'chapter');
|
|
}
|
|
|
|
async function associateMovieScenes(movies, movieScenes) {
|
|
const moviesByEntityIdAndEntryId = movies.reduce((acc, movie) => ({
|
|
...acc,
|
|
[movie.entity.id]: {
|
|
...acc[movie.entity.id],
|
|
[movie.entryId]: movie,
|
|
},
|
|
}), {});
|
|
|
|
const associations = movieScenes.map((scene) => {
|
|
if (!scene.movie) {
|
|
return null;
|
|
}
|
|
|
|
const sceneMovie = moviesByEntityIdAndEntryId[scene.entity.id]?.[scene.movie.entryId]
|
|
|| moviesByEntityIdAndEntryId[scene.entity.parent?.id]?.[scene.movie.entryId];
|
|
|
|
if (sceneMovie?.id) {
|
|
return {
|
|
movie_id: sceneMovie.id,
|
|
scene_id: scene.id,
|
|
};
|
|
}
|
|
|
|
return null;
|
|
}).filter(Boolean);
|
|
|
|
await bulkInsert('movies_scenes', associations, false);
|
|
}
|
|
|
|
async function associateSerieScenes(series, serieScenes) {
|
|
const seriesByEntityIdAndEntryId = series.reduce((acc, serie) => ({
|
|
...acc,
|
|
[serie.entity.id]: {
|
|
...acc[serie.entity.id],
|
|
[serie.entryId]: serie,
|
|
},
|
|
}), {});
|
|
|
|
const associations = serieScenes.map((scene) => {
|
|
if (!scene.serie) {
|
|
return null;
|
|
}
|
|
|
|
const sceneSerie = seriesByEntityIdAndEntryId[scene.entity.id]?.[scene.serie.entryId]
|
|
|| seriesByEntityIdAndEntryId[scene.entity.parent?.id]?.[scene.serie.entryId];
|
|
|
|
if (sceneSerie?.id) {
|
|
return {
|
|
serie_id: sceneSerie.id,
|
|
scene_id: scene.id,
|
|
};
|
|
}
|
|
|
|
return null;
|
|
}).filter(Boolean);
|
|
|
|
await bulkInsert('series_scenes', associations, false);
|
|
}
|
|
|
|
async function updateMovieSearch(movieIds, target = 'movie') {
|
|
logger.info(`Updating search documents for ${movieIds ? movieIds.length : 'all' } ${target}s`);
|
|
|
|
const documents = await knex.raw(`
|
|
SELECT
|
|
${target}s.id AS ${target}_id,
|
|
TO_TSVECTOR(
|
|
'english',
|
|
COALESCE(${target}s.title, '') || ' ' ||
|
|
entities.name || ' ' ||
|
|
entities.slug || ' ' ||
|
|
COALESCE(array_to_string(entities.alias, ' '), '') || ' ' ||
|
|
COALESCE(parents.name, '') || ' ' ||
|
|
COALESCE(parents.slug, '') || ' ' ||
|
|
COALESCE(array_to_string(parents.alias, ' '), '') || ' ' ||
|
|
COALESCE(TO_CHAR(${target}s.date, 'YYYY YY MM FMMM FMMonth mon DD FMDD'), '') || ' ' ||
|
|
STRING_AGG(COALESCE(releases.title, ''), ' ') || ' ' ||
|
|
STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' ||
|
|
STRING_AGG(COALESCE(tags.name, ''), ' ')
|
|
) as document
|
|
FROM ${target}s
|
|
LEFT JOIN entities ON ${target}s.entity_id = entities.id
|
|
LEFT JOIN entities AS parents ON parents.id = entities.parent_id
|
|
LEFT JOIN ${target}s_scenes ON ${target}s_scenes.${target}_id = ${target}s.id
|
|
LEFT JOIN releases ON releases.id = ${target}s_scenes.scene_id
|
|
LEFT JOIN releases_actors ON releases_actors.release_id = ${target}s_scenes.scene_id
|
|
LEFT JOIN releases_tags ON releases_tags.release_id = releases.id
|
|
LEFT JOIN actors ON actors.id = releases_actors.actor_id
|
|
LEFT JOIN tags ON tags.id = releases_tags.tag_id
|
|
${movieIds ? `WHERE ${target}s.id = ANY(?)` : ''}
|
|
GROUP BY ${target}s.id, entities.name, entities.slug, entities.alias, parents.name, parents.slug, parents.alias;
|
|
`, movieIds && [movieIds]);
|
|
|
|
if (documents.rows?.length > 0) {
|
|
await bulkInsert(`${target}s_search`, documents.rows, [`${target}_id`]);
|
|
}
|
|
}
|
|
|
|
async function storeMovies(movies, useBatchId) {
|
|
if (!movies || movies.length === 0) {
|
|
return [];
|
|
}
|
|
|
|
const { uniqueReleases } = await filterDuplicateReleases(movies);
|
|
const [batchId] = useBatchId ? [useBatchId] : await knex('batches').insert({ comment: null }).returning('id');
|
|
|
|
const curatedMovieEntries = await Promise.all(uniqueReleases.map((release) => curateReleaseEntry(release, batchId, null, 'movie')));
|
|
|
|
const storedMovies = await bulkInsert('movies', curatedMovieEntries, ['entity_id', 'entry_id'], true);
|
|
const moviesWithId = attachReleaseIds(movies, storedMovies);
|
|
|
|
await updateMovieSearch(moviesWithId.map((movie) => movie.id));
|
|
await associateReleaseMedia(moviesWithId, 'movie');
|
|
|
|
return moviesWithId;
|
|
}
|
|
|
|
async function storeSeries(series, useBatchId) {
|
|
if (!series || series.length === 0) {
|
|
return [];
|
|
}
|
|
|
|
const { uniqueReleases } = await filterDuplicateReleases(series);
|
|
const [batchId] = useBatchId ? [useBatchId] : await knex('batches').insert({ comment: null }).returning('id');
|
|
|
|
const curatedSerieEntries = await Promise.all(uniqueReleases.map((release) => curateReleaseEntry(release, batchId, null, 'serie')));
|
|
|
|
const storedSeries = await bulkInsert('series', curatedSerieEntries, ['entity_id', 'entry_id'], true);
|
|
const seriesWithId = attachReleaseIds(series, storedSeries);
|
|
|
|
await updateMovieSearch(seriesWithId.map((serie) => serie.id), 'serie');
|
|
await associateReleaseMedia(seriesWithId, 'serie');
|
|
|
|
return seriesWithId;
|
|
}
|
|
|
|
async function storeScenes(releases, useBatchId) {
|
|
if (!releases || releases.length === 0) {
|
|
return [];
|
|
}
|
|
|
|
const [batchId] = useBatchId ? [useBatchId] : await knex('batches').insert({ comment: null }).returning('id');
|
|
|
|
const releasesWithChannels = await attachChannelEntities(releases);
|
|
const releasesWithBaseActors = releasesWithChannels.map((release) => ({ ...release, actors: toBaseActors(release.actors) }));
|
|
const releasesWithStudios = await attachStudios(releasesWithBaseActors);
|
|
|
|
// uniqueness is entity ID + entry ID, filter uniques after adding entities
|
|
const { uniqueReleases, duplicateReleases, duplicateReleaseEntries } = await filterDuplicateReleases(releasesWithStudios);
|
|
|
|
const curatedNewReleaseEntries = await Promise.all(uniqueReleases.map((release) => curateReleaseEntry(release, batchId)));
|
|
const storedReleases = await bulkInsert('releases', curatedNewReleaseEntries);
|
|
|
|
const storedReleaseEntries = Array.isArray(storedReleases) ? storedReleases : [];
|
|
|
|
const uniqueReleasesWithId = attachReleaseIds(uniqueReleases, storedReleaseEntries, batchId);
|
|
const duplicateReleasesWithId = attachReleaseIds(duplicateReleases, duplicateReleaseEntries, batchId);
|
|
const curatedDuplicateReleases = await Promise.all(duplicateReleasesWithId.map((release) => curateReleaseEntry(release, batchId)));
|
|
const releasesWithId = uniqueReleasesWithId.concat(duplicateReleasesWithId);
|
|
|
|
const updated = await knex.raw(`
|
|
UPDATE releases
|
|
SET url = COALESCE(new.url, releases.url),
|
|
date = COALESCE(new.date, releases.date),
|
|
entity_id = COALESCE((new.entity->>'id')::integer, releases.entity_id),
|
|
title = COALESCE(new.title, releases.title),
|
|
description = COALESCE(new.description, releases.description),
|
|
duration = COALESCE(new.duration, releases.duration),
|
|
deep = new.url IS NOT NULL,
|
|
updated_at = NOW()
|
|
FROM json_to_recordset(:scenes)
|
|
AS new(id int, url text, date timestamptz, entity json, title text, description text, duration integer, deep boolean)
|
|
WHERE releases.id = new.id
|
|
`, {
|
|
scenes: JSON.stringify(curatedDuplicateReleases),
|
|
});
|
|
|
|
const [actors, storedSeries] = await Promise.all([
|
|
associateActors(releasesWithId, batchId),
|
|
storeSeries(releasesWithId.map((release) => release.serie && { ...release.serie, entity: release.entity }).filter(Boolean), batchId),
|
|
associateReleaseTags(releasesWithId),
|
|
storeChapters(releasesWithId),
|
|
]);
|
|
|
|
await associateSerieScenes(storedSeries, releasesWithId);
|
|
await associateDirectors(releasesWithId, batchId); // some directors may also be actors, don't associate at the same time
|
|
await updateSceneSearch(releasesWithId.map((release) => release.id));
|
|
|
|
// media is more error-prone, associate separately
|
|
await associateReleaseMedia(releasesWithId);
|
|
|
|
if (argv.sceneActors && actors) {
|
|
await scrapeActors(actors.map((actor) => actor.name));
|
|
}
|
|
|
|
logger.info(`Stored ${storedReleaseEntries.length}, updated ${updated.rowCount} releases`);
|
|
|
|
await notify(releasesWithId);
|
|
|
|
return releasesWithId;
|
|
}
|
|
|
|
module.exports = {
|
|
associateMovieScenes,
|
|
storeScenes,
|
|
storeMovies,
|
|
updateSceneSearch,
|
|
updateMovieSearch,
|
|
};
|