Added stashes to Manticore search update.

This commit is contained in:
DebaucheryLibrarian
2024-03-15 00:57:28 +01:00
parent b96d996947
commit f83ea2436d
2417 changed files with 565 additions and 111 deletions

View File

@@ -231,7 +231,7 @@ async function scrapeScene({ html, query }, context) {
function scrapeMovie({ el, query }, url, site) {
const movie = { url, site };
movie.entryId = new URL(url).pathname.split('/').slice(-1)[0]?.replace('.html', '');
movie.entryId = new URL(url).pathname.split('/').slice(-1)[0]?.replace('.html', '').toLowerCase();
movie.title = query.cnt('.title_bar span');
movie.covers = query.urls('#dvd-cover-flip > a');
movie.channel = slugify(query.q('.update_date a', true), '');

View File

@@ -1,6 +1,7 @@
'use strict';
const { JSDOM } = require('jsdom');
const unprint = require('unprint');
const moment = require('moment');
const http = require('../utils/http');
@@ -50,14 +51,16 @@ function scrapeLatest(html, site) {
};
}
console.log(release);
return release;
});
}
function scrapeScene(html, site, url) {
const { document } = new JSDOM(html).window;
const release = { site };
function scrapeScene({ query }, { url, entity }) {
const release = {};
<<<<<<< Updated upstream
const scene = document.querySelector('#t2019-2col');
release.url = url;
@@ -78,43 +81,47 @@ function scrapeScene(html, site, url) {
release.photos = Array.from(scene.querySelectorAll('#t2019-main .t2019-thumbs img'), (el) => ({
src: (/^http/.test(el.src) ? el.src : `https:${el.src}`),
referer: site.url,
=======
release.entryId = new URL(url).pathname.match('video/(.*)')?.[1];
release.title = query.content('.scene-info .text-2xl');
release.description = query.content('.scene-info .space-x-4 span');
release.actors = query.all('.scene-info .link-list-with-commas a').map((el) => ({
name: unprint.query.content(el),
url: unprint.query.url(el, null, { origin: entity.url }),
}));
// unreliable CDN
release.photos = query.imgs('#trailer_player .flex-row img').filter((src) => src?.includes('cdn.')).map((src) => ({
src,
referer: entity.url,
>>>>>>> Stashed changes
attempts: 5,
interval: 5000,
concurrency: 1,
}));
const posterEl = scene.querySelector('#no-player-image');
const videoEl = scene.querySelector('video');
const trailerEl = scene.querySelector('#t2019-video source');
release.poster = {
src: query.img('#no-player-image') || query.poster('#player'),
referer: entity.url,
attempts: 5,
interval: 5000,
concurrency: 1,
};
if (posterEl) {
release.poster = {
src: /^http/.test(posterEl.src) ? posterEl.src : `https:${posterEl.src}`,
referer: site.url,
attempts: 5,
interval: 5000,
concurrency: 1,
};
} else if (videoEl) {
release.poster = {
src: /^http/.test(videoEl.poster) ? videoEl.poster : `https:${videoEl.poster}`,
referer: site.url,
attempts: 5,
interval: 5000,
concurrency: 1,
};
}
if (trailerEl) {
if (query.exists('#player source')) {
release.trailer = {
src: trailerEl.src,
referer: site.url,
src: query.video('#player source'),
referer: entity.url,
attempts: 5,
interval: 5000,
concurrency: 1,
};
}
console.log(release);
return release;
}
@@ -129,17 +136,10 @@ async function fetchLatest(site, page = 1) {
return [];
}
async function fetchScene(url, site) {
const res = await http.get(url);
if (res.statusCode === 200) {
return scrapeScene(res.body.toString(), site, url);
}
return null;
}
module.exports = {
fetchLatest,
fetchScene,
scrapeScene: {
scraper: scrapeScene,
unprint: true,
},
};

View File

@@ -21,9 +21,11 @@ async function fetchActors() {
// manually select date of birth, otherwise it is retrieved in local timezone but interpreted as UTC...
const actors = await knex.raw(`
SELECT
actors.*,
actors_meta.*,
date_of_birth AT TIME ZONE 'Europe/Amsterdam' AT TIME ZONE 'UTC' as dob
FROM actors_meta;
FROM actors
LEFT JOIN actors_meta ON actors_meta.actor_id = actors.id
`);
return actors.rows;
@@ -31,7 +33,7 @@ async function fetchActors() {
async function init() {
if (update) {
await utilsApi.sql('drop table actors');
await utilsApi.sql('drop table if exists actors');
await utilsApi.sql(`create table actors(
id int,
name text,

View File

@@ -0,0 +1,147 @@
'use strict';
const config = require('config');
const manticore = require('manticoresearch');
const args = require('yargs').argv;
const { format } = require('date-fns');
const knex = require('../knex');
const mantiClient = new manticore.ApiClient();
mantiClient.basePath = `http://${config.database.manticore.host}:${config.database.manticore.httpPort}`;
// const searchApi = new manticore.SearchApi(mantiClient);
const utilsApi = new manticore.UtilsApi(mantiClient);
const indexApi = new manticore.IndexApi(mantiClient);
const update = args.update;
async function fetchMovies() {
const movies = await knex.raw(`
SELECT
movies.id AS id,
movies.title,
movies.created_at,
movies.date,
movies_meta.stashed,
entities.id as channel_id,
entities.slug as channel_slug,
entities.name as channel_name,
parents.id as network_id,
parents.slug as network_slug,
parents.name as network_name,
movies_covers IS NOT NULL as has_cover,
COALESCE(JSON_AGG(DISTINCT (actors.id, actors.name)) FILTER (WHERE actors.id IS NOT NULL), '[]') as actors,
COALESCE(JSON_AGG(DISTINCT (tags.id, tags.name, tags.priority, tags_aliases.name)) FILTER (WHERE tags.id IS NOT NULL), '[]') as tags
FROM movies
LEFT JOIN movies_meta ON movies_meta.movie_id = movies.id
LEFT JOIN movies_scenes ON movies_scenes.movie_id = movies.id
LEFT JOIN entities ON movies.entity_id = entities.id
LEFT JOIN entities AS parents ON parents.id = entities.parent_id
LEFT JOIN releases_actors AS local_actors ON local_actors.release_id = movies_scenes.scene_id
LEFT JOIN releases_directors AS local_directors ON local_directors.release_id = movies_scenes.scene_id
LEFT JOIN releases_tags AS local_tags ON local_tags.release_id = movies_scenes.scene_id
LEFT JOIN actors ON local_actors.actor_id = actors.id
LEFT JOIN actors AS directors ON local_directors.director_id = directors.id
LEFT JOIN tags ON local_tags.tag_id = tags.id
LEFT JOIN tags as tags_aliases ON local_tags.tag_id = tags_aliases.alias_for AND tags_aliases.secondary = true
LEFT JOIN movies_covers ON movies_covers.movie_id = movies.id
GROUP BY
movies.id,
movies.title,
movies.created_at,
movies.date,
movies_meta.stashed,
movies_meta.stashed_scenes,
movies_meta.stashed_total,
entities.id,
entities.name,
entities.slug,
entities.alias,
parents.id,
parents.name,
parents.slug,
parents.alias,
movies_covers.*
`);
return movies.rows;
}
async function init() {
if (update) {
await utilsApi.sql('drop table if exists movies');
await utilsApi.sql(`create table movies (
id int,
title text,
title_filtered text,
channel_id int,
channel_name text,
channel_slug text,
network_id int,
network_name text,
network_slug text,
actor_ids multi,
actors text,
tag_ids multi,
tags text,
meta text,
date timestamp,
has_cover bool,
created_at timestamp,
effective_date timestamp,
stashed int,
stashed_scenes int,
stashed_total int
)`);
const movies = await fetchMovies();
const docs = movies.map((movie) => {
const flatActors = movie.actors.flatMap((actor) => actor.f2.match(/[\w']+/g)); // match word characters to filter out brackets etc.
const flatTags = movie.tags.filter((tag) => tag.f3 > 6).flatMap((tag) => (tag.f4 ? `${tag.f2} ${tag.f4}` : tag.f2).match(/[\w']+/g)); // only make top tags searchable to minimize cluttered results
const filteredTitle = movie.title && [...flatActors, ...flatTags].reduce((accTitle, tag) => accTitle.replace(new RegExp(tag.replace(/[^\w\s]+/g, ''), 'gi'), ''), movie.title).trim().replace(/\s{2,}/g, ' ');
return {
replace: {
index: 'movies',
id: movie.id,
doc: {
title: movie.title || undefined,
title_filtered: filteredTitle || undefined,
date: movie.date ? Math.round(movie.date.getTime() / 1000) : undefined,
created_at: Math.round(movie.created_at.getTime() / 1000),
effective_date: Math.round((movie.date || movie.created_at).getTime() / 1000),
channel_id: movie.channel_id,
channel_slug: movie.channel_slug,
channel_name: movie.channel_name,
network_id: movie.network_id || undefined,
network_slug: movie.network_slug || undefined,
network_name: movie.network_name || undefined,
actor_ids: movie.actors.map((actor) => actor.f1),
actors: movie.actors.map((actor) => actor.f2).join(),
tag_ids: movie.tags.map((tag) => tag.f1),
tags: flatTags.join(' '),
has_cover: movie.has_cover,
meta: movie.date ? format(movie.date, 'y yy M MMM MMMM d') : undefined,
stashed: movie.stashed || 0,
stashed_scenes: movie.stashed_scenes || 0,
stashed_total: movie.stashed_total || 0,
},
},
};
});
console.log(docs.map((doc) => doc.replace));
const data = await indexApi.bulk(docs.map((doc) => JSON.stringify(doc)).join('\n'));
console.log('data', data);
}
knex.destroy();
}
init();

View File

@@ -21,11 +21,11 @@ const update = args.update;
async function fetchScenes() {
const scenes = await knex.raw(`
SELECT
scenes_meta.id AS id,
scenes_meta.title,
scenes_meta.created_at,
scenes_meta.date,
scenes_meta.shoot_id,
releases.id AS id,
releases.title,
releases.created_at,
releases.date,
releases.shoot_id,
scenes_meta.stashed,
entities.id as channel_id,
entities.slug as channel_slug,
@@ -35,22 +35,23 @@ async function fetchScenes() {
parents.name as network_name,
COALESCE(JSON_AGG(DISTINCT (actors.id, actors.name)) FILTER (WHERE actors.id IS NOT NULL), '[]') as actors,
COALESCE(JSON_AGG(DISTINCT (tags.id, tags.name, tags.priority, tags_aliases.name)) FILTER (WHERE tags.id IS NOT NULL), '[]') as tags
FROM scenes_meta
LEFT JOIN entities ON scenes_meta.entity_id = entities.id
FROM releases
LEFT JOIN scenes_meta ON scenes_meta.scene_id = releases.id
LEFT JOIN entities ON releases.entity_id = entities.id
LEFT JOIN entities AS parents ON parents.id = entities.parent_id
LEFT JOIN releases_actors AS local_actors ON local_actors.release_id = scenes_meta.id
LEFT JOIN releases_directors AS local_directors ON local_directors.release_id = scenes_meta.id
LEFT JOIN releases_tags AS local_tags ON local_tags.release_id = scenes_meta.id
LEFT JOIN releases_actors AS local_actors ON local_actors.release_id = releases.id
LEFT JOIN releases_directors AS local_directors ON local_directors.release_id = releases.id
LEFT JOIN releases_tags AS local_tags ON local_tags.release_id = releases.id
LEFT JOIN actors ON local_actors.actor_id = actors.id
LEFT JOIN actors AS directors ON local_directors.director_id = directors.id
LEFT JOIN tags ON local_tags.tag_id = tags.id
LEFT JOIN tags as tags_aliases ON local_tags.tag_id = tags_aliases.alias_for AND tags_aliases.secondary = true
GROUP BY
scenes_meta.id,
scenes_meta.title,
scenes_meta.created_at,
scenes_meta.date,
scenes_meta.shoot_id,
releases.id,
releases.title,
releases.created_at,
releases.date,
releases.shoot_id,
scenes_meta.stashed,
entities.id,
entities.name,
@@ -67,7 +68,7 @@ async function fetchScenes() {
async function init() {
if (update) {
await utilsApi.sql('drop table scenes');
await utilsApi.sql('drop table if exists scenes');
await utilsApi.sql(`create table scenes (
id int,
title text,
@@ -95,7 +96,7 @@ async function init() {
const docs = scenes.map((scene) => {
const flatActors = scene.actors.flatMap((actor) => actor.f2.match(/[\w']+/g)); // match word characters to filter out brackets etc.
const flatTags = scene.tags.filter((tag) => tag.f3 > 6).flatMap((tag) => (tag.f4 ? `${tag.f2} ${tag.f4}` : tag.f2).match(/[\w']+/g)); // only make top tags searchable to minimize cluttered results
const filteredTitle = scene.title && [...flatActors, ...flatTags].reduce((accTitle, tag) => accTitle.replace(new RegExp(tag, 'i'), ''), scene.title).trim().replace(/\s{2,}/, ' ');
const filteredTitle = scene.title && [...flatActors, ...flatTags].reduce((accTitle, tag) => accTitle.replace(new RegExp(tag.replace(/[^\w\s]+/g, ''), 'i'), ''), scene.title).trim().replace(/\s{2,}/, ' ');
return {
replace: {

View File

@@ -6,18 +6,61 @@ const { format } = require('date-fns');
const knex = require('./knex');
const logger = require('./logger')(__filename);
const bulkInsert = require('./utils/bulk-insert');
const chunk = require('./utils/chunk');
const mantiClient = new manticore.ApiClient();
const indexApi = new manticore.IndexApi(mantiClient);
async function updateManticoreSearch(releaseIds) {
async function updateManticoreStashedScenes(docs) {
await chunk(docs, 1000).reduce(async (chain, docsChunk) => {
await chain;
const sceneIds = docsChunk.map((doc) => doc.replace.id);
const stashes = await knex('stashes_scenes')
.select('stashes_scenes.id as stashed_id', 'stashes_scenes.scene_id', 'stashes.id as stash_id', 'stashes.user_id as user_id')
.leftJoin('stashes', 'stashes.id', 'stashes_scenes.stash_id')
.whereIn('scene_id', sceneIds);
const stashDocs = docsChunk.flatMap((doc) => {
const sceneStashes = stashes.filter((stash) => stash.scene_id === doc.replace.id);
if (sceneStashes.length === 0) {
return [];
}
const stashDoc = sceneStashes.map((stash) => ({
replace: {
index: 'scenes_stashed',
id: stash.stashed_id,
doc: {
// ...doc.replace.doc,
scene_id: doc.replace.id,
user_id: stash.user_id,
stash_id: stash.stash_id,
},
},
}));
return stashDoc;
});
if (stashDocs.length > 0) {
await indexApi.bulk(stashDocs.map((doc) => JSON.stringify(doc)).join('\n'));
}
}, Promise.resolve());
}
async function updateManticoreSceneSearch(releaseIds) {
logger.info(`Updating Manticore search documents for ${releaseIds ? releaseIds.length : 'all' } releases`);
const scenes = await knex.raw(`
SELECT
scenes_meta.id AS id,
scenes_meta.title,
scenes_meta.created_at,
scenes_meta.date,
scenes_meta.shoot_id,
releases.id AS id,
releases.title,
releases.created_at,
releases.date,
releases.shoot_id,
scenes_meta.stashed,
entities.id as channel_id,
entities.slug as channel_slug,
@@ -27,23 +70,24 @@ async function updateManticoreSearch(releaseIds) {
parents.name as network_name,
COALESCE(JSON_AGG(DISTINCT (actors.id, actors.name)) FILTER (WHERE actors.id IS NOT NULL), '[]') as actors,
COALESCE(JSON_AGG(DISTINCT (tags.id, tags.name, tags.priority)) FILTER (WHERE tags.id IS NOT NULL), '[]') as tags
FROM scenes_meta
LEFT JOIN entities ON scenes_meta.entity_id = entities.id
FROM releases
LEFT JOIN scenes_meta ON scenes_meta.scene_id = releases.id
LEFT JOIN entities ON releases.entity_id = entities.id
LEFT JOIN entities AS parents ON parents.id = entities.parent_id
LEFT JOIN releases_actors AS local_actors ON local_actors.release_id = scenes_meta.id
LEFT JOIN releases_directors AS local_directors ON local_directors.release_id = scenes_meta.id
LEFT JOIN releases_tags AS local_tags ON local_tags.release_id = scenes_meta.id
LEFT JOIN releases_actors AS local_actors ON local_actors.release_id = releases.id
LEFT JOIN releases_directors AS local_directors ON local_directors.release_id = releases.id
LEFT JOIN releases_tags AS local_tags ON local_tags.release_id = releases.id
LEFT JOIN actors ON local_actors.actor_id = actors.id
LEFT JOIN actors AS directors ON local_directors.director_id = directors.id
LEFT JOIN tags ON local_tags.tag_id = tags.id AND tags.priority >= 6
LEFT JOIN tags as tags_aliases ON local_tags.tag_id = tags_aliases.alias_for AND tags_aliases.secondary = true
${releaseIds ? 'WHERE scenes_meta.id = ANY(?)' : ''}
${releaseIds ? 'WHERE releases.id = ANY(?)' : ''}
GROUP BY
scenes_meta.id,
scenes_meta.title,
scenes_meta.created_at,
scenes_meta.date,
scenes_meta.shoot_id,
releases.id,
releases.title,
releases.created_at,
releases.date,
releases.shoot_id,
scenes_meta.stashed,
entities.id,
entities.name,
@@ -58,7 +102,7 @@ async function updateManticoreSearch(releaseIds) {
const docs = scenes.rows.map((scene) => {
const flatActors = scene.actors.flatMap((actor) => actor.f2.split(' '));
const flatTags = scene.tags.filter((tag) => tag.f3 > 6).flatMap((tag) => tag.f2.split(' ')); // only make top tags searchable to minimize cluttered results
const filteredTitle = scene.title && [...flatActors, ...flatTags].reduce((accTitle, tag) => accTitle.replace(new RegExp(tag, 'i'), ''), scene.title).trim().replace(/\s{2,}/, ' ');
const filteredTitle = scene.title && [...flatActors, ...flatTags].reduce((accTitle, tag) => accTitle.replace(new RegExp(tag.replace(/[^\w\s]+/g, ''), 'i'), ''), scene.title).trim().replace(/\s{2,}/, ' ');
return {
replace: {
@@ -92,11 +136,14 @@ async function updateManticoreSearch(releaseIds) {
return;
}
await indexApi.bulk(docs.map((doc) => JSON.stringify(doc)).join('\n'));
await Promise.all([
indexApi.bulk(docs.map((doc) => JSON.stringify(doc)).join('\n')),
updateManticoreStashedScenes(docs),
]);
}
async function updateSqlSearch(releaseIds) {
logger.info(`Updating search documents for ${releaseIds ? releaseIds.length : 'all' } releases`);
async function updateSqlSceneSearch(releaseIds) {
logger.info(`Updating SQL search documents for ${releaseIds ? releaseIds.length : 'all' } releases`);
const documents = await knex.raw(`
SELECT
@@ -142,11 +189,103 @@ async function updateSqlSearch(releaseIds) {
async function updateSceneSearch(releaseIds) {
await knex.raw('REFRESH MATERIALIZED VIEW scenes_meta;');
await updateSqlSearch(releaseIds);
await updateManticoreSearch(releaseIds);
await updateSqlSceneSearch(releaseIds);
await updateManticoreSceneSearch(releaseIds);
}
async function updateMovieSearch(movieIds, target = 'movie') {
async function updateManticoreMovieSearch(movieIds) {
const movies = await knex.raw(`
SELECT
movies.id AS id,
movies.title,
movies.created_at,
movies.date,
movies_meta.stashed,
entities.id as channel_id,
entities.slug as channel_slug,
entities.name as channel_name,
parents.id as network_id,
parents.slug as network_slug,
parents.name as network_name,
movies_covers IS NOT NULL as has_cover,
COALESCE(JSON_AGG(DISTINCT (actors.id, actors.name)) FILTER (WHERE actors.id IS NOT NULL), '[]') as actors,
COALESCE(JSON_AGG(DISTINCT (tags.id, tags.name, tags.priority, tags_aliases.name)) FILTER (WHERE tags.id IS NOT NULL), '[]') as tags
FROM movies
LEFT JOIN movies_meta ON movies_meta.movie_id = movies.id
LEFT JOIN movies_scenes ON movies_scenes.movie_id = movies.id
LEFT JOIN entities ON movies.entity_id = entities.id
LEFT JOIN entities AS parents ON parents.id = entities.parent_id
LEFT JOIN releases_actors AS local_actors ON local_actors.release_id = movies_scenes.scene_id
LEFT JOIN releases_directors AS local_directors ON local_directors.release_id = movies_scenes.scene_id
LEFT JOIN releases_tags AS local_tags ON local_tags.release_id = movies_scenes.scene_id
LEFT JOIN actors ON local_actors.actor_id = actors.id
LEFT JOIN actors AS directors ON local_directors.director_id = directors.id
LEFT JOIN tags ON local_tags.tag_id = tags.id
LEFT JOIN tags as tags_aliases ON local_tags.tag_id = tags_aliases.alias_for AND tags_aliases.secondary = true
LEFT JOIN movies_covers ON movies_covers.movie_id = movies.id
${movieIds ? 'WHERE movies.id = ANY(?)' : ''}
GROUP BY
movies.id,
movies.title,
movies.created_at,
movies.date,
movies_meta.stashed,
movies_meta.stashed_scenes,
movies_meta.stashed_total,
entities.id,
entities.name,
entities.slug,
entities.alias,
parents.id,
parents.name,
parents.slug,
parents.alias,
movies_covers.*
`, movieIds && [movieIds]);
const docs = movies.rows.map((movie) => {
const flatActors = movie.actors.flatMap((actor) => actor.f2.match(/[\w']+/g)); // match word characters to filter out brackets etc.
const flatTags = movie.tags.filter((tag) => tag.f3 > 6).flatMap((tag) => (tag.f4 ? `${tag.f2} ${tag.f4}` : tag.f2).match(/[\w']+/g)); // only make top tags searchable to minimize cluttered results
const filteredTitle = movie.title && [...flatActors, ...flatTags].reduce((accTitle, tag) => accTitle.replace(new RegExp(tag.replace(/[^\w\s]+/g, ''), 'gi'), ''), movie.title).trim().replace(/\s{2,}/g, ' ');
return {
replace: {
index: 'movies',
id: movie.id,
doc: {
title: movie.title || undefined,
title_filtered: filteredTitle || undefined,
date: movie.date ? Math.round(movie.date.getTime() / 1000) : undefined,
created_at: Math.round(movie.created_at.getTime() / 1000),
effective_date: Math.round((movie.date || movie.created_at).getTime() / 1000),
channel_id: movie.channel_id,
channel_slug: movie.channel_slug,
channel_name: movie.channel_name,
network_id: movie.network_id || undefined,
network_slug: movie.network_slug || undefined,
network_name: movie.network_name || undefined,
actor_ids: movie.actors.map((actor) => actor.f1),
actors: movie.actors.map((actor) => actor.f2).join(),
tag_ids: movie.tags.map((tag) => tag.f1),
tags: flatTags.join(' '),
has_cover: movie.has_cover,
meta: movie.date ? format(movie.date, 'y yy M MMM MMMM d') : undefined,
stashed: movie.stashed || 0,
stashed_scenes: movie.stashed_scenes || 0,
stashed_total: movie.stashed_total || 0,
},
},
};
});
if (docs.length === 0) {
return;
}
await indexApi.bulk(docs.map((doc) => JSON.stringify(doc)).join('\n'));
}
async function updateSqlMovieSearch(movieIds, target = 'movie') {
logger.info(`Updating search documents for ${movieIds ? movieIds.length : 'all' } ${target}s`);
const documents = await knex.raw(`
@@ -184,6 +323,13 @@ async function updateMovieSearch(movieIds, target = 'movie') {
}
}
async function updateMovieSearch(releaseIds) {
await knex.raw('REFRESH MATERIALIZED VIEW movies_meta;');
await updateSqlMovieSearch(releaseIds);
await updateManticoreMovieSearch(releaseIds);
}
module.exports = {
updateSceneSearch,
updateMovieSearch,