diff --git a/migrations/20190325001339_releases.js b/migrations/20190325001339_releases.js index 3a3280ee..e7c313df 100644 --- a/migrations/20190325001339_releases.js +++ b/migrations/20190325001339_releases.js @@ -476,6 +476,15 @@ exports.up = knex => Promise.resolve() ALTER TABLE releases_search ADD COLUMN document tsvector; + CREATE TEXT SEARCH DICTIONARY traxxx ( + TEMPLATE = pg_catalog.simple, + stopwords = traxxx + ); + + CREATE TEXT SEARCH CONFIGURATION traxxx ( + COPY = english + ); + CREATE UNIQUE INDEX releases_search_unique ON releases_search (release_id); CREATE INDEX releases_search_index ON releases_search USING GIN (document); @@ -492,8 +501,8 @@ exports.up = knex => Promise.resolve() CREATE FUNCTION search_releases(query text) RETURNS SETOF releases_search AS $$ SELECT * FROM releases_search AS search - WHERE search.document @@ plainto_tsquery(replace(query, '.', ' ')) - ORDER BY ts_rank(search.document, plainto_tsquery(replace(query, '.', ' '))) DESC; + WHERE search.document @@ plainto_tsquery('traxxx', replace(query, '.', ' ')) + ORDER BY ts_rank(search.document, plainto_tsquery('traxxx', replace(query, '.', ' '))) DESC; $$ LANGUAGE SQL STABLE; /* @@ -557,4 +566,7 @@ exports.down = knex => knex.raw(` DROP TABLE IF EXISTS media CASCADE; DROP TABLE IF EXISTS countries CASCADE; DROP TABLE IF EXISTS networks CASCADE; + + DROP TEXT SEARCH CONFIGURATION IF EXISTS traxxx; + DROP TEXT SEARCH DICTIONARY IF EXISTS traxxx; `); diff --git a/src/releases.js b/src/releases.js index b0ae915c..3b3ed71d 100644 --- a/src/releases.js +++ b/src/releases.js @@ -374,21 +374,30 @@ async function updateReleasesSearch(releaseIds) { SELECT releases.id as release_id, to_tsvector( + 'traxxx', releases.title || ' ' || sites.name || ' ' || sites.slug || ' ' || - replace(CAST(releases.date AS VARCHAR), '-', ' ') || ' ' || + networks.name || ' ' || + networks.slug || ' ' || + EXTRACT(YEAR FROM releases.date) || ' ' || + CAST(EXTRACT(MONTH FROM releases.date) AS VARCHAR) || ' ' || + CAST(EXTRACT(DAY FROM releases.date) AS VARCHAR) || ' ' || + SUBSTRING(CAST(EXTRACT(YEAR FROM releases.date) AS VARCHAR) FROM 3 for 2) || ' ' || + LPAD(CAST(EXTRACT(MONTH FROM releases.date) AS VARCHAR), 2, '0') || ' ' || + LPAD(CAST(EXTRACT(DAY FROM releases.date) AS VARCHAR), 2, '0') || ' ' || string_agg(coalesce(actors.name, ''), ' ') || ' ' || string_agg(coalesce(tags.name, ''), ' ') ) as document FROM releases - JOIN sites ON releases.site_id = sites.id + LEFT JOIN sites ON releases.site_id = sites.id + LEFT JOIN networks ON sites.network_id = networks.id LEFT JOIN releases_actors AS local_actors ON local_actors.release_id = releases.id LEFT JOIN releases_tags AS local_tags ON local_tags.release_id = releases.id LEFT JOIN actors ON local_actors.actor_id = actors.id LEFT JOIN tags ON local_tags.tag_id = tags.id WHERE releases.id = ANY(?) - GROUP BY releases.id, sites.name, sites.slug; + GROUP BY releases.id, sites.name, sites.slug, networks.name, networks.slug; `, [releaseIds]); if (documents.rows?.length > 0) { @@ -467,13 +476,14 @@ async function storeReleases(releases) { const actors = accumulateActors(storedReleases); const movies = accumulateMovies(storedReleases); + await associateActors(actors, storedReleases); + await Promise.all([ - associateActors(actors, storedReleases), + // actors need to be stored before generating search + updateReleasesSearch(storedReleases.map(release => release.id)), storeReleaseAssets(storedReleases), ]); - await updateReleasesSearch(storedReleases.map(release => release.id)); - if (argv.withProfiles && Object.keys(actors).length > 0) { await scrapeBasicActors(); } diff --git a/src/scrapers/mindgeek.js b/src/scrapers/mindgeek.js index c2f45c2a..bc515f2f 100644 --- a/src/scrapers/mindgeek.js +++ b/src/scrapers/mindgeek.js @@ -8,6 +8,7 @@ const { CookieJar } = Promise.promisifyAll(require('tough-cookie')); const moment = require('moment'); const { ex } = require('../utils/q'); +const slugify = require('../utils/slugify'); const { inchesToCm, lbsToKg } = require('../utils/convert'); const { cookieToData } = require('../utils/cookies'); @@ -97,7 +98,7 @@ function scrapeScene(data, url, _site, networkName) { } const siteName = data.collections[0]?.name || data.brand; - release.channel = siteName.replace(/\s+/g, '').toLowerCase(); + release.channel = slugify(siteName, { delimiter: '' }); release.url = url || `https://www.${networkName || data.brand}.com/scene/${entryId}/`;