diff --git a/assets/js/fragments.js b/assets/js/fragments.js index 83432446..666adcf8 100644 --- a/assets/js/fragments.js +++ b/assets/js/fragments.js @@ -1,11 +1,10 @@ const siteFragment = ` - site { + site: entity { id name slug url - independent - network { + network: parent { id name slug @@ -20,7 +19,6 @@ const sitesFragment = ` name slug url - independent network { id name @@ -49,7 +47,7 @@ const actorFields = ` lazy } } - network { + network: entity { id name slug diff --git a/migrations/20190325001339_releases.js b/migrations/20190325001339_releases.js index affae9a6..b7cfa5ae 100644 --- a/migrations/20190325001339_releases.js +++ b/migrations/20190325001339_releases.js @@ -326,9 +326,9 @@ exports.up = knex => Promise.resolve() table.text('real_name'); - table.integer('network_id', 12) + table.integer('entity_id', 12) .references('id') - .inTable('networks'); + .inTable('entities'); table.integer('alias_for', 12) .references('id') @@ -393,15 +393,11 @@ exports.up = knex => Promise.resolve() .references('id') .inTable('actors'); - table.integer('network_id', 12) + table.integer('entity_id', 12) .references('id') - .inTable('networks'); + .inTable('entities'); - table.integer('site_id', 12) - .references('id') - .inTable('sites'); - - table.unique(['actor_id', 'network_id', 'site_id']); + table.unique(['actor_id', 'entity_id']); table.integer('priority', 4) .defaultTo(1); @@ -680,13 +676,10 @@ exports.up = knex => Promise.resolve() .then(() => knex.schema.createTable('releases', (table) => { table.increments('id', 16); - table.integer('site_id', 12) + table.integer('entity_id', 12) .references('id') - .inTable('sites'); - - table.integer('network_id', 12) - .references('id') - .inTable('networks'); + .inTable('entities') + .notNullable(); table.integer('studio_id', 12) .references('id') @@ -697,7 +690,7 @@ exports.up = knex => Promise.resolve() table.text('shoot_id'); table.text('entry_id'); - table.unique(['site_id', 'network_id', 'entry_id', 'type']); + table.unique(['entity_id', 'entry_id', 'type']); table.text('url', 1000); table.text('title'); @@ -856,15 +849,6 @@ exports.up = knex => Promise.resolve() .then(() => { // eslint-disable-line arrow-body-style // allow vim fold return knex.raw(` - ALTER TABLE releases - ADD CONSTRAINT ensure_site_or_network CHECK (site_id IS NOT NULL OR network_id IS NOT NULL); - - ALTER TABLE releases_search - ADD COLUMN document tsvector; - - CREATE UNIQUE INDEX unique_actor_slugs_network ON actors (slug, network_id); - CREATE UNIQUE INDEX unique_actor_slugs ON actors (slug, (network_id IS NULL)); - CREATE TEXT SEARCH DICTIONARY traxxx_dict ( TEMPLATE = pg_catalog.simple, stopwords = traxxx @@ -874,6 +858,12 @@ exports.up = knex => Promise.resolve() COPY = english ); + ALTER TABLE releases_search + ADD COLUMN document tsvector; + + CREATE UNIQUE INDEX unique_actor_slugs_network ON actors (slug, entity_id); + CREATE UNIQUE INDEX unique_actor_slugs ON actors (slug, (entity_id IS NULL)); + ALTER TEXT SEARCH CONFIGURATION traxxx ALTER MAPPING FOR word, numword, hword, numhword, hword_part, hword_numpart, asciiword, asciihword, hword_asciipart WITH traxxx_dict, simple, english_stem; diff --git a/src/actors.js b/src/actors.js index e80f5e49..4019aade 100644 --- a/src/actors.js +++ b/src/actors.js @@ -144,7 +144,7 @@ function curateActor(actor, withDetails = false) { name: actor.name, slug: actor.slug, gender: actor.gender, - networkId: actor.network_id, + networkId: actor.entity_id, aliasFor: actor.alias_for, dateOfBirth: actor.date_of_birth, birthCountry: actor.birth_country_alpha2, @@ -211,7 +211,7 @@ function curateActorEntry(baseActor, batchId) { return { name: baseActor.name, slug: baseActor.slug, - network_id: null, + entity_id: null, batch_id: batchId, }; } @@ -225,7 +225,7 @@ function curateProfileEntry(profile) { ...(profile.update !== false && { id: profile.update }), actor_id: profile.id, site_id: profile.site?.id || null, - network_id: profile.network?.id || null, + entity_id: profile.network?.id || null, date_of_birth: profile.dateOfBirth, date_of_death: profile.dateOfDeath, gender: profile.gender, @@ -577,14 +577,17 @@ async function scrapeActors(actorNames) { const siteSlugs = sources.flat(); const [networks, sites, existingActorEntries] = await Promise.all([ - knex('networks').whereIn('slug', siteSlugs), - knex('sites') + knex('entities') + .where('type', 2) + .whereIn('slug', siteSlugs), + knex('entities') .select( - 'sites.*', - 'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters', + 'entities.*', + 'parents.name as network_name', 'parents.slug as network_slug', 'parents.url as network_url', 'parents.description as network_description', 'parents.parameters as network_parameters', ) - .whereIn('sites.slug', siteSlugs) - .leftJoin('networks', 'sites.network_id', 'networks.id'), + .where('type', 2) + .whereIn('entities.slug', siteSlugs) + .leftJoin('entities as parents', 'parents.id', 'entities.parent_id'), knex('actors') .select(['id', 'name', 'slug']) .modify((queryBuilder) => { @@ -612,8 +615,8 @@ async function scrapeActors(actorNames) { ...acc, [profile.actor_id]: { ...acc[profile.actor_id], - [profile.network_id]: { - ...acc[profile.network_id], + [profile.entity_id]: { + ...acc[profile.entity_id], [profile.site_id]: profile, }, }, @@ -644,17 +647,19 @@ async function scrapeActors(actorNames) { } async function getOrCreateActors(baseActors, batchId) { + console.log(baseActors); + const existingActors = await knex('actors') - .select('id', 'alias_for', 'name', 'slug', 'network_id') + .select('id', 'alias_for', 'name', 'slug', 'entity_id') .whereIn('slug', baseActors.map(baseActor => baseActor.slug)) - .whereNull('network_id') - .orWhereIn(['slug', 'network_id'], baseActors.map(baseActor => [baseActor.slug, baseActor.network.id])); + .whereNull('entity_id') + .orWhereIn(['slug', 'entity_id'], baseActors.map(baseActor => [baseActor.slug, baseActor.network.id])); // const existingActorSlugs = new Set(existingActors.map(actor => actor.slug)); const existingActorSlugs = existingActors.reduce((acc, actor) => ({ ...acc, - [actor.network_id]: { - ...acc[actor.network_id], + [actor.entity_id]: { + ...acc[actor.entity_id], [actor.slug]: true, }, }), {}); @@ -662,7 +667,7 @@ async function getOrCreateActors(baseActors, batchId) { const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.network.id]?.[baseActor.slug] && !existingActorSlugs.null?.[baseActor.slug]); const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId); - const newActors = await knex('actors').insert(curatedActorEntries, ['id', 'alias_for', 'name', 'slug', 'network_id']); + const newActors = await knex('actors').insert(curatedActorEntries, ['id', 'alias_for', 'name', 'slug', 'entity_id']); if (Array.isArray(newActors)) { return newActors.concat(existingActors); @@ -732,7 +737,7 @@ async function fetchActor(actorId) { queryBuilder.where('actors.id', actorId); }) .leftJoin('actors as actor_alias', 'actor_alias.id', 'actors.alias_for') - .leftJoin('networks', 'networks.id', 'actors.network_id') + .leftJoin('networks', 'networks.id', 'actors.entity_id') .leftJoin('countries as birth_country', 'birth_country.alpha2', 'actors.birth_country_alpha2') .leftJoin('countries as residence_country', 'residence_country.alpha2', 'actors.residence_country_alpha2') .leftJoin('media', 'media.id', 'actors.avatar_media_id') diff --git a/src/deep.js b/src/deep.js index ae732274..3bcd6d4e 100644 --- a/src/deep.js +++ b/src/deep.js @@ -34,12 +34,15 @@ async function findSites(baseReleases) { .filter(Boolean), )); - const siteEntries = await knex('sites') - .leftJoin('networks', 'networks.id', 'sites.network_id') - .select('sites.*', 'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.parameters as network_parameters', 'networks.description as network_description') - .whereIn('sites.slug', siteSlugs); + const siteEntries = await knex('entities') + .leftJoin('entities as parents', 'parents.id', 'entities.parent_id') + .select('entities.*', 'parents.id as network_id', 'parents.name as network_name', 'parents.slug as network_slug', 'parents.url as network_url', 'parents.parameters as network_parameters', 'parents.description as network_description') + .where('entities.type', 2) + .whereIn('entities.slug', siteSlugs); - const networkEntries = await knex('networks').whereIn('slug', siteSlugs); + const networkEntries = await knex('entities') + .where('type', 1) + .whereIn('slug', siteSlugs); const sites = await curateSites(siteEntries, true, false); const networks = await curateNetworks(networkEntries, true, false, false); diff --git a/src/entities.js b/src/entities.js index 29c93689..1984319c 100644 --- a/src/entities.js +++ b/src/entities.js @@ -17,7 +17,10 @@ function curateEntity(entity, includeParameters = false) { type: entity.type, parameters: includeParameters ? entity.parameters : null, parent: entity.parent, - children: (entity.children || []).map(child => curateEntity(child)), + children: (entity.children || []).map(child => curateEntity({ + ...child, + parent: entity, + })), }; return curatedEntity; @@ -28,7 +31,8 @@ async function curateEntities(entities, includeParameters) { } async function fetchSitesFromArgv() { - const rawEntities = await knex.raw(` + const rawNetworks = await knex.raw(` + /* networks from argument with sites as children */ WITH RECURSIVE temp AS ( SELECT id, parent_id, name, slug, type, url, description, parameters @@ -57,8 +61,10 @@ async function fetchSitesFromArgv() { GROUP BY temp.parent_id, entities.id, entities.name, parents.id UNION ALL + + /* sites from argument as the child of network with parent */ SELECT - entities.*, row_to_json(parents) as parent, json_build_array(row_to_json(children)) + entities.*, row_to_json(parents) as parent, json_agg(row_to_json(children)) FROM entities AS children LEFT JOIN @@ -68,15 +74,13 @@ async function fetchSitesFromArgv() { WHERE children.slug = ANY(?) AND children.type = 2 GROUP BY - entities.id, parents.id, children.id; + entities.id, parents.id; `, [argv.networks || [], argv.sites || []]); - const curatedEntities = await curateEntities(rawEntities.rows, true); - logger.info(`Found ${curatedEntities.length} entities in database`); + const curatedNetworks = await curateEntities(rawNetworks.rows, true); + logger.info(`Found ${curatedNetworks.length} networks in database`); - console.log(rawEntities.rows); - - return curatedEntities; + return curatedNetworks; } async function fetchSitesFromConfig() { diff --git a/src/store-releases.js b/src/store-releases.js index b2440fe0..63cb3795 100644 --- a/src/store-releases.js +++ b/src/store-releases.js @@ -20,8 +20,7 @@ function curateReleaseEntry(release, batchId, existingRelease) { const curatedRelease = { title: release.title, entry_id: release.entryId || null, - site_id: release.site?.id, - network_id: release.site ? null : release.network?.id, // prefer site ID if available + entity_id: release.site?.id, shoot_id: release.shootId || null, studio_id: release.studio?.id || null, url: release.url, @@ -49,10 +48,10 @@ function curateReleaseEntry(release, batchId, existingRelease) { async function attachChannelSites(releases) { const releasesWithoutSite = releases.filter(release => release.channel && (!release.site || release.site.isNetwork || release.site.slug !== release.channel)); - const channelSites = await knex('sites') - .leftJoin('networks', 'networks.id', 'sites.network_id') - .select('sites.*', 'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.parameters as network_parameters', 'networks.description as network_description') - .whereIn('sites.slug', releasesWithoutSite.map(release => release.channel)); + const channelSites = await knex('entities') + .leftJoin('entities AS parents', 'parents.id', 'entities.parent_id') + .select('entities.*', 'parents.name as network_name', 'parents.slug as network_slug', 'parents.url as network_url', 'parents.parameters as network_parameters', 'parents.description as network_description') + .whereIn('entities.slug', releasesWithoutSite.map(release => release.channel)); const channelSitesBySlug = channelSites.reduce((acc, site) => ({ ...acc, [site.slug]: site }), {}); @@ -71,7 +70,6 @@ async function attachChannelSites(releases) { return release; } - if (release.site && release.site.isNetwork) { return { ...release, @@ -114,8 +112,8 @@ async function attachStudios(releases) { function attachReleaseIds(releases, storedReleases) { const storedReleaseIdsBySiteIdAndEntryId = storedReleases.reduce((acc, release) => { - if (!acc[release.site_id]) acc[release.site_id] = {}; - acc[release.site_id][release.entry_id] = release.id; + if (!acc[release.entity_id]) acc[release.entity_id] = {}; + acc[release.entity_id][release.entry_id] = release.id; return acc; }, {}); @@ -152,11 +150,11 @@ async function filterDuplicateReleases(releases) { const internalUniqueReleases = filterInternalDuplicateReleases(releases); const duplicateReleaseEntries = await knex('releases') - .whereIn(['entry_id', 'site_id'], internalUniqueReleases.map(release => [release.entryId, release.site.id])); + .whereIn(['entry_id', 'entity_id'], internalUniqueReleases.map(release => [release.entryId, release.site.id])); const duplicateReleasesBySiteIdAndEntryId = duplicateReleaseEntries.reduce((acc, release) => { - if (!acc[release.site_id]) acc[release.site_id] = {}; - acc[release.site_id][release.entry_id] = true; + if (!acc[release.entity_id]) acc[release.entity_id] = {}; + acc[release.entity_id][release.entry_id] = true; return acc; }, {}); @@ -180,13 +178,13 @@ async function updateReleasesSearch(releaseIds) { TO_TSVECTOR( 'traxxx', COALESCE(releases.title, '') || ' ' || - networks.name || ' ' || - networks.slug || ' ' || - networks.url || ' ' || - sites.name || ' ' || - sites.slug || ' ' || - COALESCE(sites.url, '') || ' ' || - COALESCE(sites.alias, '') || ' ' || + parents.name || ' ' || + parents.slug || ' ' || + parents.url || ' ' || + entities.name || ' ' || + entities.slug || ' ' || + COALESCE(entities.url, '') || ' ' || + COALESCE(entities.alias, '') || ' ' || COALESCE(releases.shoot_id, '') || ' ' || COALESCE(TO_CHAR(releases.date, 'YYYY YY MM FMMM FMmonth mon DD FMDD'), '') || ' ' || STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' || @@ -194,15 +192,15 @@ async function updateReleasesSearch(releaseIds) { STRING_AGG(COALESCE(tags_aliases.name, ''), ' ') ) as document FROM releases - LEFT JOIN sites ON releases.site_id = sites.id - LEFT JOIN networks ON sites.network_id = networks.id + LEFT JOIN entities ON releases.entity_id = entities.id + LEFT JOIN entities AS parents ON parents.id = entities.parent_id LEFT JOIN releases_actors AS local_actors ON local_actors.release_id = releases.id LEFT JOIN releases_tags AS local_tags ON local_tags.release_id = releases.id LEFT JOIN actors ON local_actors.actor_id = actors.id LEFT JOIN tags ON local_tags.tag_id = tags.id AND tags.priority >= 7 LEFT JOIN tags as tags_aliases ON local_tags.tag_id = tags_aliases.alias_for AND tags_aliases.secondary = true ${releaseIds ? 'WHERE releases.id = ANY(?)' : ''} - GROUP BY releases.id, sites.name, sites.slug, sites.alias, sites.url, networks.name, networks.slug, networks.url; + GROUP BY releases.id, entities.name, entities.slug, entities.alias, entities.url, parents.name, parents.slug, parents.url; `, releaseIds && [releaseIds]); if (documents.rows?.length > 0) { diff --git a/src/updates.js b/src/updates.js index 83b66781..283aaee7 100644 --- a/src/updates.js +++ b/src/updates.js @@ -30,14 +30,14 @@ async function filterUniqueReleases(latestReleases, accReleases) { .map(release => [release.site.id, release.entryId]); const duplicateReleases = await knex('releases') - .whereIn(['site_id', 'entry_id'], latestReleaseIdentifiers); + .whereIn(['entity_id', 'entry_id'], latestReleaseIdentifiers); // add entry IDs of accumulated releases to prevent an infinite scrape loop // when one page contains the same release as the previous const duplicateReleasesSiteIdAndEntryIds = duplicateReleases .concat(accReleases) .reduce((acc, release) => { - const siteId = release.site_id || release.site.id; + const siteId = release.entity_id || release.site.id; const entryId = release.entry_id || release.entryId; if (!acc[siteId]) acc[siteId] = {}; @@ -85,7 +85,7 @@ async function scrapeReleases(scraper, site, preData, upcoming = false) { if (!Array.isArray(latestReleases)) { // scraper is unable to fetch the releases and returned a HTTP code or null - logger.warn(`Scraper returned ${latestReleases} when fetching latest from '${site.name}' (${site.network.name})`); + logger.warn(`Scraper returned ${latestReleases} when fetching latest from '${site.name}' (${site.parent?.name})`); return accReleases; } @@ -102,7 +102,7 @@ async function scrapeReleases(scraper, site, preData, upcoming = false) { const pageAccReleases = accReleases.concat(uniqueReleases); - logger.verbose(`Scraped '${site.name}' (${site.network.name}) ${upcoming ? 'upcoming' : 'latest'} page ${page}, found ${uniqueReleases.length} unique updates`); + logger.verbose(`Scraped '${site.name}' (${site.parent?.name}) ${upcoming ? 'upcoming' : 'latest'} page ${page}, found ${uniqueReleases.length} unique updates`); if (needNextPage(uniqueReleases, pageAccReleases)) { return scrapePage(page + 1, pageAccReleases); @@ -135,7 +135,7 @@ async function scrapeLatestReleases(scraper, site, preData) { try { return await scrapeReleases(scraper, site, preData, false); } catch (error) { - logger.warn(`Failed to scrape latest updates for '${site.slug}' (${site.network.slug}): ${error.message}`); + logger.warn(`Failed to scrape latest updates for '${site.slug}' (${site.parent?.slug}): ${error.message}`); } return []; @@ -149,7 +149,7 @@ async function scrapeUpcomingReleases(scraper, site, preData) { try { return await scrapeReleases(scraper, site, preData, true); } catch (error) { - logger.warn(`Failed to scrape upcoming updates for '${site.slug}' (${site.network.slug}): ${error.message}`); + logger.warn(`Failed to scrape upcoming updates for '${site.slug}' (${site.parent?.slug}): ${error.message}`); } return []; @@ -165,18 +165,18 @@ async function scrapeSiteReleases(scraper, site, preData) { : [], ]); - logger.info(`Fetching ${latestReleases.length} latest and ${upcomingReleases.length} upcoming updates for '${site.name}' (${site.network.name})`); + logger.info(`Fetching ${latestReleases.length} latest and ${upcomingReleases.length} upcoming updates for '${site.name}' (${site.parent.name})`); return [...latestReleases, ...upcomingReleases]; } async function scrapeSite(site, accSiteReleases) { const scraper = scrapers.releases[site.slug] - || scrapers.releases[site.network.slug] - || scrapers.releases[site.network.parent?.slug]; + || scrapers.releases[site.parent?.slug] + || scrapers.releases[site.parent?.parent?.slug]; if (!scraper) { - logger.warn(`No scraper found for '${site.name}' (${site.network.name})`); + logger.warn(`No scraper found for '${site.name}' (${site.parent.name})`); return []; } @@ -211,7 +211,7 @@ async function scrapeNetworkSequential(network) { async function scrapeNetworkParallel(network) { return Promise.map( - network.sites, + network.children, async site => scrapeSite(site, network), { concurrency: 3 }, ); @@ -222,8 +222,6 @@ async function fetchUpdates() { ? await fetchSitesFromArgv() : await fetchSitesFromConfig(); - // console.log('included', includedNetworks); - const scrapedNetworks = await Promise.map( includedNetworks, async network => (network.parameters?.sequential