From d5cdfb36a909225c5fbec9822cd7367a1b240da6 Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Tue, 2 Feb 2021 01:31:12 +0100 Subject: [PATCH] Selecting included networks with infinite parent depth to facilitate scraper resolve. --- src/deep.js | 30 ++--------- src/entities.js | 108 +++++++++++++++++++++++----------------- src/scrapers/resolve.js | 32 ++++++++++++ src/updates.js | 11 ++-- 4 files changed, 101 insertions(+), 80 deletions(-) create mode 100644 src/scrapers/resolve.js diff --git a/src/deep.js b/src/deep.js index ce25f9ed..78ccb797 100644 --- a/src/deep.js +++ b/src/deep.js @@ -5,10 +5,10 @@ const merge = require('object-merge-advanced'); const argv = require('./argv'); const include = require('./utils/argv-include')(argv); +const { resolveScraper, resolveLayoutScraper } = require('./scrapers/resolve'); const { fetchReleaseEntities, urlToSiteSlug } = require('./entities'); const logger = require('./logger')(__filename); const qu = require('./utils/qu'); -const scrapers = require('./scrapers/scrapers'); function toBaseReleases(baseReleasesOrUrls, entity = null) { if (!baseReleasesOrUrls) { @@ -68,30 +68,6 @@ async function fetchScene(scraper, url, entity, baseRelease, options) { return null; } -function findScraper(entity) { - if (scrapers.releases[entity.slug]) { - return scrapers.releases[entity.slug]; - } - - if (entity.parent) { - return findScraper(entity.parent); - } - - return null; -} - -function findLayoutScraper(entity, scraper) { - if (scraper?.[entity.parameters?.layout]) { - return scraper[entity.parameters.layout]; - } - - if (entity.parent) { - return findLayoutScraper(entity.parent, scraper); - } - - return scraper; -} - async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') { const entity = baseRelease.entity || entitiesBySlug[urlToSiteSlug(baseRelease.url)]; @@ -107,8 +83,8 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') { }; } - const scraper = findScraper(entity); - const layoutScraper = findLayoutScraper(entity, scraper); + const scraper = resolveScraper(entity); + const layoutScraper = resolveLayoutScraper(entity, scraper); if (!layoutScraper) { logger.warn(`Could not find scraper for ${baseRelease.url}`); diff --git a/src/entities.js b/src/entities.js index a92b2d84..405d12f7 100644 --- a/src/entities.js +++ b/src/entities.js @@ -94,62 +94,80 @@ async function fetchIncludedEntities() { WITH RECURSIVE included_entities AS ( /* select configured channels and networks */ SELECT - entities.* + entities.* FROM - entities + entities WHERE - CASE WHEN :includeAll - THEN - /* select all top level networks and independent channels */ - entities.parent_id IS NULL - ELSE - ((entities.slug = ANY(:includedNetworks) - AND entities.type = 'network') - OR (entities.slug = ANY(:includedChannels) - AND entities.type = 'channel')) - END - AND NOT ( - (entities.slug = ANY(:excludedNetworks) - AND entities.type = 'network') - OR (entities.slug = ANY(:excludedChannels) - AND entities.type = 'channel')) + CASE WHEN :includeAll + THEN + /* select all top level networks and independent channels */ + entities.parent_id IS NULL + ELSE + ((entities.slug = ANY(:includedNetworks) + AND entities.type = 'network') + OR (entities.slug = ANY(:includedChannels) + AND entities.type = 'channel')) + END + AND NOT ( + (entities.slug = ANY(:excludedNetworks) + AND entities.type = 'network') + OR (entities.slug = ANY(:excludedChannels) + AND entities.type = 'channel')) UNION ALL /* select recursive children of configured networks */ SELECT - entities.* + entities.* FROM - entities + entities INNER JOIN - included_entities ON included_entities.id = entities.parent_id + included_entities ON included_entities.id = entities.parent_id WHERE NOT ((entities.slug = ANY(:excludedNetworks) AND entities.type = 'network') OR (entities.slug = ANY(:excludedChannels) AND entities.type = 'channel')) + ), included_per_network AS ( + /* select recursive channels as children of networks */ + SELECT + parents.*, + json_agg(included_entities ORDER BY included_entities.id) included_children, + row_to_json(grandparents) AS parent, + (SELECT json_agg(children) + FROM entities AS children + WHERE children.parent_id = parents.id) children + FROM + included_entities + LEFT JOIN + entities AS parents ON parents.id = included_entities.parent_id + LEFT JOIN + entities AS grandparents ON grandparents.id = parents.parent_id + WHERE + included_entities.type = 'channel' + GROUP BY + parents.id, grandparents.id + ), entity_tree as ( + /* get recursive parents of networks (necessary for scraper resolve) */ + SELECT to_jsonb(included_per_network) as entity, + parent_id, + array['parent'] as parent_path, + 0 as depth + FROM included_per_network + + UNION ALL + + SELECT jsonb_set(entity_tree.entity, entity_tree.parent_path, to_jsonb(entities)), + entities.parent_id, + entity_tree.parent_path || array['parent'], + depth + 1 + FROM entity_tree + JOIN entities ON entity_tree.parent_id = entities.id ) - /* select recursive channels as children of networks */ - SELECT - parents.*, - json_agg(included_entities ORDER BY included_entities.id) included_children, - row_to_json(grandparents) AS parent, - (SELECT json_agg(children) - FROM entities AS children - WHERE children.parent_id = parents.id) children - FROM - included_entities - LEFT JOIN - entities AS parents ON parents.id = included_entities.parent_id - LEFT JOIN - entities AS grandparents ON grandparents.id = parents.parent_id - WHERE - included_entities.type = 'channel' - GROUP BY - parents.id, grandparents.id; + SELECT entity FROM entity_tree WHERE parent_id is null; `, include); - const curatedNetworks = rawNetworks.rows.map(entity => curateEntity(entity, true)); + const curatedNetworks = rawNetworks.rows.map(({ entity }) => curateEntity(entity, true)); return curatedNetworks; } @@ -164,7 +182,7 @@ async function fetchReleaseEntities(baseReleases) { )); const entities = await knex.raw(` - WITH RECURSIVE tree as ( + WITH RECURSIVE entity_tree as ( SELECT to_jsonb(entities) as entity, parent_id, array['parent'] as parent_path, @@ -174,14 +192,14 @@ async function fetchReleaseEntities(baseReleases) { UNION ALL - SELECT jsonb_set(tree.entity, tree.parent_path, to_jsonb(entities)), + SELECT jsonb_set(entity_tree.entity, entity_tree.parent_path, to_jsonb(entities)), entities.parent_id, - tree.parent_path || array['parent'], + entity_tree.parent_path || array['parent'], depth + 1 - FROM tree - JOIN entities ON tree.parent_id = entities.id + FROM entity_tree + JOIN entities ON entity_tree.parent_id = entities.id ) - SELECT entity FROM tree WHERE parent_id is null + SELECT entity FROM entity_tree WHERE parent_id is null ORDER BY entity->'type' ASC; `, { entitySlugs }); diff --git a/src/scrapers/resolve.js b/src/scrapers/resolve.js new file mode 100644 index 00000000..89726b56 --- /dev/null +++ b/src/scrapers/resolve.js @@ -0,0 +1,32 @@ +'use strict'; + +const scrapers = require('./scrapers'); + +function resolveScraper(entity) { + if (scrapers.releases[entity.slug]) { + return scrapers.releases[entity.slug]; + } + + if (entity.parent) { + return resolveScraper(entity.parent); + } + + return null; +} + +function resolveLayoutScraper(entity, scraper) { + if (scraper?.[entity.parameters?.layout]) { + return scraper[entity.parameters.layout]; + } + + if (entity.parent) { + return resolveLayoutScraper(entity.parent, scraper); + } + + return scraper; +} + +module.exports = { + resolveScraper, + resolveLayoutScraper, +}; diff --git a/src/updates.js b/src/updates.js index 93d09a16..3c0002f8 100644 --- a/src/updates.js +++ b/src/updates.js @@ -9,7 +9,7 @@ const logger = require('./logger')(__filename); const knex = require('./knex'); const { curateRelease } = require('./releases'); const include = require('./utils/argv-include')(argv); -const scrapers = require('./scrapers/scrapers'); +const { resolveScraper, resolveLayoutScraper } = require('./scrapers/resolve'); const { fetchIncludedEntities } = require('./entities'); const emptyReleases = { uniqueReleases: [], duplicateReleases: [] }; @@ -205,13 +205,8 @@ async function scrapeChannelReleases(scraper, channelEntity, preData) { } async function scrapeChannel(channelEntity, accNetworkReleases) { - console.log(channelEntity); - - const scraper = scrapers.releases[channelEntity.slug] - || scrapers.releases[channelEntity.parent?.slug] - || scrapers.releases[channelEntity.parent?.parent?.slug]; - - const layoutScraper = scraper?.[channelEntity.parameters?.layout] || scraper?.[channelEntity.parent?.parameters?.layout] || scraper?.[channelEntity.parent?.parent?.parameters?.layout] || scraper; + const scraper = resolveScraper(channelEntity); + const layoutScraper = resolveLayoutScraper(channelEntity, scraper); if (!layoutScraper) { logger.warn(`No scraper found for '${channelEntity.name}' (${channelEntity.parent?.name})`);