Selecting included networks with infinite parent depth to facilitate scraper resolve.
This commit is contained in:
parent
46c0b269c3
commit
d5cdfb36a9
30
src/deep.js
30
src/deep.js
|
@ -5,10 +5,10 @@ const merge = require('object-merge-advanced');
|
||||||
|
|
||||||
const argv = require('./argv');
|
const argv = require('./argv');
|
||||||
const include = require('./utils/argv-include')(argv);
|
const include = require('./utils/argv-include')(argv);
|
||||||
|
const { resolveScraper, resolveLayoutScraper } = require('./scrapers/resolve');
|
||||||
const { fetchReleaseEntities, urlToSiteSlug } = require('./entities');
|
const { fetchReleaseEntities, urlToSiteSlug } = require('./entities');
|
||||||
const logger = require('./logger')(__filename);
|
const logger = require('./logger')(__filename);
|
||||||
const qu = require('./utils/qu');
|
const qu = require('./utils/qu');
|
||||||
const scrapers = require('./scrapers/scrapers');
|
|
||||||
|
|
||||||
function toBaseReleases(baseReleasesOrUrls, entity = null) {
|
function toBaseReleases(baseReleasesOrUrls, entity = null) {
|
||||||
if (!baseReleasesOrUrls) {
|
if (!baseReleasesOrUrls) {
|
||||||
|
@ -68,30 +68,6 @@ async function fetchScene(scraper, url, entity, baseRelease, options) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
function findScraper(entity) {
|
|
||||||
if (scrapers.releases[entity.slug]) {
|
|
||||||
return scrapers.releases[entity.slug];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (entity.parent) {
|
|
||||||
return findScraper(entity.parent);
|
|
||||||
}
|
|
||||||
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
function findLayoutScraper(entity, scraper) {
|
|
||||||
if (scraper?.[entity.parameters?.layout]) {
|
|
||||||
return scraper[entity.parameters.layout];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (entity.parent) {
|
|
||||||
return findLayoutScraper(entity.parent, scraper);
|
|
||||||
}
|
|
||||||
|
|
||||||
return scraper;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
|
async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
|
||||||
const entity = baseRelease.entity || entitiesBySlug[urlToSiteSlug(baseRelease.url)];
|
const entity = baseRelease.entity || entitiesBySlug[urlToSiteSlug(baseRelease.url)];
|
||||||
|
|
||||||
|
@ -107,8 +83,8 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
const scraper = findScraper(entity);
|
const scraper = resolveScraper(entity);
|
||||||
const layoutScraper = findLayoutScraper(entity, scraper);
|
const layoutScraper = resolveLayoutScraper(entity, scraper);
|
||||||
|
|
||||||
if (!layoutScraper) {
|
if (!layoutScraper) {
|
||||||
logger.warn(`Could not find scraper for ${baseRelease.url}`);
|
logger.warn(`Could not find scraper for ${baseRelease.url}`);
|
||||||
|
|
108
src/entities.js
108
src/entities.js
|
@ -94,62 +94,80 @@ async function fetchIncludedEntities() {
|
||||||
WITH RECURSIVE included_entities AS (
|
WITH RECURSIVE included_entities AS (
|
||||||
/* select configured channels and networks */
|
/* select configured channels and networks */
|
||||||
SELECT
|
SELECT
|
||||||
entities.*
|
entities.*
|
||||||
FROM
|
FROM
|
||||||
entities
|
entities
|
||||||
WHERE
|
WHERE
|
||||||
CASE WHEN :includeAll
|
CASE WHEN :includeAll
|
||||||
THEN
|
THEN
|
||||||
/* select all top level networks and independent channels */
|
/* select all top level networks and independent channels */
|
||||||
entities.parent_id IS NULL
|
entities.parent_id IS NULL
|
||||||
ELSE
|
ELSE
|
||||||
((entities.slug = ANY(:includedNetworks)
|
((entities.slug = ANY(:includedNetworks)
|
||||||
AND entities.type = 'network')
|
AND entities.type = 'network')
|
||||||
OR (entities.slug = ANY(:includedChannels)
|
OR (entities.slug = ANY(:includedChannels)
|
||||||
AND entities.type = 'channel'))
|
AND entities.type = 'channel'))
|
||||||
END
|
END
|
||||||
AND NOT (
|
AND NOT (
|
||||||
(entities.slug = ANY(:excludedNetworks)
|
(entities.slug = ANY(:excludedNetworks)
|
||||||
AND entities.type = 'network')
|
AND entities.type = 'network')
|
||||||
OR (entities.slug = ANY(:excludedChannels)
|
OR (entities.slug = ANY(:excludedChannels)
|
||||||
AND entities.type = 'channel'))
|
AND entities.type = 'channel'))
|
||||||
|
|
||||||
UNION ALL
|
UNION ALL
|
||||||
|
|
||||||
/* select recursive children of configured networks */
|
/* select recursive children of configured networks */
|
||||||
SELECT
|
SELECT
|
||||||
entities.*
|
entities.*
|
||||||
FROM
|
FROM
|
||||||
entities
|
entities
|
||||||
INNER JOIN
|
INNER JOIN
|
||||||
included_entities ON included_entities.id = entities.parent_id
|
included_entities ON included_entities.id = entities.parent_id
|
||||||
WHERE
|
WHERE
|
||||||
NOT ((entities.slug = ANY(:excludedNetworks)
|
NOT ((entities.slug = ANY(:excludedNetworks)
|
||||||
AND entities.type = 'network')
|
AND entities.type = 'network')
|
||||||
OR (entities.slug = ANY(:excludedChannels)
|
OR (entities.slug = ANY(:excludedChannels)
|
||||||
AND entities.type = 'channel'))
|
AND entities.type = 'channel'))
|
||||||
|
), included_per_network AS (
|
||||||
|
/* select recursive channels as children of networks */
|
||||||
|
SELECT
|
||||||
|
parents.*,
|
||||||
|
json_agg(included_entities ORDER BY included_entities.id) included_children,
|
||||||
|
row_to_json(grandparents) AS parent,
|
||||||
|
(SELECT json_agg(children)
|
||||||
|
FROM entities AS children
|
||||||
|
WHERE children.parent_id = parents.id) children
|
||||||
|
FROM
|
||||||
|
included_entities
|
||||||
|
LEFT JOIN
|
||||||
|
entities AS parents ON parents.id = included_entities.parent_id
|
||||||
|
LEFT JOIN
|
||||||
|
entities AS grandparents ON grandparents.id = parents.parent_id
|
||||||
|
WHERE
|
||||||
|
included_entities.type = 'channel'
|
||||||
|
GROUP BY
|
||||||
|
parents.id, grandparents.id
|
||||||
|
), entity_tree as (
|
||||||
|
/* get recursive parents of networks (necessary for scraper resolve) */
|
||||||
|
SELECT to_jsonb(included_per_network) as entity,
|
||||||
|
parent_id,
|
||||||
|
array['parent'] as parent_path,
|
||||||
|
0 as depth
|
||||||
|
FROM included_per_network
|
||||||
|
|
||||||
|
UNION ALL
|
||||||
|
|
||||||
|
SELECT jsonb_set(entity_tree.entity, entity_tree.parent_path, to_jsonb(entities)),
|
||||||
|
entities.parent_id,
|
||||||
|
entity_tree.parent_path || array['parent'],
|
||||||
|
depth + 1
|
||||||
|
FROM entity_tree
|
||||||
|
JOIN entities ON entity_tree.parent_id = entities.id
|
||||||
)
|
)
|
||||||
/* select recursive channels as children of networks */
|
SELECT entity FROM entity_tree WHERE parent_id is null;
|
||||||
SELECT
|
|
||||||
parents.*,
|
|
||||||
json_agg(included_entities ORDER BY included_entities.id) included_children,
|
|
||||||
row_to_json(grandparents) AS parent,
|
|
||||||
(SELECT json_agg(children)
|
|
||||||
FROM entities AS children
|
|
||||||
WHERE children.parent_id = parents.id) children
|
|
||||||
FROM
|
|
||||||
included_entities
|
|
||||||
LEFT JOIN
|
|
||||||
entities AS parents ON parents.id = included_entities.parent_id
|
|
||||||
LEFT JOIN
|
|
||||||
entities AS grandparents ON grandparents.id = parents.parent_id
|
|
||||||
WHERE
|
|
||||||
included_entities.type = 'channel'
|
|
||||||
GROUP BY
|
|
||||||
parents.id, grandparents.id;
|
|
||||||
`, include);
|
`, include);
|
||||||
|
|
||||||
const curatedNetworks = rawNetworks.rows.map(entity => curateEntity(entity, true));
|
const curatedNetworks = rawNetworks.rows.map(({ entity }) => curateEntity(entity, true));
|
||||||
|
|
||||||
return curatedNetworks;
|
return curatedNetworks;
|
||||||
}
|
}
|
||||||
|
@ -164,7 +182,7 @@ async function fetchReleaseEntities(baseReleases) {
|
||||||
));
|
));
|
||||||
|
|
||||||
const entities = await knex.raw(`
|
const entities = await knex.raw(`
|
||||||
WITH RECURSIVE tree as (
|
WITH RECURSIVE entity_tree as (
|
||||||
SELECT to_jsonb(entities) as entity,
|
SELECT to_jsonb(entities) as entity,
|
||||||
parent_id,
|
parent_id,
|
||||||
array['parent'] as parent_path,
|
array['parent'] as parent_path,
|
||||||
|
@ -174,14 +192,14 @@ async function fetchReleaseEntities(baseReleases) {
|
||||||
|
|
||||||
UNION ALL
|
UNION ALL
|
||||||
|
|
||||||
SELECT jsonb_set(tree.entity, tree.parent_path, to_jsonb(entities)),
|
SELECT jsonb_set(entity_tree.entity, entity_tree.parent_path, to_jsonb(entities)),
|
||||||
entities.parent_id,
|
entities.parent_id,
|
||||||
tree.parent_path || array['parent'],
|
entity_tree.parent_path || array['parent'],
|
||||||
depth + 1
|
depth + 1
|
||||||
FROM tree
|
FROM entity_tree
|
||||||
JOIN entities ON tree.parent_id = entities.id
|
JOIN entities ON entity_tree.parent_id = entities.id
|
||||||
)
|
)
|
||||||
SELECT entity FROM tree WHERE parent_id is null
|
SELECT entity FROM entity_tree WHERE parent_id is null
|
||||||
ORDER BY entity->'type' ASC;
|
ORDER BY entity->'type' ASC;
|
||||||
`, { entitySlugs });
|
`, { entitySlugs });
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,32 @@
|
||||||
|
'use strict';
|
||||||
|
|
||||||
|
const scrapers = require('./scrapers');
|
||||||
|
|
||||||
|
function resolveScraper(entity) {
|
||||||
|
if (scrapers.releases[entity.slug]) {
|
||||||
|
return scrapers.releases[entity.slug];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (entity.parent) {
|
||||||
|
return resolveScraper(entity.parent);
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveLayoutScraper(entity, scraper) {
|
||||||
|
if (scraper?.[entity.parameters?.layout]) {
|
||||||
|
return scraper[entity.parameters.layout];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (entity.parent) {
|
||||||
|
return resolveLayoutScraper(entity.parent, scraper);
|
||||||
|
}
|
||||||
|
|
||||||
|
return scraper;
|
||||||
|
}
|
||||||
|
|
||||||
|
module.exports = {
|
||||||
|
resolveScraper,
|
||||||
|
resolveLayoutScraper,
|
||||||
|
};
|
|
@ -9,7 +9,7 @@ const logger = require('./logger')(__filename);
|
||||||
const knex = require('./knex');
|
const knex = require('./knex');
|
||||||
const { curateRelease } = require('./releases');
|
const { curateRelease } = require('./releases');
|
||||||
const include = require('./utils/argv-include')(argv);
|
const include = require('./utils/argv-include')(argv);
|
||||||
const scrapers = require('./scrapers/scrapers');
|
const { resolveScraper, resolveLayoutScraper } = require('./scrapers/resolve');
|
||||||
const { fetchIncludedEntities } = require('./entities');
|
const { fetchIncludedEntities } = require('./entities');
|
||||||
|
|
||||||
const emptyReleases = { uniqueReleases: [], duplicateReleases: [] };
|
const emptyReleases = { uniqueReleases: [], duplicateReleases: [] };
|
||||||
|
@ -205,13 +205,8 @@ async function scrapeChannelReleases(scraper, channelEntity, preData) {
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapeChannel(channelEntity, accNetworkReleases) {
|
async function scrapeChannel(channelEntity, accNetworkReleases) {
|
||||||
console.log(channelEntity);
|
const scraper = resolveScraper(channelEntity);
|
||||||
|
const layoutScraper = resolveLayoutScraper(channelEntity, scraper);
|
||||||
const scraper = scrapers.releases[channelEntity.slug]
|
|
||||||
|| scrapers.releases[channelEntity.parent?.slug]
|
|
||||||
|| scrapers.releases[channelEntity.parent?.parent?.slug];
|
|
||||||
|
|
||||||
const layoutScraper = scraper?.[channelEntity.parameters?.layout] || scraper?.[channelEntity.parent?.parameters?.layout] || scraper?.[channelEntity.parent?.parent?.parameters?.layout] || scraper;
|
|
||||||
|
|
||||||
if (!layoutScraper) {
|
if (!layoutScraper) {
|
||||||
logger.warn(`No scraper found for '${channelEntity.name}' (${channelEntity.parent?.name})`);
|
logger.warn(`No scraper found for '${channelEntity.name}' (${channelEntity.parent?.name})`);
|
||||||
|
|
Loading…
Reference in New Issue