Querying infinite parent depth for deep release entities.

This commit is contained in:
DebaucheryLibrarian
2021-02-01 01:45:30 +01:00
parent 97c088cfb4
commit aade7490f8
12 changed files with 95 additions and 51 deletions

View File

@@ -5,49 +5,11 @@ const merge = require('object-merge-advanced');
const argv = require('./argv');
const include = require('./utils/argv-include')(argv);
const { fetchReleaseEntities, urlToSiteSlug } = require('./entities');
const logger = require('./logger')(__filename);
const knex = require('./knex');
const qu = require('./utils/qu');
const scrapers = require('./scrapers/scrapers');
function urlToSiteSlug(url) {
try {
const slug = new URL(url)
.hostname
.match(/([\w-]+)\.\w+$/)?.[1]
.replace(/[-_]+/g, '');
return slug;
} catch (error) {
logger.warn(`Failed to derive entity slug from '${url}': ${error.message}`);
return null;
}
}
async function findEntities(baseReleases) {
const baseReleasesWithoutEntity = baseReleases.filter(release => release.url && !release.site && !release.entity);
const entitySlugs = Array.from(new Set(
baseReleasesWithoutEntity
.map(baseRelease => urlToSiteSlug(baseRelease.url))
.filter(Boolean),
));
const entities = await knex('entities')
.select(knex.raw('entities.*, row_to_json(parents) as parent, json_agg(children) as children'))
.leftJoin('entities as parents', 'parents.id', 'entities.parent_id')
.leftJoin('entities as children', 'children.parent_id', 'entities.id')
.whereIn('entities.slug', entitySlugs)
.groupBy('entities.id', 'parents.id')
.orderBy('entities.type', 'asc');
// channel entity will overwrite network entity
const entitiesBySlug = entities.reduce((accEntities, entity) => ({ ...accEntities, [entity.slug]: accEntities[entity.slug] || entity }), {});
return entitiesBySlug;
}
function toBaseReleases(baseReleasesOrUrls, entity = null) {
if (!baseReleasesOrUrls) {
return [];
@@ -106,8 +68,32 @@ async function fetchScene(scraper, url, entity, baseRelease, options) {
return null;
}
async function scrapeRelease(baseRelease, entities, type = 'scene') {
const entity = baseRelease.entity || entities[urlToSiteSlug(baseRelease.url)];
function findScraper(entity) {
if (scrapers.releases[entity.slug]) {
return scrapers.releases[entity.slug];
}
if (entity.parent) {
return findScraper(entity.parent);
}
return null;
}
function findLayoutScraper(entity, scraper) {
if (scraper?.[entity.parameters?.layout]) {
return scraper[entity.parameters.layout];
}
if (entity.parent) {
return findLayoutScraper(entity.parent, scraper);
}
return scraper;
}
async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
const entity = baseRelease.entity || entitiesBySlug[urlToSiteSlug(baseRelease.url)];
if (!entity) {
logger.warn(`No entity available for ${baseRelease.url}`);
@@ -121,8 +107,8 @@ async function scrapeRelease(baseRelease, entities, type = 'scene') {
};
}
const scraper = scrapers.releases[entity.slug] || scrapers.releases[entity.parent?.slug] || scrapers.releases[entity.parent?.parent?.slug];
const layoutScraper = scraper?.[entity.parameters?.layout] || scraper?.[entity.parent?.parameters?.layout] || scraper?.[entity.parent?.parent?.parameters?.layout] || scraper;
const scraper = findScraper(entity);
const layoutScraper = findLayoutScraper(entity, scraper);
if (!layoutScraper) {
logger.warn(`Could not find scraper for ${baseRelease.url}`);
@@ -184,19 +170,19 @@ async function scrapeRelease(baseRelease, entities, type = 'scene') {
}
}
async function scrapeReleases(baseReleases, entities, type) {
async function scrapeReleases(baseReleases, entitiesBySlug, type) {
return Promise.map(
baseReleases,
async baseRelease => scrapeRelease(baseRelease, entities, type),
async baseRelease => scrapeRelease(baseRelease, entitiesBySlug, type),
{ concurrency: 10 },
);
}
async function fetchReleases(baseReleasesOrUrls, type = 'scene') {
const baseReleases = toBaseReleases(baseReleasesOrUrls);
const entities = await findEntities(baseReleases);
const entitiesBySlug = await fetchReleaseEntities(baseReleases);
const deepReleases = await scrapeReleases(baseReleases, entities, type);
const deepReleases = await scrapeReleases(baseReleases, entitiesBySlug, type);
return deepReleases.filter(Boolean);
}