From e4b269956e8673008d7d6c34452094cf787fbb89 Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Tue, 17 Mar 2020 00:58:03 +0100 Subject: [PATCH] Attaching channel site and studio to stored releases. --- src/deep.js | 10 +++++-- src/store-releases.js | 66 +++++++++++++++++++++++++++++++++++++++---- 2 files changed, 67 insertions(+), 9 deletions(-) diff --git a/src/deep.js b/src/deep.js index 7d3044ce..a67e8b11 100644 --- a/src/deep.js +++ b/src/deep.js @@ -1,5 +1,7 @@ 'use strict'; +const Promise = require('bluebird'); + const argv = require('./argv'); const logger = require('./logger')(__filename); const knex = require('./knex'); @@ -120,8 +122,6 @@ async function scrapeRelease(baseRelease, sites, type = 'scene') { mergedRelease.tags = baseRelease.tags.concat(scrapedRelease.tags); } - console.log(mergedRelease); - return mergedRelease; } catch (error) { logger.error(`Deep scrape failed for ${baseRelease.url}: ${error.message}`); @@ -130,7 +130,11 @@ async function scrapeRelease(baseRelease, sites, type = 'scene') { } async function scrapeReleases(baseReleases, sites) { - return Promise.all(baseReleases.map(baseRelease => scrapeRelease(baseRelease, sites))); + return Promise.map( + baseReleases, + async baseRelease => scrapeRelease(baseRelease, sites), + { concurrency: 10 }, + ); } async function fetchReleases(baseReleasesOrUrls) { diff --git a/src/store-releases.js b/src/store-releases.js index e3f526f5..951c79ff 100644 --- a/src/store-releases.js +++ b/src/store-releases.js @@ -2,6 +2,8 @@ const config = require('config'); +const argv = require('./argv'); +const logger = require('./logger'); const knex = require('./knex'); const slugify = require('./utils/slugify'); @@ -39,10 +41,56 @@ function curateReleaseEntry(release, batchId, existingRelease) { return curatedRelease; } -async function attachSite(releases) { - const releasesWithoutSite = releases.filter(release => !release.site || release.site.isFallback); +async function attachChannelSites(releases) { + const releasesWithoutSite = releases.filter(release => release.channel && (!release.site || release.site.isFallback)); - // console.log(releases, releasesWithoutSite); + const channelSites = await knex('sites').whereIn('slug', releasesWithoutSite.map(release => release.channel)); + const channelSitesBySlug = channelSites.reduce((acc, site) => ({ ...acc, [site.slug]: site }), {}); + + const releasesWithChannelSite = releases + .map((release) => { + if (release.site && !release.site.isFallback) { + return release; + } + + if (release.channel && channelSitesBySlug[release.channel]) { + return { + ...release, + site: channelSitesBySlug[release.channel], + }; + } + + logger.error(`Unable to match channel '${release.channel.slug || release.channel}' from generic URL ${release.url}`); + + return null; + }) + .filter(Boolean); + + return releasesWithChannelSite; +} + +async function attachStudios(releases) { + const studioSlugs = releases.map(release => release.studio).filter(Boolean); + + const studios = await knex('studios').whereIn('slug', studioSlugs); + const studioBySlug = studios.reduce((acc, studio) => ({ ...acc, [studio.slug]: studio }), {}); + + const releasesWithStudio = releases.map((release) => { + if (release.studio && studioBySlug[release.studio]) { + return { + ...release, + studio: release.studio, + }; + } + + if (release.studio) { + logger.warn(`Unable to match studio '${release.studio}' for ${release.url}`); + } + + return release; + }); + + return releasesWithStudio; } async function extractUniqueReleases(releases) { @@ -58,12 +106,18 @@ async function extractUniqueReleases(releases) { async function storeReleases(releases) { const [batchId] = await knex('batches').insert({ comment: null }).returning('id'); - const uniqueReleases = await extractUniqueReleases(releases); - const releasesWithSites = await attachSite(releases); + const releasesWithSites = await attachChannelSites(releases); + const releasesWithStudios = await attachStudios(releasesWithSites); + + // uniqueness is site ID + entry ID, filter uniques after adding sites + const uniqueReleases = argv.redownload + ? releasesWithStudios + : await extractUniqueReleases(releasesWithStudios); const curatedReleaseEntries = uniqueReleases.slice(0, 2).map(release => curateReleaseEntry(release, batchId)); + const storedReleases = await knex('releases').insert(curatedReleaseEntries).returning('*'); - await knex('releases').insert(curatedReleaseEntries); + console.log(storedReleases); } module.exports = {