Split up profile scrape runner. Fixed wrong search document date key. Added search update CLI.

This commit is contained in:
ThePendulum 2020-03-10 23:46:55 +01:00
parent 791a6c1c72
commit d97b1ab894
5 changed files with 58 additions and 47 deletions

View File

@ -376,30 +376,8 @@ async function mergeProfiles(profiles, actor) {
return mergedProfile; return mergedProfile;
} }
async function scrapeActors(actorNames) { async function scrapeProfiles(sources, actorName, actorEntry, sitesBySlug) {
return Promise.map(actorNames || argv.actors, async (actorName) => { return Promise.map(sources, async (source) => {
try {
const actorSlug = slugify(actorName);
const actorEntry = await knex('actors').where({ slug: actorSlug }).first();
const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
const finalSources = argv.withReleases ? sources.flat() : sources; // ignore race-to-success grouping when scenes are requested
const [siteEntries, networks] = await Promise.all([
knex('sites')
.leftJoin('networks', 'sites.network_id', 'networks.id')
.select(
'sites.*',
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
)
.whereIn('sites.slug', finalSources.flat()),
knex('networks').select('*').whereIn('slug', finalSources.flat()),
]);
const sites = await curateSites(siteEntries, true);
const sitesBySlug = [].concat(networks, sites).reduce((acc, site) => ({ ...acc, [site.slug]: site }), {});
const profiles = await Promise.map(finalSources, async (source) => {
// const [scraperSlug, scraper] = source; // const [scraperSlug, scraper] = source;
const profileScrapers = [].concat(source).map(slug => ({ scraperSlug: slug, scraper: scrapers.actors[slug] })); const profileScrapers = [].concat(source).map(slug => ({ scraperSlug: slug, scraper: scrapers.actors[slug] }));
@ -438,7 +416,32 @@ async function scrapeActors(actorNames) {
return null; return null;
}); });
}
async function scrapeActors(actorNames) {
return Promise.map(actorNames || argv.actors, async (actorName) => {
try {
const actorSlug = slugify(actorName);
const actorEntry = await knex('actors').where({ slug: actorSlug }).first();
const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
const finalSources = argv.withReleases ? sources.flat() : sources; // ignore race-to-success grouping when scenes are requested
const [siteEntries, networks] = await Promise.all([
knex('sites')
.leftJoin('networks', 'sites.network_id', 'networks.id')
.select(
'sites.*',
'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
)
.whereIn('sites.slug', finalSources.flat()),
knex('networks').select('*').whereIn('slug', finalSources.flat()),
]);
const sites = await curateSites(siteEntries, true);
const sitesBySlug = [].concat(networks, sites).reduce((acc, site) => ({ ...acc, [site.slug]: site }), {});
const profiles = await scrapeProfiles(sources, sitesBySlug, actorName, actorEntry);
const profile = await mergeProfiles(profiles, actorEntry); const profile = await mergeProfiles(profiles, actorEntry);
if (profile === null) { if (profile === null) {

View File

@ -6,7 +6,7 @@ const initServer = require('./web/server');
const scrapeSites = require('./scrape-sites'); const scrapeSites = require('./scrape-sites');
const { scrapeScenes, scrapeMovies, deepFetchReleases } = require('./scrape-releases'); const { scrapeScenes, scrapeMovies, deepFetchReleases } = require('./scrape-releases');
const { storeReleases } = require('./releases'); const { storeReleases, updateReleasesSearch } = require('./releases');
const { scrapeActors, scrapeBasicActors } = require('./actors'); const { scrapeActors, scrapeBasicActors } = require('./actors');
if (process.env.NODE_ENV === 'development') { if (process.env.NODE_ENV === 'development') {
@ -41,6 +41,10 @@ async function init() {
await scrapeBasicActors(); await scrapeBasicActors();
} }
if (argv.updateSearch) {
await updateReleasesSearch();
}
if (argv.server) { if (argv.server) {
await initServer(); await initServer();
return; return;

View File

@ -171,8 +171,8 @@ const { argv } = yargs
type: 'boolean', type: 'boolean',
default: process.env.NODE_ENV === 'development', default: process.env.NODE_ENV === 'development',
}) })
.option('dummy', { .option('update-search', {
describe: 'Generate dummy data during seed', describe: 'Update search documents for all releases.',
type: 'boolean', type: 'boolean',
default: false, default: false,
}); });

View File

@ -367,6 +367,8 @@ async function storeReleaseAssets(releases) {
} }
async function updateReleasesSearch(releaseIds) { async function updateReleasesSearch(releaseIds) {
logger.info(`Updating search documents for ${releaseIds ? releaseIds.length : 'all' } releases`);
const documents = await knex.raw(` const documents = await knex.raw(`
SELECT SELECT
releases.id AS release_id, releases.id AS release_id,
@ -378,7 +380,7 @@ async function updateReleasesSearch(releaseIds) {
networks.name || ' ' || networks.name || ' ' ||
networks.slug || ' ' || networks.slug || ' ' ||
COALESCE(releases.shoot_id, '') || ' ' || COALESCE(releases.shoot_id, '') || ' ' ||
COALESCE(TO_CHAR(releases.date, 'YYYY YY MM FMMM FMmonth mon DD FMD'), '') || ' ' || COALESCE(TO_CHAR(releases.date, 'YYYY YY MM FMMM FMmonth mon DD FMDD'), '') || ' ' ||
STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' || STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' ||
STRING_AGG(COALESCE(tags.name, ''), ' ') || ' ' || STRING_AGG(COALESCE(tags.name, ''), ' ') || ' ' ||
STRING_AGG(COALESCE(tags_aliases.name, ''), ' ') STRING_AGG(COALESCE(tags_aliases.name, ''), ' ')
@ -391,9 +393,9 @@ async function updateReleasesSearch(releaseIds) {
LEFT JOIN actors ON local_actors.actor_id = actors.id LEFT JOIN actors ON local_actors.actor_id = actors.id
LEFT JOIN tags ON local_tags.tag_id = tags.id LEFT JOIN tags ON local_tags.tag_id = tags.id
LEFT JOIN tags as tags_aliases ON local_tags.tag_id = tags_aliases.alias_for LEFT JOIN tags as tags_aliases ON local_tags.tag_id = tags_aliases.alias_for
WHERE releases.id = ANY(?) ${releaseIds ? 'WHERE releases.id = ANY(?)' : ''}
GROUP BY releases.id, sites.name, sites.slug, networks.name, networks.slug; GROUP BY releases.id, sites.name, sites.slug, networks.name, networks.slug;
`, [releaseIds]); `, releaseIds && [releaseIds]);
if (documents.rows?.length > 0) { if (documents.rows?.length > 0) {
const query = knex('releases_search').insert(documents.rows).toString(); const query = knex('releases_search').insert(documents.rows).toString();
@ -507,4 +509,5 @@ module.exports = {
fetchTagReleases, fetchTagReleases,
storeRelease, storeRelease,
storeReleases, storeReleases,
updateReleasesSearch,
}; };

View File

@ -11,6 +11,7 @@ const PgConnectionFilterPlugin = require('postgraphile-plugin-connection-filter'
const PgSimplifyInflectorPlugin = require('@graphile-contrib/pg-simplify-inflector'); const PgSimplifyInflectorPlugin = require('@graphile-contrib/pg-simplify-inflector');
const PgOrderByRelatedPlugin = require('@graphile-contrib/pg-order-by-related'); const PgOrderByRelatedPlugin = require('@graphile-contrib/pg-order-by-related');
const logger = require('../logger');
const { ActorPlugins, SitePlugins, ReleasePlugins } = require('./plugins/plugins'); const { ActorPlugins, SitePlugins, ReleasePlugins } = require('./plugins/plugins');
const { const {
@ -100,7 +101,7 @@ function initServer() {
const server = app.listen(config.web.port, config.web.host, () => { const server = app.listen(config.web.port, config.web.host, () => {
const { address, port } = server.address(); const { address, port } = server.address();
console.log(`Web server listening on ${address}:${port}`); logger.info(`Web server listening on ${address}:${port}`);
}); });
} }