Split up profile scrape runner. Fixed wrong search document date key. Added search update CLI.
This commit is contained in:
parent
791a6c1c72
commit
d97b1ab894
|
@ -376,6 +376,48 @@ async function mergeProfiles(profiles, actor) {
|
||||||
return mergedProfile;
|
return mergedProfile;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function scrapeProfiles(sources, actorName, actorEntry, sitesBySlug) {
|
||||||
|
return Promise.map(sources, async (source) => {
|
||||||
|
// const [scraperSlug, scraper] = source;
|
||||||
|
const profileScrapers = [].concat(source).map(slug => ({ scraperSlug: slug, scraper: scrapers.actors[slug] }));
|
||||||
|
|
||||||
|
try {
|
||||||
|
return await profileScrapers.reduce(async (outcome, { scraper, scraperSlug }) => outcome.catch(async () => {
|
||||||
|
if (!scraper) {
|
||||||
|
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
|
||||||
|
throw Object.assign(new Error(`No profile scraper available for ${scraperSlug}`));
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.verbose(`Searching '${actorName}' on ${scraperSlug}`);
|
||||||
|
|
||||||
|
const site = sitesBySlug[scraperSlug] || null;
|
||||||
|
const profile = await scraper.fetchProfile(actorEntry ? actorEntry.name : actorName, scraperSlug, site, argv.withReleases);
|
||||||
|
|
||||||
|
if (profile) {
|
||||||
|
logger.verbose(`Found profile for '${actorName}' on ${scraperSlug}`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
...profile,
|
||||||
|
name: actorName,
|
||||||
|
scraper: scraperSlug,
|
||||||
|
site,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.verbose(`No profile for '${actorName}' available on ${scraperSlug}`);
|
||||||
|
throw Object.assign(new Error(`Profile for ${actorName} not available on ${scraperSlug}`), { warn: false });
|
||||||
|
}), Promise.reject(new Error()));
|
||||||
|
} catch (error) {
|
||||||
|
if (error.warn !== false) {
|
||||||
|
logger.warn(`Error in scraper ${source}: ${error.message}`);
|
||||||
|
// logger.error(error.stack);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
async function scrapeActors(actorNames) {
|
async function scrapeActors(actorNames) {
|
||||||
return Promise.map(actorNames || argv.actors, async (actorName) => {
|
return Promise.map(actorNames || argv.actors, async (actorName) => {
|
||||||
try {
|
try {
|
||||||
|
@ -399,46 +441,7 @@ async function scrapeActors(actorNames) {
|
||||||
const sites = await curateSites(siteEntries, true);
|
const sites = await curateSites(siteEntries, true);
|
||||||
const sitesBySlug = [].concat(networks, sites).reduce((acc, site) => ({ ...acc, [site.slug]: site }), {});
|
const sitesBySlug = [].concat(networks, sites).reduce((acc, site) => ({ ...acc, [site.slug]: site }), {});
|
||||||
|
|
||||||
const profiles = await Promise.map(finalSources, async (source) => {
|
const profiles = await scrapeProfiles(sources, sitesBySlug, actorName, actorEntry);
|
||||||
// const [scraperSlug, scraper] = source;
|
|
||||||
const profileScrapers = [].concat(source).map(slug => ({ scraperSlug: slug, scraper: scrapers.actors[slug] }));
|
|
||||||
|
|
||||||
try {
|
|
||||||
return await profileScrapers.reduce(async (outcome, { scraper, scraperSlug }) => outcome.catch(async () => {
|
|
||||||
if (!scraper) {
|
|
||||||
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
|
|
||||||
throw Object.assign(new Error(`No profile scraper available for ${scraperSlug}`));
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.verbose(`Searching '${actorName}' on ${scraperSlug}`);
|
|
||||||
|
|
||||||
const site = sitesBySlug[scraperSlug] || null;
|
|
||||||
const profile = await scraper.fetchProfile(actorEntry ? actorEntry.name : actorName, scraperSlug, site, argv.withReleases);
|
|
||||||
|
|
||||||
if (profile) {
|
|
||||||
logger.verbose(`Found profile for '${actorName}' on ${scraperSlug}`);
|
|
||||||
|
|
||||||
return {
|
|
||||||
...profile,
|
|
||||||
name: actorName,
|
|
||||||
scraper: scraperSlug,
|
|
||||||
site,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.verbose(`No profile for '${actorName}' available on ${scraperSlug}`);
|
|
||||||
throw Object.assign(new Error(`Profile for ${actorName} not available on ${scraperSlug}`), { warn: false });
|
|
||||||
}), Promise.reject(new Error()));
|
|
||||||
} catch (error) {
|
|
||||||
if (error.warn !== false) {
|
|
||||||
logger.warn(`Error in scraper ${source}: ${error.message}`);
|
|
||||||
// logger.error(error.stack);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return null;
|
|
||||||
});
|
|
||||||
|
|
||||||
const profile = await mergeProfiles(profiles, actorEntry);
|
const profile = await mergeProfiles(profiles, actorEntry);
|
||||||
|
|
||||||
if (profile === null) {
|
if (profile === null) {
|
||||||
|
|
|
@ -6,7 +6,7 @@ const initServer = require('./web/server');
|
||||||
|
|
||||||
const scrapeSites = require('./scrape-sites');
|
const scrapeSites = require('./scrape-sites');
|
||||||
const { scrapeScenes, scrapeMovies, deepFetchReleases } = require('./scrape-releases');
|
const { scrapeScenes, scrapeMovies, deepFetchReleases } = require('./scrape-releases');
|
||||||
const { storeReleases } = require('./releases');
|
const { storeReleases, updateReleasesSearch } = require('./releases');
|
||||||
const { scrapeActors, scrapeBasicActors } = require('./actors');
|
const { scrapeActors, scrapeBasicActors } = require('./actors');
|
||||||
|
|
||||||
if (process.env.NODE_ENV === 'development') {
|
if (process.env.NODE_ENV === 'development') {
|
||||||
|
@ -41,6 +41,10 @@ async function init() {
|
||||||
await scrapeBasicActors();
|
await scrapeBasicActors();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (argv.updateSearch) {
|
||||||
|
await updateReleasesSearch();
|
||||||
|
}
|
||||||
|
|
||||||
if (argv.server) {
|
if (argv.server) {
|
||||||
await initServer();
|
await initServer();
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -171,8 +171,8 @@ const { argv } = yargs
|
||||||
type: 'boolean',
|
type: 'boolean',
|
||||||
default: process.env.NODE_ENV === 'development',
|
default: process.env.NODE_ENV === 'development',
|
||||||
})
|
})
|
||||||
.option('dummy', {
|
.option('update-search', {
|
||||||
describe: 'Generate dummy data during seed',
|
describe: 'Update search documents for all releases.',
|
||||||
type: 'boolean',
|
type: 'boolean',
|
||||||
default: false,
|
default: false,
|
||||||
});
|
});
|
||||||
|
|
|
@ -367,6 +367,8 @@ async function storeReleaseAssets(releases) {
|
||||||
}
|
}
|
||||||
|
|
||||||
async function updateReleasesSearch(releaseIds) {
|
async function updateReleasesSearch(releaseIds) {
|
||||||
|
logger.info(`Updating search documents for ${releaseIds ? releaseIds.length : 'all' } releases`);
|
||||||
|
|
||||||
const documents = await knex.raw(`
|
const documents = await knex.raw(`
|
||||||
SELECT
|
SELECT
|
||||||
releases.id AS release_id,
|
releases.id AS release_id,
|
||||||
|
@ -378,7 +380,7 @@ async function updateReleasesSearch(releaseIds) {
|
||||||
networks.name || ' ' ||
|
networks.name || ' ' ||
|
||||||
networks.slug || ' ' ||
|
networks.slug || ' ' ||
|
||||||
COALESCE(releases.shoot_id, '') || ' ' ||
|
COALESCE(releases.shoot_id, '') || ' ' ||
|
||||||
COALESCE(TO_CHAR(releases.date, 'YYYY YY MM FMMM FMmonth mon DD FMD'), '') || ' ' ||
|
COALESCE(TO_CHAR(releases.date, 'YYYY YY MM FMMM FMmonth mon DD FMDD'), '') || ' ' ||
|
||||||
STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' ||
|
STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' ||
|
||||||
STRING_AGG(COALESCE(tags.name, ''), ' ') || ' ' ||
|
STRING_AGG(COALESCE(tags.name, ''), ' ') || ' ' ||
|
||||||
STRING_AGG(COALESCE(tags_aliases.name, ''), ' ')
|
STRING_AGG(COALESCE(tags_aliases.name, ''), ' ')
|
||||||
|
@ -391,9 +393,9 @@ async function updateReleasesSearch(releaseIds) {
|
||||||
LEFT JOIN actors ON local_actors.actor_id = actors.id
|
LEFT JOIN actors ON local_actors.actor_id = actors.id
|
||||||
LEFT JOIN tags ON local_tags.tag_id = tags.id
|
LEFT JOIN tags ON local_tags.tag_id = tags.id
|
||||||
LEFT JOIN tags as tags_aliases ON local_tags.tag_id = tags_aliases.alias_for
|
LEFT JOIN tags as tags_aliases ON local_tags.tag_id = tags_aliases.alias_for
|
||||||
WHERE releases.id = ANY(?)
|
${releaseIds ? 'WHERE releases.id = ANY(?)' : ''}
|
||||||
GROUP BY releases.id, sites.name, sites.slug, networks.name, networks.slug;
|
GROUP BY releases.id, sites.name, sites.slug, networks.name, networks.slug;
|
||||||
`, [releaseIds]);
|
`, releaseIds && [releaseIds]);
|
||||||
|
|
||||||
if (documents.rows?.length > 0) {
|
if (documents.rows?.length > 0) {
|
||||||
const query = knex('releases_search').insert(documents.rows).toString();
|
const query = knex('releases_search').insert(documents.rows).toString();
|
||||||
|
@ -507,4 +509,5 @@ module.exports = {
|
||||||
fetchTagReleases,
|
fetchTagReleases,
|
||||||
storeRelease,
|
storeRelease,
|
||||||
storeReleases,
|
storeReleases,
|
||||||
|
updateReleasesSearch,
|
||||||
};
|
};
|
||||||
|
|
|
@ -11,6 +11,7 @@ const PgConnectionFilterPlugin = require('postgraphile-plugin-connection-filter'
|
||||||
const PgSimplifyInflectorPlugin = require('@graphile-contrib/pg-simplify-inflector');
|
const PgSimplifyInflectorPlugin = require('@graphile-contrib/pg-simplify-inflector');
|
||||||
const PgOrderByRelatedPlugin = require('@graphile-contrib/pg-order-by-related');
|
const PgOrderByRelatedPlugin = require('@graphile-contrib/pg-order-by-related');
|
||||||
|
|
||||||
|
const logger = require('../logger');
|
||||||
const { ActorPlugins, SitePlugins, ReleasePlugins } = require('./plugins/plugins');
|
const { ActorPlugins, SitePlugins, ReleasePlugins } = require('./plugins/plugins');
|
||||||
|
|
||||||
const {
|
const {
|
||||||
|
@ -100,7 +101,7 @@ function initServer() {
|
||||||
const server = app.listen(config.web.port, config.web.host, () => {
|
const server = app.listen(config.web.port, config.web.host, () => {
|
||||||
const { address, port } = server.address();
|
const { address, port } = server.address();
|
||||||
|
|
||||||
console.log(`Web server listening on ${address}:${port}`);
|
logger.info(`Web server listening on ${address}:${port}`);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue