From 7413d7db2583444c4fbfec3c35763d189c610719 Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Wed, 12 Aug 2020 20:51:08 +0200 Subject: [PATCH] Improved and documented actor profile scraping. --- README.md | 7 +++++++ src/actors.js | 30 ++++++++++++++++++++++++------ src/app.js | 2 +- src/argv.js | 38 ++++++++++++++++++++++++++++++-------- src/updates.js | 19 ++----------------- 5 files changed, 64 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index f3bcd570f..72f252f70 100644 --- a/README.md +++ b/README.md @@ -40,6 +40,13 @@ To generate the thumbnails for logos and tag photos, run: * `--scene [URL]`: Try to retrieve scene details from its official channel or network URL. * `--deep`: Follow each release link found running `--channel` or `--network` and scrape it for more details. Enabled by default ; use `--no-deep` to only save information found on the overview pages. +#### Actors +* `--actors "[name]" "[name]"`: Fetch actor profiles. When no names are specified, actors without existing profiles are scraped +* `--actors-file [filepath]`: Fetch all scenes for the actors specified in a file using a newline delimiter. +* `--actors-sources [slug] [slug]`: Scrapers to use for actor profiles. Defaults to config. +* `--actors-scenes`: Fetch all scenes for scraped actors. Use with caution, as an actor may have many scenes. +* `--scene-actors`: Fetch profiles for actors associated with scraped scenes. Use with caution, as scenes may have many actors, each with many profiles. + #### Developers * `--no-save`: Do not store retrieved information in local database, forcing re-fetch. * `--level`: Change log level to `silly`, `verbose`, `info`, `warn` or `error`. diff --git a/src/actors.js b/src/actors.js index 09547f34a..9d8ddcd1a 100644 --- a/src/actors.js +++ b/src/actors.js @@ -582,9 +582,31 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy return profiles.filter(Boolean); } -async function scrapeActors(actorNames) { +async function getActorNames(actorNames) { + if (actorNames.length > 0) { + return actorNames; + } + + const actorsWithoutProfiles = await knex.raw(` + SELECT actors.name + FROM actors + WHERE NOT EXISTS ( + SELECT * + FROM actors_profiles + WHERE actors_profiles.actor_id = actors.id + AND actors_profiles.updated_at <= (?) + ) + `, [argv.actorsUpdate || new Date()]); + + return actorsWithoutProfiles.rows.map(actor => actor.name); +} + +async function scrapeActors(argNames) { + const actorNames = await getActorNames(argNames); const baseActors = toBaseActors(actorNames); + logger.info(`Scraping profiles for ${actorNames.length} actors`); + const sources = argv.sources || config.profiles || Object.keys(scrapers.actors); const entitySlugs = sources.flat(); @@ -596,11 +618,7 @@ async function scrapeActors(actorNames) { .orderBy('entities.type'), knex('actors') .select(['id', 'name', 'slug', 'entry_id']) - .modify((queryBuilder) => { - if (actorNames.length > 0) { - queryBuilder.whereIn('slug', baseActors.map(baseActor => baseActor.slug)); - } - }) + .whereIn('slug', baseActors.map(baseActor => baseActor.slug)) .whereNull('alias_for'), ]); diff --git a/src/app.js b/src/app.js index 2d0745be8..010f026e5 100644 --- a/src/app.js +++ b/src/app.js @@ -25,7 +25,7 @@ async function init() { const actorsFromFile = argv.actorsFile && await getFileEntries(argv.actorsFile); const actorNames = (argv.actors || []).concat(actorsFromFile || []); - const actors = actorNames.length > 0 && await scrapeActors(actorNames); + const actors = (argv.actors || argv.actorsFile) && await scrapeActors(actorNames); const actorBaseScenes = argv.actors && argv.actorScenes && actors.map(actor => actor.releases).flat().filter(Boolean); const updateBaseScenes = (argv.all || argv.channels || argv.networks || argv.movies) && await fetchUpdates(); diff --git a/src/argv.js b/src/argv.js index d01eb0098..0faff2f1c 100644 --- a/src/argv.js +++ b/src/argv.js @@ -2,6 +2,22 @@ const config = require('config'); const yargs = require('yargs'); +const moment = require('moment'); + +function interpretAfter(after) { + if (/\d{2,4}-\d{2}-\d{2,4}/.test(after)) { + // using date + return moment + .utc(after, ['YYYY-MM-DD', 'DD-MM-YYYY']) + .toDate(); + } + + // using time distance (e.g. "1 month") + return moment + .utc() + .subtract(...after.split(' ')) + .toDate(); +} const { argv } = yargs .command('npm start') @@ -30,16 +46,25 @@ const { argv } = yargs type: 'array', alias: 'actor', }) + .option('actors-update', { + describe: 'Rescrape actors last updated before this period', + type: 'string', + }) .option('actors-file', { describe: 'Scrape actors names from file', type: 'string', }) - .option('actor-scenes', { + .option('actors-scenes', { describe: 'Fetch all scenes for an actor', type: 'boolean', - alias: 'with-scenes', + alias: 'actor-scenes', default: false, }) + .option('actors-sources', { + describe: 'Use these scrapers for actor data', + type: 'array', + alias: 'actor-source', + }) .option('movie-scenes', { describe: 'Fetch all scenes for a movie', type: 'boolean', @@ -70,11 +95,6 @@ const { argv } = yargs describe: 'Scrape movie info from URL', type: 'array', }) - .option('sources', { - describe: 'Use these scrapers for actor data', - type: 'array', - alias: 'source', - }) .option('deep', { describe: 'Fetch details for all releases', type: 'boolean', @@ -204,6 +224,8 @@ const { argv } = yargs describe: 'Update search documents for all releases.', type: 'boolean', default: false, - }); + }) + .coerce('after', interpretAfter) + .coerce('actors-update', interpretAfter); module.exports = argv; diff --git a/src/updates.js b/src/updates.js index 8e3dcea1a..0933c3f85 100644 --- a/src/updates.js +++ b/src/updates.js @@ -10,21 +10,6 @@ const include = require('./utils/argv-include')(argv); const scrapers = require('./scrapers/scrapers'); const { fetchChannelsFromArgv, fetchChannelsFromConfig } = require('./entities'); -const afterDate = (() => { - if (/\d{2,4}-\d{2}-\d{2,4}/.test(argv.after)) { - // using date - return moment - .utc(argv.after, ['YYYY-MM-DD', 'DD-MM-YYYY']) - .toDate(); - } - - // using time distance (e.g. "1 month") - return moment - .utc() - .subtract(...argv.after.split(' ')) - .toDate(); -})(); - async function filterUniqueReleases(latestReleases, accReleases) { const latestReleaseIdentifiers = latestReleases .map(release => [release.entity.id, release.entryId]); @@ -67,7 +52,7 @@ function needNextPage(uniqueReleases, pageAccReleases) { .sort((releaseA, releaseB) => releaseB.date - releaseA.date) .slice(-1)[0]; - if (moment(oldestReleaseOnPage.date).isAfter(afterDate)) { + if (moment(oldestReleaseOnPage.date).isAfter(argv.after)) { // oldest release on page is newer than the specified date cut-off return true; } @@ -126,7 +111,7 @@ async function scrapeReleases(scraper, entity, preData, upcoming = false) { } if (releases.every(release => release.date)) { - return releases.filter(release => moment(release.date).isAfter(afterDate)); + return releases.filter(release => moment(release.date).isAfter(argv.after)); } return releases.slice(0, argv.nullDateLimit);