Improved and documented actor profile scraping.

2020-08-12 20:51:08 +02:00 · 2020-08-12 20:51:08 +02:00 · 7413d7db25
parent 5cabeed19d
commit 7413d7db25
5 changed files with 64 additions and 32 deletions
--- a/README.md
+++ b/README.md
@ -40,6 +40,13 @@ To generate the thumbnails for logos and tag photos, run:
 * `--scene [URL]`: Try to retrieve scene details from its official channel or network URL.
 * `--deep`: Follow each release link found running `--channel` or `--network` and scrape it for more details. Enabled by default ; use `--no-deep` to only save information found on the overview pages.

+#### Actors
+* `--actors "[name]" "[name]"`: Fetch actor profiles. When no names are specified, actors without existing profiles are scraped
+* `--actors-file [filepath]`: Fetch all scenes for the actors specified in a file using a newline delimiter.
+* `--actors-sources [slug] [slug]`: Scrapers to use for actor profiles. Defaults to config.
+* `--actors-scenes`: Fetch all scenes for scraped actors. Use with caution, as an actor may have many scenes.
+* `--scene-actors`: Fetch profiles for actors associated with scraped scenes. Use with caution, as scenes may have many actors, each with many profiles.
+
 #### Developers
 * `--no-save`: Do not store retrieved information in local database, forcing re-fetch.
 * `--level`: Change log level to `silly`, `verbose`, `info`, `warn` or `error`.
--- a/src/actors.js
+++ b/src/actors.js
@ -582,9 +582,31 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy
 	return profiles.filter(Boolean);
 }

-async function scrapeActors(actorNames) {
+async function getActorNames(actorNames) {
+	if (actorNames.length > 0) {
+		return actorNames;
+	}
+
+	const actorsWithoutProfiles = await knex.raw(`
+		SELECT actors.name
+		FROM actors
+		WHERE NOT EXISTS (
+			SELECT *
+			FROM actors_profiles
+			WHERE actors_profiles.actor_id = actors.id
+			AND actors_profiles.updated_at <= (?)
+		)
+	`, [argv.actorsUpdate || new Date()]);
+
+	return actorsWithoutProfiles.rows.map(actor => actor.name);
+}
+
+async function scrapeActors(argNames) {
+	const actorNames = await getActorNames(argNames);
 	const baseActors = toBaseActors(actorNames);

+	logger.info(`Scraping profiles for ${actorNames.length} actors`);
+
 	const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
 	const entitySlugs = sources.flat();

@ -596,11 +618,7 @@ async function scrapeActors(actorNames) {
 			.orderBy('entities.type'),
 		knex('actors')
 			.select(['id', 'name', 'slug', 'entry_id'])
-			.modify((queryBuilder) => {
-				if (actorNames.length > 0) {
-					queryBuilder.whereIn('slug', baseActors.map(baseActor => baseActor.slug));
-				}
-			})
+			.whereIn('slug', baseActors.map(baseActor => baseActor.slug))
 			.whereNull('alias_for'),
 	]);

--- a/src/app.js
+++ b/src/app.js
@ -25,7 +25,7 @@ async function init() {
 	const actorsFromFile = argv.actorsFile && await getFileEntries(argv.actorsFile);
 	const actorNames = (argv.actors || []).concat(actorsFromFile || []);

-	const actors = actorNames.length > 0 && await scrapeActors(actorNames);
+	const actors = (argv.actors || argv.actorsFile) && await scrapeActors(actorNames);
 	const actorBaseScenes = argv.actors && argv.actorScenes && actors.map(actor => actor.releases).flat().filter(Boolean);

 	const updateBaseScenes = (argv.all || argv.channels || argv.networks || argv.movies) && await fetchUpdates();
--- a/src/argv.js
+++ b/src/argv.js
@ -2,6 +2,22 @@

 const config = require('config');
 const yargs = require('yargs');
+const moment = require('moment');
+
+function interpretAfter(after) {
+	if (/\d{2,4}-\d{2}-\d{2,4}/.test(after)) {
+		// using date
+		return moment
+			.utc(after, ['YYYY-MM-DD', 'DD-MM-YYYY'])
+			.toDate();
+	}
+
+	// using time distance (e.g. "1 month")
+	return moment
+		.utc()
+		.subtract(...after.split(' '))
+		.toDate();
+}

 const { argv } = yargs
 	.command('npm start')
@ -30,16 +46,25 @@ const { argv } = yargs
 		type: 'array',
 		alias: 'actor',
 	})
+	.option('actors-update', {
+		describe: 'Rescrape actors last updated before this period',
+		type: 'string',
+	})
 	.option('actors-file', {
 		describe: 'Scrape actors names from file',
 		type: 'string',
 	})
-	.option('actor-scenes', {
+	.option('actors-scenes', {
 		describe: 'Fetch all scenes for an actor',
 		type: 'boolean',
-		alias: 'with-scenes',
+		alias: 'actor-scenes',
 		default: false,
 	})
+	.option('actors-sources', {
+		describe: 'Use these scrapers for actor data',
+		type: 'array',
+		alias: 'actor-source',
+	})
 	.option('movie-scenes', {
 		describe: 'Fetch all scenes for a movie',
 		type: 'boolean',
@ -70,11 +95,6 @@ const { argv } = yargs
 		describe: 'Scrape movie info from URL',
 		type: 'array',
 	})
-	.option('sources', {
-		describe: 'Use these scrapers for actor data',
-		type: 'array',
-		alias: 'source',
-	})
 	.option('deep', {
 		describe: 'Fetch details for all releases',
 		type: 'boolean',
@ -204,6 +224,8 @@ const { argv } = yargs
 		describe: 'Update search documents for all releases.',
 		type: 'boolean',
 		default: false,
-	});
+	})
+	.coerce('after', interpretAfter)
+	.coerce('actors-update', interpretAfter);

 module.exports = argv;
--- a/src/updates.js
+++ b/src/updates.js
@ -10,21 +10,6 @@ const include = require('./utils/argv-include')(argv);
 const scrapers = require('./scrapers/scrapers');
 const { fetchChannelsFromArgv, fetchChannelsFromConfig } = require('./entities');

-const afterDate = (() => {
-	if (/\d{2,4}-\d{2}-\d{2,4}/.test(argv.after)) {
-		// using date
-		return moment
-			.utc(argv.after, ['YYYY-MM-DD', 'DD-MM-YYYY'])
-			.toDate();
-	}
-
-	// using time distance (e.g. "1 month")
-	return moment
-		.utc()
-		.subtract(...argv.after.split(' '))
-		.toDate();
-})();
-
 async function filterUniqueReleases(latestReleases, accReleases) {
 	const latestReleaseIdentifiers = latestReleases
 		.map(release => [release.entity.id, release.entryId]);
@ -67,7 +52,7 @@ function needNextPage(uniqueReleases, pageAccReleases) {
 			.sort((releaseA, releaseB) => releaseB.date - releaseA.date)
 			.slice(-1)[0];

-		if (moment(oldestReleaseOnPage.date).isAfter(afterDate)) {
+		if (moment(oldestReleaseOnPage.date).isAfter(argv.after)) {
 			// oldest release on page is newer than the specified date cut-off
 			return true;
 		}
@ -126,7 +111,7 @@ async function scrapeReleases(scraper, entity, preData, upcoming = false) {
 	}

 	if (releases.every(release => release.date)) {
-		return releases.filter(release => moment(release.date).isAfter(afterDate));
+		return releases.filter(release => moment(release.date).isAfter(argv.after));
 	}

 	return releases.slice(0, argv.nullDateLimit);