forked from DebaucheryLibrarian/traxxx
				
			Improved and documented actor profile scraping.
This commit is contained in:
		
							parent
							
								
									5cabeed19d
								
							
						
					
					
						commit
						7413d7db25
					
				|  | @ -40,6 +40,13 @@ To generate the thumbnails for logos and tag photos, run: | |||
| * `--scene [URL]`: Try to retrieve scene details from its official channel or network URL. | ||||
| * `--deep`: Follow each release link found running `--channel` or `--network` and scrape it for more details. Enabled by default ; use `--no-deep` to only save information found on the overview pages. | ||||
| 
 | ||||
| #### Actors | ||||
| * `--actors "[name]" "[name]"`: Fetch actor profiles. When no names are specified, actors without existing profiles are scraped | ||||
| * `--actors-file [filepath]`: Fetch all scenes for the actors specified in a file using a newline delimiter. | ||||
| * `--actors-sources [slug] [slug]`: Scrapers to use for actor profiles. Defaults to config. | ||||
| * `--actors-scenes`: Fetch all scenes for scraped actors. Use with caution, as an actor may have many scenes. | ||||
| * `--scene-actors`: Fetch profiles for actors associated with scraped scenes. Use with caution, as scenes may have many actors, each with many profiles. | ||||
| 
 | ||||
| #### Developers | ||||
| * `--no-save`: Do not store retrieved information in local database, forcing re-fetch. | ||||
| * `--level`: Change log level to `silly`, `verbose`, `info`, `warn` or `error`. | ||||
|  |  | |||
|  | @ -582,9 +582,31 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy | |||
| 	return profiles.filter(Boolean); | ||||
| } | ||||
| 
 | ||||
| async function scrapeActors(actorNames) { | ||||
| async function getActorNames(actorNames) { | ||||
| 	if (actorNames.length > 0) { | ||||
| 		return actorNames; | ||||
| 	} | ||||
| 
 | ||||
| 	const actorsWithoutProfiles = await knex.raw(` | ||||
| 		SELECT actors.name | ||||
| 		FROM actors | ||||
| 		WHERE NOT EXISTS ( | ||||
| 			SELECT * | ||||
| 			FROM actors_profiles | ||||
| 			WHERE actors_profiles.actor_id = actors.id | ||||
| 			AND actors_profiles.updated_at <= (?) | ||||
| 		) | ||||
| 	`, [argv.actorsUpdate || new Date()]);
 | ||||
| 
 | ||||
| 	return actorsWithoutProfiles.rows.map(actor => actor.name); | ||||
| } | ||||
| 
 | ||||
| async function scrapeActors(argNames) { | ||||
| 	const actorNames = await getActorNames(argNames); | ||||
| 	const baseActors = toBaseActors(actorNames); | ||||
| 
 | ||||
| 	logger.info(`Scraping profiles for ${actorNames.length} actors`); | ||||
| 
 | ||||
| 	const sources = argv.sources || config.profiles || Object.keys(scrapers.actors); | ||||
| 	const entitySlugs = sources.flat(); | ||||
| 
 | ||||
|  | @ -596,11 +618,7 @@ async function scrapeActors(actorNames) { | |||
| 			.orderBy('entities.type'), | ||||
| 		knex('actors') | ||||
| 			.select(['id', 'name', 'slug', 'entry_id']) | ||||
| 			.modify((queryBuilder) => { | ||||
| 				if (actorNames.length > 0) { | ||||
| 					queryBuilder.whereIn('slug', baseActors.map(baseActor => baseActor.slug)); | ||||
| 				} | ||||
| 			}) | ||||
| 			.whereIn('slug', baseActors.map(baseActor => baseActor.slug)) | ||||
| 			.whereNull('alias_for'), | ||||
| 	]); | ||||
| 
 | ||||
|  |  | |||
|  | @ -25,7 +25,7 @@ async function init() { | |||
| 	const actorsFromFile = argv.actorsFile && await getFileEntries(argv.actorsFile); | ||||
| 	const actorNames = (argv.actors || []).concat(actorsFromFile || []); | ||||
| 
 | ||||
| 	const actors = actorNames.length > 0 && await scrapeActors(actorNames); | ||||
| 	const actors = (argv.actors || argv.actorsFile) && await scrapeActors(actorNames); | ||||
| 	const actorBaseScenes = argv.actors && argv.actorScenes && actors.map(actor => actor.releases).flat().filter(Boolean); | ||||
| 
 | ||||
| 	const updateBaseScenes = (argv.all || argv.channels || argv.networks || argv.movies) && await fetchUpdates(); | ||||
|  |  | |||
							
								
								
									
										38
									
								
								src/argv.js
								
								
								
								
							
							
						
						
									
										38
									
								
								src/argv.js
								
								
								
								
							|  | @ -2,6 +2,22 @@ | |||
| 
 | ||||
| const config = require('config'); | ||||
| const yargs = require('yargs'); | ||||
| const moment = require('moment'); | ||||
| 
 | ||||
| function interpretAfter(after) { | ||||
| 	if (/\d{2,4}-\d{2}-\d{2,4}/.test(after)) { | ||||
| 		// using date
 | ||||
| 		return moment | ||||
| 			.utc(after, ['YYYY-MM-DD', 'DD-MM-YYYY']) | ||||
| 			.toDate(); | ||||
| 	} | ||||
| 
 | ||||
| 	// using time distance (e.g. "1 month")
 | ||||
| 	return moment | ||||
| 		.utc() | ||||
| 		.subtract(...after.split(' ')) | ||||
| 		.toDate(); | ||||
| } | ||||
| 
 | ||||
| const { argv } = yargs | ||||
| 	.command('npm start') | ||||
|  | @ -30,16 +46,25 @@ const { argv } = yargs | |||
| 		type: 'array', | ||||
| 		alias: 'actor', | ||||
| 	}) | ||||
| 	.option('actors-update', { | ||||
| 		describe: 'Rescrape actors last updated before this period', | ||||
| 		type: 'string', | ||||
| 	}) | ||||
| 	.option('actors-file', { | ||||
| 		describe: 'Scrape actors names from file', | ||||
| 		type: 'string', | ||||
| 	}) | ||||
| 	.option('actor-scenes', { | ||||
| 	.option('actors-scenes', { | ||||
| 		describe: 'Fetch all scenes for an actor', | ||||
| 		type: 'boolean', | ||||
| 		alias: 'with-scenes', | ||||
| 		alias: 'actor-scenes', | ||||
| 		default: false, | ||||
| 	}) | ||||
| 	.option('actors-sources', { | ||||
| 		describe: 'Use these scrapers for actor data', | ||||
| 		type: 'array', | ||||
| 		alias: 'actor-source', | ||||
| 	}) | ||||
| 	.option('movie-scenes', { | ||||
| 		describe: 'Fetch all scenes for a movie', | ||||
| 		type: 'boolean', | ||||
|  | @ -70,11 +95,6 @@ const { argv } = yargs | |||
| 		describe: 'Scrape movie info from URL', | ||||
| 		type: 'array', | ||||
| 	}) | ||||
| 	.option('sources', { | ||||
| 		describe: 'Use these scrapers for actor data', | ||||
| 		type: 'array', | ||||
| 		alias: 'source', | ||||
| 	}) | ||||
| 	.option('deep', { | ||||
| 		describe: 'Fetch details for all releases', | ||||
| 		type: 'boolean', | ||||
|  | @ -204,6 +224,8 @@ const { argv } = yargs | |||
| 		describe: 'Update search documents for all releases.', | ||||
| 		type: 'boolean', | ||||
| 		default: false, | ||||
| 	}); | ||||
| 	}) | ||||
| 	.coerce('after', interpretAfter) | ||||
| 	.coerce('actors-update', interpretAfter); | ||||
| 
 | ||||
| module.exports = argv; | ||||
|  |  | |||
|  | @ -10,21 +10,6 @@ const include = require('./utils/argv-include')(argv); | |||
| const scrapers = require('./scrapers/scrapers'); | ||||
| const { fetchChannelsFromArgv, fetchChannelsFromConfig } = require('./entities'); | ||||
| 
 | ||||
| const afterDate = (() => { | ||||
| 	if (/\d{2,4}-\d{2}-\d{2,4}/.test(argv.after)) { | ||||
| 		// using date
 | ||||
| 		return moment | ||||
| 			.utc(argv.after, ['YYYY-MM-DD', 'DD-MM-YYYY']) | ||||
| 			.toDate(); | ||||
| 	} | ||||
| 
 | ||||
| 	// using time distance (e.g. "1 month")
 | ||||
| 	return moment | ||||
| 		.utc() | ||||
| 		.subtract(...argv.after.split(' ')) | ||||
| 		.toDate(); | ||||
| })(); | ||||
| 
 | ||||
| async function filterUniqueReleases(latestReleases, accReleases) { | ||||
| 	const latestReleaseIdentifiers = latestReleases | ||||
| 		.map(release => [release.entity.id, release.entryId]); | ||||
|  | @ -67,7 +52,7 @@ function needNextPage(uniqueReleases, pageAccReleases) { | |||
| 			.sort((releaseA, releaseB) => releaseB.date - releaseA.date) | ||||
| 			.slice(-1)[0]; | ||||
| 
 | ||||
| 		if (moment(oldestReleaseOnPage.date).isAfter(afterDate)) { | ||||
| 		if (moment(oldestReleaseOnPage.date).isAfter(argv.after)) { | ||||
| 			// oldest release on page is newer than the specified date cut-off
 | ||||
| 			return true; | ||||
| 		} | ||||
|  | @ -126,7 +111,7 @@ async function scrapeReleases(scraper, entity, preData, upcoming = false) { | |||
| 	} | ||||
| 
 | ||||
| 	if (releases.every(release => release.date)) { | ||||
| 		return releases.filter(release => moment(release.date).isAfter(afterDate)); | ||||
| 		return releases.filter(release => moment(release.date).isAfter(argv.after)); | ||||
| 	} | ||||
| 
 | ||||
| 	return releases.slice(0, argv.nullDateLimit); | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue