Improved and documented actor profile scraping.
This commit is contained in:
parent
5cabeed19d
commit
7413d7db25
|
@ -40,6 +40,13 @@ To generate the thumbnails for logos and tag photos, run:
|
||||||
* `--scene [URL]`: Try to retrieve scene details from its official channel or network URL.
|
* `--scene [URL]`: Try to retrieve scene details from its official channel or network URL.
|
||||||
* `--deep`: Follow each release link found running `--channel` or `--network` and scrape it for more details. Enabled by default ; use `--no-deep` to only save information found on the overview pages.
|
* `--deep`: Follow each release link found running `--channel` or `--network` and scrape it for more details. Enabled by default ; use `--no-deep` to only save information found on the overview pages.
|
||||||
|
|
||||||
|
#### Actors
|
||||||
|
* `--actors "[name]" "[name]"`: Fetch actor profiles. When no names are specified, actors without existing profiles are scraped
|
||||||
|
* `--actors-file [filepath]`: Fetch all scenes for the actors specified in a file using a newline delimiter.
|
||||||
|
* `--actors-sources [slug] [slug]`: Scrapers to use for actor profiles. Defaults to config.
|
||||||
|
* `--actors-scenes`: Fetch all scenes for scraped actors. Use with caution, as an actor may have many scenes.
|
||||||
|
* `--scene-actors`: Fetch profiles for actors associated with scraped scenes. Use with caution, as scenes may have many actors, each with many profiles.
|
||||||
|
|
||||||
#### Developers
|
#### Developers
|
||||||
* `--no-save`: Do not store retrieved information in local database, forcing re-fetch.
|
* `--no-save`: Do not store retrieved information in local database, forcing re-fetch.
|
||||||
* `--level`: Change log level to `silly`, `verbose`, `info`, `warn` or `error`.
|
* `--level`: Change log level to `silly`, `verbose`, `info`, `warn` or `error`.
|
||||||
|
|
|
@ -582,9 +582,31 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy
|
||||||
return profiles.filter(Boolean);
|
return profiles.filter(Boolean);
|
||||||
}
|
}
|
||||||
|
|
||||||
async function scrapeActors(actorNames) {
|
async function getActorNames(actorNames) {
|
||||||
|
if (actorNames.length > 0) {
|
||||||
|
return actorNames;
|
||||||
|
}
|
||||||
|
|
||||||
|
const actorsWithoutProfiles = await knex.raw(`
|
||||||
|
SELECT actors.name
|
||||||
|
FROM actors
|
||||||
|
WHERE NOT EXISTS (
|
||||||
|
SELECT *
|
||||||
|
FROM actors_profiles
|
||||||
|
WHERE actors_profiles.actor_id = actors.id
|
||||||
|
AND actors_profiles.updated_at <= (?)
|
||||||
|
)
|
||||||
|
`, [argv.actorsUpdate || new Date()]);
|
||||||
|
|
||||||
|
return actorsWithoutProfiles.rows.map(actor => actor.name);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function scrapeActors(argNames) {
|
||||||
|
const actorNames = await getActorNames(argNames);
|
||||||
const baseActors = toBaseActors(actorNames);
|
const baseActors = toBaseActors(actorNames);
|
||||||
|
|
||||||
|
logger.info(`Scraping profiles for ${actorNames.length} actors`);
|
||||||
|
|
||||||
const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
|
const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
|
||||||
const entitySlugs = sources.flat();
|
const entitySlugs = sources.flat();
|
||||||
|
|
||||||
|
@ -596,11 +618,7 @@ async function scrapeActors(actorNames) {
|
||||||
.orderBy('entities.type'),
|
.orderBy('entities.type'),
|
||||||
knex('actors')
|
knex('actors')
|
||||||
.select(['id', 'name', 'slug', 'entry_id'])
|
.select(['id', 'name', 'slug', 'entry_id'])
|
||||||
.modify((queryBuilder) => {
|
.whereIn('slug', baseActors.map(baseActor => baseActor.slug))
|
||||||
if (actorNames.length > 0) {
|
|
||||||
queryBuilder.whereIn('slug', baseActors.map(baseActor => baseActor.slug));
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.whereNull('alias_for'),
|
.whereNull('alias_for'),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
|
|
@ -25,7 +25,7 @@ async function init() {
|
||||||
const actorsFromFile = argv.actorsFile && await getFileEntries(argv.actorsFile);
|
const actorsFromFile = argv.actorsFile && await getFileEntries(argv.actorsFile);
|
||||||
const actorNames = (argv.actors || []).concat(actorsFromFile || []);
|
const actorNames = (argv.actors || []).concat(actorsFromFile || []);
|
||||||
|
|
||||||
const actors = actorNames.length > 0 && await scrapeActors(actorNames);
|
const actors = (argv.actors || argv.actorsFile) && await scrapeActors(actorNames);
|
||||||
const actorBaseScenes = argv.actors && argv.actorScenes && actors.map(actor => actor.releases).flat().filter(Boolean);
|
const actorBaseScenes = argv.actors && argv.actorScenes && actors.map(actor => actor.releases).flat().filter(Boolean);
|
||||||
|
|
||||||
const updateBaseScenes = (argv.all || argv.channels || argv.networks || argv.movies) && await fetchUpdates();
|
const updateBaseScenes = (argv.all || argv.channels || argv.networks || argv.movies) && await fetchUpdates();
|
||||||
|
|
38
src/argv.js
38
src/argv.js
|
@ -2,6 +2,22 @@
|
||||||
|
|
||||||
const config = require('config');
|
const config = require('config');
|
||||||
const yargs = require('yargs');
|
const yargs = require('yargs');
|
||||||
|
const moment = require('moment');
|
||||||
|
|
||||||
|
function interpretAfter(after) {
|
||||||
|
if (/\d{2,4}-\d{2}-\d{2,4}/.test(after)) {
|
||||||
|
// using date
|
||||||
|
return moment
|
||||||
|
.utc(after, ['YYYY-MM-DD', 'DD-MM-YYYY'])
|
||||||
|
.toDate();
|
||||||
|
}
|
||||||
|
|
||||||
|
// using time distance (e.g. "1 month")
|
||||||
|
return moment
|
||||||
|
.utc()
|
||||||
|
.subtract(...after.split(' '))
|
||||||
|
.toDate();
|
||||||
|
}
|
||||||
|
|
||||||
const { argv } = yargs
|
const { argv } = yargs
|
||||||
.command('npm start')
|
.command('npm start')
|
||||||
|
@ -30,16 +46,25 @@ const { argv } = yargs
|
||||||
type: 'array',
|
type: 'array',
|
||||||
alias: 'actor',
|
alias: 'actor',
|
||||||
})
|
})
|
||||||
|
.option('actors-update', {
|
||||||
|
describe: 'Rescrape actors last updated before this period',
|
||||||
|
type: 'string',
|
||||||
|
})
|
||||||
.option('actors-file', {
|
.option('actors-file', {
|
||||||
describe: 'Scrape actors names from file',
|
describe: 'Scrape actors names from file',
|
||||||
type: 'string',
|
type: 'string',
|
||||||
})
|
})
|
||||||
.option('actor-scenes', {
|
.option('actors-scenes', {
|
||||||
describe: 'Fetch all scenes for an actor',
|
describe: 'Fetch all scenes for an actor',
|
||||||
type: 'boolean',
|
type: 'boolean',
|
||||||
alias: 'with-scenes',
|
alias: 'actor-scenes',
|
||||||
default: false,
|
default: false,
|
||||||
})
|
})
|
||||||
|
.option('actors-sources', {
|
||||||
|
describe: 'Use these scrapers for actor data',
|
||||||
|
type: 'array',
|
||||||
|
alias: 'actor-source',
|
||||||
|
})
|
||||||
.option('movie-scenes', {
|
.option('movie-scenes', {
|
||||||
describe: 'Fetch all scenes for a movie',
|
describe: 'Fetch all scenes for a movie',
|
||||||
type: 'boolean',
|
type: 'boolean',
|
||||||
|
@ -70,11 +95,6 @@ const { argv } = yargs
|
||||||
describe: 'Scrape movie info from URL',
|
describe: 'Scrape movie info from URL',
|
||||||
type: 'array',
|
type: 'array',
|
||||||
})
|
})
|
||||||
.option('sources', {
|
|
||||||
describe: 'Use these scrapers for actor data',
|
|
||||||
type: 'array',
|
|
||||||
alias: 'source',
|
|
||||||
})
|
|
||||||
.option('deep', {
|
.option('deep', {
|
||||||
describe: 'Fetch details for all releases',
|
describe: 'Fetch details for all releases',
|
||||||
type: 'boolean',
|
type: 'boolean',
|
||||||
|
@ -204,6 +224,8 @@ const { argv } = yargs
|
||||||
describe: 'Update search documents for all releases.',
|
describe: 'Update search documents for all releases.',
|
||||||
type: 'boolean',
|
type: 'boolean',
|
||||||
default: false,
|
default: false,
|
||||||
});
|
})
|
||||||
|
.coerce('after', interpretAfter)
|
||||||
|
.coerce('actors-update', interpretAfter);
|
||||||
|
|
||||||
module.exports = argv;
|
module.exports = argv;
|
||||||
|
|
|
@ -10,21 +10,6 @@ const include = require('./utils/argv-include')(argv);
|
||||||
const scrapers = require('./scrapers/scrapers');
|
const scrapers = require('./scrapers/scrapers');
|
||||||
const { fetchChannelsFromArgv, fetchChannelsFromConfig } = require('./entities');
|
const { fetchChannelsFromArgv, fetchChannelsFromConfig } = require('./entities');
|
||||||
|
|
||||||
const afterDate = (() => {
|
|
||||||
if (/\d{2,4}-\d{2}-\d{2,4}/.test(argv.after)) {
|
|
||||||
// using date
|
|
||||||
return moment
|
|
||||||
.utc(argv.after, ['YYYY-MM-DD', 'DD-MM-YYYY'])
|
|
||||||
.toDate();
|
|
||||||
}
|
|
||||||
|
|
||||||
// using time distance (e.g. "1 month")
|
|
||||||
return moment
|
|
||||||
.utc()
|
|
||||||
.subtract(...argv.after.split(' '))
|
|
||||||
.toDate();
|
|
||||||
})();
|
|
||||||
|
|
||||||
async function filterUniqueReleases(latestReleases, accReleases) {
|
async function filterUniqueReleases(latestReleases, accReleases) {
|
||||||
const latestReleaseIdentifiers = latestReleases
|
const latestReleaseIdentifiers = latestReleases
|
||||||
.map(release => [release.entity.id, release.entryId]);
|
.map(release => [release.entity.id, release.entryId]);
|
||||||
|
@ -67,7 +52,7 @@ function needNextPage(uniqueReleases, pageAccReleases) {
|
||||||
.sort((releaseA, releaseB) => releaseB.date - releaseA.date)
|
.sort((releaseA, releaseB) => releaseB.date - releaseA.date)
|
||||||
.slice(-1)[0];
|
.slice(-1)[0];
|
||||||
|
|
||||||
if (moment(oldestReleaseOnPage.date).isAfter(afterDate)) {
|
if (moment(oldestReleaseOnPage.date).isAfter(argv.after)) {
|
||||||
// oldest release on page is newer than the specified date cut-off
|
// oldest release on page is newer than the specified date cut-off
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -126,7 +111,7 @@ async function scrapeReleases(scraper, entity, preData, upcoming = false) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (releases.every(release => release.date)) {
|
if (releases.every(release => release.date)) {
|
||||||
return releases.filter(release => moment(release.date).isAfter(afterDate));
|
return releases.filter(release => moment(release.date).isAfter(argv.after));
|
||||||
}
|
}
|
||||||
|
|
||||||
return releases.slice(0, argv.nullDateLimit);
|
return releases.slice(0, argv.nullDateLimit);
|
||||||
|
|
Loading…
Reference in New Issue