1115 lines
34 KiB
JavaScript
Executable File
1115 lines
34 KiB
JavaScript
Executable File
'use strict';
|
|
|
|
const config = require('config');
|
|
const util = require('util');
|
|
const Promise = require('bluebird');
|
|
const moment = require('moment');
|
|
const blake2 = require('blake2');
|
|
const DOMPurify = require('dompurify');
|
|
const { JSDOM } = require('jsdom');
|
|
const omit = require('object.omit');
|
|
const inquirer = require('inquirer');
|
|
const unprint = require('unprint');
|
|
|
|
const { window } = new JSDOM('');
|
|
const domPurify = DOMPurify(window);
|
|
|
|
// const logger = require('./logger')(__filename);
|
|
const knex = require('./knex');
|
|
const redis = require('./redis');
|
|
const scrapers = require('./scrapers/scrapers').actors;
|
|
|
|
const argv = require('./argv');
|
|
const include = require('./utils/argv-include')(argv);
|
|
const bulkInsert = require('./utils/bulk-insert');
|
|
const chunk = require('./utils/chunk');
|
|
const logger = require('./logger')(__filename);
|
|
|
|
const { toBaseReleases } = require('./deep');
|
|
const { associateAvatars, flushOrphanedMedia } = require('./media');
|
|
const { fetchEntitiesBySlug } = require('./entities');
|
|
const { deleteScenes } = require('./releases');
|
|
|
|
const actorsCommon = import('../common/actors.mjs'); // eslint-disable-line import/extensions, import/no-relative-packages
|
|
const geoCommon = import('../common/geo.mjs'); // eslint-disable-line import/extensions, import/no-relative-packages
|
|
|
|
const slugify = require('./utils/slugify');
|
|
const capitalize = require('./utils/capitalize');
|
|
const { resolveLayoutScraper } = require('./scrapers/resolve');
|
|
const getRecursiveParameters = require('./utils/get-recursive-parameters');
|
|
|
|
const commonContext = {
|
|
knex,
|
|
redis,
|
|
logger,
|
|
moment,
|
|
slugify,
|
|
omit,
|
|
unprint,
|
|
};
|
|
|
|
const hairColors = {
|
|
'jet-black': 'black',
|
|
'red-head': 'red',
|
|
'soft-black': 'black',
|
|
'brunette/raven': 'brown',
|
|
black: 'black',
|
|
blond: 'blond',
|
|
blonde: 'blonde',
|
|
blondie: 'blonde',
|
|
brown: 'brown',
|
|
bruin: 'brown',
|
|
brunette: 'brown',
|
|
fair: 'blonde',
|
|
grey: 'gray',
|
|
gray: 'gray',
|
|
raven: 'black',
|
|
red: 'red',
|
|
redhead: 'red',
|
|
'red head': 'red',
|
|
rood: 'red',
|
|
blue: 'blue',
|
|
green: 'green',
|
|
purple: 'purple',
|
|
pink: 'pink',
|
|
zwart: 'black',
|
|
};
|
|
|
|
const eyeColors = {
|
|
blauw: 'blue',
|
|
blue: 'blue',
|
|
brown: 'brown',
|
|
bruin: 'brown',
|
|
dark: 'brown',
|
|
gray: 'gray',
|
|
green: 'green',
|
|
groen: 'green',
|
|
grey: 'gray',
|
|
hazel: 'hazel',
|
|
};
|
|
|
|
const orientations = {
|
|
bi: 'bisexual',
|
|
biseksueel: 'bisexual',
|
|
bisexual: 'bisexual',
|
|
gay: 'gay',
|
|
hetero: 'straight',
|
|
heteroseksueel: 'straight',
|
|
heterosexual: 'straight',
|
|
homoseksueel: 'gay',
|
|
homosexual: 'gay',
|
|
straight: 'straight',
|
|
};
|
|
|
|
const ethnicities = {
|
|
'african american': 'black',
|
|
'african-american': 'black',
|
|
'native american': 'native american',
|
|
african: 'black',
|
|
aravic: 'arabic',
|
|
asian: 'asian',
|
|
black: 'black',
|
|
caucasian: 'white',
|
|
european: 'white',
|
|
hispanic: 'latin',
|
|
indian: 'indian',
|
|
japanese: 'japanese',
|
|
latin: 'latin',
|
|
latina: 'latina',
|
|
latino: 'latino',
|
|
white: 'white',
|
|
};
|
|
|
|
const bloodTypes = {
|
|
A: 'A',
|
|
'A+': 'A+',
|
|
'A-': 'A-',
|
|
B: 'B',
|
|
'B+': 'B+',
|
|
'B-': 'B-',
|
|
AB: 'AB',
|
|
'AB+': 'AB+',
|
|
'AB-': 'AB-',
|
|
O: 'O',
|
|
'O+': 'O+',
|
|
'O-': 'O-',
|
|
};
|
|
|
|
function getBoolean(value) {
|
|
if (typeof value === 'boolean') {
|
|
return value;
|
|
}
|
|
|
|
if (typeof value === 'string') {
|
|
if (/yes/i.test(value)) {
|
|
return true;
|
|
}
|
|
|
|
if (/no/i.test(value)) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
function toBaseActors(actorsOrNames, release) {
|
|
if (!actorsOrNames) {
|
|
return [];
|
|
}
|
|
|
|
const baseActors = actorsOrNames
|
|
.filter((actorOrName) => actorOrName && (typeof actorOrName === 'string' || actorOrName.name))
|
|
.map((actorOrName) => {
|
|
const [baseName, entryId] = (actorOrName.name || actorOrName).split(':');
|
|
|
|
const name = capitalize(baseName);
|
|
const slug = slugify(name);
|
|
|
|
// using top level parent widens the scope too much, e.g. different Gamma sites may not use the same actor database
|
|
// const entity = getRecursiveParent(release?.entity);
|
|
const entity = (release?.entity?.indepdendent && release?.entity)
|
|
|| release?.entity?.parent
|
|
|| release?.entity
|
|
|| null;
|
|
|
|
const baseActor = {
|
|
name,
|
|
slug,
|
|
entryId: (entity && (entryId || actorOrName.entryId)) || null,
|
|
suppliedEntryId: entryId,
|
|
entity,
|
|
hasProfile: !!actorOrName.name, // actor contains profile information
|
|
};
|
|
|
|
if (actorOrName.name) {
|
|
return {
|
|
...actorOrName,
|
|
...baseActor,
|
|
};
|
|
}
|
|
|
|
return baseActor;
|
|
});
|
|
|
|
return baseActors;
|
|
}
|
|
|
|
function getCollisionLikely(actor) {
|
|
// actor with single name
|
|
return actor.name.match(/\w+/g).length === 1;
|
|
}
|
|
|
|
function curateActor(actor, withDetails = false, isProfile = false) {
|
|
if (!actor) {
|
|
return null;
|
|
}
|
|
|
|
const curatedActor = {
|
|
id: actor.id,
|
|
name: actor.name,
|
|
slug: actor.slug,
|
|
url: actor.url,
|
|
gender: actor.gender,
|
|
orientation: actor.orientation,
|
|
entityId: actor.entity_id,
|
|
aliasFor: actor.alias_for,
|
|
dateOfBirth: actor.date_of_birth,
|
|
age: actor.age,
|
|
birthCountry: actor.birth_country_alpha2,
|
|
...(withDetails && {
|
|
alias: actor.alias && {
|
|
id: actor.alias.id,
|
|
name: actor.alias.name,
|
|
slug: actor.slug,
|
|
gender: actor.alias.gender,
|
|
},
|
|
entity: actor.entity && {
|
|
id: actor.entity.id,
|
|
name: actor.entity.name,
|
|
slug: actor.entity.slug,
|
|
},
|
|
dateOfDeath: actor.date_of_death,
|
|
cup: actor.cup,
|
|
bust: actor.bust,
|
|
waist: actor.waist,
|
|
hip: actor.hip,
|
|
foot: actor.foot,
|
|
leg: actor.leg,
|
|
thigh: actor.thigh,
|
|
naturalBoobs: actor.natural_boobs,
|
|
penisLength: actor.penis_length,
|
|
penisGirth: actor.penis_girth,
|
|
circumcised: actor.circumcised,
|
|
height: actor.height,
|
|
weight: actor.weight,
|
|
shoeSize: actor.shoe_size,
|
|
eyes: actor.eyes,
|
|
hairColor: actor.hair_color,
|
|
hairType: actor.hair_type,
|
|
hasTattoos: actor.has_tattoos,
|
|
hasPiercings: actor.has_piercings,
|
|
tattoos: actor.tattoos,
|
|
piercings: actor.piercings,
|
|
bloodType: actor.blood_type,
|
|
...(isProfile && { description: actor.description }),
|
|
placeOfBirth: actor.birth_country && {
|
|
country: {
|
|
alpha2: actor.birth_country.alpha2,
|
|
name: actor.birth_country.name,
|
|
alias: actor.birth_country.alias,
|
|
},
|
|
state: actor.birth_state,
|
|
city: actor.birth_city,
|
|
},
|
|
placeOfResidence: actor.residence_country && {
|
|
country: {
|
|
alpha2: actor.residence_country.alpha2,
|
|
name: actor.residence_country.name,
|
|
alias: actor.residence_country.alias,
|
|
},
|
|
state: actor.residence_state,
|
|
city: actor.residence_city,
|
|
},
|
|
avatar: actor.avatar && {
|
|
id: actor.avatar.id,
|
|
path: actor.avatar.path,
|
|
width: actor.avatar.width,
|
|
height: actor.avatar.height,
|
|
size: actor.avatar.size,
|
|
source: actor.avatar.source,
|
|
},
|
|
...(actor.profiles && { profiles: actor.profiles?.map((profile) => curateActor(profile, true, true)) }),
|
|
}),
|
|
};
|
|
|
|
return curatedActor;
|
|
}
|
|
|
|
function curateActorEntry(baseActor, batchId) {
|
|
const collisionLikely = getCollisionLikely(baseActor);
|
|
|
|
return {
|
|
name: baseActor.name,
|
|
slug: baseActor.slug,
|
|
entity_id: (collisionLikely && baseActor.entity?.id) || null,
|
|
entry_id: (collisionLikely && baseActor.entryId) || null,
|
|
batch_id: batchId,
|
|
};
|
|
}
|
|
|
|
function curateActorEntries(baseActors, batchId) {
|
|
return baseActors.map((baseActor) => curateActorEntry(baseActor, batchId));
|
|
}
|
|
|
|
function curateProfileEntry(profile) {
|
|
if (!profile.actorId) {
|
|
return null;
|
|
}
|
|
|
|
const curatedProfileEntry = {
|
|
...(profile.update !== false && { id: profile.update }),
|
|
actor_id: profile.actorId,
|
|
entity_id: profile.entity?.id || null,
|
|
date_of_birth: profile.dateOfBirth,
|
|
date_of_death: profile.dateOfDeath,
|
|
age: profile.age,
|
|
url: profile.url,
|
|
gender: profile.gender,
|
|
orientation: profile.orientation,
|
|
ethnicity: profile.ethnicity,
|
|
description: profile.description,
|
|
description_hash: profile.descriptionHash,
|
|
birth_city: profile.placeOfBirth?.city || null,
|
|
birth_state: profile.placeOfBirth?.state || null,
|
|
birth_country_alpha2: profile.placeOfBirth?.country || null,
|
|
residence_city: profile.placeOfResidence?.city || null,
|
|
residence_state: profile.placeOfResidence?.state || null,
|
|
residence_country_alpha2: profile.placeOfResidence?.country || null,
|
|
cup: profile.cup,
|
|
bust: profile.bust,
|
|
waist: profile.waist,
|
|
leg: profile.leg,
|
|
thigh: profile.thigh,
|
|
foot: profile.foot,
|
|
hip: profile.hip,
|
|
penis_length: profile.penisLength,
|
|
penis_girth: profile.penisGirth,
|
|
circumcised: profile.circumcised,
|
|
natural_boobs: profile.naturalBoobs,
|
|
height: profile.height,
|
|
weight: profile.weight,
|
|
shoe_size: profile.shoeSize,
|
|
hair_color: profile.hairColor,
|
|
hair_type: profile.hairType,
|
|
eyes: profile.eyes,
|
|
has_tattoos: profile.hasTattoos,
|
|
has_piercings: profile.hasPiercings,
|
|
piercings: profile.piercings,
|
|
tattoos: profile.tattoos,
|
|
blood_type: profile.bloodType,
|
|
avatar_media_id: profile.avatarMediaId || null,
|
|
};
|
|
|
|
return curatedProfileEntry;
|
|
}
|
|
|
|
async function fetchProfiles(actorIdsOrNames) {
|
|
return knex('actors_profiles')
|
|
.select(knex.raw('actors_profiles.*, actors.name, row_to_json(media) as avatar'))
|
|
.leftJoin('actors', 'actors.id', 'actors_profiles.actor_id')
|
|
.modify((query) => {
|
|
if (actorIdsOrNames) {
|
|
query
|
|
.whereIn('actor_id', actorIdsOrNames.filter((idOrName) => typeof idOrName === 'number'))
|
|
.orWhere((builder) => {
|
|
builder
|
|
.whereIn('actors.name', actorIdsOrNames.filter((idOrName) => typeof idOrName === 'string'))
|
|
.whereNull('actors.entity_id');
|
|
});
|
|
}
|
|
})
|
|
.leftJoin('media', 'actors_profiles.avatar_media_id', 'media.id');
|
|
}
|
|
|
|
async function interpolateProfiles(actorIdsOrNames, refreshView) {
|
|
const { interpolateProfiles: interpolateProfilesUtil } = await actorsCommon;
|
|
|
|
try {
|
|
await interpolateProfilesUtil(actorIdsOrNames, commonContext, { refreshView });
|
|
} catch (error) {
|
|
console.log(error);
|
|
}
|
|
}
|
|
|
|
async function curateProfile(profile, actor) {
|
|
if (!profile) {
|
|
return null;
|
|
}
|
|
|
|
const { resolvePlace } = await geoCommon;
|
|
|
|
try {
|
|
const curatedProfile = {
|
|
// id: profile.id,
|
|
update: profile.update,
|
|
actorId: profile.actorId,
|
|
profileId: profile.profileId,
|
|
name: profile.name,
|
|
url: profile.url,
|
|
avatar: profile.avatar,
|
|
scraper: profile.scraper,
|
|
entity: profile.entity,
|
|
};
|
|
|
|
curatedProfile.description = domPurify.sanitize(profile.description?.replace(/\s+/g, ' '), { ALLOWED_TAGS: [] }).trim() || null;
|
|
|
|
const hasher = curatedProfile.description && blake2
|
|
.createHash('blake2b', { digestLength: 24 })
|
|
.update(Buffer.from(slugify(curatedProfile.description)));
|
|
|
|
curatedProfile.descriptionHash = curatedProfile.description && hasher.digest('hex');
|
|
|
|
curatedProfile.nationality = profile.nationality?.trim() || null; // used to derive country when country not available
|
|
|
|
curatedProfile.ethnicity = ethnicities[profile.ethnicity?.trim().toLowerCase()] || null;
|
|
curatedProfile.hairType = profile.hairType?.trim() || null;
|
|
curatedProfile.hairColor = hairColors[(profile.hairColor || profile.hair)?.toLowerCase().replace('hair', '').trim()] || null;
|
|
curatedProfile.eyes = eyeColors[profile.eyes?.replace(/eyes?/i).trim().toLowerCase()] || null;
|
|
|
|
curatedProfile.tattoos = profile.tattoos?.trim() || null;
|
|
curatedProfile.piercings = profile.piercings?.trim() || null;
|
|
|
|
curatedProfile.gender = (/female/i.test(profile.gender) && 'female')
|
|
|| (/shemale|trans/i.test(profile.gender) && 'transsexual')
|
|
|| (/male/i.test(profile.gender) && 'male')
|
|
|| null;
|
|
|
|
curatedProfile.orientation = orientations[profile.orientation?.trim()] || null;
|
|
|
|
const dateOfBirth = profile.dateOfBirth || profile.birthdate;
|
|
|
|
curatedProfile.dateOfBirth = (!Number.isNaN(Number(dateOfBirth)) // possibly valid date
|
|
&& new Date() - dateOfBirth > 567648000000 // over 18
|
|
&& dateOfBirth)
|
|
|| null;
|
|
|
|
curatedProfile.dateOfDeath = Number.isNaN(Number(profile.dateOfDeath)) ? null : profile.dateOfDeath;
|
|
curatedProfile.age = Number(profile.age) || null;
|
|
|
|
curatedProfile.height = Number(profile.height) || profile.height?.match?.(/\d+/)?.[0] || null;
|
|
curatedProfile.weight = Number(profile.weight) || profile.weight?.match?.(/\d+/)?.[0] || null;
|
|
curatedProfile.shoeSize = Number(profile.shoeSize) || profile.shoeSize?.match?.(/\d+/)?.[0] || null;
|
|
|
|
// separate measurement values
|
|
curatedProfile.cup = profile.cup || (typeof profile.bust === 'string' && profile.bust?.match?.(/[a-zA-Z]+/)?.[0]) || null;
|
|
curatedProfile.bust = Number(profile.bust) || profile.bust?.match?.(/\d+/)?.[0] || null;
|
|
curatedProfile.waist = Number(profile.waist) || profile.waist?.match?.(/\d+/)?.[0] || null;
|
|
curatedProfile.hip = Number(profile.hip) || profile.hip?.match?.(/\d+/)?.[0] || null;
|
|
|
|
curatedProfile.leg = Number(profile.leg) || profile.leg?.match?.(/\d+/)?.[0] || null;
|
|
curatedProfile.thigh = Number(profile.thigh) || profile.thigh?.match?.(/\d+/)?.[0] || null;
|
|
curatedProfile.foot = Number(profile.foot) || profile.foot?.match?.(/\d+/)?.[0] || null;
|
|
|
|
// combined measurement value
|
|
// ExCoGi uses x, Jules Jordan has spaces between the dashes, SpermMenia/Cum Buffet sometimes misses cup
|
|
const measurements = profile.measurements?.match(/(\d+)([a-z]+)?(?:\s*[-x]\s*(\d+)\s*[-x]\s*(\d+))?/i);
|
|
|
|
if (measurements) {
|
|
curatedProfile.bust = Number(measurements[1]) || null;
|
|
curatedProfile.cup = measurements[2] || null;
|
|
curatedProfile.waist = Number(measurements[3]) || null;
|
|
curatedProfile.hip = Number(measurements[4]) || null;
|
|
}
|
|
|
|
curatedProfile.penisLength = Number(profile.penisLength) || profile.penisLength?.match?.(/\d+/)?.[0] || null;
|
|
curatedProfile.penisGirth = Number(profile.penisGirth) || profile.penisGirth?.match?.(/\d+/)?.[0] || null;
|
|
|
|
curatedProfile.circumcised = getBoolean(profile.circumcised);
|
|
curatedProfile.naturalBoobs = getBoolean(profile.naturalBoobs);
|
|
curatedProfile.hasTattoos = getBoolean(profile.hasTattoos);
|
|
curatedProfile.hasPiercings = getBoolean(profile.hasPiercings);
|
|
curatedProfile.bloodType = bloodTypes[profile.bloodType?.trim().toUpperCase()] || null;
|
|
|
|
if (argv.resolvePlace) {
|
|
const [placeOfBirth, placeOfResidence] = await Promise.all([
|
|
resolvePlace(profile.birthPlace, commonContext, { useCache: argv.placeCache, userAgent: config.location.userAgent }),
|
|
resolvePlace(profile.residencePlace, commonContext, { useCache: argv.placeCache, userAgent: config.location.userAgent }),
|
|
]);
|
|
|
|
curatedProfile.placeOfBirth = placeOfBirth;
|
|
curatedProfile.placeOfResidence = placeOfResidence;
|
|
}
|
|
|
|
if (!curatedProfile.placeOfBirth && curatedProfile.nationality) {
|
|
const country = await knex('countries')
|
|
.where('nationality', 'ilike', `%${curatedProfile.nationality}%`)
|
|
.orWhere('alpha3', 'ilike', `%${curatedProfile.nationality}%`)
|
|
.orWhere('alpha2', 'ilike', `%${curatedProfile.nationality}%`)
|
|
.orderBy('priority', 'desc')
|
|
.first();
|
|
|
|
if (country) {
|
|
curatedProfile.placeOfBirth = {
|
|
country: country.alpha2,
|
|
};
|
|
}
|
|
}
|
|
|
|
curatedProfile.social = [].concat(profile.social).map((social) => {
|
|
if (!social) {
|
|
return null;
|
|
}
|
|
|
|
try {
|
|
const { origin, pathname } = new URL(social);
|
|
|
|
return `${origin}${pathname}`;
|
|
} catch (error) {
|
|
logger.warn(`Profile scraper for '${profile.entity.name}' returned invalid social link: ${social}`);
|
|
return null;
|
|
}
|
|
}).filter(Boolean);
|
|
|
|
curatedProfile.scenes = toBaseReleases(profile.scenes || profile.releases, profile.entity, actor)
|
|
// attach actor to base scene, in case it was not scraped
|
|
.map((scene) => {
|
|
if (actor && !scene.actors?.find((sceneActor) => slugify(sceneActor) === actor.slug || slugify(sceneActor.name) === actor.slug)) {
|
|
return {
|
|
...scene,
|
|
actors: [actor, ...(scene.actors || [])],
|
|
};
|
|
}
|
|
|
|
return scene;
|
|
});
|
|
|
|
if (profile.ethnicity && !curatedProfile.ethnicity) logger.warn(`Unrecognized ethnicity returned by '${profile.entity.name}' scraper: ${profile.ethnicity}`);
|
|
if ((profile.hairColor || profile.hair) && !curatedProfile.hairColor) logger.warn(`Unrecognized hair color returned by '${profile.entity.name}' scraper: ${profile.hairColor || profile.hair}`);
|
|
if (profile.eyes && !curatedProfile.eyes) logger.warn(`Unrecognized eye color returned by '${profile.entity.name}' scraper: ${profile.eyes}`);
|
|
|
|
return curatedProfile;
|
|
} catch (error) {
|
|
logger.error(`Failed to curate '${profile.name}': ${error.message}`);
|
|
|
|
return null;
|
|
}
|
|
}
|
|
|
|
async function insertProfiles(newProfiles) {
|
|
if (newProfiles.length > 0) {
|
|
const entries = await bulkInsert('actors_profiles', newProfiles);
|
|
|
|
logger.info(`Saved ${newProfiles.length} actor profiles`);
|
|
|
|
return entries;
|
|
}
|
|
|
|
return [];
|
|
}
|
|
|
|
async function upsertProfiles(profiles) {
|
|
const newProfileEntries = profiles.filter((profile) => !profile.update).map((profile) => curateProfileEntry(profile)).filter(Boolean);
|
|
const updatingProfileEntries = profiles.filter((profile) => profile.update).map((profile) => curateProfileEntry(profile)).filter(Boolean);
|
|
|
|
const newProfiles = await insertProfiles(newProfileEntries);
|
|
|
|
if (argv.force && updatingProfileEntries.length > 0) {
|
|
const transaction = await knex.transaction();
|
|
const queries = updatingProfileEntries.map((profileEntry) => knex('actors_profiles')
|
|
.where('id', profileEntry.id)
|
|
.update(profileEntry)
|
|
.returning(['id', 'actor_id'])
|
|
.transacting(transaction));
|
|
|
|
await Promise.all(queries)
|
|
.then(transaction.commit)
|
|
.catch(transaction.rollback);
|
|
|
|
logger.info(`Updated ${updatingProfileEntries.length} new actor profiles`);
|
|
}
|
|
|
|
if (profiles.length > 0) {
|
|
const newProfileIdMap = newProfiles.reduce((acc, profile) => {
|
|
if (!acc[profile.actor_id]) {
|
|
acc[profile.actor_id] = {};
|
|
}
|
|
|
|
acc[profile.actor_id][profile.entity_id] = profile.id;
|
|
|
|
return acc;
|
|
}, {});
|
|
|
|
const avatars = profiles.filter((profile) => !!profile.avatarMediaId).map((profile) => ({
|
|
actor_id: profile.actorId,
|
|
profile_id: profile.profileId || newProfileIdMap[profile.actorId]?.[profile.entity?.id],
|
|
media_id: profile.avatarMediaId,
|
|
}));
|
|
|
|
await knex('actors_avatars')
|
|
.insert(avatars)
|
|
.onConflict()
|
|
.ignore();
|
|
}
|
|
}
|
|
|
|
async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesByActorEntityId) {
|
|
const validSources = actor.entity ? sources.filter((source) => source === actor.entity.slug) : sources;
|
|
|
|
const profiles = Promise.map(validSources, async (source) => {
|
|
try {
|
|
// config may group sources to try until success
|
|
return await [].concat(source).reduce(async (outcome, scraperSlug) => outcome.catch(async () => {
|
|
try {
|
|
const entity = entitiesBySlug[scraperSlug] || null;
|
|
|
|
const scraper = scrapers[scraperSlug];
|
|
const layoutScraper = resolveLayoutScraper(entity, scraper);
|
|
|
|
if (!layoutScraper?.fetchProfile) {
|
|
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
|
|
throw new Error(`No profile profile scraper available for ${scraperSlug}`);
|
|
}
|
|
|
|
const context = {
|
|
...entity,
|
|
// legacy
|
|
site: entity,
|
|
channel: entity,
|
|
network: entity?.parent,
|
|
entity,
|
|
include,
|
|
scraper: scraperSlug,
|
|
parameters: getRecursiveParameters(entity),
|
|
};
|
|
|
|
const label = context.entity?.name;
|
|
|
|
if (!context.entity) {
|
|
logger.warn(`No entity found for ${scraperSlug}`);
|
|
throw new Error(`No entity found for ${scraperSlug}`);
|
|
}
|
|
|
|
const existingProfile = existingProfilesByActorEntityId[actor.id]?.[context.entity?.id || null];
|
|
|
|
if (existingProfile && !argv.force) {
|
|
logger.verbose(`Found existing profile for '${actor.name}' on '${label}', use --force to scrape again`);
|
|
|
|
return null;
|
|
}
|
|
|
|
logger.verbose(`Searching profile for '${actor.name}' on '${label}'`);
|
|
|
|
const profile = await layoutScraper.fetchProfile(curateActor({
|
|
...existingProfile,
|
|
...actor,
|
|
}), context, include);
|
|
|
|
if (!profile || typeof profile === 'number') { // scraper returns HTTP code on request failure
|
|
logger.verbose(`Profile for '${actor.name}' not available on ${label}, scraper returned ${profile}`);
|
|
throw Object.assign(new Error(`Profile for '${actor.name}' not available on ${label}`), { code: 'PROFILE_NOT_AVAILABLE' });
|
|
}
|
|
|
|
logger.verbose(`Found profile for '${actor.name}' on '${label}'`);
|
|
|
|
return await curateProfile({
|
|
...actor,
|
|
...profile,
|
|
entity,
|
|
actorId: actor.id,
|
|
profileId: existingProfile?.id,
|
|
update: existingProfile?.id || false,
|
|
}, actor);
|
|
} catch (error) {
|
|
if (error.code !== 'PROFILE_NOT_AVAILABLE') {
|
|
logger.error(`Failed to fetch profile for '${actor.name}' from '${scraperSlug}': ${error.message}`);
|
|
}
|
|
|
|
// throw error to try next source
|
|
throw error;
|
|
}
|
|
}), Promise.reject(new Error()));
|
|
} catch (error) {
|
|
console.log(error);
|
|
|
|
if (error.code !== 'PROFILE_NOT_AVAILABLE') {
|
|
logger.error(`Failed to fetch profile for '${actor.name}': ${error.message}`);
|
|
}
|
|
}
|
|
|
|
return null;
|
|
});
|
|
|
|
return profiles.filter(Boolean);
|
|
}
|
|
|
|
async function associateSocials(profiles) {
|
|
const profileEntries = await knex('actors_profiles').whereIn(['actor_id', 'entity_id'], profiles.map((profile) => [profile.actorId, profile.entity.id]));
|
|
|
|
const profileEntriesByActorIdAndEntityId = profileEntries.reduce((acc, profileEntry) => {
|
|
if (!acc[profileEntry.actor_id]) {
|
|
acc[profileEntry.actor_id] = {};
|
|
}
|
|
|
|
acc[profileEntry.actor_id][profileEntry.entity_id] = profileEntry.id;
|
|
|
|
return acc;
|
|
}, {});
|
|
|
|
profiles.reduce(async (chain, profile) => {
|
|
await chain;
|
|
|
|
if (!Array.isArray(profile.social) || profile.social.length === 0) {
|
|
return;
|
|
}
|
|
|
|
const profileId = profileEntriesByActorIdAndEntityId[profile.actorId]?.[profile.entity.id];
|
|
|
|
if (!profileId) {
|
|
return;
|
|
}
|
|
|
|
await knex('actors_socials')
|
|
.insert(profile.social.map((url) => ({
|
|
url,
|
|
platform: new URL(url).hostname.match(/([\w-]+)?\.(\w+)$/)?.[1],
|
|
actor_id: profile.actorId,
|
|
profile_id: profileId,
|
|
})))
|
|
.onConflict()
|
|
.ignore();
|
|
}, Promise.resolve());
|
|
}
|
|
|
|
async function getActorNames(actorNames) {
|
|
if (actorNames.length > 0) {
|
|
return actorNames;
|
|
}
|
|
|
|
const actorsWithoutProfiles = await knex.raw(`
|
|
SELECT actors.name
|
|
FROM actors
|
|
WHERE NOT EXISTS (
|
|
SELECT *
|
|
FROM actors_profiles
|
|
WHERE actors_profiles.actor_id = actors.id
|
|
AND actors_profiles.updated_at <= (?)
|
|
)
|
|
`, [argv.actorsUpdate || new Date()]);
|
|
|
|
return actorsWithoutProfiles.rows.map((actor) => actor.name);
|
|
}
|
|
|
|
async function storeProfiles(profiles) {
|
|
const profilesWithAvatarIds = await associateAvatars(profiles);
|
|
const actorIds = Array.from(new Set(profiles.map((profile) => profile.actorId)));
|
|
|
|
await associateSocials(profiles);
|
|
|
|
await upsertProfiles(profilesWithAvatarIds);
|
|
await interpolateProfiles(actorIds);
|
|
}
|
|
|
|
async function scrapeActors(argNames) {
|
|
const actorNames = await getActorNames(argNames);
|
|
const baseActors = toBaseActors(actorNames);
|
|
|
|
logger.info(`Scraping profiles for ${actorNames.length} actors`);
|
|
|
|
const sources = argv.profileSources || config.profiles || Object.keys(scrapers.actors);
|
|
const entitySlugs = sources.flat();
|
|
|
|
const [entitiesBySlug, existingActorEntries] = await Promise.all([
|
|
fetchEntitiesBySlug(entitySlugs, { types: ['channel', 'network', 'info'] }),
|
|
knex('actors')
|
|
.select(knex.raw('actors.id, actors.name, actors.slug, actors.entry_id, actors.entity_id, row_to_json(entities) as entity'))
|
|
.whereIn('actors.slug', baseActors.map((baseActor) => baseActor.slug))
|
|
.whereNull('actors.alias_for')
|
|
.leftJoin('entities', 'entities.id', 'actors.entity_id')
|
|
.groupBy('actors.id', 'entities.id'),
|
|
]);
|
|
|
|
const existingActorEntriesBySlugAndEntryId = existingActorEntries.reduce((acc, actorEntry) => ({
|
|
...acc,
|
|
[actorEntry.slug]: {
|
|
...acc[actorEntry.slug],
|
|
[actorEntry.entryId || null]: actorEntry,
|
|
},
|
|
}), {});
|
|
|
|
const newBaseActors = baseActors.filter((baseActor) => !existingActorEntriesBySlugAndEntryId[baseActor.slug]?.[baseActor.entryId]);
|
|
|
|
const [{ id: batchId }] = newBaseActors.length > 0 ? await knex('batches').insert({ comment: null }).returning('id') : [{ id: null }];
|
|
const curatedActorEntries = batchId && curateActorEntries(newBaseActors, batchId);
|
|
|
|
// TODO: associate entity when entry ID is provided
|
|
|
|
const newActorEntries = batchId && await bulkInsert('actors', curatedActorEntries);
|
|
|
|
const actors = existingActorEntries.concat(Array.isArray(newActorEntries) ? newActorEntries : []);
|
|
|
|
const existingProfiles = await knex('actors_profiles')
|
|
.select(knex.raw('actors_profiles.*, row_to_json(avatars) as avatar'))
|
|
.whereIn('actor_id', actors.map((actor) => actor.id))
|
|
.leftJoin('media as avatars', 'avatars.id', 'actors_profiles.avatar_media_id');
|
|
|
|
const existingProfilesByActorEntityId = existingProfiles.reduce((acc, profile) => ({
|
|
...acc,
|
|
[profile.actor_id]: {
|
|
...acc[profile.actor_id],
|
|
[profile.entity_id]: profile,
|
|
},
|
|
}), {});
|
|
|
|
const profilesPerActor = await Promise.map(
|
|
actors,
|
|
async (actor) => scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesByActorEntityId),
|
|
{ concurrency: 10 },
|
|
);
|
|
|
|
const profiles = profilesPerActor.flat().filter(Boolean);
|
|
|
|
logger.info(`Scraped ${profiles.length} profiles`);
|
|
|
|
if (argv.report) {
|
|
console.log(util.inspect(profiles, { depth: Infinity, colors: true }));
|
|
}
|
|
|
|
if (argv.save) {
|
|
await storeProfiles(profiles);
|
|
}
|
|
|
|
return profiles;
|
|
}
|
|
|
|
async function getOrCreateActors(baseActors, batchId) {
|
|
// WHERE IN causes stack depth error and performance issues with a large amount of values, no knex VALUES helper available
|
|
const actorValues = baseActors.map((actor) => knex.raw('(:slug, :entityId, :entryId, :collisionLikely)', {
|
|
slug: actor.slug,
|
|
entityId: actor.entity.id,
|
|
entryId: actor.entryId,
|
|
collisionLikely: getCollisionLikely(actor),
|
|
})).join(', ');
|
|
|
|
const existingActors = await knex
|
|
.select('actors.*')
|
|
.from(knex.raw(`actors, (VALUES ${actorValues}) AS base_actors (slug, entity_id, entry_id, collision_likely)`))
|
|
.whereRaw(`
|
|
actors.slug = base_actors.slug
|
|
AND actors.entity_id IS NULL
|
|
AND NOT base_actors.collision_likely
|
|
`)
|
|
.orWhereRaw(`
|
|
actors.slug = base_actors.slug
|
|
AND actors.entity_id = base_actors.entity_id
|
|
AND ((actors.entry_id IS NULL AND base_actors.entry_id IS NULL)
|
|
OR actors.entry_id = base_actors.entry_id)
|
|
`);
|
|
|
|
// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug));
|
|
const existingActorSlugs = existingActors.reduce((acc, actor) => ({
|
|
...acc,
|
|
[actor.entity_id]: {
|
|
...acc[actor.entity_id],
|
|
[actor.entry_id]: {
|
|
...acc[actor.entity_id]?.[actor.entry_id],
|
|
[actor.slug]: true,
|
|
},
|
|
},
|
|
}), {});
|
|
|
|
const uniqueBaseActors = baseActors.filter((baseActor) => !existingActorSlugs[baseActor.entity.id]?.[baseActor.entryId]?.[baseActor.slug] && !existingActorSlugs.null?.null?.[baseActor.slug]);
|
|
const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId);
|
|
|
|
const newActors = await bulkInsert('actors', curatedActorEntries);
|
|
|
|
const newActorIdsByEntityIdEntryIdAndSlug = newActors.reduce((acc, actor) => ({
|
|
...acc,
|
|
[actor.entity_id]: {
|
|
...acc[actor.entity_id],
|
|
[actor.entry_id]: {
|
|
...acc[actor.entity_id]?.[actor.entry_id],
|
|
[actor.slug]: actor.id,
|
|
},
|
|
},
|
|
}), {});
|
|
|
|
const newActorProfiles = await Promise.all(baseActors
|
|
.filter((actor) => actor.hasProfile)
|
|
.map((actor) => ({
|
|
...actor,
|
|
actorId: newActorIdsByEntityIdEntryIdAndSlug[actor.entity?.id]?.[actor.entryId]?.[actor.slug] || newActorIdsByEntityIdEntryIdAndSlug.null?.null?.[actor.slug],
|
|
}))
|
|
.filter((actor) => !!actor.id)
|
|
.map((actor) => curateProfile(actor)));
|
|
|
|
await storeProfiles(newActorProfiles);
|
|
|
|
if (Array.isArray(newActors)) {
|
|
return newActors.concat(existingActors);
|
|
}
|
|
|
|
return existingActors;
|
|
}
|
|
|
|
async function associatePeople(releases, batchId, type = 'actor') {
|
|
try {
|
|
const baseActorsByReleaseId = releases.reduce((acc, release) => {
|
|
if (type === 'actors' && release.actors) {
|
|
acc[release.id] = toBaseActors(release.actors, release);
|
|
}
|
|
|
|
if (type === 'directors' && (release.director || release.directors)) {
|
|
acc[release.id] = toBaseActors([].concat(release.director || release.directors).filter(Boolean), release);
|
|
}
|
|
|
|
return acc;
|
|
}, {});
|
|
|
|
const baseActors = Object.values(baseActorsByReleaseId).flat();
|
|
|
|
if (baseActors.length === 0) {
|
|
return [];
|
|
}
|
|
|
|
const baseActorsBySlug = baseActors.reduce((acc, baseActor) => ({
|
|
...acc,
|
|
[baseActor.slug]: baseActor,
|
|
}), {});
|
|
|
|
const uniqueBaseActors = Object.values(baseActorsBySlug);
|
|
const actors = await getOrCreateActors(uniqueBaseActors, batchId);
|
|
|
|
const personKey = ({
|
|
actors: 'actor_id',
|
|
directors: 'director_id',
|
|
})[type];
|
|
|
|
const actorIdsByEntityIdEntryIdAndSlug = actors.reduce((acc, actor) => ({
|
|
...acc,
|
|
[actor.entity_id]: {
|
|
...acc[actor.entity_id],
|
|
[actor.entry_id]: {
|
|
...acc[actor.entity_id]?.[actor.entry_id],
|
|
[actor.slug]: {
|
|
[personKey]: actor.alias_for || actor.id,
|
|
alias_id: actor.alias_for ? actor.id : null,
|
|
},
|
|
},
|
|
},
|
|
}), {});
|
|
|
|
const releaseActorAssociations = Object.entries(baseActorsByReleaseId)
|
|
.map(([releaseId, releaseActors]) => releaseActors
|
|
.map((releaseActor) => ({
|
|
release_id: releaseId,
|
|
...(actorIdsByEntityIdEntryIdAndSlug[releaseActor.entity?.id]?.[releaseActor.entryId]?.[releaseActor.slug] || actorIdsByEntityIdEntryIdAndSlug.null.null[releaseActor.slug]),
|
|
})))
|
|
.flat();
|
|
|
|
const validReleaseActorAssociations = releaseActorAssociations.filter((association) => association.release_id && association[personKey]);
|
|
|
|
if (releaseActorAssociations.length > validReleaseActorAssociations.length) {
|
|
const invalidReleaseActorAssociations = releaseActorAssociations.filter((association) => !association.release_id || !association[personKey]);
|
|
|
|
logger.error(invalidReleaseActorAssociations);
|
|
}
|
|
|
|
await bulkInsert(`releases_${type}`, validReleaseActorAssociations, false);
|
|
await knex.schema.refreshMaterializedView('actors_meta');
|
|
|
|
logger.verbose(`Associated ${releaseActorAssociations.length} actors to ${releases.length} scenes`);
|
|
|
|
return actors;
|
|
} catch (error) {
|
|
logger.error(`Failed to associate actors: ${error.message} ${error.stack}`);
|
|
|
|
return [];
|
|
}
|
|
}
|
|
|
|
async function associateActors(releases, batchId) {
|
|
return associatePeople(releases, batchId, 'actors');
|
|
}
|
|
|
|
async function associateDirectors(releases, batchId) {
|
|
return associatePeople(releases, batchId, 'directors');
|
|
}
|
|
|
|
async function fetchActor(actorId) {
|
|
const actor = await knex('actors')
|
|
.select(knex.raw(`
|
|
actors.*,
|
|
row_to_json(entities) as entity,
|
|
row_to_json(actor_alias) as alias,
|
|
row_to_json(birth_country) as birth_country,
|
|
row_to_json(residence_country) as residence_country,
|
|
row_to_json(media) as avatar,
|
|
json_agg(actors_profiles) as profiles
|
|
`))
|
|
.modify((queryBuilder) => {
|
|
if (Number.isNaN(Number(actorId))) {
|
|
queryBuilder.where('actors.slug', actorId);
|
|
return;
|
|
}
|
|
|
|
queryBuilder.where('actors.id', actorId);
|
|
})
|
|
.leftJoin('actors as actor_alias', 'actor_alias.id', 'actors.alias_for')
|
|
.leftJoin('actors_profiles', 'actors.id', 'actors_profiles.actor_id')
|
|
.leftJoin('entities', 'entities.id', 'actors.entity_id')
|
|
.leftJoin('countries as birth_country', 'birth_country.alpha2', 'actors.birth_country_alpha2')
|
|
.leftJoin('countries as residence_country', 'residence_country.alpha2', 'actors.residence_country_alpha2')
|
|
.leftJoin('media', 'media.id', 'actors.avatar_media_id')
|
|
.groupBy('actors.id', 'entities.id', 'actor_alias.id', 'birth_country.alpha2', 'residence_country.alpha2', 'media.id')
|
|
.first();
|
|
|
|
return curateActor(actor, true);
|
|
}
|
|
|
|
async function searchActors(query) {
|
|
const actors = await knex
|
|
.select('*')
|
|
.from(knex.raw('search_actors(?) as actors', [query]))
|
|
.limit(100);
|
|
|
|
return actors.map((actor) => curateActor(actor));
|
|
}
|
|
|
|
async function flushProfiles(actorIdsOrNames) {
|
|
const profiles = await fetchProfiles(actorIdsOrNames);
|
|
const actorNames = Array.from(new Set(profiles.map((profile) => profile.name)));
|
|
|
|
const deleteCount = await knex('actors_profiles')
|
|
.whereIn('id', profiles.map((profile) => profile.id))
|
|
.delete();
|
|
|
|
await interpolateProfiles(actorIdsOrNames);
|
|
await flushOrphanedMedia(); // don't flush until main avatar is detached by re-interpolating
|
|
|
|
if (actorNames.length > 20) {
|
|
logger.info(`Removed ${deleteCount} profiles for ${actorNames.length} actors`);
|
|
return;
|
|
}
|
|
|
|
if (deleteCount > 0) {
|
|
logger.info(`Removed ${deleteCount} profiles for ${actorNames.join(', ')}`);
|
|
return;
|
|
}
|
|
|
|
logger.info(`Removed ${deleteCount} profiles`);
|
|
}
|
|
|
|
async function deleteActors(allActorIdsOrNames) {
|
|
const deleteCounts = await Promise.map(chunk(allActorIdsOrNames), async (actorIdsOrNames) => {
|
|
const actors = await knex('actors')
|
|
.whereIn('id', actorIdsOrNames.filter((idOrName) => typeof idOrName === 'number'))
|
|
.orWhere((builder) => {
|
|
builder
|
|
.whereIn('name', actorIdsOrNames.filter((idOrName) => typeof idOrName === 'string'))
|
|
.whereNull('entity_id');
|
|
});
|
|
|
|
const actorIds = actors.map((actor) => actor.id);
|
|
|
|
const sceneIds = await knex('releases_actors')
|
|
.select('releases.id')
|
|
.whereIn('actor_id', actorIds)
|
|
.leftJoin('releases', 'releases.id', 'releases_actors.release_id')
|
|
.pluck('id');
|
|
|
|
const [deletedScenesCount, deletedActorsCount] = await Promise.all([
|
|
deleteScenes(sceneIds),
|
|
knex('actors')
|
|
.whereIn('id', actorIds)
|
|
.delete(),
|
|
]);
|
|
|
|
return { deletedScenesCount, deletedActorsCount };
|
|
}, { concurrency: 10 });
|
|
|
|
const deletedActorsCount = deleteCounts.reduce((acc, count) => acc + count.deletedActorsCount, 0);
|
|
const deletedScenesCount = deleteCounts.reduce((acc, count) => acc + count.deletedScenesCount, 0);
|
|
|
|
await flushOrphanedMedia();
|
|
|
|
logger.info(`Removed ${deletedActorsCount} actors with ${deletedScenesCount} scenes`);
|
|
|
|
return deletedActorsCount;
|
|
}
|
|
|
|
async function flushActors() {
|
|
const actorIds = await knex('actors').select('id').pluck('id');
|
|
|
|
const confirmed = await inquirer.prompt([{
|
|
type: 'confirm',
|
|
name: 'flushActors',
|
|
message: `You are about to remove ${actorIds.length} actors. Are you sure?`,
|
|
default: false,
|
|
}]);
|
|
|
|
if (!confirmed.flushActors) {
|
|
logger.warn('Confirmation rejected, not flushing actors');
|
|
return;
|
|
}
|
|
|
|
const deleteCount = await deleteActors(actorIds);
|
|
|
|
await flushOrphanedMedia();
|
|
|
|
logger.info(`Removed ${deleteCount}/${actorIds.length} actors`);
|
|
}
|
|
|
|
module.exports = {
|
|
associateActors,
|
|
associateDirectors,
|
|
deleteActors,
|
|
fetchActor,
|
|
flushActors,
|
|
flushProfiles,
|
|
interpolateProfiles,
|
|
scrapeActors,
|
|
searchActors,
|
|
toBaseActors,
|
|
};
|