traxxx/src/actors.js

1002 lines
31 KiB
JavaScript
Raw Normal View History

'use strict';
const config = require('config');
const util = require('util');
const Promise = require('bluebird');
2020-05-17 01:00:44 +00:00
const moment = require('moment');
const blake2 = require('blake2');
const DOMPurify = require('dompurify');
const { JSDOM } = require('jsdom');
2020-12-30 01:23:43 +00:00
const omit = require('object.omit');
const { window } = new JSDOM('');
const domPurify = DOMPurify(window);
2020-05-13 00:56:20 +00:00
// const logger = require('./logger')(__filename);
const knex = require('./knex');
2020-05-15 02:40:59 +00:00
const scrapers = require('./scrapers/scrapers').actors;
const argv = require('./argv');
2020-05-15 02:40:59 +00:00
const include = require('./utils/argv-include')(argv);
const bulkInsert = require('./utils/bulk-insert');
2020-05-15 02:40:59 +00:00
const logger = require('./logger')(__filename);
2020-05-17 01:00:44 +00:00
const { toBaseReleases } = require('./deep');
2020-12-30 01:23:43 +00:00
const { associateAvatars, flushOrphanedMedia } = require('./media');
2020-12-30 02:19:09 +00:00
const { deleteScenes } = require('./releases');
2020-05-17 01:00:44 +00:00
const slugify = require('./utils/slugify');
const capitalize = require('./utils/capitalize');
2020-05-15 02:40:59 +00:00
const resolvePlace = require('./utils/resolve-place');
const hairColors = {
'jet-black': 'black',
'red-head': 'red',
'soft-black': 'black',
black: 'black',
blonde: 'blonde',
blondie: 'blonde',
brown: 'brown',
brunette: 'brown',
fair: 'blonde',
raven: 'black',
red: 'red',
redhead: 'red',
blue: 'blue',
green: 'green',
purple: 'purple',
pink: 'pink',
};
const eyeColors = {
blue: 'blue',
brown: 'brown',
dark: 'brown',
gray: 'gray',
green: 'green',
grey: 'gray',
hazel: 'hazel',
};
const ethnicities = {
'african american': 'black',
'african-american': 'black',
'native american': 'native american',
african: 'black',
aravic: 'arabic',
asian: 'asian',
black: 'black',
caucasian: 'white',
european: 'white',
hispanic: 'latin',
indian: 'indian',
japanese: 'japanese',
latin: 'latin',
latina: 'latina',
latino: 'latino',
white: 'white',
};
function getBoolean(value) {
if (typeof value === 'boolean') {
return value;
}
if (typeof value === 'string') {
if (/yes/i.test(value)) {
return true;
}
if (/no/i.test(value)) {
return true;
}
}
return null;
}
2020-05-17 01:00:44 +00:00
function getMostFrequent(items) {
const { mostFrequent } = items.reduce((acc, item) => {
if (item === undefined || item === null) {
return acc;
}
const slug = slugify(item);
acc.counts[slug] = (acc.counts[slug] || 0) + 1;
2020-05-17 01:00:44 +00:00
if (!acc.mostFrequent || acc.counts[slug] > acc.counts[slugify(acc.mostFrequent)]) {
2020-05-17 01:00:44 +00:00
acc.mostFrequent = item;
}
return acc;
}, {
counts: {},
mostFrequent: null,
});
return mostFrequent;
}
function getMostFrequentDate(dates) {
const year = getMostFrequent(dates.map(dateX => dateX.getFullYear()));
const month = getMostFrequent(dates.map(dateX => dateX.getMonth()));
const date = getMostFrequent(dates.map(dateX => dateX.getDate()));
2020-07-21 02:04:07 +00:00
if (year === null || month === null || date === null) {
return null;
}
2020-07-21 02:04:07 +00:00
return moment({ year, month, date }).toDate();
2020-05-17 01:00:44 +00:00
}
function getHighest(items) {
return items.reduce((prevItem, item) => (item > prevItem ? item : prevItem), null);
}
2020-05-17 01:00:44 +00:00
function getLongest(items) {
return items.sort((itemA, itemB) => itemB.length - itemA.length)[0] || null;
}
function getAverage(items) {
return Math.round(items.reduce((acc, item) => acc + item, 0) / items.length) || null;
2020-05-17 01:00:44 +00:00
}
function toBaseActors(actorsOrNames, release) {
if (!actorsOrNames) {
return [];
}
const baseActors = actorsOrNames
.filter(actorOrName => actorOrName && (typeof actorOrName === 'string' || actorOrName.name))
.map((actorOrName) => {
const [baseName, entryId] = (actorOrName.name || actorOrName).split(':');
const name = capitalize(baseName);
const slug = slugify(name);
const baseActor = {
name,
slug,
entryId: entryId || actorOrName.entryId || null,
entity: release?.entity?.parent || release?.entity || null,
hasProfile: !!actorOrName.name, // actor contains profile information
};
if (actorOrName.name) {
return {
...actorOrName,
...baseActor,
};
}
return baseActor;
});
return baseActors;
}
function curateActor(actor, withDetails = false, isProfile = false) {
2020-05-19 23:11:32 +00:00
if (!actor) {
return null;
}
const curatedActor = {
id: actor.id,
name: actor.name,
slug: actor.slug,
url: actor.url,
2020-05-19 23:11:32 +00:00
gender: actor.gender,
entityId: actor.entity_id,
aliasFor: actor.alias_for,
2020-05-19 23:11:32 +00:00
dateOfBirth: actor.date_of_birth,
age: actor.age,
birthCountry: actor.birth_country_alpha2,
...(withDetails && {
alias: actor.alias && {
id: actor.alias.id,
name: actor.alias.name,
slug: actor.slug,
gender: actor.alias.gender,
2020-05-19 23:11:32 +00:00
},
entity: actor.entity && {
id: actor.entity.id,
name: actor.entity.name,
slug: actor.entity.slug,
2020-05-19 23:11:32 +00:00
},
dateOfDeath: actor.date_of_death,
cup: actor.cup,
bust: actor.bust,
waist: actor.waist,
hip: actor.hip,
naturalBoobs: actor.natural_boobs,
penisLength: actor.penis_length,
penisGirth: actor.penis_girth,
circumcised: actor.circumcised,
height: actor.height,
weight: actor.weight,
eyes: actor.eyes,
hairColor: actor.hair_color,
hasTattoos: actor.has_tattoos,
hasPiercings: actor.has_piercings,
tattoos: actor.tattoos,
piercings: actor.piercings,
...(isProfile && { description: actor.description }),
placeOfBirth: actor.birth_country && {
country: {
alpha2: actor.birth_country.alpha2,
name: actor.birth_country.name,
alias: actor.birth_country.alias,
},
state: actor.birth_state,
city: actor.birth_city,
},
placeOfResidence: actor.residence_country && {
country: {
alpha2: actor.residence_country.alpha2,
name: actor.residence_country.name,
alias: actor.residence_country.alias,
},
state: actor.residence_state,
city: actor.residence_city,
},
avatar: actor.avatar && {
id: actor.avatar.id,
path: actor.avatar.path,
width: actor.avatar.width,
height: actor.avatar.height,
size: actor.avatar.size,
source: actor.avatar.source,
},
...(actor.profiles && { profiles: actor.profiles?.map(profile => curateActor(profile, true, true)) }),
}),
2020-05-19 23:11:32 +00:00
};
return curatedActor;
}
2020-05-13 00:56:20 +00:00
function curateActorEntry(baseActor, batchId) {
return {
name: baseActor.name,
slug: baseActor.slug,
entity_id: null,
2020-07-21 02:04:07 +00:00
entry_id: baseActor.entryId,
batch_id: batchId,
};
}
2020-05-13 00:56:20 +00:00
function curateActorEntries(baseActors, batchId) {
return baseActors.map(baseActor => curateActorEntry(baseActor, batchId));
}
2020-05-15 02:40:59 +00:00
function curateProfileEntry(profile) {
if (!profile.id) {
return null;
}
2020-05-15 02:40:59 +00:00
const curatedProfileEntry = {
...(profile.update !== false && { id: profile.update }),
2020-05-15 02:40:59 +00:00
actor_id: profile.id,
entity_id: profile.entity?.id || null,
2020-05-15 02:40:59 +00:00
date_of_birth: profile.dateOfBirth,
date_of_death: profile.dateOfDeath,
age: profile.age,
url: profile.url,
2020-05-15 02:40:59 +00:00
gender: profile.gender,
ethnicity: profile.ethnicity,
description: profile.description,
description_hash: profile.descriptionHash,
2020-05-15 02:40:59 +00:00
birth_city: profile.placeOfBirth?.city || null,
birth_state: profile.placeOfBirth?.state || null,
2020-05-17 01:00:44 +00:00
birth_country_alpha2: profile.placeOfBirth?.country || null,
2020-05-15 02:40:59 +00:00
residence_city: profile.placeOfResidence?.city || null,
residence_state: profile.placeOfResidence?.state || null,
2020-05-17 01:00:44 +00:00
residence_country_alpha2: profile.placeOfResidence?.country || null,
2020-05-15 02:40:59 +00:00
cup: profile.cup,
bust: profile.bust,
waist: profile.waist,
hip: profile.hip,
penis_length: profile.penisLength,
penis_girth: profile.penisGirth,
circumcised: profile.circumcised,
2020-05-15 02:40:59 +00:00
natural_boobs: profile.naturalBoobs,
height: profile.height,
weight: profile.weight,
hair_color: profile.hairColor,
2020-05-15 02:40:59 +00:00
eyes: profile.eyes,
has_tattoos: profile.hasTattoos,
has_piercings: profile.hasPiercings,
piercings: profile.piercings,
tattoos: profile.tattoos,
avatar_media_id: profile.avatarMediaId || null,
2020-05-15 02:40:59 +00:00
};
return curatedProfileEntry;
}
async function curateProfile(profile, actor) {
2020-05-17 23:22:56 +00:00
if (!profile) {
return null;
}
2020-05-15 02:40:59 +00:00
try {
const curatedProfile = {
id: profile.id,
name: profile.name,
avatar: profile.avatar,
scraper: profile.scraper,
entity: profile.entity,
update: profile.update,
2020-05-15 02:40:59 +00:00
};
2020-05-19 23:11:32 +00:00
curatedProfile.description = domPurify.sanitize(profile.description?.replace(/\s+/g, ' '), { ALLOWED_TAGS: [] }).trim() || null;
const hasher = curatedProfile.description && blake2
.createHash('blake2b', { digestLength: 24 })
.update(Buffer.from(slugify(curatedProfile.description)));
curatedProfile.descriptionHash = curatedProfile.description && hasher.digest('hex');
2020-05-15 02:40:59 +00:00
curatedProfile.nationality = profile.nationality?.trim() || null; // used to derive country when country not available
curatedProfile.ethnicity = ethnicities[profile.ethnicity?.trim().toLowerCase()] || null;
curatedProfile.hairColor = hairColors[(profile.hairColor || profile.hair)?.toLowerCase().replace('hair', '').trim()] || null;
curatedProfile.eyes = eyeColors[profile.eyes?.trim().toLowerCase()] || null;
2020-05-15 02:40:59 +00:00
curatedProfile.tattoos = profile.tattoos?.trim() || null;
curatedProfile.piercings = profile.piercings?.trim() || null;
curatedProfile.gender = (/female/i.test(profile.gender) && 'female')
|| (/shemale|trans/i.test(profile.gender) && 'transsexual')
2020-05-15 02:40:59 +00:00
|| (/male/i.test(profile.gender) && 'male')
|| null;
2020-05-17 03:08:41 +00:00
const dateOfBirth = profile.dateOfBirth || profile.birthdate;
curatedProfile.dateOfBirth = (!Number.isNaN(Number(dateOfBirth)) // possibly valid date
&& new Date() - dateOfBirth > 567648000000 // over 18
&& dateOfBirth)
2020-05-15 02:40:59 +00:00
|| null;
curatedProfile.dateOfDeath = Number.isNaN(Number(profile.dateOfDeath)) ? null : profile.dateOfDeath;
curatedProfile.age = Number(profile.age) || null;
2020-05-15 02:40:59 +00:00
curatedProfile.height = Number(profile.height) || profile.height?.match?.(/\d+/)?.[0] || null;
curatedProfile.weight = Number(profile.weight) || profile.weight?.match?.(/\d+/)?.[0] || null;
curatedProfile.cup = profile.cup || (typeof profile.bust === 'string' && profile.bust?.match?.(/[a-zA-Z]+/)?.[0]) || null;
curatedProfile.bust = Number(profile.bust) || profile.bust?.match?.(/\d+/)?.[0] || null;
curatedProfile.waist = Number(profile.waist) || profile.waist?.match?.(/\d+/)?.[0] || null;
curatedProfile.hip = Number(profile.hip) || profile.hip?.match?.(/\d+/)?.[0] || null;
curatedProfile.penisLength = Number(profile.penisLength) || profile.penisLength?.match?.(/\d+/)?.[0] || null;
curatedProfile.penisGirth = Number(profile.penisGirth) || profile.penisGirth?.match?.(/\d+/)?.[0] || null;
curatedProfile.circumcised = getBoolean(profile.circumcised);
curatedProfile.naturalBoobs = getBoolean(profile.naturalBoobs);
curatedProfile.hasTattoos = getBoolean(profile.hasTattoos);
curatedProfile.hasPiercings = getBoolean(profile.hasPiercings);
2020-05-15 02:40:59 +00:00
2020-05-17 01:00:44 +00:00
if (argv.resolvePlace) {
const [placeOfBirth, placeOfResidence] = await Promise.all([
resolvePlace(profile.birthPlace),
resolvePlace(profile.residencePlace),
]);
2020-05-15 02:40:59 +00:00
2020-05-17 01:00:44 +00:00
curatedProfile.placeOfBirth = placeOfBirth;
curatedProfile.placeOfResidence = placeOfResidence;
}
2020-05-15 02:40:59 +00:00
if (!curatedProfile.placeOfBirth && curatedProfile.nationality) {
const country = await knex('countries')
.where('nationality', 'ilike', `%${curatedProfile.nationality}%`)
.orWhere('alpha3', 'ilike', `%${curatedProfile.nationality}%`)
.orWhere('alpha2', 'ilike', `%${curatedProfile.nationality}%`)
2020-05-15 02:40:59 +00:00
.orderBy('priority', 'desc')
.first();
if (country) {
curatedProfile.placeOfBirth = {
country: country.alpha2,
};
}
2020-05-15 02:40:59 +00:00
}
curatedProfile.social = Array.isArray(profile.social)
? profile.social.map((social) => {
try {
const { href } = new URL(social);
2020-05-15 02:40:59 +00:00
return href;
} catch (error) {
logger.warn(`Profile scraper for '${profile.entity.name}' returned invalid social link: ${social}`);
2020-05-15 02:40:59 +00:00
return null;
}
}).filter(Boolean)
: [];
curatedProfile.scenes = toBaseReleases(profile.scenes || profile.releases, profile.entity, actor)
// attach actor to base scene, in case it was not scraped
.map((scene) => {
if (actor && !scene.actors?.find(sceneActor => slugify(sceneActor) === actor.slug || slugify(sceneActor.name) === actor.slug)) {
return {
...scene,
actors: [actor, ...(scene.actors || [])],
};
}
return scene;
});
2020-05-15 02:40:59 +00:00
if (profile.ethnicity && !curatedProfile.ethnicity) logger.warn(`Unrecognized ethnicity returned by '${profile.entity.name}' scraper: ${profile.ethnicity}`);
if ((profile.hairColor || profile.hair) && !curatedProfile.hairColor) logger.warn(`Unrecognized hair color returned by '${profile.entity.name}' scraper: ${profile.hairColor || profile.hair}`);
if (profile.eyes && !curatedProfile.eyes) logger.warn(`Unrecognized eye color returned by '${profile.entity.name}' scraper: ${profile.eyes}`);
2020-05-15 02:40:59 +00:00
return curatedProfile;
} catch (error) {
logger.error(`Failed to curate '${profile.name}': ${error.message}`);
return null;
}
}
2020-12-30 01:23:43 +00:00
async function fetchProfiles(actorIdsOrNames) {
return knex('actors_profiles')
.select(knex.raw('actors_profiles.*, row_to_json(actors) as actor, row_to_json(media) as avatar'))
.leftJoin('actors', 'actors.id', 'actors_profiles.actor_id')
.modify((query) => {
2020-12-30 01:23:43 +00:00
if (actorIdsOrNames) {
query
.whereIn('actor_id', actorIdsOrNames.filter(idOrName => typeof idOrName === 'number'))
2020-12-30 01:23:43 +00:00
.orWhere((builder) => {
builder
.whereIn('actors.name', actorIdsOrNames.filter(idOrName => typeof idOrName === 'string'))
.whereNull('actors.entity_id');
});
}
})
2020-05-17 01:00:44 +00:00
.leftJoin('media', 'actors_profiles.avatar_media_id', 'media.id');
2020-12-30 01:23:43 +00:00
}
async function interpolateProfiles(actorIdsOrNames) {
const profiles = await fetchProfiles(actorIdsOrNames);
2020-05-17 01:00:44 +00:00
const profilesByActorId = profiles.reduce((acc, profile) => ({
...acc,
[profile.actor_id]: [
...(acc[profile.actor_id] || []),
profile,
],
}), {});
logger.info(`Interpolating ${profiles.length} profiles from ${Object.keys(profilesByActorId).length} actors`);
2020-05-17 01:00:44 +00:00
const interpolatedProfiles = Object.entries(profilesByActorId).map(([actorId, actorProfiles]) => {
// group values from each profile
2020-05-17 01:00:44 +00:00
const valuesByProperty = actorProfiles.reduce((acc, profile) => Object
.entries(profile)
.reduce((profileAcc, [property, value]) => ({
...profileAcc,
[property]: [
...(acc[property] || []),
...(value === null ? [] : Array.from({ length: profile.priority }, () => value)), // multiply by priority, increasing the odds of being the most frequent value
2020-05-17 01:00:44 +00:00
],
}), {
// bundle location values so they can be assessed together, to ensure the most frequent city is in the most frequent state is in most frequent country
origin: [...acc.origin || [], {
...(profile.birth_country_alpha2 && { country: profile.birth_country_alpha2 }),
...(profile.birth_state && { state: profile.birth_state }),
...(profile.birth_city && { city: profile.birth_city }),
}].filter(location => Object.keys(location).length > 0),
residence: [...acc.residence || [], {
...(profile.residence_country_alpha2 && { country: profile.residence_country_alpha2 }),
...(profile.residence_state && { state: profile.residence_state }),
...(profile.residence_city && { city: profile.residence_city }),
}].filter(location => Object.keys(location).length > 0),
}), {});
2020-05-17 01:00:44 +00:00
const mostFrequentValues = [
'gender',
'ethnicity',
'cup',
'bust',
'waist',
'hip',
'penis_length',
'penis_girth',
'circumcised',
'natural_boobs',
'hair_color',
'eyes',
'has_tattoos',
'has_piercings',
].reduce((acc, property) => ({
...acc,
[property]: getMostFrequent(valuesByProperty[property]),
}), {});
2020-05-17 01:00:44 +00:00
const profile = {
id: actorId,
...mostFrequentValues,
2020-05-17 01:00:44 +00:00
};
profile.height = getMostFrequent(valuesByProperty.height.filter(height => height > 50 && height < 300)); // remove unlikely values
2020-05-17 01:00:44 +00:00
profile.date_of_birth = getMostFrequentDate(valuesByProperty.date_of_birth);
profile.date_of_death = getMostFrequentDate(valuesByProperty.date_of_death);
profile.age = getHighest(valuesByProperty.age);
2020-05-17 01:00:44 +00:00
// ensure most frequent country, city and state match up
profile.birth_country_alpha2 = getMostFrequent(valuesByProperty.origin.map(location => location.country));
const remainingOriginCountries = valuesByProperty.origin.filter(location => location.country === profile.birth_country_alpha2);
2020-05-17 01:00:44 +00:00
profile.birth_state = getMostFrequent(remainingOriginCountries.map(location => location.state));
const remainingOriginStates = remainingOriginCountries.filter(location => !profile.birth_state || location.state === profile.birth_state);
2020-05-17 01:00:44 +00:00
profile.birth_city = getMostFrequent(remainingOriginStates.map(location => location.city));
2020-05-17 01:00:44 +00:00
profile.residence_country_alpha2 = getMostFrequent(valuesByProperty.residence.map(location => location.country));
const remainingResidenceCountries = valuesByProperty.residence.filter(location => location.country === profile.residence_country_alpha2);
2020-05-17 01:00:44 +00:00
profile.residence_state = getMostFrequent(remainingResidenceCountries.map(location => location.state));
const remainingResidenceStates = remainingResidenceCountries.filter(location => !profile.residence_state || location.state === profile.residence_state);
profile.residence_city = getMostFrequent(remainingResidenceStates.map(location => location.city));
2020-05-17 01:00:44 +00:00
profile.weight = getAverage(valuesByProperty.weight);
2020-05-17 01:00:44 +00:00
profile.tattoos = getLongest(valuesByProperty.tattoos);
profile.piercings = getLongest(valuesByProperty.piercings);
profile.avatar_media_id = actorProfiles
.map(actorProfile => actorProfile.avatar)
.filter(avatar => avatar && (avatar.entropy === null || avatar.entropy > 6))
.sort((avatarA, avatarB) => avatarB.height - avatarA.height)[0]?.id || null;
2020-05-17 01:00:44 +00:00
return profile;
});
const transaction = await knex.transaction();
2020-12-30 01:23:43 +00:00
// clear existing interpolated data
const emptyProfile = Object
.keys(omit(curateProfileEntry({ id: 1 }), ['id', 'actor_id', 'entity_id', 'url', 'description_hash']))
.reduce((acc, key) => ({ ...acc, [key]: null }), {});
await knex('actors')
.modify((modifyBuilder) => {
if (actorIdsOrNames) {
modifyBuilder
.whereIn('id', actorIdsOrNames.filter(idOrName => typeof idOrName === 'number'))
.orWhere((whereBuilder) => {
whereBuilder
.whereIn('name', actorIdsOrNames.filter(idOrName => typeof idOrName === 'string'))
.whereNull('entity_id');
});
}
})
.update(emptyProfile)
.transacting(transaction);
// insert new interpolated data
2020-05-17 01:00:44 +00:00
const queries = interpolatedProfiles.map(profile => knex('actors')
.where('id', profile.id)
.update(profile)
.transacting(transaction));
await Promise.all(queries)
.then(transaction.commit)
.catch(transaction.rollback);
}
async function upsertProfiles(profiles) {
const newProfileEntries = profiles.filter(profile => !profile.update).map(profile => curateProfileEntry(profile)).filter(Boolean);
const updatingProfileEntries = profiles.filter(profile => profile.update).map(profile => curateProfileEntry(profile)).filter(Boolean);
2020-05-15 02:40:59 +00:00
if (newProfileEntries.length > 0) {
await bulkInsert('actors_profiles', newProfileEntries);
logger.info(`Saved ${newProfileEntries.length} actor profiles`);
2020-05-15 02:40:59 +00:00
}
if (argv.force && updatingProfileEntries.length > 0) {
const transaction = await knex.transaction();
const queries = updatingProfileEntries.map(profileEntry => knex('actors_profiles')
.where('id', profileEntry.id)
.update(profileEntry)
.returning(['id', 'actor_id'])
.transacting(transaction));
await Promise.all(queries)
.then(transaction.commit)
.catch(transaction.rollback);
logger.info(`Updated ${updatingProfileEntries.length} new actor profiles`);
2020-05-15 02:40:59 +00:00
}
}
async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesByActorEntityId) {
2020-05-17 23:22:56 +00:00
const profiles = Promise.map(sources, async (source) => {
try {
// config may group sources to try until success
2020-05-17 23:22:56 +00:00
return await [].concat(source).reduce(async (outcome, scraperSlug) => outcome.catch(async () => {
try {
const entity = entitiesBySlug[scraperSlug] || null;
const scraper = scrapers[scraperSlug];
const layoutScraper = scraper?.[entity.parameters?.layout] || scraper;
const context = {
...entity,
// legacy
site: entity,
network: entity?.parent,
entity,
scraper: scraperSlug,
};
const label = context.entity?.name;
if (!layoutScraper?.fetchProfile) {
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
throw new Error(`No profile profile scraper available for ${scraperSlug}`);
}
if (!context.entity) {
logger.warn(`No entity found for ${scraperSlug}`);
throw new Error(`No entity found for ${scraperSlug}`);
}
const existingProfile = existingProfilesByActorEntityId[actor.id]?.[context.entity?.id || null];
if (existingProfile && !argv.force) {
logger.verbose(`Found existing profile for '${actor.name}' on '${label}', use --force to scrape again`);
return null;
}
logger.verbose(`Searching profile for '${actor.name}' on '${label}'`);
const profile = await layoutScraper.fetchProfile(curateActor({
...existingProfile,
...actor,
}), context, include);
if (!profile || typeof profile === 'number') { // scraper returns HTTP code on request failure
logger.verbose(`Profile for '${actor.name}' not available on ${label}, scraper returned ${profile}`);
throw Object.assign(new Error(`Profile for '${actor.name}' not available on ${label}`), { code: 'PROFILE_NOT_AVAILABLE' });
}
logger.verbose(`Found profile for '${actor.name}' on '${label}'`);
return await curateProfile({
...actor,
...profile,
entity,
update: existingProfile?.id || false,
}, actor);
} catch (error) {
if (error.code !== 'PROFILE_NOT_AVAILABLE') {
logger.error(`Failed to fetch profile for '${actor.name}' from '${scraperSlug}': ${error.message}`);
}
2020-05-17 23:22:56 +00:00
// throw error to try next source
throw error;
2020-05-17 23:22:56 +00:00
}
}), Promise.reject(new Error()));
} catch (error) {
console.log(error);
2020-05-17 23:22:56 +00:00
if (error.code !== 'PROFILE_NOT_AVAILABLE') {
logger.error(`Failed to fetch profile for '${actor.name}': ${error.message}`);
}
}
return null;
});
return profiles.filter(Boolean);
}
async function getActorNames(actorNames) {
if (actorNames.length > 0) {
return actorNames;
}
const actorsWithoutProfiles = await knex.raw(`
SELECT actors.name
FROM actors
WHERE NOT EXISTS (
SELECT *
FROM actors_profiles
WHERE actors_profiles.actor_id = actors.id
AND actors_profiles.updated_at <= (?)
)
`, [argv.actorsUpdate || new Date()]);
return actorsWithoutProfiles.rows.map(actor => actor.name);
}
async function storeProfiles(profiles) {
const profilesWithAvatarIds = await associateAvatars(profiles);
const actorIds = Array.from(new Set(profiles.map(profile => profile.id)));
await upsertProfiles(profilesWithAvatarIds);
await interpolateProfiles(actorIds);
}
async function scrapeActors(argNames) {
const actorNames = await getActorNames(argNames);
const baseActors = toBaseActors(actorNames);
logger.info(`Scraping profiles for ${actorNames.length} actors`);
const sources = argv.profileSources || config.profiles || Object.keys(scrapers.actors);
const entitySlugs = sources.flat();
const [entities, existingActorEntries] = await Promise.all([
knex('entities')
.select(knex.raw('entities.*, row_to_json(parents) as parent, json_agg(children) as children'))
.whereIn('entities.slug', entitySlugs)
.whereIn('entities.type', ['network', 'channel'])
.leftJoin('entities as parents', 'parents.id', 'entities.parent_id')
.leftJoin('entities as children', 'children.parent_id', 'entities.id')
.orderBy('entities.type')
.groupBy('entities.id', 'parents.id'),
knex('actors')
.select(['id', 'name', 'slug', 'entry_id'])
.whereIn('slug', baseActors.map(baseActor => baseActor.slug))
.whereNull('alias_for'),
]);
const entitiesBySlug = entities.reduce((acc, entity) => ({ ...acc, [entity.slug]: acc[entity.slug] || entity }), {});
const existingActorEntriesBySlugAndEntryId = existingActorEntries.reduce((acc, actorEntry) => ({
...acc,
[actorEntry.slug]: {
...acc[actorEntry.slug],
[actorEntry.entryId || null]: actorEntry,
},
}), {});
const newBaseActors = baseActors.filter(baseActor => !existingActorEntriesBySlugAndEntryId[baseActor.slug]?.[baseActor.entryId]);
const [batchId] = newBaseActors.length > 0 ? await knex('batches').insert({ comment: null }).returning('id') : [null];
const curatedActorEntries = batchId && curateActorEntries(newBaseActors, batchId);
2020-07-21 02:04:07 +00:00
// TODO: associate entity when entry ID is provided
const newActorEntries = batchId && await bulkInsert('actors', curatedActorEntries);
2020-05-15 02:40:59 +00:00
const actors = existingActorEntries.concat(Array.isArray(newActorEntries) ? newActorEntries : []);
const existingProfiles = await knex('actors_profiles')
.select(knex.raw('actors_profiles.*, row_to_json(avatars) as avatar'))
.whereIn('actor_id', actors.map(actor => actor.id))
.leftJoin('media as avatars', 'avatars.id', 'actors_profiles.avatar_media_id');
const existingProfilesByActorEntityId = existingProfiles.reduce((acc, profile) => ({
...acc,
[profile.actor_id]: {
...acc[profile.actor_id],
[profile.entity_id]: profile,
},
}), {});
2020-05-15 02:40:59 +00:00
const profilesPerActor = await Promise.map(
actors,
async actor => scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesByActorEntityId),
2020-05-15 02:40:59 +00:00
{ concurrency: 10 },
);
const profiles = profilesPerActor.flat().filter(Boolean);
logger.info(`Scraped ${profiles.length} profiles`);
if (argv.report) {
console.log(util.inspect(profiles, { depth: Infinity, colors: true }));
}
if (argv.save) {
await storeProfiles(profiles);
}
return profiles;
}
2020-05-13 00:56:20 +00:00
async function getOrCreateActors(baseActors, batchId) {
// WHERE IN causes stack depth error and performance issues with a large amount of values, no knex VALUES helper available
const actorValues = baseActors.map(actor => knex.raw('(:slug, :entityId)', { slug: actor.slug, entityId: actor.entity.id })).join(', ');
const existingActors = await knex
.select('actors.*')
.from(knex.raw(`actors, (VALUES ${actorValues}) AS base_actors (slug, entity_id)`))
.whereRaw('actors.slug = base_actors.slug AND actors.entity_id IS NULL')
.orWhereRaw('actors.slug = base_actors.slug AND actors.entity_id = base_actors.entity_id');
// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug));
const existingActorSlugs = existingActors.reduce((acc, actor) => ({
...acc,
[actor.entity_id]: {
...acc[actor.entity_id],
[actor.slug]: true,
},
}), {});
const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.entity.id]?.[baseActor.slug] && !existingActorSlugs.null?.[baseActor.slug]);
const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId);
const newActors = await bulkInsert('actors', curatedActorEntries);
const newActorIdsByEntityIdAndSlug = newActors.reduce((acc, actor) => ({
...acc,
[actor.entity_id]: {
...acc[actor.entity_id],
[actor.slug]: actor.id,
},
}), {});
const newActorProfiles = await Promise.all(baseActors
.filter(actor => actor.hasProfile)
.map(actor => ({
...actor,
id: newActorIdsByEntityIdAndSlug[actor.entity?.id]?.[actor.slug] || newActorIdsByEntityIdAndSlug.null?.[actor.slug],
}))
.filter(actor => !!actor.id)
.map(actor => curateProfile(actor)));
await storeProfiles(newActorProfiles);
if (Array.isArray(newActors)) {
return newActors.concat(existingActors);
}
return existingActors;
}
2020-05-13 00:56:20 +00:00
async function associateActors(releases, batchId) {
const baseActorsByReleaseId = releases.reduce((acc, release) => {
if (release.actors) {
acc[release.id] = toBaseActors(release.actors, release);
}
return acc;
}, {});
const baseActors = Object.values(baseActorsByReleaseId).flat();
if (baseActors.length === 0) {
return [];
}
2020-05-17 23:22:56 +00:00
const baseActorsBySlug = baseActors.reduce((acc, baseActor) => ({
...acc,
2020-05-17 23:22:56 +00:00
[baseActor.slug]: baseActor,
}), {});
2020-05-17 23:22:56 +00:00
const uniqueBaseActors = Object.values(baseActorsBySlug);
const actors = await getOrCreateActors(uniqueBaseActors, batchId);
2020-05-17 23:22:56 +00:00
const actorIdsBySlug = actors.reduce((acc, actor) => ({
...acc,
2020-05-17 23:22:56 +00:00
[actor.slug]: actor.alias_for || actor.id,
}), {});
const releaseActorAssociations = Object.entries(baseActorsByReleaseId)
.map(([releaseId, releaseActors]) => releaseActors
.map(releaseActor => ({
release_id: releaseId,
2020-05-17 23:22:56 +00:00
actor_id: actorIdsBySlug[releaseActor.slug],
})))
.flat();
await bulkInsert('releases_actors', releaseActorAssociations, false);
logger.verbose(`Associated ${releaseActorAssociations.length} actors to ${releases.length} scenes`);
return actors;
}
2020-05-19 23:11:32 +00:00
async function fetchActor(actorId) {
const actor = await knex('actors')
.select(knex.raw(`
actors.*,
row_to_json(entities) as entity,
2020-05-19 23:11:32 +00:00
row_to_json(actor_alias) as alias,
row_to_json(birth_country) as birth_country,
row_to_json(residence_country) as residence_country,
row_to_json(media) as avatar,
json_agg(actors_profiles) as profiles
2020-05-19 23:11:32 +00:00
`))
.modify((queryBuilder) => {
if (Number.isNaN(Number(actorId))) {
queryBuilder.where('actors.slug', actorId);
return;
}
queryBuilder.where('actors.id', actorId);
})
.leftJoin('actors as actor_alias', 'actor_alias.id', 'actors.alias_for')
.leftJoin('actors_profiles', 'actors.id', 'actors_profiles.actor_id')
.leftJoin('entities', 'entities.id', 'actors.entity_id')
2020-05-19 23:11:32 +00:00
.leftJoin('countries as birth_country', 'birth_country.alpha2', 'actors.birth_country_alpha2')
.leftJoin('countries as residence_country', 'residence_country.alpha2', 'actors.residence_country_alpha2')
.leftJoin('media', 'media.id', 'actors.avatar_media_id')
.groupBy('actors.id', 'entities.id', 'actor_alias.id', 'birth_country.alpha2', 'residence_country.alpha2', 'media.id')
2020-05-19 23:11:32 +00:00
.first();
return curateActor(actor, true);
}
async function searchActors(query) {
const actors = await knex
.select('*')
.from(knex.raw('search_actors(?) as actors', [query]))
2020-11-26 03:01:01 +00:00
.limit(100);
return actors.map(actor => curateActor(actor));
2020-05-19 23:11:32 +00:00
}
2020-12-30 02:19:09 +00:00
async function flushProfiles(actorIdsOrNames) {
const profiles = await fetchProfiles(actorIdsOrNames);
const actorNames = Array.from(new Set(profiles.map(profile => profile.actor.name)));
const deleteCount = await knex('actors_profiles')
.whereIn('id', profiles.map(profile => profile.id))
.delete();
await interpolateProfiles(actorIdsOrNames);
await flushOrphanedMedia(); // don't flush until main avatar is detached by re-interpolating
if (actorNames.length > 20) {
logger.info(`Removed ${deleteCount} profiles for ${actorNames.length} actors`);
return;
}
if (deleteCount > 0) {
logger.info(`Removed ${deleteCount} profiles for ${actorNames.join(', ')}`);
return;
}
logger.info(`Removed ${deleteCount} profiles`);
}
async function flushActors(actorIdsOrNames) {
const actors = await knex('actors')
.whereIn('id', actorIdsOrNames.filter(idOrName => typeof idOrName === 'number'))
.orWhere((builder) => {
builder
.whereIn('name', actorIdsOrNames.filter(idOrName => typeof idOrName === 'string'))
.whereNull('entity_id');
});
2020-12-30 02:19:09 +00:00
const actorIds = actors.map(actor => actor.id);
const sceneIds = await knex('releases_actors')
.select('releases.id')
.whereIn('actor_id', actorIds)
.leftJoin('releases', 'releases.id', 'releases_actors.release_id')
.pluck('id');
const [deletedScenesCount, deletedActorsCount] = await Promise.all([
deleteScenes(sceneIds),
knex('actors')
.whereIn('id', actorIds)
.delete(),
]);
await flushOrphanedMedia();
logger.info(`Removed ${deletedActorsCount} actors with ${deletedScenesCount} scenes`);
}
module.exports = {
associateActors,
2020-05-19 23:11:32 +00:00
fetchActor,
2020-12-30 02:19:09 +00:00
flushActors,
2020-12-30 01:23:43 +00:00
flushProfiles,
interpolateProfiles,
scrapeActors,
searchActors,
toBaseActors,
};