'use strict'; const config = require('config'); const util = require('util'); const Promise = require('bluebird'); const moment = require('moment'); const blake2 = require('blake2'); const DOMPurify = require('dompurify'); const { JSDOM } = require('jsdom'); const omit = require('object.omit'); const inquirer = require('inquirer'); const { window } = new JSDOM(''); const domPurify = DOMPurify(window); // const logger = require('./logger')(__filename); const knex = require('./knex'); const scrapers = require('./scrapers/scrapers').actors; const argv = require('./argv'); const include = require('./utils/argv-include')(argv); const bulkInsert = require('./utils/bulk-insert'); const chunk = require('./utils/chunk'); const logger = require('./logger')(__filename); const { toBaseReleases } = require('./deep'); const { associateAvatars, flushOrphanedMedia } = require('./media'); const { fetchEntitiesBySlug } = require('./entities'); const { deleteScenes } = require('./releases'); const slugify = require('./utils/slugify'); const capitalize = require('./utils/capitalize'); const resolvePlace = require('./utils/resolve-place'); const { resolveLayoutScraper } = require('./scrapers/resolve'); const getRecursiveParameters = require('./utils/get-recursive-parameters'); const hairColors = { 'jet-black': 'black', 'red-head': 'red', 'soft-black': 'black', 'brunette/raven': 'brown', black: 'black', blond: 'blond', blonde: 'blonde', blondie: 'blonde', brown: 'brown', bruin: 'brown', brunette: 'brown', fair: 'blonde', grey: 'gray', gray: 'gray', raven: 'black', red: 'red', redhead: 'red', 'red head': 'red', rood: 'red', blue: 'blue', green: 'green', purple: 'purple', pink: 'pink', zwart: 'black', }; const eyeColors = { blauw: 'blue', blue: 'blue', brown: 'brown', bruin: 'bruin', dark: 'brown', gray: 'gray', green: 'green', groen: 'green', grey: 'gray', hazel: 'hazel', }; const orientations = { bi: 'bisexual', biseksueel: 'bisexual', bisexual: 'bisexual', gay: 'gay', hetero: 'straight', heteroseksueel: 'straight', heterosexual: 'straight', homoseksueel: 'gay', homosexual: 'gay', straight: 'straight', }; const ethnicities = { 'african american': 'black', 'african-american': 'black', 'native american': 'native american', african: 'black', aravic: 'arabic', asian: 'asian', black: 'black', caucasian: 'white', european: 'white', hispanic: 'latin', indian: 'indian', japanese: 'japanese', latin: 'latin', latina: 'latina', latino: 'latino', white: 'white', }; function getBoolean(value) { if (typeof value === 'boolean') { return value; } if (typeof value === 'string') { if (/yes/i.test(value)) { return true; } if (/no/i.test(value)) { return true; } } return null; } function getMostFrequent(items) { const { mostFrequent } = items.reduce((acc, item) => { if (item === undefined || item === null) { return acc; } const slug = slugify(item); acc.counts[slug] = (acc.counts[slug] || 0) + 1; if (!acc.mostFrequent || acc.counts[slug] > acc.counts[slugify(acc.mostFrequent)]) { acc.mostFrequent = item; } return acc; }, { counts: {}, mostFrequent: null, }); return mostFrequent; } function getMostFrequentDate(dates) { const year = getMostFrequent(dates.map((dateX) => dateX.getFullYear())); const month = getMostFrequent(dates.map((dateX) => dateX.getMonth())); const date = getMostFrequent(dates.map((dateX) => dateX.getDate())); if (year === null || month === null || date === null) { return null; } return moment({ year, month, date }).toDate(); } function getHighest(items) { return items.reduce((prevItem, item) => (item > prevItem ? item : prevItem), null); } function getLongest(items) { return items.sort((itemA, itemB) => itemB.length - itemA.length)[0] || null; } function getAverage(items) { return Math.round(items.reduce((acc, item) => acc + item, 0) / items.length) || null; } function toBaseActors(actorsOrNames, release) { if (!actorsOrNames) { return []; } const baseActors = actorsOrNames .filter((actorOrName) => actorOrName && (typeof actorOrName === 'string' || actorOrName.name)) .map((actorOrName) => { const [baseName, entryId] = (actorOrName.name || actorOrName).split(':'); const name = capitalize(baseName); const slug = slugify(name); // using top level parent widens the scope too much, e.g. different Gamma sites may not use the same actor database // const entity = getRecursiveParent(release?.entity); const entity = (release?.entity?.indepdendent && release?.entity) || release?.entity?.parent || release?.entity || null; const baseActor = { name, slug, entryId: (entity && (entryId || actorOrName.entryId)) || null, entity, hasProfile: !!actorOrName.name, // actor contains profile information }; if (actorOrName.name) { return { ...actorOrName, ...baseActor, }; } return baseActor; }); return baseActors; } function getCollisionLikely(actor) { // actor with single name return actor.name.match(/\w+/g).length === 1; } function curateActor(actor, withDetails = false, isProfile = false) { if (!actor) { return null; } const curatedActor = { id: actor.id, name: actor.name, slug: actor.slug, url: actor.url, gender: actor.gender, orientation: actor.orientation, entityId: actor.entity_id, aliasFor: actor.alias_for, dateOfBirth: actor.date_of_birth, age: actor.age, birthCountry: actor.birth_country_alpha2, ...(withDetails && { alias: actor.alias && { id: actor.alias.id, name: actor.alias.name, slug: actor.slug, gender: actor.alias.gender, }, entity: actor.entity && { id: actor.entity.id, name: actor.entity.name, slug: actor.entity.slug, }, dateOfDeath: actor.date_of_death, cup: actor.cup, bust: actor.bust, waist: actor.waist, hip: actor.hip, naturalBoobs: actor.natural_boobs, penisLength: actor.penis_length, penisGirth: actor.penis_girth, circumcised: actor.circumcised, height: actor.height, weight: actor.weight, eyes: actor.eyes, hairColor: actor.hair_color, hasTattoos: actor.has_tattoos, hasPiercings: actor.has_piercings, tattoos: actor.tattoos, piercings: actor.piercings, ...(isProfile && { description: actor.description }), placeOfBirth: actor.birth_country && { country: { alpha2: actor.birth_country.alpha2, name: actor.birth_country.name, alias: actor.birth_country.alias, }, state: actor.birth_state, city: actor.birth_city, }, placeOfResidence: actor.residence_country && { country: { alpha2: actor.residence_country.alpha2, name: actor.residence_country.name, alias: actor.residence_country.alias, }, state: actor.residence_state, city: actor.residence_city, }, avatar: actor.avatar && { id: actor.avatar.id, path: actor.avatar.path, width: actor.avatar.width, height: actor.avatar.height, size: actor.avatar.size, source: actor.avatar.source, }, ...(actor.profiles && { profiles: actor.profiles?.map((profile) => curateActor(profile, true, true)) }), }), }; return curatedActor; } function curateActorEntry(baseActor, batchId) { const collisionLikely = getCollisionLikely(baseActor); return { name: baseActor.name, slug: baseActor.slug, entity_id: (collisionLikely && baseActor.entity?.id) || null, entry_id: (collisionLikely && baseActor.entryId) || null, batch_id: batchId, }; } function curateActorEntries(baseActors, batchId) { return baseActors.map((baseActor) => curateActorEntry(baseActor, batchId)); } function curateProfileEntry(profile) { if (!profile.id) { return null; } const curatedProfileEntry = { ...(profile.update !== false && { id: profile.update }), actor_id: profile.id, entity_id: profile.entity?.id || null, date_of_birth: profile.dateOfBirth, date_of_death: profile.dateOfDeath, age: profile.age, url: profile.url, gender: profile.gender, orientation: profile.orientation, ethnicity: profile.ethnicity, description: profile.description, description_hash: profile.descriptionHash, birth_city: profile.placeOfBirth?.city || null, birth_state: profile.placeOfBirth?.state || null, birth_country_alpha2: profile.placeOfBirth?.country || null, residence_city: profile.placeOfResidence?.city || null, residence_state: profile.placeOfResidence?.state || null, residence_country_alpha2: profile.placeOfResidence?.country || null, cup: profile.cup, bust: profile.bust, waist: profile.waist, hip: profile.hip, penis_length: profile.penisLength, penis_girth: profile.penisGirth, circumcised: profile.circumcised, natural_boobs: profile.naturalBoobs, height: profile.height, weight: profile.weight, hair_color: profile.hairColor, eyes: profile.eyes, has_tattoos: profile.hasTattoos, has_piercings: profile.hasPiercings, piercings: profile.piercings, tattoos: profile.tattoos, avatar_media_id: profile.avatarMediaId || null, }; return curatedProfileEntry; } async function curateProfile(profile, actor) { if (!profile) { return null; } try { const curatedProfile = { id: profile.id, name: profile.name, url: profile.url, avatar: profile.avatar, scraper: profile.scraper, entity: profile.entity, update: profile.update, }; curatedProfile.description = domPurify.sanitize(profile.description?.replace(/\s+/g, ' '), { ALLOWED_TAGS: [] }).trim() || null; const hasher = curatedProfile.description && blake2 .createHash('blake2b', { digestLength: 24 }) .update(Buffer.from(slugify(curatedProfile.description))); curatedProfile.descriptionHash = curatedProfile.description && hasher.digest('hex'); curatedProfile.nationality = profile.nationality?.trim() || null; // used to derive country when country not available curatedProfile.ethnicity = ethnicities[profile.ethnicity?.trim().toLowerCase()] || null; curatedProfile.hairColor = hairColors[(profile.hairColor || profile.hair)?.toLowerCase().replace('hair', '').trim()] || null; curatedProfile.eyes = eyeColors[profile.eyes?.trim().toLowerCase()] || null; curatedProfile.tattoos = profile.tattoos?.trim() || null; curatedProfile.piercings = profile.piercings?.trim() || null; curatedProfile.gender = (/female/i.test(profile.gender) && 'female') || (/shemale|trans/i.test(profile.gender) && 'transsexual') || (/male/i.test(profile.gender) && 'male') || null; curatedProfile.orientation = orientations[profile.orientation?.trim()] || null; const dateOfBirth = profile.dateOfBirth || profile.birthdate; curatedProfile.dateOfBirth = (!Number.isNaN(Number(dateOfBirth)) // possibly valid date && new Date() - dateOfBirth > 567648000000 // over 18 && dateOfBirth) || null; curatedProfile.dateOfDeath = Number.isNaN(Number(profile.dateOfDeath)) ? null : profile.dateOfDeath; curatedProfile.age = Number(profile.age) || null; curatedProfile.height = Number(profile.height) || profile.height?.match?.(/\d+/)?.[0] || null; curatedProfile.weight = Number(profile.weight) || profile.weight?.match?.(/\d+/)?.[0] || null; // separate measurement values curatedProfile.cup = profile.cup || (typeof profile.bust === 'string' && profile.bust?.match?.(/[a-zA-Z]+/)?.[0]) || null; curatedProfile.bust = Number(profile.bust) || profile.bust?.match?.(/\d+/)?.[0] || null; curatedProfile.waist = Number(profile.waist) || profile.waist?.match?.(/\d+/)?.[0] || null; curatedProfile.hip = Number(profile.hip) || profile.hip?.match?.(/\d+/)?.[0] || null; // combined measurement value const measurements = profile.measurements?.match(/(\d+)(\w+)\s*[-x]\s*(\d+)\s*[-x]\s*(\d+)/); // ExCoGi uses x, Jules Jordan has spaces between the dashes if (measurements) { curatedProfile.bust = Number(measurements[1]); curatedProfile.cup = measurements[2]; curatedProfile.waist = Number(measurements[3]); curatedProfile.hip = Number(measurements[4]); } curatedProfile.penisLength = Number(profile.penisLength) || profile.penisLength?.match?.(/\d+/)?.[0] || null; curatedProfile.penisGirth = Number(profile.penisGirth) || profile.penisGirth?.match?.(/\d+/)?.[0] || null; curatedProfile.circumcised = getBoolean(profile.circumcised); curatedProfile.naturalBoobs = getBoolean(profile.naturalBoobs); curatedProfile.hasTattoos = getBoolean(profile.hasTattoos); curatedProfile.hasPiercings = getBoolean(profile.hasPiercings); if (argv.resolvePlace) { const [placeOfBirth, placeOfResidence] = await Promise.all([ resolvePlace(profile.birthPlace), resolvePlace(profile.residencePlace), ]); curatedProfile.placeOfBirth = placeOfBirth; curatedProfile.placeOfResidence = placeOfResidence; } if (!curatedProfile.placeOfBirth && curatedProfile.nationality) { const country = await knex('countries') .where('nationality', 'ilike', `%${curatedProfile.nationality}%`) .orWhere('alpha3', 'ilike', `%${curatedProfile.nationality}%`) .orWhere('alpha2', 'ilike', `%${curatedProfile.nationality}%`) .orderBy('priority', 'desc') .first(); if (country) { curatedProfile.placeOfBirth = { country: country.alpha2, }; } } curatedProfile.social = [].concat(profile.social).map((social) => { if (!social) { return null; } try { const { origin, pathname } = new URL(social); return `${origin}${pathname}`; } catch (error) { logger.warn(`Profile scraper for '${profile.entity.name}' returned invalid social link: ${social}`); return null; } }).filter(Boolean); curatedProfile.scenes = toBaseReleases(profile.scenes || profile.releases, profile.entity, actor) // attach actor to base scene, in case it was not scraped .map((scene) => { if (actor && !scene.actors?.find((sceneActor) => slugify(sceneActor) === actor.slug || slugify(sceneActor.name) === actor.slug)) { return { ...scene, actors: [actor, ...(scene.actors || [])], }; } return scene; }); if (profile.ethnicity && !curatedProfile.ethnicity) logger.warn(`Unrecognized ethnicity returned by '${profile.entity.name}' scraper: ${profile.ethnicity}`); if ((profile.hairColor || profile.hair) && !curatedProfile.hairColor) logger.warn(`Unrecognized hair color returned by '${profile.entity.name}' scraper: ${profile.hairColor || profile.hair}`); if (profile.eyes && !curatedProfile.eyes) logger.warn(`Unrecognized eye color returned by '${profile.entity.name}' scraper: ${profile.eyes}`); return curatedProfile; } catch (error) { logger.error(`Failed to curate '${profile.name}': ${error.message}`); return null; } } async function fetchProfiles(actorIdsOrNames) { return knex('actors_profiles') .select(knex.raw('actors_profiles.*, row_to_json(actors) as actor, row_to_json(media) as avatar')) .leftJoin('actors', 'actors.id', 'actors_profiles.actor_id') .modify((query) => { if (actorIdsOrNames) { query .whereIn('actor_id', actorIdsOrNames.filter((idOrName) => typeof idOrName === 'number')) .orWhere((builder) => { builder .whereIn('actors.name', actorIdsOrNames.filter((idOrName) => typeof idOrName === 'string')) .whereNull('actors.entity_id'); }); } }) .leftJoin('media', 'actors_profiles.avatar_media_id', 'media.id'); } async function interpolateProfiles(actorIdsOrNames) { const profiles = await fetchProfiles(actorIdsOrNames); const profilesByActorId = profiles.reduce((acc, profile) => ({ ...acc, [profile.actor_id]: [ ...(acc[profile.actor_id] || []), profile, ], }), {}); logger.info(`Interpolating ${profiles.length} profiles from ${Object.keys(profilesByActorId).length} actors`); const interpolatedProfiles = Object.entries(profilesByActorId).map(([actorId, actorProfiles]) => { // group values from each profile const valuesByProperty = actorProfiles.reduce((acc, profile) => Object .entries(profile) .reduce((profileAcc, [property, value]) => ({ ...profileAcc, [property]: [ ...(acc[property] || []), ...(value === null ? [] : Array.from({ length: profile.priority }, () => value)), // multiply by priority, increasing the odds of being the most frequent value ], }), { // bundle location values so they can be assessed together, to ensure the most frequent city is in the most frequent state is in most frequent country origin: [...acc.origin || [], { ...(profile.birth_country_alpha2 && { country: profile.birth_country_alpha2 }), ...(profile.birth_state && { state: profile.birth_state }), ...(profile.birth_city && { city: profile.birth_city }), }].filter((location) => Object.keys(location).length > 0), residence: [...acc.residence || [], { ...(profile.residence_country_alpha2 && { country: profile.residence_country_alpha2 }), ...(profile.residence_state && { state: profile.residence_state }), ...(profile.residence_city && { city: profile.residence_city }), }].filter((location) => Object.keys(location).length > 0), }), {}); const mostFrequentValues = [ 'gender', 'orientation', 'ethnicity', 'cup', 'bust', 'waist', 'hip', 'penis_length', 'penis_girth', 'circumcised', 'hair_color', 'eyes', 'has_tattoos', 'has_piercings', ].reduce((acc, property) => ({ ...acc, [property]: getMostFrequent(valuesByProperty[property]), }), {}); const profile = { id: actorId, ...mostFrequentValues, }; profile.height = getMostFrequent(valuesByProperty.height.filter((height) => height > 50 && height < 300)); // remove unlikely values profile.date_of_birth = getMostFrequentDate(valuesByProperty.date_of_birth); profile.date_of_death = getMostFrequentDate(valuesByProperty.date_of_death); profile.age = getHighest(valuesByProperty.age); profile.natural_boobs = profile.gender === 'male' ? null : getMostFrequent(valuesByProperty.natural_boobs); // ensure most frequent country, city and state match up profile.birth_country_alpha2 = getMostFrequent(valuesByProperty.origin.map((location) => location.country)); const remainingOriginCountries = valuesByProperty.origin.filter((location) => location.country === profile.birth_country_alpha2); profile.birth_state = getMostFrequent(remainingOriginCountries.map((location) => location.state)); const remainingOriginStates = remainingOriginCountries.filter((location) => !profile.birth_state || location.state === profile.birth_state); profile.birth_city = getMostFrequent(remainingOriginStates.map((location) => location.city)); profile.residence_country_alpha2 = getMostFrequent(valuesByProperty.residence.map((location) => location.country)); const remainingResidenceCountries = valuesByProperty.residence.filter((location) => location.country === profile.residence_country_alpha2); profile.residence_state = getMostFrequent(remainingResidenceCountries.map((location) => location.state)); const remainingResidenceStates = remainingResidenceCountries.filter((location) => !profile.residence_state || location.state === profile.residence_state); profile.residence_city = getMostFrequent(remainingResidenceStates.map((location) => location.city)); profile.weight = getAverage(valuesByProperty.weight); profile.tattoos = getLongest(valuesByProperty.tattoos); profile.piercings = getLongest(valuesByProperty.piercings); profile.avatar_media_id = actorProfiles .map((actorProfile) => actorProfile.avatar) .filter((avatar) => avatar && (avatar.entropy === null || avatar.entropy > 5.5)) .sort((avatarA, avatarB) => avatarB.height - avatarA.height)[0]?.id || null; if (!profile.avatar_media_id) { // try to settle for low quality avatar profile.avatar_media_id = actorProfiles .map((actorProfile) => actorProfile.avatar) .filter((avatar) => avatar) .sort((avatarA, avatarB) => avatarB.height - avatarA.height)[0]?.id || null; } return profile; }); const transaction = await knex.transaction(); // clear existing interpolated data const emptyProfile = Object .keys(omit(curateProfileEntry({ id: 1 }), ['id', 'actor_id', 'entity_id', 'url', 'description_hash'])) .reduce((acc, key) => ({ ...acc, [key]: null }), {}); await knex('actors') .modify((modifyBuilder) => { if (actorIdsOrNames) { modifyBuilder .whereIn('id', actorIdsOrNames.filter((idOrName) => typeof idOrName === 'number')) .orWhere((whereBuilder) => { whereBuilder .whereIn('name', actorIdsOrNames.filter((idOrName) => typeof idOrName === 'string')) .whereNull('entity_id'); }); } }) .update(emptyProfile) .transacting(transaction); // insert new interpolated data const queries = interpolatedProfiles.map((profile) => knex('actors') .where('id', profile.id) .update(profile) .transacting(transaction)); await Promise.all(queries) .then(transaction.commit) .catch(transaction.rollback); } async function upsertProfiles(profiles) { const newProfileEntries = profiles.filter((profile) => !profile.update).map((profile) => curateProfileEntry(profile)).filter(Boolean); const updatingProfileEntries = profiles.filter((profile) => profile.update).map((profile) => curateProfileEntry(profile)).filter(Boolean); if (newProfileEntries.length > 0) { await bulkInsert('actors_profiles', newProfileEntries); logger.info(`Saved ${newProfileEntries.length} actor profiles`); } if (argv.force && updatingProfileEntries.length > 0) { const transaction = await knex.transaction(); const queries = updatingProfileEntries.map((profileEntry) => knex('actors_profiles') .where('id', profileEntry.id) .update(profileEntry) .returning(['id', 'actor_id']) .transacting(transaction)); await Promise.all(queries) .then(transaction.commit) .catch(transaction.rollback); logger.info(`Updated ${updatingProfileEntries.length} new actor profiles`); } } async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesByActorEntityId) { const validSources = actor.entity ? sources.filter((source) => source === actor.entity.slug) : sources; const profiles = Promise.map(validSources, async (source) => { try { // config may group sources to try until success return await [].concat(source).reduce(async (outcome, scraperSlug) => outcome.catch(async () => { try { const entity = entitiesBySlug[scraperSlug] || null; const scraper = scrapers[scraperSlug]; const layoutScraper = resolveLayoutScraper(entity, scraper); if (!layoutScraper?.fetchProfile) { logger.warn(`No profile profile scraper available for ${scraperSlug}`); throw new Error(`No profile profile scraper available for ${scraperSlug}`); } const context = { ...entity, // legacy site: entity, channel: entity, network: entity?.parent, entity, include, scraper: scraperSlug, parameters: getRecursiveParameters(entity), }; const label = context.entity?.name; if (!context.entity) { logger.warn(`No entity found for ${scraperSlug}`); throw new Error(`No entity found for ${scraperSlug}`); } const existingProfile = existingProfilesByActorEntityId[actor.id]?.[context.entity?.id || null]; if (existingProfile && !argv.force) { logger.verbose(`Found existing profile for '${actor.name}' on '${label}', use --force to scrape again`); return null; } logger.verbose(`Searching profile for '${actor.name}' on '${label}'`); const profile = await layoutScraper.fetchProfile(curateActor({ ...existingProfile, ...actor, }), context, include); if (!profile || typeof profile === 'number') { // scraper returns HTTP code on request failure logger.verbose(`Profile for '${actor.name}' not available on ${label}, scraper returned ${profile}`); throw Object.assign(new Error(`Profile for '${actor.name}' not available on ${label}`), { code: 'PROFILE_NOT_AVAILABLE' }); } logger.verbose(`Found profile for '${actor.name}' on '${label}'`); return await curateProfile({ ...actor, ...profile, entity, update: existingProfile?.id || false, }, actor); } catch (error) { if (error.code !== 'PROFILE_NOT_AVAILABLE') { logger.error(`Failed to fetch profile for '${actor.name}' from '${scraperSlug}': ${error.message}`); } // throw error to try next source throw error; } }), Promise.reject(new Error())); } catch (error) { console.log(error); if (error.code !== 'PROFILE_NOT_AVAILABLE') { logger.error(`Failed to fetch profile for '${actor.name}': ${error.message}`); } } return null; }); return profiles.filter(Boolean); } async function associateSocials(profiles) { const profileEntries = await knex('actors_profiles').whereIn(['actor_id', 'entity_id'], profiles.map((profile) => [profile.id, profile.entity.id])); const profileEntriesByActorIdAndEntityId = profileEntries.reduce((acc, profileEntry) => { if (!acc[profileEntry.actor_id]) { acc[profileEntry.actor_id] = {}; } acc[profileEntry.actor_id][profileEntry.entity_id] = profileEntry.id; return acc; }, {}); profiles.reduce(async (chain, profile) => { await chain; if (!Array.isArray(profile.social) || profile.social.length === 0) { return; } const profileId = profileEntriesByActorIdAndEntityId[profile.id]?.[profile.entity.id]; if (!profileId) { return; } await knex('actors_social') .insert(profile.social.map((url) => ({ url, platform: new URL(url).hostname.match(/([\w-]+)?\.(\w+)$/)?.[1], actor_id: profile.id, profile_id: profileId, }))) .onConflict() .ignore(); }, Promise.resolve()); } async function getActorNames(actorNames) { if (actorNames.length > 0) { return actorNames; } const actorsWithoutProfiles = await knex.raw(` SELECT actors.name FROM actors WHERE NOT EXISTS ( SELECT * FROM actors_profiles WHERE actors_profiles.actor_id = actors.id AND actors_profiles.updated_at <= (?) ) `, [argv.actorsUpdate || new Date()]); return actorsWithoutProfiles.rows.map((actor) => actor.name); } async function storeProfiles(profiles) { const profilesWithAvatarIds = await associateAvatars(profiles); const actorIds = Array.from(new Set(profiles.map((profile) => profile.id))); await associateSocials(profiles); await upsertProfiles(profilesWithAvatarIds); await interpolateProfiles(actorIds); } async function scrapeActors(argNames) { const actorNames = await getActorNames(argNames); const baseActors = toBaseActors(actorNames); logger.info(`Scraping profiles for ${actorNames.length} actors`); const sources = argv.profileSources || config.profiles || Object.keys(scrapers.actors); const entitySlugs = sources.flat(); const [entitiesBySlug, existingActorEntries] = await Promise.all([ fetchEntitiesBySlug(entitySlugs, 'desc'), knex('actors') .select(knex.raw('actors.id, actors.name, actors.slug, actors.entry_id, actors.entity_id, row_to_json(entities) as entity')) .whereIn('actors.slug', baseActors.map((baseActor) => baseActor.slug)) .whereNull('actors.alias_for') .leftJoin('entities', 'entities.id', 'actors.entity_id') .groupBy('actors.id', 'entities.id'), ]); const existingActorEntriesBySlugAndEntryId = existingActorEntries.reduce((acc, actorEntry) => ({ ...acc, [actorEntry.slug]: { ...acc[actorEntry.slug], [actorEntry.entryId || null]: actorEntry, }, }), {}); const newBaseActors = baseActors.filter((baseActor) => !existingActorEntriesBySlugAndEntryId[baseActor.slug]?.[baseActor.entryId]); const [batchId] = newBaseActors.length > 0 ? await knex('batches').insert({ comment: null }).returning('id') : [null]; const curatedActorEntries = batchId && curateActorEntries(newBaseActors, batchId); // TODO: associate entity when entry ID is provided const newActorEntries = batchId && await bulkInsert('actors', curatedActorEntries); const actors = existingActorEntries.concat(Array.isArray(newActorEntries) ? newActorEntries : []); const existingProfiles = await knex('actors_profiles') .select(knex.raw('actors_profiles.*, row_to_json(avatars) as avatar')) .whereIn('actor_id', actors.map((actor) => actor.id)) .leftJoin('media as avatars', 'avatars.id', 'actors_profiles.avatar_media_id'); const existingProfilesByActorEntityId = existingProfiles.reduce((acc, profile) => ({ ...acc, [profile.actor_id]: { ...acc[profile.actor_id], [profile.entity_id]: profile, }, }), {}); const profilesPerActor = await Promise.map( actors, async (actor) => scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesByActorEntityId), { concurrency: 10 }, ); const profiles = profilesPerActor.flat().filter(Boolean); logger.info(`Scraped ${profiles.length} profiles`); if (argv.report) { console.log(util.inspect(profiles, { depth: Infinity, colors: true })); } if (argv.save) { await storeProfiles(profiles); } return profiles; } async function getOrCreateActors(baseActors, batchId) { // WHERE IN causes stack depth error and performance issues with a large amount of values, no knex VALUES helper available const actorValues = baseActors.map((actor) => knex.raw('(:slug, :entityId, :entryId, :collisionLikely)', { slug: actor.slug, entityId: actor.entity.id, entryId: actor.entryId, collisionLikely: getCollisionLikely(actor), })).join(', '); const existingActors = await knex .select('actors.*') .from(knex.raw(`actors, (VALUES ${actorValues}) AS base_actors (slug, entity_id, entry_id, collision_likely)`)) .whereRaw(` actors.slug = base_actors.slug AND actors.entity_id IS NULL AND NOT base_actors.collision_likely `) .orWhereRaw(` actors.slug = base_actors.slug AND actors.entity_id = base_actors.entity_id AND ((actors.entry_id IS NULL AND base_actors.entry_id IS NULL) OR actors.entry_id = base_actors.entry_id) `); // const existingActorSlugs = new Set(existingActors.map(actor => actor.slug)); const existingActorSlugs = existingActors.reduce((acc, actor) => ({ ...acc, [actor.entity_id]: { ...acc[actor.entity_id], [actor.entry_id]: { ...acc[actor.entity_id]?.[actor.entry_id], [actor.slug]: true, }, }, }), {}); const uniqueBaseActors = baseActors.filter((baseActor) => !existingActorSlugs[baseActor.entity.id]?.[baseActor.entryId]?.[baseActor.slug] && !existingActorSlugs.null?.null?.[baseActor.slug]); const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId); const newActors = await bulkInsert('actors', curatedActorEntries); const newActorIdsByEntityIdEntryIdAndSlug = newActors.reduce((acc, actor) => ({ ...acc, [actor.entity_id]: { ...acc[actor.entity_id], [actor.entry_id]: { ...acc[actor.entity_id]?.[actor.entry_id], [actor.slug]: actor.id, }, }, }), {}); const newActorProfiles = await Promise.all(baseActors .filter((actor) => actor.hasProfile) .map((actor) => ({ ...actor, id: newActorIdsByEntityIdEntryIdAndSlug[actor.entity?.id]?.[actor.entryId]?.[actor.slug] || newActorIdsByEntityIdEntryIdAndSlug.null?.null?.[actor.slug], })) .filter((actor) => !!actor.id) .map((actor) => curateProfile(actor))); await storeProfiles(newActorProfiles); if (Array.isArray(newActors)) { return newActors.concat(existingActors); } return existingActors; } async function associatePeople(releases, batchId, type = 'actor') { try { const baseActorsByReleaseId = releases.reduce((acc, release) => { if (type === 'actors' && release.actors) { acc[release.id] = toBaseActors(release.actors, release); } if (type === 'directors' && (release.director || release.directors)) { acc[release.id] = toBaseActors([].concat(release.director || release.directors).filter(Boolean), release); } return acc; }, {}); const baseActors = Object.values(baseActorsByReleaseId).flat(); if (baseActors.length === 0) { return []; } const baseActorsBySlug = baseActors.reduce((acc, baseActor) => ({ ...acc, [baseActor.slug]: baseActor, }), {}); const uniqueBaseActors = Object.values(baseActorsBySlug); const actors = await getOrCreateActors(uniqueBaseActors, batchId); const personKey = ({ actors: 'actor_id', directors: 'director_id', })[type]; const actorIdsByEntityIdEntryIdAndSlug = actors.reduce((acc, actor) => ({ ...acc, [actor.entity_id]: { ...acc[actor.entity_id], [actor.entry_id]: { ...acc[actor.entity_id]?.[actor.entry_id], [actor.slug]: { [personKey]: actor.alias_for || actor.id, alias_id: actor.alias_for ? actor.id : null, }, }, }, }), {}); const releaseActorAssociations = Object.entries(baseActorsByReleaseId) .map(([releaseId, releaseActors]) => releaseActors .map((releaseActor) => ({ release_id: releaseId, ...(actorIdsByEntityIdEntryIdAndSlug[releaseActor.entity?.id]?.[releaseActor.entryId]?.[releaseActor.slug] || actorIdsByEntityIdEntryIdAndSlug.null.null[releaseActor.slug]), }))) .flat(); const validReleaseActorAssociations = releaseActorAssociations.filter((association) => association.release_id && association[personKey]); if (releaseActorAssociations.length > validReleaseActorAssociations.length) { const invalidReleaseActorAssociations = releaseActorAssociations.filter((association) => !association.release_id || !association[personKey]); logger.error(invalidReleaseActorAssociations); } await bulkInsert(`releases_${type}`, validReleaseActorAssociations, false); logger.verbose(`Associated ${releaseActorAssociations.length} actors to ${releases.length} scenes`); return actors; } catch (error) { logger.error(`Failed to associate actors: ${error.message}`); return []; } } async function associateActors(releases, batchId) { return associatePeople(releases, batchId, 'actors'); } async function associateDirectors(releases, batchId) { return associatePeople(releases, batchId, 'directors'); } async function fetchActor(actorId) { const actor = await knex('actors') .select(knex.raw(` actors.*, row_to_json(entities) as entity, row_to_json(actor_alias) as alias, row_to_json(birth_country) as birth_country, row_to_json(residence_country) as residence_country, row_to_json(media) as avatar, json_agg(actors_profiles) as profiles `)) .modify((queryBuilder) => { if (Number.isNaN(Number(actorId))) { queryBuilder.where('actors.slug', actorId); return; } queryBuilder.where('actors.id', actorId); }) .leftJoin('actors as actor_alias', 'actor_alias.id', 'actors.alias_for') .leftJoin('actors_profiles', 'actors.id', 'actors_profiles.actor_id') .leftJoin('entities', 'entities.id', 'actors.entity_id') .leftJoin('countries as birth_country', 'birth_country.alpha2', 'actors.birth_country_alpha2') .leftJoin('countries as residence_country', 'residence_country.alpha2', 'actors.residence_country_alpha2') .leftJoin('media', 'media.id', 'actors.avatar_media_id') .groupBy('actors.id', 'entities.id', 'actor_alias.id', 'birth_country.alpha2', 'residence_country.alpha2', 'media.id') .first(); return curateActor(actor, true); } async function searchActors(query) { const actors = await knex .select('*') .from(knex.raw('search_actors(?) as actors', [query])) .limit(100); return actors.map((actor) => curateActor(actor)); } async function flushProfiles(actorIdsOrNames) { const profiles = await fetchProfiles(actorIdsOrNames); const actorNames = Array.from(new Set(profiles.map((profile) => profile.actor.name))); const deleteCount = await knex('actors_profiles') .whereIn('id', profiles.map((profile) => profile.id)) .delete(); await interpolateProfiles(actorIdsOrNames); await flushOrphanedMedia(); // don't flush until main avatar is detached by re-interpolating if (actorNames.length > 20) { logger.info(`Removed ${deleteCount} profiles for ${actorNames.length} actors`); return; } if (deleteCount > 0) { logger.info(`Removed ${deleteCount} profiles for ${actorNames.join(', ')}`); return; } logger.info(`Removed ${deleteCount} profiles`); } async function deleteActors(allActorIdsOrNames) { const deleteCounts = await Promise.map(chunk(allActorIdsOrNames), async (actorIdsOrNames) => { const actors = await knex('actors') .whereIn('id', actorIdsOrNames.filter((idOrName) => typeof idOrName === 'number')) .orWhere((builder) => { builder .whereIn('name', actorIdsOrNames.filter((idOrName) => typeof idOrName === 'string')) .whereNull('entity_id'); }); const actorIds = actors.map((actor) => actor.id); const sceneIds = await knex('releases_actors') .select('releases.id') .whereIn('actor_id', actorIds) .leftJoin('releases', 'releases.id', 'releases_actors.release_id') .pluck('id'); const [deletedScenesCount, deletedActorsCount] = await Promise.all([ deleteScenes(sceneIds), knex('actors') .whereIn('id', actorIds) .delete(), ]); return { deletedScenesCount, deletedActorsCount }; }, { concurrency: 10 }); const deletedActorsCount = deleteCounts.reduce((acc, count) => acc + count.deletedActorsCount, 0); const deletedScenesCount = deleteCounts.reduce((acc, count) => acc + count.deletedScenesCount, 0); await flushOrphanedMedia(); logger.info(`Removed ${deletedActorsCount} actors with ${deletedScenesCount} scenes`); return deletedActorsCount; } async function flushActors() { const actorIds = await knex('actors').select('id').pluck('id'); const confirmed = await inquirer.prompt([{ type: 'confirm', name: 'flushActors', message: `You are about to remove ${actorIds.length} actors. Are you sure?`, default: false, }]); if (!confirmed.flushActors) { logger.warn('Confirmation rejected, not flushing actors'); return; } const deleteCount = await deleteActors(actorIds); await flushOrphanedMedia(); logger.info(`Removed ${deleteCount}/${actorIds.length} actors`); } module.exports = { associateActors, associateDirectors, deleteActors, fetchActor, flushActors, flushProfiles, interpolateProfiles, scrapeActors, searchActors, toBaseActors, };