'use strict'; const config = require('config'); const Promise = require('bluebird'); const moment = require('moment'); // const logger = require('./logger')(__filename); const knex = require('./knex'); const scrapers = require('./scrapers/scrapers').actors; const argv = require('./argv'); const include = require('./utils/argv-include')(argv); const logger = require('./logger')(__filename); const { toBaseReleases } = require('./deep'); const { associateAvatars } = require('./media'); const slugify = require('./utils/slugify'); const capitalize = require('./utils/capitalize'); const resolvePlace = require('./utils/resolve-place'); function getMostFrequent(items) { const { mostFrequent } = items.reduce((acc, item) => { acc.counts[item] = (acc.counts[item] || 0) + 1; if (!acc.mostFrequent || acc.counts[item] > acc.counts[acc.mostFrequent]) { acc.mostFrequent = item; } return acc; }, { counts: {}, mostFrequent: null, }); return mostFrequent; } function getMostFrequentDate(dates) { const year = getMostFrequent(dates.map(dateX => dateX.getFullYear())); const month = getMostFrequent(dates.map(dateX => dateX.getMonth())); const date = getMostFrequent(dates.map(dateX => dateX.getDate())); return moment({ year, month, date }).toDate(); } function getLongest(items) { return items.sort((itemA, itemB) => itemB.length - itemA.length)[0] || null; } function getAverage(items) { return Math.round(items.reduce((acc, item) => acc + item, 0) / items.length) || null; } function toBaseActors(actorsOrNames, release) { return actorsOrNames.map((actorOrName) => { const name = capitalize(actorOrName.name || actorOrName); const slug = slugify(name); const baseActor = { name, slug, network: release?.site.network, }; if (actorOrName.name) { return { ...actorOrName, ...baseActor, }; } return baseActor; }); } function curateActorEntry(baseActor, batchId) { return { name: baseActor.name, slug: baseActor.slug, network_id: null, batch_id: batchId, }; } function curateActorEntries(baseActors, batchId) { return baseActors.map(baseActor => curateActorEntry(baseActor, batchId)); } function curateProfileEntry(profile) { const curatedProfileEntry = { actor_id: profile.id, site_id: profile.site?.id || null, network_id: profile.network?.id || null, date_of_birth: profile.dateOfBirth, date_of_death: profile.dateOfDeath, gender: profile.gender, ethnicity: profile.ethnicity, description: profile.description, birth_city: profile.placeOfBirth?.city || null, birth_state: profile.placeOfBirth?.state || null, birth_country_alpha2: profile.placeOfBirth?.country || null, residence_city: profile.placeOfResidence?.city || null, residence_state: profile.placeOfResidence?.state || null, residence_country_alpha2: profile.placeOfResidence?.country || null, cup: profile.cup, bust: profile.bust, waist: profile.waist, hip: profile.hip, natural_boobs: profile.naturalBoobs, height: profile.height, weight: profile.weight, hair: profile.hair, eyes: profile.eyes, has_tattoos: profile.hasTattoos, has_piercings: profile.hasPiercings, piercings: profile.piercings, tattoos: profile.tattoos, avatar_media_id: profile.avatarMediaId || null, }; return curatedProfileEntry; } async function curateProfile(profile) { try { const curatedProfile = { id: profile.id, name: profile.name, avatar: profile.avatar, scraper: profile.scraper, }; curatedProfile.site = profile.site.isNetwork ? null : profile.site; curatedProfile.network = profile.site.isNetwork ? profile.site : null; curatedProfile.description = profile.description?.trim() || null; curatedProfile.nationality = profile.nationality?.trim() || null; // used to derive country when country not available curatedProfile.ethnicity = profile.ethnicity?.trim() || null; curatedProfile.hair = profile.hair?.trim() || null; curatedProfile.eyes = profile.eyes?.trim() || null; curatedProfile.tattoos = profile.tattoos?.trim() || null; curatedProfile.piercings = profile.piercings?.trim() || null; curatedProfile.gender = (/female/i.test(profile.gender) && 'female') || (/shemale/i.test(profile.gender) && 'transsexual') || (/male/i.test(profile.gender) && 'male') || (/trans/i.test(profile.gender) && 'transsexual') || null; curatedProfile.dateOfBirth = (!Number.isNaN(Number(profile.dateOfBirth || profile.birthdate)) // possibly valid date && new Date() - profile.birthdate > 567648000000 // over 18 && profile.birthdate) || null; curatedProfile.dateOfDeath = Number.isNaN(Number(profile.dateOfDeath)) ? null : profile.dateOfDeath; curatedProfile.cup = profile.cup || profile.bust?.match(/[a-zA-Z]+/)?.[0] || null; curatedProfile.bust = Number(profile.bust) || profile.bust?.match(/\d+/)?.[0] || null; curatedProfile.waist = Number(profile.waist) || profile.waist?.match(/\d+/)?.[0] || null; curatedProfile.hip = Number(profile.hip) || profile.hip?.match(/\d+/)?.[0] || null; curatedProfile.height = Number(profile.height) || profile.height?.match(/\d+/)?.[0] || null; curatedProfile.weight = Number(profile.weight) || profile.weight?.match(/\d+/)?.[0] || null; curatedProfile.naturalBoobs = typeof profile.naturalBoobs === 'boolean' ? profile.naturalBoobs : null; curatedProfile.hasTattoos = typeof profile.hasTattoos === 'boolean' ? profile.hasTattoos : null; curatedProfile.hasPiercings = typeof profile.hasPiercings === 'boolean' ? profile.hasPiercings : null; if (argv.resolvePlace) { const [placeOfBirth, placeOfResidence] = await Promise.all([ resolvePlace(profile.birthPlace), resolvePlace(profile.residencePlace), ]); curatedProfile.placeOfBirth = placeOfBirth; curatedProfile.placeOfResidence = placeOfResidence; } if (!curatedProfile.placeOfBirth && curatedProfile.nationality) { const country = await knex('countries') .where('nationality', 'ilike', `%${curatedProfile.nationality}%`) .orderBy('priority', 'desc') .first(); curatedProfile.placeOfBirth = { country: country.alpha2, }; } curatedProfile.social = Array.isArray(profile.social) ? profile.social.map((social) => { try { const { href } = new URL(); return href; } catch (error) { logger.warn(`Profile scraper for '${profile.site.name}' returned invalid social link: ${social}`); return null; } }).filter(Boolean) : []; curatedProfile.releases = toBaseReleases(profile.releases); if (argv.inspect) { console.log(curatedProfile); } return curatedProfile; } catch (error) { logger.error(`Failed to curate '${profile.name}': ${error.message}`); return null; } } async function interpolateProfiles(actors) { const profiles = await knex('actors_profiles') .select(['actors_profiles.*', 'media.width as avatar_width', 'media.height as avatar_height', 'media.size as avatar_size']) .whereIn('actor_id', actors.map(actor => actor.id)) .leftJoin('media', 'actors_profiles.avatar_media_id', 'media.id'); const profilesByActorId = profiles.reduce((acc, profile) => ({ ...acc, [profile.actor_id]: [ ...(acc[profile.actor_id] || []), profile, ], }), {}); const interpolatedProfiles = Object.entries(profilesByActorId).map(([actorId, actorProfiles]) => { const valuesByProperty = actorProfiles.reduce((acc, profile) => Object .entries(profile) .reduce((profileAcc, [property, value]) => ({ ...profileAcc, [property]: [ ...(acc[property] || []), ...(value === null ? [] : [value]), ], }), {}), {}); const avatars = actorProfiles.map(profile => profile.avatar_media_id && ({ id: profile.avatar_media_id, width: profile.avatar_width, height: profile.avatar_height, size: profile.avatar_size, })).filter(Boolean); const profile = { id: actorId, }; profile.gender = getMostFrequent(valuesByProperty.gender); profile.ethnicity = getMostFrequent(valuesByProperty.ethnicity.map(ethnicity => ethnicity.toLowerCase())); profile.date_of_birth = getMostFrequentDate(valuesByProperty.date_of_birth); profile.date_of_death = getMostFrequentDate(valuesByProperty.date_of_death); profile.birth_city = getMostFrequent(valuesByProperty.birth_city); profile.birth_state = getMostFrequent(valuesByProperty.birth_state); profile.birth_country_alpha2 = getMostFrequent(valuesByProperty.birth_country_alpha2); profile.residence_city = getMostFrequent(valuesByProperty.residence_city); profile.residence_state = getMostFrequent(valuesByProperty.residence_state); profile.residence_country_alpha2 = getMostFrequent(valuesByProperty.residence_country_alpha2); profile.cup = getMostFrequent(valuesByProperty.cup); profile.bust = getMostFrequent(valuesByProperty.bust); profile.waist = getMostFrequent(valuesByProperty.waist); profile.hip = getMostFrequent(valuesByProperty.hip); profile.natural_boobs = getMostFrequent(valuesByProperty.natural_boobs); profile.hair = getMostFrequent(valuesByProperty.hair.map(hair => hair.toLowerCase())); profile.eyes = getMostFrequent(valuesByProperty.eyes.map(eyes => eyes.toLowerCase())); profile.weight = getAverage(valuesByProperty.weight); profile.height = getMostFrequent(valuesByProperty.height); profile.has_tattoos = getMostFrequent(valuesByProperty.has_tattoos); profile.has_piercings = getMostFrequent(valuesByProperty.has_piercings); profile.tattoos = getLongest(valuesByProperty.tattoos); profile.piercings = getLongest(valuesByProperty.piercings); profile.avatar_media_id = avatars.sort((avatarA, avatarB) => avatarB.height - avatarA.height)[0].id; return profile; }); const transaction = await knex.transaction(); const queries = interpolatedProfiles.map(profile => knex('actors') .where('id', profile.id) .update(profile) .transacting(transaction)); await Promise.all(queries) .then(transaction.commit) .catch(transaction.rollback); } async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) { const profiles = Promise.map(sources, async (source) => { try { return await [].concat(source).reduce(async (outcome, scraperSlug) => outcome.catch(async () => { const scraper = scrapers[scraperSlug]; const siteOrNetwork = networksBySlug[scraperSlug] || sitesBySlug[scraperSlug]; if (!scraper?.fetchProfile) { logger.warn(`No profile profile scraper available for ${scraperSlug}`); throw new Error(`No profile profile scraper available for ${scraperSlug}`); } if (!siteOrNetwork) { logger.warn(`No site or network found for ${scraperSlug}`); throw new Error(`No site or network found for ${scraperSlug}`); } logger.verbose(`Searching profile for '${actor.name}' on '${scraperSlug}'`); const profile = await scraper.fetchProfile(actor.name, scraperSlug, siteOrNetwork, include); if (!profile || typeof profile === 'number') { // scraper returns HTTP code on request failure logger.verbose(`Profile for '${actor.name}' not available on ${scraperSlug}, scraper returned ${profile}`); throw Object.assign(new Error(`Profile for '${actor.name}' not available on ${scraperSlug}`), { code: 'PROFILE_NOT_AVAILABLE' }); } return { ...actor, ...profile, scraper: scraperSlug, site: siteOrNetwork, }; }), Promise.reject(new Error())); } catch (error) { if (error.code !== 'PROFILE_NOT_AVAILABLE') { logger.error(`Failed to fetch profile for '${actor.name}': ${error.message}`); } } return null; }); return profiles.filter(Boolean); } async function upsertProfiles(profiles) { const curatedProfileEntries = profiles.map(profile => curateProfileEntry(profile)); const existingProfiles = await knex('actors_profiles') .whereIn(['actor_id', 'network_id'], curatedProfileEntries.map(entry => [entry.actor_id, entry.network_id])) .orWhereIn(['actor_id', 'site_id'], curatedProfileEntries.map(entry => [entry.actor_id, entry.site_id])); const existingProfilesByActorNetworkSiteIds = existingProfiles.reduce((acc, profile) => ({ ...acc, [profile.actor_id]: { ...acc[profile.actor_id], [profile.network_id]: { ...acc[profile.actor_id]?.[profile.network_id], [profile.site_id]: profile, }, }, }), {}); const { updatingProfileEntries, newProfileEntries } = curatedProfileEntries.reduce((acc, profile) => { const existingProfile = existingProfilesByActorNetworkSiteIds[profile.actor_id]?.[profile.network_id]?.[profile.site_id]; if (existingProfile) { return { ...acc, updatingProfileEntries: [...acc.updatingProfileEntries, { ...profile, id: existingProfile.id, }], }; } return { ...acc, newProfileEntries: [...acc.newProfileEntries, profile], }; }, { updatingProfileEntries: [], newProfileEntries: [], }); if (newProfileEntries.length > 0) { await knex('actors_profiles').insert(newProfileEntries); } if (argv.force && updatingProfileEntries.length > 0) { const transaction = await knex.transaction(); const queries = updatingProfileEntries.map(profileEntry => knex('actors_profiles') .where('id', profileEntry.id) .update(profileEntry) .returning(['id', 'actor_id']) .transacting(transaction)); await Promise.all(queries) .then(transaction.commit) .catch(transaction.rollback); } } async function scrapeActors(actorNames) { const baseActors = toBaseActors(actorNames); const sources = argv.sources || config.profiles || Object.keys(scrapers.actors); const siteSlugs = sources.flat(); const [networks, sites, existingActorEntries] = await Promise.all([ knex('networks').whereIn('slug', siteSlugs), knex('sites').whereIn('slug', siteSlugs), knex('actors') .select(['id', 'name', 'slug']) .whereIn('slug', baseActors.map(baseActor => baseActor.slug)) .whereNull('network_id'), ]); const existingActorEntriesBySlug = existingActorEntries.reduce((acc, actorEntry) => ({ ...acc, [actorEntry.slug]: actorEntry }), {}); const networksBySlug = networks.reduce((acc, network) => ({ ...acc, [network.slug]: { ...network, isNetwork: true } }), {}); const sitesBySlug = sites.reduce((acc, site) => ({ ...acc, [site.slug]: site }), {}); const newBaseActors = baseActors.filter(baseActor => !existingActorEntriesBySlug[baseActor.slug]); const [batchId] = newBaseActors.length > 0 ? await knex('batches').insert({ comment: null }).returning('id') : [null]; const curatedActorEntries = batchId && curateActorEntries(newBaseActors, batchId); const newActorEntries = batchId && await knex('actors').insert(curatedActorEntries).returning(['id', 'name', 'slug']); const actors = existingActorEntries.concat(Array.isArray(newActorEntries) ? newActorEntries : []); // TODO: don't fetch existing profiles unless --force is used const profilesPerActor = await Promise.map( actors, async actor => scrapeProfiles(actor, sources, networksBySlug, sitesBySlug), { concurrency: 10 }, ); const profiles = await Promise.all(profilesPerActor.flat().map(profile => curateProfile(profile))); const profilesWithAvatarIds = await associateAvatars(profiles); await upsertProfiles(profilesWithAvatarIds); await interpolateProfiles(actors); } async function getOrCreateActors(baseActors, batchId) { const existingActors = await knex('actors') .select('id', 'alias_for', 'name', 'slug', 'network_id') .whereIn('slug', baseActors.map(baseActor => baseActor.slug)) .whereNull('network_id') .orWhereIn(['slug', 'network_id'], baseActors.map(baseActor => [baseActor.slug, baseActor.network.id])); // const existingActorSlugs = new Set(existingActors.map(actor => actor.slug)); const existingActorSlugs = existingActors.reduce((acc, actor) => ({ ...acc, [actor.network_id]: { ...acc[actor.network_id], [actor.slug]: true, }, }), {}); const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.network.id]?.[baseActor.slug] && !existingActorSlugs.null?.[baseActor.slug]); const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId); const newActors = await knex('actors').insert(curatedActorEntries, ['id', 'alias_for', 'name', 'slug', 'network_id']); if (Array.isArray(newActors)) { return newActors.concat(existingActors); } return existingActors; } async function associateActors(releases, batchId) { const baseActorsByReleaseId = releases.reduce((acc, release) => { if (release.actors) { acc[release.id] = toBaseActors(release.actors, release); } return acc; }, {}); const baseActors = Object.values(baseActorsByReleaseId).flat(); if (baseActors.length === 0) { return null; } const baseActorsBySlugAndNetworkId = baseActors.reduce((acc, baseActor) => ({ ...acc, [baseActor.slug]: { ...acc[baseActor.slug], [baseActor.network.id]: baseActor, }, }), {}); const uniqueBaseActors = Object.values(baseActorsBySlugAndNetworkId).map(baseActorsByNetworkId => Object.values(baseActorsByNetworkId)).flat(); const actors = await getOrCreateActors(uniqueBaseActors, batchId); const actorIdsBySlugAndNetworkId = actors.reduce((acc, actor) => ({ ...acc, [actor.network_id]: { ...acc[actor.network_id], [actor.slug]: actor.alias_for || actor.id, }, }), {}); const releaseActorAssociations = Object.entries(baseActorsByReleaseId) .map(([releaseId, releaseActors]) => releaseActors .map(releaseActor => ({ release_id: releaseId, actor_id: actorIdsBySlugAndNetworkId[releaseActor.network.id]?.[releaseActor.slug] || actorIdsBySlugAndNetworkId.null[releaseActor.slug], }))) .flat(); await knex.raw(`${knex('releases_actors').insert(releaseActorAssociations).toString()} ON CONFLICT DO NOTHING;`); return actors; } module.exports = { associateActors, scrapeActors, };