'use strict'; const config = require('config'); const Promise = require('bluebird'); const UrlPattern = require('url-pattern'); const moment = require('moment'); const logger = require('./logger')(__filename); const knex = require('./knex'); const argv = require('./argv'); const include = require('./utils/argv-include')(argv); const scrapers = require('./scrapers/scrapers'); const whereOr = require('./utils/where-or'); const resolvePlace = require('./utils/resolve-place'); const slugify = require('./utils/slugify'); const capitalize = require('./utils/capitalize'); const { curateSites } = require('./sites'); const { storeMedia, associateMedia } = require('./media'); async function curateActor(actor) { const [aliases, avatar, photos, social] = await Promise.all([ knex('actors').where({ alias_for: actor.id }), knex('actors_avatars') .where('actor_id', actor.id) .join('media', 'media.id', 'actors_avatars.media_id') .first(), knex('actors_photos') .where('actor_id', actor.id) .join('media', 'media.id', 'actors_photos.media_id') .orderBy('index'), knex('actors_social') .where('actor_id', actor.id) .orderBy('platform', 'desc'), ]); const curatedActor = { id: actor.id, gender: actor.gender, name: actor.name, description: actor.description, birthdate: actor.birthdate && new Date(actor.birthdate), country: actor.country_alpha2, origin: (actor.birth_city || actor.birth_state || actor.birth_country_alpha2) ? {} : null, residence: (actor.residence_city || actor.residence_state || actor.residence_country_alpha2) ? {} : null, ethnicity: actor.ethnicity, height: actor.height, weight: actor.weight, bust: actor.bust, waist: actor.waist, hip: actor.hip, naturalBoobs: actor.natural_boobs, aliases: aliases.map(({ name }) => name), slug: actor.slug, avatar, photos, hasTattoos: actor.has_tattoos, hasPiercings: actor.has_piercings, tattoos: actor.tattoos, piercings: actor.piercings, social, scrapedAt: actor.scraped_at, }; if (curatedActor.birthdate) { curatedActor.age = moment().diff(curatedActor.birthdate, 'years'); } if (actor.birth_city) curatedActor.origin.city = actor.birth_city; if (actor.birth_state) curatedActor.origin.state = actor.birth_state; if (actor.birth_country_alpha2) { curatedActor.origin.country = { alpha2: actor.birth_country_alpha2, name: actor.birth_country_name, alias: actor.birth_country_alias, }; } if (actor.residence_city) curatedActor.residence.city = actor.residence_city; if (actor.residence_state) curatedActor.residence.state = actor.residence_state; if (actor.residence_country_alpha2) { curatedActor.residence.country = { alpha2: actor.residence_country_alpha2, name: actor.residence_country_name, alias: actor.residence_country_alias, }; } return curatedActor; } function curateActors(releases) { return Promise.all(releases.map(async release => curateActor(release))); } function curateActorEntry(actor, scraped, scrapeSuccess) { const curatedActor = { name: capitalize(actor.name), slug: slugify(actor.name), birthdate: actor.birthdate, description: actor.description, gender: actor.gender, ethnicity: actor.ethnicity, bust: actor.bust, waist: actor.waist, hip: actor.hip, natural_boobs: actor.naturalBoobs, height: actor.height, weight: actor.weight, hair: actor.hair, eyes: actor.eyes, has_tattoos: actor.hasTattoos, has_piercings: actor.hasPiercings, tattoos: actor.tattoos, piercings: actor.piercings, }; if (actor.id) { curatedActor.id = actor.id; } if (actor.birthPlace) { curatedActor.birth_city = actor.birthPlace.city; curatedActor.birth_state = actor.birthPlace.state; curatedActor.birth_country_alpha2 = actor.birthPlace.country; } if (actor.residencePlace) { curatedActor.residence_city = actor.residencePlace.city; curatedActor.residence_state = actor.residencePlace.state; curatedActor.residence_country_alpha2 = actor.residencePlace.country; } if (scraped) { curatedActor.scraped_at = new Date(); curatedActor.scrape_success = scrapeSuccess; } return curatedActor; } function curateSocialEntry(url, actorId) { const platforms = [ // links supplied by PH often look like domain.com/domain.com/username { label: 'twitter', pattern: 'http(s)\\://(*)twitter.com/:username(/)(?*)', format: username => `https://www.twitter.com/${username}`, }, { label: 'youtube', pattern: 'http(s)\\://(*)youtube.com/channel/:username(?*)', format: username => `https://www.youtube.com/channel/${username}`, }, { label: 'instagram', pattern: 'http(s)\\://(*)instagram.com/:username(/)(?*)', format: username => `https://www.instagram.com/${username}`, }, { label: 'snapchat', pattern: 'http(s)\\://(*)snapchat.com/add/:username(/)(?*)', format: username => `https://www.snapchat.com/add/${username}`, }, { label: 'tumblr', pattern: 'http(s)\\://:username.tumblr.com(*)', format: username => `https://${username}.tumblr.com`, }, { label: 'onlyfans', pattern: 'http(s)\\://(*)onlyfans.com/:username(/)(?*)', format: username => `https://www.onlyfans.com/${username}`, }, { label: 'fancentro', pattern: 'http(s)\\://(*)fancentro.com/:username(/)(?*)', format: username => `https://www.fancentro.com/${username}`, }, { label: 'modelhub', pattern: 'http(s)\\://(*)modelhub.com/:username(/)(?*)', format: username => `https://www.modelhub.com/${username}`, }, ]; const match = platforms.reduce((acc, platform) => { if (acc) return acc; const patternMatch = new UrlPattern(platform.pattern).match(url); if (patternMatch) { return { platform: platform.label, original: url, username: patternMatch.username, url: platform.format ? platform.format(patternMatch.username) : url, }; } return null; }, null) || { url }; return { url: match.url, platform: match.platform, actor_id: actorId, }; } async function curateSocialEntries(urls, actorId) { if (!urls) { return []; } const existingSocialLinks = await knex('actors_social').where('actor_id', actorId); return urls.reduce((acc, url) => { const socialEntry = curateSocialEntry(url, actorId); if (acc.some(entry => socialEntry.url.toLowerCase() === entry.url.toLowerCase()) || existingSocialLinks.some(entry => socialEntry.url.toLowerCase() === entry.url.toLowerCase())) { // prevent duplicates return acc; } return [...acc, socialEntry]; }, []); } async function fetchActors(queryObject, limit = 100) { const releases = await knex('actors') .select( 'actors.*', 'birth_countries.alpha2 as birth_country_alpha2', 'birth_countries.name as birth_country_name', 'birth_countries.alias as birth_country_alias', 'residence_countries.alpha2 as residence_country_alpha2', 'residence_countries.name as residence_country_name', 'residence_countries.alias as residence_country_alias', ) .leftJoin('countries as birth_countries', 'actors.birth_country_alpha2', 'birth_countries.alpha2') .leftJoin('countries as residence_countries', 'actors.residence_country_alpha2', 'residence_countries.alpha2') .orderBy(['actors.name', 'actors.gender']) .where(builder => whereOr(queryObject, 'actors', builder)) .limit(limit); return curateActors(releases); } async function storeSocialLinks(urls, actorId) { const curatedSocialEntries = await curateSocialEntries(urls, actorId); await knex('actors_social').insert(curatedSocialEntries); } async function storeAvatars(avatars, actorId) { if (!avatars || avatars.length === 0) { return []; } const avatarsBySource = await storeMedia(avatars, 'actor', 'avatar'); await associateMedia({ [actorId]: avatars }, avatarsBySource, 'actor', 'photo', 'avatar'); return avatarsBySource; } async function storeActor(actor, scraped = false, scrapeSuccess = false) { const curatedActor = curateActorEntry(actor, scraped, scrapeSuccess); const [actorEntry] = await knex('actors') .insert(curatedActor) .returning('*'); await storeSocialLinks(actor.social, actorEntry.id); if (actor.avatars) { await storeAvatars(actor.avatars, actorEntry.id); } logger.info(`Added new entry for actor '${actor.name}'`); return actorEntry; } async function updateActor(actor, scraped = false, scrapeSuccess = false) { const curatedActor = curateActorEntry(actor, scraped, scrapeSuccess); const [actorEntry] = await knex('actors') .where({ id: actor.id }) .update(curatedActor) .returning('*'); await storeSocialLinks(actor.social, actor.id); logger.info(`Updated entry for actor '${actor.name}'`); return actorEntry; } async function mergeProfiles(profiles, actor) { if (profiles.filter(Boolean).length === 0) { return null; } const mergedProfile = profiles.reduce((prevProfile, profile) => { if (profile === null) { return prevProfile; } const accProfile = { id: actor ? actor.id : null, name: actor ? actor.name : (prevProfile.name || profile.name), description: prevProfile.description || profile.description, gender: prevProfile.gender || profile.gender, birthdate: !prevProfile.birthdate || Number.isNaN(Number(prevProfile.birthdate)) ? profile.birthdate : prevProfile.birthdate, birthPlace: prevProfile.birthPlace || profile.birthPlace, residencePlace: prevProfile.residencePlace || profile.residencePlace, nationality: prevProfile.nationality || profile.nationality, // used to derive country when not available ethnicity: prevProfile.ethnicity || profile.ethnicity, bust: prevProfile.bust || (/\d+\w+/.test(profile.bust) ? profile.bust : null), waist: prevProfile.waist || profile.waist, hip: prevProfile.hip || profile.hip, naturalBoobs: prevProfile.naturalBoobs === undefined ? profile.naturalBoobs : prevProfile.naturalBoobs, height: prevProfile.height || profile.height, weight: prevProfile.weight || profile.weight, hair: prevProfile.hair || profile.hair, eyes: prevProfile.eyes || profile.eyes, hasPiercings: prevProfile.hasPiercings === undefined ? profile.hasPiercings : prevProfile.hasPiercings, hasTattoos: prevProfile.hasTattoos === undefined ? profile.hasTattoos : prevProfile.hasTattoos, piercings: prevProfile.piercings || profile.piercings, tattoos: prevProfile.tattoos || profile.tattoos, social: prevProfile.social.concat(profile.social || []), releases: prevProfile.releases.concat(profile.releases ? profile.releases : []), // don't flatten fallbacks }; if (profile.avatar) { const avatar = Array.isArray(profile.avatar) ? profile.avatar.map(avatarX => ({ src: avatarX.src || avatarX, scraper: profile.scraper, copyright: avatarX.copyright === undefined ? capitalize(profile.site?.name || profile.scraper) : profile.avatar.copyright, })) : { src: profile.avatar.src || profile.avatar, scraper: profile.scraper, copyright: profile.avatar.copyright === undefined ? capitalize(profile.site?.name || profile.scraper) : profile.avatar.copyright, }; accProfile.avatars = prevProfile.avatars.concat([avatar]); // don't flatten fallbacks } else { accProfile.avatars = prevProfile.avatars; } return accProfile; }, { social: [], avatars: [], releases: [], }); const [birthPlace, residencePlace] = await Promise.all([ resolvePlace(mergedProfile.birthPlace), resolvePlace(mergedProfile.residencePlace), ]); mergedProfile.birthPlace = birthPlace; mergedProfile.residencePlace = residencePlace; if (!mergedProfile.birthPlace && mergedProfile.nationality) { const country = await knex('countries') .where('nationality', 'ilike', `%${mergedProfile.nationality}%`) .orderBy('priority', 'desc') .first(); mergedProfile.birthPlace = { country: country.alpha2, }; } return mergedProfile; } async function scrapeProfiles(sources, actorName, actorEntry, sitesBySlug) { return Promise.map(sources, async (source) => { // const [scraperSlug, scraper] = source; const profileScrapers = [].concat(source).map(slug => ({ scraperSlug: slug, scraper: scrapers.actors[slug] })); try { return await profileScrapers.reduce(async (outcome, { scraper, scraperSlug }) => outcome.catch(async () => { if (!scraper) { logger.warn(`No profile profile scraper available for ${scraperSlug}`); throw Object.assign(new Error(`No profile scraper available for ${scraperSlug}`)); } logger.verbose(`Searching '${actorName}' on ${scraperSlug}`); const site = sitesBySlug[scraperSlug] || null; const profile = await scraper.fetchProfile(actorEntry ? actorEntry.name : actorName, scraperSlug, site, include); if (profile && typeof profile !== 'number') { logger.verbose(`Found profile for '${actorName}' on ${scraperSlug}`); return { ...profile, name: actorName, scraper: scraperSlug, site, releases: profile.releases?.map(release => (typeof release === 'string' ? { url: release, site } : { ...release, site: release.site || site } )), }; } logger.verbose(`No profile for '${actorName}' available on ${scraperSlug}: ${profile}`); throw Object.assign(new Error(`Profile for ${actorName} not available on ${scraperSlug}`), { warn: false }); }), Promise.reject(new Error())); } catch (error) { if (error.warn !== false) { logger.warn(`Error in scraper ${source}: ${error.message}`); // logger.error(error.stack); } } return null; }); } async function scrapeActors(actorNames) { return Promise.map(actorNames || argv.actors, async (actorName) => { try { const actorSlug = slugify(actorName); const actorEntry = await knex('actors').where({ slug: actorSlug }).first(); const sources = argv.sources || config.profiles || Object.keys(scrapers.actors); const finalSources = argv.withReleases ? sources.flat() : sources; // ignore race-to-success grouping when scenes are requested const [siteEntries, networkEntries] = await Promise.all([ knex('sites') .leftJoin('networks', 'sites.network_id', 'networks.id') .select( 'sites.*', 'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters', ) .whereIn('sites.slug', finalSources.flat()), knex('networks').select('*').whereIn('slug', finalSources.flat()), ]); const sites = await curateSites(siteEntries, true); const networks = networkEntries.map(network => ({ ...network, isFallback: true })); const sitesBySlug = [].concat(networks, sites).reduce((acc, site) => ({ ...acc, [site.slug]: site }), {}); const profiles = await scrapeProfiles(sources, actorName, actorEntry, sitesBySlug); const profile = await mergeProfiles(profiles, actorEntry); if (profile === null) { logger.warn(`Could not find profile for actor '${actorName}'`); if (argv.save && !actorEntry) { await storeActor({ name: actorName }, false, false); } return null; } if (argv.inspect) { console.log(profile); logger.info(`Found ${profile.releases.length} releases for ${actorName}`); } if (argv.save) { if (actorEntry && profile) { await Promise.all([ updateActor(profile, true, true), storeAvatars(profile.avatars, actorEntry.id), ]); return profile; } await storeActor(profile, true, true); } return profile; } catch (error) { console.log(error); logger.warn(`${actorName}: ${error}`); return null; } }, { concurrency: 3, }); } async function scrapeBasicActors() { const basicActors = await knex('actors').where('scraped_at', null); return scrapeActors(basicActors.map(actor => actor.name)); } async function associateActors(mappedActors, releases) { const [existingActorEntries, existingAssociationEntries] = await Promise.all([ knex('actors') .whereIn('name', Object.values(mappedActors).map(actor => actor.name)) .orWhereIn('slug', Object.keys(mappedActors)), knex('releases_actors').whereIn('release_id', releases.map(release => release.id)), ]); const associations = await Promise.map(Object.entries(mappedActors), async ([actorSlug, actor]) => { try { const actorEntry = existingActorEntries.find(actorX => actorX.slug === actorSlug) || await storeActor(actor); // if a scene return Array.from(actor.releaseIds) .map(releaseId => ({ release_id: releaseId, actor_id: actorEntry.id, })) .filter(association => !existingAssociationEntries // remove associations already in database .some(associationEntry => associationEntry.actor_id === association.actor_id && associationEntry.release_id === association.release_id)); } catch (error) { logger.error(actor.name, error); return null; } }); await knex('releases_actors').insert(associations.filter(association => association).flat()); // basic actor scraping is failure prone, don't run together with actor association // await scrapebasicactors(), } module.exports = { associateActors, fetchActors, scrapeActors, scrapeBasicActors, };