2019-11-10 03:20:22 +00:00
|
|
|
'use strict';
|
|
|
|
|
2020-05-14 02:26:05 +00:00
|
|
|
const config = require('config');
|
|
|
|
const Promise = require('bluebird');
|
|
|
|
|
2020-05-13 00:56:20 +00:00
|
|
|
// const logger = require('./logger')(__filename);
|
2020-03-26 02:32:07 +00:00
|
|
|
const knex = require('./knex');
|
2020-05-15 02:40:59 +00:00
|
|
|
const scrapers = require('./scrapers/scrapers').actors;
|
2020-05-14 02:26:05 +00:00
|
|
|
|
|
|
|
const argv = require('./argv');
|
2020-05-15 02:40:59 +00:00
|
|
|
const include = require('./utils/argv-include')(argv);
|
|
|
|
const logger = require('./logger')(__filename);
|
2020-01-07 03:23:28 +00:00
|
|
|
const slugify = require('./utils/slugify');
|
2020-03-26 02:32:07 +00:00
|
|
|
const capitalize = require('./utils/capitalize');
|
2020-05-15 02:40:59 +00:00
|
|
|
const resolvePlace = require('./utils/resolve-place');
|
|
|
|
|
|
|
|
const { toBaseReleases } = require('./deep');
|
2019-11-10 03:20:22 +00:00
|
|
|
|
2020-03-26 02:32:07 +00:00
|
|
|
function toBaseActors(actorsOrNames, release) {
|
2020-05-14 02:26:05 +00:00
|
|
|
return actorsOrNames.map((actorOrName) => {
|
|
|
|
const name = capitalize(actorOrName.name || actorOrName);
|
|
|
|
const slug = slugify(name);
|
|
|
|
|
|
|
|
const baseActor = {
|
|
|
|
name,
|
|
|
|
slug,
|
|
|
|
network: release?.site.network,
|
|
|
|
};
|
|
|
|
|
|
|
|
if (actorOrName.name) {
|
|
|
|
return {
|
|
|
|
...actorOrName,
|
|
|
|
...baseActor,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
return baseActor;
|
|
|
|
});
|
2020-03-24 02:48:24 +00:00
|
|
|
}
|
|
|
|
|
2020-05-13 00:56:20 +00:00
|
|
|
function curateActorEntry(baseActor, batchId) {
|
2020-05-14 02:26:05 +00:00
|
|
|
return {
|
|
|
|
name: baseActor.name,
|
|
|
|
slug: baseActor.slug,
|
|
|
|
network_id: null,
|
|
|
|
batch_id: batchId,
|
|
|
|
};
|
2020-03-26 02:32:07 +00:00
|
|
|
}
|
|
|
|
|
2020-05-13 00:56:20 +00:00
|
|
|
function curateActorEntries(baseActors, batchId) {
|
2020-05-14 02:26:05 +00:00
|
|
|
return baseActors.map(baseActor => curateActorEntry(baseActor, batchId));
|
2020-03-26 02:32:07 +00:00
|
|
|
}
|
|
|
|
|
2020-05-15 02:40:59 +00:00
|
|
|
function curateProfileEntry(profile) {
|
|
|
|
const curatedProfileEntry = {
|
|
|
|
actor_id: profile.id,
|
|
|
|
site_id: profile.site?.id || null,
|
|
|
|
network_id: profile.network?.id || null,
|
|
|
|
date_of_birth: profile.dateOfBirth,
|
|
|
|
date_of_death: profile.dateOfDeath,
|
|
|
|
gender: profile.gender,
|
|
|
|
ethnicity: profile.ethnicity,
|
|
|
|
description: profile.description,
|
|
|
|
birth_city: profile.placeOfBirth?.city || null,
|
|
|
|
birth_state: profile.placeOfBirth?.state || null,
|
|
|
|
birth_country_alpha2: profile.placeOfBirth?.country?.alpha2 || null,
|
|
|
|
residence_city: profile.placeOfResidence?.city || null,
|
|
|
|
residence_state: profile.placeOfResidence?.state || null,
|
|
|
|
residence_country_alpha2: profile.placeOfResidence?.country?.alpha2 || null,
|
|
|
|
cup: profile.cup,
|
|
|
|
bust: profile.bust,
|
|
|
|
waist: profile.waist,
|
|
|
|
hip: profile.hip,
|
|
|
|
natural_boobs: profile.naturalBoobs,
|
|
|
|
height: profile.height,
|
|
|
|
weight: profile.weight,
|
|
|
|
hair: profile.hair,
|
|
|
|
eyes: profile.eyes,
|
|
|
|
has_tattoos: profile.hasTattoos,
|
|
|
|
has_piercings: profile.hasPiercings,
|
|
|
|
piercings: profile.piercings,
|
|
|
|
tattoos: profile.tattoos,
|
|
|
|
};
|
|
|
|
|
|
|
|
return curatedProfileEntry;
|
|
|
|
}
|
|
|
|
|
|
|
|
async function curateProfile(profile) {
|
|
|
|
try {
|
|
|
|
const curatedProfile = {
|
|
|
|
id: profile.id,
|
|
|
|
name: profile.name,
|
|
|
|
avatar: profile.avatar,
|
|
|
|
};
|
|
|
|
|
|
|
|
curatedProfile.site = profile.site.isNetwork ? null : profile.site;
|
|
|
|
curatedProfile.network = profile.site.isNetwork ? profile.site : null;
|
|
|
|
|
|
|
|
curatedProfile.description = profile.description?.trim() || null;
|
|
|
|
curatedProfile.nationality = profile.nationality?.trim() || null; // used to derive country when country not available
|
|
|
|
curatedProfile.ethnicity = profile.ethnicity?.trim() || null;
|
|
|
|
curatedProfile.hair = profile.hair?.trim() || null;
|
|
|
|
curatedProfile.eyes = profile.eyes?.trim() || null;
|
|
|
|
curatedProfile.tattoos = profile.tattoos?.trim() || null;
|
|
|
|
curatedProfile.piercings = profile.piercings?.trim() || null;
|
|
|
|
|
|
|
|
curatedProfile.gender = (/female/i.test(profile.gender) && 'female')
|
|
|
|
|| (/shemale/i.test(profile.gender) && 'transsexual')
|
|
|
|
|| (/male/i.test(profile.gender) && 'male')
|
|
|
|
|| (/trans/i.test(profile.gender) && 'transsexual')
|
|
|
|
|| null;
|
|
|
|
|
|
|
|
curatedProfile.dateOfBirth = (!Number.isNaN(Number(profile.dateOfBirth || profile.birthdate)) // possibly valid date
|
|
|
|
&& new Date() - profile.birthdate > 567648000000 // over 18
|
|
|
|
&& profile.birthdate)
|
|
|
|
|| null;
|
|
|
|
|
|
|
|
curatedProfile.dateOfDeath = Number.isNaN(Number(profile.dateOfDeath)) ? null : profile.dateOfDeath;
|
|
|
|
|
|
|
|
curatedProfile.cup = profile.cup || profile.bust?.match(/[a-zA-Z]+/)?.[0] || null;
|
|
|
|
curatedProfile.bust = Number(profile.bust) || profile.bust?.match(/\d+/)?.[0] || null;
|
|
|
|
curatedProfile.waist = Number(profile.waist) || profile.waist?.match(/\d+/)?.[0] || null;
|
|
|
|
curatedProfile.hip = Number(profile.hip) || profile.hip?.match(/\d+/)?.[0] || null;
|
|
|
|
curatedProfile.height = Number(profile.height) || profile.height?.match(/\d+/)?.[0] || null;
|
|
|
|
curatedProfile.weight = Number(profile.weight) || profile.weight?.match(/\d+/)?.[0] || null;
|
|
|
|
|
|
|
|
curatedProfile.naturalBoobs = typeof profile.naturalBoobs === 'boolean' ? profile.naturalBoobs : null;
|
|
|
|
curatedProfile.hasTattoos = typeof profile.hasTattoos === 'boolean' ? profile.hasTattoos : null;
|
|
|
|
curatedProfile.hasPiercings = typeof profile.hasPiercings === 'boolean' ? profile.hasPiercings : null;
|
|
|
|
|
|
|
|
const [placeOfBirth, placeOfResidence] = await Promise.all([
|
|
|
|
resolvePlace(profile.birthPlace),
|
|
|
|
resolvePlace(profile.residencePlace),
|
|
|
|
]);
|
|
|
|
|
|
|
|
curatedProfile.placeOfBirth = placeOfBirth;
|
|
|
|
curatedProfile.placeOfResidence = placeOfResidence;
|
|
|
|
|
|
|
|
if (!curatedProfile.placeOfBirth && curatedProfile.nationality) {
|
|
|
|
const country = await knex('countries')
|
|
|
|
.where('nationality', 'ilike', `%${curatedProfile.nationality}%`)
|
|
|
|
.orderBy('priority', 'desc')
|
|
|
|
.first();
|
|
|
|
|
|
|
|
curatedProfile.placeOfBirth = {
|
|
|
|
country: country.alpha2,
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
curatedProfile.social = Array.isArray(profile.social)
|
|
|
|
? profile.social.map((social) => {
|
|
|
|
try {
|
|
|
|
const { href } = new URL();
|
|
|
|
return href;
|
|
|
|
} catch (error) {
|
|
|
|
logger.warn(`Profile scraper for '${profile.site.name}' returned invalid social link: ${social}`);
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}).filter(Boolean)
|
|
|
|
: [];
|
|
|
|
|
|
|
|
curatedProfile.releases = toBaseReleases(profile.releases);
|
|
|
|
|
|
|
|
return curatedProfile;
|
|
|
|
} catch (error) {
|
|
|
|
logger.error(`Failed to curate '${profile.name}': ${error.message}`);
|
|
|
|
|
|
|
|
return null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) {
|
|
|
|
const profiles = Promise.map(sources, async (source) => {
|
|
|
|
try {
|
|
|
|
return await [].concat(source).reduce(async (outcome, scraperSlug) => outcome.catch(async () => {
|
|
|
|
const scraper = scrapers[scraperSlug];
|
|
|
|
const siteOrNetwork = networksBySlug[scraperSlug] || sitesBySlug[scraperSlug];
|
|
|
|
|
|
|
|
if (!scraper?.fetchProfile) {
|
|
|
|
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
|
|
|
|
throw new Error(`No profile profile scraper available for ${scraperSlug}`);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!siteOrNetwork) {
|
|
|
|
logger.warn(`No site or network found for ${scraperSlug}`);
|
|
|
|
throw new Error(`No site or network found for ${scraperSlug}`);
|
|
|
|
}
|
|
|
|
|
|
|
|
logger.verbose(`Searching profile for '${actor.name}' on '${scraperSlug}'`);
|
|
|
|
|
|
|
|
const profile = await scraper.fetchProfile(actor.name, scraperSlug, siteOrNetwork, include);
|
|
|
|
|
|
|
|
if (!profile || typeof profile === 'number') { // scraper returns HTTP code on request failure
|
|
|
|
logger.verbose(`Profile for '${actor.name}' not available on ${scraperSlug}, scraper returned ${profile}`);
|
|
|
|
throw Object.assign(new Error(`Profile for '${actor.name}' not available on ${scraperSlug}`), { code: 'PROFILE_NOT_AVAILABLE' });
|
|
|
|
}
|
|
|
|
|
|
|
|
return {
|
|
|
|
...actor,
|
|
|
|
...profile,
|
|
|
|
site: siteOrNetwork,
|
|
|
|
};
|
|
|
|
}), Promise.reject(new Error()));
|
|
|
|
} catch (error) {
|
|
|
|
if (error.code !== 'PROFILE_NOT_AVAILABLE') {
|
|
|
|
logger.error(`Failed to fetch profile for '${actor.name}': ${error.message}`);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return null;
|
|
|
|
});
|
|
|
|
|
|
|
|
return profiles.filter(Boolean);
|
|
|
|
}
|
|
|
|
|
|
|
|
async function upsertProfiles(curatedProfileEntries) {
|
|
|
|
const existingProfiles = await knex('actors_profiles')
|
|
|
|
.whereIn(['actor_id', 'network_id'], curatedProfileEntries.map(entry => [entry.actor_id, entry.network_id]))
|
|
|
|
.orWhereIn(['actor_id', 'site_id'], curatedProfileEntries.map(entry => [entry.actor_id, entry.site_id]));
|
|
|
|
|
|
|
|
const existingProfilesByActorNetworkSiteIds = existingProfiles.reduce((acc, profile) => ({
|
|
|
|
...acc,
|
|
|
|
[profile.actor_id]: {
|
|
|
|
...acc[profile.actor_id],
|
|
|
|
[profile.network_id]: {
|
|
|
|
...acc[profile.actor_id]?.[profile.network_id],
|
|
|
|
[profile.site_id]: profile,
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}), {});
|
|
|
|
|
|
|
|
const { updatingProfileEntries, newProfileEntries } = curatedProfileEntries.reduce((acc, profile) => {
|
|
|
|
const existingProfile = existingProfilesByActorNetworkSiteIds[profile.actor_id]?.[profile.network_id]?.[profile.site_id];
|
|
|
|
|
|
|
|
if (existingProfile) {
|
|
|
|
return {
|
|
|
|
...acc,
|
|
|
|
updatingProfileEntries: [...acc.updatingProfileEntries, {
|
|
|
|
...profile,
|
|
|
|
id: existingProfile.id,
|
|
|
|
}],
|
|
|
|
};
|
|
|
|
}
|
|
|
|
|
|
|
|
return {
|
|
|
|
...acc,
|
|
|
|
newProfileEntries: [...acc.newProfileEntries, profile],
|
|
|
|
};
|
|
|
|
}, {
|
|
|
|
updatingProfileEntries: [],
|
|
|
|
newProfileEntries: [],
|
|
|
|
});
|
|
|
|
|
|
|
|
if (newProfileEntries.length > 0) {
|
|
|
|
await knex('actors_profiles').insert(newProfileEntries);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (argv.force && updatingProfileEntries.length > 0) {
|
|
|
|
knex.transaction(async (transaction) => {
|
|
|
|
const queries = updatingProfileEntries.map(profileEntry => knex('actors_profiles')
|
|
|
|
.where('id', profileEntry.id)
|
|
|
|
.update(profileEntry)
|
|
|
|
.transacting(transaction));
|
|
|
|
|
|
|
|
return Promise.all(queries)
|
|
|
|
.then(transaction.commit)
|
|
|
|
.catch(transaction.rollback);
|
|
|
|
});
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2020-05-14 02:26:05 +00:00
|
|
|
async function scrapeActors(actorNames) {
|
|
|
|
const baseActors = toBaseActors(actorNames);
|
|
|
|
|
|
|
|
const sources = argv.sources || config.profiles || Object.keys(scrapers.actors);
|
|
|
|
const siteSlugs = sources.flat();
|
|
|
|
|
|
|
|
const [networks, sites, existingActorEntries] = await Promise.all([
|
|
|
|
knex('networks').whereIn('slug', siteSlugs),
|
|
|
|
knex('sites').whereIn('slug', siteSlugs),
|
|
|
|
knex('actors')
|
|
|
|
.select(['id', 'name', 'slug'])
|
|
|
|
.whereIn('slug', baseActors.map(baseActor => baseActor.slug))
|
|
|
|
.whereNull('network_id'),
|
|
|
|
]);
|
|
|
|
|
|
|
|
const existingActorEntriesBySlug = existingActorEntries.reduce((acc, actorEntry) => ({ ...acc, [actorEntry.slug]: actorEntry }), {});
|
|
|
|
const networksBySlug = networks.reduce((acc, network) => ({ ...acc, [network.slug]: { ...network, isNetwork: true } }), {});
|
|
|
|
const sitesBySlug = sites.reduce((acc, site) => ({ ...acc, [site.slug]: site }), {});
|
|
|
|
|
|
|
|
const newBaseActors = baseActors.filter(baseActor => !existingActorEntriesBySlug[baseActor.slug]);
|
|
|
|
|
|
|
|
const [batchId] = newBaseActors.length > 0 ? await knex('batches').insert({ comment: null }).returning('id') : [null];
|
|
|
|
const curatedActorEntries = batchId && curateActorEntries(newBaseActors, batchId);
|
|
|
|
const newActorEntries = batchId && await knex('actors').insert(curatedActorEntries).returning(['id', 'name', 'slug']);
|
|
|
|
|
2020-05-15 02:40:59 +00:00
|
|
|
const actors = existingActorEntries.concat(Array.isArray(newActorEntries) ? newActorEntries : []);
|
|
|
|
|
|
|
|
// TODO: don't fetch existing profiles unless --force is used
|
|
|
|
|
|
|
|
const profilesPerActor = await Promise.map(
|
|
|
|
actors,
|
|
|
|
async actor => scrapeProfiles(actor, sources, networksBySlug, sitesBySlug),
|
|
|
|
{ concurrency: 10 },
|
|
|
|
);
|
|
|
|
|
|
|
|
const profiles = await Promise.all(profilesPerActor.flat().map(profile => curateProfile(profile)));
|
|
|
|
const curatedProfileEntries = profiles.map(profile => curateProfileEntry(profile));
|
2020-05-13 21:17:39 +00:00
|
|
|
|
2020-05-15 02:40:59 +00:00
|
|
|
await upsertProfiles(curatedProfileEntries);
|
2020-05-13 21:17:39 +00:00
|
|
|
}
|
|
|
|
|
2020-05-13 00:56:20 +00:00
|
|
|
async function getOrCreateActors(baseActors, batchId) {
|
2020-05-14 02:26:05 +00:00
|
|
|
const existingActors = await knex('actors')
|
|
|
|
.select('id', 'alias_for', 'name', 'slug', 'network_id')
|
|
|
|
.whereIn('slug', baseActors.map(baseActor => baseActor.slug))
|
|
|
|
.whereNull('network_id')
|
|
|
|
.orWhereIn(['slug', 'network_id'], baseActors.map(baseActor => [baseActor.slug, baseActor.network.id]));
|
|
|
|
|
|
|
|
// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug));
|
|
|
|
const existingActorSlugs = existingActors.reduce((acc, actor) => ({
|
|
|
|
...acc,
|
|
|
|
[actor.network_id]: {
|
|
|
|
...acc[actor.network_id],
|
|
|
|
[actor.slug]: true,
|
|
|
|
},
|
|
|
|
}), {});
|
|
|
|
|
|
|
|
const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.network.id]?.[baseActor.slug] && !existingActorSlugs.null?.[baseActor.slug]);
|
|
|
|
|
|
|
|
const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId);
|
|
|
|
const newActors = await knex('actors').insert(curatedActorEntries, ['id', 'alias_for', 'name', 'slug', 'network_id']);
|
|
|
|
|
|
|
|
if (Array.isArray(newActors)) {
|
|
|
|
return newActors.concat(existingActors);
|
|
|
|
}
|
|
|
|
|
|
|
|
return existingActors;
|
2020-03-26 02:32:07 +00:00
|
|
|
}
|
|
|
|
|
2020-05-13 00:56:20 +00:00
|
|
|
async function associateActors(releases, batchId) {
|
2020-05-14 02:26:05 +00:00
|
|
|
const baseActorsByReleaseId = releases.reduce((acc, release) => {
|
|
|
|
if (release.actors) {
|
|
|
|
acc[release.id] = toBaseActors(release.actors, release);
|
|
|
|
}
|
|
|
|
|
|
|
|
return acc;
|
|
|
|
}, {});
|
|
|
|
|
|
|
|
const baseActors = Object.values(baseActorsByReleaseId).flat();
|
|
|
|
|
|
|
|
if (baseActors.length === 0) {
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
const baseActorsBySlugAndNetworkId = baseActors.reduce((acc, baseActor) => ({
|
|
|
|
...acc,
|
|
|
|
[baseActor.slug]: {
|
|
|
|
...acc[baseActor.slug],
|
|
|
|
[baseActor.network.id]: baseActor,
|
|
|
|
},
|
|
|
|
}), {});
|
|
|
|
|
|
|
|
const uniqueBaseActors = Object.values(baseActorsBySlugAndNetworkId).map(baseActorsByNetworkId => Object.values(baseActorsByNetworkId)).flat();
|
|
|
|
|
|
|
|
const actors = await getOrCreateActors(uniqueBaseActors, batchId);
|
|
|
|
|
|
|
|
const actorIdsBySlugAndNetworkId = actors.reduce((acc, actor) => ({
|
|
|
|
...acc,
|
|
|
|
[actor.network_id]: {
|
|
|
|
...acc[actor.network_id],
|
|
|
|
[actor.slug]: actor.alias_for || actor.id,
|
|
|
|
},
|
|
|
|
}), {});
|
|
|
|
|
|
|
|
const releaseActorAssociations = Object.entries(baseActorsByReleaseId)
|
|
|
|
.map(([releaseId, releaseActors]) => releaseActors
|
|
|
|
.map(releaseActor => ({
|
|
|
|
release_id: releaseId,
|
|
|
|
actor_id: actorIdsBySlugAndNetworkId[releaseActor.network.id]?.[releaseActor.slug] || actorIdsBySlugAndNetworkId.null[releaseActor.slug],
|
|
|
|
})))
|
|
|
|
.flat();
|
|
|
|
|
|
|
|
await knex.raw(`${knex('releases_actors').insert(releaseActorAssociations).toString()} ON CONFLICT DO NOTHING;`);
|
2019-11-11 02:20:00 +00:00
|
|
|
}
|
|
|
|
|
2019-11-10 03:20:22 +00:00
|
|
|
module.exports = {
|
2020-05-14 02:26:05 +00:00
|
|
|
associateActors,
|
|
|
|
scrapeActors,
|
2019-11-10 03:20:22 +00:00
|
|
|
};
|