Fixed profile location interpolation. Generalizing ethnicity, hair color and eye color.

This commit is contained in:
2020-05-19 01:10:32 +02:00
parent 4826ae8571
commit 0c4628677f
16 changed files with 1976 additions and 1862 deletions

View File

@@ -20,11 +20,56 @@ const slugify = require('./utils/slugify');
const capitalize = require('./utils/capitalize');
const resolvePlace = require('./utils/resolve-place');
const hairColors = {
'jet-black': 'black',
'red-head': 'red',
'soft-black': 'black',
black: 'black',
blonde: 'blonde',
blondie: 'blonde',
brown: 'brown',
brunette: 'brown',
fair: 'blonde',
raven: 'black',
red: 'red',
redhead: 'red',
};
const eyeColors = {
blue: 'blue',
brown: 'brown',
dark: 'brown',
gray: 'gray',
green: 'green',
grey: 'gray',
hazel: 'hazel',
};
const ethnicities = {
'african american': 'black',
'african-american': 'black',
'native american': 'native american',
african: 'black',
aravic: 'arabic',
asian: 'asian',
black: 'black',
caucasian: 'white',
european: 'white',
hispanic: 'latina',
indian: 'indian',
japanese: 'japanese',
latina: 'latina',
latino: 'latino',
white: 'white',
};
function getMostFrequent(items) {
const { mostFrequent } = items.reduce((acc, item) => {
acc.counts[item] = (acc.counts[item] || 0) + 1;
const slug = slugify(item);
if (!acc.mostFrequent || acc.counts[item] > acc.counts[acc.mostFrequent]) {
acc.counts[slug] = (acc.counts[slug] || 0) + 1;
if (!acc.mostFrequent || acc.counts[slug] > acc.counts[slugify(acc.mostFrequent)]) {
acc.mostFrequent = item;
}
@@ -144,9 +189,11 @@ async function curateProfile(profile) {
curatedProfile.description = profile.description?.trim() || null;
curatedProfile.nationality = profile.nationality?.trim() || null; // used to derive country when country not available
curatedProfile.ethnicity = profile.ethnicity?.trim() || null;
curatedProfile.hair = profile.hair?.trim() || null;
curatedProfile.eyes = profile.eyes?.trim() || null;
curatedProfile.ethnicity = ethnicities[profile.ethnicity?.trim().toLowerCase()] || null;
curatedProfile.hair = hairColors[profile.hair?.trim().toLowerCase()] || null;
curatedProfile.eyes = eyeColors[profile.eyes?.trim().toLowerCase()] || null;
curatedProfile.tattoos = profile.tattoos?.trim() || null;
curatedProfile.piercings = profile.piercings?.trim() || null;
@@ -211,6 +258,10 @@ async function curateProfile(profile) {
curatedProfile.releases = toBaseReleases(profile.releases);
if (profile.ethnicity && !curatedProfile.ethnicity) logger.warn(`Unrecognized ethnicity returned by '${profile.site?.name || profile.network?.slug}' scraper: ${profile.ethnicity}`);
if (profile.hair && !curatedProfile.hair) logger.warn(`Unrecognized hair color returned by '${profile.site?.name || profile.network?.slug}' scraper: ${profile.hair}`);
if (profile.eyes && !curatedProfile.eyes) logger.warn(`Unrecognized eye color returned by '${profile.site?.name || profile.network?.slug}' scraper: ${profile.eyes}`);
return curatedProfile;
} catch (error) {
logger.error(`Failed to curate '${profile.name}': ${error.message}`);
@@ -234,15 +285,28 @@ async function interpolateProfiles(actors) {
}), {});
const interpolatedProfiles = Object.entries(profilesByActorId).map(([actorId, actorProfiles]) => {
// group values from each profile
const valuesByProperty = actorProfiles.reduce((acc, profile) => Object
.entries(profile)
.reduce((profileAcc, [property, value]) => ({
...profileAcc,
[property]: [
...(acc[property] || []),
...(value === null ? [] : [value]),
...(value === null ? [] : Array.from({ length: profile.priority }, () => value)), // multiply by priority, increasing the odds of being the most frequent value
],
}), {}), {});
}), {
// bundle location values so they can be assessed together, to ensure the most frequent city is in the most frequent state is in most frequent country
origin: [...acc.origin || [], {
...(profile.birth_country_alpha2 && { country: profile.birth_country_alpha2 }),
...(profile.birth_state && { state: profile.birth_state }),
...(profile.birth_city && { city: profile.birth_city }),
}].filter(location => Object.keys(location).length > 0),
residence: [...acc.residence || [], {
...(profile.residence_country_alpha2 && { country: profile.residence_country_alpha2 }),
...(profile.residence_state && { state: profile.residence_state }),
...(profile.residence_city && { city: profile.residence_city }),
}].filter(location => Object.keys(location).length > 0),
}), {});
const avatars = actorProfiles.map(profile => profile.avatar_media_id && ({
id: profile.avatar_media_id,
@@ -251,39 +315,50 @@ async function interpolateProfiles(actors) {
size: profile.avatar_size,
})).filter(Boolean);
const mostFrequentValues = [
'gender',
'ethnicity',
'cup',
'bust',
'waist',
'hip',
'natural_boobs',
'height',
'hair',
'eyes',
'has_tattoos',
'has_piercings',
].reduce((acc, property) => ({
...acc,
[property]: getMostFrequent(valuesByProperty[property]),
}), {});
const profile = {
id: actorId,
...mostFrequentValues,
};
profile.gender = getMostFrequent(valuesByProperty.gender);
profile.ethnicity = getMostFrequent(valuesByProperty.ethnicity.map(ethnicity => ethnicity.toLowerCase()));
profile.date_of_birth = getMostFrequentDate(valuesByProperty.date_of_birth);
profile.date_of_death = getMostFrequentDate(valuesByProperty.date_of_death);
// TODO: fix city, state and country not matching
profile.birth_city = getMostFrequent(valuesByProperty.birth_city);
profile.birth_state = getMostFrequent(valuesByProperty.birth_state);
profile.birth_country_alpha2 = getMostFrequent(valuesByProperty.birth_country_alpha2);
// ensure most frequent country, city and state match up
profile.birth_country_alpha2 = getMostFrequent(valuesByProperty.origin.map(location => location.country));
const remainingOriginCountries = valuesByProperty.origin.filter(location => location.country === profile.birth_country_alpha2);
profile.residence_city = getMostFrequent(valuesByProperty.residence_city);
profile.residence_state = getMostFrequent(valuesByProperty.residence_state);
profile.residence_country_alpha2 = getMostFrequent(valuesByProperty.residence_country_alpha2);
profile.birth_state = getMostFrequent(remainingOriginCountries.map(location => location.state));
const remainingOriginStates = remainingOriginCountries.filter(location => !profile.birth_state || location.state === profile.birth_state);
profile.cup = getMostFrequent(valuesByProperty.cup);
profile.bust = getMostFrequent(valuesByProperty.bust);
profile.waist = getMostFrequent(valuesByProperty.waist);
profile.hip = getMostFrequent(valuesByProperty.hip);
profile.natural_boobs = getMostFrequent(valuesByProperty.natural_boobs);
profile.birth_city = getMostFrequent(remainingOriginStates.map(location => location.city));
profile.hair = getMostFrequent(valuesByProperty.hair.map(hair => hair.toLowerCase()));
profile.eyes = getMostFrequent(valuesByProperty.eyes.map(eyes => eyes.toLowerCase()));
profile.residence_country_alpha2 = getMostFrequent(valuesByProperty.residence.map(location => location.country));
const remainingResidenceCountries = valuesByProperty.residence.filter(location => location.country === profile.residence_country_alpha2);
profile.residence_state = getMostFrequent(remainingResidenceCountries.map(location => location.state));
const remainingResidenceStates = remainingResidenceCountries.filter(location => !profile.residence_state || location.state === profile.residence_state);
profile.residence_city = getMostFrequent(remainingResidenceStates.map(location => location.city));
profile.weight = getAverage(valuesByProperty.weight);
profile.height = getMostFrequent(valuesByProperty.height);
profile.has_tattoos = getMostFrequent(valuesByProperty.has_tattoos);
profile.has_piercings = getMostFrequent(valuesByProperty.has_piercings);
profile.tattoos = getLongest(valuesByProperty.tattoos);
profile.piercings = getLongest(valuesByProperty.piercings);
@@ -366,38 +441,47 @@ async function upsertProfiles(profiles) {
async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) {
const profiles = Promise.map(sources, async (source) => {
try {
// config may group sources to try until success
return await [].concat(source).reduce(async (outcome, scraperSlug) => outcome.catch(async () => {
const scraper = scrapers[scraperSlug];
const context = {
site: sitesBySlug[scraperSlug] || null,
network: networksBySlug[scraperSlug] || sitesBySlug[scraperSlug]?.network || null,
scraper: scraperSlug,
};
try {
const scraper = scrapers[scraperSlug];
const context = {
site: sitesBySlug[scraperSlug] || null,
network: networksBySlug[scraperSlug] || sitesBySlug[scraperSlug]?.network || null,
scraper: scraperSlug,
};
if (!scraper?.fetchProfile) {
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
throw new Error(`No profile profile scraper available for ${scraperSlug}`);
if (!scraper?.fetchProfile) {
logger.warn(`No profile profile scraper available for ${scraperSlug}`);
throw new Error(`No profile profile scraper available for ${scraperSlug}`);
}
if (!context.site && !context.network) {
logger.warn(`No site or network found for ${scraperSlug}`);
throw new Error(`No site or network found for ${scraperSlug}`);
}
logger.verbose(`Searching profile for '${actor.name}' on '${scraperSlug}'`);
const profile = await scraper.fetchProfile(actor.name, context, include);
if (!profile || typeof profile === 'number') { // scraper returns HTTP code on request failure
logger.verbose(`Profile for '${actor.name}' not available on ${context.site?.name || context.network?.name || context.scraper}, scraper returned ${profile}`);
throw Object.assign(new Error(`Profile for '${actor.name}' not available on ${context.site?.name || context.network?.name || context.scraper}`), { code: 'PROFILE_NOT_AVAILABLE' });
}
return {
...actor,
...profile,
...context,
};
} catch (error) {
if (error.code !== 'PROFILE_NOT_AVAILABLE') {
logger.error(`Failed to fetch profile for '${actor.name}' from '${scraperSlug}': ${error.message}`);
}
throw error;
}
if (!context.site && !context.network) {
logger.warn(`No site or network found for ${scraperSlug}`);
throw new Error(`No site or network found for ${scraperSlug}`);
}
logger.verbose(`Searching profile for '${actor.name}' on '${scraperSlug}'`);
const profile = await scraper.fetchProfile(actor.name, context, include);
if (!profile || typeof profile === 'number') { // scraper returns HTTP code on request failure
logger.verbose(`Profile for '${actor.name}' not available on ${scraperSlug}, scraper returned ${profile}`);
throw Object.assign(new Error(`Profile for '${actor.name}' not available on ${scraperSlug}`), { code: 'PROFILE_NOT_AVAILABLE' });
}
return {
...actor,
...profile,
...context,
};
}), Promise.reject(new Error()));
} catch (error) {
if (error.code !== 'PROFILE_NOT_AVAILABLE') {

View File

@@ -119,7 +119,6 @@ async function getPhotos(entryId, site, type = 'highres', page = 1) {
}
function getEntryId(html) {
// TODO: not working for https://www.julesjordan.com/members/scenes/jada-stevens-anal-ass-gets-oiled-up-for-james-deens-cock_vids.html
const entryId = html.match(/showtagform\((\d+)\)/);
if (entryId) {

View File

@@ -9,7 +9,7 @@ const slugify = require('../utils/slugify');
function extractTitle(originalTitle) {
const titleComponents = originalTitle.split(' ');
const sceneIdMatch = titleComponents.slice(-1)[0].match(/(AB|AF|GP|SZ|IV|GIO|RS|TW|MA|FM|SAL|NR|AA|GL|BZ|FS|KS|OT)\d+/); // detect studio prefixes
const sceneIdMatch = titleComponents.slice(-1)[0].match(/(AB|AF|GP|SZ|IV|GIO|RS|TW|MA|FM|SAL|NR|AA|GL|BZ|FS|KS|OTS)\d+/); // detect studio prefixes
const shootId = sceneIdMatch ? sceneIdMatch[0] : null;
const title = sceneIdMatch ? titleComponents.slice(0, -1).join(' ') : originalTitle;

View File

@@ -4,7 +4,7 @@ function slugify(string, delimiter = '-', {
encode = false,
limit = 1000,
} = {}) {
if (!string) {
if (!string || typeof string !== 'string') {
return string;
}