Added profile interpolation.

This commit is contained in:
2020-05-17 03:00:44 +02:00
parent 05ee57378a
commit 985ab9d2dc
16 changed files with 252 additions and 35 deletions

View File

@@ -2,6 +2,7 @@
const config = require('config');
const Promise = require('bluebird');
const moment = require('moment');
// const logger = require('./logger')(__filename);
const knex = require('./knex');
@@ -10,12 +11,46 @@ const scrapers = require('./scrapers/scrapers').actors;
const argv = require('./argv');
const include = require('./utils/argv-include')(argv);
const logger = require('./logger')(__filename);
const { toBaseReleases } = require('./deep');
const { associateAvatars } = require('./media');
const slugify = require('./utils/slugify');
const capitalize = require('./utils/capitalize');
const resolvePlace = require('./utils/resolve-place');
const { associateAvatars } = require('./media');
const { toBaseReleases } = require('./deep');
function getMostFrequent(items) {
const { mostFrequent } = items.reduce((acc, item) => {
acc.counts[item] = (acc.counts[item] || 0) + 1;
if (!acc.mostFrequent || acc.counts[item] > acc.counts[acc.mostFrequent]) {
acc.mostFrequent = item;
}
return acc;
}, {
counts: {},
mostFrequent: null,
});
return mostFrequent;
}
function getMostFrequentDate(dates) {
const year = getMostFrequent(dates.map(dateX => dateX.getFullYear()));
const month = getMostFrequent(dates.map(dateX => dateX.getMonth()));
const date = getMostFrequent(dates.map(dateX => dateX.getDate()));
return moment({ year, month, date }).toDate();
}
function getLongest(items) {
return items.sort((itemA, itemB) => itemB.length - itemA.length)[0] || null;
}
function getAverage(items) {
return Math.round(items.reduce((acc, item) => acc + item, 0) / items.length);
}
function toBaseActors(actorsOrNames, release) {
return actorsOrNames.map((actorOrName) => {
@@ -64,10 +99,10 @@ function curateProfileEntry(profile) {
description: profile.description,
birth_city: profile.placeOfBirth?.city || null,
birth_state: profile.placeOfBirth?.state || null,
birth_country_alpha2: profile.placeOfBirth?.country?.alpha2 || null,
birth_country_alpha2: profile.placeOfBirth?.country || null,
residence_city: profile.placeOfResidence?.city || null,
residence_state: profile.placeOfResidence?.state || null,
residence_country_alpha2: profile.placeOfResidence?.country?.alpha2 || null,
residence_country_alpha2: profile.placeOfResidence?.country || null,
cup: profile.cup,
bust: profile.bust,
waist: profile.waist,
@@ -131,13 +166,15 @@ async function curateProfile(profile) {
curatedProfile.hasTattoos = typeof profile.hasTattoos === 'boolean' ? profile.hasTattoos : null;
curatedProfile.hasPiercings = typeof profile.hasPiercings === 'boolean' ? profile.hasPiercings : null;
const [placeOfBirth, placeOfResidence] = await Promise.all([
resolvePlace(profile.birthPlace),
resolvePlace(profile.residencePlace),
]);
if (argv.resolvePlace) {
const [placeOfBirth, placeOfResidence] = await Promise.all([
resolvePlace(profile.birthPlace),
resolvePlace(profile.residencePlace),
]);
curatedProfile.placeOfBirth = placeOfBirth;
curatedProfile.placeOfResidence = placeOfResidence;
curatedProfile.placeOfBirth = placeOfBirth;
curatedProfile.placeOfResidence = placeOfResidence;
}
if (!curatedProfile.placeOfBirth && curatedProfile.nationality) {
const country = await knex('countries')
@@ -164,6 +201,10 @@ async function curateProfile(profile) {
curatedProfile.releases = toBaseReleases(profile.releases);
if (argv.inspect) {
console.log(curatedProfile);
}
return curatedProfile;
} catch (error) {
logger.error(`Failed to curate '${profile.name}': ${error.message}`);
@@ -172,6 +213,91 @@ async function curateProfile(profile) {
}
}
async function interpolateProfiles(actors) {
const profiles = await knex('actors_profiles')
.select(['actors_profiles.*', 'media.width as avatar_width', 'media.height as avatar_height', 'media.size as avatar_size'])
.whereIn('actor_id', actors.map(actor => actor.id))
.leftJoin('media', 'actors_profiles.avatar_media_id', 'media.id');
const profilesByActorId = profiles.reduce((acc, profile) => ({
...acc,
[profile.actor_id]: [
...(acc[profile.actor_id] || []),
profile,
],
}), {});
const interpolatedProfiles = Object.entries(profilesByActorId).map(([actorId, actorProfiles]) => {
const valuesByProperty = actorProfiles.reduce((acc, profile) => Object
.entries(profile)
.reduce((profileAcc, [property, value]) => ({
...profileAcc,
[property]: [
...(acc[property] || []),
...(value === null ? [] : [value]),
],
}), {}), {});
const avatars = actorProfiles.map(profile => profile.avatar_media_id && ({
id: profile.avatar_media_id,
width: profile.avatar_width,
height: profile.avatar_height,
size: profile.avatar_size,
})).filter(Boolean);
const profile = {
id: actorId,
};
profile.gender = getMostFrequent(valuesByProperty.gender);
profile.ethnicity = getMostFrequent(valuesByProperty.ethnicity.map(ethnicity => ethnicity.toLowerCase()));
profile.date_of_birth = getMostFrequentDate(valuesByProperty.date_of_birth);
profile.date_of_death = getMostFrequentDate(valuesByProperty.date_of_death);
profile.birth_city = getMostFrequent(valuesByProperty.birth_city);
profile.birth_state = getMostFrequent(valuesByProperty.birth_state);
profile.birth_country_alpha2 = getMostFrequent(valuesByProperty.birth_country_alpha2);
profile.residence_city = getMostFrequent(valuesByProperty.residence_city);
profile.residence_state = getMostFrequent(valuesByProperty.residence_state);
profile.residence_country_alpha2 = getMostFrequent(valuesByProperty.residence_country_alpha2);
profile.cup = getMostFrequent(valuesByProperty.cup);
profile.bust = getMostFrequent(valuesByProperty.bust);
profile.waist = getMostFrequent(valuesByProperty.waist);
profile.hip = getMostFrequent(valuesByProperty.hip);
profile.natural_boobs = getMostFrequent(valuesByProperty.natural_boobs);
profile.hair = getMostFrequent(valuesByProperty.hair.map(hair => hair.toLowerCase()));
profile.eyes = getMostFrequent(valuesByProperty.eyes.map(eyes => eyes.toLowerCase()));
profile.weight = getAverage(valuesByProperty.weight);
profile.height = getMostFrequent(valuesByProperty.height);
profile.has_tattoos = getMostFrequent(valuesByProperty.has_tattoos);
profile.has_piercings = getMostFrequent(valuesByProperty.has_piercings);
profile.tattoos = getLongest(valuesByProperty.tattoos);
profile.piercings = getLongest(valuesByProperty.piercings);
profile.avatar_media_id = avatars.sort((avatarA, avatarB) => avatarB.height - avatarA.height)[0].id;
return profile;
});
const transaction = await knex.transaction();
const queries = interpolatedProfiles.map(profile => knex('actors')
.where('id', profile.id)
.update(profile)
.transacting(transaction));
await Promise.all(queries)
.then(transaction.commit)
.catch(transaction.rollback);
}
async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) {
const profiles = Promise.map(sources, async (source) => {
try {
@@ -217,7 +343,9 @@ async function scrapeProfiles(actor, sources, networksBySlug, sitesBySlug) {
return profiles.filter(Boolean);
}
async function upsertProfiles(curatedProfileEntries) {
async function upsertProfiles(profiles) {
const curatedProfileEntries = profiles.map(profile => curateProfileEntry(profile));
const existingProfiles = await knex('actors_profiles')
.whereIn(['actor_id', 'network_id'], curatedProfileEntries.map(entry => [entry.actor_id, entry.network_id]))
.orWhereIn(['actor_id', 'site_id'], curatedProfileEntries.map(entry => [entry.actor_id, entry.site_id]));
@@ -311,9 +439,8 @@ async function scrapeActors(actorNames) {
const profiles = await Promise.all(profilesPerActor.flat().map(profile => curateProfile(profile)));
const profilesWithAvatarIds = await associateAvatars(profiles);
const curatedProfileEntries = profilesWithAvatarIds.map(profile => curateProfileEntry(profile));
await upsertProfiles(curatedProfileEntries);
await upsertProfiles(profilesWithAvatarIds);
await interpolateProfiles(actors);
}
async function getOrCreateActors(baseActors, batchId) {

View File

@@ -177,6 +177,11 @@ const { argv } = yargs
type: 'string',
default: process.env.NODE_ENV === 'development' ? 'silly' : 'info',
})
.option('resolve-place', {
describe: 'Call OSM Nominatim API for actor place of birth and residence. Raw value discarded if disabled.',
type: 'boolean',
default: true,
})
.option('debug', {
describe: 'Show error stack traces',
type: 'boolean',

View File

@@ -52,6 +52,10 @@ async function findSites(baseReleases) {
}
function toBaseReleases(baseReleasesOrUrls) {
if (!baseReleasesOrUrls) {
return [];
}
return baseReleasesOrUrls
.map((baseReleaseOrUrl) => {
if (baseReleaseOrUrl.url) {

View File

@@ -141,7 +141,7 @@ async function fetchActorReleases({ qu, html }, accReleases = []) {
return accReleases.concat(releases);
}
async function scrapeProfile(html, url, actorName) {
async function scrapeProfile(html, url, actorName, include) {
const qProfile = ex(html);
const { q, qa } = qProfile;
@@ -175,7 +175,9 @@ async function scrapeProfile(html, url, actorName) {
const avatarEl = q('.big-pic-model-container img');
if (avatarEl) profile.avatar = `https:${avatarEl.src}`;
profile.releases = await fetchActorReleases(qProfile);
if (include.releases) {
profile.releases = await fetchActorReleases(qProfile);
}
return profile;
}
@@ -198,7 +200,7 @@ async function fetchScene(url, site) {
return scrapeScene(res.body.toString(), url, site);
}
async function fetchProfile(actorName) {
async function fetchProfile(actorName, scraperSlug, siteOrNetwork, include) {
const searchUrl = 'https://brazzers.com/pornstars-search/';
const searchRes = await bhttp.get(searchUrl, {
headers: {
@@ -212,7 +214,7 @@ async function fetchProfile(actorName) {
const url = `https://brazzers.com${actorLink}`;
const res = await bhttp.get(url);
return scrapeProfile(res.body.toString(), url, actorName);
return scrapeProfile(res.body.toString(), url, actorName, include);
}
return null;

View File

@@ -368,7 +368,7 @@ function scrapeApiProfile(data, releases, siteSlug) {
const avatarPaths = Object.values(data.pictures).reverse();
if (avatarPaths.length > 0) profile.avatar = avatarPaths.map(avatarPath => `https://images01-evilangel.gammacdn.com/actors${avatarPath}`);
profile.releases = releases.map(release => `https://${siteSlug}.com/en/video/${release.url_title}/${release.clip_id}`);
if (releases) profile.releases = releases.map(release => `https://${siteSlug}.com/en/video/${release.url_title}/${release.clip_id}`);
return profile;
}
@@ -579,7 +579,7 @@ async function fetchProfile(actorName, siteSlug, altSearchUrl, getActorReleasesU
return null;
}
async function fetchApiProfile(actorName, siteSlug) {
async function fetchApiProfile(actorName, siteSlug, site, include) {
const actorSlug = encodeURI(actorName);
const referer = `https://www.${siteSlug}.com/en/search`;
@@ -603,7 +603,7 @@ async function fetchApiProfile(actorName, siteSlug) {
const actorData = res.body.results[0].hits.find(actor => slugify(actor.name) === slugify(actorName));
if (actorData) {
const actorScenes = await fetchActorScenes(actorData.name, apiUrl, siteSlug);
const actorScenes = include.releases && await fetchActorScenes(actorData.name, apiUrl, siteSlug);
return scrapeApiProfile(actorData, actorScenes, siteSlug);
}

View File

@@ -12,7 +12,7 @@ const schemaExtender = makeExtendSchemaPlugin(_build => ({
}
extend type Actor {
age: Int @requires(columns: ["date_of_birth"])
age: Int @requires(columns: ["dateOfBirth"])
height(units:Units): String @requires(columns: ["height"])
weight(units:Units): String @requires(columns: ["weight"])
}
@@ -20,9 +20,9 @@ const schemaExtender = makeExtendSchemaPlugin(_build => ({
resolvers: {
Actor: {
age(parent, _args, _context, _info) {
if (!parent.birthdate) return null;
if (!parent.dateOfBirth) return null;
return moment().diff(parent.birthdate, 'years');
return moment().diff(parent.dateOfBirth, 'years');
},
height(parent, args, _context, _info) {
if (!parent.height) return null;