Resolving actor birth and residence place before storage. Layout improvements.

This commit is contained in:
2019-11-29 05:46:06 +01:00
parent 4be508b388
commit 0dbe853f39
17 changed files with 377 additions and 308 deletions

View File

@@ -157,10 +157,8 @@ function scrapeProfile(html, url, actorName) {
if (bio.Weight) profile.weight = lbsToKg(bio.Weight.match(/\d+/)[0]);
if (bio['Hair Color']) profile.hair = hairMap[bio['Hair Color']] || bio['Hair Color'].toLowerCase();
if (bio['Body Art']) {
profile.hasTattoo = !!bio['Body Art'].match('Tattoo');
profile.hasPiercing = !!bio['Body Art'].match('Piercing');
}
if (bio['Body Art'] && bio['Body Art'].match('Tattoo')) profile.hasTattoos = true;
if (bio['Body Art'] && bio['Body Art'].match('Piercing')) profile.hasPiercings = true;
if (descriptionEl) profile.description = descriptionEl.textContent.trim();
if (avatarEl) profile.avatar = `https:${avatarEl.src}`;

View File

@@ -5,8 +5,6 @@ const bhttp = require('bhttp');
const { JSDOM } = require('jsdom');
const moment = require('moment');
const knex = require('../knex');
async function scrapeProfileFrontpage(html, url, name) {
const { document } = new JSDOM(html).window;
const bioEl = document.querySelector('.dashboard-bio-list');
@@ -18,55 +16,47 @@ async function scrapeProfileFrontpage(html, url, name) {
const bio = keys.reduce((acc, key, index) => ({ ...acc, [key]: values[index] }), {});
const profile = {
name,
gender: 'female',
};
const birthdateString = bio['Date of Birth:'];
const birthdate = birthdateString && birthdateString !== 'Unknown (Add)'
? moment.utc(birthdateString.slice(0, birthdateString.indexOf(' (')), 'MMMM D, YYYY').toDate()
: null;
const measurementsString = bio['Measurements:'];
const [bust, waist, hip] = measurementsString.split('-').map(measurement => (measurement === '??' ? null : measurement));
const naturalBoobs = bio['Fake Boobs:'] === 'No';
const residenceCountryName = bio['Country of Origin:'];
const countryEntry = await knex('countries').where({ name: residenceCountryName }).first();
const residenceCountry = countryEntry ? countryEntry.alpha2 : null;
const birthPlace = bio['Place of Birth:'];
const birthCityString = bio['Place of Birth:'];
const birthCity = birthCityString !== undefined && birthCityString !== 'Unknown' && birthCityString !== 'Unknown (add)' && birthCityString;
const hair = bio['Hair Color:'].toLowerCase();
const eyes = bio['Eye Color:'].toLowerCase();
const birthCountryString = bio['Country of Origin:'];
const birthCountry = birthCountryString !== undefined && birthCountryString !== 'Unknown' && birthCountryString !== 'Unknown (add)' && birthCountryString;
const piercingsString = bio['Piercings:'];
const hasPiercings = !!(piercingsString !== undefined && piercingsString !== 'Unknown (add)' && piercingsString !== 'None');
const piercings = hasPiercings && piercingsString;
const tattoosString = bio['Tattoos:'];
const hasTattoos = !!(tattoosString !== undefined && tattoosString !== 'Unknown (add)' && tattoosString !== 'None');
const tattoos = hasTattoos && tattoosString;
const social = Array.from(bioEl.querySelectorAll('.dashboard-socialmedia a'), el => el.href);
if (birthdateString && birthdateString !== 'Unknown (add)') profile.birthdate = moment.utc(birthdateString.slice(0, birthdateString.indexOf(' (')), 'MMMM D, YYYY').toDate();
if (measurementsString) [profile.bust, profile.waist, profile.hip] = measurementsString.split('-').map(measurement => (measurement === '??' ? null : measurement));
if (bio['Fake Boobs:']) profile.naturalBoobs = bio['Fake Boobs:'] === 'No';
profile.birthPlace = `${birthCity || ''}${birthCity ? ', ' : ''}${birthCountry || ''}`;
profile.hair = bio['Hair Color:'].toLowerCase();
profile.eyes = bio['Eye Color:'].toLowerCase();
if (piercingsString) profile.hasPiercings = !!(piercingsString !== 'Unknown (add)' && piercingsString !== 'None');
if (tattoosString) profile.hasTattoos = !!(tattoosString !== 'Unknown (add)' && tattoosString !== 'None');
if (profile.hasPiercings && piercingsString !== 'various') profile.piercings = piercingsString;
if (profile.hasTattoos && tattoosString !== 'various') profile.tattoos = tattoosString;
profile.social = Array.from(bioEl.querySelectorAll('.dashboard-socialmedia a'), el => el.href);
return {
bio: {
name,
gender: 'female',
birthdate,
residenceCountry,
birthPlace,
naturalBoobs,
bust,
waist,
hip,
hair,
eyes,
piercings,
tattoos,
social,
},
profile,
url: bioUrl,
};
}
async function scrapeProfileBio(html, frontpageBio, url, name) {
async function scrapeProfileBio(html, frontpageProfile, url, name) {
const { document } = new JSDOM(html).window;
const bioEl = document.querySelector('#biographyTable');
@@ -75,58 +65,46 @@ async function scrapeProfileBio(html, frontpageBio, url, name) {
const bio = keys.reduce((acc, key, index) => ({ ...acc, [key]: values[index] }), {});
const birthdateString = bio['Date of Birth:'];
const birthdate = birthdateString && birthdateString !== 'Unknown'
? moment.utc(birthdateString.slice(0, birthdateString.indexOf(' (')), 'MMMM D, YYYY').toDate()
: null;
const measurementsString = bio['Measurements:'];
const [bust, waist, hip] = measurementsString.split('-').map(measurement => (measurement === '??' ? null : measurement));
const boobsNatural = bio['Fake boobs:'] === 'No';
const ethnicity = bio['Ethnicity:'];
const residenceCountryName = bio['Country of Origin:'];
const countryEntry = await knex('countries').where({ name: residenceCountryName }).first();
const residenceCountry = countryEntry ? countryEntry.alpha2 : null;
const birthPlace = bio['Place of Birth:'];
const hair = bio['Hair Color:'].toLowerCase();
const eyes = bio['Eye Color:'].toLowerCase();
const height = Number(bio['Height:'].match(/\d+/)[0]);
const weight = Number(bio['Weight:'].match(/\d+/)[0]);
const piercingsString = bio['Piercings:'];
const hasPiercings = !!(piercingsString !== undefined && piercingsString !== 'Unknown (add)' && piercingsString !== 'None');
const piercings = hasPiercings && piercingsString;
const tattoosString = bio['Tattoos:'];
const hasTattoos = !!(tattoosString !== undefined && tattoosString !== 'Unknown (add)' && tattoosString !== 'None');
const tattoos = hasTattoos && tattoosString;
const social = Array.from(bioEl.querySelectorAll('#socialmedia a'), el => el.href);
return {
...frontpageBio,
const profile = {
...frontpageProfile,
name,
gender: 'female',
birthdate,
residenceCountry,
birthPlace,
ethnicity,
naturalBoobs: boobsNatural,
bust,
waist,
hip,
height,
weight,
hair,
eyes,
hasPiercings,
hasTattoos,
piercings,
tattoos,
social,
};
const birthdateString = bio['Date of Birth:'];
const measurementsString = bio['Measurements:'];
const birthCityString = bio['Place of Birth:'];
const birthCity = birthCityString !== undefined && birthCityString !== 'Unknown' && birthCityString !== 'Unknown (add)' && birthCityString;
const birthCountryString = bio['Country of Origin:'];
const birthCountry = birthCountryString !== undefined && birthCountryString !== 'Unknown' && birthCountryString !== 'Unknown (add)' && birthCountryString;
const piercingsString = bio['Piercings:'];
const tattoosString = bio['Tattoos:'];
if (birthdateString && birthdateString !== 'Unknown') profile.birthdate = moment.utc(birthdateString.slice(0, birthdateString.indexOf(' (')), 'MMMM D, YYYY').toDate();
if (measurementsString) [profile.bust, profile.waist, profile.hip] = measurementsString.split('-').map(measurement => (measurement === '??' ? null : measurement));
if (bio['Fake boobs']) profile.naturalBoobs = bio['Fake boobs:'] === 'No';
profile.ethnicity = bio['Ethnicity:'];
profile.birthPlace = `${birthCity || ''}${birthCity ? ', ' : ''}${birthCountry || ''}`;
profile.hair = bio['Hair Color:'].toLowerCase();
profile.eyes = bio['Eye Color:'].toLowerCase();
profile.height = Number(bio['Height:'].match(/\d+/)[0]);
profile.weight = Number(bio['Weight:'].match(/\d+/)[0]);
if (piercingsString) profile.hasPiercings = !!(piercingsString !== 'Unknown (add)' && piercingsString !== 'None');
if (tattoosString) profile.hasTattoos = !!(tattoosString !== 'Unknown (add)' && tattoosString !== 'None');
if (profile.hasPiercings && piercingsString !== 'various') profile.piercings = piercingsString;
if (profile.hasTattoos && tattoosString !== 'various') profile.tattoos = tattoosString;
profile.social = Array.from(bioEl.querySelectorAll('#socialmedia a'), el => el.href);
return profile;
}
async function fetchProfile(actorName) {
@@ -148,10 +126,10 @@ async function fetchProfile(actorName) {
const resFallback = await bhttp.get(fallbackUrl);
if (resFallback.statusCode === 200) {
const { url, bio } = await scrapeProfileFrontpage(resFallback.body.toString(), fallbackUrl, actorName);
const { url, profile } = await scrapeProfileFrontpage(resFallback.body.toString(), fallbackUrl, actorName);
const resBio = await bhttp.get(url);
return scrapeProfileBio(resBio.body.toString(), bio, url, actorName);
return scrapeProfileBio(resBio.body.toString(), profile, url, actorName);
}
return null;

View File

@@ -224,13 +224,15 @@ function scrapeProfile(html, url, actorName) {
if (measurementsString) [profile.bust, profile.waist, profile.hip] = measurementsString[0].split('-');
if (avatarEl) {
const src = avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src') + 5, avatarEl.innerHTML.indexOf('set.jpg') + 7);
const src0 = avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0') + 6, avatarEl.innerHTML.indexOf('set.jpg') + 7);
const src1 = avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0_1x') + 9, avatarEl.innerHTML.indexOf('1x.jpg') + 6);
const src2 = avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0_2x') + 9, avatarEl.innerHTML.indexOf('2x.jpg') + 6);
const src3 = avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0_3x') + 9, avatarEl.innerHTML.indexOf('3x.jpg') + 6);
const src = avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src') + 5, avatarEl.innerHTML.indexOf('set.jpg') + 7).trim();
const src0 = avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0') + 6, avatarEl.innerHTML.indexOf('set.jpg') + 7).trim();
const src1 = avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0_1x') + 9, avatarEl.innerHTML.indexOf('1x.jpg') + 6).trim();
const src2 = avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0_2x') + 9, avatarEl.innerHTML.indexOf('2x.jpg') + 6).trim();
const src3 = avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0_3x') + 9, avatarEl.innerHTML.indexOf('3x.jpg') + 6).trim();
profile.avatar = src3 || src2 || src1 || src0 || src;
const avatar = src3 || src2 || src1 || src0 || src;
if (avatar) profile.avatar = avatar;
}
profile.releases = Array.from(document.querySelectorAll('.category_listing_block .update_details > a:first-child'), el => el.href);

View File

@@ -70,34 +70,6 @@ function scrapeLatest(html, site) {
});
}
async function scrapeProfile(html, _url, actorName) {
const { document } = new JSDOM(html).window;
const profile = {
name: actorName,
};
const avatarEl = document.querySelector('.model--avatar img[src^="http"]');
const entries = Array.from(document.querySelectorAll('.model--description tr'), el => el.textContent.replace(/\n/g, '').split(':'));
const bio = entries
.filter(entry => entry.length === 2) // ignore entries without ':' (About section, see Blanche Bradburry)
.reduce((acc, [key, value]) => ({ ...acc, [key.trim()]: value.trim() }), {});
const birthCountryName = bio.Nationality;
if (birthCountryName) {
const countryEntry = await knex('countries').where({ name: birthCountryName }).first();
if (countryEntry) profile.birthCountry = countryEntry.alpha2;
}
if (bio.Age) profile.age = bio.Age;
if (avatarEl) profile.avatar = avatarEl.src;
return profile;
}
async function scrapeScene(html, url, site, useGallery) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const playerObject = $('script:contains("new VideoPlayer")').html();
@@ -158,6 +130,28 @@ async function scrapeScene(html, url, site, useGallery) {
};
}
async function scrapeProfile(html, _url, actorName) {
const { document } = new JSDOM(html).window;
const profile = {
name: actorName,
};
const avatarEl = document.querySelector('.model--avatar img[src^="http"]');
const entries = Array.from(document.querySelectorAll('.model--description tr'), el => el.textContent.replace(/\n/g, '').split(':'));
const bio = entries
.filter(entry => entry.length === 2) // ignore entries without ':' (About section, see Blanche Bradburry)
.reduce((acc, [key, value]) => ({ ...acc, [key.trim()]: value.trim() }), {});
profile.birthPlace = bio.Nationality;
if (bio.Age) profile.age = bio.Age;
if (avatarEl) profile.avatar = avatarEl.src;
return profile;
}
async function fetchLatest(site, page = 1) {
const res = await bhttp.get(`${site.url}/new-videos/${page}`);

View File

@@ -4,8 +4,6 @@ const bhttp = require('bhttp');
const { JSDOM } = require('jsdom');
const moment = require('moment');
const knex = require('../knex');
const ethnicityMap = {
White: 'Caucasian',
};
@@ -14,10 +12,6 @@ const hairMap = {
Brunette: 'brown',
};
const countryMap = {
'United States of America': 'United States',
};
async function scrapeProfile(html, _url, actorName) {
const { document } = new JSDOM(html).window;
@@ -28,9 +22,7 @@ async function scrapeProfile(html, _url, actorName) {
name: actorName,
};
const descriptionString = document.querySelector('div[itemprop="description"]');
const birthPlaceString = bio['Birth Place'] || bio.Birthplace;
const residencePlaceString = bio['City and Country'];
const descriptionString = document.querySelector('div[itemprop="description"]') || document.querySelector('.longBio');
const avatarEl = document.querySelector('#getAvatar') || document.querySelector('.thumbImage img');
if (bio.Gender) profile.gender = bio.Gender.toLowerCase();
@@ -38,35 +30,20 @@ async function scrapeProfile(html, _url, actorName) {
if (descriptionString) profile.description = descriptionString.textContent;
if (bio.Birthday) bio.birthdate = moment.utc(bio.Birthday, 'MMM D, YYYY').toDate();
if (bio.Born) bio.birthdate = moment.utc(bio.Born, 'YYYY-MM-DD').toDate();
if (bio.Birthday) profile.birthdate = moment.utc(bio.Birthday, 'MMM D, YYYY').toDate();
if (bio.Born) profile.birthdate = moment.utc(bio.Born, 'YYYY-MM-DD').toDate();
if (birthPlaceString) {
const birthPlaceSegments = birthPlaceString.split(',');
const birthCountryName = birthPlaceSegments.slice(-1)[0].trim();
const birthCountryEntry = await knex('countries').where('name', countryMap[birthCountryName] || birthCountryName).first();
profile.birthPlace = bio['Birth Place'] || bio.Birthplace;
profile.residencePlace = bio['City and Country'];
profile.birthPlace = birthPlaceSegments.slice(0, -1).join(',').trim();
profile.birthCountry = birthCountryEntry ? birthCountryEntry.alpha2 : null;
}
if (residencePlaceString) {
const residencePlaceSegments = residencePlaceString.split(',');
const residenceCountryAlpha2 = residencePlaceSegments.slice(-1)[0].trim();
const residenceCountryEntry = await knex('countries').where('alpha2', residenceCountryAlpha2).first();
profile.residencePlace = residencePlaceSegments.slice(0, -1).join(',').trim();
profile.residenceCountry = residenceCountryEntry ? residenceCountryEntry.alpha2 : null;
}
if (bio.Measurements && bio.Measurements !== '--') [profile.bust, profile.waist, profile.hip] = bio.Measurements.split('-').map(measurement => parseInt(measurement, 10) || null);
if (bio.Measurements && bio.Measurements !== '--') [profile.bust, profile.waist, profile.hip] = bio.Measurements.split('-');
if (bio['Fake Boobs']) profile.naturalBoobs = bio['Fake Boobs'] === 'No';
if (bio.Height) profile.height = Number(bio.Height.match(/\(\d+/)[0].slice(1));
if (bio.Weight) profile.weight = Number(bio.Weight.match(/\(\d+/)[0].slice(1));
if (bio['Hair Color']) profile.hair = hairMap[bio['Hair Color']] || bio['Hair Color'].toLowerCase();
if (bio.Piercings) profile.hasPiercings = bio.Piercings === 'Yes';
if (bio.Tattoos) profile.hasTattoos = bio.hasTattoos === 'Yes';
if (bio.Tattoos) profile.hasTattoos = bio.Tattoos === 'Yes';
if (avatarEl) profile.avatar = avatarEl.src;
profile.social = Array.from(document.querySelectorAll('.socialList a'), el => el.href).filter(link => link !== 'https://www.twitter.com/'); // PH links to Twitter itself for some reason