Updated Jules Jordan profile scraper.
This commit is contained in:
parent
9331c0af52
commit
51e04e7331
|
@ -416,7 +416,7 @@ async function curateProfile(profile, actor) {
|
||||||
curatedProfile.hip = Number(profile.hip) || profile.hip?.match?.(/\d+/)?.[0] || null;
|
curatedProfile.hip = Number(profile.hip) || profile.hip?.match?.(/\d+/)?.[0] || null;
|
||||||
|
|
||||||
// combined measurement value
|
// combined measurement value
|
||||||
const measurements = profile.measurements?.match(/(\d+)(\w+)[-x](\d+)[-x](\d+)/); // ExCoGi uses x
|
const measurements = profile.measurements?.match(/(\d+)(\w+)\s*[-x]\s*(\d+)\s*[-x]\s*(\d+)/); // ExCoGi uses x, Jules Jordan has spaces between the dashes
|
||||||
|
|
||||||
if (measurements) {
|
if (measurements) {
|
||||||
curatedProfile.bust = Number(measurements[1]);
|
curatedProfile.bust = Number(measurements[1]);
|
||||||
|
@ -610,6 +610,14 @@ async function interpolateProfiles(actorIdsOrNames) {
|
||||||
.filter((avatar) => avatar && (avatar.entropy === null || avatar.entropy > 5.5))
|
.filter((avatar) => avatar && (avatar.entropy === null || avatar.entropy > 5.5))
|
||||||
.sort((avatarA, avatarB) => avatarB.height - avatarA.height)[0]?.id || null;
|
.sort((avatarA, avatarB) => avatarB.height - avatarA.height)[0]?.id || null;
|
||||||
|
|
||||||
|
if (!profile.avatar_media_id) {
|
||||||
|
// try to settle for low quality avatar
|
||||||
|
profile.avatar_media_id = actorProfiles
|
||||||
|
.map((actorProfile) => actorProfile.avatar)
|
||||||
|
.filter((avatar) => avatar)
|
||||||
|
.sort((avatarA, avatarB) => avatarB.height - avatarA.height)[0]?.id || null;
|
||||||
|
}
|
||||||
|
|
||||||
return profile;
|
return profile;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
|
@ -353,8 +353,6 @@ async function extractSource(baseSource, { existingExtractMediaByUrl }) {
|
||||||
if (typeof baseSource.defer === 'function') {
|
if (typeof baseSource.defer === 'function') {
|
||||||
const src = await baseSource.defer();
|
const src = await baseSource.defer();
|
||||||
|
|
||||||
console.log(baseSource, src);
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
...baseSource,
|
...baseSource,
|
||||||
...toBaseSource(src),
|
...toBaseSource(src),
|
||||||
|
|
|
@ -3,7 +3,6 @@
|
||||||
const util = require('util');
|
const util = require('util');
|
||||||
const Promise = require('bluebird');
|
const Promise = require('bluebird');
|
||||||
const cheerio = require('cheerio');
|
const cheerio = require('cheerio');
|
||||||
const { JSDOM } = require('jsdom');
|
|
||||||
const moment = require('moment');
|
const moment = require('moment');
|
||||||
const unprint = require('unprint');
|
const unprint = require('unprint');
|
||||||
|
|
||||||
|
@ -272,9 +271,8 @@ function scrapeMovie({ el, query }, url, site) {
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
function scrapeProfile(html, url, actorName, entity) {
|
function scrapeProfile(html, url, actorName, entity) {
|
||||||
const { document } = new JSDOM(html).window;
|
|
||||||
|
|
||||||
const bio = document.querySelector('.model_bio').textContent;
|
const bio = document.querySelector('.model_bio').textContent;
|
||||||
const avatarEl = document.querySelector('.model_bio_pic img, .model_bio_thumb');
|
const avatarEl = document.querySelector('.model_bio_pic img, .model_bio_thumb');
|
||||||
|
|
||||||
|
@ -320,6 +318,36 @@ function scrapeProfile(html, url, actorName, entity) {
|
||||||
|
|
||||||
return profile;
|
return profile;
|
||||||
}
|
}
|
||||||
|
*/
|
||||||
|
|
||||||
|
function scrapeProfile({ query }, url, name, entity) {
|
||||||
|
const profile = { url };
|
||||||
|
|
||||||
|
profile.description = query.content('//comment()[contains(., " Bio Extra Field ")]/following-sibling::span'); // the spaces are important to avoid selecting a similar comment
|
||||||
|
|
||||||
|
profile.height = heightToCm(query.content('//span[contains(text(), "Height")]/following-sibling::span'));
|
||||||
|
profile.measurements = query.content('//span[contains(text(), "Measurements")]/following-sibling::span');
|
||||||
|
|
||||||
|
const age = query.content('//span[contains(text(), "Age")]/following-sibling::span')?.trim();
|
||||||
|
|
||||||
|
if (age && /\w+ \d+, \d{4}/.test(age)) {
|
||||||
|
profile.dateOfBirth = unprint.extractDate(age, 'MMMM D, YYYY');
|
||||||
|
} else {
|
||||||
|
profile.age = Number(age) || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
profile.avatar = [
|
||||||
|
query.img('.model_bio_pic img, .model_bio_thumb', { attribute: 'src0_3x' }),
|
||||||
|
query.img('.model_bio_pic img, .model_bio_thumb', { attribute: 'src0_2x' }),
|
||||||
|
query.img('.model_bio_pic img, .model_bio_thumb', { attribute: 'src0_1x' }),
|
||||||
|
query.img('.model_bio_pic img, .model_bio_thumb', { attribute: 'src0' }),
|
||||||
|
query.img('.model_bio_pic img, .model_bio_thumb', { attribute: 'src' }),
|
||||||
|
].filter(Boolean);
|
||||||
|
|
||||||
|
profile.scenes = scrapeAll(unprint.initAll(query.all('.grid-item')), entity, true);
|
||||||
|
|
||||||
|
return profile;
|
||||||
|
}
|
||||||
|
|
||||||
async function fetchLatest(site, page = 1, include, preData, entryIdFromTitle = false) {
|
async function fetchLatest(site, page = 1, include, preData, entryIdFromTitle = false) {
|
||||||
const url = site.parameters?.latest
|
const url = site.parameters?.latest
|
||||||
|
@ -376,10 +404,10 @@ async function fetchProfile({ name: actorName, url }, entity) {
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
|
|
||||||
const res = await http.get(profileUrl);
|
const res = await unprint.get(profileUrl);
|
||||||
|
|
||||||
if (res.statusCode === 200) {
|
if (res.ok) {
|
||||||
return scrapeProfile(res.body.toString(), profileUrl, actorName, entity);
|
return scrapeProfile(res.context, profileUrl, actorName, entity);
|
||||||
}
|
}
|
||||||
|
|
||||||
return null;
|
return null;
|
||||||
|
|
Loading…
Reference in New Issue