Updated Jules Jordan profile scraper.

This commit is contained in:
DebaucheryLibrarian
2023-07-06 05:09:05 +02:00
parent 9331c0af52
commit 51e04e7331
3 changed files with 43 additions and 9 deletions

View File

@@ -3,7 +3,6 @@
const util = require('util');
const Promise = require('bluebird');
const cheerio = require('cheerio');
const { JSDOM } = require('jsdom');
const moment = require('moment');
const unprint = require('unprint');
@@ -272,9 +271,8 @@ function scrapeMovie({ el, query }, url, site) {
};
}
/*
function scrapeProfile(html, url, actorName, entity) {
const { document } = new JSDOM(html).window;
const bio = document.querySelector('.model_bio').textContent;
const avatarEl = document.querySelector('.model_bio_pic img, .model_bio_thumb');
@@ -320,6 +318,36 @@ function scrapeProfile(html, url, actorName, entity) {
return profile;
}
*/
function scrapeProfile({ query }, url, name, entity) {
const profile = { url };
profile.description = query.content('//comment()[contains(., " Bio Extra Field ")]/following-sibling::span'); // the spaces are important to avoid selecting a similar comment
profile.height = heightToCm(query.content('//span[contains(text(), "Height")]/following-sibling::span'));
profile.measurements = query.content('//span[contains(text(), "Measurements")]/following-sibling::span');
const age = query.content('//span[contains(text(), "Age")]/following-sibling::span')?.trim();
if (age && /\w+ \d+, \d{4}/.test(age)) {
profile.dateOfBirth = unprint.extractDate(age, 'MMMM D, YYYY');
} else {
profile.age = Number(age) || null;
}
profile.avatar = [
query.img('.model_bio_pic img, .model_bio_thumb', { attribute: 'src0_3x' }),
query.img('.model_bio_pic img, .model_bio_thumb', { attribute: 'src0_2x' }),
query.img('.model_bio_pic img, .model_bio_thumb', { attribute: 'src0_1x' }),
query.img('.model_bio_pic img, .model_bio_thumb', { attribute: 'src0' }),
query.img('.model_bio_pic img, .model_bio_thumb', { attribute: 'src' }),
].filter(Boolean);
profile.scenes = scrapeAll(unprint.initAll(query.all('.grid-item')), entity, true);
return profile;
}
async function fetchLatest(site, page = 1, include, preData, entryIdFromTitle = false) {
const url = site.parameters?.latest
@@ -376,10 +404,10 @@ async function fetchProfile({ name: actorName, url }, entity) {
return null;
}
const res = await http.get(profileUrl);
const res = await unprint.get(profileUrl);
if (res.statusCode === 200) {
return scrapeProfile(res.body.toString(), profileUrl, actorName, entity);
if (res.ok) {
return scrapeProfile(res.context, profileUrl, actorName, entity);
}
return null;