From 0d0acb6f3ca782ac3d5acf9162e9661655b9de15 Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Wed, 11 Mar 2020 03:01:37 +0100 Subject: [PATCH] Fixed release sites for profile scraping. --- src/actors.js | 9 +++++++-- src/app.js | 2 +- src/scrapers/julesjordan.js | 32 ++++++++++++++++++++++---------- src/utils/capitalize.js | 4 ++++ src/utils/posters.js | 10 ++++++---- src/utils/qu.js | 3 +-- 6 files changed, 41 insertions(+), 19 deletions(-) diff --git a/src/actors.js b/src/actors.js index e13c805b..aad1fdb2 100644 --- a/src/actors.js +++ b/src/actors.js @@ -401,6 +401,10 @@ async function scrapeProfiles(sources, actorName, actorEntry, sitesBySlug) { name: actorName, scraper: scraperSlug, site, + releases: profile.releases?.map(release => (typeof release === 'string' + ? { url: release, site } + : { ...release, site: release.site || site } + )), }; } @@ -427,7 +431,7 @@ async function scrapeActors(actorNames) { const finalSources = argv.withReleases ? sources.flat() : sources; // ignore race-to-success grouping when scenes are requested - const [siteEntries, networks] = await Promise.all([ + const [siteEntries, networkEntries] = await Promise.all([ knex('sites') .leftJoin('networks', 'sites.network_id', 'networks.id') .select( @@ -439,9 +443,10 @@ async function scrapeActors(actorNames) { ]); const sites = await curateSites(siteEntries, true); + const networks = networkEntries.map(network => ({ ...network, isFallback: true })); const sitesBySlug = [].concat(networks, sites).reduce((acc, site) => ({ ...acc, [site.slug]: site }), {}); - const profiles = await scrapeProfiles(sources, sitesBySlug, actorName, actorEntry); + const profiles = await scrapeProfiles(sources, actorName, actorEntry, sitesBySlug); const profile = await mergeProfiles(profiles, actorEntry); if (profile === null) { diff --git a/src/app.js b/src/app.js index 95824c24..645d6f4c 100644 --- a/src/app.js +++ b/src/app.js @@ -31,7 +31,7 @@ async function init() { if (argv.withReleases) { const baseReleases = actors.map(actor => actor?.releases || []).flat(); - const releases = await deepFetchReleases(baseReleases, null, 'scene'); + const releases = await deepFetchReleases(baseReleases, null); await storeReleases(releases); } diff --git a/src/scrapers/julesjordan.js b/src/scrapers/julesjordan.js index 1e092dc3..2c7dee78 100644 --- a/src/scrapers/julesjordan.js +++ b/src/scrapers/julesjordan.js @@ -8,7 +8,7 @@ const { JSDOM } = require('jsdom'); const moment = require('moment'); const logger = require('../logger')(__filename); -const { get, geta, ctxa } = require('../utils/q'); +const { get, geta, ctxa, parseDate } = require('../utils/q'); const { heightToCm } = require('../utils/convert'); const slugify = require('../utils/slugify'); @@ -304,27 +304,37 @@ function scrapeProfile(html, url, actorName) { const { document } = new JSDOM(html).window; const bio = document.querySelector('.model_bio').textContent; - const avatarEl = document.querySelector('.model_bio_pic'); + const avatarEl = document.querySelector('.model_bio_pic img'); const profile = { name: actorName, }; const heightString = bio.match(/\d+ feet \d+ inches/); - const ageString = bio.match(/Age:\s*\d{2}/); + const ageString = bio.match(/Age:\s*(\d{2})/); + const birthDateString = bio.match(/Age:\s*(\w+ \d{1,2}, \d{4})/); const measurementsString = bio.match(/\w+-\d+-\d+/); + if (birthDateString) profile.birthdate = parseDate(birthDateString[1], 'MMMM D, YYYY'); + if (ageString) profile.age = Number(ageString[1]); + if (heightString) profile.height = heightToCm(heightString[0]); - if (ageString) profile.age = Number(ageString[0].match(/\d{2}/)[0]); - if (measurementsString) [profile.bust, profile.waist, profile.hip] = measurementsString[0].split('-'); + + if (measurementsString) { + const [bust, waist, hip] = measurementsString[0].split('-'); + + if (bust) profile.bust = bust; + if (waist) profile.waist = Number(waist); + if (hip) profile.hip = Number(hip); + } if (avatarEl) { const avatarSources = [ - avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0_3x') + 9, avatarEl.innerHTML.indexOf('3x.jpg') + 6).trim(), - avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0_2x') + 9, avatarEl.innerHTML.indexOf('2x.jpg') + 6).trim(), - avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0_1x') + 9, avatarEl.innerHTML.indexOf('1x.jpg') + 6).trim(), - avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src0') + 6, avatarEl.innerHTML.indexOf('set.jpg') + 7).trim(), - avatarEl.innerHTML.slice(avatarEl.innerHTML.indexOf('src') + 5, avatarEl.innerHTML.indexOf('set.jpg') + 7).trim(), + avatarEl.getAttribute('src0_3x'), + avatarEl.getAttribute('src0_2x'), + avatarEl.getAttribute('src0_1x'), + avatarEl.getAttribute('src0'), + avatarEl.getAttribute('src'), ].filter(Boolean); if (avatarSources.length) profile.avatar = avatarSources; @@ -332,6 +342,8 @@ function scrapeProfile(html, url, actorName) { profile.releases = Array.from(document.querySelectorAll('.category_listing_block .update_details > a:first-child'), el => el.href); + console.log(profile); + return profile; } diff --git a/src/utils/capitalize.js b/src/utils/capitalize.js index 5fa0c976..5263c34b 100644 --- a/src/utils/capitalize.js +++ b/src/utils/capitalize.js @@ -1,6 +1,10 @@ 'use strict'; function capitalize(string, trim = true) { + if (!string) { + return ''; + } + const capitalized = string .split(/\s/) .map(component => `${component.charAt(0).toUpperCase()}${component.slice(1)}`) diff --git a/src/utils/posters.js b/src/utils/posters.js index 2b8c5fa3..d30d6d95 100644 --- a/src/utils/posters.js +++ b/src/utils/posters.js @@ -10,20 +10,22 @@ const knex = require('../knex'); async function init() { const posters = await knex('actors') - .select('actors.name as actor_name', 'releases.title', 'releases.date', 'media.path', 'sites.name as site_name', 'networks.name as network_name') - .whereIn('actors.name', argv.actors) + .select('actors.name as actor_name', 'releases.title', 'releases.date', 'media.path', 'media.index', 'sites.name as site_name', 'networks.name as network_name') + .whereIn('actors.name', (argv.actors || []).concat(argv._)) .join('releases_actors', 'releases_actors.actor_id', 'actors.id') .join('releases', 'releases_actors.release_id', 'releases.id') - .join('releases_posters', 'releases_posters.release_id', 'releases.id') .join('sites', 'sites.id', 'releases.site_id') .join('networks', 'networks.id', 'sites.network_id') + .join('releases_posters', 'releases_posters.release_id', 'releases.id') .join('media', 'releases_posters.media_id', 'media.id'); + // .join('releases_photos', 'releases_photos.release_id', 'releases.id') + // .join('media', 'releases_photos.media_id', 'media.id'); await Promise.all(posters.map(async (poster) => { const source = path.join(config.media.path, poster.path); const directory = path.join(config.media.path, 'extracted', poster.actor_name); - const target = path.join(directory, `${poster.actor_name} - ${poster.network_name}: ${poster.site_name} - ${poster.title.replace(/[/.]/g, '_')} (${moment.utc(poster.date).format('YYYY-MM-DD')}).jpeg`); + const target = path.join(directory, `${poster.actor_name} - ${poster.network_name}: ${poster.site_name} - ${poster.title.replace(/[/.]/g, '_')} (${moment.utc(poster.date).format('YYYY-MM-DD')})-${poster.index}.jpeg`); await fs.mkdir(path.join(directory), { recursive: true }); const file = await fs.readFile(source); diff --git a/src/utils/qu.js b/src/utils/qu.js index afb75f09..1909a387 100644 --- a/src/utils/qu.js +++ b/src/utils/qu.js @@ -334,11 +334,10 @@ module.exports = { ex: extract, exa: extractAll, fd: formatDate, + parseDate: extractDate, ctx: init, ctxa: initAll, geta: getAll, - edate: extractDate, - fdate: formatDate, qu: quFuncs, ...legacyFuncs, };