From 9d9eda29be547ad294d8ba99546fc43d5a800610 Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Sun, 9 Feb 2020 03:09:06 +0100 Subject: [PATCH] Added scene count to actor inspect. Preferring network slug over data brand for scene URLs in MindGeek scraper, since milehighmedia.com's brand is milehigh, resulting in milehigh.com. --- src/actors.js | 1 + src/scrape-releases.js | 2 +- src/scrapers/mindgeek.js | 12 ++++++------ src/utils/posters.js | 5 +++-- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/actors.js b/src/actors.js index c5024b5a..8fc00bcf 100644 --- a/src/actors.js +++ b/src/actors.js @@ -387,6 +387,7 @@ async function scrapeActors(actorNames) { if (argv.inspect) { console.log(profile); + logger.info(`Found ${profile.releases.length} releases for ${actorName}`); } if (profile === null) { diff --git a/src/scrape-releases.js b/src/scrape-releases.js index cc19a8b8..4f0712b0 100644 --- a/src/scrape-releases.js +++ b/src/scrape-releases.js @@ -39,7 +39,7 @@ async function scrapeRelease(source, basicRelease = null, type = 'scene') { const site = await findSite(url, release); if (!site) { - throw new Error('Could not find site in database'); + throw new Error(`Could not find site ${url} in database`); } const scraper = scrapers.releases[site.slug] || scrapers.releases[site.network.slug]; diff --git a/src/scrapers/mindgeek.js b/src/scrapers/mindgeek.js index 4c40a11d..5cfa4e77 100644 --- a/src/scrapers/mindgeek.js +++ b/src/scrapers/mindgeek.js @@ -73,7 +73,7 @@ async function scrapeLatest(items, site) { return latestReleases.filter(Boolean); } -function scrapeScene(data, url, _site) { +function scrapeScene(data, url, _site, networkName) { const release = {}; const { id: entryId, title, description } = data; @@ -100,7 +100,7 @@ function scrapeScene(data, url, _site) { const siteName = data.collections[0]?.name || data.brand; release.channel = siteName.replace(/\s+/g, '').toLowerCase(); - release.url = url || `https://www.${data.brand}.com/scene/${entryId}/`; + release.url = url || `https://www.${networkName || data.brand}.com/scene/${entryId}/`; return release; } @@ -139,7 +139,7 @@ async function getSession(url) { return { session, instanceToken }; } -function scrapeProfile(data, html, releases = []) { +function scrapeProfile(data, html, releases = [], networkName) { const { qa, qd } = ex(html); const profile = { @@ -170,7 +170,7 @@ function scrapeProfile(data, html, releases = []) { const birthdate = qa('li').find(el => /Date of Birth/.test(el.textContent)); if (birthdate) profile.birthdate = qd(birthdate, 'span', 'MMMM Do, YYYY'); - profile.releases = releases.map(release => scrapeScene(release)); + profile.releases = releases.map(release => scrapeScene(release, null, null, networkName)); return profile; } @@ -247,11 +247,11 @@ async function fetchProfile(actorName, networkName, actorPath = 'model') { ]); if (actorRes.statusCode === 200 && actorReleasesRes.statusCode === 200 && actorReleasesRes.body.result) { - return scrapeProfile(actorData, actorRes.body.toString(), actorReleasesRes.body.result); + return scrapeProfile(actorData, actorRes.body.toString(), actorReleasesRes.body.result, networkName); } if (actorRes.statusCode === 200) { - return scrapeProfile(actorData, actorRes.body.toString()); + return scrapeProfile(actorData, actorRes.body.toString(), null, networkName); } } } diff --git a/src/utils/posters.js b/src/utils/posters.js index eefe9508..8f44284c 100644 --- a/src/utils/posters.js +++ b/src/utils/posters.js @@ -3,13 +3,14 @@ const config = require('config'); const path = require('path'); const fs = require('fs-extra'); +const moment = require('moment'); const argv = require('../argv'); const knex = require('../knex'); async function init() { const posters = await knex('actors') - .select('actors.name as actor_name', 'releases.title', 'media.path', 'sites.name as site_name', 'networks.name as network_name') + .select('actors.name as actor_name', 'releases.title', 'releases.date', 'media.path', 'sites.name as site_name', 'networks.name as network_name') .whereIn('actors.name', argv.actors) .join('releases_actors', 'releases_actors.actor_id', 'actors.id') .join('releases', 'releases_actors.release_id', 'releases.id') @@ -20,7 +21,7 @@ async function init() { await Promise.all(posters.map(async (poster) => { const source = path.join(config.media.path, poster.path); - const target = path.join(config.media.path, 'posters', `${poster.actor_name} - ${poster.network_name}: ${poster.site_name} - ${poster.title.replace(/[/.]/g, '_')}.jpeg`); + const target = path.join(config.media.path, 'posters', `${poster.actor_name} - ${poster.network_name}: ${poster.site_name} - ${poster.title.replace(/[/.]/g, '_')} (${moment.utc(poster.date).format('YYYY-MM-DD')}).jpeg`); const file = await fs.readFile(source); await fs.writeFile(target, file);