From bd041c528daaf249df4c7ee6cab39f5ed815ebf4 Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Sun, 17 Nov 2019 00:45:31 +0100 Subject: [PATCH] Fixed ID number/string incompatability in duplicate detection. Expanded Reality Kings scraper to handle older scenes and fix URLs. --- src/releases.js | 4 +++- src/scrape-sites.js | 8 ++++---- src/scrapers/realitykings.js | 39 +++++++++++++++++++++--------------- traxxx | 2 ++ 4 files changed, 32 insertions(+), 21 deletions(-) create mode 100755 traxxx diff --git a/src/releases.js b/src/releases.js index 8e2b2ced..00f72b28 100644 --- a/src/releases.js +++ b/src/releases.js @@ -133,7 +133,9 @@ async function storeRelease(release) { async function storeReleases(releases) { return Promise.map(releases, async (release) => { try { - return storeRelease(release); + const releaseId = await storeRelease(release); + + return releaseId; } catch (error) { console.error(error); diff --git a/src/scrape-sites.js b/src/scrape-sites.js index 103cd49f..99157346 100644 --- a/src/scrape-sites.js +++ b/src/scrape-sites.js @@ -24,8 +24,8 @@ async function findDuplicateReleaseIds(latestReleases, accReleases) { // include accumulated releases as duplicates to prevent an infinite // loop when the next page contains the same releases as the previous return new Set(duplicateReleases - .map(release => release.entry_id) - .concat(accReleases.map(release => release.entryId))); + .map(release => String(release.entry_id)) + .concat(accReleases.map(release => String(release.entryId)))); } async function scrapeUniqueReleases(scraper, site, afterDate = getAfterDate(), accReleases = [], page = 1) { @@ -122,11 +122,11 @@ async function scrapeReleases() { } } catch (error) { if (argv.debug) { - console.error(`${site.id}: Failed to fetch releases`, error); + console.error(`${site.id}: Failed to scrape releases`, error); return; } - console.warn(`${site.id}: Failed to fetch releases`); + console.warn(`${site.id}: Failed to scrape releases`); } }, { concurrency: 2, diff --git a/src/scrapers/realitykings.js b/src/scrapers/realitykings.js index 747c320e..e5bf3215 100644 --- a/src/scrapers/realitykings.js +++ b/src/scrapers/realitykings.js @@ -8,6 +8,20 @@ const { JSDOM } = require('jsdom'); const { matchTags } = require('../tags'); +function getThumbs(scene) { + if (scene.images.poster) { + return scene.images.poster.map(image => image.xl.url); + } + + if (scene.images.card_main_rect) { + return scene.images.card_main_rect + .concat(scene.images.card_secondary_rect || []) + .map(image => image.xl.url.replace('.thumb', '')); + } + + return []; +} + async function scrapeLatest(html, site) { const { document } = new JSDOM(html).window; @@ -29,24 +43,17 @@ async function scrapeLatest(html, site) { description, } = scene; - const url = `https://www.realitykings.com/scene/${entryId}`; + const url = `https://www.realitykings.com/scene/${entryId}/`; const date = new Date(scene.dateReleased); const actors = scene.actors.map(actorId => actorsMap[actorId].name); + const duration = scene.videos.mediabook && scene.videos.mediabook.length; const rawTags = scene.tags.map(tagId => tagsMap[tagId].name); const tags = await matchTags(rawTags); - if (!scene.images.poster) { - console.log(site.name, site.id); - console.log(scene); - console.log(title, url, scene.images); - } - - const [poster, ...photos] = scene.images.poster.map(image => image.xl.url); - - const duration = scene.videos.mediabook.length; - const trailer720p = scene.videos.mediabook.files['720p'] && scene.videos.mediabook.files['720p'].urls.view; - const trailer360p = scene.videos.mediabook.files['360p'] && scene.videos.mediabook.files['360p'].urls.view; + const [poster, ...photos] = getThumbs(scene); + const trailer720p = scene.videos.mediabook && scene.videos.mediabook.files['720p'] && scene.videos.mediabook.files['720p'].urls.view; + const trailer360p = scene.videos.mediabook && scene.videos.mediabook.files['360p'] && scene.videos.mediabook.files['360p'].urls.view; const { likes, dislikes } = scene.stats; @@ -86,11 +93,11 @@ async function scrapeScene(data, url, site) { const rawTags = data.tags.map(tag => tag.name); const tags = await matchTags(rawTags); - const [poster, ...photos] = data.images.poster.map(image => image.xl.url); + const [poster, ...photos] = getThumbs(data); - const duration = data.videos.mediabook.length; - const trailer720p = data.videos.mediabook.files['720p'] && data.videos.mediabook.files['720p'].urls.view; - const trailer360p = data.videos.mediabook.files['360p'] && data.videos.mediabook.files['360p'].urls.view; + const duration = data.videos.mediabook && data.videos.mediabook.length; + const trailer720p = data.videos.mediabook && data.videos.mediabook.files['720p'] && data.videos.mediabook.files['720p'].urls.view; + const trailer360p = data.videos.mediabook && data.videos.mediabook.files['360p'] && data.videos.mediabook.files['360p'].urls.view; return { url, diff --git a/traxxx b/traxxx new file mode 100755 index 00000000..13765190 --- /dev/null +++ b/traxxx @@ -0,0 +1,2 @@ +#!/usr/bin/bash +node ./src/app.js "$@";