From d4801bb2406f66977910f5fbf72942a1a002917c Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Wed, 5 Feb 2020 23:57:55 +0100 Subject: [PATCH] Returning window.document instead of element as document from q. Fixed actor collisions when scrapers return same scene multiple times. Scraping all Score actor release pages. Fixed 21Sextury and PureTaboo photo scraping. --- src/actors.js | 3 ++- src/releases.js | 9 +++++---- src/scrapers/21sextury.js | 2 +- src/scrapers/puretaboo.js | 2 +- src/scrapers/score.js | 27 +++++++++++++++++++++++---- src/utils/q.js | 6 ++++-- 6 files changed, 36 insertions(+), 13 deletions(-) diff --git a/src/actors.js b/src/actors.js index 12364f76..dbdf78f5 100644 --- a/src/actors.js +++ b/src/actors.js @@ -456,7 +456,8 @@ async function associateActors(mappedActors, releases) { const actorEntry = existingActorEntries.find(actor => actor.slug === actorMap[actorName]) || await storeActor({ name: actorName }); - return releaseIds + // if a scene + return Array.from(releaseIds) .map(releaseId => ({ release_id: releaseId, actor_id: actorEntry.id, diff --git a/src/releases.js b/src/releases.js index aa01de48..8f2c1b22 100644 --- a/src/releases.js +++ b/src/releases.js @@ -292,8 +292,8 @@ function accumulateActors(releases) { release.actors.forEach((actor) => { const actorName = actor.name ? actor.name.trim() : actor.trim(); - if (!acc[actorName]) acc[actorName] = []; - acc[actorName].push(release.id); + if (!acc[actorName]) acc[actorName] = new Set(); + acc[actorName].add(release.id); }); return acc; @@ -372,10 +372,11 @@ async function storeRelease(release) { site_id: release.site.id, }) .first(); + const curatedRelease = await curateReleaseEntry(release); if (existingRelease && !argv.redownload) { - return existingRelease.id; + return existingRelease; } if (existingRelease && argv.redownload) { @@ -394,7 +395,7 @@ async function storeRelease(release) { await associateTags(release, existingRelease.id); - return existingRelease.id; + return existingRelease; } const [releaseEntry] = await knex('releases') diff --git a/src/scrapers/21sextury.js b/src/scrapers/21sextury.js index 68879c0b..a6a08f95 100644 --- a/src/scrapers/21sextury.js +++ b/src/scrapers/21sextury.js @@ -90,7 +90,7 @@ async function scrapeScene(html, url, site) { const poster = videoData.picPreview; const trailer = `${videoData.playerOptions.host}${videoData.url}`; - const photos = await getPhotos($('.picturesItem a').attr('href'), '21sextury.com', site); + const photos = await getPhotos($('.picturesItem a').attr('href'), site); const tags = data.keywords.split(', '); const siteName = data.productionCompany ? data.productionCompany.name : $('#logoLink a').attr('title'); diff --git a/src/scrapers/puretaboo.js b/src/scrapers/puretaboo.js index b5cd19b4..dc95bc5a 100644 --- a/src/scrapers/puretaboo.js +++ b/src/scrapers/puretaboo.js @@ -69,7 +69,7 @@ async function scrapeScene(html, url, site) { src: `${videoData.playerOptions.host}${videoData.url}`, }; - release.photos = await getPhotos(q('.picturesItem a').href, 'puretaboo.com', site); + release.photos = await getPhotos(q('.picturesItem a').href, site); return release; } diff --git a/src/scrapers/score.js b/src/scrapers/score.js index a983f8ba..499da36f 100644 --- a/src/scrapers/score.js +++ b/src/scrapers/score.js @@ -2,7 +2,7 @@ const bhttp = require('bhttp'); -const { ex, exa } = require('../utils/q'); +const { ex, exa, get } = require('../utils/q'); const slugify = require('../utils/slugify'); const { heightToCm, lbsToKg } = require('../utils/convert'); @@ -132,7 +132,24 @@ function scrapeModels(html, actorName) { return model?.href || null; } -function scrapeProfile(html) { +async function fetchActorReleases(url, accReleases = []) { + const { document, qu } = await get(url); + + if (document) { + const releases = accReleases.concat(scrapeAll(document.body.outerHTML)); + const nextPage = qu('.next-pg'); + + if (nextPage && new URL(nextPage).searchParams.has('page')) { + return fetchActorReleases(nextPage, releases); + } + + return releases; + } + + return null; +} + +async function scrapeProfile(html) { const { q, qa, qi } = ex(html, '#model-page'); const profile = { gender: 'female' }; @@ -170,8 +187,10 @@ function scrapeProfile(html) { const avatar = qi('img'); if (avatar) profile.avatar = avatar; - const releases = ex(html, '#model-page + .container, #model-page + .container-fluid'); - if (releases) profile.releases = scrapeAll(releases.document.outerHTML); + const { qu } = ex(html, '#model-page + .container, #model-page + .container-fluid'); + const releasesPage = qu('.next-pg'); + + if (releasesPage) profile.releases = await fetchActorReleases(releasesPage); return profile; } diff --git a/src/utils/q.js b/src/utils/q.js index c7c073f7..47ecbda6 100644 --- a/src/utils/q.js +++ b/src/utils/q.js @@ -161,8 +161,10 @@ function init(element, window) { return { element, - document: element, - ...(window && { window }), + ...(window && { + window, + document: window.document, + }), ...contextFuncs, }; }