Returning window.document instead of element as document from q. Fixed actor collisions when scrapers return same scene multiple times. Scraping all Score actor release pages. Fixed 21Sextury and PureTaboo photo scraping.

This commit is contained in:
ThePendulum 2020-02-05 23:57:55 +01:00
parent 75dbe2548a
commit d4801bb240
6 changed files with 36 additions and 13 deletions

View File

@ -456,7 +456,8 @@ async function associateActors(mappedActors, releases) {
const actorEntry = existingActorEntries.find(actor => actor.slug === actorMap[actorName]) const actorEntry = existingActorEntries.find(actor => actor.slug === actorMap[actorName])
|| await storeActor({ name: actorName }); || await storeActor({ name: actorName });
return releaseIds // if a scene
return Array.from(releaseIds)
.map(releaseId => ({ .map(releaseId => ({
release_id: releaseId, release_id: releaseId,
actor_id: actorEntry.id, actor_id: actorEntry.id,

View File

@ -292,8 +292,8 @@ function accumulateActors(releases) {
release.actors.forEach((actor) => { release.actors.forEach((actor) => {
const actorName = actor.name ? actor.name.trim() : actor.trim(); const actorName = actor.name ? actor.name.trim() : actor.trim();
if (!acc[actorName]) acc[actorName] = []; if (!acc[actorName]) acc[actorName] = new Set();
acc[actorName].push(release.id); acc[actorName].add(release.id);
}); });
return acc; return acc;
@ -372,10 +372,11 @@ async function storeRelease(release) {
site_id: release.site.id, site_id: release.site.id,
}) })
.first(); .first();
const curatedRelease = await curateReleaseEntry(release); const curatedRelease = await curateReleaseEntry(release);
if (existingRelease && !argv.redownload) { if (existingRelease && !argv.redownload) {
return existingRelease.id; return existingRelease;
} }
if (existingRelease && argv.redownload) { if (existingRelease && argv.redownload) {
@ -394,7 +395,7 @@ async function storeRelease(release) {
await associateTags(release, existingRelease.id); await associateTags(release, existingRelease.id);
return existingRelease.id; return existingRelease;
} }
const [releaseEntry] = await knex('releases') const [releaseEntry] = await knex('releases')

View File

@ -90,7 +90,7 @@ async function scrapeScene(html, url, site) {
const poster = videoData.picPreview; const poster = videoData.picPreview;
const trailer = `${videoData.playerOptions.host}${videoData.url}`; const trailer = `${videoData.playerOptions.host}${videoData.url}`;
const photos = await getPhotos($('.picturesItem a').attr('href'), '21sextury.com', site); const photos = await getPhotos($('.picturesItem a').attr('href'), site);
const tags = data.keywords.split(', '); const tags = data.keywords.split(', ');
const siteName = data.productionCompany ? data.productionCompany.name : $('#logoLink a').attr('title'); const siteName = data.productionCompany ? data.productionCompany.name : $('#logoLink a').attr('title');

View File

@ -69,7 +69,7 @@ async function scrapeScene(html, url, site) {
src: `${videoData.playerOptions.host}${videoData.url}`, src: `${videoData.playerOptions.host}${videoData.url}`,
}; };
release.photos = await getPhotos(q('.picturesItem a').href, 'puretaboo.com', site); release.photos = await getPhotos(q('.picturesItem a').href, site);
return release; return release;
} }

View File

@ -2,7 +2,7 @@
const bhttp = require('bhttp'); const bhttp = require('bhttp');
const { ex, exa } = require('../utils/q'); const { ex, exa, get } = require('../utils/q');
const slugify = require('../utils/slugify'); const slugify = require('../utils/slugify');
const { heightToCm, lbsToKg } = require('../utils/convert'); const { heightToCm, lbsToKg } = require('../utils/convert');
@ -132,7 +132,24 @@ function scrapeModels(html, actorName) {
return model?.href || null; return model?.href || null;
} }
function scrapeProfile(html) { async function fetchActorReleases(url, accReleases = []) {
const { document, qu } = await get(url);
if (document) {
const releases = accReleases.concat(scrapeAll(document.body.outerHTML));
const nextPage = qu('.next-pg');
if (nextPage && new URL(nextPage).searchParams.has('page')) {
return fetchActorReleases(nextPage, releases);
}
return releases;
}
return null;
}
async function scrapeProfile(html) {
const { q, qa, qi } = ex(html, '#model-page'); const { q, qa, qi } = ex(html, '#model-page');
const profile = { gender: 'female' }; const profile = { gender: 'female' };
@ -170,8 +187,10 @@ function scrapeProfile(html) {
const avatar = qi('img'); const avatar = qi('img');
if (avatar) profile.avatar = avatar; if (avatar) profile.avatar = avatar;
const releases = ex(html, '#model-page + .container, #model-page + .container-fluid'); const { qu } = ex(html, '#model-page + .container, #model-page + .container-fluid');
if (releases) profile.releases = scrapeAll(releases.document.outerHTML); const releasesPage = qu('.next-pg');
if (releasesPage) profile.releases = await fetchActorReleases(releasesPage);
return profile; return profile;
} }

View File

@ -161,8 +161,10 @@ function init(element, window) {
return { return {
element, element,
document: element, ...(window && {
...(window && { window }), window,
document: window.document,
}),
...contextFuncs, ...contextFuncs,
}; };
} }