Returning window.document instead of element as document from q. Fixed actor collisions when scrapers return same scene multiple times. Scraping all Score actor release pages. Fixed 21Sextury and PureTaboo photo scraping.

This commit is contained in:
ThePendulum 2020-02-05 23:57:55 +01:00
parent 75dbe2548a
commit d4801bb240
6 changed files with 36 additions and 13 deletions

View File

@ -456,7 +456,8 @@ async function associateActors(mappedActors, releases) {
const actorEntry = existingActorEntries.find(actor => actor.slug === actorMap[actorName])
|| await storeActor({ name: actorName });
return releaseIds
// if a scene
return Array.from(releaseIds)
.map(releaseId => ({
release_id: releaseId,
actor_id: actorEntry.id,

View File

@ -292,8 +292,8 @@ function accumulateActors(releases) {
release.actors.forEach((actor) => {
const actorName = actor.name ? actor.name.trim() : actor.trim();
if (!acc[actorName]) acc[actorName] = [];
acc[actorName].push(release.id);
if (!acc[actorName]) acc[actorName] = new Set();
acc[actorName].add(release.id);
});
return acc;
@ -372,10 +372,11 @@ async function storeRelease(release) {
site_id: release.site.id,
})
.first();
const curatedRelease = await curateReleaseEntry(release);
if (existingRelease && !argv.redownload) {
return existingRelease.id;
return existingRelease;
}
if (existingRelease && argv.redownload) {
@ -394,7 +395,7 @@ async function storeRelease(release) {
await associateTags(release, existingRelease.id);
return existingRelease.id;
return existingRelease;
}
const [releaseEntry] = await knex('releases')

View File

@ -90,7 +90,7 @@ async function scrapeScene(html, url, site) {
const poster = videoData.picPreview;
const trailer = `${videoData.playerOptions.host}${videoData.url}`;
const photos = await getPhotos($('.picturesItem a').attr('href'), '21sextury.com', site);
const photos = await getPhotos($('.picturesItem a').attr('href'), site);
const tags = data.keywords.split(', ');
const siteName = data.productionCompany ? data.productionCompany.name : $('#logoLink a').attr('title');

View File

@ -69,7 +69,7 @@ async function scrapeScene(html, url, site) {
src: `${videoData.playerOptions.host}${videoData.url}`,
};
release.photos = await getPhotos(q('.picturesItem a').href, 'puretaboo.com', site);
release.photos = await getPhotos(q('.picturesItem a').href, site);
return release;
}

View File

@ -2,7 +2,7 @@
const bhttp = require('bhttp');
const { ex, exa } = require('../utils/q');
const { ex, exa, get } = require('../utils/q');
const slugify = require('../utils/slugify');
const { heightToCm, lbsToKg } = require('../utils/convert');
@ -132,7 +132,24 @@ function scrapeModels(html, actorName) {
return model?.href || null;
}
function scrapeProfile(html) {
async function fetchActorReleases(url, accReleases = []) {
const { document, qu } = await get(url);
if (document) {
const releases = accReleases.concat(scrapeAll(document.body.outerHTML));
const nextPage = qu('.next-pg');
if (nextPage && new URL(nextPage).searchParams.has('page')) {
return fetchActorReleases(nextPage, releases);
}
return releases;
}
return null;
}
async function scrapeProfile(html) {
const { q, qa, qi } = ex(html, '#model-page');
const profile = { gender: 'female' };
@ -170,8 +187,10 @@ function scrapeProfile(html) {
const avatar = qi('img');
if (avatar) profile.avatar = avatar;
const releases = ex(html, '#model-page + .container, #model-page + .container-fluid');
if (releases) profile.releases = scrapeAll(releases.document.outerHTML);
const { qu } = ex(html, '#model-page + .container, #model-page + .container-fluid');
const releasesPage = qu('.next-pg');
if (releasesPage) profile.releases = await fetchActorReleases(releasesPage);
return profile;
}

View File

@ -161,8 +161,10 @@ function init(element, window) {
return {
element,
document: element,
...(window && { window }),
...(window && {
window,
document: window.document,
}),
...contextFuncs,
};
}