From ab46e8558d6c1fbeebe22b87384da3bcea325c3a Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Tue, 4 Jun 2024 05:12:41 +0200 Subject: [PATCH] Fixed Kink scraper. --- package-lock.json | 8 +-- package.json | 2 +- src/scrapers/kink.js | 139 ++++++++++++++++++++++--------------------- 3 files changed, 77 insertions(+), 72 deletions(-) diff --git a/package-lock.json b/package-lock.json index c62a2fd2..d4b23066 100644 --- a/package-lock.json +++ b/package-lock.json @@ -88,7 +88,7 @@ "tunnel": "0.0.6", "ua-parser-js": "^1.0.37", "undici": "^5.28.1", - "unprint": "^0.10.11", + "unprint": "^0.10.12", "url-pattern": "^1.0.3", "v-tooltip": "^2.1.3", "video.js": "^8.6.1", @@ -18293,9 +18293,9 @@ } }, "node_modules/unprint": { - "version": "0.10.11", - "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.10.11.tgz", - "integrity": "sha512-+OL+8BFF9SYvayp57l8ifq77I6ok2ilPCidBVka7VbMALJgqHxkHqrqkCupw2RKX2tNfPT/TGa+NJsYGboFnRQ==", + "version": "0.10.12", + "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.10.12.tgz", + "integrity": "sha512-EbRGhkoOcmnMmQBaKZA6Tky6gpEwrhy4tDB1KeajSGhqli7zhlNe3WqsTQPtLBNKa/4M2PJZS8l0GOOjvTLndQ==", "dependencies": { "axios": "^0.27.2", "bottleneck": "^2.19.5", diff --git a/package.json b/package.json index 26e72856..8ca1280c 100755 --- a/package.json +++ b/package.json @@ -147,7 +147,7 @@ "tunnel": "0.0.6", "ua-parser-js": "^1.0.37", "undici": "^5.28.1", - "unprint": "^0.10.11", + "unprint": "^0.10.12", "url-pattern": "^1.0.3", "v-tooltip": "^2.1.3", "video.js": "^8.6.1", diff --git a/src/scrapers/kink.js b/src/scrapers/kink.js index 598ca800..692ab4a0 100755 --- a/src/scrapers/kink.js +++ b/src/scrapers/kink.js @@ -4,40 +4,49 @@ const unprint = require('unprint'); const http = require('../utils/http'); const slugify = require('../utils/slugify'); +const { stripQuery } = require('../utils/url'); function scrapeAll(scenes, entity) { return scenes.map(({ query }) => { const release = {}; const networkUrl = entity.type === 'channel' ? entity.parent.url : entity.url; - const href = query.url('.shoot-link'); + const href = query.url('a[href*="/shoot"]'); release.url = `${networkUrl}${href}`; release.shootId = href.split('/').slice(-1)[0]; release.entryId = release.shootId; - release.title = query.content('.shoot-thumb-title a', true); - release.date = query.date('.date', 'MMM DD, YYYY'); + release.title = query.content('.card-body a[href*="/shoot"]').trim(); + release.date = query.date('small > span', 'MMM D, YYYY'); - release.actors = query.all('.shoot-thumb-models a').map((actorEl) => ({ + release.actors = query.all('a[href*="/model"]').map((actorEl) => ({ name: unprint.query.content(actorEl), url: unprint.query.url(actorEl, null, { origin: networkUrl }), })); - release.rating = query.number('.thumb-ratings') / 10; + const poster = query.img('.ratio-thumbnail img'); - release.poster = query.img('.adimage'); - release.photos = query.imgs('.rollover .roll-image', { attribute: 'data-imagesrc' }).map((photo) => [ - photo - .replace('410/', '830/') - .replace('_thumb', '_full'), - photo, - ]); + release.poster = [ + stripQuery(poster).replace('_thumb', '_full'), + stripQuery(poster), + poster, + ]; + + try { + release.photos = JSON.parse(query.attribute('.ratio-thumbnail img', 'data-cycle')).map((src) => [ + stripQuery(src).replace('_thumb', '_full'), + stripQuery(src), + src, + ]); + } catch (error) { + // no photos + } release.trailer = `https://cdnp.kink.com/imagedb/${release.entryId}/trailer/${release.entryId}_trailer_high.mp4`; - release.duration = query.dur('.video span'); + release.rating = query.number('.thumb-up') / 10; return release; }); @@ -45,65 +54,63 @@ function scrapeAll(scenes, entity) { function scrapeScene({ query }, url, entity) { const release = { url }; + const data = query.json('div[data-setup]', { attribute: 'data-setup' }); - release.shootId = new URL(url).pathname.split('/')[2]; - release.entryId = release.shootId; + release.shootId = data?.id || new URL(url).pathname.split('/')[2]; + release.entryId = data?.id || release.shootId; - release.title = query.attribute('.shoot-title .favorite-button', 'data-title') || query.content('.shoot-title'); - release.description = query.content('.description-text'); + release.title = data?.title || query.attribute('#shootPage #favoriteShootButton', 'data-title') || query.content('#shootPage h1'); + release.description = query.content('//h4[contains(text(), \'Description\')]/following-sibling::span/p'); - release.date = query.date('.shoot-date', 'MMMM DD, YYYY'); + release.date = query.date('.shoot-detail-legend', 'MM/DD/YY'); + release.duration = data?.duration + ? data.duration / 1000 + : query.duration('#shootPage .clock'); - release.actors = query.elements('.names a').map((actorEl) => ({ + release.actors = query.elements('#shootPage h1 + span a[href*="/model"]').map((actorEl) => ({ name: unprint.query.content(actorEl).replace(/,\s*/, ''), url: unprint.query.url(actorEl, null, { origin: entity.type === 'channel' ? entity.parent.url : entity.url }), })); - release.director = query.content('.director-name'); + release.director = query.content('.director-name')?.trim(); - release.photos = query.imgs('.gallery .thumb img, #gallerySlider .gallery-img', { attribute: 'data-image-file' }); - release.poster = query.poster(); - release.trailer = query.dataset('.player span[data-type="trailer-src"]', 'url') || `https://cdnp.kink.com/imagedb/${release.entryId}/trailer/${release.entryId}_trailer_high.mp4`; + const poster = data?.posterUrl || query.poster(); - release.tags = query.contents('.tag-list a[href*="/tag"]').map((tag) => tag.replace(/,\s*/, '')); + release.poster = [ + stripQuery(poster), + poster, + ]; - release.channel = slugify(query.url('.shoot-logo a')?.split('/').slice(-1)[0], ''); + release.photos = query.json('#galleryImagesContainer', { attribute: 'data-images' })?.map((src) => [ + src.fullPath, + src.thumbFullPath, + ]); + + release.trailer = [ + ...(data?.trailer?.sources?.map((source) => ({ + src: source.url, + quality: source.resolution, + })) || []), + `https://cdnp.kink.com/imagedb/${release.entryId}/trailer/${release.entryId}_trailer_high.mp4`, + ]; + + release.tags = query.contents('#shootPage a[href*="/tag"]').map((tag) => tag.replace(/,\s*/, '')); + release.channel = data?.channelName?.name || slugify(query.url('.shoot-detail-legend a[href*="/channel"]')?.split('/').slice(-1)[0], ''); + + release.qualities = data?.resolutions + ? Object.entries(data.resolutions).filter(([, enabled]) => enabled).map(([res]) => parseInt(res, 10)) + : null; return release; } -async function fetchActorReleases(actorId, entity, page = 1, accReleases = []) { - const networkUrl = entity.type === 'channel' ? entity.parent.url : entity.url; - const { tab } = await http.getBrowserSession('kink'); - const res = await tab.goto(`${networkUrl}/search?type=shoots&performerIds=${actorId}&sort=published&page=${page}`); - - if (res.status() === 200) { - const html = await tab.content(); - const item = unprint.init(html); - const releases = scrapeAll(unprint.initAll(html, '.results .shoot-card'), entity); - const hasNextPage = item.query.exists('.paginated-nav li:last-child:not(.disabled)'); - - await tab.close(); - - if (hasNextPage) { - return fetchActorReleases(actorId, entity, page + 1, accReleases.concat(releases)); - } - - return accReleases.concat(releases); - } - - await tab.close(); - - return accReleases; -} - -async function scrapeProfile({ query }, actorUrl, entity, include) { +async function scrapeProfile({ query }, actorUrl) { const profile = {}; - profile.entryId = actorUrl.match(/\/model\/(\d+)\//)?.[1] || query.attribute('.favorite-button.bio-favorite', 'data-id'); - profile.description = query.content('.bio-outer #expand-text'); + profile.entryId = actorUrl.match(/\/model\/(\d+)\//)?.[1] || query.attribute('h1 + button[data-id]', 'data-id'); + profile.description = query.content('.content-container #expand-text')?.trim(); - const tags = query.contents('.bio-tags a').map((tag) => tag.toLowerCase()); + const tags = query.contents('.content-container a[href*="/tag"]').map((tag) => tag.toLowerCase().trim()); if (tags.includes('brunette') || tags.includes('brunet')) profile.hairColor = 'brown'; if (tags.includes('blonde') || tags.includes('blond')) profile.hairColor = 'blonde'; @@ -125,24 +132,21 @@ async function scrapeProfile({ query }, actorUrl, entity, include) { if ((tags.includes('big dick') || tags.includes('foreskin')) && (tags.includes('fake boobs') || tags.includes('big tits'))) profile.gender = 'transsexual'; - profile.avatar = query.img('.bio-slider-img, .bio-img:not([src*="Missing"])'); - profile.social = query.urls('a.social-link'); - - if (include.releases && profile.entryId) { - profile.releases = await fetchActorReleases(profile.entryId, entity); - } + [profile.avatar, ...profile.photos] = query.imgs('.kink-slider-img:not([data-src*="Missing"])', { attribute: 'data-src' }); + profile.social = query.urls('.content-container a[href*="twitter.com"], .content-container a[href*="x.com"]'); return profile; } async function fetchLatest(channel, page = 1) { const { tab } = await http.getBrowserSession('kink'); - const res = await tab.goto(`${channel.parent.url}/search?type=shoots&channelIds=${channel.parameters?.slug || channel.slug}&sort=published&page=${page}`); + const url = `${channel.parent.url}/search?type=shoots&channelIds=${channel.parameters?.slug || channel.slug}&sort=published&page=${page}`; + const res = await tab.goto(url); const status = res.status(); if (status === 200) { const html = await tab.content(); - const items = unprint.initAll(html, '.results .shoot-card'); + const items = unprint.initAll(html, '.container .card'); const scenes = scrapeAll(items, channel); @@ -178,7 +182,7 @@ async function fetchScene(url, channel) { return status; } -async function fetchProfile({ name: actorName }, entity, options) { +async function fetchProfile({ name: actorName }, entity) { const networkUrl = entity.type === 'channel' ? entity.parent.url : entity.url; const { tab } = await http.getBrowserSession('kink'); @@ -188,12 +192,13 @@ async function fetchProfile({ name: actorName }, entity, options) { if (searchStatus === 200) { const searchHtml = await tab.content(); - const searchResItems = unprint.initAll(searchHtml, '.model'); - const actorItem = searchResItems.find((item) => item.query.exists(`.model-link img[alt="${actorName}"]`)); + const searchResItems = unprint.initAll(searchHtml, '.ratio-model'); + const actorItem = searchResItems.find((item) => item.query.exists(`//span[contains(text(), '${actorName}')]`)); if (actorItem) { - const actorPath = actorItem.query.url('.model-link'); + const actorPath = actorItem.query.url(null); const actorUrl = `${networkUrl}${actorPath}`; + const actorRes = await tab.goto(actorUrl); const actorStatus = actorRes.status(); @@ -203,7 +208,7 @@ async function fetchProfile({ name: actorName }, entity, options) { await tab.close(); - return scrapeProfile(item, actorUrl, entity, options); + return scrapeProfile(item, actorUrl); } await tab.close();