Refactored Kink scraper to use unprint browser. Improved socials handling in actors module.

This commit is contained in:
DebaucheryLibrarian 2025-12-28 05:48:24 +01:00
parent f5d6574cc6
commit 5c585d5d45
4 changed files with 129 additions and 68 deletions

2
common

@ -1 +1 @@
Subproject commit dc00c3d58af2c23530b8b3cb6704f3860fdd7d0f Subproject commit 4b90a5feeccc0c6325469dcb45a8d7cceabb386a

View File

@ -403,4 +403,23 @@ module.exports = {
flushWindow: 1000, flushWindow: 1000,
}, },
titleSlugLength: 50, titleSlugLength: 50,
socials: {
urls: {
cashapp: 'https://cash.app/${handle}', // eslint-disable-line no-template-curly-in-string
fansly: 'https://fansly.com/{handle}',
instagram: 'https://www.instagram.com/{handle}',
linktree: 'https://linktr.ee/{handle}',
loyalfans: 'https://www.loyalfans.com/{handle}',
manyvids: 'https://{handle}.manyvids.com',
onlyfans: 'https://onlyfans.com/{handle}',
pornhub: 'https://www.pornhub.com/model/{handle}',
reddit: 'https://www.reddit.com/u/{handle}',
twitter: 'https://x.com/{handle}',
},
prefix: {
default: '@',
cashapp: '$',
reddit: 'u/',
},
},
}; };

View File

@ -698,7 +698,61 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy
return profiles.filter(Boolean); return profiles.filter(Boolean);
} }
function curateSocials(socials, platformsByHostname) {
return socials
.map((social) => {
if (social.url) {
return social.url;
}
if (social.handle && social.platform) {
return social;
}
if (typeof social === 'string') {
return {
url: social,
};
}
return null;
})
.filter(Boolean)
.map((social) => {
if (social.handle && social.platform && /[\w-]+/.test(social.handle) && /[a-z]+/i.test(social.platform)) {
return {
platform: social.platform.toLowerCase(),
handle: social.handle,
};
}
if (social.url) {
const { hostname, pathname } = new URL(social.url);
const platform = platformsByHostname[hostname];
if (platform) {
const handle = pathname.match(new RegExp(platform.pathname.replace('{handle}', '([\\w-]+)')))?.[1];
if (handle) {
return {
platform: platform.platform,
handle,
};
}
}
return {
url: social.url,
};
}
throw new Error('Invalid social');
})
.filter(Boolean);
}
async function associateSocials(profiles) { async function associateSocials(profiles) {
const { platformsByHostname } = await actorsCommon;
const profileEntries = await knex('actors_profiles').whereIn(['actor_id', 'entity_id'], profiles.map((profile) => [profile.actorId, profile.entity.id])); const profileEntries = await knex('actors_profiles').whereIn(['actor_id', 'entity_id'], profiles.map((profile) => [profile.actorId, profile.entity.id]));
const profileEntriesByActorIdAndEntityId = profileEntries.reduce((acc, profileEntry) => { const profileEntriesByActorIdAndEntityId = profileEntries.reduce((acc, profileEntry) => {
@ -725,11 +779,12 @@ async function associateSocials(profiles) {
} }
await knex('actors_socials') await knex('actors_socials')
.insert(profile.social.map((url) => ({ .insert(curateSocials(profile.social, platformsByHostname).map((social) => ({
url, platform: social.platform,
platform: new URL(url).hostname.match(/([\w-]+)?\.(\w+)$/)?.[1], handle: social.handle,
url: social.url,
actor_id: profile.actorId, actor_id: profile.actorId,
profile_id: profileId, // profile_id: profileId,
}))) })))
.onConflict() .onConflict()
.ignore(); .ignore();

View File

@ -2,7 +2,6 @@
const unprint = require('unprint'); const unprint = require('unprint');
const http = require('../utils/http');
const slugify = require('../utils/slugify'); const slugify = require('../utils/slugify');
const { stripQuery } = require('../utils/url'); const { stripQuery } = require('../utils/url');
@ -40,11 +39,12 @@ function scrapeAll(scenes, entity) {
})); }));
try { try {
release.photos = JSON.parse(query.attribute('.ratio-thumbnail img', 'data-cycle')).map((src) => [ release.photos = JSON.parse(query.attribute('.ratio-thumbnail img', 'data-cycle'))
.map((src) => Array.from(new Set([
stripQuery(src).replace('_thumb', '_full'), stripQuery(src).replace('_thumb', '_full'),
stripQuery(src), stripQuery(src),
src, src,
].filter(Boolean).map((source) => ({ ])).filter(Boolean).map((source) => ({
src: source, src: source,
expectType: { expectType: {
PNG: 'image/png', PNG: 'image/png',
@ -56,7 +56,7 @@ function scrapeAll(scenes, entity) {
release.trailer = `https://cdnp.kink.com/imagedb/${release.entryId}/trailer/${release.entryId}_trailer_high.mp4`; release.trailer = `https://cdnp.kink.com/imagedb/${release.entryId}/trailer/${release.entryId}_trailer_high.mp4`;
release.channel = slugify(query.content('.shoot-detail-legend a[href*="/channel"]'), ''); release.channel = slugify(query.content('.shoot-thumbnail-footer a[href*="/channel"]'), '');
release.rating = query.number('.thumb-up') / 10; release.rating = query.number('.thumb-up') / 10;
return release; return release;
@ -64,25 +64,21 @@ function scrapeAll(scenes, entity) {
} }
async function fetchLatest(channel, page = 1) { async function fetchLatest(channel, page = 1) {
const { tab } = await http.getBrowserSession('kink', { useGlobalBrowser: false, useProxy: true });
const url = `${channel.parent.url}/search?type=shoots&channelIds=${channel.parameters?.slug || channel.slug}&sort=published&page=${page}`; const url = `${channel.parent.url}/search?type=shoots&channelIds=${channel.parameters?.slug || channel.slug}&sort=published&page=${page}`;
const res = await tab.goto(url);
const status = res.status();
if (status === 200) { const res = await unprint.browserRequest(url, {
const html = await tab.content(); selectAll: '.container .card',
const items = unprint.initAll(html, '.container .card'); });
const scenes = scrapeAll(items, channel); if (res.status === 200) {
// const items = unprint.initAll(html, '.container .card');
await tab.close(); const scenes = scrapeAll(res.context, channel);
return scenes; return scenes;
} }
await tab.close(); return res.status;
return status;
} }
function scrapeScene({ query }, url, entity) { function scrapeScene({ query }, url, entity) {
@ -149,29 +145,19 @@ function scrapeScene({ query }, url, entity) {
} }
async function fetchScene(url, channel) { async function fetchScene(url, channel) {
const { tab } = await http.getBrowserSession('kink', { useGlobalBrowser: false, useProxy: true }); const res = await unprint.browserRequest(url);
const res = await tab.goto(url);
const status = res.status(); if (res.status === 200) {
const scene = scrapeScene(res.context, url, channel);
if (status === 200) {
const html = await tab.content();
const item = unprint.init(html);
const scene = scrapeScene(item, url, channel);
await tab.close();
return scene; return scene;
} }
await tab.close(); return res.status;
return status;
} }
async function scrapeProfile({ query }, actorUrl) { async function scrapeProfile({ query }, actorUrl) {
const profile = {}; const profile = { url: actorUrl };
profile.entryId = actorUrl.match(/\/model\/(\d+)\//)?.[1] || query.attribute('h1 + button[data-id]', 'data-id'); profile.entryId = actorUrl.match(/\/model\/(\d+)\//)?.[1] || query.attribute('h1 + button[data-id]', 'data-id');
profile.description = query.content('.content-container #expand-text')?.trim(); profile.description = query.content('.content-container #expand-text')?.trim();
@ -204,42 +190,43 @@ async function scrapeProfile({ query }, actorUrl) {
return profile; return profile;
} }
async function fetchProfile({ name: actorName }, entity) { async function getActorUrl({ name: actorName, url }, networkUrl) {
const networkUrl = entity.type === 'channel' ? entity.parent.url : entity.url; if (url) {
const { tab } = await http.getBrowserSession('kink', { useGlobalBrowser: false, useProxy: true }); return url;
}
// const searchRes = await tab.goto(`${networkUrl}/search?type=performers&q=${actorName}`); // const searchRes = await tab.goto(`${networkUrl}/search?type=performers&q=${actorName}`);
const searchApiRes = await tab.goto(`https://www.kink.com/api/v2/search/suggestions/performers?term=${actorName}`); const searchApiRes = await unprint.browserRequest(`https://www.kink.com/api/v2/search/suggestions/performers?term=${actorName}`);
const searchStatus = searchApiRes.status();
if (searchStatus === 200) { if (searchApiRes.status === 200) {
const searchHtml = await tab.content(); const data = searchApiRes.context.query.json('body pre');
const data = unprint.init(searchHtml).query.json('body pre');
const actorId = data.find((actor) => actor.label === actorName)?.id; const actorId = data.find((actor) => actor.label === actorName)?.id;
if (actorId) { if (actorId) {
const actorUrl = `${networkUrl}/model/${actorId}/${slugify(actorName)}`; const actorUrl = `${networkUrl}/model/${actorId}/${slugify(actorName)}`;
const actorRes = await tab.goto(actorUrl);
const actorStatus = actorRes.status();
if (actorStatus === 200) { return actorUrl;
const actorHtml = await tab.content(); }
const item = unprint.init(actorHtml);
await tab.close();
return scrapeProfile(item, actorUrl);
} }
await tab.close(); return null;
}
async function fetchProfile(actor, entity) {
const networkUrl = entity.type === 'channel' ? entity.parent.url : entity.url;
const actorUrl = await getActorUrl(actor, networkUrl);
if (actorUrl) {
const actorRes = await unprint.browserRequest(actorUrl);
if (actorRes.status === 200) {
return scrapeProfile(actorRes.context, actorUrl);
}
return actorRes.status; return actorRes.status;
} }
return null; return null;
}
return searchStatus;
} }
module.exports = { module.exports = {