forked from DebaucheryLibrarian/traxxx
Added puppeteer to http module, refactored Kink scraper to utilize it.
This commit is contained in:
@@ -208,6 +208,7 @@ async function init() {
|
||||
}
|
||||
|
||||
await http.destroyBypassSessions();
|
||||
await http.destroyBrowserSessions();
|
||||
|
||||
knex.destroy();
|
||||
done = true;
|
||||
|
||||
@@ -221,10 +221,14 @@ async function scrapeRelease(baseRelease, entitiesBySlug, type = 'scene') {
|
||||
|
||||
async function scrapeReleases(baseReleases, entitiesBySlug, type) {
|
||||
const entitiesWithBeforeDataEntries = await Promise.all(Object.entries(entitiesBySlug).map(async ([slug, entity]) => {
|
||||
console.log('scraper', entity.scraper?.beforeFetchScenes);
|
||||
|
||||
if (entity.scraper?.beforeFetchScenes) {
|
||||
const parameters = getRecursiveParameters(entity);
|
||||
const preData = await entity.scraper.beforeFetchScenes(entity, parameters);
|
||||
|
||||
console.log('pre data', preData);
|
||||
|
||||
return [slug, { ...entity, preData }];
|
||||
}
|
||||
|
||||
|
||||
@@ -1,26 +1,34 @@
|
||||
'use strict';
|
||||
|
||||
const unprint = require('unprint');
|
||||
|
||||
const qu = require('../utils/qu');
|
||||
const http = require('../utils/http');
|
||||
const slugify = require('../utils/slugify');
|
||||
|
||||
function scrapeAll(scenes) {
|
||||
return scenes.map(({ query }) => {
|
||||
const release = {};
|
||||
|
||||
const href = query.url('.shoot-thumb-info > a');
|
||||
release.url = `https://kink.com${href}`;
|
||||
const href = query.url('.shoot-link');
|
||||
|
||||
release.url = `https://www.kink.com${href}`;
|
||||
|
||||
release.shootId = href.split('/').slice(-1)[0];
|
||||
release.entryId = release.shootId;
|
||||
|
||||
release.title = query.q('.shoot-thumb-title a', true);
|
||||
release.title = query.content('.shoot-thumb-title a', true);
|
||||
release.date = query.date('.date', 'MMM DD, YYYY');
|
||||
|
||||
release.actors = query.all('.shoot-thumb-models a', true);
|
||||
release.stars = query.q('.average-rating', 'data-rating') / 10;
|
||||
release.actors = query.all('.shoot-thumb-models a').map((actorEl) => ({
|
||||
name: unprint.query.content(actorEl),
|
||||
url: unprint.query.url(actorEl, null, { origin: 'https://www.kink.com' }),
|
||||
}));
|
||||
|
||||
release.rating = query.number('.thumb-ratings') / 10;
|
||||
|
||||
release.poster = query.img('.adimage');
|
||||
release.photos = query.imgs('.rollover .roll-image', 'data-imagesrc').map((photo) => [
|
||||
release.photos = query.imgs('.rollover .roll-image', { attribute: 'data-imagesrc' }).map((photo) => [
|
||||
photo.replace('410/', '830/'),
|
||||
photo,
|
||||
]);
|
||||
@@ -31,25 +39,28 @@ function scrapeAll(scenes) {
|
||||
});
|
||||
}
|
||||
|
||||
async function scrapeScene({ query }, url) {
|
||||
function scrapeScene({ query }, url) {
|
||||
const release = { url };
|
||||
|
||||
release.shootId = new URL(url).pathname.split('/')[2];
|
||||
release.entryId = release.shootId;
|
||||
|
||||
release.title = query.q('.shoot-title span.favorite-button', 'data-title');
|
||||
release.description = query.q('.description-text', true);
|
||||
release.title = query.attribute('.shoot-title .favorite-button', 'data-title') || query.content('.shoot-title');
|
||||
release.description = query.content('.description-text');
|
||||
|
||||
release.date = query.date('.shoot-date', 'MMMM DD, YYYY');
|
||||
release.actors = query.all('.names a', true).map((actor) => actor.replace(/,\s*/, ''));
|
||||
release.director = query.q('.director-name', true);
|
||||
release.actors = query.elements('.names a').map((actorEl) => ({
|
||||
name: unprint.query.content(actorEl).replace(/,\s*/, ''),
|
||||
url: unprint.query.url(actorEl, null, { origin: 'https://www.kink.com' }),
|
||||
}));
|
||||
release.director = query.content('.director-name');
|
||||
|
||||
release.photos = query.imgs('.gallery .thumb img, #gallerySlider .gallery-img', 'data-image-file');
|
||||
release.poster = query.poster();
|
||||
|
||||
release.tags = query.all('.tag-list a[href*="/tag"]', true).map((tag) => tag.replace(/,\s*/, ''));
|
||||
release.tags = query.contents('.tag-list a[href*="/tag"]').map((tag) => tag.replace(/,\s*/, ''));
|
||||
|
||||
const trailer = query.q('.player span[data-type="trailer-src"]', 'data-url');
|
||||
const trailer = query.attribute('.player span[data-type="trailer-src"]', 'data-url');
|
||||
|
||||
if (trailer) {
|
||||
release.trailer = [
|
||||
@@ -72,7 +83,9 @@ async function scrapeScene({ query }, url) {
|
||||
];
|
||||
}
|
||||
|
||||
release.channel = query.url('.shoot-logo a').split('/').slice(-1)[0];
|
||||
release.channel = slugify(query.url('.shoot-logo a')?.split('/').slice(-1)[0], '');
|
||||
|
||||
console.log(release);
|
||||
|
||||
return release;
|
||||
}
|
||||
@@ -131,33 +144,43 @@ async function scrapeProfile({ query }, actorUrl, include) {
|
||||
return profile;
|
||||
}
|
||||
|
||||
async function beforeFetchLatest() {
|
||||
const tab = await http.getBrowserSession();
|
||||
|
||||
await tab.goto('https://www.kink.com');
|
||||
await tab.click('#ccc-recommended-settings', { delay: 1000 });
|
||||
await tab.click('#contentTypeModal button', { delay: 1000 });
|
||||
|
||||
return tab;
|
||||
}
|
||||
|
||||
async function fetchLatest(site, page = 1) {
|
||||
// const res = await qu.getAll(`${site.url}/latest/page/${page}`, '.shoot-list .shoot', {
|
||||
// const res = await qu.getAll(`https://www.kink.com/channel/bound-gang-bangs/latest/page/${page}`, '.shoot-list .shoot', {
|
||||
const res = await qu.getAll(`https://www.kink.com/search?type=shoots&channelIds=${site.slug}&sort=published&page=${page}`, '.shoot-list .shoot', {
|
||||
cookie: 'ct=2;',
|
||||
}, {
|
||||
includeDefaultHeaders: false,
|
||||
followRedirects: false,
|
||||
});
|
||||
const { tab } = await http.getBrowserSession('kink', { headless: false });
|
||||
const res = await tab.goto(`https://www.kink.com/search?type=shoots&channelIds=${site.slug}&sort=published&page=${page}`);
|
||||
const status = res.status();
|
||||
|
||||
console.log(res.items);
|
||||
if (status === 200) {
|
||||
const html = await tab.content();
|
||||
const items = unprint.initAll(html, '.results .shoot-card');
|
||||
|
||||
if (res.ok) {
|
||||
return scrapeAll(res.items, site);
|
||||
const scenes = scrapeAll(items, site);
|
||||
|
||||
await tab.close();
|
||||
|
||||
return scenes;
|
||||
}
|
||||
|
||||
return res.status;
|
||||
return status;
|
||||
}
|
||||
|
||||
async function fetchScene(url, channel) {
|
||||
const { tab } = await http.getBrowserSession('kink');
|
||||
const res = await tab.goto(url);
|
||||
|
||||
const status = res.status();
|
||||
|
||||
if (status === 200) {
|
||||
const html = await tab.content();
|
||||
const item = unprint.init(html);
|
||||
|
||||
const scene = scrapeScene(item, url, channel);
|
||||
|
||||
await tab.close();
|
||||
|
||||
return scene;
|
||||
}
|
||||
|
||||
return status;
|
||||
}
|
||||
|
||||
async function fetchProfile({ name: actorName }, entity, include) {
|
||||
@@ -185,8 +208,8 @@ async function fetchProfile({ name: actorName }, entity, include) {
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
beforeFetchLatest,
|
||||
// beforeNetwork,
|
||||
fetchLatest,
|
||||
fetchScene,
|
||||
fetchProfile,
|
||||
scrapeScene,
|
||||
};
|
||||
|
||||
@@ -235,7 +235,7 @@ async function scrapeChannel(channelEntity, accNetworkReleases, beforeNetwork) {
|
||||
}
|
||||
|
||||
try {
|
||||
const beforeFetchLatest = await scraper.beforeFetchLatest?.(channelEntity);
|
||||
const beforeFetchLatest = await scraper.beforeFetchLatest?.(channelEntity, { beforeNetwork });
|
||||
|
||||
return await scrapeChannelReleases(layoutScraper, channelEntity, {
|
||||
...accNetworkReleases,
|
||||
|
||||
@@ -9,7 +9,8 @@ const stream = require('stream');
|
||||
const tunnel = require('tunnel');
|
||||
const Bottleneck = require('bottleneck');
|
||||
const { JSDOM, toughCookie } = require('jsdom');
|
||||
const puppeteer = require('puppeteer');
|
||||
const puppeteer = require('puppeteer-extra');
|
||||
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
|
||||
|
||||
const windows = require('./http-windows');
|
||||
|
||||
@@ -30,12 +31,13 @@ const limiters = {
|
||||
const bypassSessions = new Map();
|
||||
|
||||
let browser = null;
|
||||
const browserSessions = new Map();
|
||||
|
||||
Promise.config({
|
||||
cancellation: true,
|
||||
});
|
||||
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
const defaultOptions = {
|
||||
timeout: argv.requestTimeout,
|
||||
encodeJSON: true,
|
||||
@@ -156,26 +158,27 @@ function extractJson(solution) {
|
||||
return solution.response;
|
||||
}
|
||||
|
||||
async function getBrowserSession(hostname) {
|
||||
console.log(browserSessions);
|
||||
async function getBrowserSession(identifier, options = {}) {
|
||||
return limiters.bypass.schedule(async () => {
|
||||
if (!browser) {
|
||||
browser = await puppeteer.launch({
|
||||
headless: typeof options.headless === 'undefined' ? true : options.headless,
|
||||
// headless: false,
|
||||
});
|
||||
|
||||
if (browserSessions.has(hostname)) {
|
||||
return browserSessions.get(hostname);
|
||||
}
|
||||
logger.info('Initialized puppeteer browser');
|
||||
}
|
||||
|
||||
if (!browser) {
|
||||
browser = await puppeteer.launch({ headless: false });
|
||||
}
|
||||
const tab = await browser.newPage();
|
||||
|
||||
const page = await browser.newPage();
|
||||
logger.verbose(`Opened puppeteer tab${identifier ? `for ${identifier}` : ''}`);
|
||||
|
||||
browserSessions.set(hostname, page);
|
||||
|
||||
return page;
|
||||
return { browser, tab };
|
||||
});
|
||||
}
|
||||
|
||||
async function bypassBrowserRequest(url, _options) {
|
||||
const page = await limiters.bypass.schedule(async () => getBrowserSession(new URL(url).hostname));
|
||||
const page = await getBrowserSession(new URL(url).hostname);
|
||||
|
||||
const res = await page.goto(url);
|
||||
const body = await page.content();
|
||||
@@ -254,6 +257,10 @@ async function destroyBypassSessions() {
|
||||
await Promise.map(sessionListRes.body.sessions, async (sessionId) => destroyBypassSession(sessionId), { concurrency: 5 });
|
||||
}
|
||||
|
||||
async function destroyBrowserSessions() {
|
||||
await browser?.close();
|
||||
}
|
||||
|
||||
async function bypassCloudflareRequest(url, method, body, cloudflareBypass, options, attempts = 0) {
|
||||
const sessionId = await limiters.bypass.schedule(async () => getBypassSession(url, cloudflareBypass));
|
||||
|
||||
@@ -456,9 +463,11 @@ module.exports = {
|
||||
patch,
|
||||
session: getSession,
|
||||
cookieJar: getCookieJar,
|
||||
browser,
|
||||
getBrowserSession,
|
||||
getBypassSession,
|
||||
getSession,
|
||||
getCookieJar,
|
||||
destroyBypassSessions,
|
||||
destroyBrowserSessions,
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user