Added puppeteer to http module, refactored Kink scraper to utilize it.

This commit is contained in:
DebaucheryLibrarian
2022-12-28 01:34:12 +01:00
parent 17feadbc15
commit 74214bc060
14 changed files with 537 additions and 62 deletions

View File

@@ -1,26 +1,34 @@
'use strict';
const unprint = require('unprint');
const qu = require('../utils/qu');
const http = require('../utils/http');
const slugify = require('../utils/slugify');
function scrapeAll(scenes) {
return scenes.map(({ query }) => {
const release = {};
const href = query.url('.shoot-thumb-info > a');
release.url = `https://kink.com${href}`;
const href = query.url('.shoot-link');
release.url = `https://www.kink.com${href}`;
release.shootId = href.split('/').slice(-1)[0];
release.entryId = release.shootId;
release.title = query.q('.shoot-thumb-title a', true);
release.title = query.content('.shoot-thumb-title a', true);
release.date = query.date('.date', 'MMM DD, YYYY');
release.actors = query.all('.shoot-thumb-models a', true);
release.stars = query.q('.average-rating', 'data-rating') / 10;
release.actors = query.all('.shoot-thumb-models a').map((actorEl) => ({
name: unprint.query.content(actorEl),
url: unprint.query.url(actorEl, null, { origin: 'https://www.kink.com' }),
}));
release.rating = query.number('.thumb-ratings') / 10;
release.poster = query.img('.adimage');
release.photos = query.imgs('.rollover .roll-image', 'data-imagesrc').map((photo) => [
release.photos = query.imgs('.rollover .roll-image', { attribute: 'data-imagesrc' }).map((photo) => [
photo.replace('410/', '830/'),
photo,
]);
@@ -31,25 +39,28 @@ function scrapeAll(scenes) {
});
}
async function scrapeScene({ query }, url) {
function scrapeScene({ query }, url) {
const release = { url };
release.shootId = new URL(url).pathname.split('/')[2];
release.entryId = release.shootId;
release.title = query.q('.shoot-title span.favorite-button', 'data-title');
release.description = query.q('.description-text', true);
release.title = query.attribute('.shoot-title .favorite-button', 'data-title') || query.content('.shoot-title');
release.description = query.content('.description-text');
release.date = query.date('.shoot-date', 'MMMM DD, YYYY');
release.actors = query.all('.names a', true).map((actor) => actor.replace(/,\s*/, ''));
release.director = query.q('.director-name', true);
release.actors = query.elements('.names a').map((actorEl) => ({
name: unprint.query.content(actorEl).replace(/,\s*/, ''),
url: unprint.query.url(actorEl, null, { origin: 'https://www.kink.com' }),
}));
release.director = query.content('.director-name');
release.photos = query.imgs('.gallery .thumb img, #gallerySlider .gallery-img', 'data-image-file');
release.poster = query.poster();
release.tags = query.all('.tag-list a[href*="/tag"]', true).map((tag) => tag.replace(/,\s*/, ''));
release.tags = query.contents('.tag-list a[href*="/tag"]').map((tag) => tag.replace(/,\s*/, ''));
const trailer = query.q('.player span[data-type="trailer-src"]', 'data-url');
const trailer = query.attribute('.player span[data-type="trailer-src"]', 'data-url');
if (trailer) {
release.trailer = [
@@ -72,7 +83,9 @@ async function scrapeScene({ query }, url) {
];
}
release.channel = query.url('.shoot-logo a').split('/').slice(-1)[0];
release.channel = slugify(query.url('.shoot-logo a')?.split('/').slice(-1)[0], '');
console.log(release);
return release;
}
@@ -131,33 +144,43 @@ async function scrapeProfile({ query }, actorUrl, include) {
return profile;
}
async function beforeFetchLatest() {
const tab = await http.getBrowserSession();
await tab.goto('https://www.kink.com');
await tab.click('#ccc-recommended-settings', { delay: 1000 });
await tab.click('#contentTypeModal button', { delay: 1000 });
return tab;
}
async function fetchLatest(site, page = 1) {
// const res = await qu.getAll(`${site.url}/latest/page/${page}`, '.shoot-list .shoot', {
// const res = await qu.getAll(`https://www.kink.com/channel/bound-gang-bangs/latest/page/${page}`, '.shoot-list .shoot', {
const res = await qu.getAll(`https://www.kink.com/search?type=shoots&channelIds=${site.slug}&sort=published&page=${page}`, '.shoot-list .shoot', {
cookie: 'ct=2;',
}, {
includeDefaultHeaders: false,
followRedirects: false,
});
const { tab } = await http.getBrowserSession('kink', { headless: false });
const res = await tab.goto(`https://www.kink.com/search?type=shoots&channelIds=${site.slug}&sort=published&page=${page}`);
const status = res.status();
console.log(res.items);
if (status === 200) {
const html = await tab.content();
const items = unprint.initAll(html, '.results .shoot-card');
if (res.ok) {
return scrapeAll(res.items, site);
const scenes = scrapeAll(items, site);
await tab.close();
return scenes;
}
return res.status;
return status;
}
async function fetchScene(url, channel) {
const { tab } = await http.getBrowserSession('kink');
const res = await tab.goto(url);
const status = res.status();
if (status === 200) {
const html = await tab.content();
const item = unprint.init(html);
const scene = scrapeScene(item, url, channel);
await tab.close();
return scene;
}
return status;
}
async function fetchProfile({ name: actorName }, entity, include) {
@@ -185,8 +208,8 @@ async function fetchProfile({ name: actorName }, entity, include) {
}
module.exports = {
beforeFetchLatest,
// beforeNetwork,
fetchLatest,
fetchScene,
fetchProfile,
scrapeScene,
};