From 8c3707114578b2f2ed27de334f897887bc20ae69 Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Wed, 12 Jun 2024 03:21:45 +0200 Subject: [PATCH] Expanded puppeteer options. Fixed Mike Adriano scraper. Fixed convert utility. --- .eslintrc | 1 + package-lock.json | 8 +- package.json | 2 +- seeds/00_tags.js | 4 + seeds/02_sites.js | 21 +++- src/.eslintrc | 1 + src/actors.js | 10 +- src/scrapers/mikeadriano.js | 208 +++++++++++++++++++++++------------- src/scrapers/scrapers.js | 1 + src/utils/convert.js | 2 +- src/utils/http.js | 15 ++- 11 files changed, 180 insertions(+), 93 deletions(-) diff --git a/.eslintrc b/.eslintrc index 0f9b4e5d..4f4ae6ef 100755 --- a/.eslintrc +++ b/.eslintrc @@ -11,6 +11,7 @@ "no-tabs": "off", "no-unused-vars": ["error", {"argsIgnorePattern": "^_"}], "no-console": 0, + "arrow-body-style": 0, "default-param-last": 0, "template-curly-spacing": "off", "max-len": 0, diff --git a/package-lock.json b/package-lock.json index 171da7a0..393d1e2e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -88,7 +88,7 @@ "tunnel": "0.0.6", "ua-parser-js": "^1.0.37", "undici": "^5.28.1", - "unprint": "^0.10.12", + "unprint": "^0.11.2", "url-pattern": "^1.0.3", "v-tooltip": "^2.1.3", "video.js": "^8.6.1", @@ -18293,9 +18293,9 @@ } }, "node_modules/unprint": { - "version": "0.10.12", - "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.10.12.tgz", - "integrity": "sha512-EbRGhkoOcmnMmQBaKZA6Tky6gpEwrhy4tDB1KeajSGhqli7zhlNe3WqsTQPtLBNKa/4M2PJZS8l0GOOjvTLndQ==", + "version": "0.11.2", + "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.11.2.tgz", + "integrity": "sha512-i4WJxmEnd6LKYbcnKAjX8bkaPRdyDlhAAqpxej0qIX0pjK5d17hp51x/RGDMfEe63dlcJtGCn9bhZrGcMY4PXQ==", "dependencies": { "axios": "^0.27.2", "bottleneck": "^2.19.5", diff --git a/package.json b/package.json index 29bbc924..06bd0109 100755 --- a/package.json +++ b/package.json @@ -147,7 +147,7 @@ "tunnel": "0.0.6", "ua-parser-js": "^1.0.37", "undici": "^5.28.1", - "unprint": "^0.10.12", + "unprint": "^0.11.2", "url-pattern": "^1.0.3", "v-tooltip": "^2.1.3", "video.js": "^8.6.1", diff --git a/seeds/00_tags.js b/seeds/00_tags.js index 7087a682..1627dc2e 100755 --- a/seeds/00_tags.js +++ b/seeds/00_tags.js @@ -166,6 +166,10 @@ const tags = [ name: 'ass worship', slug: 'ass-worship', }, + { + name: 'audition', + slug: 'audition', + }, { name: 'babe', slug: 'babe', diff --git a/seeds/02_sites.js b/seeds/02_sites.js index 4ac2be4f..13515b92 100755 --- a/seeds/02_sites.js +++ b/seeds/02_sites.js @@ -6026,7 +6026,7 @@ const sites = [ { slug: 'trueanal', name: 'True Anal', - url: 'https://trueanal.com', + url: 'https://tour.trueanal.com', description: 'TrueAnal is the hottest site with all hardcore Anal content and only the most popular pornstars getting their asses pounded and gapped with huge cock and more!', tags: ['anal'], parent: 'mikeadriano', @@ -6034,14 +6034,14 @@ const sites = [ { slug: 'analonly', name: 'Anal Only', - url: 'https://analonly.com', + url: 'https://tour.analonly.com', tags: ['anal'], parent: 'mikeadriano', }, { slug: 'allanal', name: 'All Anal', - url: 'https://allanal.com', + url: 'https://tour.allanal.com', description: 'Popular babes getting their tight asses filled with cock! Pure anal fucking only at AllAnal!', tags: ['anal', 'mff'], parent: 'mikeadriano', @@ -6049,18 +6049,29 @@ const sites = [ { slug: 'nympho', name: 'Nympho', - url: 'https://nympho.com', + url: 'https://tour.nympho.com', description: 'These Babes have an appetite for nasty, sloppy fucking!', parent: 'mikeadriano', }, { slug: 'swallowed', name: 'Swallowed', - url: 'https://swallowed.com', + url: 'https://tour.swallowed.com', description: 'Swallowed is a Premium adult website for the hottest Blowjobs content online with only the most popular pornstars swallowing cock!', tags: ['blowjob', 'deepthroat', 'facefucking'], parent: 'mikeadriano', }, + { + slug: 'dirtyauditions', + name: 'Dirty Auditions', + url: 'https://dirtyauditions.com', + description: 'Watch hot pornstars tryout for DirtyAuditions in exclusive and extreme HD videos!', + tags: ['audition'], + parent: 'mikeadriano', + parameters: { + useBrowser: true, + }, + }, // MILE HIGH MEDIA { slug: 'doghousedigital', diff --git a/src/.eslintrc b/src/.eslintrc index f11cb59c..e4db6f58 100755 --- a/src/.eslintrc +++ b/src/.eslintrc @@ -14,6 +14,7 @@ "no-underscore-dangle": 0, "default-param-last": 0, "prefer-destructuring": "off", + "arrow-body-style": 0, "template-curly-spacing": "off", "object-curly-newline": "off" } diff --git a/src/actors.js b/src/actors.js index f379bb9c..b133d18e 100755 --- a/src/actors.js +++ b/src/actors.js @@ -443,13 +443,13 @@ async function curateProfile(profile, actor) { curatedProfile.hip = Number(profile.hip) || profile.hip?.match?.(/\d+/)?.[0] || null; // combined measurement value - const measurements = profile.measurements?.match(/(\d+)(\w+)\s*[-x]\s*(\d+)\s*[-x]\s*(\d+)/); // ExCoGi uses x, Jules Jordan has spaces between the dashes + const measurements = profile.measurements?.match(/(\d+)(\w+)(?:\s*[-x]\s*(\d+)\s*[-x]\s*(\d+))?/); // ExCoGi uses x, Jules Jordan has spaces between the dashes if (measurements) { - curatedProfile.bust = Number(measurements[1]); - curatedProfile.cup = measurements[2]; - curatedProfile.waist = Number(measurements[3]); - curatedProfile.hip = Number(measurements[4]); + curatedProfile.bust = Number(measurements[1]) || null; + curatedProfile.cup = measurements[2] || null; + curatedProfile.waist = Number(measurements[3]) || null; + curatedProfile.hip = Number(measurements[4]) || null; } curatedProfile.penisLength = Number(profile.penisLength) || profile.penisLength?.match?.(/\d+/)?.[0] || null; diff --git a/src/scrapers/mikeadriano.js b/src/scrapers/mikeadriano.js index 74c79aef..8e4e7137 100755 --- a/src/scrapers/mikeadriano.js +++ b/src/scrapers/mikeadriano.js @@ -1,88 +1,166 @@ 'use strict'; -const qu = require('../utils/qu'); -const http = require('../utils/http'); +const unprint = require('unprint'); -function scrapeAll(scenes) { +const http = require('../utils/http'); +const slugify = require('../utils/slugify'); +const { convert } = require('../utils/convert'); + +function scrapeAll(scenes, channel) { return scenes.map(({ query }) => { const release = {}; - release.title = query.cnt('h3.title a, .content-title-wrap a'); - release.url = query.url('h3.title a, .content-title-wrap a'); + release.title = query.content('h3.title a, .content-title-wrap a'); + release.url = query.url('h3.title a, h1.title a, .content-title-wrap a', { origin: channel.url }); const pathname = new URL(release.url).pathname; - release.entryId = pathname.match(/\/view\/(\d+)/)?.[1] || pathname.match(/\/view\/([\w-]+)/)?.[1]; - release.description = query.cnt('.desc, .content-description'); - release.date = query.date('.date, time, .hide', 'Do MMM YYYY'); + release.entryId = pathname.match(/\/scenes\/([\w-]+)/)?.[1]; - release.actors = query.cnts('h4.models a, .content-models a'); - release.duration = query.dur('.total-time'); + release.description = query.content('.desc, .content-description'); + release.date = query.date('.date, time, .hide', 'Do MMM YYYY', { match: null }); + + release.actors = query.contents('h4.models a, .content-models a'); + release.duration = query.duration('//span[contains(@class, "total-time") and text()[contains(., ":")]]'); // total-time is also used for photo counts on True Anal const [poster, ...primaryPhotos] = query.imgs('a img'); - const secondaryPhotos = query.styles('.thumb-top, .thumb-bottom, .thumb-mouseover', 'background-image').map((style) => style.match(/url\((.*)\)/)[1]); + const secondaryPhotos = query.styles('.thumb-top, .thumb-bottom, .thumb-mouseover', { styleAttribute: 'background-image' }).map((style) => style.match(/url\((.*)\)/)?.[1]); + + release.poster = [ + poster.replace(/-c\d+x\d+/, ''), + poster, + ]; - release.poster = poster; release.photos = primaryPhotos.concat(secondaryPhotos); return release; }); } -async function scrapeScene({ query }, url) { +async function scrapeScene({ query }, url, channel) { const release = {}; const pathname = new URL(url).pathname; - release.entryId = pathname.match(/\/view\/(\d+)/)?.[1] || pathname.match(/\/view\/([\w-]+)/)?.[1]; + const data = query.json('#__NEXT_DATA__')?.props?.pageProps?.content; - release.title = query.cnt('.content-page-info .title'); - release.description = query.cnt('.content-page-info .desc'); - release.date = query.date('.content-page-info .date, .content-page-info .hide, .post-date', 'Do MMM YYYY'); + release.entryId = data?.slug || pathname.match(/\/scenes\/([\w-]+)/)?.[1]; - release.actors = query.cnts('.content-page-info .models a'); - release.duration = query.dur('.content-page-info .total-time:last-child'); + release.title = data?.title || query.content('.content-page-info .title'); + release.description = data?.description || query.content('.content-page-info .desc'); + release.date = data?.formatted_date + ? unprint.extractDate(data.formatted_date, 'Do MMM YYYY', { match: null }) + : query.date('.content-page-info .date, .content-page-info .hide, .post-date', 'Do MMM YYYY', { match: null }); - release.poster = query.poster('.content-page-header video, .content-page-header-inner video') || query.poster('#main-player', 'data-screenshot'); - release.trailer = query.video('.content-page-header source, .content-page-header-inner source') || query.q('#main-player', 'data-url'); + release.actors = data?.models_thumbs?.map((actor) => ({ + name: actor.name, + url: actor.slug && `${channel.url}/models/${actor.slug}`, + avatar: actor.thumb, + })) + || query.elements('.content-page-info .models a').map((actorEl) => ({ + name: unprint.query(actorEl), + url: unprint.url(actorEl, null), + })); + + release.duration = data?.seconds_duration || query.duration('.content-page-info .total-time:last-child'); + + release.poster = [data?.trailer_screencap, data?.thumb, data?.extra_thumbails?.[0]].filter(Boolean); + release.photos = data?.extra_thumbnails?.slice(1); // first photo is poster + + release.trailer = data?.trailer_url || null; + release.caps = data?.thumbs; + + release.tags = data?.tags; + + release.qualities = data?.videos && Object.values(data.videos).map((video) => video.height); return release; } -async function fetchLatest(channel, page = 1) { - const { host } = new URL(channel.url); - const url = `https://tour.${host}/videos?page=${page}`; +async function scrapeProfile({ query }) { + const profile = {}; - const res = await qu.get(url); + const bio = Object.fromEntries(query.all('.model-info li, .model-desc li').map((el) => [ + slugify(unprint.query.content(el, 'span')), + unprint.query.text(el), + ])); - if (res.ok) { - if (res.item.query.exists('a[href*="stackpath.com"]')) { - throw new Error('URL blocked by StackPath'); - } + const avatar = query.img('.model-photo img, img[alt="model"]'); - return scrapeAll(qu.initAll(res.item.el, '.content-item-large, .content-item, .content-border'), channel); + if (avatar) { + profile.avatar = [ + avatar.replace(/-\d+x\d+/, ''), + avatar, + ]; } - return res.status; + if (bio && Object.keys(bio).length > 0) { + profile.description = bio.bio; + + profile.dateOfBirth = bio.birthdate && unprint.extractDate(bio.birthdate, 'YYYY-MM-DD'); + profile.birthPlace = bio.born; + + profile.measurements = bio.measurements; + + profile.height = convert(bio.height, 'cm'); + profile.weight = convert(bio.weight, 'lb', 'kg'); + + profile.eyes = bio.eyes; + profile.hairColor = bio.hair; + } + + return profile; } -async function fetchUpcoming(channel) { - const { host } = new URL(channel.url); - const url = `https://tour.${host}`; +async function fetchLatestContent(url, parameters) { + if (parameters.useBrowser) { + const res = await http.get(url, { + bypassBrowser: 'shared', + bypass: { + evaluate: async () => { + // images lazy loaded by JS, gradually scroll through page + return Array.from(document.querySelectorAll('.content-item ')).reduce(async (chain, el) => { + await chain; - const res = await qu.get(url); + return new Promise((resolve) => { + el.scrollIntoView(); + setTimeout(resolve, 20); + }); + }, Promise.resolve()); + }, + }, + }); + + if (res.statusCode !== 200) { + return { + ok: false, + status: res.statusCode, + }; + } + + const context = unprint.init(res.body); + + return { + ok: true, + status: res.statusCode, + context, + }; + } + + const res = await unprint.get(url); + + return res; +} + +async function fetchLatest(channel, page = 1, { parameters }) { + const url = `${channel.url}/scenes?page=${page}`; + const res = await fetchLatestContent(url, parameters); if (res.ok) { - if (res.item.query.exists('a[href*="stackpath.com"]')) { + if (res.context.query.exists('a[href*="stackpath.com"]')) { throw new Error('URL blocked by StackPath'); } - const sceneItem = qu.init(res.item.el, '#upcoming-content'); - - if (sceneItem) { - return scrapeAll([sceneItem], channel); - } - - return null; + return scrapeAll(unprint.initAll(res.context.query.all('.content-item-large, .content-item, .content-border')), channel); } return res.status; @@ -92,58 +170,40 @@ async function fetchScene(url, channel) { const cookieJar = http.cookieJar(); const session = http.session({ cookieJar }); - /* not working - const resA = await http.get(url, { - session, - extract: { - runScripts: 'dangerously', - }, - }); - - cookieJar.setCookieSync(http.toughCookie.Cookie.parse(resA.document.cookie), url); - - console.log(res.req); - */ - const res = await http.get(url, { session, }); if (res.ok) { - const item = qu.init(res.document); + const context = unprint.init(res.body); - if (item.query.exists('a[href*="stackpath.com"]')) { + if (context.query.exists('a[href*="stackpath.com"]')) { throw new Error('URL blocked by StackPath'); } - return scrapeScene(item, url, channel); + return scrapeScene(context, url, channel); } return res.status; } -/* API protected -async function fetchProfile({ name: actorName }, context , site) { +async function fetchProfile(actor, context) { const session = http.session(); - await http.get(`https://tour.${site.slug}.com`, { session }); + await http.get(context.channel.url, { session }); - const url = `https://tour.${site.slug}.com/search-preview`; - const res = await http.post(url, { q: actorName }, { - session, - headers: { - 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36', - origin: `https://tour.${site.slug}.com`, - }, - }); + const url = `${context.channel.url}/models/${actor.slug}`; + const res = await unprint.get(url); - console.log(res.body.toString()); + if (res.ok) { + return scrapeProfile(res.context, context.channel); + } + + return res.status; } -*/ module.exports = { fetchLatest, - fetchUpcoming, - // fetchProfile, + fetchProfile, fetchScene, }; diff --git a/src/scrapers/scrapers.js b/src/scrapers/scrapers.js index 00618095..27f0b950 100755 --- a/src/scrapers/scrapers.js +++ b/src/scrapers/scrapers.js @@ -211,6 +211,7 @@ const scrapers = { deeplush: nubiles, devilsfilm: famedigital, digitalplayground: aylo, + dirtyauditions: mikeadriano, dorcelclub: dorcel, doubleviewcasting: firstanalquest, dtfsluts: fullpornnetwork, diff --git a/src/utils/convert.js b/src/utils/convert.js index 5965e914..1374c575 100755 --- a/src/utils/convert.js +++ b/src/utils/convert.js @@ -79,7 +79,7 @@ function convertApi(input, fromOrTo, to) { const inputNumber = Number(typeof input === 'string' ? input.match(/\d+(\.\d+)?/)?.[0] : input); - return Math.round(convert(inputNumber).from(fromOrTo).to(to)) || null; + return Math.round(convert(inputNumber, fromOrTo).to(to)) || null; } catch (error) { logger.error(error); return null; diff --git a/src/utils/http.js b/src/utils/http.js index 4be6e303..85eec706 100755 --- a/src/utils/http.js +++ b/src/utils/http.js @@ -162,7 +162,7 @@ async function getBrowserSession(identifier, options = {}) { return limiters.bypass.schedule(async () => { if (!browser) { browser = await puppeteer.launch({ - headless: typeof options.headless === 'undefined' ? 'new' : options.headless, + headless: typeof options.bypass?.headless === 'undefined' ? 'new' : options.bypass.headless, // headless: false, }); @@ -177,10 +177,19 @@ async function getBrowserSession(identifier, options = {}) { }); } -async function bypassBrowserRequest(url, _options) { - const { tab } = await getBrowserSession(new URL(url).hostname); +async function bypassBrowserRequest(url, options) { + const { tab } = await getBrowserSession(new URL(url).hostname, options); const res = await tab.goto(url); + + if (options.bypass?.delay) { + await Promise.delay(options.bypass.delay); + } + + if (typeof options.bypass?.evaluate === 'function') { + await tab.evaluate(options.bypass.evaluate, options.bypass); + } + const rawBody = await tab.content(); const headers = res.headers();