From 0cf43f6eabe4f74f850d9a71cc94eabf3d83ac77 Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Tue, 19 Nov 2024 01:35:28 +0100 Subject: [PATCH] Upgraded Spizoo scraper to unprint, added to default proxy list. --- config/default.js | 11 ++++++ src/scrapers/scrapers.js | 1 + src/scrapers/spizoo.js | 83 +++++++++++----------------------------- 3 files changed, 35 insertions(+), 60 deletions(-) diff --git a/config/default.js b/config/default.js index 9da6da2d..42d3ddc4 100755 --- a/config/default.js +++ b/config/default.js @@ -353,6 +353,17 @@ module.exports = { 'www.badteenspunished.com', 'www.cumlouder.com', 'im0.imgcm.com', + // Spizoo + 'https://www.spizoo.com', + 'https://www.creamher.com', + 'https://www.gothgirlfriends.com', + 'https://mrluckypov.com', + 'https://mrluckyvip.com', + 'https://mrluckyraw.com', + 'https://firstclasspov.com', + 'https://rawattack.com', + 'https://realsensual.com', + 'https://vlogxxx.com', ], }, bypass: { diff --git a/src/scrapers/scrapers.js b/src/scrapers/scrapers.js index 8898fb56..cb275ff6 100755 --- a/src/scrapers/scrapers.js +++ b/src/scrapers/scrapers.js @@ -323,6 +323,7 @@ const scrapers = { cutebutts: snowvalley, transexjapan: snowvalley, uralesbian: snowvalley, + rawattack: spizoo, spizoo, swallowed: mikeadriano, milfcandy: archangel, diff --git a/src/scrapers/spizoo.js b/src/scrapers/spizoo.js index 8929af79..ac933a99 100755 --- a/src/scrapers/spizoo.js +++ b/src/scrapers/spizoo.js @@ -1,11 +1,8 @@ 'use strict'; -const config = require('config'); const unprint = require('unprint'); const format = require('template-format'); -const { HttpsProxyAgent } = require('https-proxy-agent'); -const qu = require('../utils/qu'); const slugify = require('../utils/slugify'); function getEntryId(url) { @@ -30,29 +27,27 @@ function scrapeAll(scenes) { release.poster = query.img('a img'); release.teaser = query.video('.leVideo source'); - console.log(release); - return release; }); } -function scrapeScene({ query }, url) { +function scrapeScene({ query }, { url, entity }) { const release = {}; release.entryId = getEntryId(url); - release.title = query.cnt(['#media-holder .title', '.content-holder h1', '#scene h1', 'h2.titular', 'title'])?.replace(/\s+-$/, ''); + release.title = query.content(['#media-holder .title', '.content-holder h1', '#scene h1', 'h2.titular', 'title'])?.replace(/\s+-$/, ''); release.date = query.date('#sceneInfo .date, #trailer-data .date', 'YYYY-MM-DD'); release.duration = query.duration('#sceneInfo .data-others, #trailer-data', /\d+:\d+/); - release.description = query.cnt('#sceneInfo .description, #trailer-data > div:first-child p'); + release.description = query.content('#sceneInfo .description, #trailer-data > div:first-child p'); release.actors = query.all('#sceneInfo .data-others a[href*="/models"], #trailer-data a[href*="/models"]').map((el) => ({ - name: query.el(el, null, 'title'), - url: query.url(el, null), + name: unprint.query.attribute(el, null, 'title'), + url: unprint.query.url(el, null, { origin: entity.url }), })); - release.tags = query.cnts('.categories-holder a, #sceneInfo a[href*="/categories"], #trailer-data a[href*="/categories"]'); + release.tags = query.contents('.categories-holder a, #sceneInfo a[href*="/categories"], #trailer-data a[href*="/categories"]').map((tag) => tag.trim()); const poster = query.img(['#video-holder .update_thumb', '#video-holder .thumb', '#noMore .update_thumb', '#hpromo .update_thumb', '.trailer-thumb']) || query.poster('#trailervideo'); const posterPathname = poster && new URL(poster)?.pathname; @@ -69,53 +64,29 @@ function scrapeScene({ query }, url) { release.trailer = query.video('#trailervideo source[type="video/mp4"], #FulsSizeVideo source[type="video/mp4"]'); // sic release.teaser = query.video('#trailer-video source[src*="/videothumbs"]'); - console.log(release); - return release; } -function scrapeProfileScenes(scenes) { - return scenes.map(({ query }) => { - const release = {}; - - release.url = query.url('a[href*="/updates"]'); - release.entryId = getEntryId(release.url); - - release.title = query.cnt('.titular, h3 a'); - release.date = query.date('.date-label', 'YYYY-MM-DD'); - release.duration = query.number('.length-label') * 60; - - release.description = query.cnt('.model-update-description'); - - release.actors = query.all('.model-labels a').map((el) => ({ - name: query.cnt(el), - url: query.url(el, null), - })); - - const poster = query.img('.update_thumb'); - - release.poster = [poster, poster?.replace(/imgw=\w+/, 'imgw=680')]; - release.tags = query.cnts('.categories-holder a'); - - return release; - }); -} - -function scrapeProfile({ query, el }) { +function scrapeProfile({ query }) { const profile = {}; - const bioKeys = query.cnts('.statsText b'); - const bioValues = query.texts('.statsText'); + const bioKeys = query.contents('.statsText b'); + const bioValues = query.text('.statsText', { join: false }); const bio = bioKeys.reduce((acc, key, index) => ({ ...acc, [slugify(key, '_')]: bioValues[index], }), {}); - profile.description = query.cnt('.descriptionText'); - profile.avatar = query.img('.model-bio-pic img'); + profile.description = query.contents('.descriptionText'); + + profile.avatar = [ + query.img('.model-bio-pic img', { attribute: 'src0_2x' }), + query.img('.model-bio-pic img', { attribute: 'src0_3x' }), // unnecessarily big + query.img('.model-bio-pic img', { attribute: 'src0_1x' }), + ]; profile.height = Number(bio.height?.match(/(\d+)\s?cm/i)?.[1]); - profile.dateOfBirth = qu.extractDate(bio.date_of_birth, 'MMMM D, YYYY'); + profile.dateOfBirth = unprint.extractDate(bio.date_of_birth, 'MMMM D, YYYY'); profile.measurements = bio.measurements; profile.butt = bio.ass_type; @@ -134,19 +105,12 @@ function scrapeProfile({ query, el }) { profile.hasPiercings = true; } - profile.scenes = scrapeProfileScenes(qu.initAll(el, '.model-update')); - return profile; } -const agent = new HttpsProxyAgent(`http://${config.proxy.host}:${config.proxy.port}`); - async function fetchLatest(channel, page) { - // const res = await qu.getAll(`${channel.url}/categories/movies_${page}_d.html`, '.thumb-big, .thumb-video, .thumbnail, .thumbnail-popular, .full-thumbnail'); - const res = await unprint.get(`${channel.url}${format(channel.parameters?.latest || '/categories/movies_{page}_d.html', { page })}`, { selectAll: '.thumb-big, .thumb-video, .thumbnail, .thumbnail-popular, .full-thumbnail', - httpsAgent: agent, }); if (res.ok) { @@ -158,23 +122,23 @@ async function fetchLatest(channel, page) { async function fetchProfile(actor, channel) { if (actor.url) { - const res = await qu.get(actor.url); + const res = await unprint.get(actor.url); if (res.ok) { - return scrapeProfile(res.item); + return scrapeProfile(res.context); } } - const resA = await qu.get(`${channel.url}/models/${slugify(actor.name)}.html`); + const resA = await unprint.get(`${channel.url}/models/${slugify(actor.name)}.html`); if (resA.ok) { - return scrapeProfile(resA.item, channel); + return scrapeProfile(resA.context, channel); } - const resB = await qu.get(`${channel.url}/models/${slugify(actor.name, '')}.html`); + const resB = await unprint.get(`${channel.url}/models/${slugify(actor.name, '')}.html`); if (resB.ok) { - return scrapeProfile(resB.item, channel); + return scrapeProfile(resB.context, channel); } return resB.status; @@ -184,5 +148,4 @@ module.exports = { fetchLatest, fetchProfile, scrapeScene, - deprecated: true, };