From 9c8cfe3bdb7ddc9484605770f17f829ecaa27d98 Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Sun, 12 Jul 2020 05:10:23 +0200 Subject: [PATCH] Re-wrote broken Perv City scraper, added profile scraping. --- seeds/02_sites.js | 27 ++++- src/actors.js | 12 +- src/scrapers/pervcity.js | 195 ++++++++++++++------------------ src/scrapers/pervcity_legacy.js | 144 +++++++++++++++++++++++ src/scrapers/scrapers.js | 1 + src/utils/convert.js | 16 ++- 6 files changed, 270 insertions(+), 125 deletions(-) create mode 100644 src/scrapers/pervcity_legacy.js diff --git a/seeds/02_sites.js b/seeds/02_sites.js index 94370957..c141581d 100644 --- a/seeds/02_sites.js +++ b/seeds/02_sites.js @@ -4347,7 +4347,10 @@ const sites = [ url: 'http://www.analoverdose.com', description: 'Before proceeding, use caution: the stunning pornstars of Anal Overdose are so fiery that they cause heavy breathing, throbbing cocks and volcanic loads of cum. If you think you can handle the heat of smoking tits, sweltering pussy and red hot ass.', parent: 'pervcity', - parameters: { tourId: 3 }, + parameters: { + siteId: 2, + tourId: 3, + }, }, { slug: 'bangingbeauties', @@ -4355,7 +4358,10 @@ const sites = [ description: "Banging Beauties isn't just a porn site; it's the gateway to all your pussy-obsessed fantasies! Our members' area is flowing with beautiful pornstars anticipating big dick throbbing in their syrupy pink slits. These experienced babes love brutal vaginal pounding! Similarly, they're eager for anal switch-hitting to shake things up. However, it's not only about gorgeous sexperts filling their hungry holes. Sometimes, it's all about innocent rookies earning their pornstar status in first time threesomes and premier interracial scenes.", url: 'http://www.bangingbeauties.com', parent: 'pervcity', - parameters: { tourId: 7 }, + parameters: { + siteId: 3, + tourId: 7, + }, }, { slug: 'oraloverdose', @@ -4363,7 +4369,10 @@ const sites = [ description: "Oral Overdose is the only site you need to live out every saliva soaked blowjob of your dreams in HD POV! We've got the most stunning cocksuckers in the world going to town on big dick. These babes not only love cock, they can't get enough of it! In fact, there is no prick too huge for our hungry girls' throats. You'll find gorgeous, big tits pornstars exercising their gag reflex in intense balls deep facefuck scenes. We also feature fresh, young newbies taking on the gagging deepthroat challenge.", url: 'http://www.oraloverdose.com', parent: 'pervcity', - parameters: { tourId: 4 }, + parameters: { + siteId: 4, + tourId: 4, + }, }, { slug: 'chocolatebjs', @@ -4371,15 +4380,21 @@ const sites = [ description: "You've just won the golden ticket to the best Chocolate BJs on the planet! We've sought far and wide to bring you the most beautiful black and ethnic pornstars. And they're in our members' area now! They can't wait to suck your white lollipop and lick the thick cream shooting from your big dick. Of course, no matter how sweet the booty or juicy the big tits, these brown foxes aren't all sugar and spice. In fact, when it comes to giving head, these big ass ebony babes know what they want: huge white cocks filling their throats!", url: 'http://www.chocolatebjs.com', parent: 'pervcity', - parameters: { tourId: 6 }, + parameters: { + siteId: 5, + tourId: 6, + }, }, { slug: 'upherasshole', name: 'Up Her Asshole', description: "You don't need to travel the globe in search of the anal wonders of the world, because you get your own private tour right here on Up Her Asshole! Our stunning pornstars and rookie starlets welcome all ass fetish and anal sex fans, with their twerking bubble butts and winking assholes. However, big booty worship is just a slice of the fun. Combined with juicy tits (big and small, wet pussy (hairy and bald, these girls deliver a spectacular sensory experience in HD POV. Not only are you in danger of busting a nut before the going gets good, but also when the good turns remarkable with rimming, fingering and butt toys!", - url: 'http://www.upherasshole.com', + url: 'http://upherasshole.com', parent: 'pervcity', - parameters: { tourId: 9 }, + parameters: { + siteId: 6, + tourId: 9, + }, }, // PIMP XXX { diff --git a/src/actors.js b/src/actors.js index b6f31d2f..6ff262f7 100644 --- a/src/actors.js +++ b/src/actors.js @@ -302,12 +302,12 @@ async function curateProfile(profile) { curatedProfile.dateOfDeath = Number.isNaN(Number(profile.dateOfDeath)) ? null : profile.dateOfDeath; - curatedProfile.cup = profile.cup || (typeof profile.bust === 'string' && profile.bust?.match(/[a-zA-Z]+/)?.[0]) || null; - curatedProfile.bust = Number(profile.bust) || profile.bust?.match(/\d+/)?.[0] || null; - curatedProfile.waist = Number(profile.waist) || profile.waist?.match(/\d+/)?.[0] || null; - curatedProfile.hip = Number(profile.hip) || profile.hip?.match(/\d+/)?.[0] || null; - curatedProfile.height = Number(profile.height) || profile.height?.match(/\d+/)?.[0] || null; - curatedProfile.weight = Number(profile.weight) || profile.weight?.match(/\d+/)?.[0] || null; + curatedProfile.cup = profile.cup || (typeof profile.bust === 'string' && profile.bust?.match?.(/[a-zA-Z]+/)?.[0]) || null; + curatedProfile.bust = Number(profile.bust) || profile.bust?.match?.(/\d+/)?.[0] || null; + curatedProfile.waist = Number(profile.waist) || profile.waist?.match?.(/\d+/)?.[0] || null; + curatedProfile.hip = Number(profile.hip) || profile.hip?.match?.(/\d+/)?.[0] || null; + curatedProfile.height = Number(profile.height) || profile.height?.match?.(/\d+/)?.[0] || null; + curatedProfile.weight = Number(profile.weight) || profile.weight?.match?.(/\d+/)?.[0] || null; curatedProfile.naturalBoobs = typeof profile.naturalBoobs === 'boolean' ? profile.naturalBoobs : null; curatedProfile.hasTattoos = typeof profile.hasTattoos === 'boolean' ? profile.hasTattoos : null; diff --git a/src/scrapers/pervcity.js b/src/scrapers/pervcity.js index 3500ff86..1f827988 100644 --- a/src/scrapers/pervcity.js +++ b/src/scrapers/pervcity.js @@ -1,140 +1,113 @@ 'use strict'; -const bhttp = require('bhttp'); -const cheerio = require('cheerio'); -const { JSDOM } = require('jsdom'); -const moment = require('moment'); +const qu = require('../utils/qu'); +const slugify = require('../utils/slugify'); +const { feetInchesToCm, lbsToKg } = require('../utils/convert'); -async function getTrailer(entryId) { - const trailerRes = await bhttp.post('https://www.pervcity.com/gettoken.php', { - setId: entryId, +function scrapeAll(scenes) { + return scenes.map(({ query }) => { + const release = {}; + + release.url = query.url('.videoPic a'); + release.entryId = query.q('.videoPic img', 'id').match(/set-target-(\d+)/)[1]; + + release.title = query.q('h3 a', true); + release.description = query.q('.runtime + p', true); + + release.date = query.date('.date', 'MM-DD-YYYY'); + release.duration = query.dur('.runtime'); + + release.actors = query.all('.tour_update_models a', true); + + release.poster = query.img('.videoPic img'); + + return release; }); +} - if (trailerRes.statusCode === 200) { - return { - poster: trailerRes.body.TrailerImg, - trailer: trailerRes.body.TrailerPath || trailerRes.body.Trailerfallback, +function scrapeScene({ query }) { + const release = {}; + + release.entryId = query.q('.trailerLeft img', 'id').match(/set-target-(\d+)/)[1]; + + release.title = query.q('.infoHeader h1', true); + release.description = query.q('.infoBox p', true); + + release.actors = query.all('.tour_update_models a', true); + + release.poster = query.img('.posterimg'); + release.photos = query.imgs('.trailerSnaps img').slice(1); // first photo is poster in lower quality + + const trailer = query.q('script')?.textContent.match(/\/trailers\/.+\.mp4/)?.[0]; + + if (trailer) { + release.trailer = { + src: `https://pervcity.com${trailer}`, }; } - return null; -} - -function scrapeLatestScene(html, site) { - const $ = cheerio.load(html, { normalizeWhitespace: true }); - - const entryId = $('li').attr('id'); - const sceneLinkElement = $('#scene_title_border a'); - const url = `${site.url}/${sceneLinkElement.attr('href')}`; - const title = sceneLinkElement.attr('title').replace(/\u00E2\u0080\u0099/g, '\''); // replace weird apostrophes - - const actors = $('.home_model_name a').toArray().map(element => $(element).text().replace(/,[\u0020\u00A0\u202F]/, '')); // replace weird commas - const date = moment.utc($('.add_date').text(), 'DD-MM-YYYY').toDate(); - - const poster = $('a:nth-child(2) > img').attr('src'); - const photos = $('.sample-picker img').map((index, element) => $(element).attr('src').replace('tourpics', 'trailer')).toArray(); - - const stars = $('img[src*="/star.png"]') - .toArray() - .map(element => $(element).attr('src')) - .length || 0; - - return { - url, - entryId, - title, - actors, - date, - poster, - photos, - rating: { - stars, - }, - site, - }; -} - -async function scrapeScene(html, url, site) { - const { document } = new JSDOM(html).window; - - const release = { url, site }; - - release.entryId = document.querySelector('input#set_ID').value; - - release.title = document.querySelector('title').textContent; - release.description = document.querySelector('.player_data').textContent.trim(); - - const durationString = document.querySelector('.tag_lineR div:nth-child(2) span').textContent; - const [minutes, seconds] = durationString.match(/\d+/g); - - release.duration = Number(minutes) * 60 + Number(seconds); - release.tags = document.querySelector('meta[name="keywords"]').content.split(','); - - const { poster, trailer } = await getTrailer(release.entryId); - - release.poster = poster; - release.trailer = { src: trailer }; - return release; } -function scrapeFallbackLanding(html) { - const { document } = new JSDOM(html).window; +function scrapeProfile({ query }) { + const profile = {}; - return document.querySelector('input#set_ID').value; + const bio = query.all('.moreInfo li').reduce((acc, el) => ({ + ...acc, + [slugify(query.q(el, 'span', true), '_')]: query.text(el), + }), {}); + + profile.description = query.q('.aboutModel p', true); + profile.dateOfBirth = qu.extractDate(bio.date_of_birth, ['MMMM D, YYYY', 'DD-MMM-YY']); + + profile.birthPlace = bio.birth_location; + profile.ethnicity = bio.ethnicity; + + profile.height = feetInchesToCm(bio.height); + profile.weight = lbsToKg(bio.weight); + + profile.eyes = bio.eye_color; + profile.hairColor = bio.hair_color; + + profile.avatar = query.img('.starPic img'); + profile.releases = scrapeAll(qu.initAll(query.all('.aboutScenes .videoBlock'))); + + return profile; } -async function scrapeFallbackScene(html, entryId, url, site) { - const { document } = new JSDOM(html).window; - const release = { url, entryId, site }; +async function fetchLatest(channel, page = 1) { + const url = `https://pervcity.com/search.php?site[]=${channel.parameters.siteId}&page=${page}`; + const res = await qu.getAll(url, '.videoBlock'); - release.title = document.querySelector('.popup_data_set_head label').textContent; - release.description = document.querySelector('.popup_data_set_des p').textContent.trim(); - release.date = moment.utc(document.querySelector('.popup_left_top div span').textContent, 'MM-DD-YYYY').toDate(); - release.actors = Array.from(document.querySelectorAll('.popup_data_set_models a'), el => el.textContent); - - const { poster, trailer } = await getTrailer(release.entryId); - - release.poster = poster; - release.trailer = { src: trailer }; - - release.channel = document.querySelector('.popup_left_top div img').alt; - - return release; + return res.ok ? scrapeAll(res.items, channel) : res.status; } -async function fetchLatest(site, page = 1) { - const res = page === 1 - ? await bhttp.get(`${site.url}/final_latestupdateview.php?limitstart=${(page - 1) * 9}&limitend=9&websiteid=0&deviceview=browser&tourId=${site.parameters.tourId}`) - : await bhttp.get(`${site.url}/final_load_latestupdate_grid_view.php?limitstart=0&limitend=${(page - 1) * 8 + 1}&websiteid=0&deviceview=browser&tourId=${site.parameters.tourId}`); - const elements = JSON.parse(res.body.toString()); +async function fetchScene(url, entity) { + const res = await qu.get(url, '.trailerArea'); - const latest = Object.values(elements.total_arr).map(html => scrapeLatestScene(html, site)); // total_arr is a key-value object for final_load_latestupdate_grid_view.php - - return latest; + return res.ok ? scrapeScene(res.item, entity) : res.status; } -async function fetchScene(url, site) { - const res = await bhttp.get(url); +async function fetchProfile(actorName) { + const url = `https://pervcity.com/models/${slugify(actorName)}.html`; + const res = await qu.get(url); - if (res.statusCode === 200) { - if (site.isNetwork) { - const entryId = scrapeFallbackLanding(res.body.toString(), url); - - const fallbackRes = await bhttp.post('https://www.pervcity.com/set_popupvideo.php', { - setId: entryId, - }); - - return scrapeFallbackScene(fallbackRes.body.toString(), entryId, url, site); - } - - return scrapeScene(res.body.toString(), url, site); + if (res.ok) { + return scrapeProfile(res.item); } - return null; + const url2 = `https://pervcity.com/models/${slugify(actorName, '')}.html`; + const res2 = await qu.get(url2); + + if (res2.ok) { + return scrapeProfile(res2.item); + } + + return res2.status; } module.exports = { fetchLatest, fetchScene, + fetchProfile, }; diff --git a/src/scrapers/pervcity_legacy.js b/src/scrapers/pervcity_legacy.js new file mode 100644 index 00000000..ff39ba70 --- /dev/null +++ b/src/scrapers/pervcity_legacy.js @@ -0,0 +1,144 @@ +'use strict'; + +const bhttp = require('bhttp'); +const cheerio = require('cheerio'); +const { JSDOM } = require('jsdom'); +const moment = require('moment'); + +async function getTrailer(entryId) { + const trailerRes = await bhttp.post('https://www.pervcity.com/gettoken.php', { + setId: entryId, + }); + + if (trailerRes.statusCode === 200) { + return { + poster: trailerRes.body.TrailerImg, + trailer: trailerRes.body.TrailerPath || trailerRes.body.Trailerfallback, + }; + } + + return null; +} + +function scrapeLatestScene(html, site) { + const $ = cheerio.load(html, { normalizeWhitespace: true }); + + const entryId = $('li').attr('id'); + const sceneLinkElement = $('#scene_title_border a'); + const url = `${site.url}/${sceneLinkElement.attr('href')}`; + const title = sceneLinkElement.attr('title').replace(/\u00E2\u0080\u0099/g, '\''); // replace weird apostrophes + + const actors = $('.home_model_name a').toArray().map(element => $(element).text().replace(/,[\u0020\u00A0\u202F]/, '')); // replace weird commas + const date = moment.utc($('.add_date').text(), 'DD-MM-YYYY').toDate(); + + const poster = $('a:nth-child(2) > img').attr('src'); + const photos = $('.sample-picker img').map((index, element) => $(element).attr('src').replace('tourpics', 'trailer')).toArray(); + + const stars = $('img[src*="/star.png"]') + .toArray() + .map(element => $(element).attr('src')) + .length || 0; + + return { + url, + entryId, + title, + actors, + date, + poster, + photos, + rating: { + stars, + }, + site, + }; +} + +async function scrapeScene(html, url, site) { + const { document } = new JSDOM(html).window; + + const release = { url, site }; + + release.entryId = document.querySelector('input#set_ID').value; + + release.title = document.querySelector('title').textContent; + release.description = document.querySelector('.player_data').textContent.trim(); + + const durationString = document.querySelector('.tag_lineR div:nth-child(2) span').textContent; + const [minutes, seconds] = durationString.match(/\d+/g); + + release.duration = Number(minutes) * 60 + Number(seconds); + release.tags = document.querySelector('meta[name="keywords"]').content.split(','); + + const { poster, trailer } = await getTrailer(release.entryId); + + release.poster = poster; + release.trailer = { src: trailer }; + + return release; +} + +function scrapeFallbackLanding(html) { + const { document } = new JSDOM(html).window; + + return document.querySelector('input#set_ID').value; +} + +async function scrapeFallbackScene(html, entryId, url, site) { + const { document } = new JSDOM(html).window; + const release = { url, entryId, site }; + + release.title = document.querySelector('.popup_data_set_head label').textContent; + release.description = document.querySelector('.popup_data_set_des p').textContent.trim(); + release.date = moment.utc(document.querySelector('.popup_left_top div span').textContent, 'MM-DD-YYYY').toDate(); + release.actors = Array.from(document.querySelectorAll('.popup_data_set_models a'), el => el.textContent); + + const { poster, trailer } = await getTrailer(release.entryId); + + release.poster = poster; + release.trailer = { src: trailer }; + + release.channel = document.querySelector('.popup_left_top div img').alt; + + return release; +} + +async function fetchLatest(channel, page = 1) { + const url = `${channel.url}/final_latestupdateview.php?limitstart=${(page - 1) * 9}&limitend=9&webchannelid=0&deviceview=browser&tourId=${channel.parameters.tourId}`; + const pagedUrl = `${channel.url}/final_load_latestupdate_grid_view.php?limitstart=0&limitend=${(page - 1) * 8 + 1}&webchannelid=0&deviceview=browser&tourId=${channel.parameters.tourId}`; + + const res = page === 1 + ? await bhttp.get(url) + : await bhttp.get(pagedUrl); + + const elements = JSON.parse(res.body.toString()); + + const latest = Object.values(elements.total_arr).map(html => scrapeLatestScene(html, channel)); // total_arr is a key-value object for final_load_latestupdate_grid_view.php + + return latest; +} + +async function fetchScene(url, site) { + const res = await bhttp.get(url); + + if (res.statusCode === 200) { + if (site.isNetwork) { + const entryId = scrapeFallbackLanding(res.body.toString(), url); + + const fallbackRes = await bhttp.post('https://www.pervcity.com/set_popupvideo.php', { + setId: entryId, + }); + + return scrapeFallbackScene(fallbackRes.body.toString(), entryId, url, site); + } + + return scrapeScene(res.body.toString(), url, site); + } + + return null; +} + +module.exports = { + fetchLatest, + fetchScene, +}; diff --git a/src/scrapers/scrapers.js b/src/scrapers/scrapers.js index 7e557006..9aed7412 100644 --- a/src/scrapers/scrapers.js +++ b/src/scrapers/scrapers.js @@ -193,6 +193,7 @@ module.exports = { nubilesporn: nubiles, nympho: mikeadriano, onlyprince: fullpornnetwork, + pervcity, pervertgallery: fullpornnetwork, peternorth: famedigital, pimpxxx: cherrypimps, diff --git a/src/utils/convert.js b/src/utils/convert.js index d90dd5c3..5e4e8d22 100644 --- a/src/utils/convert.js +++ b/src/utils/convert.js @@ -1,19 +1,25 @@ 'use strict'; function inchesToCm(inches) { + if (!inches) return null; + return Math.round(Number(inches) * 2.54); } function feetInchesToCm(feet, inches) { + if (!feet && !inches) return null; + if (typeof feet === 'string' && !inches) { const [feetPart, inchesPart] = feet.match(/\d+/g); - return feetInchesToCm(feetPart, inchesPart); + return feetInchesToCm(Number(feetPart), Number(inchesPart)); } - return Math.round((Number(feet) * 30.48) + (Number(inches) * 2.54)); + return Math.round((Number(feet) * 30.48) + ((Number(inches) || 0) * 2.54)); } function cmToFeetInches(centimeters) { + if (!centimeters) return null; + const feet = Math.floor(centimeters / 30.48); const inches = Math.round((centimeters / 2.54) % (feet * 12)); @@ -21,18 +27,24 @@ function cmToFeetInches(centimeters) { } function heightToCm(height) { + if (!height) return null; + const [feet, inches] = height.match(/\d+/g); return feetInchesToCm(feet, inches); } function lbsToKg(lbs) { + if (!lbs) return null; + const pounds = lbs.toString().match(/\d+/)[0]; return Math.round(Number(pounds) * 0.453592); } function kgToLbs(kgs) { + if (!kgs) return null; + const kilos = kgs.toString().match(/\d+/)[0]; return Math.round(Number(kilos) / 0.453592);