From 65f98c63879fbd12b2ad7232a8ebef8437891f85 Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Thu, 6 Feb 2020 23:51:13 +0100 Subject: [PATCH] Refactored 21Sextury module to use Gamma API. --- seeds/01_sites.js | 1 + src/scrape-releases.js | 5 ++ src/scrape-sites.js | 8 +- src/scrapers/21sextury.js | 149 +------------------------------------- src/scrapers/gamma.js | 48 +++++++----- 5 files changed, 45 insertions(+), 166 deletions(-) diff --git a/seeds/01_sites.js b/seeds/01_sites.js index 70167389..84372fad 100644 --- a/seeds/01_sites.js +++ b/seeds/01_sites.js @@ -16,6 +16,7 @@ const sites = [ url: 'https://www.assholefever.com', description: 'Welcome to AssholeFever, the most hardcore anal site on the net. Watch your favorite pornstars and anal sluts from all over the world in big booty hardcore porn, anal gape, beads, anal creampie and more! Look inside if you dare!', network: '21sextury', + parameters: { networkReferer: true }, }, { slug: 'buttplays', diff --git a/src/scrape-releases.js b/src/scrape-releases.js index ed7f7722..512393ac 100644 --- a/src/scrape-releases.js +++ b/src/scrape-releases.js @@ -82,6 +82,11 @@ async function scrapeReleases(sources, release = null, type = 'scene') { const curatedReleases = scrapedReleases.map(scrapedRelease => ({ ...scrapedRelease, type })); + if (argv.scene && argv.inspect) { + // only show when fetching from URL + console.log(curatedReleases); + } + if (argv.save) { /* const movie = scrapedRelease.movie diff --git a/src/scrape-sites.js b/src/scrape-sites.js index 6843d634..b57baaf6 100644 --- a/src/scrape-sites.js +++ b/src/scrape-sites.js @@ -158,8 +158,14 @@ async function scrapeSites() { concurrency: 5, }); + const releases = scrapedNetworks.flat(2); + + if (argv.inspect) { + console.log(releases); + } + if (argv.save) { - await storeReleases(scrapedNetworks.flat(2)); + await storeReleases(releases); } } diff --git a/src/scrapers/21sextury.js b/src/scrapers/21sextury.js index a6a08f95..8d972832 100644 --- a/src/scrapers/21sextury.js +++ b/src/scrapers/21sextury.js @@ -1,151 +1,10 @@ 'use strict'; -const bhttp = require('bhttp'); -const cheerio = require('cheerio'); -const moment = require('moment'); - -const { getPhotos, fetchProfile } = require('./gamma'); - -function scrape(html, site) { - const $ = cheerio.load(html, { normalizeWhitespace: true }); - const scenesElements = $('li[data-itemtype=scene]').toArray(); - - return scenesElements.reduce((accReleases, element) => { - const siteName = $(element).find('.studioName a').attr('title'); - - if (!site.url && siteName.toLowerCase() !== site.name.toLowerCase()) { - // using generic overview as fallback, scene from different site - return accReleases; - } - - const sceneLinkElement = $(element).find('.sceneTitle a'); - const url = `${site.url || 'https://www.21sextury.com'}${sceneLinkElement.attr('href')}`; - const title = sceneLinkElement.attr('title').trim(); - - const entryId = $(element).attr('data-itemid'); - - const date = moment - .utc($(element).find('.sceneDate').text(), 'MM-DD-YYYY') - .toDate(); - - const actors = $(element).find('.sceneActors a') - .map((actorIndex, actorElement) => $(actorElement).attr('title')) - .toArray(); - - const poster = $(element).find('.imgLink img').attr('data-original'); - const trailer = `https://videothumb.gammacdn.com/307x224/${entryId}.mp4`; - - const [likes, dislikes] = $(element).find('.value') - .toArray() - .map(value => Number($(value).text())); - - return [ - ...accReleases, - { - url, - entryId, - title, - actors, - date, - poster, - trailer: { - src: trailer, - }, - rating: { - likes, - dislikes, - }, - site, - }, - ]; - }, []); -} - -async function scrapeScene(html, url, site) { - const $ = cheerio.load(html, { normalizeWhitespace: true }); - const sceneElement = $('#videoWrapper'); - const json = $('script[type="application/ld+json"]').html(); - const videoJson = $('script:contains("ScenePlayerOptions")').html(); - const videoDataString = videoJson.slice(videoJson.indexOf('= {') + 2, videoJson.indexOf('};') + 1); - - const data = JSON.parse(json)[0]; - const videoData = JSON.parse(videoDataString); - const entryId = new URL(url).pathname.split('/').slice(-1)[0]; - - const title = videoData?.playerOptions?.sceneInfos?.sceneTitle || (data.isPartOf && data.isPartOf !== 'TBD' ? data.isPartOf.name : data.name); - const dataDate = moment.utc(videoData?.playerOptions?.sceneInfos?.sceneReleaseDate, 'YYYY-MM-DD'); - - const date = dataDate.isValid() - ? dataDate.toDate() - : moment.utc(sceneElement.find('.updatedDate').text().trim(), 'MM-DD-YYYY').toDate(); - - const actors = data.actor.map(actor => actor.name); - - const description = data.description || null; // prevent empty string - const likes = Number(sceneElement.find('.rating .state_1 .value').text()); - const dislikes = Number(sceneElement.find('#infoWrapper .rating .state_2 .value').text()); - - const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds(); - - const poster = videoData.picPreview; - const trailer = `${videoData.playerOptions.host}${videoData.url}`; - - const photos = await getPhotos($('.picturesItem a').attr('href'), site); - - const tags = data.keywords.split(', '); - const siteName = data.productionCompany ? data.productionCompany.name : $('#logoLink a').attr('title'); - const channel = siteName && siteName.replace(/\s+/g, '').toLowerCase(); - - return { - url, - entryId, - title, - date, - actors, - description, - duration, - tags, - poster, - photos, - trailer: { - src: trailer, - }, - rating: { - likes, - dislikes, - }, - site, - channel, - }; -} - -async function fetchLatest(site, page = 1) { - const url = `${site.url || 'https://21sextury.com'}/en/videos/All-Categories/0/All-Pornstars/0/latest/${page}`; - const res = await bhttp.get(url); - - return scrape(res.body.toString(), site); -} - -async function fetchUpcoming(site) { - const url = `${site.url || 'https://21sextury.com'}/en/videos/All-Categories/0/All-Pornstars/0/upcoming`; - const res = await bhttp.get(url); - - return scrape(res.body.toString(), site); -} - -async function fetchScene(url, site) { - const res = await bhttp.get(url); - - return scrapeScene(res.body.toString(), url, site); -} - -async function networkFetchProfile(actorName) { - return fetchProfile(actorName, '21sextury', true); -} +const { fetchApiLatest, fetchApiUpcoming, fetchScene, fetchApiProfile } = require('./gamma'); module.exports = { - fetchLatest, - fetchProfile: networkFetchProfile, - fetchUpcoming, + fetchLatest: fetchApiLatest, + fetchProfile: fetchApiProfile, + fetchUpcoming: fetchApiUpcoming, fetchScene, }; diff --git a/src/scrapers/gamma.js b/src/scrapers/gamma.js index afe767e2..0c224628 100644 --- a/src/scrapers/gamma.js +++ b/src/scrapers/gamma.js @@ -6,6 +6,8 @@ const { JSDOM } = require('jsdom'); const cheerio = require('cheerio'); const moment = require('moment'); +const { ex } = require('../utils/q'); + async function fetchPhotos(url) { const res = await bhttp.get(url); @@ -237,39 +239,45 @@ function scrapeActorSearch(html, url, actorName) { function scrapeProfile(html, url, actorName, siteSlug) { const { document } = new JSDOM(html).window; + const { q, qu } = ex(html); - const avatarEl = document.querySelector('img.actorPicture'); - const descriptionEl = document.querySelector('.actorBio p:not(.bioTitle)'); - const hairEl = document.querySelector('.actorProfile .attribute_hair_color'); - const heightEl = document.querySelector('.actorProfile .attribute_height'); - const weightEl = document.querySelector('.actorProfile .attribute_weight'); - const aliasEl = document.querySelector('.actorProfile .attribute_alternate_names'); - const nationalityEl = document.querySelector('.actorProfile .attribute_home'); + const avatar = q('img.actorPicture'); + const hair = q('.actorProfile .attribute_hair_color', true); + const height = q('.actorProfile .attribute_height', true); + const weight = q('.actorProfile .attribute_weight', true); + const alias = q('.actorProfile .attribute_alternate_names', true); + const nationality = q('.actorProfile .attribute_home', true); const profile = { name: actorName, }; - if (avatarEl) { + if (avatar) { // larger sizes usually available, provide fallbacks const avatars = [ - avatarEl.src.replace(/\d+x\d+/, '500x750'), - avatarEl.src.replace(/\d+x\d+/, '240x360'), - avatarEl.src.replace(/\d+x\d+/, '200x300'), - avatarEl.src, + avatar.src.replace(/\d+x\d+/, '500x750'), + avatar.src.replace(/\d+x\d+/, '240x360'), + avatar.src.replace(/\d+x\d+/, '200x300'), + avatar.src, ]; profile.avatar = avatars; } - if (descriptionEl) profile.description = descriptionEl.textContent.trim(); - if (hairEl) profile.hair = hairEl.textContent.split(':')[1].trim(); - if (heightEl) profile.height = Number(heightEl.textContent.match(/\d+/)[0]); - if (weightEl) profile.weight = Number(weightEl.textContent.match(/\d+/)[0]); - if (aliasEl) profile.aliases = aliasEl.textContent.split(':')[1].trim().split(', '); - if (nationalityEl) profile.nationality = nationalityEl.textContent.split(':')[1].trim(); + profile.description = q('.actorBio p:not(.bioTitle)', true); + if (hair) profile.hair = hair.split(':')[1].trim(); + if (height) profile.height = Number(height.match(/\d+/)[0]); + if (weight) profile.weight = Number(weight.match(/\d+/)[0]); + if (alias) profile.aliases = alias.split(':')[1].trim().split(', '); + if (nationality) profile.nationality = nationality.split(':')[1].trim(); + + /* not fetching all releases profile.releases = Array.from(document.querySelectorAll('.sceneList .scene a.imgLink'), el => `https://${siteSlug}.com${el.href}`); + const moreReleases = qu('.seeAllTop a'); + + console.log(moreReleases); + */ return profile; } @@ -322,7 +330,7 @@ async function fetchApiCredentials(referer) { } async function fetchApiLatest(site, page = 1, upcoming = false) { - const referer = `${site.url}/en/videos`; + const referer = `${site.parameters?.networkReferer ? site.network.url : site.url}/en/videos`; const { apiUrl } = await fetchApiCredentials(referer); const res = await bhttp.post(apiUrl, { @@ -339,7 +347,7 @@ async function fetchApiLatest(site, page = 1, upcoming = false) { encodeJSON: true, }); - if (res.statuscode === 200 && res.body.results?.[0]?.hits) { + if (res.statusCode === 200 && res.body.results?.[0]?.hits) { return scrapeApiReleases(res.body.results[0].hits, site); }