forked from DebaucheryLibrarian/traxxx
				
			Integrated Blowpass into generic Gamma scraper.
This commit is contained in:
		
							parent
							
								
									5dfaa4c126
								
							
						
					
					
						commit
						3541a9c402
					
				|  | @ -9,36 +9,8 @@ module.exports = { | |||
|         host: '0.0.0.0', | ||||
|         port: 5000, | ||||
|     }, | ||||
|     include: [ | ||||
|         '21sextury', | ||||
|         'babes', | ||||
|         'bang', | ||||
|         'bangbros', | ||||
|         'blowpass', | ||||
|         'brazzers', | ||||
|         'ddfnetwork', | ||||
|         'digitalplayground', | ||||
|         'dogfartnetwork', | ||||
|         'evilangel', | ||||
|         'fakehub', | ||||
|         'jayrock', | ||||
|         'julesjordan', | ||||
|         'kellymadison', | ||||
|         'kink', | ||||
|         'legalporno', | ||||
|         'mikeadriano', | ||||
|         'milehighmedia', | ||||
|         'mofos', | ||||
|         'naughtyamerica', | ||||
|         'perfectgonzo', | ||||
|         'pervcity', | ||||
|         'pornpros', | ||||
|         'private', | ||||
|         'realitykings', | ||||
|         'teamskeet', | ||||
|         'vixen', | ||||
|         'xempire', | ||||
|     ], | ||||
|     // include: [],
 | ||||
|     // exclude: [],
 | ||||
|     fetchAfter: [1, 'week'], | ||||
|     media: { | ||||
|         path: './media', | ||||
|  |  | |||
|  | @ -105,7 +105,7 @@ async function scrapeSiteReleases(scraper, site) { | |||
|     ]); | ||||
| 
 | ||||
|     if (argv.upcoming) { | ||||
|         logger.info(`${site.name}: ${argv.latest ? 'Found' : 'Ignoring'} ${newReleases.length || ''}latest releases, ${argv.upcoming ? '' : 'ignoring '}${upcomingReleases.length || ''} upcoming releases`); | ||||
|         logger.info(`${site.name}: ${argv.latest ? `Found ${newReleases.length}` : 'Ignoring'} latest releases, ${argv.upcoming ? '' : 'ignoring '}${upcomingReleases.length || ''} upcoming releases`); | ||||
|     } | ||||
| 
 | ||||
|     const baseReleases = [...newReleases, ...upcomingReleases]; | ||||
|  |  | |||
|  | @ -1,132 +1,38 @@ | |||
| 'use strict'; | ||||
| 
 | ||||
| /* eslint-disable newline-per-chained-call */ | ||||
| const bhttp = require('bhttp'); | ||||
| const cheerio = require('cheerio'); | ||||
| const moment = require('moment'); | ||||
| 
 | ||||
| const { getPhotos, fetchProfile } = require('./gamma'); | ||||
| const { scrapeAll, scrapeScene, fetchProfile } = require('./gamma'); | ||||
| 
 | ||||
| function scrape(html, site) { | ||||
|     const $ = cheerio.load(html, { normalizeWhitespace: true }); | ||||
|     const sceneElements = $('.sceneList .scene').toArray(); | ||||
| async function fetchScene(url, site) { | ||||
|     // const res = await bhttp.get(url);
 | ||||
|     const res = await bhttp.get(`https://www.blowpass.com/en/video/${site.id}/${new URL(url).pathname.split('/').slice(-2).join('/')}`); | ||||
| 
 | ||||
|     return sceneElements.map((element) => { | ||||
|         const entryId = $(element).attr('data-itemid'); | ||||
|     const release = await scrapeScene(res.body.toString(), url, site); | ||||
|     release.channel = release.$('.siteNameSpan').text().trim().toLowerCase(); | ||||
| 
 | ||||
|         const sceneLinkElement = $(element).find('.sceneTitle a'); | ||||
|         const title = sceneLinkElement.attr('title'); | ||||
|         const url = `${site.url}/en/scene/${sceneLinkElement.attr('href').split('/').slice(-2).join('/')}`; | ||||
|     if (['onlyteenblowjobs.com', 'mommyblowsbest.com'].includes(release.channel)) release.url = url.replace(/video\/\w+\//, 'scene/'); | ||||
|     else release.url = url.replace(/video\/\w+\//, 'video/'); | ||||
| 
 | ||||
|         const date = moment.utc($(element).find('.sceneDate').text(), 'MM-DD-YYYY').toDate(); | ||||
|         const actors = $(element).find('.sceneActors a').map((actorIndex, actorElement) => $(actorElement).text()).toArray(); | ||||
| 
 | ||||
|         const poster = $(element).find('a.imgLink img.img').attr('data-original'); | ||||
|         const trailer = `https://videothumb.gammacdn.com/600x339/${entryId}.mp4`; | ||||
| 
 | ||||
|         const likes = Number($(element).find('.rating .state_1 .value').text()); | ||||
| 
 | ||||
|         return { | ||||
|             url, | ||||
|             entryId, | ||||
|             title, | ||||
|             actors, | ||||
|             date, | ||||
|             poster, | ||||
|             trailer: { | ||||
|                 src: trailer, | ||||
|                 quality: 339, | ||||
|             }, | ||||
|             rating: { | ||||
|                 likes, | ||||
|             }, | ||||
|             site, | ||||
|         }; | ||||
|     }); | ||||
| } | ||||
| 
 | ||||
| async function scrapeScene(html, url, site) { | ||||
|     const $ = cheerio.load(html, { normalizeWhitespace: true }); | ||||
|     const json = $('script[type="application/ld+json"]').html(); | ||||
|     const data = JSON.parse(json).slice(-1)[0]; | ||||
|     const sceneElement = $('#wrapper'); | ||||
| 
 | ||||
|     const videoScript = $('script:contains("window.ScenePlayerOptions")').html(); | ||||
|     const playerObject = videoScript.slice(videoScript.indexOf('{'), videoScript.indexOf('};') + 1); | ||||
|     const playerData = JSON.parse(playerObject); | ||||
| 
 | ||||
|     // const workName = data.isPartOf.name.split(' - ');
 | ||||
|     // const shootId = workName.length > 1 ? workName[1] : null;
 | ||||
|     const entryId = url.split('/').slice(-1)[0]; | ||||
|     const title = data.title || $('meta[name="twitter:title"]').attr('content'); | ||||
|     const description = data.description || $('meta[name="twitter:description"]').attr('content'); | ||||
|     // date in data object is not the release date of the scene, but the date the entry was added
 | ||||
|     const date = moment.utc($('.updatedDate').first().text(), 'MM-DD-YYYY').toDate(); | ||||
| 
 | ||||
|     const actors = data.actor.map(({ name }) => name); | ||||
| 
 | ||||
|     const likes = Number(sceneElement.find('.rating .state_1 .value').text()); | ||||
|     const dislikes = Number(sceneElement.find('.rating .state_2 .value').text()); | ||||
| 
 | ||||
|     const channel = $('.siteNameSpan').text().trim().toLowerCase(); | ||||
| 
 | ||||
|     const poster = playerData.picPreview; | ||||
|     const trailer = `${playerData.playerOptions.host}${playerData.url}`; | ||||
|     const photos = await getPhotos($('.picturesItem a').attr('href'), 'blowpass.com', site); | ||||
| 
 | ||||
|     const duration = moment.duration(data.duration.slice(2)).asSeconds(); | ||||
|     const tags = data.keywords.split(', '); | ||||
| 
 | ||||
|     return { | ||||
|         url, | ||||
|         // shootId,
 | ||||
|         entryId, | ||||
|         title, | ||||
|         description, | ||||
|         actors, | ||||
|         date, | ||||
|         duration, | ||||
|         poster, | ||||
|         photos, | ||||
|         trailer: { | ||||
|             src: trailer, | ||||
|             quality: playerData.sizeOnLoad.slice(0, -1), | ||||
|         }, | ||||
|         tags, | ||||
|         rating: { | ||||
|             likes, | ||||
|             dislikes, | ||||
|         }, | ||||
|         site, | ||||
|         channel, | ||||
|     }; | ||||
|     return release; | ||||
| } | ||||
| 
 | ||||
| async function fetchLatest(site, page = 1) { | ||||
|     const res = await bhttp.get(`https://www.blowpass.com/en/videos/${site.slug}/latest/All-Categories/0/All-Pornstars/0/${page}`); | ||||
| 
 | ||||
|     return scrape(res.body.toString(), site); | ||||
|     return scrapeAll(res.body.toString(), site); | ||||
| } | ||||
| 
 | ||||
| async function fetchUpcoming(site) { | ||||
|     const res = await bhttp.get(`https://www.blowpass.com/en/videos/${site.slug}/upcoming`); | ||||
| 
 | ||||
|     return scrape(res.body.toString(), site); | ||||
|     return scrapeAll(res.body.toString(), site); | ||||
| } | ||||
| 
 | ||||
| async function fetchScene(url, site) { | ||||
|     const res = await bhttp.get(`https://www.blowpass.com/en/video/${site.id}/${new URL(url).pathname.split('/').slice(-2).join('/')}`); | ||||
| 
 | ||||
|     return scrapeScene(res.body.toString(), url, site); | ||||
| } | ||||
| 
 | ||||
| async function blowpassFetchProfile(actorName) { | ||||
|     return fetchProfile(actorName, 'blowpass'); | ||||
| } | ||||
| 
 | ||||
| module.exports = { | ||||
|     fetchLatest, | ||||
|     fetchProfile: blowpassFetchProfile, | ||||
|     fetchScene, | ||||
|     fetchProfile, | ||||
|     fetchUpcoming, | ||||
|     fetchScene, | ||||
| }; | ||||
|  |  | |||
|  | @ -106,14 +106,14 @@ async function scrapeApiReleases(json, site) { | |||
|     }); | ||||
| } | ||||
| 
 | ||||
| function scrapeAll(html, site) { | ||||
| function scrapeAll(html, site, useNetworkUrl) { | ||||
|     const $ = cheerio.load(html, { normalizeWhitespace: true }); | ||||
|     const scenesElements = $('li[data-itemtype=scene]').toArray(); | ||||
| 
 | ||||
|     return scenesElements.map((element) => { | ||||
|         const sceneLinkElement = $(element).find('.sceneTitle a'); | ||||
| 
 | ||||
|         const url = `${site.url}${sceneLinkElement.attr('href')}`; | ||||
|         const url = `${useNetworkUrl ? site.network.url : site.url}${sceneLinkElement.attr('href')}`; | ||||
|         const title = sceneLinkElement.attr('title'); | ||||
| 
 | ||||
|         const entryId = $(element).attr('data-itemid'); | ||||
|  | @ -175,8 +175,8 @@ async function scrapeScene(html, url, site) { | |||
|     release.date = moment.utc(dateMatch, ['MM-DD-YYYY', 'YYYY-MM-DD']).toDate(); | ||||
| 
 | ||||
|     release.director = data.director?.[0].name || data2?.director?.[0].name; | ||||
|     release.actors = data.actor.map(actor => actor.name); | ||||
|     const hasTrans = data.actor.some(actor => actor.gender === 'shemale'); | ||||
|     release.actors = (data.actor || data2.actor).map(actor => actor.name); | ||||
|     const hasTrans = (data.actor || data2.actor).some(actor => actor.gender === 'shemale'); | ||||
| 
 | ||||
|     const stars = (data.aggregateRating.ratingValue / data.aggregateRating.bestRating) * 5; | ||||
|     if (stars) release.rating = { stars }; | ||||
|  | @ -339,7 +339,8 @@ async function fetchApiUpcoming(site) { | |||
| } | ||||
| 
 | ||||
| async function fetchLatest(site, page = 1) { | ||||
|     const res = await bhttp.get(`${site.url}/en/videos/AllCategories/0/${page}`); | ||||
|     const url = `${site.url}/en/videos/AllCategories/0/${page}`; | ||||
|     const res = await bhttp.get(url); | ||||
| 
 | ||||
|     return scrapeAll(res.body.toString(), site); | ||||
| } | ||||
|  |  | |||
							
								
								
									
										17
									
								
								src/sites.js
								
								
								
								
							
							
						
						
									
										17
									
								
								src/sites.js
								
								
								
								
							|  | @ -39,7 +39,7 @@ function curateSites(sites, includeParameters) { | |||
|     return Promise.all(sites.map(async site => curateSite(site, includeParameters))); | ||||
| } | ||||
| 
 | ||||
| function destructConfigNetworks(networks) { | ||||
| function destructConfigNetworks(networks = []) { | ||||
|     return networks.reduce((acc, network) => { | ||||
|         if (Array.isArray(network)) { | ||||
|             // network specifies sites
 | ||||
|  | @ -119,6 +119,7 @@ async function fetchSitesFromArgv() { | |||
| 
 | ||||
| async function fetchSitesFromConfig() { | ||||
|     const included = destructConfigNetworks(config.include); | ||||
|     const excluded = destructConfigNetworks(config.exclude); | ||||
| 
 | ||||
|     const rawSites = await knex('sites') | ||||
|         .select( | ||||
|  | @ -126,8 +127,18 @@ async function fetchSitesFromConfig() { | |||
|             'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters', | ||||
|         ) | ||||
|         .leftJoin('networks', 'sites.network_id', 'networks.id') | ||||
|         .whereIn('sites.slug', included.sites || []) | ||||
|         .orWhereIn('networks.slug', included.networks || []); | ||||
|         .where((builder) => { | ||||
|             if (config.include) { | ||||
|                 builder | ||||
|                     .whereIn('sites.slug', included.sites) | ||||
|                     .orWhereIn('networks.slug', included.networks); | ||||
|             } | ||||
|         }) | ||||
|         .whereNot((builder) => { | ||||
|             builder | ||||
|                 .whereIn('sites.slug', excluded.sites) | ||||
|                 .orWhereIn('networks.slug', excluded.networks); | ||||
|         }); | ||||
| 
 | ||||
|     const curatedSites = await curateSites(rawSites, true); | ||||
|     logger.info(`Found ${curatedSites.length} sites in database`); | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue