forked from DebaucheryLibrarian/traxxx
				
			Integrated Blowpass into generic Gamma scraper.
This commit is contained in:
		
							parent
							
								
									5dfaa4c126
								
							
						
					
					
						commit
						3541a9c402
					
				|  | @ -9,36 +9,8 @@ module.exports = { | ||||||
|         host: '0.0.0.0', |         host: '0.0.0.0', | ||||||
|         port: 5000, |         port: 5000, | ||||||
|     }, |     }, | ||||||
|     include: [ |     // include: [],
 | ||||||
|         '21sextury', |     // exclude: [],
 | ||||||
|         'babes', |  | ||||||
|         'bang', |  | ||||||
|         'bangbros', |  | ||||||
|         'blowpass', |  | ||||||
|         'brazzers', |  | ||||||
|         'ddfnetwork', |  | ||||||
|         'digitalplayground', |  | ||||||
|         'dogfartnetwork', |  | ||||||
|         'evilangel', |  | ||||||
|         'fakehub', |  | ||||||
|         'jayrock', |  | ||||||
|         'julesjordan', |  | ||||||
|         'kellymadison', |  | ||||||
|         'kink', |  | ||||||
|         'legalporno', |  | ||||||
|         'mikeadriano', |  | ||||||
|         'milehighmedia', |  | ||||||
|         'mofos', |  | ||||||
|         'naughtyamerica', |  | ||||||
|         'perfectgonzo', |  | ||||||
|         'pervcity', |  | ||||||
|         'pornpros', |  | ||||||
|         'private', |  | ||||||
|         'realitykings', |  | ||||||
|         'teamskeet', |  | ||||||
|         'vixen', |  | ||||||
|         'xempire', |  | ||||||
|     ], |  | ||||||
|     fetchAfter: [1, 'week'], |     fetchAfter: [1, 'week'], | ||||||
|     media: { |     media: { | ||||||
|         path: './media', |         path: './media', | ||||||
|  |  | ||||||
|  | @ -105,7 +105,7 @@ async function scrapeSiteReleases(scraper, site) { | ||||||
|     ]); |     ]); | ||||||
| 
 | 
 | ||||||
|     if (argv.upcoming) { |     if (argv.upcoming) { | ||||||
|         logger.info(`${site.name}: ${argv.latest ? 'Found' : 'Ignoring'} ${newReleases.length || ''}latest releases, ${argv.upcoming ? '' : 'ignoring '}${upcomingReleases.length || ''} upcoming releases`); |         logger.info(`${site.name}: ${argv.latest ? `Found ${newReleases.length}` : 'Ignoring'} latest releases, ${argv.upcoming ? '' : 'ignoring '}${upcomingReleases.length || ''} upcoming releases`); | ||||||
|     } |     } | ||||||
| 
 | 
 | ||||||
|     const baseReleases = [...newReleases, ...upcomingReleases]; |     const baseReleases = [...newReleases, ...upcomingReleases]; | ||||||
|  |  | ||||||
|  | @ -1,132 +1,38 @@ | ||||||
| 'use strict'; | 'use strict'; | ||||||
| 
 | 
 | ||||||
| /* eslint-disable newline-per-chained-call */ |  | ||||||
| const bhttp = require('bhttp'); | const bhttp = require('bhttp'); | ||||||
| const cheerio = require('cheerio'); |  | ||||||
| const moment = require('moment'); |  | ||||||
| 
 | 
 | ||||||
| const { getPhotos, fetchProfile } = require('./gamma'); | const { scrapeAll, scrapeScene, fetchProfile } = require('./gamma'); | ||||||
| 
 | 
 | ||||||
| function scrape(html, site) { | async function fetchScene(url, site) { | ||||||
|     const $ = cheerio.load(html, { normalizeWhitespace: true }); |     // const res = await bhttp.get(url);
 | ||||||
|     const sceneElements = $('.sceneList .scene').toArray(); |     const res = await bhttp.get(`https://www.blowpass.com/en/video/${site.id}/${new URL(url).pathname.split('/').slice(-2).join('/')}`); | ||||||
| 
 | 
 | ||||||
|     return sceneElements.map((element) => { |     const release = await scrapeScene(res.body.toString(), url, site); | ||||||
|         const entryId = $(element).attr('data-itemid'); |     release.channel = release.$('.siteNameSpan').text().trim().toLowerCase(); | ||||||
| 
 | 
 | ||||||
|         const sceneLinkElement = $(element).find('.sceneTitle a'); |     if (['onlyteenblowjobs.com', 'mommyblowsbest.com'].includes(release.channel)) release.url = url.replace(/video\/\w+\//, 'scene/'); | ||||||
|         const title = sceneLinkElement.attr('title'); |     else release.url = url.replace(/video\/\w+\//, 'video/'); | ||||||
|         const url = `${site.url}/en/scene/${sceneLinkElement.attr('href').split('/').slice(-2).join('/')}`; |  | ||||||
| 
 | 
 | ||||||
|         const date = moment.utc($(element).find('.sceneDate').text(), 'MM-DD-YYYY').toDate(); |     return release; | ||||||
|         const actors = $(element).find('.sceneActors a').map((actorIndex, actorElement) => $(actorElement).text()).toArray(); |  | ||||||
| 
 |  | ||||||
|         const poster = $(element).find('a.imgLink img.img').attr('data-original'); |  | ||||||
|         const trailer = `https://videothumb.gammacdn.com/600x339/${entryId}.mp4`; |  | ||||||
| 
 |  | ||||||
|         const likes = Number($(element).find('.rating .state_1 .value').text()); |  | ||||||
| 
 |  | ||||||
|         return { |  | ||||||
|             url, |  | ||||||
|             entryId, |  | ||||||
|             title, |  | ||||||
|             actors, |  | ||||||
|             date, |  | ||||||
|             poster, |  | ||||||
|             trailer: { |  | ||||||
|                 src: trailer, |  | ||||||
|                 quality: 339, |  | ||||||
|             }, |  | ||||||
|             rating: { |  | ||||||
|                 likes, |  | ||||||
|             }, |  | ||||||
|             site, |  | ||||||
|         }; |  | ||||||
|     }); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| async function scrapeScene(html, url, site) { |  | ||||||
|     const $ = cheerio.load(html, { normalizeWhitespace: true }); |  | ||||||
|     const json = $('script[type="application/ld+json"]').html(); |  | ||||||
|     const data = JSON.parse(json).slice(-1)[0]; |  | ||||||
|     const sceneElement = $('#wrapper'); |  | ||||||
| 
 |  | ||||||
|     const videoScript = $('script:contains("window.ScenePlayerOptions")').html(); |  | ||||||
|     const playerObject = videoScript.slice(videoScript.indexOf('{'), videoScript.indexOf('};') + 1); |  | ||||||
|     const playerData = JSON.parse(playerObject); |  | ||||||
| 
 |  | ||||||
|     // const workName = data.isPartOf.name.split(' - ');
 |  | ||||||
|     // const shootId = workName.length > 1 ? workName[1] : null;
 |  | ||||||
|     const entryId = url.split('/').slice(-1)[0]; |  | ||||||
|     const title = data.title || $('meta[name="twitter:title"]').attr('content'); |  | ||||||
|     const description = data.description || $('meta[name="twitter:description"]').attr('content'); |  | ||||||
|     // date in data object is not the release date of the scene, but the date the entry was added
 |  | ||||||
|     const date = moment.utc($('.updatedDate').first().text(), 'MM-DD-YYYY').toDate(); |  | ||||||
| 
 |  | ||||||
|     const actors = data.actor.map(({ name }) => name); |  | ||||||
| 
 |  | ||||||
|     const likes = Number(sceneElement.find('.rating .state_1 .value').text()); |  | ||||||
|     const dislikes = Number(sceneElement.find('.rating .state_2 .value').text()); |  | ||||||
| 
 |  | ||||||
|     const channel = $('.siteNameSpan').text().trim().toLowerCase(); |  | ||||||
| 
 |  | ||||||
|     const poster = playerData.picPreview; |  | ||||||
|     const trailer = `${playerData.playerOptions.host}${playerData.url}`; |  | ||||||
|     const photos = await getPhotos($('.picturesItem a').attr('href'), 'blowpass.com', site); |  | ||||||
| 
 |  | ||||||
|     const duration = moment.duration(data.duration.slice(2)).asSeconds(); |  | ||||||
|     const tags = data.keywords.split(', '); |  | ||||||
| 
 |  | ||||||
|     return { |  | ||||||
|         url, |  | ||||||
|         // shootId,
 |  | ||||||
|         entryId, |  | ||||||
|         title, |  | ||||||
|         description, |  | ||||||
|         actors, |  | ||||||
|         date, |  | ||||||
|         duration, |  | ||||||
|         poster, |  | ||||||
|         photos, |  | ||||||
|         trailer: { |  | ||||||
|             src: trailer, |  | ||||||
|             quality: playerData.sizeOnLoad.slice(0, -1), |  | ||||||
|         }, |  | ||||||
|         tags, |  | ||||||
|         rating: { |  | ||||||
|             likes, |  | ||||||
|             dislikes, |  | ||||||
|         }, |  | ||||||
|         site, |  | ||||||
|         channel, |  | ||||||
|     }; |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| async function fetchLatest(site, page = 1) { | async function fetchLatest(site, page = 1) { | ||||||
|     const res = await bhttp.get(`https://www.blowpass.com/en/videos/${site.slug}/latest/All-Categories/0/All-Pornstars/0/${page}`); |     const res = await bhttp.get(`https://www.blowpass.com/en/videos/${site.slug}/latest/All-Categories/0/All-Pornstars/0/${page}`); | ||||||
| 
 | 
 | ||||||
|     return scrape(res.body.toString(), site); |     return scrapeAll(res.body.toString(), site); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| async function fetchUpcoming(site) { | async function fetchUpcoming(site) { | ||||||
|     const res = await bhttp.get(`https://www.blowpass.com/en/videos/${site.slug}/upcoming`); |     const res = await bhttp.get(`https://www.blowpass.com/en/videos/${site.slug}/upcoming`); | ||||||
| 
 | 
 | ||||||
|     return scrape(res.body.toString(), site); |     return scrapeAll(res.body.toString(), site); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| async function fetchScene(url, site) { |  | ||||||
|     const res = await bhttp.get(`https://www.blowpass.com/en/video/${site.id}/${new URL(url).pathname.split('/').slice(-2).join('/')}`); |  | ||||||
| 
 |  | ||||||
|     return scrapeScene(res.body.toString(), url, site); |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| async function blowpassFetchProfile(actorName) { |  | ||||||
|     return fetchProfile(actorName, 'blowpass'); |  | ||||||
| } |  | ||||||
| 
 | 
 | ||||||
| module.exports = { | module.exports = { | ||||||
|     fetchLatest, |     fetchLatest, | ||||||
|     fetchProfile: blowpassFetchProfile, |     fetchProfile, | ||||||
|     fetchScene, |  | ||||||
|     fetchUpcoming, |     fetchUpcoming, | ||||||
|  |     fetchScene, | ||||||
| }; | }; | ||||||
|  |  | ||||||
|  | @ -106,14 +106,14 @@ async function scrapeApiReleases(json, site) { | ||||||
|     }); |     }); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| function scrapeAll(html, site) { | function scrapeAll(html, site, useNetworkUrl) { | ||||||
|     const $ = cheerio.load(html, { normalizeWhitespace: true }); |     const $ = cheerio.load(html, { normalizeWhitespace: true }); | ||||||
|     const scenesElements = $('li[data-itemtype=scene]').toArray(); |     const scenesElements = $('li[data-itemtype=scene]').toArray(); | ||||||
| 
 | 
 | ||||||
|     return scenesElements.map((element) => { |     return scenesElements.map((element) => { | ||||||
|         const sceneLinkElement = $(element).find('.sceneTitle a'); |         const sceneLinkElement = $(element).find('.sceneTitle a'); | ||||||
| 
 | 
 | ||||||
|         const url = `${site.url}${sceneLinkElement.attr('href')}`; |         const url = `${useNetworkUrl ? site.network.url : site.url}${sceneLinkElement.attr('href')}`; | ||||||
|         const title = sceneLinkElement.attr('title'); |         const title = sceneLinkElement.attr('title'); | ||||||
| 
 | 
 | ||||||
|         const entryId = $(element).attr('data-itemid'); |         const entryId = $(element).attr('data-itemid'); | ||||||
|  | @ -175,8 +175,8 @@ async function scrapeScene(html, url, site) { | ||||||
|     release.date = moment.utc(dateMatch, ['MM-DD-YYYY', 'YYYY-MM-DD']).toDate(); |     release.date = moment.utc(dateMatch, ['MM-DD-YYYY', 'YYYY-MM-DD']).toDate(); | ||||||
| 
 | 
 | ||||||
|     release.director = data.director?.[0].name || data2?.director?.[0].name; |     release.director = data.director?.[0].name || data2?.director?.[0].name; | ||||||
|     release.actors = data.actor.map(actor => actor.name); |     release.actors = (data.actor || data2.actor).map(actor => actor.name); | ||||||
|     const hasTrans = data.actor.some(actor => actor.gender === 'shemale'); |     const hasTrans = (data.actor || data2.actor).some(actor => actor.gender === 'shemale'); | ||||||
| 
 | 
 | ||||||
|     const stars = (data.aggregateRating.ratingValue / data.aggregateRating.bestRating) * 5; |     const stars = (data.aggregateRating.ratingValue / data.aggregateRating.bestRating) * 5; | ||||||
|     if (stars) release.rating = { stars }; |     if (stars) release.rating = { stars }; | ||||||
|  | @ -339,7 +339,8 @@ async function fetchApiUpcoming(site) { | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| async function fetchLatest(site, page = 1) { | async function fetchLatest(site, page = 1) { | ||||||
|     const res = await bhttp.get(`${site.url}/en/videos/AllCategories/0/${page}`); |     const url = `${site.url}/en/videos/AllCategories/0/${page}`; | ||||||
|  |     const res = await bhttp.get(url); | ||||||
| 
 | 
 | ||||||
|     return scrapeAll(res.body.toString(), site); |     return scrapeAll(res.body.toString(), site); | ||||||
| } | } | ||||||
|  |  | ||||||
							
								
								
									
										17
									
								
								src/sites.js
								
								
								
								
							
							
						
						
									
										17
									
								
								src/sites.js
								
								
								
								
							|  | @ -39,7 +39,7 @@ function curateSites(sites, includeParameters) { | ||||||
|     return Promise.all(sites.map(async site => curateSite(site, includeParameters))); |     return Promise.all(sites.map(async site => curateSite(site, includeParameters))); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| function destructConfigNetworks(networks) { | function destructConfigNetworks(networks = []) { | ||||||
|     return networks.reduce((acc, network) => { |     return networks.reduce((acc, network) => { | ||||||
|         if (Array.isArray(network)) { |         if (Array.isArray(network)) { | ||||||
|             // network specifies sites
 |             // network specifies sites
 | ||||||
|  | @ -119,6 +119,7 @@ async function fetchSitesFromArgv() { | ||||||
| 
 | 
 | ||||||
| async function fetchSitesFromConfig() { | async function fetchSitesFromConfig() { | ||||||
|     const included = destructConfigNetworks(config.include); |     const included = destructConfigNetworks(config.include); | ||||||
|  |     const excluded = destructConfigNetworks(config.exclude); | ||||||
| 
 | 
 | ||||||
|     const rawSites = await knex('sites') |     const rawSites = await knex('sites') | ||||||
|         .select( |         .select( | ||||||
|  | @ -126,8 +127,18 @@ async function fetchSitesFromConfig() { | ||||||
|             'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters', |             'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters', | ||||||
|         ) |         ) | ||||||
|         .leftJoin('networks', 'sites.network_id', 'networks.id') |         .leftJoin('networks', 'sites.network_id', 'networks.id') | ||||||
|         .whereIn('sites.slug', included.sites || []) |         .where((builder) => { | ||||||
|         .orWhereIn('networks.slug', included.networks || []); |             if (config.include) { | ||||||
|  |                 builder | ||||||
|  |                     .whereIn('sites.slug', included.sites) | ||||||
|  |                     .orWhereIn('networks.slug', included.networks); | ||||||
|  |             } | ||||||
|  |         }) | ||||||
|  |         .whereNot((builder) => { | ||||||
|  |             builder | ||||||
|  |                 .whereIn('sites.slug', excluded.sites) | ||||||
|  |                 .orWhereIn('networks.slug', excluded.networks); | ||||||
|  |         }); | ||||||
| 
 | 
 | ||||||
|     const curatedSites = await curateSites(rawSites, true); |     const curatedSites = await curateSites(rawSites, true); | ||||||
|     logger.info(`Found ${curatedSites.length} sites in database`); |     logger.info(`Found ${curatedSites.length} sites in database`); | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue