Fixed slug lookup in Perfect Gonzo scraper.
This commit is contained in:
		
							parent
							
								
									aade7490f8
								
							
						
					
					
						commit
						4b5cd50122
					
				|  | @ -3,14 +3,13 @@ | ||||||
| const blake2 = require('blake2'); | const blake2 = require('blake2'); | ||||||
| const knex = require('../knex'); | const knex = require('../knex'); | ||||||
| 
 | 
 | ||||||
| const { ex, ctxa } = require('../utils/q'); | const qu = require('../utils/qu'); | ||||||
| const http = require('../utils/http'); |  | ||||||
| 
 | 
 | ||||||
| async function getSiteSlugs() { | async function getSiteSlugs() { | ||||||
| 	return knex('sites') | 	return knex('entities') | ||||||
| 		.pluck('sites.slug') | 		.pluck('entities.slug') | ||||||
| 		.join('networks', 'networks.id', 'sites.network_id') | 		.join('entities AS parents', 'parents.id', 'entities.parent_id') | ||||||
| 		.where('networks.slug', 'perfectgonzo'); | 		.where('parents.slug', 'perfectgonzo'); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| function getHash(identifier) { | function getHash(identifier) { | ||||||
|  | @ -39,8 +38,10 @@ function extractMaleModelsFromTags(tagContainer) { | ||||||
| 	return []; | 	return []; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| async function extractChannelFromPhoto(photo, metaSiteSlugs) { | async function extractChannelFromPhoto(photo, channel) { | ||||||
| 	const siteSlugs = metaSiteSlugs || await getSiteSlugs(); | 	const siteSlugs = (channel.type === 'network' ? channel.children : channel.parent?.children)?.map(child => child.slug) | ||||||
|  | 		|| await getSiteSlugs(); | ||||||
|  | 
 | ||||||
| 	const channelMatch = photo.match(new RegExp(siteSlugs.join('|'))); | 	const channelMatch = photo.match(new RegExp(siteSlugs.join('|'))); | ||||||
| 
 | 
 | ||||||
| 	if (channelMatch) { | 	if (channelMatch) { | ||||||
|  | @ -50,66 +51,50 @@ async function extractChannelFromPhoto(photo, metaSiteSlugs) { | ||||||
| 	return null; | 	return null; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| async function scrapeLatest(html, site) { | async function scrapeLatest(scenes, site) { | ||||||
| 	const siteSlugs = await getSiteSlugs(); | 	return scenes.map(({ query }) => { | ||||||
| 	const { element } = ex(html); | 		const release = {}; | ||||||
| 
 | 
 | ||||||
| 	return ctxa(element, '#content-main .itemm').map(({ | 		release.title = query.q('a', 'title'); | ||||||
| 		q, qa, qlength, qdate, qimages, | 		release.url = query.url('a', 'href', { origin: site.url }); | ||||||
| 	}) => { | 		release.date = query.date('.nm-date', 'MM/DD/YYYY'); | ||||||
| 		const release = { |  | ||||||
| 			site, |  | ||||||
| 			meta: { |  | ||||||
| 				siteSlugs, |  | ||||||
| 			}, |  | ||||||
| 		}; |  | ||||||
| 
 |  | ||||||
| 		const sceneLink = q('a'); |  | ||||||
| 
 |  | ||||||
| 		release.title = sceneLink.title; |  | ||||||
| 		release.url = `${site.url}${sceneLink.href}`; |  | ||||||
| 		release.date = qdate('.nm-date', 'MM/DD/YYYY'); |  | ||||||
| 
 | 
 | ||||||
| 		const slug = new URL(release.url).pathname.split('/')[2]; | 		const slug = new URL(release.url).pathname.split('/')[2]; | ||||||
| 		release.entryId = getHash(`${site.slug}${slug}${release.date.toISOString()}`); | 		release.entryId = getHash(`${site.slug}${slug}${release.date.toISOString()}`); | ||||||
| 
 | 
 | ||||||
| 		release.actors = release.title.split('&').map(actor => actor.trim()); | 		release.actors = release.title.split('&').map(actor => actor.trim()); | ||||||
| 
 | 
 | ||||||
| 		[release.poster, ...release.photos] = qimages('.bloc-link img'); | 		[release.poster, ...release.photos] = query.imgs('.bloc-link img'); | ||||||
| 
 | 
 | ||||||
| 		release.tags = qa('.dropdown ul a', true).slice(1); | 		release.tags = query.cnts('.dropdown ul a').slice(1); | ||||||
| 		release.duration = qlength('.dropdown p:first-child'); | 		release.duration = query.duration('.dropdown p:first-child'); | ||||||
| 
 | 
 | ||||||
| 		return release; | 		return release; | ||||||
| 	}); | 	}); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| async function scrapeScene(html, site, url, metaSiteSlugs) { | async function scrapeScene({ query }, site, url) { | ||||||
| 	const { |  | ||||||
| 		q, qa, qlength, qdate, qposter, qtrailer, |  | ||||||
| 	} = ex(html); |  | ||||||
| 
 |  | ||||||
| 	const release = { url, site }; | 	const release = { url, site }; | ||||||
| 
 | 
 | ||||||
| 	release.title = q('#movie-header h2', true); | 	release.title = query.cnt('#movie-header h2'); | ||||||
| 	release.date = qdate('#movie-header div span', 'MMMM DD, YYYY', /\w+ \d{1,2}, \d{4}/); | 	release.date = query.date('#movie-header div span', 'MMMM DD, YYYY', /\w+ \d{1,2}, \d{4}/); | ||||||
| 
 | 
 | ||||||
| 	release.description = q('.container .mg-md', true); | 	release.description = query.cnt('.container .mg-md'); | ||||||
| 	release.duration = qlength('#video-ribbon .container > div > span:nth-child(3)'); | 	release.duration = query.duration('#video-ribbon .container > div > span:nth-child(3)'); | ||||||
| 
 | 
 | ||||||
| 	release.actors = qa('#video-info a', true).concat(extractMaleModelsFromTags(q('.tag-container'))); | 	release.actors = query.cnts('#video-info a').concat(extractMaleModelsFromTags(query.q('.tag-container'))); | ||||||
| 	release.tags = qa('.tag-container a', true); | 	release.tags = query.cnts('.tag-container a'); | ||||||
| 
 | 
 | ||||||
| 	const uhd = q('#video-ribbon .container > div > span:nth-child(2)', true); | 	const uhd = query.cnt('#video-ribbon .container > div > span:nth-child(2)'); | ||||||
| 	if (/4K/.test(uhd)) release.tags = release.tags.concat('4k'); | 	if (/4K/.test(uhd)) release.tags = release.tags.concat('4k'); | ||||||
| 
 | 
 | ||||||
| 	release.photos = qa('.bxslider_pics img').map(el => el.dataset.original || el.src); | 	release.photos = query.all('.bxslider_pics img').map(el => el.dataset.original || el.src); | ||||||
| 	release.poster = qposter(); | 	release.poster = query.poster(); | ||||||
| 
 | 
 | ||||||
| 	const trailer = qtrailer(); | 	const trailer = query.trailer(); | ||||||
| 	if (trailer) release.trailer = { src: trailer }; | 	if (trailer) release.trailer = { src: trailer }; | ||||||
| 
 | 
 | ||||||
| 	if (release.photos.length > 0) release.channel = await extractChannelFromPhoto(release.photos[0], metaSiteSlugs); | 	if (release.photos.length > 0) release.channel = await extractChannelFromPhoto(release.photos[0], site); | ||||||
| 
 | 
 | ||||||
| 	if (release.channel) { | 	if (release.channel) { | ||||||
| 		const { pathname } = new URL(url); | 		const { pathname } = new URL(url); | ||||||
|  | @ -124,23 +109,23 @@ async function scrapeScene(html, site, url, metaSiteSlugs) { | ||||||
| 
 | 
 | ||||||
| async function fetchLatest(site, page = 1) { | async function fetchLatest(site, page = 1) { | ||||||
| 	const url = `${site.url}/movies/page-${page}`; | 	const url = `${site.url}/movies/page-${page}`; | ||||||
| 	const res = await http.get(url); | 	const res = await qu.getAll(url, '#content-main [class^="item"]'); | ||||||
| 
 | 
 | ||||||
| 	if (res.statusCode === 200) { | 	if (res.ok) { | ||||||
| 		return scrapeLatest(res.body.toString(), site); | 		return scrapeLatest(res.items, site); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	return []; | 	return res.status; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| async function fetchScene(url, site, release) { | async function fetchScene(url, channel) { | ||||||
| 	const res = await http.get(url); | 	const res = await qu.get(url); | ||||||
| 
 | 
 | ||||||
| 	if (res.statusCode === 200) { | 	if (res.ok) { | ||||||
| 		return scrapeScene(res.body.toString(), site, url, release?.meta.siteSlugs); | 		return scrapeScene(res.item, channel, url); | ||||||
| 	} | 	} | ||||||
| 
 | 
 | ||||||
| 	return []; | 	return res.status; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| module.exports = { | module.exports = { | ||||||
|  |  | ||||||
|  | @ -42,39 +42,6 @@ function getAvatarFallbacks(avatar) { | ||||||
| 		.flat(); | 		.flat(); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /* |  | ||||||
| async function getTrailerLegacy(scene, site, url) { |  | ||||||
| 	const qualities = [360, 480, 720, 1080, 2160]; |  | ||||||
| 
 |  | ||||||
| 	const tokenRes = await http.post(`${site.url}/api/__record_tknreq`, { |  | ||||||
| 		file: scene.previewVideoUrl1080P, |  | ||||||
| 		sizes: qualities.join('+'), |  | ||||||
| 		type: 'trailer', |  | ||||||
| 	}, { |  | ||||||
| 		headers: { |  | ||||||
| 			referer: url, |  | ||||||
| 			origin: site.url, |  | ||||||
| 		}, |  | ||||||
| 	}); |  | ||||||
| 
 |  | ||||||
| 	if (!tokenRes.ok) { |  | ||||||
| 		return null; |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	const trailerUrl = `${site.url}/api${tokenRes.body.data.url}`; |  | ||||||
| 	const trailersRes = await http.post(trailerUrl, null, { headers: { referer: url } }); |  | ||||||
| 
 |  | ||||||
| 	if (trailersRes.ok) { |  | ||||||
| 		return qualities.map(quality => (trailersRes.body[quality] ? { |  | ||||||
| 			src: trailersRes.body[quality].token, |  | ||||||
| 			quality, |  | ||||||
| 		} : null)).filter(Boolean); |  | ||||||
| 	} |  | ||||||
| 
 |  | ||||||
| 	return null; |  | ||||||
| } |  | ||||||
| */ |  | ||||||
| 
 |  | ||||||
| async function getTrailer(scene, channel, url) { | async function getTrailer(scene, channel, url) { | ||||||
| 	const res = await http.post(`${channel.url}/graphql`, { | 	const res = await http.post(`${channel.url}/graphql`, { | ||||||
| 		operationName: 'getToken', | 		operationName: 'getToken', | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue