forked from DebaucheryLibrarian/traxxx
				
			Changed sites from argument query to group by network.
This commit is contained in:
		
							parent
							
								
									79465d9634
								
							
						
					
					
						commit
						1907ce1e54
					
				
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 1.4 MiB | 
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 8.4 KiB | 
										
											Binary file not shown.
										
									
								
							| After Width: | Height: | Size: 36 KiB | 
										
											Binary file not shown.
										
									
								
							| Before Width: | Height: | Size: 47 KiB After Width: | Height: | Size: 36 KiB | 
|  | @ -437,14 +437,14 @@ const networks = [ | |||
| 
 | ||||
| exports.seed = knex => Promise.resolve() | ||||
| 	.then(async () => { | ||||
| 		const { inserted, updated } = await upsert('entities', parentNetworks, 'slug', knex); | ||||
| 		const { inserted, updated } = await upsert('entities', parentNetworks.map(network => ({ ...network, type: 1 })), ['slug', 'type'], knex); | ||||
| 		const parentNetworksBySlug = [].concat(inserted, updated).reduce((acc, network) => ({ ...acc, [network.slug]: network.id }), {}); | ||||
| 
 | ||||
| 		const networksWithParent = networks.map(network => ({ | ||||
| 			slug: network.slug, | ||||
| 			name: network.name, | ||||
| 			type: network.type || 1, | ||||
| 			alias: (network.alias || []).join(','), | ||||
| 			alias: network.alias ? network.alias.join(',') : null, | ||||
| 			url: network.url, | ||||
| 			description: network.description, | ||||
| 			parameters: network.parameters, | ||||
|  |  | |||
|  | @ -596,7 +596,7 @@ const tagPosters = [ | |||
| 	['bdsm', 0, 'Dani Daniels in "The Traning of Dani Daniels, Day 2" for The Training of O at Kink'], | ||||
| 	['behind-the-scenes', 0, 'Janice Griffith in "Day With A Pornstar: Janice" for Brazzers'], | ||||
| 	['blonde', 0, 'Anikka Albrite and Lena Nicole or Cherie DeVille in the BTS of "New Zealand Holiday" for In The Crack'], | ||||
| 	['blowbang', 'poster', 'Marsha May in "Feeding Frenzy 12" for Jules Jordan'], | ||||
| 	['blowbang', 0, 'Lacy Lennon in "Lacy Lennon\'s First Blowbang" for HardX'], | ||||
| 	['blowjob', 0, 'Adriana Chechik in "The Dinner Party" for Real Wife Stories (Brazzers)'], | ||||
| 	['brunette', 0, 'Nicole Black in GIO971 for LegalPorno'], | ||||
| 	['bukkake', 0, 'Jaye Summers in "Facialized 5" for HardX'], | ||||
|  | @ -670,7 +670,8 @@ const tagPhotos = [ | |||
| 	// ['anal', 1, 'Veronica Leal and Tina Kay in "Agents On Anal Mission" for Asshole Fever'],
 | ||||
| 	// ['anal', 0, 'Veronica Leal'],
 | ||||
| 	['behind-the-scenes', 1, 'Madison Ivy in "Day With A Pornstar" for Brazzers'], | ||||
| 	['bukkake', 'poster', 'Mia Malkova in "Facialized 2" for HardX'], | ||||
| 	['blowbang', 'poster', 'Marsha May in "Feeding Frenzy 12" for Jules Jordan'], | ||||
| 	// ['bukkake', 'poster', 'Mia Malkova in "Facialized 2" for HardX'],
 | ||||
| 	['caucasian', 1, 'Sheena Shaw for Brazzers'], | ||||
| 	['da-tp', 1, 'Francys Belle in SZ1702 for LegalPorno'], | ||||
| 	['da-tp', 2, 'Angel Smalls in GIO408 for LegalPorno'], | ||||
|  |  | |||
							
								
								
									
										112
									
								
								src/entities.js
								
								
								
								
							
							
						
						
									
										112
									
								
								src/entities.js
								
								
								
								
							|  | @ -14,16 +14,10 @@ function curateEntity(entity, includeParameters = false) { | |||
| 		url: entity.url, | ||||
| 		description: entity.description, | ||||
| 		slug: entity.slug, | ||||
| 		independent: !!entity.parameters && entity.parameters.independent, | ||||
| 		type: entity.type, | ||||
| 		parameters: includeParameters ? entity.parameters : null, | ||||
| 		network: { | ||||
| 			id: entity.network_id, | ||||
| 			name: entity.network_name, | ||||
| 			description: entity.network_description, | ||||
| 			slug: entity.network_slug, | ||||
| 			url: entity.network_url, | ||||
| 			parameters: includeParameters ? entity.network_parameters : null, | ||||
| 		}, | ||||
| 		parent: entity.parent, | ||||
| 		children: (entity.children || []).map(child => curateEntity(child)), | ||||
| 	}; | ||||
| 
 | ||||
| 	return curatedEntity; | ||||
|  | @ -33,56 +27,59 @@ async function curateEntities(entities, includeParameters) { | |||
| 	return Promise.all(entities.map(async entity => curateEntity(entity, includeParameters))); | ||||
| } | ||||
| 
 | ||||
| async function findSiteByUrl(url) { | ||||
| 	const { origin, hostname, pathname } = new URL(url); | ||||
| 	// const domain = hostname.replace(/www.|tour./, '');
 | ||||
| 	const dirUrl = `${origin}${pathname.split('/').slice(0, 2).join('/')}`; // allow for sites on URI directory
 | ||||
| 
 | ||||
| 	const site = await knex('sites') | ||||
| 		.leftJoin('networks', 'sites.network_id', 'networks.id') | ||||
| 		.select( | ||||
| 			'sites.*', | ||||
| 			'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters', | ||||
| 		) | ||||
| 		.where('sites.url', url) | ||||
| 		.orWhere('sites.url', origin) | ||||
| 		.orWhere('sites.url', origin.replace(/www\.|tour\./, '')) | ||||
| 		.orWhere('sites.url', `https://www.${hostname}`) | ||||
| 		.orWhere('sites.url', `http://www.${hostname}`) | ||||
| 		.orWhere('sites.url', dirUrl) | ||||
| 	// .orWhere('sites.url', 'like', `%${domain}`)
 | ||||
| 		.first(); | ||||
| 
 | ||||
| 	if (site) { | ||||
| 		const curatedSite = curateSite(site, true, false); | ||||
| 
 | ||||
| 		return curatedSite; | ||||
| 	} | ||||
| 
 | ||||
| 	return null; | ||||
| } | ||||
| 
 | ||||
| async function fetchEntitiesFromArgv() { | ||||
| async function fetchSitesFromArgv() { | ||||
| 	const rawEntities = await knex.raw(` | ||||
| 		WITH RECURSIVE temp AS ( | ||||
| 			SELECT id, parent_id, name, slug, type FROM entities WHERE slug IN (?) | ||||
| 			UNION | ||||
| 				SELECT entities.id, entities.parent_id, entities.name, entities.slug, entities.type FROM entities | ||||
| 				INNER JOIN temp ON temp.id = entities.parent_id | ||||
| 		) SELECT * FROM temp; | ||||
| 	`, argv.sites || argv.networks || argv.entities);
 | ||||
| 
 | ||||
| 	console.log(rawEntities.rows); | ||||
| 			SELECT | ||||
| 				id, parent_id, name, slug, type, url, description, parameters | ||||
| 			FROM | ||||
| 				entities | ||||
| 			WHERE | ||||
| 				slug = ANY(?) AND entities.type = 1 | ||||
| 			UNION ALL | ||||
| 			SELECT | ||||
| 				entities.id, entities.parent_id, entities.name, entities.slug, entities.type, entities.url, entities.description, entities.parameters | ||||
| 			FROM | ||||
| 				entities | ||||
| 			INNER JOIN | ||||
| 				temp ON temp.id = entities.parent_id | ||||
| 		) | ||||
| 		SELECT | ||||
| 			entities.*, row_to_json(parents) as parent, json_agg(temp) as children | ||||
| 		FROM | ||||
| 			temp | ||||
| 		LEFT JOIN | ||||
| 			entities ON entities.id = temp.parent_id | ||||
| 		LEFT JOIN | ||||
| 			entities AS parents ON parents.id = entities.parent_id | ||||
| 		WHERE | ||||
| 			temp.type = 2 | ||||
| 		GROUP BY | ||||
| 			temp.parent_id, entities.id, entities.name, parents.id | ||||
| 		UNION ALL | ||||
| 		SELECT | ||||
| 			entities.*, row_to_json(parents) as parent, json_build_array(row_to_json(children)) | ||||
| 		FROM | ||||
| 			entities AS children | ||||
| 		LEFT JOIN | ||||
| 			entities ON entities.id = children.parent_id | ||||
| 		LEFT JOIN | ||||
| 			entities AS parents ON parents.id = entities.parent_id | ||||
| 		WHERE | ||||
| 			children.slug = ANY(?) AND children.type = 2 | ||||
| 		GROUP BY | ||||
| 			entities.id, parents.id, children.id; | ||||
| 	`, [argv.networks || [], argv.sites || []]);
 | ||||
| 
 | ||||
| 	const curatedEntities = await curateEntities(rawEntities.rows, true); | ||||
| 	logger.info(`Found ${curatedEntities.length} entities in database`); | ||||
| 
 | ||||
| 	console.log(curatedEntities); | ||||
| 	console.log(rawEntities.rows); | ||||
| 
 | ||||
| 	return curatedEntities; | ||||
| } | ||||
| 
 | ||||
| async function fetchEntitiesFromConfig() { | ||||
| async function fetchSitesFromConfig() { | ||||
| 	const rawSites = await knex('entities') | ||||
| 		.select('entities.*') | ||||
| 		.leftJoin('entities as entities_parents', 'entities_parents.id', 'entities.id') | ||||
|  | @ -95,18 +92,18 @@ async function fetchEntitiesFromConfig() { | |||
| 			builder.whereIn('entities.slug', config.exclude || []); | ||||
| 		}); | ||||
| 
 | ||||
| 	const curatedSites = await curateSites(rawSites, true); | ||||
| 	logger.info(`Found ${curatedSites.length} sites in database`); | ||||
| 	const curatedSites = await curateEntities(rawSites, true); | ||||
| 	logger.info(`Found ${curatedSites.length} entities in database`); | ||||
| 
 | ||||
| 	return curatedSites; | ||||
| } | ||||
| 
 | ||||
| async function fetchIncludedEntities() { | ||||
| 	if (argv.networks || argv.sites) { | ||||
| 		return fetchEntitiesFromArgv(); | ||||
| 		return fetchSitesFromArgv(); | ||||
| 	} | ||||
| 
 | ||||
| 	return fetchEntitiesFromConfig(); | ||||
| 	return fetchSitesFromConfig(); | ||||
| } | ||||
| 
 | ||||
| async function fetchSites(queryObject) { | ||||
|  | @ -119,7 +116,7 @@ async function fetchSites(queryObject) { | |||
| 		.leftJoin('networks', 'sites.network_id', 'networks.id') | ||||
| 		.limit(100); | ||||
| 
 | ||||
| 	return curateSites(sites); | ||||
| 	return curateEntities(sites); | ||||
| } | ||||
| 
 | ||||
| async function fetchSitesFromReleases() { | ||||
|  | @ -129,7 +126,7 @@ async function fetchSitesFromReleases() { | |||
| 		.groupBy('sites.id') | ||||
| 		.limit(100); | ||||
| 
 | ||||
| 	return curateSites(sites); | ||||
| 	return curateEntities(sites); | ||||
| } | ||||
| 
 | ||||
| module.exports = { | ||||
|  | @ -137,8 +134,7 @@ module.exports = { | |||
| 	curateEntities, | ||||
| 	fetchIncludedEntities, | ||||
| 	fetchSites, | ||||
| 	fetchEntitiesFromConfig, | ||||
| 	fetchEntitiesFromArgv, | ||||
| 	fetchSitesFromConfig, | ||||
| 	fetchSitesFromArgv, | ||||
| 	fetchSitesFromReleases, | ||||
| 	findSiteByUrl, | ||||
| }; | ||||
|  |  | |||
|  | @ -108,7 +108,7 @@ function scrapeAllT1(scenes, site, accSiteReleases) { | |||
| 		release.entryId = deriveEntryId(release); | ||||
| 
 | ||||
| 		if (site.parameters?.accFilter && accSiteReleases?.map(accRelease => accRelease.entryId).includes(release.entryId)) { | ||||
| 			// filter out releases that were already scraped from a categorized site
 | ||||
| 			// filter out releases that were already scraped from a categorized site, requires sequential site scraping
 | ||||
| 			return null; | ||||
| 		} | ||||
| 
 | ||||
|  |  | |||
|  | @ -8,7 +8,7 @@ const logger = require('./logger')(__filename); | |||
| const knex = require('./knex'); | ||||
| const include = require('./utils/argv-include')(argv); | ||||
| const scrapers = require('./scrapers/scrapers'); | ||||
| const { fetchEntitiesFromArgv, fetchEntitiesFromConfig } = require('./entities'); | ||||
| const { fetchSitesFromArgv, fetchSitesFromConfig } = require('./entities'); | ||||
| 
 | ||||
| const afterDate = (() => { | ||||
| 	if (/\d{2,4}-\d{2}-\d{2,4}/.test(argv.after)) { | ||||
|  | @ -219,8 +219,10 @@ async function scrapeNetworkParallel(network) { | |||
| 
 | ||||
| async function fetchUpdates() { | ||||
| 	const includedNetworks = argv.sites || argv.networks || argv.from | ||||
| 		? await fetchEntitiesFromArgv() | ||||
| 		: await fetchEntitiesFromConfig(); | ||||
| 		? await fetchSitesFromArgv() | ||||
| 		: await fetchSitesFromConfig(); | ||||
| 
 | ||||
| 	// console.log('included', includedNetworks);
 | ||||
| 
 | ||||
| 	const scrapedNetworks = await Promise.map( | ||||
| 		includedNetworks, | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue