Merge branch 'experimental' into master
This commit is contained in:
		
						commit
						42247449f8
					
				|  | @ -230,24 +230,26 @@ const networks = [ | |||
| 	{ | ||||
| 		slug: 'hussiepass', | ||||
| 		name: 'Hussie Pass', | ||||
| 		url: 'http://www.hussiepass.com', | ||||
| 		url: 'https://www.hussiepass.com', | ||||
| 		parent: 'hush', | ||||
| 	}, | ||||
| 	{ | ||||
| 		slug: 'hushpass', | ||||
| 		name: 'Hush Pass', | ||||
| 		url: 'http://www.hushpass.com', | ||||
| 		url: 'https://www.hushpass.com', | ||||
| 		parent: 'hush', | ||||
| 		parameters: { | ||||
| 			t1: true, | ||||
| 			sequential: true, | ||||
| 		}, | ||||
| 	}, | ||||
| 	{ | ||||
| 		slug: 'interracialpass', | ||||
| 		name: 'Interracial Pass', | ||||
| 		url: 'http://www.interracialpass.com', | ||||
| 		url: 'https://www.interracialpass.com', | ||||
| 		parent: 'hush', | ||||
| 		parameters: { | ||||
| 			t1: true, | ||||
| 			sequential: true, | ||||
| 		}, | ||||
| 	}, | ||||
|  |  | |||
|  | @ -2681,6 +2681,7 @@ const sites = [ | |||
| 		name: 'Booty Annihilation', | ||||
| 		tags: ['interracial'], | ||||
| 		parent: 'interracialpass', | ||||
| 		hasLogo: false, | ||||
| 		parameters: { | ||||
| 			latest: 'https://www.interracialpass.com/t1/categories/BootyAnnihilation_%d_d.html', | ||||
| 			media: 'https://www.interracialpass.com', | ||||
|  |  | |||
|  | @ -33,9 +33,11 @@ async function findEntities(baseReleases) { | |||
| 	)); | ||||
| 
 | ||||
| 	const entities = await knex('entities') | ||||
| 		.select(knex.raw('entities.*, row_to_json(parents) as parent')) | ||||
| 		.select(knex.raw('entities.*, row_to_json(parents) as parent, json_agg(children) as children')) | ||||
| 		.leftJoin('entities as parents', 'parents.id', 'entities.parent_id') | ||||
| 		.leftJoin('entities as children', 'children.parent_id', 'entities.id') | ||||
| 		.whereIn('entities.slug', entitySlugs) | ||||
| 		.groupBy('entities.id', 'parents.id') | ||||
| 		.orderBy('entities.type', 'asc'); | ||||
| 
 | ||||
| 	// channel entity will overwrite network entity
 | ||||
|  |  | |||
|  | @ -86,7 +86,7 @@ async function fetchIncludedEntities() { | |||
| 		) | ||||
| 		/* select recursive channels as children of networks */ | ||||
| 		SELECT | ||||
| 			entities.*, json_agg(channels) as children | ||||
| 			entities.*, json_agg(channels ORDER BY channels.id) as children | ||||
| 		FROM | ||||
| 			channels | ||||
| 		LEFT JOIN | ||||
|  |  | |||
|  | @ -122,6 +122,7 @@ async function searchReleases(query, limit = 100) { | |||
| } | ||||
| 
 | ||||
| module.exports = { | ||||
| 	curateRelease, | ||||
| 	fetchRelease, | ||||
| 	fetchReleases, | ||||
| 	searchReleases, | ||||
|  |  | |||
|  | @ -2,19 +2,10 @@ | |||
| 
 | ||||
| const util = require('util'); | ||||
| 
 | ||||
| const knex = require('../knex'); | ||||
| const { get, geta, ed, formatDate, ctxa } = require('../utils/q'); | ||||
| const slugify = require('../utils/slugify'); | ||||
| const { feetInchesToCm } = require('../utils/convert'); | ||||
| 
 | ||||
| async function getChannelRegExp(site) { | ||||
| 	if (!['hushpass', 'interracialpass'].includes(site.parent.slug)) return null; | ||||
| 
 | ||||
| 	const sites = await knex('sites').where('network_id', site.parent.id); | ||||
| 
 | ||||
| 	return new RegExp(sites.map(channel => channel.parameters?.match || channel.name).join('|'), 'i'); | ||||
| } | ||||
| 
 | ||||
| function deriveEntryId(release) { | ||||
| 	if (release.date && release.title) { | ||||
| 		return `${slugify(formatDate(release.date, 'YYYY-MM-DD'))}-${slugify(release.title)}`; | ||||
|  | @ -80,7 +71,7 @@ function scrapeAll(scenes, site) { | |||
| 	}); | ||||
| } | ||||
| 
 | ||||
| function scrapeAllT1(scenes, site, accSiteReleases) { | ||||
| function scrapeAllT1(scenes, site, accNetworkReleases) { | ||||
| 	return scenes.map(({ qu }) => { | ||||
| 		const release = {}; | ||||
| 
 | ||||
|  | @ -107,7 +98,7 @@ function scrapeAllT1(scenes, site, accSiteReleases) { | |||
| 		// release.entryId = q('.img-div img', 'id')?.match(/set-target-(\d+)/)[1];
 | ||||
| 		release.entryId = deriveEntryId(release); | ||||
| 
 | ||||
| 		if (site.parameters?.accFilter && accSiteReleases?.map(accRelease => accRelease.entryId).includes(release.entryId)) { | ||||
| 		if (site.parameters?.accFilter && accNetworkReleases?.map(accRelease => accRelease.entryId).includes(release.entryId)) { | ||||
| 			// filter out releases that were already scraped from a categorized site, requires sequential site scraping
 | ||||
| 			return null; | ||||
| 		} | ||||
|  | @ -160,7 +151,7 @@ function scrapeScene({ html, qu }, site, url, baseRelease) { | |||
| 	return release; | ||||
| } | ||||
| 
 | ||||
| function scrapeSceneT1({ html, qu }, site, url, baseRelease, channelRegExp) { | ||||
| function scrapeSceneT1({ html, qu }, site, url, baseRelease) { | ||||
| 	const release = { url }; | ||||
| 
 | ||||
| 	release.title = qu.q('.trailer-section-head .section-title', true); | ||||
|  | @ -187,14 +178,12 @@ function scrapeSceneT1({ html, qu }, site, url, baseRelease, channelRegExp) { | |||
| 	const stars = qu.q('.update-rating', true).match(/\d.\d/)?.[0]; | ||||
| 	if (stars) release.stars = Number(stars); | ||||
| 
 | ||||
| 	if (channelRegExp) { | ||||
| 	if (site.type === 'network') { | ||||
| 		const channelRegExp = new RegExp(site.children.map(channel => channel.parameters?.match || channel.name).join('|'), 'i'); | ||||
| 		const channel = release.tags.find(tag => channelRegExp.test(tag)); | ||||
| 
 | ||||
| 		if (channel) { | ||||
| 			release.channel = { | ||||
| 				force: true, | ||||
| 				slug: slugify(channel, ''), | ||||
| 			}; | ||||
| 			release.channel = slugify(channel, ''); | ||||
| 		} | ||||
| 	} | ||||
| 
 | ||||
|  | @ -356,7 +345,7 @@ function scrapeProfileTour({ el, qu }, site) { | |||
| 	return profile; | ||||
| } | ||||
| 
 | ||||
| async function fetchLatest(site, page = 1, include, preflight, accSiteReleases) { | ||||
| async function fetchLatest(site, page = 1, include, { uniqueReleases, duplicateReleases }) { | ||||
| 	const url = (site.parameters?.latest && util.format(site.parameters.latest, page)) | ||||
|         || (site.parameters?.t1 && `${site.url}/t1/categories/movies_${page}_d.html`) | ||||
|         || `${site.url}/categories/movies_${page}_d.html`; | ||||
|  | @ -364,18 +353,17 @@ async function fetchLatest(site, page = 1, include, preflight, accSiteReleases) | |||
| 	const res = await geta(url, '.modelfeature, .item-video, .updateItem'); | ||||
| 
 | ||||
| 	if (!res.ok) return res.status; | ||||
| 	if (site.parameters?.t1) return scrapeAllT1(res.items, site, accSiteReleases); | ||||
| 	if (site.parameters?.tour) return scrapeAllTour(res.items, site, accSiteReleases); | ||||
| 	if (site.parameters?.t1) return scrapeAllT1(res.items, site, [...uniqueReleases, ...duplicateReleases]); | ||||
| 	if (site.parameters?.tour) return scrapeAllTour(res.items, site); | ||||
| 
 | ||||
| 	return scrapeAll(res.items, site, accSiteReleases); | ||||
| 	return scrapeAll(res.items, site, uniqueReleases); | ||||
| } | ||||
| 
 | ||||
| async function fetchScene(url, site, baseRelease, beforeFetchLatest) { | ||||
| 	const channelRegExp = beforeFetchLatest || await getChannelRegExp(site); | ||||
| async function fetchScene(url, site, baseRelease) { | ||||
| 	const res = await get(url); | ||||
| 
 | ||||
| 	if (!res.ok) return res.status; | ||||
| 	if (site.parameters?.t1) return scrapeSceneT1(res.item, site, url, baseRelease, channelRegExp); | ||||
| 	if (site.parameters?.t1) return scrapeSceneT1(res.item, site, url, baseRelease); | ||||
| 	if (site.parameters?.tour) return scrapeSceneTour(res.item, site, url, baseRelease); | ||||
| 
 | ||||
| 	return scrapeScene(res.item, site, url, baseRelease); | ||||
|  | @ -403,7 +391,6 @@ async function fetchProfile({ name: actorName }, { site }) { | |||
| } | ||||
| 
 | ||||
| module.exports = { | ||||
| 	beforeFetchLatest: getChannelRegExp, | ||||
| 	fetchLatest, | ||||
| 	fetchScene, | ||||
| 	fetchProfile, | ||||
|  |  | |||
|  | @ -6,24 +6,31 @@ const moment = require('moment'); | |||
| const argv = require('./argv'); | ||||
| const logger = require('./logger')(__filename); | ||||
| const knex = require('./knex'); | ||||
| const { curateRelease } = require('./releases'); | ||||
| const include = require('./utils/argv-include')(argv); | ||||
| const scrapers = require('./scrapers/scrapers'); | ||||
| const { fetchIncludedEntities } = require('./entities'); | ||||
| 
 | ||||
| const emptyReleases = { uniqueReleases: [], duplicateReleases: [] }; | ||||
| 
 | ||||
| async function filterUniqueReleases(latestReleases, accReleases) { | ||||
| 	const latestReleaseIdentifiers = latestReleases | ||||
| 		.map(release => [release.entity.id, release.entryId]); | ||||
| 
 | ||||
| 	const duplicateReleases = await knex('releases') | ||||
| 	const duplicateReleaseEntries = await knex('releases') | ||||
| 		.select(knex.raw('releases.*, row_to_json(entities) as entity')) | ||||
| 		.leftJoin('entities', 'entities.id', 'releases.entity_id') | ||||
| 		.whereIn(['entity_id', 'entry_id'], latestReleaseIdentifiers); | ||||
| 
 | ||||
| 	const duplicateReleases = duplicateReleaseEntries.map(release => curateRelease(release)); | ||||
| 
 | ||||
| 	// add entry IDs of accumulated releases to prevent an infinite scrape loop
 | ||||
| 	// when one page contains the same release as the previous
 | ||||
| 	const duplicateReleasesSiteIdAndEntryIds = duplicateReleases | ||||
| 		.concat(accReleases) | ||||
| 		.reduce((acc, release) => { | ||||
| 			const entityId = release.entity_id || release.entity.id; | ||||
| 			const entryId = release.entry_id || release.entryId; | ||||
| 			const entityId = release.entityId || release.entity.id; | ||||
| 			const entryId = release.entryId || release.entryId; | ||||
| 
 | ||||
| 			if (!acc[entityId]) acc[entityId] = {}; | ||||
| 			acc[entityId][entryId] = true; | ||||
|  | @ -31,10 +38,9 @@ async function filterUniqueReleases(latestReleases, accReleases) { | |||
| 			return acc; | ||||
| 		}, {}); | ||||
| 
 | ||||
| 	const uniqueReleases = latestReleases | ||||
| 		.filter(release => !duplicateReleasesSiteIdAndEntryIds[release.entity.id]?.[release.entryId]); | ||||
| 	const uniqueReleases = latestReleases.filter(release => !duplicateReleasesSiteIdAndEntryIds[release.entity.id]?.[release.entryId]); | ||||
| 
 | ||||
| 	return uniqueReleases; | ||||
| 	return { uniqueReleases, duplicateReleases }; | ||||
| } | ||||
| 
 | ||||
| function needNextPage(releases, uniqueReleases, totalReleases, hasDates, upcoming) { | ||||
|  | @ -66,7 +72,7 @@ function needNextPage(releases, uniqueReleases, totalReleases, hasDates, upcomin | |||
| 	return false; | ||||
| } | ||||
| 
 | ||||
| async function scrapeReleases(scraper, entity, preData, upcoming = false, page = 1, accReleases = [], totalReleases = 0) { | ||||
| async function scrapeReleases(scraper, entity, preData, upcoming = false, page = 1, acc = emptyReleases, totalReleases = 0) { | ||||
| 	const releases = upcoming | ||||
| 		? await scraper.fetchUpcoming(entity, page, include, preData) | ||||
| 		: await scraper.fetchLatest(entity, page, include, preData); | ||||
|  | @ -74,7 +80,7 @@ async function scrapeReleases(scraper, entity, preData, upcoming = false, page = | |||
| 	if (!Array.isArray(releases)) { | ||||
| 		// scraper is unable to fetch the releases and returned a HTTP code or null
 | ||||
| 		logger.warn(`Scraper returned ${releases} when fetching latest from '${entity.name}' (${entity.parent?.name})`); | ||||
| 		return accReleases; | ||||
| 		return acc; | ||||
| 	} | ||||
| 
 | ||||
| 	const releasesWithEntity = releases.map(release => ({ | ||||
|  | @ -88,20 +94,25 @@ async function scrapeReleases(scraper, entity, preData, upcoming = false, page = | |||
| 		|| (hasDates && releasesWithEntity.filter(release => moment(release.date).isAfter(argv.after))) | ||||
| 		|| releasesWithEntity.slice(0, Math.max(argv.nullDateLimit - totalReleases, 0)); | ||||
| 
 | ||||
| 	const uniqueReleases = argv.force | ||||
| 		? limitedReleases | ||||
| 		: await filterUniqueReleases(limitedReleases, accReleases); | ||||
| 	const { uniqueReleases, duplicateReleases } = argv.force | ||||
| 		? { uniqueReleases: limitedReleases, duplicateReleases: [] } | ||||
| 		: await filterUniqueReleases(limitedReleases, acc.uniqueReleases); | ||||
| 
 | ||||
| 	const accReleases = { | ||||
| 		uniqueReleases: acc.uniqueReleases.concat(uniqueReleases), | ||||
| 		duplicateReleases: acc.duplicateReleases.concat(duplicateReleases), | ||||
| 	}; | ||||
| 
 | ||||
| 	if (needNextPage(releases, uniqueReleases, totalReleases, hasDates, upcoming)) { | ||||
| 		return scrapeReleases(scraper, entity, preData, upcoming, page + 1, accReleases.concat(uniqueReleases), totalReleases + releases.length); | ||||
| 		return scrapeReleases(scraper, entity, preData, upcoming, page + 1, accReleases, totalReleases + releases.length); | ||||
| 	} | ||||
| 
 | ||||
| 	return accReleases.concat(uniqueReleases); | ||||
| 	return accReleases; | ||||
| } | ||||
| 
 | ||||
| async function scrapeLatestReleases(scraper, entity, preData) { | ||||
| 	if (!scraper.fetchLatest) { | ||||
| 		return []; | ||||
| 	if (!argv.latest || !scraper.fetchLatest) { | ||||
| 		return emptyReleases; | ||||
| 	} | ||||
| 
 | ||||
| 	try { | ||||
|  | @ -114,12 +125,12 @@ async function scrapeLatestReleases(scraper, entity, preData) { | |||
| 		logger.warn(`Failed to scrape latest updates for '${entity.slug}' (${entity.parent?.slug}): ${error.message}`); | ||||
| 	} | ||||
| 
 | ||||
| 	return []; | ||||
| 	return emptyReleases; | ||||
| } | ||||
| 
 | ||||
| async function scrapeUpcomingReleases(scraper, entity, preData) { | ||||
| 	if (!scraper.fetchUpcoming) { | ||||
| 		return []; | ||||
| 	if (!argv.upcoming || !scraper.fetchUpcoming) { | ||||
| 		return emptyReleases; | ||||
| 	} | ||||
| 
 | ||||
| 	try { | ||||
|  | @ -132,11 +143,11 @@ async function scrapeUpcomingReleases(scraper, entity, preData) { | |||
| 		logger.warn(`Failed to scrape upcoming updates for '${entity.slug}' (${entity.parent?.slug}): ${error.message}`); | ||||
| 	} | ||||
| 
 | ||||
| 	return []; | ||||
| 	return emptyReleases; | ||||
| } | ||||
| 
 | ||||
| async function scrapeMovies(scraper, entity) { | ||||
| 	if (!scraper.fetchMovies) { | ||||
| 	if (!argv.movies || !scraper.fetchMovies) { | ||||
| 		return []; | ||||
| 	} | ||||
| 
 | ||||
|  | @ -152,20 +163,17 @@ async function scrapeMovies(scraper, entity) { | |||
| 
 | ||||
| async function scrapeChannelReleases(scraper, channelEntity, preData) { | ||||
| 	const [latestReleases, upcomingReleases] = await Promise.all([ | ||||
| 		argv.latest | ||||
| 			? scrapeLatestReleases(scraper, channelEntity, preData) | ||||
| 			: [], | ||||
| 		argv.upcoming | ||||
| 			? scrapeUpcomingReleases(scraper, channelEntity, preData) | ||||
| 			: [], | ||||
| 		argv.movies | ||||
| 			? scrapeMovies(scraper, channelEntity, preData) | ||||
| 			: [], | ||||
| 		scrapeLatestReleases(scraper, channelEntity, preData), | ||||
| 		scrapeUpcomingReleases(scraper, channelEntity, preData), | ||||
| 		scrapeMovies(scraper, channelEntity, preData), | ||||
| 	]); | ||||
| 
 | ||||
| 	logger.info(`Fetching ${latestReleases.length} latest and ${upcomingReleases.length} upcoming updates for '${channelEntity.name}' (${channelEntity.parent?.name})`); | ||||
| 	logger.info(`Fetching ${latestReleases.uniqueReleases.length} latest and ${upcomingReleases.uniqueReleases.length} upcoming updates for '${channelEntity.name}' (${channelEntity.parent?.name})`); | ||||
| 
 | ||||
| 	return [...latestReleases, ...upcomingReleases]; | ||||
| 	return { | ||||
| 		uniqueReleases: [...latestReleases.uniqueReleases, ...upcomingReleases.uniqueReleases], | ||||
| 		duplicateReleases: [...latestReleases.duplicateReleases, ...upcomingReleases.duplicateReleases], | ||||
| 	}; | ||||
| } | ||||
| 
 | ||||
| async function scrapeChannel(channelEntity, accNetworkReleases) { | ||||
|  | @ -181,12 +189,10 @@ async function scrapeChannel(channelEntity, accNetworkReleases) { | |||
| 	try { | ||||
| 		const beforeFetchLatest = await scraper.beforeFetchLatest?.(channelEntity); | ||||
| 
 | ||||
| 		const channelEntityReleases = await scrapeChannelReleases(scraper, channelEntity, { | ||||
| 			accNetworkReleases, | ||||
| 		return await scrapeChannelReleases(scraper, channelEntity, { | ||||
| 			...accNetworkReleases, | ||||
| 			beforeFetchLatest, | ||||
| 		}); | ||||
| 
 | ||||
| 		return channelEntityReleases.map(release => ({ ...release, channelEntity })); | ||||
| 	} catch (error) { | ||||
| 		logger.error(`Failed to scrape releases from ${channelEntity.name} using ${scraper.slug}: ${error.message}`); | ||||
| 
 | ||||
|  | @ -195,22 +201,31 @@ async function scrapeChannel(channelEntity, accNetworkReleases) { | |||
| } | ||||
| 
 | ||||
| async function scrapeNetworkSequential(networkEntity) { | ||||
| 	return Promise.reduce( | ||||
| 	const releases = await Promise.reduce( | ||||
| 		networkEntity.children, | ||||
| 		async (chain, channelEntity) => { | ||||
| 			const accNetworkReleases = await chain; | ||||
| 			const channelReleases = await scrapeChannel(channelEntity, accNetworkReleases); | ||||
| 			const { uniqueReleases, duplicateReleases } = await scrapeChannel(channelEntity, accNetworkReleases); | ||||
| 
 | ||||
| 			return accNetworkReleases.concat(channelReleases); | ||||
| 			return { | ||||
| 				uniqueReleases: accNetworkReleases.uniqueReleases.concat(uniqueReleases), | ||||
| 				duplicateReleases: accNetworkReleases.duplicateReleases.concat(duplicateReleases), | ||||
| 			}; | ||||
| 		}, | ||||
| 		Promise.resolve([]), | ||||
| 		Promise.resolve(emptyReleases), | ||||
| 	); | ||||
| 
 | ||||
| 	return releases.uniqueReleases; | ||||
| } | ||||
| 
 | ||||
| async function scrapeNetworkParallel(networkEntity) { | ||||
| 	return Promise.map( | ||||
| 		networkEntity.children, | ||||
| 		async channelEntity => scrapeChannel(channelEntity, networkEntity), | ||||
| 		async (channelEntity) => { | ||||
| 			const { uniqueReleases } = await scrapeChannel(channelEntity, networkEntity); | ||||
| 
 | ||||
| 			return uniqueReleases; | ||||
| 		}, | ||||
| 		{ concurrency: 3 }, | ||||
| 	); | ||||
| } | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue