Added upcoming, profile and detailed scene actor scraping to InTheCrack. Fixed clip upsert.

2020-08-20 23:35:18 +02:00 · 2020-08-20 23:35:18 +02:00 · 4ec89e2cc8
parent 552e6da392
commit 4ec89e2cc8
16 changed files with 188 additions and 35 deletions
--- a/assets/components/releases/clips.vue
+++ b/assets/components/releases/clips.vue
@ -86,12 +86,14 @@ export default {
 }

 .clip-duration {
+	background: var(--darken);
 	color: var(--text-light);
 	display: block;
 	position: absolute;
-	bottom: 0;
-	left: 0;
-	padding: .5rem .5rem .75rem 1rem;
+	top: 0;
+	right: 0;
+	padding: .25rem .5rem;
+	font-size: .9rem;
 	font-weight: bold;
 	text-shadow: 0 0 2px var(--darken-strong);
 }
--- a/assets/components/releases/details.vue
+++ b/assets/components/releases/details.vue
@ -3,7 +3,6 @@
 		<div class="column">
 			<div class="tidbits">
 				<a
-					v-if="release.date"
 					:title="release.url && `View scene on ${release.entity.name}`"
 					:href="release.url"
 					:class="{ link: release.url }"
@ -11,8 +10,8 @@
 					rel="noopener noreferrer"
 					class="tidbit date nolink"
 				>
-					<span class="date-compact">{{ formatDate(release.date, 'MMM D, YYYY', release.datePrecision) }}</span>
-					<span class="date-full">{{ formatDate(release.date, 'MMMM D, YYYY', release.datePrecision) }}</span>
+					<span class="date-compact">{{ release.date ? formatDate(release.date, 'MMM D, YYYY', release.datePrecision) : 'Date N/A' }}</span>
+					<span class="date-full">{{ release.date ? formatDate(release.date, 'MMMM D, YYYY', release.datePrecision) : 'Date unknown' }}</span>

 					<Icon
 						v-if="release.url"
--- a/assets/components/tags/tags.vue
+++ b/assets/components/tags/tags.vue
@ -103,6 +103,7 @@ async function mounted() {
 			'femdom',
 		],
 		toys: [
+			'anal-toys',
 			'double-dildo',
 			'double-dildo-blowjob',
 		],
--- a/public/img/tags/anal-toys/0.jpeg
+++ b/public/img/tags/anal-toys/0.jpeg
--- a/public/img/tags/anal-toys/lazy/0.jpeg
+++ b/public/img/tags/anal-toys/lazy/0.jpeg
--- a/public/img/tags/anal-toys/thumbs/0.jpeg
+++ b/public/img/tags/anal-toys/thumbs/0.jpeg
--- a/public/img/tags/double-dildo/0.jpeg
+++ b/public/img/tags/double-dildo/0.jpeg
--- a/public/img/tags/double-dildo/lazy/0.jpeg
+++ b/public/img/tags/double-dildo/lazy/0.jpeg
--- a/public/img/tags/double-dildo/thumbs/0.jpeg
+++ b/public/img/tags/double-dildo/thumbs/0.jpeg
--- a/seeds/02_sites.js
+++ b/seeds/02_sites.js
@ -2649,7 +2649,7 @@ const sites = [
 	{
 		slug: 'inthecrack',
 		name: 'InTheCrack',
-		url: 'https://inthecrack.com/',
+		url: 'https://inthecrack.com',
 	},
 	// INTERRACIAL PASS
 	{
--- a/seeds/04_media.js
+++ b/seeds/04_media.js
@ -589,6 +589,7 @@ const tagPosters = [
 	['airtight', 6, 'Remy Lacroix in "Ass Worship 14" for Jules Jordan'],
 	['anal', 0, 'Adriana Chechik in "Manuel Creampies Their Asses 3" for Jules Jordan'],
 	['anal-creampie', 1, 'Aleska Diamond in "Aleska Wants More" for Asshole Fever'],
+	['anal-toys', 0, 'Kira Noir in 1225 for InTheCrack'],
 	['ass-eating', 0, 'Angelica Heart and Leanna Sweet in "ATM Bitches" for Asshole Fever'],
 	['asian', 0, 'Jade Kush for Erotica X'],
 	['atm', 2, 'Jureka Del Mar in "Stretched Out" for Her Limit'],
--- a/src/argv.js
+++ b/src/argv.js
@ -67,6 +67,7 @@ const { argv } = yargs
 		describe: 'Fetch all scenes for an actor',
 		type: 'boolean',
 		default: false,
+		alias: 'actor-scenes',
 	})
 	.option('actors-sources', {
 		describe: 'Use these scrapers for actor data',
--- a/src/deep.js
+++ b/src/deep.js
@ -135,6 +135,10 @@ async function scrapeRelease(baseRelease, entities, type = 'scene') {
 	} catch (error) {
 		logger.error(`Deep scrape failed for ${baseRelease.url}: ${error.message}`);

+		if (argv.debug) {
+			console.error(error);
+		}
+
 		if (error.code === 'NO_ENTRY_ID') {
 			return null;
 		}
--- a/src/scrapers/inthecrack.js
+++ b/src/scrapers/inthecrack.js
@ -4,17 +4,19 @@ const moment = require('moment');

 const qu = require('../utils/q');
 const slugify = require('../utils/slugify');
+const { feetInchesToCm, lbsToKg } = require('../utils/convert');

 function scrapeAll(scenes, channel) {
 	return scenes.map(({ query }) => {
 		const release = {};

 		release.url = query.url('a', 'href', { origin: channel.url });
-		release.entryId = new URL(release.url).pathname.match(/\/Collection\/(\d+)/)[1];
+		// release.entryId = new URL(release.url).pathname.match(/\/Collection\/(\d+)/)[1]; can't be matched with upcoming scenes

 		release.shootId = query.cnt('a span:nth-of-type(1)').match(/^\d+/)?.[0];
-		release.date = query.date('a span:nth-of-type(2)', 'YYYY-MM-DD');
+		release.entryId = release.shootId;

+		release.date = query.date('a span:nth-of-type(2)', 'YYYY-MM-DD');
 		release.actors = (query.q('a img', 'alt') || query.cnt('a span:nth-of-type(1)'))?.match(/[a-zA-Z]+(\s[A-Za-z]+)*/g);

 		release.poster = release.shootId
@ -25,13 +27,145 @@ function scrapeAll(scenes, channel) {
 	});
 }

-function scrapeScene({ query, html }, url, channel) {
+function scrapeUpcoming(scenes, channel) {
+	return scenes.map(({ query }) => {
+		const release = {};
+
+		const title = query.cnt('span');
+
+		release.entryId = title.match(/^\d+/)[0];
+		release.actors = title.slice(0, title.indexOf('-')).match(/[a-zA-Z]+(\s[a-zA-Z]+)*/g);
+
+		const date = moment.utc(title.match(/\w+ \d+\w+$/)[0], 'MMM Do');
+
+		if (date.isBefore()) {
+			// date is next year
+			release.date = date.add(1, 'year').toDate();
+		} else {
+			release.date = date.toDate();
+		}
+
+		release.poster = [
+			`https://inthecrack.com/assets/images/posters/collections/${release.entryId}.jpg`,
+			query.img('img', 'src', { origin: channel.url }),
+		];
+
+		return release;
+	});
+}
+
+function scrapeProfileScenes(items, actorName, channel) {
+	return items.map(({ query }) => {
+		const release = {};
+
+		if (slugify(query.cnt()) === 'no-other-collections') {
+			return null;
+		}
+
+		const details = query.cnts('figure p').reduce((acc, info) => {
+			const [key, value] = info.split(':');
+
+			return {
+				...acc,
+				[slugify(key, '_')]: value?.trim(),
+			};
+		}, {});
+
+		release.url = query.url('a', 'href', { origin: channel.url });
+
+		release.shootId = details.collection.match(/\d+/)[0];
+		release.entryId = release.shootId;
+
+		release.date = qu.parseDate(details.release_date, 'YYYY-MM-DD');
+		release.actors = [actorName];
+
+		/* rely on clip length
+		const durationString = Object.keys(details).find(info => /\d+_min_video/.test(info));
+		release.duration = durationString && Number(durationString.match(/^\d+/)?.[0]) * 60;
+		*/
+
+		release.productionLocation = details.shoot_location;
+
+		release.poster = [
+			`https://inthecrack.com/assets/images/posters/collections/${release.entryId}.jpg`,
+			query.img('img', 'src', { origin: channel.url }),
+		];
+
+		return release;
+	}).filter(Boolean);
+}
+
+function scrapeProfile({ query }, actorName, actorAvatar, channel, releasesFromScene) {
+	const profile = {};
+
+	const bio = query.cnts(releasesFromScene ? 'ul li' : 'div.modelInfo li').reduce((acc, info) => {
+		const [key, value] = info.split(':');
+
+		return {
+			...acc,
+			[slugify(key, '_')]: value.trim(),
+		};
+	}, {});
+
+	profile.name = actorName || bio.name;
+	profile.gender = 'female';
+	profile.birthPlace = bio.nationality;
+
+	if (bio.height) profile.height = feetInchesToCm(bio.height);
+	if (bio.weight) profile.weight = lbsToKg(bio.weight);
+
+	profile.releases = releasesFromScene?.[profile.name] || scrapeProfileScenes(qu.initAll(query.all('.Models li')), actorName, channel);
+
+	// avatar is the poster of a scene, find scene and use its high quality poster instead
+	const avatarRelease = profile.releases.find(release => new URL(release.poster[1]).pathname === new URL(actorAvatar).pathname);
+	profile.avatar = avatarRelease?.poster[0];
+
+	return profile;
+}
+
+async function fetchSceneActors(entryId, _release, channel) {
+	const url = `https://inthecrack.com/Collection/Biography/${entryId}`;
+	const res = await qu.get(url);
+
+	if (res.ok) {
+		const actorTabs = qu.initAll(res.item.query.all('#ModelTabs li')).map(({ query }) => ({
+			name: query.cnt('a'),
+			id: query.q('a', 'data-model'),
+		}));
+
+		const actorReleasesByActorName = actorTabs.reduce((acc, { name, id }) => {
+			const releaseEls = qu.initAll(res.item.query.all(`#Model-${id} li`));
+			const releases = scrapeProfileScenes(releaseEls, name, channel);
+
+			return {
+				...acc,
+				[name]: releases,
+			};
+		}, {});
+
+		const actors = qu.initAll(res.item.query.all('.modelInfo > li')).map((item) => {
+			const avatar = item.query.img('img', 'src', { origin: channel.url });
+			const profile = scrapeProfile(item, null, avatar, channel, actorReleasesByActorName);
+
+			return profile;
+		});
+
+		return actors;
+	}
+
+	return null;
+}
+
+async function scrapeScene({ query, html }, url, channel) {
 	const release = {};

-	release.entryId = new URL(url).pathname.match(/\/Collection\/(\d+)/)[1];
-	release.shootId = query.cnt('h2 span').match(/^\d+/)?.[0];
+	const entryId = new URL(url).pathname.match(/\/Collection\/(\d+)/)[1];

-	release.actors = query.cnt('h2 span')?.match(/[a-zA-Z]+(\s[A-Za-z]+)*/g);
+	release.shootId = query.cnt('h2 span').match(/^\d+/)?.[0];
+	release.entryId = release.shootId; // site entry ID can't be matched with upcoming scenes
+
+	const actors = await fetchSceneActors(entryId, release, channel);
+	release.actors = actors || query.cnt('h2 span')?.match(/[a-zA-Z]+(\s[A-Za-z]+)*/g);

 	release.description = query.cnt('p#CollectionDescription');
 	release.productionLocation = query.cnt('.modelCollectionHeader p')?.match(/Shoot Location: (.*)/)?.[1];
@ -67,22 +201,6 @@ function scrapeScene({ query, html }, url, channel) {
 	return release;
 }

-function scrapeProfile({ query, el }, actorName, entity, include) {
-	const profile = {};
-
-	profile.description = query.cnt('.bio-text');
-	profile.birthPlace = query.cnt('.birth-place span');
-
-	profile.avatar = query.img('.actor-photo img');
-
-	if (include.releases) {
-		return scrapeAll(qu.initAll(el, '.scene'));
-	}
-
-	console.log(profile);
-	return profile;
-}
-
 async function fetchLatest(channel, page = 1) {
 	const year = moment().subtract(page - 1, ' year').year();

@ -96,6 +214,16 @@ async function fetchLatest(channel, page = 1) {
 	return res.status;
 }

+async function fetchUpcoming(channel) {
+	const res = await qu.getAll(channel.url, '#ComingSoon li');
+
+	if (res.ok) {
+		return scrapeUpcoming(res.items, channel);
+	}
+
+	return res.status;
+}
+
 async function fetchScene(url, channel) {
 	const res = await qu.get(url);

@ -106,12 +234,27 @@ async function fetchScene(url, channel) {
 	return res.status;
 }

-async function fetchProfile({ name: actorName }, entity, include) {
-	const url = `${entity.url}/actors/${slugify(actorName, '_')}`;
-	const res = await qu.get(url);
+async function fetchProfile({ name: actorName }, channel, _include) {
+	const firstLetter = actorName.charAt(0).toUpperCase();
+	const url = `${channel.url}/Collections/Name/${firstLetter}`;
+	const res = await qu.getAll(url, '.collectionGridLayout li');

 	if (res.ok) {
-		return scrapeProfile(res.item, actorName, entity, include);
+		const actorItem = res.items.find(({ query }) => slugify(query.cnt('span')) === slugify(actorName));
+
+		if (actorItem) {
+			const actorUrl = actorItem.query.url('a', 'href', { origin: channel.url });
+			const actorAvatar = actorItem.query.img('img', 'src', { origin: channel.url });
+			const actorRes = await qu.get(actorUrl);
+
+			if (actorRes.ok) {
+				return scrapeProfile(actorRes.item, actorName, actorAvatar, channel);
+			}
+
+			return actorRes.status;
+		}
+
+		return null;
 	}

 	return res.status;
@ -119,6 +262,7 @@ async function fetchProfile({ name: actorName }, entity, include) {

 module.exports = {
 	fetchLatest,
+	fetchUpcoming,
 	fetchScene,
-	// fetchProfile,
+	fetchProfile,
 };
--- a/src/scrapers/scrapers.js
+++ b/src/scrapers/scrapers.js
@ -197,6 +197,7 @@ module.exports = {
 		iconmale,
 		interracialpass: hush,
 		interracialpovs: hush,
+		inthecrack,
 		jamesdeen: fullpornnetwork,
 		julesjordan,
 		kellymadison,
--- a/src/store-releases.js
+++ b/src/store-releases.js
@ -263,7 +263,7 @@ async function storeClips(releases) {
 		clip: clip.clip,
 	}));

-	const storedClips = await bulkInsert('clips', curatedClipEntries);
+	const storedClips = await bulkInsert('clips', curatedClipEntries, ['release_id', 'clip']);
 	const clipIdsByReleaseIdAndClip = storedClips.reduce((acc, clip) => ({
 		...acc,
 		[clip.release_id]: {