Refactored In The Crack. Added chapter videos (unused) and dates. Added stylized entity name field.

2026-02-07 05:53:16 +01:00 · 2026-02-07 05:53:16 +01:00 · 9a8527a780
parent 33179c0829
commit 9a8527a780
6 changed files with 249 additions and 211 deletions
--- a/migrations/20260207034922_chapter_details.js
+++ b/migrations/20260207034922_chapter_details.js
@ -0,0 +1,50 @@
+exports.up = async function(knex) {
+	await knex.schema.alterTable('chapters', (table) => {
+		table.datetime('date');
+	});
+
+	await knex.schema.createTable('chapters_trailers', (table) => {
+		table.integer('chapter_id')
+			.notNullable()
+			.references('id')
+			.inTable('chapters')
+			.onDelete('cascade');
+
+		table.text('media_id')
+			.notNullable()
+			.references('id')
+			.inTable('media')
+			.onDelete('cascade');
+	});
+
+	await knex.schema.createTable('chapters_teasers', (table) => {
+		table.integer('chapter_id')
+			.notNullable()
+			.references('id')
+			.inTable('chapters')
+			.onDelete('cascade');
+
+		table.text('media_id')
+			.notNullable()
+			.references('id')
+			.inTable('media')
+			.onDelete('cascade');
+	});
+
+	await knex.schema.alterTable('entities', (table) => {
+		table.string('name_stylized');
+	});
+};
+
+exports.down = async function(knex) {
+	await knex.schema.alterTable('chapters', (table) => {
+		table.dropColumn('date');
+	});
+
+	await knex.schema.alterTable('entities', (table) => {
+		table.dropColumn('name_stylized');
+	});
+
+	await knex.schema.dropTable('chapters_trailers');
+	await knex.schema.dropTable('chapters_teasers');
+};
--- a/seeds/02_sites.js
+++ b/seeds/02_sites.js
@ -5753,6 +5753,7 @@ const sites = [
 	{
 		slug: 'inthecrack',
 		name: 'In The Crack',
+		style: 'InTheCrack',
 		url: 'https://inthecrack.com',
 	},
 	// TODO: INDIE https://nats.indiebucks.com/external.php?page=sites
@ -15502,6 +15503,7 @@ exports.seed = (knex) => Promise.resolve()
 		const sitesWithNetworks = sites.filter((site) => !site.delete).map((site) => ({
 			slug: site.slug,
 			name: site.name,
+			name_stylized: site.style,
 			type: site.type || 'channel',
 			alias: site.alias,
 			description: site.description,
--- a/src/deep.js
+++ b/src/deep.js
@ -212,9 +212,10 @@ async function scrapeRelease(baseRelease, entitiesByHostname, type = 'scene') {
 				...[].concat(curatedScrapedRelease.poster),
 				...[].concat(baseRelease.poster),
 			])).filter(Boolean),
-			photos: curatedScrapedRelease.photos?.length > 0
-				? curatedScrapedRelease.photos
-				: baseRelease.photos,
+			photos: [
+				...curatedScrapedRelease.photos || [],
+				...baseRelease.photos || [],
+			],
 			deep: !!scrapedRelease,
 			entity,
 		};
@ -267,7 +268,7 @@ async function scrapeReleases(baseReleases, entitiesByHostname, type) {

 async function fetchReleases(baseReleasesOrUrls, type = 'scene') {
 	const baseReleases = toBaseReleases(baseReleasesOrUrls);
-	const entitiesByHostname = await fetchReleaseEntities(baseReleases);
+	const entitiesByHostname = await fetchReleaseEntities(baseReleases, { appendBySlug: false });

 	const deepReleases = await scrapeReleases(baseReleases, entitiesByHostname, type);

--- a/src/entities.js
+++ b/src/entities.js
@ -219,7 +219,7 @@ async function fetchIncludedEntities() {
 	return curatedNetworks;
 }

-async function fetchEntitiesBySlug(entitySlugs, options = { prefer: 'channel' }) {
+async function fetchEntitiesBySlug(entitySlugs, options = { prefer: 'channel', appendBySlug: true }) {
 	const entities = await knex.raw(`
 		WITH RECURSIVE entity_tree as (
 			SELECT to_jsonb(entities) as entity,
@ -267,7 +267,7 @@ async function fetchEntitiesBySlug(entitySlugs, options = { prefer: 'channel' })

 		return {
 			...accEntities,
-			[entity.slug]: curatedEntity,
+			...(options.appendBySlug !== false ? { [entity.slug]: curatedEntity } : null),
 			[host]: curatedEntity,
 		};
 	}, {});
--- a/src/scrapers/inthecrack.js
+++ b/src/scrapers/inthecrack.js
@ -1,263 +1,244 @@
 'use strict';

-const moment = require('moment');
+const unprint = require('unprint');

-const qu = require('../utils/q');
 const slugify = require('../utils/slugify');
-const { feetInchesToCm, lbsToKg } = require('../utils/convert');

-function scrapeAll(scenes, channel) {
-	return scenes.map(({ query }) => {
-		const release = {};
+function scrapeProfile(model, channel) {
+	const profile = {};

-		release.url = query.url('a', 'href', { origin: channel.url });
-		// release.entryId = new URL(release.url).pathname.match(/\/Collection\/(\d+)/)[1]; can't be matched with upcoming scenes
+	profile.name = model.name; // used by shallow scrape
+	profile.entryId = model.id;

-		release.shootId = query.cnt('a span:nth-of-type(1)').match(/^\d+/)?.[0];
-		release.entryId = release.shootId;
+	profile.dateOfBirth = unprint.extractDate(model.birthdate, 'YYYY-MM-DD');

-		release.date = query.date('a span:nth-of-type(2)', 'YYYY-MM-DD');
-		release.actors = (query.q('a img', 'alt') || query.cnt('a span:nth-of-type(1)'))?.match(/[a-zA-Z]+(\s[A-Za-z]+)*/g);
-
-		release.poster = release.shootId
-			? `https://inthecrack.com/assets/images/posters/collections/${release.shootId}.jpg`
-			: query.img('a img', 'src', { origin: channel.url });
-
-		return release;
-	});
+	profile.birthPlace = model.countries?.map((country) => {
+		if (country.name) {
+			return country.name;
 		}

-function scrapeUpcoming(scenes, channel) {
-	return scenes.map(({ query }) => {
-		const release = {};
-
-		const title = query.cnt('span');
-
-		release.entryId = title.match(/^\d+/)[0];
-		release.actors = title.slice(0, title.indexOf('-')).match(/[a-zA-Z]+(\s[a-zA-Z]+)*/g);
-
-		const date = moment.utc(title.match(/\w+ \d+\w+$/)[0], 'MMM Do');
-
-		if (date.isBefore()) {
-			// date is next year
-			release.date = date.add(1, 'year').toDate();
-		} else {
-			release.date = date.toDate();
+		if (country.isO2 || country.iso2) { // sic
+			return country.isO2 || country.iso2;
 		}

-		release.poster = [
-			`https://inthecrack.com/assets/images/posters/collections/${release.entryId}.jpg`,
-			query.img('img', 'src', { origin: channel.url }),
-		];
-
-		return release;
-	});
+		if (typeof country === 'string') {
+			return country;
 		}

-function scrapeProfileScenes(items, actorName, channel) {
-	return items.map(({ query }) => {
-		const release = {};
+		return null;
+	}).filter(Boolean)[0];

-		if (slugify(query.cnt()) === 'no-other-collections') {
+	profile.height = model.height;
+	profile.weight = model.weight;
+
+	const ethnicity = model.ethnicity?.title || model.ethnicity;
+
+	if (!/none/i.test(ethnicity)) {
+		profile.ethnicity = ethnicity;
+	}
+
+	if (model.id) {
+		profile.url = `${channel.origin}/modelcollections/${model.id}`;
+	}
+
+	return profile;
+}
+
+function mergeModels(sceneModels, models, channel) {
+	if (!Array.isArray(sceneModels) || !models) {
+		return [];
+	}
+
+	return sceneModels.map((modelId) => {
+		const model = models[modelId?.id || modelId];
+
+		if (!model) {
 			return null;
 		}

-		const details = query.cnts('figure p').reduce((acc, info) => {
-			const [key, value] = info.split(':');
-
-			return {
-				...acc,
-				[slugify(key, '_')]: value?.trim(),
-			};
-		}, {});
-
-		release.url = query.url('a', 'href', { origin: channel.url });
-
-		release.shootId = details.collection.match(/\d+/)[0];
-		release.entryId = release.shootId;
-
-		release.date = qu.parseDate(details.release_date, 'YYYY-MM-DD');
-		release.actors = [actorName];
-
-		/* rely on clip length
-		const durationString = Object.keys(details).find(info => /\d+_min_video/.test(info));
-		release.duration = durationString && Number(durationString.match(/^\d+/)?.[0]) * 60;
-		*/
-
-		release.productionLocation = details.shoot_location;
-
-		release.poster = [
-			`https://inthecrack.com/assets/images/posters/collections/${release.entryId}.jpg`,
-			query.img('img', 'src', { origin: channel.url }),
-		];
-
-		return release;
+		return scrapeProfile(model, channel);
 	}).filter(Boolean);
 }

-function scrapeProfile({ query }, actorName, actorAvatar, channel, releasesFromScene) {
-	const profile = {};
-
-	const bio = query.cnts(releasesFromScene ? 'ul li' : 'div.modelInfo li').reduce((acc, info) => {
-		const [key, value] = info.split(':');
-
-		return {
-			...acc,
-			[slugify(key, '_')]: value.trim(),
-		};
-	}, {});
-
-	profile.name = actorName || bio.name;
-	profile.gender = 'female';
-	profile.birthPlace = bio.nationality;
-
-	if (bio.height) profile.height = feetInchesToCm(bio.height);
-	if (bio.weight) profile.weight = lbsToKg(bio.weight);
-
-	profile.releases = releasesFromScene?.[profile.name] || scrapeProfileScenes(qu.initAll(query.all('.Models li')), actorName, channel);
-
-	// avatar is the poster of a scene, find scene and use its high quality poster instead
-	const avatarRelease = profile.releases.find((release) => new URL(release.poster[1]).pathname === new URL(actorAvatar).pathname);
-	profile.avatar = avatarRelease?.poster[0];
-
-	return profile;
-}
-
-async function fetchSceneActors(entryId, _release, channel) {
-	const url = `https://inthecrack.com/Collection/Biography/${entryId}`;
-	const res = await qu.get(url);
-
-	if (res.ok) {
-		const actorTabs = qu.initAll(res.item.query.all('#ModelTabs li')).map(({ query }) => ({
-			name: query.cnt('a'),
-			id: query.q('a', 'data-model'),
-		}));
-
-		const actorReleasesByActorName = actorTabs.reduce((acc, { name, id }) => {
-			const releaseEls = qu.initAll(res.item.query.all(`#Model-${id} li`));
-			const releases = scrapeProfileScenes(releaseEls, name, channel);
-
-			return {
-				...acc,
-				[name]: releases,
-			};
-		}, {});
-
-		const actors = qu.initAll(res.item.query.all('.modelInfo > li')).map((item) => {
-			const avatar = item.query.img('img', 'src', { origin: channel.url });
-			const profile = scrapeProfile(item, null, avatar, channel, actorReleasesByActorName);
-
-			return profile;
-		});
-
-		return actors;
-	}
-
-	return null;
-}
-
-async function scrapeScene({ query, html }, url, channel) {
+function scrapeAll(scenes, channel, models = {}, isUpcoming = false) {
+	return scenes.map((scene) => {
 		const release = {};

-	const entryId = new URL(url).pathname.match(/\/Collection\/(\d+)/)[1];
+		release.entryId = scene.id;
+		release.shootId = scene.id;

-	release.shootId = query.cnt('h2 span').match(/^\d+/)?.[0];
-	release.entryId = release.shootId; // site entry ID can't be matched with upcoming scenes
+		release.title = scene.title;
+		release.date = unprint.extractDate(scene.releaseDate, 'YYYY-MM-DD');

-	const actors = await fetchSceneActors(entryId, release, channel);
-	release.actors = actors || query.cnt('h2 span')?.match(/[a-zA-Z]+(\s[A-Za-z]+)*/g);
+		release.poster = `https://api.inthecrack.com/image/resize/images/posters/collections/${scene.id}.jpg?w=1400`;

-	release.description = query.cnt('p#CollectionDescription');
-	release.productionLocation = query.cnt('.modelCollectionHeader p')?.match(/Shoot Location: (.*)/)?.[1];
-
-	release.poster = qu.prefixUrl(html.match(/background-image: url\('(.*)'\)/)?.[1], channel.url);
-
-	release.chapters = query.all('.ClipOuter').map((el) => {
-		const chapter = {};
-
-		chapter.title = query.text(el, 'h4');
-		chapter.description = query.cnt(el, 'p');
-		chapter.duration = query.dur(el, '.InlineDuration');
-
-		const posterStyle = query.style(el, '.clipImage', 'background-image');
-		const poster = qu.prefixUrl(posterStyle.match(/url\((.*)\)/)?.[1], channel.url);
-
-		if (poster) {
-			const { origin, pathname } = new URL(poster);
-
-			chapter.poster = [
-				`${origin}${pathname}`, // full size
-				poster,
-			];
-		}
-
-		if (query.exists(el, '.ThreeDInfo')) {
-			chapter.tags = ['3d'];
-		}
-
-		return chapter;
-	});
+		// coming soon  photo remains available after release date
+		release.photos = [`https://api.inthecrack.com/FileStore/images/coming_soon/${scene.id}.jpg`];

+		if (isUpcoming) {
 			return release;
 		}

-async function fetchLatest(channel, page = 1) {
-	const year = moment().subtract(page - 1, ' year').year();
+		release.url = `${channel.origin}/collection/${scene.id}`;

-	const url = `${channel.url}/Collections/Date/${year}`;
-	const res = await qu.getAll(url, '.collectionGridLayout li');
+		release.duration = scene.clipMinutesTotal * 60 || null;
+		release.actors = mergeModels(scene.models, models, channel);
+
+		release.productionDate = unprint.extractDate(scene.shootDate, 'YYYY-MM-DD');
+		release.photoCount = scene.picTotal;
+
+		release.productionLocation = scene.shootLocation;
+
+		return release;
+	});
+}
+
+async function fetchLatest(channel, page, context) {
+	const res = await unprint.get('https://api.inthecrack.com/Collection/');

 	if (res.ok) {
-		return scrapeAll(res.items, channel);
+		// API has no pagination, simulate so it doesn't blow up the rest of the guts
+		return scrapeAll(res.data.slice((page - 1) * 100, page * 100), channel, context.beforeFetchLatest);
 	}

 	return res.status;
 }

 async function fetchUpcoming(channel) {
-	const res = await qu.getAll(channel.url, '#ComingSoon li');
+	const res = await unprint.get('https://api.inthecrack.com/Home/coming_soon');

 	if (res.ok) {
-		return scrapeUpcoming(res.items, channel);
+		// API has no pagination, simulate so it doesn't blow up the rest of the guts
+		return scrapeAll(res.data, channel, null, true);
 	}

 	return res.status;
 }

-async function fetchScene(url, channel) {
-	const res = await qu.get(url);
+const qualityMap = {
+	// unsnure about 2 and 5
+	1: 360,
+	3: 720,
+	4: 1080,
+	6: 2160,
+};
+
+function scrapeScene(scene, channel, baseRelease, models = {}) {
+	const release = {};
+
+	release.entryId = scene.id;
+	release.shootId = scene.id;
+
+	release.url = `${channel.origin}/collection/${scene.id}`;
+
+	release.title = scene.title;
+	release.description = scene.description;
+
+	release.actors = mergeModels(scene.models, models, channel);
+
+	release.productionDate = unprint.extractDate(scene.shootDate, 'YYYY-MM-DD');
+	release.productionLocation = scene.shootLocation;
+
+	release.poster = `https://api.inthecrack.com/image/resize/images/posters/collections/${scene.id}.jpg?w=1400`;
+
+	release.photos = scene.galleryImages
+		?.filter((image) => image.imageType === 1) // type 1 and 2 are dupes as far as thumbs are concerned
+		.slice(0, 15) // only first 15 photos have a free thumb
+		.map((image) => image.filename && `https://api.inthecrack.com/FileStore/images/gallerysamples/${scene.id}/${image.filename}`).filter(Boolean);
+
+	release.chapters = scene.clips?.map((clip) => ({
+		entryId: clip.id,
+		title: clip.title,
+		description: clip.description,
+		date: unprint.extractDate(clip.releaseDate, 'YYYY-MM-DD'),
+		duration: clip.length,
+		// this is how the site itself renders the thumbnails, I shit you not. does not return valid image without ?w parameter
+		poster: `https://api.inthecrack.com/image/resize/images/posters/clips/${clip.videos?.[0]?.filename.match(/^(.*?)(?=\d+x\d+\.mp4)/)[0]}.jpg?w=1400`,
+	}));
+
+	release.qualities = scene.clips?.[0]?.videos?.map((video) => qualityMap[video.videoResolutionId]).filter(Boolean);
+
+	if (!baseRelease.date) {
+		// base release has 'official' release date, deep data only has chapter dates
+		// though, this is probably how they calculate the collection date, too
+		release.date = release.chapters
+			?.map((chapter) => chapter.date)
+			.filter(Boolean)
+			.toSorted((dateA, dateB) => dateA - dateB)[0];
+	}
+
+	return release;
+}
+
+async function fetchScene(url, channel, baseRelease, context) {
+	const entryId = new URL(url).pathname.match(/\/collection\/(\d+)/)?.[1];
+
+	if (!entryId) {
+		return null;
+	}
+
+	const res = await unprint.get(`https://api.inthecrack.com/Collection/${entryId}`);

 	if (res.ok) {
-		return scrapeScene(res.item, url, channel);
+		return scrapeScene(res.data, channel, baseRelease, context.beforeFetchScenes);
 	}

 	return res.status;
 }

-async function fetchProfile({ name: actorName }, channel, _include) {
-	const firstLetter = actorName.charAt(0).toUpperCase();
-	const url = `${channel.url}/Collections/Name/${firstLetter}`;
-	const res = await qu.getAll(url, '.collectionGridLayout li');
+async function fetchModels() {
+	const res = await unprint.get('https://api.inthecrack.com/Model/');

 	if (res.ok) {
-		const actorItem = res.items.find(({ query }) => slugify(query.cnt('span')) === slugify(actorName));
+		try {
+			const modelsById = Object.fromEntries(res.data.map((model) => [model.id, model]));

-		if (actorItem) {
-			const actorUrl = actorItem.query.url('a', 'href', { origin: channel.url });
-			const actorAvatar = actorItem.query.img('img', 'src', { origin: channel.url });
-			const actorRes = await qu.get(actorUrl);
-
-			if (actorRes.ok) {
-				return scrapeProfile(actorRes.item, actorName, actorAvatar, channel);
+			return modelsById;
+		} catch (error) {
+			// we can continue, we just won't have model names
+		}
 	}

-			return actorRes.status;
+	return {};
+}
+
+async function getModelId(actor) {
+	if (actor.entryId) {
+		return actor.entryId;
+	}
+
+	if (actor.url) {
+		const modelId = new URL(actor.url).pathname.match(/\/modelcollection\/(\d+)/)?.[1];
+
+		if (modelId) {
+			return modelId;
+		}
+	}
+
+	const modelsById = await fetchModels();
+	const model = Object.values(modelsById).find((searchModel) => slugify(searchModel.name) === slugify(actor.name));
+
+	if (model) {
+		return model.id;
 	}

 	return null;
 }

-	return res.status;
+async function fetchProfile(actor, channel) {
+	const modelId = await getModelId(actor);
+
+	if (!modelId) {
+		return null;
+	}
+
+	const res = await unprint.get(`https://api.inthecrack.com/Model/${modelId}`);
+
+	if (res.ok) {
+		return scrapeProfile(res.data, channel);
+	}
+
+	return null;
 }

 module.exports = {
@ -265,4 +246,6 @@ module.exports = {
 	fetchUpcoming,
 	fetchScene,
 	fetchProfile,
+	beforeFetchLatest: fetchModels,
+	beforeFetchScenes: fetchModels,
 };
--- a/src/store-releases.js
+++ b/src/store-releases.js
@ -253,6 +253,7 @@ async function storeChapters(releases) {
 			releaseId: release.id,
 			index: index + 1,
 			time: chapter.time,
+			date: chapter.date,
 			duration: chapter.duration,
 			title: chapter.title,
 			description: chapter.description,
@ -268,6 +269,7 @@ async function storeChapters(releases) {
 		index: chapter.index,
 		time: chapter.time,
 		duration: chapter.duration,
+		date: chapter.date,
 		title: chapter.title,
 		description: chapter.description,
 		release_id: chapter.releaseId,