From 9a8527a780c94e60da7c163b60ce8cbb94bbf83b Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Sat, 7 Feb 2026 05:53:16 +0100 Subject: [PATCH] Refactored In The Crack. Added chapter videos (unused) and dates. Added stylized entity name field. --- migrations/20260207034922_chapter_details.js | 50 +++ seeds/02_sites.js | 4 +- src/deep.js | 9 +- src/entities.js | 4 +- src/scrapers/inthecrack.js | 391 +++++++++---------- src/store-releases.js | 2 + 6 files changed, 249 insertions(+), 211 deletions(-) create mode 100644 migrations/20260207034922_chapter_details.js diff --git a/migrations/20260207034922_chapter_details.js b/migrations/20260207034922_chapter_details.js new file mode 100644 index 00000000..8521ff8c --- /dev/null +++ b/migrations/20260207034922_chapter_details.js @@ -0,0 +1,50 @@ +exports.up = async function(knex) { + await knex.schema.alterTable('chapters', (table) => { + table.datetime('date'); + }); + + await knex.schema.createTable('chapters_trailers', (table) => { + table.integer('chapter_id') + .notNullable() + .references('id') + .inTable('chapters') + .onDelete('cascade'); + + table.text('media_id') + .notNullable() + .references('id') + .inTable('media') + .onDelete('cascade'); + }); + + await knex.schema.createTable('chapters_teasers', (table) => { + table.integer('chapter_id') + .notNullable() + .references('id') + .inTable('chapters') + .onDelete('cascade'); + + table.text('media_id') + .notNullable() + .references('id') + .inTable('media') + .onDelete('cascade'); + }); + + await knex.schema.alterTable('entities', (table) => { + table.string('name_stylized'); + }); +}; + +exports.down = async function(knex) { + await knex.schema.alterTable('chapters', (table) => { + table.dropColumn('date'); + }); + + await knex.schema.alterTable('entities', (table) => { + table.dropColumn('name_stylized'); + }); + + await knex.schema.dropTable('chapters_trailers'); + await knex.schema.dropTable('chapters_teasers'); +}; diff --git a/seeds/02_sites.js b/seeds/02_sites.js index 58a82f3f..37dff94c 100755 --- a/seeds/02_sites.js +++ b/seeds/02_sites.js @@ -5752,7 +5752,8 @@ const sites = [ // IN THE CRACK { slug: 'inthecrack', - name: 'InTheCrack', + name: 'In The Crack', + style: 'InTheCrack', url: 'https://inthecrack.com', }, // TODO: INDIE https://nats.indiebucks.com/external.php?page=sites @@ -15502,6 +15503,7 @@ exports.seed = (knex) => Promise.resolve() const sitesWithNetworks = sites.filter((site) => !site.delete).map((site) => ({ slug: site.slug, name: site.name, + name_stylized: site.style, type: site.type || 'channel', alias: site.alias, description: site.description, diff --git a/src/deep.js b/src/deep.js index a55411ea..665e7535 100755 --- a/src/deep.js +++ b/src/deep.js @@ -212,9 +212,10 @@ async function scrapeRelease(baseRelease, entitiesByHostname, type = 'scene') { ...[].concat(curatedScrapedRelease.poster), ...[].concat(baseRelease.poster), ])).filter(Boolean), - photos: curatedScrapedRelease.photos?.length > 0 - ? curatedScrapedRelease.photos - : baseRelease.photos, + photos: [ + ...curatedScrapedRelease.photos || [], + ...baseRelease.photos || [], + ], deep: !!scrapedRelease, entity, }; @@ -267,7 +268,7 @@ async function scrapeReleases(baseReleases, entitiesByHostname, type) { async function fetchReleases(baseReleasesOrUrls, type = 'scene') { const baseReleases = toBaseReleases(baseReleasesOrUrls); - const entitiesByHostname = await fetchReleaseEntities(baseReleases); + const entitiesByHostname = await fetchReleaseEntities(baseReleases, { appendBySlug: false }); const deepReleases = await scrapeReleases(baseReleases, entitiesByHostname, type); diff --git a/src/entities.js b/src/entities.js index 39086c60..7ec4982c 100755 --- a/src/entities.js +++ b/src/entities.js @@ -219,7 +219,7 @@ async function fetchIncludedEntities() { return curatedNetworks; } -async function fetchEntitiesBySlug(entitySlugs, options = { prefer: 'channel' }) { +async function fetchEntitiesBySlug(entitySlugs, options = { prefer: 'channel', appendBySlug: true }) { const entities = await knex.raw(` WITH RECURSIVE entity_tree as ( SELECT to_jsonb(entities) as entity, @@ -267,7 +267,7 @@ async function fetchEntitiesBySlug(entitySlugs, options = { prefer: 'channel' }) return { ...accEntities, - [entity.slug]: curatedEntity, + ...(options.appendBySlug !== false ? { [entity.slug]: curatedEntity } : null), [host]: curatedEntity, }; }, {}); diff --git a/src/scrapers/inthecrack.js b/src/scrapers/inthecrack.js index 0b17aa0e..df6e9ebb 100755 --- a/src/scrapers/inthecrack.js +++ b/src/scrapers/inthecrack.js @@ -1,268 +1,251 @@ 'use strict'; -const moment = require('moment'); +const unprint = require('unprint'); -const qu = require('../utils/q'); const slugify = require('../utils/slugify'); -const { feetInchesToCm, lbsToKg } = require('../utils/convert'); -function scrapeAll(scenes, channel) { - return scenes.map(({ query }) => { - const release = {}; - - release.url = query.url('a', 'href', { origin: channel.url }); - // release.entryId = new URL(release.url).pathname.match(/\/Collection\/(\d+)/)[1]; can't be matched with upcoming scenes - - release.shootId = query.cnt('a span:nth-of-type(1)').match(/^\d+/)?.[0]; - release.entryId = release.shootId; - - release.date = query.date('a span:nth-of-type(2)', 'YYYY-MM-DD'); - release.actors = (query.q('a img', 'alt') || query.cnt('a span:nth-of-type(1)'))?.match(/[a-zA-Z]+(\s[A-Za-z]+)*/g); - - release.poster = release.shootId - ? `https://inthecrack.com/assets/images/posters/collections/${release.shootId}.jpg` - : query.img('a img', 'src', { origin: channel.url }); - - return release; - }); -} - -function scrapeUpcoming(scenes, channel) { - return scenes.map(({ query }) => { - const release = {}; - - const title = query.cnt('span'); - - release.entryId = title.match(/^\d+/)[0]; - release.actors = title.slice(0, title.indexOf('-')).match(/[a-zA-Z]+(\s[a-zA-Z]+)*/g); - - const date = moment.utc(title.match(/\w+ \d+\w+$/)[0], 'MMM Do'); - - if (date.isBefore()) { - // date is next year - release.date = date.add(1, 'year').toDate(); - } else { - release.date = date.toDate(); - } - - release.poster = [ - `https://inthecrack.com/assets/images/posters/collections/${release.entryId}.jpg`, - query.img('img', 'src', { origin: channel.url }), - ]; - - return release; - }); -} - -function scrapeProfileScenes(items, actorName, channel) { - return items.map(({ query }) => { - const release = {}; - - if (slugify(query.cnt()) === 'no-other-collections') { - return null; - } - - const details = query.cnts('figure p').reduce((acc, info) => { - const [key, value] = info.split(':'); - - return { - ...acc, - [slugify(key, '_')]: value?.trim(), - }; - }, {}); - - release.url = query.url('a', 'href', { origin: channel.url }); - - release.shootId = details.collection.match(/\d+/)[0]; - release.entryId = release.shootId; - - release.date = qu.parseDate(details.release_date, 'YYYY-MM-DD'); - release.actors = [actorName]; - - /* rely on clip length - const durationString = Object.keys(details).find(info => /\d+_min_video/.test(info)); - release.duration = durationString && Number(durationString.match(/^\d+/)?.[0]) * 60; - */ - - release.productionLocation = details.shoot_location; - - release.poster = [ - `https://inthecrack.com/assets/images/posters/collections/${release.entryId}.jpg`, - query.img('img', 'src', { origin: channel.url }), - ]; - - return release; - }).filter(Boolean); -} - -function scrapeProfile({ query }, actorName, actorAvatar, channel, releasesFromScene) { +function scrapeProfile(model, channel) { const profile = {}; - const bio = query.cnts(releasesFromScene ? 'ul li' : 'div.modelInfo li').reduce((acc, info) => { - const [key, value] = info.split(':'); + profile.name = model.name; // used by shallow scrape + profile.entryId = model.id; - return { - ...acc, - [slugify(key, '_')]: value.trim(), - }; - }, {}); + profile.dateOfBirth = unprint.extractDate(model.birthdate, 'YYYY-MM-DD'); - profile.name = actorName || bio.name; - profile.gender = 'female'; - profile.birthPlace = bio.nationality; + profile.birthPlace = model.countries?.map((country) => { + if (country.name) { + return country.name; + } - if (bio.height) profile.height = feetInchesToCm(bio.height); - if (bio.weight) profile.weight = lbsToKg(bio.weight); + if (country.isO2 || country.iso2) { // sic + return country.isO2 || country.iso2; + } - profile.releases = releasesFromScene?.[profile.name] || scrapeProfileScenes(qu.initAll(query.all('.Models li')), actorName, channel); + if (typeof country === 'string') { + return country; + } - // avatar is the poster of a scene, find scene and use its high quality poster instead - const avatarRelease = profile.releases.find((release) => new URL(release.poster[1]).pathname === new URL(actorAvatar).pathname); - profile.avatar = avatarRelease?.poster[0]; + return null; + }).filter(Boolean)[0]; + + profile.height = model.height; + profile.weight = model.weight; + + const ethnicity = model.ethnicity?.title || model.ethnicity; + + if (!/none/i.test(ethnicity)) { + profile.ethnicity = ethnicity; + } + + if (model.id) { + profile.url = `${channel.origin}/modelcollections/${model.id}`; + } return profile; } -async function fetchSceneActors(entryId, _release, channel) { - const url = `https://inthecrack.com/Collection/Biography/${entryId}`; - const res = await qu.get(url); - - if (res.ok) { - const actorTabs = qu.initAll(res.item.query.all('#ModelTabs li')).map(({ query }) => ({ - name: query.cnt('a'), - id: query.q('a', 'data-model'), - })); - - const actorReleasesByActorName = actorTabs.reduce((acc, { name, id }) => { - const releaseEls = qu.initAll(res.item.query.all(`#Model-${id} li`)); - const releases = scrapeProfileScenes(releaseEls, name, channel); - - return { - ...acc, - [name]: releases, - }; - }, {}); - - const actors = qu.initAll(res.item.query.all('.modelInfo > li')).map((item) => { - const avatar = item.query.img('img', 'src', { origin: channel.url }); - const profile = scrapeProfile(item, null, avatar, channel, actorReleasesByActorName); - - return profile; - }); - - return actors; +function mergeModels(sceneModels, models, channel) { + if (!Array.isArray(sceneModels) || !models) { + return []; } - return null; + return sceneModels.map((modelId) => { + const model = models[modelId?.id || modelId]; + + if (!model) { + return null; + } + + return scrapeProfile(model, channel); + }).filter(Boolean); } -async function scrapeScene({ query, html }, url, channel) { - const release = {}; +function scrapeAll(scenes, channel, models = {}, isUpcoming = false) { + return scenes.map((scene) => { + const release = {}; - const entryId = new URL(url).pathname.match(/\/Collection\/(\d+)/)[1]; + release.entryId = scene.id; + release.shootId = scene.id; - release.shootId = query.cnt('h2 span').match(/^\d+/)?.[0]; - release.entryId = release.shootId; // site entry ID can't be matched with upcoming scenes + release.title = scene.title; + release.date = unprint.extractDate(scene.releaseDate, 'YYYY-MM-DD'); - const actors = await fetchSceneActors(entryId, release, channel); - release.actors = actors || query.cnt('h2 span')?.match(/[a-zA-Z]+(\s[A-Za-z]+)*/g); + release.poster = `https://api.inthecrack.com/image/resize/images/posters/collections/${scene.id}.jpg?w=1400`; - release.description = query.cnt('p#CollectionDescription'); - release.productionLocation = query.cnt('.modelCollectionHeader p')?.match(/Shoot Location: (.*)/)?.[1]; + // coming soon photo remains available after release date + release.photos = [`https://api.inthecrack.com/FileStore/images/coming_soon/${scene.id}.jpg`]; - release.poster = qu.prefixUrl(html.match(/background-image: url\('(.*)'\)/)?.[1], channel.url); - - release.chapters = query.all('.ClipOuter').map((el) => { - const chapter = {}; - - chapter.title = query.text(el, 'h4'); - chapter.description = query.cnt(el, 'p'); - chapter.duration = query.dur(el, '.InlineDuration'); - - const posterStyle = query.style(el, '.clipImage', 'background-image'); - const poster = qu.prefixUrl(posterStyle.match(/url\((.*)\)/)?.[1], channel.url); - - if (poster) { - const { origin, pathname } = new URL(poster); - - chapter.poster = [ - `${origin}${pathname}`, // full size - poster, - ]; + if (isUpcoming) { + return release; } - if (query.exists(el, '.ThreeDInfo')) { - chapter.tags = ['3d']; - } + release.url = `${channel.origin}/collection/${scene.id}`; - return chapter; + release.duration = scene.clipMinutesTotal * 60 || null; + release.actors = mergeModels(scene.models, models, channel); + + release.productionDate = unprint.extractDate(scene.shootDate, 'YYYY-MM-DD'); + release.photoCount = scene.picTotal; + + release.productionLocation = scene.shootLocation; + + return release; }); - - return release; } -async function fetchLatest(channel, page = 1) { - const year = moment().subtract(page - 1, ' year').year(); - - const url = `${channel.url}/Collections/Date/${year}`; - const res = await qu.getAll(url, '.collectionGridLayout li'); +async function fetchLatest(channel, page, context) { + const res = await unprint.get('https://api.inthecrack.com/Collection/'); if (res.ok) { - return scrapeAll(res.items, channel); + // API has no pagination, simulate so it doesn't blow up the rest of the guts + return scrapeAll(res.data.slice((page - 1) * 100, page * 100), channel, context.beforeFetchLatest); } return res.status; } async function fetchUpcoming(channel) { - const res = await qu.getAll(channel.url, '#ComingSoon li'); + const res = await unprint.get('https://api.inthecrack.com/Home/coming_soon'); if (res.ok) { - return scrapeUpcoming(res.items, channel); + // API has no pagination, simulate so it doesn't blow up the rest of the guts + return scrapeAll(res.data, channel, null, true); } return res.status; } -async function fetchScene(url, channel) { - const res = await qu.get(url); +const qualityMap = { + // unsnure about 2 and 5 + 1: 360, + 3: 720, + 4: 1080, + 6: 2160, +}; - if (res.ok) { - return scrapeScene(res.item, url, channel); +function scrapeScene(scene, channel, baseRelease, models = {}) { + const release = {}; + + release.entryId = scene.id; + release.shootId = scene.id; + + release.url = `${channel.origin}/collection/${scene.id}`; + + release.title = scene.title; + release.description = scene.description; + + release.actors = mergeModels(scene.models, models, channel); + + release.productionDate = unprint.extractDate(scene.shootDate, 'YYYY-MM-DD'); + release.productionLocation = scene.shootLocation; + + release.poster = `https://api.inthecrack.com/image/resize/images/posters/collections/${scene.id}.jpg?w=1400`; + + release.photos = scene.galleryImages + ?.filter((image) => image.imageType === 1) // type 1 and 2 are dupes as far as thumbs are concerned + .slice(0, 15) // only first 15 photos have a free thumb + .map((image) => image.filename && `https://api.inthecrack.com/FileStore/images/gallerysamples/${scene.id}/${image.filename}`).filter(Boolean); + + release.chapters = scene.clips?.map((clip) => ({ + entryId: clip.id, + title: clip.title, + description: clip.description, + date: unprint.extractDate(clip.releaseDate, 'YYYY-MM-DD'), + duration: clip.length, + // this is how the site itself renders the thumbnails, I shit you not. does not return valid image without ?w parameter + poster: `https://api.inthecrack.com/image/resize/images/posters/clips/${clip.videos?.[0]?.filename.match(/^(.*?)(?=\d+x\d+\.mp4)/)[0]}.jpg?w=1400`, + })); + + release.qualities = scene.clips?.[0]?.videos?.map((video) => qualityMap[video.videoResolutionId]).filter(Boolean); + + if (!baseRelease.date) { + // base release has 'official' release date, deep data only has chapter dates + // though, this is probably how they calculate the collection date, too + release.date = release.chapters + ?.map((chapter) => chapter.date) + .filter(Boolean) + .toSorted((dateA, dateB) => dateA - dateB)[0]; } - return res.status; + return release; } -async function fetchProfile({ name: actorName }, channel, _include) { - const firstLetter = actorName.charAt(0).toUpperCase(); - const url = `${channel.url}/Collections/Name/${firstLetter}`; - const res = await qu.getAll(url, '.collectionGridLayout li'); - - if (res.ok) { - const actorItem = res.items.find(({ query }) => slugify(query.cnt('span')) === slugify(actorName)); - - if (actorItem) { - const actorUrl = actorItem.query.url('a', 'href', { origin: channel.url }); - const actorAvatar = actorItem.query.img('img', 'src', { origin: channel.url }); - const actorRes = await qu.get(actorUrl); - - if (actorRes.ok) { - return scrapeProfile(actorRes.item, actorName, actorAvatar, channel); - } - - return actorRes.status; - } +async function fetchScene(url, channel, baseRelease, context) { + const entryId = new URL(url).pathname.match(/\/collection\/(\d+)/)?.[1]; + if (!entryId) { return null; } + const res = await unprint.get(`https://api.inthecrack.com/Collection/${entryId}`); + + if (res.ok) { + return scrapeScene(res.data, channel, baseRelease, context.beforeFetchScenes); + } + return res.status; } +async function fetchModels() { + const res = await unprint.get('https://api.inthecrack.com/Model/'); + + if (res.ok) { + try { + const modelsById = Object.fromEntries(res.data.map((model) => [model.id, model])); + + return modelsById; + } catch (error) { + // we can continue, we just won't have model names + } + } + + return {}; +} + +async function getModelId(actor) { + if (actor.entryId) { + return actor.entryId; + } + + if (actor.url) { + const modelId = new URL(actor.url).pathname.match(/\/modelcollection\/(\d+)/)?.[1]; + + if (modelId) { + return modelId; + } + } + + const modelsById = await fetchModels(); + const model = Object.values(modelsById).find((searchModel) => slugify(searchModel.name) === slugify(actor.name)); + + if (model) { + return model.id; + } + + return null; +} + +async function fetchProfile(actor, channel) { + const modelId = await getModelId(actor); + + if (!modelId) { + return null; + } + + const res = await unprint.get(`https://api.inthecrack.com/Model/${modelId}`); + + if (res.ok) { + return scrapeProfile(res.data, channel); + } + + return null; +} + module.exports = { fetchLatest, fetchUpcoming, fetchScene, fetchProfile, + beforeFetchLatest: fetchModels, + beforeFetchScenes: fetchModels, }; diff --git a/src/store-releases.js b/src/store-releases.js index 650d0146..ebdc400a 100755 --- a/src/store-releases.js +++ b/src/store-releases.js @@ -253,6 +253,7 @@ async function storeChapters(releases) { releaseId: release.id, index: index + 1, time: chapter.time, + date: chapter.date, duration: chapter.duration, title: chapter.title, description: chapter.description, @@ -268,6 +269,7 @@ async function storeChapters(releases) { index: chapter.index, time: chapter.time, duration: chapter.duration, + date: chapter.date, title: chapter.title, description: chapter.description, release_id: chapter.releaseId,