Storing scene language and production date precision. Refactored Teen Core Club.

This commit is contained in:
DebaucheryLibrarian
2026-04-03 00:26:13 +02:00
parent ed07a6c249
commit a492401db0
7 changed files with 553 additions and 478 deletions

View File

@@ -2,83 +2,82 @@
const unprint = require('unprint');
const slugify = require('../utils/slugify');
function pickLocale(item) {
if (!item) {
return null;
}
function scrapeAll(scenes) {
return scenes.map(({ query }) => {
const release = {};
if (item.en) {
return item.en;
}
release.url = query.url('.title a');
release.entryId = new URL(release.url).pathname.match(/\/scene\/(\d+)/)[1];
release.title = query.content('.title a');
release.date = query.date('.date', 'MMM DD, YYYY');
release.duration = query.duration('.duration');
release.actors = query.all('.models a.model').map((actorEl) => ({
name: unprint.query.content(actorEl),
url: unprint.query.url(actorEl, null),
}));
release.poster = query.img('img.poster');
release.teaser = query.video('.teaser video');
console.log(release);
return release;
});
return Object.values(item)[0];
}
async function fetchLatest(channel, page = 1) {
const url = `${channel.url}/${page}`;
const res = await unprint.get(url, { selectAll: '.scene' });
function scrapeScene(scene, channel) {
const release = {};
if (res.ok) {
return scrapeAll(res.context, channel);
release.entryId = scene.id;
release.url = `${channel.url}/video/${scene.id}/${scene.slug}`;
release.title = pickLocale(scene.title);
release.description = pickLocale(scene.description);
release.date = new Date(scene.publication_date);
release.duration = scene.meta?.duration_seconds || unprint.extractDuration(scene.meta?.duration);
release.productionDate = scene.meta.year && new Date(Date.UTC(scene.meta.year, 0, 1));
release.productionDatePrecision = 'year';
release.actors = scene.actors?.map((actor) => ({
name: actor.name,
entryId: actor.id,
url: `${channel.url}/videos/browse/cast/${actor.id}`,
}));
const poster = scene.artwork?.original;
const photos = [scene.artwork_f16, scene.cover].map((art) => art.original).filter(Boolean);
if (poster) {
release.poster = poster;
release.photos = photos;
} else {
// not observed, but artwork_f16 is suitable as poster
release.poster = photos[0];
release.photos = photos.slice(1);
}
release.caps = scene.screenshots?.map((src) => unprint.prefixUrl(src, 'https://s02.uni73d.net')) || [];
release.teaser = unprint.prefixUrl(scene.preview?.url, 'https://s02.uni73d.net');
release.tags = scene.display_genres?.map((genre) => pickLocale(genre.title)).filter(Boolean) || [];
if (scene.is_gay) {
release.tags = release.tags.concat('gay');
}
release.language = scene.meta?.language;
return release;
}
async function fetchLatest(channel, page = 1, { parameters }) {
const url = `https://api.fundorado.com/api/videos/browse/labels/${parameters.legacySiteId}?page=${page}&sg=false&sort=release&video_type=scene&lang=en`;
const res = await unprint.get(url);
if (res.ok && res.data?.videos?.data) {
return res.data.videos.data.map((scene) => scrapeScene(scene, channel));
}
return res.status;
}
function scrapeScene({ query }, { url }) {
const release = {};
async function fetchScene(url, channel) {
const entryId = new URL(url).pathname.match(/\/video\/(\d+)/)[1];
const res = await unprint.get(`https://api.fundorado.com/api/videodetail/${entryId}`);
release.entryId = new URL(url).pathname.match(/\/scene\/(\d+)/)[1];
release.title = query.content('h3.title');
release.description = query.content('p.description');
release.date = query.date('.date', 'MMMM D, YYYY');
release.duration = query.duration('.duration');
[release.poster, ...release.photos] = query.imgs('.preview-thumb');
release.trailer = query.video('.trailer video');
console.log(release);
return release;
}
function scrapeProfile({ query }) {
const profile = {};
profile.description = query.content('.bio-text');
profile.birthPlace = query.content('.birth-place span');
profile.avatar = query.img('.actor-photo img');
console.log(profile);
return profile;
}
async function fetchProfile({ name: actorName }, entity) {
const url = `${entity.url}/actors/${slugify(actorName, '_')}`;
const res = await unprint.get(url);
if (res.ok) {
return scrapeProfile(res.context, entity);
if (res.ok && res.data?.video) {
return scrapeScene(res.data.video, channel);
}
return res.status;
@@ -86,6 +85,5 @@ async function fetchProfile({ name: actorName }, entity) {
module.exports = {
fetchLatest,
fetchProfile,
scrapeScene,
fetchScene,
};

View File

@@ -70,6 +70,7 @@ async function curateReleaseEntry(release, batchId, existingRelease, type = 'sce
if (type === 'scene') {
curatedRelease.shoot_id = release.shootId || null;
curatedRelease.production_date = Number(release.productionDate) ? release.productionDate : null;
curatedRelease.production_date_precision = release.productionDatePrecision;
curatedRelease.duration = Math.round(release.duration) || null; // float may happen if scraper converts duration from milliseconds with a simple / 1000
curatedRelease.qualities = Array.from(new Set(release.qualities?.map(Number).filter(Boolean))).sort((qualityA, qualityB) => qualityB - qualityA);
}
@@ -89,6 +90,20 @@ async function curateReleaseEntry(release, batchId, existingRelease, type = 'sce
}
}
if (release.language) {
const curatedLanguage = release.language.toLowerCase();
const language = await knex('languages')
.where(knex.raw('lower(alpha2)'), curatedLanguage)
.orWhere(knex.raw('lower(name)'), curatedLanguage)
.orWhere(knex.raw('lower(name_native)'), curatedLanguage)
.first();
if (language) {
curatedRelease.language_alpha2 = language.alpha2;
}
}
if (!existingRelease && !release.id) {
curatedRelease.created_batch_id = batchId;
}
@@ -443,12 +458,15 @@ async function storeScenes(releases, useBatchId) {
description = COALESCE(new.description, releases.description),
shoot_id = COALESCE(new.shoot_id, releases.shoot_id),
duration = COALESCE(new.duration, releases.duration),
production_date = COALESCE(new.production_date, releases.production_date),
production_date_precision = COALESCE(new.production_date_precision, releases.production_date_precision),
language_alpha2 = COALESCE(new.language_alpha2, releases.language_alpha2),
comment = COALESCE(new.comment, releases.comment),
attributes = COALESCE(new.attributes::jsonb || releases.attributes::jsonb, new.attributes::jsonb, releases.attributes::jsonb),
deep = new.url IS NOT NULL,
updated_at = NOW()
FROM json_to_recordset(:scenes)
AS new(id int, url text, date timestamptz, entity json, title text, description text, shoot_id text, duration integer, comment text, attributes json, deep boolean)
AS new(id int, url text, date timestamptz, entity json, title text, description text, shoot_id text, duration integer, production_date timestamptz, production_date_precision text, language_alpha2 text, comment text, attributes json, deep boolean)
WHERE releases.id = new.id
RETURNING releases.*
`, {