Refactored media fetch with undici for http/2 support.

This commit is contained in:
DebaucheryLibrarian
2026-04-02 04:45:31 +02:00
parent 6cabfc3090
commit a96ec64d61
9 changed files with 205 additions and 934 deletions

29
package-lock.json generated
View File

@@ -93,7 +93,7 @@
"tough-cookie": "^4.1.3",
"tunnel": "0.0.6",
"ua-parser-js": "^1.0.37",
"undici": "^5.28.1",
"undici": "^7.24.7",
"unprint": "^0.19.13",
"url-pattern": "^1.0.3",
"v-tooltip": "^2.1.3",
@@ -3064,14 +3064,6 @@
"npm": ">=6.14.13"
}
},
"node_modules/@fastify/busboy": {
"version": "2.1.0",
"resolved": "https://registry.npmjs.org/@fastify/busboy/-/busboy-2.1.0.tgz",
"integrity": "sha512-+KpH+QxZU7O4675t3mnkQKcZZg56u+K/Ct2K+N2AZYNVK8kyeo/bI18tI8aPm3tvNNRyTWfj6s5tnGNlcbQRsA==",
"engines": {
"node": ">=14"
}
},
"node_modules/@gar/promisify": {
"version": "1.1.3",
"resolved": "https://registry.npmjs.org/@gar/promisify/-/promisify-1.1.3.tgz",
@@ -20576,14 +20568,11 @@
}
},
"node_modules/undici": {
"version": "5.28.1",
"resolved": "https://registry.npmjs.org/undici/-/undici-5.28.1.tgz",
"integrity": "sha512-xcIIvj1LOQH9zAL54iWFkuDEaIVEjLrru7qRpa3GrEEHk6OBhb/LycuUY2m7VCcTuDeLziXCxobQVyKExyGeIA==",
"dependencies": {
"@fastify/busboy": "^2.0.0"
},
"version": "7.24.7",
"resolved": "https://registry.npmjs.org/undici/-/undici-7.24.7.tgz",
"integrity": "sha512-H/nlJ/h0ggGC+uRL3ovD+G0i4bqhvsDOpbDv7At5eFLlj2b41L8QliGbnl2H7SnDiYhENphh1tQFJZf+MyfLsQ==",
"engines": {
"node": ">=14.0"
"node": ">=20.18.1"
}
},
"node_modules/undici-types": {
@@ -21305,14 +21294,6 @@
"node": ">= 0.6"
}
},
"node_modules/unprint/node_modules/undici": {
"version": "7.18.2",
"resolved": "https://registry.npmjs.org/undici/-/undici-7.18.2.tgz",
"integrity": "sha512-y+8YjDFzWdQlSE9N5nzKMT3g4a5UBX1HKowfdXh0uvAnTaqqwqB92Jt4UXBAeKekDs5IaDKyJFR4X1gYVCgXcw==",
"engines": {
"node": ">=20.18.1"
}
},
"node_modules/unprint/node_modules/w3c-xmlserializer": {
"version": "2.0.0",
"resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-2.0.0.tgz",

View File

@@ -152,7 +152,7 @@
"tough-cookie": "^4.1.3",
"tunnel": "0.0.6",
"ua-parser-js": "^1.0.37",
"undici": "^5.28.1",
"undici": "^7.24.7",
"unprint": "^0.19.13",
"url-pattern": "^1.0.3",
"v-tooltip": "^2.1.3",

View File

@@ -796,6 +796,9 @@ const networks = [
slug: 'teencoreclub',
name: 'Teen Core Club',
url: 'https://teencoreclub.com',
parameters: {
studioId: 1624,
},
},
{
slug: 'teenmegaworld',

View File

@@ -13510,7 +13510,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 178,
legacySiteId: 178,
},
},
{
@@ -13520,7 +13520,7 @@ const sites = [
parent: 'teencoreclub',
hasLogo: false,
parameters: {
siteId: 482,
legacySiteId: 482,
},
},
{
@@ -13537,7 +13537,8 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 180,
legacySiteId: 180,
siteId: 17,
},
},
{
@@ -13552,7 +13553,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 182,
legacySiteId: 182,
},
},
{
@@ -13564,7 +13565,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 184,
legacySiteId: 184,
},
},
{
@@ -13579,7 +13580,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 362,
legacySiteId: 362,
},
},
{
@@ -13591,7 +13592,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 278,
legacySiteId: 278,
},
},
{
@@ -13608,7 +13609,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 186,
legacySiteId: 186,
},
},
{
@@ -13620,7 +13621,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 280,
legacySiteId: 280,
},
},
{
@@ -13632,7 +13633,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 282,
legacySiteId: 282,
},
},
{
@@ -13644,7 +13645,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 188,
legacySiteId: 188,
},
},
{
@@ -13656,7 +13657,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 284,
legacySiteId: 284,
},
},
{
@@ -13672,7 +13673,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 190,
legacySiteId: 190,
},
},
{
@@ -13689,7 +13690,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 192,
legacySiteId: 192,
},
},
{
@@ -13706,7 +13707,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 288,
legacySiteId: 288,
},
},
{
@@ -13718,7 +13719,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 290,
legacySiteId: 290,
},
},
{
@@ -13733,7 +13734,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 292,
legacySiteId: 292,
},
},
{
@@ -13745,7 +13746,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 194,
legacySiteId: 194,
},
},
{
@@ -13757,7 +13758,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 196,
legacySiteId: 196,
},
},
{
@@ -13769,7 +13770,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 198,
legacySiteId: 198,
},
},
{
@@ -13784,7 +13785,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 294,
legacySiteId: 294,
},
},
{
@@ -13795,7 +13796,7 @@ const sites = [
visible: false,
hasLogo: false,
parameters: {
siteId: 566,
legacySiteId: 566,
},
},
{
@@ -13815,7 +13816,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 200,
legacySiteId: 200,
},
},
{
@@ -13830,7 +13831,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 296,
legacySiteId: 296,
},
},
{
@@ -13842,7 +13843,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 298,
legacySiteId: 298,
},
},
{
@@ -13857,7 +13858,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 300,
legacySiteId: 300,
},
},
{
@@ -13872,7 +13873,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 302,
legacySiteId: 302,
},
},
{
@@ -13888,7 +13889,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 304,
legacySiteId: 304,
},
},
{
@@ -13903,7 +13904,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 306,
legacySiteId: 306,
},
},
{
@@ -13915,7 +13916,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 308,
legacySiteId: 308,
},
},
{
@@ -13926,7 +13927,7 @@ const sites = [
visible: false,
hasLogo: false,
parameters: {
siteId: 568,
legacySiteId: 568,
},
},
{
@@ -13937,7 +13938,7 @@ const sites = [
visible: false,
hasLogo: false,
parameters: {
siteId: 570,
legacySiteId: 570,
},
},
{
@@ -13950,7 +13951,7 @@ const sites = [
parent: 'teencoreclub',
hasLogo: false,
parameters: {
siteId: 360,
legacySiteId: 360,
},
},
{
@@ -13962,7 +13963,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 310,
legacySiteId: 310,
},
},
{
@@ -13976,7 +13977,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 202,
legacySiteId: 202,
},
},
{
@@ -13988,7 +13989,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 312,
legacySiteId: 312,
},
},
{
@@ -14003,7 +14004,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 314,
legacySiteId: 314,
},
},
{
@@ -14014,7 +14015,7 @@ const sites = [
visible: false,
hasLogo: false,
parameters: {
siteId: 556,
legacySiteId: 556,
},
},
{
@@ -14029,7 +14030,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 316,
legacySiteId: 316,
},
},
{
@@ -14043,7 +14044,7 @@ const sites = [
parent: 'teencoreclub',
hasLogo: false,
parameters: {
siteId: 418,
legacySiteId: 418,
},
},
{
@@ -14055,7 +14056,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 318,
legacySiteId: 318,
},
},
{
@@ -14067,7 +14068,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 204,
legacySiteId: 204,
},
},
{
@@ -14083,7 +14084,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 320,
legacySiteId: 320,
},
},
{
@@ -14095,7 +14096,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 322,
legacySiteId: 322,
},
},
{
@@ -14107,7 +14108,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 324,
legacySiteId: 324,
},
},
{
@@ -14119,7 +14120,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 366,
legacySiteId: 366,
},
},
{
@@ -14132,7 +14133,7 @@ const sites = [
parent: 'teencoreclub',
hasLogo: false,
parameters: {
siteId: 176,
legacySiteId: 176,
},
},
{
@@ -14147,7 +14148,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 368,
legacySiteId: 368,
},
},
{
@@ -14162,7 +14163,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 326,
legacySiteId: 326,
},
},
{
@@ -14180,7 +14181,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 206,
legacySiteId: 206,
},
},
{
@@ -14195,7 +14196,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 208,
legacySiteId: 208,
},
},
{
@@ -14210,7 +14211,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 210,
legacySiteId: 210,
},
},
{
@@ -14227,7 +14228,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 328,
legacySiteId: 328,
},
},
{
@@ -14242,7 +14243,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 212,
legacySiteId: 212,
},
},
{
@@ -14258,7 +14259,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 330,
legacySiteId: 330,
},
},
{
@@ -14275,7 +14276,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 214,
legacySiteId: 214,
},
},
{
@@ -14291,7 +14292,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 332,
legacySiteId: 332,
},
},
{
@@ -14310,7 +14311,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 216,
legacySiteId: 216,
},
},
{
@@ -14325,7 +14326,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 334,
legacySiteId: 334,
},
},
{
@@ -14336,7 +14337,7 @@ const sites = [
visible: false,
hasLogo: false,
parameters: {
siteId: 558,
legacySiteId: 558,
},
},
{
@@ -14352,7 +14353,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 336,
legacySiteId: 336,
},
},
{
@@ -14369,7 +14370,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 218,
legacySiteId: 218,
},
},
{
@@ -14386,7 +14387,7 @@ const sites = [
],
parent: 'teencoreclub',
parameters: {
siteId: 220,
legacySiteId: 220,
},
},
/* TCC VOD services and unused brands

View File

@@ -651,9 +651,10 @@ async function fetchHttpSource(source, tempFileTarget, hashStream) {
const res = await http.get(source.src, {
limits: 'media',
headers: {
host: new URL(source.src).hostname,
// explicit host not allowed in HTTP/2
// host: new URL(source.src).hostname,
// ...(source.host && { host: source.host }),
...(source.referer && { referer: source.referer }),
...(source.host && { host: source.host }),
},
stream: true, // sources are fetched in parallel, don't gobble up memory
followRedirects: source.followRedirects,

View File

@@ -85,7 +85,7 @@ async function scrapeScene({ query: pageQuery, html }, { url, entity, include })
}
if (include.photos && capsUrl) {
release.caps = await fetchCaps(capsUrl);
release.caps = await fetchCaps(capsUrl, entity);
}
release.trailer = pageQuery.video('#download_select option[value*=".mp4"]', { attribute: 'value' });

View File

@@ -1,155 +1,84 @@
'use strict';
const moment = require('moment');
const unprint = require('unprint');
const logger = require('../logger')(__filename);
const http = require('../utils/http');
const qu = require('../utils/qu');
const slugify = require('../utils/slugify');
const { prefixUrl } = require('../utils/qu');
function scrapeAll(scenes, entity) {
return scenes.map((scene) => {
function scrapeAll(scenes) {
return scenes.map(({ query }) => {
const release = {};
release.entryId = scene.id;
release.url = `${new URL(entity.url).origin}/video/${scene.id}/${scene.slug}`;
release.url = query.url('.title a');
release.entryId = new URL(release.url).pathname.match(/\/scene\/(\d+)/)[1];
if (/bic/i.test(scene.title)) {
release.shootId = scene.title.toUpperCase().replace('-', '_');
} else {
release.title = scene.title;
}
release.title = query.content('.title a');
release.description = scene.description;
release.date = moment.utc(scene.year, 'YYYY').toDate();
release.datePrecision = 'year';
release.date = query.date('.date', 'MMM DD, YYYY');
release.duration = query.duration('.duration');
release.actors = scene.actors.map((actor) => ({
name: actor.name.trim(),
avatar: actor.image || null,
})).filter((actor) => actor.name && slugify(actor.name) !== 'amateur-girl');
release.actors = query.all('.models a.model').map((actorEl) => ({
name: unprint.query.content(actorEl),
url: unprint.query.url(actorEl, null),
}));
release.duration = scene.duration;
release.stars = scene.video_rating_score;
release.poster = query.img('img.poster');
release.teaser = query.video('.teaser video');
[release.poster, ...release.photos] = scene.screenshots.map((url) => prefixUrl(url));
if (scene.is_gay) {
release.tags = ['gay'];
}
console.log(release);
return release;
});
}
async function scrapeScene({ query }, url) {
const release = {};
const { pathname, origin, host } = new URL(url);
const entryId = pathname.match(/\/video\/(\d+)/)[1];
release.entryId = entryId;
const title = query.meta('name=title');
if (/bic/i.test(title)) {
release.shootId = title.toUpperCase().replace('-', '_');
} else {
release.title = title;
}
release.date = query.date('.detail-meta li:nth-child(2)', 'YYYY');
release.datePrecision = 'year';
release.description = query.q('.detail-description', true);
release.duration = query.dur('.detail-meta li:first-child');
const actors = [query.q('.detail-hero-title h1', true)?.trim()].filter((name) => name && slugify(name) !== 'amateur-girl');
if (actors.length > 0) {
release.actors = actors;
}
release.poster = query.q('.detail-hero').style['background-image'].match(/url\((.+)\)/)[1];
release.photos = query.imgs('.detail-grabs img');
const streamData = await http.get(`${origin}/video/source/${entryId}`, {
headers: {
host,
referer: url,
},
}, {
interval: 5000,
concurrency: 1,
});
if (streamData.ok && streamData.body.status === 'success') {
release.trailer = {
stream: streamData.body.link,
};
} else {
logger.warn(`Failed to fetch trailer for ${url}: ${streamData.ok ? streamData.body.status : streamData.status }`);
}
return release;
}
async function scrapeProfile(actor, entity, include) {
const profile = {};
if (actor.image) {
profile.avatar = `https://teencoreclub.com${actor.image}`;
}
if (include.releases) {
const res = await http.get(`https://teencoreclub.com/browsevideos/api/all?actor=${actor.id}`);
if (res.ok) {
profile.releases = scrapeAll(res.body.data, entity);
}
}
return profile;
}
async function fetchLatest(entity, page = 1) {
// console.log(entity, page);
if (entity.parameters?.siteId) {
const res = await http.get(`https://teencoreclub.com/browsevideos/api/all?resType=latest&page=${page}&label=${entity.parameters.siteId}`);
if (res.ok) {
return scrapeAll(res.body.data, entity);
}
return res.status;
}
return null;
}
async function fetchScene(url, entity) {
const { pathname } = new URL(url);
const res = await qu.get(`https://teencoreclub.com${pathname}`);
async function fetchLatest(channel, page = 1) {
const url = `${channel.url}/${page}`;
const res = await unprint.get(url, { selectAll: '.scene' });
if (res.ok) {
return scrapeScene(res.item, url, entity);
return scrapeAll(res.context, channel);
}
return res.status;
}
async function fetchProfile({ name: actorName }, { entity }, include) {
const res = await http.get(`https://teencoreclub.com/api/actors?query=${actorName}`);
function scrapeScene({ query }, { url }) {
const release = {};
release.entryId = new URL(url).pathname.match(/\/scene\/(\d+)/)[1];
release.title = query.content('h3.title');
release.description = query.content('p.description');
release.date = query.date('.date', 'MMMM D, YYYY');
release.duration = query.duration('.duration');
[release.poster, ...release.photos] = query.imgs('.preview-thumb');
release.trailer = query.video('.trailer video');
console.log(release);
return release;
}
function scrapeProfile({ query }) {
const profile = {};
profile.description = query.content('.bio-text');
profile.birthPlace = query.content('.birth-place span');
profile.avatar = query.img('.actor-photo img');
console.log(profile);
return profile;
}
async function fetchProfile({ name: actorName }, entity) {
const url = `${entity.url}/actors/${slugify(actorName, '_')}`;
const res = await unprint.get(url);
if (res.ok) {
const actor = res.body.data.find((item) => slugify(item.name) === slugify(actorName));
if (actor) {
return scrapeProfile(actor, entity, include);
}
return null;
return scrapeProfile(res.context, entity);
}
return res.status;
@@ -157,6 +86,6 @@ async function fetchProfile({ name: actorName }, { entity }, include) {
module.exports = {
fetchLatest,
fetchScene,
fetchProfile,
scrapeScene,
};

View File

@@ -1,655 +0,0 @@
'use strict';
const config = require('config');
const fs = require('fs');
const path = require('path');
const moment = require('moment');
const Promise = require('bluebird');
const bhttp = require('bhttp');
const { nanoid } = require('nanoid/non-secure');
const { Upload } = require('@aws-sdk/lib-storage');
const { S3Client } = require('@aws-sdk/client-s3');
const { graphql } = require('../web/graphql');
const knex = require('../knex');
const args = require('../argv');
const s3 = new S3Client({
region: 'eu-central-1',
endpoint: 'https://s3.eu-central-1.wasabisys.com',
credentials: {
accessKeyId: config.s3.accessKey,
secretAccessKey: config.s3.secretKey,
},
});
// NOT TRANSFERRED, unutilized on old server: production location, availabile qualities, actor alias for, actor entry id, chapter posters, chapter photos
const sceneFields = `
entryId
shootId
title
url
date
datePrecision
productionDate
description
duration
entity {
slug
type
}
studio {
slug
}
movies: moviesScenesBySceneId {
movie {
title
entryId
entity {
slug
type
}
}
}
actors: releasesActors {
actor {
name
slug
entryId
entity {
slug
type
}
}
}
directors: releasesDirectors {
director {
name
slug
entryId
entity {
slug
type
}
}
}
tags: releasesTags {
tag {
slug
}
}
chapters(orderBy: TIME_ASC) {
index
time
duration
title
description
tags: chaptersTags {
tag {
slug
}
}
}
poster: releasesPoster {
media {
hash
path
thumbnail
lazy
s3: isS3
mime
index
width
height
size
source
sourcePage
}
}
photos: releasesPhotos {
media {
hash
path
thumbnail
lazy
s3: isS3
mime
index
width
height
size
source
sourcePage
}
}
covers: releasesCovers {
media {
hash
path
thumbnail
lazy
s3: isS3
mime
index
width
height
size
source
sourcePage
}
}
trailer: releasesTrailer {
media {
hash
path
thumbnail
lazy
s3: isS3
mime
index
width
height
size
source
sourcePage
}
}
teaser: releasesTeaser {
media {
hash
path
thumbnail
lazy
s3: isS3
mime
index
width
height
size
source
sourcePage
}
}
createdAt
`;
const movieFields = `
entryId
title
url
date
datePrecision
entity {
slug
type
}
poster: moviesPoster {
media {
hash
path
thumbnail
lazy
s3: isS3
mime
index
width
height
size
source
sourcePage
}
}
covers: moviesCovers {
media {
hash
path
thumbnail
lazy
s3: isS3
mime
index
width
height
size
source
sourcePage
}
}
createdAt
`;
async function save() {
const limit = args.limit || 1000;
const offset = args.start || 0;
const { releases } = await graphql(`
query SearchScenes(
$limit: Int = 20
$offset: Int = 0
) {
releases(
first: $limit
offset: $offset
orderBy: DATE_DESC
) {
${sceneFields}
}
}
`, {
limit,
offset,
}, 'owner');
const { movies } = await graphql(`
query SearchScenes(
$limit: Int = 20
$offset: Int = 0
) {
movies(
first: $limit
offset: $offset
orderBy: DATE_DESC
) {
${movieFields}
}
}
`, {
limit,
offset,
}, 'owner');
const filename = `export-${offset}-${offset + limit}-${moment().format('YYYY-MM-DD_hh_mm_ss')}.json`;
let savedScenes = 0;
let savedMovies = 0;
await releases.reduce(async (chain, release) => {
await chain;
const entry = JSON.stringify({
...release,
type: 'release',
actors: release.actors.filter(Boolean).map(({ actor }) => actor),
directors: release.directors.filter(Boolean).map(({ director }) => director),
studio: release.studio?.slug,
tags: release.tags.map(({ tag }) => tag?.slug).filter(Boolean),
movies: release.movies?.map(({ movie }) => movie) || [],
chapters: release.chapters.filter(Boolean).map((chapter) => ({
...chapter,
tags: chapter.tags.map(({ tag }) => tag?.slug).filter(Boolean),
})),
poster: release.poster?.media,
trailer: release.trailer?.media,
teaser: release.teaser?.media,
photos: release.photos.filter(Boolean).map(({ media }) => media),
covers: release.covers.filter(Boolean).map(({ media }) => media),
});
await fs.promises.appendFile(filename, `${entry}\n`);
savedScenes += 1;
}, Promise.resolve());
await movies.reduce(async (chain, movie) => {
await chain;
const entry = JSON.stringify({
...movie,
type: 'movie',
poster: movie.poster?.media,
covers: movie.covers.filter(Boolean).map(({ media }) => media),
});
await fs.promises.appendFile(filename, `${entry}\n`);
savedMovies += 1;
}, Promise.resolve());
console.log(`Saved ${savedScenes} scenes and ${savedMovies} movies to ${filename}`);
process.exit();
}
async function addReleaseTags(release, context) {
if (release.tags.length === 0) {
return;
}
await knex('releases_tags').insert(release.tags.map((tag) => ({
tag_id: context.tagIdsBySlug[tag],
release_id: release.id,
original_tag: tag,
})));
}
async function addNewActor(actor, entity, context) {
const [{ id: actorId }] = await knex('actors')
.insert({
name: actor.name,
slug: actor.slug,
entity_id: entity?.id,
batch_id: context.batchId,
})
.returning('id');
return actorId;
}
async function addReleaseActors(release, context, target = 'actor') {
await release[`${target}s`].reduce(async (chain, actor) => {
await chain;
const entity = actor.entity
? await knex('entities').where(actor.entity).first()
: null;
if (actor.entity && !entity) {
throw new Error(`Actor ${actor.slug} contains non-existent ${release.entity.type} '${release.entity.slug}'`);
}
const existingActor = await knex('actors')
.where('slug', actor.slug)
.where((builder) => {
if (entity) {
builder.where('entity_id', entity.id);
return;
}
builder.whereNull('entity_id');
})
.first();
const actorId = existingActor?.id
|| await addNewActor(actor, entity, context);
await knex(`releases_${target}s`).insert({
release_id: release.id,
[`${target}_id`]: actorId,
});
}, Promise.resolve());
}
async function addReleaseDirectors(release, context) {
return addReleaseActors(release, context, 'director');
}
async function addReleaseChapters(release, context) {
await release.chapters.reduce(async (chain, chapter) => {
await chain;
const [{ id: chapterId }] = await knex('chapters')
.insert({
release_id: release.id,
index: chapter.index,
time: chapter.time,
duration: chapter.duration,
description: chapter.description,
})
.returning('id');
if (chapter.tags.length > 0) {
await knex('chapters_tags').insert(chapter.tags.map((tag) => ({
tag_id: context.tagIdsBySlug[tag],
chapter_id: chapterId,
original_tag: tag,
})));
}
}, Promise.resolve());
}
const dirs = {
path: '',
thumbnail: 'thumbs',
lazy: 'lazy',
};
async function transferMedia(media, target) {
return ['path', 'thumbnail', 'lazy'].reduce(async (chain, type) => {
await chain;
const filename = `${media.hash}${path.extname(media[type])}`;
const filepath = path.join(target, dirs[type], filename);
const temp = path.join('media/temp', filepath);
const url = new URL(media[type], `${media.s3 ? config.media.transferSources.s3 : config.media.transferSources.local}/`).href;
if (args.logLevel === 'debug') {
console.log('Transferring media', url);
}
const res = await bhttp.get(url, { stream: true });
if (res.statusCode !== 200) {
console.warn(`Missing ${target} ${url}`);
return;
}
await fs.promises.mkdir(path.dirname(temp), { recursive: true });
await new Promise((resolve, reject) => {
const fileStream = fs.createWriteStream(temp);
res.pipe(fileStream);
res.on('error', () => { reject(); });
fileStream.on('finish', () => { resolve(); });
fileStream.on('error', () => { reject(); });
});
await new Upload({
client: s3,
params: {
Bucket: config.s3.bucket,
Body: fs.createReadStream(temp),
Key: filepath,
ContentType: media.mime,
},
}).done();
await fs.promises.unlink(temp);
}, Promise.resolve());
}
async function addReleaseMedia(medias, release, target) {
await medias.filter(Boolean).reduce(async (chain, media) => {
await chain;
const existingMedia = await knex('media')
.where('hash', media.hash)
.orWhere('source', media.source)
.first();
const id = existingMedia?.id || nanoid();
if (!existingMedia) {
await knex('media').insert({
id,
hash: media.hash,
path: path.join(target, '', `${media.hash}${path.extname(media.path)}`),
thumbnail: path.join(target, 'thumbs', `${media.hash}${path.extname(media.thumbnail)}`),
lazy: path.join(target, 'lazy', `${media.hash}${path.extname(media.lazy)}`),
// is_s3: media.s3,
is_s3: true,
index: media.index,
mime: media.mime,
size: media.size,
width: media.width,
height: media.height,
source: media.source,
source_page: media.sourcePage,
});
await transferMedia(media, target);
}
try {
await knex(`${release.type}s_${target}`).insert({
[`${release.type}_id`]: release.id,
media_id: id,
});
} catch (error) {
console.warn(`Ignored duplicate ${release.type} ${target} association ${media.hash} with ${release.id} "${release.title}"`);
}
}, Promise.resolve());
}
async function linkMovieScenes(release, context) {
await release.movies.reduce(async (chain, linkedMovie) => {
await chain;
const movie = context.movies.find((storedMovie) => storedMovie.entryId === linkedMovie.entryId
&& storedMovie.entity.slug === linkedMovie.entity.slug
&& storedMovie.entity.type === linkedMovie.entity.type);
if (!movie) {
throw new Error(`Missing ${linkedMovie.entity.slug} movie '${linkedMovie.title}' in '${release.title}'`);
}
await knex('movies_scenes').insert({
movie_id: movie.id,
scene_id: release.id,
});
}, Promise.resolve());
}
async function addRelease(release, context) {
const existingRelease = await knex(`${release.type}s`)
.select(`${release.type}s.*`, 'entities.name as entity_name')
.leftJoin('entities', 'entities.id', `${release.type}s.entity_id`)
.where('entry_id', release.entryId)
.where('entities.slug', release.entity.slug)
.where('entities.type', release.entity.type)
.first();
if (existingRelease) {
console.log(`Skipping ${release.entity.slug} release "${release.title}", already in database`);
return {
...release,
skipped: true,
id: existingRelease.id,
entityName: existingRelease.entity_name,
};
}
const [entity] = await Promise.all([
knex('entities').select(['id', 'name']).where(release.entity).first(),
]);
if (!entity) {
throw new Error(`Release "${release.title}" contains non-existent ${release.entity.type} '${release.entity.slug}'`);
}
const [releaseEntry] = await knex(`${release.type}s`)
.insert({
entry_id: release.entryId,
entity_id: entity.id,
url: release.url,
title: release.title,
slug: release.slug,
date: release.date,
date_precision: release.datePrecision,
created_batch_id: context.batchId,
updated_batch_id: context.batchId,
...(release.type === 'scene' && {
shoot_id: release.shootId,
studio_id: context.studioIdsBySlug[release.studio],
production_date: release.productionDate,
description: release.description,
duration: release.duration,
}),
})
.returning(['id', 'entry_id']);
const releaseWithId = {
...release,
id: releaseEntry.id,
entityName: entity.name,
};
await addReleaseMedia([releaseWithId.poster], releaseWithId, 'posters', context);
if (release.type === 'release') {
await Promise.all([
addReleaseTags(releaseWithId, context),
addReleaseActors(releaseWithId, context),
addReleaseDirectors(releaseWithId, context),
addReleaseChapters(releaseWithId, context),
linkMovieScenes(releaseWithId, context),
addReleaseMedia(releaseWithId.photos, releaseWithId, 'photos', context),
]);
}
if (release.type === 'movie') {
await addReleaseMedia(releaseWithId.covers, releaseWithId, 'covers', context);
}
return releaseWithId;
}
async function load() {
const file = await fs.promises.readFile(args.file, 'utf8');
const start = args.start || 0;
const end = args.limit ? start + args.limit : Infinity;
const releases = file.split('\n')
.filter(Boolean)
.map((data) => JSON.parse(data))
.filter((release) => (args.entity ? release.entity.slug === args.entity : true))
.slice(start, end);
if (releases.length === 0) {
console.log('Nothing to load');
return;
}
const [{ id: batchId }] = await knex('batches').insert({ comment: `import ${args.file}` }).returning('id');
const aggTags = Array.from(new Set(releases.filter((release) => release.type === 'release').flatMap((release) => [...release.tags, ...release.chapters.flatMap((chapter) => chapter.tags)]).filter(Boolean)));
const aggStudios = Array.from(new Set(releases.map((release) => release.studio).filter(Boolean)));
const tags = await knex('tags')
.select('id', 'slug')
.whereIn('slug', aggTags);
const studios = await knex('entities')
.select('id', 'slug')
.where('type', 'studio')
.whereIn('slug', aggStudios);
const tagIdsBySlug = Object.fromEntries(tags.map((tag) => [tag.slug, tag.id]));
const studioIdsBySlug = Object.fromEntries(studios.map((studio) => [studio.slug, studio.id]));
const addedMovies = await releases.filter((release) => release.type === 'movie').reduce(async (chain, release, index, array) => {
const acc = await chain;
const movie = await addRelease(release, { batchId, tagIdsBySlug, studioIdsBySlug });
if (!movie.skipped) {
console.log(`Loaded ${index}/${array.length} '${movie.entityName}' movie "${movie.title}"`);
}
return acc.concat(movie);
}, Promise.resolve([]));
const addedScenes = await releases.filter((release) => release.type === 'release').reduce(async (chain, release, index, array) => {
const acc = await chain;
const scene = await addRelease(release, { batchId, movies: addedMovies, tagIdsBySlug, studioIdsBySlug });
if (!scene.skipped) {
console.log(`Loaded ${index}/${array.length} '${scene.entityName}' scene "${scene.title}"`);
}
return acc.concat((!!scene && !scene.skipped));
}, Promise.resolve([]));
console.log(`Loaded ${addedMovies.filter((movie) => movie && !movie.skipped).length}/${releases.filter((release) => release.type === 'movie').length} movies in batch ${batchId}`);
console.log(`Loaded ${addedScenes.filter((scene) => scene && !scene.skipped).length}/${releases.filter((release) => release.type === 'release').length} scenes in batch ${batchId}`);
process.exit();
}
({
save,
load,
})[args._]();

View File

@@ -3,9 +3,11 @@
const config = require('config');
const Promise = require('bluebird');
const bhttp = require('bhttp');
const undici = require('undici');
const fs = require('fs').promises;
const util = require('util');
const stream = require('stream');
// const util = require('util');
// const stream = require('stream');
const { pipeline } = require('stream/promises');
const tunnel = require('tunnel');
const Bottleneck = require('bottleneck');
const { JSDOM, toughCookie } = require('jsdom');
@@ -18,7 +20,7 @@ const logger = require('../logger')(__filename);
const virtualConsole = require('./virtual-console')(__filename);
const argv = require('../argv');
const pipeline = util.promisify(stream.pipeline);
// const pipeline = util.promisify(stream.pipeline);
const limiters = {
bypass: new Bottleneck({
@@ -47,13 +49,6 @@ const defaultOptions = {
},
};
const proxyAgent = tunnel.httpsOverHttp({
proxy: {
host: config.proxy.host,
port: config.proxy.port,
},
});
function useProxy(url) {
if (!config.proxy.enable) {
return false;
@@ -326,87 +321,103 @@ async function bypassCloudflareRequest(url, method, body, cloudflareBypass, opti
};
}
async function request(method = 'get', url, body, requestOptions = {}, limiter) {
const http = requestOptions.session || bhttp;
const defaultAgent = new undici.Agent({
allowH2: true,
connect: {
rejectUnauthorized: false,
},
});
const options = {
...requestOptions,
session: null,
};
const proxyAgent = tunnel.httpsOverHttp({
proxy: {
host: config.proxy.host,
port: config.proxy.port,
},
});
async function request(method = 'get', url, body, requestOptions = {}, limiter, redirects = 0) {
const withProxy = useProxy(url);
const withBrowserBypass = useBrowserBypass(url, options);
const withCloudflareBypass = useCloudflareBypass(url, options);
const withBrowserBypass = useBrowserBypass(url, requestOptions);
const withCloudflareBypass = useCloudflareBypass(url, requestOptions);
if (withProxy) {
options.agent = proxyAgent;
}
logger.debug(`${method.toUpperCase()} (${limiter._store.storeOptions.minTime}ms/${limiter._store.storeOptions.maxConcurrent}p${withProxy ? ' proxy' : ''}${withBrowserBypass || withCloudflareBypass ? ' bypass' : ''}) ${url}`);
logger.debug(`${redirects > 0 ? 'REDIRECT ' : ''}${method.toUpperCase()} (${limiter._store.storeOptions.minTime}ms/${limiter._store.storeOptions.maxConcurrent}p${withProxy ? ' proxy' : ''}${withBrowserBypass || withCloudflareBypass ? ' bypass' : ''}) ${url}`);
if (withBrowserBypass) {
if (method !== 'get') {
throw new Error('Browser bypass only supports GET');
}
return bypassBrowserRequest(url, options);
if (method !== 'get') throw new Error('Browser bypass only supports GET');
return bypassBrowserRequest(url, requestOptions);
}
if (withCloudflareBypass) {
return bypassCloudflareRequest(url, method, body, withCloudflareBypass, options);
return bypassCloudflareRequest(url, method, body, withCloudflareBypass, requestOptions);
}
const res = await (body
? http[method](url, body, options)
: http[method](url, options));
const headers = {
...requestOptions.headers,
};
const res = await undici.request(url, {
method: method.toUpperCase(),
headers,
body: body ?? null,
dispatcher: withProxy
? proxyAgent
: defaultAgent,
maxRedirections: 0, // handle manually
});
if (res.headers.location && redirects < 3) {
// Drain the body to free the socket before redirecting
await res.body.dump();
const nextUrl = new URL(res.headers.location, url).href;
return request(method, nextUrl, body, requestOptions, limiter, redirects + 1);
}
return res;
}
async function finalizeResult(res, options) {
async function finalizeResult(res, url, options) {
if (options.destination) {
// res.on('progress', (bytes, totalBytes) => logger.silly(`Downloaded ${Math.round((bytes / totalBytes) * 100)}% of ${url}`));
await pipeline(res, ...(options.transforms || []), options.destination);
}
if (Buffer.isBuffer(res.body)) {
const html = res.body.toString();
const window = options?.parse ? new JSDOM(html, { virtualConsole, ...options.extract }).window : null;
const pathname = new URL(res.request.url).pathname.replace(/\//g, '_');
// allow window.close to be called after scraping is done, only for deep scrapes where the URL is known outside the scraper
if (window && /fetchScene|fetchMovie/.test(new Error().stack)) {
windows.set(pathname, window);
}
if (argv.saveHtml) {
await fs.writeFile(`./html/${pathname}.html`, html);
}
await pipeline(
res.body,
...(options.transforms || []),
options.destination,
);
return {
...res,
body: html,
html,
status: res.statusCode,
statusCode: res.statusCode,
headers: res.headers,
document: window?.document || null,
window,
status: res.statusCode,
ok: res.statusCode >= 200 && res.statusCode <= 299,
};
}
const buffer = await res.body.arrayBuffer();
const html = Buffer.from(buffer).toString();
const window = options?.parse ? new JSDOM(html, { virtualConsole, ...options.extract }).window : null;
const pathname = new URL(url).pathname.replace(/\//g, '_');
if (window && /fetchScene|fetchMovie/.test(new Error().stack)) {
windows.set(pathname, window);
}
if (argv.saveHtml) {
await fs.writeFile(`./html/${pathname}.html`, html);
}
return {
...res,
body: res.body,
statusCode: res.statusCode,
status: res.statusCode,
headers: res.headers,
body: html,
html,
document: window?.document || null,
window,
ok: res.statusCode >= 200 && res.statusCode <= 299,
};
}
function getTimeout(options, url) {
return new Promise((resolve, reject, onCancel) => {
return new Promise((_resolve, reject, onCancel) => {
const timeout = setTimeout(() => {
logger.debug(`Canceled timed out request to ${url}`);
reject(new Error(`URL ${url} timed out`));
@@ -441,7 +452,7 @@ async function scheduleRequest(method = 'get', url, body, requestOptions = {}) {
timeout.cancel();
const curatedResult = await finalizeResult(result, options);
const curatedResult = await finalizeResult(result, url, options);
logger.silly(`Response ${curatedResult.status} for ${method.toUpperCase()} ${url}`);