diff --git a/package-lock.json b/package-lock.json index 6426e066..7bbcfe4c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -93,7 +93,7 @@ "tough-cookie": "^4.1.3", "tunnel": "0.0.6", "ua-parser-js": "^1.0.37", - "undici": "^5.28.1", + "undici": "^7.24.7", "unprint": "^0.19.13", "url-pattern": "^1.0.3", "v-tooltip": "^2.1.3", @@ -3064,14 +3064,6 @@ "npm": ">=6.14.13" } }, - "node_modules/@fastify/busboy": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/@fastify/busboy/-/busboy-2.1.0.tgz", - "integrity": "sha512-+KpH+QxZU7O4675t3mnkQKcZZg56u+K/Ct2K+N2AZYNVK8kyeo/bI18tI8aPm3tvNNRyTWfj6s5tnGNlcbQRsA==", - "engines": { - "node": ">=14" - } - }, "node_modules/@gar/promisify": { "version": "1.1.3", "resolved": "https://registry.npmjs.org/@gar/promisify/-/promisify-1.1.3.tgz", @@ -20576,14 +20568,11 @@ } }, "node_modules/undici": { - "version": "5.28.1", - "resolved": "https://registry.npmjs.org/undici/-/undici-5.28.1.tgz", - "integrity": "sha512-xcIIvj1LOQH9zAL54iWFkuDEaIVEjLrru7qRpa3GrEEHk6OBhb/LycuUY2m7VCcTuDeLziXCxobQVyKExyGeIA==", - "dependencies": { - "@fastify/busboy": "^2.0.0" - }, + "version": "7.24.7", + "resolved": "https://registry.npmjs.org/undici/-/undici-7.24.7.tgz", + "integrity": "sha512-H/nlJ/h0ggGC+uRL3ovD+G0i4bqhvsDOpbDv7At5eFLlj2b41L8QliGbnl2H7SnDiYhENphh1tQFJZf+MyfLsQ==", "engines": { - "node": ">=14.0" + "node": ">=20.18.1" } }, "node_modules/undici-types": { @@ -21305,14 +21294,6 @@ "node": ">= 0.6" } }, - "node_modules/unprint/node_modules/undici": { - "version": "7.18.2", - "resolved": "https://registry.npmjs.org/undici/-/undici-7.18.2.tgz", - "integrity": "sha512-y+8YjDFzWdQlSE9N5nzKMT3g4a5UBX1HKowfdXh0uvAnTaqqwqB92Jt4UXBAeKekDs5IaDKyJFR4X1gYVCgXcw==", - "engines": { - "node": ">=20.18.1" - } - }, "node_modules/unprint/node_modules/w3c-xmlserializer": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-2.0.0.tgz", diff --git a/package.json b/package.json index c6d31b4e..11438b45 100755 --- a/package.json +++ b/package.json @@ -152,7 +152,7 @@ "tough-cookie": "^4.1.3", "tunnel": "0.0.6", "ua-parser-js": "^1.0.37", - "undici": "^5.28.1", + "undici": "^7.24.7", "unprint": "^0.19.13", "url-pattern": "^1.0.3", "v-tooltip": "^2.1.3", diff --git a/seeds/01_networks.js b/seeds/01_networks.js index 4c90d5d2..af10f3b6 100755 --- a/seeds/01_networks.js +++ b/seeds/01_networks.js @@ -796,6 +796,9 @@ const networks = [ slug: 'teencoreclub', name: 'Teen Core Club', url: 'https://teencoreclub.com', + parameters: { + studioId: 1624, + }, }, { slug: 'teenmegaworld', diff --git a/seeds/02_sites.js b/seeds/02_sites.js index fc980516..a6338eec 100755 --- a/seeds/02_sites.js +++ b/seeds/02_sites.js @@ -13510,7 +13510,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 178, + legacySiteId: 178, }, }, { @@ -13520,7 +13520,7 @@ const sites = [ parent: 'teencoreclub', hasLogo: false, parameters: { - siteId: 482, + legacySiteId: 482, }, }, { @@ -13537,7 +13537,8 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 180, + legacySiteId: 180, + siteId: 17, }, }, { @@ -13552,7 +13553,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 182, + legacySiteId: 182, }, }, { @@ -13564,7 +13565,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 184, + legacySiteId: 184, }, }, { @@ -13579,7 +13580,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 362, + legacySiteId: 362, }, }, { @@ -13591,7 +13592,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 278, + legacySiteId: 278, }, }, { @@ -13608,7 +13609,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 186, + legacySiteId: 186, }, }, { @@ -13620,7 +13621,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 280, + legacySiteId: 280, }, }, { @@ -13632,7 +13633,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 282, + legacySiteId: 282, }, }, { @@ -13644,7 +13645,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 188, + legacySiteId: 188, }, }, { @@ -13656,7 +13657,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 284, + legacySiteId: 284, }, }, { @@ -13672,7 +13673,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 190, + legacySiteId: 190, }, }, { @@ -13689,7 +13690,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 192, + legacySiteId: 192, }, }, { @@ -13706,7 +13707,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 288, + legacySiteId: 288, }, }, { @@ -13718,7 +13719,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 290, + legacySiteId: 290, }, }, { @@ -13733,7 +13734,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 292, + legacySiteId: 292, }, }, { @@ -13745,7 +13746,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 194, + legacySiteId: 194, }, }, { @@ -13757,7 +13758,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 196, + legacySiteId: 196, }, }, { @@ -13769,7 +13770,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 198, + legacySiteId: 198, }, }, { @@ -13784,7 +13785,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 294, + legacySiteId: 294, }, }, { @@ -13795,7 +13796,7 @@ const sites = [ visible: false, hasLogo: false, parameters: { - siteId: 566, + legacySiteId: 566, }, }, { @@ -13815,7 +13816,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 200, + legacySiteId: 200, }, }, { @@ -13830,7 +13831,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 296, + legacySiteId: 296, }, }, { @@ -13842,7 +13843,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 298, + legacySiteId: 298, }, }, { @@ -13857,7 +13858,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 300, + legacySiteId: 300, }, }, { @@ -13872,7 +13873,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 302, + legacySiteId: 302, }, }, { @@ -13888,7 +13889,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 304, + legacySiteId: 304, }, }, { @@ -13903,7 +13904,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 306, + legacySiteId: 306, }, }, { @@ -13915,7 +13916,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 308, + legacySiteId: 308, }, }, { @@ -13926,7 +13927,7 @@ const sites = [ visible: false, hasLogo: false, parameters: { - siteId: 568, + legacySiteId: 568, }, }, { @@ -13937,7 +13938,7 @@ const sites = [ visible: false, hasLogo: false, parameters: { - siteId: 570, + legacySiteId: 570, }, }, { @@ -13950,7 +13951,7 @@ const sites = [ parent: 'teencoreclub', hasLogo: false, parameters: { - siteId: 360, + legacySiteId: 360, }, }, { @@ -13962,7 +13963,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 310, + legacySiteId: 310, }, }, { @@ -13976,7 +13977,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 202, + legacySiteId: 202, }, }, { @@ -13988,7 +13989,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 312, + legacySiteId: 312, }, }, { @@ -14003,7 +14004,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 314, + legacySiteId: 314, }, }, { @@ -14014,7 +14015,7 @@ const sites = [ visible: false, hasLogo: false, parameters: { - siteId: 556, + legacySiteId: 556, }, }, { @@ -14029,7 +14030,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 316, + legacySiteId: 316, }, }, { @@ -14043,7 +14044,7 @@ const sites = [ parent: 'teencoreclub', hasLogo: false, parameters: { - siteId: 418, + legacySiteId: 418, }, }, { @@ -14055,7 +14056,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 318, + legacySiteId: 318, }, }, { @@ -14067,7 +14068,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 204, + legacySiteId: 204, }, }, { @@ -14083,7 +14084,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 320, + legacySiteId: 320, }, }, { @@ -14095,7 +14096,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 322, + legacySiteId: 322, }, }, { @@ -14107,7 +14108,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 324, + legacySiteId: 324, }, }, { @@ -14119,7 +14120,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 366, + legacySiteId: 366, }, }, { @@ -14132,7 +14133,7 @@ const sites = [ parent: 'teencoreclub', hasLogo: false, parameters: { - siteId: 176, + legacySiteId: 176, }, }, { @@ -14147,7 +14148,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 368, + legacySiteId: 368, }, }, { @@ -14162,7 +14163,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 326, + legacySiteId: 326, }, }, { @@ -14180,7 +14181,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 206, + legacySiteId: 206, }, }, { @@ -14195,7 +14196,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 208, + legacySiteId: 208, }, }, { @@ -14210,7 +14211,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 210, + legacySiteId: 210, }, }, { @@ -14227,7 +14228,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 328, + legacySiteId: 328, }, }, { @@ -14242,7 +14243,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 212, + legacySiteId: 212, }, }, { @@ -14258,7 +14259,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 330, + legacySiteId: 330, }, }, { @@ -14275,7 +14276,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 214, + legacySiteId: 214, }, }, { @@ -14291,7 +14292,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 332, + legacySiteId: 332, }, }, { @@ -14310,7 +14311,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 216, + legacySiteId: 216, }, }, { @@ -14325,7 +14326,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 334, + legacySiteId: 334, }, }, { @@ -14336,7 +14337,7 @@ const sites = [ visible: false, hasLogo: false, parameters: { - siteId: 558, + legacySiteId: 558, }, }, { @@ -14352,7 +14353,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 336, + legacySiteId: 336, }, }, { @@ -14369,7 +14370,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 218, + legacySiteId: 218, }, }, { @@ -14386,7 +14387,7 @@ const sites = [ ], parent: 'teencoreclub', parameters: { - siteId: 220, + legacySiteId: 220, }, }, /* TCC VOD services and unused brands diff --git a/src/media.js b/src/media.js index 491a668e..2c86e5b6 100755 --- a/src/media.js +++ b/src/media.js @@ -651,9 +651,10 @@ async function fetchHttpSource(source, tempFileTarget, hashStream) { const res = await http.get(source.src, { limits: 'media', headers: { - host: new URL(source.src).hostname, + // explicit host not allowed in HTTP/2 + // host: new URL(source.src).hostname, + // ...(source.host && { host: source.host }), ...(source.referer && { referer: source.referer }), - ...(source.host && { host: source.host }), }, stream: true, // sources are fetched in parallel, don't gobble up memory followRedirects: source.followRedirects, diff --git a/src/scrapers/darkkotv.js b/src/scrapers/darkkotv.js index fe1e93fa..38f948dc 100755 --- a/src/scrapers/darkkotv.js +++ b/src/scrapers/darkkotv.js @@ -85,7 +85,7 @@ async function scrapeScene({ query: pageQuery, html }, { url, entity, include }) } if (include.photos && capsUrl) { - release.caps = await fetchCaps(capsUrl); + release.caps = await fetchCaps(capsUrl, entity); } release.trailer = pageQuery.video('#download_select option[value*=".mp4"]', { attribute: 'value' }); diff --git a/src/scrapers/teencoreclub.js b/src/scrapers/teencoreclub.js index 540aeb23..217bf0f8 100755 --- a/src/scrapers/teencoreclub.js +++ b/src/scrapers/teencoreclub.js @@ -1,155 +1,84 @@ 'use strict'; -const moment = require('moment'); +const unprint = require('unprint'); -const logger = require('../logger')(__filename); -const http = require('../utils/http'); -const qu = require('../utils/qu'); const slugify = require('../utils/slugify'); -const { prefixUrl } = require('../utils/qu'); -function scrapeAll(scenes, entity) { - return scenes.map((scene) => { +function scrapeAll(scenes) { + return scenes.map(({ query }) => { const release = {}; - release.entryId = scene.id; - release.url = `${new URL(entity.url).origin}/video/${scene.id}/${scene.slug}`; + release.url = query.url('.title a'); + release.entryId = new URL(release.url).pathname.match(/\/scene\/(\d+)/)[1]; - if (/bic/i.test(scene.title)) { - release.shootId = scene.title.toUpperCase().replace('-', '_'); - } else { - release.title = scene.title; - } + release.title = query.content('.title a'); - release.description = scene.description; - release.date = moment.utc(scene.year, 'YYYY').toDate(); - release.datePrecision = 'year'; + release.date = query.date('.date', 'MMM DD, YYYY'); + release.duration = query.duration('.duration'); - release.actors = scene.actors.map((actor) => ({ - name: actor.name.trim(), - avatar: actor.image || null, - })).filter((actor) => actor.name && slugify(actor.name) !== 'amateur-girl'); + release.actors = query.all('.models a.model').map((actorEl) => ({ + name: unprint.query.content(actorEl), + url: unprint.query.url(actorEl, null), + })); - release.duration = scene.duration; - release.stars = scene.video_rating_score; + release.poster = query.img('img.poster'); + release.teaser = query.video('.teaser video'); - [release.poster, ...release.photos] = scene.screenshots.map((url) => prefixUrl(url)); - - if (scene.is_gay) { - release.tags = ['gay']; - } + console.log(release); return release; }); } -async function scrapeScene({ query }, url) { - const release = {}; - const { pathname, origin, host } = new URL(url); - - const entryId = pathname.match(/\/video\/(\d+)/)[1]; - release.entryId = entryId; - - const title = query.meta('name=title'); - - if (/bic/i.test(title)) { - release.shootId = title.toUpperCase().replace('-', '_'); - } else { - release.title = title; - } - - release.date = query.date('.detail-meta li:nth-child(2)', 'YYYY'); - release.datePrecision = 'year'; - - release.description = query.q('.detail-description', true); - release.duration = query.dur('.detail-meta li:first-child'); - - const actors = [query.q('.detail-hero-title h1', true)?.trim()].filter((name) => name && slugify(name) !== 'amateur-girl'); - - if (actors.length > 0) { - release.actors = actors; - } - - release.poster = query.q('.detail-hero').style['background-image'].match(/url\((.+)\)/)[1]; - release.photos = query.imgs('.detail-grabs img'); - - const streamData = await http.get(`${origin}/video/source/${entryId}`, { - headers: { - host, - referer: url, - }, - }, { - interval: 5000, - concurrency: 1, - }); - - if (streamData.ok && streamData.body.status === 'success') { - release.trailer = { - stream: streamData.body.link, - }; - } else { - logger.warn(`Failed to fetch trailer for ${url}: ${streamData.ok ? streamData.body.status : streamData.status }`); - } - - return release; -} - -async function scrapeProfile(actor, entity, include) { - const profile = {}; - - if (actor.image) { - profile.avatar = `https://teencoreclub.com${actor.image}`; - } - - if (include.releases) { - const res = await http.get(`https://teencoreclub.com/browsevideos/api/all?actor=${actor.id}`); - - if (res.ok) { - profile.releases = scrapeAll(res.body.data, entity); - } - } - - return profile; -} - -async function fetchLatest(entity, page = 1) { - // console.log(entity, page); - - if (entity.parameters?.siteId) { - const res = await http.get(`https://teencoreclub.com/browsevideos/api/all?resType=latest&page=${page}&label=${entity.parameters.siteId}`); - - if (res.ok) { - return scrapeAll(res.body.data, entity); - } - - return res.status; - } - - return null; -} - -async function fetchScene(url, entity) { - const { pathname } = new URL(url); - const res = await qu.get(`https://teencoreclub.com${pathname}`); +async function fetchLatest(channel, page = 1) { + const url = `${channel.url}/${page}`; + const res = await unprint.get(url, { selectAll: '.scene' }); if (res.ok) { - return scrapeScene(res.item, url, entity); + return scrapeAll(res.context, channel); } return res.status; } -async function fetchProfile({ name: actorName }, { entity }, include) { - const res = await http.get(`https://teencoreclub.com/api/actors?query=${actorName}`); +function scrapeScene({ query }, { url }) { + const release = {}; + + release.entryId = new URL(url).pathname.match(/\/scene\/(\d+)/)[1]; + + release.title = query.content('h3.title'); + release.description = query.content('p.description'); + + release.date = query.date('.date', 'MMMM D, YYYY'); + release.duration = query.duration('.duration'); + + [release.poster, ...release.photos] = query.imgs('.preview-thumb'); + release.trailer = query.video('.trailer video'); + + console.log(release); + + return release; +} + +function scrapeProfile({ query }) { + const profile = {}; + + profile.description = query.content('.bio-text'); + profile.birthPlace = query.content('.birth-place span'); + + profile.avatar = query.img('.actor-photo img'); + + console.log(profile); + + return profile; +} + +async function fetchProfile({ name: actorName }, entity) { + const url = `${entity.url}/actors/${slugify(actorName, '_')}`; + const res = await unprint.get(url); if (res.ok) { - const actor = res.body.data.find((item) => slugify(item.name) === slugify(actorName)); - - if (actor) { - return scrapeProfile(actor, entity, include); - } - - return null; + return scrapeProfile(res.context, entity); } return res.status; @@ -157,6 +86,6 @@ async function fetchProfile({ name: actorName }, { entity }, include) { module.exports = { fetchLatest, - fetchScene, fetchProfile, + scrapeScene, }; diff --git a/src/tools/transfer.js b/src/tools/transfer.js deleted file mode 100644 index e1a5369e..00000000 --- a/src/tools/transfer.js +++ /dev/null @@ -1,655 +0,0 @@ -'use strict'; - -const config = require('config'); -const fs = require('fs'); -const path = require('path'); -const moment = require('moment'); -const Promise = require('bluebird'); -const bhttp = require('bhttp'); -const { nanoid } = require('nanoid/non-secure'); -const { Upload } = require('@aws-sdk/lib-storage'); -const { S3Client } = require('@aws-sdk/client-s3'); - -const { graphql } = require('../web/graphql'); -const knex = require('../knex'); -const args = require('../argv'); - -const s3 = new S3Client({ - region: 'eu-central-1', - endpoint: 'https://s3.eu-central-1.wasabisys.com', - credentials: { - accessKeyId: config.s3.accessKey, - secretAccessKey: config.s3.secretKey, - }, -}); - -// NOT TRANSFERRED, unutilized on old server: production location, availabile qualities, actor alias for, actor entry id, chapter posters, chapter photos - -const sceneFields = ` - entryId - shootId - title - url - date - datePrecision - productionDate - description - duration - entity { - slug - type - } - studio { - slug - } - movies: moviesScenesBySceneId { - movie { - title - entryId - entity { - slug - type - } - } - } - actors: releasesActors { - actor { - name - slug - entryId - entity { - slug - type - } - } - } - directors: releasesDirectors { - director { - name - slug - entryId - entity { - slug - type - } - } - } - tags: releasesTags { - tag { - slug - } - } - chapters(orderBy: TIME_ASC) { - index - time - duration - title - description - tags: chaptersTags { - tag { - slug - } - } - } - poster: releasesPoster { - media { - hash - path - thumbnail - lazy - s3: isS3 - mime - index - width - height - size - source - sourcePage - } - } - photos: releasesPhotos { - media { - hash - path - thumbnail - lazy - s3: isS3 - mime - index - width - height - size - source - sourcePage - } - } - covers: releasesCovers { - media { - hash - path - thumbnail - lazy - s3: isS3 - mime - index - width - height - size - source - sourcePage - } - } - trailer: releasesTrailer { - media { - hash - path - thumbnail - lazy - s3: isS3 - mime - index - width - height - size - source - sourcePage - } - } - teaser: releasesTeaser { - media { - hash - path - thumbnail - lazy - s3: isS3 - mime - index - width - height - size - source - sourcePage - } - } - createdAt -`; - -const movieFields = ` - entryId - title - url - date - datePrecision - entity { - slug - type - } - poster: moviesPoster { - media { - hash - path - thumbnail - lazy - s3: isS3 - mime - index - width - height - size - source - sourcePage - } - } - covers: moviesCovers { - media { - hash - path - thumbnail - lazy - s3: isS3 - mime - index - width - height - size - source - sourcePage - } - } - createdAt -`; - -async function save() { - const limit = args.limit || 1000; - const offset = args.start || 0; - - const { releases } = await graphql(` - query SearchScenes( - $limit: Int = 20 - $offset: Int = 0 - ) { - releases( - first: $limit - offset: $offset - orderBy: DATE_DESC - ) { - ${sceneFields} - } - } - `, { - limit, - offset, - }, 'owner'); - - const { movies } = await graphql(` - query SearchScenes( - $limit: Int = 20 - $offset: Int = 0 - ) { - movies( - first: $limit - offset: $offset - orderBy: DATE_DESC - ) { - ${movieFields} - } - } - `, { - limit, - offset, - }, 'owner'); - - const filename = `export-${offset}-${offset + limit}-${moment().format('YYYY-MM-DD_hh_mm_ss')}.json`; - - let savedScenes = 0; - let savedMovies = 0; - - await releases.reduce(async (chain, release) => { - await chain; - - const entry = JSON.stringify({ - ...release, - type: 'release', - actors: release.actors.filter(Boolean).map(({ actor }) => actor), - directors: release.directors.filter(Boolean).map(({ director }) => director), - studio: release.studio?.slug, - tags: release.tags.map(({ tag }) => tag?.slug).filter(Boolean), - movies: release.movies?.map(({ movie }) => movie) || [], - chapters: release.chapters.filter(Boolean).map((chapter) => ({ - ...chapter, - tags: chapter.tags.map(({ tag }) => tag?.slug).filter(Boolean), - })), - poster: release.poster?.media, - trailer: release.trailer?.media, - teaser: release.teaser?.media, - photos: release.photos.filter(Boolean).map(({ media }) => media), - covers: release.covers.filter(Boolean).map(({ media }) => media), - }); - - await fs.promises.appendFile(filename, `${entry}\n`); - - savedScenes += 1; - }, Promise.resolve()); - - await movies.reduce(async (chain, movie) => { - await chain; - - const entry = JSON.stringify({ - ...movie, - type: 'movie', - poster: movie.poster?.media, - covers: movie.covers.filter(Boolean).map(({ media }) => media), - }); - - await fs.promises.appendFile(filename, `${entry}\n`); - - savedMovies += 1; - }, Promise.resolve()); - - console.log(`Saved ${savedScenes} scenes and ${savedMovies} movies to ${filename}`); - - process.exit(); -} - -async function addReleaseTags(release, context) { - if (release.tags.length === 0) { - return; - } - - await knex('releases_tags').insert(release.tags.map((tag) => ({ - tag_id: context.tagIdsBySlug[tag], - release_id: release.id, - original_tag: tag, - }))); -} - -async function addNewActor(actor, entity, context) { - const [{ id: actorId }] = await knex('actors') - .insert({ - name: actor.name, - slug: actor.slug, - entity_id: entity?.id, - batch_id: context.batchId, - }) - .returning('id'); - - return actorId; -} - -async function addReleaseActors(release, context, target = 'actor') { - await release[`${target}s`].reduce(async (chain, actor) => { - await chain; - - const entity = actor.entity - ? await knex('entities').where(actor.entity).first() - : null; - - if (actor.entity && !entity) { - throw new Error(`Actor ${actor.slug} contains non-existent ${release.entity.type} '${release.entity.slug}'`); - } - - const existingActor = await knex('actors') - .where('slug', actor.slug) - .where((builder) => { - if (entity) { - builder.where('entity_id', entity.id); - return; - } - - builder.whereNull('entity_id'); - }) - .first(); - - const actorId = existingActor?.id - || await addNewActor(actor, entity, context); - - await knex(`releases_${target}s`).insert({ - release_id: release.id, - [`${target}_id`]: actorId, - }); - }, Promise.resolve()); -} - -async function addReleaseDirectors(release, context) { - return addReleaseActors(release, context, 'director'); -} - -async function addReleaseChapters(release, context) { - await release.chapters.reduce(async (chain, chapter) => { - await chain; - - const [{ id: chapterId }] = await knex('chapters') - .insert({ - release_id: release.id, - index: chapter.index, - time: chapter.time, - duration: chapter.duration, - description: chapter.description, - }) - .returning('id'); - - if (chapter.tags.length > 0) { - await knex('chapters_tags').insert(chapter.tags.map((tag) => ({ - tag_id: context.tagIdsBySlug[tag], - chapter_id: chapterId, - original_tag: tag, - }))); - } - }, Promise.resolve()); -} - -const dirs = { - path: '', - thumbnail: 'thumbs', - lazy: 'lazy', -}; - -async function transferMedia(media, target) { - return ['path', 'thumbnail', 'lazy'].reduce(async (chain, type) => { - await chain; - - const filename = `${media.hash}${path.extname(media[type])}`; - const filepath = path.join(target, dirs[type], filename); - const temp = path.join('media/temp', filepath); - const url = new URL(media[type], `${media.s3 ? config.media.transferSources.s3 : config.media.transferSources.local}/`).href; - - if (args.logLevel === 'debug') { - console.log('Transferring media', url); - } - - const res = await bhttp.get(url, { stream: true }); - - if (res.statusCode !== 200) { - console.warn(`Missing ${target} ${url}`); - return; - } - - await fs.promises.mkdir(path.dirname(temp), { recursive: true }); - - await new Promise((resolve, reject) => { - const fileStream = fs.createWriteStream(temp); - - res.pipe(fileStream); - - res.on('error', () => { reject(); }); - - fileStream.on('finish', () => { resolve(); }); - fileStream.on('error', () => { reject(); }); - }); - - await new Upload({ - client: s3, - params: { - Bucket: config.s3.bucket, - Body: fs.createReadStream(temp), - Key: filepath, - ContentType: media.mime, - }, - }).done(); - - await fs.promises.unlink(temp); - }, Promise.resolve()); -} - -async function addReleaseMedia(medias, release, target) { - await medias.filter(Boolean).reduce(async (chain, media) => { - await chain; - - const existingMedia = await knex('media') - .where('hash', media.hash) - .orWhere('source', media.source) - .first(); - - const id = existingMedia?.id || nanoid(); - - if (!existingMedia) { - await knex('media').insert({ - id, - hash: media.hash, - path: path.join(target, '', `${media.hash}${path.extname(media.path)}`), - thumbnail: path.join(target, 'thumbs', `${media.hash}${path.extname(media.thumbnail)}`), - lazy: path.join(target, 'lazy', `${media.hash}${path.extname(media.lazy)}`), - // is_s3: media.s3, - is_s3: true, - index: media.index, - mime: media.mime, - size: media.size, - width: media.width, - height: media.height, - source: media.source, - source_page: media.sourcePage, - }); - - await transferMedia(media, target); - } - - try { - await knex(`${release.type}s_${target}`).insert({ - [`${release.type}_id`]: release.id, - media_id: id, - }); - } catch (error) { - console.warn(`Ignored duplicate ${release.type} ${target} association ${media.hash} with ${release.id} "${release.title}"`); - } - }, Promise.resolve()); -} - -async function linkMovieScenes(release, context) { - await release.movies.reduce(async (chain, linkedMovie) => { - await chain; - - const movie = context.movies.find((storedMovie) => storedMovie.entryId === linkedMovie.entryId - && storedMovie.entity.slug === linkedMovie.entity.slug - && storedMovie.entity.type === linkedMovie.entity.type); - - if (!movie) { - throw new Error(`Missing ${linkedMovie.entity.slug} movie '${linkedMovie.title}' in '${release.title}'`); - } - - await knex('movies_scenes').insert({ - movie_id: movie.id, - scene_id: release.id, - }); - }, Promise.resolve()); -} - -async function addRelease(release, context) { - const existingRelease = await knex(`${release.type}s`) - .select(`${release.type}s.*`, 'entities.name as entity_name') - .leftJoin('entities', 'entities.id', `${release.type}s.entity_id`) - .where('entry_id', release.entryId) - .where('entities.slug', release.entity.slug) - .where('entities.type', release.entity.type) - .first(); - - if (existingRelease) { - console.log(`Skipping ${release.entity.slug} release "${release.title}", already in database`); - - return { - ...release, - skipped: true, - id: existingRelease.id, - entityName: existingRelease.entity_name, - }; - } - - const [entity] = await Promise.all([ - knex('entities').select(['id', 'name']).where(release.entity).first(), - ]); - - if (!entity) { - throw new Error(`Release "${release.title}" contains non-existent ${release.entity.type} '${release.entity.slug}'`); - } - - const [releaseEntry] = await knex(`${release.type}s`) - .insert({ - entry_id: release.entryId, - entity_id: entity.id, - url: release.url, - title: release.title, - slug: release.slug, - date: release.date, - date_precision: release.datePrecision, - created_batch_id: context.batchId, - updated_batch_id: context.batchId, - ...(release.type === 'scene' && { - shoot_id: release.shootId, - studio_id: context.studioIdsBySlug[release.studio], - production_date: release.productionDate, - description: release.description, - duration: release.duration, - }), - }) - .returning(['id', 'entry_id']); - - const releaseWithId = { - ...release, - id: releaseEntry.id, - entityName: entity.name, - }; - - await addReleaseMedia([releaseWithId.poster], releaseWithId, 'posters', context); - - if (release.type === 'release') { - await Promise.all([ - addReleaseTags(releaseWithId, context), - addReleaseActors(releaseWithId, context), - addReleaseDirectors(releaseWithId, context), - addReleaseChapters(releaseWithId, context), - linkMovieScenes(releaseWithId, context), - addReleaseMedia(releaseWithId.photos, releaseWithId, 'photos', context), - ]); - } - - if (release.type === 'movie') { - await addReleaseMedia(releaseWithId.covers, releaseWithId, 'covers', context); - } - - return releaseWithId; -} - -async function load() { - const file = await fs.promises.readFile(args.file, 'utf8'); - const start = args.start || 0; - const end = args.limit ? start + args.limit : Infinity; - - const releases = file.split('\n') - .filter(Boolean) - .map((data) => JSON.parse(data)) - .filter((release) => (args.entity ? release.entity.slug === args.entity : true)) - .slice(start, end); - - if (releases.length === 0) { - console.log('Nothing to load'); - return; - } - - const [{ id: batchId }] = await knex('batches').insert({ comment: `import ${args.file}` }).returning('id'); - - const aggTags = Array.from(new Set(releases.filter((release) => release.type === 'release').flatMap((release) => [...release.tags, ...release.chapters.flatMap((chapter) => chapter.tags)]).filter(Boolean))); - const aggStudios = Array.from(new Set(releases.map((release) => release.studio).filter(Boolean))); - - const tags = await knex('tags') - .select('id', 'slug') - .whereIn('slug', aggTags); - - const studios = await knex('entities') - .select('id', 'slug') - .where('type', 'studio') - .whereIn('slug', aggStudios); - - const tagIdsBySlug = Object.fromEntries(tags.map((tag) => [tag.slug, tag.id])); - const studioIdsBySlug = Object.fromEntries(studios.map((studio) => [studio.slug, studio.id])); - - const addedMovies = await releases.filter((release) => release.type === 'movie').reduce(async (chain, release, index, array) => { - const acc = await chain; - const movie = await addRelease(release, { batchId, tagIdsBySlug, studioIdsBySlug }); - - if (!movie.skipped) { - console.log(`Loaded ${index}/${array.length} '${movie.entityName}' movie "${movie.title}"`); - } - - return acc.concat(movie); - }, Promise.resolve([])); - - const addedScenes = await releases.filter((release) => release.type === 'release').reduce(async (chain, release, index, array) => { - const acc = await chain; - const scene = await addRelease(release, { batchId, movies: addedMovies, tagIdsBySlug, studioIdsBySlug }); - - if (!scene.skipped) { - console.log(`Loaded ${index}/${array.length} '${scene.entityName}' scene "${scene.title}"`); - } - - return acc.concat((!!scene && !scene.skipped)); - }, Promise.resolve([])); - - console.log(`Loaded ${addedMovies.filter((movie) => movie && !movie.skipped).length}/${releases.filter((release) => release.type === 'movie').length} movies in batch ${batchId}`); - console.log(`Loaded ${addedScenes.filter((scene) => scene && !scene.skipped).length}/${releases.filter((release) => release.type === 'release').length} scenes in batch ${batchId}`); - - process.exit(); -} - -({ - save, - load, -})[args._](); diff --git a/src/utils/http.js b/src/utils/http.js index b913879b..a2ba75ba 100755 --- a/src/utils/http.js +++ b/src/utils/http.js @@ -3,9 +3,11 @@ const config = require('config'); const Promise = require('bluebird'); const bhttp = require('bhttp'); +const undici = require('undici'); const fs = require('fs').promises; -const util = require('util'); -const stream = require('stream'); +// const util = require('util'); +// const stream = require('stream'); +const { pipeline } = require('stream/promises'); const tunnel = require('tunnel'); const Bottleneck = require('bottleneck'); const { JSDOM, toughCookie } = require('jsdom'); @@ -18,7 +20,7 @@ const logger = require('../logger')(__filename); const virtualConsole = require('./virtual-console')(__filename); const argv = require('../argv'); -const pipeline = util.promisify(stream.pipeline); +// const pipeline = util.promisify(stream.pipeline); const limiters = { bypass: new Bottleneck({ @@ -47,13 +49,6 @@ const defaultOptions = { }, }; -const proxyAgent = tunnel.httpsOverHttp({ - proxy: { - host: config.proxy.host, - port: config.proxy.port, - }, -}); - function useProxy(url) { if (!config.proxy.enable) { return false; @@ -326,87 +321,103 @@ async function bypassCloudflareRequest(url, method, body, cloudflareBypass, opti }; } -async function request(method = 'get', url, body, requestOptions = {}, limiter) { - const http = requestOptions.session || bhttp; +const defaultAgent = new undici.Agent({ + allowH2: true, + connect: { + rejectUnauthorized: false, + }, +}); - const options = { - ...requestOptions, - session: null, - }; +const proxyAgent = tunnel.httpsOverHttp({ + proxy: { + host: config.proxy.host, + port: config.proxy.port, + }, +}); +async function request(method = 'get', url, body, requestOptions = {}, limiter, redirects = 0) { const withProxy = useProxy(url); - const withBrowserBypass = useBrowserBypass(url, options); - const withCloudflareBypass = useCloudflareBypass(url, options); + const withBrowserBypass = useBrowserBypass(url, requestOptions); + const withCloudflareBypass = useCloudflareBypass(url, requestOptions); - if (withProxy) { - options.agent = proxyAgent; - } - - logger.debug(`${method.toUpperCase()} (${limiter._store.storeOptions.minTime}ms/${limiter._store.storeOptions.maxConcurrent}p${withProxy ? ' proxy' : ''}${withBrowserBypass || withCloudflareBypass ? ' bypass' : ''}) ${url}`); + logger.debug(`${redirects > 0 ? 'REDIRECT ' : ''}${method.toUpperCase()} (${limiter._store.storeOptions.minTime}ms/${limiter._store.storeOptions.maxConcurrent}p${withProxy ? ' proxy' : ''}${withBrowserBypass || withCloudflareBypass ? ' bypass' : ''}) ${url}`); if (withBrowserBypass) { - if (method !== 'get') { - throw new Error('Browser bypass only supports GET'); - } - - return bypassBrowserRequest(url, options); + if (method !== 'get') throw new Error('Browser bypass only supports GET'); + return bypassBrowserRequest(url, requestOptions); } if (withCloudflareBypass) { - return bypassCloudflareRequest(url, method, body, withCloudflareBypass, options); + return bypassCloudflareRequest(url, method, body, withCloudflareBypass, requestOptions); } - const res = await (body - ? http[method](url, body, options) - : http[method](url, options)); + const headers = { + ...requestOptions.headers, + }; + + const res = await undici.request(url, { + method: method.toUpperCase(), + headers, + body: body ?? null, + dispatcher: withProxy + ? proxyAgent + : defaultAgent, + maxRedirections: 0, // handle manually + }); + + if (res.headers.location && redirects < 3) { + // Drain the body to free the socket before redirecting + await res.body.dump(); + const nextUrl = new URL(res.headers.location, url).href; + return request(method, nextUrl, body, requestOptions, limiter, redirects + 1); + } return res; } -async function finalizeResult(res, options) { +async function finalizeResult(res, url, options) { if (options.destination) { - // res.on('progress', (bytes, totalBytes) => logger.silly(`Downloaded ${Math.round((bytes / totalBytes) * 100)}% of ${url}`)); - - await pipeline(res, ...(options.transforms || []), options.destination); - } - - if (Buffer.isBuffer(res.body)) { - const html = res.body.toString(); - const window = options?.parse ? new JSDOM(html, { virtualConsole, ...options.extract }).window : null; - const pathname = new URL(res.request.url).pathname.replace(/\//g, '_'); - - // allow window.close to be called after scraping is done, only for deep scrapes where the URL is known outside the scraper - if (window && /fetchScene|fetchMovie/.test(new Error().stack)) { - windows.set(pathname, window); - } - - if (argv.saveHtml) { - await fs.writeFile(`./html/${pathname}.html`, html); - } + await pipeline( + res.body, + ...(options.transforms || []), + options.destination, + ); return { - ...res, - body: html, - html, - status: res.statusCode, + statusCode: res.statusCode, headers: res.headers, - document: window?.document || null, - window, + status: res.statusCode, ok: res.statusCode >= 200 && res.statusCode <= 299, }; } + const buffer = await res.body.arrayBuffer(); + const html = Buffer.from(buffer).toString(); + const window = options?.parse ? new JSDOM(html, { virtualConsole, ...options.extract }).window : null; + const pathname = new URL(url).pathname.replace(/\//g, '_'); + + if (window && /fetchScene|fetchMovie/.test(new Error().stack)) { + windows.set(pathname, window); + } + + if (argv.saveHtml) { + await fs.writeFile(`./html/${pathname}.html`, html); + } + return { - ...res, - body: res.body, + statusCode: res.statusCode, status: res.statusCode, headers: res.headers, + body: html, + html, + document: window?.document || null, + window, ok: res.statusCode >= 200 && res.statusCode <= 299, }; } function getTimeout(options, url) { - return new Promise((resolve, reject, onCancel) => { + return new Promise((_resolve, reject, onCancel) => { const timeout = setTimeout(() => { logger.debug(`Canceled timed out request to ${url}`); reject(new Error(`URL ${url} timed out`)); @@ -441,7 +452,7 @@ async function scheduleRequest(method = 'get', url, body, requestOptions = {}) { timeout.cancel(); - const curatedResult = await finalizeResult(result, options); + const curatedResult = await finalizeResult(result, url, options); logger.silly(`Response ${curatedResult.status} for ${method.toUpperCase()} ${url}`);