22 Commits

Author SHA1 Message Date
DebaucheryLibrarian
430c7e124d 1.214.0 2022-04-04 00:23:40 +02:00
DebaucheryLibrarian
153f28c494 Added last login column to users table. 2022-04-04 00:23:37 +02:00
DebaucheryLibrarian
a586413240 1.213.9 2022-04-03 23:31:38 +02:00
DebaucheryLibrarian
25e0575c2b Fixed description query in Dogfart scraper. 2022-04-03 23:31:36 +02:00
DebaucheryLibrarian
acca75e2b5 1.213.8 2022-04-03 23:00:08 +02:00
DebaucheryLibrarian
5cbf122d6f Scraping Dogfart scenes from native sites. 2022-04-03 23:00:05 +02:00
DebaucheryLibrarian
08df432665 1.213.7 2022-04-03 01:29:18 +02:00
DebaucheryLibrarian
762b3984a3 Ignore join links for trailers in Dogfart scraper. 2022-04-03 01:29:16 +02:00
DebaucheryLibrarian
505ff0767c 1.213.6 2022-04-03 00:53:30 +02:00
DebaucheryLibrarian
9be80e2be9 Returning unextracted scenes from Kelly Madison / Teen Fidelity scraper. Fixed Dogfart profile scraper to use extract scenes. 2022-04-03 00:53:27 +02:00
DebaucheryLibrarian
e202e887f9 1.213.5 2022-04-03 00:49:42 +02:00
DebaucheryLibrarian
574c117ab0 Refactored Dogfart scraper to use qu and return unextracted scenes. 2022-04-03 00:49:39 +02:00
DebaucheryLibrarian
d59a57f311 1.213.4 2022-04-02 00:32:29 +02:00
DebaucheryLibrarian
5e499c3685 Added chunking to media duplicate queries to prevent overloading parameters. Added DP Diva to Perv City (coming soon). 2022-04-02 00:32:23 +02:00
DebaucheryLibrarian
17e5ce71b2 1.213.3 2022-03-31 23:01:56 +02:00
DebaucheryLibrarian
5352186319 Insex not fetching video when not required. 2022-03-31 23:01:54 +02:00
DebaucheryLibrarian
e9ba02d65d 1.213.2 2022-03-31 22:46:56 +02:00
DebaucheryLibrarian
39813d4461 Updated Insex scraper. 2022-03-31 22:46:54 +02:00
DebaucheryLibrarian
829a285a2d 1.213.1 2022-03-31 14:34:12 +02:00
DebaucheryLibrarian
a19a77e165 Optionalized qualities. 2022-03-31 14:34:10 +02:00
DebaucheryLibrarian
122dd3eaee 1.213.0 2022-03-31 14:11:23 +02:00
DebaucheryLibrarian
18b219850e Storing scene qualities. Updated Perv City scraper. 2022-03-31 14:11:13 +02:00
42 changed files with 306 additions and 282 deletions

View File

@@ -203,6 +203,19 @@
</div>
</div>
<div
v-if="release.qualities"
class="row"
>
<span class="row-label">Available qualities</span>
<span
v-for="quality in release.qualities"
:key="quality"
class="quality"
>{{ quality }}</span>
</div>
<div
v-if="release.comment"
class="row"
@@ -470,6 +483,16 @@ export default {
text-overflow: ellipsis;
}
.quality {
&::after {
content: 'p, ';
}
&:last-child::after {
content: 'p',
}
}
.releases {
margin: 0 0 .5rem 0;
}

View File

@@ -12,6 +12,7 @@ export default {
selectableTags: [
'airtight',
'anal',
'bdsm',
'blowbang',
'blowjob',
'creampie',

View File

@@ -367,6 +367,7 @@ const releaseFields = `
date
datePrecision
slug
qualities
shootId
productionDate
comment
@@ -475,6 +476,7 @@ const releaseFragment = `
duration
createdAt
shootId
qualities
productionDate
createdBatchId
productionLocation

View File

@@ -89,6 +89,10 @@ module.exports = {
'uksinners',
// mindgeek
'pornhub',
// insex
'paintoy',
'aganmedon',
'sensualpain',
],
networks: [
// dummy network for testing

View File

@@ -0,0 +1,7 @@
exports.up = async (knex) => knex.schema.alterTable('releases', (table) => {
table.specificType('qualities', 'text[]');
});
exports.down = async (knex) => knex.schema.alterTable('releases', (table) => {
table.dropColumn('qualities');
});

View File

@@ -0,0 +1,7 @@
exports.up = async (knex) => knex.schema.alterTable('users', (table) => {
table.datetime('last_login');
});
exports.down = async (knex) => knex.schema.alterTable('users', (table) => {
table.dropColumn('last_login');
});

View File

@@ -0,0 +1,25 @@
exports.up = async (knex) => knex.raw(`
CREATE MATERIALIZED VIEW entities_stats
AS
WITH RECURSIVE relations AS (
SELECT entities.id, entities.parent_id, count(releases.id) AS releases_count, count(releases.id) AS total_count
FROM entities
LEFT JOIN releases ON releases.entity_id = entities.id
GROUP BY entities.id
UNION ALL
SELECT entities.id AS entity_id, count(releases.id) AS releases_count, count(releases.id) + relations.total_count AS total_count
FROM entities
INNER JOIN relations ON relations.id = entities.parent_id
LEFT JOIN releases ON releases.entity_id = entities.id
GROUP BY entities.id
)
SELECT relations.id AS entity_id, relations.releases_count
FROM relations;
`);
exports.down = async (knex) => knex.raw(`
DROP MATERIALIZED VIEW entities_stats;
`);

23
package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "traxxx",
"version": "1.212.9",
"version": "1.214.0",
"lockfileVersion": 2,
"requires": true,
"packages": {
"": {
"name": "traxxx",
"version": "1.212.9",
"version": "1.214.0",
"license": "ISC",
"dependencies": {
"@casl/ability": "^5.2.2",
@@ -11650,25 +11650,6 @@
"webidl-conversions": "^3.0.0"
}
},
"node_modules/node-fetch/node_modules/tr46": {
"version": "0.0.3",
"resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz",
"integrity": "sha1-gYT9NH2snNwYWZLzpmIuFLnZq2o="
},
"node_modules/node-fetch/node_modules/webidl-conversions": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",
"integrity": "sha1-JFNCdeKnvGvnvIZhHMFq4KVlSHE="
},
"node_modules/node-fetch/node_modules/whatwg-url": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz",
"integrity": "sha1-lmRU6HZUYuN2RNNib2dCzotwll0=",
"dependencies": {
"tr46": "~0.0.3",
"webidl-conversions": "^3.0.0"
}
},
"node_modules/node-gyp": {
"version": "7.1.2",
"resolved": "https://registry.npmjs.org/node-gyp/-/node-gyp-7.1.2.tgz",

View File

@@ -1,6 +1,6 @@
{
"name": "traxxx",
"version": "1.212.9",
"version": "1.214.0",
"description": "All the latest porn releases in one place",
"main": "src/app.js",
"scripts": {

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.5 KiB

After

Width:  |  Height:  |  Size: 2.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.4 KiB

After

Width:  |  Height:  |  Size: 2.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.6 KiB

After

Width:  |  Height:  |  Size: 3.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.7 KiB

After

Width:  |  Height:  |  Size: 3.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.8 KiB

After

Width:  |  Height:  |  Size: 3.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.5 KiB

After

Width:  |  Height:  |  Size: 3.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.8 KiB

After

Width:  |  Height:  |  Size: 3.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.4 KiB

After

Width:  |  Height:  |  Size: 2.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 27 KiB

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 17 KiB

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 36 KiB

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.7 KiB

After

Width:  |  Height:  |  Size: 3.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 30 KiB

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 30 KiB

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 20 KiB

After

Width:  |  Height:  |  Size: 20 KiB

View File

@@ -2714,163 +2714,142 @@ const sites = [
{
slug: 'blacksonblondes',
name: 'Blacks On Blondes',
url: 'https://www.blacksonblondes.com/tour',
url: 'https://www.blacksonblondes.com',
description: 'Blacks On Blondes is the Worlds Largest and Best Interracial Sex and Interracial Porn website. Black Men and White Women. BlacksOnBlondes has 23 years worth of Hardcore Interracial Content. Featuring the entire Legendary Dogfart Movie Archive',
parent: 'dogfartnetwork',
},
{
slug: 'cuckoldsessions',
name: 'Cuckold Sessions',
url: 'https://www.cuckoldsessions.com/tour',
description: 'Dogfart, the #1 Interracial Network in the World Presents CuckoldSessions.com/tour - Hardcore Cuckold Fetish Videos',
url: 'https://www.cuckoldsessions.com',
description: 'Dogfart, the #1 Interracial Network in the World Presents CuckoldSessions.com - Hardcore Cuckold Fetish Videos',
parent: 'dogfartnetwork',
},
{
slug: 'gloryhole',
name: 'Glory Hole',
url: 'https://www.gloryhole.com/tour',
description: '',
url: 'https://www.gloryhole.com',
parent: 'dogfartnetwork',
},
{
slug: 'blacksoncougars',
name: 'Blacks On Cougars',
url: 'https://www.blacksoncougars.com/tour',
description: '',
url: 'https://www.blacksoncougars.com',
parent: 'dogfartnetwork',
},
{
slug: 'wefuckblackgirls',
name: 'We Fuck Black Girls',
alias: ['wfbg'],
url: 'https://www.wefuckblackgirls.com/tour',
description: '',
url: 'https://www.wefuckblackgirls.com',
parent: 'dogfartnetwork',
},
{
slug: 'watchingmymomgoblack',
name: 'Watching My Mom Go Black',
url: 'https://www.watchingmymomgoblack.com/tour',
description: '',
url: 'https://www.watchingmymomgoblack.com',
parent: 'dogfartnetwork',
},
{
slug: 'interracialblowbang',
name: 'Interracial Blowbang',
url: 'https://www.interracialblowbang.com/tour',
description: '',
url: 'https://www.interracialblowbang.com',
parent: 'dogfartnetwork',
},
{
slug: 'cumbang',
name: 'Cumbang',
url: 'https://www.cumbang.com/tour',
description: '',
url: 'https://www.cumbang.com',
parent: 'dogfartnetwork',
},
{
slug: 'interracialpickups',
name: 'Interracial Pickups',
url: 'https://www.interracialpickups.com/tour',
description: '',
url: 'https://www.interracialpickups.com',
parent: 'dogfartnetwork',
},
{
slug: 'watchingmydaughtergoblack',
name: 'Watching My Daughter Go Black',
url: 'https://www.watchingmydaughtergoblack.com/tour',
description: '',
url: 'https://www.watchingmydaughtergoblack.com',
parent: 'dogfartnetwork',
},
{
slug: 'zebragirls',
name: 'Zebra Girls',
url: 'https://www.zebragirls.com/tour',
description: '',
url: 'https://www.zebragirls.com',
parent: 'dogfartnetwork',
},
{
slug: 'gloryholeinitiations',
name: 'Gloryhole Initiations',
url: 'https://www.gloryhole-initiations.com/tour',
description: '',
url: 'https://www.gloryhole-initiations.com',
parent: 'dogfartnetwork',
},
{
slug: 'dogfartbehindthescenes',
name: 'Dogfart Behind The Scenes',
url: 'https://www.dogfartbehindthescenes.com/tour',
description: '',
url: 'https://www.dogfartbehindthescenes.com',
parent: 'dogfartnetwork',
},
{
slug: 'blackmeatwhitefeet',
name: 'Black Meat White Feet',
url: 'https://www.blackmeatwhitefeet.com/tour',
description: '',
url: 'https://www.blackmeatwhitefeet.com',
parent: 'dogfartnetwork',
},
{
slug: 'springthomas',
name: 'Spring Thomas',
url: 'https://www.springthomas.com/tour',
description: '',
url: 'https://www.springthomas.com',
parent: 'dogfartnetwork',
},
{
slug: 'katiethomas',
name: 'Katie Thomas',
url: 'https://www.katiethomas.com/tour',
description: '',
url: 'https://www.katiethomas.com',
parent: 'dogfartnetwork',
},
{
slug: 'ruthblackwell',
name: 'Ruth Blackwell',
url: 'https://www.ruthblackwell.com/tour',
description: '',
url: 'https://www.ruthblackwell.com',
parent: 'dogfartnetwork',
},
{
slug: 'candymonroe',
name: 'Candy Monroe',
url: 'https://www.candymonroe.com/tour',
description: '',
url: 'https://www.candymonroe.com',
parent: 'dogfartnetwork',
},
{
slug: 'wifewriting',
name: 'Wife Writing',
url: 'https://www.wifewriting.com/tour',
description: '',
url: 'https://www.wifewriting.com',
parent: 'dogfartnetwork',
},
{
slug: 'barbcummings',
name: 'Barb Cummings',
url: 'https://www.barbcummings.com/tour',
description: '',
url: 'https://www.barbcummings.com',
parent: 'dogfartnetwork',
},
{
slug: 'theminion',
name: 'The Minion',
url: 'https://www.theminion.com/tour',
description: '',
url: 'https://www.theminion.com',
parent: 'dogfartnetwork',
},
{
slug: 'blacksonboys',
name: 'Blacks On Boys',
url: 'https://www.blacksonboys.com/tour',
description: '',
url: 'https://www.blacksonboys.com',
parent: 'dogfartnetwork',
},
{
slug: 'gloryholesandhandjobs',
name: 'Gloryholes And Handjobs',
url: 'https://www.gloryholesandhandjobs.com/tour',
description: '',
url: 'https://www.gloryholesandhandjobs.com',
parent: 'dogfartnetwork',
},
// DORCEL
@@ -4219,7 +4198,6 @@ const sites = [
tags: ['bdsm'],
parent: 'insex',
parameters: {
scraper: 'alt',
latest: 'https://www.sexuallybroken.com/sb',
},
},
@@ -4230,13 +4208,20 @@ const sites = [
url: 'https://www.infernalrestraints.com',
tags: ['bdsm'],
parent: 'insex',
parameters: {
latest: 'https://www.infernalrestraints.com/ir',
},
},
{
slug: 'hardtied',
name: 'Hardtied',
alias: ['ht'],
url: 'https://www.hardtied.com',
tags: ['bdsm'],
parent: 'insex',
parameters: {
latest: 'https://www.hardtied.com/ht',
},
},
{
slug: 'realtimebondage',
@@ -4245,6 +4230,9 @@ const sites = [
url: 'https://www.realtimebondage.com',
tags: ['bdsm', 'live'],
parent: 'insex',
parameters: {
latest: 'https://www.realtimebondage.com/rtb',
},
},
{
slug: 'topgrl',
@@ -4254,7 +4242,6 @@ const sites = [
tags: ['bdsm', 'femdom'],
parent: 'insex',
parameters: {
scraper: 'alt',
latest: 'https://www.topgrl.com/tg',
},
},
@@ -6909,6 +6896,13 @@ const sites = [
tourId: 9,
},
},
{
slug: 'dpdiva',
name: 'DP Diva',
url: 'http://dpdiva.com',
parent: 'pervcity',
tags: ['dp', 'anal'],
},
// PIERRE WOODMAN
{
slug: 'woodmancastingx',

View File

@@ -85,23 +85,6 @@ async function startMemorySample(snapshotTriggers = []) {
}, config.memorySampling.sampleDuration);
}
async function startMemorySample() {
await inspector.heap.enable();
await inspector.heap.startSampling();
// monitorMemory();
logger.info(`Start heap sampling, memory usage: ${process.memoryUsage.rss() / 1000000} MB`);
setTimeout(async () => {
await stopMemorySample();
if (!done) {
await startMemorySample();
}
}, 30000);
}
async function init() {
try {
if (argv.server) {

View File

@@ -194,6 +194,7 @@ const { argv } = yargs
alias: 'pics',
})
.option('videos', {
alias: 'video',
describe: 'Include any trailers or teasers',
type: 'boolean',
default: true,

View File

@@ -34,6 +34,10 @@ async function login(credentials) {
await verifyPassword(credentials.password, user.password);
await knex('users')
.update('last_login', 'NOW()')
.where('id', user.id);
return curateUser(user);
}

View File

@@ -21,6 +21,7 @@ const argv = require('./argv');
const knex = require('./knex');
const http = require('./utils/http');
const bulkInsert = require('./utils/bulk-insert');
const chunk = require('./utils/chunk');
const { get } = require('./utils/qu');
const pipeline = util.promisify(stream.pipeline);
@@ -63,10 +64,10 @@ function sampleMedias(medias, limit = argv.mediaLimit, preferLast = true) {
? chunks.slice(0, -1).concat(chunks.slice(-1).reverse())
: chunks;
const groupedMedias = lastPreferredChunks.map((chunk) => {
const groupedMedias = lastPreferredChunks.map((mediaChunk) => {
// merge chunked medias into single media with grouped fallback priorities,
// so the first sources of each media is preferred over all second sources, etc.
const sources = chunk
const sources = mediaChunk
.reduce((accSources, media) => {
media.sources.forEach((source, index) => {
if (!accSources[index]) {
@@ -82,8 +83,8 @@ function sampleMedias(medias, limit = argv.mediaLimit, preferLast = true) {
.flat();
return {
id: chunk[0].id,
role: chunk[0].role,
id: mediaChunk[0].id,
role: mediaChunk[0].role,
sources,
};
});
@@ -235,22 +236,41 @@ async function findSourceDuplicates(baseMedias) {
.filter(Boolean);
const [existingSourceMedia, existingExtractMedia] = await Promise.all([
knex('media').whereIn('source', sourceUrls),
knex('media').whereIn('source_page', extractUrls),
// my try to check thousands of URLs at once, don't pass all of them to a single query
chunk(sourceUrls).reduce(async (chain, sourceUrlsChunk) => {
const accUrls = await chain;
const existingUrls = await knex('media').whereIn('source', sourceUrlsChunk);
return [...accUrls, ...existingUrls];
}, []),
chunk(extractUrls).reduce(async (chain, extractUrlsChunk) => {
const accUrls = await chain;
const existingUrls = await knex('media').whereIn('source_page', extractUrlsChunk);
return [...accUrls, ...existingUrls];
}, []),
]);
const existingSourceMediaByUrl = itemsByKey(existingSourceMedia, 'source');
const existingExtractMediaByUrl = itemsByKey(existingExtractMedia, 'source_page');
return { existingSourceMediaByUrl, existingExtractMediaByUrl };
return {
existingSourceMediaByUrl,
existingExtractMediaByUrl,
};
}
async function findHashDuplicates(medias) {
const hashes = medias.map((media) => media.meta?.hash || media.entry?.hash).filter(Boolean);
const existingHashMediaEntries = await knex('media').whereIn('hash', hashes);
const existingHashMediaEntriesByHash = itemsByKey(existingHashMediaEntries, 'hash');
const existingHashMediaEntries = await chunk(hashes, 2).reduce(async (chain, hashesChunk) => {
const accHashes = await chain;
const existingHashes = await knex('media').whereIn('hash', hashesChunk);
return [...accHashes, ...existingHashes];
}, []);
const existingHashMediaEntriesByHash = itemsByKey(existingHashMediaEntries, 'hash');
const uniqueHashMedias = medias.filter((media) => !media.entry && !existingHashMediaEntriesByHash[media.meta?.hash]);
const { selfDuplicateMedias, selfUniqueMediasByHash } = uniqueHashMedias.reduce((acc, media) => {
@@ -600,11 +620,11 @@ async function fetchSource(source, baseMedia) {
const hashStream = new stream.PassThrough();
let size = 0;
hashStream.on('data', (chunk) => {
size += chunk.length;
hashStream.on('data', (streamChunk) => {
size += streamChunk.length;
if (hasherReady) {
hasher.write(chunk);
hasher.write(streamChunk);
}
});

View File

@@ -1,20 +1,16 @@
'use strict';
/* eslint-disable newline-per-chained-call */
// const Promise = require('bluebird');
const { JSDOM } = require('jsdom');
const moment = require('moment');
const http = require('../utils/http');
const slugify = require('../utils/slugify');
const qu = require('../utils/qu');
async function getPhotos(albumUrl) {
const res = await http.get(albumUrl);
const html = res.body.toString();
const { document } = new JSDOM(html).window;
const res = await qu.get(albumUrl);
const lastPhotoPage = Array.from(document.querySelectorAll('.preview-image-container a')).slice(-1)[0].href;
if (!res.ok) {
return [];
}
const lastPhotoPage = res.item.query.urls('.pics-container .preview-image-container a').at(-1);
const lastPhotoIndex = parseInt(lastPhotoPage.match(/\d+.jpg/)[0], 10);
const photoUrls = Array.from({ length: lastPhotoIndex }, (value, index) => {
@@ -29,125 +25,95 @@ async function getPhotos(albumUrl) {
return photoUrls;
}
function scrapeLatest(html, site, filter = true) {
const { document } = new JSDOM(html).window;
const sceneElements = Array.from(document.querySelectorAll('.recent-updates'));
function scrapeLatest(scenes, site, filter = true) {
return scenes.reduce((acc, { query }) => {
const release = {};
return sceneElements.map((element) => {
const siteUrl = element.querySelector('.recent-details-title .help-block, .model-details-title .site-name').textContent;
const siteUrl = query.cnt('.recent-details-title .help-block, .model-details-title .site-name');
if (filter && `www.${siteUrl.toLowerCase()}` !== new URL(site.url).host) {
release.url = query.url('.thumbnail, .preview-image-container > a', 'href', { origin: site.url });
release.entryId = `${site.slug}_${new URL(release.url).pathname.split('/')[4]}`;
release.title = query.cnt('.scene-title');
// release.actors = release.title.split(/[,&]|\band\b/).map((actor) => actor.replace(/BTS/i, '').trim()); // the titles don't always list the actors, e.g. BarbCummings.com
// release.poster = `https:${element.querySelector('img').src}`;
release.poster = query.img();
release.teaser = query.video('.thumbnail, .preview-thumbnail', 'data-preview_clip_url');
release.channel = siteUrl?.match(/(.*).com/)?.[1].toLowerCase();
if (filter && siteUrl && `www.${siteUrl.toLowerCase()}` !== new URL(site.url).host) {
// different dogfart site
return null;
return { ...acc, unextracted: [...acc.unextracted, release] };
}
const sceneLinkElement = element.querySelector('.thumbnail');
const url = qu.prefixUrl(sceneLinkElement.href, 'https://dogfartnetwork.com');
const { pathname } = new URL(url);
const entryId = `${site.slug}_${pathname.split('/')[4]}`;
const title = element.querySelector('.scene-title').textContent;
const actors = title.split(/[,&]|\band\b/).map((actor) => actor.replace(/BTS/i, '').trim());
const poster = `https:${element.querySelector('img').src}`;
const teaser = sceneLinkElement.dataset.preview_clip_url;
const channel = siteUrl?.match(/(.*).com/)?.[1].toLowerCase();
return {
url,
entryId,
title,
actors,
poster,
teaser: {
src: teaser,
},
site,
channel,
};
}).filter(Boolean);
return { ...acc, scenes: [...acc.scenes, release] };
}, {
scenes: [],
unextracted: [],
});
}
async function scrapeScene(html, url, site) {
const { document } = new JSDOM(html).window;
const title = document.querySelector('.description-title').textContent;
const actors = Array.from(document.querySelectorAll('.more-scenes a')).map(({ textContent }) => textContent);
const metaDescription = document.querySelector('meta[itemprop="description"]').content;
const description = metaDescription
? metaDescription.content
: document.querySelector('.description')
.textContent
.replace(/[ \t\n]{2,}/g, ' ')
.replace('...read more', '')
.trim();
const channel = document.querySelector('.site-name').textContent.split('.')[0].toLowerCase();
async function scrapeScene({ query }, url, channel, baseScene, parameters) {
const release = {};
const { origin, pathname } = new URL(url);
const entryId = `${channel}_${pathname.split('/').slice(-2)[0]}`;
const date = new Date(document.querySelector('meta[itemprop="uploadDate"]').content);
const duration = moment
.duration(`00:${document
.querySelectorAll('.extra-info p')[1]
.textContent
.match(/\d+:\d+$/)[0]}`)
.asSeconds();
release.channel = channel.type === 'channel' ? channel.slug : query.cnt('.site-name').split('.')[0].toLowerCase();
release.entryId = `${release.channel}_${pathname.split('/').slice(-2)[0]}`;
const trailerElement = document.querySelector('.html5-video');
const poster = `https:${trailerElement.dataset.poster}`;
const { trailer } = trailerElement.dataset;
release.title = query.cnt('.description-title') || query.text('.scene-title');
release.actors = query.all('.more-scenes a, .starring-list a').map((actorEl) => ({
name: query.cnt(actorEl),
url: query.url(actorEl, null, 'href', { origin: channel.url }),
}));
const lastPhotosUrl = Array.from(document.querySelectorAll('.pagination a')).slice(-1)[0]?.href;
const photos = lastPhotosUrl ? await getPhotos(`${origin}${pathname}${lastPhotosUrl}`, site, url) : [];
release.description = query.meta('meta[itemprop="description"]') || query.cnt('.description, [itemprop="description"]')?.replace(/[ \t\n]{2,}/g, ' ').replace('...read more', '').trim();
const stars = Math.floor(Number(document.querySelector('span[itemprop="average"]')?.textContent || document.querySelector('span[itemprop="ratingValue"]')?.textContent) / 2);
const tags = Array.from(document.querySelectorAll('.scene-details .categories a')).map(({ textContent }) => textContent);
release.date = query.date('meta[itemprop="uploadDate"]', null, null, 'content');
release.duration = query.duration('.extra-info p:nth-child(2), .run-time-container');
return {
entryId,
url: `${origin}${pathname}`,
title,
description,
actors,
date,
duration,
poster,
photos,
trailer: {
src: trailer,
},
tags,
rating: {
stars,
},
site,
channel,
};
release.tags = query.exists('.scene-details .categories a') ? query.cnts('.scene-details .categories a') : query.text('.categories')?.split(/,\s+/);
const trailer = query.video('.html5-video', 'data-trailer');
const lastPhotosUrl = query.urls('.pagination a').at(-1);
release.poster = query.poster('.html5-video', 'data-poster') || query.img('.trailer-image');
if (trailer && !trailer?.includes('join')) {
release.trailer = trailer;
}
if (lastPhotosUrl && parameters.includePhotos) {
release.photos = await getPhotos(`${origin}${pathname}${lastPhotosUrl}`, channel, url);
}
release.stars = Number(((query.number('span[itemprop="average"], span[itemprop="ratingValue"]') || query.number('canvas[data-score]', null, 'data-score')) / 2).toFixed(2));
return release;
}
async function fetchLatest(site, page = 1) {
const res = await http.get(`https://dogfartnetwork.com/tour/scenes/?p=${page}`);
async function fetchLatest(channel, page = 1) {
// const res = await qu.getAll(`https://dogfartnetwork.com/tour/scenes/?p=${page}`, '.recent-updates');
const res = await qu.getAll(`${channel.url}/tour/scenes/?p=${page}`, '.recent-updates, .preview-image-container');
return scrapeLatest(res.body.toString(), site);
}
if (res.ok) {
return scrapeLatest(res.items, channel);
}
async function fetchScene(url, site) {
const res = await http.get(url);
return scrapeScene(res.body.toString(), url, site);
return res.status;
}
async function fetchProfile(baseActor, entity) {
const slug = slugify(baseActor.name, '+');
const url = `https://www.dogfartnetwork.com/tour/girls/${slug}/`;
const res = await http.get(url);
const res = await qu.getAll(url, '.recent-updates');
if (res.ok) {
const scenes = scrapeLatest(res.body, entity, false);
const { scenes } = scrapeLatest(res.items, entity, false);
// no bio available
return { scenes };
}
@@ -156,6 +122,6 @@ async function fetchProfile(baseActor, entity) {
module.exports = {
fetchLatest,
fetchScene,
fetchProfile,
scrapeScene,
};

View File

@@ -5,6 +5,27 @@ const http = require('../utils/http');
const slugify = require('../utils/slugify');
function scrapeLatest(scenes, site) {
return scenes.map(({ query }) => {
const release = {};
release.url = query.url('figure a', 'href', { origin: site.parameters.latest });
release.title = query.cnt('.has-text-weight-bold, .is-size-6');
release.date = query.date('span.tag', 'YYYY-MM-DD');
release.actors = query.cnts('a.tag');
const cover = query.img('.image img');
release.poster = cover.replace('poster_noplay', 'trailer_noplay');
release.covers = [cover];
release.entryId = `${qu.formatDate(release.date, 'YYYY-MM-DD')}-${slugify(release.title.split(/\s+/).slice(0, 5).join(' '))}`;
return release;
});
}
function scrapeLatestLegacy(scenes, site) {
return scenes.map(({ query }) => {
// if (q('.articleTitleText')) return scrapeFirstLatest(ctx(el), site);
const release = {};
@@ -47,28 +68,35 @@ function scrapeLatest(scenes, site) {
});
}
function scrapeLatestAlt(scenes, site) {
return scenes.map(({ query }) => {
const release = {};
async function scrapeScene({ query }, url, channel, parameters, session) {
const release = {};
release.url = query.url('figure a', 'href', { origin: site.parameters.latest });
release.title = query.cnt('.columns div.is-size-5.has-text-weight-bold');
release.description = query.cnt('.has-background-black-ter > div:nth-child(4)');
release.date = query.date('.has-text-white-ter span.tag', 'YYYY-MM-DD');
release.title = query.cnt('.has-text-weight-bold');
release.date = query.date('span.tag', 'YYYY-MM-DD');
release.actors = query.cnts('a.tag');
release.actors = query.cnts('.has-text-white-ter a.tag[href*="home.php"]');
release.tags = query.cnts('.has-background-black-ter > div:nth-child(6) > span');
const cover = query.img('.image img');
release.poster = query.img('#videoPlayer, #iodvideo', 'poster');
release.photos = Array.from(query.html('body > div:nth-child(6)').matchAll(/src="(http.*jpg)"/g), (match) => match[1]);
release.poster = cover.replace('poster_noplay', 'trailer_noplay');
release.covers = [cover];
release.entryId = `${qu.formatDate(release.date, 'YYYY-MM-DD')}-${slugify(release.title)}`;
release.entryId = `${qu.formatDate(release.date, 'YYYY-MM-DD')}-${slugify(release.title)}`;
release.trailer = query.video();
return release;
});
if (!release.trailer && parameters.includeTrailers) {
const trailerRes = await http.get(`${channel.url}/api/play-api.php`, { session });
if (trailerRes.ok) {
release.trailer = trailerRes.body;
}
}
return release;
}
function scrapeScene({ query }, site) {
function scrapeSceneLegacy({ query }, site) {
const release = {};
const titleEl = query.q('.articleTitleText');
@@ -97,70 +125,34 @@ function scrapeScene({ query }, site) {
return release;
}
async function scrapeSceneAlt({ query }, url, channel, session) {
const release = {};
release.title = query.cnt('.columns div.is-size-5');
release.description = query.cnt('.has-background-black-ter > div:nth-child(4)');
release.date = query.date('.has-text-white-ter span.tag', 'YYYY-MM-DD');
release.actors = query.cnts('.has-text-white-ter a.tag[href*="home.php"]');
release.tags = query.cnts('.has-background-black-ter > div:nth-child(6) > span');
release.poster = query.img('#videoPlayer, #iodvideo', 'poster');
release.photos = query.imgs('body > div:nth-child(6) img');
release.entryId = `${qu.formatDate(release.date, 'YYYY-MM-DD')}-${slugify(release.title)}`;
release.trailer = query.video();
if (!release.trailer) {
const trailerRes = await http.get(`${channel.url}/api/play-api.php`, { session });
if (trailerRes.ok) {
release.trailer = trailerRes.body;
}
}
return release;
}
async function fetchLatest(site, page = 1) {
const url = (site.parameters?.scraper === 'alt' && `${site.parameters.latest}/home.php?o=latest&p=${page}`)
// || (site.slug === 'paintoy' && `${site.url}/corporal/punishment/gallery.php?type=brief&page=${page}`) // paintoy's site is (was?) partially broken, use front page
|| `${site.url}/scripts/switch_tour.php?type=brief&page=${page}`;
const res = await ((site.parameters?.scraper === 'alt' && qu.getAll(url, 'body > .columns .column'))
// || (site.slug === 'paintoy' && qu.getAll(url, '#articleTable table[cellspacing="2"]'))
|| qu.get(url)); // JSON containing html as a property
const url = `${site.parameters.latest}/home.php?o=latest&p=${page}`;
const res = await qu.getAll(url, 'body > .columns .column', { cookie: 'consent=yes' });
if (res.ok) {
if (site.parameters?.scraper === 'alt') {
return scrapeLatestAlt(res.items, site);
}
/*
if (site.slug === 'paintoy') {
return scrapeLatest(res.items, site);
}
*/
return scrapeLatest(qu.extractAll(res.body.html, '#articleTable > tbody > tr:nth-child(2) > td > table'), site);
return scrapeLatest(res.items, site);
}
return res.status;
}
async function fetchScene(url, site) {
const session = http.session();
const res = await qu.get(url, null, null, { session });
async function fetchLatestLegacy(site, page = 1) {
const url = `${site.url}/scripts/switch_tour.php?type=brief&page=${page}`;
const res = await qu.get(url); // JSON containing html as a property
if (res.ok) {
if (site.parameters?.scraper === 'alt') {
return scrapeSceneAlt(res.item, url, site, session);
}
return scrapeLatestLegacy(qu.extractAll(res.body.html, '#articleTable > tbody > tr:nth-child(2) > td > table'), site);
}
return scrapeScene(res.item, site);
return res.status;
}
async function fetchScene(url, site, baseRelease, parameters) {
const session = http.session();
const res = await qu.get(url, null, { cookie: 'consent=yes' }, { session });
if (res.ok) {
return scrapeScene(res.item, url, site, parameters, session);
}
return res.status;
@@ -169,4 +161,8 @@ async function fetchScene(url, site) {
module.exports = {
fetchLatest,
fetchScene,
legacy: {
fetchLatest: fetchLatestLegacy,
scrapeScene: scrapeSceneLegacy,
},
};

View File

@@ -16,7 +16,7 @@ const siteMapByKey = {
const siteMapBySlug = Object.entries(siteMapByKey).reduce((acc, [key, value]) => ({ ...acc, [value]: key }), {});
function scrapeLatest(scenes, site) {
return scenes.map(({ query }) => {
return scenes.reduce((acc, { query }) => {
const release = {};
release.shootId = query.q('.card-meta .text-right, .row .text-right, .card-footer-item:last-child', true);
@@ -24,11 +24,6 @@ function scrapeLatest(scenes, site) {
const siteId = release.shootId.match(/\d?\w{2}/)[0];
const siteSlug = siteMapByKey[siteId];
if (site.slug !== siteSlug) {
// using generic network overview, scene is not from the site we want
return null;
}
const { pathname } = new URL(query.url('h5 a, .ep-title a, .title a'));
[release.entryId] = pathname.match(/\d+$/);
release.url = `${site.url}${pathname}`;
@@ -52,8 +47,16 @@ function scrapeLatest(scenes, site) {
};
}
return release;
}).filter((scene) => scene);
if (site.slug !== siteSlug) {
// using generic network overview, scene is not from the site we want
return { ...acc, unextracted: [...acc.unextracted, release] };
}
return { ...acc, scenes: [...acc.scenes, release] };
}, {
scenes: [],
unextracted: [],
});
}
async function scrapeScene({ query, html }, url, baseRelease, channel, session) {

View File

@@ -12,6 +12,13 @@ const channelCodes = {
uha: 'upherasshole',
};
const qualities = {
v4k: 2160,
vFullHD: 1080,
vHD: 720,
vSD: 480,
};
const channelRegExp = new RegExp(Object.keys(channelCodes).join('|'), 'i');
function scrapeAll(scenes, entity) {
@@ -42,9 +49,12 @@ function scrapeScene({ query }) {
release.entryId = query.q('.trailerLeft img', 'id').match(/set-target-(\d+)/)[1];
release.title = query.cnt('.infoHeader h1');
release.description = query.cnt('.infoBox p');
release.description = query.cnt('.description');
release.duration = query.duration('.tRuntime');
release.actors = query.cnts('.infoBox .tour_update_models a');
release.tags = query.cnts('.tagcats a');
release.qualities = query.imgs('.avaiFormate img').map((src) => qualities[src.match(/\/(\w+)\.png/)[1]]).filter(Boolean);
release.poster = query.img('.posterimg');
release.photos = query.imgs('.trailerSnaps img').slice(1); // first photo is poster in lower quality

View File

@@ -38,11 +38,8 @@ async function curateReleaseEntry(release, batchId, existingRelease, type = 'sce
date_precision: release.datePrecision,
slug,
description: release.description,
qualities: release.qualities?.map(Number).filter(Boolean),
comment: release.comment,
// director: release.director,
// likes: release.rating && release.rating.likes,
// dislikes: release.rating && release.rating.dislikes,
// rating: release.rating && release.rating.stars && Math.floor(release.rating.stars),
deep: typeof release.deep === 'boolean' ? release.deep : false,
deep_url: release.deepUrl,
updated_batch_id: batchId,