12 Commits

Author SHA1 Message Date
DebaucheryLibrarian
e202e887f9 1.213.5 2022-04-03 00:49:42 +02:00
DebaucheryLibrarian
574c117ab0 Refactored Dogfart scraper to use qu and return unextracted scenes. 2022-04-03 00:49:39 +02:00
DebaucheryLibrarian
d59a57f311 1.213.4 2022-04-02 00:32:29 +02:00
DebaucheryLibrarian
5e499c3685 Added chunking to media duplicate queries to prevent overloading parameters. Added DP Diva to Perv City (coming soon). 2022-04-02 00:32:23 +02:00
DebaucheryLibrarian
17e5ce71b2 1.213.3 2022-03-31 23:01:56 +02:00
DebaucheryLibrarian
5352186319 Insex not fetching video when not required. 2022-03-31 23:01:54 +02:00
DebaucheryLibrarian
e9ba02d65d 1.213.2 2022-03-31 22:46:56 +02:00
DebaucheryLibrarian
39813d4461 Updated Insex scraper. 2022-03-31 22:46:54 +02:00
DebaucheryLibrarian
829a285a2d 1.213.1 2022-03-31 14:34:12 +02:00
DebaucheryLibrarian
a19a77e165 Optionalized qualities. 2022-03-31 14:34:10 +02:00
DebaucheryLibrarian
122dd3eaee 1.213.0 2022-03-31 14:11:23 +02:00
DebaucheryLibrarian
18b219850e Storing scene qualities. Updated Perv City scraper. 2022-03-31 14:11:13 +02:00
40 changed files with 264 additions and 227 deletions

View File

@@ -203,6 +203,19 @@
</div>
</div>
<div
v-if="release.qualities"
class="row"
>
<span class="row-label">Available qualities</span>
<span
v-for="quality in release.qualities"
:key="quality"
class="quality"
>{{ quality }}</span>
</div>
<div
v-if="release.comment"
class="row"
@@ -470,6 +483,16 @@ export default {
text-overflow: ellipsis;
}
.quality {
&::after {
content: 'p, ';
}
&:last-child::after {
content: 'p',
}
}
.releases {
margin: 0 0 .5rem 0;
}

View File

@@ -12,6 +12,7 @@ export default {
selectableTags: [
'airtight',
'anal',
'bdsm',
'blowbang',
'blowjob',
'creampie',

View File

@@ -367,6 +367,7 @@ const releaseFields = `
date
datePrecision
slug
qualities
shootId
productionDate
comment
@@ -475,6 +476,7 @@ const releaseFragment = `
duration
createdAt
shootId
qualities
productionDate
createdBatchId
productionLocation

View File

@@ -89,6 +89,10 @@ module.exports = {
'uksinners',
// mindgeek
'pornhub',
// insex
'paintoy',
'aganmedon',
'sensualpain',
],
networks: [
// dummy network for testing

View File

@@ -0,0 +1,25 @@
exports.up = async (knex) => knex.raw(`
CREATE MATERIALIZED VIEW entities_stats
AS
WITH RECURSIVE relations AS (
SELECT entities.id, entities.parent_id, count(releases.id) AS releases_count, count(releases.id) AS total_count
FROM entities
LEFT JOIN releases ON releases.entity_id = entities.id
GROUP BY entities.id
UNION ALL
SELECT entities.id AS entity_id, count(releases.id) AS releases_count, count(releases.id) + relations.total_count AS total_count
FROM entities
INNER JOIN relations ON relations.id = entities.parent_id
LEFT JOIN releases ON releases.entity_id = entities.id
GROUP BY entities.id
)
SELECT relations.id AS entity_id, relations.releases_count
FROM relations;
`);
exports.down = async (knex) => knex.raw(`
DROP MATERIALIZED VIEW entities_stats;
`);

View File

@@ -0,0 +1,7 @@
exports.up = async (knex) => knex.schema.alterTable('releases', (table) => {
table.specificType('qualities', 'text[]');
});
exports.down = async (knex) => knex.schema.alterTable('releases', (table) => {
table.dropColumn('qualities');
});

View File

@@ -0,0 +1,12 @@
exports.up = async (knex) => knex.raw(`
CREATE MATERIALIZED VIEW entities_stats
AS
SELECT entities.id AS entity_id, count(releases.id) AS releases_count
FROM entities
LEFT JOIN releases ON releases.entity_id = entities.id
GROUP BY entities.id;
`);
exports.down = async (knex) => knex.raw(`
DROP MATERIALIZED VIEW entities_stats;
`);

23
package-lock.json generated
View File

@@ -1,12 +1,12 @@
{
"name": "traxxx",
"version": "1.212.9",
"version": "1.213.5",
"lockfileVersion": 2,
"requires": true,
"packages": {
"": {
"name": "traxxx",
"version": "1.212.9",
"version": "1.213.5",
"license": "ISC",
"dependencies": {
"@casl/ability": "^5.2.2",
@@ -11650,25 +11650,6 @@
"webidl-conversions": "^3.0.0"
}
},
"node_modules/node-fetch/node_modules/tr46": {
"version": "0.0.3",
"resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz",
"integrity": "sha1-gYT9NH2snNwYWZLzpmIuFLnZq2o="
},
"node_modules/node-fetch/node_modules/webidl-conversions": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",
"integrity": "sha1-JFNCdeKnvGvnvIZhHMFq4KVlSHE="
},
"node_modules/node-fetch/node_modules/whatwg-url": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz",
"integrity": "sha1-lmRU6HZUYuN2RNNib2dCzotwll0=",
"dependencies": {
"tr46": "~0.0.3",
"webidl-conversions": "^3.0.0"
}
},
"node_modules/node-gyp": {
"version": "7.1.2",
"resolved": "https://registry.npmjs.org/node-gyp/-/node-gyp-7.1.2.tgz",

View File

@@ -1,6 +1,6 @@
{
"name": "traxxx",
"version": "1.212.9",
"version": "1.213.5",
"description": "All the latest porn releases in one place",
"main": "src/app.js",
"scripts": {

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.5 KiB

After

Width:  |  Height:  |  Size: 2.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.4 KiB

After

Width:  |  Height:  |  Size: 2.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.6 KiB

After

Width:  |  Height:  |  Size: 3.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.7 KiB

After

Width:  |  Height:  |  Size: 3.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.8 KiB

After

Width:  |  Height:  |  Size: 3.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.5 KiB

After

Width:  |  Height:  |  Size: 3.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.8 KiB

After

Width:  |  Height:  |  Size: 3.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.4 KiB

After

Width:  |  Height:  |  Size: 2.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 27 KiB

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 17 KiB

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 36 KiB

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.7 KiB

After

Width:  |  Height:  |  Size: 3.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 30 KiB

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 30 KiB

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 20 KiB

After

Width:  |  Height:  |  Size: 20 KiB

View File

@@ -4219,7 +4219,6 @@ const sites = [
tags: ['bdsm'],
parent: 'insex',
parameters: {
scraper: 'alt',
latest: 'https://www.sexuallybroken.com/sb',
},
},
@@ -4230,13 +4229,20 @@ const sites = [
url: 'https://www.infernalrestraints.com',
tags: ['bdsm'],
parent: 'insex',
parameters: {
latest: 'https://www.infernalrestraints.com/ir',
},
},
{
slug: 'hardtied',
name: 'Hardtied',
alias: ['ht'],
url: 'https://www.hardtied.com',
tags: ['bdsm'],
parent: 'insex',
parameters: {
latest: 'https://www.hardtied.com/ht',
},
},
{
slug: 'realtimebondage',
@@ -4245,6 +4251,9 @@ const sites = [
url: 'https://www.realtimebondage.com',
tags: ['bdsm', 'live'],
parent: 'insex',
parameters: {
latest: 'https://www.realtimebondage.com/rtb',
},
},
{
slug: 'topgrl',
@@ -4254,7 +4263,6 @@ const sites = [
tags: ['bdsm', 'femdom'],
parent: 'insex',
parameters: {
scraper: 'alt',
latest: 'https://www.topgrl.com/tg',
},
},
@@ -6909,6 +6917,13 @@ const sites = [
tourId: 9,
},
},
{
slug: 'dpdiva',
name: 'DP Diva',
url: 'http://dpdiva.com',
parent: 'pervcity',
tags: ['dp', 'anal'],
},
// PIERRE WOODMAN
{
slug: 'woodmancastingx',

View File

@@ -85,23 +85,6 @@ async function startMemorySample(snapshotTriggers = []) {
}, config.memorySampling.sampleDuration);
}
async function startMemorySample() {
await inspector.heap.enable();
await inspector.heap.startSampling();
// monitorMemory();
logger.info(`Start heap sampling, memory usage: ${process.memoryUsage.rss() / 1000000} MB`);
setTimeout(async () => {
await stopMemorySample();
if (!done) {
await startMemorySample();
}
}, 30000);
}
async function init() {
try {
if (argv.server) {

View File

@@ -194,6 +194,7 @@ const { argv } = yargs
alias: 'pics',
})
.option('videos', {
alias: 'video',
describe: 'Include any trailers or teasers',
type: 'boolean',
default: true,

View File

@@ -21,6 +21,7 @@ const argv = require('./argv');
const knex = require('./knex');
const http = require('./utils/http');
const bulkInsert = require('./utils/bulk-insert');
const chunk = require('./utils/chunk');
const { get } = require('./utils/qu');
const pipeline = util.promisify(stream.pipeline);
@@ -63,10 +64,10 @@ function sampleMedias(medias, limit = argv.mediaLimit, preferLast = true) {
? chunks.slice(0, -1).concat(chunks.slice(-1).reverse())
: chunks;
const groupedMedias = lastPreferredChunks.map((chunk) => {
const groupedMedias = lastPreferredChunks.map((mediaChunk) => {
// merge chunked medias into single media with grouped fallback priorities,
// so the first sources of each media is preferred over all second sources, etc.
const sources = chunk
const sources = mediaChunk
.reduce((accSources, media) => {
media.sources.forEach((source, index) => {
if (!accSources[index]) {
@@ -82,8 +83,8 @@ function sampleMedias(medias, limit = argv.mediaLimit, preferLast = true) {
.flat();
return {
id: chunk[0].id,
role: chunk[0].role,
id: mediaChunk[0].id,
role: mediaChunk[0].role,
sources,
};
});
@@ -235,22 +236,41 @@ async function findSourceDuplicates(baseMedias) {
.filter(Boolean);
const [existingSourceMedia, existingExtractMedia] = await Promise.all([
knex('media').whereIn('source', sourceUrls),
knex('media').whereIn('source_page', extractUrls),
// my try to check thousands of URLs at once, don't pass all of them to a single query
chunk(sourceUrls).reduce(async (chain, sourceUrlsChunk) => {
const accUrls = await chain;
const existingUrls = await knex('media').whereIn('source', sourceUrlsChunk);
return [...accUrls, ...existingUrls];
}, []),
chunk(extractUrls).reduce(async (chain, extractUrlsChunk) => {
const accUrls = await chain;
const existingUrls = await knex('media').whereIn('source_page', extractUrlsChunk);
return [...accUrls, ...existingUrls];
}, []),
]);
const existingSourceMediaByUrl = itemsByKey(existingSourceMedia, 'source');
const existingExtractMediaByUrl = itemsByKey(existingExtractMedia, 'source_page');
return { existingSourceMediaByUrl, existingExtractMediaByUrl };
return {
existingSourceMediaByUrl,
existingExtractMediaByUrl,
};
}
async function findHashDuplicates(medias) {
const hashes = medias.map((media) => media.meta?.hash || media.entry?.hash).filter(Boolean);
const existingHashMediaEntries = await knex('media').whereIn('hash', hashes);
const existingHashMediaEntriesByHash = itemsByKey(existingHashMediaEntries, 'hash');
const existingHashMediaEntries = await chunk(hashes, 2).reduce(async (chain, hashesChunk) => {
const accHashes = await chain;
const existingHashes = await knex('media').whereIn('hash', hashesChunk);
return [...accHashes, ...existingHashes];
}, []);
const existingHashMediaEntriesByHash = itemsByKey(existingHashMediaEntries, 'hash');
const uniqueHashMedias = medias.filter((media) => !media.entry && !existingHashMediaEntriesByHash[media.meta?.hash]);
const { selfDuplicateMedias, selfUniqueMediasByHash } = uniqueHashMedias.reduce((acc, media) => {
@@ -600,11 +620,11 @@ async function fetchSource(source, baseMedia) {
const hashStream = new stream.PassThrough();
let size = 0;
hashStream.on('data', (chunk) => {
size += chunk.length;
hashStream.on('data', (streamChunk) => {
size += streamChunk.length;
if (hasherReady) {
hasher.write(chunk);
hasher.write(streamChunk);
}
});

View File

@@ -1,20 +1,16 @@
'use strict';
/* eslint-disable newline-per-chained-call */
// const Promise = require('bluebird');
const { JSDOM } = require('jsdom');
const moment = require('moment');
const http = require('../utils/http');
const slugify = require('../utils/slugify');
const qu = require('../utils/qu');
async function getPhotos(albumUrl) {
const res = await http.get(albumUrl);
const html = res.body.toString();
const { document } = new JSDOM(html).window;
const res = await qu.get(albumUrl);
const lastPhotoPage = Array.from(document.querySelectorAll('.preview-image-container a')).slice(-1)[0].href;
if (!res.ok) {
return [];
}
const lastPhotoPage = res.item.query.urls('.preview-image-container a').at(-1);
const lastPhotoIndex = parseInt(lastPhotoPage.match(/\d+.jpg/)[0], 10);
const photoUrls = Array.from({ length: lastPhotoIndex }, (value, index) => {
@@ -29,124 +25,88 @@ async function getPhotos(albumUrl) {
return photoUrls;
}
function scrapeLatest(html, site, filter = true) {
const { document } = new JSDOM(html).window;
const sceneElements = Array.from(document.querySelectorAll('.recent-updates'));
function scrapeLatest(scenes, site, filter = true) {
return scenes.reduce((acc, { query }) => {
const release = {};
return sceneElements.map((element) => {
const siteUrl = element.querySelector('.recent-details-title .help-block, .model-details-title .site-name').textContent;
const siteUrl = query.cnt('.recent-details-title .help-block, .model-details-title .site-name');
release.url = query.url('.thumbnail', 'href', { origin: site.type === 'network' ? site.url : site.parent.url });
release.entryId = `${site.slug}_${new URL(release.url).pathname.split('/')[4]}`;
release.title = query.cnt('.scene-title');
release.actors = release.title.split(/[,&]|\band\b/).map((actor) => actor.replace(/BTS/i, '').trim());
// release.poster = `https:${element.querySelector('img').src}`;
release.poster = query.img();
release.teaser = query.el('.thumbnail', 'data-preview_clip_url');
release.channel = siteUrl?.match(/(.*).com/)?.[1].toLowerCase();
if (filter && `www.${siteUrl.toLowerCase()}` !== new URL(site.url).host) {
// different dogfart site
return null;
return { ...acc, unextracted: [...acc.unextracted, release] };
}
const sceneLinkElement = element.querySelector('.thumbnail');
const url = qu.prefixUrl(sceneLinkElement.href, 'https://dogfartnetwork.com');
const { pathname } = new URL(url);
const entryId = `${site.slug}_${pathname.split('/')[4]}`;
const title = element.querySelector('.scene-title').textContent;
const actors = title.split(/[,&]|\band\b/).map((actor) => actor.replace(/BTS/i, '').trim());
const poster = `https:${element.querySelector('img').src}`;
const teaser = sceneLinkElement.dataset.preview_clip_url;
const channel = siteUrl?.match(/(.*).com/)?.[1].toLowerCase();
return {
url,
entryId,
title,
actors,
poster,
teaser: {
src: teaser,
},
site,
channel,
};
}).filter(Boolean);
return { ...acc, scenes: [...acc.scenes, release] };
}, {
scenes: [],
unextracted: [],
});
}
async function scrapeScene(html, url, site) {
const { document } = new JSDOM(html).window;
const title = document.querySelector('.description-title').textContent;
const actors = Array.from(document.querySelectorAll('.more-scenes a')).map(({ textContent }) => textContent);
const metaDescription = document.querySelector('meta[itemprop="description"]').content;
const description = metaDescription
? metaDescription.content
: document.querySelector('.description')
.textContent
.replace(/[ \t\n]{2,}/g, ' ')
.replace('...read more', '')
.trim();
const channel = document.querySelector('.site-name').textContent.split('.')[0].toLowerCase();
async function scrapeScene({ query }, url, channel, baseScene, parameters) {
const release = {};
const { origin, pathname } = new URL(url);
const entryId = `${channel}_${pathname.split('/').slice(-2)[0]}`;
const date = new Date(document.querySelector('meta[itemprop="uploadDate"]').content);
const duration = moment
.duration(`00:${document
.querySelectorAll('.extra-info p')[1]
.textContent
.match(/\d+:\d+$/)[0]}`)
.asSeconds();
release.channel = query.cnt('.site-name').split('.')[0].toLowerCase();
release.entryId = `${release.channel}_${pathname.split('/').slice(-2)[0]}`;
const trailerElement = document.querySelector('.html5-video');
const poster = `https:${trailerElement.dataset.poster}`;
const { trailer } = trailerElement.dataset;
release.title = query.cnt('.description-title');
release.actors = query.all('.more-scenes a').map((actorEl) => ({
name: query.cnt(actorEl),
url: query.url(actorEl, null, 'href', { origin: channel.url }),
}));
const lastPhotosUrl = Array.from(document.querySelectorAll('.pagination a')).slice(-1)[0]?.href;
const photos = lastPhotosUrl ? await getPhotos(`${origin}${pathname}${lastPhotosUrl}`, site, url) : [];
release.description = query.meta('meta[itemprop="description"]') || qu.cnt('.description').replace(/[ \t\n]{2,}/g, ' ').replace('...read more', '').trim();
const stars = Math.floor(Number(document.querySelector('span[itemprop="average"]')?.textContent || document.querySelector('span[itemprop="ratingValue"]')?.textContent) / 2);
const tags = Array.from(document.querySelectorAll('.scene-details .categories a')).map(({ textContent }) => textContent);
release.date = query.date('meta[itemprop="uploadDate"]', null, null, 'content');
release.duration = query.duration('.extra-info p:nth-child(2)');
return {
entryId,
url: `${origin}${pathname}`,
title,
description,
actors,
date,
duration,
poster,
photos,
trailer: {
src: trailer,
},
tags,
rating: {
stars,
},
site,
channel,
};
release.tags = query.cnts('.scene-details .categories a');
release.trailer = query.video('.html5-video', 'data-trailer');
release.poster = query.poster('.html5-video', 'data-poster');
const lastPhotosUrl = query.urls('.pagination a').at(-1);
if (lastPhotosUrl && parameters.includePhotos) {
release.photos = await getPhotos(`${origin}${pathname}${lastPhotosUrl}`, channel, url);
}
release.stars = Number(((query.number('span[itemprop="average"]') || query.number('span[itemprop="ratingValue"]')) / 2).toFixed(2));
return release;
}
async function fetchLatest(site, page = 1) {
const res = await http.get(`https://dogfartnetwork.com/tour/scenes/?p=${page}`);
const res = await qu.getAll(`https://dogfartnetwork.com/tour/scenes/?p=${page}`, '.recent-updates');
return scrapeLatest(res.body.toString(), site);
}
if (res.ok) {
return scrapeLatest(res.items, site);
}
async function fetchScene(url, site) {
const res = await http.get(url);
return scrapeScene(res.body.toString(), url, site);
return res.status;
}
async function fetchProfile(baseActor, entity) {
const slug = slugify(baseActor.name, '+');
const url = `https://www.dogfartnetwork.com/tour/girls/${slug}/`;
const res = await http.get(url);
const res = await qu.getAll(url, '.recent-updates');
if (res.ok) {
const scenes = scrapeLatest(res.body, entity, false);
const scenes = scrapeLatest(res.items, entity, false);
return { scenes };
}
@@ -156,6 +116,6 @@ async function fetchProfile(baseActor, entity) {
module.exports = {
fetchLatest,
fetchScene,
fetchProfile,
scrapeScene,
};

View File

@@ -5,6 +5,27 @@ const http = require('../utils/http');
const slugify = require('../utils/slugify');
function scrapeLatest(scenes, site) {
return scenes.map(({ query }) => {
const release = {};
release.url = query.url('figure a', 'href', { origin: site.parameters.latest });
release.title = query.cnt('.has-text-weight-bold, .is-size-6');
release.date = query.date('span.tag', 'YYYY-MM-DD');
release.actors = query.cnts('a.tag');
const cover = query.img('.image img');
release.poster = cover.replace('poster_noplay', 'trailer_noplay');
release.covers = [cover];
release.entryId = `${qu.formatDate(release.date, 'YYYY-MM-DD')}-${slugify(release.title.split(/\s+/).slice(0, 5).join(' '))}`;
return release;
});
}
function scrapeLatestLegacy(scenes, site) {
return scenes.map(({ query }) => {
// if (q('.articleTitleText')) return scrapeFirstLatest(ctx(el), site);
const release = {};
@@ -47,28 +68,35 @@ function scrapeLatest(scenes, site) {
});
}
function scrapeLatestAlt(scenes, site) {
return scenes.map(({ query }) => {
const release = {};
async function scrapeScene({ query }, url, channel, parameters, session) {
const release = {};
release.url = query.url('figure a', 'href', { origin: site.parameters.latest });
release.title = query.cnt('.columns div.is-size-5.has-text-weight-bold');
release.description = query.cnt('.has-background-black-ter > div:nth-child(4)');
release.date = query.date('.has-text-white-ter span.tag', 'YYYY-MM-DD');
release.title = query.cnt('.has-text-weight-bold');
release.date = query.date('span.tag', 'YYYY-MM-DD');
release.actors = query.cnts('a.tag');
release.actors = query.cnts('.has-text-white-ter a.tag[href*="home.php"]');
release.tags = query.cnts('.has-background-black-ter > div:nth-child(6) > span');
const cover = query.img('.image img');
release.poster = query.img('#videoPlayer, #iodvideo', 'poster');
release.photos = Array.from(query.html('body > div:nth-child(6)').matchAll(/src="(http.*jpg)"/g), (match) => match[1]);
release.poster = cover.replace('poster_noplay', 'trailer_noplay');
release.covers = [cover];
release.entryId = `${qu.formatDate(release.date, 'YYYY-MM-DD')}-${slugify(release.title)}`;
release.entryId = `${qu.formatDate(release.date, 'YYYY-MM-DD')}-${slugify(release.title)}`;
release.trailer = query.video();
return release;
});
if (!release.trailer && parameters.includeTrailers) {
const trailerRes = await http.get(`${channel.url}/api/play-api.php`, { session });
if (trailerRes.ok) {
release.trailer = trailerRes.body;
}
}
return release;
}
function scrapeScene({ query }, site) {
function scrapeSceneLegacy({ query }, site) {
const release = {};
const titleEl = query.q('.articleTitleText');
@@ -97,70 +125,34 @@ function scrapeScene({ query }, site) {
return release;
}
async function scrapeSceneAlt({ query }, url, channel, session) {
const release = {};
release.title = query.cnt('.columns div.is-size-5');
release.description = query.cnt('.has-background-black-ter > div:nth-child(4)');
release.date = query.date('.has-text-white-ter span.tag', 'YYYY-MM-DD');
release.actors = query.cnts('.has-text-white-ter a.tag[href*="home.php"]');
release.tags = query.cnts('.has-background-black-ter > div:nth-child(6) > span');
release.poster = query.img('#videoPlayer, #iodvideo', 'poster');
release.photos = query.imgs('body > div:nth-child(6) img');
release.entryId = `${qu.formatDate(release.date, 'YYYY-MM-DD')}-${slugify(release.title)}`;
release.trailer = query.video();
if (!release.trailer) {
const trailerRes = await http.get(`${channel.url}/api/play-api.php`, { session });
if (trailerRes.ok) {
release.trailer = trailerRes.body;
}
}
return release;
}
async function fetchLatest(site, page = 1) {
const url = (site.parameters?.scraper === 'alt' && `${site.parameters.latest}/home.php?o=latest&p=${page}`)
// || (site.slug === 'paintoy' && `${site.url}/corporal/punishment/gallery.php?type=brief&page=${page}`) // paintoy's site is (was?) partially broken, use front page
|| `${site.url}/scripts/switch_tour.php?type=brief&page=${page}`;
const res = await ((site.parameters?.scraper === 'alt' && qu.getAll(url, 'body > .columns .column'))
// || (site.slug === 'paintoy' && qu.getAll(url, '#articleTable table[cellspacing="2"]'))
|| qu.get(url)); // JSON containing html as a property
const url = `${site.parameters.latest}/home.php?o=latest&p=${page}`;
const res = await qu.getAll(url, 'body > .columns .column', { cookie: 'consent=yes' });
if (res.ok) {
if (site.parameters?.scraper === 'alt') {
return scrapeLatestAlt(res.items, site);
}
/*
if (site.slug === 'paintoy') {
return scrapeLatest(res.items, site);
}
*/
return scrapeLatest(qu.extractAll(res.body.html, '#articleTable > tbody > tr:nth-child(2) > td > table'), site);
return scrapeLatest(res.items, site);
}
return res.status;
}
async function fetchScene(url, site) {
const session = http.session();
const res = await qu.get(url, null, null, { session });
async function fetchLatestLegacy(site, page = 1) {
const url = `${site.url}/scripts/switch_tour.php?type=brief&page=${page}`;
const res = await qu.get(url); // JSON containing html as a property
if (res.ok) {
if (site.parameters?.scraper === 'alt') {
return scrapeSceneAlt(res.item, url, site, session);
}
return scrapeLatestLegacy(qu.extractAll(res.body.html, '#articleTable > tbody > tr:nth-child(2) > td > table'), site);
}
return scrapeScene(res.item, site);
return res.status;
}
async function fetchScene(url, site, baseRelease, parameters) {
const session = http.session();
const res = await qu.get(url, null, { cookie: 'consent=yes' }, { session });
if (res.ok) {
return scrapeScene(res.item, url, site, parameters, session);
}
return res.status;
@@ -169,4 +161,8 @@ async function fetchScene(url, site) {
module.exports = {
fetchLatest,
fetchScene,
legacy: {
fetchLatest: fetchLatestLegacy,
scrapeScene: scrapeSceneLegacy,
},
};

View File

@@ -12,6 +12,13 @@ const channelCodes = {
uha: 'upherasshole',
};
const qualities = {
v4k: 2160,
vFullHD: 1080,
vHD: 720,
vSD: 480,
};
const channelRegExp = new RegExp(Object.keys(channelCodes).join('|'), 'i');
function scrapeAll(scenes, entity) {
@@ -42,9 +49,12 @@ function scrapeScene({ query }) {
release.entryId = query.q('.trailerLeft img', 'id').match(/set-target-(\d+)/)[1];
release.title = query.cnt('.infoHeader h1');
release.description = query.cnt('.infoBox p');
release.description = query.cnt('.description');
release.duration = query.duration('.tRuntime');
release.actors = query.cnts('.infoBox .tour_update_models a');
release.tags = query.cnts('.tagcats a');
release.qualities = query.imgs('.avaiFormate img').map((src) => qualities[src.match(/\/(\w+)\.png/)[1]]).filter(Boolean);
release.poster = query.img('.posterimg');
release.photos = query.imgs('.trailerSnaps img').slice(1); // first photo is poster in lower quality

View File

@@ -38,11 +38,8 @@ async function curateReleaseEntry(release, batchId, existingRelease, type = 'sce
date_precision: release.datePrecision,
slug,
description: release.description,
qualities: release.qualities?.map(Number).filter(Boolean),
comment: release.comment,
// director: release.director,
// likes: release.rating && release.rating.likes,
// dislikes: release.rating && release.rating.dislikes,
// rating: release.rating && release.rating.stars && Math.floor(release.rating.stars),
deep: typeof release.deep === 'boolean' ? release.deep : false,
deep_url: release.deepUrl,
updated_batch_id: batchId,