1.213.5

Refactored Dogfart scraper to use qu and return unextracted scenes.
1.213.4
2022-04-03 00:49:42 +02:00 · 2022-04-03 00:49:39 +02:00 · 2022-04-02 00:32:29 +02:00 · 2022-04-02 00:32:23 +02:00 · 2022-03-31 23:01:56 +02:00 · 2022-03-31 23:01:54 +02:00
40 changed files with 264 additions and 227 deletions
--- a/assets/components/releases/release.vue
+++ b/assets/components/releases/release.vue
@@ -203,6 +203,19 @@
 				</div>
 			</div>

+			<div
+				v-if="release.qualities"
+				class="row"
+			>
+				<span class="row-label">Available qualities</span>
+
+				<span
+					v-for="quality in release.qualities"
+					:key="quality"
+					class="quality"
+				>{{ quality }}</span>
+			</div>
+
 			<div
 				v-if="release.comment"
 				class="row"
@@ -470,6 +483,16 @@ export default {
 	text-overflow: ellipsis;
 }

+.quality {
+	&::after {
+		content: 'p, ';
+	}
+
+	&:last-child::after {
+		content: 'p',
+	}
+}
+
 .releases {
 	margin: 0 0 .5rem 0;
 }
--- a/assets/js/config/default.js
+++ b/assets/js/config/default.js
@@ -12,6 +12,7 @@ export default {
 	selectableTags: [
 		'airtight',
 		'anal',
+		'bdsm',
 		'blowbang',
 		'blowjob',
 		'creampie',
--- a/assets/js/fragments.js
+++ b/assets/js/fragments.js
@@ -367,6 +367,7 @@ const releaseFields = `
    date
 	datePrecision
    slug
+	qualities
 	shootId
 	productionDate
 	comment
@@ -475,6 +476,7 @@ const releaseFragment = `
    duration
    createdAt
    shootId
+	qualities
 	productionDate
 	createdBatchId
 	productionLocation
--- a/config/default.js
+++ b/config/default.js
@@ -89,6 +89,10 @@ module.exports = {
 			'uksinners',
 			// mindgeek
 			'pornhub',
+			// insex
+			'paintoy',
+			'aganmedon',
+			'sensualpain',
 		],
 		networks: [
 			// dummy network for testing
--- a/migrations/20220330230122_stats.js
+++ b/migrations/20220330230122_stats.js
@@ -0,0 +1,25 @@
+exports.up = async (knex) => knex.raw(`
+	CREATE MATERIALIZED VIEW entities_stats
+	AS
+		WITH RECURSIVE relations AS (
+			SELECT entities.id, entities.parent_id, count(releases.id) AS releases_count, count(releases.id) AS total_count
+			FROM entities
+			LEFT JOIN releases ON releases.entity_id = entities.id
+			GROUP BY entities.id
+
+			UNION ALL
+
+			SELECT entities.id AS entity_id, count(releases.id) AS releases_count, count(releases.id) + relations.total_count AS total_count
+			FROM entities
+			INNER JOIN relations ON relations.id = entities.parent_id
+			LEFT JOIN releases ON releases.entity_id = entities.id
+			GROUP BY entities.id
+		)
+
+		SELECT relations.id AS entity_id, relations.releases_count
+		FROM relations;
+`);
+
+exports.down = async (knex) => knex.raw(`
+	DROP MATERIALIZED VIEW entities_stats;
+`);
--- a/migrations/20220331135618_qualities.js
+++ b/migrations/20220331135618_qualities.js
@@ -0,0 +1,7 @@
+exports.up = async (knex) => knex.schema.alterTable('releases', (table) => {
+	table.specificType('qualities', 'text[]');
+});
+
+exports.down = async (knex) => knex.schema.alterTable('releases', (table) => {
+	table.dropColumn('qualities');
+});
--- a/migrations/_20220330230122_stats.js
+++ b/migrations/_20220330230122_stats.js
@@ -0,0 +1,12 @@
+exports.up = async (knex) => knex.raw(`
+	CREATE MATERIALIZED VIEW entities_stats
+	AS
+		SELECT entities.id AS entity_id, count(releases.id) AS releases_count
+		FROM entities
+		LEFT JOIN releases ON releases.entity_id = entities.id
+		GROUP BY entities.id;
+`);
+
+exports.down = async (knex) => knex.raw(`
+	DROP MATERIALIZED VIEW entities_stats;
+`);
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,12 @@
 {
    "name": "traxxx",
-    "version": "1.212.9",
+    "version": "1.213.5",
    "lockfileVersion": 2,
    "requires": true,
    "packages": {
        "": {
            "name": "traxxx",
-            "version": "1.212.9",
+            "version": "1.213.5",
            "license": "ISC",
            "dependencies": {
                "@casl/ability": "^5.2.2",
@@ -11650,25 +11650,6 @@
                "webidl-conversions": "^3.0.0"
            }
        },
-        "node_modules/node-fetch/node_modules/tr46": {
-            "version": "0.0.3",
-            "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz",
-            "integrity": "sha1-gYT9NH2snNwYWZLzpmIuFLnZq2o="
-        },
-        "node_modules/node-fetch/node_modules/webidl-conversions": {
-            "version": "3.0.1",
-            "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",
-            "integrity": "sha1-JFNCdeKnvGvnvIZhHMFq4KVlSHE="
-        },
-        "node_modules/node-fetch/node_modules/whatwg-url": {
-            "version": "5.0.0",
-            "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz",
-            "integrity": "sha1-lmRU6HZUYuN2RNNib2dCzotwll0=",
-            "dependencies": {
-                "tr46": "~0.0.3",
-                "webidl-conversions": "^3.0.0"
-            }
-        },
        "node_modules/node-gyp": {
            "version": "7.1.2",
            "resolved": "https://registry.npmjs.org/node-gyp/-/node-gyp-7.1.2.tgz",
--- a/package.json
+++ b/package.json
@@ -1,6 +1,6 @@
 {
    "name": "traxxx",
-    "version": "1.212.9",
+    "version": "1.213.5",
    "description": "All the latest porn releases in one place",
    "main": "src/app.js",
    "scripts": {
--- a/public/img/logos/pervcity/dpdiva.png
+++ b/public/img/logos/pervcity/dpdiva.png
--- a/public/img/logos/pervcity/lazy/analoverdose.png
+++ b/public/img/logos/pervcity/lazy/analoverdose.png
--- a/public/img/logos/pervcity/lazy/bangingbeauties.png
+++ b/public/img/logos/pervcity/lazy/bangingbeauties.png
--- a/public/img/logos/pervcity/lazy/chocolatebjs.png
+++ b/public/img/logos/pervcity/lazy/chocolatebjs.png
--- a/public/img/logos/pervcity/lazy/dpdiva.png
+++ b/public/img/logos/pervcity/lazy/dpdiva.png
--- a/public/img/logos/pervcity/lazy/favicon.png
+++ b/public/img/logos/pervcity/lazy/favicon.png
--- a/public/img/logos/pervcity/lazy/favicon_dark.png
+++ b/public/img/logos/pervcity/lazy/favicon_dark.png
--- a/public/img/logos/pervcity/lazy/favicon_light.png
+++ b/public/img/logos/pervcity/lazy/favicon_light.png
--- a/public/img/logos/pervcity/lazy/network.png
+++ b/public/img/logos/pervcity/lazy/network.png
--- a/public/img/logos/pervcity/lazy/oraloverdose.png
+++ b/public/img/logos/pervcity/lazy/oraloverdose.png
--- a/public/img/logos/pervcity/lazy/pervcity.png
+++ b/public/img/logos/pervcity/lazy/pervcity.png
--- a/public/img/logos/pervcity/lazy/upherasshole.png
+++ b/public/img/logos/pervcity/lazy/upherasshole.png
--- a/public/img/logos/pervcity/thumbs/analoverdose.png
+++ b/public/img/logos/pervcity/thumbs/analoverdose.png
--- a/public/img/logos/pervcity/thumbs/bangingbeauties.png
+++ b/public/img/logos/pervcity/thumbs/bangingbeauties.png
--- a/public/img/logos/pervcity/thumbs/chocolatebjs.png
+++ b/public/img/logos/pervcity/thumbs/chocolatebjs.png
--- a/public/img/logos/pervcity/thumbs/dpdiva.png
+++ b/public/img/logos/pervcity/thumbs/dpdiva.png
--- a/public/img/logos/pervcity/thumbs/favicon.png
+++ b/public/img/logos/pervcity/thumbs/favicon.png
--- a/public/img/logos/pervcity/thumbs/favicon_dark.png
+++ b/public/img/logos/pervcity/thumbs/favicon_dark.png
--- a/public/img/logos/pervcity/thumbs/favicon_light.png
+++ b/public/img/logos/pervcity/thumbs/favicon_light.png
--- a/public/img/logos/pervcity/thumbs/network.png
+++ b/public/img/logos/pervcity/thumbs/network.png
--- a/public/img/logos/pervcity/thumbs/oraloverdose.png
+++ b/public/img/logos/pervcity/thumbs/oraloverdose.png
--- a/public/img/logos/pervcity/thumbs/pervcity.png
+++ b/public/img/logos/pervcity/thumbs/pervcity.png
--- a/public/img/logos/pervcity/thumbs/upherasshole.png
+++ b/public/img/logos/pervcity/thumbs/upherasshole.png
--- a/seeds/02_sites.js
+++ b/seeds/02_sites.js
@@ -4219,7 +4219,6 @@ const sites = [
 		tags: ['bdsm'],
 		parent: 'insex',
 		parameters: {
-			scraper: 'alt',
 			latest: 'https://www.sexuallybroken.com/sb',
 		},
 	},
@@ -4230,13 +4229,20 @@ const sites = [
 		url: 'https://www.infernalrestraints.com',
 		tags: ['bdsm'],
 		parent: 'insex',
+		parameters: {
+			latest: 'https://www.infernalrestraints.com/ir',
+		},
 	},
 	{
 		slug: 'hardtied',
 		name: 'Hardtied',
+		alias: ['ht'],
 		url: 'https://www.hardtied.com',
 		tags: ['bdsm'],
 		parent: 'insex',
+		parameters: {
+			latest: 'https://www.hardtied.com/ht',
+		},
 	},
 	{
 		slug: 'realtimebondage',
@@ -4245,6 +4251,9 @@ const sites = [
 		url: 'https://www.realtimebondage.com',
 		tags: ['bdsm', 'live'],
 		parent: 'insex',
+		parameters: {
+			latest: 'https://www.realtimebondage.com/rtb',
+		},
 	},
 	{
 		slug: 'topgrl',
@@ -4254,7 +4263,6 @@ const sites = [
 		tags: ['bdsm', 'femdom'],
 		parent: 'insex',
 		parameters: {
-			scraper: 'alt',
 			latest: 'https://www.topgrl.com/tg',
 		},
 	},
@@ -6909,6 +6917,13 @@ const sites = [
 			tourId: 9,
 		},
 	},
+	{
+		slug: 'dpdiva',
+		name: 'DP Diva',
+		url: 'http://dpdiva.com',
+		parent: 'pervcity',
+		tags: ['dp', 'anal'],
+	},
 	// PIERRE WOODMAN
 	{
 		slug: 'woodmancastingx',
--- a/src/app.js
+++ b/src/app.js
@@ -85,23 +85,6 @@ async function startMemorySample(snapshotTriggers = []) {
 	}, config.memorySampling.sampleDuration);
 }

-async function startMemorySample() {
-	await inspector.heap.enable();
-	await inspector.heap.startSampling();
-
-	// monitorMemory();
-
-	logger.info(`Start heap sampling, memory usage: ${process.memoryUsage.rss() / 1000000} MB`);
-
-	setTimeout(async () => {
-		await stopMemorySample();
-
-		if (!done) {
-			await startMemorySample();
-		}
-	}, 30000);
-}
-
 async function init() {
 	try {
 		if (argv.server) {
--- a/src/argv.js
+++ b/src/argv.js
@@ -194,6 +194,7 @@ const { argv } = yargs
 		alias: 'pics',
 	})
 	.option('videos', {
+		alias: 'video',
 		describe: 'Include any trailers or teasers',
 		type: 'boolean',
 		default: true,
--- a/src/media.js
+++ b/src/media.js
@@ -21,6 +21,7 @@ const argv = require('./argv');
 const knex = require('./knex');
 const http = require('./utils/http');
 const bulkInsert = require('./utils/bulk-insert');
+const chunk = require('./utils/chunk');
 const { get } = require('./utils/qu');

 const pipeline = util.promisify(stream.pipeline);
@@ -63,10 +64,10 @@ function sampleMedias(medias, limit = argv.mediaLimit, preferLast = true) {
 		? chunks.slice(0, -1).concat(chunks.slice(-1).reverse())
 		: chunks;

-	const groupedMedias = lastPreferredChunks.map((chunk) => {
+	const groupedMedias = lastPreferredChunks.map((mediaChunk) => {
 		// merge chunked medias into single media with grouped fallback priorities,
 		// so the first sources of each media is preferred over all second sources, etc.
-		const sources = chunk
+		const sources = mediaChunk
 			.reduce((accSources, media) => {
 				media.sources.forEach((source, index) => {
 					if (!accSources[index]) {
@@ -82,8 +83,8 @@ function sampleMedias(medias, limit = argv.mediaLimit, preferLast = true) {
 			.flat();

 		return {
-			id: chunk[0].id,
-			role: chunk[0].role,
+			id: mediaChunk[0].id,
+			role: mediaChunk[0].role,
 			sources,
 		};
 	});
@@ -235,22 +236,41 @@ async function findSourceDuplicates(baseMedias) {
 		.filter(Boolean);

 	const [existingSourceMedia, existingExtractMedia] = await Promise.all([
-		knex('media').whereIn('source', sourceUrls),
-		knex('media').whereIn('source_page', extractUrls),
+		// my try to check thousands of URLs at once, don't pass all of them to a single query
+		chunk(sourceUrls).reduce(async (chain, sourceUrlsChunk) => {
+			const accUrls = await chain;
+			const existingUrls = await knex('media').whereIn('source', sourceUrlsChunk);
+
+			return [...accUrls, ...existingUrls];
+		}, []),
+		chunk(extractUrls).reduce(async (chain, extractUrlsChunk) => {
+			const accUrls = await chain;
+			const existingUrls = await knex('media').whereIn('source_page', extractUrlsChunk);
+
+			return [...accUrls, ...existingUrls];
+		}, []),
 	]);

 	const existingSourceMediaByUrl = itemsByKey(existingSourceMedia, 'source');
 	const existingExtractMediaByUrl = itemsByKey(existingExtractMedia, 'source_page');

-	return { existingSourceMediaByUrl, existingExtractMediaByUrl };
+	return {
+		existingSourceMediaByUrl,
+		existingExtractMediaByUrl,
+	};
 }

 async function findHashDuplicates(medias) {
 	const hashes = medias.map((media) => media.meta?.hash || media.entry?.hash).filter(Boolean);

-	const existingHashMediaEntries = await knex('media').whereIn('hash', hashes);
-	const existingHashMediaEntriesByHash = itemsByKey(existingHashMediaEntries, 'hash');
+	const existingHashMediaEntries = await chunk(hashes, 2).reduce(async (chain, hashesChunk) => {
+		const accHashes = await chain;
+		const existingHashes = await knex('media').whereIn('hash', hashesChunk);

+		return [...accHashes, ...existingHashes];
+	}, []);
+
+	const existingHashMediaEntriesByHash = itemsByKey(existingHashMediaEntries, 'hash');
 	const uniqueHashMedias = medias.filter((media) => !media.entry && !existingHashMediaEntriesByHash[media.meta?.hash]);

 	const { selfDuplicateMedias, selfUniqueMediasByHash } = uniqueHashMedias.reduce((acc, media) => {
@@ -600,11 +620,11 @@ async function fetchSource(source, baseMedia) {
 			const hashStream = new stream.PassThrough();
 			let size = 0;

-			hashStream.on('data', (chunk) => {
-				size += chunk.length;
+			hashStream.on('data', (streamChunk) => {
+				size += streamChunk.length;

 				if (hasherReady) {
-					hasher.write(chunk);
+					hasher.write(streamChunk);
 				}
 			});

--- a/src/scrapers/dogfart.js
+++ b/src/scrapers/dogfart.js
@@ -1,20 +1,16 @@
 'use strict';

-/* eslint-disable newline-per-chained-call */
-// const Promise = require('bluebird');
-const { JSDOM } = require('jsdom');
-const moment = require('moment');
-
-const http = require('../utils/http');
 const slugify = require('../utils/slugify');
 const qu = require('../utils/qu');

 async function getPhotos(albumUrl) {
-	const res = await http.get(albumUrl);
-	const html = res.body.toString();
-	const { document } = new JSDOM(html).window;
+	const res = await qu.get(albumUrl);

-	const lastPhotoPage = Array.from(document.querySelectorAll('.preview-image-container a')).slice(-1)[0].href;
+	if (!res.ok) {
+		return [];
+	}
+
+	const lastPhotoPage = res.item.query.urls('.preview-image-container a').at(-1);
 	const lastPhotoIndex = parseInt(lastPhotoPage.match(/\d+.jpg/)[0], 10);

 	const photoUrls = Array.from({ length: lastPhotoIndex }, (value, index) => {
@@ -29,124 +25,88 @@ async function getPhotos(albumUrl) {
 	return photoUrls;
 }

-function scrapeLatest(html, site, filter = true) {
-	const { document } = new JSDOM(html).window;
-	const sceneElements = Array.from(document.querySelectorAll('.recent-updates'));
+function scrapeLatest(scenes, site, filter = true) {
+	return scenes.reduce((acc, { query }) => {
+		const release = {};

-	return sceneElements.map((element) => {
-		const siteUrl = element.querySelector('.recent-details-title .help-block, .model-details-title .site-name').textContent;
+		const siteUrl = query.cnt('.recent-details-title .help-block, .model-details-title .site-name');
+
+		release.url = query.url('.thumbnail', 'href', { origin: site.type === 'network' ? site.url : site.parent.url });
+		release.entryId = `${site.slug}_${new URL(release.url).pathname.split('/')[4]}`;
+
+		release.title = query.cnt('.scene-title');
+		release.actors = release.title.split(/[,&]|\band\b/).map((actor) => actor.replace(/BTS/i, '').trim());
+
+		// release.poster = `https:${element.querySelector('img').src}`;
+		release.poster = query.img();
+		release.teaser = query.el('.thumbnail', 'data-preview_clip_url');
+
+		release.channel = siteUrl?.match(/(.*).com/)?.[1].toLowerCase();

 		if (filter && `www.${siteUrl.toLowerCase()}` !== new URL(site.url).host) {
 			// different dogfart site
-			return null;
+			return { ...acc, unextracted: [...acc.unextracted, release] };
 		}

-		const sceneLinkElement = element.querySelector('.thumbnail');
-		const url = qu.prefixUrl(sceneLinkElement.href, 'https://dogfartnetwork.com');
-		const { pathname } = new URL(url);
-		const entryId = `${site.slug}_${pathname.split('/')[4]}`;
-
-		const title = element.querySelector('.scene-title').textContent;
-		const actors = title.split(/[,&]|\band\b/).map((actor) => actor.replace(/BTS/i, '').trim());
-
-		const poster = `https:${element.querySelector('img').src}`;
-		const teaser = sceneLinkElement.dataset.preview_clip_url;
-
-		const channel = siteUrl?.match(/(.*).com/)?.[1].toLowerCase();
-
-		return {
-			url,
-			entryId,
-			title,
-			actors,
-			poster,
-			teaser: {
-				src: teaser,
-			},
-			site,
-			channel,
-		};
-	}).filter(Boolean);
+		return { ...acc, scenes: [...acc.scenes, release] };
+	}, {
+		scenes: [],
+		unextracted: [],
+	});
 }

-async function scrapeScene(html, url, site) {
-	const { document } = new JSDOM(html).window;
-
-	const title = document.querySelector('.description-title').textContent;
-	const actors = Array.from(document.querySelectorAll('.more-scenes a')).map(({ textContent }) => textContent);
-	const metaDescription = document.querySelector('meta[itemprop="description"]').content;
-	const description = metaDescription
-		? metaDescription.content
-		: document.querySelector('.description')
-			.textContent
-			.replace(/[ \t\n]{2,}/g, ' ')
-			.replace('...read more', '')
-			.trim();
-
-	const channel = document.querySelector('.site-name').textContent.split('.')[0].toLowerCase();
+async function scrapeScene({ query }, url, channel, baseScene, parameters) {
+	const release = {};
 	const { origin, pathname } = new URL(url);
-	const entryId = `${channel}_${pathname.split('/').slice(-2)[0]}`;

-	const date = new Date(document.querySelector('meta[itemprop="uploadDate"]').content);
-	const duration = moment
-		.duration(`00:${document
-			.querySelectorAll('.extra-info p')[1]
-			.textContent
-			.match(/\d+:\d+$/)[0]}`)
-		.asSeconds();
+	release.channel = query.cnt('.site-name').split('.')[0].toLowerCase();
+	release.entryId = `${release.channel}_${pathname.split('/').slice(-2)[0]}`;

-	const trailerElement = document.querySelector('.html5-video');
-	const poster = `https:${trailerElement.dataset.poster}`;
-	const { trailer } = trailerElement.dataset;
+	release.title = query.cnt('.description-title');
+	release.actors = query.all('.more-scenes a').map((actorEl) => ({
+		name: query.cnt(actorEl),
+		url: query.url(actorEl, null, 'href', { origin: channel.url }),
+	}));

-	const lastPhotosUrl = Array.from(document.querySelectorAll('.pagination a')).slice(-1)[0]?.href;
-	const photos = lastPhotosUrl ? await getPhotos(`${origin}${pathname}${lastPhotosUrl}`, site, url) : [];
+	release.description = query.meta('meta[itemprop="description"]') || qu.cnt('.description').replace(/[ \t\n]{2,}/g, ' ').replace('...read more', '').trim();

-	const stars = Math.floor(Number(document.querySelector('span[itemprop="average"]')?.textContent || document.querySelector('span[itemprop="ratingValue"]')?.textContent) / 2);
-	const tags = Array.from(document.querySelectorAll('.scene-details .categories a')).map(({ textContent }) => textContent);
+	release.date = query.date('meta[itemprop="uploadDate"]', null, null, 'content');
+	release.duration = query.duration('.extra-info p:nth-child(2)');

-	return {
-		entryId,
-		url: `${origin}${pathname}`,
-		title,
-		description,
-		actors,
-		date,
-		duration,
-		poster,
-		photos,
-		trailer: {
-			src: trailer,
-		},
-		tags,
-		rating: {
-			stars,
-		},
-		site,
-		channel,
-	};
+	release.tags = query.cnts('.scene-details .categories a');
+
+	release.trailer = query.video('.html5-video', 'data-trailer');
+	release.poster = query.poster('.html5-video', 'data-poster');
+
+	const lastPhotosUrl = query.urls('.pagination a').at(-1);
+
+	if (lastPhotosUrl && parameters.includePhotos) {
+		release.photos = await getPhotos(`${origin}${pathname}${lastPhotosUrl}`, channel, url);
+	}
+
+	release.stars = Number(((query.number('span[itemprop="average"]') || query.number('span[itemprop="ratingValue"]')) / 2).toFixed(2));
+
+	return release;
 }

 async function fetchLatest(site, page = 1) {
-	const res = await http.get(`https://dogfartnetwork.com/tour/scenes/?p=${page}`);
+	const res = await qu.getAll(`https://dogfartnetwork.com/tour/scenes/?p=${page}`, '.recent-updates');

-	return scrapeLatest(res.body.toString(), site);
-}
+	if (res.ok) {
+		return scrapeLatest(res.items, site);
+	}

-async function fetchScene(url, site) {
-	const res = await http.get(url);
-
-	return scrapeScene(res.body.toString(), url, site);
+	return res.status;
 }

 async function fetchProfile(baseActor, entity) {
 	const slug = slugify(baseActor.name, '+');
 	const url = `https://www.dogfartnetwork.com/tour/girls/${slug}/`;

-	const res = await http.get(url);
+	const res = await qu.getAll(url, '.recent-updates');

 	if (res.ok) {
-		const scenes = scrapeLatest(res.body, entity, false);
+		const scenes = scrapeLatest(res.items, entity, false);

 		return { scenes };
 	}
@@ -156,6 +116,6 @@ async function fetchProfile(baseActor, entity) {

 module.exports = {
 	fetchLatest,
-	fetchScene,
 	fetchProfile,
+	scrapeScene,
 };
--- a/src/scrapers/insex.js
+++ b/src/scrapers/insex.js
@@ -5,6 +5,27 @@ const http = require('../utils/http');
 const slugify = require('../utils/slugify');

 function scrapeLatest(scenes, site) {
+	return scenes.map(({ query }) => {
+		const release = {};
+
+		release.url = query.url('figure a', 'href', { origin: site.parameters.latest });
+
+		release.title = query.cnt('.has-text-weight-bold, .is-size-6');
+		release.date = query.date('span.tag', 'YYYY-MM-DD');
+		release.actors = query.cnts('a.tag');
+
+		const cover = query.img('.image img');
+
+		release.poster = cover.replace('poster_noplay', 'trailer_noplay');
+		release.covers = [cover];
+
+		release.entryId = `${qu.formatDate(release.date, 'YYYY-MM-DD')}-${slugify(release.title.split(/\s+/).slice(0, 5).join(' '))}`;
+
+		return release;
+	});
+}
+
+function scrapeLatestLegacy(scenes, site) {
 	return scenes.map(({ query }) => {
 		// if (q('.articleTitleText')) return scrapeFirstLatest(ctx(el), site);
 		const release = {};
@@ -47,28 +68,35 @@ function scrapeLatest(scenes, site) {
 	});
 }

-function scrapeLatestAlt(scenes, site) {
-	return scenes.map(({ query }) => {
-		const release = {};
+async function scrapeScene({ query }, url, channel, parameters, session) {
+	const release = {};

-		release.url = query.url('figure a', 'href', { origin: site.parameters.latest });
+	release.title = query.cnt('.columns div.is-size-5.has-text-weight-bold');
+	release.description = query.cnt('.has-background-black-ter > div:nth-child(4)');
+	release.date = query.date('.has-text-white-ter span.tag', 'YYYY-MM-DD');

-		release.title = query.cnt('.has-text-weight-bold');
-		release.date = query.date('span.tag', 'YYYY-MM-DD');
-		release.actors = query.cnts('a.tag');
+	release.actors = query.cnts('.has-text-white-ter a.tag[href*="home.php"]');
+	release.tags = query.cnts('.has-background-black-ter > div:nth-child(6) > span');

-		const cover = query.img('.image img');
+	release.poster = query.img('#videoPlayer, #iodvideo', 'poster');
+	release.photos = Array.from(query.html('body > div:nth-child(6)').matchAll(/src="(http.*jpg)"/g), (match) => match[1]);

-		release.poster = cover.replace('poster_noplay', 'trailer_noplay');
-		release.covers = [cover];
+	release.entryId = `${qu.formatDate(release.date, 'YYYY-MM-DD')}-${slugify(release.title)}`;

-		release.entryId = `${qu.formatDate(release.date, 'YYYY-MM-DD')}-${slugify(release.title)}`;
+	release.trailer = query.video();

-		return release;
-	});
+	if (!release.trailer && parameters.includeTrailers) {
+		const trailerRes = await http.get(`${channel.url}/api/play-api.php`, { session });
+
+		if (trailerRes.ok) {
+			release.trailer = trailerRes.body;
+		}
+	}
+
+	return release;
 }

-function scrapeScene({ query }, site) {
+function scrapeSceneLegacy({ query }, site) {
 	const release = {};

 	const titleEl = query.q('.articleTitleText');
@@ -97,70 +125,34 @@ function scrapeScene({ query }, site) {
 	return release;
 }

-async function scrapeSceneAlt({ query }, url, channel, session) {
-	const release = {};
-
-	release.title = query.cnt('.columns div.is-size-5');
-	release.description = query.cnt('.has-background-black-ter > div:nth-child(4)');
-	release.date = query.date('.has-text-white-ter span.tag', 'YYYY-MM-DD');
-
-	release.actors = query.cnts('.has-text-white-ter a.tag[href*="home.php"]');
-	release.tags = query.cnts('.has-background-black-ter > div:nth-child(6) > span');
-
-	release.poster = query.img('#videoPlayer, #iodvideo', 'poster');
-	release.photos = query.imgs('body > div:nth-child(6) img');
-
-	release.entryId = `${qu.formatDate(release.date, 'YYYY-MM-DD')}-${slugify(release.title)}`;
-
-	release.trailer = query.video();
-
-	if (!release.trailer) {
-		const trailerRes = await http.get(`${channel.url}/api/play-api.php`, { session });
-
-		if (trailerRes.ok) {
-			release.trailer = trailerRes.body;
-		}
-	}
-
-	return release;
-}
-
 async function fetchLatest(site, page = 1) {
-	const url = (site.parameters?.scraper === 'alt' && `${site.parameters.latest}/home.php?o=latest&p=${page}`)
-		// || (site.slug === 'paintoy' && `${site.url}/corporal/punishment/gallery.php?type=brief&page=${page}`) // paintoy's site is (was?) partially broken, use front page
-		|| `${site.url}/scripts/switch_tour.php?type=brief&page=${page}`;
-
-	const res = await ((site.parameters?.scraper === 'alt' && qu.getAll(url, 'body > .columns .column'))
-		// || (site.slug === 'paintoy' && qu.getAll(url, '#articleTable table[cellspacing="2"]'))
-		|| qu.get(url)); // JSON containing html as a property
+	const url = `${site.parameters.latest}/home.php?o=latest&p=${page}`;
+	const res = await qu.getAll(url, 'body > .columns .column', { cookie: 'consent=yes' });

 	if (res.ok) {
-		if (site.parameters?.scraper === 'alt') {
-			return scrapeLatestAlt(res.items, site);
-		}
-
-		/*
-		if (site.slug === 'paintoy') {
-			return scrapeLatest(res.items, site);
-		}
-		*/
-
-		return scrapeLatest(qu.extractAll(res.body.html, '#articleTable > tbody > tr:nth-child(2) > td > table'), site);
+		return scrapeLatest(res.items, site);
 	}

 	return res.status;
 }

-async function fetchScene(url, site) {
-	const session = http.session();
-	const res = await qu.get(url, null, null, { session });
+async function fetchLatestLegacy(site, page = 1) {
+	const url = `${site.url}/scripts/switch_tour.php?type=brief&page=${page}`;
+	const res = await qu.get(url); // JSON containing html as a property

 	if (res.ok) {
-		if (site.parameters?.scraper === 'alt') {
-			return scrapeSceneAlt(res.item, url, site, session);
-		}
+		return scrapeLatestLegacy(qu.extractAll(res.body.html, '#articleTable > tbody > tr:nth-child(2) > td > table'), site);
+	}

-		return scrapeScene(res.item, site);
+	return res.status;
+}
+
+async function fetchScene(url, site, baseRelease, parameters) {
+	const session = http.session();
+	const res = await qu.get(url, null, { cookie: 'consent=yes' }, { session });
+
+	if (res.ok) {
+		return scrapeScene(res.item, url, site, parameters, session);
 	}

 	return res.status;
@@ -169,4 +161,8 @@ async function fetchScene(url, site) {
 module.exports = {
 	fetchLatest,
 	fetchScene,
+	legacy: {
+		fetchLatest: fetchLatestLegacy,
+		scrapeScene: scrapeSceneLegacy,
+	},
 };
--- a/src/scrapers/pervcity.js
+++ b/src/scrapers/pervcity.js
@@ -12,6 +12,13 @@ const channelCodes = {
 	uha: 'upherasshole',
 };

+const qualities = {
+	v4k: 2160,
+	vFullHD: 1080,
+	vHD: 720,
+	vSD: 480,
+};
+
 const channelRegExp = new RegExp(Object.keys(channelCodes).join('|'), 'i');

 function scrapeAll(scenes, entity) {
@@ -42,9 +49,12 @@ function scrapeScene({ query }) {
 	release.entryId = query.q('.trailerLeft img', 'id').match(/set-target-(\d+)/)[1];

 	release.title = query.cnt('.infoHeader h1');
-	release.description = query.cnt('.infoBox p');
+	release.description = query.cnt('.description');
+	release.duration = query.duration('.tRuntime');

 	release.actors = query.cnts('.infoBox .tour_update_models a');
+	release.tags = query.cnts('.tagcats a');
+	release.qualities = query.imgs('.avaiFormate img').map((src) => qualities[src.match(/\/(\w+)\.png/)[1]]).filter(Boolean);

 	release.poster = query.img('.posterimg');
 	release.photos = query.imgs('.trailerSnaps img').slice(1); // first photo is poster in lower quality
--- a/src/store-releases.js
+++ b/src/store-releases.js
@@ -38,11 +38,8 @@ async function curateReleaseEntry(release, batchId, existingRelease, type = 'sce
 		date_precision: release.datePrecision,
 		slug,
 		description: release.description,
+		qualities: release.qualities?.map(Number).filter(Boolean),
 		comment: release.comment,
-		// director: release.director,
-		// likes: release.rating && release.rating.likes,
-		// dislikes: release.rating && release.rating.dislikes,
-		// rating: release.rating && release.rating.stars && Math.floor(release.rating.stars),
 		deep: typeof release.deep === 'boolean' ? release.deep : false,
 		deep_url: release.deepUrl,
 		updated_batch_id: batchId,
Author	SHA1	Message	Date
DebaucheryLibrarian	e202e887f9	1.213.5	2022-04-03 00:49:42 +02:00
DebaucheryLibrarian	574c117ab0	Refactored Dogfart scraper to use qu and return unextracted scenes.	2022-04-03 00:49:39 +02:00
DebaucheryLibrarian	d59a57f311	1.213.4	2022-04-02 00:32:29 +02:00
DebaucheryLibrarian	5e499c3685	Added chunking to media duplicate queries to prevent overloading parameters. Added DP Diva to Perv City (coming soon).	2022-04-02 00:32:23 +02:00
DebaucheryLibrarian	17e5ce71b2	1.213.3	2022-03-31 23:01:56 +02:00
DebaucheryLibrarian	5352186319	Insex not fetching video when not required.	2022-03-31 23:01:54 +02:00
DebaucheryLibrarian	e9ba02d65d	1.213.2	2022-03-31 22:46:56 +02:00
DebaucheryLibrarian	39813d4461	Updated Insex scraper.	2022-03-31 22:46:54 +02:00
DebaucheryLibrarian	829a285a2d	1.213.1	2022-03-31 14:34:12 +02:00
DebaucheryLibrarian	a19a77e165	Optionalized qualities.	2022-03-31 14:34:10 +02:00
DebaucheryLibrarian	122dd3eaee	1.213.0	2022-03-31 14:11:23 +02:00
DebaucheryLibrarian	18b219850e	Storing scene qualities. Updated Perv City scraper.	2022-03-31 14:11:13 +02:00