Added chunking to media duplicate queries to prevent overloading parameters. Added DP Diva to Perv City (coming soon).

This commit is contained in:
DebaucheryLibrarian 2022-04-02 00:32:23 +02:00
parent 17e5ce71b2
commit 5e499c3685
28 changed files with 65 additions and 29 deletions

View File

@ -12,6 +12,7 @@ export default {
selectableTags: [
'airtight',
'anal',
'bdsm',
'blowbang',
'blowjob',
'creampie',

View File

@ -0,0 +1,25 @@
exports.up = async (knex) => knex.raw(`
CREATE MATERIALIZED VIEW entities_stats
AS
WITH RECURSIVE relations AS (
SELECT entities.id, entities.parent_id, count(releases.id) AS releases_count, count(releases.id) AS total_count
FROM entities
LEFT JOIN releases ON releases.entity_id = entities.id
GROUP BY entities.id
UNION ALL
SELECT entities.id AS entity_id, count(releases.id) AS releases_count, count(releases.id) + relations.total_count AS total_count
FROM entities
INNER JOIN relations ON relations.id = entities.parent_id
LEFT JOIN releases ON releases.entity_id = entities.id
GROUP BY entities.id
)
SELECT relations.id AS entity_id, relations.releases_count
FROM relations;
`);
exports.down = async (knex) => knex.raw(`
DROP MATERIALIZED VIEW entities_stats;
`);

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.5 KiB

After

Width:  |  Height:  |  Size: 2.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.4 KiB

After

Width:  |  Height:  |  Size: 2.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.6 KiB

After

Width:  |  Height:  |  Size: 3.6 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.7 KiB

After

Width:  |  Height:  |  Size: 3.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.8 KiB

After

Width:  |  Height:  |  Size: 3.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.5 KiB

After

Width:  |  Height:  |  Size: 3.5 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.8 KiB

After

Width:  |  Height:  |  Size: 3.8 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.4 KiB

After

Width:  |  Height:  |  Size: 2.4 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 27 KiB

After

Width:  |  Height:  |  Size: 27 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 17 KiB

After

Width:  |  Height:  |  Size: 17 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 36 KiB

After

Width:  |  Height:  |  Size: 36 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 31 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 3.7 KiB

After

Width:  |  Height:  |  Size: 3.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.7 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 3.7 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 30 KiB

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 15 KiB

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 30 KiB

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 20 KiB

After

Width:  |  Height:  |  Size: 20 KiB

View File

@ -6917,6 +6917,13 @@ const sites = [
tourId: 9,
},
},
{
slug: 'dpdiva',
name: 'DP Diva',
url: 'http://dpdiva.com',
parent: 'pervcity',
tags: ['dp', 'anal'],
},
// PIERRE WOODMAN
{
slug: 'woodmancastingx',

View File

@ -85,23 +85,6 @@ async function startMemorySample(snapshotTriggers = []) {
}, config.memorySampling.sampleDuration);
}
async function startMemorySample() {
await inspector.heap.enable();
await inspector.heap.startSampling();
// monitorMemory();
logger.info(`Start heap sampling, memory usage: ${process.memoryUsage.rss() / 1000000} MB`);
setTimeout(async () => {
await stopMemorySample();
if (!done) {
await startMemorySample();
}
}, 30000);
}
async function init() {
try {
if (argv.server) {

View File

@ -21,6 +21,7 @@ const argv = require('./argv');
const knex = require('./knex');
const http = require('./utils/http');
const bulkInsert = require('./utils/bulk-insert');
const chunk = require('./utils/chunk');
const { get } = require('./utils/qu');
const pipeline = util.promisify(stream.pipeline);
@ -63,10 +64,10 @@ function sampleMedias(medias, limit = argv.mediaLimit, preferLast = true) {
? chunks.slice(0, -1).concat(chunks.slice(-1).reverse())
: chunks;
const groupedMedias = lastPreferredChunks.map((chunk) => {
const groupedMedias = lastPreferredChunks.map((mediaChunk) => {
// merge chunked medias into single media with grouped fallback priorities,
// so the first sources of each media is preferred over all second sources, etc.
const sources = chunk
const sources = mediaChunk
.reduce((accSources, media) => {
media.sources.forEach((source, index) => {
if (!accSources[index]) {
@ -82,8 +83,8 @@ function sampleMedias(medias, limit = argv.mediaLimit, preferLast = true) {
.flat();
return {
id: chunk[0].id,
role: chunk[0].role,
id: mediaChunk[0].id,
role: mediaChunk[0].role,
sources,
};
});
@ -235,22 +236,41 @@ async function findSourceDuplicates(baseMedias) {
.filter(Boolean);
const [existingSourceMedia, existingExtractMedia] = await Promise.all([
knex('media').whereIn('source', sourceUrls),
knex('media').whereIn('source_page', extractUrls),
// my try to check thousands of URLs at once, don't pass all of them to a single query
chunk(sourceUrls).reduce(async (chain, sourceUrlsChunk) => {
const accUrls = await chain;
const existingUrls = await knex('media').whereIn('source', sourceUrlsChunk);
return [...accUrls, ...existingUrls];
}, []),
chunk(extractUrls).reduce(async (chain, extractUrlsChunk) => {
const accUrls = await chain;
const existingUrls = await knex('media').whereIn('source_page', extractUrlsChunk);
return [...accUrls, ...existingUrls];
}, []),
]);
const existingSourceMediaByUrl = itemsByKey(existingSourceMedia, 'source');
const existingExtractMediaByUrl = itemsByKey(existingExtractMedia, 'source_page');
return { existingSourceMediaByUrl, existingExtractMediaByUrl };
return {
existingSourceMediaByUrl,
existingExtractMediaByUrl,
};
}
async function findHashDuplicates(medias) {
const hashes = medias.map((media) => media.meta?.hash || media.entry?.hash).filter(Boolean);
const existingHashMediaEntries = await knex('media').whereIn('hash', hashes);
const existingHashMediaEntriesByHash = itemsByKey(existingHashMediaEntries, 'hash');
const existingHashMediaEntries = await chunk(hashes, 2).reduce(async (chain, hashesChunk) => {
const accHashes = await chain;
const existingHashes = await knex('media').whereIn('hash', hashesChunk);
return [...accHashes, ...existingHashes];
}, []);
const existingHashMediaEntriesByHash = itemsByKey(existingHashMediaEntries, 'hash');
const uniqueHashMedias = medias.filter((media) => !media.entry && !existingHashMediaEntriesByHash[media.meta?.hash]);
const { selfDuplicateMedias, selfUniqueMediasByHash } = uniqueHashMedias.reduce((acc, media) => {
@ -600,11 +620,11 @@ async function fetchSource(source, baseMedia) {
const hashStream = new stream.PassThrough();
let size = 0;
hashStream.on('data', (chunk) => {
size += chunk.length;
hashStream.on('data', (streamChunk) => {
size += streamChunk.length;
if (hasherReady) {
hasher.write(chunk);
hasher.write(streamChunk);
}
});