Replaced bhttp with patched fork. Improved Jesse Loads Monster Facials scraper reliability (WIP). Added various tag photos.

This commit is contained in:
DebaucheryLibrarian
2020-10-30 17:37:10 +01:00
parent 4af7597441
commit 39f8c037a5
43 changed files with 128 additions and 33 deletions

View File

@@ -792,6 +792,8 @@ async function associateActors(releases, batchId) {
await bulkInsert('releases_actors', releaseActorAssociations, false);
logger.verbose(`Associated ${releaseActorAssociations.length} actors to ${releases.length} scenes`);
return actors;
}

View File

@@ -262,11 +262,13 @@ async function flushEntities(networkSlugs = [], channelSlugs = []) {
return;
}
await Promise.all([
const [deletedScenesCount, deletedMoviesCount] = await Promise.all([
deleteScenes(sceneIds),
deleteMovies(movieIds),
]);
logger.info(`Removed ${deletedScenesCount} scenes and ${deletedMoviesCount} movies for ${entitySlugs}`);
await flushOrphanedMedia();
}

View File

@@ -279,6 +279,8 @@ async function extractSource(baseSource, { existingExtractMediaByUrl }) {
}
async function storeImageFile(media, hashDir, hashSubDir, filename, filedir, filepath) {
logger.silly(`Storing permanent media files for ${media.id} from ${media.src} at ${filepath}`);
try {
const thumbdir = path.join(media.role, 'thumbs', hashDir, hashSubDir);
const thumbpath = path.join(thumbdir, filename);
@@ -620,6 +622,7 @@ async function storeMedias(baseMedias) {
const fetchedMedias = await Promise.map(
baseMedias,
async baseMedia => fetchMedia(baseMedia, { existingSourceMediaByUrl, existingExtractMediaByUrl }),
{ concurrency: 100 }, // don't overload disk (or network, although this has its own throttling)
);
const [uniqueHashMedias, existingHashMedias] = await findHashDuplicates(fetchedMedias);
@@ -627,6 +630,7 @@ async function storeMedias(baseMedias) {
const savedMedias = await Promise.map(
uniqueHashMedias,
async baseMedia => storeFile(baseMedia),
{ concurrency: 100 }, // don't overload disk
);
if (argv.force) {
@@ -634,6 +638,7 @@ async function storeMedias(baseMedias) {
await Promise.map(
existingHashMedias,
async baseMedia => storeFile(baseMedia),
{ concurrency: 100 }, // don't overload disk
);
}
@@ -784,7 +789,7 @@ async function flushOrphanedMedia() {
await fsPromises.rmdir(path.join(config.media.path, 'temp'), { recursive: true });
logger.info('Removed temporary media directory');
logger.info('Cleared temporary media directory');
}
module.exports = {

View File

@@ -126,15 +126,31 @@ async function searchReleases(query, limit = 100) {
}
async function deleteScenes(sceneIds) {
await knex('releases')
if (sceneIds.length === 0) {
return 0;
}
const deleteCount = await knex('releases')
.whereIn('id', sceneIds)
.delete();
logger.info(`Removed ${deleteCount}/${sceneIds.length} scenes`);
return deleteCount;
}
async function deleteMovies(movieIds) {
await knex('movies')
if (movieIds.length === 0) {
return 0;
}
const deleteCount = await knex('movies')
.whereIn('id', movieIds)
.delete();
logger.info(`Removed ${deleteCount}/${movieIds.length} movies`);
return deleteCount;
}
async function flushBatches(batchIds) {
@@ -161,11 +177,13 @@ async function flushBatches(batchIds) {
return;
}
await Promise.all([
const [deletedScenesCount, deletedMoviesCount] = await Promise.all([
deleteScenes(sceneIds),
deleteMovies(movieIds),
]);
logger.info(`Removed ${deletedScenesCount} scenes and ${deletedMoviesCount} movies for batches ${batchIds}`);
await flushOrphanedMedia();
}

View File

@@ -1,6 +1,6 @@
'use strict';
const bhttp = require('bhttp');
const bhttp = require('@thependulum/bhttp');
const { post } = require('../utils/http');
const { extractDate } = require('../utils/qu');

View File

@@ -1,7 +1,7 @@
'use strict';
/* eslint-disable newline-per-chained-call */
const bhttp = require('bhttp');
const bhttp = require('@thependulum/bhttp');
const cheerio = require('cheerio');
const moment = require('moment');

View File

@@ -2,7 +2,7 @@
/* eslint-disable newline-per-chained-call */
// const Promise = require('bluebird');
const bhttp = require('bhttp');
const bhttp = require('@thependulum/bhttp');
const { JSDOM } = require('jsdom');
const moment = require('moment');

View File

@@ -1,6 +1,6 @@
'use strict';
const { get, initAll, formatDate } = require('../utils/qu');
const { get, initAll } = require('../utils/qu');
function scrapeLatest(scenes, dates, site) {
return scenes.map(({ qu }, index) => {
@@ -8,21 +8,23 @@ function scrapeLatest(scenes, dates, site) {
const path = qu.url('a[href*="videos/"]');
if (path) {
release.url = `${site.url}/visitors/${path}`;
if (/\.wmv$/.test(path)) {
release.trailer = `${site.url}/visitors/${path}`;
} else {
release.url = `${site.url}/visitors/${path}`;
}
}
console.log(dates, dates[index], path);
if (dates && dates[index]) {
release.date = dates[index].qu.date(null, 'MM/DD/YYYY');
}
const entryId = path?.match(/videos\/([a-zA-Z0-9]+)(?:_hd)?_trailer/)?.[1]
// release.entryId = release.date ? `${formatDate(release.date, 'YYYY-MM-DD')}-${entryId}` : entryId;
release.entryId = path?.match(/videos\/([a-zA-Z0-9]+)(?:_hd)?_trailer/)?.[1]
|| qu.img('img[src*="graphics/fft"]')?.match(/fft_(\w+).gif/)?.[1];
if (!entryId) {
return null;
}
release.entryId = release.date ? `${formatDate(release.date, 'YYYY-MM-DD')}-${entryId}` : entryId;
release.description = qu.q('tbody tr:nth-child(3) font', true);
const infoLine = qu.q('font[color="#663366"]', true);
@@ -43,7 +45,14 @@ function scrapeScene({ qu }, url, site) {
const release = { url };
const { pathname } = new URL(url);
release.entryId = pathname.match(/videos\/(\w+)_hd_trailer/)[1];
release.entryId = pathname?.match(/videos\/([a-zA-Z0-9]+)(?:_hd)?_trailer/)?.[1];
if (/\.wmv$/.test(pathname)) {
release.trailer = url;
return release;
}
const actor = qu.q('font[color="#990033"] strong', true);
release.actors = [actor];

View File

@@ -2,7 +2,7 @@
const util = require('util');
const Promise = require('bluebird');
const bhttp = require('bhttp');
const bhttp = require('@thependulum/bhttp');
const cheerio = require('cheerio');
const { JSDOM } = require('jsdom');
const moment = require('moment');

View File

@@ -2,7 +2,7 @@
/* eslint-disable newline-per-chained-call */
const Promise = require('bluebird');
const bhttp = require('bhttp');
const bhttp = require('@thependulum/bhttp');
const { CookieJar } = Promise.promisifyAll(require('tough-cookie'));
const moment = require('moment');

View File

@@ -1,6 +1,6 @@
'use strict';
const bhttp = require('bhttp');
const bhttp = require('@thependulum/bhttp');
const { JSDOM } = require('jsdom');
const moment = require('moment');

View File

@@ -1,7 +1,7 @@
'use strict';
/* eslint-disable newline-per-chained-call */
const bhttp = require('bhttp');
const bhttp = require('@thependulum/bhttp');
const cheerio = require('cheerio');
const moment = require('moment');

View File

@@ -1,6 +1,6 @@
'use strict';
const bhttp = require('bhttp');
const bhttp = require('@thependulum/bhttp');
const cheerio = require('cheerio');
const {

View File

@@ -258,8 +258,7 @@ async function fetchLatest(entity, page, options) {
.limit(faker.random.number({ min: 2, max: 15 }))
.pluck('name');
// release.actors = actors(release);
release.actors = [null, 'Charles Darwin'];
release.actors = [...actors(release), null]; // include empty actor to ensure proper handling
release.title = title(release);
return release;

View File

@@ -1,7 +1,7 @@
'use strict';
/* eslint-disable no-unused-vars */
const bhttp = require('bhttp');
const bhttp = require('@thependulum/bhttp');
const { get, ed } = require('../utils/q');
const { fetchApiLatest, fetchApiUpcoming, fetchScene, fetchApiProfile } = require('./gamma');

View File

@@ -1,6 +1,6 @@
'use strict';
const bhttp = require('bhttp');
const bhttp = require('@thependulum/bhttp');
const { fetchLatest, fetchUpcoming, scrapeScene, fetchProfile } = require('./gamma');

View File

@@ -21,12 +21,10 @@ async function bulkUpsert(table, items, conflict, update = true, chunkSize) {
return knex.transaction(async (transaction) => {
const chunked = chunk(items, chunkSize);
// console.log(items.length, chunkSize, chunked.length, chunked[0]?.length);
const queries = chunked
.map(chunkItems => knex.raw(updated || ':query RETURNING *;', {
query: knex(table).insert(chunkItems).transacting(transaction),
}));
query: knex(table).insert(chunkItems),
}).transacting(transaction));
const responses = await Promise.all(queries);

View File

@@ -4,7 +4,7 @@ const util = require('util');
const stream = require('stream');
const config = require('config');
const tunnel = require('tunnel');
const bhttp = require('bhttp');
const bhttp = require('@thependulum/bhttp');
const taskQueue = require('promise-task-queue');
const pipeline = util.promisify(stream.pipeline);