Filtering out empty or unidentified scenes from update scraper, with warning. Improved Jesse Loads Monster Facials reliability.

2020-10-29 15:20:59 +01:00
parent f4b1fb4831
commit b188bc5744
6 changed files with 30 additions and 10 deletions
--- a/public/img/logos/evilangel/evilangel.png
+++ b/public/img/logos/evilangel/evilangel.png
--- a/public/img/logos/evilangel/misc/evil-angel_halloween-2020.svg
+++ b/public/img/logos/evilangel/misc/evil-angel_halloween-2020.svg
--- a/public/img/logos/evilangel/network.png
+++ b/public/img/logos/evilangel/network.png
--- a/seeds/02_sites.js
+++ b/seeds/02_sites.js
@@ -3415,21 +3415,21 @@ const sites = [
 	{
 		slug: 'paintoy',
 		name: 'Paintoy',
-		url: 'https://www.paintoy.com',
+		url: 'http://www.paintoy.com',
 		tags: ['bdsm'],
 		parent: 'insex',
 	},
 	{
 		slug: 'aganmedon',
 		name: 'Agan Medon',
-		url: 'https://www.aganmedon.com',
+		url: 'http://www.aganmedon.com',
 		tags: ['bdsm', 'animated'],
 		parent: 'insex',
 	},
 	{
 		slug: 'sensualpain',
 		name: 'Sensual Pain',
-		url: 'https://www.sensualpain.com',
+		url: 'http://www.sensualpain.com',
 		tags: ['bdsm'],
 		parent: 'insex',
 	},
--- a/src/scrapers/jesseloadsmonsterfacials.js
+++ b/src/scrapers/jesseloadsmonsterfacials.js
@@ -1,23 +1,35 @@
 'use strict';

-const { get, initAll } = require('../utils/qu');
+const { get, initAll, formatDate } = require('../utils/qu');

 function scrapeLatest(scenes, dates, site) {
 	return scenes.map(({ qu }, index) => {
 		const release = {};
+		const path = qu.url('a[href*="videos/"]');

-		const path = qu.url('a');
-		release.url = `${site.url}/visitors/${path}`;
-		release.entryId = path.match(/videos\/([a-zA-Z0-9]+)(?:_hd)?_trailer/)?.[1];
+		if (path) {
+			release.url = `${site.url}/visitors/${path}`;
+		}

 		if (dates && dates[index]) {
 			release.date = dates[index].qu.date(null, 'MM/DD/YYYY');
 		}

+		const entryId = path?.match(/videos\/([a-zA-Z0-9]+)(?:_hd)?_trailer/)?.[1]
+			|| qu.img('img[src*="graphics/fft"]')?.match(/fft_(\w+).gif/)?.[1];
+
+		if (!entryId) {
+			return null;
+		}
+
+		release.entryId = release.date ? `${formatDate(release.date, 'YYYY-MM-DD')}-${entryId}` : entryId;
 		release.description = qu.q('tbody tr:nth-child(3) font', true);

 		const infoLine = qu.q('font[color="#663366"]', true);
-		if (infoLine) release.duration = Number(infoLine.match(/(\d+) min/)[1]) * 60;
+
+		if (infoLine) {
+			release.duration = Number(infoLine.match(/(\d+) min/i)?.[1] || infoLine.match(/video: (\d+)/i)?.[1]) * 60 || null;
+		}

 		const poster = qu.img('img[src*="photos/"][width="400"]');
 		release.poster = `${site.url}/visitors/${poster}`;
--- a/src/updates.js
+++ b/src/updates.js
@@ -109,7 +109,12 @@ async function scrapeReleases(scraper, entity, preData, isUpcoming) {
 			return accReleases;
 		}

-		const pageReleasesWithEntity = pageReleases.map(release => ({ ...release, entity: release.entity || entity }));
+		const validPageReleases = pageReleases.filter(release => release?.entryId); // filter out empty and unidentified releases
+		const pageReleasesWithEntity = validPageReleases.map(release => ({ ...release, entity: release.entity || entity }));
+
+		if (pageReleases.length > validPageReleases.length) {
+			logger.warn(`Found ${pageReleases.length - validPageReleases.length} empty or unidentified releases on page ${page} for '${entity.name}'`);
+		}

 		if (needNextPage(pageReleasesWithEntity, accReleases, isUpcoming)) {
 			return scrapeReleasesPage(page + 1, accReleases.concat(pageReleasesWithEntity), isUpcoming);
@@ -119,6 +124,7 @@ async function scrapeReleases(scraper, entity, preData, isUpcoming) {
 	}

 	const releases = await scrapeReleasesPage(argv.page || 1, []);
+
 	const hasDates = releases.every(release => !!release.date);

 	const limitedReleases = (argv.last && releases.slice(0, Math.max(argv.last, 0)))
@@ -133,7 +139,7 @@ async function scrapeReleases(scraper, entity, preData, isUpcoming) {
 }

 async function scrapeLatestReleases(scraper, entity, preData) {
-	if ((!argv.latest && !argv.last && !argv.after) || !scraper.fetchLatest) {
+	if ((!argv.latest && !argv.last) || !scraper.fetchLatest) {
 		return emptyReleases;
 	}