Improved actor extraction for fcuk scraper. Changed 'copyright' to 'credit'. Redused entity page favicon size.

Added scraper for FCUK's coed sites.
2020-07-15 05:12:29 +02:00 · 2020-07-15 04:51:39 +02:00
33 changed files with 113 additions and 19 deletions
--- a/assets/components/actors/actor.vue
+++ b/assets/components/actors/actor.vue
@@ -43,7 +43,7 @@
 				>
 					<img
 						:src="sfw ? `/img/${actor.avatar.sfw.thumbnail}` : `/media/${actor.avatar.thumbnail}`"
-						:title="actor.avatar.copyright && `© ${actor.avatar.copyright}`"
+						:title="actor.avatar.credit && `© ${actor.avatar.credit}`"
 						class="avatar"
 					>
 				</a>
--- a/assets/components/actors/photos.vue
+++ b/assets/components/actors/photos.vue
@@ -18,7 +18,7 @@
 				:src="sfw ? `/img/${actor.avatar.sfw.thumbnail}` : `/media/${actor.avatar.thumbnail}`"
 				:data-src="sfw ? `/img/${actor.avatar.sfw.thumbnail}` : `/media/${actor.avatar.thumbnail}`"
 				:data-loading="sfw ? `/img/${actor.avatar.sfw.lazy}` : `/media/${actor.avatar.lazy}`"
-				:title="actor.avatar.copyright && `© ${actor.avatar.copyright}`"
+				:title="actor.avatar.credit && `© ${actor.avatar.credit}`"
 				class="avatar photo"
 				@load="$parent.$emit('load')"
 			>
@@ -36,7 +36,7 @@
 				:src="sfw ? `/img/${photo.sfw.thumbnail}` : `/media/${photo.thumbnail}`"
 				:data-src="sfw ? `/img/${photo.sfw.thumbnail}` : `/media/${photo.thumbnail}`"
 				:data-loading="sfw ? `/img/${photo.sfw.lazy}` : `/media/${photo.lazy}`"
-				:title="`© ${photo.copyright || photo.entity.name}`"
+				:title="`© ${photo.credit || photo.entity.name}`"
 				class="photo"
 				@load="$parent.$emit('load')"
 			>
--- a/assets/components/entities/entity.vue
+++ b/assets/components/entities/entity.vue
@@ -215,11 +215,14 @@ export default {
 	height: 2.5rem;
 }

-.logo-parent,
-.favicon {
+.logo-parent {
 	height: 1.5rem;
 }

+.favicon {
+	height: 1rem;
+}
+
 .name {
 	color: var(--text-light);
 	display: flex;
--- a/assets/js/ui/actions.js
+++ b/assets/js/ui/actions.js
@@ -116,7 +116,7 @@ function initUiActions(_store, _router) {
 							thumbnail
 							lazy
 							comment
-							copyright
+							credit
 						}
 						birthCountry: countryByBirthCountryAlpha2 {
 						  alpha2
@@ -135,7 +135,7 @@ function initUiActions(_store, _router) {
 						thumbnail
 						lazy
 						comment
-						copyright
+						credit
 					}
                    birthCountry: countryByBirthCountryAlpha2 {
                      alpha2
--- a/public/img/logos/fcuk/favicon.png
+++ b/public/img/logos/fcuk/favicon.png
--- a/public/img/logos/fcuk/fcuk.png
+++ b/public/img/logos/fcuk/fcuk.png
--- a/public/img/logos/fcuk/lazy/afterhoursexposed.png
+++ b/public/img/logos/fcuk/lazy/afterhoursexposed.png
--- a/public/img/logos/fcuk/lazy/backroomcastingcouch.png
+++ b/public/img/logos/fcuk/lazy/backroomcastingcouch.png
--- a/public/img/logos/fcuk/lazy/blackambush.png
+++ b/public/img/logos/fcuk/lazy/blackambush.png
--- a/public/img/logos/fcuk/lazy/eurocoeds.png
+++ b/public/img/logos/fcuk/lazy/eurocoeds.png
--- a/public/img/logos/fcuk/lazy/exploitedcollegegirls.png
+++ b/public/img/logos/fcuk/lazy/exploitedcollegegirls.png
--- a/public/img/logos/fcuk/lazy/favicon.png
+++ b/public/img/logos/fcuk/lazy/favicon.png
--- a/public/img/logos/fcuk/lazy/fcuk.png
+++ b/public/img/logos/fcuk/lazy/fcuk.png
--- a/public/img/logos/fcuk/lazy/nebraskacoeds.png
+++ b/public/img/logos/fcuk/lazy/nebraskacoeds.png
--- a/public/img/logos/fcuk/lazy/network.png
+++ b/public/img/logos/fcuk/lazy/network.png
--- a/public/img/logos/fcuk/lazy/southbeachcoeds.png
+++ b/public/img/logos/fcuk/lazy/southbeachcoeds.png
--- a/public/img/logos/fcuk/lazy/springbreaklife.png
+++ b/public/img/logos/fcuk/lazy/springbreaklife.png
--- a/public/img/logos/fcuk/network.png
+++ b/public/img/logos/fcuk/network.png
--- a/public/img/logos/fcuk/thumbs/afterhoursexposed.png
+++ b/public/img/logos/fcuk/thumbs/afterhoursexposed.png
--- a/public/img/logos/fcuk/thumbs/backroomcastingcouch.png
+++ b/public/img/logos/fcuk/thumbs/backroomcastingcouch.png
--- a/public/img/logos/fcuk/thumbs/blackambush.png
+++ b/public/img/logos/fcuk/thumbs/blackambush.png
--- a/public/img/logos/fcuk/thumbs/eurocoeds.png
+++ b/public/img/logos/fcuk/thumbs/eurocoeds.png
--- a/public/img/logos/fcuk/thumbs/exploitedcollegegirls.png
+++ b/public/img/logos/fcuk/thumbs/exploitedcollegegirls.png
--- a/public/img/logos/fcuk/thumbs/favicon.png
+++ b/public/img/logos/fcuk/thumbs/favicon.png
--- a/public/img/logos/fcuk/thumbs/fcuk.png
+++ b/public/img/logos/fcuk/thumbs/fcuk.png
--- a/public/img/logos/fcuk/thumbs/nebraskacoeds.png
+++ b/public/img/logos/fcuk/thumbs/nebraskacoeds.png
--- a/public/img/logos/fcuk/thumbs/network.png
+++ b/public/img/logos/fcuk/thumbs/network.png
--- a/public/img/logos/fcuk/thumbs/southbeachcoeds.png
+++ b/public/img/logos/fcuk/thumbs/southbeachcoeds.png
--- a/public/img/logos/fcuk/thumbs/springbreaklife.png
+++ b/public/img/logos/fcuk/thumbs/springbreaklife.png
--- a/seeds/00_tags.js
+++ b/seeds/00_tags.js
@@ -180,6 +180,7 @@ const tags = [
 	{
 		name: 'behind the scenes',
 		slug: 'behind-the-scenes',
+		priority: 6,
 	},
 	{
 		name: 'big dick',
--- a/seeds/02_sites.js
+++ b/seeds/02_sites.js
@@ -2139,6 +2139,12 @@ const sites = [
 		url: 'https://eurocoeds.com',
 		parent: 'fcuk',
 	},
+	{
+		name: 'After Hours Exposed',
+		slug: 'afterhoursexposed',
+		url: 'https://afterhoursexposed.com',
+		parent: 'fcuk',
+	},
 	// FOR BONDAGE
 	{
 		name: 'Crowd Bondage',
--- a/src/scrapers/fcuk.js
+++ b/src/scrapers/fcuk.js
@@ -2,7 +2,9 @@

 const qu = require('../utils/qu');

-function scrapeLatest(scenes, channel) {
+// TODO: profile scraping
+
+function scrapeLatestBlog(scenes, channel) {
 	return scenes.map(({ query }) => {
 		const release = {};

@@ -18,6 +20,14 @@ function scrapeLatest(scenes, channel) {
 		release.description = query.text('p');
 		release.date = query.date('h5 strong, .videos h3', 'MMM. DD, YYYY', /\w+. \d{2}, \d{4}/);

+		// remove common patterns so only the name is left
+		const curatedTitle = release.title.replace(/\b(part \d|\banal|bts)\b/gi, '').trim();
+
+		if (!/\band\b/.test(curatedTitle) && new RegExp(curatedTitle).test(release.description)) {
+			// scene title is probably the actor name
+			release.actors = [release.title];
+		}
+
 		release.poster = query.img('.bigthumb', null, { origin: channel.url });
 		release.photos = query.imgs('.smallthumb', null, { origin: channel.url });

@@ -27,15 +37,41 @@ function scrapeLatest(scenes, channel) {
 	});
 }

-function scrapeScene({ query }, url, channel) {
+function scrapeAll(scenes, channel) {
+	return scenes.map(({ query }) => {
+		const release = {};
+
+		release.url = query.url('.updateInfo h5 a:not([href*="content/"]):not([href*="#coming"])');
+		release.entryId = query.url('.updateThumb img', 'alt');
+
+		release.title = query.q('.updateInfo h5 a', true);
+
+		release.actors = query.all('.tour_update_models a', true);
+		release.date = query.date('.availdate, .updateInfo p span:nth-child(2)', 'MM/DD/YYYY');
+
+		release.poster = query.img('.updateThumb img');
+
+		const trailer = query.q('.updateInfo h5 a', 'onclick')?.match(/'(.+)'/)?.[1];
+
+		if (trailer) {
+			release.trailer = {
+				src: `${channel.url}${trailer}`,
+			};
+		}
+
+		return release;
+	});
+}
+
+function scrapeSceneBlog({ query }, url, channel) {
 	const release = {};

 	release.entryId = new URL(url).pathname.match(/\/scene\/(\d+)\/(\d+)/).slice(1, 3).join('-');

-	release.title = query.q('h4 strong, .videos h3', true);
+	release.title = query.text('h4 strong, .videos h3');
 	release.description = query.q('#about p, .videos p', true);

-	const actors = query.urls('a[href*="/girl/"]').map(actorUrl => actorUrl.match(/video-(\w+)/)?.[1]).filter(Boolean);
+	const actors = query.urls('a[href*="/girl/"]').map(actorUrl => actorUrl.match(/video-([\w\s]+)/)?.[1]).filter(Boolean);

 	if (actors.length > 0) {
 		release.actors = actors;
@@ -52,20 +88,68 @@ function scrapeScene({ query }, url, channel) {
 	return release;
 }

-async function fetchLatest(channel, page = 1) {
+function scrapeScene({ query, html }, url, channel) {
+	const release = {};
+
+	release.title = query.q('.updatesBlock h2', true);
+	release.poster = query.meta('property="og:image"');
+	release.entryId = release.poster.match(/\/content\/(.*)\//)?.[1];
+
+	const trailer = html.match(/src="(.+\.mp4)"/)?.[1];
+
+	if (trailer) {
+		release.trailer = {
+			src: `${channel.url}${trailer}`,
+		};
+	}
+
+	return release;
+}
+
+async function fetchLatestBlog(channel, page) {
 	const url = `${channel.url}/free/updates/videos/${(page - 1) * 10}`;
 	const res = await qu.getAll(url, '.videos');

-	return res.ok ? scrapeLatest(res.items, channel) : res.status;
+	return res.ok ? scrapeLatestBlog(res.items, channel) : res.status;
+}
+
+async function fetchLatest(channel, page = 1) {
+	if (channel.parameters?.blog) {
+		return fetchLatestBlog(channel, page);
+	}
+
+	const url = `${channel.url}/categories/Movies_${page}_d.html`;
+	const res = await qu.getAll(url, '.bodyArea .updateItem');
+
+	return res.ok ? scrapeAll(res.items, channel) : res.status;
+}
+
+async function fetchUpcoming(channel) {
+	if (channel.parameters?.blog) {
+		return [];
+	}
+
+	const res = await qu.getAll(channel.url, '#owl-upcomingScenes .updateItem');
+
+	return res.ok ? scrapeAll(res.items, channel) : res.status;
 }

 async function fetchScene(url, channel) {
 	const res = await qu.get(url);

-	return res.ok ? scrapeScene(res.item, url, channel) : res.status;
+	if (res.ok) {
+		if (channel.parameters?.blog) {
+			return scrapeSceneBlog(res.item, url, channel);
+		}
+
+		return scrapeScene(res.item, url, channel);
+	}
+
+	return res.status;
 }

 module.exports = {
 	fetchLatest,
 	fetchScene,
+	fetchUpcoming,
 };
--- a/src/updates.js
+++ b/src/updates.js
@@ -53,15 +53,15 @@ async function filterUniqueReleases(latestReleases, accReleases) {
 }

 function needNextPage(uniqueReleases, pageAccReleases) {
-	if (argv.last && pageAccReleases.length < argv.last) {
-		// request for last N releases not yet satisfied
-		return true;
-	}
-
 	if (uniqueReleases.length === 0) {
 		return false;
 	}

+	if (argv.last && pageAccReleases.length < argv.last) {
+		// TODO: find a way to paginate if scraper filters page with multiple channels, see Kelly Madison
+		return true;
+	}
+
 	if (uniqueReleases.every(release => !!release.date)) {
 		const oldestReleaseOnPage = uniqueReleases
 			.sort((releaseA, releaseB) => releaseB.date - releaseA.date)
Author	SHA1	Message	Date
DebaucheryLibrarian	5b886b3917	Improved actor extraction for fcuk scraper. Changed 'copyright' to 'credit'. Redused entity page favicon size.	2020-07-15 05:12:29 +02:00
DebaucheryLibrarian	c62df2228b	Added scraper for FCUK's coed sites.	2020-07-15 04:51:39 +02:00