Switched to tabs. Adding missing actor entries when scraping actors, with batch ID.

2020-05-14 04:26:05 +02:00
parent f1eb29c713
commit 11eb66f834
178 changed files with 16594 additions and 16929 deletions
--- a/src/scrapers/bangbros.js
+++ b/src/scrapers/bangbros.js
@@ -10,44 +10,44 @@ const slugify = require('../utils/slugify');
 const { ex } = require('../utils/q');

 function scrape(html, site) {
-    const $ = cheerio.load(html, { normalizeWhitespace: true });
-    const sceneElements = $('.echThumb').toArray();
+	const $ = cheerio.load(html, { normalizeWhitespace: true });
+	const sceneElements = $('.echThumb').toArray();

-    return sceneElements.map((element) => {
-        const sceneLinkElement = $(element).find('.thmb_lnk');
-        const title = sceneLinkElement.attr('title');
-        const url = `https://bangbros.com${sceneLinkElement.attr('href')}`;
-        const shootId = sceneLinkElement.attr('id') && sceneLinkElement.attr('id').split('-')[1];
-        const entryId = url.split('/')[3].slice(5);
+	return sceneElements.map((element) => {
+		const sceneLinkElement = $(element).find('.thmb_lnk');
+		const title = sceneLinkElement.attr('title');
+		const url = `https://bangbros.com${sceneLinkElement.attr('href')}`;
+		const shootId = sceneLinkElement.attr('id') && sceneLinkElement.attr('id').split('-')[1];
+		const entryId = url.split('/')[3].slice(5);

-        const date = moment.utc($(element).find('.thmb_mr_2 span.faTxt').text(), 'MMM D, YYYY').toDate();
-        const actors = $(element).find('.cast-wrapper a.cast').map((actorIndex, actorElement) => $(actorElement).text().trim()).toArray();
+		const date = moment.utc($(element).find('.thmb_mr_2 span.faTxt').text(), 'MMM D, YYYY').toDate();
+		const actors = $(element).find('.cast-wrapper a.cast').map((actorIndex, actorElement) => $(actorElement).text().trim()).toArray();

-        const photoElement = $(element).find('.rollover-image');
-        const poster = `https:${photoElement.attr('data-original')}`;
+		const photoElement = $(element).find('.rollover-image');
+		const poster = `https:${photoElement.attr('data-original')}`;

-        const photosUrl = photoElement.attr('data-rollover-url');
-        const photosMaxIndex = photoElement.attr('data-rollover-max-index');
-        const photos = Array.from({ length: photosMaxIndex }, (val, index) => `https:${photosUrl}big${index + 1}.jpg`);
+		const photosUrl = photoElement.attr('data-rollover-url');
+		const photosMaxIndex = photoElement.attr('data-rollover-max-index');
+		const photos = Array.from({ length: photosMaxIndex }, (val, index) => `https:${photosUrl}big${index + 1}.jpg`);

-        const duration = moment.duration(`0:${$(element).find('.thmb_pic b.tTm').text()}`).asSeconds();
-        const channel = $(element).find('a[href*="/websites"]').attr('href').split('/').slice(-1)[0];
+		const duration = moment.duration(`0:${$(element).find('.thmb_pic b.tTm').text()}`).asSeconds();
+		const channel = $(element).find('a[href*="/websites"]').attr('href').split('/').slice(-1)[0];

-        return {
-            url,
-            entryId,
-            shootId,
-            title,
-            actors,
-            date,
-            duration,
-            poster,
-            photos,
-            rating: null,
-            site,
-            channel,
-        };
-    });
+		return {
+			url,
+			entryId,
+			shootId,
+			title,
+			actors,
+			date,
+			duration,
+			poster,
+			photos,
+			rating: null,
+			site,
+			channel,
+		};
+	});
 }

 /* no dates available, breaks database
@@ -80,63 +80,63 @@ function scrapeUpcoming(html, site) {
 */

 function scrapeScene(html, url, _site) {
-    const { qu } = ex(html, '.playerSection');
-    const release = {};
+	const { qu } = ex(html, '.playerSection');
+	const release = {};

-    [release.shootId] = qu.q('.vdoTags + .vdoCast', true).match(/\w+$/);
-    [release.entryId] = url.split('/')[3].match(/\d+$/);
-    release.title = qu.q('.ps-vdoHdd h1', true);
-    release.description = qu.q('.vdoDesc', true);
+	[release.shootId] = qu.q('.vdoTags + .vdoCast', true).match(/\w+$/);
+	[release.entryId] = url.split('/')[3].match(/\d+$/);
+	release.title = qu.q('.ps-vdoHdd h1', true);
+	release.description = qu.q('.vdoDesc', true);

-    release.actors = qu.all('a[href*="/model"]', true);
-    release.tags = qu.all('.vdoTags a', true);
+	release.actors = qu.all('a[href*="/model"]', true);
+	release.tags = qu.all('.vdoTags a', true);

-    release.stars = Number(qu.q('div[class*="like"]', true).match(/^\d+/)[0]) / 20;
+	release.stars = Number(qu.q('div[class*="like"]', true).match(/^\d+/)[0]) / 20;

-    const poster = qu.img('img#player-overlay-image');
-    release.poster = [
-        poster,
-        poster.replace('/big_trailer', '/members/450x340'), // load error fallback
-    ];
+	const poster = qu.img('img#player-overlay-image');
+	release.poster = [
+		poster,
+		poster.replace('/big_trailer', '/members/450x340'), // load error fallback
+	];

-    release.trailer = { src: qu.trailer() };
+	release.trailer = { src: qu.trailer() };

-    // all scenes seem to have 12 album photos available, not always included on the page
-    const firstPhotoUrl = ex(html).qu.img('img[data-slider-index="1"]');
-    release.photos = Array.from({ length: 12 }, (val, index) => firstPhotoUrl.replace(/big\d+/, `big${index + 1}`));
+	// all scenes seem to have 12 album photos available, not always included on the page
+	const firstPhotoUrl = ex(html).qu.img('img[data-slider-index="1"]');
+	release.photos = Array.from({ length: 12 }, (val, index) => firstPhotoUrl.replace(/big\d+/, `big${index + 1}`));

-    const [channel] = qu.url('a[href*="/websites"]').match(/\w+$/);
+	const [channel] = qu.url('a[href*="/websites"]').match(/\w+$/);

-    if (channel === 'bangcasting') release.channel = 'bangbroscasting';
-    if (channel === 'remaster') release.channel = 'bangbrosremastered';
-    else release.channel = channel;
+	if (channel === 'bangcasting') release.channel = 'bangbroscasting';
+	if (channel === 'remaster') release.channel = 'bangbrosremastered';
+	else release.channel = channel;

-    return release;
+	return release;
 }

 function scrapeProfile(html) {
-    const { q } = ex(html);
-    const profile = {};
+	const { q } = ex(html);
+	const profile = {};

-    const avatar = q('.profilePic img', 'src');
-    if (avatar) profile.avatar = `https:${avatar}`;
+	const avatar = q('.profilePic img', 'src');
+	if (avatar) profile.avatar = `https:${avatar}`;

-    profile.releases = scrape(html);
+	profile.releases = scrape(html);

-    return profile;
+	return profile;
 }

 function scrapeProfileSearch(html, actorName) {
-    const { qu } = ex(html);
-    const actorLink = qu.url(`a[title="${actorName}" i][href*="model"]`);
+	const { qu } = ex(html);
+	const actorLink = qu.url(`a[title="${actorName}" i][href*="model"]`);

-    return actorLink ? `https://bangbros.com${actorLink}` : null;
+	return actorLink ? `https://bangbros.com${actorLink}` : null;
 }

 async function fetchLatest(site, page = 1) {
-    const res = await bhttp.get(`${site.url}/${page}`);
+	const res = await bhttp.get(`${site.url}/${page}`);

-    return scrape(res.body.toString(), site);
+	return scrape(res.body.toString(), site);
 }

 /*
@@ -148,43 +148,43 @@ async function fetchUpcoming(site) {
 */

 async function fetchScene(url, site, release) {
-    if (!release?.date) {
-        logger.warn(`Scraping Bang Bros scene from URL without release date: ${url}`);
-    }
+	if (!release?.date) {
+		logger.warn(`Scraping Bang Bros scene from URL without release date: ${url}`);
+	}

-    const { origin } = new URL(url);
-    const res = await bhttp.get(url);
+	const { origin } = new URL(url);
+	const res = await bhttp.get(url);

-    if (!/https?:\/\/(www.)?bangbros.com\/?$/.test(origin)) {
-        throw new Error('Cannot fetch from this URL. Please find the scene on https://bangbros.com and try again.');
-    }
+	if (!/https?:\/\/(www.)?bangbros.com\/?$/.test(origin)) {
+		throw new Error('Cannot fetch from this URL. Please find the scene on https://bangbros.com and try again.');
+	}

-    return scrapeScene(res.body.toString(), url, site);
+	return scrapeScene(res.body.toString(), url, site);
 }

 async function fetchProfile(actorName) {
-    const actorSlug = slugify(actorName);
-    const url = `https://bangbros.com/search/${actorSlug}`;
-    const res = await bhttp.get(url);
+	const actorSlug = slugify(actorName);
+	const url = `https://bangbros.com/search/${actorSlug}`;
+	const res = await bhttp.get(url);

-    if (res.statusCode === 200) {
-        const actorUrl = scrapeProfileSearch(res.body.toString(), actorName);
+	if (res.statusCode === 200) {
+		const actorUrl = scrapeProfileSearch(res.body.toString(), actorName);

-        if (actorUrl) {
-            const actorRes = await bhttp.get(actorUrl);
+		if (actorUrl) {
+			const actorRes = await bhttp.get(actorUrl);

-            if (actorRes.statusCode === 200) {
-                return scrapeProfile(actorRes.body.toString());
-            }
-        }
-    }
+			if (actorRes.statusCode === 200) {
+				return scrapeProfile(actorRes.body.toString());
+			}
+		}
+	}

-    return null;
+	return null;
 }

 module.exports = {
-    fetchLatest,
-    fetchScene,
-    fetchProfile,
-    // fetchUpcoming, no dates available
+	fetchLatest,
+	fetchScene,
+	fetchProfile,
+	// fetchUpcoming, no dates available
 };