Added Naughty America scraper.

2019-04-08 03:07:16 +02:00
parent aa33c83b83
commit 763ffb341a
7 changed files with 559 additions and 1 deletions
--- a/src/scrapers/index.js
+++ b/src/scrapers/index.js
@@ -1,5 +1,6 @@
 'use strict';

+const twentyonesextury = require('./21sextury');
 const blowpass = require('./blowpass');
 const brazzers = require('./brazzers');
 const ddfnetwork = require('./ddfnetwork');
@@ -9,8 +10,8 @@ const legalporno = require('./legalporno');
 const mofos = require('./mofos');
 const pervcity = require('./pervcity');
 const privateNetwork = require('./private'); // reserved keyword
+const naughtyamerica = require('./naughtyamerica');
 const realitykings = require('./realitykings');
-const twentyonesextury = require('./21sextury');
 const vixen = require('./vixen');
 const xempire = require('./xempire');

@@ -25,6 +26,7 @@ module.exports = {
    mofos,
    pervcity,
    private: privateNetwork,
+    naughtyamerica,
    realitykings,
    vixen,
    xempire,
--- a/src/scrapers/naughtyamerica.js
+++ b/src/scrapers/naughtyamerica.js
@@ -0,0 +1,108 @@
+'use strict';
+
+/* eslint-disable newline-per-chained-call */
+const bhttp = require('bhttp');
+const cheerio = require('cheerio');
+const moment = require('moment');
+
+const knex = require('../knex');
+const { matchTags } = require('../tags');
+
+function titleExtractor(pathname) {
+    const components = pathname.split('/')[2].split('-');
+    const entryId = components.slice(-1)[0];
+
+    const title = components.slice(0, -1).reduce((accTitle, word, index) => `${accTitle}${index > 0 ? ' ' : ''}${word.slice(0, 1).toUpperCase()}${word.slice(1)}`, '');
+
+    return { title, entryId };
+}
+
+function scrapeLatest(html, site) {
+    const $ = cheerio.load(html, { normalizeWhitespace: true });
+    const sceneElements = $('.site-list .scene-item').toArray();
+
+    return sceneElements.map((item) => {
+        const element = $(item);
+
+        const sceneLinkElement = element.find('a').first();
+        const { protocol, hostname, pathname } = new URL(sceneLinkElement.attr('href'));
+        const url = `${protocol}//${hostname}${pathname}`;
+        const { title, entryId } = titleExtractor(pathname);
+
+        const date = moment.utc(element.find('.entry-date').text(), 'MMM D, YYYY').toDate();
+        const actors = element.find('.contain-actors a').map((actorIndex, actorElement) => $(actorElement).text()).toArray();
+
+        const duration = Number(element.find('.scene-runtime').text().slice(0, -4)) * 60;
+
+        return {
+            url,
+            entryId,
+            title,
+            actors,
+            date,
+            duration,
+            rating: null,
+            site,
+        };
+    });
+}
+
+async function scrapeScene(html, url, site) {
+    const $ = cheerio.load(html, { normalizeWhitespace: true });
+    const sceneElement = $('.scene-info');
+
+    const { protocol, hostname, pathname } = new URL(url);
+    const originalUrl = `${protocol}//${hostname}${pathname}`;
+
+    const entryId = originalUrl.split('-').slice(-1)[0];
+    const title = sceneElement.find('h1.scene-title.grey-text').text();
+    const description = sceneElement.find('.synopsis').contents().slice(2).text().replace(/[\s\n]+/g, ' ').trim();
+
+    const date = moment.utc(sceneElement.find('span.entry-date').text(), 'MMM D, YYYY').toDate();
+    const actors = $('a.scene-title.grey-text.link').map((actorIndex, actorElement) => $(actorElement).text()).toArray();
+
+    const duration = Number(sceneElement.find('.duration-ratings .duration').text().slice(10, -4)) * 60;
+
+    const siteName = sceneElement.find('a.site-title').text();
+    const siteId = siteName.replace(/[\s']+/g, '').toLowerCase();
+
+    const rawTags = $('.categories a.cat-tag').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
+
+    const [channelSite, tags] = await Promise.all([
+        knex('sites')
+            .where({ id: siteId })
+            .orWhere({ name: siteName })
+            .first(),
+        matchTags(rawTags),
+    ]);
+
+    return {
+        url,
+        entryId,
+        title,
+        description,
+        actors,
+        date,
+        duration,
+        tags,
+        rating: null,
+        site: channelSite || site,
+    };
+}
+
+async function fetchLatest(site, page = 1) {
+    const res = await bhttp.get(`${site.url}?page=${page}`);
+
+    return scrapeLatest(res.body.toString(), site);
+}
+
+async function fetchScene(url, site) {
+    const res = await bhttp.get(url);
+
+    return scrapeScene(res.body.toString(), url, site);
+}
+
+module.exports = {
+    fetchLatest,
+    fetchScene,
+};