Added tags and duration to scraping. Added LegalPorno scraper.

2019-03-24 01:29:22 +01:00
parent e8d4b76403
commit 4fcabb4aae
11 changed files with 273 additions and 15 deletions
--- a/src/scrapers/legalporno.js
+++ b/src/scrapers/legalporno.js
@@ -1 +1,101 @@
 'use strict';
+
+const bhttp = require('bhttp');
+const cheerio = require('cheerio');
+const moment = require('moment');
+
+const tagMap = {
+    '3+ on 1': 'gangbang',
+    anal: 'anal',
+    bbc: 'big black cock',
+    'cum swallowing': 'swallowing',
+    rough: 'rough',
+    'deep throat': 'deepthroat',
+    'double penetration (DP)': 'DP',
+    'double anal (DAP)': 'DAP',
+    'double vaginal (DPP)': 'DVP',
+    'gapes (gaping asshole)': 'gaping',
+    'huge toys': 'toys',
+    interracial: 'interracial',
+    'triple penetration': 'TP',
+};
+
+function extractTitle(originalTitle) {
+    const titleComponents = originalTitle.split(' ');
+    const sceneIdMatch = titleComponents.slice(-1)[0].match(/(GP|SZ|IV|GIO|AA|GL|BZ|FS)\d+/); // detect studio prefixes
+    const id = sceneIdMatch ? sceneIdMatch[0] : null;
+    const title = sceneIdMatch ? titleComponents.slice(0, -1).join(' ') : originalTitle;
+
+    return { id, title };
+}
+
+function scrapeLatest(html, site) {
+    const $ = cheerio.load(html, { normalizeWhitespace: true });
+    const scenesElements = $('.thumbnails > div').toArray();
+
+    return scenesElements.map((element) => {
+        const sceneLinkElement = $(element).find('.thumbnail-title a');
+        const url = sceneLinkElement.attr('href');
+
+        const originalTitle = sceneLinkElement.attr('title');
+        const { id, title } = extractTitle(originalTitle);
+
+        const date = moment.utc($(element).attr('release'), 'YYYY/MM/DD').toDate();
+
+        return {
+            url,
+            id,
+            title,
+            date,
+            site,
+        };
+    });
+}
+
+function scrapeScene(html, url, site) {
+    const $ = cheerio.load(html, { normalizeWhitespace: true });
+
+    const originalTitle = $('h1.watchpage-title').text().trim();
+    const { id, title } = extractTitle(originalTitle);
+
+    const date = moment.utc($('span[title="Release date"] a').text(), 'YYYY-MM-DD').toDate();
+
+    const [actorsElement, tagsElement] = $('.scene-description__row').toArray();
+    const actors = $(actorsElement)
+        .find('a[href*="com/model"]')
+        .map((actorIndex, actorElement) => $(actorElement).text()).toArray();
+
+    const runtime = $('span[title="Runtime"]').text().trim().split(':');
+    const duration = Number(runtime[0]) * 3600 + Number(runtime[1]) * 60 + Number(runtime[2]);
+
+    const rawTags = $(tagsElement).find('a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
+    const tags = rawTags.reduce((accTags, tag) => (tagMap[tag] ? [...accTags, tagMap[tag]] : accTags), []);
+
+    return {
+        url,
+        id,
+        title,
+        date,
+        actors,
+        duration,
+        tags,
+        site,
+    };
+}
+
+async function fetchLatest(site) {
+    const res = await bhttp.get(`${site.url}/new-videos`);
+
+    return scrapeLatest(res.body.toString(), site);
+}
+
+async function fetchScene(url, site) {
+    const res = await bhttp.get(url);
+
+    return scrapeScene(res.body.toString(), url, site);
+}
+
+module.exports = {
+    fetchLatest,
+    fetchScene,
+};