Added 21Sextury scraper. Various improvements.

2019-04-07 20:51:14 +02:00
parent e78e12a3c5
commit ec056a177a
9 changed files with 294 additions and 75 deletions
--- a/src/scrapers/xempire.js
+++ b/src/scrapers/xempire.js
@@ -4,6 +4,7 @@ const bhttp = require('bhttp');
 const cheerio = require('cheerio');
 const moment = require('moment');

+const knex = require('../knex');
 const { matchTags } = require('../tags');

 function scrape(html, site) {
@@ -15,7 +16,7 @@ function scrape(html, site) {
        const url = `${site.url}${sceneLinkElement.attr('href')}`;
        const title = sceneLinkElement.attr('title');

-        const shootId = $(element).attr('data-itemid');
+        const entryId = $(element).attr('data-itemid');

        const date = moment
            .utc($(element).find('.sceneDate').text(), 'MM-DD-YYYY')
@@ -31,7 +32,7 @@ function scrape(html, site) {

        return {
            url,
-            shootId,
+            entryId,
            title,
            actors,
            director: 'Mason',
@@ -45,44 +46,12 @@ function scrape(html, site) {
    });
 }

-async function scrapeSceneFallback($, url, site) {
-    const shootId = new URL(url).pathname.split('/').slice(-1)[0];
-    const title = $('h1.title').text();
-    const date = moment.utc($('.updatedDate').text(), 'MM-DD-YYYY').toDate();
-    const actors = $('.sceneColActors a').map((actorIndex, actorElement) => $(actorElement).text()).toArray();
-
-    const description = ($('.sceneDesc').text() || '').replace(/Video Description:/g, ' ').trim();
-    const stars = $('.currentRating').text().split('/')[0] / 2;
-
-    const rawTags = $('.sceneColCategories > a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
-    const tags = await matchTags(rawTags);
-
-    return {
-        url,
-        shootId,
-        title,
-        date,
-        actors,
-        director: 'Mason',
-        description,
-        tags,
-        rating: {
-            stars,
-        },
-        site,
-    };
-}
-
 async function scrapeScene(html, url, site) {
    const $ = cheerio.load(html, { normalizeWhitespace: true });
    const json = $('script[type="application/ld+json"]').html();

-    if (!json) {
-        return scrapeSceneFallback($, url, site);
-    }
-
    const data = JSON.parse(json)[0];
-    const shootId = new URL(url).pathname.split('/').slice(-1)[0];
+    const entryId = new URL(url).pathname.split('/').slice(-1)[0];

    const title = data.isPartOf.name;
    const date = moment.utc(data.dateCreated, 'YYYY-MM-DD').toDate();
@@ -102,11 +71,23 @@ async function scrapeScene(html, url, site) {
    const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds();

    const rawTags = data.keywords.split(', ');
-    const tags = await matchTags(rawTags);
+    const siteDomain = $('meta[name="twitter:domain"]').attr('content');
+    const siteId = siteDomain && siteDomain.split('.')[0].toLowerCase();
+    const siteUrl = siteDomain && `https://www.${siteDomain}`;
+
+    const [channelSite, tags] = await Promise.all([
+        site.isFallback
+            ? knex('sites')
+                .where({ url: siteUrl })
+                .orWhere({ id: siteId })
+                .first()
+            : site,
+        matchTags(rawTags),
+    ]);

    return {
-        url,
-        shootId,
+        url: channelSite ? `${channelSite.url}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}` : url,
+        entryId,
        title,
        date,
        actors,
@@ -117,7 +98,7 @@ async function scrapeScene(html, url, site) {
        rating: {
            stars,
        },
-        site,
+        site: channelSite || site,
    };
 }