Added 21Sextury scraper. Various improvements.

2019-04-07 20:51:14 +02:00
parent e78e12a3c5
commit ec056a177a
9 changed files with 294 additions and 75 deletions
--- a/src/fetch-releases.js
+++ b/src/fetch-releases.js
@@ -96,6 +96,10 @@ async function storeReleases(releases) {
 async function fetchNewReleases(scraper, site, afterDate, accReleases = [], page = 1) {
    const latestReleases = await scraper.fetchLatest(site, page);

+    if (latestReleases.length === 0) {
+        return [];
+    }
+
    const duplicateReleases = await findDuplicateReleases(latestReleases, site.id);
    const duplicateReleasesIds = new Set(
        duplicateReleases
--- a/src/scrapers/21sextury.js
+++ b/src/scrapers/21sextury.js
@@ -0,0 +1,141 @@
+'use strict';
+
+const bhttp = require('bhttp');
+const cheerio = require('cheerio');
+const moment = require('moment');
+
+const knex = require('../knex');
+const { matchTags } = require('../tags');
+
+function scrape(html, site) {
+    const $ = cheerio.load(html, { normalizeWhitespace: true });
+    const scenesElements = $('li[data-itemtype=scene]').toArray();
+
+    return scenesElements.reduce((accReleases, element) => {
+        const siteName = $(element).find('.studioName a').attr('title');
+
+        if (site.parameters && site.parameters.filter && siteName.toLowerCase() !== site.name.toLowerCase()) {
+            return accReleases;
+        }
+
+        const sceneLinkElement = $(element).find('.sceneTitle a');
+        const url = `${site.url}${sceneLinkElement.attr('href')}`;
+        const title = sceneLinkElement.attr('title').trim();
+
+        const entryId = $(element).attr('data-itemid');
+
+        const date = moment
+            .utc($(element).find('.sceneDate').text(), 'MM-DD-YYYY')
+            .toDate();
+
+        const actors = $(element).find('.sceneActors a')
+            .map((actorIndex, actorElement) => $(actorElement).attr('title'))
+            .toArray();
+
+        const [likes, dislikes] = $(element).find('.value')
+            .toArray()
+            .map(value => Number($(value).text()));
+
+        return [
+            ...accReleases,
+            {
+                url,
+                entryId,
+                title,
+                actors,
+                date,
+                rating: {
+                    likes,
+                    dislikes,
+                },
+                site,
+            },
+        ];
+    }, []);
+}
+
+async function scrapeScene(html, url, site) {
+    const $ = cheerio.load(html, { normalizeWhitespace: true });
+    const sceneElement = $('#videoWrapper');
+    const json = $('script[type="application/ld+json"]').html();
+
+    const data = JSON.parse(json)[0];
+    const entryId = new URL(url).pathname.split('/').slice(-1)[0];
+
+    const title = data.name;
+    const date = moment.utc(data.dateCreated, 'YYYY-MM-DD').toDate();
+
+    const actors = data.actor
+        .sort(({ genderA }, { genderB }) => {
+            if (genderA === 'female' && genderB === 'male') return 1;
+            if (genderA === 'male' && genderB === 'female') return -1;
+
+            return 0;
+        })
+        .map(actor => actor.name);
+
+    const description = data.description || null; // prevent empty string
+    const likes = Number(sceneElement.find('.rating .state_1 .value').text());
+    const dislikes = Number(sceneElement.find('#infoWrapper .rating .state_2 .value').text());
+
+    const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds();
+
+    const rawTags = data.keywords.split(', ');
+    const siteName = data.productionCompany ? data.productionCompany.name : $('#logoLink a').attr('title');
+    const siteId = siteName && siteName.replace(/\s+/g, '').toLowerCase();
+
+    const [channelSite, tags] = await Promise.all([
+        site.isFallback
+            ? knex('sites')
+                .where({ id: siteId })
+                .orWhereRaw('name = ? collate NOCASE', [siteName])
+                .first()
+            : site,
+        matchTags(rawTags),
+    ]);
+
+    // only replace generic URL with site URL if site is not marked to fetch scenes from generic site
+    const originalUrl = channelSite && !(channelSite.parameters && JSON.parse(channelSite.parameters).filter)
+        ? `${channelSite.url}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}`
+        : url;
+
+    return {
+        url: originalUrl,
+        entryId,
+        title,
+        date,
+        actors,
+        description,
+        duration,
+        tags,
+        rating: {
+            likes,
+            dislikes,
+        },
+        site: channelSite || site,
+    };
+}
+
+async function fetchLatest(site, page = 1) {
+    const res = await bhttp.get(`${site.parameters.filter ? 'https://21sextury.com' : site.url}/en/videos/All-Categories/0/All-Pornstars/0/latest/${page}`);
+
+    return scrape(res.body.toString(), site);
+}
+
+async function fetchUpcoming(site) {
+    const res = await bhttp.get(`${site.parameters.filter ? 'https://21sextury.com' : site.url}/en/videos/All-Categories/0/All-Pornstars/0/upcoming`);
+
+    return scrape(res.body.toString(), site);
+}
+
+async function fetchScene(url, site) {
+    const res = await bhttp.get(url);
+
+    return scrapeScene(res.body.toString(), url, site);
+}
+
+module.exports = {
+    fetchLatest,
+    fetchUpcoming,
+    fetchScene,
+};
--- a/src/scrapers/index.js
+++ b/src/scrapers/index.js
@@ -9,10 +9,12 @@ const legalporno = require('./legalporno');
 const mofos = require('./mofos');
 const pervcity = require('./pervcity');
 const privateNetwork = require('./private'); // reserved keyword
+const twentyonesextury = require('./21sextury');
 const vixen = require('./vixen');
 const xempire = require('./xempire');

 module.exports = {
+    '21sextury': twentyonesextury,
    blowpass,
    brazzers,
    ddfnetwork,
--- a/src/scrapers/xempire.js
+++ b/src/scrapers/xempire.js
@@ -4,6 +4,7 @@ const bhttp = require('bhttp');
 const cheerio = require('cheerio');
 const moment = require('moment');

+const knex = require('../knex');
 const { matchTags } = require('../tags');

 function scrape(html, site) {
@@ -15,7 +16,7 @@ function scrape(html, site) {
        const url = `${site.url}${sceneLinkElement.attr('href')}`;
        const title = sceneLinkElement.attr('title');

-        const shootId = $(element).attr('data-itemid');
+        const entryId = $(element).attr('data-itemid');

        const date = moment
            .utc($(element).find('.sceneDate').text(), 'MM-DD-YYYY')
@@ -31,7 +32,7 @@ function scrape(html, site) {

        return {
            url,
-            shootId,
+            entryId,
            title,
            actors,
            director: 'Mason',
@@ -45,44 +46,12 @@ function scrape(html, site) {
    });
 }

-async function scrapeSceneFallback($, url, site) {
-    const shootId = new URL(url).pathname.split('/').slice(-1)[0];
-    const title = $('h1.title').text();
-    const date = moment.utc($('.updatedDate').text(), 'MM-DD-YYYY').toDate();
-    const actors = $('.sceneColActors a').map((actorIndex, actorElement) => $(actorElement).text()).toArray();
-
-    const description = ($('.sceneDesc').text() || '').replace(/Video Description:/g, ' ').trim();
-    const stars = $('.currentRating').text().split('/')[0] / 2;
-
-    const rawTags = $('.sceneColCategories > a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
-    const tags = await matchTags(rawTags);
-
-    return {
-        url,
-        shootId,
-        title,
-        date,
-        actors,
-        director: 'Mason',
-        description,
-        tags,
-        rating: {
-            stars,
-        },
-        site,
-    };
-}
-
 async function scrapeScene(html, url, site) {
    const $ = cheerio.load(html, { normalizeWhitespace: true });
    const json = $('script[type="application/ld+json"]').html();

-    if (!json) {
-        return scrapeSceneFallback($, url, site);
-    }
-
    const data = JSON.parse(json)[0];
-    const shootId = new URL(url).pathname.split('/').slice(-1)[0];
+    const entryId = new URL(url).pathname.split('/').slice(-1)[0];

    const title = data.isPartOf.name;
    const date = moment.utc(data.dateCreated, 'YYYY-MM-DD').toDate();
@@ -102,11 +71,23 @@ async function scrapeScene(html, url, site) {
    const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds();

    const rawTags = data.keywords.split(', ');
-    const tags = await matchTags(rawTags);
+    const siteDomain = $('meta[name="twitter:domain"]').attr('content');
+    const siteId = siteDomain && siteDomain.split('.')[0].toLowerCase();
+    const siteUrl = siteDomain && `https://www.${siteDomain}`;
+
+    const [channelSite, tags] = await Promise.all([
+        site.isFallback
+            ? knex('sites')
+                .where({ url: siteUrl })
+                .orWhere({ id: siteId })
+                .first()
+            : site,
+        matchTags(rawTags),
+    ]);

    return {
-        url,
-        shootId,
+        url: channelSite ? `${channelSite.url}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}` : url,
+        entryId,
        title,
        date,
        actors,
@@ -117,7 +98,7 @@ async function scrapeScene(html, url, site) {
        rating: {
            stars,
        },
-        site,
+        site: channelSite || site,
    };
 }