From d5073a73ce5b09e3ba96dc20286863b5ada8410b Mon Sep 17 00:00:00 2001
From: Niels Simenon <niels.simenon@gmail.com>
Date: Tue, 29 Oct 2019 03:13:56 +0100
Subject: [PATCH] Added media support to XEmpire (HardX) scraper.

---
 assets/components/release/release.vue |  2 +-
 src/scrapers/xempire.js               | 66 +++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/assets/components/release/release.vue b/assets/components/release/release.vue
index 7bd5464b..4f59333a 100644
--- a/assets/components/release/release.vue
+++ b/assets/components/release/release.vue
@@ -101,7 +101,7 @@ function scrollBanner(event) {
 
 function photos() {
     if (this.release.photos.length) {
-        return this.release.photos;
+        return this.release.photos.sort(({ index: indexA }, { index: indexB }) => indexA - indexB);
     }
 
     if (this.release.poster && !this.release.trailer) {
diff --git a/src/scrapers/xempire.js b/src/scrapers/xempire.js
index e7cbb110..99e8d22b 100644
--- a/src/scrapers/xempire.js
+++ b/src/scrapers/xempire.js
@@ -1,5 +1,6 @@
 'use strict';
 
+const Promise = require('bluebird');
 const bhttp = require('bhttp');
 const cheerio = require('cheerio');
 const moment = require('moment');
@@ -7,12 +8,55 @@ const moment = require('moment');
 const knex = require('../knex');
 const { matchTags } = require('../tags');
 
+async function fetchPhotos(url) {
+    const res = await bhttp.get(url);
+
+    return res.body.toString();
+}
+
+function scrapePhotos(html) {
+    const $ = cheerio.load(html, { normalizeWhitespace: true });
+
+    const unlockedPhotos = $('.preview .imgLink.pgUnlocked')
+        .map((photoIndex, photoElement) => $(photoElement).attr('href')).toArray();
+
+    const lockedThumbnails = $('.preview .imgLink.lockedPicture img')
+        .map((photoIndex, photoElement) => $(photoElement)
+            .attr('src')
+            .replace('_tb.jpg', '.jpg'))
+        .toArray();
+
+    return unlockedPhotos.concat(lockedThumbnails);
+}
+
+async function getPhotos(albumPath, siteDomain) {
+    const albumUrl = `https://${siteDomain}${albumPath}`;
+
+    const html = await fetchPhotos(albumUrl);
+    const $ = cheerio.load(html, { normalizeWhitespace: true });
+    const photos = scrapePhotos(html);
+
+    const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();
+
+    const otherPhotos = await Promise.map(pages, async (page) => {
+        const pageUrl = `https://${siteDomain}${page}`;
+        const pageHtml = await fetchPhotos(pageUrl);
+
+        return scrapePhotos(pageHtml);
+    }, {
+        concurrency: 2,
+    });
+
+    return photos.concat(otherPhotos.flat());
+}
+
 function scrape(html, site) {
     const $ = cheerio.load(html, { normalizeWhitespace: true });
     const scenesElements = $('li[data-itemtype=scene]').toArray();
 
     return scenesElements.map((element) => {
         const sceneLinkElement = $(element).find('.sceneTitle a');
+
         const url = `${site.url}${sceneLinkElement.attr('href')}`;
         const title = sceneLinkElement.attr('title');
 
@@ -30,6 +74,9 @@ function scrape(html, site) {
             .toArray()
             .map(value => Number($(value).text()));
 
+        const poster = $(element).find('.imgLink img').attr('data-original');
+        const trailer = `https://videothumb.gammacdn.com/307x224/${entryId}.mp4`;
+
         return {
             url,
             entryId,
@@ -37,6 +84,11 @@ function scrape(html, site) {
             actors,
             director: 'Mason',
             date,
+            poster,
+            trailer: {
+                src: trailer,
+                quality: 224,
+            },
             rating: {
                 likes,
                 dislikes,
@@ -49,8 +101,11 @@ function scrape(html, site) {
 async function scrapeScene(html, url, site) {
     const $ = cheerio.load(html, { normalizeWhitespace: true });
     const json = $('script[type="application/ld+json"]').html();
+    const videoJson = $('script:contains("window.ScenePlayerOptions")').html();
 
     const data = JSON.parse(json)[0];
+    const videoData = JSON.parse(videoJson.slice(videoJson.indexOf('{"id":'), videoJson.indexOf('};') + 1));
+
     const entryId = new URL(url).pathname.split('/').slice(-1)[0];
 
     const title = $('meta[name="twitter:title"]').attr('content');
@@ -76,6 +131,11 @@ async function scrapeScene(html, url, site) {
     const siteId = siteDomain && siteDomain.split('.')[0].toLowerCase();
     const siteUrl = siteDomain && `https://www.${siteDomain}`;
 
+    const poster = videoData.picPreview;
+    const trailer = `${videoData.playerOptions.host}${videoData.url}`;
+
+    const photos = await getPhotos($('.picturesItem a').attr('href'), siteDomain);
+
     const [channelSite, tags] = await Promise.all([
         site.isFallback
             ? knex('sites')
@@ -95,6 +155,12 @@ async function scrapeScene(html, url, site) {
         director: 'Mason',
         description,
         duration,
+        poster,
+        photos,
+        trailer: {
+            src: trailer,
+            quality: parseInt(videoData.sizeOnLoad, 10),
+        },
         tags,
         rating: {
             stars,