Added media support to XEmpire (HardX) scraper.

2019-10-29 03:13:56 +01:00 · 2019-10-29 03:13:56 +01:00 · d5073a73ce
parent e204f56370
commit d5073a73ce
2 changed files with 67 additions and 1 deletions
--- a/assets/components/release/release.vue
+++ b/assets/components/release/release.vue
@ -101,7 +101,7 @@ function scrollBanner(event) {

 function photos() {
    if (this.release.photos.length) {
-        return this.release.photos;
+        return this.release.photos.sort(({ index: indexA }, { index: indexB }) => indexA - indexB);
    }

    if (this.release.poster && !this.release.trailer) {
--- a/src/scrapers/xempire.js
+++ b/src/scrapers/xempire.js
@ -1,5 +1,6 @@
 'use strict';

+const Promise = require('bluebird');
 const bhttp = require('bhttp');
 const cheerio = require('cheerio');
 const moment = require('moment');
@ -7,12 +8,55 @@ const moment = require('moment');
 const knex = require('../knex');
 const { matchTags } = require('../tags');

+async function fetchPhotos(url) {
+    const res = await bhttp.get(url);
+
+    return res.body.toString();
+}
+
+function scrapePhotos(html) {
+    const $ = cheerio.load(html, { normalizeWhitespace: true });
+
+    const unlockedPhotos = $('.preview .imgLink.pgUnlocked')
+        .map((photoIndex, photoElement) => $(photoElement).attr('href')).toArray();
+
+    const lockedThumbnails = $('.preview .imgLink.lockedPicture img')
+        .map((photoIndex, photoElement) => $(photoElement)
+            .attr('src')
+            .replace('_tb.jpg', '.jpg'))
+        .toArray();
+
+    return unlockedPhotos.concat(lockedThumbnails);
+}
+
+async function getPhotos(albumPath, siteDomain) {
+    const albumUrl = `https://${siteDomain}${albumPath}`;
+
+    const html = await fetchPhotos(albumUrl);
+    const $ = cheerio.load(html, { normalizeWhitespace: true });
+    const photos = scrapePhotos(html);
+
+    const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();
+
+    const otherPhotos = await Promise.map(pages, async (page) => {
+        const pageUrl = `https://${siteDomain}${page}`;
+        const pageHtml = await fetchPhotos(pageUrl);
+
+        return scrapePhotos(pageHtml);
+    }, {
+        concurrency: 2,
+    });
+
+    return photos.concat(otherPhotos.flat());
+}
+
 function scrape(html, site) {
    const $ = cheerio.load(html, { normalizeWhitespace: true });
    const scenesElements = $('li[data-itemtype=scene]').toArray();

    return scenesElements.map((element) => {
        const sceneLinkElement = $(element).find('.sceneTitle a');
+
        const url = `${site.url}${sceneLinkElement.attr('href')}`;
        const title = sceneLinkElement.attr('title');

@ -30,6 +74,9 @@ function scrape(html, site) {
            .toArray()
            .map(value => Number($(value).text()));

+        const poster = $(element).find('.imgLink img').attr('data-original');
+        const trailer = `https://videothumb.gammacdn.com/307x224/${entryId}.mp4`;
+
        return {
            url,
            entryId,
@ -37,6 +84,11 @@ function scrape(html, site) {
            actors,
            director: 'Mason',
            date,
+            poster,
+            trailer: {
+                src: trailer,
+                quality: 224,
+            },
            rating: {
                likes,
                dislikes,
@ -49,8 +101,11 @@ function scrape(html, site) {
 async function scrapeScene(html, url, site) {
    const $ = cheerio.load(html, { normalizeWhitespace: true });
    const json = $('script[type="application/ld+json"]').html();
+    const videoJson = $('script:contains("window.ScenePlayerOptions")').html();

    const data = JSON.parse(json)[0];
+    const videoData = JSON.parse(videoJson.slice(videoJson.indexOf('{"id":'), videoJson.indexOf('};') + 1));
+
    const entryId = new URL(url).pathname.split('/').slice(-1)[0];

    const title = $('meta[name="twitter:title"]').attr('content');
@ -76,6 +131,11 @@ async function scrapeScene(html, url, site) {
    const siteId = siteDomain && siteDomain.split('.')[0].toLowerCase();
    const siteUrl = siteDomain && `https://www.${siteDomain}`;

+    const poster = videoData.picPreview;
+    const trailer = `${videoData.playerOptions.host}${videoData.url}`;
+
+    const photos = await getPhotos($('.picturesItem a').attr('href'), siteDomain);
+
    const [channelSite, tags] = await Promise.all([
        site.isFallback
            ? knex('sites')
@ -95,6 +155,12 @@ async function scrapeScene(html, url, site) {
        director: 'Mason',
        description,
        duration,
+        poster,
+        photos,
+        trailer: {
+            src: trailer,
+            quality: parseInt(videoData.sizeOnLoad, 10),
+        },
        tags,
        rating: {
            stars,