From d5073a73ce5b09e3ba96dc20286863b5ada8410b Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Tue, 29 Oct 2019 03:13:56 +0100 Subject: [PATCH] Added media support to XEmpire (HardX) scraper. --- assets/components/release/release.vue | 2 +- src/scrapers/xempire.js | 66 +++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/assets/components/release/release.vue b/assets/components/release/release.vue index 7bd5464b..4f59333a 100644 --- a/assets/components/release/release.vue +++ b/assets/components/release/release.vue @@ -101,7 +101,7 @@ function scrollBanner(event) { function photos() { if (this.release.photos.length) { - return this.release.photos; + return this.release.photos.sort(({ index: indexA }, { index: indexB }) => indexA - indexB); } if (this.release.poster && !this.release.trailer) { diff --git a/src/scrapers/xempire.js b/src/scrapers/xempire.js index e7cbb110..99e8d22b 100644 --- a/src/scrapers/xempire.js +++ b/src/scrapers/xempire.js @@ -1,5 +1,6 @@ 'use strict'; +const Promise = require('bluebird'); const bhttp = require('bhttp'); const cheerio = require('cheerio'); const moment = require('moment'); @@ -7,12 +8,55 @@ const moment = require('moment'); const knex = require('../knex'); const { matchTags } = require('../tags'); +async function fetchPhotos(url) { + const res = await bhttp.get(url); + + return res.body.toString(); +} + +function scrapePhotos(html) { + const $ = cheerio.load(html, { normalizeWhitespace: true }); + + const unlockedPhotos = $('.preview .imgLink.pgUnlocked') + .map((photoIndex, photoElement) => $(photoElement).attr('href')).toArray(); + + const lockedThumbnails = $('.preview .imgLink.lockedPicture img') + .map((photoIndex, photoElement) => $(photoElement) + .attr('src') + .replace('_tb.jpg', '.jpg')) + .toArray(); + + return unlockedPhotos.concat(lockedThumbnails); +} + +async function getPhotos(albumPath, siteDomain) { + const albumUrl = `https://${siteDomain}${albumPath}`; + + const html = await fetchPhotos(albumUrl); + const $ = cheerio.load(html, { normalizeWhitespace: true }); + const photos = scrapePhotos(html); + + const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray(); + + const otherPhotos = await Promise.map(pages, async (page) => { + const pageUrl = `https://${siteDomain}${page}`; + const pageHtml = await fetchPhotos(pageUrl); + + return scrapePhotos(pageHtml); + }, { + concurrency: 2, + }); + + return photos.concat(otherPhotos.flat()); +} + function scrape(html, site) { const $ = cheerio.load(html, { normalizeWhitespace: true }); const scenesElements = $('li[data-itemtype=scene]').toArray(); return scenesElements.map((element) => { const sceneLinkElement = $(element).find('.sceneTitle a'); + const url = `${site.url}${sceneLinkElement.attr('href')}`; const title = sceneLinkElement.attr('title'); @@ -30,6 +74,9 @@ function scrape(html, site) { .toArray() .map(value => Number($(value).text())); + const poster = $(element).find('.imgLink img').attr('data-original'); + const trailer = `https://videothumb.gammacdn.com/307x224/${entryId}.mp4`; + return { url, entryId, @@ -37,6 +84,11 @@ function scrape(html, site) { actors, director: 'Mason', date, + poster, + trailer: { + src: trailer, + quality: 224, + }, rating: { likes, dislikes, @@ -49,8 +101,11 @@ function scrape(html, site) { async function scrapeScene(html, url, site) { const $ = cheerio.load(html, { normalizeWhitespace: true }); const json = $('script[type="application/ld+json"]').html(); + const videoJson = $('script:contains("window.ScenePlayerOptions")').html(); const data = JSON.parse(json)[0]; + const videoData = JSON.parse(videoJson.slice(videoJson.indexOf('{"id":'), videoJson.indexOf('};') + 1)); + const entryId = new URL(url).pathname.split('/').slice(-1)[0]; const title = $('meta[name="twitter:title"]').attr('content'); @@ -76,6 +131,11 @@ async function scrapeScene(html, url, site) { const siteId = siteDomain && siteDomain.split('.')[0].toLowerCase(); const siteUrl = siteDomain && `https://www.${siteDomain}`; + const poster = videoData.picPreview; + const trailer = `${videoData.playerOptions.host}${videoData.url}`; + + const photos = await getPhotos($('.picturesItem a').attr('href'), siteDomain); + const [channelSite, tags] = await Promise.all([ site.isFallback ? knex('sites') @@ -95,6 +155,12 @@ async function scrapeScene(html, url, site) { director: 'Mason', description, duration, + poster, + photos, + trailer: { + src: trailer, + quality: parseInt(videoData.sizeOnLoad, 10), + }, tags, rating: { stars,