Added media support to XEmpire (HardX) scraper.

This commit is contained in:
ThePendulum 2019-10-29 03:13:56 +01:00
parent e204f56370
commit d5073a73ce
2 changed files with 67 additions and 1 deletions

View File

@ -101,7 +101,7 @@ function scrollBanner(event) {
function photos() {
if (this.release.photos.length) {
return this.release.photos;
return this.release.photos.sort(({ index: indexA }, { index: indexB }) => indexA - indexB);
}
if (this.release.poster && !this.release.trailer) {

View File

@ -1,5 +1,6 @@
'use strict';
const Promise = require('bluebird');
const bhttp = require('bhttp');
const cheerio = require('cheerio');
const moment = require('moment');
@ -7,12 +8,55 @@ const moment = require('moment');
const knex = require('../knex');
const { matchTags } = require('../tags');
async function fetchPhotos(url) {
const res = await bhttp.get(url);
return res.body.toString();
}
function scrapePhotos(html) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const unlockedPhotos = $('.preview .imgLink.pgUnlocked')
.map((photoIndex, photoElement) => $(photoElement).attr('href')).toArray();
const lockedThumbnails = $('.preview .imgLink.lockedPicture img')
.map((photoIndex, photoElement) => $(photoElement)
.attr('src')
.replace('_tb.jpg', '.jpg'))
.toArray();
return unlockedPhotos.concat(lockedThumbnails);
}
async function getPhotos(albumPath, siteDomain) {
const albumUrl = `https://${siteDomain}${albumPath}`;
const html = await fetchPhotos(albumUrl);
const $ = cheerio.load(html, { normalizeWhitespace: true });
const photos = scrapePhotos(html);
const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();
const otherPhotos = await Promise.map(pages, async (page) => {
const pageUrl = `https://${siteDomain}${page}`;
const pageHtml = await fetchPhotos(pageUrl);
return scrapePhotos(pageHtml);
}, {
concurrency: 2,
});
return photos.concat(otherPhotos.flat());
}
function scrape(html, site) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const scenesElements = $('li[data-itemtype=scene]').toArray();
return scenesElements.map((element) => {
const sceneLinkElement = $(element).find('.sceneTitle a');
const url = `${site.url}${sceneLinkElement.attr('href')}`;
const title = sceneLinkElement.attr('title');
@ -30,6 +74,9 @@ function scrape(html, site) {
.toArray()
.map(value => Number($(value).text()));
const poster = $(element).find('.imgLink img').attr('data-original');
const trailer = `https://videothumb.gammacdn.com/307x224/${entryId}.mp4`;
return {
url,
entryId,
@ -37,6 +84,11 @@ function scrape(html, site) {
actors,
director: 'Mason',
date,
poster,
trailer: {
src: trailer,
quality: 224,
},
rating: {
likes,
dislikes,
@ -49,8 +101,11 @@ function scrape(html, site) {
async function scrapeScene(html, url, site) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const json = $('script[type="application/ld+json"]').html();
const videoJson = $('script:contains("window.ScenePlayerOptions")').html();
const data = JSON.parse(json)[0];
const videoData = JSON.parse(videoJson.slice(videoJson.indexOf('{"id":'), videoJson.indexOf('};') + 1));
const entryId = new URL(url).pathname.split('/').slice(-1)[0];
const title = $('meta[name="twitter:title"]').attr('content');
@ -76,6 +131,11 @@ async function scrapeScene(html, url, site) {
const siteId = siteDomain && siteDomain.split('.')[0].toLowerCase();
const siteUrl = siteDomain && `https://www.${siteDomain}`;
const poster = videoData.picPreview;
const trailer = `${videoData.playerOptions.host}${videoData.url}`;
const photos = await getPhotos($('.picturesItem a').attr('href'), siteDomain);
const [channelSite, tags] = await Promise.all([
site.isFallback
? knex('sites')
@ -95,6 +155,12 @@ async function scrapeScene(html, url, site) {
director: 'Mason',
description,
duration,
poster,
photos,
trailer: {
src: trailer,
quality: parseInt(videoData.sizeOnLoad, 10),
},
tags,
rating: {
stars,