Added media support to XEmpire (HardX) scraper.
This commit is contained in:
parent
e204f56370
commit
d5073a73ce
|
@ -101,7 +101,7 @@ function scrollBanner(event) {
|
|||
|
||||
function photos() {
|
||||
if (this.release.photos.length) {
|
||||
return this.release.photos;
|
||||
return this.release.photos.sort(({ index: indexA }, { index: indexB }) => indexA - indexB);
|
||||
}
|
||||
|
||||
if (this.release.poster && !this.release.trailer) {
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
'use strict';
|
||||
|
||||
const Promise = require('bluebird');
|
||||
const bhttp = require('bhttp');
|
||||
const cheerio = require('cheerio');
|
||||
const moment = require('moment');
|
||||
|
@ -7,12 +8,55 @@ const moment = require('moment');
|
|||
const knex = require('../knex');
|
||||
const { matchTags } = require('../tags');
|
||||
|
||||
async function fetchPhotos(url) {
|
||||
const res = await bhttp.get(url);
|
||||
|
||||
return res.body.toString();
|
||||
}
|
||||
|
||||
function scrapePhotos(html) {
|
||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||
|
||||
const unlockedPhotos = $('.preview .imgLink.pgUnlocked')
|
||||
.map((photoIndex, photoElement) => $(photoElement).attr('href')).toArray();
|
||||
|
||||
const lockedThumbnails = $('.preview .imgLink.lockedPicture img')
|
||||
.map((photoIndex, photoElement) => $(photoElement)
|
||||
.attr('src')
|
||||
.replace('_tb.jpg', '.jpg'))
|
||||
.toArray();
|
||||
|
||||
return unlockedPhotos.concat(lockedThumbnails);
|
||||
}
|
||||
|
||||
async function getPhotos(albumPath, siteDomain) {
|
||||
const albumUrl = `https://${siteDomain}${albumPath}`;
|
||||
|
||||
const html = await fetchPhotos(albumUrl);
|
||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||
const photos = scrapePhotos(html);
|
||||
|
||||
const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();
|
||||
|
||||
const otherPhotos = await Promise.map(pages, async (page) => {
|
||||
const pageUrl = `https://${siteDomain}${page}`;
|
||||
const pageHtml = await fetchPhotos(pageUrl);
|
||||
|
||||
return scrapePhotos(pageHtml);
|
||||
}, {
|
||||
concurrency: 2,
|
||||
});
|
||||
|
||||
return photos.concat(otherPhotos.flat());
|
||||
}
|
||||
|
||||
function scrape(html, site) {
|
||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||
const scenesElements = $('li[data-itemtype=scene]').toArray();
|
||||
|
||||
return scenesElements.map((element) => {
|
||||
const sceneLinkElement = $(element).find('.sceneTitle a');
|
||||
|
||||
const url = `${site.url}${sceneLinkElement.attr('href')}`;
|
||||
const title = sceneLinkElement.attr('title');
|
||||
|
||||
|
@ -30,6 +74,9 @@ function scrape(html, site) {
|
|||
.toArray()
|
||||
.map(value => Number($(value).text()));
|
||||
|
||||
const poster = $(element).find('.imgLink img').attr('data-original');
|
||||
const trailer = `https://videothumb.gammacdn.com/307x224/${entryId}.mp4`;
|
||||
|
||||
return {
|
||||
url,
|
||||
entryId,
|
||||
|
@ -37,6 +84,11 @@ function scrape(html, site) {
|
|||
actors,
|
||||
director: 'Mason',
|
||||
date,
|
||||
poster,
|
||||
trailer: {
|
||||
src: trailer,
|
||||
quality: 224,
|
||||
},
|
||||
rating: {
|
||||
likes,
|
||||
dislikes,
|
||||
|
@ -49,8 +101,11 @@ function scrape(html, site) {
|
|||
async function scrapeScene(html, url, site) {
|
||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||
const json = $('script[type="application/ld+json"]').html();
|
||||
const videoJson = $('script:contains("window.ScenePlayerOptions")').html();
|
||||
|
||||
const data = JSON.parse(json)[0];
|
||||
const videoData = JSON.parse(videoJson.slice(videoJson.indexOf('{"id":'), videoJson.indexOf('};') + 1));
|
||||
|
||||
const entryId = new URL(url).pathname.split('/').slice(-1)[0];
|
||||
|
||||
const title = $('meta[name="twitter:title"]').attr('content');
|
||||
|
@ -76,6 +131,11 @@ async function scrapeScene(html, url, site) {
|
|||
const siteId = siteDomain && siteDomain.split('.')[0].toLowerCase();
|
||||
const siteUrl = siteDomain && `https://www.${siteDomain}`;
|
||||
|
||||
const poster = videoData.picPreview;
|
||||
const trailer = `${videoData.playerOptions.host}${videoData.url}`;
|
||||
|
||||
const photos = await getPhotos($('.picturesItem a').attr('href'), siteDomain);
|
||||
|
||||
const [channelSite, tags] = await Promise.all([
|
||||
site.isFallback
|
||||
? knex('sites')
|
||||
|
@ -95,6 +155,12 @@ async function scrapeScene(html, url, site) {
|
|||
director: 'Mason',
|
||||
description,
|
||||
duration,
|
||||
poster,
|
||||
photos,
|
||||
trailer: {
|
||||
src: trailer,
|
||||
quality: parseInt(videoData.sizeOnLoad, 10),
|
||||
},
|
||||
tags,
|
||||
rating: {
|
||||
stars,
|
||||
|
|
Loading…
Reference in New Issue