Added media support to XEmpire (HardX) scraper.
This commit is contained in:
parent
e204f56370
commit
d5073a73ce
|
@ -101,7 +101,7 @@ function scrollBanner(event) {
|
||||||
|
|
||||||
function photos() {
|
function photos() {
|
||||||
if (this.release.photos.length) {
|
if (this.release.photos.length) {
|
||||||
return this.release.photos;
|
return this.release.photos.sort(({ index: indexA }, { index: indexB }) => indexA - indexB);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (this.release.poster && !this.release.trailer) {
|
if (this.release.poster && !this.release.trailer) {
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
'use strict';
|
'use strict';
|
||||||
|
|
||||||
|
const Promise = require('bluebird');
|
||||||
const bhttp = require('bhttp');
|
const bhttp = require('bhttp');
|
||||||
const cheerio = require('cheerio');
|
const cheerio = require('cheerio');
|
||||||
const moment = require('moment');
|
const moment = require('moment');
|
||||||
|
@ -7,12 +8,55 @@ const moment = require('moment');
|
||||||
const knex = require('../knex');
|
const knex = require('../knex');
|
||||||
const { matchTags } = require('../tags');
|
const { matchTags } = require('../tags');
|
||||||
|
|
||||||
|
async function fetchPhotos(url) {
|
||||||
|
const res = await bhttp.get(url);
|
||||||
|
|
||||||
|
return res.body.toString();
|
||||||
|
}
|
||||||
|
|
||||||
|
function scrapePhotos(html) {
|
||||||
|
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||||
|
|
||||||
|
const unlockedPhotos = $('.preview .imgLink.pgUnlocked')
|
||||||
|
.map((photoIndex, photoElement) => $(photoElement).attr('href')).toArray();
|
||||||
|
|
||||||
|
const lockedThumbnails = $('.preview .imgLink.lockedPicture img')
|
||||||
|
.map((photoIndex, photoElement) => $(photoElement)
|
||||||
|
.attr('src')
|
||||||
|
.replace('_tb.jpg', '.jpg'))
|
||||||
|
.toArray();
|
||||||
|
|
||||||
|
return unlockedPhotos.concat(lockedThumbnails);
|
||||||
|
}
|
||||||
|
|
||||||
|
async function getPhotos(albumPath, siteDomain) {
|
||||||
|
const albumUrl = `https://${siteDomain}${albumPath}`;
|
||||||
|
|
||||||
|
const html = await fetchPhotos(albumUrl);
|
||||||
|
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||||
|
const photos = scrapePhotos(html);
|
||||||
|
|
||||||
|
const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();
|
||||||
|
|
||||||
|
const otherPhotos = await Promise.map(pages, async (page) => {
|
||||||
|
const pageUrl = `https://${siteDomain}${page}`;
|
||||||
|
const pageHtml = await fetchPhotos(pageUrl);
|
||||||
|
|
||||||
|
return scrapePhotos(pageHtml);
|
||||||
|
}, {
|
||||||
|
concurrency: 2,
|
||||||
|
});
|
||||||
|
|
||||||
|
return photos.concat(otherPhotos.flat());
|
||||||
|
}
|
||||||
|
|
||||||
function scrape(html, site) {
|
function scrape(html, site) {
|
||||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||||
const scenesElements = $('li[data-itemtype=scene]').toArray();
|
const scenesElements = $('li[data-itemtype=scene]').toArray();
|
||||||
|
|
||||||
return scenesElements.map((element) => {
|
return scenesElements.map((element) => {
|
||||||
const sceneLinkElement = $(element).find('.sceneTitle a');
|
const sceneLinkElement = $(element).find('.sceneTitle a');
|
||||||
|
|
||||||
const url = `${site.url}${sceneLinkElement.attr('href')}`;
|
const url = `${site.url}${sceneLinkElement.attr('href')}`;
|
||||||
const title = sceneLinkElement.attr('title');
|
const title = sceneLinkElement.attr('title');
|
||||||
|
|
||||||
|
@ -30,6 +74,9 @@ function scrape(html, site) {
|
||||||
.toArray()
|
.toArray()
|
||||||
.map(value => Number($(value).text()));
|
.map(value => Number($(value).text()));
|
||||||
|
|
||||||
|
const poster = $(element).find('.imgLink img').attr('data-original');
|
||||||
|
const trailer = `https://videothumb.gammacdn.com/307x224/${entryId}.mp4`;
|
||||||
|
|
||||||
return {
|
return {
|
||||||
url,
|
url,
|
||||||
entryId,
|
entryId,
|
||||||
|
@ -37,6 +84,11 @@ function scrape(html, site) {
|
||||||
actors,
|
actors,
|
||||||
director: 'Mason',
|
director: 'Mason',
|
||||||
date,
|
date,
|
||||||
|
poster,
|
||||||
|
trailer: {
|
||||||
|
src: trailer,
|
||||||
|
quality: 224,
|
||||||
|
},
|
||||||
rating: {
|
rating: {
|
||||||
likes,
|
likes,
|
||||||
dislikes,
|
dislikes,
|
||||||
|
@ -49,8 +101,11 @@ function scrape(html, site) {
|
||||||
async function scrapeScene(html, url, site) {
|
async function scrapeScene(html, url, site) {
|
||||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||||
const json = $('script[type="application/ld+json"]').html();
|
const json = $('script[type="application/ld+json"]').html();
|
||||||
|
const videoJson = $('script:contains("window.ScenePlayerOptions")').html();
|
||||||
|
|
||||||
const data = JSON.parse(json)[0];
|
const data = JSON.parse(json)[0];
|
||||||
|
const videoData = JSON.parse(videoJson.slice(videoJson.indexOf('{"id":'), videoJson.indexOf('};') + 1));
|
||||||
|
|
||||||
const entryId = new URL(url).pathname.split('/').slice(-1)[0];
|
const entryId = new URL(url).pathname.split('/').slice(-1)[0];
|
||||||
|
|
||||||
const title = $('meta[name="twitter:title"]').attr('content');
|
const title = $('meta[name="twitter:title"]').attr('content');
|
||||||
|
@ -76,6 +131,11 @@ async function scrapeScene(html, url, site) {
|
||||||
const siteId = siteDomain && siteDomain.split('.')[0].toLowerCase();
|
const siteId = siteDomain && siteDomain.split('.')[0].toLowerCase();
|
||||||
const siteUrl = siteDomain && `https://www.${siteDomain}`;
|
const siteUrl = siteDomain && `https://www.${siteDomain}`;
|
||||||
|
|
||||||
|
const poster = videoData.picPreview;
|
||||||
|
const trailer = `${videoData.playerOptions.host}${videoData.url}`;
|
||||||
|
|
||||||
|
const photos = await getPhotos($('.picturesItem a').attr('href'), siteDomain);
|
||||||
|
|
||||||
const [channelSite, tags] = await Promise.all([
|
const [channelSite, tags] = await Promise.all([
|
||||||
site.isFallback
|
site.isFallback
|
||||||
? knex('sites')
|
? knex('sites')
|
||||||
|
@ -95,6 +155,12 @@ async function scrapeScene(html, url, site) {
|
||||||
director: 'Mason',
|
director: 'Mason',
|
||||||
description,
|
description,
|
||||||
duration,
|
duration,
|
||||||
|
poster,
|
||||||
|
photos,
|
||||||
|
trailer: {
|
||||||
|
src: trailer,
|
||||||
|
quality: parseInt(videoData.sizeOnLoad, 10),
|
||||||
|
},
|
||||||
tags,
|
tags,
|
||||||
rating: {
|
rating: {
|
||||||
stars,
|
stars,
|
||||||
|
|
Loading…
Reference in New Issue