2019-03-04 01:46:33 +00:00
'use strict' ;
2019-10-29 02:13:56 +00:00
const Promise = require ( 'bluebird' ) ;
2019-03-04 01:46:33 +00:00
const bhttp = require ( 'bhttp' ) ;
const cheerio = require ( 'cheerio' ) ;
2019-11-30 04:55:32 +00:00
const { JSDOM } = require ( 'jsdom' ) ;
2019-03-04 03:19:03 +00:00
const moment = require ( 'moment' ) ;
2019-03-04 01:46:33 +00:00
2019-11-13 02:14:24 +00:00
const defaultTags = {
hardx : [ ] ,
darkx : [ 'interracial' ] ,
eroticax : [ ] ,
lesbianx : [ 'lesbian' ] ,
2019-12-12 02:12:05 +00:00
allblackx : [ 'ebony' , 'bbc' ] ,
2019-11-13 02:14:24 +00:00
} ;
2019-10-29 02:13:56 +00:00
async function fetchPhotos ( url ) {
const res = await bhttp . get ( url ) ;
return res . body . toString ( ) ;
}
function scrapePhotos ( html ) {
const $ = cheerio . load ( html , { normalizeWhitespace : true } ) ;
2019-12-12 02:12:05 +00:00
return $ ( '.preview .imgLink' ) . toArray ( ) . map ( ( linkEl ) => {
const url = $ ( linkEl ) . attr ( 'href' ) ;
if ( url . match ( '/join' ) ) {
// URL links to join page instead of full photo, extract thumbnail
const src = $ ( linkEl ) . find ( 'img' ) . attr ( 'src' ) ;
if ( src . match ( 'previews/' ) ) {
// resource often serves full photo at a modifier URL anyway, add as primary source
const highRes = src
. replace ( 'previews/' , '' )
. replace ( '_tb.jpg' , '.jpg' ) ;
2019-10-29 02:13:56 +00:00
2019-12-12 02:12:05 +00:00
// keep original thumbnail as fallback in case full photo is not available
return [ highRes , src ] ;
}
2019-10-29 02:13:56 +00:00
2019-12-12 02:12:05 +00:00
return src ;
}
// URL links to full photo
return url ;
} ) ;
2019-10-29 02:13:56 +00:00
}
2019-12-09 04:00:49 +00:00
async function getPhotos ( albumPath , siteDomain ) {
2019-10-29 02:13:56 +00:00
const albumUrl = ` https:// ${ siteDomain } ${ albumPath } ` ;
2019-12-12 02:12:05 +00:00
try {
const html = await fetchPhotos ( albumUrl ) ;
const $ = cheerio . load ( html , { normalizeWhitespace : true } ) ;
const photos = scrapePhotos ( html ) ;
2019-10-29 02:13:56 +00:00
2019-12-12 02:12:05 +00:00
const pages = $ ( '.paginatorPages a' ) . map ( ( pageIndex , pageElement ) => $ ( pageElement ) . attr ( 'href' ) ) . toArray ( ) ;
2019-10-29 02:13:56 +00:00
2019-12-12 02:12:05 +00:00
const otherPhotos = await Promise . map ( pages , async ( page ) => {
const pageUrl = ` https:// ${ siteDomain } ${ page } ` ;
const pageHtml = await fetchPhotos ( pageUrl ) ;
2019-10-29 02:13:56 +00:00
2019-12-12 02:12:05 +00:00
return scrapePhotos ( pageHtml ) ;
} , {
concurrency : 2 ,
} ) ;
return photos . concat ( otherPhotos . flat ( ) ) ;
} catch ( error ) {
console . error ( ` Failed to fetch XEmpire photos from ${ albumPath } : ${ error . message } ` ) ;
2019-10-29 02:13:56 +00:00
2019-12-12 02:12:05 +00:00
return [ ] ;
}
2019-10-29 02:13:56 +00:00
}
2019-03-04 01:46:33 +00:00
function scrape ( html , site ) {
const $ = cheerio . load ( html , { normalizeWhitespace : true } ) ;
2019-03-26 00:26:47 +00:00
const scenesElements = $ ( 'li[data-itemtype=scene]' ) . toArray ( ) ;
2019-03-04 01:46:33 +00:00
return scenesElements . map ( ( element ) => {
const sceneLinkElement = $ ( element ) . find ( '.sceneTitle a' ) ;
2019-10-29 02:13:56 +00:00
2019-03-04 01:46:33 +00:00
const url = ` ${ site . url } ${ sceneLinkElement . attr ( 'href' ) } ` ;
const title = sceneLinkElement . attr ( 'title' ) ;
2019-04-07 18:51:14 +00:00
const entryId = $ ( element ) . attr ( 'data-itemid' ) ;
2019-03-26 00:26:47 +00:00
2019-03-18 03:46:53 +00:00
const date = moment
. utc ( $ ( element ) . find ( '.sceneDate' ) . text ( ) , 'MM-DD-YYYY' )
. toDate ( ) ;
2019-03-04 01:46:33 +00:00
const actors = $ ( element ) . find ( '.sceneActors a' )
. map ( ( actorIndex , actorElement ) => $ ( actorElement ) . attr ( 'title' ) )
. toArray ( ) ;
2019-03-18 03:46:53 +00:00
const [ likes , dislikes ] = $ ( element ) . find ( '.value' )
. toArray ( )
. map ( value => Number ( $ ( value ) . text ( ) ) ) ;
2019-03-04 01:46:33 +00:00
2019-10-29 02:13:56 +00:00
const poster = $ ( element ) . find ( '.imgLink img' ) . attr ( 'data-original' ) ;
const trailer = ` https://videothumb.gammacdn.com/307x224/ ${ entryId } .mp4 ` ;
2019-03-04 01:46:33 +00:00
return {
url ,
2019-04-07 18:51:14 +00:00
entryId ,
2019-03-04 01:46:33 +00:00
title ,
actors ,
2019-04-04 02:00:28 +00:00
director : 'Mason' ,
2019-03-04 01:46:33 +00:00
date ,
2019-10-29 02:13:56 +00:00
poster ,
trailer : {
src : trailer ,
quality : 224 ,
} ,
2019-03-04 01:46:33 +00:00
rating : {
likes ,
dislikes ,
} ,
2019-03-11 03:19:36 +00:00
site ,
2019-03-04 01:46:33 +00:00
} ;
} ) ;
}
2019-03-25 02:57:33 +00:00
async function scrapeScene ( html , url , site ) {
2019-03-24 00:29:22 +00:00
const $ = cheerio . load ( html , { normalizeWhitespace : true } ) ;
const json = $ ( 'script[type="application/ld+json"]' ) . html ( ) ;
2019-12-12 02:12:05 +00:00
const json2 = $ ( 'script:contains("dataLayer = ")' ) . html ( ) ;
2019-10-29 02:13:56 +00:00
const videoJson = $ ( 'script:contains("window.ScenePlayerOptions")' ) . html ( ) ;
2019-03-24 00:29:22 +00:00
const data = JSON . parse ( json ) [ 0 ] ;
2019-12-12 02:12:05 +00:00
const data2 = JSON . parse ( json2 . slice ( json2 . indexOf ( '[{' ) , - 1 ) ) [ 0 ] ;
2019-10-29 02:13:56 +00:00
const videoData = JSON . parse ( videoJson . slice ( videoJson . indexOf ( '{"id":' ) , videoJson . indexOf ( '};' ) + 1 ) ) ;
2019-12-12 02:12:05 +00:00
const entryId = data2 . sceneDetails . sceneId || new URL ( url ) . pathname . split ( '/' ) . slice ( - 1 ) [ 0 ] ;
2019-03-24 00:29:22 +00:00
2019-12-12 02:12:05 +00:00
const title = data2 . sceneDetails . sceneTitle || $ ( 'meta[name="twitter:title"]' ) . attr ( 'content' ) ;
const description = data2 . sceneDetails . sceneDescription || data . description || $ ( 'meta[name="twitter:description"]' ) . attr ( 'content' ) ;
2019-09-26 01:27:01 +00:00
// date in data object is not the release date of the scene, but the date the entry was added
const date = moment . utc ( $ ( '.updatedDate' ) . first ( ) . text ( ) , 'MM-DD-YYYY' ) . toDate ( ) ;
2019-03-24 00:29:22 +00:00
2019-12-12 02:12:05 +00:00
const actors = ( data2 . sceneDetails . sceneActors || data . actor ) . map ( actor => actor . actorName || actor . name ) ;
2019-03-24 00:29:22 +00:00
const stars = ( data . aggregateRating . ratingValue / data . aggregateRating . bestRating ) * 5 ;
2019-03-24 04:28:18 +00:00
const duration = moment . duration ( data . duration . slice ( 2 ) . split ( ':' ) ) . asSeconds ( ) ;
2019-03-24 00:29:22 +00:00
2019-12-12 02:12:05 +00:00
const siteDomain = $ ( 'meta[name="twitter:domain"]' ) . attr ( 'content' ) || 'allblackx.com' ; // only AllBlackX has no twitter domain, no other useful hints available
2019-11-30 04:55:32 +00:00
const siteSlug = siteDomain && siteDomain . split ( '.' ) [ 0 ] . toLowerCase ( ) ;
2019-04-07 18:51:14 +00:00
const siteUrl = siteDomain && ` https://www. ${ siteDomain } ` ;
2019-10-29 02:13:56 +00:00
const poster = videoData . picPreview ;
const trailer = ` ${ videoData . playerOptions . host } ${ videoData . url } ` ;
2019-11-27 03:58:38 +00:00
const photos = await getPhotos ( $ ( '.picturesItem a' ) . attr ( 'href' ) , siteDomain , site ) ;
2019-10-29 02:13:56 +00:00
2019-11-04 04:47:37 +00:00
const rawTags = data . keywords . split ( ', ' ) ;
2019-12-12 02:12:05 +00:00
const tags = [ ... defaultTags [ siteSlug ] , ... rawTags ] ;
2019-03-24 00:29:22 +00:00
2019-03-23 21:48:39 +00:00
return {
2019-12-12 02:12:05 +00:00
url : ` ${ siteUrl } /en/video/ ${ new URL ( url ) . pathname . split ( '/' ) . slice ( - 2 ) . join ( '/' ) } ` ,
2019-04-07 18:51:14 +00:00
entryId ,
2019-03-23 21:48:39 +00:00
title ,
date ,
actors ,
2019-04-04 02:00:28 +00:00
director : 'Mason' ,
2019-03-23 21:48:39 +00:00
description ,
2019-03-24 00:29:22 +00:00
duration ,
2019-10-29 02:13:56 +00:00
poster ,
photos ,
trailer : {
src : trailer ,
quality : parseInt ( videoData . sizeOnLoad , 10 ) ,
} ,
2019-03-24 00:29:22 +00:00
tags ,
2019-03-23 21:48:39 +00:00
rating : {
stars ,
} ,
2019-12-12 02:12:05 +00:00
site ,
channel : siteSlug ,
2019-03-23 21:48:39 +00:00
} ;
}
2019-11-30 04:55:32 +00:00
function scrapeActorSearch ( html , url , actorName ) {
const { document } = new JSDOM ( html ) . window ;
const actorLink = document . querySelector ( ` a[title=" ${ actorName } " i] ` ) ;
return actorLink ? actorLink . href : null ;
}
function scrapeProfile ( html , url , actorName ) {
const { document } = new JSDOM ( html ) . window ;
const avatarEl = document . querySelector ( 'img.actorPicture' ) ;
const descriptionEl = document . querySelector ( '.actorBio p:not(.bioTitle)' ) ;
const profile = {
name : actorName ,
} ;
if ( avatarEl ) profile . avatar = avatarEl . src ;
if ( descriptionEl ) profile . description = descriptionEl . textContent . trim ( ) ;
profile . releases = Array . from ( document . querySelectorAll ( '.sceneList .scene a.imgLink' ) , el => ` https://xempire.com ${ el . href } ` ) ;
return profile ;
}
2019-04-05 01:45:40 +00:00
async function fetchLatest ( site , page = 1 ) {
const res = await bhttp . get ( ` ${ site . url } /en/videos/AllCategories/0/ ${ page } ` ) ;
2019-03-23 21:48:39 +00:00
2019-04-05 01:45:40 +00:00
return scrape ( res . body . toString ( ) , site ) ;
2019-03-23 21:48:39 +00:00
}
async function fetchUpcoming ( site ) {
const res = await bhttp . get ( ` ${ site . url } /en/videos/AllCategories/0/1/upcoming ` ) ;
return scrape ( res . body . toString ( ) , site ) ;
}
async function fetchScene ( url , site ) {
const res = await bhttp . get ( url ) ;
2019-03-04 01:46:33 +00:00
2019-03-23 21:48:39 +00:00
return scrapeScene ( res . body . toString ( ) , url , site ) ;
2019-03-04 01:46:33 +00:00
}
2019-11-30 04:55:32 +00:00
async function fetchProfile ( actorName ) {
const actorSlug = actorName . toLowerCase ( ) . replace ( /\s+/ , '+' ) ;
const searchUrl = ` https://www.xempire.com/en/search/xempire/actor/ ${ actorSlug } ` ;
const searchRes = await bhttp . get ( searchUrl ) ;
if ( searchRes . statusCode !== 200 ) {
return null ;
}
const actorUrl = scrapeActorSearch ( searchRes . body . toString ( ) , searchUrl , actorName ) ;
if ( actorUrl ) {
const url = ` https://xempire.com ${ actorUrl } ` ;
const actorRes = await bhttp . get ( url ) ;
if ( actorRes . statusCode !== 200 ) {
return null ;
}
return scrapeProfile ( actorRes . body . toString ( ) , url , actorName ) ;
}
return null ;
}
2019-03-23 21:48:39 +00:00
module . exports = {
fetchLatest ,
2019-11-30 04:55:32 +00:00
fetchProfile ,
2019-03-23 21:48:39 +00:00
fetchUpcoming ,
fetchScene ,
} ;