2019-03-23 21:48:39 +00:00
'use strict' ;
2019-03-24 00:29:22 +00:00
const bhttp = require ( 'bhttp' ) ;
2019-11-20 03:53:36 +00:00
const { JSDOM } = require ( 'jsdom' ) ;
2019-03-24 00:29:22 +00:00
const cheerio = require ( 'cheerio' ) ;
const moment = require ( 'moment' ) ;
2019-03-25 02:57:33 +00:00
2019-03-24 00:29:22 +00:00
function extractTitle ( originalTitle ) {
const titleComponents = originalTitle . split ( ' ' ) ;
2020-04-08 12:50:43 +00:00
const sceneIdMatch = titleComponents . slice ( - 1 ) [ 0 ] . match ( /(AB|AF|GP|SZ|IV|GIO|RS|TW|MA|FM|SAL|NR|AA|GL|BZ|FS|KS|OT)\d+/ ) ; // detect studio prefixes
2019-03-26 00:26:47 +00:00
const shootId = sceneIdMatch ? sceneIdMatch [ 0 ] : null ;
2019-03-24 00:29:22 +00:00
const title = sceneIdMatch ? titleComponents . slice ( 0 , - 1 ) . join ( ' ' ) : originalTitle ;
2019-03-26 00:26:47 +00:00
return { shootId , title } ;
2019-03-24 00:29:22 +00:00
}
2019-10-29 00:47:16 +00:00
function getPoster ( posterElement , sceneId ) {
const posterStyle = posterElement . attr ( 'style' ) ;
if ( posterStyle ) {
return posterStyle . slice ( posterStyle . indexOf ( '(' ) + 1 , - 1 ) ;
}
const posterRange = posterElement . attr ( 'data-casting' ) ;
const posterRangeData = posterRange ? JSON . parse ( posterRange ) : null ;
const posterTimeRange = posterRangeData [ Math . floor ( Math . random ( ) * posterRangeData . length ) ] ;
2019-12-11 01:25:25 +00:00
if ( ! posterTimeRange ) {
return null ;
}
2019-10-29 00:47:16 +00:00
if ( typeof posterTimeRange === 'number' ) {
// poster time is already a single time value
return ` https://legalporno.com/casting/ ${ sceneId } / ${ posterTimeRange } ` ;
}
const [ max , min ] = posterTimeRange . split ( '-' ) ;
const posterTime = Math . floor ( Math . random ( ) * ( Number ( max ) - Number ( min ) + 1 ) + Number ( min ) ) ;
return ` https://legalporno.com/casting/ ${ sceneId } / ${ posterTime } ` ;
}
2019-03-24 00:29:22 +00:00
function scrapeLatest ( html , site ) {
const $ = cheerio . load ( html , { normalizeWhitespace : true } ) ;
const scenesElements = $ ( '.thumbnails > div' ) . toArray ( ) ;
return scenesElements . map ( ( element ) => {
const sceneLinkElement = $ ( element ) . find ( '.thumbnail-title a' ) ;
const url = sceneLinkElement . attr ( 'href' ) ;
2019-04-05 01:45:40 +00:00
const originalTitle = sceneLinkElement . text ( ) . trim ( ) ; // title attribute breaks when they use \\ escaping
2019-03-26 00:26:47 +00:00
const { shootId , title } = extractTitle ( originalTitle ) ;
2019-04-06 21:24:26 +00:00
const entryId = new URL ( url ) . pathname . split ( '/' ) [ 2 ] ;
2019-03-24 00:29:22 +00:00
const date = moment . utc ( $ ( element ) . attr ( 'release' ) , 'YYYY/MM/DD' ) . toDate ( ) ;
2019-10-29 00:47:16 +00:00
const sceneId = $ ( element ) . attr ( 'data-content' ) ;
const posterElement = $ ( element ) . find ( '.thumbnail-avatar' ) ;
const poster = getPoster ( posterElement , sceneId ) ;
2019-03-24 00:29:22 +00:00
return {
url ,
2019-04-06 21:24:26 +00:00
shootId ,
entryId ,
2019-03-24 00:29:22 +00:00
title ,
date ,
2019-10-29 00:47:16 +00:00
poster ,
2019-03-24 00:29:22 +00:00
site ,
} ;
} ) ;
}
2019-10-29 00:47:16 +00:00
async function scrapeScene ( html , url , site , useGallery ) {
2019-03-24 00:29:22 +00:00
const $ = cheerio . load ( html , { normalizeWhitespace : true } ) ;
2020-04-08 12:50:43 +00:00
const playerObject = $ ( 'script:contains("new WatchPage")' ) . html ( ) ;
const playerData = playerObject && playerObject . slice ( playerObject . indexOf ( '{"swf":' ) , playerObject . lastIndexOf ( '},' ) + 1 ) ;
const data = playerData && JSON . parse ( playerData ) ;
2019-03-24 00:29:22 +00:00
2020-03-10 03:42:15 +00:00
const release = { url } ;
2019-03-24 00:29:22 +00:00
const originalTitle = $ ( 'h1.watchpage-title' ) . text ( ) . trim ( ) ;
2019-03-26 00:26:47 +00:00
const { shootId , title } = extractTitle ( originalTitle ) ;
2019-03-24 00:29:22 +00:00
2020-03-10 03:42:15 +00:00
release . shootId = shootId ;
release . entryId = new URL ( url ) . pathname . split ( '/' ) [ 2 ] ;
release . title = title ;
release . date = moment . utc ( $ ( 'span[title="Release date"] a' ) . text ( ) , 'YYYY-MM-DD' ) . toDate ( ) ;
2019-03-24 00:29:22 +00:00
2020-01-29 02:19:38 +00:00
const [ actorsElement , tagsElement , descriptionElement ] = $ ( '.scene-description__row' ) . toArray ( ) ;
2020-03-10 03:42:15 +00:00
release . description = $ ( 'meta[name="description"]' ) ? . attr ( 'content' ) ? . trim ( )
|| ( descriptionElement && $ ( descriptionElement ) . find ( 'dd' ) . text ( ) . trim ( ) ) ;
release . actors = $ ( actorsElement )
2019-03-24 00:29:22 +00:00
. find ( 'a[href*="com/model"]' )
. map ( ( actorIndex , actorElement ) => $ ( actorElement ) . text ( ) ) . toArray ( ) ;
2020-03-10 03:42:15 +00:00
release . duration = moment . duration ( $ ( 'span[title="Runtime"]' ) . text ( ) . trim ( ) ) . asSeconds ( ) ;
release . tags = $ ( tagsElement ) . find ( 'a' ) . map ( ( tagIndex , tagElement ) => $ ( tagElement ) . text ( ) ) . toArray ( ) ;
2019-10-29 00:47:16 +00:00
const photos = useGallery
? $ ( '.gallery a img' ) . map ( ( photoIndex , photoElement ) => $ ( photoElement ) . attr ( 'src' ) ) . toArray ( )
: $ ( '.screenshots img' ) . map ( ( photoIndex , photoElement ) => $ ( photoElement ) . attr ( 'src' ) ) . toArray ( ) ;
2020-03-10 03:42:15 +00:00
release . photos = photos . map ( ( source ) => {
// source without parameters sometimes serves larger preview photo
const { origin , pathname } = new URL ( source ) ;
return ` ${ origin } ${ pathname } ` ;
/ * d i s a b l e t h u m b n a i l a s f a l l b a c k , u s u a l l y e n o u g h h i g h r e s p h o t o s a v a i l a b l e
return [
` ${ origin } ${ pathname } ` ,
source ,
] ;
* /
} ) ;
const posterStyle = $ ( '#player' ) . attr ( 'style' ) ;
const poster = posterStyle . slice ( posterStyle . indexOf ( '(' ) + 1 , - 1 ) ;
release . poster = poster || release . photos . slice ( Math . floor ( release . photos . length / 3 ) * - 1 ) ; // poster unavailable, try last 1/3rd of high res photos as fallback
2020-04-08 12:50:43 +00:00
if ( data ) {
const qualityMap = {
web : 240 ,
vga : 480 ,
hd : 720 ,
'1080p' : 1080 ,
} ;
release . trailer = data . clip . qualities . map ( trailer => ( {
src : trailer . src ,
type : trailer . type ,
quality : qualityMap [ trailer . quality ] || trailer . quality ,
} ) ) ;
}
2019-10-29 00:47:16 +00:00
2019-10-30 03:45:42 +00:00
const studioName = $ ( '.watchpage-studioname' ) . first ( ) . text ( ) . trim ( ) ;
2020-03-10 03:42:15 +00:00
release . studio = studioName . replace ( /[\s.']+/g , '' ) . toLowerCase ( ) ;
return release ;
2019-03-24 00:29:22 +00:00
}
2019-11-29 04:46:06 +00:00
async function scrapeProfile ( html , _url , actorName ) {
const { document } = new JSDOM ( html ) . window ;
const profile = {
name : actorName ,
} ;
const avatarEl = document . querySelector ( '.model--avatar img[src^="http"]' ) ;
const entries = Array . from ( document . querySelectorAll ( '.model--description tr' ) , el => el . textContent . replace ( /\n/g , '' ) . split ( ':' ) ) ;
const bio = entries
. filter ( entry => entry . length === 2 ) // ignore entries without ':' (About section, see Blanche Bradburry)
. reduce ( ( acc , [ key , value ] ) => ( { ... acc , [ key . trim ( ) ] : value . trim ( ) } ) , { } ) ;
profile . birthPlace = bio . Nationality ;
if ( bio . Age ) profile . age = bio . Age ;
if ( avatarEl ) profile . avatar = avatarEl . src ;
return profile ;
}
2019-04-05 01:45:40 +00:00
async function fetchLatest ( site , page = 1 ) {
const res = await bhttp . get ( ` ${ site . url } /new-videos/ ${ page } ` ) ;
2019-03-24 00:29:22 +00:00
return scrapeLatest ( res . body . toString ( ) , site ) ;
}
async function fetchScene ( url , site ) {
2019-10-29 00:47:16 +00:00
const useGallery = true ;
const res = useGallery
? await bhttp . get ( ` ${ url } /gallery#gallery ` )
: await bhttp . get ( ` ${ url } /screenshots#screenshots ` ) ;
2019-03-24 00:29:22 +00:00
2019-10-29 00:47:16 +00:00
return scrapeScene ( res . body . toString ( ) , url , site , useGallery ) ;
2019-03-24 00:29:22 +00:00
}
2019-11-20 03:53:36 +00:00
async function fetchProfile ( actorName ) {
const res = await bhttp . get ( ` https://www.legalporno.com/api/autocomplete/search?q= ${ actorName . replace ( ' ' , '+' ) } ` ) ;
const data = res . body ;
const result = data . terms . find ( item => item . type === 'model' ) ;
if ( result ) {
const bioRes = await bhttp . get ( result . url ) ;
const html = bioRes . body . toString ( ) ;
return scrapeProfile ( html , result . url , actorName ) ;
}
return null ;
}
2019-03-24 00:29:22 +00:00
module . exports = {
fetchLatest ,
2019-11-20 03:53:36 +00:00
fetchProfile ,
2019-03-24 00:29:22 +00:00
fetchScene ,
} ;