2019-03-23 21:48:39 +00:00
'use strict' ;
2019-03-24 00:29:22 +00:00
2019-11-20 03:53:36 +00:00
const { JSDOM } = require ( 'jsdom' ) ;
2019-03-24 00:29:22 +00:00
const cheerio = require ( 'cheerio' ) ;
const moment = require ( 'moment' ) ;
2019-03-25 02:57:33 +00:00
2020-09-18 20:43:45 +00:00
const http = require ( '../utils/http' ) ;
2020-04-11 20:49:37 +00:00
const slugify = require ( '../utils/slugify' ) ;
2019-03-24 00:29:22 +00:00
function extractTitle ( originalTitle ) {
2020-05-14 02:26:05 +00:00
const titleComponents = originalTitle . split ( ' ' ) ;
2020-07-22 02:12:20 +00:00
// const sceneIdMatch = titleComponents.slice(-1)[0].match(/(AB|AF|GP|SZ|IV|GIO|RS|TW|MA|FM|SAL|NR|AA|GL|BZ|FS|KS|OTS|NF|NT|AX|RV|CM|BTG)\d+/); // detect studio prefixes
const sceneIdMatch = titleComponents . slice ( - 1 ) [ 0 ] . match ( /\w+\d+\s*$/ ) ; // detect studio prefixes
2020-05-14 02:26:05 +00:00
const shootId = sceneIdMatch ? sceneIdMatch [ 0 ] : null ;
const title = sceneIdMatch ? titleComponents . slice ( 0 , - 1 ) . join ( ' ' ) : originalTitle ;
2019-03-24 00:29:22 +00:00
2020-05-14 02:26:05 +00:00
return { shootId , title } ;
2019-03-24 00:29:22 +00:00
}
2019-10-29 00:47:16 +00:00
function getPoster ( posterElement , sceneId ) {
2020-05-14 02:26:05 +00:00
const posterStyle = posterElement . attr ( 'style' ) ;
2019-10-29 00:47:16 +00:00
2020-05-14 02:26:05 +00:00
if ( posterStyle ) {
return posterStyle . slice ( posterStyle . indexOf ( '(' ) + 1 , - 1 ) ;
}
2019-10-29 00:47:16 +00:00
2020-05-14 02:26:05 +00:00
const posterRange = posterElement . attr ( 'data-casting' ) ;
const posterRangeData = posterRange ? JSON . parse ( posterRange ) : null ;
const posterTimeRange = posterRangeData [ Math . floor ( Math . random ( ) * posterRangeData . length ) ] ;
2019-10-29 00:47:16 +00:00
2020-05-14 02:26:05 +00:00
if ( ! posterTimeRange ) {
return null ;
}
2019-12-11 01:25:25 +00:00
2020-05-14 02:26:05 +00:00
if ( typeof posterTimeRange === 'number' ) {
// poster time is already a single time value
return ` https://legalporno.com/casting/ ${ sceneId } / ${ posterTimeRange } ` ;
}
2019-10-29 00:47:16 +00:00
2020-05-14 02:26:05 +00:00
const [ max , min ] = posterTimeRange . split ( '-' ) ;
const posterTime = Math . floor ( Math . random ( ) * ( Number ( max ) - Number ( min ) + 1 ) + Number ( min ) ) ;
2019-10-29 00:47:16 +00:00
2020-05-14 02:26:05 +00:00
return ` https://legalporno.com/casting/ ${ sceneId } / ${ posterTime } ` ;
2019-10-29 00:47:16 +00:00
}
2020-07-19 17:40:21 +00:00
function scrapeAll ( html ) {
2020-05-14 02:26:05 +00:00
const $ = cheerio . load ( html , { normalizeWhitespace : true } ) ;
const scenesElements = $ ( '.thumbnails > div' ) . toArray ( ) ;
return scenesElements . map ( ( element ) => {
const sceneLinkElement = $ ( element ) . find ( '.thumbnail-title a' ) ;
const url = sceneLinkElement . attr ( 'href' ) ;
const originalTitle = sceneLinkElement . text ( ) . trim ( ) ; // title attribute breaks when they use \\ escaping
const { shootId , title } = extractTitle ( originalTitle ) ;
const entryId = new URL ( url ) . pathname . split ( '/' ) [ 2 ] ;
const date = moment . utc ( $ ( element ) . attr ( 'release' ) , 'YYYY/MM/DD' ) . toDate ( ) ;
const sceneId = $ ( element ) . attr ( 'data-content' ) ;
const posterElement = $ ( element ) . find ( '.thumbnail-avatar' ) ;
const poster = getPoster ( posterElement , sceneId ) ;
return {
url ,
shootId ,
entryId ,
title ,
date ,
poster ,
} ;
} ) ;
2019-03-24 00:29:22 +00:00
}
2019-10-29 00:47:16 +00:00
async function scrapeScene ( html , url , site , useGallery ) {
2020-05-14 02:26:05 +00:00
const $ = cheerio . load ( html , { normalizeWhitespace : true } ) ;
const playerObject = $ ( 'script:contains("new WatchPage")' ) . html ( ) ;
const playerData = playerObject && playerObject . slice ( playerObject . indexOf ( '{"swf":' ) , playerObject . lastIndexOf ( '},' ) + 1 ) ;
const data = playerData && JSON . parse ( playerData ) ;
2019-03-24 00:29:22 +00:00
2020-05-14 02:26:05 +00:00
const release = { url } ;
2020-03-10 03:42:15 +00:00
2020-05-14 02:26:05 +00:00
const originalTitle = $ ( 'h1.watchpage-title' ) . text ( ) . trim ( ) ;
const { shootId , title } = extractTitle ( originalTitle ) ;
2019-03-24 00:29:22 +00:00
2020-05-14 02:26:05 +00:00
release . shootId = shootId ;
release . entryId = new URL ( url ) . pathname . split ( '/' ) [ 2 ] ;
2020-03-10 03:42:15 +00:00
2020-05-14 02:26:05 +00:00
release . title = title ;
release . date = moment . utc ( $ ( 'span[title="Release date"] a' ) . text ( ) , 'YYYY-MM-DD' ) . toDate ( ) ;
2019-03-24 00:29:22 +00:00
2020-05-14 02:26:05 +00:00
const [ actorsElement , tagsElement , descriptionElement ] = $ ( '.scene-description__row' ) . toArray ( ) ;
2020-03-10 03:42:15 +00:00
2020-05-14 02:26:05 +00:00
release . description = $ ( 'meta[name="description"]' ) ? . attr ( 'content' ) ? . trim ( )
2020-03-10 03:42:15 +00:00
|| ( descriptionElement && $ ( descriptionElement ) . find ( 'dd' ) . text ( ) . trim ( ) ) ;
2020-05-14 02:26:05 +00:00
release . actors = $ ( actorsElement )
. find ( 'a[href*="com/model"]' )
. map ( ( actorIndex , actorElement ) => $ ( actorElement ) . text ( ) ) . toArray ( ) ;
2019-03-24 00:29:22 +00:00
2020-05-14 02:26:05 +00:00
release . duration = moment . duration ( $ ( 'span[title="Runtime"]' ) . text ( ) . trim ( ) ) . asSeconds ( ) ;
release . tags = $ ( tagsElement ) . find ( 'a' ) . map ( ( tagIndex , tagElement ) => $ ( tagElement ) . text ( ) ) . toArray ( ) ;
2019-10-29 00:47:16 +00:00
2020-05-14 02:26:05 +00:00
const photos = useGallery
? $ ( '.gallery a img' ) . map ( ( photoIndex , photoElement ) => $ ( photoElement ) . attr ( 'src' ) ) . toArray ( )
: $ ( '.screenshots img' ) . map ( ( photoIndex , photoElement ) => $ ( photoElement ) . attr ( 'src' ) ) . toArray ( ) ;
2019-10-29 00:47:16 +00:00
2020-05-14 02:26:05 +00:00
release . photos = photos . map ( ( source ) => {
// source without parameters sometimes serves larger preview photo
const { origin , pathname } = new URL ( source ) ;
2020-03-10 03:42:15 +00:00
2020-05-14 02:26:05 +00:00
return ` ${ origin } ${ pathname } ` ;
} ) ;
2020-03-10 03:42:15 +00:00
2020-05-14 02:26:05 +00:00
const posterStyle = $ ( '#player' ) . attr ( 'style' ) ;
const poster = posterStyle . slice ( posterStyle . indexOf ( '(' ) + 1 , - 1 ) ;
2020-03-10 03:42:15 +00:00
2020-05-14 02:26:05 +00:00
release . poster = poster || release . photos . slice ( Math . floor ( release . photos . length / 3 ) * - 1 ) ; // poster unavailable, try last 1/3rd of high res photos as fallback
2020-03-10 03:42:15 +00:00
2020-05-14 02:26:05 +00:00
if ( data ) {
const qualityMap = {
web : 240 ,
vga : 480 ,
hd : 720 ,
'1080p' : 1080 ,
} ;
2020-04-08 12:50:43 +00:00
2020-05-14 02:26:05 +00:00
release . trailer = data . clip . qualities . map ( trailer => ( {
src : trailer . src ,
type : trailer . type ,
quality : qualityMap [ trailer . quality ] || trailer . quality ,
} ) ) ;
}
2019-10-29 00:47:16 +00:00
2020-05-14 02:26:05 +00:00
const studioName = $ ( '.watchpage-studioname' ) . first ( ) . text ( ) . trim ( ) ;
release . studio = slugify ( studioName , '' ) ;
2020-03-10 03:42:15 +00:00
2020-05-14 02:26:05 +00:00
return release ;
2019-03-24 00:29:22 +00:00
}
2019-11-29 04:46:06 +00:00
async function scrapeProfile ( html , _url , actorName ) {
2020-05-14 02:26:05 +00:00
const { document } = new JSDOM ( html ) . window ;
2019-11-29 04:46:06 +00:00
2020-05-14 02:26:05 +00:00
const profile = {
name : actorName ,
} ;
2019-11-29 04:46:06 +00:00
2020-05-14 02:26:05 +00:00
const avatarEl = document . querySelector ( '.model--avatar img[src^="http"]' ) ;
const entries = Array . from ( document . querySelectorAll ( '.model--description tr' ) , el => el . textContent . replace ( /\n/g , '' ) . split ( ':' ) ) ;
2019-11-29 04:46:06 +00:00
2020-05-14 02:26:05 +00:00
const bio = entries
. filter ( entry => entry . length === 2 ) // ignore entries without ':' (About section, see Blanche Bradburry)
. reduce ( ( acc , [ key , value ] ) => ( { ... acc , [ key . trim ( ) ] : value . trim ( ) } ) , { } ) ;
2019-11-29 04:46:06 +00:00
2020-05-14 02:26:05 +00:00
profile . birthPlace = bio . Nationality ;
2019-11-29 04:46:06 +00:00
2020-05-14 02:26:05 +00:00
if ( bio . Age ) profile . age = bio . Age ;
if ( avatarEl ) profile . avatar = avatarEl . src ;
2019-11-29 04:46:06 +00:00
2020-07-19 17:40:21 +00:00
profile . releases = scrapeAll ( html ) ;
2020-05-14 02:26:05 +00:00
return profile ;
2019-11-29 04:46:06 +00:00
}
2019-04-05 01:45:40 +00:00
async function fetchLatest ( site , page = 1 ) {
2020-09-18 20:43:45 +00:00
const res = await http . get ( ` ${ site . url } /new-videos/ ${ page } ` ) ;
2019-03-24 00:29:22 +00:00
2020-07-19 17:40:21 +00:00
return scrapeAll ( res . body . toString ( ) , site ) ;
2019-03-24 00:29:22 +00:00
}
async function fetchScene ( url , site ) {
2020-05-14 02:26:05 +00:00
const useGallery = true ;
2019-10-29 00:47:16 +00:00
2020-05-14 02:26:05 +00:00
// TODO: fall back on screenshots when gallery is not available
const res = useGallery
2020-09-18 20:43:45 +00:00
? await http . get ( ` ${ url } /gallery#gallery ` )
: await http . get ( ` ${ url } /screenshots#screenshots ` ) ;
2019-03-24 00:29:22 +00:00
2020-05-14 02:26:05 +00:00
return scrapeScene ( res . body . toString ( ) , url , site , useGallery ) ;
2019-03-24 00:29:22 +00:00
}
2020-07-20 23:44:51 +00:00
async function fetchProfile ( { name : actorName } ) {
2020-09-18 20:43:45 +00:00
const res = await http . get ( ` https://www.legalporno.com/api/autocomplete/search?q= ${ actorName . replace ( ' ' , '+' ) } ` ) ;
2020-05-14 02:26:05 +00:00
const data = res . body ;
2019-11-20 03:53:36 +00:00
2020-05-14 02:26:05 +00:00
const result = data . terms . find ( item => item . type === 'model' ) ;
2019-11-20 03:53:36 +00:00
2020-05-14 02:26:05 +00:00
if ( result ) {
2020-09-18 20:43:45 +00:00
const bioRes = await http . get ( result . url ) ;
2020-05-14 02:26:05 +00:00
const html = bioRes . body . toString ( ) ;
2019-11-20 03:53:36 +00:00
2020-05-14 02:26:05 +00:00
return scrapeProfile ( html , result . url , actorName ) ;
}
2019-11-20 03:53:36 +00:00
2020-05-14 02:26:05 +00:00
return null ;
2019-11-20 03:53:36 +00:00
}
2019-03-24 00:29:22 +00:00
module . exports = {
2020-05-14 02:26:05 +00:00
fetchLatest ,
fetchProfile ,
fetchScene ,
2019-03-24 00:29:22 +00:00
} ;