2019-03-18 03:46:53 +00:00
'use strict' ;
2020-03-01 04:28:08 +00:00
const util = require ( 'util' ) ;
2020-02-28 02:56:58 +00:00
const Promise = require ( 'bluebird' ) ;
2023-07-05 22:14:38 +00:00
const unprint = require ( 'unprint' ) ;
2019-03-18 03:46:53 +00:00
2023-07-09 02:35:30 +00:00
const argv = require ( '../argv' ) ;
2020-11-22 03:07:09 +00:00
const qu = require ( '../utils/qu' ) ;
2019-11-21 03:05:32 +00:00
const { heightToCm } = require ( '../utils/convert' ) ;
2020-02-24 02:12:58 +00:00
const slugify = require ( '../utils/slugify' ) ;
2019-03-24 00:29:22 +00:00
2020-03-09 01:02:29 +00:00
function getEntryId ( html ) {
2020-05-14 02:26:05 +00:00
const entryId = html . match ( /showtagform\((\d+)\)/ ) ;
2020-03-09 01:02:29 +00:00
2020-05-14 02:26:05 +00:00
if ( entryId ) {
return entryId [ 1 ] ;
}
2020-03-09 01:02:29 +00:00
2020-05-14 02:26:05 +00:00
const setIdIndex = html . indexOf ( 'setid:"' ) ;
2020-03-09 01:02:29 +00:00
2020-05-14 02:26:05 +00:00
if ( setIdIndex ) {
2020-05-18 02:28:38 +00:00
return html . slice ( setIdIndex , html . indexOf ( ',' , setIdIndex ) ) . match ( /\d+/ ) ? . [ 0 ] ;
2020-05-14 02:26:05 +00:00
}
2020-03-09 01:02:29 +00:00
2020-05-14 02:26:05 +00:00
return null ;
2020-03-09 01:02:29 +00:00
}
2020-09-04 23:56:54 +00:00
function scrapeAll ( scenes , site , entryIdFromTitle ) {
2023-07-05 22:14:38 +00:00
return scenes . map ( ( { element , query } ) => {
2020-05-14 02:26:05 +00:00
const release = { } ;
2023-07-05 22:14:38 +00:00
const title = query . content ( '.content_img div, .dvd_info > a, a.update_title, a[title] + a[title], .overlay-text' ) || query . content ( 'a[title*=" "]' ) ;
2020-02-12 22:49:22 +00:00
2021-12-05 22:43:23 +00:00
release . title = title ? . slice ( 0 , title . match ( /starring:/i ) ? . index || Infinity ) . trim ( ) ;
2023-07-05 22:14:38 +00:00
release . url = query . url ( '.content_img a, .dvd_info > a, a.update_title, a[title]' ) ;
2023-11-24 00:29:22 +00:00
release . date = query . date ( '.update_date' , [ 'MM/DD/YYYY' , 'YYYY-MM-DD' ] ) ;
2020-02-12 22:49:22 +00:00
2023-07-05 22:14:38 +00:00
release . entryId = ( entryIdFromTitle && slugify ( release . title ) ) || element . dataset . setid || query . element ( '.rating_box' ) ? . dataset . id || query . attribute ( 'a img' , 'id' ) ? . match ( /set-target-(\d+)/ ) ? . [ 1 ] ;
2020-09-04 23:56:54 +00:00
2021-12-05 22:43:23 +00:00
release . actors = query . all ( '.content_img .update_models a, .update_models a' ) . map ( ( actorEl ) => ( {
2023-07-05 22:14:38 +00:00
name : unprint . query . content ( actorEl ) ,
url : unprint . query . url ( actorEl , null ) ,
2021-11-30 23:00:24 +00:00
} ) ) ;
2020-02-12 22:49:22 +00:00
2020-11-22 03:07:09 +00:00
const dvdPhotos = query . imgs ( '.dvd_preview_thumb' ) ;
2023-07-05 22:14:38 +00:00
const photoCount = Number ( query . attribute ( 'a img.thumbs' , 'cnt' ) ) || 1 ;
2020-02-12 22:49:22 +00:00
2020-05-14 02:26:05 +00:00
[ release . poster , ... release . photos ] = dvdPhotos . length
? dvdPhotos
: Array . from ( { length : photoCount } ) . map ( ( value , index ) => {
2023-07-05 22:14:38 +00:00
const src = query . img ( 'a img.thumbs' , { attribute : ` src ${ index } _1x ` } ) || query . img ( 'a img.thumbs' , { attribute : ` src ${ index } ` } ) || query . img ( 'a img.thumbs' ) ;
2020-11-22 03:07:09 +00:00
const prefixedSrc = qu . prefixUrl ( src , site . url ) ;
2020-09-04 23:56:54 +00:00
if ( src ) {
2023-07-29 21:59:17 +00:00
return Array . from ( new Set ( [
prefixedSrc . replace ( /.jpg$/ , '-full.jpg' ) ,
prefixedSrc . replace ( /-1x.jpg$/ , '-4x.jpg' ) ,
prefixedSrc . replace ( /-1x.jpg$/ , '-2x.jpg' ) ,
prefixedSrc ,
] ) ) . map ( ( source ) => ( {
src : source ,
referer : site . url ,
verifyType : 'image' ,
} ) ) ;
2020-09-04 23:56:54 +00:00
}
return null ;
2020-05-14 02:26:05 +00:00
} ) . filter ( Boolean ) ;
2019-05-08 03:50:13 +00:00
2020-11-22 03:07:09 +00:00
const teaserScript = query . html ( 'script' ) ;
2021-11-30 23:00:24 +00:00
2020-05-14 02:26:05 +00:00
if ( teaserScript ) {
2021-11-30 23:00:24 +00:00
release . teaser = teaserScript . slice ( teaserScript . indexOf ( 'http' ) , teaserScript . indexOf ( '.mp4' ) + 4 ) ;
2020-05-14 02:26:05 +00:00
}
2019-03-26 00:26:47 +00:00
2020-05-14 02:26:05 +00:00
return release ;
} ) ;
2019-03-18 03:46:53 +00:00
}
2023-07-09 02:35:30 +00:00
function scrapeUpcoming ( scenes , channel ) {
return scenes . map ( ( { query , html } ) => {
2020-09-11 00:36:36 +00:00
const release = { } ;
2023-07-09 02:35:30 +00:00
release . title = query . text ( '.overlay-text' , { join : false } ) ? . [ 0 ] ;
2023-11-24 00:29:22 +00:00
release . date = query . date ( '.overlay-text' , [ 'MM/DD/YYYY' , 'YYYY-MM-DD' ] ) ;
2020-05-14 02:26:05 +00:00
2023-07-09 02:35:30 +00:00
release . actors = query . all ( '.update_models a' ) . map ( ( actorEl ) => ( {
name : unprint . query . content ( actorEl ) ,
url : unprint . query . url ( actorEl , null ) ,
} ) ) ;
2020-05-14 02:26:05 +00:00
2023-07-09 02:35:30 +00:00
release . poster = query . img ( 'img' ) || query . img ( 'img' , { attribute : 'src0_1x' } ) ;
2023-07-09 19:39:40 +00:00
release . teaser = html . match ( /src=['"](https:\/\/.*\.mp4)['"]/ ) ? . [ 1 ] ;
2020-05-14 02:26:05 +00:00
2023-07-09 02:35:30 +00:00
release . entryId = channel . parameters ? . entryIdFromTitle ? slugify ( release . title ) : getEntryId ( html ) ;
2020-09-11 00:36:36 +00:00
return release ;
2020-05-14 02:26:05 +00:00
} ) ;
2019-03-18 03:46:53 +00:00
}
2023-07-05 22:14:38 +00:00
function extractLegacyTrailer ( html , context ) {
const trailerLines = html . split ( '\n' ) . filter ( ( line ) => / movie \ [ "trailer\w*" \ ] \ [ / i . t e s t ( l i n e ) ) ;
if ( trailerLines . length ) {
return trailerLines . map ( ( trailerLine ) => {
// const src = trailerLine.match(/path:"([\w-:/.&=?%]+)"/)?.[1];
const src = trailerLine . match ( /path:"(.+)"/ ) ? . [ 1 ] ;
const quality = trailerLine . match ( /movie_height:'(\d+)/ ) ? . [ 1 ] ;
return src && {
src : /^http/ . test ( src ) ? src : ` ${ context . entity . url } ${ src } ` ,
quality : quality && Number ( quality . replace ( '558' , '540' ) ) ,
} ;
} ) . filter ( Boolean ) ;
}
return null ;
}
const qualities = [
'photos' ,
'1600watermarked' ,
'1280watermarked' ,
'1024watermarked' ,
'thumbs' ,
] ;
function getPhotos ( query , release , context ) {
// https://thumbs.julesjordan.com/members/content//upload/dl03/julesjordan/whitney_wright_dredd/1024watermarked/whitney_wright_julesjordan.com-20.jpg
// https://thumbs.julesjordan.com/members/content//upload/dl03/julesjordan/bambi_barton_manuel_ferrara/1024watermarked/bambi_barton_julesjordan_com-13.jpg
if ( ! release . actors ? . length > 0 ) {
return null ;
}
const photoCount = query . number ( '//div[contains(@class, "title-heading-content")][contains(text(), "Photos")]' ) ;
if ( photoCount ) {
// slug actor order is not always the same as actor list order, prefer trailer slug if available
const path = query . dataset ( '.movieformat_button' , 'src' ) ? . match ( /:(.*)_trailer/ ) ? . [ 1 ] || release . actors . map ( ( actor ) => slugify ( actor . name || actor , '_' ) ) . join ( '_' ) ;
const derivedActorSlug = path . replace ( ` _ ${ release . actors . slice ( 1 ) . map ( ( { name } ) => slugify(name, '_'))} ` , '' ) ;
const actorSlug = derivedActorSlug === path // no replacement took place, so the slug is likely invalid
? slugify ( release . actors [ 0 ] . name || release . actors [ 0 ] , '_' )
: derivedActorSlug ;
return Array . from ( { length : photoCount } , ( value , index ) => qualities
. flatMap ( ( quality ) => [
` https://thumbs. ${ context . entity . slug } .com/trial/content//upload/dl03/ ${ context . entity . slug } / ${ path } / ${ quality } / ${ actorSlug } _ ${ context . entity . slug } _com- ${ index + 1 } .jpg ` ,
` https://thumbs. ${ context . entity . slug } .com/trial/content//upload/dl03/ ${ context . entity . slug } / ${ path } / ${ quality } / ${ actorSlug } _ ${ context . entity . slug } .com- ${ index + 1 } .jpg ` , // .com instead of _com
2023-07-05 22:30:04 +00:00
] ) . map ( ( src ) => ( { src , attempts : 1 } ) ) ) ;
2023-07-05 22:14:38 +00:00
}
return null ;
}
async function scrapeScene ( { html , query } , context ) {
2021-11-30 23:00:24 +00:00
const release = { } ;
2019-12-13 15:59:04 +00:00
2023-07-05 22:14:38 +00:00
release . title = query . content ( '.title_bar_hilite, .movie_title' ) ;
release . description = query . content ( '.update_description' ) || query . text ( '//div[./span[contains(text(), "Description")]]' ) ;
release . entryId = context . entity . parameters ? . entryIdFromTitle ? slugify ( release . title ) : getEntryId ( html ) ;
2020-01-08 04:12:14 +00:00
2023-11-24 00:29:22 +00:00
release . date = query . date ( [ '.update_date' , '//div[./span[contains(text(), "Date")]]' ] , [ 'MM/DD/YYYY' , 'YYYY-MM-DD' ] ) ;
2020-03-09 04:06:37 +00:00
2023-07-05 22:14:38 +00:00
release . actors = query . all ( '.backgroundcolor_info > .update_models a, .item .update_models a, .player-scene-description .update_models a' ) . map ( ( actorEl ) => ( {
name : unprint . query . content ( actorEl ) ,
url : unprint . query . url ( actorEl , null ) ,
2021-11-30 23:00:24 +00:00
} ) ) ;
2023-07-05 22:14:38 +00:00
release . tags = query . contents ( '.update_tags a, .player-scene-description a[href*="/categories"]' ) ;
2023-07-01 20:45:28 +00:00
release . director = release . tags ? . find ( ( tag ) => [ 'mike john' , 'van styles' ] . includes ( tag ? . trim ( ) . toLowerCase ( ) ) ) ;
2019-03-23 21:48:39 +00:00
2023-07-05 22:14:38 +00:00
const posterPath = query . poster ( '#video-player' ) || html . match ( /useimage = "(.*)"/ ) ? . [ 1 ] ;
2019-10-29 03:49:27 +00:00
2020-05-14 02:26:05 +00:00
if ( posterPath ) {
2023-07-05 22:14:38 +00:00
const poster = /^http/ . test ( posterPath ) ? posterPath : ` ${ context . entity . url } ${ posterPath } ` ;
2020-03-01 04:28:08 +00:00
2020-05-14 02:26:05 +00:00
if ( poster ) {
release . poster = {
src : poster ,
2023-07-05 22:14:38 +00:00
referer : context . entity . url ,
2020-05-14 02:26:05 +00:00
} ;
}
}
2019-12-13 15:59:04 +00:00
2023-07-05 22:14:38 +00:00
if ( query . exists ( 'source[data-bitrate="trailer"]' ) ) {
release . trailer = [
query . video ( 'source[data-bitrate="trailer_1080" i]' ) ,
query . video ( 'source[data-bitrate="trailer_720" i]' ) ,
query . video ( 'source[data-bitrate="trailer" i]' ) , // also seems to be 720p
query . video ( 'source[data-bitrate="trailer_mobile" i]' ) , // also seems to be 720p
] ;
} else if ( context . include . trailers && context . entity . slug !== 'manuelferrara' ) {
release . trailer = extractLegacyTrailer ( html , context ) ;
2020-05-14 02:26:05 +00:00
}
2019-12-13 15:59:04 +00:00
2023-07-05 22:14:38 +00:00
// release.photos = async () => await getPhotos(release.entryId, context.entity); // probably no longer works on any site
2023-07-09 02:35:30 +00:00
if ( argv . jjFullPhotos ) {
release . photos = getPhotos ( query , release , context ) ;
} else {
2023-07-29 21:59:17 +00:00
// base release photos are usually better, but deep photos have additional thumbs
// the filenames are not chronological, so sorting after appending only worsens the mix
release . photos = [
... context . baseRelease ? . photos ? . map ( ( sources ) => sources . at ( - 1 ) . src ) || [ ] ,
... query . imgs ( '#images img' ) ,
] . map ( ( source ) => Array . from ( new Set ( [
source . replace ( /.jpg$/ , '-full.jpg' ) ,
source . replace ( /-1x.jpg$/ , '-4x.jpg' ) ,
source . replace ( /-1x.jpg$/ , '-2x.jpg' ) ,
source ,
] ) ) . map ( ( fallbackSource ) => ( {
src : fallbackSource ,
referer : context . entity . url ,
verifyType : 'image' ,
} ) ) ) ;
2023-07-09 02:35:30 +00:00
}
2020-03-01 04:28:08 +00:00
2020-11-22 03:07:09 +00:00
if ( query . exists ( '.update_dvds a' ) ) {
2020-05-14 02:26:05 +00:00
release . movie = {
2020-11-22 03:07:09 +00:00
url : query . url ( '.update_dvds a' ) ,
2021-11-30 23:00:24 +00:00
title : query . cnt ( '.update_dvds a' ) ,
2020-05-14 02:26:05 +00:00
} ;
2021-07-04 22:06:18 +00:00
release . movie . entryId = new URL ( release . movie . url ) . pathname . split ( '/' ) . slice ( - 1 ) [ 0 ] ? . replace ( '.html' , '' ) ;
2020-05-14 02:26:05 +00:00
}
2019-12-13 15:59:04 +00:00
2021-11-30 23:00:24 +00:00
release . stars = query . number ( '.avg_rating' ) ;
2019-12-13 15:59:04 +00:00
2020-05-14 02:26:05 +00:00
return release ;
2019-03-23 21:48:39 +00:00
}
2020-08-01 13:11:07 +00:00
function scrapeMovie ( { el , query } , url , site ) {
2020-05-14 02:26:05 +00:00
const movie = { url , site } ;
2019-12-13 02:28:52 +00:00
2024-03-14 23:57:28 +00:00
movie . entryId = new URL ( url ) . pathname . split ( '/' ) . slice ( - 1 ) [ 0 ] ? . replace ( '.html' , '' ) . toLowerCase ( ) ;
2021-11-30 23:00:24 +00:00
movie . title = query . cnt ( '.title_bar span' ) ;
2020-08-01 13:11:07 +00:00
movie . covers = query . urls ( '#dvd-cover-flip > a' ) ;
movie . channel = slugify ( query . q ( '.update_date a' , true ) , '' ) ;
2020-03-08 03:23:10 +00:00
2020-05-14 02:26:05 +00:00
// movie.releases = Array.from(document.querySelectorAll('.cell.dvd_info > a'), el => el.href);
2020-11-22 03:07:09 +00:00
const sceneQus = qu . initAll ( el , '.dvd_details' ) ;
2020-05-14 02:26:05 +00:00
const scenes = scrapeAll ( sceneQus , site ) ;
2020-03-08 03:23:10 +00:00
2020-05-14 02:26:05 +00:00
const curatedScenes = scenes
2021-11-20 22:59:15 +00:00
? . map ( ( scene ) => ( { ... scene , movie } ) )
2021-02-21 21:58:46 +00:00
. sort ( ( sceneA , sceneB ) => sceneA . date - sceneB . date ) ;
2019-12-13 02:28:52 +00:00
2021-12-05 22:43:23 +00:00
movie . date = curatedScenes ? . [ 0 ] ? . date ;
2020-03-08 03:23:10 +00:00
2020-05-14 02:26:05 +00:00
return {
... movie ,
... ( curatedScenes && { scenes : curatedScenes } ) ,
} ;
2019-12-13 02:28:52 +00:00
}
2023-07-06 03:09:05 +00:00
function scrapeProfile ( { query } , url , name , entity ) {
const profile = { url } ;
profile . description = query . content ( '//comment()[contains(., " Bio Extra Field ")]/following-sibling::span' ) ; // the spaces are important to avoid selecting a similar comment
profile . height = heightToCm ( query . content ( '//span[contains(text(), "Height")]/following-sibling::span' ) ) ;
profile . measurements = query . content ( '//span[contains(text(), "Measurements")]/following-sibling::span' ) ;
const age = query . content ( '//span[contains(text(), "Age")]/following-sibling::span' ) ? . trim ( ) ;
if ( age && /\w+ \d+, \d{4}/ . test ( age ) ) {
profile . dateOfBirth = unprint . extractDate ( age , 'MMMM D, YYYY' ) ;
} else {
profile . age = Number ( age ) || null ;
}
profile . avatar = [
query . img ( '.model_bio_pic img, .model_bio_thumb' , { attribute : 'src0_3x' } ) ,
query . img ( '.model_bio_pic img, .model_bio_thumb' , { attribute : 'src0_2x' } ) ,
query . img ( '.model_bio_pic img, .model_bio_thumb' , { attribute : 'src0_1x' } ) ,
query . img ( '.model_bio_pic img, .model_bio_thumb' , { attribute : 'src0' } ) ,
query . img ( '.model_bio_pic img, .model_bio_thumb' , { attribute : 'src' } ) ,
] . filter ( Boolean ) ;
profile . scenes = scrapeAll ( unprint . initAll ( query . all ( '.grid-item' ) ) , entity , true ) ;
return profile ;
}
2019-11-21 03:05:32 +00:00
2020-09-10 21:49:24 +00:00
async function fetchLatest ( site , page = 1 , include , preData , entryIdFromTitle = false ) {
2020-05-14 02:26:05 +00:00
const url = site . parameters ? . latest
? util . format ( site . parameters . latest , page )
: ` ${ site . url } /trial/categories/movies_ ${ page } _d.html ` ;
2020-03-01 04:28:08 +00:00
2020-11-22 23:05:02 +00:00
// const res = await http.get(url);
2023-07-05 22:14:38 +00:00
const res = await unprint . get ( url , { selectAll : '.update_details, .grid-item' } ) ;
if ( res . ok ) {
return scrapeAll ( res . context , site , typeof site . parameters ? . entryIdFromTitle === 'boolean' ? site . parameters . entryIdFromTitle : entryIdFromTitle ) ;
}
2019-03-23 21:48:39 +00:00
2023-07-05 22:14:38 +00:00
return res . status ;
2019-03-23 21:48:39 +00:00
}
async function fetchUpcoming ( site ) {
2020-05-14 02:26:05 +00:00
if ( site . parameters ? . upcoming === false ) return null ;
2020-03-01 04:28:08 +00:00
2020-05-14 02:26:05 +00:00
const url = site . parameters ? . upcoming ? util . format ( site . parameters . upcoming ) : ` ${ site . url } /trial/index.php ` ;
2023-07-09 02:35:30 +00:00
const res = await unprint . get ( url , { selectAll : '//img[contains(@alt, "Coming Soon")]/parent::div' } ) ;
2020-03-01 04:28:08 +00:00
2023-07-09 02:35:30 +00:00
if ( res . ok ) {
return scrapeUpcoming ( res . context , site ) ;
2020-05-14 02:26:05 +00:00
}
2019-03-23 21:48:39 +00:00
2023-07-09 02:35:30 +00:00
return res . status ;
2019-03-23 21:48:39 +00:00
}
2019-03-18 03:46:53 +00:00
2019-12-13 02:28:52 +00:00
async function fetchMovie ( url , site ) {
2020-11-22 03:07:09 +00:00
const res = await qu . get ( url ) ;
2019-12-13 02:28:52 +00:00
2020-05-14 02:26:05 +00:00
return res . ok ? scrapeMovie ( res . item , url , site ) : res . status ;
2019-12-13 02:28:52 +00:00
}
2020-09-04 23:56:54 +00:00
async function fetchProfile ( { name : actorName , url } , entity ) {
const actorSlugA = slugify ( actorName , '' ) ;
const actorSlugB = slugify ( actorName , '-' ) ;
2019-11-28 04:36:22 +00:00
2020-09-04 23:56:54 +00:00
const urls = [
url ,
` ${ entity . parameters ? . profile || ` ${ entity . url } /trial/models ` } / ${ actorSlugA } .html ` ,
` ${ entity . parameters ? . profile || ` ${ entity . url } /trial/models ` } / ${ actorSlugB } .html ` ,
] ;
2019-11-28 04:36:22 +00:00
2020-09-04 23:56:54 +00:00
return urls . reduce ( async ( chain , profileUrl ) => {
const profile = await chain ;
2019-11-28 04:36:22 +00:00
2020-09-04 23:56:54 +00:00
if ( profile ) {
return profile ;
}
2019-11-28 04:36:22 +00:00
2020-09-04 23:56:54 +00:00
if ( ! profileUrl ) {
return null ;
}
2019-11-28 04:36:22 +00:00
2023-07-06 03:09:05 +00:00
const res = await unprint . get ( profileUrl ) ;
2019-11-21 03:05:32 +00:00
2023-07-06 03:09:05 +00:00
if ( res . ok ) {
return scrapeProfile ( res . context , profileUrl , actorName , entity ) ;
2020-09-04 23:56:54 +00:00
}
2019-11-21 03:05:32 +00:00
2020-09-04 23:56:54 +00:00
return null ;
} , Promise . resolve ( ) ) ;
2019-11-21 03:05:32 +00:00
}
2019-03-23 21:48:39 +00:00
module . exports = {
2020-05-14 02:26:05 +00:00
fetchLatest ,
fetchMovie ,
fetchProfile ,
fetchUpcoming ,
2023-07-05 22:14:38 +00:00
scrapeScene : {
scraper : scrapeScene ,
unprint : true ,
} ,
2019-03-23 21:48:39 +00:00
} ;