diff --git a/package-lock.json b/package-lock.json index fcfdce52..361f165c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -94,7 +94,7 @@ "tunnel": "0.0.6", "ua-parser-js": "^1.0.37", "undici": "^5.28.1", - "unprint": "^0.18.16", + "unprint": "^0.18.18", "url-pattern": "^1.0.3", "v-tooltip": "^2.1.3", "video.js": "^8.6.1", @@ -20380,9 +20380,10 @@ } }, "node_modules/unprint": { - "version": "0.18.16", - "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.18.16.tgz", - "integrity": "sha512-BiqHfGmQIHjTgAta3d2zAnw+jDzlrlJ3IYEkRQe9f3kNMZRbhOTOmWlkRYIzKpJBAEn2ECRwfoiYUaW8gtI5rQ==", + "version": "0.18.18", + "resolved": "https://registry.npmjs.org/unprint/-/unprint-0.18.18.tgz", + "integrity": "sha512-M4sjzpPPAemZ1ND+FNlyGnCO4MP1qSupLiary1qZWfFQwSohePG962BWvja4r4AfwqzviV4mAC2RJILVKIPhYg==", + "license": "ISC", "dependencies": { "bottleneck": "^2.19.5", "cookie": "^1.1.1", diff --git a/package.json b/package.json index 328c8b28..20c1615b 100755 --- a/package.json +++ b/package.json @@ -153,7 +153,7 @@ "tunnel": "0.0.6", "ua-parser-js": "^1.0.37", "undici": "^5.28.1", - "unprint": "^0.18.16", + "unprint": "^0.18.18", "url-pattern": "^1.0.3", "v-tooltip": "^2.1.3", "video.js": "^8.6.1", diff --git a/src/scrapers/actors.js b/src/scrapers/actors.js index 5f0e1740..0131646f 100644 --- a/src/scrapers/actors.js +++ b/src/scrapers/actors.js @@ -190,6 +190,9 @@ module.exports = { julesjordan, amateurallure: julesjordan, // different company, same scraper swallowsalon: julesjordan, // different company, same scraper + // first anal quest + doubleviewcasting: firstanalquest, + firstanalquest, // etc '18vr': badoink, theflourishxxx: theflourish, @@ -211,8 +214,6 @@ module.exports = { cherrypimps, cumlouder, dorcelclub: dorcel, - doubleviewcasting: firstanalquest, - firstanalquest, freeones, hitzefrei, hookuphotshot, diff --git a/src/scrapers/firstanalquest.js b/src/scrapers/firstanalquest.js index 416de2a6..1c974ba4 100755 --- a/src/scrapers/firstanalquest.js +++ b/src/scrapers/firstanalquest.js @@ -1,200 +1,224 @@ 'use strict'; -const qu = require('../utils/qu'); +const unprint = require('unprint'); + const slugify = require('../utils/slugify'); +const { stripQuery } = require('../utils/url'); function scrapeAllA(scenes, channel) { return scenes.map(({ query }) => { const release = {}; - release.url = query.url('a.thumb-img, a.thumb', 'href', { origin: channel.url }); + release.url = query.url('a.thumb-img, a.thumb', { origin: channel.url, protocol: 'http' }); release.entryId = new URL(release.url).pathname.match(/(\d+)\/?$/)?.[1]; release.title = query.text('.thumb-title, .title'); release.date = query.date('.thumb-added, .date', ['MMM D, YYYY', 'MMMM DD, YYYY'], /\w+ \d{1,2}, \d{4}/); - release.duration = query.dur('.thumb-duration'); + release.duration = query.duration('.thumb-duration'); release.actors = query.all('.thumb-models a, .models a').map((actorEl) => ({ - name: query.cnt(actorEl), - url: query.url(actorEl, null, 'href', { origin: channel.url }), + name: unprint.query.content(actorEl), + url: unprint.query.url(actorEl, null, { origin: channel.url, protocol: 'http' }), })); - const [, photoUrl, photoCount] = query.q('.thumb-img img', 'onmouseover')?.match(/'(.*)', (\d+)\)/) || []; + const [, photoUrl, photoCount] = query.attribute('.thumb-img img', 'onmouseover')?.match(/'(.*)', (\d+)\)/) || []; if (photoUrl && photoCount) { - [release.poster, ...release.photos] = Array.from({ length: 5 }, (value, index) => `${photoUrl}${index + 1}.jpg`); + [release.poster, ...release.photos] = Array.from({ length: 5 }, (_value, index) => unprint.prefixUrl(`${photoUrl}${index + 1}.jpg`, channel.origin, { protocol: 'http' })); } else { - release.poster = query.img('.thumb-img img, .thumb img', 'src', { origin: channel.url }); + release.poster = query.img('.thumb-img img, .thumb img', { origin: channel.url, protocol: 'http' }); } - release.tags = query.cnts('.tags a'); + release.tags = query.contents('.tags a'); release.rating = query.number('.thumb-rating'); return release; }); } -function scrapeAllB(scenes, channel) { - return scenes.map(({ query }) => { - const release = {}; - - release.title = query.cnt('.title, h2'); - release.description = query.cnt('.description, p textarea'); - release.duration = query.dur('.time'); - - const previewHtml = query.html('script')?.match(/document.write\("(.*)"\);/)?.[1]; - const previewEl = qu.extract(previewHtml); - const previewQuery = previewEl?.query.q('param[name="flashvars"]', 'value') || query.q('param[name="flashvars"]', 'value'); - const previewParams = previewQuery && new URLSearchParams(previewQuery); - - if (previewParams) { - release.poster = qu.prefixUrl(previewParams.get('image') || previewParams.get('poster'), channel.url); - release.trailer = previewParams.get('file'); - } - - release.photos = query.imgs('img[src*="sets/"], img[src*="thumbnails/"]', 'src', { origin: channel.url }); - - release.entryId = release.poster?.match(/\/sets\/(.*)\//)?.[1] || slugify(release.title); - - return release; - }); -} - -function scrapeSceneA({ query }, url, channel) { - const release = {}; - - release.entryId = new URL(url).pathname.match(/(\d+)\/?$/)?.[1]; - - release.title = query.cnt('.title, .scene-title h3').replace(/:$/, ''); - release.description = query.cnt('.text-desc p, .info-description p'); - - release.duration = query.dur('.media-body li span, .duration'); - - release.actors = query.all('.media-body a[href*="models/"], .models a').map((actorEl) => ({ - name: query.cnt(actorEl), - url: query.url(actorEl, null, 'href', { origin: channel.url }), - })); - - release.tags = query.cnts('.media-body a[href*="tags/"], .tags a'); - - release.poster = [ - query.img('.player-preview'), - qu.prefixUrl(`/contents/videos_screenshots/0/${release.entryId}/preview_trailer.mp4.jpg`, channel.url), - qu.prefixUrl(query.q('param[name="flashvars"]', 'value')?.match(/poster=(.*\.jpg)/)?.[1], channel.url), - qu.prefixUrl(`/contents/scenes/${release.entyId}/thumbnails/920x518.jpg`, channel.url), - ]; - - release.photos = query.urls('.thumb-album a:not([href="#"]), .thumbs-photo a:not([href*="signup"])', 'href', { origin: channel.url }) - .concat(query.imgs('.thumb-album a[href="#"] img, .thumbs-photo a[href*="signup"] img', 'src', { origin: channel.url })); - - release.trailer = query.url('a[href*="get_file/"], .download a'); - - return release; -} - -function scrapeProfileA({ query, el }, entity) { - const profile = {}; - - const bio = query.all('.list-model-info li, .profile-info li').reduce((acc, bioEl) => ({ - ...acc, - [slugify(query.cnt(bioEl, '.title, span'), '_')]: query.cnt(bioEl, ':nth-child(2)') || query.q(bioEl, ':nth-child(2)', 'title') || query.text(bioEl), - }), {}); - - profile.dateOfBirth = qu.parseDate(bio.birth_date || bio.date_of_birth, 'DD MMMM, YYYY'); - profile.birthPlace = bio.nationality || bio.place_of_birth || null; - - profile.weight = Number(bio.weight?.match(/\d+/)?.[0]); - profile.height = Number(bio.height?.match(/\d+/)?.[0]); - - profile.eyes = bio.eye_color; - profile.hairColor = bio.hair || bio.hair_color; - - profile.aliases = query.text('.sub-title')?.replace(/:\s*/, '').split(/,\s*/); - - if (bio.measurements || bio.body_shape_dimensions) { - const [, bust, cup, waist, hip] = (bio.measurements || bio.body_shape_dimensions).match(/(\d+)(\w+)-(\d+)-(\d+)/); - - profile.bust = Number(bust); - profile.cup = cup; - profile.waist = Number(waist); - profile.hip = Number(hip); - } - - const description = query.cnt('.model-biography p'); - const avatar = query.img('.model-box img, .profile-model-photo', 'src', { origin: entity.url }); - - if (!/there is no description/.test(description)) { - profile.description = description; - } - - if (avatar) { - profile.avatar = [ - avatar, - avatar.replace('s2_', 's1_'), - ]; - } - - profile.scenes = scrapeAllA(qu.initAll(el, '.list-thumbs .thumb, .main-thumbs > li'), entity); - - return profile; -} - async function fetchLatestA(channel, page) { const url = channel.parameters?.latest ? `${channel.parameters.latest}/${page}` : `${channel.url}/latest-updates/${page}/`; - const res = await qu.getAll(url, '.list-thumbs ul > li, .main-thumbs > li'); + const res = await unprint.get(url, { selectAll: '.list-thumbs ul > li, .main-thumbs > li' }); if (res.ok) { - return scrapeAllA(res.items, channel); + return scrapeAllA(res.context, channel); } return res.status; } +function scrapeAllB(scenes, channel) { + return scenes.map(({ query }) => { + const release = {}; + + release.title = query.content('.title, h2'); + release.duration = query.duration('.time'); + + const description = query.content('.description, p textarea'); + + if (!/there is no description/i.test(description)) { + release.description = description; + } + + release.poster = query.poster('#player, #example_video_1', { origin: channel.origin, protocol: 'http' }); + release.trailer = query.video('#player source, #example_video_1 source', { origin: channel.origin, protocol: 'http' }); + + release.photos = query.imgs('img[src*="sets/"], img[src*="thumbnails/"]', { origin: channel.origin, protocol: 'http' }); + + release.entryId = release.poster?.match(/\/sets\/(.*)\//)?.[1] || slugify(release.title); + + return release; + }); +} + async function fetchLatestB(channel, page) { const url = channel.parameters?.paginated ? `${channel.url}/page/${page}` : channel.url; - const res = await qu.getAll(url, '#container, article:not(.sortby)'); + const res = await unprint.get(url, { + selectAll: '#container, article:not(.sortby)', + parser: { + runScripts: 'dangerously', + }, + }); if (res.ok) { - return scrapeAllB(res.items, channel); + return scrapeAllB(res.context, channel); } return res.status; } +function scrapeSceneA({ query }, url, channel) { + const release = {}; + + release.entryId = new URL(url).pathname.match(/(\d+)\/?$/)?.[1]; + + release.title = query.content('.title, .scene-title h3').replace(/:$/, ''); + + const description = query.content('.text-desc p, .info-description p'); + + if (!/there is no description/i.test(description)) { + release.description = description; + } + + release.duration = query.duration('.media-body li span, .duration'); + + release.actors = query.all('.media-body a[href*="models/"], .models a').map((actorEl) => ({ + name: unprint.query.content(actorEl), + url: unprint.query.url(actorEl, null, { origin: channel.url, protocol: 'http' }), + })); + + release.tags = query.contents('.media-body a[href*="tags/"], .tags a'); + + release.poster = Array.from(new Set([ + query.img('.player-preview', { protocol: 'http' }), + unprint.prefixUrl(`/contents/videos_screenshots/0/${release.entryId}/preview_trailer.mp4.jpg`, channel.url, { protocol: 'http' }), + unprint.prefixUrl(query.attribute('param[name="flashvars"]', 'value')?.match(/poster=(.*\.jpg)/)?.[1], channel.url, { protocol: 'http' }), + unprint.prefixUrl(`/contents/scenes/${release.entryId}/thumbnails/920x518.jpg`, channel.url, { protocol: 'http' }), + ].filter(Boolean))); + + release.photos = query.urls('.thumb-album a:not([href="#"]), .thumbs-photo a:not([href*="signup"])', { origin: channel.url, protocol: 'http' }) + .concat(query.imgs('.thumb-album a[href="#"] img, .thumbs-photo a[href*="signup"] img', { origin: channel.url, protocol: 'http' })); + + release.trailer = stripQuery(query.url('a[href*="get_file/"], .download a')); + + return release; +} + async function fetchSceneA(url, channel) { - const res = await qu.get(url, '.main, .main-content'); + const res = await unprint.get(url, { select: '.main, .main-content' }); if (res.ok) { - return scrapeSceneA(res.item, url, channel); + return scrapeSceneA(res.context, url, channel); } return res.status; } -async function fetchProfileA({ name, slug }, { entity }) { - const searchRes = await qu.getAll(`${entity.url}/models/search/?q=${name}`, '.thumb-modal, .big-thumb'); +function scrapeProfileA({ query }, entity) { + const profile = {}; - if (!searchRes.ok) { - return searchRes.status; + const bio = query.all('.list-model-info li, .profile-info li').reduce((acc, bioEl) => ({ + ...acc, + [slugify(unprint.query.content(bioEl, '.title, span'), '_')]: unprint.query.content(bioEl, ':nth-child(2)') + || unprint.query.attribute(bioEl, ':nth-child(2)', 'title') + || unprint.query.text(bioEl), + }), {}); + + profile.dateOfBirth = unprint.extractDate(bio.birth_date || bio.date_of_birth, 'DD MMMM, YYYY', { match: null }); + profile.birthPlace = bio.nationality || bio.place_of_birth || null; + + profile.weight = unprint.extractNumber(bio.weight); + profile.height = unprint.extractNumber(bio.height); + + profile.eyes = bio.eye_color; + profile.hairColor = bio.hair || bio.hair_color; + + profile.aliases = query.text('.sub-title')?.replace(/:\s*/, '').split(/,\s*/); + + profile.measurements = bio.measurements || bio.body_shape_dimensions; + + const description = query.content('.model-biography p'); + const avatar = query.img('.model-box img, .profile-model-photo', { origin: entity.url, protocol: 'http' }); + + if (!/there is no description/i.test(description)) { + profile.description = description; } - const actor = searchRes.items.find(({ query }) => slugify(query.cnt('.thumb-title a, .title')) === slug); + if (avatar) { + profile.avatar = Array.from(new Set([ + avatar, + avatar.replace('s2_', 's1_'), + avatar.replace('s1_', 's2_'), + ])); + } - if (!actor) { + profile.scenes = scrapeAllA(unprint.initAll(query.all('.list-thumbs .thumb, .main-thumbs > li')), entity); + + return profile; +} + +async function getActorUrl(actor, entity) { + if (actor.url) { + return actor.url; + } + + // Double View Casting seems to be case sensitive... + const res = await unprint.get(`${entity.origin}/models/search/?q=${actor.name}`, { selectAll: '.thumb-modal, .big-thumb' }); + + if (!res.ok) { + return res.status; + } + + const actorItem = res.context.find(({ query }) => slugify(query.content('.thumb-title a, .title')) === actor.slug); + + if (!actorItem) { return null; } - const actorUrl = actor.query.url('a', 'href', { origin: entity.url }); - const actorRes = await qu.get(actorUrl); + const actorUrl = actorItem.query.url('a', { origin: entity.url, protocol: 'http' }); - if (actorRes.ok) { - return scrapeProfileA(actorRes.item, entity); + if (actorUrl) { + return actorUrl; + } + + return null; +} + +async function fetchProfileA(actor, { entity }) { + const actorUrl = await getActorUrl(actor, entity); + + if (actorUrl) { + const actorRes = await unprint.get(actorUrl); + + if (actorRes.ok) { + return scrapeProfileA(actorRes.context, entity); + } } return null; diff --git a/tests/profiles.js b/tests/profiles.js index 579a7bf8..288b29b9 100644 --- a/tests/profiles.js +++ b/tests/profiles.js @@ -232,6 +232,9 @@ const actors = [ { entity: 'bang', name: 'Riley Reid', fields: ['avatar', 'dateOfBirth', 'birthPlace', 'ethnicity', 'hairColor', 'eyes'] }, { entity: 'littlecapricedreams', name: 'Littlecaprice', fields: ['avatar', 'nationality', 'cup', 'measurements', 'height', 'description'] }, // sic { entity: 'pascalssubsluts', name: 'Zlata Shine', fields: ['avatar', 'gender', 'nationality', 'hairColor', 'height', 'description'] }, // sic + { entity: 'nebraskacoeds', name: 'Mary Beth Haglin', fields: ['avatar'] }, // sic + { entity: 'firstanalquest', name: 'Abigaile Johnson', fields: ['avatar', 'dateOfBirth', 'birthPlace', 'weight', 'height', 'measurements'] }, // sic + { entity: 'doubleviewcasting', name: 'Abigaile Johnson', fields: ['avatar', 'dateOfBirth', 'birthPlace', 'weight', 'height', 'measurements'] }, // sic ]; const actorScrapers = scrapers.actors;