From 6cbb7f9c1e322c66aef72d8110fc571360882d67 Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Mon, 9 Mar 2020 02:02:29 +0100 Subject: [PATCH] Major API change for 'q', renamed to 'qu', refactored modules. Fixed Gamma URL entry ID regex. --- assets/components/releases/banner.vue | 30 +-- seeds/02_sites.js | 16 +- src/scrape-releases.js | 8 +- src/scrapers/assylum.js | 20 +- src/scrapers/bangbros.js | 22 +- src/scrapers/brazzers.js | 8 +- src/scrapers/cherrypimps.js | 20 +- src/scrapers/famedigital.js | 14 +- src/scrapers/fullpornnetwork.js | 18 +- src/scrapers/gamma.js | 10 +- src/scrapers/hush.js | 38 +-- src/scrapers/insex.js | 40 +-- src/scrapers/julesjordan.js | 104 ++++---- src/scrapers/newsensations.js | 41 +-- src/scrapers/nubiles.js | 56 ++--- src/scrapers/private.js | 10 +- src/scrapers/score.js | 46 ++-- src/scrapers/template.js | 18 +- src/scrapers/vivid.js | 4 +- src/utils/q.js | 303 +--------------------- src/utils/qu.js | 346 ++++++++++++++++++++++++++ 21 files changed, 611 insertions(+), 561 deletions(-) create mode 100644 src/utils/qu.js diff --git a/assets/components/releases/banner.vue b/assets/components/releases/banner.vue index b1b47ab4..4f1a7ad6 100644 --- a/assets/components/releases/banner.vue +++ b/assets/components/releases/banner.vue @@ -3,21 +3,6 @@ class="banner" @wheel.prevent="scrollBanner" > - -
+ + movie.scenes).flat(); - console.log(movieScenes); + // console.log(movieScenes); if (storedReleases) { logger.info(storedReleases.map(storedRelease => `\nhttp://${config.web.host}:${config.web.port}/scene/${storedRelease.id}/${storedRelease.slug}`).join('')); @@ -120,7 +120,7 @@ async function scrapeReleases(sources, release = null, type = 'scene', preflight } async function deepFetchReleases(baseReleases, beforeFetchLatest) { - return Promise.map(baseReleases, async (release) => { + const deepReleases = await Promise.map(baseReleases, async (release) => { if (release.url || (release.path && release.site)) { try { const fullRelease = await scrapeRelease(release.url, release, 'scene', beforeFetchLatest); @@ -150,6 +150,10 @@ async function deepFetchReleases(baseReleases, beforeFetchLatest) { }, { concurrency: 2, }); + + // console.log(deepReleases); + + return deepReleases; } module.exports = { diff --git a/src/scrapers/assylum.js b/src/scrapers/assylum.js index 1702444b..09039858 100644 --- a/src/scrapers/assylum.js +++ b/src/scrapers/assylum.js @@ -18,23 +18,23 @@ function matchActors(actorString, models) { } function scrapeLatest(scenes, site, models) { - return scenes.map(({ q, qd, qu, qi }) => { + return scenes.map(({ qu }) => { const release = {}; - const pathname = qu('a.itemimg').slice(1); + const pathname = qu.url('a.itemimg').slice(1); [release.entryId] = pathname.split('/').slice(-1); release.url = `${site.url}${pathname}`; - release.title = q('.itemimg img', 'alt') || q('h4 a', true); - release.description = q('.mas_longdescription', true); - release.date = qd('.movie_info2', 'MM/DD/YY', /\d{2}\/\d{2}\/\d{2}/); + release.title = qu.q('.itemimg img', 'alt') || qu.q('h4 a', true); + release.description = qu.q('.mas_longdescription', true); + release.date = qu.date('.movie_info2', 'MM/DD/YY', /\d{2}\/\d{2}\/\d{2}/); - const actorString = q('.mas_description', true); + const actorString = qu.q('.mas_description', true); const actors = matchActors(actorString, models); if (actors.length > 0) release.actors = actors; else release.actors = extractActors(actorString); - const posterPath = qi('.itemimg img'); + const posterPath = qu.img('.itemimg img'); release.poster = `${site.url}/${posterPath}`; return release; @@ -72,17 +72,17 @@ function scrapeScene({ html, q, qa, qd, qis }, url, site, models) { function extractModels({ el }, site) { const models = ctxa(el, '.item'); - return models.map(({ q, qu }) => { + return models.map(({ qu }) => { const actor = { gender: 'female' }; - const avatar = q('.itemimg img'); + const avatar = qu.q('.itemimg img'); actor.avatar = `${site.url}/${avatar.src}`; actor.name = avatar.alt .split(':').slice(-1)[0] .replace(/xtreme girl|nurse/ig, '') .trim(); - const actorPath = qu('.itemimg'); + const actorPath = qu.url('.itemimg'); actor.url = `${site.url}${actorPath.slice(1)}`; return actor; diff --git a/src/scrapers/bangbros.js b/src/scrapers/bangbros.js index 49f5ad1f..b87af897 100644 --- a/src/scrapers/bangbros.js +++ b/src/scrapers/bangbros.js @@ -79,32 +79,32 @@ function scrapeUpcoming(html, site) { */ function scrapeScene(html, url, _site) { - const { q, qa, qu, qi, qt } = ex(html, '.playerSection'); + const { qu } = ex(html, '.playerSection'); const release = {}; - [release.shootId] = q('.vdoTags + .vdoCast', true).match(/\w+$/); + [release.shootId] = qu.q('.vdoTags + .vdoCast', true).match(/\w+$/); [release.entryId] = url.split('/')[3].match(/\d+$/); - release.title = q('.ps-vdoHdd h1', true); - release.description = q('.vdoDesc', true); + release.title = qu.q('.ps-vdoHdd h1', true); + release.description = qu.q('.vdoDesc', true); - release.actors = qa('a[href*="/model"]', true); - release.tags = qa('.vdoTags a', true); + release.actors = qu.all('a[href*="/model"]', true); + release.tags = qu.all('.vdoTags a', true); - release.stars = Number(q('div[class*="like"]', true).match(/^\d+/)[0]) / 20; + release.stars = Number(qu.q('div[class*="like"]', true).match(/^\d+/)[0]) / 20; - const poster = qi('img#player-overlay-image'); + const poster = qu.img('img#player-overlay-image'); release.poster = [ poster, poster.replace('/big_trailer', '/members/450x340'), // load error fallback ]; - release.trailer = { src: qt() }; + release.trailer = { src: qu.trailer() }; // all scenes seem to have 12 album photos available, not always included on the page - const firstPhotoUrl = ex(html).qi('img[data-slider-index="1"]'); + const firstPhotoUrl = ex(html).qu.img('img[data-slider-index="1"]'); release.photos = Array.from({ length: 12 }, (val, index) => firstPhotoUrl.replace(/big\d+/, `big${index + 1}`)); - const [channel] = qu('a[href*="/websites"]').match(/\w+$/); + const [channel] = qu.url('a[href*="/websites"]').match(/\w+$/); release.channel = channel === 'bangcasting' ? 'bangbroscasting' : channel; return release; diff --git a/src/scrapers/brazzers.js b/src/scrapers/brazzers.js index d4366697..c2006874 100644 --- a/src/scrapers/brazzers.js +++ b/src/scrapers/brazzers.js @@ -127,13 +127,15 @@ function scrapeActorSearch(html, url, actorName) { async function fetchActorReleases({ qu, html }, accReleases = []) { const releases = scrapeAll(html); - const next = qu('.pagination .next a'); + const next = qu.url('.pagination .next a'); if (next) { const url = `https://www.brazzers.com${next}`; - const qNext = await get(url); + const res = await get(url); - return fetchActorReleases(qNext, accReleases.concat(releases)); + if (res.ok) { + return fetchActorReleases(res.item, accReleases.concat(releases)); + } } return accReleases.concat(releases); diff --git a/src/scrapers/cherrypimps.js b/src/scrapers/cherrypimps.js index f79e2ccd..f0950349 100644 --- a/src/scrapers/cherrypimps.js +++ b/src/scrapers/cherrypimps.js @@ -4,12 +4,12 @@ const { get, geta, ctxa, ed } = require('../utils/q'); const slugify = require('../utils/slugify'); function scrapeAll(scenes, site) { - return scenes.map(({ q, qa, qu, qd, ql, qi, qt }) => { - const url = qu('.text-thumb a'); + return scenes.map(({ qu }) => { + const url = qu.url('.text-thumb a'); const { pathname } = new URL(url); - const channelUrl = qu('.badge'); + const channelUrl = qu.url('.badge'); - if (site?.parameters?.extract && q('.badge', true) !== site.name) { + if (site?.parameters?.extract && qu.q('.badge', true) !== site.name) { return null; } @@ -17,15 +17,15 @@ function scrapeAll(scenes, site) { release.url = channelUrl ? `${channelUrl}${pathname}` : url; release.entryId = pathname.match(/\/\d+/)[0].slice(1); - release.title = q('.text-thumb a', true); + release.title = qu.q('.text-thumb a', true); - release.date = qd('.date', 'YYYY-MM-DD', /\d{4}-\d{2}-\d{2}/); - release.duration = ql('.date', /(\d{2}:)?\d{2}:\d{2}/); + release.date = qu.date('.date', 'YYYY-MM-DD', /\d{4}-\d{2}-\d{2}/); + release.duration = qu.dur('.date', /(\d{2}:)?\d{2}:\d{2}/); - release.actors = qa('.category a', true); + release.actors = qu.all('.category a', true); - release.poster = qi('img.video_placeholder, .video-images img'); - release.teaser = { src: qt() }; + release.poster = qu.img('img.video_placeholder, .video-images img'); + release.teaser = { src: qu.trailer() }; return release; }).filter(Boolean); diff --git a/src/scrapers/famedigital.js b/src/scrapers/famedigital.js index 06369914..8e60b063 100644 --- a/src/scrapers/famedigital.js +++ b/src/scrapers/famedigital.js @@ -62,19 +62,23 @@ async function fetchClassicProfile(actorName, siteSlug) { const actorSlug = slugify(actorName); const url = `https://${siteSlug}.com/en/pornstars`; - const { qa } = await get(url); + const pornstarsRes = await get(url); - const actorPath = qa('option[value*="/pornstar"]') + if (!pornstarsRes.ok) return null; + + const actorPath = pornstarsRes.item.qa('option[value*="/pornstar"]') .find(el => slugify(el.textContent) === actorSlug) ?.value; if (actorPath) { const actorUrl = `https://${siteSlug}.com${actorPath}`; - const { html } = await get(actorUrl); + const res = await get(actorUrl); - const releases = scrapeAll(html, null, `https://www.${siteSlug}.com`, false); + if (res.ok) { + const releases = scrapeAll(res.item, null, `https://www.${siteSlug}.com`, false); - return { releases }; + return { releases }; + } } return null; diff --git a/src/scrapers/fullpornnetwork.js b/src/scrapers/fullpornnetwork.js index 51bd20ac..d818adbe 100644 --- a/src/scrapers/fullpornnetwork.js +++ b/src/scrapers/fullpornnetwork.js @@ -4,21 +4,21 @@ const { get, geta, ctxa } = require('../utils/q'); const slugify = require('../utils/slugify'); function scrapeAll(scenes) { - return scenes.map(({ el, q, qa, qd, qu, ql }) => { + return scenes.map(({ el, qu }) => { const release = {}; - release.entryId = el.dataset.setid || q('.update_thumb', 'id').match(/\w+-\w+-(\d+)-\d+/)[1]; - release.url = qu('.title'); + release.entryId = el.dataset.setid || qu.q('.update_thumb', 'id').match(/\w+-\w+-(\d+)-\d+/)[1]; + release.url = qu.url('.title'); - release.title = q('.title', true); - release.description = q('.title', 'title'); + release.title = qu.q('.title', true); + release.description = qu.q('.title', 'title'); - release.date = qd('.video-data > span:last-child', 'YYYY-MM-DD'); - release.duration = ql('.video-data > span'); + release.date = qu.date('.video-data > span:last-child', 'YYYY-MM-DD'); + release.duration = qu.dur('.video-data > span'); - release.actors = qa('.update_models a', true); + release.actors = qu.all('.update_models a', true); - const poster = q('.update_thumb', 'src0_1x'); + const poster = qu.q('.update_thumb', 'src0_1x'); release.poster = [ poster.replace('-1x', '-2x'), poster, diff --git a/src/scrapers/gamma.js b/src/scrapers/gamma.js index 3c891f3e..d60ba89e 100644 --- a/src/scrapers/gamma.js +++ b/src/scrapers/gamma.js @@ -198,7 +198,7 @@ async function scrapeScene(html, url, site, baseRelease, mobileHtml) { const [data, data2] = json ? JSON.parse(json) : []; const videoData = videoJson && JSON.parse(videoJson.slice(videoJson.indexOf('{'), videoJson.indexOf('};') + 1)); - release.entryId = (baseRelease?.path || new URL(url).pathname).match(/\/(\d{2,})\//)[1]; + release.entryId = (baseRelease?.path || new URL(url).pathname).match(/\/(\d{2,})(\/|$)/)?.[1]; release.title = videoData?.playerOptions?.sceneInfos.sceneTitle || data?.name; // date in data object is not the release date of the scene, but the date the entry was added; only use as fallback @@ -298,10 +298,12 @@ async function fetchActorReleases(profileUrl, getActorReleasesUrl, page = 1, acc const profilePath = `/${pathname.split('/').slice(-2).join('/')}`; const url = getActorReleasesUrl(profilePath, page); - const { html, qu } = await get(url); + const res = await get(url); - const releases = scrapeAll(html, null, origin); - const nextPage = qu('.Gamma_Paginator a.next'); + if (!res.ok) return []; + + const releases = scrapeAll(res.html, null, origin); + const nextPage = res.item.qu.url('.Gamma_Paginator a.next'); if (nextPage) { return fetchActorReleases(profileUrl, getActorReleasesUrl, page + 1, accReleases.concat(releases)); diff --git a/src/scrapers/hush.js b/src/scrapers/hush.js index 9029a56a..8fa828b3 100644 --- a/src/scrapers/hush.js +++ b/src/scrapers/hush.js @@ -60,18 +60,18 @@ function getImageWithFallbacks(q, selector, site, el) { } function scrapeAll(scenes, site) { - return scenes.map(({ q, qu, qd, ql }) => { + return scenes.map(({ qu }) => { const release = {}; - release.title = q('h3 a', 'title') || q('h3 a', true); - release.url = qu('h3 a'); + release.title = qu.q('h3 a', 'title') || qu.q('h3 a', true); + release.url = qu.url('h3 a'); - release.date = qd('.modeldata p', 'YYYY-MM-DD', /\d{4}-\d{2}-\d{2}/); - release.duration = ql('.modeldata p'); + release.date = qu.date('.modeldata p', 'YYYY-MM-DD', /\d{4}-\d{2}-\d{2}/); + release.duration = qu.dur('.modeldata p'); if (/bts|behind the scenes/i.test(release.title)) release.tags = ['behind the scenes']; - release.poster = getImageWithFallbacks(q, '.modelimg img', site); + release.poster = getImageWithFallbacks(qu.q, '.modelimg img', site); // release.entryId = q('.modelimg img', 'id').match(/set-target-(\d+)/)[1]; release.entryId = deriveEntryId(release); @@ -81,18 +81,18 @@ function scrapeAll(scenes, site) { } function scrapeAllT1(scenes, site, accSiteReleases) { - return scenes.map(({ q, qi, qd, ql, qu }) => { + return scenes.map(({ qu }) => { const release = {}; - release.title = q('h4 a', 'title') || q('h4 a', true); - release.url = qu('h4 a'); + release.title = qu.q('h4 a', 'title') || qu.q('h4 a', true); + release.url = qu.url('h4 a'); - release.date = qd('.more-info-div', 'MMM D, YYYY'); - release.duration = ql('.more-info-div'); + release.date = qu.date('.more-info-div', 'MMM D, YYYY'); + release.duration = qu.dur('.more-info-div'); if (/bts|behind the scenes/i.test(release.title)) release.tags = ['behind the scenes']; - const posterPath = q('.img-div img', 'src0_1x') || qi('img.video_placeholder'); + const posterPath = qu.q('.img-div img', 'src0_1x') || qu.img('img.video_placeholder'); if (posterPath) { const poster = /^http/.test(posterPath) ? posterPath : `${site.parameters?.media || site.url}${posterPath}`; @@ -117,16 +117,16 @@ function scrapeAllT1(scenes, site, accSiteReleases) { } function scrapeAllTour(scenes) { - return scenes.map(({ q, qa, qu, qd, qi }) => { + return scenes.map(({ qu }) => { const release = {}; - release.title = q('h4 a', true); - release.url = qu('a'); - release.date = qd('.tour_update_models + span', 'YYYY-MM-DD'); + release.title = qu.q('h4 a', true); + release.url = qu.url('a'); + release.date = qu.date('.tour_update_models + span', 'YYYY-MM-DD'); - release.actors = qa('.tour_update_models a', true); + release.actors = qu.all('.tour_update_models a', true); - release.poster = qi('a img'); + release.poster = qu.img('a img'); release.entryId = deriveEntryId(release); @@ -343,7 +343,7 @@ function scrapeProfileTour({ el, q, qtxs }, site) { const qReleases = ctxa(el, '.update_block'); profile.releases = qReleases.map((qRelease) => { - const url = qRelease.qu('.update_image a[href]'); + const url = qRelease.qu.url('.update_image a[href]'); const release = scrapeSceneTour(qRelease, site); if (!/\/(signup|join)/i.test(url)) release.url = url; diff --git a/src/scrapers/insex.js b/src/scrapers/insex.js index 7a5c1f2a..713e206a 100644 --- a/src/scrapers/insex.js +++ b/src/scrapers/insex.js @@ -8,15 +8,15 @@ function scrapeLatest(html, site) { ? exa(html, '#articleTable table[cellspacing="2"]') : exa(html, 'body > table'); - return scenes.map(({ q, qd, qi, qu, ql }) => { + return scenes.map(({ qu }) => { // if (q('.articleTitleText')) return scrapeFirstLatest(ctx(el), site); const release = {}; - const titleEl = q('.galleryTitleText, .articleTitleText'); + const titleEl = qu.q('.galleryTitleText, .articleTitleText'); const [title, ...actors] = titleEl.textContent.split('|'); - const date = qd('.articlePostDateText td', 'MMM D, YYYY'); + const date = qu.date('.articlePostDateText td', 'MMM D, YYYY'); - const url = qu(titleEl, 'a'); + const url = qu.url(titleEl, 'a'); [release.entryId] = url.split('/').slice(-2); release.url = `${site.url}${url}`; @@ -31,15 +31,15 @@ function scrapeLatest(html, site) { release.actors = actors.map(actor => actor.trim()); - const description = q('.articleCopyText', true); + const description = qu.q('.articleCopyText', true); if (description) release.description = description.slice(0, description.lastIndexOf('(')); - const duration = ql('.articleCopyText a:nth-child(2)'); + const duration = qu.dur('.articleCopyText a:nth-child(2)'); if (duration) release.duration = duration; - release.likes = parseInt(q('.articlePostDateText td:nth-child(3)', true), 10); + release.likes = parseInt(qu.q('.articlePostDateText td:nth-child(3)', true), 10); - const cover = qi('a img'); + const cover = qu.img('a img'); release.covers = [[ cover.replace('_thumbnail', ''), cover, @@ -49,31 +49,31 @@ function scrapeLatest(html, site) { }); } -function scrapeScene({ q, qd, ql, qu, qis, qp, qt }, site) { +function scrapeScene({ qu }, site) { const release = {}; - const titleEl = q('.articleTitleText'); + const titleEl = qu.q('.articleTitleText'); const [title, ...actors] = titleEl.textContent.split('|'); - const url = qu(titleEl, 'a'); + const url = qu.url(titleEl, 'a'); [release.entryId] = url.split('/').slice(-2); release.url = `${site.url}${url}`; release.title = title.trim(); - release.description = q('.articleCopyText', true); + release.description = qu.q('.articleCopyText', true); release.actors = actors.map(actor => actor.trim()); - release.date = qd('.articlePostDateText', 'MMMM D, YYYY'); - release.duration = ql('.articlePostDateText a:nth-child(2)'); + release.date = qu.date('.articlePostDateText', 'MMMM D, YYYY'); + release.duration = qu.dur('.articlePostDateText a:nth-child(2)'); - const [cover, ...photos] = qis('img[src*="images"]'); + const [cover, ...photos] = qu.imgs('img[src*="images"]'); release.covers = [cover]; release.photos = photos; - release.poster = qp(); + release.poster = qu.poster(); - const trailer = qt(); - release.trailer = { src: trailer }; + const trailer = qu.trailer(); + if (trailer) release.trailer = { src: trailer }; return release; } @@ -96,9 +96,9 @@ async function fetchLatest(site, page = 1) { } async function fetchScene(url, site) { - const qScene = await get(url); + const res = await get(url); - return qScene && scrapeScene(qScene, site); + return res.ok ? scrapeScene(res.item, site) : res.status; } module.exports = { diff --git a/src/scrapers/julesjordan.js b/src/scrapers/julesjordan.js index 94caf208..2799123f 100644 --- a/src/scrapers/julesjordan.js +++ b/src/scrapers/julesjordan.js @@ -118,25 +118,41 @@ async function getPhotos(entryId, site, type = 'highres', page = 1) { return getPhotosLegacy(entryId, site, 'highres', 1); } +function getEntryId(html) { + const entryId = html.match(/showtagform\((\d+)\)/); + + if (entryId) { + return entryId[1]; + } + + const setIdIndex = html.indexOf('setid:"'); + + if (setIdIndex) { + return html.slice(setIdIndex, html.indexOf(',', setIdIndex)).match(/\d+/)[0]; + } + + return null; +} + function scrapeAll(scenes, site) { - return scenes.map(({ el, q, qa, qh, qu, qd, qi, qis }) => { + return scenes.map(({ qu }) => { const release = {}; - release.entryId = el.dataset.setid || q('.rating_box')?.dataset.id; + release.entryId = qu.el.dataset.setid || qu.q('.rating_box')?.dataset.id; - release.url = qu('.update_title, .dvd_info > a, a ~ a'); - release.title = q('.update_title, .dvd_info > a, a ~ a', true); - release.date = qd('.update_date', 'MM/DD/YYYY'); + release.url = qu.url('.update_title, .dvd_info > a, a ~ a'); + release.title = qu.q('.update_title, .dvd_info > a, a ~ a', true); + release.date = qu.date('.update_date', 'MM/DD/YYYY'); - release.actors = qa('.update_models a', true); + release.actors = qu.all('.update_models a', true); - const dvdPhotos = qis('.dvd_preview_thumb'); - const photoCount = Number(q('a img.thumbs', 'cnt')) || 1; + const dvdPhotos = qu.imgs('.dvd_preview_thumb'); + const photoCount = Number(qu.q('a img.thumbs', 'cnt')) || 1; [release.poster, ...release.photos] = dvdPhotos.length ? dvdPhotos : Array.from({ length: photoCount }).map((value, index) => { - const src = qi('a img.thumbs', `src${index}_1x`) || qi('a img.thumbs', `src${index}`) || qi('a img.thumbs'); + const src = qu.img('a img.thumbs', `src${index}_1x`) || qu.img('a img.thumbs', `src${index}`) || qu.img('a img.thumbs'); return src ? { src: /^http/.test(src) ? src : `${site.url}${src}`, @@ -144,7 +160,7 @@ function scrapeAll(scenes, site) { } : null; }).filter(Boolean); - const teaserScript = qh('script'); + const teaserScript = qu.content('script'); if (teaserScript) { const src = teaserScript.slice(teaserScript.indexOf('http'), teaserScript.indexOf('.mp4') + 4); if (src) release.teaser = { src }; @@ -204,50 +220,17 @@ function scrapeUpcoming(html, site) { }); } -async function scrapeScene(html, url, site) { - const $ = cheerio.load(html, { normalizeWhitespace: true }); - +async function scrapeScene({ qu }, url, site) { const release = { url, site }; - release.title = $('.title_bar_hilite').text().trim(); + release.entryId = getEntryId(qu.html); + release.title = qu.q('.title_bar_hilite', true); + release.description = qu.q('.update_description', true); - const entryId = html.match(/showtagform\((\d+)\)/); + release.date = qu.date('.update_date', 'MM/DD/YYYY', null, 'innerHTML'); + release.actors = qu.all('.backgroundcolor_info > .update_models a, .item .update_models a', true); - if (entryId) release.entryId = entryId[1]; - else { - const setIdIndex = html.indexOf('setid:"'); - if (setIdIndex) release.entryId = html.slice(setIdIndex, html.indexOf(',', setIdIndex)).match(/\d+/)[0]; - } - - const dateElement = $('.update_date').text().trim(); - const dateComment = $('*') - .contents() - .toArray() - .find(({ type, data }) => type === 'comment' && data.match('Date OFF')); - - if (dateElement) { - release.date = moment - .utc($('.update_date').text(), 'MM/DD/YYYY') - .toDate(); - } - - if (dateComment) { - release.date = moment - .utc(dateComment.nodeValue.match(/\d{2}\/\d{2}\/\d{4}/), 'MM/DD/YYYY') - .toDate(); - } - - release.description = $('.update_description').text().trim(); - - release.actors = $('.backgroundcolor_info > .update_models a, .item .update_models a') - .map((_actorIndex, actorElement) => $(actorElement).text()) - .toArray(); - - const infoLines = $('script:contains("useimage")') - .html() - .split('\n'); - - const posterPath = infoLines.find(line => line.match('useimage')).replace('useimage = "', '').slice(0, -2); + const posterPath = qu.html.match(/useimage = "(.*)"/)?.[1]; if (posterPath) { const poster = /^http/.test(posterPath) ? posterPath : `${site.url}${posterPath}`; @@ -261,7 +244,7 @@ async function scrapeScene(html, url, site) { } if (site.slug !== 'manuelferrara') { - const trailerLines = infoLines.filter(line => /movie\["Trailer\w*"\]\[/.test(line)); + const trailerLines = qu.html.split('\n').filter(line => /movie\["trailer\w*"\]\[/i.test(line)); if (trailerLines.length) { release.trailer = trailerLines.map((trailerLine) => { @@ -270,19 +253,24 @@ async function scrapeScene(html, url, site) { return src && { src: /^http/.test(src) ? src : `${site.url}${src}`, - quality: quality && Number(quality), + quality: quality && Number(quality.replace('558', '540')), }; }).filter(Boolean); } } release.photos = await getPhotos(release.entryId, site); - release.tags = $('.update_tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray(); + release.tags = qu.all('.update_tags a', true); - const movie = $('.update_dvds a').attr('href'); - if (movie) release.movie = movie; + if (qu.exists('.update_dvds a')) { + release.movie = { + url: qu.url('.update_dvds a'), + title: qu.q('.update_dvds a', true), + }; + } - release.stars = Number($('.avg_rating').text().trim().replace(/[\s|Avg Rating:]/g, '')); + const stars = Number(qu.q('.avg_rating', true)?.replace(/[\s|Avg Rating:]/g, '')); + if (stars) release.stars = stars; return release; } @@ -371,9 +359,9 @@ async function fetchUpcoming(site) { } async function fetchScene(url, site) { - const res = await bhttp.get(url); + const res = await get(url); - return scrapeScene(res.body.toString(), url, site); + return res.ok ? scrapeScene(res.item, url, site) : res.status; } async function fetchMovie(url, site) { diff --git a/src/scrapers/newsensations.js b/src/scrapers/newsensations.js index 2eee837a..6f2b9a7b 100644 --- a/src/scrapers/newsensations.js +++ b/src/scrapers/newsensations.js @@ -3,52 +3,55 @@ const { geta, ed } = require('../utils/q'); function scrapeBlockLatest(scenes) { - return scenes.map(({ html, q, qa, qu, qt }) => { + return scenes.map(({ html, qu }) => { const release = {}; - const entryId = q('div[class*="videothumb"]', 'class').match(/videothumb_(\d+)/) - || q('div[id*="videothumb"]', 'id').match(/videothumb_(\d+)/); + const entryId = qu.q('div[class*="videothumb"]', 'class').match(/videothumb_(\d+)/) + || qu.q('div[id*="videothumb"]', 'id').match(/videothumb_(\d+)/); release.entryId = entryId[1]; - release.title = q('h4 a', true); - release.url = qu('h4 a'); + release.title = qu.q('h4 a', true); + release.url = qu.url('h4 a'); release.date = ed(html, 'MM/DD/YYYY', /\d{2}\/\d{2}\/\d{4}/); - release.actors = qa('.tour_update_models a', true); + release.actors = qu.all('.tour_update_models a', true); - release.poster = q('div img').dataset.src; - release.photos = [q('div img', 'src0_4x') || q('div img', 'src0_3x') || q('div img', 'src0_2x')]; + release.poster = qu.q('div img').dataset.src; + release.photos = [qu.q('div img', 'src0_4x') || qu.q('div img', 'src0_3x') || qu.q('div img', 'src0_2x')]; - release.teaser = qt(); + release.teaser = qu.video(); return release; }); } function scrapeClassicLatest(scenes) { - return scenes.map(({ el, q, qa, qd, qu }) => { + return scenes.map(({ el, qu }) => { const release = {}; release.entryId = el.dataset.setid; - release.url = qu('a'); + release.url = qu.url('a'); - release.title = q('.update_title_small', true) || q('a:nth-child(2)', true); + release.title = qu.q('.update_title_small', true) || qu.q('a:nth-child(2)', true); - const description = q('a', 'title'); + const description = qu.q('a', 'title'); if (description) release.description = description; - const date = qd('.date_small, .update_date', 'MM/DD/YYYY'); + const date = qu.date('.date_small, .update_date', 'MM/DD/YYYY'); if (date) release.date = date; - const durationLine = q('.update_counts', true); + const durationLine = qu.q('.update_counts', true); if (durationLine) release.duration = Number(durationLine.match(/(\d+) min/i)[1]) * 60; - const actors = qa('.update_models a', true); - release.actors = actors.length > 0 ? actors : q('.update_models', true).split(/,\s*/); + const actors = qu.all('.update_models a', true); + release.actors = actors.length > 0 ? actors : qu.q('.update_models', true).split(/,\s*/); - const photoCount = q('.update_thumb', 'cnt'); - [release.poster, ...release.photos] = Array.from({ length: photoCount }).map((value, index) => q('.update_thumb', `src${index}_3x`) || q('.update_thumb', `src${index}_2x`) || q('.update_thumb', `src${index}_1x`)); + const photoCount = qu.q('.update_thumb', 'cnt'); + [release.poster, ...release.photos] = Array.from({ length: photoCount }) + .map((value, index) => qu.q('.update_thumb', `src${index}_3x`) + || qu.q('.update_thumb', `src${index}_2x`) + || qu.q('.update_thumb', `src${index}_1x`)); return release; }); diff --git a/src/scrapers/nubiles.js b/src/scrapers/nubiles.js index 570169df..9c72a05a 100644 --- a/src/scrapers/nubiles.js +++ b/src/scrapers/nubiles.js @@ -18,13 +18,13 @@ async function getPhotos(albumUrl) { } function scrapeAll(scenes, site, origin) { - return scenes.map(({ q, qa, qu, qd }) => { + return scenes.map(({ qu }) => { const release = {}; - release.title = q('.title a', true); + release.title = qu.q('.title a', true); - const url = qu('.title a').split('?')[0]; - const channelUrl = qu('.site-link'); + const url = qu.url('.title a').split('?')[0]; + const channelUrl = qu.url('.site-link'); if (/^http/.test(url)) { const { pathname } = new URL(url); @@ -39,74 +39,74 @@ function scrapeAll(scenes, site, origin) { else if (site?.url) release.url = `${site.url}${url}`; else if (origin) release.url = `${origin}${url}`; } else { - release.entryId = q('a img', 'tube_tour_thumb_id'); + release.entryId = qu.q('a img', 'tube_tour_thumb_id'); } - release.date = qd('.date', 'MMM D, YYYY'); - release.actors = qa('.models a.model', true); + release.date = qu.date('.date', 'MMM D, YYYY'); + release.actors = qu.all('.models a.model', true); - const poster = q('img').dataset.original; + const poster = qu.q('img').dataset.original; release.poster = [ poster.replace('_640', '_1280'), poster, ]; - release.stars = Number(q('.rating', true)); - release.likes = Number(q('.likes', true)); + release.stars = Number(qu.q('.rating', true)); + release.likes = Number(qu.q('.likes', true)); return release; }); } -async function scrapeScene({ q, qa, qd, qp, qu, qi }, url, site) { +async function scrapeScene(qu, url, site) { const release = {}; const { origin, pathname } = new URL(url); release.url = `${origin}${pathname}`; release.entryId = new URL(url).pathname.split('/')[3]; - release.title = q('.content-pane-title h2', true); - release.description = q('.content-pane-column div', true); + release.title = qu.q('.content-pane-title h2', true); + release.description = qu.q('.content-pane-column div', true); - release.date = qd('.date', 'MMM D, YYYY'); + release.date = qu.q('.date', 'MMM D, YYYY'); - release.actors = qa('.content-pane-performers .model', true); - release.tags = qa('.categories a', true); + release.actors = qu.all('.content-pane-performers .model', true); + release.tags = qu.all('.categories a', true); - release.poster = qp() || qi('.fake-video-player img'); - release.trailer = qa('source').map(source => ({ + release.poster = qu.poster() || qu.img('.fake-video-player img'); + release.trailer = qu.all('source').map(source => ({ src: source.src, quality: Number(source.getAttribute('res')), })); - release.stars = Number(q('.score', true)); - release.likes = Number(q('#likecount', true)); + release.stars = Number(qu.q('.score', true)); + release.likes = Number(qu.q('#likecount', true)); - const albumLink = qu('.content-pane-related-links a[href*="gallery"]'); + const albumLink = qu.url('.content-pane-related-links a[href*="gallery"]'); if (albumLink) release.photos = await getPhotos(`${site.url}${albumLink}`); return release; } -function scrapeProfile({ q, qa, qi, qu }, _actorName, origin) { +function scrapeProfile({ qu }, _actorName, origin) { const profile = {}; - const keys = qa('.model-profile h5', true); - const values = qa('.model-profile h5 + p', true); + const keys = qu.all('.model-profile h5', true); + const values = qu.all('.model-profile h5 + p', true); const bio = keys.reduce((acc, key, index) => ({ ...acc, [slugify(key, { delimiter: '_' })]: values[index] }), {}); profile.age = Number(bio.age); - profile.description = q('.model-bio', true); + profile.description = qu.q('.model-bio', true); profile.residencePlace = bio.location; profile.height = heightToCm(bio.height); [profile.bust, profile.waist, profile.hip] = bio.figure.split('-').map(v => Number(v) || v); - profile.avatar = qi('.model-profile img'); + profile.avatar = qu.img('.model-profile img'); - const releases = qa('.content-grid-item').filter(el => /video\//.test(qu(el, '.img-wrapper a'))); // filter out photos + const releases = qu.all('.content-grid-item').filter(el => /video\//.test(qu.url(el, '.img-wrapper a'))); // filter out photos profile.releases = scrapeAll(ctxa(releases), null, origin); return profile; @@ -145,7 +145,7 @@ async function fetchProfile(actorName, siteSlug) { if (!resModels.ok) return resModels.status; - const modelPath = resModels.item.qa('.content-grid-item a.title').find(el => slugify(el.textContent) === slugify(actorName)); + const modelPath = resModels.item.qu.all('.content-grid-item a.title').find(el => slugify(el.textContent) === slugify(actorName)); if (modelPath) { const modelUrl = `${origin}${modelPath}`; diff --git a/src/scrapers/private.js b/src/scrapers/private.js index 502aad72..c80073b3 100644 --- a/src/scrapers/private.js +++ b/src/scrapers/private.js @@ -178,16 +178,16 @@ async function fetchScene(url, site) { async function fetchProfile(actorName) { const actorSearchSlug = slugify(actorName, { delimiter: '+' }); const url = `https://www.private.com/search.php?query=${actorSearchSlug}`; - const modelLinks = await geta(url, '.model h3 a'); + const modelRes = await geta(url, '.model h3 a'); - if (modelLinks) { + if (modelRes.ok) { const actorSlug = slugify(actorName); - const model = modelLinks.find(({ text }) => slugify(text) === actorSlug); + const model = modelRes.items.find(({ text }) => slugify(text) === actorSlug); if (model) { - const qProfile = await get(model.el.href); + const res = await get(model.el.href); - return qProfile && scrapeProfile(qProfile); + return res.ok ? scrapeProfile(res.item) : res.status; } } diff --git a/src/scrapers/score.js b/src/scrapers/score.js index 59fb7a10..3bd4e1cf 100644 --- a/src/scrapers/score.js +++ b/src/scrapers/score.js @@ -65,49 +65,49 @@ function scrapeAll(html, site) { } async function scrapeScene(html, url, site) { - const { q, qa, qtext, qi, qd, ql, qu, qis, qp } = ex(html, '#videos-page, #content'); + const { qu } = ex(html, '#videos-page, #content'); const release = {}; [release.entryId] = new URL(url).pathname.split('/').slice(-2); - release.title = q('h2.text-uppercase, h2.title, #breadcrumb-top + h1', true) - || q('h1.m-title', true)?.split(/»|\//).slice(-1)[0].trim(); - release.description = qtext('.p-desc, .desc'); + release.title = qu.q('h2.text-uppercase, h2.title, #breadcrumb-top + h1', true) + || qu.q('h1.m-title', true)?.split(/»|\//).slice(-1)[0].trim(); + release.description = qu.text('.p-desc, .desc'); - release.actors = qa('.value a[href*=models], .value a[href*=performer], .value a[href*=teen-babes]', true); + release.actors = qu.all('.value a[href*=models], .value a[href*=performer], .value a[href*=teen-babes]', true); if (release.actors.length === 0) { - const actorEl = qa('.stat').find(stat => /Featuring/.test(stat.textContent)); - const actorString = qtext(actorEl); + const actorEl = qu.all('.stat').find(stat => /Featuring/.test(stat.textContent)); + const actorString = qu.text(actorEl); release.actors = actorString?.split(/,\band\b|,/g).map(actor => actor.trim()) || []; } if (release.actors.length === 0 && site.parameters?.actors) release.actors = site.parameters.actors; - release.tags = qa('a[href*=tag]', true); + release.tags = qu.all('a[href*=tag]', true); - const dateEl = qa('.value').find(el => /\w+ \d+\w+, \d{4}/.test(el.textContent)); - release.date = qd(dateEl, null, 'MMMM Do, YYYY') - || qd('.date', 'MMMM Do, YYYY', /\w+ \d{1,2}\w+, \d{4}/) - || qd('.info .holder', 'MM/DD/YYYY', /\d{2}\/\d{2}\/\d{4}/); + const dateEl = qu.all('.value').find(el => /\w+ \d+\w+, \d{4}/.test(el.textContent)); + release.date = qu.date(dateEl, null, 'MMMM Do, YYYY') + || qu.date('.date', 'MMMM Do, YYYY', /\w+ \d{1,2}\w+, \d{4}/) + || qu.date('.info .holder', 'MM/DD/YYYY', /\d{2}\/\d{2}\/\d{4}/); - const durationEl = qa('value').find(el => /\d{1,3}:\d{2}/.test(el.textContent)); - release.duration = ql(durationEl); + const durationEl = qu.all('value').find(el => /\d{1,3}:\d{2}/.test(el.textContent)); + release.duration = qu.dur(durationEl); - release.poster = qp('video') || qi('.flowplayer img') || qi('img'); // _800.jpg is larger than _xl.jpg in landscape - const photosUrl = qu('.stat a[href*=photos]'); + release.poster = qu.poster('video') || qu.img('.flowplayer img') || qu.img('img'); // _800.jpg is larger than _xl.jpg in landscape + const photosUrl = qu.url('.stat a[href*=photos]'); if (photosUrl) { release.photos = await fetchPhotos(photosUrl); } else { - release.photos = qis('img[src*=ThumbNails], .p-photos .tn img').map(photo => [ + release.photos = qu.imgs('img[src*=ThumbNails], .p-photos .tn img').map(photo => [ photo.replace('_tn', ''), photo, ]); } - const trailers = qa('a[href*=Trailers]'); + const trailers = qu.all('a[href*=Trailers]'); if (trailers) { release.trailer = trailers.map((trailer) => { @@ -119,7 +119,7 @@ async function scrapeScene(html, url, site) { }).filter(Boolean); } - const stars = q('.rate-box').dataset.score; + const stars = qu.q('.rate-box').dataset.score; if (stars) release.rating = { stars }; return release; @@ -133,11 +133,11 @@ function scrapeModels(html, actorName) { } async function fetchActorReleases(url, accReleases = []) { - const { document, qu } = await get(url); + const res = await get(url); - if (document) { - const releases = accReleases.concat(scrapeAll(document.body.outerHTML)); - const nextPage = qu('.next-pg'); + if (res.ok) { + const releases = accReleases.concat(scrapeAll(res.item.document.body.outerHTML)); + const nextPage = res.item.qu.url('.next-pg'); if (nextPage && new URL(nextPage).searchParams.has('page')) { // last page has 'next' button linking to join page return fetchActorReleases(nextPage, releases); diff --git a/src/scrapers/template.js b/src/scrapers/template.js index 6cd24768..fae753e5 100644 --- a/src/scrapers/template.js +++ b/src/scrapers/template.js @@ -3,29 +3,29 @@ const { get, geta } = require('../utils/q'); function scrapeLatest(scenes, site) { - return scenes.map(({ q, qa, qu, qd }) => { + return scenes.map(({ qu }) => { const release = {}; - release.title = q('.title a', true); + release.title = qu.q('.title a', true); - const pathname = qu('.title a'); + const pathname = qu.url('.title a'); release.entryId = pathname.split('/')[3]; release.url = `${site.url}${pathname}`; - release.date = qd('.date', 'MMM DD, YYYY'); - release.actors = qa('.models a.model', true); + release.date = qu.date('.date', 'MMM DD, YYYY'); + release.actors = qu.all('.models a.model', true); - release.poster = q('img').dataset.original; + release.poster = qu.q('img').dataset.original; - release.stars = Number(q('.rating', true)); - release.likes = Number(q('.likes', true)); + release.stars = Number(qu.q('.rating', true)); + release.likes = Number(qu.q('.likes', true)); console.log(release); return release; }); } -function scrapeScene({ q }, _site) { +function scrapeScene({ qu }, _site) { const release = {}; console.log(release); diff --git a/src/scrapers/vivid.js b/src/scrapers/vivid.js index b844bc78..45086283 100644 --- a/src/scrapers/vivid.js +++ b/src/scrapers/vivid.js @@ -96,9 +96,9 @@ async function fetchSceneNative(url, site, release) { return fetchScene(url, site, release); } - const qScene = await get(url); + const res = await get(url); - return qScene && scrapeSceneNative(qScene, url, site); + return res.ok ? scrapeSceneNative(res.item, url, site) : res.status; } async function fetchSceneWrapper(url, site, release) { diff --git a/src/utils/q.js b/src/utils/q.js index 68e54cb4..fc445832 100644 --- a/src/utils/q.js +++ b/src/utils/q.js @@ -1,304 +1,5 @@ 'use strict'; -const { JSDOM } = require('jsdom'); -const moment = require('moment'); -const http = require('./http'); +const qu = require('./qu'); -function trim(str) { - if (!str) return null; - return str.trim().replace(/\s+/g, ' '); -} - -function extractDate(dateString, format, match) { - if (match) { - const dateStamp = trim(dateString).match(match); - - if (dateStamp) { - const date = moment.utc(dateStamp[0], format); - - return date.isValid() ? date.toDate() : null; - } - return null; - } - - const date = moment.utc(trim(dateString), format); - - return date.isValid() ? date.toDate() : null; -} - -function formatDate(date, format, inputFormat) { - if (inputFormat) return moment(date, inputFormat).format(format); - - return moment(date).format(format); -} - -function prefixProtocol(url, protocol = 'https') { - if (protocol && /^\/\//.test(url)) { - return `${protocol}:${url}`; - } - - return url; -} - -function q(context, selector, attrArg, applyTrim = true) { - const attr = attrArg === true ? 'textContent' : attrArg; - - if (attr) { - const value = selector - ? context.querySelector(selector)?.[attr] || context.querySelector(selector)?.attributes[attr]?.value - : context[attr] || context[attr]?.attributes[attr]?.value; - - return applyTrim && value ? trim(value) : value; - } - - return selector ? context.querySelector(selector) : context; -} - -function qall(context, selector, attrArg, applyTrim = true) { - const attr = attrArg === true ? 'textContent' : attrArg; - - if (attr) { - return Array.from(context.querySelectorAll(selector), el => (applyTrim && el[attr] ? trim(el[attr]) : el[attr])); - } - - return Array.from(context.querySelectorAll(selector)); -} - -function qhtml(context, selector) { - const el = q(context, selector, null, true); - - return el && el.innerHTML; -} - -function qtexts(context, selector, applyTrim = true, filter = true) { - const el = q(context, selector, null, applyTrim); - if (!el) return null; - - const nodes = Array.from(el.childNodes) - .filter(node => node.nodeName === '#text') - .map(node => (applyTrim ? trim(node.textContent) : node.textContent)); - - return filter ? nodes.filter(Boolean) : nodes; -} - -function qtext(context, selector, applyTrim = true) { - const nodes = qtexts(context, selector, applyTrim, true); - if (!nodes) return null; - - const text = nodes.join(' '); - - return applyTrim ? trim(text) : text; -} - -function qmeta(context, selector, attrArg = 'content', applyTrim = true) { - if (/meta\[.*\]/.test(selector)) { - return q(context, selector, attrArg, applyTrim); - } - - return q(context, `meta[${selector}]`, attrArg, applyTrim); -} - -function qdate(context, selector, format, match, attr = 'textContent') { - const dateString = q(context, selector, attr, true); - - if (!dateString) return null; - - return extractDate(dateString, format, match); -} - -function qimage(context, selector = 'img', attr = 'src', protocol = 'https') { - const image = q(context, selector, attr); - - // no attribute means q output will be HTML element - return attr ? prefixProtocol(image, protocol) : image; -} - -function qimages(context, selector = 'img', attr = 'src', protocol = 'https') { - const images = qall(context, selector, attr); - - return attr ? images.map(image => prefixProtocol(image, protocol)) : images; -} - -function qurl(context, selector = 'a', attr = 'href', protocol = 'https') { - const url = q(context, selector, attr); - - return attr ? prefixProtocol(url, protocol) : url; -} - -function qurls(context, selector = 'a', attr = 'href', protocol = 'https') { - const urls = qall(context, selector, attr); - - return attr ? urls.map(url => prefixProtocol(url, protocol)) : urls; -} - -function qposter(context, selector = 'video', attr = 'poster', protocol = 'https') { - const poster = q(context, selector, attr); - - return attr ? prefixProtocol(poster, protocol) : poster; -} - -function qtrailer(context, selector = 'source', attr = 'src', protocol = 'https') { - const trailer = q(context, selector, attr); - - return attr ? prefixProtocol(trailer, protocol) : trailer; -} - -function qtrailers(context, selector = 'source', attr = 'src', protocol = 'https') { - const trailers = qall(context, selector, attr); - - return attr ? trailers.map(trailer => prefixProtocol(trailer, protocol)) : trailers; -} - -function qlength(context, selector, match, attr = 'textContent') { - const durationString = q(context, selector, attr); - - if (!durationString) return null; - const duration = durationString.match(match || /(\d+:)?\d+:\d+/); - - if (duration) { - const segments = ['00'].concat(duration[0].split(':')).slice(-3); - - return moment.duration(segments.join(':')).asSeconds(); - } - - return null; -} - -const funcs = { - q, - qa: qall, - qall, - qd: qdate, - qdate, - qh: qhtml, - qhtml, - qi: qimage, - qimage, - qimages, - qis: qimages, - ql: qlength, - qlength, - qm: qmeta, - qmeta, - qp: qposter, - qposter, - qs: qall, - qt: qtrailer, - qtext, - qtexts, - qtrailer, - qtrailers, - qts: qtrailers, - qtx: qtext, - qtxs: qtexts, - qtxt: qtext, - qtxts: qtexts, - qu: qurl, - qurl, - qurls, - qus: qurls, -}; - -function init(element, window) { - if (!element) return null; - - const contextFuncs = Object.entries(funcs) // dynamically attach methods with context - .reduce((acc, [key, func]) => ({ - ...acc, - [key]: (...args) => (window && args[0] instanceof window.HTMLElement // allow for different context - ? func(...args) - : func(element, ...args)), - }), {}); - - return { - element, - el: element, - html: element.outerHTML || element.body.outerHTML, - text: trim(element.textContent), - ...(window && { - window, - document: window.document, - }), - ...contextFuncs, - }; -} - -function initAll(context, selector, window) { - if (Array.isArray(context)) { - return context.map(element => init(element, window)); - } - - return Array.from(context.querySelectorAll(selector)) - .map(element => init(element, window)); -} - -function extract(html, selector) { - const { window } = new JSDOM(html); - - if (selector) { - return init(window.document.querySelector(selector), window); - } - - return init(window.document, window); -} - -function extractAll(html, selector) { - const { window } = new JSDOM(html); - - return initAll(window.document, selector, window); -} - -async function get(url, selector, headers, all = false) { - const res = await http.get(url, { - headers, - }); - - if (res.statusCode === 200) { - const item = all - ? extractAll(res.body.toString(), selector) - : extract(res.body.toString(), selector); - - return { - item, - items: all ? item : [item], - res, - ok: true, - status: res.statusCode, - }; - } - - return { - item: null, - items: [], - res, - ok: false, - status: res.statusCode, - }; -} - -async function getAll(url, selector, headers) { - return get(url, selector, headers, true); -} - -module.exports = { - extractDate, - extract, - extractAll, - init, - initAll, - formatDate, - get, - getAll, - context: init, - contextAll: initAll, - ed: extractDate, - ex: extract, - exa: extractAll, - fd: formatDate, - ctx: init, - ctxa: initAll, - geta: getAll, - edate: extractDate, - fdate: formatDate, - ...funcs, -}; +module.exports = qu; diff --git a/src/utils/qu.js b/src/utils/qu.js new file mode 100644 index 00000000..d9563092 --- /dev/null +++ b/src/utils/qu.js @@ -0,0 +1,346 @@ +'use strict'; + +const { JSDOM } = require('jsdom'); +const moment = require('moment'); +const http = require('./http'); + +function trim(str) { + if (!str) return null; + return str.trim().replace(/\s+/g, ' '); +} + +function extractDate(dateString, format, match) { + if (match) { + const dateStamp = trim(dateString).match(match); + + if (dateStamp) { + const dateValue = moment.utc(dateStamp[0], format); + + return dateValue.isValid() ? dateValue.toDate() : null; + } + return null; + } + + const dateValue = moment.utc(trim(dateString), format); + + return dateValue.isValid() ? dateValue.toDate() : null; +} + +function formatDate(dateValue, format, inputFormat) { + if (inputFormat) { + return moment(dateValue, inputFormat).format(format); + } + + return moment(dateValue).format(format); +} + +function prefixProtocol(urlValue, protocol = 'https') { + if (protocol && /^\/\//.test(urlValue)) { + return `${protocol}:${urlValue}`; + } + + return urlValue; +} + +function q(context, selector, attrArg, applyTrim = true) { + const attr = attrArg === true ? 'textContent' : attrArg; + + if (attr) { + const value = selector + ? context.querySelector(selector)?.[attr] || context.querySelector(selector)?.attributes[attr]?.value + : context[attr] || context[attr]?.attributes[attr]?.value; + + return applyTrim && value ? trim(value) : value; + } + + return selector ? context.querySelector(selector) : context; +} + +function all(context, selector, attrArg, applyTrim = true) { + const attr = attrArg === true ? 'textContent' : attrArg; + + if (attr) { + return Array.from(context.querySelectorAll(selector), el => (applyTrim && el[attr] ? trim(el[attr]) : el[attr])); + } + + return Array.from(context.querySelectorAll(selector)); +} + +function exists(context, selector) { + return !!q(context, selector); +} + +function content(context, selector) { + const el = q(context, selector, null, true); + + return el && el.innerHTML; +} + +function texts(context, selector, applyTrim = true, filter = true) { + const el = q(context, selector, null, applyTrim); + if (!el) return null; + + const nodes = Array.from(el.childNodes) + .filter(node => node.nodeName === '#text') + .map(node => (applyTrim ? trim(node.textContent) : node.textContent)); + + return filter ? nodes.filter(Boolean) : nodes; +} + +function text(context, selector, applyTrim = true) { + const nodes = texts(context, selector, applyTrim, true); + if (!nodes) return null; + + const textValue = nodes.join(' '); + + return applyTrim ? trim(textValue) : textValue; +} + +function meta(context, selector, attrArg = 'content', applyTrim = true) { + if (/meta\[.*\]/.test(selector)) { + return q(context, selector, attrArg, applyTrim); + } + + return q(context, `meta[${selector}]`, attrArg, applyTrim); +} + +function date(context, selector, format, match, attr = 'textContent') { + const dateString = q(context, selector, attr, true); + + if (!dateString) return null; + + return extractDate(dateString, format, match); +} + +function image(context, selector = 'img', attr = 'src', protocol = 'https') { + const imageEl = q(context, selector, attr); + + // no attribute means q output will be HTML element + return attr ? prefixProtocol(imageEl, protocol) : imageEl; +} + +function images(context, selector = 'img', attr = 'src', protocol = 'https') { + const imageEls = all(context, selector, attr); + + return attr ? imageEls.map(imageEl => prefixProtocol(imageEl, protocol)) : imageEls; +} + +function url(context, selector = 'a', attr = 'href', protocol = 'https') { + const urlEl = q(context, selector, attr); + + return attr ? prefixProtocol(urlEl, protocol) : urlEl; +} + +function urls(context, selector = 'a', attr = 'href', protocol = 'https') { + const urlEls = all(context, selector, attr); + + return attr ? urlEls.map(urlEl => prefixProtocol(urlEl, protocol)) : urlEls; +} + +function poster(context, selector = 'video', attr = 'poster', protocol = 'https') { + const posterEl = q(context, selector, attr); + + return attr ? prefixProtocol(posterEl, protocol) : posterEl; +} + +function video(context, selector = 'source', attr = 'src', protocol = 'https') { + const trailerEl = q(context, selector, attr); + + return attr ? prefixProtocol(trailerEl, protocol) : trailerEl; +} + +function videos(context, selector = 'source', attr = 'src', protocol = 'https') { + const trailerEls = all(context, selector, attr); + + return attr ? trailerEls.map(trailerEl => prefixProtocol(trailerEl, protocol)) : trailerEls; +} + +function duration(context, selector, match, attr = 'textContent') { + const durationString = q(context, selector, attr); + + if (!durationString) return null; + const durationMatch = durationString.match(match || /(\d+:)?\d+:\d+/); + + if (durationMatch) { + const segments = ['00'].concat(durationMatch[0].split(':')).slice(-3); + + return moment.duration(segments.join(':')).asSeconds(); + } + + return null; +} + +const legacyFuncs = { + q, + qa: all, + qall: all, + qd: date, + qdate: date, + qh: content, + qhtml: content, + qi: image, + qimage: image, + qimages: images, + qis: images, + ql: duration, + qlength: duration, + qm: meta, + qmeta: meta, + qp: poster, + qposter: poster, + qs: all, + qt: video, + qtext: text, + qtexts: texts, + qtrailer: video, + qtrailers: videos, + qts: videos, + qtx: text, + qtxs: texts, + qtxt: text, + qtxts: texts, + // qu: url, + qurl: url, + qurls: urls, + qus: urls, +}; + +const quFuncs = { + all, + body: content, + content, + date, + dur: duration, + duration, + exists, + image, + images, + img: image, + imgs: images, + inner: content, + length: duration, + meta, + poster, + q, + text, + texts, + trailer: video, + url, + urls, + video, + videos, +}; + +function init(element, window) { + if (!element) return null; + + const legacyContextFuncs = Object.entries(legacyFuncs) // dynamically attach methods with context + .reduce((acc, [key, func]) => ({ + ...acc, + [key]: (...args) => (window && args[0] instanceof window.HTMLElement // allow for different context + ? func(...args) + : func(element, ...args)), + }), {}); + + const quContextFuncs = Object.entries(quFuncs) // dynamically attach methods with context + .reduce((acc, [key, func]) => ({ + ...acc, + [key]: (...args) => (window && args[0] instanceof window.HTMLElement // allow for different context + ? func(...args) + : func(element, ...args)), + }), {}); + + return { + element, + el: element, + html: element.outerHTML || element.body.outerHTML, + text: trim(element.textContent), + ...(window && { + window, + document: window.document, + }), + ...legacyContextFuncs, + qu: quContextFuncs, + }; +} + +function initAll(context, selector, window) { + if (Array.isArray(context)) { + return context.map(element => init(element, window)); + } + + return Array.from(context.querySelectorAll(selector)) + .map(element => init(element, window)); +} + +function extract(htmlValue, selector) { + const { window } = new JSDOM(htmlValue); + + if (selector) { + return init(window.document.querySelector(selector), window); + } + + return init(window.document, window); +} + +function extractAll(htmlValue, selector) { + const { window } = new JSDOM(htmlValue); + + return initAll(window.document, selector, window); +} + +async function get(urlValue, selector, headers, queryAll = false) { + const res = await http.get(urlValue, { + headers, + }); + + if (res.statusCode === 200) { + const item = queryAll + ? extractAll(res.body.toString(), selector) + : extract(res.body.toString(), selector); + + return { + item, + items: all ? item : [item], + res, + ok: true, + status: res.statusCode, + }; + } + + return { + item: null, + items: [], + res, + ok: false, + status: res.statusCode, + }; +} + +async function getAll(urlValue, selector, headers) { + return get(urlValue, selector, headers, true); +} + +module.exports = { + extractDate, + extract, + extractAll, + init, + initAll, + formatDate, + get, + getAll, + context: init, + contextAll: initAll, + ed: extractDate, + ex: extract, + exa: extractAll, + fd: formatDate, + ctx: init, + ctxa: initAll, + geta: getAll, + edate: extractDate, + fdate: formatDate, + qu: quFuncs, + ...legacyFuncs, +};