'use strict'; const { JSDOM } = require('jsdom'); const moment = require('moment'); const http = require('./http'); const virtualConsole = require('./virtual-console')(__filename); const { window: globalWindow } = new JSDOM('', { virtualConsole }); function trim(str) { if (typeof str !== 'string') { return str; } return str.trim().replace(/\s+/g, ' '); } function extractDate(dateString, format, match) { if (match) { const dateStamp = trim(dateString).match(match); if (dateStamp) { const dateValue = moment.utc(dateStamp[0], format); return dateValue.isValid() ? dateValue.toDate() : null; } return null; } const dateValue = moment.utc(trim(dateString), format); return dateValue.isValid() ? dateValue.toDate() : null; } function formatDate(dateValue, format, inputFormat) { if (inputFormat) { return moment(dateValue, inputFormat).format(format); } return moment(dateValue).format(format); } function durationToSeconds(durationString, match) { const durationMatch = durationString.match(match || /(\d+:)?\d+:\d+/); if (durationMatch) { const segments = ['00'].concat(durationMatch[0].split(/[:hm]/)).slice(-3); return moment.duration(segments.join(':')).asSeconds(); } return null; } function prefixUrl(urlValue, origin, protocol = 'https') { if (!urlValue) { return null; } if (/^http/.test(urlValue)) { return urlValue; } if (protocol && /^\/\//.test(urlValue)) { return `${protocol}:${urlValue}`; } if (origin && /^\//.test(urlValue)) { return `${origin}${urlValue}`; } if (origin && /^\.\//.test(urlValue)) { return `${origin}${urlValue.slice(1)}`; } if (origin) { return `${origin}/${urlValue}`; } return urlValue; } function q(context, selector, attrArg, applyTrim = true) { if (!selector && context.nodeName === '#document') { return null; } const attr = attrArg === true ? 'textContent' : attrArg; if (attr) { const value = selector ? context.querySelector(selector)?.[attr] || context.querySelector(selector)?.attributes[attr]?.value : context[attr] || context.getAttribute(attr); return applyTrim && typeof value === 'string' ? trim(value) : value; } return selector ? context.querySelector(selector) : context; } function all(context, selector, attrArg, applyTrim = true) { const attr = attrArg === true ? 'textContent' : attrArg; if (attr) { return Array.from(context.querySelectorAll(selector), (el) => q(el, null, attr, applyTrim)); } return Array.from(context.querySelectorAll(selector)); } function exists(context, selector) { return !!q(context, selector); } function count(context, selector) { return all(context, selector)?.length || 0; } function content(context, selector, applyTrim = true) { return q(context, selector, 'textContent', applyTrim); } function contents(context, selector, applyTrim) { return all(context, selector, 'textContent', applyTrim); } function html(context, selector) { const el = q(context, selector, null, true); return el && el.innerHTML; } function json(context, selector) { const el = q(context, selector, null, true); try { return JSON.parse(el?.innerHTML); } catch (error) { return null; } } function jsons(context, selector) { const els = all(context, selector, null, true); return els.map((el) => { try { return JSON.parse(el?.innerHTML); } catch (error) { return null; } }); } function htmls(context, selector) { const els = all(context, selector, null, true); return els.map((el) => el.innerHTML); } function texts(context, selector, applyTrim = true, filter = true) { const el = q(context, selector, null, applyTrim); if (!el) return null; const nodes = Array.from(el.childNodes) .filter((node) => node.nodeName === '#text') .map((node) => (applyTrim ? trim(node.textContent) : node.textContent)); return filter ? nodes.filter(Boolean) : nodes; } function text(context, selector, applyTrim = true) { const nodes = texts(context, selector, applyTrim, true); if (!nodes) return null; const textValue = nodes.join(' '); return applyTrim ? trim(textValue) : textValue; } function removeStyleFunctionSpaces(el) { // jsdom appears to have a bug where it ignores inline CSS attributes set to a function() containing spaces, e.g. url( image.png ) el.setAttribute('style', el.getAttribute('style').replace(/\(\s+(.*)\s+\)/g, (match, cssArgs) => `(${cssArgs})`)); } function style(context, selector, styleAttr) { const el = q(context, selector); if (el?.hasAttribute('style')) { removeStyleFunctionSpaces(el); return styleAttr ? el.style[styleAttr] : el.style; } return null; } function styles(context, selector, styleAttr) { const elStyles = Array.from(context.querySelectorAll(selector), (el) => { removeStyleFunctionSpaces(el); return styleAttr ? el.style[styleAttr] : el.style; }); return elStyles; } function number(context, selector, match = /\d+(\.\d*)?/, attr = 'textContent') { const value = q(context, selector, attr); if (value && match) { return Number(value.match(match)?.[0]); } if (value) { return Number(value); } return null; } function meta(context, selector, attrArg = 'content', applyTrim = true) { if (/meta\[.*\]/.test(selector)) { return q(context, selector, attrArg, applyTrim); } return q(context, `meta[${selector}]`, attrArg, applyTrim); } function date(context, selector, format, match, attr = 'textContent') { const dateString = q(context, selector, attr, true); if (!dateString) return null; return extractDate(dateString, format, match); } function dateAgo(context, selector, match = /(\d+)\s*(\w+)/, attr = 'textContent') { const timeString = q(context, selector, attr, 'textContent'); if (!timeString) { return null; } const timeMatch = timeString.match(match); if (timeMatch) { const [n, period] = timeMatch.slice(1); const thenDate = moment.utc().subtract(Number(n), period); return { date: thenDate.toDate(), precision: period.replace(/s$/, ''), }; } return null; } function image(context, selector = 'img', attr, { origin, protocol = 'https' } = {}) { const imageEl = (attr && q(context, selector, attr)) || q(context, selector, 'data-src') || q(context, selector, 'src'); return prefixUrl(imageEl, origin, protocol); } function images(context, selector = 'img', attr, { origin, protocol = 'https' } = {}) { const attribute = attr || (q(context, selector, 'data-src') && 'data-src') || (q(context, selector, 'src') && 'src'); const imageEls = all(context, selector, attribute); return imageEls.map((imageEl) => prefixUrl(imageEl, origin, protocol)); } function url(context, selector = 'a', attr = 'href', { origin, protocol = 'https', object = false } = {}) { const urlEl = q(context, selector, attr); const prefixedUrl = prefixUrl(urlEl, origin, protocol); if (prefixedUrl && object) { return new URL(prefixedUrl); } return prefixedUrl; } function urls(context, selector = 'a', attr = 'href', { origin, protocol = 'https' } = {}) { const urlEls = all(context, selector, attr); return attr ? urlEls.map((urlEl) => prefixUrl(urlEl, origin, protocol)) : urlEls; } function sourceSet(context, selector, attr = 'srcset', options = {}) { const srcset = q(context, selector, attr); if (!srcset) { return null; } const sources = srcset .split(/\s*,\s*/) .map((source) => { const [link, descriptor] = source.split(' '); if (link) { return { descriptor: descriptor || 'fallback', url: prefixUrl(link, options.origin, options.protocol), }; } return null; }) .filter(Boolean) .sort((sourceA, sourceB) => { if (sourceB.descriptor === 'fallback' || parseInt(sourceA.descriptor, 10) > parseInt(sourceB.descriptor, 10)) { return -1; } if (parseInt(sourceA.descriptor, 10) < parseInt(sourceB.descriptor, 10)) { return 1; } return 0; }); if (options.includeDescriptor) { return sources; } return sources.map((source) => source.url); } function poster(context, selector = 'video', attr = 'poster', { origin, protocol = 'https' } = {}) { const posterEl = q(context, selector, attr); return attr ? prefixUrl(posterEl, origin, protocol) : posterEl; } function video(context, selector = 'source', attr = 'src', { origin, protocol = 'https' } = {}) { const trailerEl = q(context, selector, attr); return attr ? prefixUrl(trailerEl, origin, protocol) : trailerEl; } function videos(context, selector = 'source', attr = 'src', { origin, protocol = 'https' } = {}) { const trailerEls = all(context, selector, attr); return attr ? trailerEls.map((trailerEl) => prefixUrl(trailerEl, origin, protocol)) : trailerEls; } function duration(context, selector, match, attr = 'textContent') { const durationString = q(context, selector, attr); if (!durationString) { return null; } const durationMatch = durationToSeconds(durationString, match); if (durationMatch) { return durationMatch; } const timestampMatch = durationString.match(/(\d+H)?\s*(\d+M)?\s*\d+S?/i); if (timestampMatch) { const hours = timestampMatch[0].match(/(\d+)H/i)?.[1] || 0; const minutes = timestampMatch[0].match(/(\d+)M/i)?.[1] || 0; const seconds = timestampMatch[0].match(/(\d+)(S|$)/i)?.[1] || 0; return (Number(hours) * 3600) + (Number(minutes) * 60) + Number(seconds); } return null; } const legacyFuncs = { q, qa: all, qall: all, qd: date, qdate: date, qh: html, qhtml: html, qi: image, qimage: image, qimages: images, qis: images, ql: duration, qlength: duration, qm: meta, qmeta: meta, qp: poster, qposter: poster, qs: all, qt: video, qtext: text, qtexts: texts, qtrailer: video, qtrailers: videos, qts: videos, qtx: text, qtxs: texts, qtxt: text, qtxts: texts, // qu: url, qurl: url, qurls: urls, qus: urls, }; const quFuncs = { all, cnt: content, cnts: contents, content, contents, count, date, dateAgo, dur: duration, duration, el: q, element: q, exists, html, htmls, image, images, img: image, imgs: images, json, jsons, length: duration, meta, num: number, number, poster, q, sourceSet, sources: sourceSet, srcs: sourceSet, srcset: sourceSet, style, styles, text, texts, trailer: video, url, urls, video, videos, }; function init(context, selector, window) { if (!context) { return null; } const element = selector ? context.querySelector(selector) : context; if (!element) { return null; } const legacyContextFuncs = Object.entries(legacyFuncs) // dynamically attach methods with context .reduce((acc, [key, func]) => ({ ...acc, [key]: (...args) => (args[0] instanceof globalWindow.HTMLElement // allow for different context ? func(...args) : func(element, ...args)), }), {}); const quContextFuncs = Object.entries(quFuncs) // dynamically attach methods with context .reduce((acc, [key, func]) => ({ ...acc, [key]: (...args) => (args[0]?.nodeType === undefined // allow for different context ? func(element, ...args) : func(...args)), }), {}); return { element, el: element, html: element.outerHTML || element.body.outerHTML, text: trim(element.textContent), ...(window && { window, document: window.document, }), ...legacyContextFuncs, qu: quContextFuncs, query: quContextFuncs, }; } function initAll(context, selector, window) { if (Array.isArray(context)) { return context.map((element) => init(element, null, window)); } return Array.from(context.querySelectorAll(selector)) .map((element) => init(element, null, window)); } function extract(htmlValue, selector, options) { const { window } = new JSDOM(htmlValue, { virtualConsole, ...options }); return init(window.document, selector, window); } function extractAll(htmlValue, selector, options) { const { window } = new JSDOM(htmlValue, { virtualConsole, ...options }); return initAll(window.document, selector, window); } async function request(method = 'get', urlValue, body, selector, headers, options, queryAll = false) { const res = await (method === 'post' ? http.post(urlValue, body, { ...options, headers }) : http[method](urlValue, { ...options, headers, parse: true, })); if (res.ok) { const item = queryAll ? initAll(res.document, selector, res.window) : init(res.document, selector, res.window); return { item, items: all ? item : [item], res, ok: true, status: res.statusCode, ...res, }; } return { item: null, items: [], res, ok: false, status: res.statusCode, ...res, }; } async function get(urlValue, selector, headers, options) { return request('get', urlValue, null, selector, headers, options, false); } async function post(urlValue, body, selector, headers, options) { return request('post', urlValue, body, selector, headers, options, false); } async function getAll(urlValue, selector, headers, options) { return request('get', urlValue, null, selector, headers, options, true); } async function postAll(urlValue, body, selector, headers, options) { return request('post', urlValue, body, selector, headers, options, true); } function session(headers, options) { return http.session(headers, options); } module.exports = { extractDate, extract, extractAll, durationToSeconds, init, initAll, formatDate, get, getAll, http, fetch: get, fetchAll: getAll, context: init, contextAll: initAll, ed: extractDate, ex: extract, exa: extractAll, fd: formatDate, parseDate: extractDate, ctx: init, ctxa: initAll, geta: getAll, qu: quFuncs, query: quFuncs, post, postAll, prefixUrl, session, ...legacyFuncs, };