'use strict'; const { JSDOM } = require('jsdom'); const moment = require('moment'); const http = require('./http'); const { window: globalWindow } = new JSDOM(''); function trim(str) { if (typeof str !== 'string') { return str; } return str.trim().replace(/\s+/g, ' '); } function extractDate(dateString, format, match) { if (match) { const dateStamp = trim(dateString).match(match); if (dateStamp) { const dateValue = moment.utc(dateStamp[0], format); return dateValue.isValid() ? dateValue.toDate() : null; } return null; } const dateValue = moment.utc(trim(dateString), format); return dateValue.isValid() ? dateValue.toDate() : null; } function formatDate(dateValue, format, inputFormat) { if (inputFormat) { return moment(dateValue, inputFormat).format(format); } return moment(dateValue).format(format); } function prefixUrl(urlValue, origin, protocol = 'https') { if (protocol && /^\/\//.test(urlValue)) { return `${protocol}:${urlValue}`; } if (origin && /^\//.test(urlValue)) { return `${origin}${urlValue}`; } return urlValue; } function q(context, selector, attrArg, applyTrim = true) { const attr = attrArg === true ? 'textContent' : attrArg; if (attr) { const value = selector ? context.querySelector(selector)?.[attr] || context.querySelector(selector)?.attributes[attr]?.value : context[attr] || context.getAttribute(attr); return applyTrim && typeof value === 'string' ? trim(value) : value; } return selector ? context.querySelector(selector) : context; } function all(context, selector, attrArg, applyTrim = true) { const attr = attrArg === true ? 'textContent' : attrArg; if (attr) { return Array.from(context.querySelectorAll(selector), el => q(el, null, attr, applyTrim)); } return Array.from(context.querySelectorAll(selector)); } function exists(context, selector) { return !!q(context, selector); } function content(context, selector, applyTrim = true) { return q(context, selector, 'textContent', applyTrim); } function contents(context, selector, applyTrim) { return all(context, selector, 'textContent', applyTrim); } function html(context, selector) { const el = q(context, selector, null, true); return el && el.innerHTML; } function texts(context, selector, applyTrim = true, filter = true) { const el = q(context, selector, null, applyTrim); if (!el) return null; const nodes = Array.from(el.childNodes) .filter(node => node.nodeName === '#text') .map(node => (applyTrim ? trim(node.textContent) : node.textContent)); return filter ? nodes.filter(Boolean) : nodes; } function text(context, selector, applyTrim = true) { const nodes = texts(context, selector, applyTrim, true); if (!nodes) return null; const textValue = nodes.join(' '); return applyTrim ? trim(textValue) : textValue; } function removeStyleFunctionSpaces(el) { // jsdom appears to have a bug where it ignores inline CSS attributes set to a function() containing spaces, e.g. url( image.png ) el.setAttribute('style', el.getAttribute('style').replace(/\(\s+(.*)\s+\)/g, (match, cssArgs) => `(${cssArgs})`)); } function style(context, selector, styleAttr) { const el = q(context, selector); if (el) { removeStyleFunctionSpaces(el); return styleAttr ? el.style[styleAttr] : el.style; } return null; } function styles(context, selector, styleAttr) { const elStyles = Array.from(context.querySelectorAll(selector), (el) => { removeStyleFunctionSpaces(el); return styleAttr ? el.style[styleAttr] : el.style; }); return elStyles; } function number(context, selector, match = /\d+/, attr = 'textContent') { const value = q(context, selector, attr); if (value && match) { return Number(value.match(match)?.[0]); } if (value) { return Number(value); } return null; } function meta(context, selector, attrArg = 'content', applyTrim = true) { if (/meta\[.*\]/.test(selector)) { return q(context, selector, attrArg, applyTrim); } return q(context, `meta[${selector}]`, attrArg, applyTrim); } function date(context, selector, format, match, attr = 'textContent') { const dateString = q(context, selector, attr, true); if (!dateString) return null; return extractDate(dateString, format, match); } function image(context, selector = 'img', attr, { origin, protocol = 'https' } = {}) { const imageEl = (attr && q(context, selector, attr)) || q(context, selector, 'data-src') || q(context, selector, 'src'); return prefixUrl(imageEl, origin, protocol); } function images(context, selector = 'img', attr, { origin, protocol = 'https' } = {}) { const attribute = attr || (q(context, selector, 'data-src') && 'data-src') || (q(context, selector, 'src') && 'src'); const imageEls = all(context, selector, attribute); return imageEls.map(imageEl => prefixUrl(imageEl, origin, protocol)); } function url(context, selector = 'a', attr = 'href', { origin, protocol = 'https' } = {}) { const urlEl = q(context, selector, attr); return attr ? prefixUrl(urlEl, origin, protocol) : urlEl; } function urls(context, selector = 'a', attr = 'href', { origin, protocol = 'https' } = {}) { const urlEls = all(context, selector, attr); return attr ? urlEls.map(urlEl => prefixUrl(urlEl, origin, protocol)) : urlEls; } function poster(context, selector = 'video', attr = 'poster', { origin, protocol = 'https' } = {}) { const posterEl = q(context, selector, attr); return attr ? prefixUrl(posterEl, origin, protocol) : posterEl; } function video(context, selector = 'source', attr = 'src', { origin, protocol = 'https' } = {}) { const trailerEl = q(context, selector, attr); return attr ? prefixUrl(trailerEl, origin, protocol) : trailerEl; } function videos(context, selector = 'source', attr = 'src', { origin, protocol = 'https' } = {}) { const trailerEls = all(context, selector, attr); return attr ? trailerEls.map(trailerEl => prefixUrl(trailerEl, origin, protocol)) : trailerEls; } function duration(context, selector, match, attr = 'textContent') { const durationString = q(context, selector, attr); if (!durationString) return null; const durationMatch = durationString.match(match || /(\d+:)?\d+:\d+/); if (durationMatch) { const segments = ['00'].concat(durationMatch[0].split(':')).slice(-3); return moment.duration(segments.join(':')).asSeconds(); } const timestampMatch = durationString.match(/T(\d+H)?(\d+M)?\d+S/); if (timestampMatch) { const hours = timestampMatch[0].match(/(\d+)H/)?.[1] || 0; const minutes = timestampMatch[0].match(/(\d+)M/)?.[1] || 0; const seconds = timestampMatch[0].match(/(\d+)S/)?.[1] || 0; return (Number(hours) * 3600) + (Number(minutes) * 60) + Number(seconds); } return null; } const legacyFuncs = { q, qa: all, qall: all, qd: date, qdate: date, qh: html, qhtml: html, qi: image, qimage: image, qimages: images, qis: images, ql: duration, qlength: duration, qm: meta, qmeta: meta, qp: poster, qposter: poster, qs: all, qt: video, qtext: text, qtexts: texts, qtrailer: video, qtrailers: videos, qts: videos, qtx: text, qtxs: texts, qtxt: text, qtxts: texts, // qu: url, qurl: url, qurls: urls, qus: urls, }; const quFuncs = { all, html, content, contents, cnt: content, cnts: contents, date, dur: duration, duration, element: q, el: q, exists, image, images, img: image, imgs: images, length: duration, meta, number, num: number, poster, q, style, styles, text, texts, trailer: video, url, urls, video, videos, }; function init(element, window) { if (!element) return null; const legacyContextFuncs = Object.entries(legacyFuncs) // dynamically attach methods with context .reduce((acc, [key, func]) => ({ ...acc, [key]: (...args) => (args[0] instanceof globalWindow.HTMLElement // allow for different context ? func(...args) : func(element, ...args)), }), {}); const quContextFuncs = Object.entries(quFuncs) // dynamically attach methods with context .reduce((acc, [key, func]) => ({ ...acc, [key]: (...args) => (args[0].nodeType === undefined // allow for different context ? func(element, ...args) : func(...args)), }), {}); return { element, el: element, html: element.outerHTML || element.body.outerHTML, text: trim(element.textContent), ...(window && { window, document: window.document, }), ...legacyContextFuncs, qu: quContextFuncs, query: quContextFuncs, }; } function initAll(context, selector, window) { if (Array.isArray(context)) { return context.map(element => init(element, window)); } return Array.from(context.querySelectorAll(selector)) .map(element => init(element, window)); } function extract(htmlValue, selector) { const { window } = new JSDOM(htmlValue); if (selector) { return init(window.document.querySelector(selector), window); } return init(window.document, window); } function extractAll(htmlValue, selector) { const { window } = new JSDOM(htmlValue); return initAll(window.document, selector, window); } async function get(urlValue, selector, headers, options, queryAll = false) { const res = await http.get(urlValue, headers, options); if (res.statusCode === 200) { const item = queryAll ? extractAll(res.body.toString(), selector) : extract(res.body.toString(), selector); return { item, items: all ? item : [item], res, ok: true, status: res.statusCode, ...res, }; } return { item: null, items: [], res, ok: false, status: res.statusCode, ...res, }; } async function getAll(urlValue, selector, headers, options) { return get(urlValue, selector, headers, options, true); } module.exports = { extractDate, extract, extractAll, init, initAll, formatDate, get, getAll, context: init, contextAll: initAll, ed: extractDate, ex: extract, exa: extractAll, fd: formatDate, parseDate: extractDate, ctx: init, ctxa: initAll, geta: getAll, qu: quFuncs, query: quFuncs, prefixUrl, ...legacyFuncs, };