606 lines
13 KiB
JavaScript
606 lines
13 KiB
JavaScript
'use strict';
|
|
|
|
const { JSDOM } = require('jsdom');
|
|
const moment = require('moment');
|
|
const http = require('./http');
|
|
const virtualConsole = require('./virtual-console')(__filename);
|
|
|
|
const { window: globalWindow } = new JSDOM('', { virtualConsole });
|
|
|
|
function trim(str) {
|
|
if (typeof str !== 'string') {
|
|
return str;
|
|
}
|
|
|
|
return str.trim().replace(/\s+/g, ' ');
|
|
}
|
|
|
|
function extractDate(dateString, format, match) {
|
|
if (match) {
|
|
const dateStamp = trim(dateString).match(match);
|
|
|
|
if (dateStamp) {
|
|
const dateValue = moment.utc(dateStamp[0], format);
|
|
|
|
return dateValue.isValid() ? dateValue.toDate() : null;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
const dateValue = moment.utc(trim(dateString), format);
|
|
|
|
return dateValue.isValid() ? dateValue.toDate() : null;
|
|
}
|
|
|
|
function formatDate(dateValue, format, inputFormat) {
|
|
if (inputFormat) {
|
|
return moment(dateValue, inputFormat).format(format);
|
|
}
|
|
|
|
return moment(dateValue).format(format);
|
|
}
|
|
|
|
function durationToSeconds(durationString, match) {
|
|
const durationMatch = durationString.match(match || /(\d+:)?\d+:\d+/);
|
|
|
|
if (durationMatch) {
|
|
const segments = ['00'].concat(durationMatch[0].split(/[:hm]/)).slice(-3);
|
|
|
|
return moment.duration(segments.join(':')).asSeconds();
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
function prefixUrl(urlValue, origin, protocol = 'https') {
|
|
if (!urlValue) {
|
|
return null;
|
|
}
|
|
|
|
if (/^http/.test(urlValue)) {
|
|
return urlValue;
|
|
}
|
|
|
|
if (protocol && /^\/\//.test(urlValue)) {
|
|
return `${protocol}:${urlValue}`;
|
|
}
|
|
|
|
if (origin && /^\//.test(urlValue)) {
|
|
return `${origin}${urlValue}`;
|
|
}
|
|
|
|
if (origin && /^\.\//.test(urlValue)) {
|
|
return `${origin}${urlValue.slice(1)}`;
|
|
}
|
|
|
|
if (origin) {
|
|
return `${origin}/${urlValue}`;
|
|
}
|
|
|
|
return urlValue;
|
|
}
|
|
|
|
function q(context, selector, attrArg, applyTrim = true) {
|
|
if (!selector && context.nodeName === '#document') {
|
|
return null;
|
|
}
|
|
|
|
const attr = attrArg === true ? 'textContent' : attrArg;
|
|
|
|
if (attr) {
|
|
const value = selector
|
|
? context.querySelector(selector)?.[attr] || context.querySelector(selector)?.attributes[attr]?.value
|
|
: context[attr] || context.getAttribute(attr);
|
|
|
|
return applyTrim && typeof value === 'string' ? trim(value) : value;
|
|
}
|
|
|
|
return selector ? context.querySelector(selector) : context;
|
|
}
|
|
|
|
function all(context, selector, attrArg, applyTrim = true) {
|
|
const attr = attrArg === true ? 'textContent' : attrArg;
|
|
|
|
if (attr) {
|
|
return Array.from(context.querySelectorAll(selector), el => q(el, null, attr, applyTrim));
|
|
}
|
|
|
|
return Array.from(context.querySelectorAll(selector));
|
|
}
|
|
|
|
function exists(context, selector) {
|
|
return !!q(context, selector);
|
|
}
|
|
|
|
function count(context, selector) {
|
|
return all(context, selector)?.length || 0;
|
|
}
|
|
|
|
function content(context, selector, applyTrim = true) {
|
|
return q(context, selector, 'textContent', applyTrim);
|
|
}
|
|
|
|
function contents(context, selector, applyTrim) {
|
|
return all(context, selector, 'textContent', applyTrim);
|
|
}
|
|
|
|
function html(context, selector) {
|
|
const el = q(context, selector, null, true);
|
|
|
|
return el && el.innerHTML;
|
|
}
|
|
|
|
function json(context, selector) {
|
|
const el = q(context, selector, null, true);
|
|
|
|
try {
|
|
return JSON.parse(el?.innerHTML);
|
|
} catch (error) {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
function jsons(context, selector) {
|
|
const els = all(context, selector, null, true);
|
|
|
|
return els.map((el) => {
|
|
try {
|
|
return JSON.parse(el?.innerHTML);
|
|
} catch (error) {
|
|
return null;
|
|
}
|
|
});
|
|
}
|
|
|
|
function htmls(context, selector) {
|
|
const els = all(context, selector, null, true);
|
|
|
|
return els.map(el => el.innerHTML);
|
|
}
|
|
|
|
function texts(context, selector, applyTrim = true, filter = true) {
|
|
const el = q(context, selector, null, applyTrim);
|
|
if (!el) return null;
|
|
|
|
const nodes = Array.from(el.childNodes)
|
|
.filter(node => node.nodeName === '#text')
|
|
.map(node => (applyTrim ? trim(node.textContent) : node.textContent));
|
|
|
|
return filter ? nodes.filter(Boolean) : nodes;
|
|
}
|
|
|
|
function text(context, selector, applyTrim = true) {
|
|
const nodes = texts(context, selector, applyTrim, true);
|
|
if (!nodes) return null;
|
|
|
|
const textValue = nodes.join(' ');
|
|
|
|
return applyTrim ? trim(textValue) : textValue;
|
|
}
|
|
|
|
function removeStyleFunctionSpaces(el) {
|
|
// jsdom appears to have a bug where it ignores inline CSS attributes set to a function() containing spaces, e.g. url( image.png )
|
|
el.setAttribute('style', el.getAttribute('style').replace(/\(\s+(.*)\s+\)/g, (match, cssArgs) => `(${cssArgs})`));
|
|
}
|
|
|
|
function style(context, selector, styleAttr) {
|
|
const el = q(context, selector);
|
|
|
|
if (el?.hasAttribute('style')) {
|
|
removeStyleFunctionSpaces(el);
|
|
|
|
return styleAttr ? el.style[styleAttr] : el.style;
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
function styles(context, selector, styleAttr) {
|
|
const elStyles = Array.from(context.querySelectorAll(selector), (el) => {
|
|
removeStyleFunctionSpaces(el);
|
|
|
|
return styleAttr ? el.style[styleAttr] : el.style;
|
|
});
|
|
|
|
return elStyles;
|
|
}
|
|
|
|
function number(context, selector, match = /\d+(\.\d*)?/, attr = 'textContent') {
|
|
const value = q(context, selector, attr);
|
|
|
|
if (value && match) {
|
|
return Number(value.match(match)?.[0]);
|
|
}
|
|
|
|
if (value) {
|
|
return Number(value);
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
function meta(context, selector, attrArg = 'content', applyTrim = true) {
|
|
if (/meta\[.*\]/.test(selector)) {
|
|
return q(context, selector, attrArg, applyTrim);
|
|
}
|
|
|
|
return q(context, `meta[${selector}]`, attrArg, applyTrim);
|
|
}
|
|
|
|
function date(context, selector, format, match, attr = 'textContent') {
|
|
const dateString = q(context, selector, attr, true);
|
|
|
|
if (!dateString) return null;
|
|
|
|
return extractDate(dateString, format, match);
|
|
}
|
|
|
|
function dateAgo(context, selector, match = /(\d+)\s*(\w+)/, attr = 'textContent') {
|
|
const timeString = q(context, selector, attr, 'textContent');
|
|
|
|
if (!timeString) {
|
|
return null;
|
|
}
|
|
|
|
const timeMatch = timeString.match(match);
|
|
|
|
if (timeMatch) {
|
|
const [n, period] = timeMatch.slice(1);
|
|
const thenDate = moment.utc().subtract(Number(n), period);
|
|
|
|
return {
|
|
date: thenDate.toDate(),
|
|
precision: period.replace(/s$/, ''),
|
|
};
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
function image(context, selector = 'img', attr, { origin, protocol = 'https' } = {}) {
|
|
const imageEl = (attr && q(context, selector, attr))
|
|
|| q(context, selector, 'data-src')
|
|
|| q(context, selector, 'src');
|
|
|
|
return prefixUrl(imageEl, origin, protocol);
|
|
}
|
|
|
|
function images(context, selector = 'img', attr, { origin, protocol = 'https' } = {}) {
|
|
const attribute = attr
|
|
|| (q(context, selector, 'data-src') && 'data-src')
|
|
|| (q(context, selector, 'src') && 'src');
|
|
|
|
const imageEls = all(context, selector, attribute);
|
|
|
|
return imageEls.map(imageEl => prefixUrl(imageEl, origin, protocol));
|
|
}
|
|
|
|
function url(context, selector = 'a', attr = 'href', { origin, protocol = 'https', object = false } = {}) {
|
|
const urlEl = q(context, selector, attr);
|
|
const prefixedUrl = prefixUrl(urlEl, origin, protocol);
|
|
|
|
if (prefixedUrl && object) {
|
|
return new URL(prefixedUrl);
|
|
}
|
|
|
|
return prefixedUrl;
|
|
}
|
|
|
|
function urls(context, selector = 'a', attr = 'href', { origin, protocol = 'https' } = {}) {
|
|
const urlEls = all(context, selector, attr);
|
|
|
|
return attr ? urlEls.map(urlEl => prefixUrl(urlEl, origin, protocol)) : urlEls;
|
|
}
|
|
|
|
function sourceSet(context, selector, attr = 'srcset', options = {}) {
|
|
const srcset = q(context, selector, attr);
|
|
|
|
if (!srcset) {
|
|
return null;
|
|
}
|
|
|
|
const sources = srcset
|
|
.split(/\s*,\s*/)
|
|
.map((source) => {
|
|
const [link, descriptor] = source.split(' ');
|
|
|
|
if (link) {
|
|
return {
|
|
descriptor: descriptor || 'fallback',
|
|
url: prefixUrl(link, options.origin, options.protocol),
|
|
};
|
|
}
|
|
|
|
return null;
|
|
})
|
|
.filter(Boolean)
|
|
.sort((sourceA, sourceB) => {
|
|
if (sourceB.descriptor === 'fallback' || parseInt(sourceA.descriptor, 10) > parseInt(sourceB.descriptor, 10)) {
|
|
return -1;
|
|
}
|
|
|
|
if (parseInt(sourceA.descriptor, 10) < parseInt(sourceB.descriptor, 10)) {
|
|
return 1;
|
|
}
|
|
|
|
return 0;
|
|
});
|
|
|
|
if (options.includeDescriptor) {
|
|
return sources;
|
|
}
|
|
|
|
return sources.map(source => source.url);
|
|
}
|
|
|
|
function poster(context, selector = 'video', attr = 'poster', { origin, protocol = 'https' } = {}) {
|
|
const posterEl = q(context, selector, attr);
|
|
|
|
return attr ? prefixUrl(posterEl, origin, protocol) : posterEl;
|
|
}
|
|
|
|
function video(context, selector = 'source', attr = 'src', { origin, protocol = 'https' } = {}) {
|
|
const trailerEl = q(context, selector, attr);
|
|
|
|
return attr ? prefixUrl(trailerEl, origin, protocol) : trailerEl;
|
|
}
|
|
|
|
function videos(context, selector = 'source', attr = 'src', { origin, protocol = 'https' } = {}) {
|
|
const trailerEls = all(context, selector, attr);
|
|
|
|
return attr ? trailerEls.map(trailerEl => prefixUrl(trailerEl, origin, protocol)) : trailerEls;
|
|
}
|
|
|
|
function duration(context, selector, match, attr = 'textContent') {
|
|
const durationString = q(context, selector, attr);
|
|
|
|
if (!durationString) {
|
|
return null;
|
|
}
|
|
|
|
const durationMatch = durationToSeconds(durationString, match);
|
|
|
|
if (durationMatch) {
|
|
return durationMatch;
|
|
}
|
|
|
|
const timestampMatch = durationString.match(/(\d+H)?\s*(\d+M)?\s*\d+S?/i);
|
|
|
|
if (timestampMatch) {
|
|
const hours = timestampMatch[0].match(/(\d+)H/i)?.[1] || 0;
|
|
const minutes = timestampMatch[0].match(/(\d+)M/i)?.[1] || 0;
|
|
const seconds = timestampMatch[0].match(/(\d+)(S|$)/i)?.[1] || 0;
|
|
|
|
return (Number(hours) * 3600) + (Number(minutes) * 60) + Number(seconds);
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
const legacyFuncs = {
|
|
q,
|
|
qa: all,
|
|
qall: all,
|
|
qd: date,
|
|
qdate: date,
|
|
qh: html,
|
|
qhtml: html,
|
|
qi: image,
|
|
qimage: image,
|
|
qimages: images,
|
|
qis: images,
|
|
ql: duration,
|
|
qlength: duration,
|
|
qm: meta,
|
|
qmeta: meta,
|
|
qp: poster,
|
|
qposter: poster,
|
|
qs: all,
|
|
qt: video,
|
|
qtext: text,
|
|
qtexts: texts,
|
|
qtrailer: video,
|
|
qtrailers: videos,
|
|
qts: videos,
|
|
qtx: text,
|
|
qtxs: texts,
|
|
qtxt: text,
|
|
qtxts: texts,
|
|
// qu: url,
|
|
qurl: url,
|
|
qurls: urls,
|
|
qus: urls,
|
|
};
|
|
|
|
const quFuncs = {
|
|
all,
|
|
cnt: content,
|
|
cnts: contents,
|
|
content,
|
|
contents,
|
|
count,
|
|
date,
|
|
dateAgo,
|
|
dur: duration,
|
|
duration,
|
|
el: q,
|
|
element: q,
|
|
exists,
|
|
html,
|
|
htmls,
|
|
image,
|
|
images,
|
|
img: image,
|
|
imgs: images,
|
|
json,
|
|
jsons,
|
|
length: duration,
|
|
meta,
|
|
num: number,
|
|
number,
|
|
poster,
|
|
q,
|
|
sourceSet,
|
|
sources: sourceSet,
|
|
srcs: sourceSet,
|
|
srcset: sourceSet,
|
|
style,
|
|
styles,
|
|
text,
|
|
texts,
|
|
trailer: video,
|
|
url,
|
|
urls,
|
|
video,
|
|
videos,
|
|
};
|
|
|
|
function init(context, selector, window) {
|
|
if (!context) {
|
|
return null;
|
|
}
|
|
|
|
const element = selector ? context.querySelector(selector) : context;
|
|
|
|
if (!element) {
|
|
return null;
|
|
}
|
|
|
|
const legacyContextFuncs = Object.entries(legacyFuncs) // dynamically attach methods with context
|
|
.reduce((acc, [key, func]) => ({
|
|
...acc,
|
|
[key]: (...args) => (args[0] instanceof globalWindow.HTMLElement // allow for different context
|
|
? func(...args)
|
|
: func(element, ...args)),
|
|
}), {});
|
|
|
|
const quContextFuncs = Object.entries(quFuncs) // dynamically attach methods with context
|
|
.reduce((acc, [key, func]) => ({
|
|
...acc,
|
|
[key]: (...args) => (args[0]?.nodeType === undefined // allow for different context
|
|
? func(element, ...args)
|
|
: func(...args)),
|
|
}), {});
|
|
|
|
return {
|
|
element,
|
|
el: element,
|
|
html: element.outerHTML || element.body.outerHTML,
|
|
text: trim(element.textContent),
|
|
...(window && {
|
|
window,
|
|
document: window.document,
|
|
}),
|
|
...legacyContextFuncs,
|
|
qu: quContextFuncs,
|
|
query: quContextFuncs,
|
|
};
|
|
}
|
|
|
|
function initAll(context, selector, window) {
|
|
if (Array.isArray(context)) {
|
|
return context.map(element => init(element, null, window));
|
|
}
|
|
|
|
return Array.from(context.querySelectorAll(selector))
|
|
.map(element => init(element, null, window));
|
|
}
|
|
|
|
function extract(htmlValue, selector, options) {
|
|
const { window } = new JSDOM(htmlValue, { virtualConsole, ...options });
|
|
|
|
return init(window.document, selector, window);
|
|
}
|
|
|
|
function extractAll(htmlValue, selector, options) {
|
|
const { window } = new JSDOM(htmlValue, { virtualConsole, ...options });
|
|
|
|
return initAll(window.document, selector, window);
|
|
}
|
|
|
|
async function request(method = 'get', urlValue, body, selector, headers, options, queryAll = false) {
|
|
const res = await (method === 'post'
|
|
? http.post(urlValue, body, { ...options, headers })
|
|
: http[method](urlValue, {
|
|
...options,
|
|
headers,
|
|
parse: true,
|
|
}));
|
|
|
|
if (res.ok) {
|
|
const item = queryAll
|
|
? initAll(res.document, selector, res.window)
|
|
: init(res.document, selector, res.window);
|
|
|
|
return {
|
|
item,
|
|
items: all ? item : [item],
|
|
res,
|
|
ok: true,
|
|
status: res.statusCode,
|
|
...res,
|
|
};
|
|
}
|
|
|
|
return {
|
|
item: null,
|
|
items: [],
|
|
res,
|
|
ok: false,
|
|
status: res.statusCode,
|
|
...res,
|
|
};
|
|
}
|
|
|
|
async function get(urlValue, selector, headers, options) {
|
|
return request('get', urlValue, null, selector, headers, options, false);
|
|
}
|
|
|
|
async function post(urlValue, body, selector, headers, options) {
|
|
return request('post', urlValue, body, selector, headers, options, false);
|
|
}
|
|
|
|
async function getAll(urlValue, selector, headers, options) {
|
|
return request('get', urlValue, null, selector, headers, options, true);
|
|
}
|
|
|
|
async function postAll(urlValue, body, selector, headers, options) {
|
|
return request('post', urlValue, body, selector, headers, options, true);
|
|
}
|
|
|
|
function session(headers, options) {
|
|
return http.session(headers, options);
|
|
}
|
|
|
|
module.exports = {
|
|
extractDate,
|
|
extract,
|
|
extractAll,
|
|
durationToSeconds,
|
|
init,
|
|
initAll,
|
|
formatDate,
|
|
get,
|
|
getAll,
|
|
http,
|
|
fetch: get,
|
|
fetchAll: getAll,
|
|
context: init,
|
|
contextAll: initAll,
|
|
ed: extractDate,
|
|
ex: extract,
|
|
exa: extractAll,
|
|
fd: formatDate,
|
|
parseDate: extractDate,
|
|
ctx: init,
|
|
ctxa: initAll,
|
|
geta: getAll,
|
|
qu: quFuncs,
|
|
query: quFuncs,
|
|
post,
|
|
postAll,
|
|
prefixUrl,
|
|
session,
|
|
...legacyFuncs,
|
|
};
|