227 lines
5.5 KiB
JavaScript
227 lines
5.5 KiB
JavaScript
'use strict';
|
|
|
|
const { JSDOM } = require('jsdom');
|
|
const moment = require('moment');
|
|
const bhttp = require('bhttp');
|
|
|
|
function prefixProtocol(url, protocol = 'https') {
|
|
if (protocol && /^\/\//.test(url)) {
|
|
return `${protocol}:${url}`;
|
|
}
|
|
|
|
return url;
|
|
}
|
|
|
|
function q(context, selector, attrArg, trim = true) {
|
|
const attr = attrArg === true ? 'textContent' : attrArg;
|
|
|
|
if (attr) {
|
|
const value = selector
|
|
? context.querySelector(selector)?.[attr]
|
|
: context[attr];
|
|
|
|
return trim ? value?.trim() : value;
|
|
}
|
|
|
|
return selector ? context.querySelector(selector) : context;
|
|
}
|
|
|
|
function qall(context, selector, attrArg, trim = true) {
|
|
const attr = attrArg === true ? 'textContent' : attrArg;
|
|
|
|
if (attr) {
|
|
return Array.from(context.querySelectorAll(selector), el => (trim ? el[attr]?.trim() : el[attr]));
|
|
}
|
|
|
|
return Array.from(context.querySelectorAll(selector));
|
|
}
|
|
|
|
function qtext(context, selector, trim = true) {
|
|
const el = q(context, selector, null, trim);
|
|
if (!el) return null;
|
|
|
|
const text = Array.from(el.childNodes)
|
|
.filter(node => node.nodeName === '#text')
|
|
.map(node => (trim ? node.textContent : node.textContent.trim()))
|
|
.join(' ');
|
|
|
|
if (trim) return text.trim();
|
|
return text;
|
|
}
|
|
|
|
function qmeta(context, selector, attrArg = 'content', trim = true) {
|
|
return q(context, selector, attrArg, trim);
|
|
}
|
|
|
|
function qdate(context, selector, format, match, attr = 'textContent') {
|
|
const dateString = selector
|
|
? context.querySelector(selector)?.[attr]
|
|
: context[attr];
|
|
|
|
if (!dateString) return null;
|
|
|
|
if (match) {
|
|
const dateStamp = dateString.trim().match(match);
|
|
|
|
if (dateStamp) return moment.utc(dateStamp[0], format).toDate();
|
|
return null;
|
|
}
|
|
|
|
return moment.utc(dateString.trim(), format).toDate();
|
|
}
|
|
|
|
function qimage(context, selector = 'img', attr = 'src', protocol = 'https') {
|
|
const image = q(context, selector, attr);
|
|
|
|
// no attribute means q output will be HTML element
|
|
return attr ? prefixProtocol(image, protocol) : image;
|
|
}
|
|
|
|
function qimages(context, selector = 'img', attr = 'src', protocol = 'https') {
|
|
const images = qall(context, selector, attr);
|
|
|
|
return attr ? images.map(image => prefixProtocol(image, protocol)) : images;
|
|
}
|
|
|
|
function qurl(context, selector = 'a', attr = 'href', protocol = 'https') {
|
|
const url = q(context, selector, attr);
|
|
|
|
return attr ? prefixProtocol(url, protocol) : url;
|
|
}
|
|
|
|
function qurls(context, selector = 'a', attr = 'href', protocol = 'https') {
|
|
const urls = qall(context, selector, attr);
|
|
|
|
return attr ? urls.map(url => prefixProtocol(url, protocol)) : urls;
|
|
}
|
|
|
|
function qposter(context, selector = 'video', attr = 'poster', protocol = 'https') {
|
|
const poster = q(context, selector, attr);
|
|
|
|
return attr ? prefixProtocol(poster, protocol) : poster;
|
|
}
|
|
|
|
function qtrailer(context, selector = 'source', attr = 'src', protocol = 'https') {
|
|
const trailer = q(context, selector, attr);
|
|
|
|
return attr ? prefixProtocol(trailer, protocol) : trailer;
|
|
}
|
|
|
|
function qlength(context, selector, attr = 'textContent') {
|
|
const durationString = q(context, selector, attr);
|
|
|
|
if (!durationString) return null;
|
|
const duration = durationString.match(/(\d+:)?\d+:\d+/);
|
|
|
|
if (duration) {
|
|
const segments = ['00'].concat(duration[0].split(':')).slice(-3);
|
|
|
|
return moment.duration(segments.join(':')).asSeconds();
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
const funcs = {
|
|
q,
|
|
qall,
|
|
qdate,
|
|
qimage,
|
|
qimages,
|
|
qposter,
|
|
qlength,
|
|
qmeta,
|
|
qtext,
|
|
qtrailer,
|
|
qurls,
|
|
qurl,
|
|
qa: qall,
|
|
qd: qdate,
|
|
qi: qimage,
|
|
qis: qimages,
|
|
qp: qposter,
|
|
ql: qlength,
|
|
qm: qmeta,
|
|
qt: qtrailer,
|
|
qtx: qtext,
|
|
qu: qurl,
|
|
qus: qurls,
|
|
};
|
|
|
|
function init(element, window) {
|
|
if (!element) return null;
|
|
|
|
const contextFuncs = Object.entries(funcs) // dynamically attach methods with context
|
|
.reduce((acc, [key, func]) => ({
|
|
...acc,
|
|
[key]: (...args) => (window && args[0] instanceof window.HTMLElement // allow for different context
|
|
? func(...args)
|
|
: func(element, ...args)),
|
|
}), {});
|
|
|
|
return {
|
|
element,
|
|
...(window && {
|
|
window,
|
|
document: window.document,
|
|
}),
|
|
...contextFuncs,
|
|
};
|
|
}
|
|
|
|
function initAll(context, selector, window) {
|
|
return Array.from(context.querySelectorAll(selector))
|
|
.map(element => init(element, window));
|
|
}
|
|
|
|
function extract(html, selector) {
|
|
const { window } = new JSDOM(html);
|
|
|
|
if (selector) {
|
|
return init(window.document.querySelector(selector), window);
|
|
}
|
|
|
|
return init(window.document, window);
|
|
}
|
|
|
|
function extractAll(html, selector) {
|
|
const { window } = new JSDOM(html);
|
|
|
|
return initAll(window.document, selector, window);
|
|
}
|
|
|
|
async function get(url, selector, headers, all = false) {
|
|
const res = await bhttp.get(url, {
|
|
headers,
|
|
});
|
|
|
|
if (res.statusCode === 200) {
|
|
return all
|
|
? extractAll(res.body.toString(), selector)
|
|
: extract(res.body.toString(), selector);
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
async function getAll(url, selector, headers) {
|
|
return get(url, selector, headers, true);
|
|
}
|
|
|
|
module.exports = {
|
|
extract,
|
|
extractAll,
|
|
init,
|
|
initAll,
|
|
get,
|
|
getAll,
|
|
context: init,
|
|
contextAll: initAll,
|
|
ex: extract,
|
|
exa: extractAll,
|
|
ctx: init,
|
|
ctxa: initAll,
|
|
geta: getAll,
|
|
...funcs,
|
|
};
|