Updated Dorcel scraper, added movie support.

This commit is contained in:
DebaucheryLibrarian
2020-11-19 02:01:13 +01:00
parent ecc90be12c
commit 77f9193669
16 changed files with 240 additions and 73 deletions

View File

@@ -242,6 +242,42 @@ function urls(context, selector = 'a', attr = 'href', { origin, protocol = 'http
return attr ? urlEls.map(urlEl => prefixUrl(urlEl, origin, protocol)) : urlEls;
}
function sourceSet(context, selector, attr, options = {}) {
const srcset = q(context, selector, attr);
if (!srcset) {
return null;
}
const sources = srcset
.split(/\s*,\s*/)
.map((source) => {
const [link, descriptor] = source.split(' ');
return {
descriptor: descriptor || 'fallback',
url: prefixUrl(link, options.origin, options.protocol),
};
})
.sort((sourceA, sourceB) => {
if (sourceB.descriptor === 'fallback' || parseInt(sourceA.descriptor, 10) > parseInt(sourceB.descriptor, 10)) {
return -1;
}
if (parseInt(sourceA.descriptor, 10) < parseInt(sourceB.descriptor, 10)) {
return 1;
}
return 0;
});
if (options.includeDescriptor) {
return sources;
}
return sources.map(source => source.url);
}
function poster(context, selector = 'video', attr = 'poster', { origin, protocol = 'https' } = {}) {
const posterEl = q(context, selector, attr);
@@ -267,17 +303,17 @@ function duration(context, selector, match, attr = 'textContent') {
const durationMatch = durationString.match(match || /(\d+:)?\d+:\d+/);
if (durationMatch) {
const segments = ['00'].concat(durationMatch[0].split(':')).slice(-3);
const segments = ['00'].concat(durationMatch[0].split(/[:hm]/)).slice(-3);
return moment.duration(segments.join(':')).asSeconds();
}
const timestampMatch = durationString.match(/T(\d+H)?(\d+M)?\d+S/);
const timestampMatch = durationString.match(/(\d+H)?\s*(\d+M)?\s*\d+S?/i);
if (timestampMatch) {
const hours = timestampMatch[0].match(/(\d+)H/)?.[1] || 0;
const minutes = timestampMatch[0].match(/(\d+)M/)?.[1] || 0;
const seconds = timestampMatch[0].match(/(\d+)S/)?.[1] || 0;
const hours = timestampMatch[0].match(/(\d+)H/i)?.[1] || 0;
const minutes = timestampMatch[0].match(/(\d+)M/i)?.[1] || 0;
const seconds = timestampMatch[0].match(/(\d+)(S|$)/i)?.[1] || 0;
return (Number(hours) * 3600) + (Number(minutes) * 60) + Number(seconds);
}
@@ -345,6 +381,10 @@ const quFuncs = {
num: number,
poster,
q,
sourceSet,
sources: sourceSet,
srcs: sourceSet,
srcset: sourceSet,
style,
styles,
text,
@@ -415,10 +455,12 @@ function extractAll(htmlValue, selector) {
return initAll(window.document, selector, window);
}
async function get(urlValue, selector, headers, options, queryAll = false) {
const res = await http.get(urlValue, headers, options);
async function request(method = 'get', urlValue, body, selector, headers, options, queryAll = false) {
const res = await (method === 'post'
? http.post(urlValue, body, headers, options)
: http[method](urlValue, headers, options));
if (res.statusCode === 200) {
if (res.ok) {
const item = queryAll
? extractAll(res.body.toString(), selector)
: extract(res.body.toString(), selector);
@@ -443,8 +485,20 @@ async function get(urlValue, selector, headers, options, queryAll = false) {
};
}
async function get(urlValue, selector, headers, options) {
return request('get', urlValue, null, selector, headers, options, false);
}
async function post(urlValue, body, selector, headers, options) {
return request('post', urlValue, body, selector, headers, options, false);
}
async function getAll(urlValue, selector, headers, options) {
return get(urlValue, selector, headers, options, true);
return request('get,', urlValue, selector, headers, options, true);
}
async function postAll(urlValue, body, selector, headers, options) {
return request('post', urlValue, body, selector, headers, options, true);
}
module.exports = {
@@ -470,6 +524,8 @@ module.exports = {
geta: getAll,
qu: quFuncs,
query: quFuncs,
post,
postAll,
prefixUrl,
...legacyFuncs,
};