traxxx/src/scrapers/whalemember.js

146 lines
3.7 KiB
JavaScript
Raw Normal View History

2020-01-14 20:45:30 +00:00
'use strict';
const { JSDOM } = require('jsdom');
const moment = require('moment');
const http = require('../utils/http');
2020-01-14 20:45:30 +00:00
function scrapeLatest(html, site) {
const { document } = new JSDOM(html).window;
const { origin } = new URL(site.url);
2020-01-15 00:01:16 +00:00
const videos = Array.from(document.querySelectorAll('.video-releases-list')).slice(-1)[0];
2020-01-14 20:45:30 +00:00
return Array.from(videos.querySelectorAll('.card'), (scene) => {
const release = { site };
2020-01-14 20:45:30 +00:00
release.url = `${origin}${scene.querySelector(':scope > a').href}`;
release.entryId = scene.dataset.videoId;
release.title = scene.querySelector('.card-title').textContent;
release.date = moment.utc(scene.dataset.date, 'MMMM DD, YYYY').toDate();
2021-11-20 22:59:15 +00:00
release.actors = Array.from(scene.querySelectorAll('.actors a'), (el) => el.textContent);
2020-01-14 20:45:30 +00:00
// slow CDN?
2020-05-20 00:23:45 +00:00
const poster = scene.querySelector('.single-image').dataset.src;
const teaserEl = scene.querySelector('source');
release.poster = {
src: /^http/.test(poster) ? poster : `https:${poster}`,
referer: site.url,
attempts: 5,
interval: 5000,
concurrency: 1,
};
2021-11-20 22:59:15 +00:00
release.photos = Array.from(scene.querySelectorAll('.rollover-thumbs img'), (el) => ({
src: (/^http/.test(el.dataset.src) ? el.dataset.src : `https:${el.dataset.src}`),
referer: site.url,
attempts: 5,
interval: 5000,
concurrency: 1,
}));
if (teaserEl) {
release.teaser = {
src: teaserEl.dataset.src,
referer: site.url,
attempts: 5,
interval: 5000,
concurrency: 1,
};
}
2020-01-14 20:45:30 +00:00
return release;
});
2020-01-14 20:45:30 +00:00
}
2020-01-15 00:01:16 +00:00
function scrapeScene(html, site, url) {
const { document } = new JSDOM(html).window;
const release = { site };
2020-01-14 20:45:30 +00:00
const scene = document.querySelector('#t2019-2col');
2020-01-14 20:45:30 +00:00
release.url = url;
release.title = scene.querySelector('.t2019-stitle').textContent.trim();
release.description = scene.querySelector('#t2019-description').textContent.trim();
2021-11-20 22:59:15 +00:00
release.actors = Array.from(scene.querySelectorAll('#t2019-models a'), (el) => el.textContent);
2020-01-14 20:45:30 +00:00
const durationEls = Array.from(scene.querySelectorAll('#t2019-stime span'));
2020-01-15 00:01:16 +00:00
if (durationEls.length > 1) {
release.date = moment.utc(durationEls[0].textContent, 'MMMM DD, YYYY').toDate();
release.duration = Number(durationEls[1].textContent.match(/\d+/)[0]) * 60;
} else {
release.duration = Number(durationEls[0].textContent.match(/\d+/)[0]) * 60;
}
2020-01-14 20:45:30 +00:00
// unreliable CDN
2021-11-20 22:59:15 +00:00
release.photos = Array.from(scene.querySelectorAll('#t2019-main .t2019-thumbs img'), (el) => ({
src: (/^http/.test(el.src) ? el.src : `https:${el.src}`),
referer: site.url,
attempts: 5,
interval: 5000,
concurrency: 1,
}));
2020-01-14 20:45:30 +00:00
const posterEl = scene.querySelector('#no-player-image');
const videoEl = scene.querySelector('video');
const trailerEl = scene.querySelector('#t2019-video source');
2020-01-14 20:45:30 +00:00
if (posterEl) {
release.poster = {
src: /^http/.test(posterEl.src) ? posterEl.src : `https:${posterEl.src}`,
referer: site.url,
attempts: 5,
interval: 5000,
concurrency: 1,
};
} else if (videoEl) {
release.poster = {
src: /^http/.test(videoEl.poster) ? videoEl.poster : `https:${videoEl.poster}`,
referer: site.url,
attempts: 5,
interval: 5000,
concurrency: 1,
};
}
2020-01-14 20:45:30 +00:00
if (trailerEl) {
release.trailer = {
src: trailerEl.src,
referer: site.url,
attempts: 5,
interval: 5000,
concurrency: 1,
};
}
2020-01-14 20:45:30 +00:00
return release;
2020-01-14 20:45:30 +00:00
}
async function fetchLatest(site, page = 1) {
const url = `${site.url}?page=${page}`;
const res = await http.get(url);
2020-01-14 20:45:30 +00:00
if (res.statusCode === 200) {
return scrapeLatest(res.body.toString(), site);
}
2020-01-14 20:45:30 +00:00
return [];
2020-01-14 20:45:30 +00:00
}
async function fetchScene(url, site) {
const res = await http.get(url);
2020-01-14 20:45:30 +00:00
if (res.statusCode === 200) {
return scrapeScene(res.body.toString(), site, url);
}
2020-01-14 20:45:30 +00:00
return null;
2020-01-14 20:45:30 +00:00
}
module.exports = {
fetchLatest,
fetchScene,
2020-01-14 20:45:30 +00:00
};