traxxx/src/scrapers/perfectgonzo.js

135 lines
3.7 KiB
JavaScript

'use strict';
const blake2 = require('blake2');
const knex = require('../knex');
const qu = require('../utils/qu');
async function getSiteSlugs() {
return knex('entities')
.pluck('entities.slug')
.join('entities AS parents', 'parents.id', 'entities.parent_id')
.where('parents.slug', 'perfectgonzo');
}
function getHash(identifier) {
const hash = blake2.createHash('blake2b', { digestLength: 8 });
hash.update(Buffer.from(identifier));
return hash.digest('hex');
}
function extractMaleModelsFromTags(tagContainer) {
if (!tagContainer) {
return [];
}
const tagEls = Array.from(tagContainer.childNodes, node => ({ type: node.nodeType, text: node.textContent.trim() })).filter(node => node.text.length > 0);
const modelLabelIndex = tagEls.findIndex(node => node.text === 'Male Models');
if (modelLabelIndex > -1) {
const nextLabelIndex = tagEls.findIndex((node, index) => index > modelLabelIndex && node.type === 3);
const maleModels = tagEls.slice(modelLabelIndex + 1, nextLabelIndex);
return maleModels.map(model => model.text);
}
return [];
}
async function extractChannelFromPhoto(photo, channel) {
const siteSlugs = (channel.type === 'network' ? channel.children : channel.parent?.children)?.map(child => child.slug)
|| await getSiteSlugs();
const channelMatch = photo.match(new RegExp(siteSlugs.join('|')));
if (channelMatch) {
return channelMatch[0];
}
return null;
}
async function scrapeLatest(scenes, site) {
return scenes.map(({ query }) => {
const release = {};
release.title = query.q('a', 'title');
release.url = query.url('a', 'href', { origin: site.url });
release.date = query.date('.nm-date', 'MM/DD/YYYY');
const slug = new URL(release.url).pathname.split('/')[2];
release.entryId = getHash(`${site.slug}${slug}${release.date.toISOString()}`);
release.actors = release.title.split('&').map(actor => actor.trim());
[release.poster, ...release.photos] = query.imgs('.bloc-link img');
release.tags = query.cnts('.dropdown ul a').slice(1);
release.duration = query.duration('.dropdown p:first-child');
return release;
});
}
async function scrapeScene({ query }, site, url) {
const release = { url, site };
release.title = query.cnt('#movie-header h2');
release.date = query.date('#movie-header div span', 'MMMM DD, YYYY', /\w+ \d{1,2}, \d{4}/);
release.description = query.cnt('.container .mg-md');
release.duration = query.duration('#video-ribbon .container > div > span:nth-child(3)');
release.actors = query.cnts('#video-info a').concat(extractMaleModelsFromTags(query.q('.tag-container')));
release.tags = query.cnts('.tag-container a');
const uhd = query.cnt('#video-ribbon .container > div > span:nth-child(2)');
if (/4K/.test(uhd)) release.tags = release.tags.concat('4k');
release.photos = query.all('.bxslider_pics img').map(el => el.dataset.original || el.src);
release.poster = query.poster();
const trailer = query.trailer();
if (trailer) release.trailer = { src: trailer };
if (release.photos.length > 0) release.channel = await extractChannelFromPhoto(release.photos[0], site);
if (release.channel) {
const { pathname } = new URL(url);
release.url = `https://${release.channel}.com${pathname}`;
const slug = pathname.split('/')[2];
release.entryId = getHash(`${release.channel}${slug}${release.date.toISOString()}`);
}
return release;
}
async function fetchLatest(site, page = 1) {
const url = `${site.url}/movies/page-${page}`;
const res = await qu.getAll(url, '#content-main [class^="item"]');
if (res.ok) {
return scrapeLatest(res.items, site);
}
return res.status;
}
async function fetchScene(url, channel) {
const res = await qu.get(url);
if (res.ok) {
return scrapeScene(res.item, channel, url);
}
return res.status;
}
module.exports = {
fetchLatest,
fetchScene,
};