forked from DebaucheryLibrarian/traxxx
Allowing scrapers to return raw tags and site URLs or slugs, to gradually remove site and tag fetching from individual scrapers. Added media and deep fetchin support to Perv City scraper.
This commit is contained in:
@@ -2,9 +2,25 @@
|
||||
|
||||
const bhttp = require('bhttp');
|
||||
const cheerio = require('cheerio');
|
||||
const { JSDOM } = require('jsdom');
|
||||
const moment = require('moment');
|
||||
|
||||
function scrape(html, site) {
|
||||
async function getTrailer(entryId) {
|
||||
const trailerRes = await bhttp.post('https://www.pervcity.com/gettoken.php', {
|
||||
setId: entryId,
|
||||
});
|
||||
|
||||
if (trailerRes.statusCode === 200) {
|
||||
return {
|
||||
poster: trailerRes.body.TrailerImg,
|
||||
trailer: trailerRes.body.TrailerPath || trailerRes.body.Trailerfallback,
|
||||
};
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function scrapeLatestScene(html, site) {
|
||||
const $ = cheerio.load(html, { normalizeWhitespace: true });
|
||||
|
||||
const entryId = $('li').attr('id');
|
||||
@@ -15,6 +31,9 @@ function scrape(html, site) {
|
||||
const actors = $('.home_model_name a').toArray().map(element => $(element).text().replace(/,[\u0020\u00A0\u202F]/, '')); // replace weird commas
|
||||
const date = moment.utc($('.add_date').text(), 'DD-MM-YYYY').toDate();
|
||||
|
||||
const poster = $('a:nth-child(2) > img').attr('src');
|
||||
const photos = $('.sample-picker img').map((index, element) => $(element).attr('src').replace('tourpics', 'trailer')).toArray();
|
||||
|
||||
const stars = $('img[src*="/star.png"]')
|
||||
.toArray()
|
||||
.map(element => $(element).attr('src'))
|
||||
@@ -26,6 +45,8 @@ function scrape(html, site) {
|
||||
title,
|
||||
actors,
|
||||
date,
|
||||
poster,
|
||||
photos,
|
||||
rating: {
|
||||
stars,
|
||||
},
|
||||
@@ -33,17 +54,87 @@ function scrape(html, site) {
|
||||
};
|
||||
}
|
||||
|
||||
async function scrapeScene(html, url, site) {
|
||||
const { document } = new JSDOM(html).window;
|
||||
|
||||
const release = { url, site };
|
||||
|
||||
release.entryId = document.querySelector('input#set_ID').value;
|
||||
|
||||
release.title = document.querySelector('title').textContent;
|
||||
release.description = document.querySelector('.player_data').textContent.trim();
|
||||
|
||||
const durationString = document.querySelector('.tag_lineR div:nth-child(2) span').textContent;
|
||||
const [minutes, seconds] = durationString.match(/\d+/g);
|
||||
|
||||
release.duration = Number(minutes) * 60 + Number(seconds);
|
||||
release.tags = document.querySelector('meta[name="keywords"]').content.split(',');
|
||||
|
||||
const { poster, trailer } = await getTrailer(release.entryId);
|
||||
|
||||
release.poster = poster;
|
||||
release.trailer = { src: trailer };
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
function scrapeFallbackLanding(html) {
|
||||
const { document } = new JSDOM(html).window;
|
||||
|
||||
return document.querySelector('input#set_ID').value;
|
||||
}
|
||||
|
||||
async function scrapeFallbackScene(html, entryId, url, site) {
|
||||
const { document } = new JSDOM(html).window;
|
||||
const release = { url, entryId, site };
|
||||
|
||||
release.title = document.querySelector('.popup_data_set_head label').textContent;
|
||||
release.description = document.querySelector('.popup_data_set_des p').textContent.trim();
|
||||
release.date = moment.utc(document.querySelector('.popup_left_top div span').textContent, 'MM-DD-YYYY').toDate();
|
||||
release.actors = Array.from(document.querySelectorAll('.popup_data_set_models a'), el => el.textContent);
|
||||
|
||||
const { poster, trailer } = await getTrailer(release.entryId);
|
||||
|
||||
release.poster = poster;
|
||||
release.trailer = { src: trailer };
|
||||
|
||||
release.channel = document.querySelector('.popup_left_top div img').alt;
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
async function fetchLatest(site, page = 1) {
|
||||
const res = page === 1
|
||||
? await bhttp.get(`${site.url}/final_latestupdateview.php?limitstart=${(page - 1) * 9}&limitend=9&websiteid=0&deviceview=browser&tourId=${site.parameters.tourId}`)
|
||||
: await bhttp.get(`${site.url}/final_load_latestupdate_grid_view.php?limitstart=0&limitend=${(page - 1) * 8 + 1}&websiteid=0&deviceview=browser&tourId=${site.parameters.tourId}`);
|
||||
const elements = JSON.parse(res.body.toString());
|
||||
|
||||
const latest = Object.values(elements.total_arr).map(html => scrape(html, site)); // total_arr is a key-value object for final_load_latestupdate_grid_view.php
|
||||
const latest = Object.values(elements.total_arr).map(html => scrapeLatestScene(html, site)); // total_arr is a key-value object for final_load_latestupdate_grid_view.php
|
||||
|
||||
return latest;
|
||||
}
|
||||
|
||||
async function fetchScene(url, site) {
|
||||
const res = await bhttp.get(url);
|
||||
|
||||
if (res.statusCode === 200) {
|
||||
if (site.isFallback) {
|
||||
const entryId = scrapeFallbackLanding(res.body.toString(), url);
|
||||
|
||||
const fallbackRes = await bhttp.post('https://www.pervcity.com/set_popupvideo.php', {
|
||||
setId: entryId,
|
||||
});
|
||||
|
||||
return scrapeFallbackScene(fallbackRes.body.toString(), entryId, url, site);
|
||||
}
|
||||
|
||||
return scrapeScene(res.body.toString(), url, site);
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
module.exports = {
|
||||
fetchLatest,
|
||||
fetchScene,
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user