Refactored 21sextury scraper.

This commit is contained in:
2019-12-09 05:00:49 +01:00
parent d874c508de
commit 04a89efa58
52 changed files with 2621 additions and 2068 deletions

View File

@@ -6,7 +6,6 @@ const { JSDOM } = require('jsdom');
const moment = require('moment');
const knex = require('../knex');
const { matchTags } = require('../tags');
/* eslint-disable newline-per-chained-call */
function scrapeLatest(html, site) {
@@ -49,13 +48,16 @@ async function scrapeScene(html, url, site) {
const title = $('meta[itemprop="name"]').attr('content');
const description = $('.descr-box p').text(); // meta tags don't contain full description
const date = moment.utc($('meta[itemprop="uploadDate"]').attr('content'), 'YYYY-MM-DD').toDate();
const dateProp = $('meta[itemprop="uploadDate"]').attr('content');
const date = dateProp
? moment.utc($('meta[itemprop="uploadDate"]').attr('content'), 'YYYY-MM-DD').toDate()
: moment.utc($('.title-border:nth-child(2) p').text(), 'MM.DD.YYYY').toDate();
const actors = $('.pornstar-card > a').map((actorIndex, actorElement) => $(actorElement).attr('title')).toArray();
const likes = Number($('.info-panel.likes .likes').text());
const duration = Number($('.info-panel.duration .duration').text().slice(0, -4)) * 60;
const rawTags = $('.tags-tab .tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
const tags = $('.tags-tab .tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
const poster = $('#video').attr('poster');
const photos = $('.photo-slider-guest .card a').map((photoIndex, photoElement) => $(photoElement).attr('href')).toArray();
@@ -63,21 +65,7 @@ async function scrapeScene(html, url, site) {
const trailer540 = $('source[res="540"]').attr('src');
const trailer720 = $('source[res="720"]').attr('src');
/*
* broken as of nov 2019
const { origin } = new URL($('.pornstar-card meta[itemprop="url"]').first().attr('content'));
const [channelSite, tags] = await Promise.all([
// don't find site if original is already specific
site.isFallback ? knex('sites').where({ url: origin }).first() : site,
matchTags(rawTags),
]);
*/
const tags = await matchTags(rawTags);
return {
// url: channelSite ? `${channelSite.url}${new URL(url).pathname}` : url,
url,
entryId,
title,
@@ -88,20 +76,19 @@ async function scrapeScene(html, url, site) {
tags,
poster,
photos,
trailer: trailer540
? {
src: trailer540,
quality: 540,
}
: {
// backup
trailer: [
{
src: trailer720,
quality: 720,
},
{
src: trailer540,
quality: 540,
},
],
rating: {
likes,
},
// site: channelSite || site,
site,
};
}