Refactored 21sextury scraper.

This commit is contained in:
2019-12-09 05:00:49 +01:00
parent d874c508de
commit 04a89efa58
52 changed files with 2621 additions and 2068 deletions

View File

@@ -1,11 +1,51 @@
'use strict';
const Promise = require('bluebird');
const bhttp = require('bhttp');
const cheerio = require('cheerio');
const moment = require('moment');
const knex = require('../knex');
const { matchTags } = require('../tags');
async function fetchPhotos(photoPath) {
const res = await bhttp.get(`https://21sextury.com${photoPath}`);
return res.body.toString();
}
function scrapePhotos(html) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const unlockedPhotos = $('.preview .imgLink.pgUnlocked')
.map((photoIndex, photoElement) => $(photoElement).attr('href')).toArray();
const lockedThumbnails = $('.preview .imgLink.lockedPicture img')
.map((photoIndex, photoElement) => $(photoElement)
.attr('src'))
// .replace('_tb.jpg', '.jpg')) does not always work
.toArray();
return unlockedPhotos.concat(lockedThumbnails);
}
async function getPhotos(photoPath) {
if (!photoPath || photoPath.match('join')) {
return [];
}
const html = await fetchPhotos(photoPath);
const $ = cheerio.load(html, { normalizeWhitespace: true });
const photos = scrapePhotos(html);
const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();
const otherPhotos = await Promise.map(pages, async (pagePath) => {
const pageHtml = await fetchPhotos(pagePath);
return scrapePhotos(pageHtml);
}, {
concurrency: 2,
});
return photos.concat(otherPhotos.flat());
}
function scrape(html, site) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
@@ -14,12 +54,13 @@ function scrape(html, site) {
return scenesElements.reduce((accReleases, element) => {
const siteName = $(element).find('.studioName a').attr('title');
if (site.parameters && site.parameters.filter && siteName.toLowerCase() !== site.name.toLowerCase()) {
if (!site.url && siteName.toLowerCase() !== site.name.toLowerCase()) {
// using generic overview as fallback, scene from different site
return accReleases;
}
const sceneLinkElement = $(element).find('.sceneTitle a');
const url = `${site.url}${sceneLinkElement.attr('href')}`;
const url = `${site.url || 'https://www.21sextury.com'}${sceneLinkElement.attr('href')}`;
const title = sceneLinkElement.attr('title').trim();
const entryId = $(element).attr('data-itemid');
@@ -32,6 +73,9 @@ function scrape(html, site) {
.map((actorIndex, actorElement) => $(actorElement).attr('title'))
.toArray();
const poster = $(element).find('.imgLink img').attr('data-original');
const trailer = `https://videothumb.gammacdn.com/307x224/${entryId}.mp4`;
const [likes, dislikes] = $(element).find('.value')
.toArray()
.map(value => Number($(value).text()));
@@ -44,6 +88,10 @@ function scrape(html, site) {
title,
actors,
date,
poster,
trailer: {
src: trailer,
},
rating: {
likes,
dislikes,
@@ -58,25 +106,21 @@ async function scrapeScene(html, url, site) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const sceneElement = $('#videoWrapper');
const json = $('script[type="application/ld+json"]').html();
const videoJson = $('script:contains("ScenePlayerOptions")').html();
const videoDataString = videoJson.slice(videoJson.indexOf('= {') + 2, videoJson.indexOf('};') + 1);
const data = JSON.parse(json)[0];
const videoData = JSON.parse(videoDataString);
const entryId = new URL(url).pathname.split('/').slice(-1)[0];
const title = data.isPartOf ? data.isPartOf.name : data.name;
const dataDate = moment.utc(data.dateCreated, 'YYYY-MM-DD');
const title = videoData?.playerOptions?.sceneInfos?.sceneTitle || (data.isPartOf && data.isPartOf !== 'TBD' ? data.isPartOf.name : data.name);
const dataDate = moment.utc(videoData?.playerOptions?.sceneInfos?.sceneReleaseDate, 'YYYY-MM-DD');
const date = dataDate.isValid()
? dataDate.toDate()
: moment.utc(sceneElement.find('.updatedDate').text().trim(), 'MM-DD-YYYY').toDate();
const actors = data.actor
.sort(({ gender: genderA }, { gender: genderB }) => {
if (genderA === 'female' && genderB === 'male') return -1;
if (genderA === 'male' && genderB === 'female') return 1;
return 0;
})
.map(actor => actor.name);
const actors = data.actor.map(actor => actor.name);
const description = data.description || null; // prevent empty string
const likes = Number(sceneElement.find('.rating .state_1 .value').text());
@@ -84,27 +128,18 @@ async function scrapeScene(html, url, site) {
const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds();
const rawTags = data.keywords.split(', ');
const poster = videoData.picPreview;
const trailer = `${videoData.playerOptions.host}${videoData.url}`;
const photoPath = $('.picturesItem a').attr('href');
const photos = await getPhotos(photoPath, site);
const tags = data.keywords.split(', ');
const siteName = data.productionCompany ? data.productionCompany.name : $('#logoLink a').attr('title');
const siteId = siteName && siteName.replace(/\s+/g, '').toLowerCase();
const [channelSite, tags] = await Promise.all([
site.isFallback
? knex('sites')
.where({ slug: siteId })
.orWhereRaw('name = ? collate NOCASE', [siteName])
.first()
: site,
matchTags(rawTags),
]);
// only replace generic URL with site URL if site is not marked to fetch scenes from generic site
const originalUrl = channelSite && !(channelSite.parameters && JSON.parse(channelSite.parameters).filter)
? `${channelSite.url}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}`
: url;
const channel = siteName && siteName.replace(/\s+/g, '').toLowerCase();
return {
url: originalUrl,
url,
entryId,
title,
date,
@@ -112,22 +147,30 @@ async function scrapeScene(html, url, site) {
description,
duration,
tags,
poster,
photos,
trailer: {
src: trailer,
},
rating: {
likes,
dislikes,
},
site: channelSite || site,
site,
channel,
};
}
async function fetchLatest(site, page = 1) {
const res = await bhttp.get(`${site.parameters && site.parameters.filter ? 'https://21sextury.com' : site.url}/en/videos/All-Categories/0/All-Pornstars/0/latest/${page}`);
const url = `${site.url || 'https://21sextury.com'}/en/videos/All-Categories/0/All-Pornstars/0/latest/${page}`;
const res = await bhttp.get(url);
return scrape(res.body.toString(), site);
}
async function fetchUpcoming(site) {
const res = await bhttp.get(`${site.parameters && site.parameters.filter ? 'https://21sextury.com' : site.url}/en/videos/All-Categories/0/All-Pornstars/0/upcoming`);
const url = `${site.url || 'https://21sextury.com'}/en/videos/All-Categories/0/All-Pornstars/0/upcoming`;
const res = await bhttp.get(url);
return scrape(res.body.toString(), site);
}

View File

@@ -6,7 +6,6 @@ const { JSDOM } = require('jsdom');
const moment = require('moment');
const knex = require('../knex');
const { matchTags } = require('../tags');
/* eslint-disable newline-per-chained-call */
function scrapeLatest(html, site) {
@@ -49,13 +48,16 @@ async function scrapeScene(html, url, site) {
const title = $('meta[itemprop="name"]').attr('content');
const description = $('.descr-box p').text(); // meta tags don't contain full description
const date = moment.utc($('meta[itemprop="uploadDate"]').attr('content'), 'YYYY-MM-DD').toDate();
const dateProp = $('meta[itemprop="uploadDate"]').attr('content');
const date = dateProp
? moment.utc($('meta[itemprop="uploadDate"]').attr('content'), 'YYYY-MM-DD').toDate()
: moment.utc($('.title-border:nth-child(2) p').text(), 'MM.DD.YYYY').toDate();
const actors = $('.pornstar-card > a').map((actorIndex, actorElement) => $(actorElement).attr('title')).toArray();
const likes = Number($('.info-panel.likes .likes').text());
const duration = Number($('.info-panel.duration .duration').text().slice(0, -4)) * 60;
const rawTags = $('.tags-tab .tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
const tags = $('.tags-tab .tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
const poster = $('#video').attr('poster');
const photos = $('.photo-slider-guest .card a').map((photoIndex, photoElement) => $(photoElement).attr('href')).toArray();
@@ -63,21 +65,7 @@ async function scrapeScene(html, url, site) {
const trailer540 = $('source[res="540"]').attr('src');
const trailer720 = $('source[res="720"]').attr('src');
/*
* broken as of nov 2019
const { origin } = new URL($('.pornstar-card meta[itemprop="url"]').first().attr('content'));
const [channelSite, tags] = await Promise.all([
// don't find site if original is already specific
site.isFallback ? knex('sites').where({ url: origin }).first() : site,
matchTags(rawTags),
]);
*/
const tags = await matchTags(rawTags);
return {
// url: channelSite ? `${channelSite.url}${new URL(url).pathname}` : url,
url,
entryId,
title,
@@ -88,20 +76,19 @@ async function scrapeScene(html, url, site) {
tags,
poster,
photos,
trailer: trailer540
? {
src: trailer540,
quality: 540,
}
: {
// backup
trailer: [
{
src: trailer720,
quality: 720,
},
{
src: trailer540,
quality: 540,
},
],
rating: {
likes,
},
// site: channelSite || site,
site,
};
}

View File

@@ -8,7 +8,6 @@ const moment = require('moment');
const knex = require('../knex');
const { matchTags } = require('../tags');
const pluckPhotos = require('../utils/pluck-photos');
async function getPhoto(url) {
const res = await bhttp.get(url);
@@ -20,7 +19,7 @@ async function getPhoto(url) {
return photoUrl;
}
async function getPhotos(albumUrl, site, siteUrl) {
async function getPhotos(albumUrl) {
const res = await bhttp.get(albumUrl);
const html = res.body.toString();
const { document } = new JSDOM(html).window;
@@ -28,15 +27,7 @@ async function getPhotos(albumUrl, site, siteUrl) {
const lastPhotoPage = Array.from(document.querySelectorAll('.preview-image-container a')).slice(-1)[0].href;
const lastPhotoIndex = parseInt(lastPhotoPage.match(/\d+.jpg/)[0], 10);
// dogfart has massive albums, pick 25 or specified number of photos: first, last and evenly inbetween
const photoLimit = (site.network.parameters && site.network.parameters.photoLimit) || 25;
const photoIndexes = pluckPhotos(lastPhotoIndex, photoLimit);
if (photoLimit > 25) {
console.log(`${site.name}: Scraping ${photoLimit} album photos from ${siteUrl}, this may take some time...`);
}
const photoUrls = await Promise.map(photoIndexes, async (index) => {
const photoUrls = await Promise.map(Array.from({ length: lastPhotoIndex }), async (index) => {
const pageUrl = `https://blacksonblondes.com${lastPhotoPage.replace(/\d+.jpg/, `${index.toString().padStart(3, '0')}.jpg`)}`;
return getPhoto(pageUrl);

View File

@@ -9,8 +9,6 @@ const moment = require('moment');
const { heightToCm } = require('../utils/convert');
const { matchTags } = require('../tags');
const pluckPhotos = require('../utils/pluck-photos');
async function fetchPhotos(url) {
const res = await bhttp.get(url);
@@ -58,14 +56,7 @@ async function getPhotos(entryId, site, page = 1) {
})
: [];
const allPhotos = photos.concat(otherPhotos.flat());
const photoLimit = (site.network.parameters && site.network.parameters.photoLimit) || 25;
const photoIndexes = pluckPhotos(allPhotos.length - 1, photoLimit);
const pluckedPhotos = photoIndexes.map(photoIndex => allPhotos[photoIndex]);
return pluckedPhotos;
return photos.concat(otherPhotos.flat());
}
function scrapeLatest(html, site) {

View File

@@ -8,7 +8,6 @@ const moment = require('moment');
const { fetchSites } = require('../sites');
const { matchTags } = require('../tags');
const pluckPhotos = require('../utils/pluck-photos');
const defaultTags = {
hardx: [],
@@ -38,7 +37,7 @@ function scrapePhotos(html) {
return unlockedPhotos.concat(lockedThumbnails);
}
async function getPhotos(albumPath, siteDomain, site) {
async function getPhotos(albumPath, siteDomain) {
const albumUrl = `https://${siteDomain}${albumPath}`;
const html = await fetchPhotos(albumUrl);
@@ -56,14 +55,7 @@ async function getPhotos(albumPath, siteDomain, site) {
concurrency: 2,
});
const allPhotos = photos.concat(otherPhotos.flat());
const photoLimit = (site.network.parameters && site.network.parameters.photoLimit) || 25;
const photoIndexes = pluckPhotos(allPhotos.length - 1, photoLimit);
const pluckedPhotos = photoIndexes.map(photoIndex => allPhotos[photoIndex]);
return pluckedPhotos;
return photos.concat(otherPhotos.flat());
}
function scrape(html, site) {