Added Dogfart scraper. Added 'date added' property to release page.

This commit is contained in:
2019-11-04 05:47:37 +01:00
parent d734b1f0b5
commit 5745cd33d8
25 changed files with 747 additions and 102 deletions

View File

@@ -91,7 +91,7 @@ async function scrapeScene(html, url, site) {
const [channelSite, tags] = await Promise.all([
site.isFallback
? knex('sites')
.where({ id: siteId })
.where({ slug: siteId })
.orWhereRaw('name = ? collate NOCASE', [siteName])
.first()
: site,

View File

@@ -69,10 +69,12 @@ async function scrapeScene(html, url, site) {
const rawTags = $('.vdoTags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
const [channelSite, tags] = await Promise.all([
knex('sites')
.where({ slug: siteId })
.orWhere({ name: siteName })
.first(),
site.isFallback
? knex('sites')
.where({ slug: siteId })
.orWhere({ name: siteName })
.first()
: site,
matchTags(rawTags),
]);

170
src/scrapers/dogfart.js Normal file
View File

@@ -0,0 +1,170 @@
'use strict';
/* eslint-disable newline-per-chained-call */
const Promise = require('bluebird');
const bhttp = require('bhttp');
const { JSDOM } = require('jsdom');
const moment = require('moment');
const knex = require('knex');
const { matchTags } = require('../tags');
async function getPhoto(url) {
const res = await bhttp.get(url);
const html = res.body.toString();
const { document } = new JSDOM(html).window;
const photoUrl = document.querySelector('.scenes-module img').src;
return photoUrl;
}
async function getPhotos(albumUrl, site, siteUrl) {
const res = await bhttp.get(albumUrl);
const html = res.body.toString();
const { document } = new JSDOM(html).window;
const lastPhotoPage = Array.from(document.querySelectorAll('.preview-image-container a')).slice(-1)[0].href;
const lastPhotoIndex = parseInt(lastPhotoPage.match(/\d+.jpg/)[0], 10);
// dogfart has massive albums, pick 20 or specified photos: first, last and evenly inbetween
const photoLimit = (site.network.parameters && site.network.parameters.photoLimit) || 25;
const photoIndexes = [1]
.concat(Array.from({ length: photoLimit - 2 }, (value, index) => Math.floor((index + 1) * (lastPhotoIndex / (photoLimit - 2)))))
.concat(lastPhotoIndex);
if (photoLimit > 25) {
console.log(`${site.name}: Scraping ${photoLimit} album photos from ${siteUrl}, this may take some time...`);
}
const photoUrls = await Promise.map(photoIndexes, async (index) => {
const pageUrl = `https://blacksonblondes.com${lastPhotoPage.replace(/\d+.jpg/, `${index.toString().padStart(3, '0')}.jpg`)}`;
return getPhoto(pageUrl);
}, {
concurrency: 5,
});
return photoUrls;
}
function scrapeLatest(html, site) {
const { document } = new JSDOM(html).window;
const sceneElements = Array.from(document.querySelectorAll('.recent-updates'));
return sceneElements.reduce((acc, element) => {
const siteUrl = element.querySelector('.help-block').textContent;
if (siteUrl.toLowerCase() !== new URL(site.url).host) {
// different dogfart site
return acc;
}
const sceneLinkElement = element.querySelector('.thumbnail');
const url = `https://dogfartnetwork.com${sceneLinkElement.href}`;
const { pathname } = new URL(url);
const entryId = `${site.slug}_${pathname.split('/')[4]}`;
const title = element.querySelector('.scene-title').textContent;
const actors = title.split(/[,&]|\band\b/).map(actor => actor.trim());
const poster = `https:${element.querySelector('img').src}`;
const trailer = sceneLinkElement.dataset.preview_clip_url;
return [
...acc,
{
url,
entryId,
title,
actors,
poster,
trailer: {
src: trailer,
},
site,
},
];
}, []);
}
async function scrapeScene(html, url, site) {
const { document } = new JSDOM(html).window;
const title = document.querySelector('.description-title').textContent;
const actors = Array.from(document.querySelectorAll('.more-scenes a')).map(({ textContent }) => textContent);
const metaDescription = document.querySelector('meta[itemprop="description"]').content;
const description = metaDescription
? metaDescription.content
: document.querySelector('.description')
.textContent
.replace(/[ \t\n]{2,}/g, ' ')
.replace('...read more', '')
.trim();
const siteSlug = document.querySelector('.site-name').textContent.split('.')[0].toLowerCase();
const date = new Date(document.querySelector('meta[itemprop="uploadDate"]').content);
const duration = moment
.duration(document
.querySelectorAll('.extra-info p')[1]
.textContent
.match(/\d+:\d+$/)[0])
.asSeconds();
const trailerElement = document.querySelector('.html5-video');
const poster = `https:${trailerElement.dataset.poster}`;
const { trailer } = trailerElement.dataset;
const lastPhotosUrl = Array.from(document.querySelectorAll('.pagination a')).slice(-1)[0].href;
const { origin, pathname } = new URL(url);
const photos = await getPhotos(`${origin}${pathname}${lastPhotosUrl}`, site, url);
const stars = Number(document.querySelector('span[itemprop="average"]').textContent) / 2;
const rawTags = Array.from(document.querySelectorAll('.scene-details .categories a')).map(({ textContent }) => textContent);
const [channelSite, tags] = await Promise.all([
site.isFallback
? knex('sites')
.where({ slug: siteSlug })
.orWhere({ url: `https://${siteSlug}.com` })
.first()
: site,
matchTags(rawTags),
]);
return {
url,
title,
description,
actors,
date,
duration,
poster,
photos,
trailer: {
src: trailer,
},
tags,
rating: {
stars,
},
site: channelSite || site,
};
}
async function fetchLatest(site, page = 1) {
const res = await bhttp.get(`https://dogfartnetwork.com/tour/scenes/?p=${page}`);
return scrapeLatest(res.body.toString(), site);
}
async function fetchScene(url, site) {
const res = await bhttp.get(url);
return scrapeScene(res.body.toString(), url, site);
}
module.exports = {
fetchLatest,
fetchScene,
};

View File

@@ -5,6 +5,7 @@ const bangbros = require('./bangbros');
const blowpass = require('./blowpass');
const brazzers = require('./brazzers');
const ddfnetwork = require('./ddfnetwork');
const dogfart = require('./dogfart');
const evilangel = require('./evilangel');
const julesjordan = require('./julesjordan');
const kink = require('./kink');
@@ -23,6 +24,8 @@ module.exports = {
blowpass,
brazzers,
ddfnetwork,
dogfart,
dogfartnetwork: dogfart,
evilangel,
julesjordan,
kink,

View File

@@ -3,8 +3,8 @@
const bhttp = require('bhttp');
const cheerio = require('cheerio');
const moment = require('moment');
const knex = require('knex');
const knex = require('../knex');
const { matchTags } = require('../tags');
function scrapeLatest(html, site) {
@@ -75,7 +75,9 @@ async function scrapeScene(html, url, shootId, ratingRes, site) {
const rawTags = $('.tag-list > a[href*="/tag"]').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
const [channelSite, tags] = await Promise.all([
knex('sites').where({ slug: sitename }).first(),
site.isFallback
? knex('sites').where({ slug: sitename }).first()
: site,
matchTags(rawTags),
]);

View File

@@ -58,7 +58,7 @@ async function scrapeScene(html, url, site) {
const [channelSite, tags] = await Promise.all([
knex('sites')
.where({ id: siteId })
.where({ slug: siteId })
.orWhere({ url: `https://www.mofos.com${siteUrl}` })
.orWhere({ name: sitename })
.first(),

View File

@@ -70,7 +70,7 @@ async function scrapeScene(html, url, site) {
const [channelSite, tags] = await Promise.all([
knex('sites')
.where({ id: siteId })
.where({ slug: siteId })
.orWhere({ name: siteName })
.first(),
matchTags(rawTags),

View File

@@ -13,6 +13,8 @@ function scrapeLatest(html, site) {
const $ = cheerio.load(html, { normalizeWhitespace: true });
const sceneElements = $('.card.card--release').toArray();
console.log(sceneElements);
return sceneElements.map((element) => {
const sceneLinkElement = $(element).find('.card-info__title a');
const title = sceneLinkElement.attr('title');
@@ -22,6 +24,8 @@ function scrapeLatest(html, site) {
const date = moment.utc($(element).find('.card-info__meta-date').text(), 'MMMM DD, YYYY').toDate();
const actors = $(element).find('.card-info__cast a').map((actorIndex, actorElement) => $(actorElement).text().trim()).toArray();
console.log(date, actors, title);
return {
url,
entryId,
@@ -54,6 +58,8 @@ async function scrapeScene(data, url, site) {
const { likes, dislikes } = data.stats;
const duration = data.videos.mediabook.length;
console.log(data);
const rawTags = data.tags.map(tag => tag.name);
const tags = await matchTags(rawTags);

View File

@@ -3,9 +3,9 @@
const Promise = require('bluebird');
const bhttp = require('bhttp');
const cheerio = require('cheerio');
const knex = require('knex');
const moment = require('moment');
const knex = require('../knex');
const { matchTags } = require('../tags');
async function fetchPhotos(url) {
@@ -126,7 +126,6 @@ async function scrapeScene(html, url, site) {
const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds();
const rawTags = data.keywords.split(', ');
const siteDomain = $('meta[name="twitter:domain"]').attr('content');
const siteId = siteDomain && siteDomain.split('.')[0].toLowerCase();
const siteUrl = siteDomain && `https://www.${siteDomain}`;
@@ -136,11 +135,13 @@ async function scrapeScene(html, url, site) {
const photos = await getPhotos($('.picturesItem a').attr('href'), siteDomain);
const rawTags = data.keywords.split(', ');
const [channelSite, tags] = await Promise.all([
site.isFallback
? knex('sites')
.where({ url: siteUrl })
.orWhere({ id: siteId })
.orWhere({ slug: siteId })
.first()
: site,
matchTags(rawTags),