Added Dogfart scraper. Added 'date added' property to release page.

2019-11-04 05:47:37 +01:00
parent d734b1f0b5
commit 5745cd33d8
25 changed files with 747 additions and 102 deletions
--- a/src/scrapers/21sextury.js
+++ b/src/scrapers/21sextury.js
@@ -91,7 +91,7 @@ async function scrapeScene(html, url, site) {
    const [channelSite, tags] = await Promise.all([
        site.isFallback
            ? knex('sites')
-                .where({ id: siteId })
+                .where({ slug: siteId })
                .orWhereRaw('name = ? collate NOCASE', [siteName])
                .first()
            : site,
--- a/src/scrapers/bangbros.js
+++ b/src/scrapers/bangbros.js
@@ -69,10 +69,12 @@ async function scrapeScene(html, url, site) {
    const rawTags = $('.vdoTags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();

    const [channelSite, tags] = await Promise.all([
-        knex('sites')
-            .where({ slug: siteId })
-            .orWhere({ name: siteName })
-            .first(),
+        site.isFallback
+            ? knex('sites')
+                .where({ slug: siteId })
+                .orWhere({ name: siteName })
+                .first()
+            : site,
        matchTags(rawTags),
    ]);

--- a/src/scrapers/dogfart.js
+++ b/src/scrapers/dogfart.js
@@ -0,0 +1,170 @@
+'use strict';
+
+/* eslint-disable newline-per-chained-call */
+const Promise = require('bluebird');
+const bhttp = require('bhttp');
+const { JSDOM } = require('jsdom');
+const moment = require('moment');
+const knex = require('knex');
+
+const { matchTags } = require('../tags');
+
+async function getPhoto(url) {
+    const res = await bhttp.get(url);
+    const html = res.body.toString();
+    const { document } = new JSDOM(html).window;
+
+    const photoUrl = document.querySelector('.scenes-module img').src;
+
+    return photoUrl;
+}
+
+async function getPhotos(albumUrl, site, siteUrl) {
+    const res = await bhttp.get(albumUrl);
+    const html = res.body.toString();
+    const { document } = new JSDOM(html).window;
+
+    const lastPhotoPage = Array.from(document.querySelectorAll('.preview-image-container a')).slice(-1)[0].href;
+    const lastPhotoIndex = parseInt(lastPhotoPage.match(/\d+.jpg/)[0], 10);
+
+    // dogfart has massive albums, pick 20 or specified photos: first, last and evenly inbetween
+    const photoLimit = (site.network.parameters && site.network.parameters.photoLimit) || 25;
+    const photoIndexes = [1]
+        .concat(Array.from({ length: photoLimit - 2 }, (value, index) => Math.floor((index + 1) * (lastPhotoIndex / (photoLimit - 2)))))
+        .concat(lastPhotoIndex);
+
+    if (photoLimit > 25) {
+        console.log(`${site.name}: Scraping ${photoLimit} album photos from ${siteUrl}, this may take some time...`);
+    }
+
+    const photoUrls = await Promise.map(photoIndexes, async (index) => {
+        const pageUrl = `https://blacksonblondes.com${lastPhotoPage.replace(/\d+.jpg/, `${index.toString().padStart(3, '0')}.jpg`)}`;
+
+        return getPhoto(pageUrl);
+    }, {
+        concurrency: 5,
+    });
+
+    return photoUrls;
+}
+
+function scrapeLatest(html, site) {
+    const { document } = new JSDOM(html).window;
+    const sceneElements = Array.from(document.querySelectorAll('.recent-updates'));
+
+    return sceneElements.reduce((acc, element) => {
+        const siteUrl = element.querySelector('.help-block').textContent;
+
+        if (siteUrl.toLowerCase() !== new URL(site.url).host) {
+            // different dogfart site
+            return acc;
+        }
+
+        const sceneLinkElement = element.querySelector('.thumbnail');
+        const url = `https://dogfartnetwork.com${sceneLinkElement.href}`;
+        const { pathname } = new URL(url);
+        const entryId = `${site.slug}_${pathname.split('/')[4]}`;
+
+        const title = element.querySelector('.scene-title').textContent;
+        const actors = title.split(/[,&]|\band\b/).map(actor => actor.trim());
+
+        const poster = `https:${element.querySelector('img').src}`;
+        const trailer = sceneLinkElement.dataset.preview_clip_url;
+
+        return [
+            ...acc,
+            {
+                url,
+                entryId,
+                title,
+                actors,
+                poster,
+                trailer: {
+                    src: trailer,
+                },
+                site,
+            },
+        ];
+    }, []);
+}
+
+async function scrapeScene(html, url, site) {
+    const { document } = new JSDOM(html).window;
+
+    const title = document.querySelector('.description-title').textContent;
+    const actors = Array.from(document.querySelectorAll('.more-scenes a')).map(({ textContent }) => textContent);
+    const metaDescription = document.querySelector('meta[itemprop="description"]').content;
+    const description = metaDescription
+        ? metaDescription.content
+        : document.querySelector('.description')
+            .textContent
+            .replace(/[ \t\n]{2,}/g, ' ')
+            .replace('...read more', '')
+            .trim();
+
+    const siteSlug = document.querySelector('.site-name').textContent.split('.')[0].toLowerCase();
+    const date = new Date(document.querySelector('meta[itemprop="uploadDate"]').content);
+    const duration = moment
+        .duration(document
+            .querySelectorAll('.extra-info p')[1]
+            .textContent
+            .match(/\d+:\d+$/)[0])
+        .asSeconds();
+
+    const trailerElement = document.querySelector('.html5-video');
+    const poster = `https:${trailerElement.dataset.poster}`;
+    const { trailer } = trailerElement.dataset;
+
+    const lastPhotosUrl = Array.from(document.querySelectorAll('.pagination a')).slice(-1)[0].href;
+    const { origin, pathname } = new URL(url);
+    const photos = await getPhotos(`${origin}${pathname}${lastPhotosUrl}`, site, url);
+
+    const stars = Number(document.querySelector('span[itemprop="average"]').textContent) / 2;
+    const rawTags = Array.from(document.querySelectorAll('.scene-details .categories a')).map(({ textContent }) => textContent);
+
+    const [channelSite, tags] = await Promise.all([
+        site.isFallback
+            ? knex('sites')
+                .where({ slug: siteSlug })
+                .orWhere({ url: `https://${siteSlug}.com` })
+                .first()
+            : site,
+        matchTags(rawTags),
+    ]);
+
+    return {
+        url,
+        title,
+        description,
+        actors,
+        date,
+        duration,
+        poster,
+        photos,
+        trailer: {
+            src: trailer,
+        },
+        tags,
+        rating: {
+            stars,
+        },
+        site: channelSite || site,
+    };
+}
+
+async function fetchLatest(site, page = 1) {
+    const res = await bhttp.get(`https://dogfartnetwork.com/tour/scenes/?p=${page}`);
+
+    return scrapeLatest(res.body.toString(), site);
+}
+
+async function fetchScene(url, site) {
+    const res = await bhttp.get(url);
+
+    return scrapeScene(res.body.toString(), url, site);
+}
+
+module.exports = {
+    fetchLatest,
+    fetchScene,
+};
--- a/src/scrapers/index.js
+++ b/src/scrapers/index.js
@@ -5,6 +5,7 @@ const bangbros = require('./bangbros');
 const blowpass = require('./blowpass');
 const brazzers = require('./brazzers');
 const ddfnetwork = require('./ddfnetwork');
+const dogfart = require('./dogfart');
 const evilangel = require('./evilangel');
 const julesjordan = require('./julesjordan');
 const kink = require('./kink');
@@ -23,6 +24,8 @@ module.exports = {
    blowpass,
    brazzers,
    ddfnetwork,
+    dogfart,
+    dogfartnetwork: dogfart,
    evilangel,
    julesjordan,
    kink,
--- a/src/scrapers/kink.js
+++ b/src/scrapers/kink.js
@@ -3,8 +3,8 @@
 const bhttp = require('bhttp');
 const cheerio = require('cheerio');
 const moment = require('moment');
+const knex = require('knex');

-const knex = require('../knex');
 const { matchTags } = require('../tags');

 function scrapeLatest(html, site) {
@@ -75,7 +75,9 @@ async function scrapeScene(html, url, shootId, ratingRes, site) {
    const rawTags = $('.tag-list > a[href*="/tag"]').map((tagIndex, tagElement) => $(tagElement).text()).toArray();

    const [channelSite, tags] = await Promise.all([
-        knex('sites').where({ slug: sitename }).first(),
+        site.isFallback
+            ? knex('sites').where({ slug: sitename }).first()
+            : site,
        matchTags(rawTags),
    ]);

--- a/src/scrapers/mofos.js
+++ b/src/scrapers/mofos.js
@@ -58,7 +58,7 @@ async function scrapeScene(html, url, site) {

    const [channelSite, tags] = await Promise.all([
        knex('sites')
-            .where({ id: siteId })
+            .where({ slug: siteId })
            .orWhere({ url: `https://www.mofos.com${siteUrl}` })
            .orWhere({ name: sitename })
            .first(),
--- a/src/scrapers/naughtyamerica.js
+++ b/src/scrapers/naughtyamerica.js
@@ -70,7 +70,7 @@ async function scrapeScene(html, url, site) {

    const [channelSite, tags] = await Promise.all([
        knex('sites')
-            .where({ id: siteId })
+            .where({ slug: siteId })
            .orWhere({ name: siteName })
            .first(),
        matchTags(rawTags),
--- a/src/scrapers/realitykings.js
+++ b/src/scrapers/realitykings.js
@@ -13,6 +13,8 @@ function scrapeLatest(html, site) {
    const $ = cheerio.load(html, { normalizeWhitespace: true });
    const sceneElements = $('.card.card--release').toArray();

+    console.log(sceneElements);
+
    return sceneElements.map((element) => {
        const sceneLinkElement = $(element).find('.card-info__title a');
        const title = sceneLinkElement.attr('title');
@@ -22,6 +24,8 @@ function scrapeLatest(html, site) {
        const date = moment.utc($(element).find('.card-info__meta-date').text(), 'MMMM DD, YYYY').toDate();
        const actors = $(element).find('.card-info__cast a').map((actorIndex, actorElement) => $(actorElement).text().trim()).toArray();

+        console.log(date, actors, title);
+
        return {
            url,
            entryId,
@@ -54,6 +58,8 @@ async function scrapeScene(data, url, site) {
    const { likes, dislikes } = data.stats;
    const duration = data.videos.mediabook.length;

+    console.log(data);
+
    const rawTags = data.tags.map(tag => tag.name);
    const tags = await matchTags(rawTags);

--- a/src/scrapers/xempire.js
+++ b/src/scrapers/xempire.js
@@ -3,9 +3,9 @@
 const Promise = require('bluebird');
 const bhttp = require('bhttp');
 const cheerio = require('cheerio');
+const knex = require('knex');
 const moment = require('moment');

-const knex = require('../knex');
 const { matchTags } = require('../tags');

 async function fetchPhotos(url) {
@@ -126,7 +126,6 @@ async function scrapeScene(html, url, site) {

    const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds();

-    const rawTags = data.keywords.split(', ');
    const siteDomain = $('meta[name="twitter:domain"]').attr('content');
    const siteId = siteDomain && siteDomain.split('.')[0].toLowerCase();
    const siteUrl = siteDomain && `https://www.${siteDomain}`;
@@ -136,11 +135,13 @@ async function scrapeScene(html, url, site) {

    const photos = await getPhotos($('.picturesItem a').attr('href'), siteDomain);

+    const rawTags = data.keywords.split(', ');
+
    const [channelSite, tags] = await Promise.all([
        site.isFallback
            ? knex('sites')
                .where({ url: siteUrl })
-                .orWhere({ id: siteId })
+                .orWhere({ slug: siteId })
                .first()
            : site,
        matchTags(rawTags),