Refactored 21sextury scraper.

2019-12-09 05:00:49 +01:00
parent d874c508de
commit 04a89efa58
52 changed files with 2621 additions and 2068 deletions
--- a/src/.eslintrc
+++ b/src/.eslintrc
@@ -1,13 +1,15 @@
 {
    "extends": "airbnb-base",
    "parserOptions": {
+        "parser": "babel-eslint",
        "sourceType": "script"
    },
    "rules": {
        "strict": 0,
        "no-unused-vars": ["error", {"argsIgnorePattern": "^_"}],
        "no-console": 0,
-        "indent": ["error", 4],
-        "max-len": [2, {"code": 300, "tabWidth": 4, "ignoreUrls": true}]
+        "indent": "off",
+        "max-len": [2, {"code": 300, "tabWidth": 4, "ignoreUrls": true}],
+        "template-curly-spacing": "off"
    }
 }
--- a/src/actors.js
+++ b/src/actors.js
@@ -271,6 +271,8 @@ async function updateActor(actor, scraped = false, scrapeSuccess = false) {
 }

 async function mergeProfiles(profiles, actor) {
+    console.log(profiles);
+
    const mergedProfile = profiles.reduce((prevProfile, profile) => {
        if (profile === null) {
            return prevProfile;
--- a/src/app.js
+++ b/src/app.js
@@ -46,4 +46,4 @@ async function init() {
    await initServer();
 }

-init();
+module.exports = init;
--- a/src/argv.js
+++ b/src/argv.js
@@ -63,6 +63,7 @@ const { argv } = yargs
    .option('debug', {
        describe: 'Show error stack traces',
        type: 'boolean',
+        default: process.env.NODE_ENV === 'development',
    });

 module.exports = argv;
--- a/src/init.js
+++ b/src/init.js
@@ -0,0 +1,4 @@
+require('babel-polyfill');
+const init = require('./app');
+
+init();
--- a/src/media.js
+++ b/src/media.js
@@ -10,6 +10,7 @@ const sharp = require('sharp');
 const blake2 = require('blake2');

 const knex = require('./knex');
+const pluckPhotos = require('./utils/pluck-photos');

 function getHash(buffer) {
    const hash = blake2.createHash('blake2b', { digestLength: 24 });
@@ -94,10 +95,10 @@ async function filterHashDuplicates(files, domains = ['releases'], roles = ['pho
 }

 async function fetchPhoto(photoUrl, index, identifier) {
-    const { pathname } = new URL(photoUrl);
-    const mimetype = mime.getType(pathname);
-
    try {
+        const { pathname } = new URL(photoUrl);
+        const mimetype = mime.getType(pathname);
+
        const res = await bhttp.get(photoUrl);

        if (res.statusCode === 200) {
@@ -176,7 +177,11 @@ async function storePhotos(release, releaseId) {
        return;
    }

-    const newPhotos = await filterSourceDuplicates(release.photos, 'releases', 'photo', `(${release.site.name}, ${releaseId}) "${release.title}"`);
+    const pluckedPhotos = pluckPhotos(release.photos, release);
+
+    console.log(release.photos, pluckedPhotos);
+
+    const newPhotos = await filterSourceDuplicates(pluckedPhotos, 'releases', 'photo', `(${release.site.name}, ${releaseId}) "${release.title}"`);

    if (newPhotos.length === 0) return;

--- a/src/releases.js
+++ b/src/releases.js
@@ -125,7 +125,7 @@ async function curateScrapedRelease(release) {
        likes: release.rating && release.rating.likes,
        dislikes: release.rating && release.rating.dislikes,
        rating: release.rating && release.rating.stars && Math.floor(release.rating.stars),
-        deep: Boolean(argv.deep && release.url && !release.upcoming),
+        deep: typeof release.deep === 'boolean' ? release.deep : false,
    };

    if (release.site.isFallback && release.channel) {
@@ -275,6 +275,12 @@ async function storeRelease(release) {

 async function storeReleases(releases) {
    const storedReleases = await Promise.map(releases, async (release) => {
+        if (release.site.isFallback && !release.channel) {
+            console.error(`Unable to derive channel site from generic URL: ${release.url}.`);
+
+            return null;
+        }
+
        try {
            const releaseId = await storeRelease(release);

@@ -289,7 +295,7 @@ async function storeReleases(releases) {
        }
    }, {
        concurrency: 10,
-    });
+    }).filter(release => release);

    const actors = storedReleases.reduce((acc, release) => {
        if (!release.actors) return acc;
--- a/src/scrape-release.js
+++ b/src/scrape-release.js
@@ -51,7 +51,9 @@ async function scrapeRelease(url, release, deep = false) {
        // don't store release when called by site scraper
        const [storedRelease] = await storeReleases([scene]);

-        console.log(`http://${config.web.host}:${config.web.port}/scene/${storedRelease.id}`);
+        if (storedRelease) {
+            console.log(`http://${config.web.host}:${config.web.port}/scene/${storedRelease.id}`);
+        }
    }

    return scene;
--- a/src/scrape-sites.js
+++ b/src/scrape-sites.js
@@ -69,12 +69,20 @@ async function scrapeUpcomingReleases(scraper, site) {
 async function deepFetchReleases(baseReleases) {
    return Promise.map(baseReleases, async (release) => {
        if (release.url) {
-            const fullRelease = await scrapeRelease(release.url, release, true);
+            try {
+                const fullRelease = await scrapeRelease(release.url, release, true);

-            return {
-                ...release,
-                ...fullRelease,
-            };
+                return {
+                    ...release,
+                    ...fullRelease,
+                    deep: true,
+                };
+            } catch (error) {
+                return {
+                    ...release,
+                    deep: false,
+                };
+            }
        }

        return release;
@@ -116,7 +124,7 @@ async function scrapeReleases() {
            return await scrapeSiteReleases(scraper, site);
        } catch (error) {
            if (argv.debug) {
-                console.error(`${site.id}: Failed to scrape releases`, error);
+                console.error(`${site.name}: Failed to scrape releases`, error);
            }

            console.warn(`${site.id}: Failed to scrape releases`);
--- a/src/scrapers/21sextury.js
+++ b/src/scrapers/21sextury.js
@@ -1,11 +1,51 @@
 'use strict';

+const Promise = require('bluebird');
 const bhttp = require('bhttp');
 const cheerio = require('cheerio');
 const moment = require('moment');

-const knex = require('../knex');
-const { matchTags } = require('../tags');
+async function fetchPhotos(photoPath) {
+    const res = await bhttp.get(`https://21sextury.com${photoPath}`);
+
+    return res.body.toString();
+}
+
+function scrapePhotos(html) {
+    const $ = cheerio.load(html, { normalizeWhitespace: true });
+
+    const unlockedPhotos = $('.preview .imgLink.pgUnlocked')
+        .map((photoIndex, photoElement) => $(photoElement).attr('href')).toArray();
+
+    const lockedThumbnails = $('.preview .imgLink.lockedPicture img')
+        .map((photoIndex, photoElement) => $(photoElement)
+            .attr('src'))
+        // .replace('_tb.jpg', '.jpg')) does not always work
+        .toArray();
+
+    return unlockedPhotos.concat(lockedThumbnails);
+}
+
+async function getPhotos(photoPath) {
+    if (!photoPath || photoPath.match('join')) {
+        return [];
+    }
+
+    const html = await fetchPhotos(photoPath);
+    const $ = cheerio.load(html, { normalizeWhitespace: true });
+    const photos = scrapePhotos(html);
+    const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();
+
+    const otherPhotos = await Promise.map(pages, async (pagePath) => {
+        const pageHtml = await fetchPhotos(pagePath);
+
+        return scrapePhotos(pageHtml);
+    }, {
+        concurrency: 2,
+    });
+
+    return photos.concat(otherPhotos.flat());
+}

 function scrape(html, site) {
    const $ = cheerio.load(html, { normalizeWhitespace: true });
@@ -14,12 +54,13 @@ function scrape(html, site) {
    return scenesElements.reduce((accReleases, element) => {
        const siteName = $(element).find('.studioName a').attr('title');

-        if (site.parameters && site.parameters.filter && siteName.toLowerCase() !== site.name.toLowerCase()) {
+        if (!site.url && siteName.toLowerCase() !== site.name.toLowerCase()) {
+            // using generic overview as fallback, scene from different site
            return accReleases;
        }

        const sceneLinkElement = $(element).find('.sceneTitle a');
-        const url = `${site.url}${sceneLinkElement.attr('href')}`;
+        const url = `${site.url || 'https://www.21sextury.com'}${sceneLinkElement.attr('href')}`;
        const title = sceneLinkElement.attr('title').trim();

        const entryId = $(element).attr('data-itemid');
@@ -32,6 +73,9 @@ function scrape(html, site) {
            .map((actorIndex, actorElement) => $(actorElement).attr('title'))
            .toArray();

+        const poster = $(element).find('.imgLink img').attr('data-original');
+        const trailer = `https://videothumb.gammacdn.com/307x224/${entryId}.mp4`;
+
        const [likes, dislikes] = $(element).find('.value')
            .toArray()
            .map(value => Number($(value).text()));
@@ -44,6 +88,10 @@ function scrape(html, site) {
                title,
                actors,
                date,
+                poster,
+                trailer: {
+                    src: trailer,
+                },
                rating: {
                    likes,
                    dislikes,
@@ -58,25 +106,21 @@ async function scrapeScene(html, url, site) {
    const $ = cheerio.load(html, { normalizeWhitespace: true });
    const sceneElement = $('#videoWrapper');
    const json = $('script[type="application/ld+json"]').html();
+    const videoJson = $('script:contains("ScenePlayerOptions")').html();
+    const videoDataString = videoJson.slice(videoJson.indexOf('= {') + 2, videoJson.indexOf('};') + 1);

    const data = JSON.parse(json)[0];
+    const videoData = JSON.parse(videoDataString);
    const entryId = new URL(url).pathname.split('/').slice(-1)[0];

-    const title = data.isPartOf ? data.isPartOf.name : data.name;
-    const dataDate = moment.utc(data.dateCreated, 'YYYY-MM-DD');
+    const title = videoData?.playerOptions?.sceneInfos?.sceneTitle || (data.isPartOf && data.isPartOf !== 'TBD' ? data.isPartOf.name : data.name);
+    const dataDate = moment.utc(videoData?.playerOptions?.sceneInfos?.sceneReleaseDate, 'YYYY-MM-DD');

    const date = dataDate.isValid()
        ? dataDate.toDate()
        : moment.utc(sceneElement.find('.updatedDate').text().trim(), 'MM-DD-YYYY').toDate();

-    const actors = data.actor
-        .sort(({ gender: genderA }, { gender: genderB }) => {
-            if (genderA === 'female' && genderB === 'male') return -1;
-            if (genderA === 'male' && genderB === 'female') return 1;
-
-            return 0;
-        })
-        .map(actor => actor.name);
+    const actors = data.actor.map(actor => actor.name);

    const description = data.description || null; // prevent empty string
    const likes = Number(sceneElement.find('.rating .state_1 .value').text());
@@ -84,27 +128,18 @@ async function scrapeScene(html, url, site) {

    const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds();

-    const rawTags = data.keywords.split(', ');
+    const poster = videoData.picPreview;
+    const trailer = `${videoData.playerOptions.host}${videoData.url}`;
+
+    const photoPath = $('.picturesItem a').attr('href');
+    const photos = await getPhotos(photoPath, site);
+
+    const tags = data.keywords.split(', ');
    const siteName = data.productionCompany ? data.productionCompany.name : $('#logoLink a').attr('title');
-    const siteId = siteName && siteName.replace(/\s+/g, '').toLowerCase();
-
-    const [channelSite, tags] = await Promise.all([
-        site.isFallback
-            ? knex('sites')
-                .where({ slug: siteId })
-                .orWhereRaw('name = ? collate NOCASE', [siteName])
-                .first()
-            : site,
-        matchTags(rawTags),
-    ]);
-
-    // only replace generic URL with site URL if site is not marked to fetch scenes from generic site
-    const originalUrl = channelSite && !(channelSite.parameters && JSON.parse(channelSite.parameters).filter)
-        ? `${channelSite.url}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}`
-        : url;
+    const channel = siteName && siteName.replace(/\s+/g, '').toLowerCase();

    return {
-        url: originalUrl,
+        url,
        entryId,
        title,
        date,
@@ -112,22 +147,30 @@ async function scrapeScene(html, url, site) {
        description,
        duration,
        tags,
+        poster,
+        photos,
+        trailer: {
+            src: trailer,
+        },
        rating: {
            likes,
            dislikes,
        },
-        site: channelSite || site,
+        site,
+        channel,
    };
 }

 async function fetchLatest(site, page = 1) {
-    const res = await bhttp.get(`${site.parameters && site.parameters.filter ? 'https://21sextury.com' : site.url}/en/videos/All-Categories/0/All-Pornstars/0/latest/${page}`);
+    const url = `${site.url || 'https://21sextury.com'}/en/videos/All-Categories/0/All-Pornstars/0/latest/${page}`;
+    const res = await bhttp.get(url);

    return scrape(res.body.toString(), site);
 }

 async function fetchUpcoming(site) {
-    const res = await bhttp.get(`${site.parameters && site.parameters.filter ? 'https://21sextury.com' : site.url}/en/videos/All-Categories/0/All-Pornstars/0/upcoming`);
+    const url = `${site.url || 'https://21sextury.com'}/en/videos/All-Categories/0/All-Pornstars/0/upcoming`;
+    const res = await bhttp.get(url);

    return scrape(res.body.toString(), site);
 }
--- a/src/scrapers/ddfnetwork.js
+++ b/src/scrapers/ddfnetwork.js
@@ -6,7 +6,6 @@ const { JSDOM } = require('jsdom');
 const moment = require('moment');

 const knex = require('../knex');
-const { matchTags } = require('../tags');

 /* eslint-disable newline-per-chained-call */
 function scrapeLatest(html, site) {
@@ -49,13 +48,16 @@ async function scrapeScene(html, url, site) {
    const title = $('meta[itemprop="name"]').attr('content');
    const description = $('.descr-box p').text(); // meta tags don't contain full description

-    const date = moment.utc($('meta[itemprop="uploadDate"]').attr('content'), 'YYYY-MM-DD').toDate();
+    const dateProp = $('meta[itemprop="uploadDate"]').attr('content');
+    const date = dateProp
+        ? moment.utc($('meta[itemprop="uploadDate"]').attr('content'), 'YYYY-MM-DD').toDate()
+        : moment.utc($('.title-border:nth-child(2) p').text(), 'MM.DD.YYYY').toDate();
    const actors = $('.pornstar-card > a').map((actorIndex, actorElement) => $(actorElement).attr('title')).toArray();

    const likes = Number($('.info-panel.likes .likes').text());
    const duration = Number($('.info-panel.duration .duration').text().slice(0, -4)) * 60;

-    const rawTags = $('.tags-tab .tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
+    const tags = $('.tags-tab .tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();

    const poster = $('#video').attr('poster');
    const photos = $('.photo-slider-guest .card a').map((photoIndex, photoElement) => $(photoElement).attr('href')).toArray();
@@ -63,21 +65,7 @@ async function scrapeScene(html, url, site) {
    const trailer540 = $('source[res="540"]').attr('src');
    const trailer720 = $('source[res="720"]').attr('src');

-    /*
-     * broken as of nov 2019
-    const { origin } = new URL($('.pornstar-card meta[itemprop="url"]').first().attr('content'));
-
-    const [channelSite, tags] = await Promise.all([
-        // don't find site if original is already specific
-        site.isFallback ? knex('sites').where({ url: origin }).first() : site,
-        matchTags(rawTags),
-    ]);
-    */
-
-    const tags = await matchTags(rawTags);
-
    return {
-        // url: channelSite ? `${channelSite.url}${new URL(url).pathname}` : url,
        url,
        entryId,
        title,
@@ -88,20 +76,19 @@ async function scrapeScene(html, url, site) {
        tags,
        poster,
        photos,
-        trailer: trailer540
-            ? {
-                src: trailer540,
-                quality: 540,
-            }
-            : {
-                // backup
+        trailer: [
+            {
                src: trailer720,
                quality: 720,
            },
+            {
+                src: trailer540,
+                quality: 540,
+            },
+        ],
        rating: {
            likes,
        },
-        // site: channelSite || site,
        site,
    };
 }
--- a/src/scrapers/dogfart.js
+++ b/src/scrapers/dogfart.js
@@ -8,7 +8,6 @@ const moment = require('moment');

 const knex = require('../knex');
 const { matchTags } = require('../tags');
-const pluckPhotos = require('../utils/pluck-photos');

 async function getPhoto(url) {
    const res = await bhttp.get(url);
@@ -20,7 +19,7 @@ async function getPhoto(url) {
    return photoUrl;
 }

-async function getPhotos(albumUrl, site, siteUrl) {
+async function getPhotos(albumUrl) {
    const res = await bhttp.get(albumUrl);
    const html = res.body.toString();
    const { document } = new JSDOM(html).window;
@@ -28,15 +27,7 @@ async function getPhotos(albumUrl, site, siteUrl) {
    const lastPhotoPage = Array.from(document.querySelectorAll('.preview-image-container a')).slice(-1)[0].href;
    const lastPhotoIndex = parseInt(lastPhotoPage.match(/\d+.jpg/)[0], 10);

-    // dogfart has massive albums, pick 25 or specified number of photos: first, last and evenly inbetween
-    const photoLimit = (site.network.parameters && site.network.parameters.photoLimit) || 25;
-    const photoIndexes = pluckPhotos(lastPhotoIndex, photoLimit);
-
-    if (photoLimit > 25) {
-        console.log(`${site.name}: Scraping ${photoLimit} album photos from ${siteUrl}, this may take some time...`);
-    }
-
-    const photoUrls = await Promise.map(photoIndexes, async (index) => {
+    const photoUrls = await Promise.map(Array.from({ length: lastPhotoIndex }), async (index) => {
        const pageUrl = `https://blacksonblondes.com${lastPhotoPage.replace(/\d+.jpg/, `${index.toString().padStart(3, '0')}.jpg`)}`;

        return getPhoto(pageUrl);
--- a/src/scrapers/julesjordan.js
+++ b/src/scrapers/julesjordan.js
@@ -9,8 +9,6 @@ const moment = require('moment');
 const { heightToCm } = require('../utils/convert');
 const { matchTags } = require('../tags');

-const pluckPhotos = require('../utils/pluck-photos');
-
 async function fetchPhotos(url) {
    const res = await bhttp.get(url);

@@ -58,14 +56,7 @@ async function getPhotos(entryId, site, page = 1) {
        })
        : [];

-    const allPhotos = photos.concat(otherPhotos.flat());
-
-    const photoLimit = (site.network.parameters && site.network.parameters.photoLimit) || 25;
-    const photoIndexes = pluckPhotos(allPhotos.length - 1, photoLimit);
-
-    const pluckedPhotos = photoIndexes.map(photoIndex => allPhotos[photoIndex]);
-
-    return pluckedPhotos;
+    return photos.concat(otherPhotos.flat());
 }

 function scrapeLatest(html, site) {
--- a/src/scrapers/xempire.js
+++ b/src/scrapers/xempire.js
@@ -8,7 +8,6 @@ const moment = require('moment');

 const { fetchSites } = require('../sites');
 const { matchTags } = require('../tags');
-const pluckPhotos = require('../utils/pluck-photos');

 const defaultTags = {
    hardx: [],
@@ -38,7 +37,7 @@ function scrapePhotos(html) {
    return unlockedPhotos.concat(lockedThumbnails);
 }

-async function getPhotos(albumPath, siteDomain, site) {
+async function getPhotos(albumPath, siteDomain) {
    const albumUrl = `https://${siteDomain}${albumPath}`;

    const html = await fetchPhotos(albumUrl);
@@ -56,14 +55,7 @@ async function getPhotos(albumPath, siteDomain, site) {
        concurrency: 2,
    });

-    const allPhotos = photos.concat(otherPhotos.flat());
-
-    const photoLimit = (site.network.parameters && site.network.parameters.photoLimit) || 25;
-    const photoIndexes = pluckPhotos(allPhotos.length - 1, photoLimit);
-
-    const pluckedPhotos = photoIndexes.map(photoIndex => allPhotos[photoIndex]);
-
-    return pluckedPhotos;
+    return photos.concat(otherPhotos.flat());
 }

 function scrape(html, site) {
--- a/src/utils/pluck-photos.js
+++ b/src/utils/pluck-photos.js
@@ -1,13 +1,18 @@
 'use strict';

+const config = require('config');
+
 // pick {photoLimit} photos evenly distributed photos from a set with {photoTotal} photos, return array of indexes starting at 1
-function pluckPhotos(photoTotal, photoLimit) {
+function pluckPhotos(photos, release, specifiedLimit) {
+    const limit = specifiedLimit || config.media.limit;
+    console.log(limit);
+
    const plucked = [1]
        .concat(
-            Array.from({ length: photoLimit - 1 }, (value, index) => Math.round((index + 1) * (photoTotal / (photoLimit - 1)))),
+            Array.from({ length: limit - 1 }, (value, index) => Math.round((index + 1) * (photos.length / (limit)))),
        );

-    return Array.from(new Set(plucked)); // remove duplicates, may happen when photo total and photo limit are close
+    return Array.from(new Set(plucked)).map(photoIndex => photos[photoIndex]); // remove duplicates, may happen when photo total and photo limit are close
 }

 module.exports = pluckPhotos;