Added support for release photo fallbacks. Limited photo fetching concurrency. Modifier XEmpire scraper for AllBlackX support and improved photo scraping. Added movie property to Evil Angel scraper.

2019-12-12 03:12:05 +01:00 · 2019-12-12 03:12:05 +01:00 · a310f9bb1d
parent c26d5b8655
commit a310f9bb1d
9 changed files with 113 additions and 70 deletions
--- a/public/img/logos/xempire/allblackx.png
+++ b/public/img/logos/xempire/allblackx.png
--- a/seeds/01_sites.js
+++ b/seeds/01_sites.js
@ -582,7 +582,7 @@ function getSites(networksMap) {
        },
        {
            slug: 'pornstarslikeitbig',
-            name: 'Pornstars Like it Big',
+            name: 'Pornstars Like It Big',
            url: 'https://www.brazzers.com/sites/view/id/24/pornstars-like-it-big',
            description: "A real big dick, that's what everyone wants. Porn-stars are no exception, all the biggest stars agree; BIG COCK is for them. Check out how it stretches their tiny pussies and cums on their round tits. We've got the best chicks jocking the biggest dicks.",
            network_id: networksMap.brazzers,
@ -2397,6 +2397,13 @@ function getSites(networksMap) {
            url: 'https://www.darkx.com',
            network_id: networksMap.xempire,
        },
+        {
+            slug: 'allblackx',
+            name: 'AllBlackX',
+            description: 'AllBlackX.com features the hottest ebony pornstar beauties in hardcore black on black gonzo porn. From director Mason, watch 4k ultra HD videos inside',
+            url: 'https://www.allblackx.com',
+            network_id: networksMap.xempire,
+        },
        {
            slug: 'lesbianx',
            name: 'LesbianX',
--- a/seeds/03_tags.js
+++ b/seeds/03_tags.js
@ -1166,6 +1166,10 @@ function getTagAliases(tagsMap) {
            name: 'dp',
            alias_for: tagsMap['double-penetration'],
        },
+        {
+            name: 'first dp',
+            alias_for: tagsMap['double-penetration'],
+        },
        {
            name: 'double penetration (dp)',
            alias_for: tagsMap['double-penetration'],
--- a/src/media.js
+++ b/src/media.js
@ -10,7 +10,6 @@ const sharp = require('sharp');
 const blake2 = require('blake2');

 const knex = require('./knex');
-const pluckPhotos = require('./utils/pluck-photos');

 function getHash(buffer) {
    const hash = blake2.createHash('blake2b', { digestLength: 24 });
@ -20,6 +19,21 @@ function getHash(buffer) {
    return hash.digest('hex');
 }

+function pluckPhotos(photos, release, specifiedLimit) {
+    const limit = specifiedLimit || config.media.limit;
+
+    if (photos.length <= limit) {
+        return photos;
+    }
+
+    const plucked = [1]
+        .concat(
+            Array.from({ length: limit }, (value, index) => Math.round((index + 1) * (photos.length / (limit)))),
+        );
+
+    return Array.from(new Set(plucked)).map(photoIndex => photos[photoIndex - 1]); // remove duplicates, may happen when photo total and photo limit are close
+}
+
 async function getThumbnail(buffer) {
    return sharp(buffer)
        .resize({
@ -94,7 +108,12 @@ async function filterHashDuplicates(files, domains = ['releases'], roles = ['pho
    return files.filter(file => file && !photoHashes.has(file.hash));
 }

-async function fetchPhoto(photoUrl, index, identifier) {
+async function fetchPhoto(photoUrl, index, identifier, attempt = 1) {
+    if (Array.isArray(photoUrl)) {
+        return fetchPhoto(photoUrl[0], index, identifier);
+        // return photoUrl.reduce(async (outcome, url) => outcome.catch(async () => fetchPhoto(url, index, identifier)), Promise.reject());
+    }
+
    try {
        const { pathname } = new URL(photoUrl);
        const mimetype = mime.getType(pathname);
@ -116,7 +135,12 @@ async function fetchPhoto(photoUrl, index, identifier) {

        throw new Error(`Response ${res.statusCode} not OK`);
    } catch (error) {
-        console.warn(`Failed to store photo ${index + 1} (${photoUrl}) for ${identifier}: ${error}`);
+        console.warn(`Failed attempt ${attempt}/3 to fetch photo ${index + 1} (${photoUrl}) for ${identifier}: ${error}`);
+
+        if (attempt < 3) {
+            await Promise.delay(1000);
+            return fetchPhoto(photoUrl, index, identifier, attempt + 1);
+        }

        return null;
    }
--- a/src/networks.js
+++ b/src/networks.js
@ -39,6 +39,7 @@ async function findNetworkByUrl(url) {

    const network = await knex('networks')
        .where('networks.url', 'like', `%${domain}`)
+        .orWhere('networks.url', url)
        .first();

    if (network) {
--- a/src/releases.js
+++ b/src/releases.js
@ -248,7 +248,6 @@ async function storeReleaseAssets(release, releaseId) {

    try {
        await Promise.all([
-            associateTags(release, releaseId),
            storePhotos(release, releaseId),
            storePoster(release, releaseId),
            storeTrailer(release, releaseId),
@ -275,17 +274,22 @@ async function storeRelease(release) {
            })
            .returning('*');

-        // await storeReleaseAssets(release, existingRelease.id);
-        console.log(`Updated release "${release.title}" (${existingRelease.id}, ${release.site.name})`);
+        if (updatedRelease) {
+            await associateTags(release, updatedRelease.id);
+            console.log(`Updated release "${release.title}" (${existingRelease.id}, ${release.site.name})`);
+        }

-        return updatedRelease ? updatedRelease.id : existingRelease.id;
+        await associateTags(release, existingRelease.id);
+
+        return existingRelease.id;
    }

    const [releaseEntry] = await knex('releases')
        .insert(curatedRelease)
        .returning('*');

-    // await storeReleaseAssets(release, releaseEntry.id);
+    await associateTags(release, releaseEntry.id);
+
    console.log(`Stored release "${release.title}" (${releaseEntry.id}, ${release.site.name})`);

    return releaseEntry.id;
@ -334,7 +338,9 @@ async function storeReleases(releases) {

    await Promise.all([
        associateActors(actors, storedReleases),
-        Promise.all(storedReleases.map(async release => storeReleaseAssets(release, release.id))),
+        Promise.map(storedReleases, async release => storeReleaseAssets(release, release.id), {
+            concurrency: 10,
+        }),
    ]);

    return storedReleases;
--- a/src/scrapers/julesjordan.js
+++ b/src/scrapers/julesjordan.js
@ -7,7 +7,6 @@ const { JSDOM } = require('jsdom');
 const moment = require('moment');

 const { heightToCm } = require('../utils/convert');
-const { matchTags } = require('../tags');

 async function fetchPhotos(url) {
    const res = await bhttp.get(url);
@ -22,13 +21,8 @@ function scrapePhotos(html) {
        .map((photoIndex, photoElement) => {
            const src = $(photoElement).attr('src');

-            if (src.match(/dl\d+/)) {
-                // thumbnail URLs containing dl02/ or dl03/ don't appear to have
-                // a full photo available, fall back to thumbnail
-                return src;
-            }
-
-            return src.replace('thumbs/', 'photos/');
+            // high res often available in photos/ directory, but not always, provide original as fallback
+            return [src.replace('thumbs/', 'photos/'), src];
        })
        .toArray();

@ -172,8 +166,8 @@ async function scrapeScene(html, url, site) {

    const photos = await getPhotos(entryId, site);

-    const rawTags = $('.update_tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
-    const tags = await matchTags(rawTags);
+    const tags = $('.update_tags a').map((tagIndex, tagElement) => $(tagElement).text()).toArray();
+    const movie = $('.update_dvds a').href;

    return {
        url,
@ -184,6 +178,7 @@ async function scrapeScene(html, url, site) {
        description,
        poster,
        photos,
+        movie,
        trailer: {
            src: trailer,
            quality: 720,
--- a/src/scrapers/xempire.js
+++ b/src/scrapers/xempire.js
@ -6,14 +6,12 @@ const cheerio = require('cheerio');
 const { JSDOM } = require('jsdom');
 const moment = require('moment');

-const { fetchSites } = require('../sites');
-const { matchTags } = require('../tags');
-
 const defaultTags = {
    hardx: [],
    darkx: ['interracial'],
    eroticax: [],
    lesbianx: ['lesbian'],
+    allblackx: ['ebony', 'bbc'],
 };

 async function fetchPhotos(url) {
@ -25,37 +23,56 @@ async function fetchPhotos(url) {
 function scrapePhotos(html) {
    const $ = cheerio.load(html, { normalizeWhitespace: true });

-    const unlockedPhotos = $('.preview .imgLink.pgUnlocked')
-        .map((photoIndex, photoElement) => $(photoElement).attr('href')).toArray();
+    return $('.preview .imgLink').toArray().map((linkEl) => {
+        const url = $(linkEl).attr('href');

-    const lockedThumbnails = $('.preview .imgLink.lockedPicture img')
-        .map((photoIndex, photoElement) => $(photoElement)
-            .attr('src'))
-        // .replace('_tb.jpg', '.jpg')) does not always work
-        .toArray();
+        if (url.match('/join')) {
+            // URL links to join page instead of full photo, extract thumbnail
+            const src = $(linkEl).find('img').attr('src');

-    return unlockedPhotos.concat(lockedThumbnails);
+            if (src.match('previews/')) {
+                // resource often serves full photo at a modifier URL anyway, add as primary source
+                const highRes = src
+                    .replace('previews/', '')
+                    .replace('_tb.jpg', '.jpg');
+
+                // keep original thumbnail as fallback in case full photo is not available
+                return [highRes, src];
+            }
+
+            return src;
+        }
+
+        // URL links to full photo
+        return url;
+    });
 }

 async function getPhotos(albumPath, siteDomain) {
    const albumUrl = `https://${siteDomain}${albumPath}`;

-    const html = await fetchPhotos(albumUrl);
-    const $ = cheerio.load(html, { normalizeWhitespace: true });
-    const photos = scrapePhotos(html);
+    try {
+        const html = await fetchPhotos(albumUrl);
+        const $ = cheerio.load(html, { normalizeWhitespace: true });
+        const photos = scrapePhotos(html);

-    const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();
+        const pages = $('.paginatorPages a').map((pageIndex, pageElement) => $(pageElement).attr('href')).toArray();

-    const otherPhotos = await Promise.map(pages, async (page) => {
-        const pageUrl = `https://${siteDomain}${page}`;
-        const pageHtml = await fetchPhotos(pageUrl);
+        const otherPhotos = await Promise.map(pages, async (page) => {
+            const pageUrl = `https://${siteDomain}${page}`;
+            const pageHtml = await fetchPhotos(pageUrl);

-        return scrapePhotos(pageHtml);
-    }, {
-        concurrency: 2,
-    });
+            return scrapePhotos(pageHtml);
+        }, {
+            concurrency: 2,
+        });

-    return photos.concat(otherPhotos.flat());
+        return photos.concat(otherPhotos.flat());
+    } catch (error) {
+        console.error(`Failed to fetch XEmpire photos from ${albumPath}: ${error.message}`);
+
+        return [];
+    }
 }

 function scrape(html, site) {
@ -109,32 +126,26 @@ function scrape(html, site) {
 async function scrapeScene(html, url, site) {
    const $ = cheerio.load(html, { normalizeWhitespace: true });
    const json = $('script[type="application/ld+json"]').html();
+    const json2 = $('script:contains("dataLayer = ")').html();
    const videoJson = $('script:contains("window.ScenePlayerOptions")').html();

    const data = JSON.parse(json)[0];
+    const data2 = JSON.parse(json2.slice(json2.indexOf('[{'), -1))[0];
    const videoData = JSON.parse(videoJson.slice(videoJson.indexOf('{"id":'), videoJson.indexOf('};') + 1));

-    const entryId = new URL(url).pathname.split('/').slice(-1)[0];
+    const entryId = data2.sceneDetails.sceneId || new URL(url).pathname.split('/').slice(-1)[0];

-    const title = $('meta[name="twitter:title"]').attr('content');
-    const description = data.description || $('meta[name="twitter:description"]').attr('content');
+    const title = data2.sceneDetails.sceneTitle || $('meta[name="twitter:title"]').attr('content');
+    const description = data2.sceneDetails.sceneDescription || data.description || $('meta[name="twitter:description"]').attr('content');
    // date in data object is not the release date of the scene, but the date the entry was added
    const date = moment.utc($('.updatedDate').first().text(), 'MM-DD-YYYY').toDate();

-    const actors = data.actor
-        .sort(({ gender: genderA }, { gender: genderB }) => {
-            if (genderA === 'female' && genderB === 'male') return -1;
-            if (genderA === 'male' && genderB === 'female') return 1;
-
-            return 0;
-        })
-        .map(actor => actor.name);
-
+    const actors = (data2.sceneDetails.sceneActors || data.actor).map(actor => actor.actorName || actor.name);
    const stars = (data.aggregateRating.ratingValue / data.aggregateRating.bestRating) * 5;

    const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds();

-    const siteDomain = $('meta[name="twitter:domain"]').attr('content');
+    const siteDomain = $('meta[name="twitter:domain"]').attr('content') || 'allblackx.com'; // only AllBlackX has no twitter domain, no other useful hints available
    const siteSlug = siteDomain && siteDomain.split('.')[0].toLowerCase();
    const siteUrl = siteDomain && `https://www.${siteDomain}`;

@ -144,19 +155,10 @@ async function scrapeScene(html, url, site) {
    const photos = await getPhotos($('.picturesItem a').attr('href'), siteDomain, site);

    const rawTags = data.keywords.split(', ');
-
-    const [[channelSite], tags] = await Promise.all([
-        site.isFallback
-            ? fetchSites({
-                url: siteUrl,
-                slug: siteSlug,
-            })
-            : [site],
-        matchTags([...defaultTags[siteSlug], ...rawTags]),
-    ]);
+    const tags = [...defaultTags[siteSlug], ...rawTags];

    return {
-        url: channelSite ? `${channelSite.url}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}` : url,
+        url: `${siteUrl}/en/video/${new URL(url).pathname.split('/').slice(-2).join('/')}`,
        entryId,
        title,
        date,
@ -174,7 +176,8 @@ async function scrapeScene(html, url, site) {
        rating: {
            stars,
        },
-        site: channelSite || site,
+        site,
+        channel: siteSlug,
    };
 }

--- a/src/sites.js
+++ b/src/sites.js
@ -62,11 +62,14 @@ async function findSiteByUrl(url) {
            'sites.*',
            'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters',
        )
-        .where('sites.url', 'like', `%${domain}%`)
+        .where('sites.url', 'like', `%${domain}`)
+        .orWhere('sites.url', url)
        .first();

    if (site) {
-        return curateSite(site, true);
+        const curatedSite = curateSite(site, true);
+
+        return curatedSite;
    }

    return null;