Fixed qu issues. Fixed media issues. Simplified and expanded date component in search query.

2020-03-10 00:17:57 +01:00 · 2020-03-10 00:17:57 +01:00 · 5c55750c0c
parent 61a795d634
commit 5c55750c0c
9 changed files with 113 additions and 116 deletions
--- a/config/default.js
+++ b/config/default.js
@ -66,6 +66,8 @@ module.exports = {
        '21sextury',
        'julesjordan',
        'naughtyamerica',
+        'cherrypimps',
+        'pimpxxx',
        [
            'hussiepass',
            'hushpass',
@ -75,16 +77,6 @@ module.exports = {
            'seehimfuck',
            'eyeontheguy',
        ],
-        [
-            'cherrypimps',
-            'drilledxxx',
-            'wildoncam',
-            'bcmxxx',
-            'familyxxx',
-            'petitexxx',
-            'confessionsxxx',
-            'cuckedxxx',
-        ],
        [
            // Full Porn Network
            'analized',
--- a/src/media.js
+++ b/src/media.js
@ -85,87 +85,91 @@ async function extractItem(source) {
    const res = await get(source.src);

    if (res.statusCode === 200) {
-        const { q } = ex(res.body.toString());
+        const { qu } = ex(res.body.toString());

-        return source.extract(q);
+        return source.extract(qu);
    }

    return null;
 }

+async function fetchSource(source, domain, role, originalSource) {
+    logger.verbose(`Fetching ${domain} ${role} from ${source.src || source}`);
+
+    // const res = await bhttp.get(source.src || source);
+    const res = await get(source.src || source, {
+        headers: {
+            ...(source.referer && { referer: source.referer }),
+            ...(source.host && { host: source.host }),
+        },
+    });
+
+    if (res.statusCode === 200) {
+        const { pathname } = new URL(source.src || source);
+        const mimetype = mime.getType(pathname);
+        const extension = mime.getExtension(mimetype);
+        const hash = getHash(res.body);
+        const { entropy, size, width, height } = /image/.test(mimetype) ? await getMeta(res.body) : {};
+
+        logger.verbose(`Fetched media item from ${source.src || source}`);
+
+        return {
+            file: res.body,
+            mimetype,
+            extension,
+            hash,
+            entropy: entropy || null,
+            size: size || null,
+            width: width || null,
+            height: height || null,
+            quality: source.quality || null,
+            source: originalSource?.src || originalSource || source.src || source,
+            scraper: source.scraper,
+            copyright: source.copyright,
+        };
+    }
+
+    throw new Error(`Response ${res.statusCode} not OK`);
+}
+
 async function fetchItem(source, index, existingItemsBySource, domain, role, attempt = 1, originalSource = null, sourceIndex = 0) {
    if (!source) return null;

    try {
        if (Array.isArray(source)) {
-            if (source.every(sourceX => !!sourceX.quality)) {
+            if (source.every(sourceX => sourceX.quality)) {
                // various video qualities provided
                const selectedSource = pickQuality(source);
                return fetchItem(selectedSource, index, existingItemsBySource, domain, role, attempt, originalSource);
            }

            // fallbacks provided
-            return source.reduce(
-                (outcome, sourceX, sourceIndexX) => outcome.catch(async () => fetchItem(sourceX, index, existingItemsBySource, domain, role, attempt, originalSource, sourceIndexX)),
-                Promise.reject(new Error()),
-            );
+            return source.reduce((outcome, sourceX, sourceIndexX) => outcome.catch(
+                async () => fetchItem(sourceX, index, existingItemsBySource, domain, role, attempt, source, sourceIndexX),
+            ), Promise.reject(new Error()));
        }

        if (source.src && source.extract) {
            // source links to page containing a (presumably) tokenized photo
            const itemSource = await extractItem(source);

-            return fetchItem(itemSource, index, existingItemsBySource, domain, role, attempt, source);
+            return fetchItem(itemSource, index, existingItemsBySource, domain, role, attempt, source, sourceIndex);
        }

-
        if (existingItemsBySource[source]) {
            return null;
        }

-        logger.verbose(`Fetching ${domain} ${role} from ${source.src || source}`);
-
-        // const res = await bhttp.get(source.src || source);
-        const res = await get(source.src || source, {
-            headers: {
-                ...(source.referer && { referer: source.referer }),
-                ...(source.host && { host: source.host }),
-            },
-        });
-
-        if (res.statusCode === 200) {
-            const { pathname } = new URL(source.src || source);
-            const mimetype = mime.getType(pathname);
-            const extension = mime.getExtension(mimetype);
-            const hash = getHash(res.body);
-            const { entropy, size, width, height } = /image/.test(mimetype) ? await getMeta(res.body) : {};
-
-            logger.verbose(`Fetched media item from ${source.src || source}`);
-
-            return {
-                file: res.body,
-                mimetype,
-                extension,
-                hash,
-                entropy: entropy || null,
-                size: size || null,
-                width: width || null,
-                height: height || null,
-                quality: source.quality || null,
-                source: originalSource?.src || originalSource || source.src || source,
-                scraper: source.scraper,
-                copyright: source.copyright,
-            };
-        }
-
-        throw new Error(`Response ${res.statusCode} not OK`);
+        return fetchSource(source, domain, role, originalSource);
    } catch (error) {
        logger.warn(`Failed attempt ${attempt}/3 to fetch ${domain} ${role} ${index + 1} (${source.src || source}): ${error}`);

+        /*
        if (attempt < 3) {
            await Promise.delay(5000);
-            return fetchItem(source, index, existingItemsBySource, domain, role, attempt + 1, originalSource);
+            return fetchItem(source, index, existingItemsBySource, domain, role, attempt + 1, originalSource, sourceIndex);
        }
+        */

        if (originalSource && sourceIndex < originalSource.length) {
            throw error;
@ -351,7 +355,7 @@ function associateTargetMedia(targetId, sources, mediaBySource, domain, role, pr
            if (!source) return null;

            const mediaItem = Array.isArray(source)
-                ? source.reduce((acc, sourceX) => acc || mediaBySource[sourceX.src || sourceX], null)
+                ? mediaBySource[source.map(sourceX => sourceX.src || sourceX).toString()]
                : mediaBySource[source.src || source];

            // return mediaItem && { [`${domain}_id`]: targetId, media_id: mediaItem.id };
--- a/src/releases.js
+++ b/src/releases.js
@ -369,24 +369,19 @@ async function storeReleaseAssets(releases) {
 async function updateReleasesSearch(releaseIds) {
    const documents = await knex.raw(`
        SELECT
-            releases.id as release_id,
-            to_tsvector(
+            releases.id AS release_id,
+            TO_TSVECTOR(
                'traxxx',
                releases.title || ' ' ||
                sites.name || ' ' ||
                sites.slug || ' ' ||
                networks.name || ' ' ||
                networks.slug || ' ' ||
-                coalesce(releases.shoot_id, '') || ' ' ||
-                EXTRACT(YEAR FROM releases.date) || ' ' ||
-                CAST(EXTRACT(MONTH FROM releases.date) AS VARCHAR) || ' ' ||
-                CAST(EXTRACT(DAY FROM releases.date) AS VARCHAR) || ' ' ||
-                SUBSTRING(CAST(EXTRACT(YEAR FROM releases.date) AS VARCHAR) FROM 3 for 2) || ' ' ||
-                LPAD(CAST(EXTRACT(MONTH FROM releases.date) AS VARCHAR), 2, '0') || ' ' ||
-                LPAD(CAST(EXTRACT(DAY FROM releases.date) AS VARCHAR), 2, '0') || ' ' ||
-                string_agg(coalesce(actors.name, ''), ' ') || ' ' ||
-                string_agg(coalesce(tags.name, ''), ' ') || ' ' ||
-                string_agg(coalesce(tags_aliases.name, ''), ' ')
+                COALESCE(releases.shoot_id, '') || ' ' ||
+                COALESCE(TO_CHAR(releases.date, 'YYYY YY MM FMMM FMmonth mon DD FMD'), '') || ' ' ||
+                STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' ||
+                STRING_AGG(COALESCE(tags.name, ''), ' ') || ' ' ||
+                STRING_AGG(COALESCE(tags_aliases.name, ''), ' ')
            ) as document
        FROM releases
        LEFT JOIN sites ON releases.site_id = sites.id
--- a/src/scrapers/bangbros.js
+++ b/src/scrapers/bangbros.js
@ -5,6 +5,7 @@ const bhttp = require('bhttp');
 const cheerio = require('cheerio');
 const moment = require('moment');

+const logger = require('../logger')(__filename);
 const slugify = require('../utils/slugify');
 const { ex } = require('../utils/q');

@ -105,7 +106,10 @@ function scrapeScene(html, url, _site) {
    release.photos = Array.from({ length: 12 }, (val, index) => firstPhotoUrl.replace(/big\d+/, `big${index + 1}`));

    const [channel] = qu.url('a[href*="/websites"]').match(/\w+$/);
-    release.channel = channel === 'bangcasting' ? 'bangbroscasting' : channel;
+
+    if (channel === 'bangcasting') release.channel = 'bangbroscasting';
+    if (channel === 'remaster') release.channel = 'bangbrosremastered';
+    else release.channel = channel;

    return release;
 }
@ -123,8 +127,8 @@ function scrapeProfile(html) {
 }

 function scrapeProfileSearch(html, actorName) {
-    const { q } = ex(html);
-    const actorLink = q(`a[title="${actorName}"]`, 'href');
+    const { qu } = ex(html);
+    const actorLink = qu.url(`a[title="${actorName}" i][href*="model"]`);

    return actorLink ? `https://bangbros.com${actorLink}` : null;
 }
@ -145,7 +149,7 @@ async function fetchUpcoming(site) {

 async function fetchScene(url, site, release) {
    if (!release?.date) {
-        throw new Error(`Cannot fetch Bang Bros scenes from argument URL, as scene pages do not have release dates: ${url}`);
+        logger.warn(`Scraping Bang Bros scene from URL without release date: ${url}`);
    }

    const { origin } = new URL(url);
--- a/src/scrapers/boobpedia.js
+++ b/src/scrapers/boobpedia.js
@ -5,11 +5,11 @@ const bhttp = require('bhttp');
 const { ex } = require('../utils/q');

 function scrapeProfile(html) {
-    const { q, qa, qd, qi, qus } = ex(html); /* eslint-disable-line object-curly-newline */
+    const { qu } = ex(html); /* eslint-disable-line object-curly-newline */
    const profile = {};

-    const bio = qa('.infobox tr[valign="top"]')
-        .map(detail => qa(detail, 'td', true))
+    const bio = qu.all('.infobox tr[valign="top"]')
+        .map(detail => qu.all(detail, 'td', true))
        .reduce((acc, [key, value]) => ({ ...acc, [key.slice(0, -1).replace(/[\s+|/]/g, '_')]: value }), {});


@ -19,9 +19,9 @@ function scrapeProfile(html) {
    profile.gender = isTrans ? 'transsexual' : 'female';
    */

-    profile.birthdate = qd('.bday', 'YYYY-MM-DD');
+    profile.birthdate = qu.date('.bday', 'YYYY-MM-DD');

-    profile.description = q('#mw-content-text > p', true);
+    profile.description = qu.q('#mw-content-text > p', true);

    if (bio.Born) profile.birthPlace = bio.Born.slice(bio.Born.lastIndexOf(')') + 1);
    if (bio.Ethnicity) profile.ethnicity = bio.Ethnicity;
@ -62,7 +62,7 @@ function scrapeProfile(html) {
    if (bio.Blood_group) profile.blood = bio.Blood_group;
    if (bio.Also_known_as) profile.aliases = bio.Also_known_as.split(', ');

-    const avatarThumbPath = qi('.image img');
+    const avatarThumbPath = qu.img('.image img');

    if (avatarThumbPath && !/NoImageAvailable/.test(avatarThumbPath)) {
        const avatarPath = avatarThumbPath.slice(0, avatarThumbPath.lastIndexOf('/')).replace('thumb/', '');
@ -73,7 +73,7 @@ function scrapeProfile(html) {
        };
    }

-    profile.social = qus('.infobox a.external');
+    profile.social = qu.urls('.infobox a.external');

    return profile;
 }
--- a/src/scrapers/ddfnetwork.js
+++ b/src/scrapers/ddfnetwork.js
@ -27,26 +27,26 @@ function scrapeAll(html, site, origin) {
 }

 async function scrapeScene(html, url, _site) {
-    const { q, qa, qd, qm, qp, qus } = ex(html);
+    const { qu } = ex(html);
    const release = {};

    [release.entryId] = url.split('/').slice(-1);

-    release.title = qm('itemprop=name');
-    release.description = q('.descr-box p', true);
-    release.date = qd('meta[itemprop=uploadDate]', 'YYYY-MM-DD', null, 'content')
-        || qd('.title-border:nth-child(2) p', 'MM.DD.YYYY');
+    release.title = qu.meta('itemprop=name');
+    release.description = qu.q('.descr-box p', true);
+    release.date = qu.date('meta[itemprop=uploadDate]', 'YYYY-MM-DD', null, 'content')
+        || qu.date('.title-border:nth-child(2) p', 'MM.DD.YYYY');

-    release.actors = qa('.pornstar-card > a', 'title');
-    release.tags = qa('.tags-tab .tags a', true);
+    release.actors = qu.all('.pornstar-card > a', 'title');
+    release.tags = qu.all('.tags-tab .tags a', true);

-    release.duration = parseInt(q('.icon-video-red + span', true), 10) * 60;
-    release.likes = Number(q('.icon-like-red + span', true));
+    release.duration = parseInt(qu.q('.icon-video-red + span', true), 10) * 60;
+    release.likes = Number(qu.q('.icon-like-red + span', true));

-    release.poster = qp();
-    release.photos = qus('.photo-slider-guest .card a');
+    release.poster = qu.poster();
+    release.photos = qu.urls('.photo-slider-guest .card a');

-    release.trailer = qa('source[type="video/mp4"]').map(trailer => ({
+    release.trailer = qu.all('source[type="video/mp4"]').map(trailer => ({
        src: trailer.src,
        quality: Number(trailer.attributes.res.value),
    }));
@ -72,10 +72,10 @@ async function fetchActorReleases(urls) {
 }

 async function scrapeProfile(html, _url, actorName) {
-    const { q, qa, qus } = ex(html);
+    const { qu } = ex(html);

-    const keys = qa('.about-title', true).map(key => slugify(key, { delimiter: '_' }));
-    const values = qa('.about-info').map((el) => {
+    const keys = qu.all('.about-title', true).map(key => slugify(key, { delimiter: '_' }));
+    const values = qu.all('.about-info').map((el) => {
        if (el.children.length > 0) {
            return Array.from(el.children, child => child.textContent.trim()).join(', ');
        }
@ -96,7 +96,7 @@ async function scrapeProfile(html, _url, actorName) {
        name: actorName,
    };

-    profile.description = q('.description-box', true);
+    profile.description = qu.q('.description-box', true);
    profile.birthdate = ed(bio.birthday, 'MMMM DD, YYYY');

    if (bio.nationality) profile.nationality = bio.nationality;
@ -118,10 +118,10 @@ async function scrapeProfile(html, _url, actorName) {

    if (bio.shoe_size) profile.shoes = Number(bio.shoe_size.split('|')[1]);

-    const avatarEl = q('.pornstar-details .card-img-top');
+    const avatarEl = qu.q('.pornstar-details .card-img-top');
    if (avatarEl && avatarEl.dataset.src.match('^//')) profile.avatar = `https:${avatarEl.dataset.src}`;

-    profile.releases = await fetchActorReleases(qus('.find-me-tab li a'));
+    profile.releases = await fetchActorReleases(qu.urls('.find-me-tab li a'));

    return profile;
 }
--- a/src/scrapers/naughtyamerica.js
+++ b/src/scrapers/naughtyamerica.js
@ -101,22 +101,24 @@ function scrapeScene(html, url, site) {
 }

 async function fetchActorReleases(url) {
-    const { qus } = await get(url);
+    const res = await get(url);

-    return qus('.contain-block:not(.live-scenes) .scene-item > a:first-child'); // live scenes repeat on all pages
+    return res.ok
+        ? res.item.qu.urls('.contain-block:not(.live-scenes) .scene-item > a:first-child') // live scenes repeat on all pages
+        : [];
 }

 async function scrapeProfile(html) {
-    const { q, qus } = ex(html);
+    const { qu } = ex(html);
    const profile = {};

-    profile.description = q('.bio_about_text', true);
+    profile.description = qu.q('.bio_about_text', true);

-    const avatar = q('img.performer-pic', 'src');
+    const avatar = qu.q('img.performer-pic', 'src');
    if (avatar) profile.avatar = `https:${avatar}`;

-    const releases = qus('.scene-item > a:first-child');
-    const otherPages = qus('.pagination a:not([rel=next]):not([rel=prev])');
+    const releases = qu.urls('.scene-item > a:first-child');
+    const otherPages = qu.urls('.pagination a:not([rel=next]):not([rel=prev])');
    const olderReleases = await Promise.all(otherPages.map(async page => fetchActorReleases(page)));

    profile.releases = releases.concat(olderReleases.flat());
--- a/src/scrapers/nubiles.js
+++ b/src/scrapers/nubiles.js
@ -58,7 +58,7 @@ function scrapeAll(scenes, site, origin) {
    });
 }

-async function scrapeScene(qu, url, site) {
+async function scrapeScene({ qu }, url, site) {
    const release = {};

    const { origin, pathname } = new URL(url);
--- a/src/scrapers/vogov.js
+++ b/src/scrapers/vogov.js
@ -116,23 +116,23 @@ function scrapeLatest(html) {
 }

 function scrapeScene(html, url) {
-    const { q, qa, qd, qus, ql, qm } = ex(html);
+    const { qu } = ex(html);
    const release = { url };

    // release.entryId = slugify(release.title);
-    [release.entryId] = q('link[rel="canonical"]').href.match(/\d+/);
+    [release.entryId] = qu.q('link[rel="canonical"]').href.match(/\d+/);

-    release.title = qm('meta[property="og:title"]') || q('.video-page-header h1', true);
+    release.title = qu.meta('meta[property="og:title"]') || qu.q('.video-page-header h1', true);
+    release.description = qu.meta('meta[property="og:description"]') || qu.q('.info-video-description', true);

-    release.description = qm('meta[property="og:description"]') || q('.info-video-description', true);
-    release.date = qd('.info-video-details li:first-child span', 'MMM DD, YYYY');
-    release.duration = ql('.info-video-details li:nth-child(2) span');
+    release.date = qu.date('.info-video-details li:first-child span', 'MMM DD, YYYY');
+    release.duration = qu.dur('.info-video-details li:nth-child(2) span');

-    release.actors = qa('.info-video-models a', true);
-    release.tags = qa('.info-video-category a', true);
+    release.actors = qu.all('.info-video-models a', true);
+    release.tags = qu.all('.info-video-category a', true);

-    release.photos = qus('.swiper-wrapper .swiper-slide a').map(source => source.replace('.jpg/', '.jpg'));
-    release.poster = qm('meta[property="og:image"');
+    release.photos = qu.urls('.swiper-wrapper .swiper-slide a').map(source => source.replace('.jpg/', '.jpg'));
+    release.poster = qu.meta('meta[property="og:image"');

    if (!release.poster) {
        const previewStart = html.indexOf('preview_url');