From 5c55750c0cdecd01f98347fb903e7d0848de4fe6 Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Tue, 10 Mar 2020 00:17:57 +0100 Subject: [PATCH] Fixed qu issues. Fixed media issues. Simplified and expanded date component in search query. --- config/default.js | 12 +---- src/media.js | 98 ++++++++++++++++++---------------- src/releases.js | 19 +++---- src/scrapers/bangbros.js | 12 +++-- src/scrapers/boobpedia.js | 14 ++--- src/scrapers/ddfnetwork.js | 36 ++++++------- src/scrapers/naughtyamerica.js | 16 +++--- src/scrapers/nubiles.js | 2 +- src/scrapers/vogov.js | 20 +++---- 9 files changed, 113 insertions(+), 116 deletions(-) diff --git a/config/default.js b/config/default.js index d0811814..8ec5c143 100644 --- a/config/default.js +++ b/config/default.js @@ -66,6 +66,8 @@ module.exports = { '21sextury', 'julesjordan', 'naughtyamerica', + 'cherrypimps', + 'pimpxxx', [ 'hussiepass', 'hushpass', @@ -75,16 +77,6 @@ module.exports = { 'seehimfuck', 'eyeontheguy', ], - [ - 'cherrypimps', - 'drilledxxx', - 'wildoncam', - 'bcmxxx', - 'familyxxx', - 'petitexxx', - 'confessionsxxx', - 'cuckedxxx', - ], [ // Full Porn Network 'analized', diff --git a/src/media.js b/src/media.js index a8c82af0..1f6091da 100644 --- a/src/media.js +++ b/src/media.js @@ -85,87 +85,91 @@ async function extractItem(source) { const res = await get(source.src); if (res.statusCode === 200) { - const { q } = ex(res.body.toString()); + const { qu } = ex(res.body.toString()); - return source.extract(q); + return source.extract(qu); } return null; } +async function fetchSource(source, domain, role, originalSource) { + logger.verbose(`Fetching ${domain} ${role} from ${source.src || source}`); + + // const res = await bhttp.get(source.src || source); + const res = await get(source.src || source, { + headers: { + ...(source.referer && { referer: source.referer }), + ...(source.host && { host: source.host }), + }, + }); + + if (res.statusCode === 200) { + const { pathname } = new URL(source.src || source); + const mimetype = mime.getType(pathname); + const extension = mime.getExtension(mimetype); + const hash = getHash(res.body); + const { entropy, size, width, height } = /image/.test(mimetype) ? await getMeta(res.body) : {}; + + logger.verbose(`Fetched media item from ${source.src || source}`); + + return { + file: res.body, + mimetype, + extension, + hash, + entropy: entropy || null, + size: size || null, + width: width || null, + height: height || null, + quality: source.quality || null, + source: originalSource?.src || originalSource || source.src || source, + scraper: source.scraper, + copyright: source.copyright, + }; + } + + throw new Error(`Response ${res.statusCode} not OK`); +} + async function fetchItem(source, index, existingItemsBySource, domain, role, attempt = 1, originalSource = null, sourceIndex = 0) { if (!source) return null; try { if (Array.isArray(source)) { - if (source.every(sourceX => !!sourceX.quality)) { + if (source.every(sourceX => sourceX.quality)) { // various video qualities provided const selectedSource = pickQuality(source); return fetchItem(selectedSource, index, existingItemsBySource, domain, role, attempt, originalSource); } // fallbacks provided - return source.reduce( - (outcome, sourceX, sourceIndexX) => outcome.catch(async () => fetchItem(sourceX, index, existingItemsBySource, domain, role, attempt, originalSource, sourceIndexX)), - Promise.reject(new Error()), - ); + return source.reduce((outcome, sourceX, sourceIndexX) => outcome.catch( + async () => fetchItem(sourceX, index, existingItemsBySource, domain, role, attempt, source, sourceIndexX), + ), Promise.reject(new Error())); } if (source.src && source.extract) { // source links to page containing a (presumably) tokenized photo const itemSource = await extractItem(source); - return fetchItem(itemSource, index, existingItemsBySource, domain, role, attempt, source); + return fetchItem(itemSource, index, existingItemsBySource, domain, role, attempt, source, sourceIndex); } - if (existingItemsBySource[source]) { return null; } - logger.verbose(`Fetching ${domain} ${role} from ${source.src || source}`); - - // const res = await bhttp.get(source.src || source); - const res = await get(source.src || source, { - headers: { - ...(source.referer && { referer: source.referer }), - ...(source.host && { host: source.host }), - }, - }); - - if (res.statusCode === 200) { - const { pathname } = new URL(source.src || source); - const mimetype = mime.getType(pathname); - const extension = mime.getExtension(mimetype); - const hash = getHash(res.body); - const { entropy, size, width, height } = /image/.test(mimetype) ? await getMeta(res.body) : {}; - - logger.verbose(`Fetched media item from ${source.src || source}`); - - return { - file: res.body, - mimetype, - extension, - hash, - entropy: entropy || null, - size: size || null, - width: width || null, - height: height || null, - quality: source.quality || null, - source: originalSource?.src || originalSource || source.src || source, - scraper: source.scraper, - copyright: source.copyright, - }; - } - - throw new Error(`Response ${res.statusCode} not OK`); + return fetchSource(source, domain, role, originalSource); } catch (error) { logger.warn(`Failed attempt ${attempt}/3 to fetch ${domain} ${role} ${index + 1} (${source.src || source}): ${error}`); + /* if (attempt < 3) { await Promise.delay(5000); - return fetchItem(source, index, existingItemsBySource, domain, role, attempt + 1, originalSource); + return fetchItem(source, index, existingItemsBySource, domain, role, attempt + 1, originalSource, sourceIndex); } + */ if (originalSource && sourceIndex < originalSource.length) { throw error; @@ -351,7 +355,7 @@ function associateTargetMedia(targetId, sources, mediaBySource, domain, role, pr if (!source) return null; const mediaItem = Array.isArray(source) - ? source.reduce((acc, sourceX) => acc || mediaBySource[sourceX.src || sourceX], null) + ? mediaBySource[source.map(sourceX => sourceX.src || sourceX).toString()] : mediaBySource[source.src || source]; // return mediaItem && { [`${domain}_id`]: targetId, media_id: mediaItem.id }; diff --git a/src/releases.js b/src/releases.js index 27636035..d982ab67 100644 --- a/src/releases.js +++ b/src/releases.js @@ -369,24 +369,19 @@ async function storeReleaseAssets(releases) { async function updateReleasesSearch(releaseIds) { const documents = await knex.raw(` SELECT - releases.id as release_id, - to_tsvector( + releases.id AS release_id, + TO_TSVECTOR( 'traxxx', releases.title || ' ' || sites.name || ' ' || sites.slug || ' ' || networks.name || ' ' || networks.slug || ' ' || - coalesce(releases.shoot_id, '') || ' ' || - EXTRACT(YEAR FROM releases.date) || ' ' || - CAST(EXTRACT(MONTH FROM releases.date) AS VARCHAR) || ' ' || - CAST(EXTRACT(DAY FROM releases.date) AS VARCHAR) || ' ' || - SUBSTRING(CAST(EXTRACT(YEAR FROM releases.date) AS VARCHAR) FROM 3 for 2) || ' ' || - LPAD(CAST(EXTRACT(MONTH FROM releases.date) AS VARCHAR), 2, '0') || ' ' || - LPAD(CAST(EXTRACT(DAY FROM releases.date) AS VARCHAR), 2, '0') || ' ' || - string_agg(coalesce(actors.name, ''), ' ') || ' ' || - string_agg(coalesce(tags.name, ''), ' ') || ' ' || - string_agg(coalesce(tags_aliases.name, ''), ' ') + COALESCE(releases.shoot_id, '') || ' ' || + COALESCE(TO_CHAR(releases.date, 'YYYY YY MM FMMM FMmonth mon DD FMD'), '') || ' ' || + STRING_AGG(COALESCE(actors.name, ''), ' ') || ' ' || + STRING_AGG(COALESCE(tags.name, ''), ' ') || ' ' || + STRING_AGG(COALESCE(tags_aliases.name, ''), ' ') ) as document FROM releases LEFT JOIN sites ON releases.site_id = sites.id diff --git a/src/scrapers/bangbros.js b/src/scrapers/bangbros.js index b87af897..e4d15cb5 100644 --- a/src/scrapers/bangbros.js +++ b/src/scrapers/bangbros.js @@ -5,6 +5,7 @@ const bhttp = require('bhttp'); const cheerio = require('cheerio'); const moment = require('moment'); +const logger = require('../logger')(__filename); const slugify = require('../utils/slugify'); const { ex } = require('../utils/q'); @@ -105,7 +106,10 @@ function scrapeScene(html, url, _site) { release.photos = Array.from({ length: 12 }, (val, index) => firstPhotoUrl.replace(/big\d+/, `big${index + 1}`)); const [channel] = qu.url('a[href*="/websites"]').match(/\w+$/); - release.channel = channel === 'bangcasting' ? 'bangbroscasting' : channel; + + if (channel === 'bangcasting') release.channel = 'bangbroscasting'; + if (channel === 'remaster') release.channel = 'bangbrosremastered'; + else release.channel = channel; return release; } @@ -123,8 +127,8 @@ function scrapeProfile(html) { } function scrapeProfileSearch(html, actorName) { - const { q } = ex(html); - const actorLink = q(`a[title="${actorName}"]`, 'href'); + const { qu } = ex(html); + const actorLink = qu.url(`a[title="${actorName}" i][href*="model"]`); return actorLink ? `https://bangbros.com${actorLink}` : null; } @@ -145,7 +149,7 @@ async function fetchUpcoming(site) { async function fetchScene(url, site, release) { if (!release?.date) { - throw new Error(`Cannot fetch Bang Bros scenes from argument URL, as scene pages do not have release dates: ${url}`); + logger.warn(`Scraping Bang Bros scene from URL without release date: ${url}`); } const { origin } = new URL(url); diff --git a/src/scrapers/boobpedia.js b/src/scrapers/boobpedia.js index dca76e73..a77e6866 100644 --- a/src/scrapers/boobpedia.js +++ b/src/scrapers/boobpedia.js @@ -5,11 +5,11 @@ const bhttp = require('bhttp'); const { ex } = require('../utils/q'); function scrapeProfile(html) { - const { q, qa, qd, qi, qus } = ex(html); /* eslint-disable-line object-curly-newline */ + const { qu } = ex(html); /* eslint-disable-line object-curly-newline */ const profile = {}; - const bio = qa('.infobox tr[valign="top"]') - .map(detail => qa(detail, 'td', true)) + const bio = qu.all('.infobox tr[valign="top"]') + .map(detail => qu.all(detail, 'td', true)) .reduce((acc, [key, value]) => ({ ...acc, [key.slice(0, -1).replace(/[\s+|/]/g, '_')]: value }), {}); @@ -19,9 +19,9 @@ function scrapeProfile(html) { profile.gender = isTrans ? 'transsexual' : 'female'; */ - profile.birthdate = qd('.bday', 'YYYY-MM-DD'); + profile.birthdate = qu.date('.bday', 'YYYY-MM-DD'); - profile.description = q('#mw-content-text > p', true); + profile.description = qu.q('#mw-content-text > p', true); if (bio.Born) profile.birthPlace = bio.Born.slice(bio.Born.lastIndexOf(')') + 1); if (bio.Ethnicity) profile.ethnicity = bio.Ethnicity; @@ -62,7 +62,7 @@ function scrapeProfile(html) { if (bio.Blood_group) profile.blood = bio.Blood_group; if (bio.Also_known_as) profile.aliases = bio.Also_known_as.split(', '); - const avatarThumbPath = qi('.image img'); + const avatarThumbPath = qu.img('.image img'); if (avatarThumbPath && !/NoImageAvailable/.test(avatarThumbPath)) { const avatarPath = avatarThumbPath.slice(0, avatarThumbPath.lastIndexOf('/')).replace('thumb/', ''); @@ -73,7 +73,7 @@ function scrapeProfile(html) { }; } - profile.social = qus('.infobox a.external'); + profile.social = qu.urls('.infobox a.external'); return profile; } diff --git a/src/scrapers/ddfnetwork.js b/src/scrapers/ddfnetwork.js index 44eae802..8bda6e8e 100644 --- a/src/scrapers/ddfnetwork.js +++ b/src/scrapers/ddfnetwork.js @@ -27,26 +27,26 @@ function scrapeAll(html, site, origin) { } async function scrapeScene(html, url, _site) { - const { q, qa, qd, qm, qp, qus } = ex(html); + const { qu } = ex(html); const release = {}; [release.entryId] = url.split('/').slice(-1); - release.title = qm('itemprop=name'); - release.description = q('.descr-box p', true); - release.date = qd('meta[itemprop=uploadDate]', 'YYYY-MM-DD', null, 'content') - || qd('.title-border:nth-child(2) p', 'MM.DD.YYYY'); + release.title = qu.meta('itemprop=name'); + release.description = qu.q('.descr-box p', true); + release.date = qu.date('meta[itemprop=uploadDate]', 'YYYY-MM-DD', null, 'content') + || qu.date('.title-border:nth-child(2) p', 'MM.DD.YYYY'); - release.actors = qa('.pornstar-card > a', 'title'); - release.tags = qa('.tags-tab .tags a', true); + release.actors = qu.all('.pornstar-card > a', 'title'); + release.tags = qu.all('.tags-tab .tags a', true); - release.duration = parseInt(q('.icon-video-red + span', true), 10) * 60; - release.likes = Number(q('.icon-like-red + span', true)); + release.duration = parseInt(qu.q('.icon-video-red + span', true), 10) * 60; + release.likes = Number(qu.q('.icon-like-red + span', true)); - release.poster = qp(); - release.photos = qus('.photo-slider-guest .card a'); + release.poster = qu.poster(); + release.photos = qu.urls('.photo-slider-guest .card a'); - release.trailer = qa('source[type="video/mp4"]').map(trailer => ({ + release.trailer = qu.all('source[type="video/mp4"]').map(trailer => ({ src: trailer.src, quality: Number(trailer.attributes.res.value), })); @@ -72,10 +72,10 @@ async function fetchActorReleases(urls) { } async function scrapeProfile(html, _url, actorName) { - const { q, qa, qus } = ex(html); + const { qu } = ex(html); - const keys = qa('.about-title', true).map(key => slugify(key, { delimiter: '_' })); - const values = qa('.about-info').map((el) => { + const keys = qu.all('.about-title', true).map(key => slugify(key, { delimiter: '_' })); + const values = qu.all('.about-info').map((el) => { if (el.children.length > 0) { return Array.from(el.children, child => child.textContent.trim()).join(', '); } @@ -96,7 +96,7 @@ async function scrapeProfile(html, _url, actorName) { name: actorName, }; - profile.description = q('.description-box', true); + profile.description = qu.q('.description-box', true); profile.birthdate = ed(bio.birthday, 'MMMM DD, YYYY'); if (bio.nationality) profile.nationality = bio.nationality; @@ -118,10 +118,10 @@ async function scrapeProfile(html, _url, actorName) { if (bio.shoe_size) profile.shoes = Number(bio.shoe_size.split('|')[1]); - const avatarEl = q('.pornstar-details .card-img-top'); + const avatarEl = qu.q('.pornstar-details .card-img-top'); if (avatarEl && avatarEl.dataset.src.match('^//')) profile.avatar = `https:${avatarEl.dataset.src}`; - profile.releases = await fetchActorReleases(qus('.find-me-tab li a')); + profile.releases = await fetchActorReleases(qu.urls('.find-me-tab li a')); return profile; } diff --git a/src/scrapers/naughtyamerica.js b/src/scrapers/naughtyamerica.js index b3a50e5b..95911c52 100644 --- a/src/scrapers/naughtyamerica.js +++ b/src/scrapers/naughtyamerica.js @@ -101,22 +101,24 @@ function scrapeScene(html, url, site) { } async function fetchActorReleases(url) { - const { qus } = await get(url); + const res = await get(url); - return qus('.contain-block:not(.live-scenes) .scene-item > a:first-child'); // live scenes repeat on all pages + return res.ok + ? res.item.qu.urls('.contain-block:not(.live-scenes) .scene-item > a:first-child') // live scenes repeat on all pages + : []; } async function scrapeProfile(html) { - const { q, qus } = ex(html); + const { qu } = ex(html); const profile = {}; - profile.description = q('.bio_about_text', true); + profile.description = qu.q('.bio_about_text', true); - const avatar = q('img.performer-pic', 'src'); + const avatar = qu.q('img.performer-pic', 'src'); if (avatar) profile.avatar = `https:${avatar}`; - const releases = qus('.scene-item > a:first-child'); - const otherPages = qus('.pagination a:not([rel=next]):not([rel=prev])'); + const releases = qu.urls('.scene-item > a:first-child'); + const otherPages = qu.urls('.pagination a:not([rel=next]):not([rel=prev])'); const olderReleases = await Promise.all(otherPages.map(async page => fetchActorReleases(page))); profile.releases = releases.concat(olderReleases.flat()); diff --git a/src/scrapers/nubiles.js b/src/scrapers/nubiles.js index 9c72a05a..c7208609 100644 --- a/src/scrapers/nubiles.js +++ b/src/scrapers/nubiles.js @@ -58,7 +58,7 @@ function scrapeAll(scenes, site, origin) { }); } -async function scrapeScene(qu, url, site) { +async function scrapeScene({ qu }, url, site) { const release = {}; const { origin, pathname } = new URL(url); diff --git a/src/scrapers/vogov.js b/src/scrapers/vogov.js index db2e8358..91e8a07b 100644 --- a/src/scrapers/vogov.js +++ b/src/scrapers/vogov.js @@ -116,23 +116,23 @@ function scrapeLatest(html) { } function scrapeScene(html, url) { - const { q, qa, qd, qus, ql, qm } = ex(html); + const { qu } = ex(html); const release = { url }; // release.entryId = slugify(release.title); - [release.entryId] = q('link[rel="canonical"]').href.match(/\d+/); + [release.entryId] = qu.q('link[rel="canonical"]').href.match(/\d+/); - release.title = qm('meta[property="og:title"]') || q('.video-page-header h1', true); + release.title = qu.meta('meta[property="og:title"]') || qu.q('.video-page-header h1', true); + release.description = qu.meta('meta[property="og:description"]') || qu.q('.info-video-description', true); - release.description = qm('meta[property="og:description"]') || q('.info-video-description', true); - release.date = qd('.info-video-details li:first-child span', 'MMM DD, YYYY'); - release.duration = ql('.info-video-details li:nth-child(2) span'); + release.date = qu.date('.info-video-details li:first-child span', 'MMM DD, YYYY'); + release.duration = qu.dur('.info-video-details li:nth-child(2) span'); - release.actors = qa('.info-video-models a', true); - release.tags = qa('.info-video-category a', true); + release.actors = qu.all('.info-video-models a', true); + release.tags = qu.all('.info-video-category a', true); - release.photos = qus('.swiper-wrapper .swiper-slide a').map(source => source.replace('.jpg/', '.jpg')); - release.poster = qm('meta[property="og:image"'); + release.photos = qu.urls('.swiper-wrapper .swiper-slide a').map(source => source.replace('.jpg/', '.jpg')); + release.poster = qu.meta('meta[property="og:image"'); if (!release.poster) { const previewStart = html.indexOf('preview_url');