From 0ed1b2eff9679705b36fbc83f101f2e58694c17d Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Sun, 2 Feb 2020 22:36:33 +0100 Subject: [PATCH] Added generic photo page extract method to media module, to allow pre-filtering sources and relief Dogfart scraper. Added 'transsexual' site tag to Trans Angels. --- seeds/03_tags.js | 1 + src/media.js | 38 ++++++++++++++++++++++++++++++++++---- src/scrapers/blowpass.js | 6 +++++- src/scrapers/dogfart.js | 32 +++++++++++--------------------- 4 files changed, 51 insertions(+), 26 deletions(-) diff --git a/seeds/03_tags.js b/seeds/03_tags.js index 46d1de24..c11dc8d2 100644 --- a/seeds/03_tags.js +++ b/seeds/03_tags.js @@ -1654,6 +1654,7 @@ function getSiteTags() { thegayoffice: ['gay'], tiny4k: ['4k'], toptobottom: ['gay'], + transangels: ['transsexual'], trueanal: ['anal'], tspussyhunters: ['transsexual'], }; diff --git a/src/media.js b/src/media.js index a6ee75bb..f6022ccb 100644 --- a/src/media.js +++ b/src/media.js @@ -12,6 +12,7 @@ const blake2 = require('blake2'); const logger = require('./logger'); const knex = require('./knex'); const upsert = require('./utils/upsert'); +const { ex } = require('./utils/q'); function getHash(buffer) { const hash = blake2.createHash('blake2b', { digestLength: 24 }); @@ -81,12 +82,22 @@ function curatePhotoEntries(files) { async function findDuplicates(photos, identifier, prop = null, label) { const duplicates = await knex('media') - .whereIn(identifier, photos.flat().map(photo => (prop ? photo[prop] : photo))); + .whereIn(identifier, photos.flat().map((photo) => { + if (prop) return photo[prop]; + if (photo.src) return photo.src; + + return photo; + })); const duplicateLookup = new Set(duplicates.map(photo => photo[prop || identifier])); - const originals = photos.filter(source => (Array.isArray(source) // fallbacks provided? - ? !source.some(sourceX => duplicateLookup.has(prop ? sourceX[prop] : sourceX)) // ensure none of the sources match - : !duplicateLookup.has(prop ? source[prop] : source))); + + const originals = photos.filter((source) => { + if (Array.isArray(source)) { + return !source.some(sourceX => !duplicateLookup.has((prop && sourceX[prop]) || (sourceX.src && sourceX))); + } + + return !duplicateLookup.has((prop && source[prop]) || (source.src && source)); + }); if (duplicates.length > 0) { logger.info(`${duplicates.length} media items already present by ${identifier} for ${label}`); @@ -99,7 +110,26 @@ async function findDuplicates(photos, identifier, prop = null, label) { return [duplicates, originals]; } +async function extractPhoto(source) { + const res = await bhttp.get(source.src); + + if (res.statusCode === 200) { + const { q } = ex(res.body.toString()); + + return source.extract(q); + } + + return null; +} + async function fetchPhoto(photoUrl, index, label, attempt = 1) { + if (photoUrl.src && photoUrl.extract) { + // source links to page containing a (presumably) tokenized photo + const photo = await extractPhoto(photoUrl); + + return fetchPhoto(photo, index, label); + } + if (Array.isArray(photoUrl)) { return photoUrl.reduce(async (outcome, url) => outcome.catch(async () => { const photo = await fetchPhoto(url, index, label); diff --git a/src/scrapers/blowpass.js b/src/scrapers/blowpass.js index d816f2ce..60eb3f99 100644 --- a/src/scrapers/blowpass.js +++ b/src/scrapers/blowpass.js @@ -9,7 +9,11 @@ async function fetchScene(url, site) { const res = await bhttp.get(`https://www.blowpass.com/en/video/${site.id}/${new URL(url).pathname.split('/').slice(-2).join('/')}`); const release = await scrapeScene(res.body.toString(), url, site); - release.channel = release.$('.siteNameSpan').text().trim().toLowerCase(); + release.channel = release.$('.siteNameSpan') + .text() + .trim() + .toLowerCase() + .replace('.com', ''); if (['onlyteenblowjobs.com', 'mommyblowsbest.com'].includes(release.channel)) release.url = url.replace(/video\/\w+\//, 'scene/'); else release.url = url.replace(/video\/\w+\//, 'video/'); diff --git a/src/scrapers/dogfart.js b/src/scrapers/dogfart.js index 8aadaefc..e807ad17 100644 --- a/src/scrapers/dogfart.js +++ b/src/scrapers/dogfart.js @@ -1,21 +1,11 @@ 'use strict'; /* eslint-disable newline-per-chained-call */ -const Promise = require('bluebird'); +// const Promise = require('bluebird'); const bhttp = require('bhttp'); const { JSDOM } = require('jsdom'); const moment = require('moment'); -async function getPhoto(url) { - const res = await bhttp.get(url); - const html = res.body.toString(); - const { document } = new JSDOM(html).window; - - const photoUrl = document.querySelector('.scenes-module img').src; - - return photoUrl; -} - async function getPhotos(albumUrl) { const res = await bhttp.get(albumUrl); const html = res.body.toString(); @@ -24,12 +14,13 @@ async function getPhotos(albumUrl) { const lastPhotoPage = Array.from(document.querySelectorAll('.preview-image-container a')).slice(-1)[0].href; const lastPhotoIndex = parseInt(lastPhotoPage.match(/\d+.jpg/)[0], 10); - const photoUrls = await Promise.map(Array.from({ length: lastPhotoIndex }), async (value, index) => { - const pageUrl = `https://blacksonblondes.com${lastPhotoPage.replace(/\d+.jpg/, `${index.toString().padStart(3, '0')}.jpg`)}`; + const photoUrls = Array.from({ length: lastPhotoIndex }, (value, index) => { + const pageUrl = `https://blacksonblondes.com${lastPhotoPage.replace(/\d+.jpg/, `${(index + 1).toString().padStart(3, '0')}.jpg`)}`; - return getPhoto(pageUrl); - }, { - concurrency: 5, + return { + src: pageUrl, + extract: q => q('.scenes-module img', 'src'), + }; }); return photoUrls; @@ -90,6 +81,9 @@ async function scrapeScene(html, url, site) { .trim(); const channel = document.querySelector('.site-name').textContent.split('.')[0].toLowerCase(); + const { origin, pathname } = new URL(url); + const entryId = `${channel}_${pathname.split('/').slice(-2)[0]}`; + const date = new Date(document.querySelector('meta[itemprop="uploadDate"]').content); const duration = moment .duration(`00:${document @@ -103,13 +97,13 @@ async function scrapeScene(html, url, site) { const { trailer } = trailerElement.dataset; const lastPhotosUrl = Array.from(document.querySelectorAll('.pagination a')).slice(-1)[0].href; - const { origin, pathname } = new URL(url); const photos = await getPhotos(`${origin}${pathname}${lastPhotosUrl}`, site, url); const stars = Math.floor(Number(document.querySelector('span[itemprop="average"]').textContent) / 2); const tags = Array.from(document.querySelectorAll('.scene-details .categories a')).map(({ textContent }) => textContent); return { + entryId, url: `${origin}${pathname}`, title, description, @@ -131,11 +125,7 @@ async function scrapeScene(html, url, site) { } async function fetchLatest(site, page = 1) { - console.time('dogfart'); - console.log('scraping...', site.name); const res = await bhttp.get(`https://dogfartnetwork.com/tour/scenes/?p=${page}`); - console.timeEnd('dogfart'); - console.log('done!', site.name); return scrapeLatest(res.body.toString(), site); }