From 3541a9c40231277a2ff2809a9e6e1e9ba9b985b3 Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Sat, 1 Feb 2020 02:26:00 +0100 Subject: [PATCH] Integrated Blowpass into generic Gamma scraper. --- config/default.js | 32 +---------- src/scrape-sites.js | 2 +- src/scrapers/blowpass.js | 120 +++++---------------------------------- src/scrapers/gamma.js | 11 ++-- src/sites.js | 17 +++++- 5 files changed, 36 insertions(+), 146 deletions(-) diff --git a/config/default.js b/config/default.js index 5d16b626..82f50dd5 100644 --- a/config/default.js +++ b/config/default.js @@ -9,36 +9,8 @@ module.exports = { host: '0.0.0.0', port: 5000, }, - include: [ - '21sextury', - 'babes', - 'bang', - 'bangbros', - 'blowpass', - 'brazzers', - 'ddfnetwork', - 'digitalplayground', - 'dogfartnetwork', - 'evilangel', - 'fakehub', - 'jayrock', - 'julesjordan', - 'kellymadison', - 'kink', - 'legalporno', - 'mikeadriano', - 'milehighmedia', - 'mofos', - 'naughtyamerica', - 'perfectgonzo', - 'pervcity', - 'pornpros', - 'private', - 'realitykings', - 'teamskeet', - 'vixen', - 'xempire', - ], + // include: [], + // exclude: [], fetchAfter: [1, 'week'], media: { path: './media', diff --git a/src/scrape-sites.js b/src/scrape-sites.js index 2f181d68..e9707d8d 100644 --- a/src/scrape-sites.js +++ b/src/scrape-sites.js @@ -105,7 +105,7 @@ async function scrapeSiteReleases(scraper, site) { ]); if (argv.upcoming) { - logger.info(`${site.name}: ${argv.latest ? 'Found' : 'Ignoring'} ${newReleases.length || ''}latest releases, ${argv.upcoming ? '' : 'ignoring '}${upcomingReleases.length || ''} upcoming releases`); + logger.info(`${site.name}: ${argv.latest ? `Found ${newReleases.length}` : 'Ignoring'} latest releases, ${argv.upcoming ? '' : 'ignoring '}${upcomingReleases.length || ''} upcoming releases`); } const baseReleases = [...newReleases, ...upcomingReleases]; diff --git a/src/scrapers/blowpass.js b/src/scrapers/blowpass.js index 59d38f4f..d83774e7 100644 --- a/src/scrapers/blowpass.js +++ b/src/scrapers/blowpass.js @@ -1,132 +1,38 @@ 'use strict'; -/* eslint-disable newline-per-chained-call */ const bhttp = require('bhttp'); -const cheerio = require('cheerio'); -const moment = require('moment'); -const { getPhotos, fetchProfile } = require('./gamma'); +const { scrapeAll, scrapeScene, fetchProfile } = require('./gamma'); -function scrape(html, site) { - const $ = cheerio.load(html, { normalizeWhitespace: true }); - const sceneElements = $('.sceneList .scene').toArray(); +async function fetchScene(url, site) { + // const res = await bhttp.get(url); + const res = await bhttp.get(`https://www.blowpass.com/en/video/${site.id}/${new URL(url).pathname.split('/').slice(-2).join('/')}`); - return sceneElements.map((element) => { - const entryId = $(element).attr('data-itemid'); + const release = await scrapeScene(res.body.toString(), url, site); + release.channel = release.$('.siteNameSpan').text().trim().toLowerCase(); - const sceneLinkElement = $(element).find('.sceneTitle a'); - const title = sceneLinkElement.attr('title'); - const url = `${site.url}/en/scene/${sceneLinkElement.attr('href').split('/').slice(-2).join('/')}`; + if (['onlyteenblowjobs.com', 'mommyblowsbest.com'].includes(release.channel)) release.url = url.replace(/video\/\w+\//, 'scene/'); + else release.url = url.replace(/video\/\w+\//, 'video/'); - const date = moment.utc($(element).find('.sceneDate').text(), 'MM-DD-YYYY').toDate(); - const actors = $(element).find('.sceneActors a').map((actorIndex, actorElement) => $(actorElement).text()).toArray(); - - const poster = $(element).find('a.imgLink img.img').attr('data-original'); - const trailer = `https://videothumb.gammacdn.com/600x339/${entryId}.mp4`; - - const likes = Number($(element).find('.rating .state_1 .value').text()); - - return { - url, - entryId, - title, - actors, - date, - poster, - trailer: { - src: trailer, - quality: 339, - }, - rating: { - likes, - }, - site, - }; - }); -} - -async function scrapeScene(html, url, site) { - const $ = cheerio.load(html, { normalizeWhitespace: true }); - const json = $('script[type="application/ld+json"]').html(); - const data = JSON.parse(json).slice(-1)[0]; - const sceneElement = $('#wrapper'); - - const videoScript = $('script:contains("window.ScenePlayerOptions")').html(); - const playerObject = videoScript.slice(videoScript.indexOf('{'), videoScript.indexOf('};') + 1); - const playerData = JSON.parse(playerObject); - - // const workName = data.isPartOf.name.split(' - '); - // const shootId = workName.length > 1 ? workName[1] : null; - const entryId = url.split('/').slice(-1)[0]; - const title = data.title || $('meta[name="twitter:title"]').attr('content'); - const description = data.description || $('meta[name="twitter:description"]').attr('content'); - // date in data object is not the release date of the scene, but the date the entry was added - const date = moment.utc($('.updatedDate').first().text(), 'MM-DD-YYYY').toDate(); - - const actors = data.actor.map(({ name }) => name); - - const likes = Number(sceneElement.find('.rating .state_1 .value').text()); - const dislikes = Number(sceneElement.find('.rating .state_2 .value').text()); - - const channel = $('.siteNameSpan').text().trim().toLowerCase(); - - const poster = playerData.picPreview; - const trailer = `${playerData.playerOptions.host}${playerData.url}`; - const photos = await getPhotos($('.picturesItem a').attr('href'), 'blowpass.com', site); - - const duration = moment.duration(data.duration.slice(2)).asSeconds(); - const tags = data.keywords.split(', '); - - return { - url, - // shootId, - entryId, - title, - description, - actors, - date, - duration, - poster, - photos, - trailer: { - src: trailer, - quality: playerData.sizeOnLoad.slice(0, -1), - }, - tags, - rating: { - likes, - dislikes, - }, - site, - channel, - }; + return release; } async function fetchLatest(site, page = 1) { const res = await bhttp.get(`https://www.blowpass.com/en/videos/${site.slug}/latest/All-Categories/0/All-Pornstars/0/${page}`); - return scrape(res.body.toString(), site); + return scrapeAll(res.body.toString(), site); } async function fetchUpcoming(site) { const res = await bhttp.get(`https://www.blowpass.com/en/videos/${site.slug}/upcoming`); - return scrape(res.body.toString(), site); + return scrapeAll(res.body.toString(), site); } -async function fetchScene(url, site) { - const res = await bhttp.get(`https://www.blowpass.com/en/video/${site.id}/${new URL(url).pathname.split('/').slice(-2).join('/')}`); - - return scrapeScene(res.body.toString(), url, site); -} - -async function blowpassFetchProfile(actorName) { - return fetchProfile(actorName, 'blowpass'); -} module.exports = { fetchLatest, - fetchProfile: blowpassFetchProfile, - fetchScene, + fetchProfile, fetchUpcoming, + fetchScene, }; diff --git a/src/scrapers/gamma.js b/src/scrapers/gamma.js index 33243b3c..d4da5501 100644 --- a/src/scrapers/gamma.js +++ b/src/scrapers/gamma.js @@ -106,14 +106,14 @@ async function scrapeApiReleases(json, site) { }); } -function scrapeAll(html, site) { +function scrapeAll(html, site, useNetworkUrl) { const $ = cheerio.load(html, { normalizeWhitespace: true }); const scenesElements = $('li[data-itemtype=scene]').toArray(); return scenesElements.map((element) => { const sceneLinkElement = $(element).find('.sceneTitle a'); - const url = `${site.url}${sceneLinkElement.attr('href')}`; + const url = `${useNetworkUrl ? site.network.url : site.url}${sceneLinkElement.attr('href')}`; const title = sceneLinkElement.attr('title'); const entryId = $(element).attr('data-itemid'); @@ -175,8 +175,8 @@ async function scrapeScene(html, url, site) { release.date = moment.utc(dateMatch, ['MM-DD-YYYY', 'YYYY-MM-DD']).toDate(); release.director = data.director?.[0].name || data2?.director?.[0].name; - release.actors = data.actor.map(actor => actor.name); - const hasTrans = data.actor.some(actor => actor.gender === 'shemale'); + release.actors = (data.actor || data2.actor).map(actor => actor.name); + const hasTrans = (data.actor || data2.actor).some(actor => actor.gender === 'shemale'); const stars = (data.aggregateRating.ratingValue / data.aggregateRating.bestRating) * 5; if (stars) release.rating = { stars }; @@ -339,7 +339,8 @@ async function fetchApiUpcoming(site) { } async function fetchLatest(site, page = 1) { - const res = await bhttp.get(`${site.url}/en/videos/AllCategories/0/${page}`); + const url = `${site.url}/en/videos/AllCategories/0/${page}`; + const res = await bhttp.get(url); return scrapeAll(res.body.toString(), site); } diff --git a/src/sites.js b/src/sites.js index 79b54ed9..3515f507 100644 --- a/src/sites.js +++ b/src/sites.js @@ -39,7 +39,7 @@ function curateSites(sites, includeParameters) { return Promise.all(sites.map(async site => curateSite(site, includeParameters))); } -function destructConfigNetworks(networks) { +function destructConfigNetworks(networks = []) { return networks.reduce((acc, network) => { if (Array.isArray(network)) { // network specifies sites @@ -119,6 +119,7 @@ async function fetchSitesFromArgv() { async function fetchSitesFromConfig() { const included = destructConfigNetworks(config.include); + const excluded = destructConfigNetworks(config.exclude); const rawSites = await knex('sites') .select( @@ -126,8 +127,18 @@ async function fetchSitesFromConfig() { 'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters', ) .leftJoin('networks', 'sites.network_id', 'networks.id') - .whereIn('sites.slug', included.sites || []) - .orWhereIn('networks.slug', included.networks || []); + .where((builder) => { + if (config.include) { + builder + .whereIn('sites.slug', included.sites) + .orWhereIn('networks.slug', included.networks); + } + }) + .whereNot((builder) => { + builder + .whereIn('sites.slug', excluded.sites) + .orWhereIn('networks.slug', excluded.networks); + }); const curatedSites = await curateSites(rawSites, true); logger.info(`Found ${curatedSites.length} sites in database`);