From e5c6ccd252b3ae776fed7fdbaeb4eb6fc2ecea67 Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Sat, 22 Feb 2020 04:37:48 +0100 Subject: [PATCH] Scraping upcoming Vixen scenes. Fetching release media groups sequentially to prevent collisions. --- config/default.js | 2 +- seeds/02_sites.js | 4 +++ src/media.js | 4 +-- src/releases.js | 34 +++++++++--------- src/scrapers/vixen.js | 83 +++++++++++++++++++++++++++++++++---------- 5 files changed, 87 insertions(+), 40 deletions(-) diff --git a/config/default.js b/config/default.js index b64c7e6f..22cb8714 100644 --- a/config/default.js +++ b/config/default.js @@ -99,7 +99,7 @@ module.exports = { path: './media', thumbnailSize: 320, // width for 16:9 will be exactly 576px thumbnailQuality: 100, - videoQuality: [480, 360, 320, 540, 720, 1080, 2160], + videoQuality: [480, 360, 320, 540, 720, 1080, 2160, 240, 180], limit: 25, // max number of photos per release }, titleSlugLength: 50, diff --git a/seeds/02_sites.js b/seeds/02_sites.js index c36fc62b..04a8476b 100644 --- a/seeds/02_sites.js +++ b/seeds/02_sites.js @@ -5304,6 +5304,7 @@ const sites = [ name: 'Blacked', description: 'Porn videos of beautiful girls in first time interracial porn videos. BLACKED has the hottest pornstars in HD sex videos.', url: 'https://www.blacked.com', + tags: ['interracial', 'bbc'], network: 'vixen', }, { @@ -5311,6 +5312,7 @@ const sites = [ name: 'Tushy', description: 'Watch the world\'s best HD Anal videos! Featuring beautiful, never before seen girls in first time anal. Exclusively on Tushy.com', url: 'https://www.tushy.com', + tags: ['anal'], network: 'vixen', }, { @@ -5318,6 +5320,7 @@ const sites = [ name: 'Blacked Raw', description: 'Experience real women in interracial sex videos. Passionate sex with beautiful pornstars. No photoshop just the highest quality porn. Everything you see is real.', url: 'https://www.blackedraw.com', + tags: ['interracial', 'bbc'], network: 'vixen', }, { @@ -5325,6 +5328,7 @@ const sites = [ name: 'Tushy Raw', description: 'Anal sex videos with beautiful models and pornstars being fucked in the ass. TUSHY RAW features famous pornstars in high quality anal porn videos.', url: 'https://www.tushyraw.com', + tags: ['anal'], network: 'vixen', }, { diff --git a/src/media.js b/src/media.js index 83ca4da0..63cf2a5c 100644 --- a/src/media.js +++ b/src/media.js @@ -166,7 +166,7 @@ async function saveItems(items, domain, role) { ? `${item.hash.slice(4)}_${item.quality}.${item.extension}` : `${item.hash.slice(4)}.${item.extension}`; - const filedir = path.join(`${domain}s`, `${role}s`, dir, subdir); + const filedir = path.join(`${role}s`, dir, subdir); const filepath = path.join(filedir, filename); await fs.mkdir(path.join(config.media.path, filedir), { recursive: true }); @@ -175,7 +175,7 @@ async function saveItems(items, domain, role) { if (/image/.test(item.mimetype)) { const thumbnail = await createThumbnail(item.file); - const thumbdir = path.join(`${domain}s`, `${role}s`, 'thumbs', dir, subdir); + const thumbdir = path.join(`${role}s`, 'thumbs', dir, subdir); const thumbpath = path.join(thumbdir, filename); await fs.mkdir(path.join(config.media.path, thumbdir), { recursive: true }); diff --git a/src/releases.js b/src/releases.js index 5e8a2ca0..084025ea 100644 --- a/src/releases.js +++ b/src/releases.js @@ -343,32 +343,30 @@ async function storeReleaseAssets(releases) { [release.id]: pluckItems(release.photos), }), {}); - const [posters, covers] = await Promise.all([ - argv.posters && storeMedia(Object.values(releasePostersById).flat(), 'release', 'poster'), - argv.covers && storeMedia(Object.values(releaseCoversById).flat(), 'release', 'cover'), - ]); + if (argv.posters) { + const posters = await storeMedia(Object.values(releasePostersById).flat(), 'release', 'poster'); + if (posters) await associateMedia(releasePostersById, posters, 'release', 'poster'); + } - // ensure posters are available before fetching supplementary media - await Promise.all([ - posters && associateMedia(releasePostersById, posters, 'release', 'poster'), - covers && associateMedia(releaseCoversById, covers, 'release', 'cover'), - ]); + if (argv.covers) { + const covers = await storeMedia(Object.values(releaseCoversById).flat(), 'release', 'cover'); + if (covers) await associateMedia(releaseCoversById, covers, 'release', 'cover'); + } if (argv.photos) { const photos = await storeMedia(Object.values(releasePhotosById).flat(), 'release', 'photo'); if (photos) await associateMedia(releasePhotosById, photos, 'release', 'photo'); } - // videos take a long time, fetch last - const [trailers, teasers] = await Promise.all([ - argv.trailers && storeMedia(Object.values(releaseTrailersById).flat(), 'release', 'trailer'), - argv.teasers && storeMedia(Object.values(releaseTeasersById).flat(), 'release', 'teaser'), - ]); + if (argv.trailers) { + const trailers = await storeMedia(Object.values(releaseTrailersById).flat(), 'release', 'trailer'); + if (trailers) await associateMedia(releaseTrailersById, trailers, 'release', 'trailer'); + } - await Promise.all([ - trailers && associateMedia(releaseTrailersById, trailers, 'release', 'trailer'), - teasers && associateMedia(releaseTeasersById, teasers, 'release', 'teaser'), - ]); + if (argv.teasers) { + const teasers = await storeMedia(Object.values(releaseTeasersById).flat(), 'release', 'teaser'); + if (teasers) await associateMedia(releaseTeasersById, teasers, 'release', 'teaser'); + } } async function storeRelease(release) { diff --git a/src/scrapers/vixen.js b/src/scrapers/vixen.js index a618dad8..cd10adec 100644 --- a/src/scrapers/vixen.js +++ b/src/scrapers/vixen.js @@ -5,17 +5,6 @@ const bhttp = require('bhttp'); const cheerio = require('cheerio'); const moment = require('moment'); -const { matchTags } = require('../tags'); - -const defaultTags = { - blacked: ['bbc'], - blackedraw: ['bbc'], - tushy: ['anal'], - tushyraw: ['anal'], - vixen: [], - deeper: [], -}; - function scrapeLatest(html, site) { const $ = cheerio.load(html, { normalizeWhitespace: true }); @@ -36,7 +25,7 @@ function scrapeLatest(html, site) { // largest thumbnail. poster is the same image but bigger, too large for storage space efficiency const poster = scene.images.listing.slice(-1)[0].src; - const trailer = scene.previews.listing.slice(-1)[0]; + const teaser = scene.previews.listing.slice(-1)[0]; return { url, @@ -45,10 +34,10 @@ function scrapeLatest(html, site) { actors, date, poster, - trailer: { - src: trailer.src, - type: trailer.type, - quality: trailer.height, + teaser: { + src: teaser.src, + type: teaser.type, + quality: teaser.height, }, rating: { stars, @@ -58,6 +47,50 @@ function scrapeLatest(html, site) { }); } +function scrapeUpcoming(html, site) { + const statePrefix = html.indexOf('__INITIAL_STATE__'); + const stateString = html.slice(html.indexOf('{', statePrefix), html.indexOf('};', statePrefix) + 1); + const data = JSON.parse(stateString); + + const scene = data.page.data['/'].data?.nextScene; + if (!scene) return null; + + const release = {}; + + release.title = scene.targetUrl + .slice(1) + .split('-') + .map(component => `${component.charAt(0).toUpperCase()}${component.slice(1)}`) + .join(' '); + + release.date = moment.utc(scene.releaseDate).toDate(); + release.url = `${site.url}${scene.targetUrl}`; + + release.actors = scene.models; + + release.poster = scene.images.poster + .filter(image => /landscape/i.test(image.name)) + .sort((imageA, imageB) => imageB.height - imageA.height) + .map((image) => { + const sources = [image.src, image.highdpi?.['2x'], image.highdpi?.['3x']]; + // high DPI images for full HD source are huge, only prefer for smaller fallback sources + return image.height === 1080 ? sources : sources.reverse(); + }) + .flat(); + + release.teaser = scene.previews.poster + .filter(teaser => /landscape/i.test(teaser.name)) + .map(teaser => ({ + src: teaser.src, + type: teaser.type, + quality: Number(String(teaser.height).replace('353', '360')), + })); + + release.entryId = (release.poster[0] || release.teaser[0])?.match(/\/(\d+)/)?.[1]; + + return [release]; +} + async function scrapeScene(html, url, site) { const $ = cheerio.load(html, { normalizeWhitespace: true }); @@ -68,6 +101,8 @@ async function scrapeScene(html, url, site) { const entryId = pageData.video; const scene = data.videos.find(video => video.newId === entryId); + console.log(scene, data, pageData); + const poster = scene.rotatingThumbsUrlSizes[0]['1040w']; const photos = pageData.pictureset.map(photo => photo.main[0].src); const trailer = scene.previews.listing.find(preview => preview.height === 353) || null; @@ -79,11 +114,10 @@ async function scrapeScene(html, url, site) { totalRateVal: stars, runLength: duration, directorNames: director, - tags: rawTags, + tags, } = scene; const date = new Date(scene.releaseDate); - const tags = await matchTags([...defaultTags[site.slug], ...rawTags]); return { url, @@ -117,7 +151,17 @@ async function fetchLatest(site, page = 1) { return scrapeLatest(res.body.toString(), site); } - throw new Error(`Vixen response not OK for latest: ${res.statusCode}`); + return res.statusCode; +} + +async function fetchUpcoming(site) { + const res = await bhttp.get(site.url); + + if (res.statusCode === 200) { + return scrapeUpcoming(res.body.toString(), site); + } + + return res.statusCode; } async function fetchScene(url, site) { @@ -132,5 +176,6 @@ async function fetchScene(url, site) { module.exports = { fetchLatest, + fetchUpcoming, fetchScene, };