From 298eabe56e0a2cd552d4e616741ca43214cedde3 Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Thu, 26 Sep 2019 03:27:01 +0200 Subject: [PATCH] Improved Vixen and XEmpire scrapers. Added media to Blowpass scraper. Improved release fetch code. --- assets/components/header/header.vue | 2 +- assets/components/release/release.vue | 1 + assets/css/_theme.scss | 2 + migrations/20190325001339_releases.js | 4 +- package-lock.json | 5 ++ package.json | 4 +- public/css/style.css | 1 + src/fetch-releases.js | 121 ++++++++++++++------------ src/scrapers/blowpass.js | 43 +++++---- src/scrapers/vixen.js | 14 ++- src/scrapers/xempire.js | 5 +- 11 files changed, 123 insertions(+), 79 deletions(-) diff --git a/assets/components/header/header.vue b/assets/components/header/header.vue index 70281e32..8db1d61a 100644 --- a/assets/components/header/header.vue +++ b/assets/components/header/header.vue @@ -4,7 +4,7 @@ :to="{ name: 'home' }" class="logo-link" > -

Porn Radar

+

traxxx

diff --git a/assets/components/release/release.vue b/assets/components/release/release.vue index a8339dc0..bef14582 100644 --- a/assets/components/release/release.vue +++ b/assets/components/release/release.vue @@ -136,6 +136,7 @@ export default { @import 'theme'; .banner { + background: $empty; white-space: nowrap; overflow-x: auto; margin: 0 0 1rem 0; diff --git a/assets/css/_theme.scss b/assets/css/_theme.scss index f1583247..71d35f04 100644 --- a/assets/css/_theme.scss +++ b/assets/css/_theme.scss @@ -8,3 +8,5 @@ $shadow-weak: rgba(0, 0, 0, .2); $shadow-hint: rgba(0, 0, 0, .1); $highlight: rgba(255, 255, 255, .5); + +$empty: #222; diff --git a/migrations/20190325001339_releases.js b/migrations/20190325001339_releases.js index eb174636..89680e1c 100644 --- a/migrations/20190325001339_releases.js +++ b/migrations/20190325001339_releases.js @@ -1,5 +1,3 @@ -'use strict'; - exports.up = knex => Promise.resolve() .then(() => knex.schema.createTable('actors', (table) => { table.increments('id', 12); @@ -109,6 +107,8 @@ exports.up = knex => Promise.resolve() table.integer('rating') .unsigned(); + table.boolean('deep'); + table.datetime('created_at') .defaultTo(knex.fn.now()); })) diff --git a/package-lock.json b/package-lock.json index ab73d8df..0c81077c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -2866,6 +2866,11 @@ } } }, + "cli-confirm": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/cli-confirm/-/cli-confirm-1.0.1.tgz", + "integrity": "sha512-AF9fOBZPflNFm2wtq5BNqLr2ZmOB0nwgYQn0tgfbk5OYDMmLFbP+FrelmGK5apvp67ybbpUVTBZZffgR2yv+CQ==" + }, "cli-cursor": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/cli-cursor/-/cli-cursor-2.1.0.tgz", diff --git a/package.json b/package.json index c77b9b81..52a1c042 100644 --- a/package.json +++ b/package.json @@ -16,7 +16,8 @@ "migrate": "knex-migrate up", "rollback": "knex-migrate down", "seed-make": "knex seed:make", - "seed": "knex seed:run" + "seed": "knex seed:run", + "flush": "cli-confirm \"This completely purges the database, are you sure?\" && knex-migrate down && knex-migrate up && knex seed:run" }, "repository": { "type": "git", @@ -66,6 +67,7 @@ "bluebird": "^3.5.4", "body-parser": "^1.19.0", "cheerio": "^1.0.0-rc.2", + "cli-confirm": "^1.0.1", "clipboardy": "^1.2.3", "config": "^3.0.1", "dayjs": "^1.8.14", diff --git a/public/css/style.css b/public/css/style.css index 52d2b3fc..5a74caa7 100644 --- a/public/css/style.css +++ b/public/css/style.css @@ -109,6 +109,7 @@ } .banner[data-v-2bc41e74] { + background: #222; white-space: nowrap; overflow-x: auto; margin: 0 0 1rem 0; diff --git a/src/fetch-releases.js b/src/fetch-releases.js index 07ca82ce..92184ecd 100644 --- a/src/fetch-releases.js +++ b/src/fetch-releases.js @@ -86,6 +86,30 @@ async function findDuplicateReleases(latestReleases, _siteId) { .orWhereIn('entry_id', latestReleasesEntryIds); } +async function storeActors(release, releaseEntry) { + const actors = await knex('actors').whereIn('name', release.actors); + const newActors = release.actors.filter(actorName => !actors.some(actor => actor.name === actorName)); + + const { rows: insertedActors } = newActors.length + ? await knex.raw(`${knex('actors').insert(newActors.map(actorName => ({ + name: actorName, + slug: actorName.toLowerCase().replace(/\s+/g, '-'), + })))} ON CONFLICT DO NOTHING RETURNING *`) + : { rows: [] }; + + return knex('actors_associated').insert(actors.concat(insertedActors).map(actor => ({ + release_id: releaseEntry.id, + actor_id: actor.id, + })), '*'); +} + +async function storeTags(release, releaseEntry) { + return knex('tags_associated').insert(release.tags.map(tag => ({ + tag_id: tag, + release_id: releaseEntry.id, + }))); +} + async function storePhotos(release, releaseEntry) { console.log(`Storing ${release.photos.length} photos for (${release.site.name}, ${releaseEntry.id}) "${release.title}"`); @@ -170,51 +194,36 @@ async function storeReleases(releases = []) { likes: release.rating && release.rating.likes, dislikes: release.rating && release.rating.dislikes, rating: release.rating && release.rating.stars, + deep: argv.deep, }; const releaseQuery = `${knex('releases').insert(curatedRelease).toString()} ON CONFLICT DO NOTHING RETURNING *`; const releaseEntry = await knex.raw(releaseQuery); - console.log(`Stored (${release.site.name}, ${releaseEntry.rows[0].id}) "${release.title}"`); + if (releaseEntry.rows.length > 0) { + console.log(`Stored (${release.site.name}, ${releaseEntry.rows[0].id}) "${release.title}"`); - if (release.actors && release.actors.length > 0) { - const actors = await knex('actors').whereIn('name', release.actors); - const newActors = release.actors.filter(actorName => !actors.some(actor => actor.name === actorName)); - const { rows: insertedActors } = newActors.length - ? await knex.raw(`${knex('actors').insert(newActors.map(actorName => ({ - name: actorName, - slug: actorName.toLowerCase().replace(/\s+/g, '-'), - })))} ON CONFLICT DO NOTHING RETURNING *`) - : { rows: [] }; + if (release.poster || (release.photos && release.photos.length)) { + await fs.mkdir(path.join(config.photoPath, release.site.slug, releaseEntry.rows[0].id.toString()), { recursive: true }); + } - await knex('actors_associated').insert(actors.concat(insertedActors).map(actor => ({ - release_id: releaseEntry.rows[0].id, - actor_id: actor.id, - })), '*'); + await Promise.all([ + release.actors && release.actors.length > 0 + ? storeActors(release, releaseEntry.rows[0]) : Promise.resolve(), + release.tags && release.tags.length > 0 + ? storeTags(release, releaseEntry.rows[0]) : Promise.resolve(), + release.photos && release.photos.length > 0 + ? storePhotos(release, releaseEntry.rows[0]) : Promise.resolve(), + release.poster + ? storePoster(release, releaseEntry.rows[0]) : Promise.resolve(), + release.trailer + ? storeTrailer(release, releaseEntry.rows[0]) : Promise.resolve(), + ]); + + return; } - if (release.tags && release.tags.length > 0) { - await knex('tags_associated').insert(release.tags.map(tag => ({ - tag_id: tag, - release_id: releaseEntry.rows[0].id, - }))); - } - - if (release.poster || (release.photos && release.photos.length)) { - await fs.mkdir(path.join(config.photoPath, release.site.slug, releaseEntry.rows[0].id.toString()), { recursive: true }); - } - - if (release.photos && release.photos.length > 0) { - await storePhotos(release, releaseEntry.rows[0]); - } - - if (release.poster) { - await storePoster(release, releaseEntry.rows[0]); - } - - if (release.trailer) { - await storeTrailer(release, releaseEntry.rows[0]); - } + console.error(`Unable to save scene to database, possible collision: "${release.title}" (${release.site.name})`); }, { concurrency: 2, }); @@ -228,13 +237,15 @@ async function fetchNewReleases(scraper, site, afterDate, accReleases = [], page } const duplicateReleases = await findDuplicateReleases(latestReleases, site.id); + const duplicateReleasesIds = new Set( duplicateReleases .map(release => release.shoot_id || release.entry_id) + .concat(duplicateReleases.map(release => release.entry_id || release.shoot_id)) // exclude accumulated releases to prevent an infinite loop if the next page contains the same releases as the previous - .concat(duplicateReleases.map(release => release.shoot_id || release.entry_id)) .concat(accReleases.map(release => release.shootId || release.entryId)), ); + const uniqueReleases = latestReleases.filter(release => !duplicateReleasesIds.has(String(release.shootId)) && !duplicateReleasesIds.has(String(release.entryId)) && moment(release.date).isAfter(afterDate)); @@ -267,29 +278,29 @@ async function fetchReleases() { console.log(`${site.name}: Found ${newReleases.length} recent releases, ${upcomingReleases.length} upcoming releases`); + const finalReleases = argv.deep + ? await Promise.map(newReleases, async (release) => { + if (release.url) { + const scene = await fetchScene(release.url, release); + + return { + ...release, + ...scene, + }; + } + + return release; + }, { + concurrency: 2, + }) + : newReleases; + if (argv.save) { - const finalReleases = argv.deep - ? await Promise.map(newReleases, async (release) => { - if (release.url) { - const scene = await fetchScene(release.url, release); - - return { - ...release, - ...scene, - }; - } - - return release; - }, { - concurrency: 2, - }) - : newReleases; - await storeReleases(finalReleases); } return [ - ...newReleases.map(release => ({ + ...finalReleases.map(release => ({ ...release, network: site.network, })), diff --git a/src/scrapers/blowpass.js b/src/scrapers/blowpass.js index 46d03383..8a75fbc9 100644 --- a/src/scrapers/blowpass.js +++ b/src/scrapers/blowpass.js @@ -21,6 +21,9 @@ function scrape(html, site) { const date = moment.utc($(element).find('.sceneDate').text(), 'MM-DD-YYYY').toDate(); const actors = $(element).find('.sceneActors a').map((actorIndex, actorElement) => $(actorElement).text()).toArray(); + const poster = $(element).find('a.imgLink img.img').attr('data-original'); + const trailer = `https://videothumb.gammacdn.com/600x339/${entryId}.mp4`; + const likes = Number($(element).find('.rating .state_1 .value').text()); return { @@ -29,6 +32,11 @@ function scrape(html, site) { title, actors, date, + poster, + trailer: { + src: trailer, + quality: 339, + }, rating: { likes, }, @@ -43,26 +51,26 @@ async function scrapeScene(html, url, site) { const data = JSON.parse(json).slice(-1)[0]; const sceneElement = $('#wrapper'); + const videoScript = $('script:contains("window.ScenePlayerOptions")').html(); + const playerObject = videoScript.slice(videoScript.indexOf('{'), videoScript.indexOf('};') + 1); + const playerData = JSON.parse(playerObject); + const workName = data.isPartOf.name.split(' - '); - const shootId = workName.length > 1 ? workName[0] : null; + const shootId = workName.length > 1 ? workName[1] : null; const entryId = url.split('/').slice(-1)[0]; - const title = data.isPartOf ? data.isPartOf.name : data.name; - const { description } = data; - const date = moment.utc(data.isPartOf.datePublished, 'YYYY-MM-DD').toDate(); + const title = data.title || $('meta[name="twitter:title"]').attr('content'); + const description = data.description || $('meta[name="twitter:description"]').attr('content'); + // date in data object is not the release date of the scene, but the date the entry was added + const date = moment.utc($('.updatedDate').first().text(), 'MM-DD-YYYY').toDate(); - // const actors = sceneElement.find('.sceneActors a').map((actorIndex, actorElement) => $(actorElement).text().trim()).toArray(); - const actors = data.actor - .sort(({ gender: genderA }, { gender: genderB }) => { - if (genderA === 'female' && genderB === 'male') return -1; - if (genderA === 'male' && genderB === 'female') return 1; - - return 0; - }) - .map(actor => actor.name); + const actors = data.actor.map(({ name }) => name); const likes = Number(sceneElement.find('.rating .state_1 .value').text()); const dislikes = Number(sceneElement.find('.rating .state_2 .value').text()); + const poster = playerData.picPreview; + const trailer = `${playerData.playerOptions.host}${playerData.url}`; + const duration = moment.duration(data.duration.slice(2)).asSeconds(); const rawTags = data.keywords.split(', '); @@ -77,6 +85,11 @@ async function scrapeScene(html, url, site) { actors, date, duration, + poster, + trailer: { + src: trailer, + quality: playerData.sizeOnLoad.slice(0, -1), + }, tags, rating: { likes, @@ -87,13 +100,13 @@ async function scrapeScene(html, url, site) { } async function fetchLatest(site, page = 1) { - const res = await bhttp.get(`https://www.blowpass.com/en/videos/${site.id}/latest/All-Categories/0/All-Pornstars/0/${page}`); + const res = await bhttp.get(`https://www.blowpass.com/en/videos/${site.slug}/latest/All-Categories/0/All-Pornstars/0/${page}`); return scrape(res.body.toString(), site); } async function fetchUpcoming(site) { - const res = await bhttp.get(`https://www.blowpass.com/en/videos/${site.id}/upcoming`); + const res = await bhttp.get(`https://www.blowpass.com/en/videos/${site.slug}/upcoming`); return scrape(res.body.toString(), site); } diff --git a/src/scrapers/vixen.js b/src/scrapers/vixen.js index 870dfa4e..f772246e 100644 --- a/src/scrapers/vixen.js +++ b/src/scrapers/vixen.js @@ -10,8 +10,8 @@ const { matchTags } = require('../tags'); function scrapeLatest(html, site) { const $ = cheerio.load(html, { normalizeWhitespace: true }); - const stateObject = $('script:contains("INITIAL_STATE")'); - const { videos: scenes } = JSON.parse(stateObject.html().trim().slice(27, -1)); + const stateScript = $('script:contains("INITIAL_STATE")').html(); + const { videos: scenes } = JSON.parse(stateScript.slice(stateScript.indexOf('{'), stateScript.indexOf('};') + 1)); return scenes.map((scene) => { const shootId = String(scene.newId); @@ -59,7 +59,8 @@ async function scrapeScene(html, url, site) { const shootId = data.page.data[`${pathname}${search}`].data.video; const scene = data.videos.find(video => video.newId === shootId); - // console.log(scene); + const [poster, ...photos] = scene.rotatingThumbsUrlSizes.map(photo => photo['1040w']); + const trailer = scene.previews.listing.find(preview => preview.height === 353) || null; const { title, @@ -84,6 +85,13 @@ async function scrapeScene(html, url, site) { date, duration, tags, + photos, + poster, + trailer: trailer && { + src: trailer.src, + type: trailer.type, + quality: 353, + }, rating: { stars, }, diff --git a/src/scrapers/xempire.js b/src/scrapers/xempire.js index f958d5d9..e7cbb110 100644 --- a/src/scrapers/xempire.js +++ b/src/scrapers/xempire.js @@ -54,7 +54,9 @@ async function scrapeScene(html, url, site) { const entryId = new URL(url).pathname.split('/').slice(-1)[0]; const title = $('meta[name="twitter:title"]').attr('content'); - const date = moment.utc(data.dateCreated, 'YYYY-MM-DD').toDate(); + const description = data.description || $('meta[name="twitter:description"]').attr('content'); + // date in data object is not the release date of the scene, but the date the entry was added + const date = moment.utc($('.updatedDate').first().text(), 'MM-DD-YYYY').toDate(); const actors = data.actor .sort(({ gender: genderA }, { gender: genderB }) => { @@ -65,7 +67,6 @@ async function scrapeScene(html, url, site) { }) .map(actor => actor.name); - const description = data.description || undefined; const stars = (data.aggregateRating.ratingValue / data.aggregateRating.bestRating) * 5; const duration = moment.duration(data.duration.slice(2).split(':')).asSeconds();