From 2b818e379a1171bfc83b8fc6406185cb1694967a Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Fri, 5 Apr 2019 03:45:40 +0200 Subject: [PATCH] Scrapers can now iterate through pages. Filtering unique releases before saving to database. Improved scrapers and rendering. --- config/default.js | 1 + seeds/networks.js | 2 +- src/argv.js | 11 +++++++ src/fetch-releases.js | 61 +++++++++++++++++++++++++++++-------- src/fetch-scene.js | 12 ++++++-- src/scrapers/brazzers.js | 4 +-- src/scrapers/julesjordan.js | 4 +-- src/scrapers/kink.js | 6 ++-- src/scrapers/legalporno.js | 9 +++--- src/scrapers/pervcity.js | 8 +++-- src/scrapers/private.js | 4 +-- src/scrapers/vixen.js | 8 ++--- src/scrapers/xempire.js | 11 ++----- src/tui/render.js | 7 ++--- 14 files changed, 99 insertions(+), 49 deletions(-) diff --git a/config/default.js b/config/default.js index 9476d281..af3fdd36 100644 --- a/config/default.js +++ b/config/default.js @@ -69,6 +69,7 @@ module.exports = { 'vixen', 'xempire', ], + fetchAfter: [3, 'months'], columns: [ { value: 'date', diff --git a/seeds/networks.js b/seeds/networks.js index 2e838dc3..f3e4f280 100644 --- a/seeds/networks.js +++ b/seeds/networks.js @@ -42,7 +42,7 @@ exports.seed = knex => Promise.resolve() { id: 'vixen', name: 'Vixen', - url: 'https://www.vixen.com/', + url: 'https://www.vixen.com', description: 'Vixen.com features the world’s finest cinematic adult films with 4K quality and high-end erotic photography.', }, { diff --git a/src/argv.js b/src/argv.js index 28b6c587..a37948a6 100644 --- a/src/argv.js +++ b/src/argv.js @@ -1,5 +1,6 @@ 'use strict'; +const config = require('config'); const yargs = require('yargs'); const { argv } = yargs @@ -14,6 +15,16 @@ const { argv } = yargs type: 'array', alias: 'site', }) + .option('after', { + describe: 'Don\'t fetch scenes older than', + type: 'string', + default: config.fetchAfter.join(' '), + }) + .option('save', { + describe: 'Save fetched releases to database', + type: 'boolean', + default: true, + }) .option('render', { describe: 'Fetch data without rendering interface', type: 'boolean', diff --git a/src/fetch-releases.js b/src/fetch-releases.js index 073838bb..2f212c92 100644 --- a/src/fetch-releases.js +++ b/src/fetch-releases.js @@ -56,11 +56,11 @@ async function accumulateIncludedSites() { return curateSites(rawSites); } -async function getStoredReleases(siteId, limit) { +async function findDuplicateReleases(latestReleases) { + const latestReleasesIds = latestReleases.map(release => release.shootId); + return knex('releases') - .where({ site_id: siteId }) - .orderBy('date', 'desc') - .limit(limit); + .whereIn('shoot_id', latestReleasesIds); } async function storeReleases(releases) { @@ -79,7 +79,7 @@ async function storeReleases(releases) { })); if (curatedReleases.length) { - console.log(`Adding ${curatedReleases.length} releases to database (if unique)`); + console.log(`Saving ${curatedReleases.length} new releases to database`); const insertQuery = knex('releases').insert(curatedReleases).toString(); await knex.raw(insertQuery.replace('insert', 'INSERT OR IGNORE')); @@ -90,6 +90,29 @@ async function storeReleases(releases) { return []; } +async function fetchNewReleases(scraper, site, afterDate, accReleases = [], page = 1) { + const latestReleases = await scraper.fetchLatest(site, page); + + const duplicateReleases = await findDuplicateReleases(latestReleases); + const duplicateReleasesShootIds = new Set( + duplicateReleases + .map(release => release.shoot_id) + // exclude accumulated releases to prevent an infinite loop if the next page contains the same releases as the previous + .concat(accReleases.map(release => release.shootId)), + ); + const uniqueReleases = latestReleases.filter(release => !duplicateReleasesShootIds.has(String(release.shootId)) && moment(release.date).isAfter(afterDate)); + + console.log(`${site.name}: Scraped page ${page}, ${uniqueReleases.length} unique releases`); + + const oldestReleaseOnPage = latestReleases.slice(-1)[0].date; + + if (uniqueReleases.length > 0 && moment(oldestReleaseOnPage).isAfter(afterDate)) { + return fetchNewReleases(scraper, site, afterDate, accReleases.concat(uniqueReleases), page + 1); + } + + return accReleases.concat(uniqueReleases); +} + async function fetchReleases() { const sites = await accumulateIncludedSites(); // const releases = await getExistingReleases(); @@ -98,18 +121,30 @@ async function fetchReleases() { const scraper = scrapers[site.id] || scrapers[site.networkId]; if (scraper) { - const storedReleases = await getStoredReleases(site.id, 100); + try { + const afterDate = moment.utc().subtract(...argv.after.split(' ')).toDate(); - const [latest, upcoming] = await Promise.all([ - scraper.fetchLatest(site, storedReleases), - scraper.fetchUpcoming ? scraper.fetchUpcoming(site) : [], - ]); + const [newReleases, upcomingReleases] = await Promise.all([ + fetchNewReleases(scraper, site, afterDate), + scraper.fetchUpcoming ? await scraper.fetchUpcoming(site) : [], + ]); - console.log(`${latest.length} published releases and ${upcoming.length} upcoming releases found`); + console.log(`${site.name}: Found ${newReleases.length} new releases, ${upcomingReleases.length} upcoming releases`); - await storeReleases(latest); + if (argv.save) { + await storeReleases(newReleases); + } - return [...latest, ...upcoming]; + return [...newReleases, ...upcomingReleases.map(release => ({ ...release, upcoming: true }))]; + } catch (error) { + if (argv.debug) { + console.error(`${site.id}: Failed to fetch releases`, error); + return []; + } + + console.log(`${site.id}: Failed to fetch releases`); + return []; + } } return []; diff --git a/src/fetch-scene.js b/src/fetch-scene.js index 4f0da572..f9630bf7 100644 --- a/src/fetch-scene.js +++ b/src/fetch-scene.js @@ -7,11 +7,17 @@ const knex = require('./knex'); const scrapers = require('./scrapers'); async function findSite(url) { - const { origin } = new URL(url); + const { protocol, hostname } = new URL(url); - const site = await knex('sites').where({ url: origin }).first() + const site = await knex('sites') + .where({ url: `${protocol}//www.${hostname}` }) + .orWhere({ url: `${protocol}//${hostname}` }) + .first() // scenes might be listed on network site, let network scraper find channel site - || await knex('networks').where({ url: origin }).first(); + || await knex('networks') + .where({ url: `${protocol}//www.${hostname}` }) + .orWhere({ url: `${protocol}//${hostname}` }) + .first(); return { id: site.id, diff --git a/src/scrapers/brazzers.js b/src/scrapers/brazzers.js index ab869f49..aabc9750 100644 --- a/src/scrapers/brazzers.js +++ b/src/scrapers/brazzers.js @@ -93,8 +93,8 @@ async function scrapeScene(html, url, site) { }; } -async function fetchLatest(site) { - const res = await bhttp.get(`${site.url}/`); +async function fetchLatest(site, page = 1) { + const res = await bhttp.get(`${site.url}/page/${page}/`); return scrape(res.body.toString(), site, false); } diff --git a/src/scrapers/julesjordan.js b/src/scrapers/julesjordan.js index c8ed186a..f142bdb6 100644 --- a/src/scrapers/julesjordan.js +++ b/src/scrapers/julesjordan.js @@ -107,8 +107,8 @@ async function scrapeScene(html, url, site) { }; } -async function fetchLatest(site, _date) { - const res = await bhttp.get(`${site.url}/trial/categories/movies_1_d.html`); +async function fetchLatest(site, page = 1) { + const res = await bhttp.get(`${site.url}/trial/categories/movies_${page}_d.html`); return scrapeLatest(res.body.toString(), site); } diff --git a/src/scrapers/kink.js b/src/scrapers/kink.js index b178e43a..4dcb6f4c 100644 --- a/src/scrapers/kink.js +++ b/src/scrapers/kink.js @@ -16,7 +16,7 @@ function scrapeLatest(html, site) { const href = sceneLinkElement.attr('href'); const url = `https://kink.com${href}`; const shootId = href.split('/')[2]; - const title = sceneLinkElement.text(); + const title = sceneLinkElement.text().trim(); const date = moment.utc($(element).find('.date').text(), 'MMM DD, YYYY').toDate(); const actors = $(element).find('.shoot-thumb-models a').map((actorIndex, actorElement) => $(actorElement).text()).toArray(); @@ -85,8 +85,8 @@ async function scrapeScene(html, url, shootId, ratingRes, site) { }; } -async function fetchLatest(site) { - const res = await bhttp.get(`${site.url}/latest`); +async function fetchLatest(site, page = 1) { + const res = await bhttp.get(`${site.url}/latest/page/${page}`); return scrapeLatest(res.body.toString(), site); } diff --git a/src/scrapers/legalporno.js b/src/scrapers/legalporno.js index f58804d3..ab17cf0e 100644 --- a/src/scrapers/legalporno.js +++ b/src/scrapers/legalporno.js @@ -23,14 +23,15 @@ function scrapeLatest(html, site) { const sceneLinkElement = $(element).find('.thumbnail-title a'); const url = sceneLinkElement.attr('href'); - const originalTitle = sceneLinkElement.attr('title'); + const originalTitle = sceneLinkElement.text().trim(); // title attribute breaks when they use \\ escaping const { shootId, title } = extractTitle(originalTitle); + const internalId = new URL(url).pathname.split('/')[2]; const date = moment.utc($(element).attr('release'), 'YYYY/MM/DD').toDate(); return { url, - shootId, + shootId: shootId || internalId, title, date, site, @@ -68,8 +69,8 @@ async function scrapeScene(html, url, site) { }; } -async function fetchLatest(site) { - const res = await bhttp.get(`${site.url}/new-videos`); +async function fetchLatest(site, page = 1) { + const res = await bhttp.get(`${site.url}/new-videos/${page}`); return scrapeLatest(res.body.toString(), site); } diff --git a/src/scrapers/pervcity.js b/src/scrapers/pervcity.js index f5300e6b..ce67e42e 100644 --- a/src/scrapers/pervcity.js +++ b/src/scrapers/pervcity.js @@ -33,11 +33,13 @@ function scrape(html, site) { }; } -async function fetchLatest(site) { - const res = await bhttp.get(`${site.url}/final_latestupdateview.php?limitstart=0&limitend=9&websiteid=0&deviceview=browser&tourId=${site.parameters.tourId}`); +async function fetchLatest(site, page = 1) { + const res = page === 1 + ? await bhttp.get(`${site.url}/final_latestupdateview.php?limitstart=${(page - 1) * 9}&limitend=9&websiteid=0&deviceview=browser&tourId=${site.parameters.tourId}`) + : await bhttp.get(`${site.url}/final_load_latestupdate_grid_view.php?limitstart=0&limitend=${(page - 1) * 8 + 1}&websiteid=0&deviceview=browser&tourId=${site.parameters.tourId}`); const elements = JSON.parse(res.body.toString()); - const latest = elements.total_arr.map(html => scrape(html, site)); + const latest = Object.values(elements.total_arr).map(html => scrape(html, site)); // total_arr is a key-value object for final_load_latestupdate_grid_view.php return latest; } diff --git a/src/scrapers/private.js b/src/scrapers/private.js index 0169302f..29765f8c 100644 --- a/src/scrapers/private.js +++ b/src/scrapers/private.js @@ -85,8 +85,8 @@ async function scrapeScene(html, url, site) { }; } -async function fetchLatest(site) { - const res = await bhttp.get(`${site.url}/`); +async function fetchLatest(site, page = 1) { + const res = await bhttp.get(`${site.url}/${page}/`); return scrapeLatest(res.body.toString(), site); } diff --git a/src/scrapers/vixen.js b/src/scrapers/vixen.js index efa9dfde..5349e469 100644 --- a/src/scrapers/vixen.js +++ b/src/scrapers/vixen.js @@ -14,7 +14,7 @@ function scrapeLatest(html, site) { const { videos: scenes } = JSON.parse(stateObject.html().trim().slice(27, -1)); return scenes.map((scene) => { - const shootId = scene.newId; + const shootId = String(scene.newId); const title = scene.title; const url = `${site.url}${scene.targetUrl}`; const date = moment.utc(scene.releaseDateFormatted, 'MMMM DD, YYYY').toDate(); @@ -44,7 +44,7 @@ async function scrapeScene(html, url, site) { const scene = data.page.data[`${pathname}${search}`].data.video; - const shootId = scene.newId; + const shootId = String(scene.newId); const title = scene.title; const date = new Date(scene.releaseDate); const actors = scene.models; @@ -72,8 +72,8 @@ async function scrapeScene(html, url, site) { }; } -async function fetchLatest(site) { - const res = await bhttp.get(`${site.url}/videos`); +async function fetchLatest(site, page = 1) { + const res = await bhttp.get(`${site.url}/videos?page=${page}&size=7`); return scrapeLatest(res.body.toString(), site); } diff --git a/src/scrapers/xempire.js b/src/scrapers/xempire.js index f4483134..ab7108e3 100644 --- a/src/scrapers/xempire.js +++ b/src/scrapers/xempire.js @@ -121,15 +121,10 @@ async function scrapeScene(html, url, site) { }; } -async function fetchLatest(site, storedReleases) { - const res = await bhttp.get(`${site.url}/en/videos`); - const releases = scrape(res.body.toString(), site); +async function fetchLatest(site, page = 1) { + const res = await bhttp.get(`${site.url}/en/videos/AllCategories/0/${page}`); - const storedShootIds = new Set(storedReleases.map(release => release.shoot_id)); - - const newReleases = releases.filter(release => !storedShootIds.has(release.shootId)); - - console.log(newReleases); + return scrape(res.body.toString(), site); } async function fetchUpcoming(site) { diff --git a/src/tui/render.js b/src/tui/render.js index 8e175a16..bba249e4 100644 --- a/src/tui/render.js +++ b/src/tui/render.js @@ -2,18 +2,17 @@ const config = require('config'); const blessed = require('neo-blessed'); -const moment = require('moment'); const opn = require('opn'); const formatters = require('./formatters'); function renderReleases(scenes, screen) { + screen.realloc(); + const tableTop = blessed.Text({ content: config.columns.reduce((acc, column, index) => `${acc}${'─'.repeat(column.width)}${index < config.columns.length - 1 ? '┬' : '┐\x1b[0m'}`, '\x1b[30m┌'), }); const items = scenes.map((scene, sceneIndex) => { - const isFuture = moment(scene.date).isAfter(); - const row = config.columns.reduce((acc, column) => { const value = (scene[column.value] && (formatters[column.value] ? formatters[column.value](scene[column.value], column) @@ -25,7 +24,7 @@ function renderReleases(scenes, screen) { const truncatedValue = realLength > column.width - 2 ? `${value.slice(0, column.width - 2 - 3)}...` : value; const paddedValue = truncatedValue.padEnd(column.width + entityLength - 1).padStart(column.width + entityLength); - const coloredValue = isFuture ? `\x1b[92m${paddedValue}\x1b[0m` : `\x1b[97m${paddedValue}\x1b[0m`; + const coloredValue = scene.upcoming ? `\x1b[92m${paddedValue}\x1b[0m` : `\x1b[97m${paddedValue}\x1b[0m`; return `${acc}${coloredValue}\x1b[90m│\x1b[0m`; }, '\x1b[90m│\x1b[0m');