From 71cb85c3e1ae2845190c8bf50039a8621d8c147e Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Thu, 5 Dec 2019 01:26:22 +0100 Subject: [PATCH] Allowing scrapers to return raw tags and site URLs or slugs, to gradually remove site and tag fetching from individual scrapers. Added media and deep fetchin support to Perv City scraper. --- assets/components/tags/tags.vue | 40 ++++++------- assets/components/tile/release.vue | 2 +- seeds/03_tags.js | 2 +- seeds/04_media.js | 2 +- src/releases.js | 39 +++++++++++- src/scrape-release.js | 4 +- src/scrapers/pervcity.js | 95 +++++++++++++++++++++++++++++- src/tags.js | 52 ++++++++-------- 8 files changed, 182 insertions(+), 54 deletions(-) diff --git a/assets/components/tags/tags.vue b/assets/components/tags/tags.vue index 379d7fba..4064b979 100644 --- a/assets/components/tags/tags.vue +++ b/assets/components/tags/tags.vue @@ -1,25 +1,5 @@ diff --git a/assets/components/tile/release.vue b/assets/components/tile/release.vue index 6189d80f..9b924d47 100644 --- a/assets/components/tile/release.vue +++ b/assets/components/tile/release.vue @@ -35,7 +35,7 @@ target="_blank" rel="noopener noreferrer" class="date" - >{{ `(${formatDate(release.dateAdded, 'MMM D, YYYY')})` }}` + >{{ `(${formatDate(release.dateAdded, 'MMM D, YYYY')})` }} curateRelease(release))); } -function curateScrapedRelease(release) { - return { +async function getChannelSite(release) { + try { + const site = await findSiteByUrl(release.channel); + + return site || null; + } catch (error) { + const [site] = await fetchSites({ + name: release.channel, + slug: release.channel, + }); + + return site || null; + } +} + +async function curateScrapedRelease(release) { + const curatedRelease = { site_id: release.site.id, studio_id: release.studio ? release.studio.id : null, shoot_id: release.shootId || null, @@ -108,6 +124,17 @@ function curateScrapedRelease(release) { rating: release.rating && release.rating.stars && Math.floor(release.rating.stars), deep: Boolean(argv.deep && release.url && !release.upcoming), }; + + if (release.site.isFallback && release.channel) { + const site = await getChannelSite(release); + + if (site) { + curatedRelease.site_id = site.id; + return curatedRelease; + } + } + + return curatedRelease; } function commonQuery(queryBuilder, { @@ -138,7 +165,9 @@ function commonQuery(queryBuilder, { .andWhereRaw('tags_associated.release_id = releases.id'); }) .andWhere('date', '>', after) + .orWhere('releases.created_at', '>', after) .andWhere('date', '<=', before) + .orWhere('releases.created_at', '<=', before) .orderBy([{ column: 'date', order: 'desc' }, { column: 'created_at', order: 'desc' }]) .limit(limit); } @@ -206,7 +235,7 @@ async function storeReleaseAssets(release, releaseId) { async function storeRelease(release) { const existingRelease = await knex('releases').where('entry_id', release.entryId).first(); - const curatedRelease = curateScrapedRelease(release); + const curatedRelease = await curateScrapedRelease(release); if (existingRelease && !argv.redownload) { return existingRelease.id; @@ -256,6 +285,8 @@ async function storeReleases(releases) { }); const actors = storedReleases.reduce((acc, release) => { + if (!release.actors) return acc; + release.actors.forEach((actor) => { const trimmedActor = actor.trim(); @@ -274,6 +305,8 @@ async function storeReleases(releases) { associateActors(actors, storedReleases), Promise.all(storedReleases.map(async release => storeReleaseAssets(release, release.id))), ]); + + return storedReleases; } module.exports = { diff --git a/src/scrape-release.js b/src/scrape-release.js index e33e471c..a6619c8e 100644 --- a/src/scrape-release.js +++ b/src/scrape-release.js @@ -48,9 +48,9 @@ async function scrapeRelease(url, release, deep = false) { if (!deep && argv.save) { // don't store release when called by site scraper - const [releaseId] = await storeReleases([scene]); + const [storedRelease] = await storeReleases([scene]); - console.log(`http://${config.web.host}:${config.web.port}/scene/${releaseId}`); + console.log(`http://${config.web.host}:${config.web.port}/scene/${storedRelease.id}`); } return scene; diff --git a/src/scrapers/pervcity.js b/src/scrapers/pervcity.js index 49f94076..17f34fb6 100644 --- a/src/scrapers/pervcity.js +++ b/src/scrapers/pervcity.js @@ -2,9 +2,25 @@ const bhttp = require('bhttp'); const cheerio = require('cheerio'); +const { JSDOM } = require('jsdom'); const moment = require('moment'); -function scrape(html, site) { +async function getTrailer(entryId) { + const trailerRes = await bhttp.post('https://www.pervcity.com/gettoken.php', { + setId: entryId, + }); + + if (trailerRes.statusCode === 200) { + return { + poster: trailerRes.body.TrailerImg, + trailer: trailerRes.body.TrailerPath || trailerRes.body.Trailerfallback, + }; + } + + return null; +} + +function scrapeLatestScene(html, site) { const $ = cheerio.load(html, { normalizeWhitespace: true }); const entryId = $('li').attr('id'); @@ -15,6 +31,9 @@ function scrape(html, site) { const actors = $('.home_model_name a').toArray().map(element => $(element).text().replace(/,[\u0020\u00A0\u202F]/, '')); // replace weird commas const date = moment.utc($('.add_date').text(), 'DD-MM-YYYY').toDate(); + const poster = $('a:nth-child(2) > img').attr('src'); + const photos = $('.sample-picker img').map((index, element) => $(element).attr('src').replace('tourpics', 'trailer')).toArray(); + const stars = $('img[src*="/star.png"]') .toArray() .map(element => $(element).attr('src')) @@ -26,6 +45,8 @@ function scrape(html, site) { title, actors, date, + poster, + photos, rating: { stars, }, @@ -33,17 +54,87 @@ function scrape(html, site) { }; } +async function scrapeScene(html, url, site) { + const { document } = new JSDOM(html).window; + + const release = { url, site }; + + release.entryId = document.querySelector('input#set_ID').value; + + release.title = document.querySelector('title').textContent; + release.description = document.querySelector('.player_data').textContent.trim(); + + const durationString = document.querySelector('.tag_lineR div:nth-child(2) span').textContent; + const [minutes, seconds] = durationString.match(/\d+/g); + + release.duration = Number(minutes) * 60 + Number(seconds); + release.tags = document.querySelector('meta[name="keywords"]').content.split(','); + + const { poster, trailer } = await getTrailer(release.entryId); + + release.poster = poster; + release.trailer = { src: trailer }; + + return release; +} + +function scrapeFallbackLanding(html) { + const { document } = new JSDOM(html).window; + + return document.querySelector('input#set_ID').value; +} + +async function scrapeFallbackScene(html, entryId, url, site) { + const { document } = new JSDOM(html).window; + const release = { url, entryId, site }; + + release.title = document.querySelector('.popup_data_set_head label').textContent; + release.description = document.querySelector('.popup_data_set_des p').textContent.trim(); + release.date = moment.utc(document.querySelector('.popup_left_top div span').textContent, 'MM-DD-YYYY').toDate(); + release.actors = Array.from(document.querySelectorAll('.popup_data_set_models a'), el => el.textContent); + + const { poster, trailer } = await getTrailer(release.entryId); + + release.poster = poster; + release.trailer = { src: trailer }; + + release.channel = document.querySelector('.popup_left_top div img').alt; + + return release; +} + async function fetchLatest(site, page = 1) { const res = page === 1 ? await bhttp.get(`${site.url}/final_latestupdateview.php?limitstart=${(page - 1) * 9}&limitend=9&websiteid=0&deviceview=browser&tourId=${site.parameters.tourId}`) : await bhttp.get(`${site.url}/final_load_latestupdate_grid_view.php?limitstart=0&limitend=${(page - 1) * 8 + 1}&websiteid=0&deviceview=browser&tourId=${site.parameters.tourId}`); const elements = JSON.parse(res.body.toString()); - const latest = Object.values(elements.total_arr).map(html => scrape(html, site)); // total_arr is a key-value object for final_load_latestupdate_grid_view.php + const latest = Object.values(elements.total_arr).map(html => scrapeLatestScene(html, site)); // total_arr is a key-value object for final_load_latestupdate_grid_view.php return latest; } +async function fetchScene(url, site) { + const res = await bhttp.get(url); + + if (res.statusCode === 200) { + if (site.isFallback) { + const entryId = scrapeFallbackLanding(res.body.toString(), url); + + const fallbackRes = await bhttp.post('https://www.pervcity.com/set_popupvideo.php', { + setId: entryId, + }); + + return scrapeFallbackScene(fallbackRes.body.toString(), entryId, url, site); + } + + return scrapeScene(res.body.toString(), url, site); + } + + return null; +} + module.exports = { fetchLatest, + fetchScene, }; diff --git a/src/tags.js b/src/tags.js index ce68ef93..da8cfa4a 100644 --- a/src/tags.js +++ b/src/tags.js @@ -33,14 +33,41 @@ function curateTags(tags) { return Promise.all(tags.map(async tag => curateTag(tag))); } +async function matchTags(rawTags) { + const tags = rawTags + .concat(rawTags.map(tag => tag.toLowerCase())) + .concat(rawTags.map(tag => tag.toUpperCase())); + + const tagEntries = await knex('tags') + .pluck('aliases.id') + .whereIn('tags.name', tags) + .where(function where() { + this + .whereNull('tags.alias_for') + .orWhereNull('aliases.alias_for'); + }) + .join('tags as aliases', function join() { + this + .on('tags.alias_for', 'aliases.id') + .orOn('tags.id', 'aliases.id'); + }) + .groupBy('aliases.id'); + + return tagEntries; +} + async function associateTags(release, releaseId) { if (!release.tags || release.tags.length === 0) { console.warn(`No tags available for (${release.site.name}, ${releaseId}}) "${release.title}"`); return; } + const tags = release.tags.some(tag => typeof tag === 'string') + ? await matchTags(release.tags) // scraper returned raw tags + : release.tags; // tags already matched by scraper + try { - await knex('tags_associated').insert(release.tags.map(tagId => ({ + await knex('tags_associated').insert(tags.map(tagId => ({ tag_id: tagId, release_id: releaseId, }))); @@ -65,29 +92,6 @@ async function fetchTags(queryObject, groupsQueryObject, limit = 100) { return curateTags(tags); } -async function matchTags(rawTags) { - const tags = rawTags - .concat(rawTags.map(tag => tag.toLowerCase())) - .concat(rawTags.map(tag => tag.toUpperCase())); - - const tagEntries = await knex('tags') - .pluck('aliases.id') - .whereIn('tags.name', tags) - .where(function where() { - this - .whereNull('tags.alias_for') - .orWhereNull('aliases.alias_for'); - }) - .join('tags as aliases', function join() { - this - .on('tags.alias_for', 'aliases.id') - .orOn('tags.id', 'aliases.id'); - }) - .groupBy('aliases.id'); - - return tagEntries; -} - module.exports = { associateTags, fetchTags,