diff --git a/assets/components/tags/tags.vue b/assets/components/tags/tags.vue index 379d7fba..4064b979 100644 --- a/assets/components/tags/tags.vue +++ b/assets/components/tags/tags.vue @@ -1,25 +1,5 @@ diff --git a/assets/components/tile/release.vue b/assets/components/tile/release.vue index 6189d80f..9b924d47 100644 --- a/assets/components/tile/release.vue +++ b/assets/components/tile/release.vue @@ -35,7 +35,7 @@ target="_blank" rel="noopener noreferrer" class="date" - >{{ `(${formatDate(release.dateAdded, 'MMM D, YYYY')})` }}` + >{{ `(${formatDate(release.dateAdded, 'MMM D, YYYY')})` }} curateRelease(release))); } -function curateScrapedRelease(release) { - return { +async function getChannelSite(release) { + try { + const site = await findSiteByUrl(release.channel); + + return site || null; + } catch (error) { + const [site] = await fetchSites({ + name: release.channel, + slug: release.channel, + }); + + return site || null; + } +} + +async function curateScrapedRelease(release) { + const curatedRelease = { site_id: release.site.id, studio_id: release.studio ? release.studio.id : null, shoot_id: release.shootId || null, @@ -108,6 +124,17 @@ function curateScrapedRelease(release) { rating: release.rating && release.rating.stars && Math.floor(release.rating.stars), deep: Boolean(argv.deep && release.url && !release.upcoming), }; + + if (release.site.isFallback && release.channel) { + const site = await getChannelSite(release); + + if (site) { + curatedRelease.site_id = site.id; + return curatedRelease; + } + } + + return curatedRelease; } function commonQuery(queryBuilder, { @@ -138,7 +165,9 @@ function commonQuery(queryBuilder, { .andWhereRaw('tags_associated.release_id = releases.id'); }) .andWhere('date', '>', after) + .orWhere('releases.created_at', '>', after) .andWhere('date', '<=', before) + .orWhere('releases.created_at', '<=', before) .orderBy([{ column: 'date', order: 'desc' }, { column: 'created_at', order: 'desc' }]) .limit(limit); } @@ -206,7 +235,7 @@ async function storeReleaseAssets(release, releaseId) { async function storeRelease(release) { const existingRelease = await knex('releases').where('entry_id', release.entryId).first(); - const curatedRelease = curateScrapedRelease(release); + const curatedRelease = await curateScrapedRelease(release); if (existingRelease && !argv.redownload) { return existingRelease.id; @@ -256,6 +285,8 @@ async function storeReleases(releases) { }); const actors = storedReleases.reduce((acc, release) => { + if (!release.actors) return acc; + release.actors.forEach((actor) => { const trimmedActor = actor.trim(); @@ -274,6 +305,8 @@ async function storeReleases(releases) { associateActors(actors, storedReleases), Promise.all(storedReleases.map(async release => storeReleaseAssets(release, release.id))), ]); + + return storedReleases; } module.exports = { diff --git a/src/scrape-release.js b/src/scrape-release.js index e33e471c..a6619c8e 100644 --- a/src/scrape-release.js +++ b/src/scrape-release.js @@ -48,9 +48,9 @@ async function scrapeRelease(url, release, deep = false) { if (!deep && argv.save) { // don't store release when called by site scraper - const [releaseId] = await storeReleases([scene]); + const [storedRelease] = await storeReleases([scene]); - console.log(`http://${config.web.host}:${config.web.port}/scene/${releaseId}`); + console.log(`http://${config.web.host}:${config.web.port}/scene/${storedRelease.id}`); } return scene; diff --git a/src/scrapers/pervcity.js b/src/scrapers/pervcity.js index 49f94076..17f34fb6 100644 --- a/src/scrapers/pervcity.js +++ b/src/scrapers/pervcity.js @@ -2,9 +2,25 @@ const bhttp = require('bhttp'); const cheerio = require('cheerio'); +const { JSDOM } = require('jsdom'); const moment = require('moment'); -function scrape(html, site) { +async function getTrailer(entryId) { + const trailerRes = await bhttp.post('https://www.pervcity.com/gettoken.php', { + setId: entryId, + }); + + if (trailerRes.statusCode === 200) { + return { + poster: trailerRes.body.TrailerImg, + trailer: trailerRes.body.TrailerPath || trailerRes.body.Trailerfallback, + }; + } + + return null; +} + +function scrapeLatestScene(html, site) { const $ = cheerio.load(html, { normalizeWhitespace: true }); const entryId = $('li').attr('id'); @@ -15,6 +31,9 @@ function scrape(html, site) { const actors = $('.home_model_name a').toArray().map(element => $(element).text().replace(/,[\u0020\u00A0\u202F]/, '')); // replace weird commas const date = moment.utc($('.add_date').text(), 'DD-MM-YYYY').toDate(); + const poster = $('a:nth-child(2) > img').attr('src'); + const photos = $('.sample-picker img').map((index, element) => $(element).attr('src').replace('tourpics', 'trailer')).toArray(); + const stars = $('img[src*="/star.png"]') .toArray() .map(element => $(element).attr('src')) @@ -26,6 +45,8 @@ function scrape(html, site) { title, actors, date, + poster, + photos, rating: { stars, }, @@ -33,17 +54,87 @@ function scrape(html, site) { }; } +async function scrapeScene(html, url, site) { + const { document } = new JSDOM(html).window; + + const release = { url, site }; + + release.entryId = document.querySelector('input#set_ID').value; + + release.title = document.querySelector('title').textContent; + release.description = document.querySelector('.player_data').textContent.trim(); + + const durationString = document.querySelector('.tag_lineR div:nth-child(2) span').textContent; + const [minutes, seconds] = durationString.match(/\d+/g); + + release.duration = Number(minutes) * 60 + Number(seconds); + release.tags = document.querySelector('meta[name="keywords"]').content.split(','); + + const { poster, trailer } = await getTrailer(release.entryId); + + release.poster = poster; + release.trailer = { src: trailer }; + + return release; +} + +function scrapeFallbackLanding(html) { + const { document } = new JSDOM(html).window; + + return document.querySelector('input#set_ID').value; +} + +async function scrapeFallbackScene(html, entryId, url, site) { + const { document } = new JSDOM(html).window; + const release = { url, entryId, site }; + + release.title = document.querySelector('.popup_data_set_head label').textContent; + release.description = document.querySelector('.popup_data_set_des p').textContent.trim(); + release.date = moment.utc(document.querySelector('.popup_left_top div span').textContent, 'MM-DD-YYYY').toDate(); + release.actors = Array.from(document.querySelectorAll('.popup_data_set_models a'), el => el.textContent); + + const { poster, trailer } = await getTrailer(release.entryId); + + release.poster = poster; + release.trailer = { src: trailer }; + + release.channel = document.querySelector('.popup_left_top div img').alt; + + return release; +} + async function fetchLatest(site, page = 1) { const res = page === 1 ? await bhttp.get(`${site.url}/final_latestupdateview.php?limitstart=${(page - 1) * 9}&limitend=9&websiteid=0&deviceview=browser&tourId=${site.parameters.tourId}`) : await bhttp.get(`${site.url}/final_load_latestupdate_grid_view.php?limitstart=0&limitend=${(page - 1) * 8 + 1}&websiteid=0&deviceview=browser&tourId=${site.parameters.tourId}`); const elements = JSON.parse(res.body.toString()); - const latest = Object.values(elements.total_arr).map(html => scrape(html, site)); // total_arr is a key-value object for final_load_latestupdate_grid_view.php + const latest = Object.values(elements.total_arr).map(html => scrapeLatestScene(html, site)); // total_arr is a key-value object for final_load_latestupdate_grid_view.php return latest; } +async function fetchScene(url, site) { + const res = await bhttp.get(url); + + if (res.statusCode === 200) { + if (site.isFallback) { + const entryId = scrapeFallbackLanding(res.body.toString(), url); + + const fallbackRes = await bhttp.post('https://www.pervcity.com/set_popupvideo.php', { + setId: entryId, + }); + + return scrapeFallbackScene(fallbackRes.body.toString(), entryId, url, site); + } + + return scrapeScene(res.body.toString(), url, site); + } + + return null; +} + module.exports = { fetchLatest, + fetchScene, }; diff --git a/src/tags.js b/src/tags.js index ce68ef93..da8cfa4a 100644 --- a/src/tags.js +++ b/src/tags.js @@ -33,14 +33,41 @@ function curateTags(tags) { return Promise.all(tags.map(async tag => curateTag(tag))); } +async function matchTags(rawTags) { + const tags = rawTags + .concat(rawTags.map(tag => tag.toLowerCase())) + .concat(rawTags.map(tag => tag.toUpperCase())); + + const tagEntries = await knex('tags') + .pluck('aliases.id') + .whereIn('tags.name', tags) + .where(function where() { + this + .whereNull('tags.alias_for') + .orWhereNull('aliases.alias_for'); + }) + .join('tags as aliases', function join() { + this + .on('tags.alias_for', 'aliases.id') + .orOn('tags.id', 'aliases.id'); + }) + .groupBy('aliases.id'); + + return tagEntries; +} + async function associateTags(release, releaseId) { if (!release.tags || release.tags.length === 0) { console.warn(`No tags available for (${release.site.name}, ${releaseId}}) "${release.title}"`); return; } + const tags = release.tags.some(tag => typeof tag === 'string') + ? await matchTags(release.tags) // scraper returned raw tags + : release.tags; // tags already matched by scraper + try { - await knex('tags_associated').insert(release.tags.map(tagId => ({ + await knex('tags_associated').insert(tags.map(tagId => ({ tag_id: tagId, release_id: releaseId, }))); @@ -65,29 +92,6 @@ async function fetchTags(queryObject, groupsQueryObject, limit = 100) { return curateTags(tags); } -async function matchTags(rawTags) { - const tags = rawTags - .concat(rawTags.map(tag => tag.toLowerCase())) - .concat(rawTags.map(tag => tag.toUpperCase())); - - const tagEntries = await knex('tags') - .pluck('aliases.id') - .whereIn('tags.name', tags) - .where(function where() { - this - .whereNull('tags.alias_for') - .orWhereNull('aliases.alias_for'); - }) - .join('tags as aliases', function join() { - this - .on('tags.alias_for', 'aliases.id') - .orOn('tags.id', 'aliases.id'); - }) - .groupBy('aliases.id'); - - return tagEntries; -} - module.exports = { associateTags, fetchTags,