From acad99bdfe85ef41edfeb13afa7b82be63f7977d Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Sun, 8 Mar 2020 04:23:10 +0100 Subject: [PATCH] Changed q get and geta APIs to include status, refactored scrapers. Showing front- and back-cover on movie tiles and release page (fix). Removed icons from main navigation. Returning scenes from Jules Jordan movie scraper. --- assets/components/header/header.vue | 4 ++ assets/components/tile/release.vue | 22 +++++++ assets/js/fragments.js | 1 + assets/js/releases/actions.js | 8 +++ src/scrape-releases.js | 6 +- src/scrapers/assylum.js | 16 ++--- src/scrapers/cherrypimps.js | 14 +++-- src/scrapers/fullpornnetwork.js | 12 ++-- src/scrapers/hush.js | 40 +++++++------ src/scrapers/julesjordan.js | 91 +++++++++++++++-------------- src/scrapers/newsensations.js | 8 +-- src/scrapers/nubiles.js | 28 ++++----- src/scrapers/template.js | 8 +-- src/utils/q.js | 52 ++++++++++++----- src/utils/timeout.js | 31 ++++++++++ 15 files changed, 222 insertions(+), 119 deletions(-) create mode 100644 src/utils/timeout.js diff --git a/assets/components/header/header.vue b/assets/components/header/header.vue index 4780c0eb..117e7f24 100644 --- a/assets/components/header/header.vue +++ b/assets/components/header/header.vue @@ -202,5 +202,9 @@ export default { .nav-item { flex-grow: 1; } + + .logo { + display: none; + } } diff --git a/assets/components/tile/release.vue b/assets/components/tile/release.vue index bf716026..3b3206e9 100644 --- a/assets/components/tile/release.vue +++ b/assets/components/tile/release.vue @@ -74,6 +74,19 @@ class="thumbnail" > + + + + ({ ...scrapedRelease, type })); - if (argv.scene && argv.inspect) { + if ((argv.scene || argv.movie) && argv.inspect) { // only show when fetching from URL - console.log(curatedReleases); } if (argv.save) { @@ -110,6 +109,9 @@ async function scrapeReleases(sources, release = null, type = 'scene', preflight */ const { releases: storedReleases } = await storeReleases(curatedReleases); + const movieScenes = storedReleases.map(movie => movie.scenes).flat(); + + console.log(movieScenes); if (storedReleases) { logger.info(storedReleases.map(storedRelease => `\nhttp://${config.web.host}:${config.web.port}/scene/${storedRelease.id}/${storedRelease.slug}`).join('')); diff --git a/src/scrapers/assylum.js b/src/scrapers/assylum.js index 5ed37d30..1702444b 100644 --- a/src/scrapers/assylum.js +++ b/src/scrapers/assylum.js @@ -91,11 +91,11 @@ function extractModels({ el }, site) { async function fetchModels(site, page = 1, accModels = []) { const url = `${site.url}/?models/${page}`; - const qModels = await get(url); + const res = await get(url); - if (qModels) { - const models = extractModels(qModels, site); - const nextPage = qModels.qa('.pagenumbers', true) + if (res.ok) { + const models = extractModels(res.item, site); + const nextPage = res.item.qa('.pagenumbers', true) .map(pageX => Number(pageX)) .filter(Boolean) // remove << and >> .includes(page + 1); @@ -112,16 +112,16 @@ async function fetchModels(site, page = 1, accModels = []) { async function fetchLatest(site, page = 1, models) { const url = `${site.url}/show.php?a=${site.parameters.a}_${page}`; - const qLatest = await geta(url, '.item'); + const res = await geta(url, '.item'); - return qLatest && scrapeLatest(qLatest, site, models); + return res.ok ? scrapeLatest(res.items, site, models) : res.status; } async function fetchScene(url, site, release, beforeFetchLatest) { const models = beforeFetchLatest || await fetchModels(site); - const qScene = await get(url); + const res = await get(url); - return qScene && scrapeScene(qScene, url, site, models); + return res.ok ? scrapeScene(res.item, url, site, models) : res.status; } module.exports = { diff --git a/src/scrapers/cherrypimps.js b/src/scrapers/cherrypimps.js index dfda83b1..f79e2ccd 100644 --- a/src/scrapers/cherrypimps.js +++ b/src/scrapers/cherrypimps.js @@ -109,15 +109,15 @@ async function fetchLatest(site, page = 1) { const url = site.parameters?.extract ? `https://cherrypimps.com/categories/movies_${page}.html` : `${site.url}/categories/movies_${page}.html`; - const qLatest = await geta(url, 'div.video-thumb'); + const res = await geta(url, 'div.video-thumb'); - return qLatest && scrapeAll(qLatest, site); + return res.ok ? scrapeAll(res.items, site) : res.status; } async function fetchScene(url, site, release) { - const qScene = await get(url); + const res = await get(url); - return qScene && scrapeScene(qScene, url, site, release); + return res.ok ? scrapeScene(res.item, url, site, release) : res.status; } async function fetchProfile(actorName, scraperSlug) { @@ -128,9 +128,11 @@ async function fetchProfile(actorName, scraperSlug) { ? [`https://${scraperSlug}.com/models/${actorSlug}.html`, `https://${scraperSlug}.com/models/${actorSlug2}.html`] : [`https://${scraperSlug.replace('xxx', '')}.xxx/models/${actorSlug}.html`, `https://${scraperSlug.replace('xxx', '')}.xxx/models/${actorSlug2}.html`]; - const qActor = await get(url) || await get(url2); + const res = await get(url); + if (res.ok) return scrapeProfile(res.item); - return qActor && scrapeProfile(qActor); + const res2 = await get(url2); + return res2.ok ? scrapeProfile(res2.item) : res2.status; } module.exports = { diff --git a/src/scrapers/fullpornnetwork.js b/src/scrapers/fullpornnetwork.js index 44b15efd..51bd20ac 100644 --- a/src/scrapers/fullpornnetwork.js +++ b/src/scrapers/fullpornnetwork.js @@ -67,15 +67,15 @@ function scrapeProfile({ el, q, qtx }) { async function fetchLatest(site, page = 1) { const url = `${site.url}/categories/movies_${page}_d.html`; - const qLatest = await geta(url, '.latest-updates .update'); + const res = await geta(url, '.latest-updates .update'); - return qLatest && scrapeAll(qLatest, site); + return res.ok ? scrapeAll(res.items, site) : res.status; } async function fetchScene(url, site) { - const qScene = await get(url, '.content-wrapper'); + const res = await get(url, '.content-wrapper'); - return qScene && scrapeScene(qScene, url, site); + return res.ok ? scrapeScene(res.item, url, site) : res.status; } async function fetchProfile(actorName, scraperSlug) { @@ -84,9 +84,9 @@ async function fetchProfile(actorName, scraperSlug) { ? `https://povperverts.net/models/${actorSlug}.html` : `https://${scraperSlug}.com/models/${actorSlug}.html`; - const qProfile = await get(url); + const res = await get(url); - return qProfile && scrapeProfile(qProfile, actorName); + return res.ok ? scrapeProfile(res.item, actorName) : res.status; } module.exports = { diff --git a/src/scrapers/hush.js b/src/scrapers/hush.js index 6aea297c..9029a56a 100644 --- a/src/scrapers/hush.js +++ b/src/scrapers/hush.js @@ -361,25 +361,24 @@ async function fetchLatest(site, page = 1, _beforeFetchLatest, accSiteReleases) || (site.parameters?.t1 && `${site.url}/t1/categories/movies_${page}_d.html`) || `${site.url}/categories/movies_${page}_d.html`; - const qLatest = await geta(url, '.modelfeature, .item-video, .updateItem'); + const res = await geta(url, '.modelfeature, .item-video, .updateItem'); - if (!qLatest) return null; - if (site.parameters?.t1) return scrapeAllT1(qLatest, site, accSiteReleases); - if (site.parameters?.tour) return scrapeAllTour(qLatest, site, accSiteReleases); + if (!res.ok) return res.status; + if (site.parameters?.t1) return scrapeAllT1(res.items, site, accSiteReleases); + if (site.parameters?.tour) return scrapeAllTour(res.items, site, accSiteReleases); - return scrapeAll(qLatest, site, accSiteReleases); + return scrapeAll(res.items, site, accSiteReleases); } async function fetchScene(url, site, baseRelease, beforeFetchLatest) { const channelRegExp = beforeFetchLatest || await getChannelRegExp(site); - const qScene = await get(url); + const res = await get(url); - if (!qScene) return null; + if (!res.ok) return res.status; + if (site.parameters?.t1) return scrapeSceneT1(res.item, site, url, baseRelease, channelRegExp); + if (site.parameters?.tour) return scrapeSceneTour(res.item, site, url, baseRelease); - if (site.parameters?.t1) return scrapeSceneT1(qScene, site, url, baseRelease, channelRegExp); - if (site.parameters?.tour) return scrapeSceneTour(qScene, site, url, baseRelease); - - return scrapeScene(qScene, site, url, baseRelease); + return scrapeScene(res.item, site, url, baseRelease); } async function fetchProfile(actorName, scraperSlug, site) { @@ -387,14 +386,21 @@ async function fetchProfile(actorName, scraperSlug, site) { const actorSlugB = slugify(actorName); const t1 = site.parameters?.t1 ? 't1/' : ''; - const qProfile = site.parameters?.profile - ? (await get(util.format(site.parameters.profile, actorSlugA)) || await get(site.parameters.profile, actorSlugB)) - : (await get(`${site.url}/${t1}models/${actorSlugA}.html`) || await get(`${site.url}/${t1}models/${actorSlugB}.html`)); - if (site.parameters?.t1) return qProfile && scrapeProfileT1(qProfile, site); - if (site.parameters?.tour) return qProfile && scrapeProfileTour(qProfile, site); + const res1 = site.parameters?.profile + ? await get(util.format(site.parameters.profile, actorSlugA)) + : await get(`${site.url}/${t1}models/${actorSlugA}.html`); - return qProfile && scrapeProfile(qProfile, site); + const res = (res1.ok && res1) + || (site.parameters?.profile + ? await get(util.format(site.parameters.profile, actorSlugB)) + : await get(`${site.url}/${t1}models/${actorSlugB}.html`)); + + if (!res.ok) return res.status; + if (site.parameters?.t1) return scrapeProfileT1(res.item, site); + if (site.parameters?.tour) return scrapeProfileTour(res.item, site); + + return scrapeProfile(res.item, site); } module.exports = { diff --git a/src/scrapers/julesjordan.js b/src/scrapers/julesjordan.js index d81f8bc7..94caf208 100644 --- a/src/scrapers/julesjordan.js +++ b/src/scrapers/julesjordan.js @@ -8,6 +8,7 @@ const { JSDOM } = require('jsdom'); const moment = require('moment'); const logger = require('../logger')(__filename); +const { get, geta, ctxa } = require('../utils/q'); const { heightToCm } = require('../utils/convert'); const slugify = require('../utils/slugify'); @@ -117,41 +118,33 @@ async function getPhotos(entryId, site, type = 'highres', page = 1) { return getPhotosLegacy(entryId, site, 'highres', 1); } -function scrapeLatest(html, site) { - const $ = cheerio.load(html, { normalizeWhitespace: true }); - const scenesElements = $('.update_details').toArray(); - - return scenesElements.map((element) => { +function scrapeAll(scenes, site) { + return scenes.map(({ el, q, qa, qh, qu, qd, qi, qis }) => { const release = {}; - const sceneLinkElement = $(element).find('a[title], .update_title a'); - release.url = sceneLinkElement.attr('href'); - release.title = sceneLinkElement.text()?.trim() || sceneLinkElement.attr('alt')?.trim(); + release.entryId = el.dataset.setid || q('.rating_box')?.dataset.id; - release.entryId = $(element).attr('data-setid'); + release.url = qu('.update_title, .dvd_info > a, a ~ a'); + release.title = q('.update_title, .dvd_info > a, a ~ a', true); + release.date = qd('.update_date', 'MM/DD/YYYY'); - release.date = moment - .utc($(element).find('.update_date').text(), 'MM/DD/YYYY') - .toDate(); + release.actors = qa('.update_models a', true); - release.actors = $(element).find('.update_models a') - .map((actorIndex, actorElement) => $(actorElement).text()) - .toArray(); + const dvdPhotos = qis('.dvd_preview_thumb'); + const photoCount = Number(q('a img.thumbs', 'cnt')) || 1; - const photoElement = $(element).find('a img.thumbs'); - const photoCount = Number(photoElement.attr('cnt')) || 1; - [release.poster, ...release.photos] = Array.from({ length: photoCount }, (value, index) => { - const src = photoElement.attr(`src${index}_1x`) || photoElement.attr(`src${index}`) || photoElement.attr('src'); + [release.poster, ...release.photos] = dvdPhotos.length + ? dvdPhotos + : Array.from({ length: photoCount }).map((value, index) => { + const src = qi('a img.thumbs', `src${index}_1x`) || qi('a img.thumbs', `src${index}`) || qi('a img.thumbs'); - if (!src) return null; + return src ? { + src: /^http/.test(src) ? src : `${site.url}${src}`, + referer: site.url, + } : null; + }).filter(Boolean); - return { - src: /^http/.test(src) ? src : `${site.url}${src}`, - referer: site.url, - }; - }).filter(photoUrl => photoUrl); - - const teaserScript = $(element).find('script').html(); + const teaserScript = qh('script'); if (teaserScript) { const src = teaserScript.slice(teaserScript.indexOf('http'), teaserScript.indexOf('.mp4') + 4); if (src) release.teaser = { src }; @@ -294,18 +287,28 @@ async function scrapeScene(html, url, site) { return release; } -function scrapeMovie(html, url, site) { - const { document } = new JSDOM(html).window; +function scrapeMovie({ el, q, qus }, url, site) { const movie = { url, site }; - movie.entryId = document.querySelector('.dvd_details_overview .rating_box').dataset.id; - movie.title = document.querySelector('.title_bar span').textContent; - movie.covers = Array.from(document.querySelectorAll('#dvd-cover-flip > a'), el => el.href); - movie.channel = document.querySelector('.update_date a').textContent; - movie.date = new Date(); - movie.releases = Array.from(document.querySelectorAll('.cell.dvd_info > a'), el => el.href); + movie.entryId = q('.dvd_details_overview .rating_box').dataset.id; + movie.title = q('.title_bar span', true); + movie.covers = qus('#dvd-cover-flip > a'); + movie.channel = q('.update_date a', true); - return movie; + // movie.releases = Array.from(document.querySelectorAll('.cell.dvd_info > a'), el => el.href); + const sceneQs = ctxa(el, '.dvd_details'); + const scenes = scrapeAll(sceneQs, site); + + const curatedScenes = scenes + .map(scene => ({ ...scene, movie })) + .sort((sceneA, sceneB) => sceneA.date - sceneB.date); + + movie.date = curatedScenes[0].date; + + return { + ...movie, + scenes: curatedScenes, + }; } function scrapeProfile(html, url, actorName) { @@ -344,14 +347,14 @@ function scrapeProfile(html, url, actorName) { } async function fetchLatest(site, page = 1) { - const url = site.parameters?.latest ? util.format(site.parameters.latest, page) : `${site.url}/trial/categories/movies_${page}_d.html`; - const res = await bhttp.get(url); + const url = site.parameters?.latest + ? util.format(site.parameters.latest, page) + : `${site.url}/trial/categories/movies_${page}_d.html`; - if (res.statusCode === 200) { - return scrapeLatest(res.body.toString(), site); - } + // const res = await bhttp.get(url); + const res = await geta(url, '.update_details'); - return res.statusCode; + return res.ok ? scrapeAll(res.items, site) : res.status; } async function fetchUpcoming(site) { @@ -374,9 +377,9 @@ async function fetchScene(url, site) { } async function fetchMovie(url, site) { - const res = await bhttp.get(url); + const res = await get(url); - return scrapeMovie(res.body.toString(), url, site); + return res.ok ? scrapeMovie(res.item, url, site) : res.status; } async function fetchProfile(actorName) { diff --git a/src/scrapers/newsensations.js b/src/scrapers/newsensations.js index 700426df..2eee837a 100644 --- a/src/scrapers/newsensations.js +++ b/src/scrapers/newsensations.js @@ -60,13 +60,13 @@ async function fetchLatest(site, page = 1) { } const url = `${site.url}/tour_${site.parameters.siteId}/categories/movies_${page}_d.html`; - const qLatest = await geta(url, '.updatesBlock .movieBlock, .updatesBlock .videoBlock, .latest_updates_block .update_details, .category_listing_block .update_details'); + const res = await geta(url, '.updatesBlock .movieBlock, .updatesBlock .videoBlock, .latest_updates_block .update_details, .category_listing_block .update_details'); - if (qLatest && site.parameters.block) { - return scrapeBlockLatest(qLatest, site); + if (res.ok && site.parameters.block) { + return scrapeBlockLatest(res.items, site); } - return qLatest && scrapeClassicLatest(qLatest, site); + return res.ok ? scrapeClassicLatest(res.items, site) : res.status; } module.exports = { diff --git a/src/scrapers/nubiles.js b/src/scrapers/nubiles.js index ef5fe7d5..570169df 100644 --- a/src/scrapers/nubiles.js +++ b/src/scrapers/nubiles.js @@ -10,10 +10,10 @@ const slugUrlMap = { }; async function getPhotos(albumUrl) { - const thumbnails = await geta(albumUrl, '.photo-thumb'); + const res = await geta(albumUrl, '.photo-thumb'); - return thumbnails - ? thumbnails.map(({ q }) => q('source').srcset) + return res.ok + ? res.items.map(({ q }) => q('source').srcset) : []; } @@ -114,26 +114,26 @@ function scrapeProfile({ q, qa, qi, qu }, _actorName, origin) { async function fetchLatest(site, page = 1) { const url = `${site.url}/video/gallery/${(page - 1) * 12}`; - const qLatest = await geta(url, '.content-grid-item'); + const res = await geta(url, '.content-grid-item'); - return qLatest && scrapeAll(qLatest, site); + return res.ok ? scrapeAll(res.items, site) : res.status; } async function fetchUpcoming(site) { if (site.parameters?.upcoming) { const url = `${site.url}/video/upcoming`; - const qUpcoming = await geta(url, '.content-grid-item'); + const res = await geta(url, '.content-grid-item'); - return qUpcoming && scrapeAll(qUpcoming, site); + return res.ok ? scrapeAll(res.items, site) : res.status; } return []; } async function fetchScene(url, site) { - const qScene = await get(url); + const res = await get(url); - return qScene && scrapeScene(qScene, url, site); + return res.ok ? scrapeScene(res.item, url, site) : res.status; } async function fetchProfile(actorName, siteSlug) { @@ -141,15 +141,17 @@ async function fetchProfile(actorName, siteSlug) { const origin = slugUrlMap[siteSlug] || `https://www.${siteSlug}.com`; const url = `${origin}/model/alpha/${firstLetter}`; - const { qa } = await get(url); + const resModels = await get(url); - const modelPath = qa('.content-grid-item a.title').find(el => slugify(el.textContent) === slugify(actorName)); + if (!resModels.ok) return resModels.status; + + const modelPath = resModels.item.qa('.content-grid-item a.title').find(el => slugify(el.textContent) === slugify(actorName)); if (modelPath) { const modelUrl = `${origin}${modelPath}`; - const qModel = await get(modelUrl); + const resModel = await get(modelUrl); - if (qModel) return scrapeProfile(qModel, actorName, origin); + return resModel.ok ? scrapeProfile(resModel.item, actorName, origin) : resModel.status; } return null; diff --git a/src/scrapers/template.js b/src/scrapers/template.js index 73531665..6cd24768 100644 --- a/src/scrapers/template.js +++ b/src/scrapers/template.js @@ -34,15 +34,15 @@ function scrapeScene({ q }, _site) { async function fetchLatest(site, page = 1) { const url = `${site.url}/${page}`; - const qLatest = await geta(url, '.selector'); + const res = await geta(url, '.selector'); - return qLatest && scrapeLatest(qLatest, site); + return res.ok ? scrapeLatest(res.items, site) : res.status; } async function fetchScene(url, site) { - const qScene = await get(url); + const res = await get(url); - return qScene && scrapeScene(qScene, site); + return res.ok ? scrapeScene(res.item, site) : res.status; } module.exports = { diff --git a/src/utils/q.js b/src/utils/q.js index 0eaaf16b..68e54cb4 100644 --- a/src/utils/q.js +++ b/src/utils/q.js @@ -64,6 +64,12 @@ function qall(context, selector, attrArg, applyTrim = true) { return Array.from(context.querySelectorAll(selector)); } +function qhtml(context, selector) { + const el = q(context, selector, null, true); + + return el && el.innerHTML; +} + function qtexts(context, selector, applyTrim = true, filter = true) { const el = q(context, selector, null, applyTrim); if (!el) return null; @@ -160,34 +166,36 @@ function qlength(context, selector, match, attr = 'textContent') { const funcs = { q, + qa: qall, qall, + qd: qdate, qdate, + qh: qhtml, + qhtml, + qi: qimage, qimage, qimages, - qposter, + qis: qimages, + ql: qlength, qlength, + qm: qmeta, qmeta, + qp: qposter, + qposter, + qs: qall, + qt: qtrailer, qtext, qtexts, qtrailer, qtrailers, - qurl, - qurls, - qa: qall, - qs: qall, - qd: qdate, - qi: qimage, - qis: qimages, - qp: qposter, - ql: qlength, - qm: qmeta, - qt: qtrailer, qts: qtrailers, qtx: qtext, - qtxt: qtext, qtxs: qtexts, + qtxt: qtext, qtxts: qtexts, qu: qurl, + qurl, + qurls, qus: qurls, }; @@ -246,12 +254,26 @@ async function get(url, selector, headers, all = false) { }); if (res.statusCode === 200) { - return all + const item = all ? extractAll(res.body.toString(), selector) : extract(res.body.toString(), selector); + + return { + item, + items: all ? item : [item], + res, + ok: true, + status: res.statusCode, + }; } - return null; + return { + item: null, + items: [], + res, + ok: false, + status: res.statusCode, + }; } async function getAll(url, selector, headers) { diff --git a/src/utils/timeout.js b/src/utils/timeout.js new file mode 100644 index 00000000..6327ed90 --- /dev/null +++ b/src/utils/timeout.js @@ -0,0 +1,31 @@ +'use strict'; + +const bhttp = require('bhttp'); + +const sleep = 5000; +const timeout = 1000; + +async function init() { + try { + const res = await bhttp.get(`https://httpstat.us/200?sleep=${sleep}`, { + responseTimeout: timeout, + }); + + console.log(res.statusCode); + } catch (error) { + console.log(error); + } +} + +/* +/home/pendulum/projectx/node_modules/bhttp/lib/bhttp.js:159 + err.response = response; + ^ + + TypeError: Cannot assign to read only property 'response' of object '[object Object]' + at addErrorData (/home/pendulum/projectx/node_modules/bhttp/lib/bhttp.js:159:16) + at Timeout.timeoutHandler [as _onTimeout] (/home/pendulum/projectx/node_modules/bhttp/lib/bhttp.js:525:27) +*/ + + +init();