From 574c117ab06af924667693b4b35f515a326c72b5 Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Sun, 3 Apr 2022 00:49:39 +0200 Subject: [PATCH] Refactored Dogfart scraper to use qu and return unextracted scenes. --- src/scrapers/dogfart.js | 164 +++++++++++++++------------------------- 1 file changed, 62 insertions(+), 102 deletions(-) diff --git a/src/scrapers/dogfart.js b/src/scrapers/dogfart.js index 5629c123..0362182f 100644 --- a/src/scrapers/dogfart.js +++ b/src/scrapers/dogfart.js @@ -1,20 +1,16 @@ 'use strict'; -/* eslint-disable newline-per-chained-call */ -// const Promise = require('bluebird'); -const { JSDOM } = require('jsdom'); -const moment = require('moment'); - -const http = require('../utils/http'); const slugify = require('../utils/slugify'); const qu = require('../utils/qu'); async function getPhotos(albumUrl) { - const res = await http.get(albumUrl); - const html = res.body.toString(); - const { document } = new JSDOM(html).window; + const res = await qu.get(albumUrl); - const lastPhotoPage = Array.from(document.querySelectorAll('.preview-image-container a')).slice(-1)[0].href; + if (!res.ok) { + return []; + } + + const lastPhotoPage = res.item.query.urls('.preview-image-container a').at(-1); const lastPhotoIndex = parseInt(lastPhotoPage.match(/\d+.jpg/)[0], 10); const photoUrls = Array.from({ length: lastPhotoIndex }, (value, index) => { @@ -29,124 +25,88 @@ async function getPhotos(albumUrl) { return photoUrls; } -function scrapeLatest(html, site, filter = true) { - const { document } = new JSDOM(html).window; - const sceneElements = Array.from(document.querySelectorAll('.recent-updates')); +function scrapeLatest(scenes, site, filter = true) { + return scenes.reduce((acc, { query }) => { + const release = {}; - return sceneElements.map((element) => { - const siteUrl = element.querySelector('.recent-details-title .help-block, .model-details-title .site-name').textContent; + const siteUrl = query.cnt('.recent-details-title .help-block, .model-details-title .site-name'); + + release.url = query.url('.thumbnail', 'href', { origin: site.type === 'network' ? site.url : site.parent.url }); + release.entryId = `${site.slug}_${new URL(release.url).pathname.split('/')[4]}`; + + release.title = query.cnt('.scene-title'); + release.actors = release.title.split(/[,&]|\band\b/).map((actor) => actor.replace(/BTS/i, '').trim()); + + // release.poster = `https:${element.querySelector('img').src}`; + release.poster = query.img(); + release.teaser = query.el('.thumbnail', 'data-preview_clip_url'); + + release.channel = siteUrl?.match(/(.*).com/)?.[1].toLowerCase(); if (filter && `www.${siteUrl.toLowerCase()}` !== new URL(site.url).host) { // different dogfart site - return null; + return { ...acc, unextracted: [...acc.unextracted, release] }; } - const sceneLinkElement = element.querySelector('.thumbnail'); - const url = qu.prefixUrl(sceneLinkElement.href, 'https://dogfartnetwork.com'); - const { pathname } = new URL(url); - const entryId = `${site.slug}_${pathname.split('/')[4]}`; - - const title = element.querySelector('.scene-title').textContent; - const actors = title.split(/[,&]|\band\b/).map((actor) => actor.replace(/BTS/i, '').trim()); - - const poster = `https:${element.querySelector('img').src}`; - const teaser = sceneLinkElement.dataset.preview_clip_url; - - const channel = siteUrl?.match(/(.*).com/)?.[1].toLowerCase(); - - return { - url, - entryId, - title, - actors, - poster, - teaser: { - src: teaser, - }, - site, - channel, - }; - }).filter(Boolean); + return { ...acc, scenes: [...acc.scenes, release] }; + }, { + scenes: [], + unextracted: [], + }); } -async function scrapeScene(html, url, site) { - const { document } = new JSDOM(html).window; - - const title = document.querySelector('.description-title').textContent; - const actors = Array.from(document.querySelectorAll('.more-scenes a')).map(({ textContent }) => textContent); - const metaDescription = document.querySelector('meta[itemprop="description"]').content; - const description = metaDescription - ? metaDescription.content - : document.querySelector('.description') - .textContent - .replace(/[ \t\n]{2,}/g, ' ') - .replace('...read more', '') - .trim(); - - const channel = document.querySelector('.site-name').textContent.split('.')[0].toLowerCase(); +async function scrapeScene({ query }, url, channel, baseScene, parameters) { + const release = {}; const { origin, pathname } = new URL(url); - const entryId = `${channel}_${pathname.split('/').slice(-2)[0]}`; - const date = new Date(document.querySelector('meta[itemprop="uploadDate"]').content); - const duration = moment - .duration(`00:${document - .querySelectorAll('.extra-info p')[1] - .textContent - .match(/\d+:\d+$/)[0]}`) - .asSeconds(); + release.channel = query.cnt('.site-name').split('.')[0].toLowerCase(); + release.entryId = `${release.channel}_${pathname.split('/').slice(-2)[0]}`; - const trailerElement = document.querySelector('.html5-video'); - const poster = `https:${trailerElement.dataset.poster}`; - const { trailer } = trailerElement.dataset; + release.title = query.cnt('.description-title'); + release.actors = query.all('.more-scenes a').map((actorEl) => ({ + name: query.cnt(actorEl), + url: query.url(actorEl, null, 'href', { origin: channel.url }), + })); - const lastPhotosUrl = Array.from(document.querySelectorAll('.pagination a')).slice(-1)[0]?.href; - const photos = lastPhotosUrl ? await getPhotos(`${origin}${pathname}${lastPhotosUrl}`, site, url) : []; + release.description = query.meta('meta[itemprop="description"]') || qu.cnt('.description').replace(/[ \t\n]{2,}/g, ' ').replace('...read more', '').trim(); - const stars = Math.floor(Number(document.querySelector('span[itemprop="average"]')?.textContent || document.querySelector('span[itemprop="ratingValue"]')?.textContent) / 2); - const tags = Array.from(document.querySelectorAll('.scene-details .categories a')).map(({ textContent }) => textContent); + release.date = query.date('meta[itemprop="uploadDate"]', null, null, 'content'); + release.duration = query.duration('.extra-info p:nth-child(2)'); - return { - entryId, - url: `${origin}${pathname}`, - title, - description, - actors, - date, - duration, - poster, - photos, - trailer: { - src: trailer, - }, - tags, - rating: { - stars, - }, - site, - channel, - }; + release.tags = query.cnts('.scene-details .categories a'); + + release.trailer = query.video('.html5-video', 'data-trailer'); + release.poster = query.poster('.html5-video', 'data-poster'); + + const lastPhotosUrl = query.urls('.pagination a').at(-1); + + if (lastPhotosUrl && parameters.includePhotos) { + release.photos = await getPhotos(`${origin}${pathname}${lastPhotosUrl}`, channel, url); + } + + release.stars = Number(((query.number('span[itemprop="average"]') || query.number('span[itemprop="ratingValue"]')) / 2).toFixed(2)); + + return release; } async function fetchLatest(site, page = 1) { - const res = await http.get(`https://dogfartnetwork.com/tour/scenes/?p=${page}`); + const res = await qu.getAll(`https://dogfartnetwork.com/tour/scenes/?p=${page}`, '.recent-updates'); - return scrapeLatest(res.body.toString(), site); -} + if (res.ok) { + return scrapeLatest(res.items, site); + } -async function fetchScene(url, site) { - const res = await http.get(url); - - return scrapeScene(res.body.toString(), url, site); + return res.status; } async function fetchProfile(baseActor, entity) { const slug = slugify(baseActor.name, '+'); const url = `https://www.dogfartnetwork.com/tour/girls/${slug}/`; - const res = await http.get(url); + const res = await qu.getAll(url, '.recent-updates'); if (res.ok) { - const scenes = scrapeLatest(res.body, entity, false); + const scenes = scrapeLatest(res.items, entity, false); return { scenes }; } @@ -156,6 +116,6 @@ async function fetchProfile(baseActor, entity) { module.exports = { fetchLatest, - fetchScene, fetchProfile, + scrapeScene, };