From 5cbf122d6f7a5abb8dc5f71418b5e1d52d88e6a3 Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Sun, 3 Apr 2022 23:00:05 +0200 Subject: [PATCH] Scraping Dogfart scenes from native sites. --- seeds/02_sites.js | 69 ++++++++++++++--------------------------- src/scrapers/dogfart.js | 33 ++++++++++---------- 2 files changed, 41 insertions(+), 61 deletions(-) diff --git a/seeds/02_sites.js b/seeds/02_sites.js index f04934ce..ffd82772 100644 --- a/seeds/02_sites.js +++ b/seeds/02_sites.js @@ -2714,163 +2714,142 @@ const sites = [ { slug: 'blacksonblondes', name: 'Blacks On Blondes', - url: 'https://www.blacksonblondes.com/tour', + url: 'https://www.blacksonblondes.com', description: 'Blacks On Blondes is the Worlds Largest and Best Interracial Sex and Interracial Porn website. Black Men and White Women. BlacksOnBlondes has 23 years worth of Hardcore Interracial Content. Featuring the entire Legendary Dogfart Movie Archive', parent: 'dogfartnetwork', }, { slug: 'cuckoldsessions', name: 'Cuckold Sessions', - url: 'https://www.cuckoldsessions.com/tour', - description: 'Dogfart, the #1 Interracial Network in the World Presents CuckoldSessions.com/tour - Hardcore Cuckold Fetish Videos', + url: 'https://www.cuckoldsessions.com', + description: 'Dogfart, the #1 Interracial Network in the World Presents CuckoldSessions.com - Hardcore Cuckold Fetish Videos', parent: 'dogfartnetwork', }, { slug: 'gloryhole', name: 'Glory Hole', - url: 'https://www.gloryhole.com/tour', - description: '', + url: 'https://www.gloryhole.com', parent: 'dogfartnetwork', }, { slug: 'blacksoncougars', name: 'Blacks On Cougars', - url: 'https://www.blacksoncougars.com/tour', - description: '', + url: 'https://www.blacksoncougars.com', parent: 'dogfartnetwork', }, { slug: 'wefuckblackgirls', name: 'We Fuck Black Girls', alias: ['wfbg'], - url: 'https://www.wefuckblackgirls.com/tour', - description: '', + url: 'https://www.wefuckblackgirls.com', parent: 'dogfartnetwork', }, { slug: 'watchingmymomgoblack', name: 'Watching My Mom Go Black', - url: 'https://www.watchingmymomgoblack.com/tour', - description: '', + url: 'https://www.watchingmymomgoblack.com', parent: 'dogfartnetwork', }, { slug: 'interracialblowbang', name: 'Interracial Blowbang', - url: 'https://www.interracialblowbang.com/tour', - description: '', + url: 'https://www.interracialblowbang.com', parent: 'dogfartnetwork', }, { slug: 'cumbang', name: 'Cumbang', - url: 'https://www.cumbang.com/tour', - description: '', + url: 'https://www.cumbang.com', parent: 'dogfartnetwork', }, { slug: 'interracialpickups', name: 'Interracial Pickups', - url: 'https://www.interracialpickups.com/tour', - description: '', + url: 'https://www.interracialpickups.com', parent: 'dogfartnetwork', }, { slug: 'watchingmydaughtergoblack', name: 'Watching My Daughter Go Black', - url: 'https://www.watchingmydaughtergoblack.com/tour', - description: '', + url: 'https://www.watchingmydaughtergoblack.com', parent: 'dogfartnetwork', }, { slug: 'zebragirls', name: 'Zebra Girls', - url: 'https://www.zebragirls.com/tour', - description: '', + url: 'https://www.zebragirls.com', parent: 'dogfartnetwork', }, { slug: 'gloryholeinitiations', name: 'Gloryhole Initiations', - url: 'https://www.gloryhole-initiations.com/tour', - description: '', + url: 'https://www.gloryhole-initiations.com', parent: 'dogfartnetwork', }, { slug: 'dogfartbehindthescenes', name: 'Dogfart Behind The Scenes', - url: 'https://www.dogfartbehindthescenes.com/tour', - description: '', + url: 'https://www.dogfartbehindthescenes.com', parent: 'dogfartnetwork', }, { slug: 'blackmeatwhitefeet', name: 'Black Meat White Feet', - url: 'https://www.blackmeatwhitefeet.com/tour', - description: '', + url: 'https://www.blackmeatwhitefeet.com', parent: 'dogfartnetwork', }, { slug: 'springthomas', name: 'Spring Thomas', - url: 'https://www.springthomas.com/tour', - description: '', + url: 'https://www.springthomas.com', parent: 'dogfartnetwork', }, { slug: 'katiethomas', name: 'Katie Thomas', - url: 'https://www.katiethomas.com/tour', - description: '', + url: 'https://www.katiethomas.com', parent: 'dogfartnetwork', }, { slug: 'ruthblackwell', name: 'Ruth Blackwell', - url: 'https://www.ruthblackwell.com/tour', - description: '', + url: 'https://www.ruthblackwell.com', parent: 'dogfartnetwork', }, { slug: 'candymonroe', name: 'Candy Monroe', - url: 'https://www.candymonroe.com/tour', - description: '', + url: 'https://www.candymonroe.com', parent: 'dogfartnetwork', }, { slug: 'wifewriting', name: 'Wife Writing', - url: 'https://www.wifewriting.com/tour', - description: '', + url: 'https://www.wifewriting.com', parent: 'dogfartnetwork', }, { slug: 'barbcummings', name: 'Barb Cummings', - url: 'https://www.barbcummings.com/tour', - description: '', + url: 'https://www.barbcummings.com', parent: 'dogfartnetwork', }, { slug: 'theminion', name: 'The Minion', - url: 'https://www.theminion.com/tour', - description: '', + url: 'https://www.theminion.com', parent: 'dogfartnetwork', }, { slug: 'blacksonboys', name: 'Blacks On Boys', - url: 'https://www.blacksonboys.com/tour', - description: '', + url: 'https://www.blacksonboys.com', parent: 'dogfartnetwork', }, { slug: 'gloryholesandhandjobs', name: 'Gloryholes And Handjobs', - url: 'https://www.gloryholesandhandjobs.com/tour', - description: '', + url: 'https://www.gloryholesandhandjobs.com', parent: 'dogfartnetwork', }, // DORCEL diff --git a/src/scrapers/dogfart.js b/src/scrapers/dogfart.js index 8300842a..37b1b48e 100644 --- a/src/scrapers/dogfart.js +++ b/src/scrapers/dogfart.js @@ -10,7 +10,7 @@ async function getPhotos(albumUrl) { return []; } - const lastPhotoPage = res.item.query.urls('.preview-image-container a').at(-1); + const lastPhotoPage = res.item.query.urls('.pics-container .preview-image-container a').at(-1); const lastPhotoIndex = parseInt(lastPhotoPage.match(/\d+.jpg/)[0], 10); const photoUrls = Array.from({ length: lastPhotoIndex }, (value, index) => { @@ -31,19 +31,19 @@ function scrapeLatest(scenes, site, filter = true) { const siteUrl = query.cnt('.recent-details-title .help-block, .model-details-title .site-name'); - release.url = query.url('.thumbnail', 'href', { origin: site.type === 'network' ? site.url : site.parent.url }); + release.url = query.url('.thumbnail, .preview-image-container > a', 'href', { origin: site.url }); release.entryId = `${site.slug}_${new URL(release.url).pathname.split('/')[4]}`; release.title = query.cnt('.scene-title'); - release.actors = release.title.split(/[,&]|\band\b/).map((actor) => actor.replace(/BTS/i, '').trim()); + // release.actors = release.title.split(/[,&]|\band\b/).map((actor) => actor.replace(/BTS/i, '').trim()); // the titles don't always list the actors, e.g. BarbCummings.com // release.poster = `https:${element.querySelector('img').src}`; release.poster = query.img(); - release.teaser = query.el('.thumbnail', 'data-preview_clip_url'); + release.teaser = query.video('.thumbnail, .preview-thumbnail', 'data-preview_clip_url'); release.channel = siteUrl?.match(/(.*).com/)?.[1].toLowerCase(); - if (filter && `www.${siteUrl.toLowerCase()}` !== new URL(site.url).host) { + if (filter && siteUrl && `www.${siteUrl.toLowerCase()}` !== new URL(site.url).host) { // different dogfart site return { ...acc, unextracted: [...acc.unextracted, release] }; } @@ -59,26 +59,26 @@ async function scrapeScene({ query }, url, channel, baseScene, parameters) { const release = {}; const { origin, pathname } = new URL(url); - release.channel = query.cnt('.site-name').split('.')[0].toLowerCase(); + release.channel = channel.type === 'channel' ? channel.slug : query.cnt('.site-name').split('.')[0].toLowerCase(); release.entryId = `${release.channel}_${pathname.split('/').slice(-2)[0]}`; - release.title = query.cnt('.description-title'); - release.actors = query.all('.more-scenes a').map((actorEl) => ({ + release.title = query.cnt('.description-title') || query.text('.scene-title'); + release.actors = query.all('.more-scenes a, .starring-list a').map((actorEl) => ({ name: query.cnt(actorEl), url: query.url(actorEl, null, 'href', { origin: channel.url }), })); - release.description = query.meta('meta[itemprop="description"]') || qu.cnt('.description').replace(/[ \t\n]{2,}/g, ' ').replace('...read more', '').trim(); + release.description = query.meta('meta[itemprop="description"]') || query.cnt('.description').replace(/[ \t\n]{2,}/g, ' ').replace('...read more', '').trim(); release.date = query.date('meta[itemprop="uploadDate"]', null, null, 'content'); - release.duration = query.duration('.extra-info p:nth-child(2)'); + release.duration = query.duration('.extra-info p:nth-child(2), .run-time-container'); - release.tags = query.cnts('.scene-details .categories a'); + release.tags = query.exists('.scene-details .categories a') ? query.cnts('.scene-details .categories a') : query.text('.categories')?.split(/,\s+/); const trailer = query.video('.html5-video', 'data-trailer'); const lastPhotosUrl = query.urls('.pagination a').at(-1); - release.poster = query.poster('.html5-video', 'data-poster'); + release.poster = query.poster('.html5-video', 'data-poster') || query.img('.trailer-image'); if (trailer && !trailer?.includes('join')) { release.trailer = trailer; @@ -88,16 +88,17 @@ async function scrapeScene({ query }, url, channel, baseScene, parameters) { release.photos = await getPhotos(`${origin}${pathname}${lastPhotosUrl}`, channel, url); } - release.stars = Number(((query.number('span[itemprop="average"]') || query.number('span[itemprop="ratingValue"]')) / 2).toFixed(2)); + release.stars = Number(((query.number('span[itemprop="average"], span[itemprop="ratingValue"]') || query.number('canvas[data-score]', null, 'data-score')) / 2).toFixed(2)); return release; } -async function fetchLatest(site, page = 1) { - const res = await qu.getAll(`https://dogfartnetwork.com/tour/scenes/?p=${page}`, '.recent-updates'); +async function fetchLatest(channel, page = 1) { + // const res = await qu.getAll(`https://dogfartnetwork.com/tour/scenes/?p=${page}`, '.recent-updates'); + const res = await qu.getAll(`${channel.url}/tour/scenes/?p=${page}`, '.recent-updates, .preview-image-container'); if (res.ok) { - return scrapeLatest(res.items, site); + return scrapeLatest(res.items, channel); } return res.status;