From 71c884fe48915405019702cde5a32dcad344c91c Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Sat, 28 Nov 2020 00:46:30 +0100 Subject: [PATCH] Improved Little Caprice Dreams scraper. --- config/default.js | 1 + src/actors.js | 2 +- src/argv.js | 2 +- src/scrapers/littlecapricedreams.js | 87 ++++++++++++++++++++++++++--- src/scrapers/scrapers.js | 1 + 5 files changed, 82 insertions(+), 11 deletions(-) diff --git a/config/default.js b/config/default.js index 6f32ed84..83e76c85 100644 --- a/config/default.js +++ b/config/default.js @@ -166,6 +166,7 @@ module.exports = { 'hitzefrei', 'porncz', 'czechav', + 'littlecapricedreams', 'gangbangcreampie', 'gloryholesecrets', 'aziani', diff --git a/src/actors.js b/src/actors.js index e718c775..ef0c3fbe 100644 --- a/src/actors.js +++ b/src/actors.js @@ -661,7 +661,7 @@ async function scrapeActors(argNames) { logger.info(`Scraping profiles for ${actorNames.length} actors`); - const sources = argv.actorsSources || config.profiles || Object.keys(scrapers.actors); + const sources = argv.profileSources || config.profiles || Object.keys(scrapers.actors); const entitySlugs = sources.flat(); const [entities, existingActorEntries] = await Promise.all([ diff --git a/src/argv.js b/src/argv.js index a610776d..4d6170c1 100644 --- a/src/argv.js +++ b/src/argv.js @@ -72,7 +72,7 @@ const { argv } = yargs .option('actor-sources', { describe: 'Use these scrapers for actor data', type: 'array', - alias: ['actor-source', 'profile-sources', 'profile-source', 'source'], + alias: ['actor-source', 'profile-sources', 'profile-source', 'source', 'sources'], }) .option('movie-scenes', { describe: 'Fetch all scenes for a movie', diff --git a/src/scrapers/littlecapricedreams.js b/src/scrapers/littlecapricedreams.js index cb75a77e..bfbbe2cc 100644 --- a/src/scrapers/littlecapricedreams.js +++ b/src/scrapers/littlecapricedreams.js @@ -17,13 +17,21 @@ function matchChannel(release, channel) { serieNames.superprivate = serieNames.superprivatex; serieNames.nasst = serieNames.nassty; serieNames.sexlesson = serieNames.sexlessons; - serieNames['sex lesson'] = serieNames.sexlessons; - const serieName = release.title.match(new RegExp(Object.keys(serieNames).join('|'), 'i'))?.[0]; + // ensure longest key matches first + const serieKeys = Object.keys(serieNames).sort((nameA, nameB) => nameB.length - nameA.length); + + const serieName = release.title.match(new RegExp(serieKeys.join('|'), 'i'))?.[0]; const serie = serieName && serieNames[slugify(serieName, '')]; - return serie?.slug || null; - // title: release.title.replace(new RegExp(`(${serieName}|${serie.name}|${serie.slug})[\\s:–-]*`, 'i'), ''), + if (serie) { + return { + slug: serie.slug, + title: release.title.replace(new RegExp(`(${serieName}|${serie.name}|${serie.slug})\\s*[-–:/]+\\s*`, 'ig'), ''), + }; + } + + return null; } function scrapeAll(scenes, channel) { @@ -41,9 +49,10 @@ function scrapeAll(scenes, channel) { referer: channel.url, }; - release.channel = matchChannel(release, channel); - - return release; + return { + ...release, + ...matchChannel(release, channel), + }; }); } @@ -100,9 +109,32 @@ async function scrapeScene({ query }, url, channel, include) { release.photos = await fetchPhotos(query.url('.vid_buttons a[href*="project/"]')); } - release.channel = matchChannel(release, channel); + return { + ...release, + ...matchChannel(release, channel), + }; +} - return release; +function scrapeProfile({ query }, url) { + const profile = {}; + + const bio = query.cnts('div p').reduce((acc, item) => { + const [key, value] = item.split(/\s*:\s*/); + + return { + ...acc, + [slugify(key, '_')]: value.trim(), + }; + }, {}); + + profile.avatar = { + src: query.img('.model-page'), + referer: url, + }; + + console.log(bio); + console.log(profile); + return profile; } async function fetchLatest(channel) { @@ -128,7 +160,44 @@ async function fetchScene(url, channel, baseRelease, include) { return res.status; } +async function getActorUrl(baseActor) { + if (baseActor.url) { + return baseActor.url; + } + + const overviewRes = await qu.getAll('https://www.littlecaprice-dreams.com/pornstars', '.models'); + + if (!overviewRes.ok) { + return overviewRes.status; + } + + const actorItem = overviewRes.items.find(({ query }) => slugify(query.q('img', 'title')) === baseActor.slug); + + if (!actorItem) { + return null; + } + + return actorItem.query.url('a'); +} + +async function fetchProfile(baseActor, entity) { + const actorUrl = await getActorUrl(baseActor); + + if (!actorUrl) { + return null; + } + + const actorRes = await qu.get(actorUrl, '#main-content'); + + if (actorRes.ok) { + return scrapeProfile(actorRes.item, actorUrl, entity); + } + + return actorRes.status; +} + module.exports = { fetchLatest, fetchScene, + fetchProfile, }; diff --git a/src/scrapers/scrapers.js b/src/scrapers/scrapers.js index 8dccccc1..edb4bc96 100644 --- a/src/scrapers/scrapers.js +++ b/src/scrapers/scrapers.js @@ -223,6 +223,7 @@ const scrapers = { killergram, kink, legalporno, + littlecapricedreams, men, metrohd, milehighmedia,