From 8359f78e2e6a0ba56f069e89884905f9682d1805 Mon Sep 17 00:00:00 2001 From: Niels Simenon Date: Sun, 23 Feb 2020 22:01:12 +0100 Subject: [PATCH] Fixed RK scraper returning dick size as bust size. --- src/actors.js | 2 +- src/media.js | 11 ++++++++--- src/scrapers/mindgeek.js | 8 +++++--- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/src/actors.js b/src/actors.js index c5f9b793..8da740d3 100644 --- a/src/actors.js +++ b/src/actors.js @@ -315,7 +315,7 @@ async function mergeProfiles(profiles, actor) { residencePlace: prevProfile.residencePlace || profile.residencePlace, nationality: prevProfile.nationality || profile.nationality, // used to derive country when not available ethnicity: prevProfile.ethnicity || profile.ethnicity, - bust: prevProfile.bust || profile.bust, + bust: prevProfile.bust || (/\d+\w+/.test(profile.bust) && profile.bust), waist: prevProfile.waist || profile.waist, hip: prevProfile.hip || profile.hip, naturalBoobs: prevProfile.naturalBoobs === undefined ? profile.naturalBoobs : prevProfile.naturalBoobs, diff --git a/src/media.js b/src/media.js index 73bea226..e2eabb49 100644 --- a/src/media.js +++ b/src/media.js @@ -115,7 +115,7 @@ async function fetchItem(source, index, existingItemsBySource, domain, role, att return null; } - logger.verbose(`Fetching media item from ${source.src || source}`); + logger.verbose(`Fetching ${domain} ${role} from ${source.src || source}`); // const res = await bhttp.get(source.src || source); const res = await get(source.src || source); @@ -199,8 +199,13 @@ async function saveItems(items, domain, role) { logger.verbose(`Saved ${domain} ${role} to ${filepath}`); return { - ...item, filepath, + mimetype: item.mimetype, + extension: item.extension, + hash: item.hash, + entropy: item.entropy, + quality: item.quality, + source: item.source, }; } catch (error) { logger.error(`Failed to store ${domain} ${role} from ${item.source}: ${error.message}`); @@ -250,7 +255,7 @@ async function storeMedia(sources, domain, role, { entropyFilter = 2.5 } = {}) { const { hash: fetchedItemsByHash } = groupItems(fetchedItems); // find hash duplicates that don't need to be re-saved - const uniqueFetchedItems = Object.values(fetchedItemsByHash).filter(item => !entropyFilter || item.entropy >= entropyFilter); + const uniqueFetchedItems = Object.values(fetchedItemsByHash).filter(item => !entropyFilter || item.entropy === null || item.entropy >= entropyFilter); const existingHashItems = await knex('media').whereIn('hash', uniqueFetchedItems.map(item => item.hash)); const { hash: existingHashItemsByHash } = groupItems(existingHashItems); diff --git a/src/scrapers/mindgeek.js b/src/scrapers/mindgeek.js index fac69001..c2f45c2a 100644 --- a/src/scrapers/mindgeek.js +++ b/src/scrapers/mindgeek.js @@ -150,9 +150,11 @@ function scrapeProfile(data, html, releases = [], networkName) { profile.gender = data.gender === 'other' ? 'transsexual' : data.gender; - if (bust) profile.bust = bust.toUpperCase(); - if (waist) profile.waist = waist; - if (hip) profile.hip = hip; + if (profile.gender === 'female') { + if (bust) profile.bust = bust.toUpperCase(); + if (waist) profile.waist = waist; + if (hip) profile.hip = hip; + } if (data.birthPlace) profile.birthPlace = data.birthPlace; if (data.height) profile.height = inchesToCm(data.height);