diff --git a/public/img/tags/anal/1.jpeg b/public/img/tags/anal/1.jpeg new file mode 100644 index 00000000..7fc78a35 Binary files /dev/null and b/public/img/tags/anal/1.jpeg differ diff --git a/public/img/tags/anal/1_thumb.jpeg b/public/img/tags/anal/1_thumb.jpeg new file mode 100644 index 00000000..61da318f Binary files /dev/null and b/public/img/tags/anal/1_thumb.jpeg differ diff --git a/public/img/tags/deepthroat/0.jpeg b/public/img/tags/deepthroat/0.jpeg new file mode 100644 index 00000000..35dc8642 Binary files /dev/null and b/public/img/tags/deepthroat/0.jpeg differ diff --git a/public/img/tags/deepthroat/0_thumb.jpeg b/public/img/tags/deepthroat/0_thumb.jpeg new file mode 100644 index 00000000..71077370 Binary files /dev/null and b/public/img/tags/deepthroat/0_thumb.jpeg differ diff --git a/public/img/tags/mff/0.jpeg b/public/img/tags/mff/0.jpeg new file mode 100755 index 00000000..985adec0 Binary files /dev/null and b/public/img/tags/mff/0.jpeg differ diff --git a/public/img/tags/mff/0_thumb.jpeg b/public/img/tags/mff/0_thumb.jpeg new file mode 100755 index 00000000..c3f432bb Binary files /dev/null and b/public/img/tags/mff/0_thumb.jpeg differ diff --git a/public/img/tags/mff/poster.jpeg b/public/img/tags/mff/poster.jpeg deleted file mode 100755 index 05baf42b..00000000 Binary files a/public/img/tags/mff/poster.jpeg and /dev/null differ diff --git a/public/img/tags/mff/poster_thumb.jpeg b/public/img/tags/mff/poster_thumb.jpeg deleted file mode 100755 index c2b06d98..00000000 Binary files a/public/img/tags/mff/poster_thumb.jpeg and /dev/null differ diff --git a/seeds/04_media.js b/seeds/04_media.js index cfa3cfa7..690a06b3 100644 --- a/seeds/04_media.js +++ b/seeds/04_media.js @@ -6,6 +6,7 @@ const tagPosters = [ ['bdsm', 0, 'Dani Daniels in "The Traning of Dani Daniels, Day 2" for The Training of O at Kink'], ['behind-the-scenes', 0, 'Janice Griffith in "Day With A Pornstar: Janice" for Brazzers'], ['da-tp', 0, 'Natasha Teen in LegalPorno SZ2164'], + ['deepthroat', 0, 'Chanel Grey in "Deepthroating Is Fun" for Throated'], ['double-anal', 5, 'Riley Reid in "The Gangbang of Riley Reid" for Jules Jordan'], ['double-penetration', 'poster', 'Mia Malkova in "DP!" for HardX'], ['double-vaginal', 'poster', 'Riley Reid in "Pizza That Ass" for Reid My Lips'], @@ -13,7 +14,7 @@ const tagPosters = [ ['oral-creampie', 1, 'Keisha Grey in Brazzers House'], ['triple-anal', 'poster', 'Kristy Black in SZ1986 for LegalPorno'], ['airtight', 1, 'Jynx Maze in "Pump My Ass Full of Cum 3" for Jules Jordan'], - ['anal', 'poster', 'Jynx Maze in "Anal Buffet 6" for Evil Angel'], + ['anal', 1, 'Veronica Leal and Tina Kay in "Agents On Anal Mission" for Asshole Fever'], ['asian', 'poster', 'Vina Sky in "Young and Glamorous 10" for Jules Jordan'], ['blowjob', 0, 'Adriana Chechik in "The Dinner Party" for Real Wife Stories (Brazzers)'], ['blowbang', 'poster'], @@ -27,7 +28,7 @@ const tagPosters = [ ['gaping', 1, 'Vina Sky in "Vina Sky Does Anal" for HardX'], ['interracial', 'poster'], ['latina', 'poster'], - ['mff', 'poster'], + ['mff', 0, 'Madison Ivy and Adriana Chechik in "Day With A Pornstar" for Brazzers'], ['mfm', 'poster'], ['orgy', 'poster'], ['schoolgirl', 1, 'Eliza Ibarra for Brazzers'], @@ -47,6 +48,7 @@ const tagPosters = [ const tagPhotos = [ ['airtight', 2, 'Dakota Skye in "Dakota Goes Nuts" for ArchAngel'], ['airtight', 3, 'Anita Bellini in "Triple Dick Gangbang" for Hands On Hardcore (DDF Network)'], + ['anal', 'poster', 'Jynx Maze in "Anal Buffet 6" for Evil Angel'], ['anal', 0], ['da-tp', 1, 'Francys Belle in SZ1702 for LegalPorno'], ['da-tp', 2, 'Angel Smalls in GIO408 for LegalPorno'], diff --git a/src/actors-legacy.js b/src/actors-legacy.js new file mode 100644 index 00000000..d26c86b8 --- /dev/null +++ b/src/actors-legacy.js @@ -0,0 +1,539 @@ +'use strict'; + +const config = require('config'); +const Promise = require('bluebird'); +const UrlPattern = require('url-pattern'); +const moment = require('moment'); + +const logger = require('./logger')(__filename); +const knex = require('./knex'); +const argv = require('./argv'); +const include = require('./utils/argv-include')(argv); +const scrapers = require('./scrapers/scrapers'); +const whereOr = require('./utils/where-or'); +const resolvePlace = require('./utils/resolve-place'); +const slugify = require('./utils/slugify'); +const capitalize = require('./utils/capitalize'); +const { curateSites } = require('./sites'); +const { storeMedia, associateMedia } = require('./media'); + +async function curateActor(actor) { + const [aliases, avatar, photos, social] = await Promise.all([ + knex('actors').where({ alias_for: actor.id }), + knex('actors_avatars') + .where('actor_id', actor.id) + .join('media', 'media.id', 'actors_avatars.media_id') + .first(), + knex('actors_photos') + .where('actor_id', actor.id) + .join('media', 'media.id', 'actors_photos.media_id') + .orderBy('index'), + knex('actors_social') + .where('actor_id', actor.id) + .orderBy('platform', 'desc'), + ]); + + const curatedActor = { + id: actor.id, + gender: actor.gender, + name: actor.name, + description: actor.description, + birthdate: actor.birthdate && new Date(actor.birthdate), + country: actor.country_alpha2, + origin: (actor.birth_city || actor.birth_state || actor.birth_country_alpha2) ? {} : null, + residence: (actor.residence_city || actor.residence_state || actor.residence_country_alpha2) ? {} : null, + ethnicity: actor.ethnicity, + height: actor.height, + weight: actor.weight, + bust: actor.bust, + waist: actor.waist, + hip: actor.hip, + naturalBoobs: actor.natural_boobs, + aliases: aliases.map(({ name }) => name), + slug: actor.slug, + avatar, + photos, + hasTattoos: actor.has_tattoos, + hasPiercings: actor.has_piercings, + tattoos: actor.tattoos, + piercings: actor.piercings, + social, + scrapedAt: actor.scraped_at, + }; + + if (curatedActor.birthdate) { + curatedActor.age = moment().diff(curatedActor.birthdate, 'years'); + } + + if (actor.birth_city) curatedActor.origin.city = actor.birth_city; + if (actor.birth_state) curatedActor.origin.state = actor.birth_state; + + if (actor.birth_country_alpha2) { + curatedActor.origin.country = { + alpha2: actor.birth_country_alpha2, + name: actor.birth_country_name, + alias: actor.birth_country_alias, + }; + } + + if (actor.residence_city) curatedActor.residence.city = actor.residence_city; + if (actor.residence_state) curatedActor.residence.state = actor.residence_state; + + if (actor.residence_country_alpha2) { + curatedActor.residence.country = { + alpha2: actor.residence_country_alpha2, + name: actor.residence_country_name, + alias: actor.residence_country_alias, + }; + } + + return curatedActor; +} + +function curateActors(releases) { + return Promise.all(releases.map(async release => curateActor(release))); +} + +function curateActorEntry(actor, scraped, scrapeSuccess) { + const curatedActor = { + name: capitalize(actor.name), + slug: slugify(actor.name), + birthdate: actor.birthdate, + description: actor.description, + gender: actor.gender, + ethnicity: actor.ethnicity, + bust: actor.bust, + waist: actor.waist, + hip: actor.hip, + natural_boobs: actor.naturalBoobs, + height: actor.height, + weight: actor.weight, + hair: actor.hair, + eyes: actor.eyes, + has_tattoos: actor.hasTattoos, + has_piercings: actor.hasPiercings, + tattoos: actor.tattoos, + piercings: actor.piercings, + }; + + if (actor.id) { + curatedActor.id = actor.id; + } + + if (actor.birthPlace) { + curatedActor.birth_city = actor.birthPlace.city; + curatedActor.birth_state = actor.birthPlace.state; + curatedActor.birth_country_alpha2 = actor.birthPlace.country; + } + + if (actor.residencePlace) { + curatedActor.residence_city = actor.residencePlace.city; + curatedActor.residence_state = actor.residencePlace.state; + curatedActor.residence_country_alpha2 = actor.residencePlace.country; + } + + if (scraped) { + curatedActor.scraped_at = new Date(); + curatedActor.scrape_success = scrapeSuccess; + } + + return curatedActor; +} + +function curateSocialEntry(url, actorId) { + const platforms = [ + // links supplied by PH often look like domain.com/domain.com/username + { + label: 'twitter', + pattern: 'http(s)\\://(*)twitter.com/:username(/)(?*)', + format: username => `https://www.twitter.com/${username}`, + }, + { + label: 'youtube', + pattern: 'http(s)\\://(*)youtube.com/channel/:username(?*)', + format: username => `https://www.youtube.com/channel/${username}`, + }, + { + label: 'instagram', + pattern: 'http(s)\\://(*)instagram.com/:username(/)(?*)', + format: username => `https://www.instagram.com/${username}`, + }, + { + label: 'snapchat', + pattern: 'http(s)\\://(*)snapchat.com/add/:username(/)(?*)', + format: username => `https://www.snapchat.com/add/${username}`, + }, + { + label: 'tumblr', + pattern: 'http(s)\\://:username.tumblr.com(*)', + format: username => `https://${username}.tumblr.com`, + }, + { + label: 'onlyfans', + pattern: 'http(s)\\://(*)onlyfans.com/:username(/)(?*)', + format: username => `https://www.onlyfans.com/${username}`, + }, + { + label: 'fancentro', + pattern: 'http(s)\\://(*)fancentro.com/:username(/)(?*)', + format: username => `https://www.fancentro.com/${username}`, + }, + { + label: 'modelhub', + pattern: 'http(s)\\://(*)modelhub.com/:username(/)(?*)', + format: username => `https://www.modelhub.com/${username}`, + }, + ]; + + const match = platforms.reduce((acc, platform) => { + if (acc) return acc; + + const patternMatch = new UrlPattern(platform.pattern).match(url); + + if (patternMatch) { + return { + platform: platform.label, + original: url, + username: patternMatch.username, + url: platform.format ? platform.format(patternMatch.username) : url, + }; + } + + return null; + }, null) || { url }; + + return { + url: match.url, + platform: match.platform, + actor_id: actorId, + }; +} + +async function curateSocialEntries(urls, actorId) { + if (!urls) { + return []; + } + + const existingSocialLinks = await knex('actors_social').where('actor_id', actorId); + + return urls.reduce((acc, url) => { + const socialEntry = curateSocialEntry(url, actorId); + + if (acc.some(entry => socialEntry.url.toLowerCase() === entry.url.toLowerCase()) || existingSocialLinks.some(entry => socialEntry.url.toLowerCase() === entry.url.toLowerCase())) { + // prevent duplicates + return acc; + } + + return [...acc, socialEntry]; + }, []); +} + +async function fetchActors(queryObject, limit = 100) { + const releases = await knex('actors') + .select( + 'actors.*', + 'birth_countries.alpha2 as birth_country_alpha2', 'birth_countries.name as birth_country_name', 'birth_countries.alias as birth_country_alias', + 'residence_countries.alpha2 as residence_country_alpha2', 'residence_countries.name as residence_country_name', 'residence_countries.alias as residence_country_alias', + ) + .leftJoin('countries as birth_countries', 'actors.birth_country_alpha2', 'birth_countries.alpha2') + .leftJoin('countries as residence_countries', 'actors.residence_country_alpha2', 'residence_countries.alpha2') + .orderBy(['actors.name', 'actors.gender']) + .where(builder => whereOr(queryObject, 'actors', builder)) + .limit(limit); + + return curateActors(releases); +} + +async function storeSocialLinks(urls, actorId) { + const curatedSocialEntries = await curateSocialEntries(urls, actorId); + + await knex('actors_social').insert(curatedSocialEntries); +} + +async function storeAvatars(avatars, actorId) { + if (!avatars || avatars.length === 0) { + return []; + } + + const avatarsBySource = await storeMedia(avatars, 'actor', 'avatar'); + await associateMedia({ [actorId]: avatars }, avatarsBySource, 'actor', 'photo', 'avatar'); + + return avatarsBySource; +} + +async function storeActor(actor, scraped = false, scrapeSuccess = false) { + const curatedActor = curateActorEntry(actor, scraped, scrapeSuccess); + + const [actorEntry] = await knex('actors') + .insert(curatedActor) + .returning('*'); + + await storeSocialLinks(actor.social, actorEntry.id); + + if (actor.avatars) { + await storeAvatars(actor.avatars, actorEntry.id); + } + + logger.info(`Added new entry for actor '${actor.name}'`); + + return actorEntry; +} + +async function updateActor(actor, scraped = false, scrapeSuccess = false) { + const curatedActor = curateActorEntry(actor, scraped, scrapeSuccess); + + const [actorEntry] = await knex('actors') + .where({ id: actor.id }) + .update(curatedActor) + .returning('*'); + + await storeSocialLinks(actor.social, actor.id); + + logger.info(`Updated entry for actor '${actor.name}'`); + + return actorEntry; +} + +async function mergeProfiles(profiles, actor) { + if (profiles.filter(Boolean).length === 0) { + return null; + } + + const mergedProfile = profiles.reduce((prevProfile, profile) => { + if (profile === null) { + return prevProfile; + } + + const accProfile = { + id: actor ? actor.id : null, + name: actor ? actor.name : (prevProfile.name || profile.name), + description: prevProfile.description || profile.description, + gender: prevProfile.gender || profile.gender, + birthdate: !prevProfile.birthdate || Number.isNaN(Number(prevProfile.birthdate)) ? profile.birthdate : prevProfile.birthdate, + birthPlace: prevProfile.birthPlace || profile.birthPlace, + residencePlace: prevProfile.residencePlace || profile.residencePlace, + nationality: prevProfile.nationality || profile.nationality, // used to derive country when not available + ethnicity: prevProfile.ethnicity || profile.ethnicity, + bust: prevProfile.bust || (/\d+\w+/.test(profile.bust) ? profile.bust : null), + waist: prevProfile.waist || profile.waist, + hip: prevProfile.hip || profile.hip, + naturalBoobs: prevProfile.naturalBoobs === undefined ? profile.naturalBoobs : prevProfile.naturalBoobs, + height: prevProfile.height || profile.height, + weight: prevProfile.weight || profile.weight, + hair: prevProfile.hair || profile.hair, + eyes: prevProfile.eyes || profile.eyes, + hasPiercings: prevProfile.hasPiercings === undefined ? profile.hasPiercings : prevProfile.hasPiercings, + hasTattoos: prevProfile.hasTattoos === undefined ? profile.hasTattoos : prevProfile.hasTattoos, + piercings: prevProfile.piercings || profile.piercings, + tattoos: prevProfile.tattoos || profile.tattoos, + social: prevProfile.social.concat(profile.social || []), + releases: prevProfile.releases.concat(profile.releases ? profile.releases : []), // don't flatten fallbacks + }; + + if (profile.avatar) { + const avatar = Array.isArray(profile.avatar) + ? profile.avatar.map(avatarX => ({ + src: avatarX.src || avatarX, + scraper: profile.scraper, + copyright: avatarX.copyright === undefined ? capitalize(profile.site?.name || profile.scraper) : profile.avatar.copyright, + })) + : { + src: profile.avatar.src || profile.avatar, + scraper: profile.scraper, + copyright: profile.avatar.copyright === undefined ? capitalize(profile.site?.name || profile.scraper) : profile.avatar.copyright, + }; + + accProfile.avatars = prevProfile.avatars.concat([avatar]); // don't flatten fallbacks + } else { + accProfile.avatars = prevProfile.avatars; + } + + return accProfile; + }, { + social: [], + avatars: [], + releases: [], + }); + + const [birthPlace, residencePlace] = await Promise.all([ + resolvePlace(mergedProfile.birthPlace), + resolvePlace(mergedProfile.residencePlace), + ]); + + mergedProfile.birthPlace = birthPlace; + mergedProfile.residencePlace = residencePlace; + + if (!mergedProfile.birthPlace && mergedProfile.nationality) { + const country = await knex('countries') + .where('nationality', 'ilike', `%${mergedProfile.nationality}%`) + .orderBy('priority', 'desc') + .first(); + + mergedProfile.birthPlace = { + country: country.alpha2, + }; + } + + return mergedProfile; +} + +async function scrapeProfiles(sources, actorName, actorEntry, sitesBySlug) { + return Promise.map(sources, async (source) => { + // const [scraperSlug, scraper] = source; + const profileScrapers = [].concat(source).map(slug => ({ scraperSlug: slug, scraper: scrapers.actors[slug] })); + + try { + return await profileScrapers.reduce(async (outcome, { scraper, scraperSlug }) => outcome.catch(async () => { + if (!scraper) { + logger.warn(`No profile profile scraper available for ${scraperSlug}`); + throw Object.assign(new Error(`No profile scraper available for ${scraperSlug}`)); + } + + logger.verbose(`Searching '${actorName}' on ${scraperSlug}`); + + const site = sitesBySlug[scraperSlug] || null; + const profile = await scraper.fetchProfile(actorEntry ? actorEntry.name : actorName, scraperSlug, site, include); + + if (profile) { + logger.verbose(`Found profile for '${actorName}' on ${scraperSlug}`); + + return { + ...profile, + name: actorName, + scraper: scraperSlug, + site, + releases: profile.releases?.map(release => (typeof release === 'string' + ? { url: release, site } + : { ...release, site: release.site || site } + )), + }; + } + + logger.verbose(`No profile for '${actorName}' available on ${scraperSlug}`); + throw Object.assign(new Error(`Profile for ${actorName} not available on ${scraperSlug}`), { warn: false }); + }), Promise.reject(new Error())); + } catch (error) { + if (error.warn !== false) { + logger.warn(`Error in scraper ${source}: ${error.message}`); + // logger.error(error.stack); + } + } + + return null; + }); +} + +async function scrapeActors(actorNames) { + return Promise.map(actorNames || argv.actors, async (actorName) => { + try { + const actorSlug = slugify(actorName); + const actorEntry = await knex('actors').where({ slug: actorSlug }).first(); + const sources = argv.sources || config.profiles || Object.keys(scrapers.actors); + + const finalSources = argv.withReleases ? sources.flat() : sources; // ignore race-to-success grouping when scenes are requested + + const [siteEntries, networkEntries] = await Promise.all([ + knex('sites') + .leftJoin('networks', 'sites.network_id', 'networks.id') + .select( + 'sites.*', + 'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters', + ) + .whereIn('sites.slug', finalSources.flat()), + knex('networks').select('*').whereIn('slug', finalSources.flat()), + ]); + + const sites = await curateSites(siteEntries, true); + const networks = networkEntries.map(network => ({ ...network, isFallback: true })); + const sitesBySlug = [].concat(networks, sites).reduce((acc, site) => ({ ...acc, [site.slug]: site }), {}); + + const profiles = await scrapeProfiles(sources, actorName, actorEntry, sitesBySlug); + const profile = await mergeProfiles(profiles, actorEntry); + + if (profile === null) { + logger.warn(`Could not find profile for actor '${actorName}'`); + + if (argv.save && !actorEntry) { + await storeActor({ name: actorName }, false, false); + } + + return null; + } + + if (argv.inspect) { + console.log(profile); + logger.info(`Found ${profile.releases.length} releases for ${actorName}`); + } + + if (argv.save) { + if (actorEntry && profile) { + await Promise.all([ + updateActor(profile, true, true), + storeAvatars(profile.avatars, actorEntry.id), + ]); + + return profile; + } + + await storeActor(profile, true, true); + } + + return profile; + } catch (error) { + console.log(error); + logger.warn(`${actorName}: ${error}`); + + return null; + } + }, { + concurrency: 3, + }); +} + +async function scrapeBasicActors() { + const basicActors = await knex('actors').where('scraped_at', null); + + return scrapeActors(basicActors.map(actor => actor.name)); +} + +async function associateActors(mappedActors, releases) { + const [existingActorEntries, existingAssociationEntries] = await Promise.all([ + knex('actors') + .whereIn('name', Object.values(mappedActors).map(actor => actor.name)) + .orWhereIn('slug', Object.keys(mappedActors)), + knex('releases_actors').whereIn('release_id', releases.map(release => release.id)), + ]); + + const associations = await Promise.map(Object.entries(mappedActors), async ([actorSlug, actor]) => { + try { + const actorEntry = existingActorEntries.find(actorX => actorX.slug === actorSlug) + || await storeActor(actor); + + // if a scene + return Array.from(actor.releaseIds) + .map(releaseId => ({ + release_id: releaseId, + actor_id: actorEntry.id, + })) + .filter(association => !existingAssociationEntries + // remove associations already in database + .some(associationEntry => associationEntry.actor_id === association.actor_id + && associationEntry.release_id === association.release_id)); + } catch (error) { + logger.error(actor.name, error); + return null; + } + }); + + await knex('releases_actors').insert(associations.filter(association => association).flat()); + + // basic actor scraping is failure prone, don't run together with actor association + // await scrapebasicactors(), +} + +module.exports = { + associateActors, + fetchActors, + scrapeActors, + scrapeBasicActors, +}; diff --git a/src/actors.js b/src/actors.js index d26c86b8..61d09176 100644 --- a/src/actors.js +++ b/src/actors.js @@ -1,539 +1,26 @@ 'use strict'; -const config = require('config'); -const Promise = require('bluebird'); -const UrlPattern = require('url-pattern'); -const moment = require('moment'); - -const logger = require('./logger')(__filename); -const knex = require('./knex'); -const argv = require('./argv'); -const include = require('./utils/argv-include')(argv); -const scrapers = require('./scrapers/scrapers'); -const whereOr = require('./utils/where-or'); -const resolvePlace = require('./utils/resolve-place'); const slugify = require('./utils/slugify'); -const capitalize = require('./utils/capitalize'); -const { curateSites } = require('./sites'); -const { storeMedia, associateMedia } = require('./media'); -async function curateActor(actor) { - const [aliases, avatar, photos, social] = await Promise.all([ - knex('actors').where({ alias_for: actor.id }), - knex('actors_avatars') - .where('actor_id', actor.id) - .join('media', 'media.id', 'actors_avatars.media_id') - .first(), - knex('actors_photos') - .where('actor_id', actor.id) - .join('media', 'media.id', 'actors_photos.media_id') - .orderBy('index'), - knex('actors_social') - .where('actor_id', actor.id) - .orderBy('platform', 'desc'), - ]); +async function storeReleaseActors(releases) { + const releaseIdsByActor = releases.reduce( + (acc, release) => release.actors.reduce((actorAcc, actor) => { + const releaseActor = actor.name ? actor : { name: actor }; + const actorSlug = slugify(releaseActor.name); - const curatedActor = { - id: actor.id, - gender: actor.gender, - name: actor.name, - description: actor.description, - birthdate: actor.birthdate && new Date(actor.birthdate), - country: actor.country_alpha2, - origin: (actor.birth_city || actor.birth_state || actor.birth_country_alpha2) ? {} : null, - residence: (actor.residence_city || actor.residence_state || actor.residence_country_alpha2) ? {} : null, - ethnicity: actor.ethnicity, - height: actor.height, - weight: actor.weight, - bust: actor.bust, - waist: actor.waist, - hip: actor.hip, - naturalBoobs: actor.natural_boobs, - aliases: aliases.map(({ name }) => name), - slug: actor.slug, - avatar, - photos, - hasTattoos: actor.has_tattoos, - hasPiercings: actor.has_piercings, - tattoos: actor.tattoos, - piercings: actor.piercings, - social, - scrapedAt: actor.scraped_at, - }; - - if (curatedActor.birthdate) { - curatedActor.age = moment().diff(curatedActor.birthdate, 'years'); - } - - if (actor.birth_city) curatedActor.origin.city = actor.birth_city; - if (actor.birth_state) curatedActor.origin.state = actor.birth_state; - - if (actor.birth_country_alpha2) { - curatedActor.origin.country = { - alpha2: actor.birth_country_alpha2, - name: actor.birth_country_name, - alias: actor.birth_country_alias, - }; - } - - if (actor.residence_city) curatedActor.residence.city = actor.residence_city; - if (actor.residence_state) curatedActor.residence.state = actor.residence_state; - - if (actor.residence_country_alpha2) { - curatedActor.residence.country = { - alpha2: actor.residence_country_alpha2, - name: actor.residence_country_name, - alias: actor.residence_country_alias, - }; - } - - return curatedActor; -} - -function curateActors(releases) { - return Promise.all(releases.map(async release => curateActor(release))); -} - -function curateActorEntry(actor, scraped, scrapeSuccess) { - const curatedActor = { - name: capitalize(actor.name), - slug: slugify(actor.name), - birthdate: actor.birthdate, - description: actor.description, - gender: actor.gender, - ethnicity: actor.ethnicity, - bust: actor.bust, - waist: actor.waist, - hip: actor.hip, - natural_boobs: actor.naturalBoobs, - height: actor.height, - weight: actor.weight, - hair: actor.hair, - eyes: actor.eyes, - has_tattoos: actor.hasTattoos, - has_piercings: actor.hasPiercings, - tattoos: actor.tattoos, - piercings: actor.piercings, - }; - - if (actor.id) { - curatedActor.id = actor.id; - } - - if (actor.birthPlace) { - curatedActor.birth_city = actor.birthPlace.city; - curatedActor.birth_state = actor.birthPlace.state; - curatedActor.birth_country_alpha2 = actor.birthPlace.country; - } - - if (actor.residencePlace) { - curatedActor.residence_city = actor.residencePlace.city; - curatedActor.residence_state = actor.residencePlace.state; - curatedActor.residence_country_alpha2 = actor.residencePlace.country; - } - - if (scraped) { - curatedActor.scraped_at = new Date(); - curatedActor.scrape_success = scrapeSuccess; - } - - return curatedActor; -} - -function curateSocialEntry(url, actorId) { - const platforms = [ - // links supplied by PH often look like domain.com/domain.com/username - { - label: 'twitter', - pattern: 'http(s)\\://(*)twitter.com/:username(/)(?*)', - format: username => `https://www.twitter.com/${username}`, - }, - { - label: 'youtube', - pattern: 'http(s)\\://(*)youtube.com/channel/:username(?*)', - format: username => `https://www.youtube.com/channel/${username}`, - }, - { - label: 'instagram', - pattern: 'http(s)\\://(*)instagram.com/:username(/)(?*)', - format: username => `https://www.instagram.com/${username}`, - }, - { - label: 'snapchat', - pattern: 'http(s)\\://(*)snapchat.com/add/:username(/)(?*)', - format: username => `https://www.snapchat.com/add/${username}`, - }, - { - label: 'tumblr', - pattern: 'http(s)\\://:username.tumblr.com(*)', - format: username => `https://${username}.tumblr.com`, - }, - { - label: 'onlyfans', - pattern: 'http(s)\\://(*)onlyfans.com/:username(/)(?*)', - format: username => `https://www.onlyfans.com/${username}`, - }, - { - label: 'fancentro', - pattern: 'http(s)\\://(*)fancentro.com/:username(/)(?*)', - format: username => `https://www.fancentro.com/${username}`, - }, - { - label: 'modelhub', - pattern: 'http(s)\\://(*)modelhub.com/:username(/)(?*)', - format: username => `https://www.modelhub.com/${username}`, - }, - ]; - - const match = platforms.reduce((acc, platform) => { - if (acc) return acc; - - const patternMatch = new UrlPattern(platform.pattern).match(url); - - if (patternMatch) { return { - platform: platform.label, - original: url, - username: patternMatch.username, - url: platform.format ? platform.format(patternMatch.username) : url, + ...actorAcc, + [actorSlug]: actorAcc[actorSlug] + ? actorAcc[actorSlug].concat(release.id) + : [release.id], }; - } + }, acc), + {}, + ); - return null; - }, null) || { url }; - - return { - url: match.url, - platform: match.platform, - actor_id: actorId, - }; -} - -async function curateSocialEntries(urls, actorId) { - if (!urls) { - return []; - } - - const existingSocialLinks = await knex('actors_social').where('actor_id', actorId); - - return urls.reduce((acc, url) => { - const socialEntry = curateSocialEntry(url, actorId); - - if (acc.some(entry => socialEntry.url.toLowerCase() === entry.url.toLowerCase()) || existingSocialLinks.some(entry => socialEntry.url.toLowerCase() === entry.url.toLowerCase())) { - // prevent duplicates - return acc; - } - - return [...acc, socialEntry]; - }, []); -} - -async function fetchActors(queryObject, limit = 100) { - const releases = await knex('actors') - .select( - 'actors.*', - 'birth_countries.alpha2 as birth_country_alpha2', 'birth_countries.name as birth_country_name', 'birth_countries.alias as birth_country_alias', - 'residence_countries.alpha2 as residence_country_alpha2', 'residence_countries.name as residence_country_name', 'residence_countries.alias as residence_country_alias', - ) - .leftJoin('countries as birth_countries', 'actors.birth_country_alpha2', 'birth_countries.alpha2') - .leftJoin('countries as residence_countries', 'actors.residence_country_alpha2', 'residence_countries.alpha2') - .orderBy(['actors.name', 'actors.gender']) - .where(builder => whereOr(queryObject, 'actors', builder)) - .limit(limit); - - return curateActors(releases); -} - -async function storeSocialLinks(urls, actorId) { - const curatedSocialEntries = await curateSocialEntries(urls, actorId); - - await knex('actors_social').insert(curatedSocialEntries); -} - -async function storeAvatars(avatars, actorId) { - if (!avatars || avatars.length === 0) { - return []; - } - - const avatarsBySource = await storeMedia(avatars, 'actor', 'avatar'); - await associateMedia({ [actorId]: avatars }, avatarsBySource, 'actor', 'photo', 'avatar'); - - return avatarsBySource; -} - -async function storeActor(actor, scraped = false, scrapeSuccess = false) { - const curatedActor = curateActorEntry(actor, scraped, scrapeSuccess); - - const [actorEntry] = await knex('actors') - .insert(curatedActor) - .returning('*'); - - await storeSocialLinks(actor.social, actorEntry.id); - - if (actor.avatars) { - await storeAvatars(actor.avatars, actorEntry.id); - } - - logger.info(`Added new entry for actor '${actor.name}'`); - - return actorEntry; -} - -async function updateActor(actor, scraped = false, scrapeSuccess = false) { - const curatedActor = curateActorEntry(actor, scraped, scrapeSuccess); - - const [actorEntry] = await knex('actors') - .where({ id: actor.id }) - .update(curatedActor) - .returning('*'); - - await storeSocialLinks(actor.social, actor.id); - - logger.info(`Updated entry for actor '${actor.name}'`); - - return actorEntry; -} - -async function mergeProfiles(profiles, actor) { - if (profiles.filter(Boolean).length === 0) { - return null; - } - - const mergedProfile = profiles.reduce((prevProfile, profile) => { - if (profile === null) { - return prevProfile; - } - - const accProfile = { - id: actor ? actor.id : null, - name: actor ? actor.name : (prevProfile.name || profile.name), - description: prevProfile.description || profile.description, - gender: prevProfile.gender || profile.gender, - birthdate: !prevProfile.birthdate || Number.isNaN(Number(prevProfile.birthdate)) ? profile.birthdate : prevProfile.birthdate, - birthPlace: prevProfile.birthPlace || profile.birthPlace, - residencePlace: prevProfile.residencePlace || profile.residencePlace, - nationality: prevProfile.nationality || profile.nationality, // used to derive country when not available - ethnicity: prevProfile.ethnicity || profile.ethnicity, - bust: prevProfile.bust || (/\d+\w+/.test(profile.bust) ? profile.bust : null), - waist: prevProfile.waist || profile.waist, - hip: prevProfile.hip || profile.hip, - naturalBoobs: prevProfile.naturalBoobs === undefined ? profile.naturalBoobs : prevProfile.naturalBoobs, - height: prevProfile.height || profile.height, - weight: prevProfile.weight || profile.weight, - hair: prevProfile.hair || profile.hair, - eyes: prevProfile.eyes || profile.eyes, - hasPiercings: prevProfile.hasPiercings === undefined ? profile.hasPiercings : prevProfile.hasPiercings, - hasTattoos: prevProfile.hasTattoos === undefined ? profile.hasTattoos : prevProfile.hasTattoos, - piercings: prevProfile.piercings || profile.piercings, - tattoos: prevProfile.tattoos || profile.tattoos, - social: prevProfile.social.concat(profile.social || []), - releases: prevProfile.releases.concat(profile.releases ? profile.releases : []), // don't flatten fallbacks - }; - - if (profile.avatar) { - const avatar = Array.isArray(profile.avatar) - ? profile.avatar.map(avatarX => ({ - src: avatarX.src || avatarX, - scraper: profile.scraper, - copyright: avatarX.copyright === undefined ? capitalize(profile.site?.name || profile.scraper) : profile.avatar.copyright, - })) - : { - src: profile.avatar.src || profile.avatar, - scraper: profile.scraper, - copyright: profile.avatar.copyright === undefined ? capitalize(profile.site?.name || profile.scraper) : profile.avatar.copyright, - }; - - accProfile.avatars = prevProfile.avatars.concat([avatar]); // don't flatten fallbacks - } else { - accProfile.avatars = prevProfile.avatars; - } - - return accProfile; - }, { - social: [], - avatars: [], - releases: [], - }); - - const [birthPlace, residencePlace] = await Promise.all([ - resolvePlace(mergedProfile.birthPlace), - resolvePlace(mergedProfile.residencePlace), - ]); - - mergedProfile.birthPlace = birthPlace; - mergedProfile.residencePlace = residencePlace; - - if (!mergedProfile.birthPlace && mergedProfile.nationality) { - const country = await knex('countries') - .where('nationality', 'ilike', `%${mergedProfile.nationality}%`) - .orderBy('priority', 'desc') - .first(); - - mergedProfile.birthPlace = { - country: country.alpha2, - }; - } - - return mergedProfile; -} - -async function scrapeProfiles(sources, actorName, actorEntry, sitesBySlug) { - return Promise.map(sources, async (source) => { - // const [scraperSlug, scraper] = source; - const profileScrapers = [].concat(source).map(slug => ({ scraperSlug: slug, scraper: scrapers.actors[slug] })); - - try { - return await profileScrapers.reduce(async (outcome, { scraper, scraperSlug }) => outcome.catch(async () => { - if (!scraper) { - logger.warn(`No profile profile scraper available for ${scraperSlug}`); - throw Object.assign(new Error(`No profile scraper available for ${scraperSlug}`)); - } - - logger.verbose(`Searching '${actorName}' on ${scraperSlug}`); - - const site = sitesBySlug[scraperSlug] || null; - const profile = await scraper.fetchProfile(actorEntry ? actorEntry.name : actorName, scraperSlug, site, include); - - if (profile) { - logger.verbose(`Found profile for '${actorName}' on ${scraperSlug}`); - - return { - ...profile, - name: actorName, - scraper: scraperSlug, - site, - releases: profile.releases?.map(release => (typeof release === 'string' - ? { url: release, site } - : { ...release, site: release.site || site } - )), - }; - } - - logger.verbose(`No profile for '${actorName}' available on ${scraperSlug}`); - throw Object.assign(new Error(`Profile for ${actorName} not available on ${scraperSlug}`), { warn: false }); - }), Promise.reject(new Error())); - } catch (error) { - if (error.warn !== false) { - logger.warn(`Error in scraper ${source}: ${error.message}`); - // logger.error(error.stack); - } - } - - return null; - }); -} - -async function scrapeActors(actorNames) { - return Promise.map(actorNames || argv.actors, async (actorName) => { - try { - const actorSlug = slugify(actorName); - const actorEntry = await knex('actors').where({ slug: actorSlug }).first(); - const sources = argv.sources || config.profiles || Object.keys(scrapers.actors); - - const finalSources = argv.withReleases ? sources.flat() : sources; // ignore race-to-success grouping when scenes are requested - - const [siteEntries, networkEntries] = await Promise.all([ - knex('sites') - .leftJoin('networks', 'sites.network_id', 'networks.id') - .select( - 'sites.*', - 'networks.name as network_name', 'networks.slug as network_slug', 'networks.url as network_url', 'networks.description as network_description', 'networks.parameters as network_parameters', - ) - .whereIn('sites.slug', finalSources.flat()), - knex('networks').select('*').whereIn('slug', finalSources.flat()), - ]); - - const sites = await curateSites(siteEntries, true); - const networks = networkEntries.map(network => ({ ...network, isFallback: true })); - const sitesBySlug = [].concat(networks, sites).reduce((acc, site) => ({ ...acc, [site.slug]: site }), {}); - - const profiles = await scrapeProfiles(sources, actorName, actorEntry, sitesBySlug); - const profile = await mergeProfiles(profiles, actorEntry); - - if (profile === null) { - logger.warn(`Could not find profile for actor '${actorName}'`); - - if (argv.save && !actorEntry) { - await storeActor({ name: actorName }, false, false); - } - - return null; - } - - if (argv.inspect) { - console.log(profile); - logger.info(`Found ${profile.releases.length} releases for ${actorName}`); - } - - if (argv.save) { - if (actorEntry && profile) { - await Promise.all([ - updateActor(profile, true, true), - storeAvatars(profile.avatars, actorEntry.id), - ]); - - return profile; - } - - await storeActor(profile, true, true); - } - - return profile; - } catch (error) { - console.log(error); - logger.warn(`${actorName}: ${error}`); - - return null; - } - }, { - concurrency: 3, - }); -} - -async function scrapeBasicActors() { - const basicActors = await knex('actors').where('scraped_at', null); - - return scrapeActors(basicActors.map(actor => actor.name)); -} - -async function associateActors(mappedActors, releases) { - const [existingActorEntries, existingAssociationEntries] = await Promise.all([ - knex('actors') - .whereIn('name', Object.values(mappedActors).map(actor => actor.name)) - .orWhereIn('slug', Object.keys(mappedActors)), - knex('releases_actors').whereIn('release_id', releases.map(release => release.id)), - ]); - - const associations = await Promise.map(Object.entries(mappedActors), async ([actorSlug, actor]) => { - try { - const actorEntry = existingActorEntries.find(actorX => actorX.slug === actorSlug) - || await storeActor(actor); - - // if a scene - return Array.from(actor.releaseIds) - .map(releaseId => ({ - release_id: releaseId, - actor_id: actorEntry.id, - })) - .filter(association => !existingAssociationEntries - // remove associations already in database - .some(associationEntry => associationEntry.actor_id === association.actor_id - && associationEntry.release_id === association.release_id)); - } catch (error) { - logger.error(actor.name, error); - return null; - } - }); - - await knex('releases_actors').insert(associations.filter(association => association).flat()); - - // basic actor scraping is failure prone, don't run together with actor association - // await scrapebasicactors(), + console.log(releaseIdsByActor); } module.exports = { - associateActors, - fetchActors, - scrapeActors, - scrapeBasicActors, + storeReleaseActors, }; diff --git a/src/app.js b/src/app.js index 99f57d58..a393df29 100644 --- a/src/app.js +++ b/src/app.js @@ -5,7 +5,10 @@ const argv = require('./argv'); const initServer = require('./web/server'); const knex = require('./knex'); -const fetchUpdates = require('./fetch-updates'); +const fetchUpdates = require('./updates'); +const fetchDeep = require('./deep'); +const { storeReleases } = require('./store-releases'); +// const { storeReleaseActors } = require('./actors'); async function init() { if (argv.server) { @@ -13,7 +16,15 @@ async function init() { return; } - await fetchUpdates(); + const updateBaseReleases = (argv.scrape || argv.sites || argv.networks) && await fetchUpdates(); + + const updateDeepReleases = updateBaseReleases && await fetchDeep(updateBaseReleases); + const argvDeepReleases = argv.scenes && await fetchDeep(argv.scenes); + + await storeReleases([...(updateDeepReleases || []), ...(argvDeepReleases || [])]); + + // await storeReleaseActors(updateReleases); + knex.destroy(); } diff --git a/src/deep.js b/src/deep.js new file mode 100644 index 00000000..7d3044ce --- /dev/null +++ b/src/deep.js @@ -0,0 +1,145 @@ +'use strict'; + +const argv = require('./argv'); +const logger = require('./logger')(__filename); +const knex = require('./knex'); +const scrapers = require('./scrapers/scrapers'); +const { curateSites } = require('./sites'); +const { curateNetworks } = require('./networks'); + +function urlToSiteSlug(url) { + try { + const slug = new URL(url) + .hostname + .match(/([\w-]+)\.\w+$/)?.[1]; + + return slug; + } catch (error) { + logger.warn(`Failed to derive site slug from '${url}': ${error.message}`); + + return null; + } +} + +async function findSites(baseReleases) { + const baseReleasesWithoutSite = baseReleases.filter(release => release.url && !release.site); + + const siteSlugs = Array.from(new Set( + baseReleasesWithoutSite + .map(baseRelease => urlToSiteSlug(baseRelease.url)) + .filter(Boolean), + )); + + const siteEntries = await knex('sites').whereIn('slug', siteSlugs); + const networkEntries = await knex('networks').whereIn('slug', siteSlugs); + + const sites = await curateSites(siteEntries, true, false); + const networks = await curateNetworks(networkEntries, true, false, false); + const markedNetworks = networks.map(network => ({ ...network, isFallback: true })); + + const sitesBySlug = [] + .concat(sites, markedNetworks) + .reduce((accSites, site) => ({ ...accSites, [site.slug]: site }), {}); + + return sitesBySlug; +} + +function toBaseReleases(baseReleasesOrUrls) { + return baseReleasesOrUrls + .map((baseReleaseOrUrl) => { + if (baseReleaseOrUrl.url) { + // base release with URL + return { + ...baseReleaseOrUrl, + deep: false, + }; + } + + if (/^http/.test(baseReleaseOrUrl)) { + // URL + return { + url: baseReleaseOrUrl, + deep: false, + }; + } + + if (typeof baseReleaseOrUrl === 'object' && !Array.isArray(baseReleaseOrUrl)) { + // base release without URL, prepare for passthrough + return { + ...baseReleaseOrUrl, + deep: false, + }; + } + + logger.warn(`Malformed base release, discarding '${baseReleaseOrUrl}'`); + return null; + }) + .filter(Boolean); +} + +async function scrapeRelease(baseRelease, sites, type = 'scene') { + const site = baseRelease.site || sites[urlToSiteSlug(baseRelease.url)]; + + if (!site) { + logger.warn(`No site available for ${baseRelease.url}`); + return baseRelease; + } + + if ((!baseRelease.url && !baseRelease.path) || !argv.deep) { + return { + ...baseRelease, + site, + }; + } + + const scraper = scrapers.releases[site.slug]; + + if (!scraper) { + logger.warn(`Could not find scraper for ${baseRelease.url}`); + return baseRelease; + } + + if ((type === 'scene' && !scraper.fetchScene) || (type === 'movie' && !scraper.fetchMovie)) { + logger.warn(`The '${site.name}'-scraper cannot fetch individual ${type}s`); + return baseRelease; + } + + try { + const scrapedRelease = type === 'scene' + ? await scraper.fetchScene(baseRelease.url, site, baseRelease) + : await scraper.fetchMovie(baseRelease.url, site, baseRelease); + + const mergedRelease = { + ...baseRelease, + ...scrapedRelease, + deep: !!scrapedRelease, + site, + }; + + if (scrapedRelease && baseRelease?.tags) { + mergedRelease.tags = baseRelease.tags.concat(scrapedRelease.tags); + } + + console.log(mergedRelease); + + return mergedRelease; + } catch (error) { + logger.error(`Deep scrape failed for ${baseRelease.url}: ${error.message}`); + return baseRelease; + } +} + +async function scrapeReleases(baseReleases, sites) { + return Promise.all(baseReleases.map(baseRelease => scrapeRelease(baseRelease, sites))); +} + +async function fetchReleases(baseReleasesOrUrls) { + const baseReleases = toBaseReleases(baseReleasesOrUrls); + const sites = await findSites(baseReleases); + + const deepReleases = await scrapeReleases(baseReleases, sites); + + return deepReleases; +} + +module.exports = fetchReleases; diff --git a/src/networks.js b/src/networks.js index d0fdd643..983bfac2 100644 --- a/src/networks.js +++ b/src/networks.js @@ -4,29 +4,33 @@ const knex = require('./knex'); const whereOr = require('./utils/where-or'); const { fetchSites } = require('./sites'); -async function curateNetwork(network, includeParameters = false) { - const [sites, studios] = await Promise.all([ - fetchSites({ network_id: network.id }), - knex('studios') - .where({ network_id: network.id }), - ]); - - return { +async function curateNetwork(network, includeParameters = false, includeSites = true, includeStudios = false) { + const curatedNetwork = { id: network.id, name: network.name, url: network.url, description: network.description, slug: network.slug, - sites, parameters: includeParameters ? network.parameters : null, - studios: studios.map(studio => ({ + }; + + if (includeSites) { + curatedNetwork.sites = await fetchSites({ network_id: network.id }); + } + + if (includeStudios) { + const studios = await knex('studios').where({ network_id: network.id }); + + curatedNetwork.studios = studios.map(studio => ({ id: studio.id, name: studio.name, url: studio.url, description: studio.description, slug: studio.slug, - })), - }; + })); + } + + return curatedNetwork; } function curateNetworks(releases) { @@ -69,6 +73,8 @@ async function fetchNetworksFromReleases() { } module.exports = { + curateNetwork, + curateNetworks, fetchNetworks, fetchNetworksFromReleases, findNetworkByUrl, diff --git a/src/releases.js b/src/releases.js index e4054561..ae5dedb6 100644 --- a/src/releases.js +++ b/src/releases.js @@ -15,7 +15,7 @@ const { storeMedia, associateMedia, } = require('./media'); -const { fetchSites, findSiteByUrl } = require('./sites'); +const { fetchSites } = require('./sites'); const slugify = require('./utils/slugify'); const capitalize = require('./utils/capitalize'); @@ -174,16 +174,7 @@ async function attachChannelSite(release) { }; } - try { - const urlSite = await findSiteByUrl(release.channel.url || release.channel); - - return { - ...release, - site: urlSite, - }; - } catch (error) { - throw new Error(`Unable to derive channel site from generic URL: ${release.url}`); - } + throw new Error(`Unable to match channel '${release.channel.slug || release.channel}' from generic URL: ${release.url}`); } async function attachStudio(release) { diff --git a/src/scrapers/bamvisions.js b/src/scrapers/bamvisions.js index a2d05f95..116073f0 100644 --- a/src/scrapers/bamvisions.js +++ b/src/scrapers/bamvisions.js @@ -90,7 +90,7 @@ async function scrapeProfile({ qu }, site, withScenes) { const bio = qu.all('.stats li', true).reduce((acc, row) => { const [key, value] = row.split(':'); - return { ...acc, [slugify(key, { delimiter: '_' })]: value.trim() }; + return { ...acc, [slugify(key, '_')]: value.trim() }; }, {}); if (bio.height) profile.height = feetInchesToCm(bio.height); @@ -133,7 +133,7 @@ async function fetchScene(url, site) { } async function fetchProfile(actorName, scraperSlug, site, include) { - const actorSlugA = slugify(actorName, { delimiter: '' }); + const actorSlugA = slugify(actorName, ''); const actorSlugB = slugify(actorName); const resA = await get(`${site.url}/models/${actorSlugA}.html`); diff --git a/src/scrapers/brazzers.js b/src/scrapers/brazzers.js index c2006874..36a24fbf 100644 --- a/src/scrapers/brazzers.js +++ b/src/scrapers/brazzers.js @@ -43,7 +43,7 @@ function scrapeAll(html, site, upcoming) { const poster = `https:${$(element).find('.card-main-img').attr('data-src')}`; const photos = $(element).find('.card-overlay .image-under').map((photoIndex, photoElement) => `https:${$(photoElement).attr('data-src')}`).toArray(); - const channel = slugify($(element).find('.collection').attr('title'), { delimiter: '' }); + const channel = slugify($(element).find('.collection').attr('title'), ''); return acc.concat({ url, diff --git a/src/scrapers/cherrypimps.js b/src/scrapers/cherrypimps.js index f0950349..7f573957 100644 --- a/src/scrapers/cherrypimps.js +++ b/src/scrapers/cherrypimps.js @@ -61,7 +61,7 @@ function scrapeProfile({ q, qa, qtx }) { const keys = qa('.model-descr_line:not(.model-descr_rait) p.text span', true); const values = qa('.model-descr_line:not(.model-descr_rait) p.text').map(el => qtx(el)); - const bio = keys.reduce((acc, key, index) => ({ ...acc, [slugify(key, { delimiter: '_' })]: values[index] }), {}); + const bio = keys.reduce((acc, key, index) => ({ ...acc, [slugify(key, '_')]: values[index] }), {}); if (bio.height) profile.height = Number(bio.height.match(/\((\d+)cm\)/)[1]); if (bio.weight) profile.weight = Number(bio.weight.match(/\((\d+)kg\)/)[1]); @@ -122,7 +122,7 @@ async function fetchScene(url, site, release) { async function fetchProfile(actorName, scraperSlug) { const actorSlug = slugify(actorName); - const actorSlug2 = slugify(actorName, { delimiter: '' }); + const actorSlug2 = slugify(actorName, ''); const [url, url2] = ['cherrypimps', 'wildoncam'].includes(scraperSlug) ? [`https://${scraperSlug}.com/models/${actorSlug}.html`, `https://${scraperSlug}.com/models/${actorSlug2}.html`] diff --git a/src/scrapers/ddfnetwork.js b/src/scrapers/ddfnetwork.js index 8bda6e8e..590d7bf8 100644 --- a/src/scrapers/ddfnetwork.js +++ b/src/scrapers/ddfnetwork.js @@ -74,7 +74,7 @@ async function fetchActorReleases(urls) { async function scrapeProfile(html, _url, actorName) { const { qu } = ex(html); - const keys = qu.all('.about-title', true).map(key => slugify(key, { delimiter: '_' })); + const keys = qu.all('.about-title', true).map(key => slugify(key, '_')); const values = qu.all('.about-info').map((el) => { if (el.children.length > 0) { return Array.from(el.children, child => child.textContent.trim()).join(', '); diff --git a/src/scrapers/fullpornnetwork.js b/src/scrapers/fullpornnetwork.js index d818adbe..2b8521b8 100644 --- a/src/scrapers/fullpornnetwork.js +++ b/src/scrapers/fullpornnetwork.js @@ -79,7 +79,7 @@ async function fetchScene(url, site) { } async function fetchProfile(actorName, scraperSlug) { - const actorSlug = slugify(actorName, { delimiter: '' }); + const actorSlug = slugify(actorName, ''); const url = scraperSlug === 'povperverts' ? `https://povperverts.net/models/${actorSlug}.html` : `https://${scraperSlug}.com/models/${actorSlug}.html`; diff --git a/src/scrapers/gamma.js b/src/scrapers/gamma.js index b54cddad..44fd3b31 100644 --- a/src/scrapers/gamma.js +++ b/src/scrapers/gamma.js @@ -233,7 +233,7 @@ async function scrapeScene(html, url, site, baseRelease, mobileHtml) { release.tags = hasTrans ? [...rawTags, 'transsexual'] : rawTags; const channel = data?.productionCompany?.name || $('.studioLink a, .siteLink a').attr('title')?.trim() || $('.siteNameSpan').text()?.trim().toLowerCase().replace('.com', ''); - if (channel) release.channel = slugify(channel, { delimiter: '' }); + if (channel) release.channel = slugify(channel, ''); if (videoData.picPreview && new URL(videoData.picPreview).pathname.length > 1) release.poster = videoData.picPreview; // sometimes links to just https://images02-fame.gammacdn.com/ diff --git a/src/scrapers/hush.js b/src/scrapers/hush.js index e013986f..7cfa8c46 100644 --- a/src/scrapers/hush.js +++ b/src/scrapers/hush.js @@ -193,7 +193,7 @@ function scrapeSceneT1({ html, qu }, site, url, baseRelease, channelRegExp) { if (channel) { release.channel = { force: true, - slug: slugify(channel, { delimiter: '' }), + slug: slugify(channel, ''), }; } } @@ -239,7 +239,7 @@ function scrapeProfile({ el, qu }, site) { return { ...acc, - [slugify(key, { delimiter: '_' })]: value.trim(), + [slugify(key, '_')]: value.trim(), }; }, {}); @@ -272,7 +272,7 @@ function scrapeProfileT1({ el, qu }, site) { return { ...acc, - [slugify(key, { delimiter: '_' })]: value.trim(), + [slugify(key, '_')]: value.trim(), }; }, {}); @@ -308,7 +308,7 @@ function scrapeProfileTour({ el, qu }, site) { return { ...acc, - [slugify(key, { delimiter: '_' })]: value.trim(), + [slugify(key, '_')]: value.trim(), }; }, {}); @@ -382,7 +382,7 @@ async function fetchScene(url, site, baseRelease, beforeFetchLatest) { } async function fetchProfile(actorName, scraperSlug, site) { - const actorSlugA = slugify(actorName, { delimiter: '' }); + const actorSlugA = slugify(actorName, ''); const actorSlugB = slugify(actorName); const t1 = site.parameters?.t1 ? 't1/' : ''; diff --git a/src/scrapers/julesjordan.js b/src/scrapers/julesjordan.js index 2c7dee78..162ccefc 100644 --- a/src/scrapers/julesjordan.js +++ b/src/scrapers/julesjordan.js @@ -384,8 +384,8 @@ async function fetchMovie(url, site) { } async function fetchProfile(actorName) { - const actorSlugA = slugify(actorName, { delimiter: '-' }); - const actorSlugB = slugify(actorName, { delimiter: '' }); + const actorSlugA = slugify(actorName, '-'); + const actorSlugB = slugify(actorName, ''); const urlA = `https://julesjordan.com/trial/models/${actorSlugA}.html`; const urlB = `https://julesjordan.com/trial/models/${actorSlugB}.html`; diff --git a/src/scrapers/mindgeek.js b/src/scrapers/mindgeek.js index bc515f2f..1d74499e 100644 --- a/src/scrapers/mindgeek.js +++ b/src/scrapers/mindgeek.js @@ -98,7 +98,7 @@ function scrapeScene(data, url, _site, networkName) { } const siteName = data.collections[0]?.name || data.brand; - release.channel = slugify(siteName, { delimiter: '' }); + release.channel = slugify(siteName, ''); release.url = url || `https://www.${networkName || data.brand}.com/scene/${entryId}/`; diff --git a/src/scrapers/nubiles.js b/src/scrapers/nubiles.js index c7208609..a2c4a77c 100644 --- a/src/scrapers/nubiles.js +++ b/src/scrapers/nubiles.js @@ -94,7 +94,7 @@ function scrapeProfile({ qu }, _actorName, origin) { const keys = qu.all('.model-profile h5', true); const values = qu.all('.model-profile h5 + p', true); - const bio = keys.reduce((acc, key, index) => ({ ...acc, [slugify(key, { delimiter: '_' })]: values[index] }), {}); + const bio = keys.reduce((acc, key, index) => ({ ...acc, [slugify(key, '_')]: values[index] }), {}); profile.age = Number(bio.age); profile.description = qu.q('.model-bio', true); diff --git a/src/scrapers/private.js b/src/scrapers/private.js index c80073b3..aa8e7bd0 100644 --- a/src/scrapers/private.js +++ b/src/scrapers/private.js @@ -95,7 +95,7 @@ async function scrapeScene(html, url, site) { release.movie = $('a[data-track="FULL MOVIE"]').attr('href'); const siteElement = $('.content-wrapper .logos-sites a'); - if (siteElement) release.channel = slugify(siteElement.text(), { delimiter: '' }); + if (siteElement) release.channel = slugify(siteElement.text(), ''); return release; } @@ -108,7 +108,7 @@ function scrapeProfile({ html, q, qa, qtx }) { const trimmedValue = value.trim(); if (trimmedValue.length === 0 || trimmedValue === '-') return acc; - return { ...acc, [slugify(key, { delimiter: '_' })]: trimmedValue }; + return { ...acc, [slugify(key, '_')]: trimmedValue }; }, {}); const description = q('.model-facts-long', true); @@ -176,7 +176,7 @@ async function fetchScene(url, site) { } async function fetchProfile(actorName) { - const actorSearchSlug = slugify(actorName, { delimiter: '+' }); + const actorSearchSlug = slugify(actorName, '+'); const url = `https://www.private.com/search.php?query=${actorSearchSlug}`; const modelRes = await geta(url, '.model h3 a'); diff --git a/src/scrapers/score.js b/src/scrapers/score.js index b6be7bdd..509254a3 100644 --- a/src/scrapers/score.js +++ b/src/scrapers/score.js @@ -155,7 +155,7 @@ async function scrapeProfile(html, actorUrl, withReleases) { const bio = qa('.stat').reduce((acc, el) => { const prop = q(el, '.label', true).slice(0, -1); - const key = slugify(prop, { delimiter: '_' }); + const key = slugify(prop, '_'); const value = q(el, '.value', true); return { diff --git a/src/sites.js b/src/sites.js index 5854edff..bbe3f6e0 100644 --- a/src/sites.js +++ b/src/sites.js @@ -7,19 +7,13 @@ const argv = require('./argv'); const knex = require('./knex'); const whereOr = require('./utils/where-or'); -async function curateSite(site, includeParameters = false) { - const tags = await knex('sites_tags') - .select('tags.*', 'sites_tags.inherit') - .where('site_id', site.id) - .join('tags', 'tags.id', 'sites_tags.tag_id'); - - return { +async function curateSite(site, includeParameters = false, includeTags = true) { + const curatedSite = { id: site.id, name: site.name, url: site.url, description: site.description, slug: site.slug, - tags, independent: !!site.parameters && site.parameters.independent, parameters: includeParameters ? site.parameters : null, network: { @@ -31,6 +25,15 @@ async function curateSite(site, includeParameters = false) { parameters: includeParameters ? site.network_parameters : null, }, }; + + if (includeTags) { + curatedSite.tags = await knex('sites_tags') + .select('tags.*', 'sites_tags.inherit') + .where('site_id', site.id) + .join('tags', 'tags.id', 'sites_tags.tag_id'); + } + + return curatedSite; } function curateSites(sites, includeParameters) { @@ -78,7 +81,7 @@ async function findSiteByUrl(url) { .first(); if (site) { - const curatedSite = curateSite(site, true); + const curatedSite = curateSite(site, true, false); return curatedSite; } @@ -182,6 +185,7 @@ async function fetchSitesFromReleases() { } module.exports = { + curateSite, curateSites, fetchIncludedSites, fetchSites, diff --git a/src/store-releases.js b/src/store-releases.js new file mode 100644 index 00000000..e3f526f5 --- /dev/null +++ b/src/store-releases.js @@ -0,0 +1,71 @@ +'use strict'; + +const config = require('config'); + +const knex = require('./knex'); +const slugify = require('./utils/slugify'); + +function curateReleaseEntry(release, batchId, existingRelease) { + const slug = slugify(release.title, '-', { + encode: true, + limit: config.titleSlugLength, + }); + + const curatedRelease = { + title: release.title, + entry_id: release.entryId || null, + site_id: release.site.id, + shoot_id: release.shootId || null, + studio_id: release.studio?.id || null, + url: release.url, + date: release.date, + slug, + description: release.description, + duration: release.duration, + type: release.type, + // director: release.director, + // likes: release.rating && release.rating.likes, + // dislikes: release.rating && release.rating.dislikes, + // rating: release.rating && release.rating.stars && Math.floor(release.rating.stars), + deep: typeof release.deep === 'boolean' ? release.deep : false, + deep_url: release.deepUrl, + updated_batch_id: batchId, + }; + + if (!existingRelease) { + curatedRelease.created_batch_id = batchId; + } + + return curatedRelease; +} + +async function attachSite(releases) { + const releasesWithoutSite = releases.filter(release => !release.site || release.site.isFallback); + + // console.log(releases, releasesWithoutSite); +} + +async function extractUniqueReleases(releases) { + const duplicateReleaseEntries = await knex('releases') + .whereIn(['entry_id', 'site_id'], releases.map(release => [release.entryId, release.site.id])); + + const duplicateReleaseEntryKeys = new Set(duplicateReleaseEntries.map(releaseEntry => `${releaseEntry.site_id}_${releaseEntry.entry_id}`)); + const uniqueReleases = releases.filter(release => !duplicateReleaseEntryKeys.has(`${release.site.id}_${release.entryId}`)); + + return uniqueReleases; +} + +async function storeReleases(releases) { + const [batchId] = await knex('batches').insert({ comment: null }).returning('id'); + + const uniqueReleases = await extractUniqueReleases(releases); + const releasesWithSites = await attachSite(releases); + + const curatedReleaseEntries = uniqueReleases.slice(0, 2).map(release => curateReleaseEntry(release, batchId)); + + await knex('releases').insert(curatedReleaseEntries); +} + +module.exports = { + storeReleases, +}; diff --git a/src/fetch-updates.js b/src/updates.js similarity index 96% rename from src/fetch-updates.js rename to src/updates.js index bed6ff6c..c7b0b1e8 100644 --- a/src/fetch-updates.js +++ b/src/updates.js @@ -83,7 +83,10 @@ async function scrapeLatestReleases(scraper, site, preData) { const latestReleasesWithSite = latestReleases.map(release => ({ ...release, site: release.site || site })); // attach site release is assigned to when stored const oldestReleaseOnPage = latestReleases.sort((releaseA, releaseB) => releaseB.date - releaseA.date).slice(-1)[0]; - const uniqueReleases = await extractUniqueReleases(latestReleasesWithSite, accReleases); + const uniqueReleases = argv.redownload + ? latestReleasesWithSite + : await extractUniqueReleases(latestReleasesWithSite, accReleases); + const pageAccReleases = accReleases.concat(uniqueReleases); logger.verbose(`Scraped '${site.name}' (${site.network.name}) page ${page}, found ${uniqueReleases.length} unique releases`); @@ -204,7 +207,9 @@ async function fetchUpdates() { { concurrency: 5 }, ); - return scrapedNetworks; + const releases = scrapedNetworks.flat(2); + + return releases; } module.exports = fetchUpdates; diff --git a/src/utils/slugify.js b/src/utils/slugify.js index 8bf237f3..f14371a3 100644 --- a/src/utils/slugify.js +++ b/src/utils/slugify.js @@ -1,13 +1,14 @@ 'use strict'; -function slugify(string, { +function slugify(string, delimiter = '-', { encode = false, - delimiter = '-', limit = 1000, } = {}) { const slugComponents = string.trim().toLowerCase().match(/\w+/g); - if (!slugComponents) return ''; + if (!slugComponents) { + return ''; + } const slug = slugComponents.reduce((acc, component, index) => { const accSlug = `${acc}${index > 0 ? delimiter : ''}${component}`;