diff --git a/seeds/01_networks.js b/seeds/01_networks.js index b4cc9df8..e9ead215 100644 --- a/seeds/01_networks.js +++ b/seeds/01_networks.js @@ -112,6 +112,12 @@ const networks = [ url: 'https://www.babes.com', parent: 'mindgeek', }, + { + slug: 'babepedia', + name: 'Babepedia', + url: 'https://www.babepedia.com', + type: 'info', + }, { slug: 'badoink', name: 'BaDoink', diff --git a/src/actors.js b/src/actors.js index 3f5146cc..56e4cb12 100644 --- a/src/actors.js +++ b/src/actors.js @@ -283,7 +283,7 @@ function curateActorEntries(baseActors, batchId) { function curateProfileEntry(profile) { if (!profile.id) { return null; - } + } const curatedProfileEntry = { ...(profile.update !== false && { id: profile.update }), @@ -322,6 +322,8 @@ function curateProfileEntry(profile) { avatar_media_id: profile.avatarMediaId || null, }; + if (profile.update) curatedProfileEntry.updated_at = new Date().toDateString(); + return curatedProfileEntry; } @@ -453,7 +455,7 @@ async function curateProfile(profile, actor) { if (profile.ethnicity && !curatedProfile.ethnicity) logger.warn(`Unrecognized ethnicity returned by '${profile.entity.name}' scraper: ${profile.ethnicity}`); if ((profile.hairColor || profile.hair) && !curatedProfile.hairColor) logger.warn(`Unrecognized hair color returned by '${profile.entity.name}' scraper: ${profile.hairColor || profile.hair}`); if (profile.eyes && !curatedProfile.eyes) logger.warn(`Unrecognized eye color returned by '${profile.entity.name}' scraper: ${profile.eyes}`); - + return curatedProfile; } catch (error) { logger.error(`Failed to curate '${profile.name}': ${error.message}`); @@ -691,7 +693,7 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy } logger.verbose(`Found profile for '${actor.name}' on '${label}'`); - + return await curateProfile({ ...actor, ...profile, @@ -727,14 +729,15 @@ async function getActorNames(actorNames) { } const actorsWithoutProfiles = await knex.raw(` - SELECT actors.name - FROM actors - WHERE NOT EXISTS ( - SELECT * - FROM actors_profiles - WHERE actors_profiles.actor_id = actors.id - AND actors_profiles.updated_at <= (?) - ) + SELECT actors.name + FROM actors + WHERE NOT EXISTS ( + SELECT * + FROM actors_profiles + WHERE actors_profiles.actor_id = actors.id + AND actors_profiles.updated_at >= (?) + ) AND alias_for IS NULL + ORDER BY actors.name `, [argv.actorsUpdate || new Date()]); return actorsWithoutProfiles.rows.map(actor => actor.name); @@ -750,9 +753,27 @@ async function storeProfiles(profiles) { async function scrapeActors(argNames) { const actorNames = await getActorNames(argNames); + const profiles = []; + + const batchSize = argv.actorsBatch; + logger.info(`Scraping profiles for ${actorNames.length} actors`); + + if (batchSize > 0) { + for (let i=0; i < actorNames.length; i=i+batchSize) { + logger.info(`Scraping profiles ${((i/actorNames.length)*100).toFixed(2)}%`); + profiles.push.apply(profiles, await scrapeActorsBatch(actorNames.slice(i, i + batchSize))); + } + } else { + profiles.push.apply(profiles, await scrapeActorsBatch(actorNames)); + } + + return profiles; +} + +async function scrapeActorsBatch(actorNames) { const baseActors = toBaseActors(actorNames); - logger.info(`Scraping profiles for ${actorNames.length} actors`); + logger.info(`Actors: ${actorNames.join(', ')}`); const sources = argv.profileSources || config.profiles || Object.keys(scrapers.actors); const entitySlugs = sources.flat(); @@ -760,7 +781,7 @@ async function scrapeActors(argNames) { const [entitiesBySlug, existingActorEntries] = await Promise.all([ fetchEntitiesBySlug(entitySlugs, 'desc'), knex('actors') - .select(['id', 'name', 'slug', 'entry_id']) + .select(['id', 'name', 'slug', 'entry_id', 'gender']) .whereIn('slug', baseActors.map(baseActor => baseActor.slug)) .whereNull('alias_for'), ]); @@ -820,132 +841,132 @@ async function scrapeActors(argNames) { async function getOrCreateActors(baseActors, batchId) { // WHERE IN causes stack depth error and performance issues with a large amount of values, no knex VALUES helper available - const actorValues = baseActors.map(actor => knex.raw('(:slug, :entityId, :entryId, :collisionLikely)', { - slug: actor.slug, - entityId: actor.entity.id, - entryId: actor.entryId, - collisionLikely: getCollisionLikely(actor), - })).join(', '); +const actorValues = baseActors.map(actor => knex.raw('(:slug, :entityId, :entryId, :collisionLikely)', { + slug: actor.slug, + entityId: actor.entity.id, + entryId: actor.entryId, + collisionLikely: getCollisionLikely(actor), +})).join(', '); - const existingActors = await knex - .select('actors.*') - .from(knex.raw(`actors, (VALUES ${actorValues}) AS base_actors (slug, entity_id, entry_id, collision_likely)`)) - .whereRaw(` - actors.slug = base_actors.slug - AND actors.entity_id IS NULL - AND NOT base_actors.collision_likely - `) - .orWhereRaw(` - actors.slug = base_actors.slug - AND actors.entity_id = base_actors.entity_id - AND ((actors.entry_id IS NULL AND base_actors.entry_id IS NULL) - OR actors.entry_id = base_actors.entry_id) - `); +const existingActors = await knex + .select('actors.*') + .from(knex.raw(`actors, (VALUES ${actorValues}) AS base_actors (slug, entity_id, entry_id, collision_likely)`)) + .whereRaw(` + actors.slug = base_actors.slug + AND actors.entity_id IS NULL + AND NOT base_actors.collision_likely + `) + .orWhereRaw(` + actors.slug = base_actors.slug + AND actors.entity_id = base_actors.entity_id + AND ((actors.entry_id IS NULL AND base_actors.entry_id IS NULL) + OR actors.entry_id = base_actors.entry_id) + `); - // const existingActorSlugs = new Set(existingActors.map(actor => actor.slug)); - const existingActorSlugs = existingActors.reduce((acc, actor) => ({ - ...acc, - [actor.entity_id]: { - ...acc[actor.entity_id], - [actor.entry_id]: { - ...acc[actor.entity_id]?.[actor.entry_id], - [actor.slug]: true, - }, +// const existingActorSlugs = new Set(existingActors.map(actor => actor.slug)); +const existingActorSlugs = existingActors.reduce((acc, actor) => ({ + ...acc, + [actor.entity_id]: { + ...acc[actor.entity_id], + [actor.entry_id]: { + ...acc[actor.entity_id]?.[actor.entry_id], + [actor.slug]: true, }, - }), {}); + }, +}), {}); - const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.entity.id]?.[baseActor.entryId]?.[baseActor.slug] && !existingActorSlugs.null?.null?.[baseActor.slug]); - const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId); +const uniqueBaseActors = baseActors.filter(baseActor => !existingActorSlugs[baseActor.entity.id]?.[baseActor.entryId]?.[baseActor.slug] && !existingActorSlugs.null?.null?.[baseActor.slug]); +const curatedActorEntries = curateActorEntries(uniqueBaseActors, batchId); - const newActors = await bulkInsert('actors', curatedActorEntries); +const newActors = await bulkInsert('actors', curatedActorEntries); - const newActorIdsByEntityIdEntryIdAndSlug = newActors.reduce((acc, actor) => ({ - ...acc, - [actor.entity_id]: { - ...acc[actor.entity_id], - [actor.entry_id]: { - ...acc[actor.entity_id]?.[actor.entry_id], - [actor.slug]: actor.id, - }, +const newActorIdsByEntityIdEntryIdAndSlug = newActors.reduce((acc, actor) => ({ + ...acc, + [actor.entity_id]: { + ...acc[actor.entity_id], + [actor.entry_id]: { + ...acc[actor.entity_id]?.[actor.entry_id], + [actor.slug]: actor.id, }, - }), {}); + }, +}), {}); - const newActorProfiles = await Promise.all(baseActors - .filter(actor => actor.hasProfile) - .map(actor => ({ - ...actor, - id: newActorIdsByEntityIdEntryIdAndSlug[actor.entity?.id]?.[actor.entryId]?.[actor.slug] || newActorIdsByEntityIdEntryIdAndSlug.null?.null?.[actor.slug], - })) - .filter(actor => !!actor.id) - .map(actor => curateProfile(actor))); +const newActorProfiles = await Promise.all(baseActors + .filter(actor => actor.hasProfile) + .map(actor => ({ + ...actor, + id: newActorIdsByEntityIdEntryIdAndSlug[actor.entity?.id]?.[actor.entryId]?.[actor.slug] || newActorIdsByEntityIdEntryIdAndSlug.null?.null?.[actor.slug], + })) + .filter(actor => !!actor.id) + .map(actor => curateProfile(actor))); - await storeProfiles(newActorProfiles); +await storeProfiles(newActorProfiles); - if (Array.isArray(newActors)) { - return newActors.concat(existingActors); - } +if (Array.isArray(newActors)) { + return newActors.concat(existingActors); +} - return existingActors; +return existingActors; } async function associateActors(releases, batchId) { - const baseActorsByReleaseId = releases.reduce((acc, release) => { - if (release.actors) { - acc[release.id] = toBaseActors(release.actors, release); - } - - return acc; - }, {}); - - const baseActors = Object.values(baseActorsByReleaseId).flat(); - - if (baseActors.length === 0) { - return []; +const baseActorsByReleaseId = releases.reduce((acc, release) => { + if (release.actors) { + acc[release.id] = toBaseActors(release.actors, release); } - const baseActorsBySlug = baseActors.reduce((acc, baseActor) => ({ - ...acc, - [baseActor.slug]: baseActor, - }), {}); + return acc; +}, {}); - const uniqueBaseActors = Object.values(baseActorsBySlug); +const baseActors = Object.values(baseActorsByReleaseId).flat(); - const actors = await getOrCreateActors(uniqueBaseActors, batchId); +if (baseActors.length === 0) { + return []; +} - /* - const actorIdsBySlug = actors.reduce((acc, actor) => ({ - ...acc, - [actor.slug]: actor.alias_for || actor.id, - }), {}); - */ +const baseActorsBySlug = baseActors.reduce((acc, baseActor) => ({ + ...acc, + [baseActor.slug]: baseActor, +}), {}); - const actorIdsByEntityIdEntryIdAndSlug = actors.reduce((acc, actor) => ({ - ...acc, - [actor.entity_id]: { - ...acc[actor.entity_id], - [actor.entry_id]: { - ...acc[actor.entity_id]?.[actor.entry_id], - [actor.slug]: { - actor_id: actor.alias_for || actor.id, - alias_id: actor.alias_for ? actor.id : null, - }, +const uniqueBaseActors = Object.values(baseActorsBySlug); + +const actors = await getOrCreateActors(uniqueBaseActors, batchId); + +/* +const actorIdsBySlug = actors.reduce((acc, actor) => ({ + ...acc, + [actor.slug]: actor.alias_for || actor.id, +}), {}); +*/ + +const actorIdsByEntityIdEntryIdAndSlug = actors.reduce((acc, actor) => ({ + ...acc, + [actor.entity_id]: { + ...acc[actor.entity_id], + [actor.entry_id]: { + ...acc[actor.entity_id]?.[actor.entry_id], + [actor.slug]: { + actor_id: actor.alias_for || actor.id, + alias_id: actor.alias_for ? actor.id : null, }, }, - }), {}); + }, +}), {}); - const releaseActorAssociations = Object.entries(baseActorsByReleaseId) - .map(([releaseId, releaseActors]) => releaseActors - .map(releaseActor => ({ - release_id: releaseId, - ...(actorIdsByEntityIdEntryIdAndSlug[releaseActor.entity?.id]?.[releaseActor.entryId]?.[releaseActor.slug] || actorIdsByEntityIdEntryIdAndSlug.null.null[releaseActor.slug]), - }))) - .flat(); +const releaseActorAssociations = Object.entries(baseActorsByReleaseId) + .map(([releaseId, releaseActors]) => releaseActors + .map(releaseActor => ({ + release_id: releaseId, + ...(actorIdsByEntityIdEntryIdAndSlug[releaseActor.entity?.id]?.[releaseActor.entryId]?.[releaseActor.slug] || actorIdsByEntityIdEntryIdAndSlug.null.null[releaseActor.slug]), + }))) + .flat(); - await bulkInsert('releases_actors', releaseActorAssociations, false); +await bulkInsert('releases_actors', releaseActorAssociations, false); - logger.verbose(`Associated ${releaseActorAssociations.length} actors to ${releases.length} scenes`); +logger.verbose(`Associated ${releaseActorAssociations.length} actors to ${releases.length} scenes`); - return actors; +return actors; } async function fetchActor(actorId) { diff --git a/src/argv.js b/src/argv.js index 1dda020f..bf2c9205 100644 --- a/src/argv.js +++ b/src/argv.js @@ -23,6 +23,13 @@ function interpretAfter(after) { .toDate(); } +function interpretActorAfter(after) { + if (!after) { + return new Date(); + } + return interpretAfter(after); +} + const { argv } = yargs .command('npm start') .option('server', { @@ -69,6 +76,11 @@ const { argv } = yargs default: false, alias: 'actor-scenes', }) + .option('actors-batch', { + describe: 'Bath size to scrape actors, if not seet then all are scraped in on pass', + type: 'number', + default: config?.actors?.batchSize === null ? 0 : config?.actors?.batchSize, + }) .option('actor-sources', { describe: 'Use these scrapers for actor data', type: 'array', @@ -307,6 +319,6 @@ const { argv } = yargs alias: ['delete-movie', 'remove-movies', 'remove-movies'], }) .coerce('after', interpretAfter) - .coerce('actors-update', interpretAfter); + .coerce('actors-update', interpretActorAfter); module.exports = argv; diff --git a/src/scrapers/babepedia.js b/src/scrapers/babepedia.js new file mode 100644 index 00000000..30cbba67 --- /dev/null +++ b/src/scrapers/babepedia.js @@ -0,0 +1,169 @@ +'use strict'; + +const qu = require('../utils/q'); +const slugify = require('../utils/slugify'); +const moment = require('moment'); + +function scrapeProfile({ query, el }, actorName, entity, include) { + let profile = { name: actorName }; + + if (!query) return {}; + + const name = query.cnt('#babename') + + if (!name) return {}; + + if (actorName !== name) { + profile.aliasFor = name; + } + + const avatar = query.url('#profimg a'); + if (avatar && avatar.indexOf('javascript:alert(') === -1) profile.avatar = { src: `${entity.url}${avatar}`, credit: 'Babepedia' }; + + const aka = query.cnt('#aka'); + if (aka) profile.aliases = aka?.replace('aka ', '')?.split('/').map(alias => alias.trim()); + + function measurementsFromString(str){ + const [bra, waist, hip] = str.split("-"); + if (bra && waist && hip) { + const measurements = {}; + measurements.bust = parseInt(bra); + measurements.cup = measurements.bust ? bra.replace(measurements.bust, "") : null; + measurements.waist = parseInt(waist); + measurements.hip = parseInt(hip); + return measurements; + } + return null; + } + + const allowedKeys = ['birthPlace', 'eyes', 'hair', 'birthdate', 'weight', + 'height', 'naturalBoobs', 'tattoos', 'piercings'] ; + + const bio = query.all('#biolist li').reduce((acc, item) => { + const keyMatch = query.cnt(item).split(':'); + + if (keyMatch && keyMatch.length === 2) { + let key = keyMatch[0].toLowerCase(); + let value = keyMatch[1].trim(); + + if (key === 'birthplace') key = 'birthPlace'; + if (key === 'eye color') key = 'eyes'; + if (key === 'hair color') key = 'hair'; + + if (key == 'measurements' && value) { + const measurements = measurementsFromString(value); + if (measurements) { + if (measurements.bust) acc.bust = measurements.bust; + if (measurements.cup) acc.cup = measurements.cup; + if (measurements.waist) acc.waist = measurements.waist; + if (measurements.hip) acc.hip = measurements.hip; + } + } + + if (key === 'born' && value) { + key = 'birthdate'; + value = moment.utc(value.replace(' of ', ' '), 'dddd Do MMMM YYYY')?.toDate(); + } + + if (key == 'height' && value) { + const rawHeightMatch = value.match(/\d+ cm/); + const cm = rawHeightMatch ? rawHeightMatch[0] : null; + value = cm ? parseInt(cm.replace("cm", "")) : null; + } + + if (key == 'weight' && value) { + const rawWeightMatch = value.match(/\d+ kg/); + const kg = rawWeightMatch ? rawWeightMatch[0] : null; + value = kg ? parseInt(kg.replace("kg", "")) : null; + } + + if (key == 'boobs' && value) { + if (value.match(/fake/i)) { + key = 'naturalBoobs'; + value = false; + } else if (value.match(/real/i)) { + key = 'naturalBoobs'; + value = true; + } + } + + if (key == 'tattoos' && value) { + if (value.match(/none/i)) { + acc.hasTattoos = false; + value = ''; + } else { + acc.hasTattoos = true; + } + } + + if (key == 'piercings' && value) { + if (value.match(/none/i)) { + acc.hasPiercings = false; + value = ''; + } else { + acc.hasPiercings = true; + } + } + + if (allowedKeys.includes(key)) { + acc[key] = value; + } + } + + return acc; + }, {}); + + profile = Object.assign(profile, bio); + + return profile; +} + +function scrapeSearch({ query, el }, actorName, entity, include) { + + const links = query.all('.results .thumbshot'); + + return links.map(link => { + const linkName = query.cnt(link, 'a'); + const linkUrl = query.url(link, 'a'); + const actorNameMatch = new RegExp( actorName, 'g' ); + + if (linkName?.match(actorNameMatch)) { + return linkUrl; + } + }).filter(Boolean); +} + +async function fetchProfile(actor, entity, include) { + const actorName = actor.name; + const searchName = actorName.replace('\'', ''); + + if (actor?.gender === 'male') { + return null; + } + + let url = `${entity.url}/search/${searchName}`; + let res = await qu.get(url); + + // Check if search redirects + let result = res.ok ? scrapeProfile(res.item, actorName, entity, include) : {}; + if (result.name === actorName) { + return result; + } + + if (res.ok) { + const actorPath = scrapeSearch(res.item, actorName, entity, include); + + if (actorPath.length === 1) { + url = `${entity.url}${actorPath[0]}`; + res = await qu.get(url); + + return res.ok ? scrapeProfile(res.item, actorName, entity, include) : res.status; + } + } + + return res.status; +} + +module.exports = { + fetchProfile, +}; diff --git a/src/scrapers/scrapers.js b/src/scrapers/scrapers.js index 2f5fa707..84392443 100644 --- a/src/scrapers/scrapers.js +++ b/src/scrapers/scrapers.js @@ -64,6 +64,7 @@ const xempire = require('./xempire'); // profiles const boobpedia = require('./boobpedia'); const freeones = require('./freeones'); +const babepedia = require('./babepedia'); const scrapers = { releases: { @@ -156,6 +157,7 @@ const scrapers = { anilos: nubiles, aziani, babes: mindgeek, + babepedia, babevr: badoink, backroomcastingcouch: elevatedx, baddaddypov: fullpornnetwork,