From fbfd52e831ab1b7841e27c89972983def07672f6 Mon Sep 17 00:00:00 2001 From: DebaucheryLibrarian Date: Sun, 24 Nov 2024 06:10:21 +0100 Subject: [PATCH] Refactored Aziani scraper. Improved actor profile update logic. --- seeds/00_tags.js | 28 +++++ seeds/01_networks.js | 6 +- seeds/02_sites.js | 140 +++++++++++++++------ src/actors.js | 71 ++++++++--- src/scrapers/aziani.js | 256 ++++++++++++++++++++++----------------- src/scrapers/scrapers.js | 5 + src/utils/convert.js | 11 +- 7 files changed, 354 insertions(+), 163 deletions(-) diff --git a/seeds/00_tags.js b/seeds/00_tags.js index 20b9a408..7f1847ce 100755 --- a/seeds/00_tags.js +++ b/seeds/00_tags.js @@ -1325,6 +1325,18 @@ const tags = [ name: 'ahegao', slug: 'ahegao', }, + { + name: 'compilation', + slug: 'compilation', + }, + { + name: 'hotwife', + slug: 'hotwife', + }, + { + name: 'interview', + slug: 'interview', + }, ]; const aliases = [ @@ -1870,6 +1882,10 @@ const aliases = [ name: 'double vaginal (dpp)', for: 'dvp', }, + { + name: 'double vagina', + for: 'dvp', + }, { name: 'double pussy penetration', for: 'dvp', @@ -2691,6 +2707,18 @@ const aliases = [ name: 'oral cumshot', for: 'cum-in-mouth', }, + { + name: 'compilations', + for: 'compilation', + }, + { + name: 'hot wife', + for: 'hotwife', + }, + { + name: 'interviews', + for: 'interview', + }, ]; const priorities = [ // higher index is higher priority diff --git a/seeds/01_networks.js b/seeds/01_networks.js index 9f92f7e2..0dd0f877 100755 --- a/seeds/01_networks.js +++ b/seeds/01_networks.js @@ -155,10 +155,12 @@ const networks = [ { slug: 'aziani', name: 'Aziani', - parent: 'gamma', url: 'https://www.aziani.com', parameters: { - layout: 'api', + areaId: 3, + blockId: 114458, + scene: 'https://aziani.com', + cdn: 'https://c75c0c3063.mjedge.net', }, }, { diff --git a/seeds/02_sites.js b/seeds/02_sites.js index 6f6ecdfb..1afe08e5 100755 --- a/seeds/02_sites.js +++ b/seeds/02_sites.js @@ -849,46 +849,82 @@ const sites = [ }, }, // AZIANI - { - slug: 'gangbangcreampie', - name: 'Gangbang Creampie', - url: 'https://www.gangbangcreampie.com', - parent: 'aziani', - tags: ['gangbang', 'creampie'], - parameters: { - scene: 'https://www.gangbangcreampie.com/en/video/gangbangcreampie', - }, - }, - { - slug: 'gloryholesecrets', - name: 'Glory Hole Secrets', - url: 'https://www.gloryholesecrets.com', - parent: 'aziani', - tags: ['gloryhole'], - parameters: { - scene: 'https://www.gloryholesecrets.com/en/video/gloryholesecrets', - }, - }, { slug: 'aziani', name: 'Aziani', - url: 'https://www.aziani.com', + url: 'https://www.aziani.com/series/aziani', parent: 'aziani', + parameters: { + seriesId: 268, + areaId: 3, + blockId: 114458, + scene: 'https://aziani.com', + }, + }, + { + slug: '2poles1hole', + name: '2 Poles 1 Hole', + url: 'https://2poles1hole.com', + parent: 'aziani', + independent: true, + parameters: { + areaId: 2, + blockId: 114064, + seriesId: 107, + modelBlockId: 114129, + }, + }, + { + slug: 'creampiled', + name: 'CreamPiled', + url: 'https://creampiled.com', + parent: 'aziani', + independent: true, + parameters: { + areaId: 11, + blockId: 115990, + seriesId: 436, + modelBlockId: 115345, + }, + }, + { + slug: 'popuporgies', + name: 'PopUpOrgies', + url: 'https://popuporgies.com', + parent: 'aziani', + independent: true, + parameters: { + areaId: 8, + blockId: 116531, + seriesId: 395, + }, + }, + { + slug: 'azianiiron', + name: 'Aziani Iron', + url: 'https://aziani.com/azianiiron', + parent: 'aziani', + parameters: { + areaId: 3, + blockId: 114458, + seriesId: 105, + scene: 'https://aziani.com', + }, + }, + { + slug: 'mrsaltys', + name: 'Mr. Saltys', + url: 'https://aziani.com/mrsaltys', + parent: 'aziani', + hasLogo: false, + parameters: { + areaId: 3, + blockId: 114458, + seriesId: 106, + scene: 'https://aziani.com', + }, }, /* offline - { - slug: 'portagloryhole', - name: 'Porta Gloryhole', - url: 'https://www.portagloryhole.com', - parent: 'aziani', - tags: ['gloryhole'], - }, - { - slug: 'azianiiron', - name: 'Aziani Iron', - url: 'https://www.azianiiron.com', - parent: 'aziani', - }, { slug: 'azianixposed', name: 'Aziani Xposed', @@ -914,6 +950,42 @@ const sites = [ parent: 'aziani', }, */ + // AZIANI GAMMA + { + slug: 'gangbangcreampie', + name: 'Gangbang Creampie', + url: 'https://www.gangbangcreampie.com', + parent: 'gamma', + tags: ['gangbang', 'creampie'], + independent: true, + parameters: { + scene: 'https://www.gangbangcreampie.com/en/video/gangbangcreampie', + }, + }, + { + slug: 'gloryholesecrets', + name: 'Glory Hole Secrets', + url: 'https://www.gloryholesecrets.com', + parent: 'gamma', + tags: ['gloryhole'], + independent: true, + parameters: { + scene: 'https://www.gloryholesecrets.com/en/video/gloryholesecrets', + }, + }, + /* different layout + { + slug: 'portagloryhole', + name: 'Porta Gloryhole', + url: 'https://www.portagloryhole.com', + parent: 'gamma', + tags: ['gloryhole'], + independent: true, + parameters: { + scene: 'https://www.portagloryhole.com/scenes', + }, + }, + */ // BABES { name: 'Babes', diff --git a/src/actors.js b/src/actors.js index 808f0831..3507003c 100755 --- a/src/actors.js +++ b/src/actors.js @@ -168,7 +168,7 @@ function toBaseActors(actorsOrNames, release) { // using top level parent widens the scope too much, e.g. different Gamma sites may not use the same actor database // const entity = getRecursiveParent(release?.entity); - const entity = (release?.entity?.indepdendent && release?.entity) + const entity = (release?.entity?.independent && release?.entity) || release?.entity?.parent || release?.entity || null; @@ -308,7 +308,7 @@ function curateProfileEntry(profile) { } const curatedProfileEntry = { - ...(profile.update !== false && { id: profile.update }), + ...(typeof profile.update === 'number' && { id: profile.update }), actor_id: profile.actorId, entity_id: profile.entity?.id || null, date_of_birth: profile.dateOfBirth, @@ -552,21 +552,34 @@ async function upsertProfiles(profiles) { const newProfileEntries = profiles.filter((profile) => !profile.update).map((profile) => curateProfileEntry(profile)).filter(Boolean); const updatingProfileEntries = profiles.filter((profile) => profile.update).map((profile) => curateProfileEntry(profile)).filter(Boolean); - const newProfiles = await insertProfiles(newProfileEntries); + const newProfiles = newProfileEntries.length > 0 + ? await insertProfiles(newProfileEntries) + : []; if (argv.force && updatingProfileEntries.length > 0) { const transaction = await knex.transaction(); - const queries = updatingProfileEntries.map((profileEntry) => knex('actors_profiles') - .where('id', profileEntry.id) + const queries = updatingProfileEntries.map(async (profileEntry) => knex('actors_profiles') + .modify((builder) => { + if (profileEntry.id) { + builder.where('id', profileEntry.id); + } else { + builder + .where('actor_id', profileEntry.actor_id) + .where('entity_id', profileEntry.entity_id); + } + }) .update(profileEntry) .returning(['id', 'actor_id']) .transacting(transaction)); await Promise.all(queries) .then(transaction.commit) - .catch(transaction.rollback); + .catch((error) => { + logger.error(error.message); + return transaction.rollback(); + }); - logger.info(`Updated ${updatingProfileEntries.length} new actor profiles`); + logger.info(`Updated ${updatingProfileEntries.length} actor profiles`); } if (profiles.length > 0) { @@ -586,10 +599,12 @@ async function upsertProfiles(profiles) { media_id: profile.avatarMediaId, })); - await knex('actors_avatars') - .insert(avatars) - .onConflict() - .ignore(); + if (avatars.length > 0) { + await knex('actors_avatars') + .insert(avatars) + .onConflict() + .ignore(); + } } } @@ -599,7 +614,7 @@ async function scrapeProfiles(actor, sources, entitiesBySlug, existingProfilesBy const profiles = Promise.map(validSources, async (source) => { try { // config may group sources to try until success - return await [].concat(source).reduce(async (outcome, scraperSlug) => outcome.catch(async () => { + return [].concat(source).reduce(async (outcome, scraperSlug) => outcome.catch(async () => { try { const entity = entitiesBySlug[scraperSlug] || null; @@ -846,6 +861,13 @@ async function getOrCreateActors(baseActors, batchId) { OR actors.entry_id = base_actors.entry_id) `); + const actorIds = existingActors.map((actor) => actor.id); + const entityIds = Array.from(new Set(baseActors.map((actor) => actor.entity?.id).filter(Boolean))); + + const existingProfiles = await knex('actors_profiles') + .whereIn('actor_id', actorIds) + .whereIn('entity_id', entityIds); + // const existingActorSlugs = new Set(existingActors.map(actor => actor.slug)); const existingActorSlugs = existingActors.reduce((acc, actor) => ({ ...acc, @@ -863,7 +885,7 @@ async function getOrCreateActors(baseActors, batchId) { const newActors = await bulkInsert('actors', curatedActorEntries); - const newActorIdsByEntityIdEntryIdAndSlug = newActors.reduce((acc, actor) => ({ + const actorIdsByEntityIdEntryIdAndSlug = [...existingActors, ...newActors].reduce((acc, actor) => ({ ...acc, [actor.entity_id]: { ...acc[actor.entity_id], @@ -874,13 +896,26 @@ async function getOrCreateActors(baseActors, batchId) { }, }), {}); + const profileIdsByActorIdAndEntityId = existingProfiles.reduce((acc, profile) => ({ + ...acc, + [profile.actor_id]: { + ...acc[profile.actor_id], + [profile.entity_id]: profile.id, + }, + }), {}); + const newActorProfiles = await Promise.all(baseActors .filter((actor) => actor.hasProfile) - .map((actor) => ({ - ...actor, - actorId: newActorIdsByEntityIdEntryIdAndSlug[actor.entity?.id]?.[actor.entryId]?.[actor.slug] || newActorIdsByEntityIdEntryIdAndSlug.null?.null?.[actor.slug], - })) - .filter((actor) => !!actor.id) + .map((actor) => { + const actorId = actorIdsByEntityIdEntryIdAndSlug[actor.entity?.id]?.[actor.entryId]?.[actor.slug] || actorIdsByEntityIdEntryIdAndSlug.null?.null?.[actor.slug]; + + return { + ...actor, + actorId, + update: profileIdsByActorIdAndEntityId[actorId]?.[actor.entity?.id], + }; + }) + .filter((actor) => !!actor.actorId) .map((actor) => curateProfile(actor))); await storeProfiles(newActorProfiles); diff --git a/src/scrapers/aziani.js b/src/scrapers/aziani.js index 4a56f77a..52d9212c 100755 --- a/src/scrapers/aziani.js +++ b/src/scrapers/aziani.js @@ -1,147 +1,187 @@ 'use strict'; +const unprint = require('unprint'); +const { decode } = require('html-entities'); + const slugify = require('../utils/slugify'); -const { get, getAll, initAll, extractDate } = require('../utils/qu'); -const { feetInchesToCm } = require('../utils/convert'); +const { feetInchesToCm, femaleFeetUsToEu } = require('../utils/convert'); -const imageRegex = /-\dx.jpg/; +function scrapeScene(data, channel, parameters) { + const release = {}; -function getFallbacks(source) { - if (!source || source.includes('join.jpg')) { - return null; - } + release.entryId = data.cms_set_id; + release.url = `${parameters.scene || channel.url}/video/${data.cms_set_id}`; - return Array.from(new Set([ - source.replace(imageRegex, '-4x.jpg'), - source.replace(imageRegex, '-3x.jpg'), - source.replace(imageRegex, '-2x.jpg'), - source.replace(imageRegex, '-1x.jpg'), - source, - ])); -} + release.title = data.name; + release.description = data.description; -function scrapeAll(scenes, site) { - return scenes.map(({ qu }) => { - const release = {}; + release.date = unprint.extractDate(data.added_nice, 'YYYY-MM-DD'); + release.duration = Number(data.lengths.total); - release.url = qu.url('a'); - - release.title = qu.q('h5 a', true); - release.date = qu.date('.icon-calendar + strong', 'MM/DD/YYYY'); - - release.entryId = qu.q('.stdimage', 'id', true)?.match(/set-target-(\d+)/)?.[1] || new URL(release.url).pathname.match(/trailers\/(.*).html/)?.[1]; - - release.actors = qu.q('h3', true).replace(/featuring:\s?/i, '').split(', ').filter(Boolean); - - const photoCount = qu.q('.stdimage', 'cnt'); - - [release.poster, ...release.photos] = Array.from({ length: Number(photoCount) }, (value, index) => { - const source = qu.img('.stdimage', `src${index}_1x`, { origin: site.url }); - - return getFallbacks(source); - }); - - return release; - }); -} - -function scrapeScene({ html, qu }, url, channel) { - const release = { url }; - - release.entryId = qu.q('.stdimage', 'id', true)?.match(/set-target-(\d+)/)?.[1] || new URL(url).pathname.match(/trailers\/(.*).html/)?.[1]; - - release.title = qu.q('h2', true); - release.description = qu.q('p', true); - - release.date = extractDate(html, 'MM/DD/YYYY', /\b\d{2}\/\d{2}\/\d{4}\b/); - - release.actors = qu.all('h5:not(.video_categories) a').map((actor) => ({ - name: qu.q(actor, null, true), - url: qu.url(actor, null), + release.actors = data.data_types.find((dataType) => dataType.data_type === 'Models')?.data_values.map((actor) => ({ + name: actor.name, + url: `${channel.url}/model/${actor.cms_data_value_id}?models=${encodeURI(actor.name)}`, // slug does not work unless it's also the ID })); - release.tags = qu.all('.video_categories a', true); + release.directors = data.data_types.find((dataType) => dataType.data_type === 'Videographers')?.data_values.map((director) => ({ + name: director.name, + // url: `${channel.url}/model/${director.slug}?models=${director.name}`, + })); - release.duration = qu.dur('.video_categories + p'); + release.tags = data.data_types + .filter((dataType) => dataType.data_type === 'Tags' || dataType.data_type === 'Category') + .flatMap((tags) => tags.data_values.map((tag) => tag.name)); - release.poster = getFallbacks(qu.img('a img')) || getFallbacks(qu.img('#preview video', 'poster', { origin: channel.url })); - release.caps = qu.imgs('.featured-video img', 'src0_1x', { origin: channel.url }).map((source) => getFallbacks(source)).filter(Boolean); + const poster = data.preview_formatted.thumb; + const teaser = data.preview_formatted.clip?.[0]; + const trailer = data.preview_formatted.trailer?.formats?.[0]?.content?.[0]; - release.trailer = qu.video('#preview source'); + if (poster) { + release.poster = Object.keys(poster) + .filter((key) => poster[key].length > 0) + .toSorted((keyA, keyB) => keyB.split('-')[0] - keyA.split('-')[0]) + .map((key) => unprint.prefixUrl(`${poster[key][0].fileuri}?${poster[key][0].signature}`, parameters.cdn)); + } + + if (teaser && teaser.fileuri !== trailer?.fileuri) { + release.teaser = unprint.prefixUrl(`${teaser.fileuri}?${teaser.signature}`, parameters.cdn); + } + + if (trailer) { + release.trailer = unprint.prefixUrl(`${trailer.fileuri}?${trailer.signature}`, parameters.cdn); + } + + release.channel = slugify(data.data_types.find((dataType) => dataType.data_type === 'Series')?.data_values[0]?.name, ''); return release; } -function scrapeProfile({ el, qu }) { - const profile = {}; +async function fetchLatest(channel, page = 1, { parameters }) { + const query = new URLSearchParams({ + cms_area_id: parameters.areaId, + cms_block_id: parameters.blockId, + count: 100, + start: (page - 1) * 100, + orderby: 'published_desc', + content_type: 'video', + // unsure what this does + status: 'enabled', + cms_set_ids: undefined, + data_types: 1, + content_count: 1, + data_type_search: parameters.seriesId && JSON.stringify({ 7: parameters.seriesId.toString() }), // doesn't seem relevant + }).toString(); - const bio = Array.from(qu.q('.widget-content').childNodes).reduce((acc, node, index, nodes) => { - const nextNode = nodes[index + 1]; + const url = `https://azianistudios.com/tour_api.php/content/sets?${query}`; - if (node.tagName === 'STRONG' && nextNode?.nodeType === 3) { - acc[slugify(node.textContent, '_')] = nextNode.textContent.trim(); - } + const res = await unprint.get(url, { + headers: { + Referer: channel.url, + 'x-nats-cms-area-id': parameters.areaId, + }, + }); - return acc; - }, {}); - - if (bio.ethnicity) profile.ethnicity = bio.ethnicity; - if (bio.age) profile.age = Number(bio.age); - - if (bio.height && /\d{3}/.test(bio.height)) profile.height = Number(bio.height.match(/\d+/)[0]); - if (bio.height && /\d[;']\d/.test(bio.height)) profile.height = feetInchesToCm(bio.height); - - if (bio.measurements) { - const [bust, waist, hip] = bio.measurements.split('-'); - - if (bust && /\d+[a-zA-Z]+/.test(bust)) profile.bust = bust; - if (waist) profile.waist = Number(waist); - if (hip) profile.hip = Number(hip); + if (res.ok && res.data.success) { + return res.data.sets.map((data) => scrapeScene(data, channel, parameters)); } - if (bio.bust_size && !profile.bust) profile.bust = bio.bust_size.toUpperCase(); + return res.status; +} - if (bio.birth_location) profile.birthPlace = bio.birth_location; - if (bio.status_married_or_single) profile.relationship = bio.status_married_or_single; +async function fetchScene(url, entity, _baseRelease, { parameters }) { + const entryId = new URL(url).pathname.match(/\/video\/(\w+)/)[1]; - if (bio.eye_color) profile.eyes = bio.eye_color; + const query = new URLSearchParams({ + cms_set_ids: entryId, + cms_area_id: parameters.areaId, + cms_block_id: parameters.blockId, + content: 1, + orderby: 'published_desc', + content_type: 'video', + // unsure what this does + data_types: 1, + content_count: 1, + }).toString(); - const avatar = qu.img('.tac img'); - profile.avatar = getFallbacks(avatar); + const apiUrl = `https://azianistudios.com/tour_api.php/content/sets?${query}`; - profile.releases = scrapeAll(initAll(el, '.featured-video')); + const res = await unprint.get(apiUrl, { + headers: { + Referer: entity.url, + 'x-nats-cms-area-id': parameters.areaId, + }, + }); + + if (res.ok && res.data.success) { + return scrapeScene(res.data.sets[0], entity, parameters); + } + + return res.status; +} + +function scrapeProfile(data, entity, parameters) { + const profile = {}; + + const bio = Object.fromEntries(Object.values(data.data_detail_values).map((detail) => [ + slugify(detail.name, '_'), + decode(detail.value || detail.content_formatted || detail.content), + ])); + + profile.url = `${entity.url}/model/${data.cms_data_value_id}`; + profile.entryId = data.cms_data_value_id; + + profile.description = data.description; + + profile.gender = bio.gender?.toLowerCase(); + profile.age = bio.age; + profile.dateOfBirth = unprint.extractDate(`${bio.born} 0`, 'MMMM Do YYYY', { match: /\w+ \d+\w{2} \d{1,4}/ }); + + profile.measurements = bio.measurements; + profile.height = feetInchesToCm(bio.height); + profile.foot = femaleFeetUsToEu(bio.foot_size); + + profile.hairColor = bio.hair_color; + profile.eyeColor = bio.eye_color; + + const avatar = bio.thumbnail?.image[0]; + + if (avatar) { + profile.avatar = { + src: `${parameters.cdn}${avatar.fileuri}?${avatar.signature}`, + expectType: { + 'application/octet-stream': 'image/jpeg', + }, + }; + } return profile; } -async function fetchLatest(site, page) { - const url = `${site.url}/tour/categories/movies_${page}_d.html`; - const res = await getAll(url, '.featured-video'); - - if (res.ok) { - return scrapeAll(res.items, site); +async function fetchProfile({ url }, { entity, parameters }) { + if (!url) { + // no easy search option + return null; } - return res.status; -} + const actorId = new URL(url).pathname.match(/model\/(\d+)/)[1]; -async function fetchScene(url, site) { - const res = await get(url, '.trailer'); + const query = new URLSearchParams({ + cms_data_value_ids: actorId, + cms_block_id: entity.parameters.modelBlockId, + cms_data_type_id: 4, + }).toString(); - if (res.ok) { - return scrapeScene(res.item, url, site); - } + const apiUrl = `https://azianistudios.com/tour_api.php/content/data-values?${query}`; - return res.status; -} + const res = await unprint.get(apiUrl, { + headers: { + Referer: entity.url, + 'x-nats-cms-area-id': entity.parameters.areaId, + }, + }); -async function fetchProfile({ name: actorName }, { site }) { - const actorSlug = slugify(actorName, ''); - const url = `${site.url}/tour/models/${actorSlug}.html`; - const res = await get(url, '.page-content .row'); - - if (res.ok) { - return scrapeProfile(res.item); + if (res.ok && res.data.success) { + return scrapeProfile(res.data.data_values[0], entity, parameters); } return res.status; diff --git a/src/scrapers/scrapers.js b/src/scrapers/scrapers.js index cb275ff6..d02993a1 100755 --- a/src/scrapers/scrapers.js +++ b/src/scrapers/scrapers.js @@ -7,6 +7,7 @@ const assylum = require('./assylum'); const amateurallure = require('./amateurallure'); const americanpornstar = require('./americanpornstar'); const amnesiac = require('./amnesiac'); +const aziani = require('./aziani'); const badoink = require('./badoink'); const bamvisions = require('./bamvisions'); const bang = require('./bang'); @@ -94,6 +95,7 @@ const scrapers = { archangel, asiam: modelmedia, assylum, + aziani, badoink, bamvisions, bang, @@ -201,6 +203,9 @@ const scrapers = { anilos: nubiles, archangel, asiam: modelmedia, + aziani, + '2poles1hole': aziani, + creampiled: aziani, babes: aylo, babevr: badoink, baddaddypov: fullpornnetwork, diff --git a/src/utils/convert.js b/src/utils/convert.js index 1374c575..e893887a 100755 --- a/src/utils/convert.js +++ b/src/utils/convert.js @@ -87,15 +87,24 @@ function convertApi(input, fromOrTo, to) { } function maleFeetUsToEu(input) { + if (!input) { + return null; + } + const size = Number(input.toString().match(/\d+(\.\d+)?/)?.[0]); return Math.round((1.27 * size + 29.94) / 0.5) * 0.5; // round to nearest half } function femaleFeetUsToEu(input) { + if (!input) { + return null; + } + const size = Number(input.toString().match(/\d+(\.\d+)?/)?.[0]); - return Math.round((1.27 * size + 28.67) / 0.5) * 0.5; // round to nearest half + // return Math.round((1.27 * size + 28.67) / 0.5) * 0.5; // round to nearest half + return Math.round(1.27 * size + 28.67); } module.exports = {