# Ops/DevOps Portfolio: Technical Specifications ## Ops Ecosystem Architecture ```text ┌─────────────────────────────────────────────────────────────────────────────┐ │ INTERFACE LAYER │ ├─────────────────────────────────────────────────────────────────────────────┤ │ Leptos WASM (Vapora Kanban) │ Ratatui TUI │ Axum REST │ CLI (clap) │ └─────────────────────────────────────────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────────────────────┐ │ ORCHESTRATION LAYER │ ├─────────────────────────────────────────────────────────────────────────────┤ │ Vapora (NATS Agents) │ Provisioning Orchestrator │ TypeDialog Backends │ │ DevOps/Monitor/Security │ (Rust + Nushell hybrid) │ (prov-gen IaC) │ └─────────────────────────────────────────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────────────────────┐ │ SECURITY LAYER │ ├─────────────────────────────────────────────────────────────────────────────┤ │ SecretumVault (PQC) │ Cedar Policies (ABAC) │ JWT + MFA (Auth) │ │ KV/Transit/PKI/DB │ Audit Logging │ TLS/mTLS (Transport) │ └─────────────────────────────────────────────────────────────────────────────┘ │ ▼ ┌─────────────────────────────────────────────────────────────────────────────┐ │ PERSISTENCE LAYER │ ├─────────────────────────────────────────────────────────────────────────────┤ │ SurrealDB (multi-tenant, time-series) │ NATS JetStream (messaging) │ │ etcd (distributed KV) │ PostgreSQL (ACID, enterprise) │ │ Filesystem (git-native markdown) │ Kogral (.kogral/ directory) │ └─────────────────────────────────────────────────────────────────────────────┘ ``` --- ## 1. Provisioning: Ops Specifications ### Directory Structure ```text provisioning/ ├── core/ │ ├── cli/ # Main CLI (211 lines, 80+ shortcuts) │ ├── nulib/ # Nushell libraries (476+ config accessors) │ └── scripts/ # Utility scripts (Nushell) ├── extensions/ │ ├── providers/ # AWS, UpCloud, Local (LXD) │ │ ├── aws/ # EC2, VPC, S3, RDS provisioners │ │ ├── upcloud/ # Servers, networking, storage │ │ └── local/ # LXD containers, networking │ ├── taskservs/ # 50+ infrastructure services │ │ ├── containerd/ # Container runtime │ │ ├── etcd/ # Distributed KV store │ │ ├── kubernetes/ # K8s control-plane + workers │ │ ├── cilium/ # eBPF-based CNI │ │ ├── postgresql/ # Database │ │ ├── prometheus/ # Metrics │ │ └── ... # 44 more services │ ├── clusters/ # Pre-configured cluster templates │ │ ├── k8s-ha/ # HA Kubernetes (3 control-plane, N workers) │ │ ├── k8s-dev/ # Dev Kubernetes (single-node) │ │ └── db-cluster/ # PostgreSQL HA with Patroni │ └── workflows/ # Automation workflows │ ├── backup/ # Backup automation │ ├── monitoring/ # Observability setup │ └── security/ # Security hardening ├── platform/ │ ├── orchestrator/ # Workflow execution engine (Rust) │ ├── control-center/ # Backend API (Axum + RBAC) │ ├── control-center-ui/ # Web dashboard (Leptos) │ ├── installer/ # Multi-mode installer │ ├── mcp-server/ # MCP server (Rust, 1000x Python) │ ├── ai-service/ # AI operations (LLM integration) │ ├── rag/ # RAG system (1200+ docs) │ ├── vault-service/ # Secrets management (integrates SecretumVault) │ └── detector/ # Anomaly detection └── schemas/ # Nickel IaC schemas (typed) ├── server.ncl # Server definition contract ├── networking.ncl # VPC, subnets, security groups ├── kubernetes.ncl # K8s cluster contract └── ... # 20+ schemas ``` ### Nickel IaC Schema Examples #### Server Schema ```nickel # schemas/server.ncl let Server = { name | String, provider | [ | 'aws, 'upcloud, 'local |], spec | { cpu | Number | default = 2, memory_gb | Number | default = 4, disk_gb | Number | default = 50, os | { family | [ | 'ubuntu, 'debian, 'rocky, 'alpine |], version | String, }, }, networking | { vpc | String | optional, subnet | String | optional, public_ip | Bool | default = false, security_groups | Array String | default = [], }, tags | { _ : String } | default = {}, # Validation constraints } | { spec.cpu | Number | std.number.is_positive | doc "CPU cores must be positive", spec.memory_gb | Number | std.number.is_positive | doc "Memory must be positive GB", spec.disk_gb | Number | std.number.greater_eq 20 | doc "Disk must be at least 20GB", } in Server ``` #### Kubernetes Cluster Schema ```nickel # schemas/kubernetes.ncl let KubernetesCluster = { name | String, provider | [ | 'aws, 'upcloud, 'local |], region | String, control_plane | { count | Number | default = 3, plan | [ | 'small, 'medium, 'large |] | default = 'medium, high_availability | Bool | default = true, }, workers | { count | Number | default = 3, plan | [ | 'small, 'medium, 'large, 'xlarge |] | default = 'medium, auto_scaling | { enabled | Bool | default = false, min | Number | default = 3, max | Number | default = 10, } | optional, }, networking | { vpc_cidr | String | default = "10.0.0.0/16", pod_cidr | String | default = "10.244.0.0/16", service_cidr | String | default = "10.96.0.0/12", cni | [ | 'cilium, 'calico, 'flannel |] | default = 'cilium, }, addons | { ingress_nginx | Bool | default = true, cert_manager | Bool | default = true, metrics_server | Bool | default = true, prometheus | Bool | default = false, }, version | String | default = "1.28", } in KubernetesCluster ``` ### Orchestrator API (Rust) ```rust // platform/orchestrator/src/lib.rs use std::collections::HashMap; use anyhow::Result; use serde::{Deserialize, Serialize}; use tokio::sync::RwLock; pub struct Orchestrator { state: Arc>, executor: WorkflowExecutor, scheduler: Scheduler, checkpoint_store: CheckpointStore, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Workflow { pub id: String, pub name: String, pub tasks: Vec, pub dependencies: HashMap>, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Task { pub id: String, pub task_type: TaskType, pub provider: Provider, pub config: serde_json::Value, pub retry_policy: RetryPolicy, } #[derive(Debug, Clone, Serialize, Deserialize)] pub enum TaskType { ProvisionServer, ConfigureNetworking, InstallService, RunHealthCheck, CreateBackup, } impl Orchestrator { pub async fn execute_workflow(&self, workflow: Workflow) -> Result { // 1. Resolve dependencies (topological sort) let ordered_tasks = self.resolve_dependencies(&workflow)?; tracing::info!("Resolved {} tasks", ordered_tasks.len()); // 2. Create execution checkpoint let checkpoint = self.checkpoint_store.create(&workflow).await?; tracing::info!("Created checkpoint: {}", checkpoint.id); // 3. Execute tasks with retry logic for (index, task) in ordered_tasks.iter().enumerate() { tracing::info!("Executing task {}/{}: {}", index + 1, ordered_tasks.len(), task.id); match self.executor.run(task).await { Ok(result) => { self.state.write().await.record_success(task, &result)?; self.checkpoint_store.update_progress(&checkpoint.id, index + 1).await?; } Err(e) => { tracing::error!("Task {} failed: {}", task.id, e); // Exponential backoff retry if let Some(result) = self.retry_with_backoff(task).await? { self.state.write().await.record_success(task, &result)?; } else { // Rollback to checkpoint tracing::warn!("Rollback to checkpoint {}", checkpoint.id); self.rollback(&checkpoint).await?; return Err(e); } } } } Ok(ExecutionResult::from_state(&*self.state.read().await)) } async fn retry_with_backoff(&self, task: &Task) -> Result> { let mut delay_ms = task.retry_policy.initial_delay_ms; for attempt in 1..=task.retry_policy.max_retries { tracing::info!("Retry attempt {}/{} for task {}", attempt, task.retry_policy.max_retries, task.id); tokio::time::sleep(tokio::time::Duration::from_millis(delay_ms)).await; match self.executor.run(task).await { Ok(result) => return Ok(Some(result)), Err(e) => { tracing::warn!("Retry {} failed: {}", attempt, e); delay_ms = (delay_ms * task.retry_policy.backoff_multiplier).min(task.retry_policy.max_delay_ms); } } } Ok(None) } async fn rollback(&self, checkpoint: &Checkpoint) -> Result<()> { let completed_tasks = self.state.read().await.get_completed_tasks(&checkpoint.workflow_id)?; // Reverse order rollback for task in completed_tasks.iter().rev() { tracing::info!("Rolling back task: {}", task.id); self.executor.rollback(task).await?; } self.checkpoint_store.delete(&checkpoint.id).await?; Ok(()) } fn resolve_dependencies(&self, workflow: &Workflow) -> Result> { // Topological sort let mut in_degree: HashMap = HashMap::new(); let mut graph: HashMap> = HashMap::new(); for task in &workflow.tasks { in_degree.insert(task.id.clone(), 0); graph.insert(task.id.clone(), vec![]); } for (task_id, deps) in &workflow.dependencies { for dep in deps { graph.get_mut(dep).unwrap().push(task_id.clone()); *in_degree.get_mut(task_id).unwrap() += 1; } } let mut queue: Vec = in_degree .iter() .filter( | (_, °ree) | degree == 0) .map( | (id, _) | id.clone()) .collect(); let mut sorted = Vec::new(); while let Some(task_id) = queue.pop() { sorted.push(task_id.clone()); for neighbor in &graph[&task_id] { *in_degree.get_mut(neighbor).unwrap() -= 1; if in_degree[neighbor] == 0 { queue.push(neighbor.clone()); } } } if sorted.len() != workflow.tasks.len() { anyhow::bail!("Cyclic dependency detected"); } Ok(sorted.into_iter().map( | id | workflow.tasks.iter().find( | t| t.id == id).unwrap().clone()).collect()) } } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct RetryPolicy { pub max_retries: u32, pub initial_delay_ms: u64, pub max_delay_ms: u64, pub backoff_multiplier: u64, } impl Default for RetryPolicy { fn default() -> Self { Self { max_retries: 3, initial_delay_ms: 1000, max_delay_ms: 60000, backoff_multiplier: 2, } } } ``` ### MCP Server Tools (Provisioning) ```rust // platform/mcp-server/src/tools.rs use serde_json::json; pub const MCP_TOOLS: &[Tool] = &[ Tool { name: "query_infrastructure", description: "Query infrastructure state using natural language (RAG-powered)", parameters: json!({ "query": { "type": "string", "description": "Natural language query" }, "provider": { "type": "string", "optional": true, "enum": ["aws", "upcloud", "local"] } }), }, Tool { name: "generate_config", description: "Generate Nickel configuration from natural language description", parameters: json!({ "description": { "type": "string", "description": "Infrastructure description" }, "provider": { "type": "string", "enum": ["aws", "upcloud", "local"] }, "resource_type": { "type": "string", "enum": ["server", "network", "cluster", "database"] } }), }, Tool { name: "validate_config", description: "Validate Nickel configuration against schemas", parameters: json!({ "config": { "type": "string", "description": "Nickel configuration code" }, "strict": { "type": "boolean", "default": true, "description": "Strict validation mode" } }), }, Tool { name: "estimate_cost", description: "Estimate monthly cost for infrastructure configuration", parameters: json!({ "config": { "type": "string", "description": "Nickel configuration" }, "region": { "type": "string", "optional": true } }), }, Tool { name: "check_compliance", description: "Check configuration against compliance frameworks", parameters: json!({ "config": { "type": "string" }, "framework": { "type": "string", "enum": ["soc2", "hipaa", "gdpr", "pci"] } }), }, Tool { name: "plan_migration", description: "Generate migration plan between configurations", parameters: json!({ "current": { "type": "string", "description": "Current Nickel config" }, "target": { "type": "string", "description": "Target Nickel config" } }), }, Tool { name: "execute_workflow", description: "Execute provisioning workflow with rollback support", parameters: json!({ "workflow_id": { "type": "string" }, "dry_run": { "type": "boolean", "default": true } }), }, ]; ``` ### CLI Shortcuts (80+) ```bash # Core operations prov init # Initialize provisioning workspace prov plan # Generate execution plan (dry-run) prov apply # Apply configuration with rollback prov destroy # Destroy infrastructure prov state list # List resources in state prov state show # Show resource details # Provider management prov provider add aws # Add AWS provider credentials prov provider add upcloud # Add UpCloud provider credentials prov provider list # List configured providers prov provider test # Test provider connectivity # Service installation (taskservs) prov service install containerd --servers server-01,server-02 prov service install kubernetes --cluster k8s-prod prov service install cilium --cluster k8s-prod --version 1.14 prov service list # List available services prov service status # Check service status # Cluster operations prov cluster create k8s-ha --template extensions/clusters/k8s-ha/ prov cluster scale k8s-prod --workers 10 prov cluster upgrade k8s-prod --version 1.28 prov cluster backup k8s-prod --output /backups/ prov cluster restore k8s-prod --from /backups/2026-01-22/ # Workflow operations prov workflow run backup --cluster k8s-prod prov workflow run monitoring --cluster k8s-prod prov workflow list # List available workflows prov workflow status # Check workflow status # Guided wizards prov wizard cluster # Interactive cluster setup prov wizard database # Interactive database setup prov wizard monitoring # Interactive monitoring setup # AI-assisted operations (MCP) prov mcp query "Show me all AWS servers in us-east-1" prov mcp generate "Create a 3-node K8s cluster with Cilium on UpCloud" prov mcp validate cluster.ncl prov mcp estimate cluster.ncl # Security operations prov vault init # Initialize SecretumVault integration prov vault store secret/myapp key=value prov vault read secret/myapp prov cert generate --domain example.com --engine pki prov cert rotate --cluster k8s-prod # Observability prov logs # View resource logs prov metrics # View cluster metrics prov health # Health check prov events # View events # Configuration management prov config get # Get config value (476+ accessors) prov config set prov config list # List all configuration prov config validate # Validate configuration ``` --- ## 2. SecretumVault: Ops Specifications ### Architecture Overview ```text ┌─────────────────────────────────────────────────────────────────┐ │ SecretumVault (~11K LOC, 50+ tests) │ ├─────────────────────────────────────────────────────────────────┤ │ │ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────────────────┐ │ │ │ CLI │ │ REST API │ │ Secrets Engines │ │ │ │ (svault) │ │ (Axum) │ │ KV/Transit/PKI/DB │ │ │ └──────┬──────┘ └──────┬──────┘ └────────────┬────────────┘ │ │ │ │ │ │ │ ┌──────┴────────────────┴──────────────────────┴─────────────┐ │ │ │ VaultCore │ │ │ │ Seal (Shamir) │ TokenManager │ Cedar ABAC │ Metrics │ │ │ └────────────────────────────────────────────────────────────┘ │ │ │ │ │ ┌───────────────────────┴───────────────────────────────────┐ │ │ │ Crypto Backends (Pluggable) │ │ │ │ OpenSSL │ OQS (PQC) │ AWS-LC │ RustCrypto │ │ │ └───────────────────────────────────────────────────────────┘ │ │ │ │ │ ┌───────────────────────┴───────────────────────────────────┐ │ │ │ Storage Backends (Pluggable) │ │ │ │ Filesystem │ etcd │ SurrealDB │ PostgreSQL │ │ │ └───────────────────────────────────────────────────────────┘ │ └─────────────────────────────────────────────────────────────────┘ ``` ### Core Types ```rust // src/core/vault.rs use std::collections::HashMap; use std::sync::Arc; use tokio::sync::Mutex; pub struct VaultCore { pub engines: HashMap>, pub storage: Arc, pub crypto: Arc, pub seal: Arc>, pub token_manager: Arc, pub authorizer: Arc, pub metrics: Arc, } impl VaultCore { pub async fn init(&self, shares: u8, threshold: u8) -> Result> { // Initialize Shamir unsealing let mut seal = self.seal.lock().await; let shares = seal.init(shares, threshold)?; // Setup default engines self.mount_engine("secret", Box::new(KvEngine::new(self.storage.clone()))).await?; Ok(shares) } pub async fn unseal(&self, share: SecretShare) -> Result { let mut seal = self.seal.lock().await; let progress = seal.unseal(share)?; if let UnsealProgress::Complete = progress { tracing::info!("Vault unsealed successfully"); self.metrics.record_unseal().await; } Ok(progress) } pub async fn mount_engine(&self, path: &str, engine: Box) -> Result<()> { self.engines.insert(path.to_string(), engine); tracing::info!("Mounted engine at path: {}", path); Ok(()) } } ``` ### Crypto Backends (Post-Quantum) ```rust // src/crypto/backends/oqs.rs (Post-Quantum) use oqs::{kem, sig}; use anyhow::Result; pub struct OqsBackend { kem_algorithm: kem::Algorithm, // MlKem768 sig_algorithm: sig::Algorithm, // MlDsa65 kem_cache: Arc>>, sig_cache: Arc>>, } impl OqsBackend { pub fn new() -> Result { Ok(Self { kem_algorithm: kem::Algorithm::MlKem768, sig_algorithm: sig::Algorithm::MlDsa65, kem_cache: Arc::new(Mutex::new(None)), sig_cache: Arc::new(Mutex::new(None)), }) } pub async fn kem_keypair(&self) -> CryptoResult { let mut cache = self.kem_cache.lock().await; if cache.is_none() { *cache = Some(kem::Kem::new(self.kem_algorithm)?); } let kem = cache.as_ref().unwrap(); let (pk, sk) = kem.keypair()?; // ML-KEM-768: 1184 bytes public key, 2400 bytes secret key Ok(KemKeyPair { public_key: pk.into_vec(), secret_key: sk.into_vec(), }) } pub async fn kem_encapsulate(&self, public_key: &[u8]) -> CryptoResult { let mut cache = self.kem_cache.lock().await; if cache.is_none() { *cache = Some(kem::Kem::new(self.kem_algorithm)?); } let kem = cache.as_ref().unwrap(); let pk = kem::PublicKey::from_bytes(public_key)?; let (ciphertext, shared_secret) = kem.encapsulate(&pk)?; // ML-KEM-768: 1088 bytes ciphertext, 32 bytes shared secret Ok(KemResult { ciphertext: ciphertext.into_vec(), shared_secret: shared_secret.into_vec(), }) } pub async fn kem_decapsulate(&self, secret_key: &[u8], ciphertext: &[u8]) -> CryptoResult> { let mut cache = self.kem_cache.lock().await; if cache.is_none() { *cache = Some(kem::Kem::new(self.kem_algorithm)?); } let kem = cache.as_ref().unwrap(); let sk = kem::SecretKey::from_bytes(secret_key)?; let ct = kem::Ciphertext::from_bytes(ciphertext)?; let shared_secret = kem.decapsulate(&sk, &ct)?; Ok(shared_secret.into_vec()) } pub async fn sign(&self, secret_key: &[u8], message: &[u8]) -> CryptoResult> { let mut cache = self.sig_cache.lock().await; if cache.is_none() { *cache = Some(sig::Sig::new(self.sig_algorithm)?); } let sig_obj = cache.as_ref().unwrap(); let sk = sig::SecretKey::from_bytes(secret_key)?; let signature = sig_obj.sign(message, &sk)?; // ML-DSA-65: 3309 bytes signature Ok(signature.into_vec()) } pub async fn verify(&self, public_key: &[u8], message: &[u8], signature: &[u8]) -> CryptoResult { let mut cache = self.sig_cache.lock().await; if cache.is_none() { *cache = Some(sig::Sig::new(self.sig_algorithm)?); } let sig_obj = cache.as_ref().unwrap(); let pk = sig::PublicKey::from_bytes(public_key)?; let sig_bytes = sig::Signature::from_bytes(signature)?; match sig_obj.verify(message, &sig_bytes, &pk) { Ok(_) => Ok(true), Err(_) => Ok(false), } } } #[async_trait] impl CryptoBackend for OqsBackend { async fn generate_keypair(&self, algorithm: KeyAlgorithm) -> CryptoResult { match algorithm { KeyAlgorithm::MlKem768 => { let kem_pair = self.kem_keypair().await?; Ok(KeyPair { public_key: kem_pair.public_key, private_key: kem_pair.secret_key, }) } KeyAlgorithm::MlDsa65 => { let mut cache = self.sig_cache.lock().await; if cache.is_none() { *cache = Some(sig::Sig::new(self.sig_algorithm)?); } let sig_obj = cache.as_ref().unwrap(); let (pk, sk) = sig_obj.keypair()?; Ok(KeyPair { public_key: pk.into_vec(), private_key: sk.into_vec(), }) } _ => Err(CryptoError::UnsupportedAlgorithm), } } async fn encrypt(&self, plaintext: &[u8]) -> CryptoResult> { // Generate ephemeral keypair let kem_pair = self.kem_keypair().await?; // Encapsulate to get shared secret let kem_result = self.kem_encapsulate(&kem_pair.public_key).await?; // Use shared secret for AES-256-GCM encryption let cipher = Aes256Gcm::new_from_slice(&kem_result.shared_secret)?; let nonce = Aes256Gcm::generate_nonce(&mut OsRng); let ciphertext = cipher.encrypt(&nonce, plaintext)?; // Prepend ciphertext with KEM ciphertext and nonce let mut result = Vec::new(); result.extend_from_slice(&kem_result.ciphertext); result.extend_from_slice(&nonce); result.extend_from_slice(&ciphertext); Ok(result) } async fn decrypt(&self, ciphertext: &[u8]) -> CryptoResult> { // Extract KEM ciphertext, nonce, and encrypted data let kem_ct = &ciphertext[..1088]; // ML-KEM-768 ciphertext size let nonce = &ciphertext[1088..1088+12]; let encrypted = &ciphertext[1088+12..]; // Decapsulate to get shared secret (requires secret key, stored in vault) // This is simplified - in practice, secret key would be retrieved securely // Use shared secret for AES-256-GCM decryption // ... (implementation details) Ok(plaintext) } } ``` ### Secrets Engines ```rust // src/engines/mod.rs #[async_trait] pub trait Engine: Send + Sync { fn name(&self) -> &str; fn engine_type(&self) -> &str; async fn read(&self, path: &str) -> Result>; async fn write(&self, path: &str, data: &Value) -> Result<()>; async fn delete(&self, path: &str) -> Result<()>; async fn list(&self, prefix: &str) -> Result>; } // src/engines/kv.rs (Key-Value Engine) pub struct KvEngine { storage: Arc, max_versions: usize, } impl KvEngine { pub fn new(storage: Arc) -> Self { Self { storage, max_versions: 10, // Keep 10 versions by default } } } #[async_trait] impl Engine for KvEngine { fn name(&self) -> &str { "kv" } fn engine_type(&self) -> &str { "kv-v2" } async fn write(&self, path: &str, data: &Value) -> Result<()> { let full_path = format!("secret/data/{}", path); // Get current version let current_version = self.get_current_version(path).await?; let new_version = current_version + 1; // Create versioned entry let entry = VersionedSecret { version: new_version, created_time: Utc::now(), data: data.clone(), }; // Store self.storage.store_secret(&full_path, &entry).await?; // Cleanup old versions self.cleanup_old_versions(path, new_version).await?; Ok(()) } async fn read(&self, path: &str) -> Result> { let full_path = format!("secret/data/{}", path); match self.storage.get_secret(&full_path).await? { Some(entry) => Ok(Some(entry.data)), None => Ok(None), } } } // src/engines/database.rs (Dynamic Credentials) pub struct DatabaseEngine { storage: Arc, connections: Arc>>, } #[async_trait] impl Engine for DatabaseEngine { fn name(&self) -> &str { "database" } fn engine_type(&self) -> &str { "database" } async fn write(&self, path: &str, data: &Value) -> Result<()> { // Configure database connection let config: DatabaseConfig = serde_json::from_value(data.clone())?; let connection = DatabaseConnection::new(&config).await?; self.connections.write().await.insert(path.to_string(), connection); Ok(()) } async fn read(&self, path: &str) -> Result> { // Generate dynamic credentials // path format: "database/creds/{role}" if !path.starts_with("creds/") { return Ok(None); } let role = path.strip_prefix("creds/").unwrap(); let role_config: RoleConfig = self.get_role_config(role).await?; // Generate username/password let username = format!("v-{}-{}", role, Uuid::new_v4()); let password = generate_secure_password(32); // Create user in database let connections = self.connections.read().await; let db_conn = connections.get(&role_config.db_name) .ok_or_else(|| anyhow!("Database connection not found"))?; db_conn.execute(&role_config.creation_statements .replace("{{name}}", &username) .replace("{{password}}", &password) .replace("{{expiration}}", &format_expiration(role_config.default_ttl)) ).await?; // Create lease for cleanup let lease_id = format!("database/creds/{}/{}", role, Uuid::new_v4()); self.create_lease(&lease_id, role_config.default_ttl, move |vault | { // Revoke credentials on lease expiration async move { db_conn.execute(&format!("DROP USER '{}'", username)).await?; Ok(()) } }).await?; Ok(Some(json!({ "lease_id": lease_id, "lease_duration": role_config.default_ttl.as_secs(), "username": username, "password": password, }))) } } ``` ### Configuration (TOML) ```toml # svault.toml [vault] # Crypto backend: openssl | oqs | aws-lc | rustcrypto crypto_backend = "oqs" # Post-quantum by default [server] address = "0.0.0.0:8200" tls_cert = "/etc/svault/certs/server.pem" tls_key = "/etc/svault/certs/server-key.pem" # Client certificate verification (mTLS) client_ca = "/etc/svault/certs/ca.pem" require_client_cert = false [storage] # Backend: filesystem | etcd | surrealdb | postgresql backend = "etcd" [storage.etcd] endpoints = ["http://etcd-01:2379", "http://etcd-02:2379", "http://etcd-03:2379"] username = "svault" password = "secret" # TLS for etcd ca_cert = "/etc/svault/etcd-ca.pem" client_cert = "/etc/svault/etcd-client.pem" client_key = "/etc/svault/etcd-client-key.pem" [seal.shamir] # Shamir secret sharing configuration shares = 5 threshold = 3 [auth] # Token TTL token_ttl = "24h" token_max_ttl = "720h" # 30 days [audit] # Audit log retention enabled = true retention_days = 2555 # 7 years backend = "file" path = "/var/log/svault/audit.log" [engines] # Default engines to mount on init kv = { path = "secret", version = 2 } transit = { path = "transit" } pki = { path = "pki", max_lease_ttl = "87600h" } # 10 years database = { path = "database" } [metrics] # Prometheus metrics enabled = true address = "0.0.0.0:9090" ``` --- ## 3. Vapora: Ops Agents Specifications ### Agent Roles for Ops ```rust // crates/vapora-agents/src/roles.rs #[derive(Debug, Clone, Serialize, Deserialize)] pub enum AgentRole { // Development roles Architect, Developer, CodeReviewer, Tester, Documenter, // Marketing/Communication Marketer, Presenter, // Ops/DevOps roles DevOps, // CI/CD, deployment, automation Monitor, // Health checks, alerting, metrics Security, // Vulnerability scanning, compliance // Management ProjectManager, DecisionMaker, } impl AgentRole { pub fn default_provider(&self) -> LLMProvider { match self { // High-complexity ops tasks: Claude Opus AgentRole::Security => LLMProvider::Claude { model: "claude-opus-4-20250514" }, AgentRole::DecisionMaker => LLMProvider::Claude { model: "claude-opus-4-20250514" }, // Standard ops tasks: Claude Sonnet AgentRole::DevOps => LLMProvider::Claude { model: "claude-sonnet-4-20250514" }, AgentRole::ProjectManager => LLMProvider::Claude { model: "claude-sonnet-4-20250514" }, // Real-time monitoring: Gemini Flash (low latency) AgentRole::Monitor => LLMProvider::Gemini { model: "gemini-2.0-flash-exp" }, _ => LLMProvider::Claude { model: "claude-sonnet-4-20250514" }, } } pub fn can_block_pipeline(&self) -> bool { matches!(self, AgentRole::Security) } pub fn requires_approval(&self) -> bool { matches!(self, AgentRole::DevOps | AgentRole::Security) } } ``` ### NATS Message Patterns (Ops) ```rust // crates/vapora-agents/src/messages.rs #[derive(Debug, Clone, Serialize, Deserialize)] pub enum AgentMessage { TaskAssignment { task_id: String, agent_id: String, agent_role: AgentRole, task_type: String, payload: serde_json::Value, priority: Priority, }, TaskResult { task_id: String, agent_id: String, agent_role: AgentRole, status: TaskStatus, output: Option, duration_ms: u64, tokens_used: u32, cost_cents: f64, }, Heartbeat { agent_id: String, agent_role: AgentRole, status: AgentStatus, current_load: f64, last_task_completed_at: Option>, }, Alert { severity: AlertSeverity, source: String, message: String, metadata: serde_json::Value, }, ApprovalRequest { task_id: String, requester: AgentRole, action: String, details: serde_json::Value, }, ApprovalResponse { task_id: String, approved: bool, approver: String, reason: Option, }, } // NATS subjects pub const TASK_ASSIGNMENT: &str = "vapora.tasks.assign"; pub const TASK_RESULTS: &str = "vapora.tasks.results"; pub const AGENT_HEARTBEAT: &str = "vapora.agents.heartbeat"; pub const ALERTS: &str = "vapora.alerts"; pub const APPROVALS_REQUEST: &str = "vapora.approvals.request"; pub const APPROVALS_RESPONSE: &str = "vapora.approvals.response"; #[derive(Debug, Clone, Serialize, Deserialize)] pub enum AlertSeverity { Info, Warning, Error, Critical, } #[derive(Debug, Clone, Serialize, Deserialize)] pub enum Priority { Low = 1, Normal = 2, High = 3, Critical = 4, } ``` ### Budget Control (Ops Agents) ```rust // crates/vapora-llm-router/src/budget.rs use std::collections::HashMap; use serde::{Deserialize, Serialize}; use chrono::{DateTime, Utc}; #[derive(Debug, Clone, Serialize, Deserialize)] pub struct BudgetConfig { pub role: AgentRole, pub monthly_limit_cents: u32, pub weekly_limit_cents: Option, pub enforcement: BudgetEnforcement, pub fallback_chain: Vec, } #[derive(Debug, Clone, Serialize, Deserialize)] pub enum BudgetEnforcement { Normal, // Under 80% of limit NearThreshold, // 80-100% of limit, use fallback Exceeded, // Over limit, block or use cheapest fallback only } pub struct BudgetTracker { configs: HashMap, usage: Arc>>, } impl BudgetTracker { pub async fn check_budget(&self, role: AgentRole, estimated_cost_cents: f64) -> BudgetEnforcement { let config = self.configs.get(&role).unwrap(); let usage = self.usage.read().await; let stats = usage.get(&role).unwrap_or(&UsageStats::default()); let monthly_usage = stats.monthly_cost_cents; let weekly_usage = stats.weekly_cost_cents; // Check weekly limit first (if set) if let Some(weekly_limit) = config.weekly_limit_cents { if weekly_usage + estimated_cost_cents > weekly_limit as f64 { return BudgetEnforcement::Exceeded; } else if weekly_usage + estimated_cost_cents > (weekly_limit as f64 * 0.8) { return BudgetEnforcement::NearThreshold; } } // Check monthly limit if monthly_usage + estimated_cost_cents > config.monthly_limit_cents as f64 { BudgetEnforcement::Exceeded } else if monthly_usage + estimated_cost_cents > (config.monthly_limit_cents as f64 * 0.8) { BudgetEnforcement::NearThreshold } else { BudgetEnforcement::Normal } } pub async fn select_provider(&self, role: AgentRole, task_type: &str) -> LLMProvider { let enforcement = self.check_budget(role, self.estimate_cost(task_type)).await; let config = self.configs.get(&role).unwrap(); match enforcement { BudgetEnforcement::Normal => { // Use default provider for role role.default_provider() } BudgetEnforcement::NearThreshold => { // Use first fallback (cheaper) config.fallback_chain.get(0) .cloned() .unwrap_or_else(|| role.default_provider()) } BudgetEnforcement::Exceeded => { // Use cheapest fallback (typically Ollama local) config.fallback_chain.last() .cloned() .unwrap_or_else(|| LLMProvider::Ollama { model: "llama3.1:8b" }) } } } pub async fn record_usage(&self, role: AgentRole, cost_cents: f64) { let mut usage = self.usage.write().await; let stats = usage.entry(role).or_insert_with(UsageStats::default); stats.monthly_cost_cents += cost_cents; stats.weekly_cost_cents += cost_cents; stats.total_requests += 1; stats.last_updated = Utc::now(); } } #[derive(Debug, Clone, Serialize, Deserialize, Default)] pub struct UsageStats { pub monthly_cost_cents: f64, pub weekly_cost_cents: f64, pub total_requests: u64, pub last_updated: DateTime, } ``` ### Prometheus Metrics (Ops) ```rust // crates/vapora-telemetry/src/metrics.rs use prometheus::{Encoder, Gauge, Counter, Histogram, Registry}; pub struct VaporaMetrics { registry: Registry, // Budget metrics budget_utilization: Gauge, budget_exceeded_total: Counter, fallback_triggers_total: Counter, // Agent metrics active_agents: Gauge, task_duration_seconds: Histogram, task_status_total: Counter, // Cost metrics llm_cost_cents_total: Counter, tokens_used_total: Counter, } impl VaporaMetrics { pub fn new() -> Self { let registry = Registry::new(); let budget_utilization = Gauge::new( "vapora_budget_utilization_ratio", "Budget utilization ratio (0.0-1.0) per agent role" ).unwrap(); let budget_exceeded_total = Counter::new( "vapora_budget_exceeded_total", "Total number of budget exceeded events per agent role" ).unwrap(); let fallback_triggers_total = Counter::new( "vapora_fallback_triggers_total", "Total number of fallback provider triggers due to budget" ).unwrap(); let active_agents = Gauge::new( "vapora_active_agents", "Number of active agents by role and status" ).unwrap(); let task_duration_seconds = Histogram::new( "vapora_task_duration_seconds", "Task execution duration in seconds" ).unwrap(); let task_status_total = Counter::new( "vapora_task_status_total", "Total tasks by status (success, failed, timeout)" ).unwrap(); let llm_cost_cents_total = Counter::new( "vapora_llm_cost_cents_total", "Total LLM cost in cents per provider and role" ).unwrap(); let tokens_used_total = Counter::new( "vapora_tokens_used_total", "Total tokens used per provider and role" ).unwrap(); registry.register(Box::new(budget_utilization.clone())).unwrap(); registry.register(Box::new(budget_exceeded_total.clone())).unwrap(); registry.register(Box::new(fallback_triggers_total.clone())).unwrap(); registry.register(Box::new(active_agents.clone())).unwrap(); registry.register(Box::new(task_duration_seconds.clone())).unwrap(); registry.register(Box::new(task_status_total.clone())).unwrap(); registry.register(Box::new(llm_cost_cents_total.clone())).unwrap(); registry.register(Box::new(tokens_used_total.clone())).unwrap(); Self { registry, budget_utilization, budget_exceeded_total, fallback_triggers_total, active_agents, task_duration_seconds, task_status_total, llm_cost_cents_total, tokens_used_total, } } pub fn export(&self) -> String { let encoder = prometheus::TextEncoder::new(); let metric_families = self.registry.gather(); let mut buffer = Vec::new(); encoder.encode(&metric_families, &mut buffer).unwrap(); String::from_utf8(buffer).unwrap() } } ``` --- ## 4. TypeDialog (prov-gen): IaC Generation Specifications ### Prov-Gen Backend ```rust // crates/typedialog-prov-gen/src/lib.rs use tera::{Tera, Context}; use serde::{Deserialize, Serialize}; pub struct ProvGenBackend { templates: Tera, validators: Vec>, } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct InfrastructureConfig { pub provider: CloudProvider, pub region: String, pub resources: Vec, pub networking: NetworkConfig, pub security: SecurityConfig, } #[derive(Debug, Clone, Serialize, Deserialize)] pub enum CloudProvider { Aws, Gcp, Azure, Hetzner, UpCloud, Local, // LXD } pub struct Generator { templates: tera::Tera, validators: Vec>, } impl Generator { pub async fn generate(&self, config: &InfrastructureConfig) -> Result { // 1. Validate input config (7-layer validation) self.validate_config(config)?; // 2. Load provider-specific template let template_name = format!("{}.ncl.tera", config.provider.as_str()); let template = self.templates.get_template(&template_name)?; // 3. Create template context let mut context = Context::new(); context.insert("provider", &config.provider); context.insert("region", &config.region); context.insert("resources", &config.resources); context.insert("networking", &config.networking); context.insert("security", &config.security); // 4. Render Nickel configuration let nickel_code = template.render(&context)?; // 5. Validate generated Nickel self.validate_nickel(&nickel_code)?; // 6. Split into logical files let files = self.split_to_files(&nickel_code)?; Ok(GeneratedIaC { provider: config.provider.clone(), main_file: nickel_code, files, validation_passed: true, }) } fn validate_config(&self, config: &InfrastructureConfig) -> Result<()> { for validator in &self.validators { validator.validate(config)?; } Ok(()) } fn validate_nickel(&self, code: &str) -> Result<()> { // Run nickel typecheck let output = std::process::Command::new("nickel") .arg("typecheck") .arg("--stdin") .stdin(std::process::Stdio::piped()) .stdout(std::process::Stdio::piped()) .stderr(std::process::Stdio::piped()) .spawn()? .stdin.unwrap().write_all(code.as_bytes())?; // Check exit status // ... (implementation details) Ok(()) } } ``` ### Templates (Tera + Nickel) ```tera {# templates/aws.ncl.tera - AWS multi-cloud template #} {# Generated by TypeDialog prov-gen backend #} { provider = "aws", region = "{{ region }}", {% if resources.servers %} servers = [ {% for server in resources.servers %} { name = "{{ server.name }}", plan = "{{ server.plan }}", role = {% if server.role %}"{{ server.role }}"{% else %}null{% endif %}, provider = "aws", spec = { cpu = {{ server.cpu | default(value=2) }}, memory_gb = {{ server.memory_gb | default(value=4) }}, disk_gb = {{ server.disk_gb | default(value=50) }}, os = { family = 'ubuntu, version = "{{ server.os_version | default(value='22.04') }}", }, }, networking = { vpc = "{{ networking.vpc_id }}", subnet = "{{ networking.subnet_id }}", public_ip = {{ server.public_ip | default(value=false) }}, security_groups = {{ server.security_groups | default(value=[]) | json_encode }}, }, tags = { Environment = "{{ environment | default(value='production') }}", ManagedBy = "provisioning", {% for key, value in server.tags %} "{{ key }}" = "{{ value }}", {% endfor %} }, }, {% endfor %} ], {% endif %} {% if resources.taskservs %} taskservs = {{ resources.taskservs | json_encode }}, {% endif %} networking = { vpc_cidr = "{{ networking.vpc_cidr | default(value='10.0.0.0/16') }}", {% if networking.pod_cidr %} pod_cidr = "{{ networking.pod_cidr }}", service_cidr = "{{ networking.service_cidr }}", {% endif %} }, {% if security %} security = { {% if security.enable_encryption %} encryption_at_rest = true, kms_key_id = "{{ security.kms_key_id }}", {% endif %} {% if security.enable_audit_logging %} audit_logging = { enabled = true, retention_days = {{ security.audit_retention_days | default(value=2555) }}, }, {% endif %} }, {% endif %} } ``` --- ## 5. Kogral: Knowledge Management Specifications ### Node Types (Ops Focus) ```rust // kogral-core/src/models.rs #[derive(Debug, Clone, Serialize, Deserialize)] pub enum NodeType { Note, // General notes, documentation Decision, // ADRs (Architectural Decision Records) Guideline, // Team/org standards, policies Pattern, // Reusable solutions, best practices Journal, // Daily development/ops log Execution, // Agent execution records, postmortems, incidents } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct Node { pub id: String, pub node_type: NodeType, pub title: String, pub content: String, // Markdown body pub metadata: HashMap, pub tags: Vec, pub created_at: DateTime, pub updated_at: DateTime, pub author: Option, } // Example: Execution node for incident postmortem impl Node { pub fn new_execution(title: &str, incident_details: IncidentDetails) -> Self { let content = format!( "# {}\n\n\ ## Timeline\n\ - **Started**: {}\n\ - **Detected**: {}\n\ - **Resolved**: {}\n\ - **Duration**: {:?}\n\n\ ## Root Cause\n\ {}\n\n\ ## Resolution\n\ {}\n\n\ ## Action Items\n\ {}\n\n\ ## Related Resources\n\ {}", title, incident_details.started_at, incident_details.detected_at, incident_details.resolved_at, incident_details.duration, incident_details.root_cause, incident_details.resolution, incident_details.action_items.join("\n"), incident_details.related_resources.join("\n"), ); Self { id: Uuid::new_v4().to_string(), node_type: NodeType::Execution, title: title.to_string(), content, metadata: incident_details.metadata, tags: incident_details.tags, created_at: Utc::now(), updated_at: Utc::now(), author: Some(incident_details.author), } } } #[derive(Debug, Clone, Serialize, Deserialize)] pub struct IncidentDetails { pub started_at: DateTime, pub detected_at: DateTime, pub resolved_at: DateTime, pub duration: std::time::Duration, pub root_cause: String, pub resolution: String, pub action_items: Vec, pub related_resources: Vec, pub metadata: HashMap, pub tags: Vec, pub author: String, } ``` ### MCP Tools (Ops Workflows) ```bash # Search troubleshooting runbooks kogral-mcp search "nginx 502 error troubleshooting" --type note # Add incident postmortem kogral-mcp add-execution \ --title "2026-01-22 PostgreSQL Connection Pool Exhaustion" \ --context "Production database connections maxed out at 100/100" \ --root-cause "Connection leak in application code, connections not released" \ --resolution "Increased max_connections from 100 to 200, added PgBouncer pooler, fixed connection leak" \ --action-items "Implement connection pool monitoring, add alerts at 80% utilization" \ --tags "database,incident,postgresql,production" # Get deployment guidelines kogral-mcp get-guidelines "kubernetes deployment" --include-shared true # Create infrastructure decision ADR kogral-mcp add-decision \ --title "Choose Cilium over Calico for CNI" \ --context "Need Kubernetes CNI with eBPF support and service mesh capabilities" \ --decision "Selected Cilium for better performance (eBPF) and built-in service mesh" \ --consequences "Higher complexity initially, better performance long-term, requires Linux kernel 4.9+" # List all postmortems (Execution nodes) kogral-mcp list --type execution --tags "incident" # Export knowledge graph to markdown kogral-mcp export --format markdown --output /docs/ops-knowledge/ ``` --- ## 6. Integration between Projects (Ops Stack) ### Data Flow Diagram ```text ┌───────────────────┐ │ Kogral │ │ (Runbooks, ADRs) │ └─────────┬─────────┘ │ MCP (operational knowledge) │ ┌─────────────────────────┼─────────────────────────┐ │ │ │ ▼ ▼ ▼ ┌─────────────┐ ┌─────────────────┐ ┌─────────────────┐ │ TypeDialog │ │ Vapora │ │ Provisioning │ │ (Wizards) │ │ (Ops Agents) │ │ (IaC Deploy) │ └──────┬──────┘ └────────┬────────┘ └────────┬────────┘ │ │ │ │ ┌─────────────┴─────────────┐ │ │ │ │ │ ▼ ▼ ▼ ▼ ┌───────────────────────────────────────────────┐ │ SECRETUMVAULT │ │ PKI certs │ DB creds │ API keys │ Encryption │ └───────────────────────────────────────────────┘ │ ▼ ┌───────────────────────────────────────────────┐ │ PERSISTENCE LAYER │ │ SurrealDB │ NATS JetStream │ etcd │ Git │ └───────────────────────────────────────────────┘ ``` ### Shared Dependencies (Ops Stack) ```toml # Common dependencies (Cargo.toml) [dependencies] # Runtime tokio = { version = "1.48", features = ["full"] } # Serialization serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" toml = "0.8" # Database surrealdb = "2.3" etcd-client = "0.14" # Web/API axum = { version = "0.8", features = ["macros"] } tower = "0.5" tower-http = { version = "0.6", features = ["cors", "compression-gzip"] } # Config nickel-lang-core = "1.15" # Logging/Tracing tracing = "0.1" tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] } tracing-opentelemetry = "0.27" # Metrics prometheus = "0.13" # Security cedar-policy = "4.3" jsonwebtoken = "9.3" # Crypto openssl = { version = "0.10", optional = true } oqs = { version = "0.10", optional = true } # Post-Quantum # Error handling anyhow = "1.0" thiserror = "2.0" ``` --- ## 7. Quality Metrics (Ops Perspective) | Project | Tests | Coverage | Clippy | Unsafe Blocks | Performance | | --------- | ------- | ---------- | -------- | --------------- | ------------- | | **Provisioning** | 218 | ~65% | 0 warnings | 0 | Rust orchestrator 10-50x Python | | **SecretumVault** | 50+ | ~75% | 0 warnings | 0 | Crypto ops <10ms (classical), <20ms (PQC) | | **Vapora** | 218 | ~70% | 0 warnings | 0 | NATS latency <5ms, task assignment <100ms | | **TypeDialog** | 3,818 | ~85% | 0 warnings | 0 | Form validation <1ms, IaC gen <500ms | | **Kogral** | 56 | ~80% | 0 warnings | 0 | Semantic search <200ms (fastembed local) | ### Ops Verification Commands ```bash # Provisioning cd provisioning cargo clippy --all-targets --all-features -- -D warnings cargo test --workspace just ci-test # Run CI tests locally # SecretumVault cd secretumvault cargo test --all-features cargo bench # Crypto benchmarks # Vapora cd vapora cargo test --workspace docker-compose up -d # Integration tests with NATS + SurrealDB # TypeDialog cd typedialog cargo test --workspace --all-features cargo run --example prov-gen # Test IaC generation # Kogral cd kogral cargo test kogral serve & # Start MCP server curl http://localhost:3100/health # Health check ``` --- *Document generated: 2026-01-22* *Type: info (Ops/DevOps technical specifications)*