//! Monitoring and metrics system for workflow orchestrator //! //! This module provides comprehensive monitoring, metrics collection, and //! health checking functionality with Prometheus-compatible metrics export //! and real-time updates via WebSocket/SSE. use std::{ collections::HashMap, sync::{ atomic::{AtomicU64, AtomicUsize, Ordering}, Arc, }, time::{Duration, Instant}, }; use anyhow::Result; use async_trait::async_trait; use axum::{ extract::{ws::WebSocket, WebSocketUpgrade}, routing::get, Router, }; use serde::{Deserialize, Serialize}; use tokio::sync::{broadcast, Mutex, RwLock}; use tracing::{debug, error, info, warn}; use crate::{ state::{ComponentHealth, HealthStatus, WorkflowStateManager}, storage::{TaskEvent, TaskStorage}, }; /// Configuration for monitoring system #[derive(Debug, Clone, Serialize, Deserialize)] pub struct MonitoringConfig { /// Metrics collection interval in seconds pub metrics_interval_seconds: u64, /// Health check interval in seconds pub health_check_interval_seconds: u64, /// Maximum number of metrics to retain in memory pub max_metrics_in_memory: usize, /// Enable Prometheus metrics export pub enable_prometheus: bool, /// Prometheus metrics path pub prometheus_path: String, /// Enable WebSocket real-time updates pub enable_websocket: bool, /// WebSocket endpoint path pub websocket_path: String, /// Enable Server-Sent Events pub enable_sse: bool, /// SSE endpoint path pub sse_path: String, } impl Default for MonitoringConfig { fn default() -> Self { Self { metrics_interval_seconds: 30, health_check_interval_seconds: 60, max_metrics_in_memory: 1000, enable_prometheus: true, prometheus_path: "/metrics".to_string(), enable_websocket: true, websocket_path: "/ws".to_string(), enable_sse: true, sse_path: "/events".to_string(), } } } /// Real-time monitoring event for WebSocket/SSE #[derive(Debug, Clone, Serialize, Deserialize)] pub struct MonitoringEvent { pub event_type: MonitoringEventType, pub timestamp: chrono::DateTime, pub data: serde_json::Value, pub metadata: HashMap, } #[derive(Debug, Clone, Serialize, Deserialize)] pub enum MonitoringEventType { TaskStatusChanged, WorkflowCompleted, HealthStatusChanged, MetricsUpdated, SystemAlert, PerformanceWarning, } /// Performance metrics collector #[derive(Debug)] pub struct MetricsCollector { /// Task execution metrics task_counter: AtomicUsize, completed_tasks: AtomicUsize, failed_tasks: AtomicUsize, average_task_duration_ms: AtomicU64, /// System performance metrics memory_usage_mb: AtomicU64, cpu_usage_percent: AtomicU64, // Store as integer (percent * 100) /// Workflow metrics active_workflows: AtomicUsize, completed_workflows: AtomicUsize, /// Storage metrics storage_operations: AtomicUsize, storage_errors: AtomicUsize, /// Custom metrics storage custom_metrics: Arc>>, /// Metrics history for trends metrics_history: Arc>>, start_time: Instant, } impl Default for MetricsCollector { fn default() -> Self { Self::new() } } impl MetricsCollector { /// Create new metrics collector pub fn new() -> Self { Self { task_counter: AtomicUsize::new(0), completed_tasks: AtomicUsize::new(0), failed_tasks: AtomicUsize::new(0), average_task_duration_ms: AtomicU64::new(0), memory_usage_mb: AtomicU64::new(0), cpu_usage_percent: AtomicU64::new(0), active_workflows: AtomicUsize::new(0), completed_workflows: AtomicUsize::new(0), storage_operations: AtomicUsize::new(0), storage_errors: AtomicUsize::new(0), custom_metrics: Arc::new(RwLock::new(HashMap::new())), metrics_history: Arc::new(RwLock::new(Vec::new())), start_time: Instant::now(), } } /// Increment task counter pub fn increment_task_counter(&self) { self.task_counter.fetch_add(1, Ordering::Relaxed); } /// Record task completion pub fn record_task_completion(&self, duration_ms: u64) { self.completed_tasks.fetch_add(1, Ordering::Relaxed); // Update average duration (simple moving average) let current_avg = self.average_task_duration_ms.load(Ordering::Relaxed); let completed = self.completed_tasks.load(Ordering::Relaxed); if completed > 0 { let new_avg = ((current_avg * (completed - 1) as u64) + duration_ms) / completed as u64; self.average_task_duration_ms .store(new_avg, Ordering::Relaxed); } } /// Record task failure pub fn record_task_failure(&self) { self.failed_tasks.fetch_add(1, Ordering::Relaxed); } /// Update system metrics pub fn update_system_metrics(&self, memory_mb: u64, cpu_percent: f64) { self.memory_usage_mb.store(memory_mb, Ordering::Relaxed); self.cpu_usage_percent .store((cpu_percent * 100.0) as u64, Ordering::Relaxed); } /// Update workflow metrics pub fn update_workflow_metrics(&self, active: usize, completed: usize) { self.active_workflows.store(active, Ordering::Relaxed); self.completed_workflows.store(completed, Ordering::Relaxed); } /// Record storage operation pub fn record_storage_operation(&self, success: bool) { self.storage_operations.fetch_add(1, Ordering::Relaxed); if !success { self.storage_errors.fetch_add(1, Ordering::Relaxed); } } /// Set custom metric pub async fn set_custom_metric(&self, name: String, value: f64) { let mut metrics = self.custom_metrics.write().await; metrics.insert(name, value); } /// Get custom metric pub async fn get_custom_metric(&self, name: &str) -> Option { let metrics = self.custom_metrics.read().await; metrics.get(name).copied() } /// Get all current metrics pub async fn get_current_metrics(&self) -> MetricsSnapshot { let custom_metrics = { let metrics = self.custom_metrics.read().await; metrics.clone() }; MetricsSnapshot { timestamp: chrono::Utc::now(), total_tasks: self.task_counter.load(Ordering::Relaxed), completed_tasks: self.completed_tasks.load(Ordering::Relaxed), failed_tasks: self.failed_tasks.load(Ordering::Relaxed), average_task_duration_ms: self.average_task_duration_ms.load(Ordering::Relaxed), memory_usage_mb: self.memory_usage_mb.load(Ordering::Relaxed), cpu_usage_percent: self.cpu_usage_percent.load(Ordering::Relaxed) as f64 / 100.0, active_workflows: self.active_workflows.load(Ordering::Relaxed), completed_workflows: self.completed_workflows.load(Ordering::Relaxed), storage_operations: self.storage_operations.load(Ordering::Relaxed), storage_errors: self.storage_errors.load(Ordering::Relaxed), uptime_seconds: self.start_time.elapsed().as_secs(), custom_metrics, } } /// Take snapshot of current metrics pub async fn take_snapshot(&self) -> Result<()> { let snapshot = self.get_current_metrics().await; let mut history = self.metrics_history.write().await; history.push(snapshot); // Keep only last 100 snapshots let history_len = history.len(); if history_len > 100 { history.drain(0..history_len - 100); } Ok(()) } /// Get metrics history pub async fn get_metrics_history(&self) -> Vec { let history = self.metrics_history.read().await; history.clone() } /// Generate Prometheus-compatible metrics string pub async fn generate_prometheus_metrics(&self) -> String { let metrics = self.get_current_metrics().await; let mut output = String::new(); // Basic metrics output.push_str("# HELP orchestrator_tasks_total Total number of tasks processed\n"); output.push_str("# TYPE orchestrator_tasks_total counter\n"); output.push_str(&format!( "orchestrator_tasks_total {}\n", metrics.total_tasks )); output.push_str("# HELP orchestrator_tasks_completed Total number of completed tasks\n"); output.push_str("# TYPE orchestrator_tasks_completed counter\n"); output.push_str(&format!( "orchestrator_tasks_completed {}\n", metrics.completed_tasks )); output.push_str("# HELP orchestrator_tasks_failed Total number of failed tasks\n"); output.push_str("# TYPE orchestrator_tasks_failed counter\n"); output.push_str(&format!( "orchestrator_tasks_failed {}\n", metrics.failed_tasks )); output.push_str( "# HELP orchestrator_task_duration_ms Average task duration in milliseconds\n", ); output.push_str("# TYPE orchestrator_task_duration_ms gauge\n"); output.push_str(&format!( "orchestrator_task_duration_ms {}\n", metrics.average_task_duration_ms )); // System metrics output.push_str("# HELP orchestrator_memory_usage_mb Current memory usage in MB\n"); output.push_str("# TYPE orchestrator_memory_usage_mb gauge\n"); output.push_str(&format!( "orchestrator_memory_usage_mb {}\n", metrics.memory_usage_mb )); output.push_str("# HELP orchestrator_cpu_usage_percent Current CPU usage percentage\n"); output.push_str("# TYPE orchestrator_cpu_usage_percent gauge\n"); output.push_str(&format!( "orchestrator_cpu_usage_percent {}\n", metrics.cpu_usage_percent )); // Workflow metrics output.push_str("# HELP orchestrator_workflows_active Currently active workflows\n"); output.push_str("# TYPE orchestrator_workflows_active gauge\n"); output.push_str(&format!( "orchestrator_workflows_active {}\n", metrics.active_workflows )); output.push_str("# HELP orchestrator_workflows_completed Total completed workflows\n"); output.push_str("# TYPE orchestrator_workflows_completed counter\n"); output.push_str(&format!( "orchestrator_workflows_completed {}\n", metrics.completed_workflows )); // Storage metrics output.push_str("# HELP orchestrator_storage_operations_total Total storage operations\n"); output.push_str("# TYPE orchestrator_storage_operations_total counter\n"); output.push_str(&format!( "orchestrator_storage_operations_total {}\n", metrics.storage_operations )); output.push_str("# HELP orchestrator_storage_errors_total Total storage errors\n"); output.push_str("# TYPE orchestrator_storage_errors_total counter\n"); output.push_str(&format!( "orchestrator_storage_errors_total {}\n", metrics.storage_errors )); // Uptime output.push_str("# HELP orchestrator_uptime_seconds System uptime in seconds\n"); output.push_str("# TYPE orchestrator_uptime_seconds gauge\n"); output.push_str(&format!( "orchestrator_uptime_seconds {}\n", metrics.uptime_seconds )); // Custom metrics for (name, value) in &metrics.custom_metrics { let metric_name = format!("orchestrator_custom_{}", name.replace("-", "_")); output.push_str(&format!("# HELP {} Custom metric: {}\n", metric_name, name)); output.push_str(&format!("# TYPE {} gauge\n", metric_name)); output.push_str(&format!("{} {}\n", metric_name, value)); } output } } /// Snapshot of metrics at a point in time #[derive(Debug, Clone, Serialize, Deserialize)] pub struct MetricsSnapshot { pub timestamp: chrono::DateTime, pub total_tasks: usize, pub completed_tasks: usize, pub failed_tasks: usize, pub average_task_duration_ms: u64, pub memory_usage_mb: u64, pub cpu_usage_percent: f64, pub active_workflows: usize, pub completed_workflows: usize, pub storage_operations: usize, pub storage_errors: usize, pub uptime_seconds: u64, pub custom_metrics: HashMap, } /// System health monitor pub struct HealthMonitor { storage: Arc, state_manager: Arc, health_checks: Arc>>>, last_health_check: Arc>, } impl HealthMonitor { /// Create new health monitor pub fn new(storage: Arc, state_manager: Arc) -> Self { Self { storage, state_manager, health_checks: Arc::new(RwLock::new(HashMap::new())), last_health_check: Arc::new(Mutex::new(Instant::now())), } } /// Register a health check pub async fn register_health_check(&self, name: String, health_check: H) where H: HealthCheck + Send + Sync + 'static, { let mut checks = self.health_checks.write().await; checks.insert(name, Box::new(health_check)); } /// Run all health checks pub async fn run_health_checks(&self) -> HashMap { let mut results = HashMap::new(); let checks = self.health_checks.read().await; // Default storage health check let storage_health = self.check_storage_health().await; results.insert("storage".to_string(), storage_health); // Run registered health checks for (name, check) in checks.iter() { match check.check().await { Ok(status) => { results.insert(name.clone(), status); } Err(e) => { let error_status = HealthStatus { component: name.clone(), status: ComponentHealth::Unhealthy, last_check: chrono::Utc::now(), details: HashMap::new(), error: Some(e.to_string()), }; results.insert(name.clone(), error_status); } } } // Update state manager for (component, status) in &results { self.state_manager .update_health_status( component, status.status.clone(), status.details.clone(), status.error.clone(), ) .await; } // Update last check time { let mut last_check = self.last_health_check.lock().await; *last_check = Instant::now(); } results } /// Check storage backend health async fn check_storage_health(&self) -> HealthStatus { let mut details = HashMap::new(); match self.storage.health_check().await { Ok(true) => { details.insert("status".to_string(), "operational".to_string()); // Check storage statistics if let Ok(stats) = self.storage.get_statistics().await { details.insert("total_tasks".to_string(), stats.total_tasks.to_string()); details.insert("pending_tasks".to_string(), stats.pending_tasks.to_string()); details.insert( "storage_size".to_string(), stats.total_storage_size.to_string(), ); } HealthStatus { component: "storage".to_string(), status: ComponentHealth::Healthy, last_check: chrono::Utc::now(), details, error: None, } } Ok(false) => HealthStatus { component: "storage".to_string(), status: ComponentHealth::Degraded, last_check: chrono::Utc::now(), details, error: Some("Storage health check returned false".to_string()), }, Err(e) => HealthStatus { component: "storage".to_string(), status: ComponentHealth::Unhealthy, last_check: chrono::Utc::now(), details, error: Some(e.to_string()), }, } } /// Get overall system health pub async fn get_system_health(&self) -> SystemHealthStatus { let health_results = self.run_health_checks().await; let total_components = health_results.len(); let healthy_components = health_results .values() .filter(|status| status.status == ComponentHealth::Healthy) .count(); let degraded_components = health_results .values() .filter(|status| status.status == ComponentHealth::Degraded) .count(); let unhealthy_components = health_results .values() .filter(|status| status.status == ComponentHealth::Unhealthy) .count(); let overall_status = if unhealthy_components > 0 { ComponentHealth::Unhealthy } else if degraded_components > 0 { ComponentHealth::Degraded } else { ComponentHealth::Healthy }; SystemHealthStatus { overall_status, total_components, healthy_components, degraded_components, unhealthy_components, component_details: health_results, last_check: chrono::Utc::now(), } } } /// Overall system health status #[derive(Debug, Clone, Serialize, Deserialize)] pub struct SystemHealthStatus { pub overall_status: ComponentHealth, pub total_components: usize, pub healthy_components: usize, pub degraded_components: usize, pub unhealthy_components: usize, pub component_details: HashMap, pub last_check: chrono::DateTime, } /// Health check trait for components #[async_trait] pub trait HealthCheck { async fn check(&self) -> Result; } /// Main monitoring system that coordinates all monitoring components pub struct MonitoringSystem { config: MonitoringConfig, metrics_collector: Arc, health_monitor: Arc, state_manager: Arc, event_broadcaster: broadcast::Sender, storage: Arc, } impl MonitoringSystem { /// Create new monitoring system pub fn new( config: MonitoringConfig, storage: Arc, state_manager: Arc, ) -> Self { let metrics_collector = Arc::new(MetricsCollector::new()); let health_monitor = Arc::new(HealthMonitor::new(storage.clone(), state_manager.clone())); let (event_broadcaster, _) = broadcast::channel(1000); Self { config, metrics_collector, health_monitor, state_manager, event_broadcaster, storage, } } /// Initialize monitoring system pub async fn init(&self) -> Result<()> { info!("Initializing monitoring system"); // Start background monitoring tasks self.start_monitoring_tasks().await?; info!("Monitoring system initialized successfully"); Ok(()) } /// Get metrics collector pub fn metrics_collector(&self) -> Arc { self.metrics_collector.clone() } /// Get health monitor pub fn health_monitor(&self) -> Arc { self.health_monitor.clone() } /// Create monitoring routes for web server pub fn create_routes(&self) -> Router where S: Clone + Send + Sync + 'static, { let mut router = Router::new(); if self.config.enable_prometheus { let metrics_collector = self.metrics_collector.clone(); router = router.route( &self.config.prometheus_path, get({ let metrics_collector = metrics_collector.clone(); move || async move { metrics_collector.generate_prometheus_metrics().await } }), ); } if self.config.enable_websocket { let event_broadcaster = self.event_broadcaster.clone(); let websocket_handler = |ws: WebSocketUpgrade| async move { ws.on_upgrade(move |socket| { handle_websocket_connection(socket, event_broadcaster.clone()) }) }; router = router.route(&self.config.websocket_path, get(websocket_handler)); } router } /// Publish monitoring event pub async fn publish_event(&self, event: MonitoringEvent) -> Result<()> { match self.event_broadcaster.send(event.clone()) { Ok(subscriber_count) => { debug!( "Published monitoring event to {} subscribers", subscriber_count ); } Err(_) => { // No subscribers, which is fine } } // Also store in storage if it's a task event if let MonitoringEventType::TaskStatusChanged = event.event_type { if let Ok(task_event) = serde_json::from_value::(event.data) { let _ = self.storage.publish_event(task_event).await; } } Ok(()) } /// Start background monitoring tasks async fn start_monitoring_tasks(&self) -> Result<()> { // Metrics collection task let metrics_collector = self.metrics_collector.clone(); let metrics_interval = Duration::from_secs(self.config.metrics_interval_seconds); tokio::spawn(async move { let mut interval = tokio::time::interval(metrics_interval); loop { interval.tick().await; if let Err(e) = metrics_collector.take_snapshot().await { error!("Failed to take metrics snapshot: {}", e); } } }); // Health monitoring task let health_monitor = self.health_monitor.clone(); let health_interval = Duration::from_secs(self.config.health_check_interval_seconds); let event_broadcaster = self.event_broadcaster.clone(); tokio::spawn(async move { let mut interval = tokio::time::interval(health_interval); loop { interval.tick().await; Self::process_health_checks(&health_monitor, &event_broadcaster).await; } }); Ok(()) } async fn process_health_checks( health_monitor: &Arc, event_broadcaster: &tokio::sync::broadcast::Sender, ) { let health_results = health_monitor.run_health_checks().await; // Broadcast health status changes for (component, status) in health_results { if status.status == ComponentHealth::Healthy { continue; } let metadata = HashMap::from([("component".to_string(), component)]); let event = MonitoringEvent { event_type: MonitoringEventType::HealthStatusChanged, timestamp: chrono::Utc::now(), data: serde_json::to_value(&status).unwrap_or_default(), metadata, }; let _ = event_broadcaster.send(event); } } } /// Handle WebSocket connection for real-time monitoring async fn handle_websocket_connection( mut socket: WebSocket, event_broadcaster: broadcast::Sender, ) { let mut event_receiver = event_broadcaster.subscribe(); loop { tokio::select! { event_result = event_receiver.recv() => { match event_result { Ok(event) => { if let Ok(json) = serde_json::to_string(&event) { if socket.send(axum::extract::ws::Message::Text(json.into())).await.is_err() { break; } } } Err(broadcast::error::RecvError::Lagged(_)) => { warn!("WebSocket client lagged behind, skipping events"); continue; } Err(broadcast::error::RecvError::Closed) => { break; } } } // Handle incoming WebSocket messages (ping/pong, client requests) msg_result = socket.recv() => { match msg_result { Some(Ok(msg)) => { match msg { axum::extract::ws::Message::Close(_) => break, axum::extract::ws::Message::Pong(_) => { // Handle pong response } _ => { // Handle other message types if needed } } } Some(Err(_)) | None => break, } } } } debug!("WebSocket connection closed"); } /// Example health check implementation for system resources pub struct SystemResourceHealthCheck { memory_threshold_mb: u64, cpu_threshold_percent: f64, } impl SystemResourceHealthCheck { pub fn new(memory_threshold_mb: u64, cpu_threshold_percent: f64) -> Self { Self { memory_threshold_mb, cpu_threshold_percent, } } async fn get_system_info(&self) -> Result<(u64, f64)> { // In a real implementation, this would collect actual system metrics // For now, return mock values let memory_mb = 512; // Mock memory usage let cpu_percent = 25.0; // Mock CPU usage Ok((memory_mb, cpu_percent)) } } #[async_trait] impl HealthCheck for SystemResourceHealthCheck { async fn check(&self) -> Result { let (memory_mb, cpu_percent) = self.get_system_info().await?; let mut details = HashMap::new(); details.insert("memory_usage_mb".to_string(), memory_mb.to_string()); details.insert("cpu_usage_percent".to_string(), cpu_percent.to_string()); details.insert( "memory_threshold_mb".to_string(), self.memory_threshold_mb.to_string(), ); details.insert( "cpu_threshold_percent".to_string(), self.cpu_threshold_percent.to_string(), ); let status = if memory_mb > self.memory_threshold_mb || cpu_percent > self.cpu_threshold_percent { if memory_mb > self.memory_threshold_mb * 2 || cpu_percent > self.cpu_threshold_percent * 2.0 { ComponentHealth::Unhealthy } else { ComponentHealth::Degraded } } else { ComponentHealth::Healthy }; let error = if status != ComponentHealth::Healthy { Some(format!( "Resource usage exceeds thresholds - Memory: {}MB (max: {}MB), CPU: {:.1}% (max: \ {:.1}%)", memory_mb, self.memory_threshold_mb, cpu_percent, self.cpu_threshold_percent )) } else { None }; Ok(HealthStatus { component: "system_resources".to_string(), status, last_check: chrono::Utc::now(), details, error, }) } }