Platform restructured into crates/, added AI service and detector,
migrated control-center-ui to Leptos 0.8
827 lines
28 KiB
Rust
827 lines
28 KiB
Rust
//! Monitoring and metrics system for workflow orchestrator
|
|
//!
|
|
//! This module provides comprehensive monitoring, metrics collection, and
|
|
//! health checking functionality with Prometheus-compatible metrics export
|
|
//! and real-time updates via WebSocket/SSE.
|
|
|
|
use std::{
|
|
collections::HashMap,
|
|
sync::{
|
|
atomic::{AtomicU64, AtomicUsize, Ordering},
|
|
Arc,
|
|
},
|
|
time::{Duration, Instant},
|
|
};
|
|
|
|
use anyhow::Result;
|
|
use async_trait::async_trait;
|
|
use axum::{
|
|
extract::{ws::WebSocket, WebSocketUpgrade},
|
|
routing::get,
|
|
Router,
|
|
};
|
|
use serde::{Deserialize, Serialize};
|
|
use tokio::sync::{broadcast, Mutex, RwLock};
|
|
use tracing::{debug, error, info, warn};
|
|
|
|
use crate::{
|
|
state::{ComponentHealth, HealthStatus, WorkflowStateManager},
|
|
storage::{TaskEvent, TaskStorage},
|
|
};
|
|
|
|
/// Configuration for monitoring system
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct MonitoringConfig {
|
|
/// Metrics collection interval in seconds
|
|
pub metrics_interval_seconds: u64,
|
|
/// Health check interval in seconds
|
|
pub health_check_interval_seconds: u64,
|
|
/// Maximum number of metrics to retain in memory
|
|
pub max_metrics_in_memory: usize,
|
|
/// Enable Prometheus metrics export
|
|
pub enable_prometheus: bool,
|
|
/// Prometheus metrics path
|
|
pub prometheus_path: String,
|
|
/// Enable WebSocket real-time updates
|
|
pub enable_websocket: bool,
|
|
/// WebSocket endpoint path
|
|
pub websocket_path: String,
|
|
/// Enable Server-Sent Events
|
|
pub enable_sse: bool,
|
|
/// SSE endpoint path
|
|
pub sse_path: String,
|
|
}
|
|
|
|
impl Default for MonitoringConfig {
|
|
fn default() -> Self {
|
|
Self {
|
|
metrics_interval_seconds: 30,
|
|
health_check_interval_seconds: 60,
|
|
max_metrics_in_memory: 1000,
|
|
enable_prometheus: true,
|
|
prometheus_path: "/metrics".to_string(),
|
|
enable_websocket: true,
|
|
websocket_path: "/ws".to_string(),
|
|
enable_sse: true,
|
|
sse_path: "/events".to_string(),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Real-time monitoring event for WebSocket/SSE
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct MonitoringEvent {
|
|
pub event_type: MonitoringEventType,
|
|
pub timestamp: chrono::DateTime<chrono::Utc>,
|
|
pub data: serde_json::Value,
|
|
pub metadata: HashMap<String, String>,
|
|
}
|
|
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub enum MonitoringEventType {
|
|
TaskStatusChanged,
|
|
WorkflowCompleted,
|
|
HealthStatusChanged,
|
|
MetricsUpdated,
|
|
SystemAlert,
|
|
PerformanceWarning,
|
|
}
|
|
|
|
/// Performance metrics collector
|
|
#[derive(Debug)]
|
|
pub struct MetricsCollector {
|
|
/// Task execution metrics
|
|
task_counter: AtomicUsize,
|
|
completed_tasks: AtomicUsize,
|
|
failed_tasks: AtomicUsize,
|
|
average_task_duration_ms: AtomicU64,
|
|
|
|
/// System performance metrics
|
|
memory_usage_mb: AtomicU64,
|
|
cpu_usage_percent: AtomicU64, // Store as integer (percent * 100)
|
|
|
|
/// Workflow metrics
|
|
active_workflows: AtomicUsize,
|
|
completed_workflows: AtomicUsize,
|
|
|
|
/// Storage metrics
|
|
storage_operations: AtomicUsize,
|
|
storage_errors: AtomicUsize,
|
|
|
|
/// Custom metrics storage
|
|
custom_metrics: Arc<RwLock<HashMap<String, f64>>>,
|
|
|
|
/// Metrics history for trends
|
|
metrics_history: Arc<RwLock<Vec<MetricsSnapshot>>>,
|
|
|
|
start_time: Instant,
|
|
}
|
|
|
|
impl Default for MetricsCollector {
|
|
fn default() -> Self {
|
|
Self::new()
|
|
}
|
|
}
|
|
|
|
impl MetricsCollector {
|
|
/// Create new metrics collector
|
|
pub fn new() -> Self {
|
|
Self {
|
|
task_counter: AtomicUsize::new(0),
|
|
completed_tasks: AtomicUsize::new(0),
|
|
failed_tasks: AtomicUsize::new(0),
|
|
average_task_duration_ms: AtomicU64::new(0),
|
|
memory_usage_mb: AtomicU64::new(0),
|
|
cpu_usage_percent: AtomicU64::new(0),
|
|
active_workflows: AtomicUsize::new(0),
|
|
completed_workflows: AtomicUsize::new(0),
|
|
storage_operations: AtomicUsize::new(0),
|
|
storage_errors: AtomicUsize::new(0),
|
|
custom_metrics: Arc::new(RwLock::new(HashMap::new())),
|
|
metrics_history: Arc::new(RwLock::new(Vec::new())),
|
|
start_time: Instant::now(),
|
|
}
|
|
}
|
|
|
|
/// Increment task counter
|
|
pub fn increment_task_counter(&self) {
|
|
self.task_counter.fetch_add(1, Ordering::Relaxed);
|
|
}
|
|
|
|
/// Record task completion
|
|
pub fn record_task_completion(&self, duration_ms: u64) {
|
|
self.completed_tasks.fetch_add(1, Ordering::Relaxed);
|
|
|
|
// Update average duration (simple moving average)
|
|
let current_avg = self.average_task_duration_ms.load(Ordering::Relaxed);
|
|
let completed = self.completed_tasks.load(Ordering::Relaxed);
|
|
|
|
if completed > 0 {
|
|
let new_avg = ((current_avg * (completed - 1) as u64) + duration_ms) / completed as u64;
|
|
self.average_task_duration_ms
|
|
.store(new_avg, Ordering::Relaxed);
|
|
}
|
|
}
|
|
|
|
/// Record task failure
|
|
pub fn record_task_failure(&self) {
|
|
self.failed_tasks.fetch_add(1, Ordering::Relaxed);
|
|
}
|
|
|
|
/// Update system metrics
|
|
pub fn update_system_metrics(&self, memory_mb: u64, cpu_percent: f64) {
|
|
self.memory_usage_mb.store(memory_mb, Ordering::Relaxed);
|
|
self.cpu_usage_percent
|
|
.store((cpu_percent * 100.0) as u64, Ordering::Relaxed);
|
|
}
|
|
|
|
/// Update workflow metrics
|
|
pub fn update_workflow_metrics(&self, active: usize, completed: usize) {
|
|
self.active_workflows.store(active, Ordering::Relaxed);
|
|
self.completed_workflows.store(completed, Ordering::Relaxed);
|
|
}
|
|
|
|
/// Record storage operation
|
|
pub fn record_storage_operation(&self, success: bool) {
|
|
self.storage_operations.fetch_add(1, Ordering::Relaxed);
|
|
if !success {
|
|
self.storage_errors.fetch_add(1, Ordering::Relaxed);
|
|
}
|
|
}
|
|
|
|
/// Set custom metric
|
|
pub async fn set_custom_metric(&self, name: String, value: f64) {
|
|
let mut metrics = self.custom_metrics.write().await;
|
|
metrics.insert(name, value);
|
|
}
|
|
|
|
/// Get custom metric
|
|
pub async fn get_custom_metric(&self, name: &str) -> Option<f64> {
|
|
let metrics = self.custom_metrics.read().await;
|
|
metrics.get(name).copied()
|
|
}
|
|
|
|
/// Get all current metrics
|
|
pub async fn get_current_metrics(&self) -> MetricsSnapshot {
|
|
let custom_metrics = {
|
|
let metrics = self.custom_metrics.read().await;
|
|
metrics.clone()
|
|
};
|
|
|
|
MetricsSnapshot {
|
|
timestamp: chrono::Utc::now(),
|
|
total_tasks: self.task_counter.load(Ordering::Relaxed),
|
|
completed_tasks: self.completed_tasks.load(Ordering::Relaxed),
|
|
failed_tasks: self.failed_tasks.load(Ordering::Relaxed),
|
|
average_task_duration_ms: self.average_task_duration_ms.load(Ordering::Relaxed),
|
|
memory_usage_mb: self.memory_usage_mb.load(Ordering::Relaxed),
|
|
cpu_usage_percent: self.cpu_usage_percent.load(Ordering::Relaxed) as f64 / 100.0,
|
|
active_workflows: self.active_workflows.load(Ordering::Relaxed),
|
|
completed_workflows: self.completed_workflows.load(Ordering::Relaxed),
|
|
storage_operations: self.storage_operations.load(Ordering::Relaxed),
|
|
storage_errors: self.storage_errors.load(Ordering::Relaxed),
|
|
uptime_seconds: self.start_time.elapsed().as_secs(),
|
|
custom_metrics,
|
|
}
|
|
}
|
|
|
|
/// Take snapshot of current metrics
|
|
pub async fn take_snapshot(&self) -> Result<()> {
|
|
let snapshot = self.get_current_metrics().await;
|
|
|
|
let mut history = self.metrics_history.write().await;
|
|
history.push(snapshot);
|
|
|
|
// Keep only last 100 snapshots
|
|
let history_len = history.len();
|
|
if history_len > 100 {
|
|
history.drain(0..history_len - 100);
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Get metrics history
|
|
pub async fn get_metrics_history(&self) -> Vec<MetricsSnapshot> {
|
|
let history = self.metrics_history.read().await;
|
|
history.clone()
|
|
}
|
|
|
|
/// Generate Prometheus-compatible metrics string
|
|
pub async fn generate_prometheus_metrics(&self) -> String {
|
|
let metrics = self.get_current_metrics().await;
|
|
let mut output = String::new();
|
|
|
|
// Basic metrics
|
|
output.push_str("# HELP orchestrator_tasks_total Total number of tasks processed\n");
|
|
output.push_str("# TYPE orchestrator_tasks_total counter\n");
|
|
output.push_str(&format!(
|
|
"orchestrator_tasks_total {}\n",
|
|
metrics.total_tasks
|
|
));
|
|
|
|
output.push_str("# HELP orchestrator_tasks_completed Total number of completed tasks\n");
|
|
output.push_str("# TYPE orchestrator_tasks_completed counter\n");
|
|
output.push_str(&format!(
|
|
"orchestrator_tasks_completed {}\n",
|
|
metrics.completed_tasks
|
|
));
|
|
|
|
output.push_str("# HELP orchestrator_tasks_failed Total number of failed tasks\n");
|
|
output.push_str("# TYPE orchestrator_tasks_failed counter\n");
|
|
output.push_str(&format!(
|
|
"orchestrator_tasks_failed {}\n",
|
|
metrics.failed_tasks
|
|
));
|
|
|
|
output.push_str(
|
|
"# HELP orchestrator_task_duration_ms Average task duration in milliseconds\n",
|
|
);
|
|
output.push_str("# TYPE orchestrator_task_duration_ms gauge\n");
|
|
output.push_str(&format!(
|
|
"orchestrator_task_duration_ms {}\n",
|
|
metrics.average_task_duration_ms
|
|
));
|
|
|
|
// System metrics
|
|
output.push_str("# HELP orchestrator_memory_usage_mb Current memory usage in MB\n");
|
|
output.push_str("# TYPE orchestrator_memory_usage_mb gauge\n");
|
|
output.push_str(&format!(
|
|
"orchestrator_memory_usage_mb {}\n",
|
|
metrics.memory_usage_mb
|
|
));
|
|
|
|
output.push_str("# HELP orchestrator_cpu_usage_percent Current CPU usage percentage\n");
|
|
output.push_str("# TYPE orchestrator_cpu_usage_percent gauge\n");
|
|
output.push_str(&format!(
|
|
"orchestrator_cpu_usage_percent {}\n",
|
|
metrics.cpu_usage_percent
|
|
));
|
|
|
|
// Workflow metrics
|
|
output.push_str("# HELP orchestrator_workflows_active Currently active workflows\n");
|
|
output.push_str("# TYPE orchestrator_workflows_active gauge\n");
|
|
output.push_str(&format!(
|
|
"orchestrator_workflows_active {}\n",
|
|
metrics.active_workflows
|
|
));
|
|
|
|
output.push_str("# HELP orchestrator_workflows_completed Total completed workflows\n");
|
|
output.push_str("# TYPE orchestrator_workflows_completed counter\n");
|
|
output.push_str(&format!(
|
|
"orchestrator_workflows_completed {}\n",
|
|
metrics.completed_workflows
|
|
));
|
|
|
|
// Storage metrics
|
|
output.push_str("# HELP orchestrator_storage_operations_total Total storage operations\n");
|
|
output.push_str("# TYPE orchestrator_storage_operations_total counter\n");
|
|
output.push_str(&format!(
|
|
"orchestrator_storage_operations_total {}\n",
|
|
metrics.storage_operations
|
|
));
|
|
|
|
output.push_str("# HELP orchestrator_storage_errors_total Total storage errors\n");
|
|
output.push_str("# TYPE orchestrator_storage_errors_total counter\n");
|
|
output.push_str(&format!(
|
|
"orchestrator_storage_errors_total {}\n",
|
|
metrics.storage_errors
|
|
));
|
|
|
|
// Uptime
|
|
output.push_str("# HELP orchestrator_uptime_seconds System uptime in seconds\n");
|
|
output.push_str("# TYPE orchestrator_uptime_seconds gauge\n");
|
|
output.push_str(&format!(
|
|
"orchestrator_uptime_seconds {}\n",
|
|
metrics.uptime_seconds
|
|
));
|
|
|
|
// Custom metrics
|
|
for (name, value) in &metrics.custom_metrics {
|
|
let metric_name = format!("orchestrator_custom_{}", name.replace("-", "_"));
|
|
output.push_str(&format!("# HELP {} Custom metric: {}\n", metric_name, name));
|
|
output.push_str(&format!("# TYPE {} gauge\n", metric_name));
|
|
output.push_str(&format!("{} {}\n", metric_name, value));
|
|
}
|
|
|
|
output
|
|
}
|
|
}
|
|
|
|
/// Snapshot of metrics at a point in time
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct MetricsSnapshot {
|
|
pub timestamp: chrono::DateTime<chrono::Utc>,
|
|
pub total_tasks: usize,
|
|
pub completed_tasks: usize,
|
|
pub failed_tasks: usize,
|
|
pub average_task_duration_ms: u64,
|
|
pub memory_usage_mb: u64,
|
|
pub cpu_usage_percent: f64,
|
|
pub active_workflows: usize,
|
|
pub completed_workflows: usize,
|
|
pub storage_operations: usize,
|
|
pub storage_errors: usize,
|
|
pub uptime_seconds: u64,
|
|
pub custom_metrics: HashMap<String, f64>,
|
|
}
|
|
|
|
/// System health monitor
|
|
pub struct HealthMonitor {
|
|
storage: Arc<dyn TaskStorage>,
|
|
state_manager: Arc<WorkflowStateManager>,
|
|
health_checks: Arc<RwLock<HashMap<String, Box<dyn HealthCheck + Send + Sync>>>>,
|
|
last_health_check: Arc<Mutex<Instant>>,
|
|
}
|
|
|
|
impl HealthMonitor {
|
|
/// Create new health monitor
|
|
pub fn new(storage: Arc<dyn TaskStorage>, state_manager: Arc<WorkflowStateManager>) -> Self {
|
|
Self {
|
|
storage,
|
|
state_manager,
|
|
health_checks: Arc::new(RwLock::new(HashMap::new())),
|
|
last_health_check: Arc::new(Mutex::new(Instant::now())),
|
|
}
|
|
}
|
|
|
|
/// Register a health check
|
|
pub async fn register_health_check<H>(&self, name: String, health_check: H)
|
|
where
|
|
H: HealthCheck + Send + Sync + 'static,
|
|
{
|
|
let mut checks = self.health_checks.write().await;
|
|
checks.insert(name, Box::new(health_check));
|
|
}
|
|
|
|
/// Run all health checks
|
|
pub async fn run_health_checks(&self) -> HashMap<String, HealthStatus> {
|
|
let mut results = HashMap::new();
|
|
let checks = self.health_checks.read().await;
|
|
|
|
// Default storage health check
|
|
let storage_health = self.check_storage_health().await;
|
|
results.insert("storage".to_string(), storage_health);
|
|
|
|
// Run registered health checks
|
|
for (name, check) in checks.iter() {
|
|
match check.check().await {
|
|
Ok(status) => {
|
|
results.insert(name.clone(), status);
|
|
}
|
|
Err(e) => {
|
|
let error_status = HealthStatus {
|
|
component: name.clone(),
|
|
status: ComponentHealth::Unhealthy,
|
|
last_check: chrono::Utc::now(),
|
|
details: HashMap::new(),
|
|
error: Some(e.to_string()),
|
|
};
|
|
results.insert(name.clone(), error_status);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Update state manager
|
|
for (component, status) in &results {
|
|
self.state_manager
|
|
.update_health_status(
|
|
component,
|
|
status.status.clone(),
|
|
status.details.clone(),
|
|
status.error.clone(),
|
|
)
|
|
.await;
|
|
}
|
|
|
|
// Update last check time
|
|
{
|
|
let mut last_check = self.last_health_check.lock().await;
|
|
*last_check = Instant::now();
|
|
}
|
|
|
|
results
|
|
}
|
|
|
|
/// Check storage backend health
|
|
async fn check_storage_health(&self) -> HealthStatus {
|
|
let mut details = HashMap::new();
|
|
|
|
match self.storage.health_check().await {
|
|
Ok(true) => {
|
|
details.insert("status".to_string(), "operational".to_string());
|
|
|
|
// Check storage statistics
|
|
if let Ok(stats) = self.storage.get_statistics().await {
|
|
details.insert("total_tasks".to_string(), stats.total_tasks.to_string());
|
|
details.insert("pending_tasks".to_string(), stats.pending_tasks.to_string());
|
|
details.insert(
|
|
"storage_size".to_string(),
|
|
stats.total_storage_size.to_string(),
|
|
);
|
|
}
|
|
|
|
HealthStatus {
|
|
component: "storage".to_string(),
|
|
status: ComponentHealth::Healthy,
|
|
last_check: chrono::Utc::now(),
|
|
details,
|
|
error: None,
|
|
}
|
|
}
|
|
Ok(false) => HealthStatus {
|
|
component: "storage".to_string(),
|
|
status: ComponentHealth::Degraded,
|
|
last_check: chrono::Utc::now(),
|
|
details,
|
|
error: Some("Storage health check returned false".to_string()),
|
|
},
|
|
Err(e) => HealthStatus {
|
|
component: "storage".to_string(),
|
|
status: ComponentHealth::Unhealthy,
|
|
last_check: chrono::Utc::now(),
|
|
details,
|
|
error: Some(e.to_string()),
|
|
},
|
|
}
|
|
}
|
|
|
|
/// Get overall system health
|
|
pub async fn get_system_health(&self) -> SystemHealthStatus {
|
|
let health_results = self.run_health_checks().await;
|
|
|
|
let total_components = health_results.len();
|
|
let healthy_components = health_results
|
|
.values()
|
|
.filter(|status| status.status == ComponentHealth::Healthy)
|
|
.count();
|
|
let degraded_components = health_results
|
|
.values()
|
|
.filter(|status| status.status == ComponentHealth::Degraded)
|
|
.count();
|
|
let unhealthy_components = health_results
|
|
.values()
|
|
.filter(|status| status.status == ComponentHealth::Unhealthy)
|
|
.count();
|
|
|
|
let overall_status = if unhealthy_components > 0 {
|
|
ComponentHealth::Unhealthy
|
|
} else if degraded_components > 0 {
|
|
ComponentHealth::Degraded
|
|
} else {
|
|
ComponentHealth::Healthy
|
|
};
|
|
|
|
SystemHealthStatus {
|
|
overall_status,
|
|
total_components,
|
|
healthy_components,
|
|
degraded_components,
|
|
unhealthy_components,
|
|
component_details: health_results,
|
|
last_check: chrono::Utc::now(),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Overall system health status
|
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
|
pub struct SystemHealthStatus {
|
|
pub overall_status: ComponentHealth,
|
|
pub total_components: usize,
|
|
pub healthy_components: usize,
|
|
pub degraded_components: usize,
|
|
pub unhealthy_components: usize,
|
|
pub component_details: HashMap<String, HealthStatus>,
|
|
pub last_check: chrono::DateTime<chrono::Utc>,
|
|
}
|
|
|
|
/// Health check trait for components
|
|
#[async_trait]
|
|
pub trait HealthCheck {
|
|
async fn check(&self) -> Result<HealthStatus>;
|
|
}
|
|
|
|
/// Main monitoring system that coordinates all monitoring components
|
|
pub struct MonitoringSystem {
|
|
config: MonitoringConfig,
|
|
metrics_collector: Arc<MetricsCollector>,
|
|
health_monitor: Arc<HealthMonitor>,
|
|
state_manager: Arc<WorkflowStateManager>,
|
|
event_broadcaster: broadcast::Sender<MonitoringEvent>,
|
|
storage: Arc<dyn TaskStorage>,
|
|
}
|
|
|
|
impl MonitoringSystem {
|
|
/// Create new monitoring system
|
|
pub fn new(
|
|
config: MonitoringConfig,
|
|
storage: Arc<dyn TaskStorage>,
|
|
state_manager: Arc<WorkflowStateManager>,
|
|
) -> Self {
|
|
let metrics_collector = Arc::new(MetricsCollector::new());
|
|
let health_monitor = Arc::new(HealthMonitor::new(storage.clone(), state_manager.clone()));
|
|
let (event_broadcaster, _) = broadcast::channel(1000);
|
|
|
|
Self {
|
|
config,
|
|
metrics_collector,
|
|
health_monitor,
|
|
state_manager,
|
|
event_broadcaster,
|
|
storage,
|
|
}
|
|
}
|
|
|
|
/// Initialize monitoring system
|
|
pub async fn init(&self) -> Result<()> {
|
|
info!("Initializing monitoring system");
|
|
|
|
// Start background monitoring tasks
|
|
self.start_monitoring_tasks().await?;
|
|
|
|
info!("Monitoring system initialized successfully");
|
|
Ok(())
|
|
}
|
|
|
|
/// Get metrics collector
|
|
pub fn metrics_collector(&self) -> Arc<MetricsCollector> {
|
|
self.metrics_collector.clone()
|
|
}
|
|
|
|
/// Get health monitor
|
|
pub fn health_monitor(&self) -> Arc<HealthMonitor> {
|
|
self.health_monitor.clone()
|
|
}
|
|
|
|
/// Create monitoring routes for web server
|
|
pub fn create_routes<S>(&self) -> Router<S>
|
|
where
|
|
S: Clone + Send + Sync + 'static,
|
|
{
|
|
let mut router = Router::new();
|
|
|
|
if self.config.enable_prometheus {
|
|
let metrics_collector = self.metrics_collector.clone();
|
|
router = router.route(
|
|
&self.config.prometheus_path,
|
|
get({
|
|
let metrics_collector = metrics_collector.clone();
|
|
move || async move { metrics_collector.generate_prometheus_metrics().await }
|
|
}),
|
|
);
|
|
}
|
|
|
|
if self.config.enable_websocket {
|
|
let event_broadcaster = self.event_broadcaster.clone();
|
|
let websocket_handler = |ws: WebSocketUpgrade| async move {
|
|
ws.on_upgrade(move |socket| {
|
|
handle_websocket_connection(socket, event_broadcaster.clone())
|
|
})
|
|
};
|
|
router = router.route(&self.config.websocket_path, get(websocket_handler));
|
|
}
|
|
|
|
router
|
|
}
|
|
|
|
/// Publish monitoring event
|
|
pub async fn publish_event(&self, event: MonitoringEvent) -> Result<()> {
|
|
match self.event_broadcaster.send(event.clone()) {
|
|
Ok(subscriber_count) => {
|
|
debug!(
|
|
"Published monitoring event to {} subscribers",
|
|
subscriber_count
|
|
);
|
|
}
|
|
Err(_) => {
|
|
// No subscribers, which is fine
|
|
}
|
|
}
|
|
|
|
// Also store in storage if it's a task event
|
|
if let MonitoringEventType::TaskStatusChanged = event.event_type {
|
|
if let Ok(task_event) = serde_json::from_value::<TaskEvent>(event.data) {
|
|
let _ = self.storage.publish_event(task_event).await;
|
|
}
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Start background monitoring tasks
|
|
async fn start_monitoring_tasks(&self) -> Result<()> {
|
|
// Metrics collection task
|
|
let metrics_collector = self.metrics_collector.clone();
|
|
let metrics_interval = Duration::from_secs(self.config.metrics_interval_seconds);
|
|
tokio::spawn(async move {
|
|
let mut interval = tokio::time::interval(metrics_interval);
|
|
loop {
|
|
interval.tick().await;
|
|
if let Err(e) = metrics_collector.take_snapshot().await {
|
|
error!("Failed to take metrics snapshot: {}", e);
|
|
}
|
|
}
|
|
});
|
|
|
|
// Health monitoring task
|
|
let health_monitor = self.health_monitor.clone();
|
|
let health_interval = Duration::from_secs(self.config.health_check_interval_seconds);
|
|
let event_broadcaster = self.event_broadcaster.clone();
|
|
tokio::spawn(async move {
|
|
let mut interval = tokio::time::interval(health_interval);
|
|
loop {
|
|
interval.tick().await;
|
|
Self::process_health_checks(&health_monitor, &event_broadcaster).await;
|
|
}
|
|
});
|
|
|
|
Ok(())
|
|
}
|
|
|
|
async fn process_health_checks(
|
|
health_monitor: &Arc<HealthMonitor>,
|
|
event_broadcaster: &tokio::sync::broadcast::Sender<MonitoringEvent>,
|
|
) {
|
|
let health_results = health_monitor.run_health_checks().await;
|
|
|
|
// Broadcast health status changes
|
|
for (component, status) in health_results {
|
|
if status.status == ComponentHealth::Healthy {
|
|
continue;
|
|
}
|
|
let metadata = HashMap::from([("component".to_string(), component)]);
|
|
let event = MonitoringEvent {
|
|
event_type: MonitoringEventType::HealthStatusChanged,
|
|
timestamp: chrono::Utc::now(),
|
|
data: serde_json::to_value(&status).unwrap_or_default(),
|
|
metadata,
|
|
};
|
|
let _ = event_broadcaster.send(event);
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Handle WebSocket connection for real-time monitoring
|
|
async fn handle_websocket_connection(
|
|
mut socket: WebSocket,
|
|
event_broadcaster: broadcast::Sender<MonitoringEvent>,
|
|
) {
|
|
let mut event_receiver = event_broadcaster.subscribe();
|
|
|
|
loop {
|
|
tokio::select! {
|
|
event_result = event_receiver.recv() => {
|
|
match event_result {
|
|
Ok(event) => {
|
|
if let Ok(json) = serde_json::to_string(&event) {
|
|
if socket.send(axum::extract::ws::Message::Text(json.into())).await.is_err() {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
Err(broadcast::error::RecvError::Lagged(_)) => {
|
|
warn!("WebSocket client lagged behind, skipping events");
|
|
continue;
|
|
}
|
|
Err(broadcast::error::RecvError::Closed) => {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
// Handle incoming WebSocket messages (ping/pong, client requests)
|
|
msg_result = socket.recv() => {
|
|
match msg_result {
|
|
Some(Ok(msg)) => {
|
|
match msg {
|
|
axum::extract::ws::Message::Close(_) => break,
|
|
axum::extract::ws::Message::Pong(_) => {
|
|
// Handle pong response
|
|
}
|
|
_ => {
|
|
// Handle other message types if needed
|
|
}
|
|
}
|
|
}
|
|
Some(Err(_)) | None => break,
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
debug!("WebSocket connection closed");
|
|
}
|
|
|
|
/// Example health check implementation for system resources
|
|
pub struct SystemResourceHealthCheck {
|
|
memory_threshold_mb: u64,
|
|
cpu_threshold_percent: f64,
|
|
}
|
|
|
|
impl SystemResourceHealthCheck {
|
|
pub fn new(memory_threshold_mb: u64, cpu_threshold_percent: f64) -> Self {
|
|
Self {
|
|
memory_threshold_mb,
|
|
cpu_threshold_percent,
|
|
}
|
|
}
|
|
|
|
async fn get_system_info(&self) -> Result<(u64, f64)> {
|
|
// In a real implementation, this would collect actual system metrics
|
|
// For now, return mock values
|
|
let memory_mb = 512; // Mock memory usage
|
|
let cpu_percent = 25.0; // Mock CPU usage
|
|
Ok((memory_mb, cpu_percent))
|
|
}
|
|
}
|
|
|
|
#[async_trait]
|
|
impl HealthCheck for SystemResourceHealthCheck {
|
|
async fn check(&self) -> Result<HealthStatus> {
|
|
let (memory_mb, cpu_percent) = self.get_system_info().await?;
|
|
|
|
let mut details = HashMap::new();
|
|
details.insert("memory_usage_mb".to_string(), memory_mb.to_string());
|
|
details.insert("cpu_usage_percent".to_string(), cpu_percent.to_string());
|
|
details.insert(
|
|
"memory_threshold_mb".to_string(),
|
|
self.memory_threshold_mb.to_string(),
|
|
);
|
|
details.insert(
|
|
"cpu_threshold_percent".to_string(),
|
|
self.cpu_threshold_percent.to_string(),
|
|
);
|
|
|
|
let status =
|
|
if memory_mb > self.memory_threshold_mb || cpu_percent > self.cpu_threshold_percent {
|
|
if memory_mb > self.memory_threshold_mb * 2
|
|
|| cpu_percent > self.cpu_threshold_percent * 2.0
|
|
{
|
|
ComponentHealth::Unhealthy
|
|
} else {
|
|
ComponentHealth::Degraded
|
|
}
|
|
} else {
|
|
ComponentHealth::Healthy
|
|
};
|
|
|
|
let error = if status != ComponentHealth::Healthy {
|
|
Some(format!(
|
|
"Resource usage exceeds thresholds - Memory: {}MB (max: {}MB), CPU: {:.1}% (max: \
|
|
{:.1}%)",
|
|
memory_mb, self.memory_threshold_mb, cpu_percent, self.cpu_threshold_percent
|
|
))
|
|
} else {
|
|
None
|
|
};
|
|
|
|
Ok(HealthStatus {
|
|
component: "system_resources".to_string(),
|
|
status,
|
|
last_check: chrono::Utc::now(),
|
|
details,
|
|
error,
|
|
})
|
|
}
|
|
}
|