prvng_platform/crates/orchestrator/src/monitor.rs

//! Monitoring and metrics system for workflow orchestrator
//!
//! This module provides comprehensive monitoring, metrics collection, and
//! health checking functionality with Prometheus-compatible metrics export
//! and real-time updates via WebSocket/SSE.

use std::{
    collections::HashMap,
    sync::{
        atomic::{AtomicU64, AtomicUsize, Ordering},
        Arc,
    },
    time::{Duration, Instant},
};

use anyhow::Result;
use async_trait::async_trait;
use axum::{
    extract::{ws::WebSocket, WebSocketUpgrade},
    routing::get,
    Router,
};
use serde::{Deserialize, Serialize};
use tokio::sync::{broadcast, Mutex, RwLock};
use tracing::{debug, error, info, warn};

use crate::{
    state::{ComponentHealth, HealthStatus, WorkflowStateManager},
    storage::{TaskEvent, TaskStorage},
};

/// Configuration for monitoring system
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MonitoringConfig {
    /// Metrics collection interval in seconds
    pub metrics_interval_seconds: u64,
    /// Health check interval in seconds
    pub health_check_interval_seconds: u64,
    /// Maximum number of metrics to retain in memory
    pub max_metrics_in_memory: usize,
    /// Enable Prometheus metrics export
    pub enable_prometheus: bool,
    /// Prometheus metrics path
    pub prometheus_path: String,
    /// Enable WebSocket real-time updates
    pub enable_websocket: bool,
    /// WebSocket endpoint path
    pub websocket_path: String,
    /// Enable Server-Sent Events
    pub enable_sse: bool,
    /// SSE endpoint path
    pub sse_path: String,
}

impl Default for MonitoringConfig {
    fn default() -> Self {
        Self {
            metrics_interval_seconds: 30,
            health_check_interval_seconds: 60,
            max_metrics_in_memory: 1000,
            enable_prometheus: true,
            prometheus_path: "/metrics".to_string(),
            enable_websocket: true,
            websocket_path: "/ws".to_string(),
            enable_sse: true,
            sse_path: "/events".to_string(),
        }
    }
}

/// Real-time monitoring event for WebSocket/SSE
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MonitoringEvent {
    pub event_type: MonitoringEventType,
    pub timestamp: chrono::DateTime<chrono::Utc>,
    pub data: serde_json::Value,
    pub metadata: HashMap<String, String>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub enum MonitoringEventType {
    TaskStatusChanged,
    WorkflowCompleted,
    HealthStatusChanged,
    MetricsUpdated,
    SystemAlert,
    PerformanceWarning,
}

/// Performance metrics collector
#[derive(Debug)]
pub struct MetricsCollector {
    /// Task execution metrics
    task_counter: AtomicUsize,
    completed_tasks: AtomicUsize,
    failed_tasks: AtomicUsize,
    average_task_duration_ms: AtomicU64,

    /// System performance metrics
    memory_usage_mb: AtomicU64,
    cpu_usage_percent: AtomicU64, // Store as integer (percent * 100)

    /// Workflow metrics
    active_workflows: AtomicUsize,
    completed_workflows: AtomicUsize,

    /// Storage metrics
    storage_operations: AtomicUsize,
    storage_errors: AtomicUsize,

    /// Custom metrics storage
    custom_metrics: Arc<RwLock<HashMap<String, f64>>>,

    /// Metrics history for trends
    metrics_history: Arc<RwLock<Vec<MetricsSnapshot>>>,

    start_time: Instant,
}

impl Default for MetricsCollector {
    fn default() -> Self {
        Self::new()
    }
}

impl MetricsCollector {
    /// Create new metrics collector
    pub fn new() -> Self {
        Self {
            task_counter: AtomicUsize::new(0),
            completed_tasks: AtomicUsize::new(0),
            failed_tasks: AtomicUsize::new(0),
            average_task_duration_ms: AtomicU64::new(0),
            memory_usage_mb: AtomicU64::new(0),
            cpu_usage_percent: AtomicU64::new(0),
            active_workflows: AtomicUsize::new(0),
            completed_workflows: AtomicUsize::new(0),
            storage_operations: AtomicUsize::new(0),
            storage_errors: AtomicUsize::new(0),
            custom_metrics: Arc::new(RwLock::new(HashMap::new())),
            metrics_history: Arc::new(RwLock::new(Vec::new())),
            start_time: Instant::now(),
        }
    }

    /// Increment task counter
    pub fn increment_task_counter(&self) {
        self.task_counter.fetch_add(1, Ordering::Relaxed);
    }

    /// Record task completion
    pub fn record_task_completion(&self, duration_ms: u64) {
        self.completed_tasks.fetch_add(1, Ordering::Relaxed);

        // Update average duration (simple moving average)
        let current_avg = self.average_task_duration_ms.load(Ordering::Relaxed);
        let completed = self.completed_tasks.load(Ordering::Relaxed);

        if completed > 0 {
            let new_avg = ((current_avg * (completed - 1) as u64) + duration_ms) / completed as u64;
            self.average_task_duration_ms
                .store(new_avg, Ordering::Relaxed);
        }
    }

    /// Record task failure
    pub fn record_task_failure(&self) {
        self.failed_tasks.fetch_add(1, Ordering::Relaxed);
    }

    /// Update system metrics
    pub fn update_system_metrics(&self, memory_mb: u64, cpu_percent: f64) {
        self.memory_usage_mb.store(memory_mb, Ordering::Relaxed);
        self.cpu_usage_percent
            .store((cpu_percent * 100.0) as u64, Ordering::Relaxed);
    }

    /// Update workflow metrics
    pub fn update_workflow_metrics(&self, active: usize, completed: usize) {
        self.active_workflows.store(active, Ordering::Relaxed);
        self.completed_workflows.store(completed, Ordering::Relaxed);
    }

    /// Record storage operation
    pub fn record_storage_operation(&self, success: bool) {
        self.storage_operations.fetch_add(1, Ordering::Relaxed);
        if !success {
            self.storage_errors.fetch_add(1, Ordering::Relaxed);
        }
    }

    /// Set custom metric
    pub async fn set_custom_metric(&self, name: String, value: f64) {
        let mut metrics = self.custom_metrics.write().await;
        metrics.insert(name, value);
    }

    /// Get custom metric
    pub async fn get_custom_metric(&self, name: &str) -> Option<f64> {
        let metrics = self.custom_metrics.read().await;
        metrics.get(name).copied()
    }

    /// Get all current metrics
    pub async fn get_current_metrics(&self) -> MetricsSnapshot {
        let custom_metrics = {
            let metrics = self.custom_metrics.read().await;
            metrics.clone()
        };

        MetricsSnapshot {
            timestamp: chrono::Utc::now(),
            total_tasks: self.task_counter.load(Ordering::Relaxed),
            completed_tasks: self.completed_tasks.load(Ordering::Relaxed),
            failed_tasks: self.failed_tasks.load(Ordering::Relaxed),
            average_task_duration_ms: self.average_task_duration_ms.load(Ordering::Relaxed),
            memory_usage_mb: self.memory_usage_mb.load(Ordering::Relaxed),
            cpu_usage_percent: self.cpu_usage_percent.load(Ordering::Relaxed) as f64 / 100.0,
            active_workflows: self.active_workflows.load(Ordering::Relaxed),
            completed_workflows: self.completed_workflows.load(Ordering::Relaxed),
            storage_operations: self.storage_operations.load(Ordering::Relaxed),
            storage_errors: self.storage_errors.load(Ordering::Relaxed),
            uptime_seconds: self.start_time.elapsed().as_secs(),
            custom_metrics,
        }
    }

    /// Take snapshot of current metrics
    pub async fn take_snapshot(&self) -> Result<()> {
        let snapshot = self.get_current_metrics().await;

        let mut history = self.metrics_history.write().await;
        history.push(snapshot);

        // Keep only last 100 snapshots
        let history_len = history.len();
        if history_len > 100 {
            history.drain(0..history_len - 100);
        }

        Ok(())
    }

    /// Get metrics history
    pub async fn get_metrics_history(&self) -> Vec<MetricsSnapshot> {
        let history = self.metrics_history.read().await;
        history.clone()
    }

    /// Generate Prometheus-compatible metrics string
    pub async fn generate_prometheus_metrics(&self) -> String {
        let metrics = self.get_current_metrics().await;
        let mut output = String::new();

        // Basic metrics
        output.push_str("# HELP orchestrator_tasks_total Total number of tasks processed\n");
        output.push_str("# TYPE orchestrator_tasks_total counter\n");
        output.push_str(&format!(
            "orchestrator_tasks_total {}\n",
            metrics.total_tasks
        ));

        output.push_str("# HELP orchestrator_tasks_completed Total number of completed tasks\n");
        output.push_str("# TYPE orchestrator_tasks_completed counter\n");
        output.push_str(&format!(
            "orchestrator_tasks_completed {}\n",
            metrics.completed_tasks
        ));

        output.push_str("# HELP orchestrator_tasks_failed Total number of failed tasks\n");
        output.push_str("# TYPE orchestrator_tasks_failed counter\n");
        output.push_str(&format!(
            "orchestrator_tasks_failed {}\n",
            metrics.failed_tasks
        ));

        output.push_str(
            "# HELP orchestrator_task_duration_ms Average task duration in milliseconds\n",
        );
        output.push_str("# TYPE orchestrator_task_duration_ms gauge\n");
        output.push_str(&format!(
            "orchestrator_task_duration_ms {}\n",
            metrics.average_task_duration_ms
        ));

        // System metrics
        output.push_str("# HELP orchestrator_memory_usage_mb Current memory usage in MB\n");
        output.push_str("# TYPE orchestrator_memory_usage_mb gauge\n");
        output.push_str(&format!(
            "orchestrator_memory_usage_mb {}\n",
            metrics.memory_usage_mb
        ));

        output.push_str("# HELP orchestrator_cpu_usage_percent Current CPU usage percentage\n");
        output.push_str("# TYPE orchestrator_cpu_usage_percent gauge\n");
        output.push_str(&format!(
            "orchestrator_cpu_usage_percent {}\n",
            metrics.cpu_usage_percent
        ));

        // Workflow metrics
        output.push_str("# HELP orchestrator_workflows_active Currently active workflows\n");
        output.push_str("# TYPE orchestrator_workflows_active gauge\n");
        output.push_str(&format!(
            "orchestrator_workflows_active {}\n",
            metrics.active_workflows
        ));

        output.push_str("# HELP orchestrator_workflows_completed Total completed workflows\n");
        output.push_str("# TYPE orchestrator_workflows_completed counter\n");
        output.push_str(&format!(
            "orchestrator_workflows_completed {}\n",
            metrics.completed_workflows
        ));

        // Storage metrics
        output.push_str("# HELP orchestrator_storage_operations_total Total storage operations\n");
        output.push_str("# TYPE orchestrator_storage_operations_total counter\n");
        output.push_str(&format!(
            "orchestrator_storage_operations_total {}\n",
            metrics.storage_operations
        ));

        output.push_str("# HELP orchestrator_storage_errors_total Total storage errors\n");
        output.push_str("# TYPE orchestrator_storage_errors_total counter\n");
        output.push_str(&format!(
            "orchestrator_storage_errors_total {}\n",
            metrics.storage_errors
        ));

        // Uptime
        output.push_str("# HELP orchestrator_uptime_seconds System uptime in seconds\n");
        output.push_str("# TYPE orchestrator_uptime_seconds gauge\n");
        output.push_str(&format!(
            "orchestrator_uptime_seconds {}\n",
            metrics.uptime_seconds
        ));

        // Custom metrics
        for (name, value) in &metrics.custom_metrics {
            let metric_name = format!("orchestrator_custom_{}", name.replace("-", "_"));
            output.push_str(&format!("# HELP {} Custom metric: {}\n", metric_name, name));
            output.push_str(&format!("# TYPE {} gauge\n", metric_name));
            output.push_str(&format!("{} {}\n", metric_name, value));
        }

        output
    }
}

/// Snapshot of metrics at a point in time
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MetricsSnapshot {
    pub timestamp: chrono::DateTime<chrono::Utc>,
    pub total_tasks: usize,
    pub completed_tasks: usize,
    pub failed_tasks: usize,
    pub average_task_duration_ms: u64,
    pub memory_usage_mb: u64,
    pub cpu_usage_percent: f64,
    pub active_workflows: usize,
    pub completed_workflows: usize,
    pub storage_operations: usize,
    pub storage_errors: usize,
    pub uptime_seconds: u64,
    pub custom_metrics: HashMap<String, f64>,
}

/// System health monitor
pub struct HealthMonitor {
    storage: Arc<dyn TaskStorage>,
    state_manager: Arc<WorkflowStateManager>,
    health_checks: Arc<RwLock<HashMap<String, Box<dyn HealthCheck + Send + Sync>>>>,
    last_health_check: Arc<Mutex<Instant>>,
}

impl HealthMonitor {
    /// Create new health monitor
    pub fn new(storage: Arc<dyn TaskStorage>, state_manager: Arc<WorkflowStateManager>) -> Self {
        Self {
            storage,
            state_manager,
            health_checks: Arc::new(RwLock::new(HashMap::new())),
            last_health_check: Arc::new(Mutex::new(Instant::now())),
        }
    }

    /// Register a health check
    pub async fn register_health_check<H>(&self, name: String, health_check: H)
    where
        H: HealthCheck + Send + Sync + 'static,
    {
        let mut checks = self.health_checks.write().await;
        checks.insert(name, Box::new(health_check));
    }

    /// Run all health checks
    pub async fn run_health_checks(&self) -> HashMap<String, HealthStatus> {
        let mut results = HashMap::new();
        let checks = self.health_checks.read().await;

        // Default storage health check
        let storage_health = self.check_storage_health().await;
        results.insert("storage".to_string(), storage_health);

        // Run registered health checks
        for (name, check) in checks.iter() {
            match check.check().await {
                Ok(status) => {
                    results.insert(name.clone(), status);
                }
                Err(e) => {
                    let error_status = HealthStatus {
                        component: name.clone(),
                        status: ComponentHealth::Unhealthy,
                        last_check: chrono::Utc::now(),
                        details: HashMap::new(),
                        error: Some(e.to_string()),
                    };
                    results.insert(name.clone(), error_status);
                }
            }
        }

        // Update state manager
        for (component, status) in &results {
            self.state_manager
                .update_health_status(
                    component,
                    status.status.clone(),
                    status.details.clone(),
                    status.error.clone(),
                )
                .await;
        }

        // Update last check time
        {
            let mut last_check = self.last_health_check.lock().await;
            *last_check = Instant::now();
        }

        results
    }

    /// Check storage backend health
    async fn check_storage_health(&self) -> HealthStatus {
        let mut details = HashMap::new();

        match self.storage.health_check().await {
            Ok(true) => {
                details.insert("status".to_string(), "operational".to_string());

                // Check storage statistics
                if let Ok(stats) = self.storage.get_statistics().await {
                    details.insert("total_tasks".to_string(), stats.total_tasks.to_string());
                    details.insert("pending_tasks".to_string(), stats.pending_tasks.to_string());
                    details.insert(
                        "storage_size".to_string(),
                        stats.total_storage_size.to_string(),
                    );
                }

                HealthStatus {
                    component: "storage".to_string(),
                    status: ComponentHealth::Healthy,
                    last_check: chrono::Utc::now(),
                    details,
                    error: None,
                }
            }
            Ok(false) => HealthStatus {
                component: "storage".to_string(),
                status: ComponentHealth::Degraded,
                last_check: chrono::Utc::now(),
                details,
                error: Some("Storage health check returned false".to_string()),
            },
            Err(e) => HealthStatus {
                component: "storage".to_string(),
                status: ComponentHealth::Unhealthy,
                last_check: chrono::Utc::now(),
                details,
                error: Some(e.to_string()),
            },
        }
    }

    /// Get overall system health
    pub async fn get_system_health(&self) -> SystemHealthStatus {
        let health_results = self.run_health_checks().await;

        let total_components = health_results.len();
        let healthy_components = health_results
            .values()
            .filter(|status| status.status == ComponentHealth::Healthy)
            .count();
        let degraded_components = health_results
            .values()
            .filter(|status| status.status == ComponentHealth::Degraded)
            .count();
        let unhealthy_components = health_results
            .values()
            .filter(|status| status.status == ComponentHealth::Unhealthy)
            .count();

        let overall_status = if unhealthy_components > 0 {
            ComponentHealth::Unhealthy
        } else if degraded_components > 0 {
            ComponentHealth::Degraded
        } else {
            ComponentHealth::Healthy
        };

        SystemHealthStatus {
            overall_status,
            total_components,
            healthy_components,
            degraded_components,
            unhealthy_components,
            component_details: health_results,
            last_check: chrono::Utc::now(),
        }
    }
}

/// Overall system health status
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SystemHealthStatus {
    pub overall_status: ComponentHealth,
    pub total_components: usize,
    pub healthy_components: usize,
    pub degraded_components: usize,
    pub unhealthy_components: usize,
    pub component_details: HashMap<String, HealthStatus>,
    pub last_check: chrono::DateTime<chrono::Utc>,
}

/// Health check trait for components
#[async_trait]
pub trait HealthCheck {
    async fn check(&self) -> Result<HealthStatus>;
}

/// Main monitoring system that coordinates all monitoring components
pub struct MonitoringSystem {
    config: MonitoringConfig,
    metrics_collector: Arc<MetricsCollector>,
    health_monitor: Arc<HealthMonitor>,
    state_manager: Arc<WorkflowStateManager>,
    event_broadcaster: broadcast::Sender<MonitoringEvent>,
    storage: Arc<dyn TaskStorage>,
}

impl MonitoringSystem {
    /// Create new monitoring system
    pub fn new(
        config: MonitoringConfig,
        storage: Arc<dyn TaskStorage>,
        state_manager: Arc<WorkflowStateManager>,
    ) -> Self {
        let metrics_collector = Arc::new(MetricsCollector::new());
        let health_monitor = Arc::new(HealthMonitor::new(storage.clone(), state_manager.clone()));
        let (event_broadcaster, _) = broadcast::channel(1000);

        Self {
            config,
            metrics_collector,
            health_monitor,
            state_manager,
            event_broadcaster,
            storage,
        }
    }

    /// Initialize monitoring system
    pub async fn init(&self) -> Result<()> {
        info!("Initializing monitoring system");

        // Start background monitoring tasks
        self.start_monitoring_tasks().await?;

        info!("Monitoring system initialized successfully");
        Ok(())
    }

    /// Get metrics collector
    pub fn metrics_collector(&self) -> Arc<MetricsCollector> {
        self.metrics_collector.clone()
    }

    /// Get health monitor
    pub fn health_monitor(&self) -> Arc<HealthMonitor> {
        self.health_monitor.clone()
    }

    /// Create monitoring routes for web server
    pub fn create_routes<S>(&self) -> Router<S>
    where
        S: Clone + Send + Sync + 'static,
    {
        let mut router = Router::new();

        if self.config.enable_prometheus {
            let metrics_collector = self.metrics_collector.clone();
            router = router.route(
                &self.config.prometheus_path,
                get({
                    let metrics_collector = metrics_collector.clone();
                    move || async move { metrics_collector.generate_prometheus_metrics().await }
                }),
            );
        }

        if self.config.enable_websocket {
            let event_broadcaster = self.event_broadcaster.clone();
            let websocket_handler = |ws: WebSocketUpgrade| async move {
                ws.on_upgrade(move |socket| {
                    handle_websocket_connection(socket, event_broadcaster.clone())
                })
            };
            router = router.route(&self.config.websocket_path, get(websocket_handler));
        }

        router
    }

    /// Publish monitoring event
    pub async fn publish_event(&self, event: MonitoringEvent) -> Result<()> {
        match self.event_broadcaster.send(event.clone()) {
            Ok(subscriber_count) => {
                debug!(
                    "Published monitoring event to {} subscribers",
                    subscriber_count
                );
            }
            Err(_) => {
                // No subscribers, which is fine
            }
        }

        // Also store in storage if it's a task event
        if let MonitoringEventType::TaskStatusChanged = event.event_type {
            if let Ok(task_event) = serde_json::from_value::<TaskEvent>(event.data) {
                let _ = self.storage.publish_event(task_event).await;
            }
        }

        Ok(())
    }

    /// Start background monitoring tasks
    async fn start_monitoring_tasks(&self) -> Result<()> {
        // Metrics collection task
        let metrics_collector = self.metrics_collector.clone();
        let metrics_interval = Duration::from_secs(self.config.metrics_interval_seconds);
        tokio::spawn(async move {
            let mut interval = tokio::time::interval(metrics_interval);
            loop {
                interval.tick().await;
                if let Err(e) = metrics_collector.take_snapshot().await {
                    error!("Failed to take metrics snapshot: {}", e);
                }
            }
        });

        // Health monitoring task
        let health_monitor = self.health_monitor.clone();
        let health_interval = Duration::from_secs(self.config.health_check_interval_seconds);
        let event_broadcaster = self.event_broadcaster.clone();
        tokio::spawn(async move {
            let mut interval = tokio::time::interval(health_interval);
            loop {
                interval.tick().await;
                Self::process_health_checks(&health_monitor, &event_broadcaster).await;
            }
        });

        Ok(())
    }

    async fn process_health_checks(
        health_monitor: &Arc<HealthMonitor>,
        event_broadcaster: &tokio::sync::broadcast::Sender<MonitoringEvent>,
    ) {
        let health_results = health_monitor.run_health_checks().await;

        // Broadcast health status changes
        for (component, status) in health_results {
            if status.status == ComponentHealth::Healthy {
                continue;
            }
            let metadata = HashMap::from([("component".to_string(), component)]);
            let event = MonitoringEvent {
                event_type: MonitoringEventType::HealthStatusChanged,
                timestamp: chrono::Utc::now(),
                data: serde_json::to_value(&status).unwrap_or_default(),
                metadata,
            };
            let _ = event_broadcaster.send(event);
        }
    }
}

/// Handle WebSocket connection for real-time monitoring
async fn handle_websocket_connection(
    mut socket: WebSocket,
    event_broadcaster: broadcast::Sender<MonitoringEvent>,
) {
    let mut event_receiver = event_broadcaster.subscribe();

    loop {
        tokio::select! {
            event_result = event_receiver.recv() => {
                match event_result {
                    Ok(event) => {
                        if let Ok(json) = serde_json::to_string(&event) {
                            if socket.send(axum::extract::ws::Message::Text(json.into())).await.is_err() {
                                break;
                            }
                        }
                    }
                    Err(broadcast::error::RecvError::Lagged(_)) => {
                        warn!("WebSocket client lagged behind, skipping events");
                        continue;
                    }
                    Err(broadcast::error::RecvError::Closed) => {
                        break;
                    }
                }
            }

            // Handle incoming WebSocket messages (ping/pong, client requests)
            msg_result = socket.recv() => {
                match msg_result {
                    Some(Ok(msg)) => {
                        match msg {
                            axum::extract::ws::Message::Close(_) => break,
                            axum::extract::ws::Message::Pong(_) => {
                                // Handle pong response
                            }
                            _ => {
                                // Handle other message types if needed
                            }
                        }
                    }
                    Some(Err(_)) | None => break,
                }
            }
        }
    }

    debug!("WebSocket connection closed");
}

/// Example health check implementation for system resources
pub struct SystemResourceHealthCheck {
    memory_threshold_mb: u64,
    cpu_threshold_percent: f64,
}

impl SystemResourceHealthCheck {
    pub fn new(memory_threshold_mb: u64, cpu_threshold_percent: f64) -> Self {
        Self {
            memory_threshold_mb,
            cpu_threshold_percent,
        }
    }

    async fn get_system_info(&self) -> Result<(u64, f64)> {
        // In a real implementation, this would collect actual system metrics
        // For now, return mock values
        let memory_mb = 512; // Mock memory usage
        let cpu_percent = 25.0; // Mock CPU usage
        Ok((memory_mb, cpu_percent))
    }
}

#[async_trait]
impl HealthCheck for SystemResourceHealthCheck {
    async fn check(&self) -> Result<HealthStatus> {
        let (memory_mb, cpu_percent) = self.get_system_info().await?;

        let mut details = HashMap::new();
        details.insert("memory_usage_mb".to_string(), memory_mb.to_string());
        details.insert("cpu_usage_percent".to_string(), cpu_percent.to_string());
        details.insert(
            "memory_threshold_mb".to_string(),
            self.memory_threshold_mb.to_string(),
        );
        details.insert(
            "cpu_threshold_percent".to_string(),
            self.cpu_threshold_percent.to_string(),
        );

        let status =
            if memory_mb > self.memory_threshold_mb || cpu_percent > self.cpu_threshold_percent {
                if memory_mb > self.memory_threshold_mb * 2
                    || cpu_percent > self.cpu_threshold_percent * 2.0
                {
                    ComponentHealth::Unhealthy
                } else {
                    ComponentHealth::Degraded
                }
            } else {
                ComponentHealth::Healthy
            };

        let error = if status != ComponentHealth::Healthy {
            Some(format!(
                "Resource usage exceeds thresholds - Memory: {}MB (max: {}MB), CPU: {:.1}% (max: \
                 {:.1}%)",
                memory_mb, self.memory_threshold_mb, cpu_percent, self.cpu_threshold_percent
            ))
        } else {
            None
        };

        Ok(HealthStatus {
            component: "system_resources".to_string(),
            status,
            last_check: chrono::Utc::now(),
            details,
            error,
        })
    }
}