#!/usr/bin/env nu # Service Health Check System # Performs health checks on services using various methods # Perform health check export def perform-health-check [ service_name: string health_config: record ] -> record { let start_time = (date now) let result = match $health_config.type { "http" => { http-health-check $health_config.http } "tcp" => { tcp-health-check $health_config.tcp } "command" => { command-health-check $health_config.command } "file" => { file-health-check $health_config.file } "none" => { { healthy: true, message: "No health check configured" } } _ => { { healthy: false, message: $"Unknown health check type: ($health_config.type)" } } } let end_time = (date now) let duration = (($end_time - $start_time) | into int) / 1_000_000 # Convert to milliseconds { service: $service_name healthy: $result.healthy message: $result.message check_type: $health_config.type duration_ms: $duration checked_at: ($end_time | format date "%Y-%m-%dT%H:%M:%S%z") } } # HTTP health check def http-health-check [ config: record ] -> record { let timeout = $config.timeout? | default 5 try { let response = (http get -t $timeout $config.endpoint) # For simple health endpoints that return strings { healthy: true, message: "HTTP health check passed" } } catch { try { # Try with curl for more control let status_code = (curl -s -o /dev/null -w "%{http_code}" -m $timeout $config.endpoint) let expected = ($config.expected_status | into string) if $status_code == $expected { { healthy: true, message: $"HTTP status ($status_code) matches expected" } } else { { healthy: false, message: $"HTTP status ($status_code) != expected ($expected)" } } } catch { { healthy: false, message: "HTTP health check failed - endpoint unreachable" } } } } # TCP health check def tcp-health-check [ config: record ] -> record { let timeout = $config.timeout? | default 5 try { # Use bash timeout with nc (netcat) let result = (bash -c $"timeout ($timeout) bash -c 'cat < /dev/null > /dev/tcp/($config.host)/($config.port)' 2>&1") { healthy: true, message: $"TCP port ($config.port) is open" } } catch { { healthy: false, message: $"TCP port ($config.port) is not accessible" } } } # Command health check def command-health-check [ config: record ] -> record { try { let result = (bash -c $config.command | complete) if $result.exit_code == $config.expected_exit_code { { healthy: true, message: "Command health check passed" } } else { { healthy: false message: $"Command exit code ($result.exit_code) != expected ($config.expected_exit_code)" } } } catch { { healthy: false, message: "Command health check failed" } } } # File health check def file-health-check [ config: record ] -> record { let path_exists = ($config.path | path exists) if $config.must_exist { if $path_exists { { healthy: true, message: $"File exists: ($config.path)" } } else { { healthy: false, message: $"File not found: ($config.path)" } } } else { if not $path_exists { { healthy: true, message: $"File does not exist: ($config.path)" } } else { { healthy: false, message: $"File exists but should not: ($config.path)" } } } } # Retry health check export def retry-health-check [ service_name: string health_config: record ] -> bool { let max_retries = $health_config.retries? | default 3 let interval = $health_config.interval? | default 10 for attempt in 1..($max_retries + 1) { let result = (perform-health-check $service_name $health_config) if $result.healthy { return true } if $attempt < ($max_retries + 1) { print $"Health check failed (attempt ($attempt)/($max_retries)), retrying in ($interval)s..." sleep $"($interval)sec" } } false } # Wait for service to be healthy export def wait-for-service [ service_name: string timeout: int ] -> bool { use manager.nu get-service-definition let service_def = (get-service-definition $service_name) let health_config = $service_def.health_check # If no health check, assume healthy if running if $health_config.type == "none" { return true } let interval = $health_config.interval? | default 5 let start_time = (date now) let timeout_ns = ($timeout * 1_000_000_000) # Convert to nanoseconds loop { let result = (perform-health-check $service_name $health_config) if $result.healthy { print $"✅ Service ($service_name) is healthy" return true } let elapsed = ((date now) - $start_time) | into int if $elapsed > $timeout_ns { print $"❌ Timeout waiting for ($service_name) to become healthy" return false } print $"Waiting for ($service_name)... (($result.message))" sleep $"($interval)sec" } } # Get service health status export def get-health-status [ service_name: string ] -> record { use manager.nu get-service-definition is-service-running if not (is-service-running $service_name) { return { service: $service_name healthy: false message: "Service not running" status: "stopped" } } let service_def = (get-service-definition $service_name) let result = (perform-health-check $service_name $service_def.health_check) { ...$result status: (if $result.healthy { "healthy" } else { "unhealthy" }) } } # Continuous health monitoring export def monitor-service-health [ service_name: string --interval: int = 30 --alert-on-failure ] { print $"Starting health monitoring for ($service_name) (interval: ($interval)s)" print "Press Ctrl+C to stop" loop { let status = (get-health-status $service_name) let timestamp = (date now | format date "%Y-%m-%d %H:%M:%S") let health_icon = if $status.healthy { "✅" } else { "❌" } print $"($timestamp) ($health_icon) ($service_name): ($status.message)" if (not $status.healthy) and $alert_on_failure { # Could integrate with alerting system here print $"⚠️ ALERT: Service ($service_name) is unhealthy!" } sleep $"($interval)sec" } }