242 lines
6.8 KiB
Plaintext
Raw Permalink Normal View History

2025-10-07 10:32:04 +01:00
#!/usr/bin/env nu
# Service Health Check System
# Performs health checks on services using various methods
# Perform health check
export def perform-health-check [
service_name: string
health_config: record
] -> record {
let start_time = (date now)
let result = match $health_config.type {
"http" => {
http-health-check $health_config.http
}
"tcp" => {
tcp-health-check $health_config.tcp
}
"command" => {
command-health-check $health_config.command
}
"file" => {
file-health-check $health_config.file
}
"none" => {
{ healthy: true, message: "No health check configured" }
}
_ => {
{ healthy: false, message: $"Unknown health check type: ($health_config.type)" }
}
}
let end_time = (date now)
let duration = (($end_time - $start_time) | into int) / 1_000_000 # Convert to milliseconds
{
service: $service_name
healthy: $result.healthy
message: $result.message
check_type: $health_config.type
duration_ms: $duration
checked_at: ($end_time | format date "%Y-%m-%dT%H:%M:%S%z")
}
}
# HTTP health check
def http-health-check [
config: record
] -> record {
let timeout = $config.timeout? | default 5
try {
let response = (http get -t $timeout $config.endpoint)
# For simple health endpoints that return strings
{ healthy: true, message: "HTTP health check passed" }
} catch {
try {
# Try with curl for more control
let status_code = (curl -s -o /dev/null -w "%{http_code}" -m $timeout $config.endpoint)
let expected = ($config.expected_status | into string)
if $status_code == $expected {
{ healthy: true, message: $"HTTP status ($status_code) matches expected" }
} else {
{ healthy: false, message: $"HTTP status ($status_code) != expected ($expected)" }
}
} catch {
{ healthy: false, message: "HTTP health check failed - endpoint unreachable" }
}
}
}
# TCP health check
def tcp-health-check [
config: record
] -> record {
let timeout = $config.timeout? | default 5
try {
# Use bash timeout with nc (netcat)
let result = (bash -c $"timeout ($timeout) bash -c 'cat < /dev/null > /dev/tcp/($config.host)/($config.port)' 2>&1")
{ healthy: true, message: $"TCP port ($config.port) is open" }
} catch {
{ healthy: false, message: $"TCP port ($config.port) is not accessible" }
}
}
# Command health check
def command-health-check [
config: record
] -> record {
try {
let result = (bash -c $config.command | complete)
if $result.exit_code == $config.expected_exit_code {
{ healthy: true, message: "Command health check passed" }
} else {
{
healthy: false
message: $"Command exit code ($result.exit_code) != expected ($config.expected_exit_code)"
}
}
} catch {
{ healthy: false, message: "Command health check failed" }
}
}
# File health check
def file-health-check [
config: record
] -> record {
let path_exists = ($config.path | path exists)
if $config.must_exist {
if $path_exists {
{ healthy: true, message: $"File exists: ($config.path)" }
} else {
{ healthy: false, message: $"File not found: ($config.path)" }
}
} else {
if not $path_exists {
{ healthy: true, message: $"File does not exist: ($config.path)" }
} else {
{ healthy: false, message: $"File exists but should not: ($config.path)" }
}
}
}
# Retry health check
export def retry-health-check [
service_name: string
health_config: record
] -> bool {
let max_retries = $health_config.retries? | default 3
let interval = $health_config.interval? | default 10
for attempt in 1..($max_retries + 1) {
let result = (perform-health-check $service_name $health_config)
if $result.healthy {
return true
}
if $attempt < ($max_retries + 1) {
print $"Health check failed (attempt ($attempt)/($max_retries)), retrying in ($interval)s..."
sleep $"($interval)sec"
}
}
false
}
# Wait for service to be healthy
export def wait-for-service [
service_name: string
timeout: int
] -> bool {
use manager.nu get-service-definition
let service_def = (get-service-definition $service_name)
let health_config = $service_def.health_check
# If no health check, assume healthy if running
if $health_config.type == "none" {
return true
}
let interval = $health_config.interval? | default 5
let start_time = (date now)
let timeout_ns = ($timeout * 1_000_000_000) # Convert to nanoseconds
loop {
let result = (perform-health-check $service_name $health_config)
if $result.healthy {
print $"✅ Service ($service_name) is healthy"
return true
}
let elapsed = ((date now) - $start_time) | into int
if $elapsed > $timeout_ns {
print $"❌ Timeout waiting for ($service_name) to become healthy"
return false
}
print $"Waiting for ($service_name)... (($result.message))"
sleep $"($interval)sec"
}
}
# Get service health status
export def get-health-status [
service_name: string
] -> record {
use manager.nu get-service-definition is-service-running
if not (is-service-running $service_name) {
return {
service: $service_name
healthy: false
message: "Service not running"
status: "stopped"
}
}
let service_def = (get-service-definition $service_name)
let result = (perform-health-check $service_name $service_def.health_check)
{
...$result
status: (if $result.healthy { "healthy" } else { "unhealthy" })
}
}
# Continuous health monitoring
export def monitor-service-health [
service_name: string
--interval: int = 30
--alert-on-failure
] {
print $"Starting health monitoring for ($service_name) (interval: ($interval)s)"
print "Press Ctrl+C to stop"
loop {
let status = (get-health-status $service_name)
let timestamp = (date now | format date "%Y-%m-%d %H:%M:%S")
let health_icon = if $status.healthy { "✅" } else { "❌" }
print $"($timestamp) ($health_icon) ($service_name): ($status.message)"
if (not $status.healthy) and $alert_on_failure {
# Could integrate with alerting system here
print $"⚠️ ALERT: Service ($service_name) is unhealthy!"
}
sleep $"($interval)sec"
}
}