242 lines
6.8 KiB
Plaintext
242 lines
6.8 KiB
Plaintext
|
|
#!/usr/bin/env nu
|
||
|
|
|
||
|
|
# Service Health Check System
|
||
|
|
# Performs health checks on services using various methods
|
||
|
|
|
||
|
|
# Perform health check
|
||
|
|
export def perform-health-check [
|
||
|
|
service_name: string
|
||
|
|
health_config: record
|
||
|
|
] -> record {
|
||
|
|
let start_time = (date now)
|
||
|
|
|
||
|
|
let result = match $health_config.type {
|
||
|
|
"http" => {
|
||
|
|
http-health-check $health_config.http
|
||
|
|
}
|
||
|
|
"tcp" => {
|
||
|
|
tcp-health-check $health_config.tcp
|
||
|
|
}
|
||
|
|
"command" => {
|
||
|
|
command-health-check $health_config.command
|
||
|
|
}
|
||
|
|
"file" => {
|
||
|
|
file-health-check $health_config.file
|
||
|
|
}
|
||
|
|
"none" => {
|
||
|
|
{ healthy: true, message: "No health check configured" }
|
||
|
|
}
|
||
|
|
_ => {
|
||
|
|
{ healthy: false, message: $"Unknown health check type: ($health_config.type)" }
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
let end_time = (date now)
|
||
|
|
let duration = (($end_time - $start_time) | into int) / 1_000_000 # Convert to milliseconds
|
||
|
|
|
||
|
|
{
|
||
|
|
service: $service_name
|
||
|
|
healthy: $result.healthy
|
||
|
|
message: $result.message
|
||
|
|
check_type: $health_config.type
|
||
|
|
duration_ms: $duration
|
||
|
|
checked_at: ($end_time | format date "%Y-%m-%dT%H:%M:%S%z")
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# HTTP health check
|
||
|
|
def http-health-check [
|
||
|
|
config: record
|
||
|
|
] -> record {
|
||
|
|
let timeout = $config.timeout? | default 5
|
||
|
|
|
||
|
|
try {
|
||
|
|
let response = (http get -t $timeout $config.endpoint)
|
||
|
|
|
||
|
|
# For simple health endpoints that return strings
|
||
|
|
{ healthy: true, message: "HTTP health check passed" }
|
||
|
|
} catch {
|
||
|
|
try {
|
||
|
|
# Try with curl for more control
|
||
|
|
let status_code = (curl -s -o /dev/null -w "%{http_code}" -m $timeout $config.endpoint)
|
||
|
|
let expected = ($config.expected_status | into string)
|
||
|
|
|
||
|
|
if $status_code == $expected {
|
||
|
|
{ healthy: true, message: $"HTTP status ($status_code) matches expected" }
|
||
|
|
} else {
|
||
|
|
{ healthy: false, message: $"HTTP status ($status_code) != expected ($expected)" }
|
||
|
|
}
|
||
|
|
} catch {
|
||
|
|
{ healthy: false, message: "HTTP health check failed - endpoint unreachable" }
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# TCP health check
|
||
|
|
def tcp-health-check [
|
||
|
|
config: record
|
||
|
|
] -> record {
|
||
|
|
let timeout = $config.timeout? | default 5
|
||
|
|
|
||
|
|
try {
|
||
|
|
# Use bash timeout with nc (netcat)
|
||
|
|
let result = (bash -c $"timeout ($timeout) bash -c 'cat < /dev/null > /dev/tcp/($config.host)/($config.port)' 2>&1")
|
||
|
|
{ healthy: true, message: $"TCP port ($config.port) is open" }
|
||
|
|
} catch {
|
||
|
|
{ healthy: false, message: $"TCP port ($config.port) is not accessible" }
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# Command health check
|
||
|
|
def command-health-check [
|
||
|
|
config: record
|
||
|
|
] -> record {
|
||
|
|
try {
|
||
|
|
let result = (bash -c $config.command | complete)
|
||
|
|
|
||
|
|
if $result.exit_code == $config.expected_exit_code {
|
||
|
|
{ healthy: true, message: "Command health check passed" }
|
||
|
|
} else {
|
||
|
|
{
|
||
|
|
healthy: false
|
||
|
|
message: $"Command exit code ($result.exit_code) != expected ($config.expected_exit_code)"
|
||
|
|
}
|
||
|
|
}
|
||
|
|
} catch {
|
||
|
|
{ healthy: false, message: "Command health check failed" }
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# File health check
|
||
|
|
def file-health-check [
|
||
|
|
config: record
|
||
|
|
] -> record {
|
||
|
|
let path_exists = ($config.path | path exists)
|
||
|
|
|
||
|
|
if $config.must_exist {
|
||
|
|
if $path_exists {
|
||
|
|
{ healthy: true, message: $"File exists: ($config.path)" }
|
||
|
|
} else {
|
||
|
|
{ healthy: false, message: $"File not found: ($config.path)" }
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
if not $path_exists {
|
||
|
|
{ healthy: true, message: $"File does not exist: ($config.path)" }
|
||
|
|
} else {
|
||
|
|
{ healthy: false, message: $"File exists but should not: ($config.path)" }
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# Retry health check
|
||
|
|
export def retry-health-check [
|
||
|
|
service_name: string
|
||
|
|
health_config: record
|
||
|
|
] -> bool {
|
||
|
|
let max_retries = $health_config.retries? | default 3
|
||
|
|
let interval = $health_config.interval? | default 10
|
||
|
|
|
||
|
|
for attempt in 1..($max_retries + 1) {
|
||
|
|
let result = (perform-health-check $service_name $health_config)
|
||
|
|
|
||
|
|
if $result.healthy {
|
||
|
|
return true
|
||
|
|
}
|
||
|
|
|
||
|
|
if $attempt < ($max_retries + 1) {
|
||
|
|
print $"Health check failed (attempt ($attempt)/($max_retries)), retrying in ($interval)s..."
|
||
|
|
sleep $"($interval)sec"
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
false
|
||
|
|
}
|
||
|
|
|
||
|
|
# Wait for service to be healthy
|
||
|
|
export def wait-for-service [
|
||
|
|
service_name: string
|
||
|
|
timeout: int
|
||
|
|
] -> bool {
|
||
|
|
use manager.nu get-service-definition
|
||
|
|
|
||
|
|
let service_def = (get-service-definition $service_name)
|
||
|
|
let health_config = $service_def.health_check
|
||
|
|
|
||
|
|
# If no health check, assume healthy if running
|
||
|
|
if $health_config.type == "none" {
|
||
|
|
return true
|
||
|
|
}
|
||
|
|
|
||
|
|
let interval = $health_config.interval? | default 5
|
||
|
|
let start_time = (date now)
|
||
|
|
let timeout_ns = ($timeout * 1_000_000_000) # Convert to nanoseconds
|
||
|
|
|
||
|
|
loop {
|
||
|
|
let result = (perform-health-check $service_name $health_config)
|
||
|
|
|
||
|
|
if $result.healthy {
|
||
|
|
print $"✅ Service ($service_name) is healthy"
|
||
|
|
return true
|
||
|
|
}
|
||
|
|
|
||
|
|
let elapsed = ((date now) - $start_time) | into int
|
||
|
|
if $elapsed > $timeout_ns {
|
||
|
|
print $"❌ Timeout waiting for ($service_name) to become healthy"
|
||
|
|
return false
|
||
|
|
}
|
||
|
|
|
||
|
|
print $"Waiting for ($service_name)... (($result.message))"
|
||
|
|
sleep $"($interval)sec"
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# Get service health status
|
||
|
|
export def get-health-status [
|
||
|
|
service_name: string
|
||
|
|
] -> record {
|
||
|
|
use manager.nu get-service-definition is-service-running
|
||
|
|
|
||
|
|
if not (is-service-running $service_name) {
|
||
|
|
return {
|
||
|
|
service: $service_name
|
||
|
|
healthy: false
|
||
|
|
message: "Service not running"
|
||
|
|
status: "stopped"
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
let service_def = (get-service-definition $service_name)
|
||
|
|
let result = (perform-health-check $service_name $service_def.health_check)
|
||
|
|
|
||
|
|
{
|
||
|
|
...$result
|
||
|
|
status: (if $result.healthy { "healthy" } else { "unhealthy" })
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# Continuous health monitoring
|
||
|
|
export def monitor-service-health [
|
||
|
|
service_name: string
|
||
|
|
--interval: int = 30
|
||
|
|
--alert-on-failure
|
||
|
|
] {
|
||
|
|
print $"Starting health monitoring for ($service_name) (interval: ($interval)s)"
|
||
|
|
print "Press Ctrl+C to stop"
|
||
|
|
|
||
|
|
loop {
|
||
|
|
let status = (get-health-status $service_name)
|
||
|
|
|
||
|
|
let timestamp = (date now | format date "%Y-%m-%d %H:%M:%S")
|
||
|
|
let health_icon = if $status.healthy { "✅" } else { "❌" }
|
||
|
|
|
||
|
|
print $"($timestamp) ($health_icon) ($service_name): ($status.message)"
|
||
|
|
|
||
|
|
if (not $status.healthy) and $alert_on_failure {
|
||
|
|
# Could integrate with alerting system here
|
||
|
|
print $"⚠️ ALERT: Service ($service_name) is unhealthy!"
|
||
|
|
}
|
||
|
|
|
||
|
|
sleep $"($interval)sec"
|
||
|
|
}
|
||
|
|
}
|