prvng_core/nulib/lib_provisioning/services/health.nu

253 lines
6.9 KiB
Text
Raw Normal View History

2025-10-07 10:32:04 +01:00
#!/usr/bin/env nu
# Service Health Check System
# Performs health checks on services using various methods
# Perform health check
export def perform-health-check [
service_name: string
health_config: record
] {
2025-10-07 10:32:04 +01:00
let start_time = (date now)
let result = match $health_config.type {
"http" => {
http-health-check $health_config.http
}
"tcp" => {
tcp-health-check $health_config.tcp
}
"command" => {
command-health-check $health_config.command
}
"file" => {
file-health-check $health_config.file
}
"none" => {
{ healthy: true, message: "No health check configured" }
}
_ => {
{ healthy: false, message: $"Unknown health check type: ($health_config.type)" }
}
}
let end_time = (date now)
let duration = (($end_time - $start_time) | into int) / 1_000_000 # Convert to milliseconds
{
service: $service_name
healthy: $result.healthy
message: $result.message
check_type: $health_config.type
duration_ms: $duration
checked_at: ($end_time | format date "%Y-%m-%dT%H:%M:%S%z")
}
}
# HTTP health check
def http-health-check [
config: record
] {
2025-10-07 10:32:04 +01:00
let timeout = $config.timeout? | default 5
feat(core): three-layer DAG, unified component arch, commands-registry cache, Nushell 0.112.2 migration - DAG architecture: `dag show/validate/export` (nulib/main_provisioning/dag.nu), config loader (lib_provisioning/config/loader/dag.nu), taskserv dag-executor. Backed by schemas/lib/dag/*.ncl; orchestrator emits NATS events via WorkspaceComposition::into_workflow. See ADR-020, ADR-021. - Unified Component Architecture: components/mod.nu, main_provisioning/ {components,workflow,extensions,ontoref-queries}.nu. Full workflow engine with topological sort and NATS subject emission. Blocks A-H complete (libre-daoshi). - Commands-registry: nulib/commands-registry.ncl (Nickel source, 314 lines) + JSON cache at ~/.cache/provisioning/commands-registry.json rebuilt on source change. cli/provisioning fast-path alias expansion avoids cold Nu startup. ADDING_COMMANDS.md documents new-command workflow. - Platform service manager: service-manager.nu (+573), startup.nu (+611), service-check.nu (+255); autostart/bootstrap/health/target refactored. - Nushell 0.112.2 migration: removed all try/catch and bash redirections; external commands prefixed with ^; type signatures enforced. Driven by scripts/refactor-try-catch{,-simplified}.nu. - TTY stack: removed shlib/*-tty.sh; replaced by cli/tty-dispatch.sh, tty-filter.sh, tty-commands.conf. - New domain modules: images/ (golden image lifecycle), workspace/{state,sync}.nu, main_provisioning/{bootstrap,cluster-deploy,fip,state}.nu, commands/{state, build,integrations/auth,utilities/alias}.nu, platform.nu expanded (+874). - Config loader overhaul: loader/core.nu slimmed (-759), cache/core.nu refactored (-454), removed legacy loaders/file_loader.nu (-330). - Thirteen new provisioning-<domain>.nu top-level modules for bash dispatcher. - Tests: test_workspace_state.nu (+351); updates to test_oci_registry, test_services. - README + CHANGELOG updated.
2026-04-17 04:27:33 +01:00
let expected_status = ($config.expected_status? | default 200)
let timeout_dur = ($"($timeout)sec" | into duration)
let response = (try {
http head --allow-errors --full --max-time $timeout_dur $config.endpoint
} catch {
return { healthy: false, message: "HTTP health check failed - endpoint unreachable" }
})
let status = $response.status
if $status == $expected_status {
{ healthy: true, message: $"HTTP status ($status) matches expected" }
} else {
feat(core): three-layer DAG, unified component arch, commands-registry cache, Nushell 0.112.2 migration - DAG architecture: `dag show/validate/export` (nulib/main_provisioning/dag.nu), config loader (lib_provisioning/config/loader/dag.nu), taskserv dag-executor. Backed by schemas/lib/dag/*.ncl; orchestrator emits NATS events via WorkspaceComposition::into_workflow. See ADR-020, ADR-021. - Unified Component Architecture: components/mod.nu, main_provisioning/ {components,workflow,extensions,ontoref-queries}.nu. Full workflow engine with topological sort and NATS subject emission. Blocks A-H complete (libre-daoshi). - Commands-registry: nulib/commands-registry.ncl (Nickel source, 314 lines) + JSON cache at ~/.cache/provisioning/commands-registry.json rebuilt on source change. cli/provisioning fast-path alias expansion avoids cold Nu startup. ADDING_COMMANDS.md documents new-command workflow. - Platform service manager: service-manager.nu (+573), startup.nu (+611), service-check.nu (+255); autostart/bootstrap/health/target refactored. - Nushell 0.112.2 migration: removed all try/catch and bash redirections; external commands prefixed with ^; type signatures enforced. Driven by scripts/refactor-try-catch{,-simplified}.nu. - TTY stack: removed shlib/*-tty.sh; replaced by cli/tty-dispatch.sh, tty-filter.sh, tty-commands.conf. - New domain modules: images/ (golden image lifecycle), workspace/{state,sync}.nu, main_provisioning/{bootstrap,cluster-deploy,fip,state}.nu, commands/{state, build,integrations/auth,utilities/alias}.nu, platform.nu expanded (+874). - Config loader overhaul: loader/core.nu slimmed (-759), cache/core.nu refactored (-454), removed legacy loaders/file_loader.nu (-330). - Thirteen new provisioning-<domain>.nu top-level modules for bash dispatcher. - Tests: test_workspace_state.nu (+351); updates to test_oci_registry, test_services. - README + CHANGELOG updated.
2026-04-17 04:27:33 +01:00
{ healthy: false, message: $"HTTP status ($status) != expected ($expected_status)" }
2025-10-07 10:32:04 +01:00
}
}
# TCP health check
def tcp-health-check [
config: record
] {
2025-10-07 10:32:04 +01:00
let timeout = $config.timeout? | default 5
let result = (do {
2025-10-07 10:32:04 +01:00
# Use bash timeout with nc (netcat)
bash -c $"timeout ($timeout) bash -c 'cat < /dev/null > /dev/tcp/($config.host)/($config.port)' 2>&1"
} | complete)
if $result.exit_code == 0 {
{ healthy: true, message: $"TCP port [$config.port] is open" }
} else {
{ healthy: false, message: $"TCP port [$config.port] is not accessible" }
2025-10-07 10:32:04 +01:00
}
}
# Command health check
def command-health-check [
config: record
] {
let result = (do {
bash -c $config.command
} | complete)
2025-10-07 10:32:04 +01:00
if $result.exit_code == $config.expected_exit_code {
{ healthy: true, message: "Command health check passed" }
} else {
{
healthy: false
message: $"Command exit code ($result.exit_code) != expected ($config.expected_exit_code)"
2025-10-07 10:32:04 +01:00
}
}
}
# File health check
def file-health-check [
config: record
] {
2025-10-07 10:32:04 +01:00
let path_exists = ($config.path | path exists)
if $config.must_exist {
if $path_exists {
{ healthy: true, message: $"File exists: ($config.path)" }
} else {
{ healthy: false, message: $"File not found: ($config.path)" }
}
} else {
if not $path_exists {
{ healthy: true, message: $"File does not exist: ($config.path)" }
} else {
{ healthy: false, message: $"File exists but should not: ($config.path)" }
}
}
}
# Retry health check
export def retry-health-check [
service_name: string
health_config: record
] {
2025-10-07 10:32:04 +01:00
let max_retries = $health_config.retries? | default 3
let interval = $health_config.interval? | default 10
for attempt in 1..($max_retries + 1) {
let result = (perform-health-check $service_name $health_config)
if $result.healthy {
return true
}
if $attempt < ($max_retries + 1) {
print $"Health check failed (attempt ($attempt)/($max_retries)), retrying in ($interval)s..."
feat(core): three-layer DAG, unified component arch, commands-registry cache, Nushell 0.112.2 migration - DAG architecture: `dag show/validate/export` (nulib/main_provisioning/dag.nu), config loader (lib_provisioning/config/loader/dag.nu), taskserv dag-executor. Backed by schemas/lib/dag/*.ncl; orchestrator emits NATS events via WorkspaceComposition::into_workflow. See ADR-020, ADR-021. - Unified Component Architecture: components/mod.nu, main_provisioning/ {components,workflow,extensions,ontoref-queries}.nu. Full workflow engine with topological sort and NATS subject emission. Blocks A-H complete (libre-daoshi). - Commands-registry: nulib/commands-registry.ncl (Nickel source, 314 lines) + JSON cache at ~/.cache/provisioning/commands-registry.json rebuilt on source change. cli/provisioning fast-path alias expansion avoids cold Nu startup. ADDING_COMMANDS.md documents new-command workflow. - Platform service manager: service-manager.nu (+573), startup.nu (+611), service-check.nu (+255); autostart/bootstrap/health/target refactored. - Nushell 0.112.2 migration: removed all try/catch and bash redirections; external commands prefixed with ^; type signatures enforced. Driven by scripts/refactor-try-catch{,-simplified}.nu. - TTY stack: removed shlib/*-tty.sh; replaced by cli/tty-dispatch.sh, tty-filter.sh, tty-commands.conf. - New domain modules: images/ (golden image lifecycle), workspace/{state,sync}.nu, main_provisioning/{bootstrap,cluster-deploy,fip,state}.nu, commands/{state, build,integrations/auth,utilities/alias}.nu, platform.nu expanded (+874). - Config loader overhaul: loader/core.nu slimmed (-759), cache/core.nu refactored (-454), removed legacy loaders/file_loader.nu (-330). - Thirteen new provisioning-<domain>.nu top-level modules for bash dispatcher. - Tests: test_workspace_state.nu (+351); updates to test_oci_registry, test_services. - README + CHANGELOG updated.
2026-04-17 04:27:33 +01:00
sleep ($"($interval)sec" | into duration)
2025-10-07 10:32:04 +01:00
}
}
false
}
# Wait for service to be healthy
export def wait-for-service [
service_name: string
timeout: int
health_config?: record
] {
# If health_config not provided, use default health check config
let health_check = $health_config | default {
type: "http"
interval: 5
timeout: 30
}
2025-10-07 10:32:04 +01:00
# If no health check, assume healthy if running
if $health_check.type == "none" {
2025-10-07 10:32:04 +01:00
return true
}
let interval = $health_check.interval? | default 5
2025-10-07 10:32:04 +01:00
let start_time = (date now)
let timeout_ns = ($timeout * 1_000_000_000) # Convert to nanoseconds
# Define recursive wait function
def wait_loop [service: string, config: record, start: any, timeout_ns: int, interval: int] {
let check_result = (perform-health-check $service $config)
2025-10-07 10:32:04 +01:00
if $check_result.healthy {
print $"✅ Service ($service) is healthy"
2025-10-07 10:32:04 +01:00
return true
}
let elapsed = ((date now) - $start) | into int
2025-10-07 10:32:04 +01:00
if $elapsed > $timeout_ns {
print $"❌ Timeout waiting for ($service) to become healthy"
2025-10-07 10:32:04 +01:00
return false
}
print $"Waiting for ($service)... (($check_result.message))"
feat(core): three-layer DAG, unified component arch, commands-registry cache, Nushell 0.112.2 migration - DAG architecture: `dag show/validate/export` (nulib/main_provisioning/dag.nu), config loader (lib_provisioning/config/loader/dag.nu), taskserv dag-executor. Backed by schemas/lib/dag/*.ncl; orchestrator emits NATS events via WorkspaceComposition::into_workflow. See ADR-020, ADR-021. - Unified Component Architecture: components/mod.nu, main_provisioning/ {components,workflow,extensions,ontoref-queries}.nu. Full workflow engine with topological sort and NATS subject emission. Blocks A-H complete (libre-daoshi). - Commands-registry: nulib/commands-registry.ncl (Nickel source, 314 lines) + JSON cache at ~/.cache/provisioning/commands-registry.json rebuilt on source change. cli/provisioning fast-path alias expansion avoids cold Nu startup. ADDING_COMMANDS.md documents new-command workflow. - Platform service manager: service-manager.nu (+573), startup.nu (+611), service-check.nu (+255); autostart/bootstrap/health/target refactored. - Nushell 0.112.2 migration: removed all try/catch and bash redirections; external commands prefixed with ^; type signatures enforced. Driven by scripts/refactor-try-catch{,-simplified}.nu. - TTY stack: removed shlib/*-tty.sh; replaced by cli/tty-dispatch.sh, tty-filter.sh, tty-commands.conf. - New domain modules: images/ (golden image lifecycle), workspace/{state,sync}.nu, main_provisioning/{bootstrap,cluster-deploy,fip,state}.nu, commands/{state, build,integrations/auth,utilities/alias}.nu, platform.nu expanded (+874). - Config loader overhaul: loader/core.nu slimmed (-759), cache/core.nu refactored (-454), removed legacy loaders/file_loader.nu (-330). - Thirteen new provisioning-<domain>.nu top-level modules for bash dispatcher. - Tests: test_workspace_state.nu (+351); updates to test_oci_registry, test_services. - README + CHANGELOG updated.
2026-04-17 04:27:33 +01:00
sleep ($"($interval)sec" | into duration)
wait_loop $service $config $start $timeout_ns $interval
2025-10-07 10:32:04 +01:00
}
wait_loop $service_name $health_check $start_time $timeout_ns $interval
2025-10-07 10:32:04 +01:00
}
# Get service health status
export def get-health-status [
service_name: string
is_running: bool = false
health_config?: record
] {
# Parameters avoid circular dependency with manager.nu
# If is_running is false, return stopped status
if not $is_running {
2025-10-07 10:32:04 +01:00
return {
service: $service_name
healthy: false
message: "Service not running"
status: "stopped"
}
}
# Use provided health_config or default
let hc = $health_config | default {
type: "http"
interval: 5
timeout: 30
}
let result = (perform-health-check $service_name $hc)
2025-10-07 10:32:04 +01:00
{
...$result
status: (if $result.healthy { "healthy" } else { "unhealthy" })
}
}
# Continuous health monitoring
export def monitor-service-health [
service_name: string
--interval: int = 30
--alert-on-failure
] {
print $"Starting health monitoring for ($service_name) (interval: ($interval)s)"
print "Press Ctrl+C to stop"
loop {
let status = (get-health-status $service_name)
let timestamp = (date now | format date "%Y-%m-%d %H:%M:%S")
let health_icon = if $status.healthy { "✅" } else { "❌" }
print $"($timestamp) ($health_icon) ($service_name): ($status.message)"
if (not $status.healthy) and $alert_on_failure {
# Could integrate with alerting system here
print $"⚠️ ALERT: Service ($service_name) is unhealthy!"
}
feat(core): three-layer DAG, unified component arch, commands-registry cache, Nushell 0.112.2 migration - DAG architecture: `dag show/validate/export` (nulib/main_provisioning/dag.nu), config loader (lib_provisioning/config/loader/dag.nu), taskserv dag-executor. Backed by schemas/lib/dag/*.ncl; orchestrator emits NATS events via WorkspaceComposition::into_workflow. See ADR-020, ADR-021. - Unified Component Architecture: components/mod.nu, main_provisioning/ {components,workflow,extensions,ontoref-queries}.nu. Full workflow engine with topological sort and NATS subject emission. Blocks A-H complete (libre-daoshi). - Commands-registry: nulib/commands-registry.ncl (Nickel source, 314 lines) + JSON cache at ~/.cache/provisioning/commands-registry.json rebuilt on source change. cli/provisioning fast-path alias expansion avoids cold Nu startup. ADDING_COMMANDS.md documents new-command workflow. - Platform service manager: service-manager.nu (+573), startup.nu (+611), service-check.nu (+255); autostart/bootstrap/health/target refactored. - Nushell 0.112.2 migration: removed all try/catch and bash redirections; external commands prefixed with ^; type signatures enforced. Driven by scripts/refactor-try-catch{,-simplified}.nu. - TTY stack: removed shlib/*-tty.sh; replaced by cli/tty-dispatch.sh, tty-filter.sh, tty-commands.conf. - New domain modules: images/ (golden image lifecycle), workspace/{state,sync}.nu, main_provisioning/{bootstrap,cluster-deploy,fip,state}.nu, commands/{state, build,integrations/auth,utilities/alias}.nu, platform.nu expanded (+874). - Config loader overhaul: loader/core.nu slimmed (-759), cache/core.nu refactored (-454), removed legacy loaders/file_loader.nu (-330). - Thirteen new provisioning-<domain>.nu top-level modules for bash dispatcher. - Tests: test_workspace_state.nu (+351); updates to test_oci_registry, test_services. - README + CHANGELOG updated.
2026-04-17 04:27:33 +01:00
sleep ($"($interval)sec" | into duration)
2025-10-07 10:32:04 +01:00
}
}