Some checks failed
Rust CI / Security Audit (push) Has been cancelled
Rust CI / Check + Test + Lint (nightly) (push) Has been cancelled
Rust CI / Check + Test + Lint (stable) (push) Has been cancelled
mdBook Build & Deploy / Build mdBook (push) Has been cancelled
Nickel Type Check / Nickel Type Checking (push) Has been cancelled
mdBook Build & Deploy / Documentation Quality Check (push) Has been cancelled
mdBook Build & Deploy / Deploy to GitHub Pages (push) Has been cancelled
mdBook Build & Deploy / Notification (push) Has been cancelled
226 lines
6.8 KiB
Plaintext
Executable File
226 lines
6.8 KiB
Plaintext
Executable File
#!/usr/bin/env nu
|
|
# VAPORA Health Check and Monitoring Script
|
|
# Monitors deployment health across Docker and Kubernetes platforms
|
|
# Version: 1.0.0
|
|
|
|
def main [
|
|
--target: string = "docker"
|
|
--interval: int = 30
|
|
--count: int = 0
|
|
] {
|
|
print "🏥 VAPORA Health Check Monitor"
|
|
print $"Target: ($target) | Interval: ($interval)s"
|
|
print ""
|
|
|
|
if $count <= 0 {
|
|
print "⚠️ Running continuous monitoring (Press Ctrl+C to stop)"
|
|
print ""
|
|
loop {
|
|
let status = match $target {
|
|
"docker" => { check-docker-health }
|
|
"kubernetes" => { check-kubernetes-health }
|
|
_ => {
|
|
error make {msg: $"Unknown target: ($target)"}
|
|
}
|
|
}
|
|
|
|
if not $status.healthy {
|
|
print "❌ Unhealthy services detected!"
|
|
$status.issues | each { |issue| print $" • ($issue)" }
|
|
} else {
|
|
print "✅ All services healthy"
|
|
}
|
|
|
|
print ""
|
|
sleep ($interval | into duration -u 'sec')
|
|
}
|
|
} else {
|
|
# Run N times
|
|
1..$count | each { |iteration|
|
|
print $"Check ($iteration)/($count):"
|
|
let status = match $target {
|
|
"docker" => { check-docker-health }
|
|
"kubernetes" => { check-kubernetes-health }
|
|
_ => {
|
|
error make {msg: $"Unknown target: ($target)"}
|
|
}
|
|
}
|
|
|
|
if not $status.healthy {
|
|
print "❌ Unhealthy"
|
|
$status.issues | each { |issue| print $" • ($issue)" }
|
|
} else {
|
|
print "✅ Healthy"
|
|
}
|
|
|
|
if $iteration < $count {
|
|
print ""
|
|
sleep ($interval | into duration -u 'sec')
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
def check-docker-health: record {
|
|
let services = ["vapora-backend", "vapora-agents", "vapora-llm-router", "vapora-frontend"]
|
|
let issues = []
|
|
let all_healthy = true
|
|
|
|
print "🐳 Checking Docker services..."
|
|
|
|
$services | each { |service|
|
|
let result = do {
|
|
docker ps --filter $"name=($service)" --format "{{.Status}}"
|
|
} | complete
|
|
|
|
if $result.exit_code == 0 {
|
|
let status = ($result.stdout | str trim)
|
|
if ($status | str contains "Up") {
|
|
print $" ✓ ($service): ($status)"
|
|
} else if ($status | is-empty) {
|
|
print $" ✗ ($service): not running"
|
|
$issues | append $"($service) not running"
|
|
} else {
|
|
print $" ⚠️ ($service): ($status)"
|
|
$issues | append $"($service) in state: ($status)"
|
|
}
|
|
} else {
|
|
print $" ✗ ($service): error checking status"
|
|
$issues | append $"Failed to check ($service)"
|
|
}
|
|
}
|
|
|
|
print ""
|
|
print "📊 Checking service endpoints..."
|
|
|
|
let endpoints = [
|
|
["backend", "http://localhost:8001/health"]
|
|
["agents", "http://localhost:8002/health"]
|
|
["llm-router", "http://localhost:8003/health"]
|
|
["frontend", "http://localhost:3000/"]
|
|
]
|
|
|
|
$endpoints | each { |endpoint|
|
|
let name = $endpoint.0
|
|
let url = $endpoint.1
|
|
|
|
let result = do {
|
|
curl -s -o /dev/null -w "%{http_code}" $url
|
|
} | complete
|
|
|
|
if $result.exit_code == 0 {
|
|
let status_code = ($result.stdout | str trim)
|
|
if ($status_code | str starts-with "2") {
|
|
print $" ✓ ($name): HTTP ($status_code)"
|
|
} else {
|
|
print $" ⚠️ ($name): HTTP ($status_code)"
|
|
$issues | append $"($name) returned HTTP ($status_code)"
|
|
}
|
|
} else {
|
|
print $" ✗ ($name): unreachable"
|
|
$issues | append $"($name) endpoint unreachable"
|
|
}
|
|
}
|
|
|
|
{
|
|
healthy: ($issues | length) == 0
|
|
issues: $issues
|
|
}
|
|
}
|
|
|
|
def check-kubernetes-health: record {
|
|
let deployments = ["vapora-backend", "vapora-agents", "vapora-llm-router"]
|
|
let issues = []
|
|
|
|
print "☸️ Checking Kubernetes deployments..."
|
|
|
|
$deployments | each { |deployment|
|
|
let result = do {
|
|
kubectl get deployment $deployment -n vapora -o json
|
|
} | complete
|
|
|
|
if $result.exit_code == 0 {
|
|
let deploy_json = ($result.stdout | from json)
|
|
let desired = $deploy_json.spec.replicas
|
|
let ready = $deploy_json.status.readyReplicas
|
|
let updated = $deploy_json.status.updatedReplicas
|
|
|
|
if ($desired == $ready) and ($desired == $updated) {
|
|
print $" ✓ ($deployment): ($ready)/($desired) replicas ready"
|
|
} else {
|
|
print $" ⚠️ ($deployment): ($ready)/($desired) replicas ready"
|
|
$issues | append $"($deployment) replicas not ready: ($ready)/($desired)"
|
|
}
|
|
} else {
|
|
print $" ✗ ($deployment): not found"
|
|
$issues | append $"($deployment) deployment not found"
|
|
}
|
|
}
|
|
|
|
print ""
|
|
print "📊 Checking pod health..."
|
|
|
|
let pods_result = do {
|
|
kubectl get pods -n vapora -o json
|
|
} | complete
|
|
|
|
if $pods_result.exit_code == 0 {
|
|
let pods_json = ($pods_result.stdout | from json)
|
|
let pods = $pods_json.items
|
|
|
|
$pods | each { |pod|
|
|
let name = $pod.metadata.name
|
|
let phase = $pod.status.phase
|
|
let ready_containers = (
|
|
$pod.status.conditions
|
|
| where type == "Ready"
|
|
| get status
|
|
| get 0
|
|
)
|
|
|
|
if ($phase == "Running") and ($ready_containers == "True") {
|
|
print $" ✓ ($name): Running"
|
|
} else {
|
|
print $" ⚠️ ($name): ($phase)"
|
|
$issues | append $"Pod ($name) in phase: ($phase)"
|
|
}
|
|
}
|
|
} else {
|
|
print " ✗ Could not get pod status"
|
|
$issues | append "Failed to query pods"
|
|
}
|
|
|
|
print ""
|
|
print "📊 Checking services..."
|
|
|
|
let svc_result = do {
|
|
kubectl get svc -n vapora -o json
|
|
} | complete
|
|
|
|
if $svc_result.exit_code == 0 {
|
|
let svc_json = ($svc_result.stdout | from json)
|
|
let services = $svc_json.items
|
|
|
|
$services | each { |service|
|
|
let name = $service.metadata.name
|
|
let svc_type = $service.spec.type
|
|
let cluster_ip = $service.spec.clusterIP
|
|
|
|
if ($cluster_ip != "None") {
|
|
print $" ✓ ($name): ($svc_type) - ($cluster_ip)"
|
|
} else {
|
|
print $" ⚠️ ($name): no cluster IP assigned"
|
|
$issues | append $"Service ($name) has no cluster IP"
|
|
}
|
|
}
|
|
}
|
|
|
|
{
|
|
healthy: ($issues | length) == 0
|
|
issues: $issues
|
|
}
|
|
}
|
|
|
|
# Run main function
|
|
main
|