provisioning/reflection/modes/validate-observability.ncl

117 lines
4.6 KiB
Text
Raw Normal View History

2026-05-12 02:40:14 +01:00
{
id = "validate-observability",
strategy = 'Override,
description = "Verify Vector → Loki → Grafana pipeline is healthy in libre-wuji; Prometheus is scraping ops-controller and keeper-daemon; required queue metrics are present (ADR-037 constraint pending-queue-ttl-monitored).",
version = "1.0",
params = {
workspace | String | doc "Workspace name (e.g. 'libre-wuji')" | default = "libre-wuji",
kubeconfig | String | doc "Path to kubeconfig; uses KUBECONFIG env if empty" | default = "",
prometheus_url | String | doc "Prometheus API base URL" | default = "http://prometheus.observability.svc.cluster.local:9090",
grafana_url | String | doc "Grafana API base URL" | default = "http://grafana.observability.svc.cluster.local:3000",
loki_url | String | doc "Loki API base URL" | default = "http://loki.observability.svc.cluster.local:3100",
},
steps = [
{
id = "vector_daemonset_ready",
actor = 'Agent,
name = "Vector DaemonSet is fully rolled out",
cmd = "kubectl -n observability rollout status daemonset/vector --timeout=30s",
depends_on = [],
on_error = "abort",
},
{
id = "loki_ready",
actor = 'Agent,
name = "Loki /ready endpoint responds 200",
cmd = "curl -sf {loki_url}/ready",
depends_on = [],
on_error = "abort",
},
{
id = "prometheus_ready",
actor = 'Agent,
name = "Prometheus /-/ready endpoint responds 200",
cmd = "curl -sf {prometheus_url}/-/ready",
depends_on = [],
on_error = "abort",
},
{
id = "grafana_ready",
actor = 'Agent,
name = "Grafana /api/health endpoint responds 200",
cmd = "curl -sf {grafana_url}/api/health | jq -e '.database == \"ok\"'",
depends_on = [],
on_error = "warn",
},
{
id = "prometheus_scrapes_ops_controller",
actor = 'Agent,
name = "Prometheus is scraping ops-controller metrics",
cmd = "curl -sf '{prometheus_url}/api/v1/query?query=up{job=\"ops-controller\"}' | jq -e '.data.result | length > 0'",
depends_on = ["prometheus_ready"],
on_error = "abort",
},
{
id = "metric_ops_pending_queue_depth",
actor = 'Agent,
name = "ops_pending_queue_depth metric is present",
cmd = "curl -sf '{prometheus_url}/api/v1/query?query=ops_pending_queue_depth' | jq -e '.data.result | length > 0'",
depends_on = ["prometheus_scrapes_ops_controller"],
on_error = "abort",
},
{
id = "metric_ops_pending_oldest_age",
actor = 'Agent,
name = "ops_pending_oldest_age_seconds metric is present",
cmd = "curl -sf '{prometheus_url}/api/v1/query?query=ops_pending_oldest_age_seconds' | jq -e '.data.result | length > 0'",
depends_on = ["prometheus_scrapes_ops_controller"],
on_error = "abort",
},
{
id = "metric_keeper_signs_total",
actor = 'Agent,
name = "keeper_signs_total metric is present",
cmd = "curl -sf '{prometheus_url}/api/v1/query?query=keeper_signs_total' | jq -e '.data.result | length > 0'",
depends_on = ["prometheus_scrapes_ops_controller"],
on_error = "warn",
},
{
id = "metric_audit_mirror_lag",
actor = 'Agent,
name = "audit_mirror_lag_seconds metric is present",
cmd = "curl -sf '{prometheus_url}/api/v1/query?query=audit_mirror_lag_seconds' | jq -e '.data.result | length > 0'",
depends_on = ["prometheus_scrapes_ops_controller"],
on_error = "warn",
},
{
id = "grafana_dashboards_loaded",
actor = 'Agent,
name = "All four ops dashboards provisioned in Grafana",
cmd = "curl -sf '{grafana_url}/api/dashboards/uid/ops-controller-queue' | jq -e '.dashboard.uid == \"ops-controller-queue\"'",
depends_on = ["grafana_ready"],
on_error = "warn",
},
{
id = "loki_receives_ops_logs",
actor = 'Agent,
name = "Loki has received at least one log entry from ops-system namespace in last 15 minutes",
cmd = "curl -sf '{loki_url}/loki/api/v1/query_range?query=%7Bnamespace%3D%22ops-system%22%7D&limit=1&start=$(date -d '15 minutes ago' +%s)000000000' | jq -e '.data.result | length > 0'",
depends_on = ["loki_ready"],
on_error = "warn",
},
{
id = "summary_report",
actor = 'Agent,
name = "Emit observability validation summary",
cmd = "echo 'observability validation complete for workspace={workspace}'",
depends_on = [
"metric_ops_pending_queue_depth",
"metric_ops_pending_oldest_age",
"grafana_dashboards_loaded",
"loki_receives_ops_logs",
],
on_error = "warn",
},
],
}