{ id = "validate-observability", strategy = 'Override, description = "Verify Vector → Loki → Grafana pipeline is healthy in libre-wuji; Prometheus is scraping ops-controller and keeper-daemon; required queue metrics are present (ADR-037 constraint pending-queue-ttl-monitored).", version = "1.0", params = { workspace | String | doc "Workspace name (e.g. 'libre-wuji')" | default = "libre-wuji", kubeconfig | String | doc "Path to kubeconfig; uses KUBECONFIG env if empty" | default = "", prometheus_url | String | doc "Prometheus API base URL" | default = "http://prometheus.observability.svc.cluster.local:9090", grafana_url | String | doc "Grafana API base URL" | default = "http://grafana.observability.svc.cluster.local:3000", loki_url | String | doc "Loki API base URL" | default = "http://loki.observability.svc.cluster.local:3100", }, steps = [ { id = "vector_daemonset_ready", actor = 'Agent, name = "Vector DaemonSet is fully rolled out", cmd = "kubectl -n observability rollout status daemonset/vector --timeout=30s", depends_on = [], on_error = "abort", }, { id = "loki_ready", actor = 'Agent, name = "Loki /ready endpoint responds 200", cmd = "curl -sf {loki_url}/ready", depends_on = [], on_error = "abort", }, { id = "prometheus_ready", actor = 'Agent, name = "Prometheus /-/ready endpoint responds 200", cmd = "curl -sf {prometheus_url}/-/ready", depends_on = [], on_error = "abort", }, { id = "grafana_ready", actor = 'Agent, name = "Grafana /api/health endpoint responds 200", cmd = "curl -sf {grafana_url}/api/health | jq -e '.database == \"ok\"'", depends_on = [], on_error = "warn", }, { id = "prometheus_scrapes_ops_controller", actor = 'Agent, name = "Prometheus is scraping ops-controller metrics", cmd = "curl -sf '{prometheus_url}/api/v1/query?query=up{job=\"ops-controller\"}' | jq -e '.data.result | length > 0'", depends_on = ["prometheus_ready"], on_error = "abort", }, { id = "metric_ops_pending_queue_depth", actor = 'Agent, name = "ops_pending_queue_depth metric is present", cmd = "curl -sf '{prometheus_url}/api/v1/query?query=ops_pending_queue_depth' | jq -e '.data.result | length > 0'", depends_on = ["prometheus_scrapes_ops_controller"], on_error = "abort", }, { id = "metric_ops_pending_oldest_age", actor = 'Agent, name = "ops_pending_oldest_age_seconds metric is present", cmd = "curl -sf '{prometheus_url}/api/v1/query?query=ops_pending_oldest_age_seconds' | jq -e '.data.result | length > 0'", depends_on = ["prometheus_scrapes_ops_controller"], on_error = "abort", }, { id = "metric_keeper_signs_total", actor = 'Agent, name = "keeper_signs_total metric is present", cmd = "curl -sf '{prometheus_url}/api/v1/query?query=keeper_signs_total' | jq -e '.data.result | length > 0'", depends_on = ["prometheus_scrapes_ops_controller"], on_error = "warn", }, { id = "metric_audit_mirror_lag", actor = 'Agent, name = "audit_mirror_lag_seconds metric is present", cmd = "curl -sf '{prometheus_url}/api/v1/query?query=audit_mirror_lag_seconds' | jq -e '.data.result | length > 0'", depends_on = ["prometheus_scrapes_ops_controller"], on_error = "warn", }, { id = "grafana_dashboards_loaded", actor = 'Agent, name = "All four ops dashboards provisioned in Grafana", cmd = "curl -sf '{grafana_url}/api/dashboards/uid/ops-controller-queue' | jq -e '.dashboard.uid == \"ops-controller-queue\"'", depends_on = ["grafana_ready"], on_error = "warn", }, { id = "loki_receives_ops_logs", actor = 'Agent, name = "Loki has received at least one log entry from ops-system namespace in last 15 minutes", cmd = "curl -sf '{loki_url}/loki/api/v1/query_range?query=%7Bnamespace%3D%22ops-system%22%7D&limit=1&start=$(date -d '15 minutes ago' +%s)000000000' | jq -e '.data.result | length > 0'", depends_on = ["loki_ready"], on_error = "warn", }, { id = "summary_report", actor = 'Agent, name = "Emit observability validation summary", cmd = "echo 'observability validation complete for workspace={workspace}'", depends_on = [ "metric_ops_pending_queue_depth", "metric_ops_pending_oldest_age", "grafana_dashboards_loaded", "loki_receives_ops_logs", ], on_error = "warn", }, ], }