116 lines
4.6 KiB
Text
116 lines
4.6 KiB
Text
{
|
|
id = "validate-observability",
|
|
strategy = 'Override,
|
|
description = "Verify Vector → Loki → Grafana pipeline is healthy in libre-wuji; Prometheus is scraping ops-controller and keeper-daemon; required queue metrics are present (ADR-037 constraint pending-queue-ttl-monitored).",
|
|
version = "1.0",
|
|
params = {
|
|
workspace | String | doc "Workspace name (e.g. 'libre-wuji')" | default = "libre-wuji",
|
|
kubeconfig | String | doc "Path to kubeconfig; uses KUBECONFIG env if empty" | default = "",
|
|
prometheus_url | String | doc "Prometheus API base URL" | default = "http://prometheus.observability.svc.cluster.local:9090",
|
|
grafana_url | String | doc "Grafana API base URL" | default = "http://grafana.observability.svc.cluster.local:3000",
|
|
loki_url | String | doc "Loki API base URL" | default = "http://loki.observability.svc.cluster.local:3100",
|
|
},
|
|
steps = [
|
|
{
|
|
id = "vector_daemonset_ready",
|
|
actor = 'Agent,
|
|
name = "Vector DaemonSet is fully rolled out",
|
|
cmd = "kubectl -n observability rollout status daemonset/vector --timeout=30s",
|
|
depends_on = [],
|
|
on_error = "abort",
|
|
},
|
|
{
|
|
id = "loki_ready",
|
|
actor = 'Agent,
|
|
name = "Loki /ready endpoint responds 200",
|
|
cmd = "curl -sf {loki_url}/ready",
|
|
depends_on = [],
|
|
on_error = "abort",
|
|
},
|
|
{
|
|
id = "prometheus_ready",
|
|
actor = 'Agent,
|
|
name = "Prometheus /-/ready endpoint responds 200",
|
|
cmd = "curl -sf {prometheus_url}/-/ready",
|
|
depends_on = [],
|
|
on_error = "abort",
|
|
},
|
|
{
|
|
id = "grafana_ready",
|
|
actor = 'Agent,
|
|
name = "Grafana /api/health endpoint responds 200",
|
|
cmd = "curl -sf {grafana_url}/api/health | jq -e '.database == \"ok\"'",
|
|
depends_on = [],
|
|
on_error = "warn",
|
|
},
|
|
{
|
|
id = "prometheus_scrapes_ops_controller",
|
|
actor = 'Agent,
|
|
name = "Prometheus is scraping ops-controller metrics",
|
|
cmd = "curl -sf '{prometheus_url}/api/v1/query?query=up{job=\"ops-controller\"}' | jq -e '.data.result | length > 0'",
|
|
depends_on = ["prometheus_ready"],
|
|
on_error = "abort",
|
|
},
|
|
{
|
|
id = "metric_ops_pending_queue_depth",
|
|
actor = 'Agent,
|
|
name = "ops_pending_queue_depth metric is present",
|
|
cmd = "curl -sf '{prometheus_url}/api/v1/query?query=ops_pending_queue_depth' | jq -e '.data.result | length > 0'",
|
|
depends_on = ["prometheus_scrapes_ops_controller"],
|
|
on_error = "abort",
|
|
},
|
|
{
|
|
id = "metric_ops_pending_oldest_age",
|
|
actor = 'Agent,
|
|
name = "ops_pending_oldest_age_seconds metric is present",
|
|
cmd = "curl -sf '{prometheus_url}/api/v1/query?query=ops_pending_oldest_age_seconds' | jq -e '.data.result | length > 0'",
|
|
depends_on = ["prometheus_scrapes_ops_controller"],
|
|
on_error = "abort",
|
|
},
|
|
{
|
|
id = "metric_keeper_signs_total",
|
|
actor = 'Agent,
|
|
name = "keeper_signs_total metric is present",
|
|
cmd = "curl -sf '{prometheus_url}/api/v1/query?query=keeper_signs_total' | jq -e '.data.result | length > 0'",
|
|
depends_on = ["prometheus_scrapes_ops_controller"],
|
|
on_error = "warn",
|
|
},
|
|
{
|
|
id = "metric_audit_mirror_lag",
|
|
actor = 'Agent,
|
|
name = "audit_mirror_lag_seconds metric is present",
|
|
cmd = "curl -sf '{prometheus_url}/api/v1/query?query=audit_mirror_lag_seconds' | jq -e '.data.result | length > 0'",
|
|
depends_on = ["prometheus_scrapes_ops_controller"],
|
|
on_error = "warn",
|
|
},
|
|
{
|
|
id = "grafana_dashboards_loaded",
|
|
actor = 'Agent,
|
|
name = "All four ops dashboards provisioned in Grafana",
|
|
cmd = "curl -sf '{grafana_url}/api/dashboards/uid/ops-controller-queue' | jq -e '.dashboard.uid == \"ops-controller-queue\"'",
|
|
depends_on = ["grafana_ready"],
|
|
on_error = "warn",
|
|
},
|
|
{
|
|
id = "loki_receives_ops_logs",
|
|
actor = 'Agent,
|
|
name = "Loki has received at least one log entry from ops-system namespace in last 15 minutes",
|
|
cmd = "curl -sf '{loki_url}/loki/api/v1/query_range?query=%7Bnamespace%3D%22ops-system%22%7D&limit=1&start=$(date -d '15 minutes ago' +%s)000000000' | jq -e '.data.result | length > 0'",
|
|
depends_on = ["loki_ready"],
|
|
on_error = "warn",
|
|
},
|
|
{
|
|
id = "summary_report",
|
|
actor = 'Agent,
|
|
name = "Emit observability validation summary",
|
|
cmd = "echo 'observability validation complete for workspace={workspace}'",
|
|
depends_on = [
|
|
"metric_ops_pending_queue_depth",
|
|
"metric_ops_pending_oldest_age",
|
|
"grafana_dashboards_loaded",
|
|
"loki_receives_ops_logs",
|
|
],
|
|
on_error = "warn",
|
|
},
|
|
],
|
|
}
|