groups: - name: lifecycle-alerts interval: 30s rules: # API Server alerts - alert: APIServerDown expr: up{job="lifecycle-api"} == 0 for: 2m labels: severity: critical service: api annotations: summary: "API Server is down" description: "The Lifecycle API server has been unreachable for 2 minutes" - alert: APIHighErrorRate expr: | (sum(rate(http_requests_total{job="lifecycle-api", status=~"5.."}[5m])) / sum(rate(http_requests_total{job="lifecycle-api"}[5m]))) > 0.05 for: 5m labels: severity: warning service: api annotations: summary: "High API error rate" description: "API error rate is above 5%" - alert: APIHighLatency expr: | histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket{job="lifecycle-api"}[5m])) by (le) ) > 1.0 for: 5m labels: severity: warning service: api annotations: summary: "High API latency" description: "API p95 latency exceeds 1 second" # NATS alerts - alert: NATSDown expr: up{job="nats"} == 0 for: 1m labels: severity: critical service: nats annotations: summary: "NATS server is down" description: "NATS JetStream broker is unreachable" - alert: NATSMemoryHigh expr: | nats_server_memory_bytes / nats_config_max_memory_bytes > 0.9 for: 5m labels: severity: warning service: nats annotations: summary: "NATS memory usage critical" description: "NATS memory usage is above 90%" - alert: NATSConnectionsHigh expr: | nats_server_connections > 1000 for: 5m labels: severity: warning service: nats annotations: summary: "High NATS connection count" description: "More than 1000 NATS connections detected" # Dashboard alerts - alert: DashboardDown expr: up{job="lifecycle-dashboard"} == 0 for: 2m labels: severity: warning service: dashboard annotations: summary: "Dashboard is down" description: "The Lifecycle Dashboard has been unreachable for 2 minutes" # Kubernetes alerts - alert: PodRestartingTooOften expr: | rate(kube_pod_container_status_restarts_total{namespace="lifecycle"}[15m]) > 0.1 for: 5m labels: severity: warning service: kubernetes annotations: summary: "Pod restarting too frequently" description: "Pod {{ $labels.pod }} is restarting more than once per 10 minutes" - alert: PersistentVolumeUsageHigh expr: | kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.85 for: 10m labels: severity: warning service: kubernetes annotations: summary: "PV usage is high" description: "PersistentVolume {{ $labels.persistentvolumeclaim }} usage is above 85%" # Synthetic monitoring - alert: EndToEndTestFailed expr: | up{job="lifecycle-e2e-test"} == 0 for: 5m labels: severity: warning service: e2e annotations: summary: "End-to-end test failed" description: "Synthetic endpoint monitoring detected failures" # SLA monitoring - alert: APIAvailabilityBelowSLA expr: | up{job="lifecycle-api"} < 0.99 for: 30m labels: severity: critical service: api annotations: summary: "API availability below SLA" description: "API availability is below 99% SLA for 30 minutes"