Merge _configs/ into config/ for single configuration directory. Update all path references. Changes: - Move _configs/* to config/ - Update .gitignore for new patterns - No code references to _configs/ found Impact: -1 root directory (layout_conventions.md compliance)
131 lines
3.9 KiB
YAML
131 lines
3.9 KiB
YAML
groups:
|
|
- name: lifecycle-alerts
|
|
interval: 30s
|
|
rules:
|
|
# API Server alerts
|
|
- alert: APIServerDown
|
|
expr: up{job="lifecycle-api"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
service: api
|
|
annotations:
|
|
summary: "API Server is down"
|
|
description: "The Lifecycle API server has been unreachable for 2 minutes"
|
|
|
|
- alert: APIHighErrorRate
|
|
expr: |
|
|
(sum(rate(http_requests_total{job="lifecycle-api", status=~"5.."}[5m]))
|
|
/ sum(rate(http_requests_total{job="lifecycle-api"}[5m]))) > 0.05
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: api
|
|
annotations:
|
|
summary: "High API error rate"
|
|
description: "API error rate is above 5%"
|
|
|
|
- alert: APIHighLatency
|
|
expr: |
|
|
histogram_quantile(0.95,
|
|
sum(rate(http_request_duration_seconds_bucket{job="lifecycle-api"}[5m])) by (le)
|
|
) > 1.0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: api
|
|
annotations:
|
|
summary: "High API latency"
|
|
description: "API p95 latency exceeds 1 second"
|
|
|
|
# NATS alerts
|
|
- alert: NATSDown
|
|
expr: up{job="nats"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
service: nats
|
|
annotations:
|
|
summary: "NATS server is down"
|
|
description: "NATS JetStream broker is unreachable"
|
|
|
|
- alert: NATSMemoryHigh
|
|
expr: |
|
|
nats_server_memory_bytes / nats_config_max_memory_bytes > 0.9
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: nats
|
|
annotations:
|
|
summary: "NATS memory usage critical"
|
|
description: "NATS memory usage is above 90%"
|
|
|
|
- alert: NATSConnectionsHigh
|
|
expr: |
|
|
nats_server_connections > 1000
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: nats
|
|
annotations:
|
|
summary: "High NATS connection count"
|
|
description: "More than 1000 NATS connections detected"
|
|
|
|
# Dashboard alerts
|
|
- alert: DashboardDown
|
|
expr: up{job="lifecycle-dashboard"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
service: dashboard
|
|
annotations:
|
|
summary: "Dashboard is down"
|
|
description: "The Lifecycle Dashboard has been unreachable for 2 minutes"
|
|
|
|
# Kubernetes alerts
|
|
- alert: PodRestartingTooOften
|
|
expr: |
|
|
rate(kube_pod_container_status_restarts_total{namespace="lifecycle"}[15m]) > 0.1
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: kubernetes
|
|
annotations:
|
|
summary: "Pod restarting too frequently"
|
|
description: "Pod {{ $labels.pod }} is restarting more than once per 10 minutes"
|
|
|
|
- alert: PersistentVolumeUsageHigh
|
|
expr: |
|
|
kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
service: kubernetes
|
|
annotations:
|
|
summary: "PV usage is high"
|
|
description: "PersistentVolume {{ $labels.persistentvolumeclaim }} usage is above 85%"
|
|
|
|
# Synthetic monitoring
|
|
- alert: EndToEndTestFailed
|
|
expr: |
|
|
up{job="lifecycle-e2e-test"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
service: e2e
|
|
annotations:
|
|
summary: "End-to-end test failed"
|
|
description: "Synthetic endpoint monitoring detected failures"
|
|
|
|
# SLA monitoring
|
|
- alert: APIAvailabilityBelowSLA
|
|
expr: |
|
|
up{job="lifecycle-api"} < 0.99
|
|
for: 30m
|
|
labels:
|
|
severity: critical
|
|
service: api
|
|
annotations:
|
|
summary: "API availability below SLA"
|
|
description: "API availability is below 99% SLA for 30 minutes"
|