syntaxis/config/monitoring/prometheus-alerts.yml
Jesús Pérez 9cef9b8d57 refactor: consolidate configuration directories
Merge _configs/ into config/ for single configuration directory.
Update all path references.

Changes:
- Move _configs/* to config/
- Update .gitignore for new patterns
- No code references to _configs/ found

Impact: -1 root directory (layout_conventions.md compliance)
2025-12-26 18:36:23 +00:00

131 lines
3.9 KiB
YAML

groups:
- name: lifecycle-alerts
interval: 30s
rules:
# API Server alerts
- alert: APIServerDown
expr: up{job="lifecycle-api"} == 0
for: 2m
labels:
severity: critical
service: api
annotations:
summary: "API Server is down"
description: "The Lifecycle API server has been unreachable for 2 minutes"
- alert: APIHighErrorRate
expr: |
(sum(rate(http_requests_total{job="lifecycle-api", status=~"5.."}[5m]))
/ sum(rate(http_requests_total{job="lifecycle-api"}[5m]))) > 0.05
for: 5m
labels:
severity: warning
service: api
annotations:
summary: "High API error rate"
description: "API error rate is above 5%"
- alert: APIHighLatency
expr: |
histogram_quantile(0.95,
sum(rate(http_request_duration_seconds_bucket{job="lifecycle-api"}[5m])) by (le)
) > 1.0
for: 5m
labels:
severity: warning
service: api
annotations:
summary: "High API latency"
description: "API p95 latency exceeds 1 second"
# NATS alerts
- alert: NATSDown
expr: up{job="nats"} == 0
for: 1m
labels:
severity: critical
service: nats
annotations:
summary: "NATS server is down"
description: "NATS JetStream broker is unreachable"
- alert: NATSMemoryHigh
expr: |
nats_server_memory_bytes / nats_config_max_memory_bytes > 0.9
for: 5m
labels:
severity: warning
service: nats
annotations:
summary: "NATS memory usage critical"
description: "NATS memory usage is above 90%"
- alert: NATSConnectionsHigh
expr: |
nats_server_connections > 1000
for: 5m
labels:
severity: warning
service: nats
annotations:
summary: "High NATS connection count"
description: "More than 1000 NATS connections detected"
# Dashboard alerts
- alert: DashboardDown
expr: up{job="lifecycle-dashboard"} == 0
for: 2m
labels:
severity: warning
service: dashboard
annotations:
summary: "Dashboard is down"
description: "The Lifecycle Dashboard has been unreachable for 2 minutes"
# Kubernetes alerts
- alert: PodRestartingTooOften
expr: |
rate(kube_pod_container_status_restarts_total{namespace="lifecycle"}[15m]) > 0.1
for: 5m
labels:
severity: warning
service: kubernetes
annotations:
summary: "Pod restarting too frequently"
description: "Pod {{ $labels.pod }} is restarting more than once per 10 minutes"
- alert: PersistentVolumeUsageHigh
expr: |
kubelet_volume_stats_used_bytes / kubelet_volume_stats_capacity_bytes > 0.85
for: 10m
labels:
severity: warning
service: kubernetes
annotations:
summary: "PV usage is high"
description: "PersistentVolume {{ $labels.persistentvolumeclaim }} usage is above 85%"
# Synthetic monitoring
- alert: EndToEndTestFailed
expr: |
up{job="lifecycle-e2e-test"} == 0
for: 5m
labels:
severity: warning
service: e2e
annotations:
summary: "End-to-end test failed"
description: "Synthetic endpoint monitoring detected failures"
# SLA monitoring
- alert: APIAvailabilityBelowSLA
expr: |
up{job="lifecycle-api"} < 0.99
for: 30m
labels:
severity: critical
service: api
annotations:
summary: "API availability below SLA"
description: "API availability is below 99% SLA for 30 minutes"