111 lines
3.8 KiB
YAML
111 lines
3.8 KiB
YAML
|
|
groups:
|
||
|
|
- name: provisioning_platform_alerts
|
||
|
|
interval: 30s
|
||
|
|
rules:
|
||
|
|
# Service availability
|
||
|
|
- alert: ServiceDown
|
||
|
|
expr: up == 0
|
||
|
|
for: 2m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
annotations:
|
||
|
|
summary: "Service {{ $labels.job }} is down"
|
||
|
|
description: "{{ $labels.job }} has been down for more than 2 minutes"
|
||
|
|
|
||
|
|
# Orchestrator alerts
|
||
|
|
- alert: OrchestratorHighTaskQueueSize
|
||
|
|
expr: orchestrator_task_queue_size > 100
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "Orchestrator task queue is growing"
|
||
|
|
description: "Task queue size is {{ $value }}, may indicate processing issues"
|
||
|
|
|
||
|
|
- alert: OrchestratorHighFailureRate
|
||
|
|
expr: rate(orchestrator_task_failures_total[5m]) > 0.1
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "High task failure rate in orchestrator"
|
||
|
|
description: "Task failure rate is {{ $value }} per second"
|
||
|
|
|
||
|
|
# Resource alerts
|
||
|
|
- alert: HighMemoryUsage
|
||
|
|
expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) > 0.9
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "High memory usage on {{ $labels.container_label_com_docker_compose_service }}"
|
||
|
|
description: "Memory usage is at {{ $value | humanizePercentage }}"
|
||
|
|
|
||
|
|
- alert: HighCPUUsage
|
||
|
|
expr: (rate(container_cpu_usage_seconds_total[5m]) / container_spec_cpu_quota) > 0.9
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "High CPU usage on {{ $labels.container_label_com_docker_compose_service }}"
|
||
|
|
description: "CPU usage is at {{ $value | humanizePercentage }}"
|
||
|
|
|
||
|
|
# Disk space alerts
|
||
|
|
- alert: LowDiskSpace
|
||
|
|
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.1
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "Low disk space on {{ $labels.mountpoint }}"
|
||
|
|
description: "Only {{ $value | humanizePercentage }} disk space remaining"
|
||
|
|
|
||
|
|
# PostgreSQL alerts
|
||
|
|
- alert: PostgreSQLDown
|
||
|
|
expr: pg_up == 0
|
||
|
|
for: 2m
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
annotations:
|
||
|
|
summary: "PostgreSQL is down"
|
||
|
|
description: "PostgreSQL database has been down for more than 2 minutes"
|
||
|
|
|
||
|
|
- alert: PostgreSQLHighConnections
|
||
|
|
expr: (pg_stat_database_numbackends / pg_settings_max_connections) > 0.8
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "PostgreSQL connection pool is filling up"
|
||
|
|
description: "{{ $value | humanizePercentage }} of max connections in use"
|
||
|
|
|
||
|
|
# OCI Registry alerts
|
||
|
|
- alert: OCIRegistryHighStorageUsage
|
||
|
|
expr: (registry_storage_used_bytes / registry_storage_total_bytes) > 0.85
|
||
|
|
for: 10m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "OCI Registry storage is filling up"
|
||
|
|
description: "Storage usage is at {{ $value | humanizePercentage }}"
|
||
|
|
|
||
|
|
# API latency alerts
|
||
|
|
- alert: HighAPILatency
|
||
|
|
expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "High API latency on {{ $labels.job }}"
|
||
|
|
description: "95th percentile latency is {{ $value }}s"
|
||
|
|
|
||
|
|
# Error rate alerts
|
||
|
|
- alert: HighErrorRate
|
||
|
|
expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
|
||
|
|
for: 5m
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
annotations:
|
||
|
|
summary: "High error rate on {{ $labels.job }}"
|
||
|
|
description: "Error rate is {{ $value }} per second"
|