groups: - name: provisioning_platform_alerts interval: 30s rules: # Service availability - alert: ServiceDown expr: up == 0 for: 2m labels: severity: critical annotations: summary: "Service {{ $labels.job }} is down" description: "{{ $labels.job }} has been down for more than 2 minutes" # Orchestrator alerts - alert: OrchestratorHighTaskQueueSize expr: orchestrator_task_queue_size > 100 for: 5m labels: severity: warning annotations: summary: "Orchestrator task queue is growing" description: "Task queue size is {{ $value }}, may indicate processing issues" - alert: OrchestratorHighFailureRate expr: rate(orchestrator_task_failures_total[5m]) > 0.1 for: 5m labels: severity: warning annotations: summary: "High task failure rate in orchestrator" description: "Task failure rate is {{ $value }} per second" # Resource alerts - alert: HighMemoryUsage expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) > 0.9 for: 5m labels: severity: warning annotations: summary: "High memory usage on {{ $labels.container_label_com_docker_compose_service }}" description: "Memory usage is at {{ $value | humanizePercentage }}" - alert: HighCPUUsage expr: (rate(container_cpu_usage_seconds_total[5m]) / container_spec_cpu_quota) > 0.9 for: 5m labels: severity: warning annotations: summary: "High CPU usage on {{ $labels.container_label_com_docker_compose_service }}" description: "CPU usage is at {{ $value | humanizePercentage }}" # Disk space alerts - alert: LowDiskSpace expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.1 for: 5m labels: severity: warning annotations: summary: "Low disk space on {{ $labels.mountpoint }}" description: "Only {{ $value | humanizePercentage }} disk space remaining" # PostgreSQL alerts - alert: PostgreSQLDown expr: pg_up == 0 for: 2m labels: severity: critical annotations: summary: "PostgreSQL is down" description: "PostgreSQL database has been down for more than 2 minutes" - alert: PostgreSQLHighConnections expr: (pg_stat_database_numbackends / pg_settings_max_connections) > 0.8 for: 5m labels: severity: warning annotations: summary: "PostgreSQL connection pool is filling up" description: "{{ $value | humanizePercentage }} of max connections in use" # OCI Registry alerts - alert: OCIRegistryHighStorageUsage expr: (registry_storage_used_bytes / registry_storage_total_bytes) > 0.85 for: 10m labels: severity: warning annotations: summary: "OCI Registry storage is filling up" description: "Storage usage is at {{ $value | humanizePercentage }}" # API latency alerts - alert: HighAPILatency expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1 for: 5m labels: severity: warning annotations: summary: "High API latency on {{ $labels.job }}" description: "95th percentile latency is {{ $value }}s" # Error rate alerts - alert: HighErrorRate expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05 for: 5m labels: severity: warning annotations: summary: "High error rate on {{ $labels.job }}" description: "Error rate is {{ $value }} per second"