prvng_platform/monitoring/prometheus/rules/alerts.yml

groups:
  - name: provisioning_platform_alerts
    interval: 30s
    rules:
      # Service availability
      - alert: ServiceDown
        expr: up == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Service {{ $labels.job }} is down"
          description: "{{ $labels.job }} has been down for more than 2 minutes"

      # Orchestrator alerts
      - alert: OrchestratorHighTaskQueueSize
        expr: orchestrator_task_queue_size > 100
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Orchestrator task queue is growing"
          description: "Task queue size is {{ $value }}, may indicate processing issues"

      - alert: OrchestratorHighFailureRate
        expr: rate(orchestrator_task_failures_total[5m]) > 0.1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High task failure rate in orchestrator"
          description: "Task failure rate is {{ $value }} per second"

      # Resource alerts
      - alert: HighMemoryUsage
        expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) > 0.9
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage on {{ $labels.container_label_com_docker_compose_service }}"
          description: "Memory usage is at {{ $value | humanizePercentage }}"

      - alert: HighCPUUsage
        expr: (rate(container_cpu_usage_seconds_total[5m]) / container_spec_cpu_quota) > 0.9
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage on {{ $labels.container_label_com_docker_compose_service }}"
          description: "CPU usage is at {{ $value | humanizePercentage }}"

      # Disk space alerts
      - alert: LowDiskSpace
        expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Low disk space on {{ $labels.mountpoint }}"
          description: "Only {{ $value | humanizePercentage }} disk space remaining"

      # PostgreSQL alerts
      - alert: PostgreSQLDown
        expr: pg_up == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "PostgreSQL is down"
          description: "PostgreSQL database has been down for more than 2 minutes"

      - alert: PostgreSQLHighConnections
        expr: (pg_stat_database_numbackends / pg_settings_max_connections) > 0.8
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "PostgreSQL connection pool is filling up"
          description: "{{ $value | humanizePercentage }} of max connections in use"

      # OCI Registry alerts
      - alert: OCIRegistryHighStorageUsage
        expr: (registry_storage_used_bytes / registry_storage_total_bytes) > 0.85
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "OCI Registry storage is filling up"
          description: "Storage usage is at {{ $value | humanizePercentage }}"

      # API latency alerts
      - alert: HighAPILatency
        expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High API latency on {{ $labels.job }}"
          description: "95th percentile latency is {{ $value }}s"

      # Error rate alerts
      - alert: HighErrorRate
        expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High error rate on {{ $labels.job }}"
          description: "Error rate is {{ $value }} per second"
core: init repo and codebase 2025-10-07 10:59:52 +01:00			`groups:`
			`- name: provisioning_platform_alerts`
			`interval: 30s`
			`rules:`
			`# Service availability`
			`- alert: ServiceDown`
			`expr: up == 0`
			`for: 2m`
			`labels:`
			`severity: critical`
			`annotations:`
			`summary: "Service {{ $labels.job }} is down"`
			`description: "{{ $labels.job }} has been down for more than 2 minutes"`

			`# Orchestrator alerts`
			`- alert: OrchestratorHighTaskQueueSize`
			`expr: orchestrator_task_queue_size > 100`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Orchestrator task queue is growing"`
			`description: "Task queue size is {{ $value }}, may indicate processing issues"`

			`- alert: OrchestratorHighFailureRate`
			`expr: rate(orchestrator_task_failures_total[5m]) > 0.1`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "High task failure rate in orchestrator"`
			`description: "Task failure rate is {{ $value }} per second"`

			`# Resource alerts`
			`- alert: HighMemoryUsage`
			`expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) > 0.9`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "High memory usage on {{ $labels.container_label_com_docker_compose_service }}"`
			`description: "Memory usage is at {{ $value \| humanizePercentage }}"`

			`- alert: HighCPUUsage`
			`expr: (rate(container_cpu_usage_seconds_total[5m]) / container_spec_cpu_quota) > 0.9`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "High CPU usage on {{ $labels.container_label_com_docker_compose_service }}"`
			`description: "CPU usage is at {{ $value \| humanizePercentage }}"`

			`# Disk space alerts`
			`- alert: LowDiskSpace`
			`expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) < 0.1`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "Low disk space on {{ $labels.mountpoint }}"`
			`description: "Only {{ $value \| humanizePercentage }} disk space remaining"`

			`# PostgreSQL alerts`
			`- alert: PostgreSQLDown`
			`expr: pg_up == 0`
			`for: 2m`
			`labels:`
			`severity: critical`
			`annotations:`
			`summary: "PostgreSQL is down"`
			`description: "PostgreSQL database has been down for more than 2 minutes"`

			`- alert: PostgreSQLHighConnections`
			`expr: (pg_stat_database_numbackends / pg_settings_max_connections) > 0.8`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "PostgreSQL connection pool is filling up"`
			`description: "{{ $value \| humanizePercentage }} of max connections in use"`

			`# OCI Registry alerts`
			`- alert: OCIRegistryHighStorageUsage`
			`expr: (registry_storage_used_bytes / registry_storage_total_bytes) > 0.85`
			`for: 10m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "OCI Registry storage is filling up"`
			`description: "Storage usage is at {{ $value \| humanizePercentage }}"`

			`# API latency alerts`
			`- alert: HighAPILatency`
			`expr: histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m])) > 1`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "High API latency on {{ $labels.job }}"`
			`description: "95th percentile latency is {{ $value }}s"`

			`# Error rate alerts`
			`- alert: HighErrorRate`
			`expr: rate(http_requests_total{status=~"5.."}[5m]) > 0.05`
			`for: 5m`
			`labels:`
			`severity: warning`
			`annotations:`
			`summary: "High error rate on {{ $labels.job }}"`
			`description: "Error rate is {{ $value }} per second"`