Vapora/kubernetes/monitoring/prometheus-alerts.yaml

106 lines
3.9 KiB
YAML
Raw Normal View History

apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-vapora-alerts
namespace: monitoring
data:
vapora-alerts.yml: |
groups:
- name: vapora_analytics
interval: 30s
rules:
# Performance Alerts
- alert: LowAgentSuccessRate
expr: vapora_overall_success_rate < 0.8
for: 5m
labels:
severity: warning
component: analytics
annotations:
summary: "Low agent success rate: {{ $value | humanizePercentage }}"
description: "Overall agent success rate is below 80% (current: {{ $value | humanizePercentage }})"
- alert: CriticalAgentSuccessRate
expr: vapora_overall_success_rate < 0.6
for: 2m
labels:
severity: critical
component: analytics
annotations:
summary: "Critical agent success rate: {{ $value | humanizePercentage }}"
description: "Overall agent success rate is below 60% (current: {{ $value | humanizePercentage }})"
# Cost Alerts
- alert: HighExecutionCost
expr: vapora_cost_per_task_cents > 100
for: 10m
labels:
severity: warning
component: cost
annotations:
summary: "High average cost per task: {{ $value | humanize }} cents"
description: "Average cost per task has exceeded 100 cents (current: {{ $value | humanize }} cents)"
- alert: BudgetThresholdExceeded
expr: vapora_budget_threshold_alerts_total > 0
for: 1m
labels:
severity: warning
component: budget
annotations:
summary: "Budget threshold alerts detected"
description: "Budget threshold has been exceeded {{ $value | humanize }} times"
# System Health Alerts
- alert: NoActiveAgents
expr: vapora_active_agents == 0
for: 1m
labels:
severity: critical
component: agents
annotations:
summary: "No active agents"
description: "No active agents detected. System cannot process tasks."
- alert: HighAnalyticsQueryErrors
expr: vapora_analytics_errors_total > 10
for: 5m
labels:
severity: warning
component: analytics
annotations:
summary: "High analytics query errors: {{ $value | humanize }} errors"
description: "More than 10 analytics query errors detected in the last 5 minutes"
- alert: TaskExecutionStalled
expr: rate(vapora_total_tasks_executed[5m]) < 0.1
for: 10m
labels:
severity: warning
component: execution
annotations:
summary: "Task execution rate is very low"
description: "Less than 0.1 tasks/second being executed. System may be stalled."
# Analytics Query Performance
- alert: SlowAnalyticsQueries
expr: histogram_quantile(0.95, vapora_analytics_query_duration_ms) > 5000
for: 5m
labels:
severity: warning
component: analytics
annotations:
summary: "Slow analytics queries detected"
description: "95th percentile query duration exceeds 5 seconds (current: {{ $value | humanize }}ms)"
# Budget Enforcement
- alert: BudgetExceeded
expr: vapora_budget_threshold_alerts_total > 5
for: 2m
labels:
severity: critical
component: budget
annotations:
summary: "Multiple budget threshold violations"
description: "Budget has been exceeded multiple times. Cost control measures may be needed."