222 lines
8.2 KiB
YAML
222 lines
8.2 KiB
YAML
|
|
apiVersion: provisioning.vapora.io/v1
|
||
|
|
kind: Workflow
|
||
|
|
metadata:
|
||
|
|
name: scale-agents
|
||
|
|
description: Dynamically scale agent pools based on queue depth and workload
|
||
|
|
spec:
|
||
|
|
version: "0.2.0"
|
||
|
|
namespace: vapora-system
|
||
|
|
timeout: 600s # 10 minutes max
|
||
|
|
schedule: "*/5 * * * *" # Run every 5 minutes
|
||
|
|
|
||
|
|
inputs:
|
||
|
|
- name: target_agent_role
|
||
|
|
type: string
|
||
|
|
required: false
|
||
|
|
description: "Specific agent role to scale (architect, developer, reviewer, etc.)"
|
||
|
|
- name: queue_depth_threshold
|
||
|
|
type: integer
|
||
|
|
required: false
|
||
|
|
default: 50
|
||
|
|
description: "Queue depth threshold before scaling up"
|
||
|
|
- name: cpu_threshold
|
||
|
|
type: float
|
||
|
|
required: false
|
||
|
|
default: 0.75
|
||
|
|
description: "CPU utilization threshold (0-1)"
|
||
|
|
- name: memory_threshold
|
||
|
|
type: float
|
||
|
|
required: false
|
||
|
|
default: 0.80
|
||
|
|
description: "Memory utilization threshold (0-1)"
|
||
|
|
|
||
|
|
phases:
|
||
|
|
|
||
|
|
# Phase 1: Collect metrics
|
||
|
|
- name: "Collect Metrics"
|
||
|
|
description: "Gather queue depth, CPU, and memory metrics"
|
||
|
|
retryable: true
|
||
|
|
steps:
|
||
|
|
- name: "Get queue depth per agent role"
|
||
|
|
command: |
|
||
|
|
provisioning metrics query --metric "agent_queue_depth" \
|
||
|
|
--group-by "agent_role" \
|
||
|
|
--output json > /tmp/queue_depth.json
|
||
|
|
timeout: 30s
|
||
|
|
|
||
|
|
- name: "Get CPU utilization"
|
||
|
|
command: |
|
||
|
|
provisioning metrics query --metric "container_cpu_usage_seconds_total" \
|
||
|
|
--selector "pod=~vapora-agents.*" \
|
||
|
|
--output json > /tmp/cpu_usage.json
|
||
|
|
timeout: 30s
|
||
|
|
|
||
|
|
- name: "Get memory utilization"
|
||
|
|
command: |
|
||
|
|
provisioning metrics query --metric "container_memory_working_set_bytes" \
|
||
|
|
--selector "pod=~vapora-agents.*" \
|
||
|
|
--output json > /tmp/memory_usage.json
|
||
|
|
timeout: 30s
|
||
|
|
|
||
|
|
# Phase 2: Analyze scaling requirements
|
||
|
|
- name: "Analyze Scaling Needs"
|
||
|
|
description: "Determine which agents need scaling up or down"
|
||
|
|
retryable: true
|
||
|
|
steps:
|
||
|
|
- name: "Calculate scale requirements"
|
||
|
|
command: |
|
||
|
|
python3 <<'EOF'
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
|
||
|
|
with open('/tmp/queue_depth.json', 'r') as f:
|
||
|
|
queue_data = json.load(f)
|
||
|
|
|
||
|
|
with open('/tmp/cpu_usage.json', 'r') as f:
|
||
|
|
cpu_data = json.load(f)
|
||
|
|
|
||
|
|
scaling_decisions = {}
|
||
|
|
|
||
|
|
# Define queue depth thresholds per role
|
||
|
|
role_thresholds = {
|
||
|
|
"architect": {"scale_up": 10, "scale_down": 3},
|
||
|
|
"developer": {"scale_up": 100, "scale_down": 30},
|
||
|
|
"reviewer": {"scale_up": 50, "scale_down": 15},
|
||
|
|
"tester": {"scale_up": 50, "scale_down": 15},
|
||
|
|
"monitor": {"scale_up": 20, "scale_down": 5},
|
||
|
|
"devops": {"scale_up": 30, "scale_down": 10}
|
||
|
|
}
|
||
|
|
|
||
|
|
for role, metrics in queue_data.items():
|
||
|
|
thresholds = role_thresholds.get(role, {"scale_up": 50, "scale_down": 15})
|
||
|
|
current_queue = metrics.get("queue_depth", 0)
|
||
|
|
current_replicas = metrics.get("replicas", 1)
|
||
|
|
|
||
|
|
if current_queue > thresholds["scale_up"]:
|
||
|
|
# Scale up
|
||
|
|
desired_replicas = min(int(current_replicas * 1.5) + 1, 20) # Max 20
|
||
|
|
scaling_decisions[role] = {
|
||
|
|
"action": "scale_up",
|
||
|
|
"current": current_replicas,
|
||
|
|
"desired": desired_replicas,
|
||
|
|
"reason": f"Queue depth {current_queue} > {thresholds['scale_up']}"
|
||
|
|
}
|
||
|
|
elif current_queue < thresholds["scale_down"] and current_replicas > 2:
|
||
|
|
# Scale down
|
||
|
|
desired_replicas = max(int(current_replicas * 0.7), 2) # Min 2
|
||
|
|
scaling_decisions[role] = {
|
||
|
|
"action": "scale_down",
|
||
|
|
"current": current_replicas,
|
||
|
|
"desired": desired_replicas,
|
||
|
|
"reason": f"Queue depth {current_queue} < {thresholds['scale_down']}"
|
||
|
|
}
|
||
|
|
|
||
|
|
with open('/tmp/scaling_decisions.json', 'w') as f:
|
||
|
|
json.dump(scaling_decisions, f, indent=2)
|
||
|
|
|
||
|
|
print(json.dumps(scaling_decisions, indent=2))
|
||
|
|
EOF
|
||
|
|
timeout: 60s
|
||
|
|
dependencies: ["Collect Metrics"]
|
||
|
|
|
||
|
|
# Phase 3: Scale agents based on decisions
|
||
|
|
- name: "Execute Scaling"
|
||
|
|
description: "Apply scaling decisions to agent pools"
|
||
|
|
retryable: true
|
||
|
|
parallel: true
|
||
|
|
steps:
|
||
|
|
- name: "Scale developer agents"
|
||
|
|
command: |
|
||
|
|
DECISION=$(grep -E '"developer":|"desired":' /tmp/scaling_decisions.json | grep -A1 'developer')
|
||
|
|
if echo "$DECISION" | grep -q 'scale_up\|scale_down'; then
|
||
|
|
REPLICAS=$(echo "$DECISION" | grep '"desired"' | grep -oE '[0-9]+')
|
||
|
|
provisioning taskserv scale vapora-agents --agent developer --replicas $REPLICAS
|
||
|
|
fi
|
||
|
|
timeout: 120s
|
||
|
|
dependencies: ["Calculate scale requirements"]
|
||
|
|
continueOnError: true
|
||
|
|
|
||
|
|
- name: "Scale reviewer agents"
|
||
|
|
command: |
|
||
|
|
DECISION=$(grep -E '"reviewer":|"desired":' /tmp/scaling_decisions.json | grep -A1 'reviewer')
|
||
|
|
if echo "$DECISION" | grep -q 'scale_up\|scale_down'; then
|
||
|
|
REPLICAS=$(echo "$DECISION" | grep '"desired"' | grep -oE '[0-9]+')
|
||
|
|
provisioning taskserv scale vapora-agents --agent reviewer --replicas $REPLICAS
|
||
|
|
fi
|
||
|
|
timeout: 120s
|
||
|
|
dependencies: ["Calculate scale requirements"]
|
||
|
|
continueOnError: true
|
||
|
|
|
||
|
|
- name: "Scale tester agents"
|
||
|
|
command: |
|
||
|
|
DECISION=$(grep -E '"tester":|"desired":' /tmp/scaling_decisions.json | grep -A1 'tester')
|
||
|
|
if echo "$DECISION" | grep -q 'scale_up\|scale_down'; then
|
||
|
|
REPLICAS=$(echo "$DECISION" | grep '"desired"' | grep -oE '[0-9]+')
|
||
|
|
provisioning taskserv scale vapora-agents --agent tester --replicas $REPLICAS
|
||
|
|
fi
|
||
|
|
timeout: 120s
|
||
|
|
dependencies: ["Calculate scale requirements"]
|
||
|
|
continueOnError: true
|
||
|
|
|
||
|
|
- name: "Scale devops agents"
|
||
|
|
command: |
|
||
|
|
DECISION=$(grep -E '"devops":|"desired":' /tmp/scaling_decisions.json | grep -A1 'devops')
|
||
|
|
if echo "$DECISION" | grep -q 'scale_up\|scale_down'; then
|
||
|
|
REPLICAS=$(echo "$DECISION" | grep '"desired"' | grep -oE '[0-9]+')
|
||
|
|
provisioning taskserv scale vapora-agents --agent devops --replicas $REPLICAS
|
||
|
|
fi
|
||
|
|
timeout: 120s
|
||
|
|
dependencies: ["Calculate scale requirements"]
|
||
|
|
continueOnError: true
|
||
|
|
|
||
|
|
# Phase 4: Verify scaling
|
||
|
|
- name: "Verify Scaling"
|
||
|
|
description: "Confirm scaling operations succeeded"
|
||
|
|
retryable: false
|
||
|
|
steps:
|
||
|
|
- name: "Check agent replicas"
|
||
|
|
command: |
|
||
|
|
provisioning taskserv list --selector "app=vapora-agents" \
|
||
|
|
--output "json" | jq '.items[] | {agent: .metadata.labels.role, replicas: .spec.replicas}'
|
||
|
|
timeout: 60s
|
||
|
|
dependencies: ["Execute Scaling"]
|
||
|
|
|
||
|
|
- name: "Wait for pods to be ready"
|
||
|
|
command: |
|
||
|
|
kubectl wait --for=condition=Ready pod \
|
||
|
|
-l app=vapora-agents \
|
||
|
|
-n vapora-system \
|
||
|
|
--timeout=300s
|
||
|
|
timeout: 320s
|
||
|
|
dependencies: ["Execute Scaling"]
|
||
|
|
|
||
|
|
- name: "Verify queue depth improvement"
|
||
|
|
command: |
|
||
|
|
provisioning metrics query --metric "agent_queue_depth" \
|
||
|
|
--group-by "agent_role" \
|
||
|
|
--compare-to /tmp/queue_depth.json
|
||
|
|
timeout: 30s
|
||
|
|
dependencies: ["Wait for pods to be ready"]
|
||
|
|
|
||
|
|
outputs:
|
||
|
|
- name: scaling_summary
|
||
|
|
value: "cat /tmp/scaling_decisions.json"
|
||
|
|
- name: new_replica_counts
|
||
|
|
command: "provisioning taskserv list --selector app=vapora-agents -o json | jq '.items[] | {agent: .metadata.labels.role, replicas: .spec.replicas}'"
|
||
|
|
|
||
|
|
# Notifications
|
||
|
|
notifications:
|
||
|
|
onSuccess:
|
||
|
|
- "slack: #ops-automation"
|
||
|
|
- "action: record-metrics"
|
||
|
|
onFailure:
|
||
|
|
- "slack: #ops-automation"
|
||
|
|
- "slack: #alerts"
|
||
|
|
|
||
|
|
# Cleanup
|
||
|
|
cleanup:
|
||
|
|
- "rm -f /tmp/queue_depth.json"
|
||
|
|
- "rm -f /tmp/cpu_usage.json"
|
||
|
|
- "rm -f /tmp/memory_usage.json"
|
||
|
|
- "rm -f /tmp/scaling_decisions.json"
|