apiVersion: provisioning.vapora.io/v1 kind: Workflow metadata: name: scale-agents description: Dynamically scale agent pools based on queue depth and workload spec: version: "0.2.0" namespace: vapora-system timeout: 600s # 10 minutes max schedule: "*/5 * * * *" # Run every 5 minutes inputs: - name: target_agent_role type: string required: false description: "Specific agent role to scale (architect, developer, reviewer, etc.)" - name: queue_depth_threshold type: integer required: false default: 50 description: "Queue depth threshold before scaling up" - name: cpu_threshold type: float required: false default: 0.75 description: "CPU utilization threshold (0-1)" - name: memory_threshold type: float required: false default: 0.80 description: "Memory utilization threshold (0-1)" phases: # Phase 1: Collect metrics - name: "Collect Metrics" description: "Gather queue depth, CPU, and memory metrics" retryable: true steps: - name: "Get queue depth per agent role" command: | provisioning metrics query --metric "agent_queue_depth" \ --group-by "agent_role" \ --output json > /tmp/queue_depth.json timeout: 30s - name: "Get CPU utilization" command: | provisioning metrics query --metric "container_cpu_usage_seconds_total" \ --selector "pod=~vapora-agents.*" \ --output json > /tmp/cpu_usage.json timeout: 30s - name: "Get memory utilization" command: | provisioning metrics query --metric "container_memory_working_set_bytes" \ --selector "pod=~vapora-agents.*" \ --output json > /tmp/memory_usage.json timeout: 30s # Phase 2: Analyze scaling requirements - name: "Analyze Scaling Needs" description: "Determine which agents need scaling up or down" retryable: true steps: - name: "Calculate scale requirements" command: | python3 <<'EOF' import json import os with open('/tmp/queue_depth.json', 'r') as f: queue_data = json.load(f) with open('/tmp/cpu_usage.json', 'r') as f: cpu_data = json.load(f) scaling_decisions = {} # Define queue depth thresholds per role role_thresholds = { "architect": {"scale_up": 10, "scale_down": 3}, "developer": {"scale_up": 100, "scale_down": 30}, "reviewer": {"scale_up": 50, "scale_down": 15}, "tester": {"scale_up": 50, "scale_down": 15}, "monitor": {"scale_up": 20, "scale_down": 5}, "devops": {"scale_up": 30, "scale_down": 10} } for role, metrics in queue_data.items(): thresholds = role_thresholds.get(role, {"scale_up": 50, "scale_down": 15}) current_queue = metrics.get("queue_depth", 0) current_replicas = metrics.get("replicas", 1) if current_queue > thresholds["scale_up"]: # Scale up desired_replicas = min(int(current_replicas * 1.5) + 1, 20) # Max 20 scaling_decisions[role] = { "action": "scale_up", "current": current_replicas, "desired": desired_replicas, "reason": f"Queue depth {current_queue} > {thresholds['scale_up']}" } elif current_queue < thresholds["scale_down"] and current_replicas > 2: # Scale down desired_replicas = max(int(current_replicas * 0.7), 2) # Min 2 scaling_decisions[role] = { "action": "scale_down", "current": current_replicas, "desired": desired_replicas, "reason": f"Queue depth {current_queue} < {thresholds['scale_down']}" } with open('/tmp/scaling_decisions.json', 'w') as f: json.dump(scaling_decisions, f, indent=2) print(json.dumps(scaling_decisions, indent=2)) EOF timeout: 60s dependencies: ["Collect Metrics"] # Phase 3: Scale agents based on decisions - name: "Execute Scaling" description: "Apply scaling decisions to agent pools" retryable: true parallel: true steps: - name: "Scale developer agents" command: | DECISION=$(grep -E '"developer":|"desired":' /tmp/scaling_decisions.json | grep -A1 'developer') if echo "$DECISION" | grep -q 'scale_up\|scale_down'; then REPLICAS=$(echo "$DECISION" | grep '"desired"' | grep -oE '[0-9]+') provisioning taskserv scale vapora-agents --agent developer --replicas $REPLICAS fi timeout: 120s dependencies: ["Calculate scale requirements"] continueOnError: true - name: "Scale reviewer agents" command: | DECISION=$(grep -E '"reviewer":|"desired":' /tmp/scaling_decisions.json | grep -A1 'reviewer') if echo "$DECISION" | grep -q 'scale_up\|scale_down'; then REPLICAS=$(echo "$DECISION" | grep '"desired"' | grep -oE '[0-9]+') provisioning taskserv scale vapora-agents --agent reviewer --replicas $REPLICAS fi timeout: 120s dependencies: ["Calculate scale requirements"] continueOnError: true - name: "Scale tester agents" command: | DECISION=$(grep -E '"tester":|"desired":' /tmp/scaling_decisions.json | grep -A1 'tester') if echo "$DECISION" | grep -q 'scale_up\|scale_down'; then REPLICAS=$(echo "$DECISION" | grep '"desired"' | grep -oE '[0-9]+') provisioning taskserv scale vapora-agents --agent tester --replicas $REPLICAS fi timeout: 120s dependencies: ["Calculate scale requirements"] continueOnError: true - name: "Scale devops agents" command: | DECISION=$(grep -E '"devops":|"desired":' /tmp/scaling_decisions.json | grep -A1 'devops') if echo "$DECISION" | grep -q 'scale_up\|scale_down'; then REPLICAS=$(echo "$DECISION" | grep '"desired"' | grep -oE '[0-9]+') provisioning taskserv scale vapora-agents --agent devops --replicas $REPLICAS fi timeout: 120s dependencies: ["Calculate scale requirements"] continueOnError: true # Phase 4: Verify scaling - name: "Verify Scaling" description: "Confirm scaling operations succeeded" retryable: false steps: - name: "Check agent replicas" command: | provisioning taskserv list --selector "app=vapora-agents" \ --output "json" | jq '.items[] | {agent: .metadata.labels.role, replicas: .spec.replicas}' timeout: 60s dependencies: ["Execute Scaling"] - name: "Wait for pods to be ready" command: | kubectl wait --for=condition=Ready pod \ -l app=vapora-agents \ -n vapora-system \ --timeout=300s timeout: 320s dependencies: ["Execute Scaling"] - name: "Verify queue depth improvement" command: | provisioning metrics query --metric "agent_queue_depth" \ --group-by "agent_role" \ --compare-to /tmp/queue_depth.json timeout: 30s dependencies: ["Wait for pods to be ready"] outputs: - name: scaling_summary value: "cat /tmp/scaling_decisions.json" - name: new_replica_counts command: "provisioning taskserv list --selector app=vapora-agents -o json | jq '.items[] | {agent: .metadata.labels.role, replicas: .spec.replicas}'" # Notifications notifications: onSuccess: - "slack: #ops-automation" - "action: record-metrics" onFailure: - "slack: #ops-automation" - "slack: #alerts" # Cleanup cleanup: - "rm -f /tmp/queue_depth.json" - "rm -f /tmp/cpu_usage.json" - "rm -f /tmp/memory_usage.json" - "rm -f /tmp/scaling_decisions.json"