Vapora/provisioning/vapora-wrksp/workflows/scale-agents.yaml

apiVersion: provisioning.vapora.io/v1
kind: Workflow
metadata:
  name: scale-agents
  description: Dynamically scale agent pools based on queue depth and workload
spec:
  version: "0.2.0"
  namespace: vapora-system
  timeout: 600s  # 10 minutes max
  schedule: "*/5 * * * *"  # Run every 5 minutes

  inputs:
    - name: target_agent_role
      type: string
      required: false
      description: "Specific agent role to scale (architect, developer, reviewer, etc.)"
    - name: queue_depth_threshold
      type: integer
      required: false
      default: 50
      description: "Queue depth threshold before scaling up"
    - name: cpu_threshold
      type: float
      required: false
      default: 0.75
      description: "CPU utilization threshold (0-1)"
    - name: memory_threshold
      type: float
      required: false
      default: 0.80
      description: "Memory utilization threshold (0-1)"

  phases:

    # Phase 1: Collect metrics
    - name: "Collect Metrics"
      description: "Gather queue depth, CPU, and memory metrics"
      retryable: true
      steps:
        - name: "Get queue depth per agent role"
          command: |
            provisioning metrics query --metric "agent_queue_depth" \
              --group-by "agent_role" \
              --output json > /tmp/queue_depth.json
          timeout: 30s

        - name: "Get CPU utilization"
          command: |
            provisioning metrics query --metric "container_cpu_usage_seconds_total" \
              --selector "pod=~vapora-agents.*" \
              --output json > /tmp/cpu_usage.json
          timeout: 30s

        - name: "Get memory utilization"
          command: |
            provisioning metrics query --metric "container_memory_working_set_bytes" \
              --selector "pod=~vapora-agents.*" \
              --output json > /tmp/memory_usage.json
          timeout: 30s

    # Phase 2: Analyze scaling requirements
    - name: "Analyze Scaling Needs"
      description: "Determine which agents need scaling up or down"
      retryable: true
      steps:
        - name: "Calculate scale requirements"
          command: |
            python3 <<'EOF'
            import json
            import os

            with open('/tmp/queue_depth.json', 'r') as f:
              queue_data = json.load(f)

            with open('/tmp/cpu_usage.json', 'r') as f:
              cpu_data = json.load(f)

            scaling_decisions = {}

            # Define queue depth thresholds per role
            role_thresholds = {
              "architect": {"scale_up": 10, "scale_down": 3},
              "developer": {"scale_up": 100, "scale_down": 30},
              "reviewer": {"scale_up": 50, "scale_down": 15},
              "tester": {"scale_up": 50, "scale_down": 15},
              "monitor": {"scale_up": 20, "scale_down": 5},
              "devops": {"scale_up": 30, "scale_down": 10}
            }

            for role, metrics in queue_data.items():
              thresholds = role_thresholds.get(role, {"scale_up": 50, "scale_down": 15})
              current_queue = metrics.get("queue_depth", 0)
              current_replicas = metrics.get("replicas", 1)

              if current_queue > thresholds["scale_up"]:
                # Scale up
                desired_replicas = min(int(current_replicas * 1.5) + 1, 20)  # Max 20
                scaling_decisions[role] = {
                  "action": "scale_up",
                  "current": current_replicas,
                  "desired": desired_replicas,
                  "reason": f"Queue depth {current_queue} > {thresholds['scale_up']}"
                }
              elif current_queue < thresholds["scale_down"] and current_replicas > 2:
                # Scale down
                desired_replicas = max(int(current_replicas * 0.7), 2)  # Min 2
                scaling_decisions[role] = {
                  "action": "scale_down",
                  "current": current_replicas,
                  "desired": desired_replicas,
                  "reason": f"Queue depth {current_queue} < {thresholds['scale_down']}"
                }

            with open('/tmp/scaling_decisions.json', 'w') as f:
              json.dump(scaling_decisions, f, indent=2)

            print(json.dumps(scaling_decisions, indent=2))
            EOF
          timeout: 60s
          dependencies: ["Collect Metrics"]

    # Phase 3: Scale agents based on decisions
    - name: "Execute Scaling"
      description: "Apply scaling decisions to agent pools"
      retryable: true
      parallel: true
      steps:
        - name: "Scale developer agents"
          command: |
            DECISION=$(grep -E '"developer":|"desired":' /tmp/scaling_decisions.json | grep -A1 'developer')
            if echo "$DECISION" | grep -q 'scale_up\|scale_down'; then
              REPLICAS=$(echo "$DECISION" | grep '"desired"' | grep -oE '[0-9]+')
              provisioning taskserv scale vapora-agents --agent developer --replicas $REPLICAS
            fi
          timeout: 120s
          dependencies: ["Calculate scale requirements"]
          continueOnError: true

        - name: "Scale reviewer agents"
          command: |
            DECISION=$(grep -E '"reviewer":|"desired":' /tmp/scaling_decisions.json | grep -A1 'reviewer')
            if echo "$DECISION" | grep -q 'scale_up\|scale_down'; then
              REPLICAS=$(echo "$DECISION" | grep '"desired"' | grep -oE '[0-9]+')
              provisioning taskserv scale vapora-agents --agent reviewer --replicas $REPLICAS
            fi
          timeout: 120s
          dependencies: ["Calculate scale requirements"]
          continueOnError: true

        - name: "Scale tester agents"
          command: |
            DECISION=$(grep -E '"tester":|"desired":' /tmp/scaling_decisions.json | grep -A1 'tester')
            if echo "$DECISION" | grep -q 'scale_up\|scale_down'; then
              REPLICAS=$(echo "$DECISION" | grep '"desired"' | grep -oE '[0-9]+')
              provisioning taskserv scale vapora-agents --agent tester --replicas $REPLICAS
            fi
          timeout: 120s
          dependencies: ["Calculate scale requirements"]
          continueOnError: true

        - name: "Scale devops agents"
          command: |
            DECISION=$(grep -E '"devops":|"desired":' /tmp/scaling_decisions.json | grep -A1 'devops')
            if echo "$DECISION" | grep -q 'scale_up\|scale_down'; then
              REPLICAS=$(echo "$DECISION" | grep '"desired"' | grep -oE '[0-9]+')
              provisioning taskserv scale vapora-agents --agent devops --replicas $REPLICAS
            fi
          timeout: 120s
          dependencies: ["Calculate scale requirements"]
          continueOnError: true

    # Phase 4: Verify scaling
    - name: "Verify Scaling"
      description: "Confirm scaling operations succeeded"
      retryable: false
      steps:
        - name: "Check agent replicas"
          command: |
            provisioning taskserv list --selector "app=vapora-agents" \
              --output "json" | jq '.items[] | {agent: .metadata.labels.role, replicas: .spec.replicas}'
          timeout: 60s
          dependencies: ["Execute Scaling"]

        - name: "Wait for pods to be ready"
          command: |
            kubectl wait --for=condition=Ready pod \
              -l app=vapora-agents \
              -n vapora-system \
              --timeout=300s
          timeout: 320s
          dependencies: ["Execute Scaling"]

        - name: "Verify queue depth improvement"
          command: |
            provisioning metrics query --metric "agent_queue_depth" \
              --group-by "agent_role" \
              --compare-to /tmp/queue_depth.json
          timeout: 30s
          dependencies: ["Wait for pods to be ready"]

  outputs:
    - name: scaling_summary
      value: "cat /tmp/scaling_decisions.json"
    - name: new_replica_counts
      command: "provisioning taskserv list --selector app=vapora-agents -o json | jq '.items[] | {agent: .metadata.labels.role, replicas: .spec.replicas}'"

  # Notifications
  notifications:
    onSuccess:
      - "slack: #ops-automation"
      - "action: record-metrics"
    onFailure:
      - "slack: #ops-automation"
      - "slack: #alerts"

  # Cleanup
  cleanup:
    - "rm -f /tmp/queue_depth.json"
    - "rm -f /tmp/cpu_usage.json"
    - "rm -f /tmp/memory_usage.json"
    - "rm -f /tmp/scaling_decisions.json"