Vapora/provisioning/vapora-wrksp/workflows/upgrade-vapora.yaml

apiVersion: provisioning.vapora.io/v1
kind: Workflow
metadata:
  name: upgrade-vapora
  description: Rolling upgrade of VAPORA services with zero downtime
spec:
  version: "0.2.0"
  namespace: vapora-system
  timeout: 1800s  # 30 minutes max

  inputs:
    - name: backend_version
      type: string
      required: true
      description: "Target version for backend service (e.g., 0.3.0)"
    - name: frontend_version
      type: string
      required: true
      description: "Target version for frontend service"
    - name: agents_version
      type: string
      required: true
      description: "Target version for agent runtime"
    - name: upgrade_strategy
      type: string
      required: false
      default: "rolling"
      description: "rolling | blue-green | canary"
    - name: skip_tests
      type: boolean
      required: false
      default: false
      description: "Skip smoke tests before upgrade"
    - name: dry_run
      type: boolean
      required: false
      default: false
      description: "Perform dry-run without actual upgrades"

  phases:

    # Phase 1: Pre-upgrade checks
    - name: "Pre-Upgrade Validation"
      description: "Verify cluster health and prepare for upgrade"
      retryable: true
      steps:
        - name: "Check cluster health"
          command: "provisioning health-check --cluster"
          timeout: 300s

        - name: "Backup current state"
          command: |
            provisioning backup create --cluster vapora-cluster \
              --label pre-upgrade-$(date +%Y%m%d-%H%M%S)
          timeout: 600s

        - name: "Verify all services are running"
          command: "provisioning health-check --services all --strict"
          timeout: 300s

        - name: "Create git tag for current state"
          command: |
            CURRENT_BACKEND=$(kubectl get deployment vapora-backend -n vapora-system -o jsonpath='{.spec.template.spec.containers[0].image}')
            git tag -a "pre-upgrade-$(echo $CURRENT_BACKEND | cut -d: -f2)" -m "Pre-upgrade checkpoint"
          timeout: 60s

    # Phase 2: Drain traffic gracefully
    - name: "Prepare for Upgrade"
      description: "Gracefully drain and prepare services for upgrade"
      retryable: true
      steps:
        - name: "Drain agent queue"
          command: |
            provisioning agents drain --timeout 600s \
              --allow-new-work false
          timeout: 700s

        - name: "Enable maintenance mode"
          command: |
            kubectl patch configmap vapora-config \
              -n vapora-system \
              -p '{"data":{"maintenance_mode":"true"}}'
          timeout: 60s

        - name: "Wait for in-flight requests to complete"
          command: |
            provisioning metrics wait-for \
              --metric "http_requests_in_flight" \
              --target 0 \
              --timeout 300s
          timeout: 320s

    # Phase 3: Database migration (if needed)
    - name: "Database Migrations"
      description: "Apply database schema changes"
      retryable: false
      steps:
        - name: "Create database backup"
          command: |
            provisioning db backup --database surrealdb \
              --output backup-pre-upgrade-$(date +%s).sql
          timeout: 600s

        - name: "Run migration scripts"
          command: |
            for MIGRATION in scripts/migrations/v0.3.0/*.surql; do
              echo "Running migration: $MIGRATION"
              provisioning db execute --database surrealdb --file "$MIGRATION" || {
                echo "Migration failed, restoring backup"
                exit 1
              }
            done
          timeout: 600s

        - name: "Verify migration success"
          command: "provisioning db verify --database surrealdb"
          timeout: 300s

    # Phase 4: Update backend service
    - name: "Upgrade Backend Service"
      description: "Rolling update of REST API backend"
      retryable: true
      steps:
        - name: "Update backend image"
          command: |
            if [ "$DRY_RUN" = "true" ]; then
              echo "[DRY-RUN] Would update backend to vapora/backend:$BACKEND_VERSION"
            else
              provisioning taskserv upgrade vapora-backend \
                --image vapora/backend:$BACKEND_VERSION \
                --strategy rolling \
                --max-surge 1 \
                --max-unavailable 0
            fi
          timeout: 600s
          env:
            - name: BACKEND_VERSION
              value: "${backend_version}"
            - name: DRY_RUN
              value: "${dry_run}"

        - name: "Wait for backend rollout"
          command: "kubectl rollout status deployment/vapora-backend -n vapora-system --timeout=300s"
          timeout: 320s

        - name: "Run smoke tests"
          command: |
            if [ "$SKIP_TESTS" != "true" ]; then
              provisioning test smoke --api http://vapora-backend.vapora-system:8080 \
                --endpoints "/api/v1/health" "/api/v1/ready"
            fi
          timeout: 180s
          env:
            - name: SKIP_TESTS
              value: "${skip_tests}"
          continueOnError: true

    # Phase 5: Update LLM Router and MCP Gateway
    - name: "Upgrade Backend Components"
      description: "Update LLM Router and MCP Gateway in parallel"
      retryable: true
      parallel: true
      steps:
        - name: "Upgrade LLM Router"
          command: |
            if [ "$DRY_RUN" != "true" ]; then
              provisioning taskserv upgrade vapora-llm-router \
                --image vapora/llm-router:$VERSION \
                --strategy rolling \
                --max-unavailable 0
            fi
          timeout: 600s
          env:
            - name: VERSION
              value: "${backend_version}"

        - name: "Upgrade MCP Gateway"
          command: |
            if [ "$DRY_RUN" != "true" ]; then
              provisioning taskserv upgrade vapora-mcp-gateway \
                --image vapora/mcp-gateway:$VERSION \
                --strategy rolling \
                --max-unavailable 0
            fi
          timeout: 600s
          env:
            - name: VERSION
              value: "${backend_version}"

    # Phase 6: Update agent runtime
    - name: "Upgrade Agent Runtime"
      description: "Update agent runtime with safe rollout"
      retryable: true
      steps:
        - name: "Update agent image"
          command: |
            if [ "$DRY_RUN" != "true" ]; then
              provisioning taskserv upgrade vapora-agents \
                --image vapora/agents:$VERSION \
                --strategy rolling \
                --max-surge 1 \
                --max-unavailable 1 \
                --drain-timeout 300s
            fi
          timeout: 900s
          env:
            - name: VERSION
              value: "${agents_version}"

        - name: "Wait for agents to stabilize"
          command: |
            kubectl wait --for=condition=Ready pod \
              -l app=vapora-agents \
              -n vapora-system \
              --timeout=600s
          timeout: 620s

    # Phase 7: Update frontend service
    - name: "Upgrade Frontend Service"
      description: "Update UI frontend with minimal user impact"
      retryable: true
      steps:
        - name: "Update frontend image"
          command: |
            if [ "$DRY_RUN" != "true" ]; then
              provisioning taskserv upgrade vapora-frontend \
                --image vapora/frontend:$VERSION \
                --strategy rolling \
                --max-surge 1 \
                --max-unavailable 0
            fi
          timeout: 600s
          env:
            - name: VERSION
              value: "${frontend_version}"

        - name: "Wait for frontend rollout"
          command: "kubectl rollout status deployment/vapora-frontend -n vapora-system --timeout=300s"
          timeout: 320s

        - name: "Test frontend endpoints"
          command: |
            if [ "$SKIP_TESTS" != "true" ]; then
              provisioning test smoke --frontend http://vapora-frontend.vapora-system:3000 \
                --endpoints "/"
            fi
          timeout: 180s

    # Phase 8: Post-upgrade verification
    - name: "Post-Upgrade Verification"
      description: "Comprehensive validation of upgraded system"
      retryable: false
      steps:
        - name: "Disable maintenance mode"
          command: |
            kubectl patch configmap vapora-config \
              -n vapora-system \
              -p '{"data":{"maintenance_mode":"false"}}'
          timeout: 60s

        - name: "Health check all services"
          command: "provisioning health-check --services all --strict"
          timeout: 300s

        - name: "Verify agent communication"
          command: "provisioning agents health-check --nats nats://nats-0.vapora-system:4222"
          timeout: 120s

        - name: "Run integration tests"
          command: "provisioning test integration --timeout 600s"
          timeout: 620s
          continueOnError: true

        - name: "Check application logs for errors"
          command: |
            ERROR_COUNT=$(kubectl logs -n vapora-system -l app=vapora-backend --tail=1000 | grep -c 'ERROR\|CRITICAL')
            if [ "$ERROR_COUNT" -gt 10 ]; then
              echo "WARNING: Found $ERROR_COUNT errors in backend logs"
              exit 1
            fi
          timeout: 120s
          continueOnError: true

        - name: "Re-enable agent work"
          command: "provisioning agents drain --disable"
          timeout: 60s

    # Phase 9: Tag and document upgrade
    - name: "Finalize Upgrade"
      description: "Document upgrade completion"
      retryable: false
      steps:
        - name: "Create upgrade completion tag"
          command: |
            git tag -a "upgraded-to-$BACKEND_VERSION-$(date +%Y%m%d-%H%M%S)" \
              -m "Upgrade completed: backend=$BACKEND_VERSION, frontend=$FRONTEND_VERSION, agents=$AGENTS_VERSION"
          timeout: 60s
          env:
            - name: BACKEND_VERSION
              value: "${backend_version}"
            - name: FRONTEND_VERSION
              value: "${frontend_version}"
            - name: AGENTS_VERSION
              value: "${agents_version}"

        - name: "Generate upgrade report"
          command: |
            provisioning report generate \
              --type upgrade \
              --format markdown \
              --output "upgrade-report-$(date +%Y%m%d-%H%M%S).md"
          timeout: 120s

  # Rollback procedure
  onFailure:
    rollback: true
    procedure:
      - name: "Restore from pre-upgrade backup"
        command: "provisioning backup restore --label pre-upgrade-* --latest"
      - name: "Verify rollback success"
        command: "provisioning health-check --cluster"

  outputs:
    - name: upgrade_status
      value: "echo 'Upgrade completed successfully'"
    - name: versions_deployed
      command: "kubectl get deployment -n vapora-system -o wide"

  notifications:
    onStart:
      - "slack: #deployment"
      - "email: devops@example.com"
    onSuccess:
      - "slack: #deployment"
      - "slack: notify: Upgrade successful"
    onFailure:
      - "slack: #deployment"
      - "slack: #alerts"
      - "email: devops@example.com"
      - "severity: critical"