apiVersion: provisioning.vapora.io/v1 kind: Workflow metadata: name: upgrade-vapora description: Rolling upgrade of VAPORA services with zero downtime spec: version: "0.2.0" namespace: vapora-system timeout: 1800s # 30 minutes max inputs: - name: backend_version type: string required: true description: "Target version for backend service (e.g., 0.3.0)" - name: frontend_version type: string required: true description: "Target version for frontend service" - name: agents_version type: string required: true description: "Target version for agent runtime" - name: upgrade_strategy type: string required: false default: "rolling" description: "rolling | blue-green | canary" - name: skip_tests type: boolean required: false default: false description: "Skip smoke tests before upgrade" - name: dry_run type: boolean required: false default: false description: "Perform dry-run without actual upgrades" phases: # Phase 1: Pre-upgrade checks - name: "Pre-Upgrade Validation" description: "Verify cluster health and prepare for upgrade" retryable: true steps: - name: "Check cluster health" command: "provisioning health-check --cluster" timeout: 300s - name: "Backup current state" command: | provisioning backup create --cluster vapora-cluster \ --label pre-upgrade-$(date +%Y%m%d-%H%M%S) timeout: 600s - name: "Verify all services are running" command: "provisioning health-check --services all --strict" timeout: 300s - name: "Create git tag for current state" command: | CURRENT_BACKEND=$(kubectl get deployment vapora-backend -n vapora-system -o jsonpath='{.spec.template.spec.containers[0].image}') git tag -a "pre-upgrade-$(echo $CURRENT_BACKEND | cut -d: -f2)" -m "Pre-upgrade checkpoint" timeout: 60s # Phase 2: Drain traffic gracefully - name: "Prepare for Upgrade" description: "Gracefully drain and prepare services for upgrade" retryable: true steps: - name: "Drain agent queue" command: | provisioning agents drain --timeout 600s \ --allow-new-work false timeout: 700s - name: "Enable maintenance mode" command: | kubectl patch configmap vapora-config \ -n vapora-system \ -p '{"data":{"maintenance_mode":"true"}}' timeout: 60s - name: "Wait for in-flight requests to complete" command: | provisioning metrics wait-for \ --metric "http_requests_in_flight" \ --target 0 \ --timeout 300s timeout: 320s # Phase 3: Database migration (if needed) - name: "Database Migrations" description: "Apply database schema changes" retryable: false steps: - name: "Create database backup" command: | provisioning db backup --database surrealdb \ --output backup-pre-upgrade-$(date +%s).sql timeout: 600s - name: "Run migration scripts" command: | for MIGRATION in scripts/migrations/v0.3.0/*.surql; do echo "Running migration: $MIGRATION" provisioning db execute --database surrealdb --file "$MIGRATION" || { echo "Migration failed, restoring backup" exit 1 } done timeout: 600s - name: "Verify migration success" command: "provisioning db verify --database surrealdb" timeout: 300s # Phase 4: Update backend service - name: "Upgrade Backend Service" description: "Rolling update of REST API backend" retryable: true steps: - name: "Update backend image" command: | if [ "$DRY_RUN" = "true" ]; then echo "[DRY-RUN] Would update backend to vapora/backend:$BACKEND_VERSION" else provisioning taskserv upgrade vapora-backend \ --image vapora/backend:$BACKEND_VERSION \ --strategy rolling \ --max-surge 1 \ --max-unavailable 0 fi timeout: 600s env: - name: BACKEND_VERSION value: "${backend_version}" - name: DRY_RUN value: "${dry_run}" - name: "Wait for backend rollout" command: "kubectl rollout status deployment/vapora-backend -n vapora-system --timeout=300s" timeout: 320s - name: "Run smoke tests" command: | if [ "$SKIP_TESTS" != "true" ]; then provisioning test smoke --api http://vapora-backend.vapora-system:8080 \ --endpoints "/api/v1/health" "/api/v1/ready" fi timeout: 180s env: - name: SKIP_TESTS value: "${skip_tests}" continueOnError: true # Phase 5: Update LLM Router and MCP Gateway - name: "Upgrade Backend Components" description: "Update LLM Router and MCP Gateway in parallel" retryable: true parallel: true steps: - name: "Upgrade LLM Router" command: | if [ "$DRY_RUN" != "true" ]; then provisioning taskserv upgrade vapora-llm-router \ --image vapora/llm-router:$VERSION \ --strategy rolling \ --max-unavailable 0 fi timeout: 600s env: - name: VERSION value: "${backend_version}" - name: "Upgrade MCP Gateway" command: | if [ "$DRY_RUN" != "true" ]; then provisioning taskserv upgrade vapora-mcp-gateway \ --image vapora/mcp-gateway:$VERSION \ --strategy rolling \ --max-unavailable 0 fi timeout: 600s env: - name: VERSION value: "${backend_version}" # Phase 6: Update agent runtime - name: "Upgrade Agent Runtime" description: "Update agent runtime with safe rollout" retryable: true steps: - name: "Update agent image" command: | if [ "$DRY_RUN" != "true" ]; then provisioning taskserv upgrade vapora-agents \ --image vapora/agents:$VERSION \ --strategy rolling \ --max-surge 1 \ --max-unavailable 1 \ --drain-timeout 300s fi timeout: 900s env: - name: VERSION value: "${agents_version}" - name: "Wait for agents to stabilize" command: | kubectl wait --for=condition=Ready pod \ -l app=vapora-agents \ -n vapora-system \ --timeout=600s timeout: 620s # Phase 7: Update frontend service - name: "Upgrade Frontend Service" description: "Update UI frontend with minimal user impact" retryable: true steps: - name: "Update frontend image" command: | if [ "$DRY_RUN" != "true" ]; then provisioning taskserv upgrade vapora-frontend \ --image vapora/frontend:$VERSION \ --strategy rolling \ --max-surge 1 \ --max-unavailable 0 fi timeout: 600s env: - name: VERSION value: "${frontend_version}" - name: "Wait for frontend rollout" command: "kubectl rollout status deployment/vapora-frontend -n vapora-system --timeout=300s" timeout: 320s - name: "Test frontend endpoints" command: | if [ "$SKIP_TESTS" != "true" ]; then provisioning test smoke --frontend http://vapora-frontend.vapora-system:3000 \ --endpoints "/" fi timeout: 180s # Phase 8: Post-upgrade verification - name: "Post-Upgrade Verification" description: "Comprehensive validation of upgraded system" retryable: false steps: - name: "Disable maintenance mode" command: | kubectl patch configmap vapora-config \ -n vapora-system \ -p '{"data":{"maintenance_mode":"false"}}' timeout: 60s - name: "Health check all services" command: "provisioning health-check --services all --strict" timeout: 300s - name: "Verify agent communication" command: "provisioning agents health-check --nats nats://nats-0.vapora-system:4222" timeout: 120s - name: "Run integration tests" command: "provisioning test integration --timeout 600s" timeout: 620s continueOnError: true - name: "Check application logs for errors" command: | ERROR_COUNT=$(kubectl logs -n vapora-system -l app=vapora-backend --tail=1000 | grep -c 'ERROR\|CRITICAL') if [ "$ERROR_COUNT" -gt 10 ]; then echo "WARNING: Found $ERROR_COUNT errors in backend logs" exit 1 fi timeout: 120s continueOnError: true - name: "Re-enable agent work" command: "provisioning agents drain --disable" timeout: 60s # Phase 9: Tag and document upgrade - name: "Finalize Upgrade" description: "Document upgrade completion" retryable: false steps: - name: "Create upgrade completion tag" command: | git tag -a "upgraded-to-$BACKEND_VERSION-$(date +%Y%m%d-%H%M%S)" \ -m "Upgrade completed: backend=$BACKEND_VERSION, frontend=$FRONTEND_VERSION, agents=$AGENTS_VERSION" timeout: 60s env: - name: BACKEND_VERSION value: "${backend_version}" - name: FRONTEND_VERSION value: "${frontend_version}" - name: AGENTS_VERSION value: "${agents_version}" - name: "Generate upgrade report" command: | provisioning report generate \ --type upgrade \ --format markdown \ --output "upgrade-report-$(date +%Y%m%d-%H%M%S).md" timeout: 120s # Rollback procedure onFailure: rollback: true procedure: - name: "Restore from pre-upgrade backup" command: "provisioning backup restore --label pre-upgrade-* --latest" - name: "Verify rollback success" command: "provisioning health-check --cluster" outputs: - name: upgrade_status value: "echo 'Upgrade completed successfully'" - name: versions_deployed command: "kubectl get deployment -n vapora-system -o wide" notifications: onStart: - "slack: #deployment" - "email: devops@example.com" onSuccess: - "slack: #deployment" - "slack: notify: Upgrade successful" onFailure: - "slack: #deployment" - "slack: #alerts" - "email: devops@example.com" - "severity: critical"