Vapora/provisioning/vapora-wrksp/workflows/deploy-full-stack.yaml

apiVersion: provisioning.vapora.io/v1
kind: Workflow
metadata:
  name: deploy-full-stack
  description: Complete VAPORA deployment from scratch including cluster, databases, and services
spec:
  # Workflow metadata
  version: "0.2.0"
  namespace: vapora-system
  timeout: 3600s  # 1 hour max
  retryPolicy:
    maxRetries: 3
    backoffFactor: 2

  # Prerequisites
  prerequisites:
    - kubeconfig_present
    - provisioning_cli_installed
    - sufficient_resources:
        cpu: "20"
        memory: "64Gi"
        disk: "500Gi"

  # Workflow phases executed sequentially with gates
  phases:

    # Phase 1: Infrastructure foundation
    - name: "Create K8s Cluster"
      description: "Deploy base Kubernetes cluster with networking"
      retryable: true
      steps:
        - name: "Apply KCL cluster schema"
          command: "provisioning cluster create --config kcl/cluster.k"
          timeout: 1200s
          onError: "rollback_cluster"

        - name: "Install CNI (Cilium)"
          command: "provisioning addon install cilium --helm-values cilium-values.yaml"
          timeout: 300s
          retries: 3

        - name: "Install service mesh (Istio)"
          command: "provisioning addon install istio --config kcl/cluster.k"
          timeout: 600s
          dependencies: ["cilium"]

        - name: "Install storage (Rook Ceph)"
          command: "provisioning addon install rook-ceph --size 500Gi --replicas 3"
          timeout: 900s
          dependencies: ["cilium"]

        - name: "Verify cluster health"
          command: "provisioning health-check --cluster"
          timeout: 300s
          dependencies: ["cilium", "istio", "rook-ceph"]

    # Phase 2: Create namespaces and RBAC
    - name: "Setup Namespaces and Security"
      description: "Create namespaces, service accounts, and RBAC policies"
      retryable: true
      steps:
        - name: "Create namespaces"
          command: "kubectl apply -f - <<EOF\napiVersion: v1\nkind: Namespace\nmetadata:\n  name: vapora-system\n  labels:\n    app: vapora\nEOF"
          timeout: 60s
          dependencies: ["Create K8s Cluster"]

        - name: "Create service accounts"
          command: "kubectl apply -f - <<EOF\napiVersion: v1\nkind: ServiceAccount\nmetadata:\n  name: vapora-backend\n  namespace: vapora-system\n---\napiVersion: v1\nkind: ServiceAccount\nmetadata:\n  name: vapora-agents\n  namespace: vapora-system\nEOF"
          timeout: 60s
          dependencies: ["Create namespaces"]

        - name: "Apply RBAC policies"
          command: "provisioning rbac apply --config kcl/services.k --namespace vapora-system"
          timeout: 120s
          dependencies: ["Create service accounts"]

    # Phase 3: Deploy storage layer
    - name: "Deploy Storage Infrastructure"
      description: "Deploy SurrealDB, Redis, and NATS"
      retryable: true
      parallel: false  # Must be sequential for data consistency
      steps:
        - name: "Deploy SurrealDB"
          command: "provisioning taskserv deploy --config taskservs/vapora-storage.toml --component surrealdb"
          timeout: 600s
          dependencies: ["Setup Namespaces and Security"]
          healthCheck:
            type: "http"
            endpoint: "http://surrealdb-0.surrealdb.vapora-system:8000/health"
            interval: 10s
            maxRetries: 30

        - name: "Deploy Redis"
          command: "provisioning taskserv deploy --config taskservs/vapora-storage.toml --component redis"
          timeout: 300s
          dependencies: ["Deploy SurrealDB"]
          healthCheck:
            type: "exec"
            command: "redis-cli ping"
            interval: 5s
            maxRetries: 20

        - name: "Deploy NATS JetStream"
          command: "provisioning taskserv deploy --config taskservs/vapora-storage.toml --component nats"
          timeout: 300s
          dependencies: ["Deploy Redis"]
          healthCheck:
            type: "http"
            endpoint: "http://nats-0:8222/varz"
            interval: 10s
            maxRetries: 20

        - name: "Initialize database schema"
          command: "provisioning db init --database surrealdb --schema-dir scripts/migrations"
          timeout: 180s
          dependencies: ["Deploy SurrealDB"]

    # Phase 4: Deploy backend services
    - name: "Deploy Backend Services"
      description: "Deploy Axum backend, LLM router, and MCP gateway"
      retryable: true
      parallel: true  # Can deploy in parallel
      steps:
        - name: "Deploy REST API backend"
          command: "provisioning taskserv deploy vapora-backend"
          timeout: 300s
          dependencies: ["Deploy Storage Infrastructure"]
          healthCheck:
            type: "http"
            endpoint: "http://vapora-backend.vapora-system:8080/api/v1/health"
            interval: 10s
            maxRetries: 20

        - name: "Deploy Multi-IA LLM Router"
          command: "provisioning taskserv deploy vapora-llm-router"
          timeout: 300s
          dependencies: ["Deploy Storage Infrastructure"]
          healthCheck:
            type: "http"
            endpoint: "http://vapora-llm-router.vapora-system:8899/health"
            interval: 10s
            maxRetries: 20

        - name: "Deploy MCP Gateway"
          command: "provisioning taskserv deploy vapora-mcp-gateway"
          timeout: 300s
          dependencies: ["Deploy Storage Infrastructure"]
          healthCheck:
            type: "http"
            endpoint: "http://vapora-mcp-gateway.vapora-system:8888/health"
            interval: 10s
            maxRetries: 20

    # Phase 5: Deploy agent runtime
    - name: "Deploy Agent Runtime"
      description: "Deploy 12-agent orchestrator with initial replicas"
      retryable: true
      steps:
        - name: "Deploy agent runtime pods"
          command: "provisioning taskserv deploy vapora-agents --replicas 3"
          timeout: 600s
          dependencies: ["Deploy Backend Services"]
          healthCheck:
            type: "custom"
            script: |
              AGENT_COUNT=$(kubectl get pods -n vapora-system -l app=vapora-agents --field-selector=status.phase=Running | wc -l)
              if [ "$AGENT_COUNT" -ge 2 ]; then
                echo "OK: $AGENT_COUNT agents running"
                exit 0
              else
                echo "ERROR: Only $AGENT_COUNT agents running"
                exit 1
              fi
            interval: 30s
            maxRetries: 20

    # Phase 6: Deploy frontend
    - name: "Deploy Frontend"
      description: "Deploy Leptos CSR frontend"
      retryable: true
      steps:
        - name: "Deploy frontend application"
          command: "provisioning taskserv deploy vapora-frontend"
          timeout: 300s
          dependencies: ["Deploy Agent Runtime"]
          healthCheck:
            type: "http"
            endpoint: "http://vapora-frontend.vapora-system:3000/"
            interval: 10s
            maxRetries: 20

    # Phase 7: Setup monitoring and observability
    - name: "Setup Monitoring Stack"
      description: "Deploy Prometheus, Grafana, and Loki"
      retryable: true
      parallel: true
      steps:
        - name: "Deploy Prometheus"
          command: "provisioning addon install prometheus --namespace monitoring"
          timeout: 300s
          dependencies: ["Deploy Frontend"]

        - name: "Deploy Grafana"
          command: "provisioning addon install grafana --namespace monitoring"
          timeout: 300s
          dependencies: ["Deploy Frontend"]

        - name: "Deploy Loki (log aggregation)"
          command: "provisioning addon install loki --namespace monitoring"
          timeout: 300s
          dependencies: ["Deploy Frontend"]

    # Phase 8: Configure networking and ingress
    - name: "Configure Ingress and Networking"
      description: "Setup Istio gateway and ingress rules"
      retryable: true
      steps:
        - name: "Apply Istio gateway configuration"
          command: "kubectl apply -f - <<EOF\napiVersion: networking.istio.io/v1beta1\nkind: Gateway\nmetadata:\n  name: vapora-gateway\n  namespace: vapora-system\nspec:\n  selector:\n    istio: ingressgateway\n  servers:\n  - port:\n      number: 80\n      name: http\n      protocol: HTTP\n    hosts:\n    - \"vapora.example.com\"\n  - port:\n      number: 443\n      name: https\n      protocol: HTTPS\n    tls:\n      mode: SIMPLE\n      credentialName: vapora-tls\n    hosts:\n    - \"vapora.example.com\"\nEOF"
          timeout: 60s
          dependencies: ["Deploy Frontend"]

        - name: "Apply VirtualService routing"
          command: "provisioning istio apply --config kcl/cluster.k --namespace vapora-system"
          timeout: 120s
          dependencies: ["Apply Istio gateway configuration"]

    # Phase 9: Post-deployment verification
    - name: "Verify Full Stack"
      description: "Run comprehensive health checks and smoke tests"
      retryable: false
      steps:
        - name: "Check all services are running"
          command: |
            provisioning health-check --services all
            if [ $? -ne 0 ]; then
              echo "ERROR: Some services are not healthy"
              exit 1
            fi
          timeout: 300s
          dependencies: ["Configure Ingress and Networking"]

        - name: "Run smoke tests"
          command: |
            provisioning test smoke --api http://vapora-backend.vapora-system:8080
            provisioning test smoke --frontend http://vapora-frontend.vapora-system:3000
          timeout: 180s
          dependencies: ["Check all services are running"]

        - name: "Verify database connectivity"
          command: "provisioning db test-connection --database surrealdb"
          timeout: 60s
          dependencies: ["Check all services are running"]

        - name: "Verify agent communication"
          command: "provisioning agents health-check --nats nats://nats-0.vapora-system:4222"
          timeout: 120s
          dependencies: ["Check all services are running"]

  # Output configuration
  outputs:
    - name: frontend_url
      value: "https://vapora.example.com"
    - name: grafana_url
      value: "https://vapora.example.com/grafana"
    - name: prometheus_url
      value: "https://vapora.example.com:9090"
    - name: cluster_info
      value: "kubectl cluster-info"

  # Rollback policies
  rollback:
    onFailure: "manual"  # manual | automatic | rollback-to-previous
    allowedSteps: ["Create K8s Cluster", "Deploy Storage Infrastructure", "Deploy Backend Services"]
    strategy: "cascade"  # cascade | parallel

  # Notifications
  notifications:
    onStart:
      - "email: devops@example.com"
      - "slack: #deployment"
    onSuccess:
      - "email: devops@example.com"
      - "slack: #deployment"
      - "action: update-dns"
    onFailure:
      - "email: devops@example.com"
      - "slack: #deployment"
      - "severity: critical"

  # Post-deployment actions
  postDeployment:
    - name: "Create backup"
      command: "provisioning backup create --cluster vapora-cluster"
      schedule: "daily"

    - name: "Generate deployment report"
      command: "provisioning report generate --format markdown --output deployment-report-$(date +%s).md"
      schedule: "once"

---

# Rollback workflow (automatically created)
apiVersion: provisioning.vapora.io/v1
kind: Workflow
metadata:
  name: deploy-full-stack-rollback
  description: Rollback VAPORA deployment to previous stable state
spec:
  version: "0.2.0"
  phases:
    - name: "Stop new operations"
      steps:
        - name: "Drain agent queue"
          command: "provisioning agents drain --wait-timeout 300s"

    - name: "Restore from backup"
      steps:
        - name: "List available backups"
          command: "provisioning backup list"

        - name: "Restore cluster state"
          command: "provisioning backup restore --backup latest --confirm"

    - name: "Verify rollback"
      steps:
        - name: "Run health checks"
          command: "provisioning health-check --cluster"