apiVersion: provisioning.vapora.io/v1 kind: Workflow metadata: name: disaster-recovery description: VAPORA disaster recovery and restoration from backups spec: version: "0.2.0" namespace: vapora-system timeout: 3600s # 1 hour max inputs: - name: backup_source type: string required: true description: "Backup identifier or timestamp to restore from" - name: partial_restore type: boolean required: false default: false description: "Restore only specific components instead of full system" - name: components_to_restore type: array required: false description: "List of components to restore: database, services, configuration, data" - name: verify_only type: boolean required: false default: false description: "Verify backup integrity without restoring" phases: # Phase 1: Pre-recovery assessment - name: "Assess Damage and Backup Status" description: "Evaluate current cluster state and available backups" retryable: true steps: - name: "Check cluster connectivity" command: "kubectl cluster-info" timeout: 30s continueOnError: true - name: "List available backups" command: | provisioning backup list --detailed \ | grep -E "id|timestamp|size|status" timeout: 60s - name: "Verify backup integrity" command: | provisioning backup verify --id $BACKUP_SOURCE timeout: 300s env: - name: BACKUP_SOURCE value: "${backup_source}" - name: "Estimate recovery time" command: | provisioning backup estimate-restore-time --id $BACKUP_SOURCE \ | tee /tmp/restore_estimate.txt timeout: 60s # Phase 2: Drain system - name: "Prepare System for Recovery" description: "Stop services and prepare for restoration" retryable: false steps: - name: "Stop accepting new tasks" command: | kubectl patch configmap vapora-config \ -n vapora-system \ -p '{"data":{"recovery_mode":"true","accept_requests":"false"}}' timeout: 60s continueOnError: true - name: "Drain task queue" command: "provisioning agents drain --timeout 300s" timeout: 320s continueOnError: true - name: "Stop agent runtime" command: | kubectl scale deployment vapora-agents \ -n vapora-system \ --replicas 0 timeout: 120s continueOnError: true - name: "Stop backend services" command: | kubectl scale deployment vapora-backend \ -n vapora-system \ --replicas 0 kubectl scale deployment vapora-llm-router \ -n vapora-system \ --replicas 0 kubectl scale deployment vapora-mcp-gateway \ -n vapora-system \ --replicas 0 timeout: 180s continueOnError: true - name: "Stop frontend" command: | kubectl scale deployment vapora-frontend \ -n vapora-system \ --replicas 0 timeout: 120s continueOnError: true # Phase 3: Restore database - name: "Restore Database" description: "Restore SurrealDB from backup" retryable: false steps: - name: "Scale down SurrealDB replicas" command: | kubectl scale statefulset surrealdb \ -n vapora-system \ --replicas 1 timeout: 300s - name: "Wait for single replica" command: | kubectl wait --for=condition=Ready pod \ -l app=surrealdb \ -n vapora-system \ --timeout=300s timeout: 320s - name: "Restore database data" command: | provisioning db restore \ --database surrealdb \ --backup $BACKUP_SOURCE \ --wait-for-completion timeout: 1200s env: - name: BACKUP_SOURCE value: "${backup_source}" - name: "Verify database integrity" command: | provisioning db verify --database surrealdb \ --check-tables \ --check-indexes \ --check-constraints timeout: 300s - name: "Scale up SurrealDB replicas" command: | kubectl scale statefulset surrealdb \ -n vapora-system \ --replicas 3 timeout: 300s - name: "Wait for all replicas" command: | kubectl wait --for=condition=Ready pod \ -l app=surrealdb \ -n vapora-system \ --timeout=600s timeout: 620s # Phase 4: Restore configuration and secrets - name: "Restore Configuration" description: "Restore ConfigMaps, Secrets, and application configuration" retryable: true steps: - name: "Restore ConfigMaps" command: | provisioning backup restore \ --id $BACKUP_SOURCE \ --resource-type ConfigMap \ --namespace vapora-system timeout: 180s - name: "Restore Secrets" command: | provisioning backup restore \ --id $BACKUP_SOURCE \ --resource-type Secret \ --namespace vapora-system timeout: 180s - name: "Verify configuration" command: | kubectl get configmap -n vapora-system kubectl get secrets -n vapora-system timeout: 60s # Phase 5: Restore service configurations - name: "Restore Services" description: "Restore service deployments and Istio configurations" retryable: true steps: - name: "Restore backend deployment" command: | provisioning backup restore \ --id $BACKUP_SOURCE \ --resource-type Deployment \ --name vapora-backend timeout: 300s - name: "Restore LLM Router" command: | provisioning backup restore \ --id $BACKUP_SOURCE \ --resource-type Deployment \ --name vapora-llm-router timeout: 300s - name: "Restore MCP Gateway" command: | provisioning backup restore \ --id $BACKUP_SOURCE \ --resource-type Deployment \ --name vapora-mcp-gateway timeout: 300s - name: "Restore frontend" command: | provisioning backup restore \ --id $BACKUP_SOURCE \ --resource-type Deployment \ --name vapora-frontend timeout: 300s - name: "Restore Istio configuration" command: | provisioning backup restore \ --id $BACKUP_SOURCE \ --resource-type Gateway,VirtualService \ --namespace vapora-system timeout: 180s # Phase 6: Restore agents - name: "Restore Agent Runtime" description: "Restore agent deployment and state" retryable: true steps: - name: "Restore agent deployment" command: | provisioning backup restore \ --id $BACKUP_SOURCE \ --resource-type Deployment \ --name vapora-agents timeout: 300s - name: "Wait for agents to be ready" command: | kubectl wait --for=condition=Ready pod \ -l app=vapora-agents \ -n vapora-system \ --timeout=600s timeout: 620s - name: "Verify agent communication" command: "provisioning agents health-check --nats nats://nats-0.vapora-system:4222" timeout: 120s # Phase 7: Post-recovery verification - name: "Verify Recovery" description: "Comprehensive verification of recovered system" retryable: false steps: - name: "Health check cluster" command: "provisioning health-check --cluster" timeout: 300s - name: "Health check all services" command: "provisioning health-check --services all --strict" timeout: 300s - name: "Test database connectivity" command: "provisioning db test-connection --database surrealdb" timeout: 120s - name: "Verify data consistency" command: | provisioning db verify --database surrealdb \ --check-integrity \ --sample-size 1000 timeout: 300s - name: "Run smoke tests" command: | provisioning test smoke \ --api http://vapora-backend.vapora-system:8080 \ --frontend http://vapora-frontend.vapora-system:3000 \ --timeout 600s timeout: 620s - name: "Test agent communication" command: | provisioning agents test \ --send-test-message \ --verify-delivery \ --timeout 120s timeout: 140s # Phase 8: Re-enable system - name: "Resume Operations" description: "Re-enable system for normal operation" retryable: false steps: - name: "Disable recovery mode" command: | kubectl patch configmap vapora-config \ -n vapora-system \ -p '{"data":{"recovery_mode":"false","accept_requests":"true"}}' timeout: 60s - name: "Scale up services to previous state" command: | provisioning taskserv scale-to-previous \ --namespace vapora-system timeout: 300s - name: "Resume agent work" command: "provisioning agents drain --disable" timeout: 60s - name: "Final health check" command: "provisioning health-check --cluster" timeout: 300s # Phase 9: Documentation and reporting - name: "Generate Recovery Report" description: "Document recovery operation" retryable: false steps: - name: "Create recovery report" command: | provisioning report generate \ --type disaster-recovery \ --backup-id $BACKUP_SOURCE \ --output "recovery-report-$(date +%Y%m%d-%H%M%S).md" timeout: 120s - name: "Create git commit for recovery" command: | git add -A git commit -m "Disaster recovery: Restored from backup $BACKUP_SOURCE at $(date)" timeout: 60s continueOnError: true - name: "Log recovery event" command: | kubectl logs -n vapora-system -l app=vapora-backend \ | grep -E "recovery|restore|initialize" \ | tail -20 timeout: 60s outputs: - name: recovery_status value: "echo 'Disaster recovery completed'" - name: restored_services command: "kubectl get deployment -n vapora-system -o wide" - name: database_status command: "provisioning db status --database surrealdb" # Error handling onFailure: procedure: - name: "Gather diagnostic information" command: | provisioning debug collect \ --output "debug-logs-$(date +%s).tar.gz" - name: "Alert operations team" command: "slack: #alerts" notifications: onStart: - "slack: #deployment" - "email: devops@example.com" - "severity: critical" onSuccess: - "slack: #deployment" - "slack: notify: Disaster recovery successful" - "email: devops@example.com" onFailure: - "slack: #deployment" - "slack: #alerts" - "email: devops@example.com" - "severity: critical" - "page: on-call"