388 lines
12 KiB
YAML
388 lines
12 KiB
YAML
|
|
apiVersion: provisioning.vapora.io/v1
|
||
|
|
kind: Workflow
|
||
|
|
metadata:
|
||
|
|
name: disaster-recovery
|
||
|
|
description: VAPORA disaster recovery and restoration from backups
|
||
|
|
spec:
|
||
|
|
version: "0.2.0"
|
||
|
|
namespace: vapora-system
|
||
|
|
timeout: 3600s # 1 hour max
|
||
|
|
|
||
|
|
inputs:
|
||
|
|
- name: backup_source
|
||
|
|
type: string
|
||
|
|
required: true
|
||
|
|
description: "Backup identifier or timestamp to restore from"
|
||
|
|
- name: partial_restore
|
||
|
|
type: boolean
|
||
|
|
required: false
|
||
|
|
default: false
|
||
|
|
description: "Restore only specific components instead of full system"
|
||
|
|
- name: components_to_restore
|
||
|
|
type: array
|
||
|
|
required: false
|
||
|
|
description: "List of components to restore: database, services, configuration, data"
|
||
|
|
- name: verify_only
|
||
|
|
type: boolean
|
||
|
|
required: false
|
||
|
|
default: false
|
||
|
|
description: "Verify backup integrity without restoring"
|
||
|
|
|
||
|
|
phases:
|
||
|
|
|
||
|
|
# Phase 1: Pre-recovery assessment
|
||
|
|
- name: "Assess Damage and Backup Status"
|
||
|
|
description: "Evaluate current cluster state and available backups"
|
||
|
|
retryable: true
|
||
|
|
steps:
|
||
|
|
- name: "Check cluster connectivity"
|
||
|
|
command: "kubectl cluster-info"
|
||
|
|
timeout: 30s
|
||
|
|
continueOnError: true
|
||
|
|
|
||
|
|
- name: "List available backups"
|
||
|
|
command: |
|
||
|
|
provisioning backup list --detailed \
|
||
|
|
| grep -E "id|timestamp|size|status"
|
||
|
|
timeout: 60s
|
||
|
|
|
||
|
|
- name: "Verify backup integrity"
|
||
|
|
command: |
|
||
|
|
provisioning backup verify --id $BACKUP_SOURCE
|
||
|
|
timeout: 300s
|
||
|
|
env:
|
||
|
|
- name: BACKUP_SOURCE
|
||
|
|
value: "${backup_source}"
|
||
|
|
|
||
|
|
- name: "Estimate recovery time"
|
||
|
|
command: |
|
||
|
|
provisioning backup estimate-restore-time --id $BACKUP_SOURCE \
|
||
|
|
| tee /tmp/restore_estimate.txt
|
||
|
|
timeout: 60s
|
||
|
|
|
||
|
|
# Phase 2: Drain system
|
||
|
|
- name: "Prepare System for Recovery"
|
||
|
|
description: "Stop services and prepare for restoration"
|
||
|
|
retryable: false
|
||
|
|
steps:
|
||
|
|
- name: "Stop accepting new tasks"
|
||
|
|
command: |
|
||
|
|
kubectl patch configmap vapora-config \
|
||
|
|
-n vapora-system \
|
||
|
|
-p '{"data":{"recovery_mode":"true","accept_requests":"false"}}'
|
||
|
|
timeout: 60s
|
||
|
|
continueOnError: true
|
||
|
|
|
||
|
|
- name: "Drain task queue"
|
||
|
|
command: "provisioning agents drain --timeout 300s"
|
||
|
|
timeout: 320s
|
||
|
|
continueOnError: true
|
||
|
|
|
||
|
|
- name: "Stop agent runtime"
|
||
|
|
command: |
|
||
|
|
kubectl scale deployment vapora-agents \
|
||
|
|
-n vapora-system \
|
||
|
|
--replicas 0
|
||
|
|
timeout: 120s
|
||
|
|
continueOnError: true
|
||
|
|
|
||
|
|
- name: "Stop backend services"
|
||
|
|
command: |
|
||
|
|
kubectl scale deployment vapora-backend \
|
||
|
|
-n vapora-system \
|
||
|
|
--replicas 0
|
||
|
|
kubectl scale deployment vapora-llm-router \
|
||
|
|
-n vapora-system \
|
||
|
|
--replicas 0
|
||
|
|
kubectl scale deployment vapora-mcp-gateway \
|
||
|
|
-n vapora-system \
|
||
|
|
--replicas 0
|
||
|
|
timeout: 180s
|
||
|
|
continueOnError: true
|
||
|
|
|
||
|
|
- name: "Stop frontend"
|
||
|
|
command: |
|
||
|
|
kubectl scale deployment vapora-frontend \
|
||
|
|
-n vapora-system \
|
||
|
|
--replicas 0
|
||
|
|
timeout: 120s
|
||
|
|
continueOnError: true
|
||
|
|
|
||
|
|
# Phase 3: Restore database
|
||
|
|
- name: "Restore Database"
|
||
|
|
description: "Restore SurrealDB from backup"
|
||
|
|
retryable: false
|
||
|
|
steps:
|
||
|
|
- name: "Scale down SurrealDB replicas"
|
||
|
|
command: |
|
||
|
|
kubectl scale statefulset surrealdb \
|
||
|
|
-n vapora-system \
|
||
|
|
--replicas 1
|
||
|
|
timeout: 300s
|
||
|
|
|
||
|
|
- name: "Wait for single replica"
|
||
|
|
command: |
|
||
|
|
kubectl wait --for=condition=Ready pod \
|
||
|
|
-l app=surrealdb \
|
||
|
|
-n vapora-system \
|
||
|
|
--timeout=300s
|
||
|
|
timeout: 320s
|
||
|
|
|
||
|
|
- name: "Restore database data"
|
||
|
|
command: |
|
||
|
|
provisioning db restore \
|
||
|
|
--database surrealdb \
|
||
|
|
--backup $BACKUP_SOURCE \
|
||
|
|
--wait-for-completion
|
||
|
|
timeout: 1200s
|
||
|
|
env:
|
||
|
|
- name: BACKUP_SOURCE
|
||
|
|
value: "${backup_source}"
|
||
|
|
|
||
|
|
- name: "Verify database integrity"
|
||
|
|
command: |
|
||
|
|
provisioning db verify --database surrealdb \
|
||
|
|
--check-tables \
|
||
|
|
--check-indexes \
|
||
|
|
--check-constraints
|
||
|
|
timeout: 300s
|
||
|
|
|
||
|
|
- name: "Scale up SurrealDB replicas"
|
||
|
|
command: |
|
||
|
|
kubectl scale statefulset surrealdb \
|
||
|
|
-n vapora-system \
|
||
|
|
--replicas 3
|
||
|
|
timeout: 300s
|
||
|
|
|
||
|
|
- name: "Wait for all replicas"
|
||
|
|
command: |
|
||
|
|
kubectl wait --for=condition=Ready pod \
|
||
|
|
-l app=surrealdb \
|
||
|
|
-n vapora-system \
|
||
|
|
--timeout=600s
|
||
|
|
timeout: 620s
|
||
|
|
|
||
|
|
# Phase 4: Restore configuration and secrets
|
||
|
|
- name: "Restore Configuration"
|
||
|
|
description: "Restore ConfigMaps, Secrets, and application configuration"
|
||
|
|
retryable: true
|
||
|
|
steps:
|
||
|
|
- name: "Restore ConfigMaps"
|
||
|
|
command: |
|
||
|
|
provisioning backup restore \
|
||
|
|
--id $BACKUP_SOURCE \
|
||
|
|
--resource-type ConfigMap \
|
||
|
|
--namespace vapora-system
|
||
|
|
timeout: 180s
|
||
|
|
|
||
|
|
- name: "Restore Secrets"
|
||
|
|
command: |
|
||
|
|
provisioning backup restore \
|
||
|
|
--id $BACKUP_SOURCE \
|
||
|
|
--resource-type Secret \
|
||
|
|
--namespace vapora-system
|
||
|
|
timeout: 180s
|
||
|
|
|
||
|
|
- name: "Verify configuration"
|
||
|
|
command: |
|
||
|
|
kubectl get configmap -n vapora-system
|
||
|
|
kubectl get secrets -n vapora-system
|
||
|
|
timeout: 60s
|
||
|
|
|
||
|
|
# Phase 5: Restore service configurations
|
||
|
|
- name: "Restore Services"
|
||
|
|
description: "Restore service deployments and Istio configurations"
|
||
|
|
retryable: true
|
||
|
|
steps:
|
||
|
|
- name: "Restore backend deployment"
|
||
|
|
command: |
|
||
|
|
provisioning backup restore \
|
||
|
|
--id $BACKUP_SOURCE \
|
||
|
|
--resource-type Deployment \
|
||
|
|
--name vapora-backend
|
||
|
|
timeout: 300s
|
||
|
|
|
||
|
|
- name: "Restore LLM Router"
|
||
|
|
command: |
|
||
|
|
provisioning backup restore \
|
||
|
|
--id $BACKUP_SOURCE \
|
||
|
|
--resource-type Deployment \
|
||
|
|
--name vapora-llm-router
|
||
|
|
timeout: 300s
|
||
|
|
|
||
|
|
- name: "Restore MCP Gateway"
|
||
|
|
command: |
|
||
|
|
provisioning backup restore \
|
||
|
|
--id $BACKUP_SOURCE \
|
||
|
|
--resource-type Deployment \
|
||
|
|
--name vapora-mcp-gateway
|
||
|
|
timeout: 300s
|
||
|
|
|
||
|
|
- name: "Restore frontend"
|
||
|
|
command: |
|
||
|
|
provisioning backup restore \
|
||
|
|
--id $BACKUP_SOURCE \
|
||
|
|
--resource-type Deployment \
|
||
|
|
--name vapora-frontend
|
||
|
|
timeout: 300s
|
||
|
|
|
||
|
|
- name: "Restore Istio configuration"
|
||
|
|
command: |
|
||
|
|
provisioning backup restore \
|
||
|
|
--id $BACKUP_SOURCE \
|
||
|
|
--resource-type Gateway,VirtualService \
|
||
|
|
--namespace vapora-system
|
||
|
|
timeout: 180s
|
||
|
|
|
||
|
|
# Phase 6: Restore agents
|
||
|
|
- name: "Restore Agent Runtime"
|
||
|
|
description: "Restore agent deployment and state"
|
||
|
|
retryable: true
|
||
|
|
steps:
|
||
|
|
- name: "Restore agent deployment"
|
||
|
|
command: |
|
||
|
|
provisioning backup restore \
|
||
|
|
--id $BACKUP_SOURCE \
|
||
|
|
--resource-type Deployment \
|
||
|
|
--name vapora-agents
|
||
|
|
timeout: 300s
|
||
|
|
|
||
|
|
- name: "Wait for agents to be ready"
|
||
|
|
command: |
|
||
|
|
kubectl wait --for=condition=Ready pod \
|
||
|
|
-l app=vapora-agents \
|
||
|
|
-n vapora-system \
|
||
|
|
--timeout=600s
|
||
|
|
timeout: 620s
|
||
|
|
|
||
|
|
- name: "Verify agent communication"
|
||
|
|
command: "provisioning agents health-check --nats nats://nats-0.vapora-system:4222"
|
||
|
|
timeout: 120s
|
||
|
|
|
||
|
|
# Phase 7: Post-recovery verification
|
||
|
|
- name: "Verify Recovery"
|
||
|
|
description: "Comprehensive verification of recovered system"
|
||
|
|
retryable: false
|
||
|
|
steps:
|
||
|
|
- name: "Health check cluster"
|
||
|
|
command: "provisioning health-check --cluster"
|
||
|
|
timeout: 300s
|
||
|
|
|
||
|
|
- name: "Health check all services"
|
||
|
|
command: "provisioning health-check --services all --strict"
|
||
|
|
timeout: 300s
|
||
|
|
|
||
|
|
- name: "Test database connectivity"
|
||
|
|
command: "provisioning db test-connection --database surrealdb"
|
||
|
|
timeout: 120s
|
||
|
|
|
||
|
|
- name: "Verify data consistency"
|
||
|
|
command: |
|
||
|
|
provisioning db verify --database surrealdb \
|
||
|
|
--check-integrity \
|
||
|
|
--sample-size 1000
|
||
|
|
timeout: 300s
|
||
|
|
|
||
|
|
- name: "Run smoke tests"
|
||
|
|
command: |
|
||
|
|
provisioning test smoke \
|
||
|
|
--api http://vapora-backend.vapora-system:8080 \
|
||
|
|
--frontend http://vapora-frontend.vapora-system:3000 \
|
||
|
|
--timeout 600s
|
||
|
|
timeout: 620s
|
||
|
|
|
||
|
|
- name: "Test agent communication"
|
||
|
|
command: |
|
||
|
|
provisioning agents test \
|
||
|
|
--send-test-message \
|
||
|
|
--verify-delivery \
|
||
|
|
--timeout 120s
|
||
|
|
timeout: 140s
|
||
|
|
|
||
|
|
# Phase 8: Re-enable system
|
||
|
|
- name: "Resume Operations"
|
||
|
|
description: "Re-enable system for normal operation"
|
||
|
|
retryable: false
|
||
|
|
steps:
|
||
|
|
- name: "Disable recovery mode"
|
||
|
|
command: |
|
||
|
|
kubectl patch configmap vapora-config \
|
||
|
|
-n vapora-system \
|
||
|
|
-p '{"data":{"recovery_mode":"false","accept_requests":"true"}}'
|
||
|
|
timeout: 60s
|
||
|
|
|
||
|
|
- name: "Scale up services to previous state"
|
||
|
|
command: |
|
||
|
|
provisioning taskserv scale-to-previous \
|
||
|
|
--namespace vapora-system
|
||
|
|
timeout: 300s
|
||
|
|
|
||
|
|
- name: "Resume agent work"
|
||
|
|
command: "provisioning agents drain --disable"
|
||
|
|
timeout: 60s
|
||
|
|
|
||
|
|
- name: "Final health check"
|
||
|
|
command: "provisioning health-check --cluster"
|
||
|
|
timeout: 300s
|
||
|
|
|
||
|
|
# Phase 9: Documentation and reporting
|
||
|
|
- name: "Generate Recovery Report"
|
||
|
|
description: "Document recovery operation"
|
||
|
|
retryable: false
|
||
|
|
steps:
|
||
|
|
- name: "Create recovery report"
|
||
|
|
command: |
|
||
|
|
provisioning report generate \
|
||
|
|
--type disaster-recovery \
|
||
|
|
--backup-id $BACKUP_SOURCE \
|
||
|
|
--output "recovery-report-$(date +%Y%m%d-%H%M%S).md"
|
||
|
|
timeout: 120s
|
||
|
|
|
||
|
|
- name: "Create git commit for recovery"
|
||
|
|
command: |
|
||
|
|
git add -A
|
||
|
|
git commit -m "Disaster recovery: Restored from backup $BACKUP_SOURCE at $(date)"
|
||
|
|
timeout: 60s
|
||
|
|
continueOnError: true
|
||
|
|
|
||
|
|
- name: "Log recovery event"
|
||
|
|
command: |
|
||
|
|
kubectl logs -n vapora-system -l app=vapora-backend \
|
||
|
|
| grep -E "recovery|restore|initialize" \
|
||
|
|
| tail -20
|
||
|
|
timeout: 60s
|
||
|
|
|
||
|
|
outputs:
|
||
|
|
- name: recovery_status
|
||
|
|
value: "echo 'Disaster recovery completed'"
|
||
|
|
- name: restored_services
|
||
|
|
command: "kubectl get deployment -n vapora-system -o wide"
|
||
|
|
- name: database_status
|
||
|
|
command: "provisioning db status --database surrealdb"
|
||
|
|
|
||
|
|
# Error handling
|
||
|
|
onFailure:
|
||
|
|
procedure:
|
||
|
|
- name: "Gather diagnostic information"
|
||
|
|
command: |
|
||
|
|
provisioning debug collect \
|
||
|
|
--output "debug-logs-$(date +%s).tar.gz"
|
||
|
|
- name: "Alert operations team"
|
||
|
|
command: "slack: #alerts"
|
||
|
|
|
||
|
|
notifications:
|
||
|
|
onStart:
|
||
|
|
- "slack: #deployment"
|
||
|
|
- "email: devops@example.com"
|
||
|
|
- "severity: critical"
|
||
|
|
onSuccess:
|
||
|
|
- "slack: #deployment"
|
||
|
|
- "slack: notify: Disaster recovery successful"
|
||
|
|
- "email: devops@example.com"
|
||
|
|
onFailure:
|
||
|
|
- "slack: #deployment"
|
||
|
|
- "slack: #alerts"
|
||
|
|
- "email: devops@example.com"
|
||
|
|
- "severity: critical"
|
||
|
|
- "page: on-call"
|