Vapora/docs/disaster-recovery/disaster-recovery-runbook.html

<!DOCTYPE HTML>
<html lang="en" class="light sidebar-visible" dir="ltr">
    <head>
        <!-- Book generated using mdBook -->
        <meta charset="UTF-8">
        <title>Disaster Recovery Runbook - VAPORA Platform Documentation</title>


        <!-- Custom HTML head -->

        <meta name="description" content="Comprehensive documentation for VAPORA, an intelligent development orchestration platform built entirely in Rust.">
        <meta name="viewport" content="width=device-width, initial-scale=1">
        <meta name="theme-color" content="#ffffff">

        <link rel="icon" href="../favicon.svg">
        <link rel="shortcut icon" href="../favicon.png">
        <link rel="stylesheet" href="../css/variables.css">
        <link rel="stylesheet" href="../css/general.css">
        <link rel="stylesheet" href="../css/chrome.css">
        <link rel="stylesheet" href="../css/print.css" media="print">

        <!-- Fonts -->
        <link rel="stylesheet" href="../FontAwesome/css/font-awesome.css">
        <link rel="stylesheet" href="../fonts/fonts.css">

        <!-- Highlight.js Stylesheets -->
        <link rel="stylesheet" id="highlight-css" href="../highlight.css">
        <link rel="stylesheet" id="tomorrow-night-css" href="../tomorrow-night.css">
        <link rel="stylesheet" id="ayu-highlight-css" href="../ayu-highlight.css">

        <!-- Custom theme stylesheets -->


        <!-- Provide site root and default themes to javascript -->
        <script>
            const path_to_root = "../";
            const default_light_theme = "light";
            const default_dark_theme = "dark";
        </script>
        <!-- Start loading toc.js asap -->
        <script src="../toc.js"></script>
    </head>
    <body>
    <div id="mdbook-help-container">
        <div id="mdbook-help-popup">
            <h2 class="mdbook-help-title">Keyboard shortcuts</h2>
            <div>
                <p>Press <kbd>←</kbd> or <kbd>→</kbd> to navigate between chapters</p>
                <p>Press <kbd>S</kbd> or <kbd>/</kbd> to search in the book</p>
                <p>Press <kbd>?</kbd> to show this help</p>
                <p>Press <kbd>Esc</kbd> to hide this help</p>
            </div>
        </div>
    </div>
    <div id="body-container">
        <!-- Work around some values being stored in localStorage wrapped in quotes -->
        <script>
            try {
                let theme = localStorage.getItem('mdbook-theme');
                let sidebar = localStorage.getItem('mdbook-sidebar');

                if (theme.startsWith('"') && theme.endsWith('"')) {
                    localStorage.setItem('mdbook-theme', theme.slice(1, theme.length - 1));
                }

                if (sidebar.startsWith('"') && sidebar.endsWith('"')) {
                    localStorage.setItem('mdbook-sidebar', sidebar.slice(1, sidebar.length - 1));
                }
            } catch (e) { }
        </script>

        <!-- Set the theme before any content is loaded, prevents flash -->
        <script>
            const default_theme = window.matchMedia("(prefers-color-scheme: dark)").matches ? default_dark_theme : default_light_theme;
            let theme;
            try { theme = localStorage.getItem('mdbook-theme'); } catch(e) { }
            if (theme === null || theme === undefined) { theme = default_theme; }
            const html = document.documentElement;
            html.classList.remove('light')
            html.classList.add(theme);
            html.classList.add("js");
        </script>

        <input type="checkbox" id="sidebar-toggle-anchor" class="hidden">

        <!-- Hide / unhide sidebar before it is displayed -->
        <script>
            let sidebar = null;
            const sidebar_toggle = document.getElementById("sidebar-toggle-anchor");
            if (document.body.clientWidth >= 1080) {
                try { sidebar = localStorage.getItem('mdbook-sidebar'); } catch(e) { }
                sidebar = sidebar || 'visible';
            } else {
                sidebar = 'hidden';
            }
            sidebar_toggle.checked = sidebar === 'visible';
            html.classList.remove('sidebar-visible');
            html.classList.add("sidebar-" + sidebar);
        </script>

        <nav id="sidebar" class="sidebar" aria-label="Table of contents">
            <!-- populated by js -->
            <mdbook-sidebar-scrollbox class="sidebar-scrollbox"></mdbook-sidebar-scrollbox>
            <noscript>
                <iframe class="sidebar-iframe-outer" src="../toc.html"></iframe>
            </noscript>
            <div id="sidebar-resize-handle" class="sidebar-resize-handle">
                <div class="sidebar-resize-indicator"></div>
            </div>
        </nav>

        <div id="page-wrapper" class="page-wrapper">

            <div class="page">
                <div id="menu-bar-hover-placeholder"></div>
                <div id="menu-bar" class="menu-bar sticky">
                    <div class="left-buttons">
                        <label id="sidebar-toggle" class="icon-button" for="sidebar-toggle-anchor" title="Toggle Table of Contents" aria-label="Toggle Table of Contents" aria-controls="sidebar">
                            <i class="fa fa-bars"></i>
                        </label>
                        <button id="theme-toggle" class="icon-button" type="button" title="Change theme" aria-label="Change theme" aria-haspopup="true" aria-expanded="false" aria-controls="theme-list">
                            <i class="fa fa-paint-brush"></i>
                        </button>
                        <ul id="theme-list" class="theme-popup" aria-label="Themes" role="menu">
                            <li role="none"><button role="menuitem" class="theme" id="default_theme">Auto</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="light">Light</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="rust">Rust</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="coal">Coal</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="navy">Navy</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="ayu">Ayu</button></li>
                        </ul>
                        <button id="search-toggle" class="icon-button" type="button" title="Search (`/`)" aria-label="Toggle Searchbar" aria-expanded="false" aria-keyshortcuts="/ s" aria-controls="searchbar">
                            <i class="fa fa-search"></i>
                        </button>
                    </div>

                    <h1 class="menu-title">VAPORA Platform Documentation</h1>

                    <div class="right-buttons">
                        <a href="../print.html" title="Print this book" aria-label="Print this book">
                            <i id="print-button" class="fa fa-print"></i>
                        </a>
                        <a href="https://github.com/vapora-platform/vapora" title="Git repository" aria-label="Git repository">
                            <i id="git-repository-button" class="fa fa-github"></i>
                        </a>
                        <a href="https://github.com/vapora-platform/vapora/edit/main/docs/src/../disaster-recovery/disaster-recovery-runbook.md" title="Suggest an edit" aria-label="Suggest an edit">
                            <i id="git-edit-button" class="fa fa-edit"></i>
                        </a>

                    </div>
                </div>

                <div id="search-wrapper" class="hidden">
                    <form id="searchbar-outer" class="searchbar-outer">
                        <input type="search" id="searchbar" name="searchbar" placeholder="Search this book ..." aria-controls="searchresults-outer" aria-describedby="searchresults-header">
                    </form>
                    <div id="searchresults-outer" class="searchresults-outer hidden">
                        <div id="searchresults-header" class="searchresults-header"></div>
                        <ul id="searchresults">
                        </ul>
                    </div>
                </div>

                <!-- Apply ARIA attributes after the sidebar and the sidebar toggle button are added to the DOM -->
                <script>
                    document.getElementById('sidebar-toggle').setAttribute('aria-expanded', sidebar === 'visible');
                    document.getElementById('sidebar').setAttribute('aria-hidden', sidebar !== 'visible');
                    Array.from(document.querySelectorAll('#sidebar a')).forEach(function(link) {
                        link.setAttribute('tabIndex', sidebar === 'visible' ? 0 : -1);
                    });
                </script>

                <div id="content" class="content">
                    <main>
                        <h1 id="disaster-recovery-runbook"><a class="header" href="#disaster-recovery-runbook">Disaster Recovery Runbook</a></h1>
<p>Step-by-step procedures for recovering VAPORA from various disaster scenarios.</p>
<hr />
<h2 id="disaster-severity-levels"><a class="header" href="#disaster-severity-levels">Disaster Severity Levels</a></h2>
<h3 id="level-1-critical-"><a class="header" href="#level-1-critical-">Level 1: Critical 🔴</a></h3>
<p><strong>Complete Service Loss</strong> - Entire VAPORA unavailable</p>
<p>Examples:</p>
<ul>
<li>Complete cluster failure</li>
<li>Complete data center outage</li>
<li>Database completely corrupted</li>
<li>All backups inaccessible</li>
</ul>
<p>RTO: 2-4 hours
RPA: Up to 1 hour of data loss possible</p>
<h3 id="level-2-major-"><a class="header" href="#level-2-major-">Level 2: Major 🟠</a></h3>
<p><strong>Partial Service Loss</strong> - Some services unavailable</p>
<p>Examples:</p>
<ul>
<li>Single region down</li>
<li>Database corrupted but backups available</li>
<li>One service completely failed</li>
<li>Primary storage unavailable</li>
</ul>
<p>RTO: 30 minutes - 2 hours
RPA: Minimal data loss</p>
<h3 id="level-3-minor-"><a class="header" href="#level-3-minor-">Level 3: Minor 🟡</a></h3>
<p><strong>Degraded Service</strong> - Service running but with issues</p>
<p>Examples:</p>
<ul>
<li>Performance issues</li>
<li>One pod crashed</li>
<li>Database connection issues</li>
<li>High error rate</li>
</ul>
<p>RTO: 5-15 minutes
RPA: No data loss</p>
<hr />
<h2 id="disaster-assessment-first-5-minutes"><a class="header" href="#disaster-assessment-first-5-minutes">Disaster Assessment (First 5 Minutes)</a></h2>
<h3 id="step-1-declare-disaster-state"><a class="header" href="#step-1-declare-disaster-state">Step 1: Declare Disaster State</a></h3>
<p>When any of these occur, declare a disaster:</p>
<pre><code class="language-bash"># Q1: Is the service accessible?
curl -v https://api.vapora.com/health

# Q2: How many pods are running?
kubectl get pods -n vapora

# Q3: Can we access the database?
kubectl exec -n vapora pod/&lt;name&gt; -- \
  surreal query "SELECT * FROM projects LIMIT 1"

# Q4: Are backups available?
aws s3 ls s3://vapora-backups/
</code></pre>
<p><strong>Decision Tree</strong>:</p>
<pre><code>Can access service normally?
  YES → No disaster, escalate to incident response
  NO → Continue

Can reach any pods?
  YES → Partial disaster (Level 2-3)
  NO → Likely total disaster (Level 1)

Can reach database?
  YES → Application issue, not data issue
  NO → Database issue, need restoration

Are backups accessible?
  YES → Recovery likely possible
  NO → Critical situation, activate backup locations
</code></pre>
<h3 id="step-2-severity-assignment"><a class="header" href="#step-2-severity-assignment">Step 2: Severity Assignment</a></h3>
<p>Based on assessment:</p>
<pre><code class="language-bash"># Level 1 Criteria (Critical)
- 0 pods running in vapora namespace
- Database completely unreachable
- All backup locations inaccessible
- Service down &gt;30 minutes

# Level 2 Criteria (Major)
- &lt;50% pods running
- Database reachable but degraded
- Primary backups inaccessible but secondary available
- Service down 5-30 minutes

# Level 3 Criteria (Minor)
- &gt;75% pods running
- Database responsive but with errors
- Backups accessible
- Service down &lt;5 minutes

Assignment: Level ___

If Level 1: Activate full DR plan
If Level 2: Activate partial DR plan
If Level 3: Use normal incident response
</code></pre>
<h3 id="step-3-notify-key-personnel"><a class="header" href="#step-3-notify-key-personnel">Step 3: Notify Key Personnel</a></h3>
<pre><code class="language-bash"># For Level 1 (Critical) DR
send_message_to = [
  "@cto",
  "@ops-manager",
  "@database-team",
  "@infrastructure-team",
  "@product-manager"
]

message = """
🔴 DISASTER DECLARED - LEVEL 1 CRITICAL

Service: VAPORA (Complete Outage)
Severity: Critical
Time Declared: [UTC]
Status: Assessing

Actions underway:
1. Activating disaster recovery procedures
2. Notifying stakeholders
3. Engaging full team

Next update: [+5 min]

/cc @all-involved
"""

post_to_slack("#incident-critical")
page_on_call_manager(urgent=true)
</code></pre>
<hr />
<h2 id="disaster-scenario-procedures"><a class="header" href="#disaster-scenario-procedures">Disaster Scenario Procedures</a></h2>
<h3 id="scenario-1-complete-cluster-failure"><a class="header" href="#scenario-1-complete-cluster-failure">Scenario 1: Complete Cluster Failure</a></h3>
<p><strong>Symptoms</strong>:</p>
<ul>
<li>kubectl commands time out or fail</li>
<li>No pods running in any namespace</li>
<li>Nodes unreachable</li>
<li>All services down</li>
</ul>
<p><strong>Recovery Steps</strong>:</p>
<h4 id="step-1-assess-infrastructure-5-min"><a class="header" href="#step-1-assess-infrastructure-5-min">Step 1: Assess Infrastructure (5 min)</a></h4>
<pre><code class="language-bash"># Try basic cluster operations
kubectl cluster-info
# If: "Unable to connect to the server"

# Check cloud provider status
# AWS: Check AWS status page, check EC2 instances
# GKE: Check Google Cloud console
# On-prem: Check infrastructure team

# Determine: Is infrastructure failed or just connectivity?
</code></pre>
<h4 id="step-2-if-infrastructure-failed"><a class="header" href="#step-2-if-infrastructure-failed">Step 2: If Infrastructure Failed</a></h4>
<p><strong>Activate Secondary Infrastructure</strong> (if available):</p>
<pre><code class="language-bash"># 1. Access backup/secondary infrastructure
export KUBECONFIG=/path/to/backup/kubeconfig

# 2. Verify it's operational
kubectl cluster-info
kubectl get nodes

# 3. Prepare for database restore
# (See: Scenario 2 - Database Recovery)
</code></pre>
<p><strong>If No Secondary</strong>: Activate failover to alternate region</p>
<pre><code class="language-bash"># 1. Contact cloud provider
# AWS: Open support case - request emergency instance launch
# GKE: Request cluster creation in different region

# 2. While infrastructure rebuilds:
# - Retrieve backups
# - Prepare restore scripts
# - Brief team on ETA
</code></pre>
<h4 id="step-3-restore-database-see-scenario-2"><a class="header" href="#step-3-restore-database-see-scenario-2">Step 3: Restore Database (See Scenario 2)</a></h4>
<h4 id="step-4-deploy-services"><a class="header" href="#step-4-deploy-services">Step 4: Deploy Services</a></h4>
<pre><code class="language-bash"># Once infrastructure ready and database restored

# 1. Apply ConfigMaps
kubectl apply -f vapora-configmap.yaml

# 2. Apply Secrets
kubectl apply -f vapora-secrets.yaml

# 3. Deploy services
kubectl apply -f vapora-deployments.yaml

# 4. Wait for pods to start
kubectl rollout status deployment/vapora-backend -n vapora --timeout=10m

# 5. Verify health
curl http://localhost:8001/health
</code></pre>
<h4 id="step-5-verification"><a class="header" href="#step-5-verification">Step 5: Verification</a></h4>
<pre><code class="language-bash"># 1. Check all pods running
kubectl get pods -n vapora
# All should show: Running, 1/1 Ready

# 2. Verify database connectivity
kubectl logs deployment/vapora-backend -n vapora | tail -20
# Should show: "Successfully connected to database"

# 3. Test API
curl http://localhost:8001/api/projects
# Should return project list

# 4. Check data integrity
# Run validation queries:
SELECT COUNT(*) FROM projects;          # Should &gt; 0
SELECT COUNT(*) FROM users;             # Should &gt; 0
SELECT COUNT(*) FROM tasks;             # Should &gt; 0
</code></pre>
<hr />
<h3 id="scenario-2-database-corruptionloss"><a class="header" href="#scenario-2-database-corruptionloss">Scenario 2: Database Corruption/Loss</a></h3>
<p><strong>Symptoms</strong>:</p>
<ul>
<li>Database queries return errors</li>
<li>Data integrity issues</li>
<li>Corruption detected in logs</li>
</ul>
<p><strong>Recovery Steps</strong>:</p>
<h4 id="step-1-assess-database-state-10-min"><a class="header" href="#step-1-assess-database-state-10-min">Step 1: Assess Database State (10 min)</a></h4>
<pre><code class="language-bash"># 1. Try to connect
kubectl exec -n vapora pod/surrealdb-0 -- \
  surreal sql --conn ws://localhost:8000 \
  --user root --pass "$DB_PASSWORD" \
  "SELECT COUNT(*) FROM projects"

# 2. Check for error messages
kubectl logs -n vapora pod/surrealdb-0 | tail -50 | grep -i error

# 3. Assess damage
# Is it:
# - Connection issue (might recover)
# - Data corruption (need restore)
# - Complete loss (restore from backup)
</code></pre>
<h4 id="step-2-backup-current-state-for-forensics"><a class="header" href="#step-2-backup-current-state-for-forensics">Step 2: Backup Current State (for forensics)</a></h4>
<pre><code class="language-bash"># Before attempting recovery, save current state

# Export what's remaining
kubectl exec -n vapora pod/surrealdb-0 -- \
  surreal export --conn ws://localhost:8000 \
  --user root --pass "$DB_PASSWORD" \
  --output /tmp/corrupted-export.sql

# Download for analysis
kubectl cp vapora/surrealdb-0:/tmp/corrupted-export.sql \
  ./corrupted-export-$(date +%Y%m%d-%H%M%S).sql
</code></pre>
<h4 id="step-3-identify-latest-good-backup"><a class="header" href="#step-3-identify-latest-good-backup">Step 3: Identify Latest Good Backup</a></h4>
<pre><code class="language-bash"># Find most recent backup before corruption
aws s3 ls s3://vapora-backups/database/ --recursive | sort

# Latest backup timestamp
# Should be within last hour

# Download backup
aws s3 cp s3://vapora-backups/database/2026-01-12/vapora-db-010000.sql.gz \
  ./vapora-db-restore.sql.gz

gunzip vapora-db-restore.sql.gz
</code></pre>
<h4 id="step-4-restore-database"><a class="header" href="#step-4-restore-database">Step 4: Restore Database</a></h4>
<pre><code class="language-bash"># Option A: Restore to same database (destructive)
# WARNING: This will overwrite current database

kubectl exec -n vapora pod/surrealdb-0 -- \
  rm -rf /var/lib/surrealdb/data.db

# Restart pod to reinitialize
kubectl delete pod -n vapora surrealdb-0
# Pod will restart with clean database

# Import backup
kubectl exec -n vapora pod/surrealdb-0 -- \
  surreal import --conn ws://localhost:8000 \
  --user root --pass "$DB_PASSWORD" \
  --input /tmp/vapora-db-restore.sql

# Wait for import to complete (5-15 minutes)
</code></pre>
<p><strong>Option B: Restore to temporary database (safer)</strong></p>
<pre><code class="language-bash"># 1. Create temporary database pod
kubectl run -n vapora restore-test --image=surrealdb/surrealdb:latest \
  -- start file:///tmp/restore-test

# 2. Restore to temporary
kubectl cp ./vapora-db-restore.sql vapora/restore-test:/tmp/
kubectl exec -n vapora restore-test -- \
  surreal import --conn ws://localhost:8000 \
  --user root --pass "$DB_PASSWORD" \
  --input /tmp/vapora-db-restore.sql

# 3. Verify restored data
kubectl exec -n vapora restore-test -- \
  surreal sql "SELECT COUNT(*) FROM projects"

# 4. If good: Restore production
kubectl delete pod -n vapora surrealdb-0
# Wait for pod restart
kubectl cp ./vapora-db-restore.sql vapora/surrealdb-0:/tmp/
kubectl exec -n vapora surrealdb-0 -- \
  surreal import --conn ws://localhost:8000 \
  --user root --pass "$DB_PASSWORD" \
  --input /tmp/vapora-db-restore.sql

# 5. Cleanup test pod
kubectl delete pod -n vapora restore-test
</code></pre>
<h4 id="step-5-verify-recovery"><a class="header" href="#step-5-verify-recovery">Step 5: Verify Recovery</a></h4>
<pre><code class="language-bash"># 1. Database responsive
kubectl exec -n vapora pod/surrealdb-0 -- \
  surreal sql "SELECT COUNT(*) FROM projects"

# 2. Application can connect
kubectl logs deployment/vapora-backend -n vapora | tail -5
# Should show successful connection

# 3. API working
curl http://localhost:8001/api/projects

# 4. Data valid
# Check record counts match pre-backup
# Check no corruption in key records
</code></pre>
<hr />
<h3 id="scenario-3-configuration-corruption"><a class="header" href="#scenario-3-configuration-corruption">Scenario 3: Configuration Corruption</a></h3>
<p><strong>Symptoms</strong>:</p>
<ul>
<li>Application misconfigured</li>
<li>Pods failing to start</li>
<li>Wrong values in environment</li>
</ul>
<p><strong>Recovery Steps</strong>:</p>
<h4 id="step-1-identify-bad-configuration"><a class="header" href="#step-1-identify-bad-configuration">Step 1: Identify Bad Configuration</a></h4>
<pre><code class="language-bash"># 1. Get current ConfigMap
kubectl get configmap -n vapora vapora-config -o yaml &gt; current-config.yaml

# 2. Compare with known-good backup
aws s3 cp s3://vapora-backups/configs/2026-01-12/configmaps.yaml .

# 3. Diff to find issues
diff configmaps.yaml current-config.yaml
</code></pre>
<h4 id="step-2-restore-previous-configuration"><a class="header" href="#step-2-restore-previous-configuration">Step 2: Restore Previous Configuration</a></h4>
<pre><code class="language-bash"># 1. Get previous ConfigMap from backup
aws s3 cp s3://vapora-backups/configs/2026-01-11/configmaps.yaml ./good-config.yaml

# 2. Apply previous configuration
kubectl apply -f good-config.yaml

# 3. Restart pods to pick up new config
kubectl rollout restart deployment/vapora-backend -n vapora
kubectl rollout restart deployment/vapora-agents -n vapora

# 4. Monitor restart
kubectl get pods -n vapora -w
</code></pre>
<h4 id="step-3-verify-configuration"><a class="header" href="#step-3-verify-configuration">Step 3: Verify Configuration</a></h4>
<pre><code class="language-bash"># 1. Pods should restart and become Running
kubectl get pods -n vapora
# All should show: Running, 1/1 Ready

# 2. Check pod logs
kubectl logs deployment/vapora-backend -n vapora | tail -10
# Should show successful startup

# 3. API operational
curl http://localhost:8001/health
</code></pre>
<hr />
<h3 id="scenario-4-data-centerregion-outage"><a class="header" href="#scenario-4-data-centerregion-outage">Scenario 4: Data Center/Region Outage</a></h3>
<p><strong>Symptoms</strong>:</p>
<ul>
<li>Entire region unreachable</li>
<li>Multiple infrastructure components down</li>
<li>Network connectivity issues</li>
</ul>
<p><strong>Recovery Steps</strong>:</p>
<h4 id="step-1-declare-regional-failover"><a class="header" href="#step-1-declare-regional-failover">Step 1: Declare Regional Failover</a></h4>
<pre><code class="language-bash"># 1. Confirm region is down
ping production.vapora.com
# Should fail

# Check status page
# Cloud provider should report outage

# 2. Declare failover
declare_failover_to_region("us-west-2")
</code></pre>
<h4 id="step-2-activate-alternate-region"><a class="header" href="#step-2-activate-alternate-region">Step 2: Activate Alternate Region</a></h4>
<pre><code class="language-bash"># 1. Switch kubeconfig to alternate region
export KUBECONFIG=/path/to/backup-region/kubeconfig

# 2. Verify alternate region up
kubectl cluster-info

# 3. Download and restore database
aws s3 cp s3://vapora-backups/database/latest/ . --recursive

# 4. Restore services (as in Scenario 1, Step 4)
</code></pre>
<h4 id="step-3-update-dnsrouting"><a class="header" href="#step-3-update-dnsrouting">Step 3: Update DNS/Routing</a></h4>
<pre><code class="language-bash"># Update DNS to point to alternate region
aws route53 change-resource-record-sets \
  --hosted-zone-id Z123456 \
  --change-batch '{
    "Changes": [{
      "Action": "UPSERT",
      "ResourceRecordSet": {
        "Name": "api.vapora.com",
        "Type": "A",
        "AliasTarget": {
          "HostedZoneId": "Z987654",
          "DNSName": "backup-region-lb.elb.amazonaws.com",
          "EvaluateTargetHealth": false
        }
      }
    }]
  }'

# Wait for DNS propagation (5-10 minutes)
</code></pre>
<h4 id="step-4-verify-failover"><a class="header" href="#step-4-verify-failover">Step 4: Verify Failover</a></h4>
<pre><code class="language-bash"># 1. DNS resolves to new region
nslookup api.vapora.com

# 2. Services accessible
curl https://api.vapora.com/health

# 3. Data intact
curl https://api.vapora.com/api/projects
</code></pre>
<h4 id="step-5-communicate-failover"><a class="header" href="#step-5-communicate-failover">Step 5: Communicate Failover</a></h4>
<pre><code>Post to #incident-critical:

✅ FAILOVER TO ALTERNATE REGION COMPLETE

Primary Region: us-east-1 (Down)
Active Region: us-west-2 (Restored)

Status:
- All services running: ✓
- Database restored: ✓
- Data integrity verified: ✓
- Partial data loss: ~30 minutes of transactions

Estimated Data Loss: 30 minutes (11:30-12:00 UTC)
Current Time: 12:05 UTC

Next steps:
- Monitor alternate region closely
- Begin investigation of primary region
- Plan failback when primary recovered

Questions? /cc @ops-team
</code></pre>
<hr />
<h2 id="post-disaster-recovery"><a class="header" href="#post-disaster-recovery">Post-Disaster Recovery</a></h2>
<h3 id="phase-1-stabilization-ongoing"><a class="header" href="#phase-1-stabilization-ongoing">Phase 1: Stabilization (Ongoing)</a></h3>
<pre><code class="language-bash"># Continue monitoring for 4 hours minimum

# Checks every 15 minutes:
✓ All pods Running
✓ API responding
✓ Database queries working
✓ Error rates normal
✓ Performance baseline
</code></pre>
<h3 id="phase-2-root-cause-analysis"><a class="header" href="#phase-2-root-cause-analysis">Phase 2: Root Cause Analysis</a></h3>
<p><strong>Start within 1 hour of service recovery</strong>:</p>
<pre><code>Questions to answer:

1. What caused the disaster?
   - Hardware failure
   - Software bug
   - Configuration error
   - External attack
   - Human error

2. Why wasn't it detected earlier?
   - Monitoring gap
   - Alert misconfiguration
   - Alert fatigue

3. How did backups perform?
   - Were they accessible?
   - Restore time as expected?
   - Data loss acceptable?

4. What took longest in recovery?
   - Finding backups
   - Restoring database
   - Redeploying services
   - Verifying integrity

5. What can be improved?
   - Faster detection
   - Faster recovery
   - Better documentation
   - More automated recovery
</code></pre>
<h3 id="phase-3-recovery-documentation"><a class="header" href="#phase-3-recovery-documentation">Phase 3: Recovery Documentation</a></h3>
<pre><code>Create post-disaster report:

Timeline:
- 11:30 UTC: Disaster detected
- 11:35 UTC: Database restore started
- 11:50 UTC: Services redeployed
- 12:00 UTC: All systems operational
- Duration: 30 minutes

Impact:
- Users affected: [X]
- Data lost: [X] transactions
- Revenue impact: $[X]

Root cause: [Description]

Contributing factors:
1. [Factor 1]
2. [Factor 2]

Preventive measures:
1. [Action] by [Owner] by [Date]
2. [Action] by [Owner] by [Date]

Lessons learned:
1. [Lesson 1]
2. [Lesson 2]
</code></pre>
<h3 id="phase-4-improvements-implementation"><a class="header" href="#phase-4-improvements-implementation">Phase 4: Improvements Implementation</a></h3>
<p><strong>Due date: Within 2 weeks</strong></p>
<pre><code>Checklist for improvements:

□ Update backup strategy (if needed)
□ Improve monitoring/alerting
□ Automate more recovery steps
□ Update runbooks with learnings
□ Train team on new procedures
□ Test improved procedures
□ Document for future reference
□ Incident retrospective meeting
</code></pre>
<hr />
<h2 id="disaster-recovery-drill"><a class="header" href="#disaster-recovery-drill">Disaster Recovery Drill</a></h2>
<h3 id="quarterly-dr-drill"><a class="header" href="#quarterly-dr-drill">Quarterly DR Drill</a></h3>
<p><strong>Purpose</strong>: Test DR procedures before real disaster</p>
<p><strong>Schedule</strong>: Last Friday of each quarter at 02:00 UTC</p>
<pre><code class="language-bash">def quarterly_dr_drill [] {
  print "=== QUARTERLY DISASTER RECOVERY DRILL ==="
  print $"Date: (date now | format date %Y-%m-%d %H:%M:%S UTC)"
  print ""

  # 1. Simulate database corruption
  print "1. Simulating database corruption..."
  # Create test database, introduce corruption

  # 2. Test restore procedure
  print "2. Testing restore from backup..."
  # Download backup, restore to test database

  # 3. Measure restore time
  let start_time = (date now)
  # ... restore process ...
  let end_time = (date now)
  let duration = $end_time - $start_time
  print $"Restore time: ($duration)"

  # 4. Verify data integrity
  print "3. Verifying data integrity..."
  # Check restored data matches pre-backup

  # 5. Document results
  print "4. Documenting results..."
  # Record in DR drill log

  print ""
  print "Drill complete"
}
</code></pre>
<h3 id="drill-checklist"><a class="header" href="#drill-checklist">Drill Checklist</a></h3>
<pre><code>Pre-Drill (1 week before):
□ Notify team of scheduled drill
□ Plan specific scenario to test
□ Prepare test environment
□ Have runbooks available

During Drill:
□ Execute scenario as planned
□ Record actual timings
□ Document any issues
□ Note what went well
□ Note what could improve

Post-Drill (within 1 day):
□ Debrief meeting
□ Review recorded times vs. targets
□ Discuss improvements
□ Update runbooks if needed
□ Thank team for participation
□ Document lessons learned

Post-Drill (within 1 week):
□ Implement identified improvements
□ Test improvements
□ Verify procedures updated
□ Archive drill documentation
</code></pre>
<hr />
<h2 id="disaster-recovery-readiness"><a class="header" href="#disaster-recovery-readiness">Disaster Recovery Readiness</a></h2>
<h3 id="recovery-readiness-checklist"><a class="header" href="#recovery-readiness-checklist">Recovery Readiness Checklist</a></h3>
<pre><code>Infrastructure:
□ Primary region configured
□ Backup region prepared
□ Load balancing configured
□ DNS failover configured

Data:
□ Hourly database backups
□ Backups encrypted
□ Backups tested (monthly)
□ Multiple backup locations

Configuration:
□ ConfigMaps backed up (daily)
□ Secrets encrypted and backed up
□ Infrastructure code in Git
□ Deployment manifests versioned

Documentation:
□ Disaster procedures documented
□ Runbooks current and tested
□ Team trained on procedures
□ Escalation paths clear

Testing:
□ Monthly restore test passes
□ Quarterly DR drill scheduled
□ Recovery times meet RTO/RPA

Monitoring:
□ Alerts for backup failures
□ Backup health checks running
□ Recovery procedures monitored
</code></pre>
<h3 id="rtorpa-targets"><a class="header" href="#rtorpa-targets">RTO/RPA Targets</a></h3>
<div class="table-wrapper"><table><thead><tr><th>Scenario</th><th>RTO</th><th>RPA</th></tr></thead><tbody>
<tr><td><strong>Single pod failure</strong></td><td>5 min</td><td>0 min</td></tr>
<tr><td><strong>Database corruption</strong></td><td>1 hour</td><td>1 hour</td></tr>
<tr><td><strong>Node failure</strong></td><td>15 min</td><td>0 min</td></tr>
<tr><td><strong>Region outage</strong></td><td>2 hours</td><td>15 min</td></tr>
<tr><td><strong>Complete cluster loss</strong></td><td>4 hours</td><td>1 hour</td></tr>
</tbody></table>
</div>
<hr />
<h2 id="disaster-recovery-contacts"><a class="header" href="#disaster-recovery-contacts">Disaster Recovery Contacts</a></h2>
<pre><code>Role: Contact: Phone: Slack:
Primary DBA: [Name] [Phone] @[slack]
Backup DBA: [Name] [Phone] @[slack]
Infra Lead: [Name] [Phone] @[slack]
Backup Infra: [Name] [Phone] @[slack]
CTO: [Name] [Phone] @[slack]
Ops Manager: [Name] [Phone] @[slack]

Escalation:
Level 1: [Name] - notify immediately
Level 2: [Name] - notify within 5 min
Level 3: [Name] - notify within 15 min
</code></pre>
<hr />
<h2 id="quick-reference-disaster-steps"><a class="header" href="#quick-reference-disaster-steps">Quick Reference: Disaster Steps</a></h2>
<pre><code>1. ASSESS (First 5 min)
   - Determine disaster severity
   - Assess damage scope
   - Get backup location access

2. COMMUNICATE (Immediately)
   - Declare disaster
   - Notify key personnel
   - Start status updates (every 5 min)

3. RECOVER (Next 30-120 min)
   - Activate backup infrastructure if needed
   - Restore database from latest backup
   - Redeploy applications
   - Verify all systems operational

4. VERIFY (Continuous)
   - Check pod health
   - Verify database connectivity
   - Test API endpoints
   - Monitor error rates

5. STABILIZE (Next 4 hours)
   - Monitor closely
   - Watch for anomalies
   - Verify performance normal
   - Check data integrity

6. INVESTIGATE (Within 1 hour)
   - Root cause analysis
   - Document what happened
   - Plan improvements
   - Update procedures

7. IMPROVE (Within 2 weeks)
   - Implement improvements
   - Test improvements
   - Update documentation
   - Train team
</code></pre>

                    </main>

                    <nav class="nav-wrapper" aria-label="Page navigation">
                        <!-- Mobile navigation buttons -->
                            <a rel="prev" href="../../disaster-recovery/index.html" class="mobile-nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
                                <i class="fa fa-angle-left"></i>
                            </a>

                            <a rel="next prefetch" href="../../disaster-recovery/backup-strategy.html" class="mobile-nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
                                <i class="fa fa-angle-right"></i>
                            </a>

                        <div style="clear: both"></div>
                    </nav>
                </div>
            </div>

            <nav class="nav-wide-wrapper" aria-label="Page navigation">
                    <a rel="prev" href="../../disaster-recovery/index.html" class="nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
                        <i class="fa fa-angle-left"></i>
                    </a>

                    <a rel="next prefetch" href="../../disaster-recovery/backup-strategy.html" class="nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
                        <i class="fa fa-angle-right"></i>
                    </a>
            </nav>

        </div>


        <script>
            window.playground_copyable = true;
        </script>


        <script src="../elasticlunr.min.js"></script>
        <script src="../mark.min.js"></script>
        <script src="../searcher.js"></script>

        <script src="../clipboard.min.js"></script>
        <script src="../highlight.js"></script>
        <script src="../book.js"></script>

        <!-- Custom JS scripts -->


    </div>
    </body>
</html>