699 lines
29 KiB
HTML
699 lines
29 KiB
HTML
<!DOCTYPE HTML>
|
|
<html lang="en" class="light sidebar-visible" dir="ltr">
|
|
<head>
|
|
<!-- Book generated using mdBook -->
|
|
<meta charset="UTF-8">
|
|
<title>Rollback Runbook - VAPORA Platform Documentation</title>
|
|
|
|
|
|
<!-- Custom HTML head -->
|
|
|
|
<meta name="description" content="Comprehensive documentation for VAPORA, an intelligent development orchestration platform built entirely in Rust.">
|
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
|
<meta name="theme-color" content="#ffffff">
|
|
|
|
<link rel="icon" href="../favicon.svg">
|
|
<link rel="shortcut icon" href="../favicon.png">
|
|
<link rel="stylesheet" href="../css/variables.css">
|
|
<link rel="stylesheet" href="../css/general.css">
|
|
<link rel="stylesheet" href="../css/chrome.css">
|
|
<link rel="stylesheet" href="../css/print.css" media="print">
|
|
|
|
<!-- Fonts -->
|
|
<link rel="stylesheet" href="../FontAwesome/css/font-awesome.css">
|
|
<link rel="stylesheet" href="../fonts/fonts.css">
|
|
|
|
<!-- Highlight.js Stylesheets -->
|
|
<link rel="stylesheet" id="highlight-css" href="../highlight.css">
|
|
<link rel="stylesheet" id="tomorrow-night-css" href="../tomorrow-night.css">
|
|
<link rel="stylesheet" id="ayu-highlight-css" href="../ayu-highlight.css">
|
|
|
|
<!-- Custom theme stylesheets -->
|
|
|
|
|
|
<!-- Provide site root and default themes to javascript -->
|
|
<script>
|
|
const path_to_root = "../";
|
|
const default_light_theme = "light";
|
|
const default_dark_theme = "dark";
|
|
</script>
|
|
<!-- Start loading toc.js asap -->
|
|
<script src="../toc.js"></script>
|
|
</head>
|
|
<body>
|
|
<div id="mdbook-help-container">
|
|
<div id="mdbook-help-popup">
|
|
<h2 class="mdbook-help-title">Keyboard shortcuts</h2>
|
|
<div>
|
|
<p>Press <kbd>←</kbd> or <kbd>→</kbd> to navigate between chapters</p>
|
|
<p>Press <kbd>S</kbd> or <kbd>/</kbd> to search in the book</p>
|
|
<p>Press <kbd>?</kbd> to show this help</p>
|
|
<p>Press <kbd>Esc</kbd> to hide this help</p>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
<div id="body-container">
|
|
<!-- Work around some values being stored in localStorage wrapped in quotes -->
|
|
<script>
|
|
try {
|
|
let theme = localStorage.getItem('mdbook-theme');
|
|
let sidebar = localStorage.getItem('mdbook-sidebar');
|
|
|
|
if (theme.startsWith('"') && theme.endsWith('"')) {
|
|
localStorage.setItem('mdbook-theme', theme.slice(1, theme.length - 1));
|
|
}
|
|
|
|
if (sidebar.startsWith('"') && sidebar.endsWith('"')) {
|
|
localStorage.setItem('mdbook-sidebar', sidebar.slice(1, sidebar.length - 1));
|
|
}
|
|
} catch (e) { }
|
|
</script>
|
|
|
|
<!-- Set the theme before any content is loaded, prevents flash -->
|
|
<script>
|
|
const default_theme = window.matchMedia("(prefers-color-scheme: dark)").matches ? default_dark_theme : default_light_theme;
|
|
let theme;
|
|
try { theme = localStorage.getItem('mdbook-theme'); } catch(e) { }
|
|
if (theme === null || theme === undefined) { theme = default_theme; }
|
|
const html = document.documentElement;
|
|
html.classList.remove('light')
|
|
html.classList.add(theme);
|
|
html.classList.add("js");
|
|
</script>
|
|
|
|
<input type="checkbox" id="sidebar-toggle-anchor" class="hidden">
|
|
|
|
<!-- Hide / unhide sidebar before it is displayed -->
|
|
<script>
|
|
let sidebar = null;
|
|
const sidebar_toggle = document.getElementById("sidebar-toggle-anchor");
|
|
if (document.body.clientWidth >= 1080) {
|
|
try { sidebar = localStorage.getItem('mdbook-sidebar'); } catch(e) { }
|
|
sidebar = sidebar || 'visible';
|
|
} else {
|
|
sidebar = 'hidden';
|
|
}
|
|
sidebar_toggle.checked = sidebar === 'visible';
|
|
html.classList.remove('sidebar-visible');
|
|
html.classList.add("sidebar-" + sidebar);
|
|
</script>
|
|
|
|
<nav id="sidebar" class="sidebar" aria-label="Table of contents">
|
|
<!-- populated by js -->
|
|
<mdbook-sidebar-scrollbox class="sidebar-scrollbox"></mdbook-sidebar-scrollbox>
|
|
<noscript>
|
|
<iframe class="sidebar-iframe-outer" src="../toc.html"></iframe>
|
|
</noscript>
|
|
<div id="sidebar-resize-handle" class="sidebar-resize-handle">
|
|
<div class="sidebar-resize-indicator"></div>
|
|
</div>
|
|
</nav>
|
|
|
|
<div id="page-wrapper" class="page-wrapper">
|
|
|
|
<div class="page">
|
|
<div id="menu-bar-hover-placeholder"></div>
|
|
<div id="menu-bar" class="menu-bar sticky">
|
|
<div class="left-buttons">
|
|
<label id="sidebar-toggle" class="icon-button" for="sidebar-toggle-anchor" title="Toggle Table of Contents" aria-label="Toggle Table of Contents" aria-controls="sidebar">
|
|
<i class="fa fa-bars"></i>
|
|
</label>
|
|
<button id="theme-toggle" class="icon-button" type="button" title="Change theme" aria-label="Change theme" aria-haspopup="true" aria-expanded="false" aria-controls="theme-list">
|
|
<i class="fa fa-paint-brush"></i>
|
|
</button>
|
|
<ul id="theme-list" class="theme-popup" aria-label="Themes" role="menu">
|
|
<li role="none"><button role="menuitem" class="theme" id="default_theme">Auto</button></li>
|
|
<li role="none"><button role="menuitem" class="theme" id="light">Light</button></li>
|
|
<li role="none"><button role="menuitem" class="theme" id="rust">Rust</button></li>
|
|
<li role="none"><button role="menuitem" class="theme" id="coal">Coal</button></li>
|
|
<li role="none"><button role="menuitem" class="theme" id="navy">Navy</button></li>
|
|
<li role="none"><button role="menuitem" class="theme" id="ayu">Ayu</button></li>
|
|
</ul>
|
|
<button id="search-toggle" class="icon-button" type="button" title="Search (`/`)" aria-label="Toggle Searchbar" aria-expanded="false" aria-keyshortcuts="/ s" aria-controls="searchbar">
|
|
<i class="fa fa-search"></i>
|
|
</button>
|
|
</div>
|
|
|
|
<h1 class="menu-title">VAPORA Platform Documentation</h1>
|
|
|
|
<div class="right-buttons">
|
|
<a href="../print.html" title="Print this book" aria-label="Print this book">
|
|
<i id="print-button" class="fa fa-print"></i>
|
|
</a>
|
|
<a href="https://github.com/vapora-platform/vapora" title="Git repository" aria-label="Git repository">
|
|
<i id="git-repository-button" class="fa fa-github"></i>
|
|
</a>
|
|
<a href="https://github.com/vapora-platform/vapora/edit/main/docs/src/../operations/rollback-runbook.md" title="Suggest an edit" aria-label="Suggest an edit">
|
|
<i id="git-edit-button" class="fa fa-edit"></i>
|
|
</a>
|
|
|
|
</div>
|
|
</div>
|
|
|
|
<div id="search-wrapper" class="hidden">
|
|
<form id="searchbar-outer" class="searchbar-outer">
|
|
<input type="search" id="searchbar" name="searchbar" placeholder="Search this book ..." aria-controls="searchresults-outer" aria-describedby="searchresults-header">
|
|
</form>
|
|
<div id="searchresults-outer" class="searchresults-outer hidden">
|
|
<div id="searchresults-header" class="searchresults-header"></div>
|
|
<ul id="searchresults">
|
|
</ul>
|
|
</div>
|
|
</div>
|
|
|
|
<!-- Apply ARIA attributes after the sidebar and the sidebar toggle button are added to the DOM -->
|
|
<script>
|
|
document.getElementById('sidebar-toggle').setAttribute('aria-expanded', sidebar === 'visible');
|
|
document.getElementById('sidebar').setAttribute('aria-hidden', sidebar !== 'visible');
|
|
Array.from(document.querySelectorAll('#sidebar a')).forEach(function(link) {
|
|
link.setAttribute('tabIndex', sidebar === 'visible' ? 0 : -1);
|
|
});
|
|
</script>
|
|
|
|
<div id="content" class="content">
|
|
<main>
|
|
<h1 id="rollback-runbook"><a class="header" href="#rollback-runbook">Rollback Runbook</a></h1>
|
|
<p>Procedures for safely rolling back VAPORA deployments when issues are detected.</p>
|
|
<hr />
|
|
<h2 id="when-to-rollback"><a class="header" href="#when-to-rollback">When to Rollback</a></h2>
|
|
<p>Immediately trigger rollback if any of these occur within 5 minutes of deployment:</p>
|
|
<p>❌ <strong>Critical Issues</strong> (rollback within 1 minute):</p>
|
|
<ul>
|
|
<li>Pod in <code>CrashLoopBackOff</code> (repeatedly restarting)</li>
|
|
<li>All pods unable to start</li>
|
|
<li>Service completely unreachable (0 endpoints)</li>
|
|
<li>Database connection completely broken</li>
|
|
<li>All requests returning 5xx errors</li>
|
|
<li>Service consuming all available memory/CPU</li>
|
|
</ul>
|
|
<p>⚠️ <strong>Serious Issues</strong> (rollback within 5 minutes):</p>
|
|
<ul>
|
|
<li>High error rate (>10% 5xx errors)</li>
|
|
<li>Significant performance degradation (2x+ latency)</li>
|
|
<li>Deployment not completing (stuck pods)</li>
|
|
<li>Unexpected dependency failures</li>
|
|
<li>Data corruption or loss</li>
|
|
</ul>
|
|
<p>✓ <strong>Monitor & Investigate</strong> (don't rollback immediately):</p>
|
|
<ul>
|
|
<li>Single pod failing (might be node issue)</li>
|
|
<li>Transient network errors</li>
|
|
<li>Gradual performance increase (might be load)</li>
|
|
<li>Expected warnings in logs</li>
|
|
</ul>
|
|
<hr />
|
|
<h2 id="kubernetes-rollback-automatic"><a class="header" href="#kubernetes-rollback-automatic">Kubernetes Rollback (Automatic)</a></h2>
|
|
<h3 id="step-1-assess-situation-30-seconds"><a class="header" href="#step-1-assess-situation-30-seconds">Step 1: Assess Situation (30 seconds)</a></h3>
|
|
<pre><code class="language-bash"># Set up environment
|
|
export NAMESPACE=vapora
|
|
export CLUSTER=production # or staging
|
|
|
|
# Verify you're on correct cluster
|
|
kubectl cluster-info | grep server
|
|
|
|
# STOP if you're on wrong cluster!
|
|
# Correct cluster should be production URL
|
|
</code></pre>
|
|
<h3 id="step-2-check-current-status"><a class="header" href="#step-2-check-current-status">Step 2: Check Current Status</a></h3>
|
|
<pre><code class="language-bash"># See what's happening right now
|
|
kubectl get deployments -n $NAMESPACE
|
|
kubectl get pods -n $NAMESPACE
|
|
|
|
# Output should show the broken state that triggered rollback
|
|
</code></pre>
|
|
<p><strong>Critical check:</strong></p>
|
|
<pre><code class="language-bash"># How many pods are actually running?
|
|
RUNNING=$(kubectl get pods -n $NAMESPACE --field-selector=status.phase=Running --no-headers | wc -l)
|
|
TOTAL=$(kubectl get pods -n $NAMESPACE --no-headers | wc -l)
|
|
|
|
echo "Pods running: $RUNNING / $TOTAL"
|
|
|
|
# If 0/X: Critical, rollback immediately
|
|
# If X/X: Investigate before rollback (might not need to)
|
|
</code></pre>
|
|
<h3 id="step-3-identify-which-deployment-failed"><a class="header" href="#step-3-identify-which-deployment-failed">Step 3: Identify Which Deployment Failed</a></h3>
|
|
<pre><code class="language-bash"># Check which deployment has issues
|
|
for deployment in vapora-backend vapora-agents vapora-llm-router; do
|
|
echo "=== $deployment ==="
|
|
kubectl get deployment $deployment -n $NAMESPACE -o wide
|
|
kubectl get pods -n $NAMESPACE -l app=$deployment
|
|
done
|
|
|
|
# Example: backend has ReplicaSet mismatch
|
|
# DESIRED CURRENT UPDATED AVAILABLE
|
|
# 3 3 3 0 ← Problem: no pods available
|
|
</code></pre>
|
|
<p><strong>Decide</strong>: Rollback all or specific deployment?</p>
|
|
<ul>
|
|
<li>If all services down: Rollback all</li>
|
|
<li>If only backend issues: Rollback backend only</li>
|
|
</ul>
|
|
<h3 id="step-4-get-rollout-history"><a class="header" href="#step-4-get-rollout-history">Step 4: Get Rollout History</a></h3>
|
|
<pre><code class="language-bash"># Show deployment revisions to see what to rollback to
|
|
for deployment in vapora-backend vapora-agents vapora-llm-router; do
|
|
echo "=== $deployment ==="
|
|
kubectl rollout history deployment/$deployment -n $NAMESPACE | tail -5
|
|
done
|
|
|
|
# Output:
|
|
# REVISION CHANGE-CAUSE
|
|
# 42 Deployment rolled out
|
|
# 43 Deployment rolled out
|
|
# 44 (current - the one with issues)
|
|
</code></pre>
|
|
<p><strong>Key</strong>: Revision numbers increase with each deployment</p>
|
|
<h3 id="step-5-execute-rollback"><a class="header" href="#step-5-execute-rollback">Step 5: Execute Rollback</a></h3>
|
|
<pre><code class="language-bash"># Option A: Rollback all three services
|
|
echo "🔙 Rolling back all services..."
|
|
|
|
for deployment in vapora-backend vapora-agents vapora-llm-router; do
|
|
echo "Rolling back $deployment..."
|
|
kubectl rollout undo deployment/$deployment -n $NAMESPACE
|
|
echo "✓ $deployment undo initiated"
|
|
done
|
|
|
|
# Wait for all rollbacks
|
|
echo "⏳ Waiting for rollback to complete..."
|
|
for deployment in vapora-backend vapora-agents vapora-llm-router; do
|
|
kubectl rollout status deployment/$deployment -n $NAMESPACE --timeout=5m
|
|
done
|
|
|
|
echo "✓ All services rolled back"
|
|
</code></pre>
|
|
<p><strong>Option B: Rollback specific deployment</strong></p>
|
|
<pre><code class="language-bash"># If only backend has issues
|
|
kubectl rollout undo deployment/vapora-backend -n $NAMESPACE
|
|
|
|
# Monitor rollback
|
|
kubectl rollout status deployment/vapora-backend -n $NAMESPACE --timeout=5m
|
|
</code></pre>
|
|
<p><strong>Option C: Rollback to specific revision</strong></p>
|
|
<pre><code class="language-bash"># If you need to skip the immediate previous version
|
|
# Find the working revision number from history
|
|
TARGET_REVISION=42 # Example
|
|
|
|
for deployment in vapora-backend vapora-agents vapora-llm-router; do
|
|
echo "Rolling back $deployment to revision $TARGET_REVISION..."
|
|
kubectl rollout undo deployment/$deployment -n $NAMESPACE \
|
|
--to-revision=$TARGET_REVISION
|
|
done
|
|
|
|
# Verify rollback
|
|
kubectl rollout status deployment/vapora-backend -n $NAMESPACE --timeout=5m
|
|
</code></pre>
|
|
<h3 id="step-6-monitor-rollback-progress"><a class="header" href="#step-6-monitor-rollback-progress">Step 6: Monitor Rollback Progress</a></h3>
|
|
<p>In a <strong>separate terminal</strong>, watch the rollback happening:</p>
|
|
<pre><code class="language-bash"># Watch pods being recreated with old version
|
|
kubectl get pods -n $NAMESPACE -w
|
|
|
|
# Output shows:
|
|
# vapora-backend-abc123-newhash 1/1 Terminating ← old pods being removed
|
|
# vapora-backend-def456-oldhash 0/1 Pending ← previous pods restarting
|
|
# vapora-backend-def456-oldhash 1/1 Running ← previous pods ready
|
|
</code></pre>
|
|
<p><strong>Expected timeline:</strong></p>
|
|
<ul>
|
|
<li>0-30 seconds: Old pods terminating, new pods starting</li>
|
|
<li>30-90 seconds: New pods starting up (ContainerCreating)</li>
|
|
<li>90-180 seconds: New pods reaching Running state</li>
|
|
</ul>
|
|
<h3 id="step-7-verify-rollback-complete"><a class="header" href="#step-7-verify-rollback-complete">Step 7: Verify Rollback Complete</a></h3>
|
|
<pre><code class="language-bash"># After rollout status shows "successfully rolled out"
|
|
|
|
# Verify all pods are running
|
|
kubectl get pods -n $NAMESPACE
|
|
|
|
# All should show:
|
|
# STATUS: Running
|
|
# READY: 1/1
|
|
|
|
# Verify service endpoints exist
|
|
kubectl get endpoints -n $NAMESPACE
|
|
|
|
# All services should have endpoints like:
|
|
# NAME ENDPOINTS
|
|
# vapora-backend 10.x.x.x:8001,10.x.x.x:8001,10.x.x.x:8001
|
|
</code></pre>
|
|
<h3 id="step-8-health-check"><a class="header" href="#step-8-health-check">Step 8: Health Check</a></h3>
|
|
<pre><code class="language-bash"># Port-forward to test services
|
|
kubectl port-forward -n $NAMESPACE svc/vapora-backend 8001:8001 &
|
|
sleep 2
|
|
|
|
# Test health endpoint
|
|
curl -v http://localhost:8001/health
|
|
|
|
# Expected: HTTP 200 OK with health data
|
|
</code></pre>
|
|
<p><strong>If health check fails:</strong></p>
|
|
<pre><code class="language-bash"># Check pod logs for errors
|
|
kubectl logs deployment/vapora-backend -n $NAMESPACE --tail=50
|
|
|
|
# See what's wrong, might need further investigation
|
|
# Possibly need to rollback to earlier version
|
|
</code></pre>
|
|
<h3 id="step-9-check-logs-for-success"><a class="header" href="#step-9-check-logs-for-success">Step 9: Check Logs for Success</a></h3>
|
|
<pre><code class="language-bash"># Verify no errors in the first 2 minutes of rolled-back logs
|
|
kubectl logs deployment/vapora-backend -n $NAMESPACE --since=2m | \
|
|
grep -i "error\|exception\|failed" | head -10
|
|
|
|
# Should return no (or very few) errors
|
|
</code></pre>
|
|
<h3 id="step-10-verify-version-reverted"><a class="header" href="#step-10-verify-version-reverted">Step 10: Verify Version Reverted</a></h3>
|
|
<pre><code class="language-bash"># Confirm we're back to previous version
|
|
kubectl get deployments -n $NAMESPACE -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.template.spec.containers[0].image}{"\n"}{end}'
|
|
|
|
# Output should show previous image versions:
|
|
# vapora-backend vapora/backend:v1.2.0 (not v1.2.1)
|
|
# vapora-agents vapora/agents:v1.2.0
|
|
# vapora-llm-router vapora/llm-router:v1.2.0
|
|
</code></pre>
|
|
<hr />
|
|
<h2 id="docker-rollback-manual"><a class="header" href="#docker-rollback-manual">Docker Rollback (Manual)</a></h2>
|
|
<p>For Docker Compose deployments (not Kubernetes):</p>
|
|
<h3 id="step-1-assess-current-state"><a class="header" href="#step-1-assess-current-state">Step 1: Assess Current State</a></h3>
|
|
<pre><code class="language-bash"># Check running containers
|
|
docker compose ps
|
|
|
|
# Check logs for errors
|
|
docker compose logs --tail=50 backend
|
|
</code></pre>
|
|
<h3 id="step-2-stop-services"><a class="header" href="#step-2-stop-services">Step 2: Stop Services</a></h3>
|
|
<pre><code class="language-bash"># Stop all services gracefully
|
|
docker compose down
|
|
|
|
# Verify stopped
|
|
docker ps | grep vapora
|
|
# Should return nothing
|
|
|
|
# Wait a moment for graceful shutdown
|
|
sleep 5
|
|
</code></pre>
|
|
<h3 id="step-3-restore-previous-configuration"><a class="header" href="#step-3-restore-previous-configuration">Step 3: Restore Previous Configuration</a></h3>
|
|
<pre><code class="language-bash"># Option A: Git history
|
|
cd deploy/docker
|
|
git log docker-compose.yml | head -5
|
|
git checkout HEAD~1 docker-compose.yml
|
|
|
|
# Option B: Backup file
|
|
cp docker-compose.yml docker-compose.yml.broken
|
|
cp docker-compose.yml.backup docker-compose.yml
|
|
|
|
# Option C: Manual
|
|
# Edit docker-compose.yml to use previous image versions
|
|
# Example: change backend service image from v1.2.1 to v1.2.0
|
|
</code></pre>
|
|
<h3 id="step-4-restart-services"><a class="header" href="#step-4-restart-services">Step 4: Restart Services</a></h3>
|
|
<pre><code class="language-bash"># Start services with previous configuration
|
|
docker compose up -d
|
|
|
|
# Wait for startup
|
|
sleep 5
|
|
|
|
# Verify services running
|
|
docker compose ps
|
|
|
|
# Should show all services with status "Up"
|
|
</code></pre>
|
|
<h3 id="step-5-verify-health"><a class="header" href="#step-5-verify-health">Step 5: Verify Health</a></h3>
|
|
<pre><code class="language-bash"># Check container logs
|
|
docker compose logs backend | tail -20
|
|
|
|
# Test health endpoint
|
|
curl -v http://localhost:8001/health
|
|
|
|
# Expected: HTTP 200 OK
|
|
</code></pre>
|
|
<h3 id="step-6-check-services"><a class="header" href="#step-6-check-services">Step 6: Check Services</a></h3>
|
|
<pre><code class="language-bash"># Verify all services responding
|
|
docker compose exec backend curl http://localhost:8001/health
|
|
docker compose exec frontend curl http://localhost:3000 --head
|
|
|
|
# All should return successful responses
|
|
</code></pre>
|
|
<hr />
|
|
<h2 id="post-rollback-procedures"><a class="header" href="#post-rollback-procedures">Post-Rollback Procedures</a></h2>
|
|
<h3 id="immediate-within-5-minutes"><a class="header" href="#immediate-within-5-minutes">Immediate (Within 5 minutes)</a></h3>
|
|
<pre><code class="language-bash"># 1. Verify all services healthy
|
|
✓ All pods running
|
|
✓ Health endpoints responding
|
|
✓ No error logs
|
|
✓ Service endpoints populated
|
|
|
|
# 2. Communicate to team
|
|
</code></pre>
|
|
<h3 id="communication"><a class="header" href="#communication">Communication</a></h3>
|
|
<pre><code>Post to #deployments:
|
|
|
|
🔙 ROLLBACK EXECUTED
|
|
|
|
Issue detected in deployment v1.2.1
|
|
All services rolled back to v1.2.0
|
|
|
|
Status: ✅ Services recovering
|
|
- All pods: Running
|
|
- Health checks: Passing
|
|
- Endpoints: Responding
|
|
|
|
Timeline:
|
|
- Issue detected: HH:MM UTC
|
|
- Rollback initiated: HH:MM UTC
|
|
- Services recovered: HH:MM UTC (5 minutes)
|
|
|
|
Next:
|
|
- Investigate root cause
|
|
- Fix issue
|
|
- Prepare corrected deployment
|
|
|
|
Questions? @on-call-engineer
|
|
</code></pre>
|
|
<h3 id="investigation--root-cause"><a class="header" href="#investigation--root-cause">Investigation & Root Cause</a></h3>
|
|
<pre><code class="language-bash"># While services are recovered, investigate what went wrong
|
|
|
|
# 1. Save logs from failed deployment
|
|
kubectl logs deployment/vapora-backend -n $NAMESPACE \
|
|
--timestamps=true \
|
|
> failed-deployment-backend.log
|
|
|
|
# 2. Save pod events
|
|
kubectl describe pod $(kubectl get pods -n $NAMESPACE \
|
|
-l app=vapora-backend --sort-by=.metadata.creationTimestamp \
|
|
| tail -1 | awk '{print $1}') \
|
|
-n $NAMESPACE > failed-pod-events.log
|
|
|
|
# 3. Archive ConfigMap from failed deployment (if changed)
|
|
kubectl get configmap -n $NAMESPACE vapora-config -o yaml > configmap-failed.yaml
|
|
|
|
# 4. Compare with previous good state
|
|
diff configmap-previous.yaml configmap-failed.yaml
|
|
|
|
# 5. Check what changed in code
|
|
git diff HEAD~1 HEAD provisioning/
|
|
</code></pre>
|
|
<h3 id="decision-what-went-wrong"><a class="header" href="#decision-what-went-wrong">Decision: What Went Wrong?</a></h3>
|
|
<p>Common issues and investigation paths:</p>
|
|
<div class="table-wrapper"><table><thead><tr><th>Issue</th><th>Investigation</th><th>Action</th></tr></thead><tbody>
|
|
<tr><td><strong>Config syntax error</strong></td><td>Check ConfigMap YAML</td><td>Fix YAML, test locally with yq</td></tr>
|
|
<tr><td><strong>Missing environment variable</strong></td><td>Check pod logs for "not found"</td><td>Update ConfigMap with value</td></tr>
|
|
<tr><td><strong>Database connection</strong></td><td>Check database connectivity</td><td>Verify DB URL in ConfigMap</td></tr>
|
|
<tr><td><strong>Resource exhaustion</strong></td><td>Check kubectl top, pod events</td><td>Increase resources or reduce replicas</td></tr>
|
|
<tr><td><strong>Image missing</strong></td><td>Check ImagePullBackOff event</td><td>Verify image pushed to registry</td></tr>
|
|
<tr><td><strong>Permission issue</strong></td><td>Check RBAC, logs for "forbidden"</td><td>Update service account permissions</td></tr>
|
|
</tbody></table>
|
|
</div>
|
|
<h3 id="post-rollback-review"><a class="header" href="#post-rollback-review">Post-Rollback Review</a></h3>
|
|
<p>Schedule within 24 hours:</p>
|
|
<pre><code>DEPLOYMENT POST-MORTEM
|
|
|
|
Deployment: v1.2.1
|
|
Outcome: ❌ Rolled back
|
|
|
|
Timeline:
|
|
- Deployed: 2026-01-12 14:00 UTC
|
|
- Issue detected: 14:05 UTC
|
|
- Rollback completed: 14:10 UTC
|
|
- Impact duration: 5 minutes
|
|
|
|
Root Cause: [describe what went wrong]
|
|
|
|
Why not caught before:
|
|
- [ ] Testing incomplete
|
|
- [ ] Config not validated
|
|
- [ ] Monitoring missed issue
|
|
- [ ] Other: [describe]
|
|
|
|
Prevention for next time:
|
|
1. [action item]
|
|
2. [action item]
|
|
3. [action item]
|
|
|
|
Owner: [person responsible for follow-up]
|
|
Deadline: [date]
|
|
</code></pre>
|
|
<hr />
|
|
<h2 id="rollback-emergency-procedures"><a class="header" href="#rollback-emergency-procedures">Rollback Emergency Procedures</a></h2>
|
|
<h3 id="if-services-still-down-after-rollback"><a class="header" href="#if-services-still-down-after-rollback">If Services Still Down After Rollback</a></h3>
|
|
<pre><code class="language-bash"># Services not recovering - emergency procedures
|
|
|
|
# 1. Check if rollback actually happened
|
|
kubectl get deployments -n $NAMESPACE -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.template.spec.containers[0].image}{"\n"}{end}'
|
|
|
|
# If image is still new version:
|
|
# - Rollback might have failed
|
|
# - Try manual version specification
|
|
|
|
# 2. Force rollback to specific revision
|
|
kubectl rollout undo deployment/vapora-backend -n $NAMESPACE --to-revision=41
|
|
|
|
# 3. If still failing, delete and recreate pods
|
|
kubectl delete pods -n $NAMESPACE -l app=vapora-backend
|
|
# Pods will restart via deployment
|
|
|
|
# 4. Last resort: Scale down and up
|
|
kubectl scale deployment/vapora-backend --replicas=0 -n $NAMESPACE
|
|
sleep 10
|
|
kubectl scale deployment/vapora-backend --replicas=3 -n $NAMESPACE
|
|
|
|
# 5. Monitor restart
|
|
kubectl get pods -n $NAMESPACE -w
|
|
</code></pre>
|
|
<h3 id="if-database-corrupted"><a class="header" href="#if-database-corrupted">If Database Corrupted</a></h3>
|
|
<pre><code class="language-bash"># Only do this if you have recent backups
|
|
|
|
# 1. Identify corruption
|
|
kubectl logs deployment/vapora-backend -n $NAMESPACE | grep -i "corruption\|data"
|
|
|
|
# 2. Restore from backup (requires DBA support)
|
|
# Contact database team
|
|
|
|
# 3. Verify data integrity
|
|
# Run validation queries/commands
|
|
|
|
# 4. Notify stakeholders immediately
|
|
</code></pre>
|
|
<h3 id="if-all-else-fails"><a class="header" href="#if-all-else-fails">If All Else Fails</a></h3>
|
|
<pre><code class="language-bash"># Complete infrastructure recovery
|
|
|
|
# 1. Escalate to Infrastructure team
|
|
# 2. Activate Disaster Recovery procedures
|
|
# 3. Failover to backup environment if available
|
|
# 4. Engage senior engineers for investigation
|
|
</code></pre>
|
|
<hr />
|
|
<h2 id="prevention--lessons-learned"><a class="header" href="#prevention--lessons-learned">Prevention & Lessons Learned</a></h2>
|
|
<p>After every rollback:</p>
|
|
<ol>
|
|
<li>
|
|
<p><strong>Root Cause Analysis</strong></p>
|
|
<ul>
|
|
<li>What actually went wrong?</li>
|
|
<li>Why wasn't it caught before deployment?</li>
|
|
<li>What can prevent this in the future?</li>
|
|
</ul>
|
|
</li>
|
|
<li>
|
|
<p><strong>Testing Improvements</strong></p>
|
|
<ul>
|
|
<li>Add test case for failure scenario</li>
|
|
<li>Update pre-deployment checklist</li>
|
|
<li>Improve staging validation</li>
|
|
</ul>
|
|
</li>
|
|
<li>
|
|
<p><strong>Monitoring Improvements</strong></p>
|
|
<ul>
|
|
<li>Add alert for this failure mode</li>
|
|
<li>Improve alerting sensitivity</li>
|
|
<li>Document expected vs abnormal logs</li>
|
|
</ul>
|
|
</li>
|
|
<li>
|
|
<p><strong>Documentation</strong></p>
|
|
<ul>
|
|
<li>Update runbooks with new learnings</li>
|
|
<li>Document this specific failure scenario</li>
|
|
<li>Share with team</li>
|
|
</ul>
|
|
</li>
|
|
</ol>
|
|
<hr />
|
|
<h2 id="rollback-checklist"><a class="header" href="#rollback-checklist">Rollback Checklist</a></h2>
|
|
<pre><code>☐ Confirmed critical issue requiring rollback
|
|
☐ Verified correct cluster and namespace
|
|
☐ Checked rollout history
|
|
☐ Executed rollback command (all services or specific)
|
|
☐ Monitored rollback progress (5-10 min wait)
|
|
☐ Verified all pods running
|
|
☐ Verified health endpoints responding
|
|
☐ Confirmed version reverted
|
|
☐ Posted communication to #deployments
|
|
☐ Notified on-call engineer: "rollback complete"
|
|
☐ Scheduled root cause analysis
|
|
☐ Saved logs for investigation
|
|
☐ Started post-mortem process
|
|
</code></pre>
|
|
<hr />
|
|
<h2 id="reference-quick-rollback-commands"><a class="header" href="#reference-quick-rollback-commands">Reference: Quick Rollback Commands</a></h2>
|
|
<p>For experienced operators:</p>
|
|
<pre><code class="language-bash"># One-liner: Rollback all services
|
|
export NS=vapora; for d in vapora-backend vapora-agents vapora-llm-router; do kubectl rollout undo deployment/$d -n $NS & done; wait
|
|
|
|
# Quick verification
|
|
kubectl get pods -n $NS && kubectl get endpoints -n $NS
|
|
|
|
# Health check
|
|
kubectl port-forward -n $NS svc/vapora-backend 8001:8001 &
|
|
sleep 2 && curl http://localhost:8001/health
|
|
</code></pre>
|
|
|
|
</main>
|
|
|
|
<nav class="nav-wrapper" aria-label="Page navigation">
|
|
<!-- Mobile navigation buttons -->
|
|
<a rel="prev" href="../../operations/incident-response-runbook.html" class="mobile-nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
|
|
<i class="fa fa-angle-left"></i>
|
|
</a>
|
|
|
|
<a rel="next prefetch" href="../../operations/backup-recovery-automation.html" class="mobile-nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
|
|
<i class="fa fa-angle-right"></i>
|
|
</a>
|
|
|
|
<div style="clear: both"></div>
|
|
</nav>
|
|
</div>
|
|
</div>
|
|
|
|
<nav class="nav-wide-wrapper" aria-label="Page navigation">
|
|
<a rel="prev" href="../../operations/incident-response-runbook.html" class="nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
|
|
<i class="fa fa-angle-left"></i>
|
|
</a>
|
|
|
|
<a rel="next prefetch" href="../../operations/backup-recovery-automation.html" class="nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
|
|
<i class="fa fa-angle-right"></i>
|
|
</a>
|
|
</nav>
|
|
|
|
</div>
|
|
|
|
|
|
|
|
|
|
<script>
|
|
window.playground_copyable = true;
|
|
</script>
|
|
|
|
|
|
<script src="../elasticlunr.min.js"></script>
|
|
<script src="../mark.min.js"></script>
|
|
<script src="../searcher.js"></script>
|
|
|
|
<script src="../clipboard.min.js"></script>
|
|
<script src="../highlight.js"></script>
|
|
<script src="../book.js"></script>
|
|
|
|
<!-- Custom JS scripts -->
|
|
|
|
|
|
</div>
|
|
</body>
|
|
</html>
|