Vapora/docs/operations/deployment-runbook.html

807 lines
36 KiB
HTML
Raw Normal View History

<!DOCTYPE HTML>
<html lang="en" class="light sidebar-visible" dir="ltr">
<head>
<!-- Book generated using mdBook -->
<meta charset="UTF-8">
<title>Deployment Runbook - VAPORA Platform Documentation</title>
<!-- Custom HTML head -->
<meta name="description" content="Comprehensive documentation for VAPORA, an intelligent development orchestration platform built entirely in Rust.">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="theme-color" content="#ffffff">
<link rel="icon" href="../favicon.svg">
<link rel="shortcut icon" href="../favicon.png">
<link rel="stylesheet" href="../css/variables.css">
<link rel="stylesheet" href="../css/general.css">
<link rel="stylesheet" href="../css/chrome.css">
<link rel="stylesheet" href="../css/print.css" media="print">
<!-- Fonts -->
<link rel="stylesheet" href="../FontAwesome/css/font-awesome.css">
<link rel="stylesheet" href="../fonts/fonts.css">
<!-- Highlight.js Stylesheets -->
<link rel="stylesheet" id="highlight-css" href="../highlight.css">
<link rel="stylesheet" id="tomorrow-night-css" href="../tomorrow-night.css">
<link rel="stylesheet" id="ayu-highlight-css" href="../ayu-highlight.css">
<!-- Custom theme stylesheets -->
<!-- Provide site root and default themes to javascript -->
<script>
const path_to_root = "../";
const default_light_theme = "light";
const default_dark_theme = "dark";
</script>
<!-- Start loading toc.js asap -->
<script src="../toc.js"></script>
</head>
<body>
<div id="mdbook-help-container">
<div id="mdbook-help-popup">
<h2 class="mdbook-help-title">Keyboard shortcuts</h2>
<div>
<p>Press <kbd></kbd> or <kbd></kbd> to navigate between chapters</p>
<p>Press <kbd>S</kbd> or <kbd>/</kbd> to search in the book</p>
<p>Press <kbd>?</kbd> to show this help</p>
<p>Press <kbd>Esc</kbd> to hide this help</p>
</div>
</div>
</div>
<div id="body-container">
<!-- Work around some values being stored in localStorage wrapped in quotes -->
<script>
try {
let theme = localStorage.getItem('mdbook-theme');
let sidebar = localStorage.getItem('mdbook-sidebar');
if (theme.startsWith('"') && theme.endsWith('"')) {
localStorage.setItem('mdbook-theme', theme.slice(1, theme.length - 1));
}
if (sidebar.startsWith('"') && sidebar.endsWith('"')) {
localStorage.setItem('mdbook-sidebar', sidebar.slice(1, sidebar.length - 1));
}
} catch (e) { }
</script>
<!-- Set the theme before any content is loaded, prevents flash -->
<script>
const default_theme = window.matchMedia("(prefers-color-scheme: dark)").matches ? default_dark_theme : default_light_theme;
let theme;
try { theme = localStorage.getItem('mdbook-theme'); } catch(e) { }
if (theme === null || theme === undefined) { theme = default_theme; }
const html = document.documentElement;
html.classList.remove('light')
html.classList.add(theme);
html.classList.add("js");
</script>
<input type="checkbox" id="sidebar-toggle-anchor" class="hidden">
<!-- Hide / unhide sidebar before it is displayed -->
<script>
let sidebar = null;
const sidebar_toggle = document.getElementById("sidebar-toggle-anchor");
if (document.body.clientWidth >= 1080) {
try { sidebar = localStorage.getItem('mdbook-sidebar'); } catch(e) { }
sidebar = sidebar || 'visible';
} else {
sidebar = 'hidden';
}
sidebar_toggle.checked = sidebar === 'visible';
html.classList.remove('sidebar-visible');
html.classList.add("sidebar-" + sidebar);
</script>
<nav id="sidebar" class="sidebar" aria-label="Table of contents">
<!-- populated by js -->
<mdbook-sidebar-scrollbox class="sidebar-scrollbox"></mdbook-sidebar-scrollbox>
<noscript>
<iframe class="sidebar-iframe-outer" src="../toc.html"></iframe>
</noscript>
<div id="sidebar-resize-handle" class="sidebar-resize-handle">
<div class="sidebar-resize-indicator"></div>
</div>
</nav>
<div id="page-wrapper" class="page-wrapper">
<div class="page">
<div id="menu-bar-hover-placeholder"></div>
<div id="menu-bar" class="menu-bar sticky">
<div class="left-buttons">
<label id="sidebar-toggle" class="icon-button" for="sidebar-toggle-anchor" title="Toggle Table of Contents" aria-label="Toggle Table of Contents" aria-controls="sidebar">
<i class="fa fa-bars"></i>
</label>
<button id="theme-toggle" class="icon-button" type="button" title="Change theme" aria-label="Change theme" aria-haspopup="true" aria-expanded="false" aria-controls="theme-list">
<i class="fa fa-paint-brush"></i>
</button>
<ul id="theme-list" class="theme-popup" aria-label="Themes" role="menu">
<li role="none"><button role="menuitem" class="theme" id="default_theme">Auto</button></li>
<li role="none"><button role="menuitem" class="theme" id="light">Light</button></li>
<li role="none"><button role="menuitem" class="theme" id="rust">Rust</button></li>
<li role="none"><button role="menuitem" class="theme" id="coal">Coal</button></li>
<li role="none"><button role="menuitem" class="theme" id="navy">Navy</button></li>
<li role="none"><button role="menuitem" class="theme" id="ayu">Ayu</button></li>
</ul>
<button id="search-toggle" class="icon-button" type="button" title="Search (`/`)" aria-label="Toggle Searchbar" aria-expanded="false" aria-keyshortcuts="/ s" aria-controls="searchbar">
<i class="fa fa-search"></i>
</button>
</div>
<h1 class="menu-title">VAPORA Platform Documentation</h1>
<div class="right-buttons">
<a href="../print.html" title="Print this book" aria-label="Print this book">
<i id="print-button" class="fa fa-print"></i>
</a>
<a href="https://github.com/vapora-platform/vapora" title="Git repository" aria-label="Git repository">
<i id="git-repository-button" class="fa fa-github"></i>
</a>
<a href="https://github.com/vapora-platform/vapora/edit/main/docs/src/../operations/deployment-runbook.md" title="Suggest an edit" aria-label="Suggest an edit">
<i id="git-edit-button" class="fa fa-edit"></i>
</a>
</div>
</div>
<div id="search-wrapper" class="hidden">
<form id="searchbar-outer" class="searchbar-outer">
<input type="search" id="searchbar" name="searchbar" placeholder="Search this book ..." aria-controls="searchresults-outer" aria-describedby="searchresults-header">
</form>
<div id="searchresults-outer" class="searchresults-outer hidden">
<div id="searchresults-header" class="searchresults-header"></div>
<ul id="searchresults">
</ul>
</div>
</div>
<!-- Apply ARIA attributes after the sidebar and the sidebar toggle button are added to the DOM -->
<script>
document.getElementById('sidebar-toggle').setAttribute('aria-expanded', sidebar === 'visible');
document.getElementById('sidebar').setAttribute('aria-hidden', sidebar !== 'visible');
Array.from(document.querySelectorAll('#sidebar a')).forEach(function(link) {
link.setAttribute('tabIndex', sidebar === 'visible' ? 0 : -1);
});
</script>
<div id="content" class="content">
<main>
<h1 id="deployment-runbook"><a class="header" href="#deployment-runbook">Deployment Runbook</a></h1>
<p>Step-by-step procedures for deploying VAPORA to staging and production environments.</p>
<hr />
<h2 id="quick-start"><a class="header" href="#quick-start">Quick Start</a></h2>
<p>For experienced operators:</p>
<pre><code class="language-bash"># Validate in CI/CD
# Download artifacts
# Review dry-run
# Apply: kubectl apply -f configmap.yaml deployment.yaml
# Monitor: kubectl logs -f deployment/vapora-backend -n vapora
# Verify: curl http://localhost:8001/health
</code></pre>
<p>For complete steps, continue reading.</p>
<hr />
<h2 id="before-starting"><a class="header" href="#before-starting">Before Starting</a></h2>
<p><strong>Prerequisites Completed</strong>:</p>
<ul>
<li><input disabled="" type="checkbox"/>
Pre-deployment checklist completed</li>
<li><input disabled="" type="checkbox"/>
Artifacts generated and validated</li>
<li><input disabled="" type="checkbox"/>
Staging deployment verified</li>
<li><input disabled="" type="checkbox"/>
Team ready and monitoring</li>
<li><input disabled="" type="checkbox"/>
Maintenance window announced</li>
</ul>
<p><strong>Access Verified</strong>:</p>
<ul>
<li><input disabled="" type="checkbox"/>
kubectl configured for target cluster</li>
<li><input disabled="" type="checkbox"/>
Can list nodes: <code>kubectl get nodes</code></li>
<li><input disabled="" type="checkbox"/>
Can access namespace: <code>kubectl get namespace vapora</code></li>
</ul>
<p><strong>If any prerequisite missing</strong>: Go back to pre-deployment checklist</p>
<hr />
<h2 id="phase-1-pre-flight-5-minutes"><a class="header" href="#phase-1-pre-flight-5-minutes">Phase 1: Pre-Flight (5 minutes)</a></h2>
<h3 id="11-verify-current-state"><a class="header" href="#11-verify-current-state">1.1 Verify Current State</a></h3>
<pre><code class="language-bash"># Set context
export CLUSTER=production # or staging
export NAMESPACE=vapora
# Verify cluster access
kubectl cluster-info
kubectl get nodes
# Output should show:
# NAME STATUS ROLES AGE
# node-1 Ready worker 30d
# node-2 Ready worker 25d
</code></pre>
<p><strong>What to look for:</strong></p>
<ul>
<li>✓ All nodes in "Ready" state</li>
<li>✓ No "NotReady" or "Unknown" nodes</li>
<li>If issues: Don't proceed, investigate node health</li>
</ul>
<h3 id="12-check-current-deployments"><a class="header" href="#12-check-current-deployments">1.2 Check Current Deployments</a></h3>
<pre><code class="language-bash"># Get current deployment status
kubectl get deployments -n $NAMESPACE -o wide
kubectl get pods -n $NAMESPACE
# Output example:
# NAME READY UP-TO-DATE AVAILABLE
# vapora-backend 3/3 3 3
# vapora-agents 2/2 2 2
# vapora-llm-router 2/2 2 2
</code></pre>
<p><strong>What to look for:</strong></p>
<ul>
<li>✓ All deployments showing correct replica count</li>
<li>✓ All pods in "Running" state</li>
<li>❌ If pods in "CrashLoopBackOff" or "Pending": Investigate before proceeding</li>
</ul>
<h3 id="13-record-current-versions"><a class="header" href="#13-record-current-versions">1.3 Record Current Versions</a></h3>
<pre><code class="language-bash"># Get current image versions (baseline for rollback)
kubectl get deployments -n $NAMESPACE -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.template.spec.containers[0].image}{"\n"}{end}'
# Expected output:
# vapora-backend vapora/backend:v1.2.0
# vapora-agents vapora/agents:v1.2.0
# vapora-llm-router vapora/llm-router:v1.2.0
</code></pre>
<p><strong>Record these for rollback</strong>: Keep this output visible</p>
<h3 id="14-get-current-revision-numbers"><a class="header" href="#14-get-current-revision-numbers">1.4 Get Current Revision Numbers</a></h3>
<pre><code class="language-bash"># For each deployment, get rollout history
for deployment in vapora-backend vapora-agents vapora-llm-router; do
echo "=== $deployment ==="
kubectl rollout history deployment/$deployment -n $NAMESPACE | tail -5
done
# Output example:
# REVISION CHANGE-CAUSE
# 42 Deployment rolled out
# 43 Deployment rolled out
# 44 (current)
</code></pre>
<p><strong>Record the highest revision number for each</strong> - this is your rollback reference</p>
<h3 id="15-check-cluster-resources"><a class="header" href="#15-check-cluster-resources">1.5 Check Cluster Resources</a></h3>
<pre><code class="language-bash"># Verify cluster has capacity for new deployment
kubectl top nodes
kubectl describe nodes | grep -A 5 "Allocated resources"
# Example - check memory/CPU availability
# Requested: 8200m (41%)
# Limits: 16400m (82%)
</code></pre>
<p><strong>What to look for:</strong></p>
<ul>
<li>✓ Less than 80% resource utilization</li>
<li>❌ If above 85%: Insufficient capacity, don't proceed</li>
</ul>
<hr />
<h2 id="phase-2-configuration-deployment-3-minutes"><a class="header" href="#phase-2-configuration-deployment-3-minutes">Phase 2: Configuration Deployment (3 minutes)</a></h2>
<h3 id="21-apply-configmap"><a class="header" href="#21-apply-configmap">2.1 Apply ConfigMap</a></h3>
<p>The ConfigMap contains all application configuration.</p>
<pre><code class="language-bash"># First: Dry-run to verify no syntax errors
kubectl apply -f configmap.yaml --dry-run=server -n $NAMESPACE
# Should output:
# configmap/vapora-config configured (server dry run)
# Check for any warnings or errors in output
# If errors, stop and fix the YAML before proceeding
</code></pre>
<p><strong>Troubleshooting</strong>:</p>
<ul>
<li>"error validating": YAML syntax error - fix and retry</li>
<li>"field is immutable": Can't change certain ConfigMap fields - delete and recreate</li>
<li>"resourceQuotaExceeded": Namespace quota exceeded - contact cluster admin</li>
</ul>
<h3 id="22-apply-configmap-for-real"><a class="header" href="#22-apply-configmap-for-real">2.2 Apply ConfigMap for Real</a></h3>
<pre><code class="language-bash"># Apply the actual ConfigMap
kubectl apply -f configmap.yaml -n $NAMESPACE
# Output:
# configmap/vapora-config configured
# Verify it was applied
kubectl get configmap -n $NAMESPACE vapora-config -o yaml | head -20
# Check for your new values in the output
</code></pre>
<p><strong>Verify ConfigMap is correct</strong>:</p>
<pre><code class="language-bash"># Extract specific values to verify
kubectl get configmap vapora-config -n $NAMESPACE -o jsonpath='{.data.vapora\.toml}' | grep "database_url" | head -1
# Should show the correct database URL
</code></pre>
<h3 id="23-annotate-configmap"><a class="header" href="#23-annotate-configmap">2.3 Annotate ConfigMap</a></h3>
<p>Record when this config was deployed for audit trail:</p>
<pre><code class="language-bash">kubectl annotate configmap vapora-config \
-n $NAMESPACE \
deployment.timestamp="$(date -u +'%Y-%m-%dT%H:%M:%SZ')" \
deployment.commit="$(git rev-parse HEAD | cut -c1-8)" \
deployment.branch="$(git rev-parse --abbrev-ref HEAD)" \
--overwrite
# Verify annotation was added
kubectl get configmap vapora-config -n $NAMESPACE -o yaml | grep "deployment\."
</code></pre>
<hr />
<h2 id="phase-3-deployment-update-5-minutes"><a class="header" href="#phase-3-deployment-update-5-minutes">Phase 3: Deployment Update (5 minutes)</a></h2>
<h3 id="31-dry-run-deployment"><a class="header" href="#31-dry-run-deployment">3.1 Dry-Run Deployment</a></h3>
<p>Always dry-run first to catch issues:</p>
<pre><code class="language-bash"># Run deployment dry-run
kubectl apply -f deployment.yaml --dry-run=server -n $NAMESPACE
# Output should show what will be updated:
# deployment.apps/vapora-backend configured (server dry run)
# deployment.apps/vapora-agents configured (server dry run)
# deployment.apps/vapora-llm-router configured (server dry run)
</code></pre>
<p><strong>Check for warnings</strong>:</p>
<ul>
<li>"imagePullBackOff": Docker image doesn't exist</li>
<li>"insufficient quota": Resource limits exceeded</li>
<li>"nodeAffinity": Pod can't be placed on any node</li>
</ul>
<h3 id="32-apply-deployments"><a class="header" href="#32-apply-deployments">3.2 Apply Deployments</a></h3>
<pre><code class="language-bash"># Apply the actual deployments
kubectl apply -f deployment.yaml -n $NAMESPACE
# Output:
# deployment.apps/vapora-backend configured
# deployment.apps/vapora-agents configured
# deployment.apps/vapora-llm-router configured
</code></pre>
<p><strong>Verify deployments updated</strong>:</p>
<pre><code class="language-bash"># Check that new rollout was initiated
kubectl get deployments -n $NAMESPACE -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.observedGeneration}{"\n"}{end}'
# Compare with recorded versions - should be incremented
</code></pre>
<h3 id="33-monitor-rollout-progress"><a class="header" href="#33-monitor-rollout-progress">3.3 Monitor Rollout Progress</a></h3>
<p>Watch the deployment rollout status:</p>
<pre><code class="language-bash"># For each deployment, monitor the rollout
for deployment in vapora-backend vapora-agents vapora-llm-router; do
echo "Waiting for $deployment..."
kubectl rollout status deployment/$deployment \
-n $NAMESPACE \
--timeout=5m
echo "$deployment ready"
done
</code></pre>
<p><strong>What to look for</strong> (per pod update):</p>
<pre><code>Waiting for rollout to finish: 2 of 3 updated replicas are available...
Waiting for rollout to finish: 2 of 3 updated replicas are available...
Waiting for rollout to finish: 3 of 3 updated replicas are available...
deployment "vapora-backend" successfully rolled out
</code></pre>
<p><strong>Expected time: 2-3 minutes per deployment</strong></p>
<h3 id="34-watch-pod-updates-in-separate-terminal"><a class="header" href="#34-watch-pod-updates-in-separate-terminal">3.4 Watch Pod Updates (in separate terminal)</a></h3>
<p>While rollout completes, monitor pods:</p>
<pre><code class="language-bash"># Watch pods being updated in real-time
kubectl get pods -n $NAMESPACE -w
# Output shows updates like:
# NAME READY STATUS
# vapora-backend-abc123-def45 1/1 Running
# vapora-backend-xyz789-old-pod 1/1 Running ← old pod still running
# vapora-backend-abc123-new-pod 0/1 Pending ← new pod starting
# vapora-backend-abc123-new-pod 0/1 ContainerCreating
# vapora-backend-abc123-new-pod 1/1 Running ← new pod ready
# vapora-backend-xyz789-old-pod 1/1 Terminating ← old pod being removed
</code></pre>
<p><strong>What to look for:</strong></p>
<ul>
<li>✓ New pods starting (Pending → ContainerCreating → Running)</li>
<li>✓ Each new pod reaches Running state</li>
<li>✓ Old pods gradually terminating</li>
<li>❌ Pod stuck in "CrashLoopBackOff": Stop, check logs, might need rollback</li>
</ul>
<hr />
<h2 id="phase-4-verification-5-minutes"><a class="header" href="#phase-4-verification-5-minutes">Phase 4: Verification (5 minutes)</a></h2>
<h3 id="41-verify-all-pods-running"><a class="header" href="#41-verify-all-pods-running">4.1 Verify All Pods Running</a></h3>
<pre><code class="language-bash"># Check all pods are ready
kubectl get pods -n $NAMESPACE
# Expected output:
# NAME READY STATUS
# vapora-backend-&lt;hash&gt;-1 1/1 Running
# vapora-backend-&lt;hash&gt;-2 1/1 Running
# vapora-backend-&lt;hash&gt;-3 1/1 Running
# vapora-agents-&lt;hash&gt;-1 1/1 Running
# vapora-agents-&lt;hash&gt;-2 1/1 Running
# vapora-llm-router-&lt;hash&gt;-1 1/1 Running
# vapora-llm-router-&lt;hash&gt;-2 1/1 Running
</code></pre>
<p><strong>Verification</strong>:</p>
<pre><code class="language-bash"># All pods should show READY=1/1
# All pods should show STATUS=Running
# No pods should be in Pending, CrashLoopBackOff, or Error state
# Quick check:
READY=$(kubectl get pods -n $NAMESPACE -o jsonpath='{range .items[*]}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' | grep -c "True")
TOTAL=$(kubectl get pods -n $NAMESPACE --no-headers | wc -l)
echo "Ready pods: $READY / $TOTAL"
# Should show: Ready pods: 7 / 7 (or your expected pod count)
</code></pre>
<h3 id="42-check-pod-logs-for-errors"><a class="header" href="#42-check-pod-logs-for-errors">4.2 Check Pod Logs for Errors</a></h3>
<pre><code class="language-bash"># Check logs from the last minute for errors
for pod in $(kubectl get pods -n $NAMESPACE -o name); do
echo "=== $pod ==="
kubectl logs $pod -n $NAMESPACE --since=1m 2&gt;&amp;1 | grep -i "error\|exception\|fatal" | head -3
done
# If errors found:
# 1. Note which pods have errors
# 2. Get full log: kubectl logs &lt;pod&gt; -n $NAMESPACE
# 3. Decide: can proceed or need to rollback
</code></pre>
<h3 id="43-verify-service-endpoints"><a class="header" href="#43-verify-service-endpoints">4.3 Verify Service Endpoints</a></h3>
<pre><code class="language-bash"># Check services are exposing pods correctly
kubectl get endpoints -n $NAMESPACE
# Expected output:
# NAME ENDPOINTS
# vapora-backend 10.1.2.3:8001,10.1.2.4:8001,10.1.2.5:8001
# vapora-agents 10.1.2.6:8002,10.1.2.7:8002
# vapora-llm-router 10.1.2.8:8003,10.1.2.9:8003
</code></pre>
<p><strong>Verification</strong>:</p>
<ul>
<li>✓ Each service has multiple endpoints (not empty)</li>
<li>✓ Endpoints match running pods</li>
<li>❌ If empty endpoints: Service can't route traffic</li>
</ul>
<h3 id="44-health-check-endpoints"><a class="header" href="#44-health-check-endpoints">4.4 Health Check Endpoints</a></h3>
<pre><code class="language-bash"># Port-forward to access services locally
kubectl port-forward -n $NAMESPACE svc/vapora-backend 8001:8001 &amp;
# Wait a moment for port-forward to establish
sleep 2
# Check backend health
curl -v http://localhost:8001/health
# Expected response:
# HTTP/1.1 200 OK
# {...healthy response...}
# Check other endpoints
curl http://localhost:8001/api/projects -H "Authorization: Bearer test-token"
</code></pre>
<p><strong>Expected responses</strong>:</p>
<ul>
<li><code>/health</code>: 200 OK with health data</li>
<li><code>/api/projects</code>: 200 OK with projects list</li>
<li><code>/metrics</code>: 200 OK with Prometheus metrics</li>
</ul>
<p><strong>If connection refused</strong>:</p>
<pre><code class="language-bash"># Check if port-forward working
ps aux | grep "port-forward"
# Restart port-forward
pkill -f "port-forward svc/vapora-backend"
kubectl port-forward -n $NAMESPACE svc/vapora-backend 8001:8001 &amp;
</code></pre>
<h3 id="45-check-metrics"><a class="header" href="#45-check-metrics">4.5 Check Metrics</a></h3>
<pre><code class="language-bash"># Monitor resource usage of deployed pods
kubectl top pods -n $NAMESPACE
# Expected output:
# NAME CPU(cores) MEMORY(Mi)
# vapora-backend-abc123 250m 512Mi
# vapora-backend-def456 280m 498Mi
# vapora-agents-ghi789 300m 256Mi
</code></pre>
<p><strong>Verification</strong>:</p>
<ul>
<li>✓ CPU usage within expected range (typically 100-500m per pod)</li>
<li>✓ Memory usage within expected range (typically 200-512Mi)</li>
<li>❌ If any pod at 100% CPU/Memory: Performance issue, monitor closely</li>
</ul>
<hr />
<h2 id="phase-5-validation-3-minutes"><a class="header" href="#phase-5-validation-3-minutes">Phase 5: Validation (3 minutes)</a></h2>
<h3 id="51-run-smoke-tests-if-available"><a class="header" href="#51-run-smoke-tests-if-available">5.1 Run Smoke Tests (if available)</a></h3>
<pre><code class="language-bash"># If your project has smoke tests:
kubectl exec -it deployment/vapora-backend -n $NAMESPACE -- \
sh -c "curl http://localhost:8001/health &amp;&amp; echo 'Health check passed'"
# Or run from your local machine:
./scripts/smoke-tests.sh --endpoint http://localhost:8001
</code></pre>
<h3 id="52-check-for-errors-in-logs"><a class="header" href="#52-check-for-errors-in-logs">5.2 Check for Errors in Logs</a></h3>
<pre><code class="language-bash"># Look at logs from all pods since deployment started
for deployment in vapora-backend vapora-agents vapora-llm-router; do
echo "=== Checking $deployment ==="
kubectl logs deployment/$deployment -n $NAMESPACE --since=5m 2&gt;&amp;1 | \
grep -i "error\|exception\|failed" | wc -l
done
# If any errors found:
# 1. Get detailed logs
# 2. Determine if critical or expected errors
# 3. Decide to proceed or rollback
</code></pre>
<h3 id="53-compare-against-baseline-metrics"><a class="header" href="#53-compare-against-baseline-metrics">5.3 Compare Against Baseline Metrics</a></h3>
<p>Compare current metrics with pre-deployment baseline:</p>
<pre><code class="language-bash"># Current metrics
echo "=== Current ==="
kubectl top nodes
kubectl top pods -n $NAMESPACE | head -5
# Compare with recorded baseline
# If similar: ✓ Good
# If significantly higher: ⚠️ Watch for issues
# If error rates high: ❌ Consider rollback
</code></pre>
<h3 id="54-check-for-recent-eventswarnings"><a class="header" href="#54-check-for-recent-eventswarnings">5.4 Check for Recent Events/Warnings</a></h3>
<pre><code class="language-bash"># Look for any cluster events in the last 5 minutes
kubectl get events -n $NAMESPACE --sort-by='.lastTimestamp' | tail -20
# Watch for:
# - Warning: FailedScheduling (pod won't fit)
# - Warning: PullImageError (image doesn't exist)
# - Warning: ImagePullBackOff (can't download image)
# - Error: ExceededQuota (resource limits)
</code></pre>
<hr />
<h2 id="phase-6-communication-1-minute"><a class="header" href="#phase-6-communication-1-minute">Phase 6: Communication (1 minute)</a></h2>
<h3 id="61-post-deployment-complete"><a class="header" href="#61-post-deployment-complete">6.1 Post Deployment Complete</a></h3>
<pre><code>Post message to #deployments:
🚀 DEPLOYMENT COMPLETE
Deployment: VAPORA Core Services
Mode: Enterprise
Duration: 8 minutes
Status: ✅ Successful
Deployed:
- vapora-backend (v1.2.1)
- vapora-agents (v1.2.1)
- vapora-llm-router (v1.2.1)
Verification:
✓ All pods running
✓ Health checks passing
✓ No error logs
✓ Metrics normal
Next steps:
- Monitor #alerts for any issues
- Check dashboards every 5 minutes for 30 min
- Review logs if any issues detected
Questions? @on-call-engineer
</code></pre>
<h3 id="62-update-status-page"><a class="header" href="#62-update-status-page">6.2 Update Status Page</a></h3>
<pre><code>If using public status page:
UPDATE: Maintenance Complete
VAPORA services have been successfully updated
and are now operating normally.
All systems monitoring nominal.
</code></pre>
<h3 id="63-notify-stakeholders"><a class="header" href="#63-notify-stakeholders">6.3 Notify Stakeholders</a></h3>
<ul>
<li><input disabled="" type="checkbox"/>
Send message to support team: "Deployment complete, all systems normal"</li>
<li><input disabled="" type="checkbox"/>
Post in #product: "Backend updated to v1.2.1, new features available"</li>
<li><input disabled="" type="checkbox"/>
Update ticket/issue with deployment completion time and status</li>
</ul>
<hr />
<h2 id="phase-7-post-deployment-monitoring-ongoing"><a class="header" href="#phase-7-post-deployment-monitoring-ongoing">Phase 7: Post-Deployment Monitoring (Ongoing)</a></h2>
<h3 id="71-first-5-minutes-watch-closely"><a class="header" href="#71-first-5-minutes-watch-closely">7.1 First 5 Minutes: Watch Closely</a></h3>
<pre><code class="language-bash"># Keep watching for any issues
watch kubectl get pods -n $NAMESPACE
watch kubectl top pods -n $NAMESPACE
watch kubectl logs -f deployment/vapora-backend -n $NAMESPACE
</code></pre>
<p><strong>Watch for:</strong></p>
<ul>
<li>Pod restarts (RESTARTS counter increasing)</li>
<li>Increased error logs</li>
<li>Resource usage spikes</li>
<li>Service unreachability</li>
</ul>
<h3 id="72-first-30-minutes-monitor-dashboard"><a class="header" href="#72-first-30-minutes-monitor-dashboard">7.2 First 30 Minutes: Monitor Dashboard</a></h3>
<p>Keep dashboard visible showing:</p>
<ul>
<li>Pod health status</li>
<li>CPU/Memory usage per pod</li>
<li>Request latency (if available)</li>
<li>Error rate</li>
<li>Recent logs</li>
</ul>
<p><strong>Alert triggers for immediate action:</strong></p>
<ul>
<li>Any pod restarting repeatedly</li>
<li>Error rate above 5%</li>
<li>Latency above 2x normal</li>
<li>Pod stuck in Pending state</li>
</ul>
<h3 id="73-first-2-hours-regular-checks"><a class="header" href="#73-first-2-hours-regular-checks">7.3 First 2 Hours: Regular Checks</a></h3>
<pre><code class="language-bash"># Every 10 minutes:
1. kubectl get pods -n $NAMESPACE
2. kubectl top pods -n $NAMESPACE
3. Check error logs: grep -i error from recent logs
4. Check alerts dashboard
</code></pre>
<p><strong>If issues detected</strong>, proceed to Incident Response Runbook</p>
<h3 id="74-after-2-hours-normal-monitoring"><a class="header" href="#74-after-2-hours-normal-monitoring">7.4 After 2 Hours: Normal Monitoring</a></h3>
<p>Return to standard monitoring procedures. Deployment complete.</p>
<hr />
<h2 id="if-issues-detected-quick-rollback"><a class="header" href="#if-issues-detected-quick-rollback">If Issues Detected: Quick Rollback</a></h2>
<p>If problems occur at any point:</p>
<pre><code class="language-bash"># IMMEDIATE: Rollback (1 minute)
for deployment in vapora-backend vapora-agents vapora-llm-router; do
kubectl rollout undo deployment/$deployment -n $NAMESPACE &amp;
done
wait
# Verify rollback completing:
kubectl rollout status deployment/vapora-backend -n $NAMESPACE --timeout=5m
# Confirm services recovering:
curl http://localhost:8001/health
# Post to #deployments:
# 🔙 ROLLBACK EXECUTED
# Issue detected, services rolled back to previous version
# All pods should be recovering now
</code></pre>
<p>See <a href="./rollback-runbook.html">Rollback Runbook</a> for detailed procedures.</p>
<hr />
<h2 id="common-issues--solutions"><a class="header" href="#common-issues--solutions">Common Issues &amp; Solutions</a></h2>
<h3 id="issue-pod-stuck-in-imagepullbackoff"><a class="header" href="#issue-pod-stuck-in-imagepullbackoff">Issue: Pod stuck in ImagePullBackOff</a></h3>
<p><strong>Cause</strong>: Docker image doesn't exist or can't be downloaded</p>
<p><strong>Solution</strong>:</p>
<pre><code class="language-bash"># Check pod events
kubectl describe pod &lt;pod-name&gt; -n $NAMESPACE
# Check image registry access
kubectl get secret -n $NAMESPACE
# Either:
1. Verify image name is correct in deployment.yaml
2. Push missing image to registry
3. Rollback deployment
</code></pre>
<h3 id="issue-pod-stuck-in-crashloopbackoff"><a class="header" href="#issue-pod-stuck-in-crashloopbackoff">Issue: Pod stuck in CrashLoopBackOff</a></h3>
<p><strong>Cause</strong>: Application crashing on startup</p>
<p><strong>Solution</strong>:</p>
<pre><code class="language-bash"># Get pod logs
kubectl logs &lt;pod-name&gt; -n $NAMESPACE --previous
# Fix typically requires config change:
1. Fix ConfigMap issue
2. Re-apply ConfigMap: kubectl apply -f configmap.yaml
3. Trigger pod restart: kubectl rollout restart deployment/&lt;name&gt;
# Or rollback if unclear
</code></pre>
<h3 id="issue-pod-in-pending-state"><a class="header" href="#issue-pod-in-pending-state">Issue: Pod in Pending state</a></h3>
<p><strong>Cause</strong>: Node doesn't have capacity or resources</p>
<p><strong>Solution</strong>:</p>
<pre><code class="language-bash"># Describe pod to see why
kubectl describe pod &lt;pod-name&gt; -n $NAMESPACE
# Check for "Insufficient cpu", "Insufficient memory"
kubectl top nodes
# Either:
1. Scale down other workloads
2. Increase node count
3. Reduce resource requirements in deployment.yaml and redeploy
</code></pre>
<h3 id="issue-service-endpoints-empty"><a class="header" href="#issue-service-endpoints-empty">Issue: Service endpoints empty</a></h3>
<p><strong>Cause</strong>: Pods not passing health checks</p>
<p><strong>Solution</strong>:</p>
<pre><code class="language-bash"># Check pod logs for errors
kubectl logs &lt;pod-name&gt; -n $NAMESPACE
# Check pod readiness probe failures
kubectl describe pod &lt;pod-name&gt; -n $NAMESPACE | grep -A 5 "Readiness"
# Fix configuration or rollback
</code></pre>
<hr />
<h2 id="completion-checklist"><a class="header" href="#completion-checklist">Completion Checklist</a></h2>
<ul>
<li><input disabled="" type="checkbox"/>
All pods running and ready</li>
<li><input disabled="" type="checkbox"/>
Health endpoints responding</li>
<li><input disabled="" type="checkbox"/>
No error logs</li>
<li><input disabled="" type="checkbox"/>
Metrics normal</li>
<li><input disabled="" type="checkbox"/>
Deployment communication posted</li>
<li><input disabled="" type="checkbox"/>
Status page updated</li>
<li><input disabled="" type="checkbox"/>
Stakeholders notified</li>
<li><input disabled="" type="checkbox"/>
Monitoring enabled for next 2 hours</li>
<li><input disabled="" type="checkbox"/>
Ticket/issue updated with completion details</li>
</ul>
<hr />
<h2 id="next-steps"><a class="header" href="#next-steps">Next Steps</a></h2>
<ul>
<li>Continue monitoring per <a href="./monitoring-runbook.html">Monitoring Runbook</a></li>
<li>If issues arise, follow <a href="./incident-response-runbook.html">Incident Response Runbook</a></li>
<li>Document lessons learned</li>
<li>Update runbooks if procedures need improvement</li>
</ul>
</main>
<nav class="nav-wrapper" aria-label="Page navigation">
<!-- Mobile navigation buttons -->
<a rel="prev" href="../../operations/index.html" class="mobile-nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
<i class="fa fa-angle-left"></i>
</a>
<a rel="next prefetch" href="../../operations/pre-deployment-checklist.html" class="mobile-nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
<i class="fa fa-angle-right"></i>
</a>
<div style="clear: both"></div>
</nav>
</div>
</div>
<nav class="nav-wide-wrapper" aria-label="Page navigation">
<a rel="prev" href="../../operations/index.html" class="nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
<i class="fa fa-angle-left"></i>
</a>
<a rel="next prefetch" href="../../operations/pre-deployment-checklist.html" class="nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
<i class="fa fa-angle-right"></i>
</a>
</nav>
</div>
<script>
window.playground_copyable = true;
</script>
<script src="../elasticlunr.min.js"></script>
<script src="../mark.min.js"></script>
<script src="../searcher.js"></script>
<script src="../clipboard.min.js"></script>
<script src="../highlight.js"></script>
<script src="../book.js"></script>
<!-- Custom JS scripts -->
</div>
</body>
</html>