776 lines
32 KiB
HTML
776 lines
32 KiB
HTML
|
|
<!DOCTYPE HTML>
|
||
|
|
<html lang="en" class="light sidebar-visible" dir="ltr">
|
||
|
|
<head>
|
||
|
|
<!-- Book generated using mdBook -->
|
||
|
|
<meta charset="UTF-8">
|
||
|
|
<title>Incident Response Runbook - VAPORA Platform Documentation</title>
|
||
|
|
|
||
|
|
|
||
|
|
<!-- Custom HTML head -->
|
||
|
|
|
||
|
|
<meta name="description" content="Comprehensive documentation for VAPORA, an intelligent development orchestration platform built entirely in Rust.">
|
||
|
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||
|
|
<meta name="theme-color" content="#ffffff">
|
||
|
|
|
||
|
|
<link rel="icon" href="../favicon.svg">
|
||
|
|
<link rel="shortcut icon" href="../favicon.png">
|
||
|
|
<link rel="stylesheet" href="../css/variables.css">
|
||
|
|
<link rel="stylesheet" href="../css/general.css">
|
||
|
|
<link rel="stylesheet" href="../css/chrome.css">
|
||
|
|
<link rel="stylesheet" href="../css/print.css" media="print">
|
||
|
|
|
||
|
|
<!-- Fonts -->
|
||
|
|
<link rel="stylesheet" href="../FontAwesome/css/font-awesome.css">
|
||
|
|
<link rel="stylesheet" href="../fonts/fonts.css">
|
||
|
|
|
||
|
|
<!-- Highlight.js Stylesheets -->
|
||
|
|
<link rel="stylesheet" id="highlight-css" href="../highlight.css">
|
||
|
|
<link rel="stylesheet" id="tomorrow-night-css" href="../tomorrow-night.css">
|
||
|
|
<link rel="stylesheet" id="ayu-highlight-css" href="../ayu-highlight.css">
|
||
|
|
|
||
|
|
<!-- Custom theme stylesheets -->
|
||
|
|
|
||
|
|
|
||
|
|
<!-- Provide site root and default themes to javascript -->
|
||
|
|
<script>
|
||
|
|
const path_to_root = "../";
|
||
|
|
const default_light_theme = "light";
|
||
|
|
const default_dark_theme = "dark";
|
||
|
|
</script>
|
||
|
|
<!-- Start loading toc.js asap -->
|
||
|
|
<script src="../toc.js"></script>
|
||
|
|
</head>
|
||
|
|
<body>
|
||
|
|
<div id="mdbook-help-container">
|
||
|
|
<div id="mdbook-help-popup">
|
||
|
|
<h2 class="mdbook-help-title">Keyboard shortcuts</h2>
|
||
|
|
<div>
|
||
|
|
<p>Press <kbd>←</kbd> or <kbd>→</kbd> to navigate between chapters</p>
|
||
|
|
<p>Press <kbd>S</kbd> or <kbd>/</kbd> to search in the book</p>
|
||
|
|
<p>Press <kbd>?</kbd> to show this help</p>
|
||
|
|
<p>Press <kbd>Esc</kbd> to hide this help</p>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
<div id="body-container">
|
||
|
|
<!-- Work around some values being stored in localStorage wrapped in quotes -->
|
||
|
|
<script>
|
||
|
|
try {
|
||
|
|
let theme = localStorage.getItem('mdbook-theme');
|
||
|
|
let sidebar = localStorage.getItem('mdbook-sidebar');
|
||
|
|
|
||
|
|
if (theme.startsWith('"') && theme.endsWith('"')) {
|
||
|
|
localStorage.setItem('mdbook-theme', theme.slice(1, theme.length - 1));
|
||
|
|
}
|
||
|
|
|
||
|
|
if (sidebar.startsWith('"') && sidebar.endsWith('"')) {
|
||
|
|
localStorage.setItem('mdbook-sidebar', sidebar.slice(1, sidebar.length - 1));
|
||
|
|
}
|
||
|
|
} catch (e) { }
|
||
|
|
</script>
|
||
|
|
|
||
|
|
<!-- Set the theme before any content is loaded, prevents flash -->
|
||
|
|
<script>
|
||
|
|
const default_theme = window.matchMedia("(prefers-color-scheme: dark)").matches ? default_dark_theme : default_light_theme;
|
||
|
|
let theme;
|
||
|
|
try { theme = localStorage.getItem('mdbook-theme'); } catch(e) { }
|
||
|
|
if (theme === null || theme === undefined) { theme = default_theme; }
|
||
|
|
const html = document.documentElement;
|
||
|
|
html.classList.remove('light')
|
||
|
|
html.classList.add(theme);
|
||
|
|
html.classList.add("js");
|
||
|
|
</script>
|
||
|
|
|
||
|
|
<input type="checkbox" id="sidebar-toggle-anchor" class="hidden">
|
||
|
|
|
||
|
|
<!-- Hide / unhide sidebar before it is displayed -->
|
||
|
|
<script>
|
||
|
|
let sidebar = null;
|
||
|
|
const sidebar_toggle = document.getElementById("sidebar-toggle-anchor");
|
||
|
|
if (document.body.clientWidth >= 1080) {
|
||
|
|
try { sidebar = localStorage.getItem('mdbook-sidebar'); } catch(e) { }
|
||
|
|
sidebar = sidebar || 'visible';
|
||
|
|
} else {
|
||
|
|
sidebar = 'hidden';
|
||
|
|
}
|
||
|
|
sidebar_toggle.checked = sidebar === 'visible';
|
||
|
|
html.classList.remove('sidebar-visible');
|
||
|
|
html.classList.add("sidebar-" + sidebar);
|
||
|
|
</script>
|
||
|
|
|
||
|
|
<nav id="sidebar" class="sidebar" aria-label="Table of contents">
|
||
|
|
<!-- populated by js -->
|
||
|
|
<mdbook-sidebar-scrollbox class="sidebar-scrollbox"></mdbook-sidebar-scrollbox>
|
||
|
|
<noscript>
|
||
|
|
<iframe class="sidebar-iframe-outer" src="../toc.html"></iframe>
|
||
|
|
</noscript>
|
||
|
|
<div id="sidebar-resize-handle" class="sidebar-resize-handle">
|
||
|
|
<div class="sidebar-resize-indicator"></div>
|
||
|
|
</div>
|
||
|
|
</nav>
|
||
|
|
|
||
|
|
<div id="page-wrapper" class="page-wrapper">
|
||
|
|
|
||
|
|
<div class="page">
|
||
|
|
<div id="menu-bar-hover-placeholder"></div>
|
||
|
|
<div id="menu-bar" class="menu-bar sticky">
|
||
|
|
<div class="left-buttons">
|
||
|
|
<label id="sidebar-toggle" class="icon-button" for="sidebar-toggle-anchor" title="Toggle Table of Contents" aria-label="Toggle Table of Contents" aria-controls="sidebar">
|
||
|
|
<i class="fa fa-bars"></i>
|
||
|
|
</label>
|
||
|
|
<button id="theme-toggle" class="icon-button" type="button" title="Change theme" aria-label="Change theme" aria-haspopup="true" aria-expanded="false" aria-controls="theme-list">
|
||
|
|
<i class="fa fa-paint-brush"></i>
|
||
|
|
</button>
|
||
|
|
<ul id="theme-list" class="theme-popup" aria-label="Themes" role="menu">
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="default_theme">Auto</button></li>
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="light">Light</button></li>
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="rust">Rust</button></li>
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="coal">Coal</button></li>
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="navy">Navy</button></li>
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="ayu">Ayu</button></li>
|
||
|
|
</ul>
|
||
|
|
<button id="search-toggle" class="icon-button" type="button" title="Search (`/`)" aria-label="Toggle Searchbar" aria-expanded="false" aria-keyshortcuts="/ s" aria-controls="searchbar">
|
||
|
|
<i class="fa fa-search"></i>
|
||
|
|
</button>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<h1 class="menu-title">VAPORA Platform Documentation</h1>
|
||
|
|
|
||
|
|
<div class="right-buttons">
|
||
|
|
<a href="../print.html" title="Print this book" aria-label="Print this book">
|
||
|
|
<i id="print-button" class="fa fa-print"></i>
|
||
|
|
</a>
|
||
|
|
<a href="https://github.com/vapora-platform/vapora" title="Git repository" aria-label="Git repository">
|
||
|
|
<i id="git-repository-button" class="fa fa-github"></i>
|
||
|
|
</a>
|
||
|
|
<a href="https://github.com/vapora-platform/vapora/edit/main/docs/src/../operations/incident-response-runbook.md" title="Suggest an edit" aria-label="Suggest an edit">
|
||
|
|
<i id="git-edit-button" class="fa fa-edit"></i>
|
||
|
|
</a>
|
||
|
|
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<div id="search-wrapper" class="hidden">
|
||
|
|
<form id="searchbar-outer" class="searchbar-outer">
|
||
|
|
<input type="search" id="searchbar" name="searchbar" placeholder="Search this book ..." aria-controls="searchresults-outer" aria-describedby="searchresults-header">
|
||
|
|
</form>
|
||
|
|
<div id="searchresults-outer" class="searchresults-outer hidden">
|
||
|
|
<div id="searchresults-header" class="searchresults-header"></div>
|
||
|
|
<ul id="searchresults">
|
||
|
|
</ul>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<!-- Apply ARIA attributes after the sidebar and the sidebar toggle button are added to the DOM -->
|
||
|
|
<script>
|
||
|
|
document.getElementById('sidebar-toggle').setAttribute('aria-expanded', sidebar === 'visible');
|
||
|
|
document.getElementById('sidebar').setAttribute('aria-hidden', sidebar !== 'visible');
|
||
|
|
Array.from(document.querySelectorAll('#sidebar a')).forEach(function(link) {
|
||
|
|
link.setAttribute('tabIndex', sidebar === 'visible' ? 0 : -1);
|
||
|
|
});
|
||
|
|
</script>
|
||
|
|
|
||
|
|
<div id="content" class="content">
|
||
|
|
<main>
|
||
|
|
<h1 id="incident-response-runbook"><a class="header" href="#incident-response-runbook">Incident Response Runbook</a></h1>
|
||
|
|
<p>Procedures for responding to and resolving VAPORA production incidents.</p>
|
||
|
|
<hr />
|
||
|
|
<h2 id="incident-severity-levels"><a class="header" href="#incident-severity-levels">Incident Severity Levels</a></h2>
|
||
|
|
<h3 id="severity-1-critical-"><a class="header" href="#severity-1-critical-">Severity 1: Critical 🔴</a></h3>
|
||
|
|
<p><strong>Definition</strong>: Service completely down or severely degraded affecting all users</p>
|
||
|
|
<p><strong>Examples</strong>:</p>
|
||
|
|
<ul>
|
||
|
|
<li>All backend pods crashed</li>
|
||
|
|
<li>Database completely unreachable</li>
|
||
|
|
<li>API returning 100% errors</li>
|
||
|
|
<li>Frontend completely inaccessible</li>
|
||
|
|
</ul>
|
||
|
|
<p><strong>Response Time</strong>: Immediate (< 2 minutes)
|
||
|
|
<strong>On-Call</strong>: Page immediately (not optional)
|
||
|
|
<strong>Communication</strong>: Update status page every 2 minutes</p>
|
||
|
|
<h3 id="severity-2-major-"><a class="header" href="#severity-2-major-">Severity 2: Major 🟠</a></h3>
|
||
|
|
<p><strong>Definition</strong>: Service partially down or significantly degraded</p>
|
||
|
|
<p><strong>Examples</strong>:</p>
|
||
|
|
<ul>
|
||
|
|
<li>50% of requests returning errors</li>
|
||
|
|
<li>Latency 10x normal</li>
|
||
|
|
<li>Some services down but others working</li>
|
||
|
|
<li>Intermittent connectivity issues</li>
|
||
|
|
</ul>
|
||
|
|
<p><strong>Response Time</strong>: 5 minutes
|
||
|
|
<strong>On-Call</strong>: Alert on-call engineer
|
||
|
|
<strong>Communication</strong>: Internal updates every 5 minutes</p>
|
||
|
|
<h3 id="severity-3-minor-"><a class="header" href="#severity-3-minor-">Severity 3: Minor 🟡</a></h3>
|
||
|
|
<p><strong>Definition</strong>: Service slow or minor issues affecting some users</p>
|
||
|
|
<p><strong>Examples</strong>:</p>
|
||
|
|
<ul>
|
||
|
|
<li>5-10% error rate</li>
|
||
|
|
<li>Elevated latency (2x normal)</li>
|
||
|
|
<li>One pod having issues, others recovering</li>
|
||
|
|
<li>Non-critical features unavailable</li>
|
||
|
|
</ul>
|
||
|
|
<p><strong>Response Time</strong>: 15 minutes
|
||
|
|
<strong>On-Call</strong>: Alert team, not necessarily emergency page
|
||
|
|
<strong>Communication</strong>: Post-incident update</p>
|
||
|
|
<h3 id="severity-4-informational-"><a class="header" href="#severity-4-informational-">Severity 4: Informational 🟢</a></h3>
|
||
|
|
<p><strong>Definition</strong>: No user impact, system anomalies or preventive issues</p>
|
||
|
|
<p><strong>Examples</strong>:</p>
|
||
|
|
<ul>
|
||
|
|
<li>Disk usage trending high</li>
|
||
|
|
<li>SSL cert expiring in 30 days</li>
|
||
|
|
<li>Deployment taking longer than normal</li>
|
||
|
|
<li>Non-critical service warnings</li>
|
||
|
|
</ul>
|
||
|
|
<p><strong>Response Time</strong>: During business hours
|
||
|
|
<strong>On-Call</strong>: No alert needed
|
||
|
|
<strong>Communication</strong>: Team Slack message</p>
|
||
|
|
<hr />
|
||
|
|
<h2 id="incident-response-process"><a class="header" href="#incident-response-process">Incident Response Process</a></h2>
|
||
|
|
<h3 id="step-1-report--assess-immediately"><a class="header" href="#step-1-report--assess-immediately">Step 1: Report & Assess (Immediately)</a></h3>
|
||
|
|
<p>When incident reported (via alert, user report, or discovery):</p>
|
||
|
|
<pre><code class="language-bash"># 1. Create incident ticket
|
||
|
|
# Title: "INCIDENT: [Service] - [Brief description]"
|
||
|
|
# Example: "INCIDENT: API - 50% error rate since 14:30 UTC"
|
||
|
|
# Severity: [1-4]
|
||
|
|
# Reporter: [Your name]
|
||
|
|
# Time Detected: [UTC time]
|
||
|
|
|
||
|
|
# 2. Open dedicated Slack channel
|
||
|
|
#slack /create #incident-20260112-backend
|
||
|
|
# Then: /invite @on-call-engineer
|
||
|
|
|
||
|
|
# 3. Post initial message
|
||
|
|
# "🔴 INCIDENT DECLARED
|
||
|
|
# Service: VAPORA Backend
|
||
|
|
# Severity: 1 (Critical)
|
||
|
|
# Time Detected: 14:32 UTC
|
||
|
|
# Current Status: Unknown
|
||
|
|
# Next Update: 14:34 UTC"
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="step-2-quick-diagnosis-first-2-minutes"><a class="header" href="#step-2-quick-diagnosis-first-2-minutes">Step 2: Quick Diagnosis (First 2 minutes)</a></h3>
|
||
|
|
<pre><code class="language-bash"># Establish facts quickly
|
||
|
|
export NAMESPACE=vapora
|
||
|
|
|
||
|
|
# Q1: Is the service actually down?
|
||
|
|
curl -v http://api.vapora.com/health
|
||
|
|
# If: Connection refused → Service down
|
||
|
|
# If: 500 errors → Service crashed
|
||
|
|
# If: Timeout → Service hung
|
||
|
|
|
||
|
|
# Q2: What's the scope?
|
||
|
|
kubectl get pods -n $NAMESPACE
|
||
|
|
# Count Running vs non-Running pods
|
||
|
|
# All down → Complete outage
|
||
|
|
# Some down → Partial outage
|
||
|
|
|
||
|
|
# Q3: What's happening right now?
|
||
|
|
for deployment in vapora-backend vapora-agents vapora-llm-router; do
|
||
|
|
echo "=== $deployment ==="
|
||
|
|
kubectl get deployment $deployment -n $NAMESPACE
|
||
|
|
done
|
||
|
|
# Shows: DESIRED vs CURRENT vs AVAILABLE
|
||
|
|
# Example: 3 DESIRED, 0 CURRENT, 0 AVAILABLE → Pod startup failure
|
||
|
|
|
||
|
|
# Q4: Any obvious errors?
|
||
|
|
kubectl logs deployment/vapora-backend -n $NAMESPACE --tail=20 | grep -i "error\|fatal"
|
||
|
|
# Shows: What's in the logs right now
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="step-3-escalate-decision"><a class="header" href="#step-3-escalate-decision">Step 3: Escalate Decision</a></h3>
|
||
|
|
<p>Based on quick diagnosis, decide next action:</p>
|
||
|
|
<pre><code>IF pods not starting (CrashLoopBackOff):
|
||
|
|
→ Likely config issue
|
||
|
|
→ Check ConfigMap values
|
||
|
|
→ Likely recent deployment
|
||
|
|
→ DECISION: Possible rollback
|
||
|
|
|
||
|
|
IF pods pending (not scheduled):
|
||
|
|
→ Likely resource issue
|
||
|
|
→ Check node capacity
|
||
|
|
→ DECISION: Scale down workloads or investigate nodes
|
||
|
|
|
||
|
|
IF pods running but unresponsive:
|
||
|
|
→ Likely application issue
|
||
|
|
→ Check application logs
|
||
|
|
→ DECISION: Investigate app logic
|
||
|
|
|
||
|
|
IF network/database issues:
|
||
|
|
→ Check connectivity
|
||
|
|
→ Check credentials
|
||
|
|
→ DECISION: Infrastructure escalation
|
||
|
|
|
||
|
|
IF unknown:
|
||
|
|
→ Ask: "What changed recently?"
|
||
|
|
→ Check deployment history
|
||
|
|
→ Check infrastructure changes
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="step-4-initial-response-actions"><a class="header" href="#step-4-initial-response-actions">Step 4: Initial Response Actions</a></h3>
|
||
|
|
<p><strong>For Severity 1 (Critical)</strong>:</p>
|
||
|
|
<pre><code class="language-bash"># A. Escalate immediately
|
||
|
|
- Page senior engineer if not already responding
|
||
|
|
- Contact infrastructure team
|
||
|
|
- Notify product/support managers
|
||
|
|
|
||
|
|
# B. Buy time with failover if available
|
||
|
|
- Switch to backup environment if configured
|
||
|
|
- Scale to different region if multi-region
|
||
|
|
|
||
|
|
# C. Gather data for debugging
|
||
|
|
- Save current logs
|
||
|
|
- Save pod events
|
||
|
|
- Record current metrics
|
||
|
|
- Take screenshot of dashboards
|
||
|
|
|
||
|
|
# D. Keep team updated
|
||
|
|
# Update #incident-* channel every 2 minutes
|
||
|
|
</code></pre>
|
||
|
|
<p><strong>For Severity 2 (Major)</strong>:</p>
|
||
|
|
<pre><code class="language-bash"># A. Alert on-call team
|
||
|
|
# B. Gather same diagnostics
|
||
|
|
# C. Start investigation
|
||
|
|
# D. Update every 5 minutes
|
||
|
|
</code></pre>
|
||
|
|
<p><strong>For Severity 3 (Minor)</strong>:</p>
|
||
|
|
<pre><code class="language-bash"># A. Create ticket for later investigation
|
||
|
|
# B. Monitor closely
|
||
|
|
# C. Gather diagnostics
|
||
|
|
# D. Plan fix during normal hours if not urgent
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="step-5-detailed-diagnosis"><a class="header" href="#step-5-detailed-diagnosis">Step 5: Detailed Diagnosis</a></h3>
|
||
|
|
<p>Once immediate actions taken:</p>
|
||
|
|
<pre><code class="language-bash"># Get comprehensive view of system state
|
||
|
|
kubectl describe node <nodename> # Hardware/capacity issues
|
||
|
|
kubectl describe pod <podname> -n $NAMESPACE # Pod-specific issues
|
||
|
|
kubectl events -n $NAMESPACE # What happened recently
|
||
|
|
kubectl top nodes # CPU/memory usage
|
||
|
|
kubectl top pods -n $NAMESPACE # Per-pod resource usage
|
||
|
|
|
||
|
|
# Check recent changes
|
||
|
|
git log -5 --oneline
|
||
|
|
git diff HEAD~1 HEAD provisioning/
|
||
|
|
|
||
|
|
# Check deployment history
|
||
|
|
kubectl rollout history deployment/vapora-backend -n $NAMESPACE | tail -5
|
||
|
|
|
||
|
|
# Timeline analysis
|
||
|
|
# What happened at 14:30 UTC? (incident time)
|
||
|
|
# Was there a deployment?
|
||
|
|
# Did metrics change suddenly?
|
||
|
|
# Any alerts triggered?
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="step-6-implement-fix"><a class="header" href="#step-6-implement-fix">Step 6: Implement Fix</a></h3>
|
||
|
|
<p>Depending on root cause:</p>
|
||
|
|
<h4 id="root-cause-recent-bad-deployment"><a class="header" href="#root-cause-recent-bad-deployment">Root Cause: Recent Bad Deployment</a></h4>
|
||
|
|
<pre><code class="language-bash"># Solution: Rollback
|
||
|
|
# See: Rollback Runbook
|
||
|
|
kubectl rollout undo deployment/vapora-backend -n $NAMESPACE
|
||
|
|
kubectl rollout status deployment/vapora-backend --timeout=5m
|
||
|
|
|
||
|
|
# Verify
|
||
|
|
curl http://localhost:8001/health
|
||
|
|
</code></pre>
|
||
|
|
<h4 id="root-cause-insufficient-resources"><a class="header" href="#root-cause-insufficient-resources">Root Cause: Insufficient Resources</a></h4>
|
||
|
|
<pre><code class="language-bash"># Solution: Either scale out or reduce load
|
||
|
|
|
||
|
|
# Option A: Add more nodes
|
||
|
|
kubectl scale nodes --increment=1
|
||
|
|
# (Requires infrastructure access)
|
||
|
|
|
||
|
|
# Option B: Scale down non-critical services
|
||
|
|
kubectl scale deployment/vapora-agents --replicas=1 -n $NAMESPACE
|
||
|
|
# Then scale back up when resolved
|
||
|
|
|
||
|
|
# Option C: Temporarily scale down pod replicas
|
||
|
|
kubectl scale deployment/vapora-backend --replicas=2 -n $NAMESPACE
|
||
|
|
# (Trade: Reduced capacity but faster recovery)
|
||
|
|
</code></pre>
|
||
|
|
<h4 id="root-cause-configuration-error"><a class="header" href="#root-cause-configuration-error">Root Cause: Configuration Error</a></h4>
|
||
|
|
<pre><code class="language-bash"># Solution: Fix ConfigMap
|
||
|
|
|
||
|
|
# 1. Identify wrong value
|
||
|
|
kubectl get configmap -n $NAMESPACE vapora-config -o yaml | grep -A 2 <suspicious-key>
|
||
|
|
|
||
|
|
# 2. Fix value
|
||
|
|
# Edit configmap in external editor or via kubectl patch:
|
||
|
|
kubectl patch configmap vapora-config -n $NAMESPACE \
|
||
|
|
--type merge \
|
||
|
|
-p '{"data":{"vapora.toml":"[corrected content]"}}'
|
||
|
|
|
||
|
|
# 3. Restart pods to pick up new config
|
||
|
|
kubectl rollout restart deployment/vapora-backend -n $NAMESPACE
|
||
|
|
kubectl rollout status deployment/vapora-backend --timeout=5m
|
||
|
|
</code></pre>
|
||
|
|
<h4 id="root-cause-database-issues"><a class="header" href="#root-cause-database-issues">Root Cause: Database Issues</a></h4>
|
||
|
|
<pre><code class="language-bash"># Solution: Depends on specific issue
|
||
|
|
|
||
|
|
# If database down:
|
||
|
|
- Contact DBA or database team
|
||
|
|
- Check database status: kubectl exec <pod> -- curl localhost:8000
|
||
|
|
|
||
|
|
# If credentials wrong:
|
||
|
|
kubectl patch configmap vapora-config -n $NAMESPACE \
|
||
|
|
--type merge \
|
||
|
|
-p '{"data":{"DB_PASSWORD":"[correct-password]"}}'
|
||
|
|
|
||
|
|
# If database full:
|
||
|
|
- Contact DBA for cleanup
|
||
|
|
- Free up space on database volume
|
||
|
|
|
||
|
|
# If connection pool exhausted:
|
||
|
|
- Scale down services to reduce connections
|
||
|
|
- Increase connection pool size if possible
|
||
|
|
</code></pre>
|
||
|
|
<h4 id="root-cause-external-service-down"><a class="header" href="#root-cause-external-service-down">Root Cause: External Service Down</a></h4>
|
||
|
|
<pre><code class="language-bash"># Examples: Third-party API, external database
|
||
|
|
|
||
|
|
# Solution: Depends on severity
|
||
|
|
|
||
|
|
# If critical: Failover
|
||
|
|
- Switch to backup provider if available
|
||
|
|
- Route traffic differently
|
||
|
|
|
||
|
|
# If non-critical: Degrade gracefully
|
||
|
|
- Disable feature temporarily
|
||
|
|
- Use cache if available
|
||
|
|
- Return cached data
|
||
|
|
|
||
|
|
# Communicate
|
||
|
|
- Notify users of reduced functionality
|
||
|
|
- Provide ETA for restoration
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="step-7-verify-recovery"><a class="header" href="#step-7-verify-recovery">Step 7: Verify Recovery</a></h3>
|
||
|
|
<pre><code class="language-bash"># Once fix applied, verify systematically
|
||
|
|
|
||
|
|
# 1. Pod health
|
||
|
|
kubectl get pods -n $NAMESPACE
|
||
|
|
# All should show: Running, 1/1 Ready
|
||
|
|
|
||
|
|
# 2. Service endpoints
|
||
|
|
kubectl get endpoints -n $NAMESPACE
|
||
|
|
# All should have IP addresses
|
||
|
|
|
||
|
|
# 3. Health endpoints
|
||
|
|
curl http://localhost:8001/health
|
||
|
|
# Should return: 200 OK
|
||
|
|
|
||
|
|
# 4. Check errors
|
||
|
|
kubectl logs deployment/vapora-backend -n $NAMESPACE --since=2m | grep -i error
|
||
|
|
# Should return: few or no errors
|
||
|
|
|
||
|
|
# 5. Monitor metrics
|
||
|
|
kubectl top pods -n $NAMESPACE
|
||
|
|
# CPU/Memory should be normal (not spiking)
|
||
|
|
|
||
|
|
# 6. Check for new issues
|
||
|
|
kubectl get events -n $NAMESPACE
|
||
|
|
# Should show normal state, no warnings
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="step-8-incident-closure"><a class="header" href="#step-8-incident-closure">Step 8: Incident Closure</a></h3>
|
||
|
|
<pre><code class="language-bash"># When everything verified healthy:
|
||
|
|
|
||
|
|
# 1. Document resolution
|
||
|
|
# Update incident ticket with:
|
||
|
|
# - Root cause
|
||
|
|
# - Fix applied
|
||
|
|
# - Verification steps
|
||
|
|
# - Resolution time
|
||
|
|
# - Impact (how many users, how long)
|
||
|
|
|
||
|
|
# 2. Post final update
|
||
|
|
# "#incident channel:
|
||
|
|
# ✅ INCIDENT RESOLVED
|
||
|
|
#
|
||
|
|
# Duration: [start] to [end] = [X minutes]
|
||
|
|
# Root Cause: [brief description]
|
||
|
|
# Fix Applied: [brief description]
|
||
|
|
# Impact: ~X users affected for X minutes
|
||
|
|
#
|
||
|
|
# Status: All services healthy
|
||
|
|
# Monitoring: Continuing for 1 hour
|
||
|
|
# Post-mortem: Scheduled for [date]"
|
||
|
|
|
||
|
|
# 3. Schedule post-mortem
|
||
|
|
# Within 24 hours: review what happened and why
|
||
|
|
# Document lessons learned
|
||
|
|
|
||
|
|
# 4. Update dashboards
|
||
|
|
# Document incident on status page history
|
||
|
|
# If public incident: close status page incident
|
||
|
|
|
||
|
|
# 5. Send all-clear message
|
||
|
|
# Notify: support team, product team, key stakeholders
|
||
|
|
</code></pre>
|
||
|
|
<hr />
|
||
|
|
<h2 id="incident-response-roles--responsibilities"><a class="header" href="#incident-response-roles--responsibilities">Incident Response Roles & Responsibilities</a></h2>
|
||
|
|
<h3 id="incident-commander"><a class="header" href="#incident-commander">Incident Commander</a></h3>
|
||
|
|
<ul>
|
||
|
|
<li>Overall control of incident response</li>
|
||
|
|
<li>Makes critical decisions</li>
|
||
|
|
<li>Drives decision-making speed</li>
|
||
|
|
<li>Communicates status updates</li>
|
||
|
|
<li>Calls when to escalate</li>
|
||
|
|
<li><strong>You</strong> if you discovered the incident and best understands it</li>
|
||
|
|
</ul>
|
||
|
|
<h3 id="technical-responders"><a class="header" href="#technical-responders">Technical Responders</a></h3>
|
||
|
|
<ul>
|
||
|
|
<li>Investigate specific systems</li>
|
||
|
|
<li>Implement fixes</li>
|
||
|
|
<li>Report findings to commander</li>
|
||
|
|
<li>Execute verified solutions</li>
|
||
|
|
</ul>
|
||
|
|
<h3 id="communication-lead-if-severity-1"><a class="header" href="#communication-lead-if-severity-1">Communication Lead (if Severity 1)</a></h3>
|
||
|
|
<ul>
|
||
|
|
<li>Updates #incident channel every 2 minutes</li>
|
||
|
|
<li>Updates status page every 5 minutes</li>
|
||
|
|
<li>Fields questions from support/product</li>
|
||
|
|
<li>Notifies key stakeholders</li>
|
||
|
|
</ul>
|
||
|
|
<h3 id="on-call-manager-if-severity-1"><a class="header" href="#on-call-manager-if-severity-1">On-Call Manager (if Severity 1)</a></h3>
|
||
|
|
<ul>
|
||
|
|
<li>Pages additional resources if needed</li>
|
||
|
|
<li>Escalates to senior engineers</li>
|
||
|
|
<li>Engages infrastructure/DBA teams</li>
|
||
|
|
<li>Tracks response timeline</li>
|
||
|
|
</ul>
|
||
|
|
<hr />
|
||
|
|
<h2 id="common-incidents--responses"><a class="header" href="#common-incidents--responses">Common Incidents & Responses</a></h2>
|
||
|
|
<h3 id="incident-type-service-unresponsive"><a class="header" href="#incident-type-service-unresponsive">Incident Type: Service Unresponsive</a></h3>
|
||
|
|
<pre><code>Detection: curl returns "Connection refused"
|
||
|
|
Diagnosis Time: 1 minute
|
||
|
|
Response:
|
||
|
|
1. Check if pods are running: kubectl get pods
|
||
|
|
2. If not running: likely crash → check logs
|
||
|
|
3. If running but unresponsive: likely port/network issue
|
||
|
|
4. Verify service exists: kubectl get service vapora-backend
|
||
|
|
|
||
|
|
Solution:
|
||
|
|
- If pods crashed: check logs, likely config or deployment issue
|
||
|
|
- If pods hanging: restart pods: kubectl delete pods -l app=vapora-backend
|
||
|
|
- If service/endpoints missing: apply service manifest
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="incident-type-high-error-rate"><a class="header" href="#incident-type-high-error-rate">Incident Type: High Error Rate</a></h3>
|
||
|
|
<pre><code>Detection: Dashboard shows >10% 5xx errors
|
||
|
|
Diagnosis Time: 2 minutes
|
||
|
|
Response:
|
||
|
|
1. Check which endpoint is failing
|
||
|
|
2. Check logs for error pattern
|
||
|
|
3. Identify affected service (backend, agents, router)
|
||
|
|
4. Compare with baseline (worked X minutes ago)
|
||
|
|
|
||
|
|
Solution:
|
||
|
|
- If recent deployment: rollback
|
||
|
|
- If config change: revert config
|
||
|
|
- If database issue: contact DBA
|
||
|
|
- If third-party down: implement fallback
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="incident-type-high-latency"><a class="header" href="#incident-type-high-latency">Incident Type: High Latency</a></h3>
|
||
|
|
<pre><code>Detection: Dashboard shows p99 latency >2 seconds
|
||
|
|
Diagnosis Time: 2 minutes
|
||
|
|
Response:
|
||
|
|
1. Check if requests still succeeding (is it slow or failing?)
|
||
|
|
2. Check CPU/memory usage: kubectl top pods
|
||
|
|
3. Check if database slow: run query diagnostics
|
||
|
|
4. Check network: are there packet losses?
|
||
|
|
|
||
|
|
Solution:
|
||
|
|
- If resource exhausted: scale up or reduce load
|
||
|
|
- If database slow: DBA investigation
|
||
|
|
- If network issue: infrastructure team
|
||
|
|
- If legitimate increased load: no action needed (expected)
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="incident-type-pod-restarting-repeatedly"><a class="header" href="#incident-type-pod-restarting-repeatedly">Incident Type: Pod Restarting Repeatedly</a></h3>
|
||
|
|
<pre><code>Detection: kubectl get pods shows high RESTARTS count
|
||
|
|
Diagnosis Time: 1 minute
|
||
|
|
Response:
|
||
|
|
1. Check restart count: kubectl get pods -n vapora
|
||
|
|
2. Get pod logs: kubectl logs <pod-name> -n vapora --previous
|
||
|
|
3. Get pod events: kubectl describe pod <pod-name> -n vapora
|
||
|
|
|
||
|
|
Solution:
|
||
|
|
- Application error: check logs, fix issue, redeploy
|
||
|
|
- Config issue: fix ConfigMap, restart pods
|
||
|
|
- Resource issue: increase limits or scale out
|
||
|
|
- Liveness probe failing: adjust probe timing or fix health check
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="incident-type-database-connectivity"><a class="header" href="#incident-type-database-connectivity">Incident Type: Database Connectivity</a></h3>
|
||
|
|
<pre><code>Detection: Logs show "database connection refused"
|
||
|
|
Diagnosis Time: 2 minutes
|
||
|
|
Response:
|
||
|
|
1. Check database service running: kubectl get pod -n <db-namespace>
|
||
|
|
2. Check database credentials in ConfigMap
|
||
|
|
3. Test connectivity: kubectl exec <pod> -- psql $DB_URL
|
||
|
|
4. Check firewall/network policy
|
||
|
|
|
||
|
|
Solution:
|
||
|
|
- If DB down: escalate to DBA, possibly restore from backup
|
||
|
|
- If credentials wrong: fix ConfigMap, restart app pods
|
||
|
|
- If network issue: network team investigation
|
||
|
|
- If no space: DBA cleanup
|
||
|
|
</code></pre>
|
||
|
|
<hr />
|
||
|
|
<h2 id="communication-during-incident"><a class="header" href="#communication-during-incident">Communication During Incident</a></h2>
|
||
|
|
<h3 id="every-2-minutes-severity-1-or-5-minutes-severity-2"><a class="header" href="#every-2-minutes-severity-1-or-5-minutes-severity-2">Every 2 Minutes (Severity 1) or 5 Minutes (Severity 2)</a></h3>
|
||
|
|
<p>Post update to #incident channel:</p>
|
||
|
|
<pre><code>⏱️ 14:35 UTC UPDATE
|
||
|
|
|
||
|
|
Status: Investigating
|
||
|
|
Current Action: Checking pod logs
|
||
|
|
Findings: Backend pods in CrashLoopBackOff
|
||
|
|
Next Step: Review recent deployment
|
||
|
|
ETA for Update: 14:37 UTC
|
||
|
|
|
||
|
|
/cc @on-call-engineer
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="status-page-updates-if-public"><a class="header" href="#status-page-updates-if-public">Status Page Updates (If Public)</a></h3>
|
||
|
|
<pre><code>INCIDENT: VAPORA API Partially Degraded
|
||
|
|
|
||
|
|
Investigating: Our team is investigating elevated error rates
|
||
|
|
Duration: 5 minutes
|
||
|
|
Impact: ~30% of API requests failing
|
||
|
|
|
||
|
|
Last Updated: 14:35 UTC
|
||
|
|
Next Update: 14:37 UTC
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="escalation-communication"><a class="header" href="#escalation-communication">Escalation Communication</a></h3>
|
||
|
|
<pre><code>If Severity 1 and unable to identify cause in 5 minutes:
|
||
|
|
|
||
|
|
"Escalating to senior engineering team.
|
||
|
|
Page @senior-engineer-on-call immediately.
|
||
|
|
Activating Incident War Room."
|
||
|
|
|
||
|
|
Include:
|
||
|
|
- Service name
|
||
|
|
- Duration so far
|
||
|
|
- What's been tried
|
||
|
|
- Current symptoms
|
||
|
|
- Why stuck
|
||
|
|
</code></pre>
|
||
|
|
<hr />
|
||
|
|
<h2 id="incident-severity-decision-tree"><a class="header" href="#incident-severity-decision-tree">Incident Severity Decision Tree</a></h2>
|
||
|
|
<pre><code>Question 1: Can any users access the service?
|
||
|
|
NO → Severity 1 (Critical - complete outage)
|
||
|
|
YES → Question 2
|
||
|
|
|
||
|
|
Question 2: What percentage of requests are failing?
|
||
|
|
>50% → Severity 1 (Critical)
|
||
|
|
10-50% → Severity 2 (Major)
|
||
|
|
5-10% → Severity 3 (Minor)
|
||
|
|
<5% → Question 3
|
||
|
|
|
||
|
|
Question 3: Is the service recovering on its own?
|
||
|
|
NO (staying broken) → Severity 2
|
||
|
|
YES (automatically recovering) → Question 4
|
||
|
|
|
||
|
|
Question 4: Does it require any user action/data loss?
|
||
|
|
YES → Severity 2
|
||
|
|
NO → Severity 3
|
||
|
|
</code></pre>
|
||
|
|
<hr />
|
||
|
|
<h2 id="post-incident-procedures"><a class="header" href="#post-incident-procedures">Post-Incident Procedures</a></h2>
|
||
|
|
<h3 id="immediate-within-30-minutes"><a class="header" href="#immediate-within-30-minutes">Immediate (Within 30 minutes)</a></h3>
|
||
|
|
<ul>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Close incident ticket</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Post final update to #incident channel</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Save all logs and diagnostics</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Create post-mortem ticket</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Notify team: "incident resolved"</li>
|
||
|
|
</ul>
|
||
|
|
<h3 id="follow-up-within-24-hours"><a class="header" href="#follow-up-within-24-hours">Follow-Up (Within 24 hours)</a></h3>
|
||
|
|
<ul>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Schedule post-mortem meeting</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Identify root cause</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Document preventive measures</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Identify owner for each action item</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Create tickets for improvements</li>
|
||
|
|
</ul>
|
||
|
|
<h3 id="prevention-within-1-week"><a class="header" href="#prevention-within-1-week">Prevention (Within 1 week)</a></h3>
|
||
|
|
<ul>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Implement identified fixes</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Update monitoring/alerting</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Update runbooks with findings</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Conduct team training if needed</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Close post-mortem ticket</li>
|
||
|
|
</ul>
|
||
|
|
<hr />
|
||
|
|
<h2 id="incident-checklist"><a class="header" href="#incident-checklist">Incident Checklist</a></h2>
|
||
|
|
<pre><code>☐ Incident severity determined
|
||
|
|
☐ Ticket created and updated
|
||
|
|
☐ #incident channel created
|
||
|
|
☐ On-call team alerted
|
||
|
|
☐ Initial diagnosis completed
|
||
|
|
☐ Fix identified and implemented
|
||
|
|
☐ Fix verified working
|
||
|
|
☐ Incident closed and communicated
|
||
|
|
☐ Post-mortem scheduled
|
||
|
|
☐ Team debriefed
|
||
|
|
☐ Root cause documented
|
||
|
|
☐ Prevention measures identified
|
||
|
|
☐ Tickets created for follow-up
|
||
|
|
</code></pre>
|
||
|
|
|
||
|
|
</main>
|
||
|
|
|
||
|
|
<nav class="nav-wrapper" aria-label="Page navigation">
|
||
|
|
<!-- Mobile navigation buttons -->
|
||
|
|
<a rel="prev" href="../../operations/on-call-procedures.html" class="mobile-nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
|
||
|
|
<i class="fa fa-angle-left"></i>
|
||
|
|
</a>
|
||
|
|
|
||
|
|
<a rel="next prefetch" href="../../operations/rollback-runbook.html" class="mobile-nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
|
||
|
|
<i class="fa fa-angle-right"></i>
|
||
|
|
</a>
|
||
|
|
|
||
|
|
<div style="clear: both"></div>
|
||
|
|
</nav>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<nav class="nav-wide-wrapper" aria-label="Page navigation">
|
||
|
|
<a rel="prev" href="../../operations/on-call-procedures.html" class="nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
|
||
|
|
<i class="fa fa-angle-left"></i>
|
||
|
|
</a>
|
||
|
|
|
||
|
|
<a rel="next prefetch" href="../../operations/rollback-runbook.html" class="nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
|
||
|
|
<i class="fa fa-angle-right"></i>
|
||
|
|
</a>
|
||
|
|
</nav>
|
||
|
|
|
||
|
|
</div>
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
<script>
|
||
|
|
window.playground_copyable = true;
|
||
|
|
</script>
|
||
|
|
|
||
|
|
|
||
|
|
<script src="../elasticlunr.min.js"></script>
|
||
|
|
<script src="../mark.min.js"></script>
|
||
|
|
<script src="../searcher.js"></script>
|
||
|
|
|
||
|
|
<script src="../clipboard.min.js"></script>
|
||
|
|
<script src="../highlight.js"></script>
|
||
|
|
<script src="../book.js"></script>
|
||
|
|
|
||
|
|
<!-- Custom JS scripts -->
|
||
|
|
|
||
|
|
|
||
|
|
</div>
|
||
|
|
</body>
|
||
|
|
</html>
|