780 lines
33 KiB
HTML
780 lines
33 KiB
HTML
|
|
<!DOCTYPE HTML>
|
||
|
|
<html lang="en" class="light sidebar-visible" dir="ltr">
|
||
|
|
<head>
|
||
|
|
<!-- Book generated using mdBook -->
|
||
|
|
<meta charset="UTF-8">
|
||
|
|
<title>Operations Overview - VAPORA Platform Documentation</title>
|
||
|
|
|
||
|
|
|
||
|
|
<!-- Custom HTML head -->
|
||
|
|
|
||
|
|
<meta name="description" content="Comprehensive documentation for VAPORA, an intelligent development orchestration platform built entirely in Rust.">
|
||
|
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||
|
|
<meta name="theme-color" content="#ffffff">
|
||
|
|
|
||
|
|
<link rel="icon" href="../favicon.svg">
|
||
|
|
<link rel="shortcut icon" href="../favicon.png">
|
||
|
|
<link rel="stylesheet" href="../css/variables.css">
|
||
|
|
<link rel="stylesheet" href="../css/general.css">
|
||
|
|
<link rel="stylesheet" href="../css/chrome.css">
|
||
|
|
<link rel="stylesheet" href="../css/print.css" media="print">
|
||
|
|
|
||
|
|
<!-- Fonts -->
|
||
|
|
<link rel="stylesheet" href="../FontAwesome/css/font-awesome.css">
|
||
|
|
<link rel="stylesheet" href="../fonts/fonts.css">
|
||
|
|
|
||
|
|
<!-- Highlight.js Stylesheets -->
|
||
|
|
<link rel="stylesheet" id="highlight-css" href="../highlight.css">
|
||
|
|
<link rel="stylesheet" id="tomorrow-night-css" href="../tomorrow-night.css">
|
||
|
|
<link rel="stylesheet" id="ayu-highlight-css" href="../ayu-highlight.css">
|
||
|
|
|
||
|
|
<!-- Custom theme stylesheets -->
|
||
|
|
|
||
|
|
|
||
|
|
<!-- Provide site root and default themes to javascript -->
|
||
|
|
<script>
|
||
|
|
const path_to_root = "../";
|
||
|
|
const default_light_theme = "light";
|
||
|
|
const default_dark_theme = "dark";
|
||
|
|
</script>
|
||
|
|
<!-- Start loading toc.js asap -->
|
||
|
|
<script src="../toc.js"></script>
|
||
|
|
</head>
|
||
|
|
<body>
|
||
|
|
<div id="mdbook-help-container">
|
||
|
|
<div id="mdbook-help-popup">
|
||
|
|
<h2 class="mdbook-help-title">Keyboard shortcuts</h2>
|
||
|
|
<div>
|
||
|
|
<p>Press <kbd>←</kbd> or <kbd>→</kbd> to navigate between chapters</p>
|
||
|
|
<p>Press <kbd>S</kbd> or <kbd>/</kbd> to search in the book</p>
|
||
|
|
<p>Press <kbd>?</kbd> to show this help</p>
|
||
|
|
<p>Press <kbd>Esc</kbd> to hide this help</p>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
<div id="body-container">
|
||
|
|
<!-- Work around some values being stored in localStorage wrapped in quotes -->
|
||
|
|
<script>
|
||
|
|
try {
|
||
|
|
let theme = localStorage.getItem('mdbook-theme');
|
||
|
|
let sidebar = localStorage.getItem('mdbook-sidebar');
|
||
|
|
|
||
|
|
if (theme.startsWith('"') && theme.endsWith('"')) {
|
||
|
|
localStorage.setItem('mdbook-theme', theme.slice(1, theme.length - 1));
|
||
|
|
}
|
||
|
|
|
||
|
|
if (sidebar.startsWith('"') && sidebar.endsWith('"')) {
|
||
|
|
localStorage.setItem('mdbook-sidebar', sidebar.slice(1, sidebar.length - 1));
|
||
|
|
}
|
||
|
|
} catch (e) { }
|
||
|
|
</script>
|
||
|
|
|
||
|
|
<!-- Set the theme before any content is loaded, prevents flash -->
|
||
|
|
<script>
|
||
|
|
const default_theme = window.matchMedia("(prefers-color-scheme: dark)").matches ? default_dark_theme : default_light_theme;
|
||
|
|
let theme;
|
||
|
|
try { theme = localStorage.getItem('mdbook-theme'); } catch(e) { }
|
||
|
|
if (theme === null || theme === undefined) { theme = default_theme; }
|
||
|
|
const html = document.documentElement;
|
||
|
|
html.classList.remove('light')
|
||
|
|
html.classList.add(theme);
|
||
|
|
html.classList.add("js");
|
||
|
|
</script>
|
||
|
|
|
||
|
|
<input type="checkbox" id="sidebar-toggle-anchor" class="hidden">
|
||
|
|
|
||
|
|
<!-- Hide / unhide sidebar before it is displayed -->
|
||
|
|
<script>
|
||
|
|
let sidebar = null;
|
||
|
|
const sidebar_toggle = document.getElementById("sidebar-toggle-anchor");
|
||
|
|
if (document.body.clientWidth >= 1080) {
|
||
|
|
try { sidebar = localStorage.getItem('mdbook-sidebar'); } catch(e) { }
|
||
|
|
sidebar = sidebar || 'visible';
|
||
|
|
} else {
|
||
|
|
sidebar = 'hidden';
|
||
|
|
}
|
||
|
|
sidebar_toggle.checked = sidebar === 'visible';
|
||
|
|
html.classList.remove('sidebar-visible');
|
||
|
|
html.classList.add("sidebar-" + sidebar);
|
||
|
|
</script>
|
||
|
|
|
||
|
|
<nav id="sidebar" class="sidebar" aria-label="Table of contents">
|
||
|
|
<!-- populated by js -->
|
||
|
|
<mdbook-sidebar-scrollbox class="sidebar-scrollbox"></mdbook-sidebar-scrollbox>
|
||
|
|
<noscript>
|
||
|
|
<iframe class="sidebar-iframe-outer" src="../toc.html"></iframe>
|
||
|
|
</noscript>
|
||
|
|
<div id="sidebar-resize-handle" class="sidebar-resize-handle">
|
||
|
|
<div class="sidebar-resize-indicator"></div>
|
||
|
|
</div>
|
||
|
|
</nav>
|
||
|
|
|
||
|
|
<div id="page-wrapper" class="page-wrapper">
|
||
|
|
|
||
|
|
<div class="page">
|
||
|
|
<div id="menu-bar-hover-placeholder"></div>
|
||
|
|
<div id="menu-bar" class="menu-bar sticky">
|
||
|
|
<div class="left-buttons">
|
||
|
|
<label id="sidebar-toggle" class="icon-button" for="sidebar-toggle-anchor" title="Toggle Table of Contents" aria-label="Toggle Table of Contents" aria-controls="sidebar">
|
||
|
|
<i class="fa fa-bars"></i>
|
||
|
|
</label>
|
||
|
|
<button id="theme-toggle" class="icon-button" type="button" title="Change theme" aria-label="Change theme" aria-haspopup="true" aria-expanded="false" aria-controls="theme-list">
|
||
|
|
<i class="fa fa-paint-brush"></i>
|
||
|
|
</button>
|
||
|
|
<ul id="theme-list" class="theme-popup" aria-label="Themes" role="menu">
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="default_theme">Auto</button></li>
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="light">Light</button></li>
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="rust">Rust</button></li>
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="coal">Coal</button></li>
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="navy">Navy</button></li>
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="ayu">Ayu</button></li>
|
||
|
|
</ul>
|
||
|
|
<button id="search-toggle" class="icon-button" type="button" title="Search (`/`)" aria-label="Toggle Searchbar" aria-expanded="false" aria-keyshortcuts="/ s" aria-controls="searchbar">
|
||
|
|
<i class="fa fa-search"></i>
|
||
|
|
</button>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<h1 class="menu-title">VAPORA Platform Documentation</h1>
|
||
|
|
|
||
|
|
<div class="right-buttons">
|
||
|
|
<a href="../print.html" title="Print this book" aria-label="Print this book">
|
||
|
|
<i id="print-button" class="fa fa-print"></i>
|
||
|
|
</a>
|
||
|
|
<a href="https://github.com/vapora-platform/vapora" title="Git repository" aria-label="Git repository">
|
||
|
|
<i id="git-repository-button" class="fa fa-github"></i>
|
||
|
|
</a>
|
||
|
|
<a href="https://github.com/vapora-platform/vapora/edit/main/docs/src/../operations/README.md" title="Suggest an edit" aria-label="Suggest an edit">
|
||
|
|
<i id="git-edit-button" class="fa fa-edit"></i>
|
||
|
|
</a>
|
||
|
|
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<div id="search-wrapper" class="hidden">
|
||
|
|
<form id="searchbar-outer" class="searchbar-outer">
|
||
|
|
<input type="search" id="searchbar" name="searchbar" placeholder="Search this book ..." aria-controls="searchresults-outer" aria-describedby="searchresults-header">
|
||
|
|
</form>
|
||
|
|
<div id="searchresults-outer" class="searchresults-outer hidden">
|
||
|
|
<div id="searchresults-header" class="searchresults-header"></div>
|
||
|
|
<ul id="searchresults">
|
||
|
|
</ul>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<!-- Apply ARIA attributes after the sidebar and the sidebar toggle button are added to the DOM -->
|
||
|
|
<script>
|
||
|
|
document.getElementById('sidebar-toggle').setAttribute('aria-expanded', sidebar === 'visible');
|
||
|
|
document.getElementById('sidebar').setAttribute('aria-hidden', sidebar !== 'visible');
|
||
|
|
Array.from(document.querySelectorAll('#sidebar a')).forEach(function(link) {
|
||
|
|
link.setAttribute('tabIndex', sidebar === 'visible' ? 0 : -1);
|
||
|
|
});
|
||
|
|
</script>
|
||
|
|
|
||
|
|
<div id="content" class="content">
|
||
|
|
<main>
|
||
|
|
<h1 id="vapora-operations-runbooks"><a class="header" href="#vapora-operations-runbooks">VAPORA Operations Runbooks</a></h1>
|
||
|
|
<p>Complete set of runbooks and procedures for deploying, monitoring, and operating VAPORA in production environments.</p>
|
||
|
|
<hr />
|
||
|
|
<h2 id="quick-navigation"><a class="header" href="#quick-navigation">Quick Navigation</a></h2>
|
||
|
|
<p><strong>I need to...</strong></p>
|
||
|
|
<ul>
|
||
|
|
<li><strong>Deploy to production</strong>: See <a href="./deployment-runbook.html">Deployment Runbook</a> or <a href="./pre-deployment-checklist.html">Pre-Deployment Checklist</a></li>
|
||
|
|
<li><strong>Respond to an incident</strong>: See <a href="./incident-response-runbook.html">Incident Response Runbook</a></li>
|
||
|
|
<li><strong>Rollback a deployment</strong>: See <a href="./rollback-runbook.html">Rollback Runbook</a></li>
|
||
|
|
<li><strong>Go on-call</strong>: See <a href="./on-call-procedures.html">On-Call Procedures</a></li>
|
||
|
|
<li><strong>Monitor services</strong>: See <a href="#monitoring">Monitoring Runbook</a></li>
|
||
|
|
<li><strong>Understand common failures</strong>: See <a href="#common-failures">Common Failure Scenarios</a></li>
|
||
|
|
</ul>
|
||
|
|
<hr />
|
||
|
|
<h2 id="runbook-overview"><a class="header" href="#runbook-overview">Runbook Overview</a></h2>
|
||
|
|
<h3 id="1-pre-deployment-checklist"><a class="header" href="#1-pre-deployment-checklist">1. Pre-Deployment Checklist</a></h3>
|
||
|
|
<p><strong>When</strong>: 24 hours before any production deployment</p>
|
||
|
|
<p><strong>Content</strong>: Comprehensive checklist for deployment preparation including:</p>
|
||
|
|
<ul>
|
||
|
|
<li>Communication & scheduling</li>
|
||
|
|
<li>Code review & validation</li>
|
||
|
|
<li>Environment verification</li>
|
||
|
|
<li>Health baseline recording</li>
|
||
|
|
<li>Artifact preparation</li>
|
||
|
|
<li>Rollback plan verification</li>
|
||
|
|
</ul>
|
||
|
|
<p><strong>Time</strong>: 1-2 hours</p>
|
||
|
|
<p><strong>File</strong>: <a href="./pre-deployment-checklist.html"><code>pre-deployment-checklist.md</code></a></p>
|
||
|
|
<h3 id="2-deployment-runbook"><a class="header" href="#2-deployment-runbook">2. Deployment Runbook</a></h3>
|
||
|
|
<p><strong>When</strong>: Executing actual production deployment</p>
|
||
|
|
<p><strong>Content</strong>: Step-by-step deployment procedures including:</p>
|
||
|
|
<ul>
|
||
|
|
<li>Pre-flight checks (5 min)</li>
|
||
|
|
<li>Configuration deployment (3 min)</li>
|
||
|
|
<li>Deployment update (5 min)</li>
|
||
|
|
<li>Verification (5 min)</li>
|
||
|
|
<li>Validation (3 min)</li>
|
||
|
|
<li>Communication & monitoring</li>
|
||
|
|
</ul>
|
||
|
|
<p><strong>Time</strong>: 15-20 minutes total</p>
|
||
|
|
<p><strong>File</strong>: <a href="./deployment-runbook.html"><code>deployment-runbook.md</code></a></p>
|
||
|
|
<h3 id="3-rollback-runbook"><a class="header" href="#3-rollback-runbook">3. Rollback Runbook</a></h3>
|
||
|
|
<p><strong>When</strong>: Issues detected after deployment requiring immediate rollback</p>
|
||
|
|
<p><strong>Content</strong>: Safe rollback procedures including:</p>
|
||
|
|
<ul>
|
||
|
|
<li>When to rollback (decision criteria)</li>
|
||
|
|
<li>Kubernetes automatic rollback (step-by-step)</li>
|
||
|
|
<li>Docker manual rollback (guided)</li>
|
||
|
|
<li>Post-rollback verification</li>
|
||
|
|
<li>Emergency procedures</li>
|
||
|
|
<li>Prevention & lessons learned</li>
|
||
|
|
</ul>
|
||
|
|
<p><strong>Time</strong>: 5-10 minutes (depending on issues)</p>
|
||
|
|
<p><strong>File</strong>: <a href="./rollback-runbook.html"><code>rollback-runbook.md</code></a></p>
|
||
|
|
<h3 id="4-incident-response-runbook"><a class="header" href="#4-incident-response-runbook">4. Incident Response Runbook</a></h3>
|
||
|
|
<p><strong>When</strong>: Production incident declared</p>
|
||
|
|
<p><strong>Content</strong>: Full incident response procedures including:</p>
|
||
|
|
<ul>
|
||
|
|
<li>Severity levels (1-4) with examples</li>
|
||
|
|
<li>Report & assess procedures</li>
|
||
|
|
<li>Diagnosis & escalation</li>
|
||
|
|
<li>Fix implementation</li>
|
||
|
|
<li>Recovery verification</li>
|
||
|
|
<li>Communication templates</li>
|
||
|
|
<li>Role definitions</li>
|
||
|
|
</ul>
|
||
|
|
<p><strong>Time</strong>: Varies by severity (2 min to 1+ hour)</p>
|
||
|
|
<p><strong>File</strong>: <a href="./incident-response-runbook.html"><code>incident-response-runbook.md</code></a></p>
|
||
|
|
<h3 id="5-on-call-procedures"><a class="header" href="#5-on-call-procedures">5. On-Call Procedures</a></h3>
|
||
|
|
<p><strong>When</strong>: During assigned on-call shift</p>
|
||
|
|
<p><strong>Content</strong>: Full on-call guide including:</p>
|
||
|
|
<ul>
|
||
|
|
<li>Before shift starts (setup & verification)</li>
|
||
|
|
<li>Daily tasks & check-ins</li>
|
||
|
|
<li>Responding to alerts</li>
|
||
|
|
<li>Monitoring dashboard setup</li>
|
||
|
|
<li>Escalation decision tree</li>
|
||
|
|
<li>Shift handoff procedures</li>
|
||
|
|
<li>Common questions & answers</li>
|
||
|
|
</ul>
|
||
|
|
<p><strong>Time</strong>: Read thoroughly before first on-call shift (~30 min)</p>
|
||
|
|
<p><strong>File</strong>: <a href="./on-call-procedures.html"><code>on-call-procedures.md</code></a></p>
|
||
|
|
<hr />
|
||
|
|
<h2 id="deployment-workflow"><a class="header" href="#deployment-workflow">Deployment Workflow</a></h2>
|
||
|
|
<h3 id="standard-deployment-process"><a class="header" href="#standard-deployment-process">Standard Deployment Process</a></h3>
|
||
|
|
<pre><code>DAY 1 (Planning)
|
||
|
|
↓
|
||
|
|
- Create GitHub issue/ticket
|
||
|
|
- Identify deployment window
|
||
|
|
- Notify stakeholders
|
||
|
|
|
||
|
|
24 HOURS BEFORE
|
||
|
|
↓
|
||
|
|
- Complete pre-deployment checklist
|
||
|
|
(pre-deployment-checklist.md)
|
||
|
|
- Verify all prerequisites
|
||
|
|
- Stage artifacts
|
||
|
|
- Test in staging
|
||
|
|
|
||
|
|
DEPLOYMENT DAY
|
||
|
|
↓
|
||
|
|
- Final go/no-go decision
|
||
|
|
- Execute deployment runbook
|
||
|
|
(deployment-runbook.md)
|
||
|
|
- Pre-flight checks
|
||
|
|
- ConfigMap deployment
|
||
|
|
- Service deployment
|
||
|
|
- Verification
|
||
|
|
- Communication
|
||
|
|
|
||
|
|
POST-DEPLOYMENT (2 hours)
|
||
|
|
↓
|
||
|
|
- Monitor closely (every 10 minutes)
|
||
|
|
- Watch for issues
|
||
|
|
- If problems → execute rollback runbook
|
||
|
|
(rollback-runbook.md)
|
||
|
|
- Document results
|
||
|
|
|
||
|
|
24 HOURS LATER
|
||
|
|
↓
|
||
|
|
- Declare deployment stable
|
||
|
|
- Schedule post-mortem (if issues)
|
||
|
|
- Update documentation
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="if-issues-during-deployment"><a class="header" href="#if-issues-during-deployment">If Issues During Deployment</a></h3>
|
||
|
|
<pre><code>Issue Detected
|
||
|
|
↓
|
||
|
|
Severity Assessment
|
||
|
|
↓
|
||
|
|
Severity 1-2:
|
||
|
|
├─ Immediate rollback
|
||
|
|
│ (rollback-runbook.md)
|
||
|
|
│
|
||
|
|
└─ Post-rollback investigation
|
||
|
|
(incident-response-runbook.md)
|
||
|
|
|
||
|
|
Severity 3-4:
|
||
|
|
├─ Monitor and investigate
|
||
|
|
│ (incident-response-runbook.md)
|
||
|
|
│
|
||
|
|
└─ Fix in place if quick
|
||
|
|
OR
|
||
|
|
Schedule rollback
|
||
|
|
</code></pre>
|
||
|
|
<hr />
|
||
|
|
<h2 id="monitoring--alerting"><a class="header" href="#monitoring--alerting">Monitoring & Alerting</a></h2>
|
||
|
|
<h3 id="essential-dashboards"><a class="header" href="#essential-dashboards">Essential Dashboards</a></h3>
|
||
|
|
<p>These should be visible during deployments and always on-call:</p>
|
||
|
|
<ol>
|
||
|
|
<li>
|
||
|
|
<p><strong>Kubernetes Dashboard</strong></p>
|
||
|
|
<ul>
|
||
|
|
<li>Pod status</li>
|
||
|
|
<li>Node health</li>
|
||
|
|
<li>Event logs</li>
|
||
|
|
</ul>
|
||
|
|
</li>
|
||
|
|
<li>
|
||
|
|
<p><strong>Grafana Dashboards</strong> (if available)</p>
|
||
|
|
<ul>
|
||
|
|
<li>Request rate and latency</li>
|
||
|
|
<li>Error rate</li>
|
||
|
|
<li>CPU/Memory usage</li>
|
||
|
|
<li>Pod restart counts</li>
|
||
|
|
</ul>
|
||
|
|
</li>
|
||
|
|
<li>
|
||
|
|
<p><strong>Application Logs</strong> (Elasticsearch, CloudWatch, etc.)</p>
|
||
|
|
<ul>
|
||
|
|
<li>Error messages</li>
|
||
|
|
<li>Stack traces</li>
|
||
|
|
<li>Performance logs</li>
|
||
|
|
</ul>
|
||
|
|
</li>
|
||
|
|
</ol>
|
||
|
|
<h3 id="alert-triggers--responses"><a class="header" href="#alert-triggers--responses">Alert Triggers & Responses</a></h3>
|
||
|
|
<div class="table-wrapper"><table><thead><tr><th>Alert</th><th>Severity</th><th>Response</th></tr></thead><tbody>
|
||
|
|
<tr><td>Pod CrashLoopBackOff</td><td>1</td><td>Check logs, likely config issue</td></tr>
|
||
|
|
<tr><td>Error rate >10%</td><td>1</td><td>Check recent deployment, consider rollback</td></tr>
|
||
|
|
<tr><td>All pods pending</td><td>1</td><td>Node issue or resource exhausted</td></tr>
|
||
|
|
<tr><td>High memory usage >90%</td><td>2</td><td>Check for memory leak or scale up</td></tr>
|
||
|
|
<tr><td>High latency (2x normal)</td><td>2</td><td>Check database, external services</td></tr>
|
||
|
|
<tr><td>Single pod failed</td><td>3</td><td>Monitor, likely transient</td></tr>
|
||
|
|
</tbody></table>
|
||
|
|
</div>
|
||
|
|
<h3 id="health-check-commands"><a class="header" href="#health-check-commands">Health Check Commands</a></h3>
|
||
|
|
<p>Quick commands to verify everything is working:</p>
|
||
|
|
<pre><code class="language-bash"># Cluster health
|
||
|
|
kubectl cluster-info
|
||
|
|
kubectl get nodes # All should be Ready
|
||
|
|
|
||
|
|
# Service health
|
||
|
|
kubectl get pods -n vapora
|
||
|
|
# All should be Running, 1/1 Ready
|
||
|
|
|
||
|
|
# Quick endpoints test
|
||
|
|
curl http://localhost:8001/health
|
||
|
|
curl http://localhost:3000
|
||
|
|
|
||
|
|
# Pod resources
|
||
|
|
kubectl top pods -n vapora
|
||
|
|
|
||
|
|
# Recent issues
|
||
|
|
kubectl get events -n vapora | grep Warning
|
||
|
|
kubectl logs deployment/vapora-backend -n vapora --tail=20
|
||
|
|
</code></pre>
|
||
|
|
<hr />
|
||
|
|
<h2 id="common-failure-scenarios"><a class="header" href="#common-failure-scenarios">Common Failure Scenarios</a></h2>
|
||
|
|
<h3 id="pod-crashloopbackoff"><a class="header" href="#pod-crashloopbackoff">Pod CrashLoopBackOff</a></h3>
|
||
|
|
<p><strong>Symptoms</strong>: Pod keeps restarting repeatedly</p>
|
||
|
|
<p><strong>Diagnosis</strong>:</p>
|
||
|
|
<pre><code class="language-bash">kubectl logs <pod> -n vapora --previous # See what crashed
|
||
|
|
kubectl describe pod <pod> -n vapora # Check events
|
||
|
|
</code></pre>
|
||
|
|
<p><strong>Solutions</strong>:</p>
|
||
|
|
<ol>
|
||
|
|
<li>If config error: Fix ConfigMap, restart pod</li>
|
||
|
|
<li>If code error: Rollback deployment</li>
|
||
|
|
<li>If resource issue: Increase limits or scale out</li>
|
||
|
|
</ol>
|
||
|
|
<p><strong>Runbook</strong>: <a href="./rollback-runbook.html">Rollback Runbook</a> or <a href="./incident-response-runbook.html">Incident Response</a></p>
|
||
|
|
<h3 id="pod-stuck-in-pending"><a class="header" href="#pod-stuck-in-pending">Pod Stuck in Pending</a></h3>
|
||
|
|
<p><strong>Symptoms</strong>: Pod won't start, stuck in "Pending" state</p>
|
||
|
|
<p><strong>Diagnosis</strong>:</p>
|
||
|
|
<pre><code class="language-bash">kubectl describe pod <pod> -n vapora # Check "Events" section
|
||
|
|
</code></pre>
|
||
|
|
<p><strong>Common causes</strong>:</p>
|
||
|
|
<ul>
|
||
|
|
<li>Insufficient CPU/memory on nodes</li>
|
||
|
|
<li>Node disk full</li>
|
||
|
|
<li>Pod can't be scheduled</li>
|
||
|
|
<li>Persistent volume not available</li>
|
||
|
|
</ul>
|
||
|
|
<p><strong>Solutions</strong>:</p>
|
||
|
|
<ol>
|
||
|
|
<li>Scale down other workloads</li>
|
||
|
|
<li>Add more nodes</li>
|
||
|
|
<li>Fix persistent volume issues</li>
|
||
|
|
<li>Check node disk space</li>
|
||
|
|
</ol>
|
||
|
|
<p><strong>Runbook</strong>: <a href="./on-call-procedures.html">On-Call Procedures</a> → "Common Questions"</p>
|
||
|
|
<h3 id="service-unresponsive-connection-refused"><a class="header" href="#service-unresponsive-connection-refused">Service Unresponsive (Connection Refused)</a></h3>
|
||
|
|
<p><strong>Symptoms</strong>: <code>curl: (7) Failed to connect to localhost port 8001</code></p>
|
||
|
|
<p><strong>Diagnosis</strong>:</p>
|
||
|
|
<pre><code class="language-bash">kubectl get pods -n vapora # Are pods even running?
|
||
|
|
kubectl get service vapora-backend -n vapora # Does service exist?
|
||
|
|
kubectl get endpoints -n vapora # Do endpoints exist?
|
||
|
|
</code></pre>
|
||
|
|
<p><strong>Common causes</strong>:</p>
|
||
|
|
<ul>
|
||
|
|
<li>Pods not running (restart loops)</li>
|
||
|
|
<li>Service missing or misconfigured</li>
|
||
|
|
<li>Port incorrect</li>
|
||
|
|
<li>Network policy blocking traffic</li>
|
||
|
|
</ul>
|
||
|
|
<p><strong>Solutions</strong>:</p>
|
||
|
|
<ol>
|
||
|
|
<li>Verify pods running: <code>kubectl get pods</code></li>
|
||
|
|
<li>Verify service exists: <code>kubectl get svc</code></li>
|
||
|
|
<li>Check endpoints: <code>kubectl get endpoints</code></li>
|
||
|
|
<li>Port-forward if issue with routing: <code>kubectl port-forward svc/vapora-backend 8001:8001</code></li>
|
||
|
|
</ol>
|
||
|
|
<p><strong>Runbook</strong>: <a href="./incident-response-runbook.html">Incident Response</a></p>
|
||
|
|
<h3 id="high-error-rate"><a class="header" href="#high-error-rate">High Error Rate</a></h3>
|
||
|
|
<p><strong>Symptoms</strong>: Dashboard shows >5% 5xx errors</p>
|
||
|
|
<p><strong>Diagnosis</strong>:</p>
|
||
|
|
<pre><code class="language-bash"># Check which endpoint
|
||
|
|
kubectl logs deployment/vapora-backend -n vapora | grep "ERROR\|500"
|
||
|
|
|
||
|
|
# Check recent deployment
|
||
|
|
git log -1 --oneline provisioning/
|
||
|
|
|
||
|
|
# Check dependencies
|
||
|
|
curl http://localhost:8001/health # is it healthy?
|
||
|
|
</code></pre>
|
||
|
|
<p><strong>Common causes</strong>:</p>
|
||
|
|
<ul>
|
||
|
|
<li>Recent bad deployment</li>
|
||
|
|
<li>Database connectivity issue</li>
|
||
|
|
<li>Configuration error</li>
|
||
|
|
<li>Dependency service down</li>
|
||
|
|
</ul>
|
||
|
|
<p><strong>Solutions</strong>:</p>
|
||
|
|
<ol>
|
||
|
|
<li>If recent deployment: Consider rollback</li>
|
||
|
|
<li>Check ConfigMap for typos</li>
|
||
|
|
<li>Check database connectivity</li>
|
||
|
|
<li>Check external service health</li>
|
||
|
|
</ol>
|
||
|
|
<p><strong>Runbook</strong>: <a href="./rollback-runbook.html">Rollback Runbook</a> or <a href="./incident-response-runbook.html">Incident Response</a></p>
|
||
|
|
<h3 id="resource-exhaustion-cpumemory"><a class="header" href="#resource-exhaustion-cpumemory">Resource Exhaustion (CPU/Memory)</a></h3>
|
||
|
|
<p><strong>Symptoms</strong>: <code>kubectl top pods</code> shows pod at 100% usage or "limits exceeded"</p>
|
||
|
|
<p><strong>Diagnosis</strong>:</p>
|
||
|
|
<pre><code class="language-bash">kubectl top nodes # Overall node usage
|
||
|
|
kubectl top pods -n vapora # Per-pod usage
|
||
|
|
kubectl get pod <pod> -o yaml | grep limits -A 10 # Check limits
|
||
|
|
</code></pre>
|
||
|
|
<p><strong>Solutions</strong>:</p>
|
||
|
|
<ol>
|
||
|
|
<li>Increase pod resource limits (requires redeployment)</li>
|
||
|
|
<li>Scale out (add more replicas)</li>
|
||
|
|
<li>Scale down other workloads</li>
|
||
|
|
<li>Investigate memory leak if growing</li>
|
||
|
|
</ol>
|
||
|
|
<p><strong>Runbook</strong>: <a href="./deployment-runbook.html">Deployment Runbook</a> → Phase 4 (Verification)</p>
|
||
|
|
<h3 id="database-connection-errors"><a class="header" href="#database-connection-errors">Database Connection Errors</a></h3>
|
||
|
|
<p><strong>Symptoms</strong>: <code>ERROR: could not connect to database</code></p>
|
||
|
|
<p><strong>Diagnosis</strong>:</p>
|
||
|
|
<pre><code class="language-bash"># Check database is running
|
||
|
|
kubectl get pods -n <database-namespace>
|
||
|
|
|
||
|
|
# Check credentials in ConfigMap
|
||
|
|
kubectl get configmap vapora-config -n vapora -o yaml | grep -i "database\|password"
|
||
|
|
|
||
|
|
# Test connectivity
|
||
|
|
kubectl exec <pod> -n vapora -- psql $DATABASE_URL
|
||
|
|
</code></pre>
|
||
|
|
<p><strong>Solutions</strong>:</p>
|
||
|
|
<ol>
|
||
|
|
<li>If credentials wrong: Fix in ConfigMap, restart pods</li>
|
||
|
|
<li>If database down: Escalate to DBA</li>
|
||
|
|
<li>If network issue: Network team investigation</li>
|
||
|
|
<li>If permissions: Update database user</li>
|
||
|
|
</ol>
|
||
|
|
<p><strong>Runbook</strong>: <a href="./incident-response-runbook.html">Incident Response</a> → "Root Cause: Database Issues"</p>
|
||
|
|
<hr />
|
||
|
|
<h2 id="communication-templates"><a class="header" href="#communication-templates">Communication Templates</a></h2>
|
||
|
|
<h3 id="deployment-start"><a class="header" href="#deployment-start">Deployment Start</a></h3>
|
||
|
|
<pre><code>🚀 Deployment starting
|
||
|
|
|
||
|
|
Service: VAPORA
|
||
|
|
Version: v1.2.1
|
||
|
|
Mode: Enterprise
|
||
|
|
Expected duration: 10-15 minutes
|
||
|
|
|
||
|
|
Will update every 2 minutes. Questions? Ask in #deployments
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="deployment-complete"><a class="header" href="#deployment-complete">Deployment Complete</a></h3>
|
||
|
|
<pre><code>✅ Deployment complete
|
||
|
|
|
||
|
|
Duration: 12 minutes
|
||
|
|
Status: All services healthy
|
||
|
|
Pods: All running
|
||
|
|
|
||
|
|
Health check results:
|
||
|
|
✓ Backend: responding
|
||
|
|
✓ Frontend: accessible
|
||
|
|
✓ API: normal latency
|
||
|
|
✓ No errors in logs
|
||
|
|
|
||
|
|
Next step: Monitor for 2 hours
|
||
|
|
Contact: @on-call-engineer
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="incident-declared"><a class="header" href="#incident-declared">Incident Declared</a></h3>
|
||
|
|
<pre><code>🔴 INCIDENT DECLARED
|
||
|
|
|
||
|
|
Service: VAPORA Backend
|
||
|
|
Severity: 1 (Critical)
|
||
|
|
Time detected: HH:MM UTC
|
||
|
|
Current status: Investigating
|
||
|
|
|
||
|
|
Updates every 2 minutes
|
||
|
|
/cc @on-call-engineer @senior-engineer
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="incident-resolved"><a class="header" href="#incident-resolved">Incident Resolved</a></h3>
|
||
|
|
<pre><code>✅ Incident resolved
|
||
|
|
|
||
|
|
Duration: 8 minutes
|
||
|
|
Root cause: [description]
|
||
|
|
Fix: [what was done]
|
||
|
|
|
||
|
|
All services healthy, monitoring for 1 hour
|
||
|
|
Post-mortem scheduled for [date]
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="rollback-executed"><a class="header" href="#rollback-executed">Rollback Executed</a></h3>
|
||
|
|
<pre><code>🔙 Rollback executed
|
||
|
|
|
||
|
|
Issue detected in v1.2.1
|
||
|
|
Rolled back to v1.2.0
|
||
|
|
|
||
|
|
Status: Services recovering
|
||
|
|
Timeline: Issue 14:30 → Rollback 14:32 → Recovered 14:35
|
||
|
|
|
||
|
|
Investigating root cause
|
||
|
|
</code></pre>
|
||
|
|
<hr />
|
||
|
|
<h2 id="escalation-matrix"><a class="header" href="#escalation-matrix">Escalation Matrix</a></h2>
|
||
|
|
<p>When unsure who to contact:</p>
|
||
|
|
<div class="table-wrapper"><table><thead><tr><th>Issue Type</th><th>First Contact</th><th>Escalation</th><th>Emergency</th></tr></thead><tbody>
|
||
|
|
<tr><td><strong>Deployment issue</strong></td><td>Deployment lead</td><td>Ops team</td><td>Ops manager</td></tr>
|
||
|
|
<tr><td><strong>Pod/Container</strong></td><td>On-call engineer</td><td>Senior engineer</td><td>Director of Eng</td></tr>
|
||
|
|
<tr><td><strong>Database</strong></td><td>DBA team</td><td>Ops manager</td><td>CTO</td></tr>
|
||
|
|
<tr><td><strong>Infrastructure</strong></td><td>Infra team</td><td>Ops manager</td><td>VP Ops</td></tr>
|
||
|
|
<tr><td><strong>Security issue</strong></td><td>Security team</td><td>CISO</td><td>CEO</td></tr>
|
||
|
|
<tr><td><strong>Networking</strong></td><td>Network team</td><td>Ops manager</td><td>CTO</td></tr>
|
||
|
|
</tbody></table>
|
||
|
|
</div>
|
||
|
|
<hr />
|
||
|
|
<h2 id="tools--commands-quick-reference"><a class="header" href="#tools--commands-quick-reference">Tools & Commands Quick Reference</a></h2>
|
||
|
|
<h3 id="essential-kubectl-commands"><a class="header" href="#essential-kubectl-commands">Essential kubectl Commands</a></h3>
|
||
|
|
<pre><code class="language-bash"># Get status
|
||
|
|
kubectl get pods -n vapora
|
||
|
|
kubectl get deployments -n vapora
|
||
|
|
kubectl get services -n vapora
|
||
|
|
|
||
|
|
# Logs
|
||
|
|
kubectl logs deployment/vapora-backend -n vapora
|
||
|
|
kubectl logs <pod> -n vapora --previous # Previous crash
|
||
|
|
kubectl logs <pod> -n vapora -f # Follow/tail
|
||
|
|
|
||
|
|
# Execute commands
|
||
|
|
kubectl exec -it <pod> -n vapora -- bash
|
||
|
|
kubectl exec <pod> -n vapora -- curl http://localhost:8001/health
|
||
|
|
|
||
|
|
# Describe (detailed info)
|
||
|
|
kubectl describe pod <pod> -n vapora
|
||
|
|
kubectl describe node <node>
|
||
|
|
|
||
|
|
# Port forward (local access)
|
||
|
|
kubectl port-forward svc/vapora-backend 8001:8001
|
||
|
|
|
||
|
|
# Restart pods
|
||
|
|
kubectl rollout restart deployment/vapora-backend -n vapora
|
||
|
|
|
||
|
|
# Rollback
|
||
|
|
kubectl rollout undo deployment/vapora-backend -n vapora
|
||
|
|
|
||
|
|
# Scale
|
||
|
|
kubectl scale deployment/vapora-backend --replicas=5 -n vapora
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="useful-aliases"><a class="header" href="#useful-aliases">Useful Aliases</a></h3>
|
||
|
|
<pre><code class="language-bash">alias k='kubectl'
|
||
|
|
alias kgp='kubectl get pods'
|
||
|
|
alias kgd='kubectl get deployments'
|
||
|
|
alias kgs='kubectl get services'
|
||
|
|
alias klogs='kubectl logs'
|
||
|
|
alias kexec='kubectl exec'
|
||
|
|
alias kdesc='kubectl describe'
|
||
|
|
alias ktop='kubectl top'
|
||
|
|
</code></pre>
|
||
|
|
<hr />
|
||
|
|
<h2 id="before-your-first-deployment"><a class="header" href="#before-your-first-deployment">Before Your First Deployment</a></h2>
|
||
|
|
<ol>
|
||
|
|
<li><strong>Read all runbooks</strong>: Thoroughly review all procedures</li>
|
||
|
|
<li><strong>Practice in staging</strong>: Do a test deployment to staging first</li>
|
||
|
|
<li><strong>Understand rollback</strong>: Know how to rollback before deploying</li>
|
||
|
|
<li><strong>Get trained</strong>: Have senior engineer walk through procedures</li>
|
||
|
|
<li><strong>Test tools</strong>: Verify kubectl and other tools work</li>
|
||
|
|
<li><strong>Verify access</strong>: Confirm you have cluster access</li>
|
||
|
|
<li><strong>Know contacts</strong>: Have escalation contacts readily available</li>
|
||
|
|
<li><strong>Review history</strong>: Look at past deployments to understand patterns</li>
|
||
|
|
</ol>
|
||
|
|
<hr />
|
||
|
|
<h2 id="continuous-improvement"><a class="header" href="#continuous-improvement">Continuous Improvement</a></h2>
|
||
|
|
<h3 id="after-each-deployment"><a class="header" href="#after-each-deployment">After Each Deployment</a></h3>
|
||
|
|
<ul>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Were all runbooks clear?</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Any steps missing or unclear?</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Any issues that could be prevented?</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Update documentation with learnings</li>
|
||
|
|
</ul>
|
||
|
|
<h3 id="monthly-review"><a class="header" href="#monthly-review">Monthly Review</a></h3>
|
||
|
|
<ul>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Review all incidents from past month</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Update procedures based on patterns</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Refresh team on any changes</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Update escalation contacts</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Review and improve alerting</li>
|
||
|
|
</ul>
|
||
|
|
<hr />
|
||
|
|
<h2 id="key-principles"><a class="header" href="#key-principles">Key Principles</a></h2>
|
||
|
|
<p>✅ <strong>Safety First</strong></p>
|
||
|
|
<ul>
|
||
|
|
<li>Always dry-run before applying</li>
|
||
|
|
<li>Rollback quickly if issues detected</li>
|
||
|
|
<li>Better to be conservative</li>
|
||
|
|
</ul>
|
||
|
|
<p>✅ <strong>Communication</strong></p>
|
||
|
|
<ul>
|
||
|
|
<li>Communicate early and often</li>
|
||
|
|
<li>Update every 2-5 minutes during incidents</li>
|
||
|
|
<li>Notify stakeholders proactively</li>
|
||
|
|
</ul>
|
||
|
|
<p>✅ <strong>Documentation</strong></p>
|
||
|
|
<ul>
|
||
|
|
<li>Document everything you do</li>
|
||
|
|
<li>Update runbooks with learnings</li>
|
||
|
|
<li>Share knowledge with team</li>
|
||
|
|
</ul>
|
||
|
|
<p>✅ <strong>Preparation</strong></p>
|
||
|
|
<ul>
|
||
|
|
<li>Plan deployments thoroughly</li>
|
||
|
|
<li>Test before going live</li>
|
||
|
|
<li>Have rollback plan ready</li>
|
||
|
|
</ul>
|
||
|
|
<p>✅ <strong>Quick Response</strong></p>
|
||
|
|
<ul>
|
||
|
|
<li>Detect issues quickly</li>
|
||
|
|
<li>Diagnose systematically</li>
|
||
|
|
<li>Execute fixes decisively</li>
|
||
|
|
</ul>
|
||
|
|
<p>❌ <strong>Avoid</strong></p>
|
||
|
|
<ul>
|
||
|
|
<li>Guessing without verifying</li>
|
||
|
|
<li>Skipping steps to save time</li>
|
||
|
|
<li>Assuming systems are working</li>
|
||
|
|
<li>Not communicating with team</li>
|
||
|
|
<li>Making multiple changes at once</li>
|
||
|
|
</ul>
|
||
|
|
<hr />
|
||
|
|
<h2 id="support--questions"><a class="header" href="#support--questions">Support & Questions</a></h2>
|
||
|
|
<ul>
|
||
|
|
<li><strong>Questions about procedures?</strong> Ask senior engineer or operations team</li>
|
||
|
|
<li><strong>Found runbook gap?</strong> Create issue/PR to update documentation</li>
|
||
|
|
<li><strong>Unclear instructions?</strong> Clarify before executing critical operations</li>
|
||
|
|
<li><strong>Ideas for improvement?</strong> Share in team meetings or documentation repo</li>
|
||
|
|
</ul>
|
||
|
|
<hr />
|
||
|
|
<h2 id="quick-start-your-first-deployment"><a class="header" href="#quick-start-your-first-deployment">Quick Start: Your First Deployment</a></h2>
|
||
|
|
<h3 id="day-0-preparation"><a class="header" href="#day-0-preparation">Day 0: Preparation</a></h3>
|
||
|
|
<ol>
|
||
|
|
<li>Read: <code>pre-deployment-checklist.md</code> (30 min)</li>
|
||
|
|
<li>Read: <code>deployment-runbook.md</code> (30 min)</li>
|
||
|
|
<li>Read: <code>rollback-runbook.md</code> (20 min)</li>
|
||
|
|
<li>Schedule walkthrough with senior engineer (1 hour)</li>
|
||
|
|
</ol>
|
||
|
|
<h3 id="day-1-execute-with-mentorship"><a class="header" href="#day-1-execute-with-mentorship">Day 1: Execute with Mentorship</a></h3>
|
||
|
|
<ol>
|
||
|
|
<li>Complete pre-deployment checklist with senior engineer</li>
|
||
|
|
<li>Execute deployment runbook with senior observing</li>
|
||
|
|
<li>Monitor for 2 hours with senior available</li>
|
||
|
|
<li>Debrief: what went well, what to improve</li>
|
||
|
|
</ol>
|
||
|
|
<h3 id="day-2-independent-deployments"><a class="header" href="#day-2-independent-deployments">Day 2+: Independent Deployments</a></h3>
|
||
|
|
<ol>
|
||
|
|
<li>Complete checklist independently</li>
|
||
|
|
<li>Execute runbook</li>
|
||
|
|
<li>Document and communicate</li>
|
||
|
|
<li>Ask for help if anything unclear</li>
|
||
|
|
</ol>
|
||
|
|
<hr />
|
||
|
|
<p><strong>Generated</strong>: 2026-01-12
|
||
|
|
<strong>Status</strong>: Production-ready
|
||
|
|
<strong>Last Updated</strong>: 2026-01-12</p>
|
||
|
|
|
||
|
|
</main>
|
||
|
|
|
||
|
|
<nav class="nav-wrapper" aria-label="Page navigation">
|
||
|
|
<!-- Mobile navigation buttons -->
|
||
|
|
<a rel="prev" href="../../tutorials/03-llm-routing.html" class="mobile-nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
|
||
|
|
<i class="fa fa-angle-left"></i>
|
||
|
|
</a>
|
||
|
|
|
||
|
|
<a rel="next prefetch" href="../../operations/deployment-runbook.html" class="mobile-nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
|
||
|
|
<i class="fa fa-angle-right"></i>
|
||
|
|
</a>
|
||
|
|
|
||
|
|
<div style="clear: both"></div>
|
||
|
|
</nav>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<nav class="nav-wide-wrapper" aria-label="Page navigation">
|
||
|
|
<a rel="prev" href="../../tutorials/03-llm-routing.html" class="nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
|
||
|
|
<i class="fa fa-angle-left"></i>
|
||
|
|
</a>
|
||
|
|
|
||
|
|
<a rel="next prefetch" href="../../operations/deployment-runbook.html" class="nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
|
||
|
|
<i class="fa fa-angle-right"></i>
|
||
|
|
</a>
|
||
|
|
</nav>
|
||
|
|
|
||
|
|
</div>
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
<script>
|
||
|
|
window.playground_copyable = true;
|
||
|
|
</script>
|
||
|
|
|
||
|
|
|
||
|
|
<script src="../elasticlunr.min.js"></script>
|
||
|
|
<script src="../mark.min.js"></script>
|
||
|
|
<script src="../searcher.js"></script>
|
||
|
|
|
||
|
|
<script src="../clipboard.min.js"></script>
|
||
|
|
<script src="../highlight.js"></script>
|
||
|
|
<script src="../book.js"></script>
|
||
|
|
|
||
|
|
<!-- Custom JS scripts -->
|
||
|
|
|
||
|
|
|
||
|
|
</div>
|
||
|
|
</body>
|
||
|
|
</html>
|