Vapora/docs/operations/on-call-procedures.html
Jesús Pérez 7110ffeea2
Some checks failed
Rust CI / Security Audit (push) Has been cancelled
Rust CI / Check + Test + Lint (nightly) (push) Has been cancelled
Rust CI / Check + Test + Lint (stable) (push) Has been cancelled
chore: extend doc: adr, tutorials, operations, etc
2026-01-12 03:32:47 +00:00

805 lines
32 KiB
HTML

<!DOCTYPE HTML>
<html lang="en" class="light sidebar-visible" dir="ltr">
<head>
<!-- Book generated using mdBook -->
<meta charset="UTF-8">
<title>On-Call Procedures - VAPORA Platform Documentation</title>
<!-- Custom HTML head -->
<meta name="description" content="Comprehensive documentation for VAPORA, an intelligent development orchestration platform built entirely in Rust.">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="theme-color" content="#ffffff">
<link rel="icon" href="../favicon.svg">
<link rel="shortcut icon" href="../favicon.png">
<link rel="stylesheet" href="../css/variables.css">
<link rel="stylesheet" href="../css/general.css">
<link rel="stylesheet" href="../css/chrome.css">
<link rel="stylesheet" href="../css/print.css" media="print">
<!-- Fonts -->
<link rel="stylesheet" href="../FontAwesome/css/font-awesome.css">
<link rel="stylesheet" href="../fonts/fonts.css">
<!-- Highlight.js Stylesheets -->
<link rel="stylesheet" id="highlight-css" href="../highlight.css">
<link rel="stylesheet" id="tomorrow-night-css" href="../tomorrow-night.css">
<link rel="stylesheet" id="ayu-highlight-css" href="../ayu-highlight.css">
<!-- Custom theme stylesheets -->
<!-- Provide site root and default themes to javascript -->
<script>
const path_to_root = "../";
const default_light_theme = "light";
const default_dark_theme = "dark";
</script>
<!-- Start loading toc.js asap -->
<script src="../toc.js"></script>
</head>
<body>
<div id="mdbook-help-container">
<div id="mdbook-help-popup">
<h2 class="mdbook-help-title">Keyboard shortcuts</h2>
<div>
<p>Press <kbd></kbd> or <kbd></kbd> to navigate between chapters</p>
<p>Press <kbd>S</kbd> or <kbd>/</kbd> to search in the book</p>
<p>Press <kbd>?</kbd> to show this help</p>
<p>Press <kbd>Esc</kbd> to hide this help</p>
</div>
</div>
</div>
<div id="body-container">
<!-- Work around some values being stored in localStorage wrapped in quotes -->
<script>
try {
let theme = localStorage.getItem('mdbook-theme');
let sidebar = localStorage.getItem('mdbook-sidebar');
if (theme.startsWith('"') && theme.endsWith('"')) {
localStorage.setItem('mdbook-theme', theme.slice(1, theme.length - 1));
}
if (sidebar.startsWith('"') && sidebar.endsWith('"')) {
localStorage.setItem('mdbook-sidebar', sidebar.slice(1, sidebar.length - 1));
}
} catch (e) { }
</script>
<!-- Set the theme before any content is loaded, prevents flash -->
<script>
const default_theme = window.matchMedia("(prefers-color-scheme: dark)").matches ? default_dark_theme : default_light_theme;
let theme;
try { theme = localStorage.getItem('mdbook-theme'); } catch(e) { }
if (theme === null || theme === undefined) { theme = default_theme; }
const html = document.documentElement;
html.classList.remove('light')
html.classList.add(theme);
html.classList.add("js");
</script>
<input type="checkbox" id="sidebar-toggle-anchor" class="hidden">
<!-- Hide / unhide sidebar before it is displayed -->
<script>
let sidebar = null;
const sidebar_toggle = document.getElementById("sidebar-toggle-anchor");
if (document.body.clientWidth >= 1080) {
try { sidebar = localStorage.getItem('mdbook-sidebar'); } catch(e) { }
sidebar = sidebar || 'visible';
} else {
sidebar = 'hidden';
}
sidebar_toggle.checked = sidebar === 'visible';
html.classList.remove('sidebar-visible');
html.classList.add("sidebar-" + sidebar);
</script>
<nav id="sidebar" class="sidebar" aria-label="Table of contents">
<!-- populated by js -->
<mdbook-sidebar-scrollbox class="sidebar-scrollbox"></mdbook-sidebar-scrollbox>
<noscript>
<iframe class="sidebar-iframe-outer" src="../toc.html"></iframe>
</noscript>
<div id="sidebar-resize-handle" class="sidebar-resize-handle">
<div class="sidebar-resize-indicator"></div>
</div>
</nav>
<div id="page-wrapper" class="page-wrapper">
<div class="page">
<div id="menu-bar-hover-placeholder"></div>
<div id="menu-bar" class="menu-bar sticky">
<div class="left-buttons">
<label id="sidebar-toggle" class="icon-button" for="sidebar-toggle-anchor" title="Toggle Table of Contents" aria-label="Toggle Table of Contents" aria-controls="sidebar">
<i class="fa fa-bars"></i>
</label>
<button id="theme-toggle" class="icon-button" type="button" title="Change theme" aria-label="Change theme" aria-haspopup="true" aria-expanded="false" aria-controls="theme-list">
<i class="fa fa-paint-brush"></i>
</button>
<ul id="theme-list" class="theme-popup" aria-label="Themes" role="menu">
<li role="none"><button role="menuitem" class="theme" id="default_theme">Auto</button></li>
<li role="none"><button role="menuitem" class="theme" id="light">Light</button></li>
<li role="none"><button role="menuitem" class="theme" id="rust">Rust</button></li>
<li role="none"><button role="menuitem" class="theme" id="coal">Coal</button></li>
<li role="none"><button role="menuitem" class="theme" id="navy">Navy</button></li>
<li role="none"><button role="menuitem" class="theme" id="ayu">Ayu</button></li>
</ul>
<button id="search-toggle" class="icon-button" type="button" title="Search (`/`)" aria-label="Toggle Searchbar" aria-expanded="false" aria-keyshortcuts="/ s" aria-controls="searchbar">
<i class="fa fa-search"></i>
</button>
</div>
<h1 class="menu-title">VAPORA Platform Documentation</h1>
<div class="right-buttons">
<a href="../print.html" title="Print this book" aria-label="Print this book">
<i id="print-button" class="fa fa-print"></i>
</a>
<a href="https://github.com/vapora-platform/vapora" title="Git repository" aria-label="Git repository">
<i id="git-repository-button" class="fa fa-github"></i>
</a>
<a href="https://github.com/vapora-platform/vapora/edit/main/docs/src/../operations/on-call-procedures.md" title="Suggest an edit" aria-label="Suggest an edit">
<i id="git-edit-button" class="fa fa-edit"></i>
</a>
</div>
</div>
<div id="search-wrapper" class="hidden">
<form id="searchbar-outer" class="searchbar-outer">
<input type="search" id="searchbar" name="searchbar" placeholder="Search this book ..." aria-controls="searchresults-outer" aria-describedby="searchresults-header">
</form>
<div id="searchresults-outer" class="searchresults-outer hidden">
<div id="searchresults-header" class="searchresults-header"></div>
<ul id="searchresults">
</ul>
</div>
</div>
<!-- Apply ARIA attributes after the sidebar and the sidebar toggle button are added to the DOM -->
<script>
document.getElementById('sidebar-toggle').setAttribute('aria-expanded', sidebar === 'visible');
document.getElementById('sidebar').setAttribute('aria-hidden', sidebar !== 'visible');
Array.from(document.querySelectorAll('#sidebar a')).forEach(function(link) {
link.setAttribute('tabIndex', sidebar === 'visible' ? 0 : -1);
});
</script>
<div id="content" class="content">
<main>
<h1 id="on-call-procedures"><a class="header" href="#on-call-procedures">On-Call Procedures</a></h1>
<p>Guide for on-call engineers managing VAPORA production operations.</p>
<hr />
<h2 id="overview"><a class="header" href="#overview">Overview</a></h2>
<p><strong>On-Call Responsibility</strong>: Monitor VAPORA production and respond to incidents during assigned shift</p>
<p><strong>Time Commitment</strong>:</p>
<ul>
<li>During business hours: ~5-10 minutes daily check-ins</li>
<li>During off-hours: Available for emergencies (paged for critical issues)</li>
</ul>
<p><strong>Expected Availability</strong>:</p>
<ul>
<li>Severity 1: Respond within 2 minutes</li>
<li>Severity 2: Respond within 15 minutes</li>
<li>Severity 3: Respond within 1 hour</li>
</ul>
<hr />
<h2 id="before-your-shift-starts"><a class="header" href="#before-your-shift-starts">Before Your Shift Starts</a></h2>
<h3 id="24-hours-before-on-call"><a class="header" href="#24-hours-before-on-call">24 Hours Before On-Call</a></h3>
<ul>
<li><input disabled="" type="checkbox"/>
Verify schedule: "I'm on-call starting [date] [time]"</li>
<li><input disabled="" type="checkbox"/>
Update your calendar with shift times</li>
<li><input disabled="" type="checkbox"/>
Notify team: "I'll be on-call [dates]"</li>
<li><input disabled="" type="checkbox"/>
Share personal contact info if not already shared</li>
<li><input disabled="" type="checkbox"/>
Download necessary tools/credentials</li>
</ul>
<h3 id="1-hour-before-shift"><a class="header" href="#1-hour-before-shift">1 Hour Before Shift</a></h3>
<ul>
<li>
<p><input disabled="" type="checkbox"/>
Test pager notification system</p>
<pre><code class="language-bash"># Verify Slack notifications working
# Ask previous on-call to send test alert: "/test-alert-to-[yourname]"
</code></pre>
</li>
<li>
<p><input disabled="" type="checkbox"/>
Verify access to necessary systems</p>
<pre><code class="language-bash"># Test each required access:
✓ SSH to bastion host: ssh bastion.vapora.com
✓ kubectl to production: kubectl cluster-info
✓ Slack channels: /join #deployments #alerts
✓ Incident tracking: open Jira/GitHub
✓ Monitoring dashboards: access Grafana
✓ Status page: access status page admin
</code></pre>
</li>
<li>
<p><input disabled="" type="checkbox"/>
Review current system status</p>
<pre><code class="language-bash"># Quick health check
kubectl cluster-info
kubectl get pods -n vapora
kubectl get events -n vapora | head -10
# Should show: All pods Running, no recent errors
</code></pre>
</li>
<li>
<p><input disabled="" type="checkbox"/>
Read recent incident reports</p>
<ul>
<li>Check previous on-call handoff notes</li>
<li>Review any incidents from past week</li>
<li>Note any known issues or monitoring gaps</li>
</ul>
</li>
<li>
<p><input disabled="" type="checkbox"/>
Receive handoff from previous on-call</p>
<pre><code>Ask: "Anything I should know?"
- Any ongoing issues?
- Any deployments planned?
- Any flaky services or known alerts?
- Any customer complaints?
</code></pre>
</li>
</ul>
<hr />
<h2 id="daily-on-call-tasks"><a class="header" href="#daily-on-call-tasks">Daily On-Call Tasks</a></h2>
<h3 id="morning-check-in-after-shift-starts"><a class="header" href="#morning-check-in-after-shift-starts">Morning Check-In (After shift starts)</a></h3>
<pre><code class="language-bash"># Automated check - run this first thing
export NAMESPACE=vapora
echo "=== Cluster Health ==="
kubectl cluster-info
kubectl get nodes
echo "=== Pod Status ==="
kubectl get pods -n $NAMESPACE
kubectl get pods -n $NAMESPACE | grep -v Running
echo "=== Recent Events ==="
kubectl get events -n $NAMESPACE --sort-by='.lastTimestamp' | tail -10
echo "=== Resource Usage ==="
kubectl top nodes
kubectl top pods -n $NAMESPACE
# If any anomalies: investigate before declaring "all clear"
</code></pre>
<h3 id="mid-shift-check-every-4-hours"><a class="header" href="#mid-shift-check-every-4-hours">Mid-Shift Check (Every 4 hours)</a></h3>
<pre><code class="language-bash"># Quick sanity check
curl https://api.vapora.com/health
curl https://vapora.app/
# Should both return 200 OK
# Check dashboards
# Grafana: any alerts? any trending issues?
# Check Slack #alerts channel
# Any warnings or anomalies posted?
</code></pre>
<h3 id="end-of-shift-handoff-before-shift-ends"><a class="header" href="#end-of-shift-handoff-before-shift-ends">End-of-Shift Handoff (Before shift ends)</a></h3>
<pre><code class="language-bash"># Prepare handoff for next on-call
# 1. Document current state
kubectl get pods -n vapora
kubectl get nodes
kubectl top pods -n vapora
# 2. Check for known issues
kubectl get events -n vapora | grep Warning
# Any persistent warnings?
# 3. Check deployment status
git log -1 --oneline provisioning/
# Any recent changes?
# 4. Document in handoff notes:
echo "HANDOFF NOTES - $(date)
Duration: [start time] to [end time]
Status: All normal / Issues: [list]
Alerts: [any]
Deployments: [any planned]
Known issues: [any]
Recommendations: [any]
" &gt; on-call-handoff.txt
# 5. Pass notes to next on-call
# Send message to @next-on-call with notes
</code></pre>
<hr />
<h2 id="responding-to-alerts"><a class="header" href="#responding-to-alerts">Responding to Alerts</a></h2>
<h3 id="alert-received"><a class="header" href="#alert-received">Alert Received</a></h3>
<p><strong>Step 1: Verify it's real</strong></p>
<pre><code class="language-bash"># Don't panic - verify the alert is legitimate
1. Check the source: is it from our system?
2. Check current status manually: curl endpoints
3. Check dashboard: see if issue visible there
4. Check cluster: kubectl get pods
# False alarms happen - verify before escalating
</code></pre>
<p><strong>Step 2: Assess severity</strong></p>
<ul>
<li>Is service completely down? → Severity 1</li>
<li>Is service partially down? → Severity 2</li>
<li>Is there a warning/anomaly? → Severity 3</li>
</ul>
<p><strong>Step 3: Declare incident</strong></p>
<pre><code class="language-bash"># Create ticket (Severity 1 is emergency)
# If Severity 1:
# - Alert team immediately
# - Create #incident-[date] channel
# - Start 2-minute update cycle
# See: Incident Response Runbook
</code></pre>
<h3 id="during-incident"><a class="header" href="#during-incident">During Incident</a></h3>
<p><strong>Your role as on-call</strong>:</p>
<ol>
<li><strong>Respond quickly</strong> - First 2 minutes are critical</li>
<li><strong>Communicate</strong> - Update team/status page</li>
<li><strong>Investigate</strong> - Follow diagnostics in runbooks</li>
<li><strong>Escalate if needed</strong> - Page senior engineer if stuck</li>
<li><strong>Execute fix</strong> - Follow approved procedures</li>
<li><strong>Verify recovery</strong> - Confirm service healthy</li>
<li><strong>Document</strong> - Record what happened</li>
</ol>
<p><strong>Key communication</strong>:</p>
<ul>
<li>Initial response time: &lt; 2 min (post "investigating")</li>
<li>Status update: every 2-5 minutes</li>
<li>Escalation: if not clear after 5 minutes</li>
<li>Resolution: post "incident resolved"</li>
</ul>
<h3 id="alert-examples--responses"><a class="header" href="#alert-examples--responses">Alert Examples &amp; Responses</a></h3>
<h4 id="alert-pod-crashloopbackoff"><a class="header" href="#alert-pod-crashloopbackoff">Alert: "Pod CrashLoopBackOff"</a></h4>
<pre><code>1. Get pod logs: kubectl logs &lt;pod&gt; --previous
2. Check for config issues: kubectl get configmap
3. Check for resource limits: kubectl describe pod &lt;pod&gt;
4. Decide: rollback or fix config
</code></pre>
<h4 id="alert-high-error-rate-5-5xx"><a class="header" href="#alert-high-error-rate-5-5xx">Alert: "High Error Rate (&gt;5% 5xx)"</a></h4>
<pre><code>1. Check which endpoint: tail application logs
2. Check dependencies: database, cache, external APIs
3. Check recent deployment: git log
4. Decide: rollback or investigate further
</code></pre>
<h4 id="alert-pod-memory--90"><a class="header" href="#alert-pod-memory--90">Alert: "Pod Memory &gt; 90%"</a></h4>
<pre><code>1. Check actual usage: kubectl top pod &lt;pod&gt;
2. Check limits: kubectl get pod &lt;pod&gt; -o yaml | grep memory
3. Decide: scale up or investigate memory leak
</code></pre>
<h4 id="alert-node-notready"><a class="header" href="#alert-node-notready">Alert: "Node NotReady"</a></h4>
<pre><code>1. Check node: kubectl describe node &lt;node&gt;
2. Check kubelet: ssh node-x &amp;&amp; systemctl status kubelet
3. Contact infrastructure team for hardware issues
4. Possibly: drain node and reschedule pods
</code></pre>
<hr />
<h2 id="monitoring-dashboard-setup"><a class="header" href="#monitoring-dashboard-setup">Monitoring Dashboard Setup</a></h2>
<p>When you start shift, have these visible:</p>
<h3 id="browser-tabs-keep-open"><a class="header" href="#browser-tabs-keep-open">Browser Tabs (Keep Open)</a></h3>
<ol>
<li>
<p><strong>Grafana Dashboard</strong> - VAPORA Cluster Overview</p>
<ul>
<li>Pod CPU/Memory usage</li>
<li>Request rate and latency</li>
<li>Error rate</li>
<li>Deployment status</li>
</ul>
</li>
<li>
<p><strong>Kubernetes Dashboard</strong></p>
<ul>
<li>kubectl port-forward -n kube-system svc/kubernetes-dashboard 8443:443</li>
<li>Or use K9s terminal UI: <code>k9s</code></li>
</ul>
</li>
<li>
<p><strong>Alert Dashboard</strong> (if available)</p>
<ul>
<li>Prometheus Alerts</li>
<li>Or monitoring system of choice</li>
</ul>
</li>
<li>
<p><strong>Status Page</strong> (if public-facing)</p>
<ul>
<li>Check for ongoing incidents</li>
<li>Prepare to update</li>
</ul>
</li>
</ol>
<h3 id="terminal-windows-keep-ready"><a class="header" href="#terminal-windows-keep-ready">Terminal Windows (Keep Ready)</a></h3>
<pre><code class="language-bash"># Terminal 1: Watch pods
watch kubectl get pods -n vapora
# Terminal 2: Tail logs
kubectl logs -f deployment/vapora-backend -n vapora
# Terminal 3: General kubectl commands
kubectl -n vapora get events --watch
# Terminal 4: Ad-hoc commands and troubleshooting
# (leave empty for ad-hoc use)
</code></pre>
<hr />
<h2 id="common-questions-during-on-call"><a class="header" href="#common-questions-during-on-call">Common Questions During On-Call</a></h2>
<h3 id="q-i-think-i-found-an-issue-but-im-not-sure-its-a-problem"><a class="header" href="#q-i-think-i-found-an-issue-but-im-not-sure-its-a-problem">Q: I think I found an issue, but I'm not sure it's a problem</a></h3>
<p><strong>A</strong>: When in doubt, escalate:</p>
<ol>
<li>Post in #deployments channel with observation</li>
<li>Ask: "Does this look normal?"</li>
<li>If others confirm: might be issue</li>
<li>Better safe than sorry (on production)</li>
</ol>
<h3 id="q-do-i-need-to-respond-to-every-alert"><a class="header" href="#q-do-i-need-to-respond-to-every-alert">Q: Do I need to respond to every alert?</a></h3>
<p><strong>A</strong>: Yes. Even false alarms need verification:</p>
<ol>
<li>Confirm it's false alarm (not just assume)</li>
<li>Update alert if it's misconfigured</li>
<li>Never ignore alerts - fix the alerting</li>
</ol>
<h3 id="q-service-looks-broken-but-dashboard-looks-normal"><a class="header" href="#q-service-looks-broken-but-dashboard-looks-normal">Q: Service looks broken but dashboard looks normal</a></h3>
<p><strong>A</strong>:</p>
<ol>
<li>Check if dashboard might be delayed (sometimes refresh slow)</li>
<li>Test manually: curl endpoints</li>
<li>Check pod logs directly: kubectl logs</li>
<li>Trust actual service health over dashboard</li>
</ol>
<h3 id="q-can-i-deploy-changes-while-on-call"><a class="header" href="#q-can-i-deploy-changes-while-on-call">Q: Can I deploy changes while on-call?</a></h3>
<p><strong>A</strong>:</p>
<ul>
<li><strong>Yes</strong> if it's emergency fix for active incident</li>
<li><strong>No</strong> for normal features/changes (schedule for dedicated deployment window)</li>
<li><strong>Escalate</strong> if unsure</li>
</ul>
<h3 id="q-something-looks-weird-but-i-cant-reproduce-it"><a class="header" href="#q-something-looks-weird-but-i-cant-reproduce-it">Q: Something looks weird but I can't reproduce it</a></h3>
<p><strong>A</strong>:</p>
<ol>
<li>Save any evidence: logs, metrics, events</li>
<li>Monitor more closely for pattern</li>
<li>Document in ticket for later investigation</li>
<li>Escalate if behavior continues</li>
</ol>
<h3 id="q-an-alert-keeps-firing-but-service-is-fine"><a class="header" href="#q-an-alert-keeps-firing-but-service-is-fine">Q: An alert keeps firing but service is fine</a></h3>
<p><strong>A</strong>:</p>
<ol>
<li>Investigate why alert is false</li>
<li>Check alert thresholds (might be too sensitive)</li>
<li>Fix the alert configuration</li>
<li>Update alert runbook with details</li>
</ol>
<hr />
<h2 id="escalation-decision-tree"><a class="header" href="#escalation-decision-tree">Escalation Decision Tree</a></h2>
<p>When should you escalate?</p>
<pre><code>START: Issue detected
Is it Severity 1 (complete outage)?
YES → Escalate immediately to senior engineer
NO → Continue
Have you diagnosed root cause in 5 minutes?
YES → Continue with fix
NO → Page senior engineer or escalate
Does fix require infrastructure/database changes?
YES → Contact infrastructure/DBA team
NO → Continue with fix
Is this outside your authority (company policy)?
YES → Escalate to manager
NO → Proceed with fix
Implemented fix, service still broken?
YES → Page senior engineer immediately
NO → Verify and close incident
Result: Uncertain?
→ Ask senior engineer or manager
→ Always better to escalate early
</code></pre>
<hr />
<h2 id="when-to-page-senior-engineer"><a class="header" href="#when-to-page-senior-engineer">When to Page Senior Engineer</a></h2>
<p><strong>Page immediately if</strong>:</p>
<ul>
<li>Service completely down (Severity 1)</li>
<li>Database appears corrupted</li>
<li>You're stuck for &gt;5 minutes</li>
<li>Rollback didn't work</li>
<li>Need infrastructure changes urgently</li>
<li>Something affecting &gt;50% of users</li>
</ul>
<p><strong>Don't page just because</strong>:</p>
<ul>
<li>Single pod restarting (monitor first)</li>
<li>Transient network errors</li>
<li>You're slightly unsure (ask in #deployments first)</li>
<li>It's 3 AM and not critical (use tickets for morning)</li>
</ul>
<hr />
<h2 id="end-of-shift-handoff"><a class="header" href="#end-of-shift-handoff">End of Shift Handoff</a></h2>
<h3 id="create-handoff-report"><a class="header" href="#create-handoff-report">Create Handoff Report</a></h3>
<pre><code>SHIFT HANDOFF - [Your Name]
Dates: [Start] to [End] UTC
Duration: [X hours]
STATUS: ✅ All normal / ⚠️ Issues ongoing / ❌ Critical
INCIDENTS: [Number]
- Incident 1: [description, resolved or ongoing]
- Incident 2: [description]
ALERTS: [Any unusual alerts]
- Alert 1: [description, action taken]
DEPLOYMENTS: [Any scheduled or happened]
- Deployment 1: [status]
KNOWN ISSUES:
- Issue 1: [description, workaround]
- Issue 2: [description]
MONITORING NOTES:
- [Any trending issues]
- [Any monitoring gaps]
- [Any recommended actions]
RECOMMENDATIONS FOR NEXT ON-CALL:
1. [Action item]
2. [Action item]
3. [Action item]
NEXT ON-CALL: @[name]
</code></pre>
<h3 id="send-to-next-on-call"><a class="header" href="#send-to-next-on-call">Send to Next On-Call</a></h3>
<pre><code>@next-on-call - Handoff notes attached:
[paste report above]
Key points:
- [Most important item]
- [Second important]
- [Any urgent follow-ups]
Questions? I'm available for 30 min
</code></pre>
<hr />
<h2 id="tools--commands-reference"><a class="header" href="#tools--commands-reference">Tools &amp; Commands Reference</a></h2>
<h3 id="essential-commands"><a class="header" href="#essential-commands">Essential Commands</a></h3>
<pre><code class="language-bash"># Pod management
kubectl get pods -n vapora
kubectl logs pod-name -n vapora
kubectl exec pod-name -n vapora -- bash
kubectl describe pod pod-name -n vapora
kubectl delete pod pod-name -n vapora # (recreates via deployment)
# Deployment management
kubectl get deployments -n vapora
kubectl rollout status deployment/vapora-backend -n vapora
kubectl rollout undo deployment/vapora-backend -n vapora
kubectl scale deployment/vapora-backend --replicas=5 -n vapora
# Service health
curl http://localhost:8001/health
kubectl get events -n vapora
kubectl top pods -n vapora
kubectl get endpoints -n vapora
# Quick diagnostics
kubectl describe nodes
kubectl cluster-info
kubectl get persistent volumes
</code></pre>
<h3 id="useful-tools"><a class="header" href="#useful-tools">Useful Tools</a></h3>
<pre><code class="language-bash"># Install these on your workstation
brew install kubectl # Kubernetes CLI
brew install k9s # Terminal UI for K8s
brew install watch # Monitor command output
brew install jq # JSON processing
brew install yq # YAML processing
brew install grpcurl # gRPC debugging
# Aliases to save time
alias k='kubectl'
alias kgp='kubectl get pods'
alias klogs='kubectl logs'
alias kexec='kubectl exec'
</code></pre>
<h3 id="dashboards--links"><a class="header" href="#dashboards--links">Dashboards &amp; Links</a></h3>
<p>Bookmark these:</p>
<ul>
<li>Grafana: <code>https://grafana.vapora.com</code></li>
<li>Status Page: <code>https://status.vapora.com</code></li>
<li>Incident Tracker: <code>https://github.com/your-org/vapora/issues</code></li>
<li>Runbooks: <code>https://github.com/your-org/vapora/tree/main/docs/operations</code></li>
<li>Kubernetes Dashboard: Run <code>kubectl proxy</code> then <code>http://localhost:8001/ui</code></li>
</ul>
<hr />
<h2 id="on-call-checklist"><a class="header" href="#on-call-checklist">On-Call Checklist</a></h2>
<h3 id="starting-shift"><a class="header" href="#starting-shift">Starting Shift</a></h3>
<ul>
<li><input disabled="" type="checkbox"/>
Verified pager notifications working</li>
<li><input disabled="" type="checkbox"/>
Tested access to all systems</li>
<li><input disabled="" type="checkbox"/>
Reviewed current system status</li>
<li><input disabled="" type="checkbox"/>
Read recent incidents</li>
<li><input disabled="" type="checkbox"/>
Received handoff from previous on-call</li>
<li><input disabled="" type="checkbox"/>
Set up monitoring dashboards</li>
<li><input disabled="" type="checkbox"/>
Opened necessary terminal windows</li>
<li><input disabled="" type="checkbox"/>
Posted "on-call" status in #deployments</li>
</ul>
<h3 id="during-shift"><a class="header" href="#during-shift">During Shift</a></h3>
<ul>
<li><input disabled="" type="checkbox"/>
Responded to all alerts within SLA</li>
<li><input disabled="" type="checkbox"/>
Updated incident status regularly</li>
<li><input disabled="" type="checkbox"/>
Escalated when appropriate</li>
<li><input disabled="" type="checkbox"/>
Documented actions in tickets</li>
<li><input disabled="" type="checkbox"/>
Verified fixes before closing</li>
<li><input disabled="" type="checkbox"/>
Communicated clearly with team</li>
</ul>
<h3 id="ending-shift"><a class="header" href="#ending-shift">Ending Shift</a></h3>
<ul>
<li><input disabled="" type="checkbox"/>
Created handoff report</li>
<li><input disabled="" type="checkbox"/>
Resolved or escalated open issues</li>
<li><input disabled="" type="checkbox"/>
Updated monitoring for anomalies</li>
<li><input disabled="" type="checkbox"/>
Passed report to next on-call</li>
<li><input disabled="" type="checkbox"/>
Closed out incident tickets</li>
<li><input disabled="" type="checkbox"/>
Verified next on-call is ready</li>
<li><input disabled="" type="checkbox"/>
Posted "handing off to [next on-call]" in #deployments</li>
</ul>
<hr />
<h2 id="post-on-call-follow-up"><a class="header" href="#post-on-call-follow-up">Post-On-Call Follow-Up</a></h2>
<p>After your shift:</p>
<ol>
<li>
<p><strong>Document lessons learned</strong></p>
<ul>
<li>Did you learn something new?</li>
<li>Did any procedure need updating?</li>
<li>Were any runbooks unclear?</li>
</ul>
</li>
<li>
<p><strong>Update runbooks</strong></p>
<ul>
<li>If you found gaps, update procedures</li>
<li>If you had questions, update docs</li>
<li>Share improvements with team</li>
</ul>
</li>
<li>
<p><strong>Communicate findings</strong></p>
<ul>
<li>Anything the team should know?</li>
<li>Any recommendations?</li>
<li>Trends to watch?</li>
</ul>
</li>
<li>
<p><strong>Celebrate successes</strong></p>
<ul>
<li>Any incidents quickly resolved?</li>
<li>Any new insights?</li>
<li>Recognize good practices</li>
</ul>
</li>
</ol>
<hr />
<h2 id="emergency-contacts"><a class="header" href="#emergency-contacts">Emergency Contacts</a></h2>
<p>Keep these accessible:</p>
<pre><code>ESCALATION CONTACTS:
Primary Escalation: [Name] [Phone] [Slack]
Backup Escalation: [Name] [Phone] [Slack]
Infrastructure: [Name] [Phone] [Slack]
Database Team: [Name] [Phone] [Slack]
Manager: [Name] [Phone] [Slack]
External Contacts:
AWS Support: [Account ID] [Contact]
CDN Provider: [Account] [Contact]
DNS Provider: [Account] [Contact]
EMERGENCY PROCEDURES:
- Complete AWS outage: Contact AWS support immediately
- Database failure: Contact DBA, activate backups
- Security incident: Contact security team immediately
- Major data loss: Activate disaster recovery
</code></pre>
<hr />
<h2 id="remember"><a class="header" href="#remember">Remember</a></h2>
<p><strong>You are the guardian of production</strong> - Your vigilance keeps services running</p>
<p><strong>Better safe than sorry</strong> - Escalate early and often</p>
<p><strong>Communication is key</strong> - Keep team informed</p>
<p><strong>Document everything</strong> - Future you and team will thank you</p>
<p><strong>Ask for help</strong> - No shame in escalating</p>
<p><strong>Don't guess</strong> - Verify before taking action</p>
<p><strong>Don't stay silent</strong> - Alert team to any issues</p>
<p><strong>Don't ignore alerts</strong> - Even false ones need investigation</p>
</main>
<nav class="nav-wrapper" aria-label="Page navigation">
<!-- Mobile navigation buttons -->
<a rel="prev" href="../../operations/monitoring-operations.html" class="mobile-nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
<i class="fa fa-angle-left"></i>
</a>
<a rel="next prefetch" href="../../operations/incident-response-runbook.html" class="mobile-nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
<i class="fa fa-angle-right"></i>
</a>
<div style="clear: both"></div>
</nav>
</div>
</div>
<nav class="nav-wide-wrapper" aria-label="Page navigation">
<a rel="prev" href="../../operations/monitoring-operations.html" class="nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
<i class="fa fa-angle-left"></i>
</a>
<a rel="next prefetch" href="../../operations/incident-response-runbook.html" class="nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
<i class="fa fa-angle-right"></i>
</a>
</nav>
</div>
<script>
window.playground_copyable = true;
</script>
<script src="../elasticlunr.min.js"></script>
<script src="../mark.min.js"></script>
<script src="../searcher.js"></script>
<script src="../clipboard.min.js"></script>
<script src="../highlight.js"></script>
<script src="../book.js"></script>
<!-- Custom JS scripts -->
</div>
</body>
</html>