805 lines
32 KiB
HTML
805 lines
32 KiB
HTML
|
|
<!DOCTYPE HTML>
|
||
|
|
<html lang="en" class="light sidebar-visible" dir="ltr">
|
||
|
|
<head>
|
||
|
|
<!-- Book generated using mdBook -->
|
||
|
|
<meta charset="UTF-8">
|
||
|
|
<title>On-Call Procedures - VAPORA Platform Documentation</title>
|
||
|
|
|
||
|
|
|
||
|
|
<!-- Custom HTML head -->
|
||
|
|
|
||
|
|
<meta name="description" content="Comprehensive documentation for VAPORA, an intelligent development orchestration platform built entirely in Rust.">
|
||
|
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||
|
|
<meta name="theme-color" content="#ffffff">
|
||
|
|
|
||
|
|
<link rel="icon" href="../favicon.svg">
|
||
|
|
<link rel="shortcut icon" href="../favicon.png">
|
||
|
|
<link rel="stylesheet" href="../css/variables.css">
|
||
|
|
<link rel="stylesheet" href="../css/general.css">
|
||
|
|
<link rel="stylesheet" href="../css/chrome.css">
|
||
|
|
<link rel="stylesheet" href="../css/print.css" media="print">
|
||
|
|
|
||
|
|
<!-- Fonts -->
|
||
|
|
<link rel="stylesheet" href="../FontAwesome/css/font-awesome.css">
|
||
|
|
<link rel="stylesheet" href="../fonts/fonts.css">
|
||
|
|
|
||
|
|
<!-- Highlight.js Stylesheets -->
|
||
|
|
<link rel="stylesheet" id="highlight-css" href="../highlight.css">
|
||
|
|
<link rel="stylesheet" id="tomorrow-night-css" href="../tomorrow-night.css">
|
||
|
|
<link rel="stylesheet" id="ayu-highlight-css" href="../ayu-highlight.css">
|
||
|
|
|
||
|
|
<!-- Custom theme stylesheets -->
|
||
|
|
|
||
|
|
|
||
|
|
<!-- Provide site root and default themes to javascript -->
|
||
|
|
<script>
|
||
|
|
const path_to_root = "../";
|
||
|
|
const default_light_theme = "light";
|
||
|
|
const default_dark_theme = "dark";
|
||
|
|
</script>
|
||
|
|
<!-- Start loading toc.js asap -->
|
||
|
|
<script src="../toc.js"></script>
|
||
|
|
</head>
|
||
|
|
<body>
|
||
|
|
<div id="mdbook-help-container">
|
||
|
|
<div id="mdbook-help-popup">
|
||
|
|
<h2 class="mdbook-help-title">Keyboard shortcuts</h2>
|
||
|
|
<div>
|
||
|
|
<p>Press <kbd>←</kbd> or <kbd>→</kbd> to navigate between chapters</p>
|
||
|
|
<p>Press <kbd>S</kbd> or <kbd>/</kbd> to search in the book</p>
|
||
|
|
<p>Press <kbd>?</kbd> to show this help</p>
|
||
|
|
<p>Press <kbd>Esc</kbd> to hide this help</p>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
<div id="body-container">
|
||
|
|
<!-- Work around some values being stored in localStorage wrapped in quotes -->
|
||
|
|
<script>
|
||
|
|
try {
|
||
|
|
let theme = localStorage.getItem('mdbook-theme');
|
||
|
|
let sidebar = localStorage.getItem('mdbook-sidebar');
|
||
|
|
|
||
|
|
if (theme.startsWith('"') && theme.endsWith('"')) {
|
||
|
|
localStorage.setItem('mdbook-theme', theme.slice(1, theme.length - 1));
|
||
|
|
}
|
||
|
|
|
||
|
|
if (sidebar.startsWith('"') && sidebar.endsWith('"')) {
|
||
|
|
localStorage.setItem('mdbook-sidebar', sidebar.slice(1, sidebar.length - 1));
|
||
|
|
}
|
||
|
|
} catch (e) { }
|
||
|
|
</script>
|
||
|
|
|
||
|
|
<!-- Set the theme before any content is loaded, prevents flash -->
|
||
|
|
<script>
|
||
|
|
const default_theme = window.matchMedia("(prefers-color-scheme: dark)").matches ? default_dark_theme : default_light_theme;
|
||
|
|
let theme;
|
||
|
|
try { theme = localStorage.getItem('mdbook-theme'); } catch(e) { }
|
||
|
|
if (theme === null || theme === undefined) { theme = default_theme; }
|
||
|
|
const html = document.documentElement;
|
||
|
|
html.classList.remove('light')
|
||
|
|
html.classList.add(theme);
|
||
|
|
html.classList.add("js");
|
||
|
|
</script>
|
||
|
|
|
||
|
|
<input type="checkbox" id="sidebar-toggle-anchor" class="hidden">
|
||
|
|
|
||
|
|
<!-- Hide / unhide sidebar before it is displayed -->
|
||
|
|
<script>
|
||
|
|
let sidebar = null;
|
||
|
|
const sidebar_toggle = document.getElementById("sidebar-toggle-anchor");
|
||
|
|
if (document.body.clientWidth >= 1080) {
|
||
|
|
try { sidebar = localStorage.getItem('mdbook-sidebar'); } catch(e) { }
|
||
|
|
sidebar = sidebar || 'visible';
|
||
|
|
} else {
|
||
|
|
sidebar = 'hidden';
|
||
|
|
}
|
||
|
|
sidebar_toggle.checked = sidebar === 'visible';
|
||
|
|
html.classList.remove('sidebar-visible');
|
||
|
|
html.classList.add("sidebar-" + sidebar);
|
||
|
|
</script>
|
||
|
|
|
||
|
|
<nav id="sidebar" class="sidebar" aria-label="Table of contents">
|
||
|
|
<!-- populated by js -->
|
||
|
|
<mdbook-sidebar-scrollbox class="sidebar-scrollbox"></mdbook-sidebar-scrollbox>
|
||
|
|
<noscript>
|
||
|
|
<iframe class="sidebar-iframe-outer" src="../toc.html"></iframe>
|
||
|
|
</noscript>
|
||
|
|
<div id="sidebar-resize-handle" class="sidebar-resize-handle">
|
||
|
|
<div class="sidebar-resize-indicator"></div>
|
||
|
|
</div>
|
||
|
|
</nav>
|
||
|
|
|
||
|
|
<div id="page-wrapper" class="page-wrapper">
|
||
|
|
|
||
|
|
<div class="page">
|
||
|
|
<div id="menu-bar-hover-placeholder"></div>
|
||
|
|
<div id="menu-bar" class="menu-bar sticky">
|
||
|
|
<div class="left-buttons">
|
||
|
|
<label id="sidebar-toggle" class="icon-button" for="sidebar-toggle-anchor" title="Toggle Table of Contents" aria-label="Toggle Table of Contents" aria-controls="sidebar">
|
||
|
|
<i class="fa fa-bars"></i>
|
||
|
|
</label>
|
||
|
|
<button id="theme-toggle" class="icon-button" type="button" title="Change theme" aria-label="Change theme" aria-haspopup="true" aria-expanded="false" aria-controls="theme-list">
|
||
|
|
<i class="fa fa-paint-brush"></i>
|
||
|
|
</button>
|
||
|
|
<ul id="theme-list" class="theme-popup" aria-label="Themes" role="menu">
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="default_theme">Auto</button></li>
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="light">Light</button></li>
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="rust">Rust</button></li>
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="coal">Coal</button></li>
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="navy">Navy</button></li>
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="ayu">Ayu</button></li>
|
||
|
|
</ul>
|
||
|
|
<button id="search-toggle" class="icon-button" type="button" title="Search (`/`)" aria-label="Toggle Searchbar" aria-expanded="false" aria-keyshortcuts="/ s" aria-controls="searchbar">
|
||
|
|
<i class="fa fa-search"></i>
|
||
|
|
</button>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<h1 class="menu-title">VAPORA Platform Documentation</h1>
|
||
|
|
|
||
|
|
<div class="right-buttons">
|
||
|
|
<a href="../print.html" title="Print this book" aria-label="Print this book">
|
||
|
|
<i id="print-button" class="fa fa-print"></i>
|
||
|
|
</a>
|
||
|
|
<a href="https://github.com/vapora-platform/vapora" title="Git repository" aria-label="Git repository">
|
||
|
|
<i id="git-repository-button" class="fa fa-github"></i>
|
||
|
|
</a>
|
||
|
|
<a href="https://github.com/vapora-platform/vapora/edit/main/docs/src/../operations/on-call-procedures.md" title="Suggest an edit" aria-label="Suggest an edit">
|
||
|
|
<i id="git-edit-button" class="fa fa-edit"></i>
|
||
|
|
</a>
|
||
|
|
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<div id="search-wrapper" class="hidden">
|
||
|
|
<form id="searchbar-outer" class="searchbar-outer">
|
||
|
|
<input type="search" id="searchbar" name="searchbar" placeholder="Search this book ..." aria-controls="searchresults-outer" aria-describedby="searchresults-header">
|
||
|
|
</form>
|
||
|
|
<div id="searchresults-outer" class="searchresults-outer hidden">
|
||
|
|
<div id="searchresults-header" class="searchresults-header"></div>
|
||
|
|
<ul id="searchresults">
|
||
|
|
</ul>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<!-- Apply ARIA attributes after the sidebar and the sidebar toggle button are added to the DOM -->
|
||
|
|
<script>
|
||
|
|
document.getElementById('sidebar-toggle').setAttribute('aria-expanded', sidebar === 'visible');
|
||
|
|
document.getElementById('sidebar').setAttribute('aria-hidden', sidebar !== 'visible');
|
||
|
|
Array.from(document.querySelectorAll('#sidebar a')).forEach(function(link) {
|
||
|
|
link.setAttribute('tabIndex', sidebar === 'visible' ? 0 : -1);
|
||
|
|
});
|
||
|
|
</script>
|
||
|
|
|
||
|
|
<div id="content" class="content">
|
||
|
|
<main>
|
||
|
|
<h1 id="on-call-procedures"><a class="header" href="#on-call-procedures">On-Call Procedures</a></h1>
|
||
|
|
<p>Guide for on-call engineers managing VAPORA production operations.</p>
|
||
|
|
<hr />
|
||
|
|
<h2 id="overview"><a class="header" href="#overview">Overview</a></h2>
|
||
|
|
<p><strong>On-Call Responsibility</strong>: Monitor VAPORA production and respond to incidents during assigned shift</p>
|
||
|
|
<p><strong>Time Commitment</strong>:</p>
|
||
|
|
<ul>
|
||
|
|
<li>During business hours: ~5-10 minutes daily check-ins</li>
|
||
|
|
<li>During off-hours: Available for emergencies (paged for critical issues)</li>
|
||
|
|
</ul>
|
||
|
|
<p><strong>Expected Availability</strong>:</p>
|
||
|
|
<ul>
|
||
|
|
<li>Severity 1: Respond within 2 minutes</li>
|
||
|
|
<li>Severity 2: Respond within 15 minutes</li>
|
||
|
|
<li>Severity 3: Respond within 1 hour</li>
|
||
|
|
</ul>
|
||
|
|
<hr />
|
||
|
|
<h2 id="before-your-shift-starts"><a class="header" href="#before-your-shift-starts">Before Your Shift Starts</a></h2>
|
||
|
|
<h3 id="24-hours-before-on-call"><a class="header" href="#24-hours-before-on-call">24 Hours Before On-Call</a></h3>
|
||
|
|
<ul>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Verify schedule: "I'm on-call starting [date] [time]"</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Update your calendar with shift times</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Notify team: "I'll be on-call [dates]"</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Share personal contact info if not already shared</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Download necessary tools/credentials</li>
|
||
|
|
</ul>
|
||
|
|
<h3 id="1-hour-before-shift"><a class="header" href="#1-hour-before-shift">1 Hour Before Shift</a></h3>
|
||
|
|
<ul>
|
||
|
|
<li>
|
||
|
|
<p><input disabled="" type="checkbox"/>
|
||
|
|
Test pager notification system</p>
|
||
|
|
<pre><code class="language-bash"># Verify Slack notifications working
|
||
|
|
# Ask previous on-call to send test alert: "/test-alert-to-[yourname]"
|
||
|
|
</code></pre>
|
||
|
|
</li>
|
||
|
|
<li>
|
||
|
|
<p><input disabled="" type="checkbox"/>
|
||
|
|
Verify access to necessary systems</p>
|
||
|
|
<pre><code class="language-bash"># Test each required access:
|
||
|
|
✓ SSH to bastion host: ssh bastion.vapora.com
|
||
|
|
✓ kubectl to production: kubectl cluster-info
|
||
|
|
✓ Slack channels: /join #deployments #alerts
|
||
|
|
✓ Incident tracking: open Jira/GitHub
|
||
|
|
✓ Monitoring dashboards: access Grafana
|
||
|
|
✓ Status page: access status page admin
|
||
|
|
</code></pre>
|
||
|
|
</li>
|
||
|
|
<li>
|
||
|
|
<p><input disabled="" type="checkbox"/>
|
||
|
|
Review current system status</p>
|
||
|
|
<pre><code class="language-bash"># Quick health check
|
||
|
|
kubectl cluster-info
|
||
|
|
kubectl get pods -n vapora
|
||
|
|
kubectl get events -n vapora | head -10
|
||
|
|
|
||
|
|
# Should show: All pods Running, no recent errors
|
||
|
|
</code></pre>
|
||
|
|
</li>
|
||
|
|
<li>
|
||
|
|
<p><input disabled="" type="checkbox"/>
|
||
|
|
Read recent incident reports</p>
|
||
|
|
<ul>
|
||
|
|
<li>Check previous on-call handoff notes</li>
|
||
|
|
<li>Review any incidents from past week</li>
|
||
|
|
<li>Note any known issues or monitoring gaps</li>
|
||
|
|
</ul>
|
||
|
|
</li>
|
||
|
|
<li>
|
||
|
|
<p><input disabled="" type="checkbox"/>
|
||
|
|
Receive handoff from previous on-call</p>
|
||
|
|
<pre><code>Ask: "Anything I should know?"
|
||
|
|
- Any ongoing issues?
|
||
|
|
- Any deployments planned?
|
||
|
|
- Any flaky services or known alerts?
|
||
|
|
- Any customer complaints?
|
||
|
|
</code></pre>
|
||
|
|
</li>
|
||
|
|
</ul>
|
||
|
|
<hr />
|
||
|
|
<h2 id="daily-on-call-tasks"><a class="header" href="#daily-on-call-tasks">Daily On-Call Tasks</a></h2>
|
||
|
|
<h3 id="morning-check-in-after-shift-starts"><a class="header" href="#morning-check-in-after-shift-starts">Morning Check-In (After shift starts)</a></h3>
|
||
|
|
<pre><code class="language-bash"># Automated check - run this first thing
|
||
|
|
export NAMESPACE=vapora
|
||
|
|
|
||
|
|
echo "=== Cluster Health ==="
|
||
|
|
kubectl cluster-info
|
||
|
|
kubectl get nodes
|
||
|
|
|
||
|
|
echo "=== Pod Status ==="
|
||
|
|
kubectl get pods -n $NAMESPACE
|
||
|
|
kubectl get pods -n $NAMESPACE | grep -v Running
|
||
|
|
|
||
|
|
echo "=== Recent Events ==="
|
||
|
|
kubectl get events -n $NAMESPACE --sort-by='.lastTimestamp' | tail -10
|
||
|
|
|
||
|
|
echo "=== Resource Usage ==="
|
||
|
|
kubectl top nodes
|
||
|
|
kubectl top pods -n $NAMESPACE
|
||
|
|
|
||
|
|
# If any anomalies: investigate before declaring "all clear"
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="mid-shift-check-every-4-hours"><a class="header" href="#mid-shift-check-every-4-hours">Mid-Shift Check (Every 4 hours)</a></h3>
|
||
|
|
<pre><code class="language-bash"># Quick sanity check
|
||
|
|
curl https://api.vapora.com/health
|
||
|
|
curl https://vapora.app/
|
||
|
|
# Should both return 200 OK
|
||
|
|
|
||
|
|
# Check dashboards
|
||
|
|
# Grafana: any alerts? any trending issues?
|
||
|
|
|
||
|
|
# Check Slack #alerts channel
|
||
|
|
# Any warnings or anomalies posted?
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="end-of-shift-handoff-before-shift-ends"><a class="header" href="#end-of-shift-handoff-before-shift-ends">End-of-Shift Handoff (Before shift ends)</a></h3>
|
||
|
|
<pre><code class="language-bash"># Prepare handoff for next on-call
|
||
|
|
|
||
|
|
# 1. Document current state
|
||
|
|
kubectl get pods -n vapora
|
||
|
|
kubectl get nodes
|
||
|
|
kubectl top pods -n vapora
|
||
|
|
|
||
|
|
# 2. Check for known issues
|
||
|
|
kubectl get events -n vapora | grep Warning
|
||
|
|
# Any persistent warnings?
|
||
|
|
|
||
|
|
# 3. Check deployment status
|
||
|
|
git log -1 --oneline provisioning/
|
||
|
|
# Any recent changes?
|
||
|
|
|
||
|
|
# 4. Document in handoff notes:
|
||
|
|
echo "HANDOFF NOTES - $(date)
|
||
|
|
Duration: [start time] to [end time]
|
||
|
|
Status: All normal / Issues: [list]
|
||
|
|
Alerts: [any]
|
||
|
|
Deployments: [any planned]
|
||
|
|
Known issues: [any]
|
||
|
|
Recommendations: [any]
|
||
|
|
" > on-call-handoff.txt
|
||
|
|
|
||
|
|
# 5. Pass notes to next on-call
|
||
|
|
# Send message to @next-on-call with notes
|
||
|
|
</code></pre>
|
||
|
|
<hr />
|
||
|
|
<h2 id="responding-to-alerts"><a class="header" href="#responding-to-alerts">Responding to Alerts</a></h2>
|
||
|
|
<h3 id="alert-received"><a class="header" href="#alert-received">Alert Received</a></h3>
|
||
|
|
<p><strong>Step 1: Verify it's real</strong></p>
|
||
|
|
<pre><code class="language-bash"># Don't panic - verify the alert is legitimate
|
||
|
|
1. Check the source: is it from our system?
|
||
|
|
2. Check current status manually: curl endpoints
|
||
|
|
3. Check dashboard: see if issue visible there
|
||
|
|
4. Check cluster: kubectl get pods
|
||
|
|
|
||
|
|
# False alarms happen - verify before escalating
|
||
|
|
</code></pre>
|
||
|
|
<p><strong>Step 2: Assess severity</strong></p>
|
||
|
|
<ul>
|
||
|
|
<li>Is service completely down? → Severity 1</li>
|
||
|
|
<li>Is service partially down? → Severity 2</li>
|
||
|
|
<li>Is there a warning/anomaly? → Severity 3</li>
|
||
|
|
</ul>
|
||
|
|
<p><strong>Step 3: Declare incident</strong></p>
|
||
|
|
<pre><code class="language-bash"># Create ticket (Severity 1 is emergency)
|
||
|
|
# If Severity 1:
|
||
|
|
# - Alert team immediately
|
||
|
|
# - Create #incident-[date] channel
|
||
|
|
# - Start 2-minute update cycle
|
||
|
|
# See: Incident Response Runbook
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="during-incident"><a class="header" href="#during-incident">During Incident</a></h3>
|
||
|
|
<p><strong>Your role as on-call</strong>:</p>
|
||
|
|
<ol>
|
||
|
|
<li><strong>Respond quickly</strong> - First 2 minutes are critical</li>
|
||
|
|
<li><strong>Communicate</strong> - Update team/status page</li>
|
||
|
|
<li><strong>Investigate</strong> - Follow diagnostics in runbooks</li>
|
||
|
|
<li><strong>Escalate if needed</strong> - Page senior engineer if stuck</li>
|
||
|
|
<li><strong>Execute fix</strong> - Follow approved procedures</li>
|
||
|
|
<li><strong>Verify recovery</strong> - Confirm service healthy</li>
|
||
|
|
<li><strong>Document</strong> - Record what happened</li>
|
||
|
|
</ol>
|
||
|
|
<p><strong>Key communication</strong>:</p>
|
||
|
|
<ul>
|
||
|
|
<li>Initial response time: < 2 min (post "investigating")</li>
|
||
|
|
<li>Status update: every 2-5 minutes</li>
|
||
|
|
<li>Escalation: if not clear after 5 minutes</li>
|
||
|
|
<li>Resolution: post "incident resolved"</li>
|
||
|
|
</ul>
|
||
|
|
<h3 id="alert-examples--responses"><a class="header" href="#alert-examples--responses">Alert Examples & Responses</a></h3>
|
||
|
|
<h4 id="alert-pod-crashloopbackoff"><a class="header" href="#alert-pod-crashloopbackoff">Alert: "Pod CrashLoopBackOff"</a></h4>
|
||
|
|
<pre><code>1. Get pod logs: kubectl logs <pod> --previous
|
||
|
|
2. Check for config issues: kubectl get configmap
|
||
|
|
3. Check for resource limits: kubectl describe pod <pod>
|
||
|
|
4. Decide: rollback or fix config
|
||
|
|
</code></pre>
|
||
|
|
<h4 id="alert-high-error-rate-5-5xx"><a class="header" href="#alert-high-error-rate-5-5xx">Alert: "High Error Rate (>5% 5xx)"</a></h4>
|
||
|
|
<pre><code>1. Check which endpoint: tail application logs
|
||
|
|
2. Check dependencies: database, cache, external APIs
|
||
|
|
3. Check recent deployment: git log
|
||
|
|
4. Decide: rollback or investigate further
|
||
|
|
</code></pre>
|
||
|
|
<h4 id="alert-pod-memory--90"><a class="header" href="#alert-pod-memory--90">Alert: "Pod Memory > 90%"</a></h4>
|
||
|
|
<pre><code>1. Check actual usage: kubectl top pod <pod>
|
||
|
|
2. Check limits: kubectl get pod <pod> -o yaml | grep memory
|
||
|
|
3. Decide: scale up or investigate memory leak
|
||
|
|
</code></pre>
|
||
|
|
<h4 id="alert-node-notready"><a class="header" href="#alert-node-notready">Alert: "Node NotReady"</a></h4>
|
||
|
|
<pre><code>1. Check node: kubectl describe node <node>
|
||
|
|
2. Check kubelet: ssh node-x && systemctl status kubelet
|
||
|
|
3. Contact infrastructure team for hardware issues
|
||
|
|
4. Possibly: drain node and reschedule pods
|
||
|
|
</code></pre>
|
||
|
|
<hr />
|
||
|
|
<h2 id="monitoring-dashboard-setup"><a class="header" href="#monitoring-dashboard-setup">Monitoring Dashboard Setup</a></h2>
|
||
|
|
<p>When you start shift, have these visible:</p>
|
||
|
|
<h3 id="browser-tabs-keep-open"><a class="header" href="#browser-tabs-keep-open">Browser Tabs (Keep Open)</a></h3>
|
||
|
|
<ol>
|
||
|
|
<li>
|
||
|
|
<p><strong>Grafana Dashboard</strong> - VAPORA Cluster Overview</p>
|
||
|
|
<ul>
|
||
|
|
<li>Pod CPU/Memory usage</li>
|
||
|
|
<li>Request rate and latency</li>
|
||
|
|
<li>Error rate</li>
|
||
|
|
<li>Deployment status</li>
|
||
|
|
</ul>
|
||
|
|
</li>
|
||
|
|
<li>
|
||
|
|
<p><strong>Kubernetes Dashboard</strong></p>
|
||
|
|
<ul>
|
||
|
|
<li>kubectl port-forward -n kube-system svc/kubernetes-dashboard 8443:443</li>
|
||
|
|
<li>Or use K9s terminal UI: <code>k9s</code></li>
|
||
|
|
</ul>
|
||
|
|
</li>
|
||
|
|
<li>
|
||
|
|
<p><strong>Alert Dashboard</strong> (if available)</p>
|
||
|
|
<ul>
|
||
|
|
<li>Prometheus Alerts</li>
|
||
|
|
<li>Or monitoring system of choice</li>
|
||
|
|
</ul>
|
||
|
|
</li>
|
||
|
|
<li>
|
||
|
|
<p><strong>Status Page</strong> (if public-facing)</p>
|
||
|
|
<ul>
|
||
|
|
<li>Check for ongoing incidents</li>
|
||
|
|
<li>Prepare to update</li>
|
||
|
|
</ul>
|
||
|
|
</li>
|
||
|
|
</ol>
|
||
|
|
<h3 id="terminal-windows-keep-ready"><a class="header" href="#terminal-windows-keep-ready">Terminal Windows (Keep Ready)</a></h3>
|
||
|
|
<pre><code class="language-bash"># Terminal 1: Watch pods
|
||
|
|
watch kubectl get pods -n vapora
|
||
|
|
|
||
|
|
# Terminal 2: Tail logs
|
||
|
|
kubectl logs -f deployment/vapora-backend -n vapora
|
||
|
|
|
||
|
|
# Terminal 3: General kubectl commands
|
||
|
|
kubectl -n vapora get events --watch
|
||
|
|
|
||
|
|
# Terminal 4: Ad-hoc commands and troubleshooting
|
||
|
|
# (leave empty for ad-hoc use)
|
||
|
|
</code></pre>
|
||
|
|
<hr />
|
||
|
|
<h2 id="common-questions-during-on-call"><a class="header" href="#common-questions-during-on-call">Common Questions During On-Call</a></h2>
|
||
|
|
<h3 id="q-i-think-i-found-an-issue-but-im-not-sure-its-a-problem"><a class="header" href="#q-i-think-i-found-an-issue-but-im-not-sure-its-a-problem">Q: I think I found an issue, but I'm not sure it's a problem</a></h3>
|
||
|
|
<p><strong>A</strong>: When in doubt, escalate:</p>
|
||
|
|
<ol>
|
||
|
|
<li>Post in #deployments channel with observation</li>
|
||
|
|
<li>Ask: "Does this look normal?"</li>
|
||
|
|
<li>If others confirm: might be issue</li>
|
||
|
|
<li>Better safe than sorry (on production)</li>
|
||
|
|
</ol>
|
||
|
|
<h3 id="q-do-i-need-to-respond-to-every-alert"><a class="header" href="#q-do-i-need-to-respond-to-every-alert">Q: Do I need to respond to every alert?</a></h3>
|
||
|
|
<p><strong>A</strong>: Yes. Even false alarms need verification:</p>
|
||
|
|
<ol>
|
||
|
|
<li>Confirm it's false alarm (not just assume)</li>
|
||
|
|
<li>Update alert if it's misconfigured</li>
|
||
|
|
<li>Never ignore alerts - fix the alerting</li>
|
||
|
|
</ol>
|
||
|
|
<h3 id="q-service-looks-broken-but-dashboard-looks-normal"><a class="header" href="#q-service-looks-broken-but-dashboard-looks-normal">Q: Service looks broken but dashboard looks normal</a></h3>
|
||
|
|
<p><strong>A</strong>:</p>
|
||
|
|
<ol>
|
||
|
|
<li>Check if dashboard might be delayed (sometimes refresh slow)</li>
|
||
|
|
<li>Test manually: curl endpoints</li>
|
||
|
|
<li>Check pod logs directly: kubectl logs</li>
|
||
|
|
<li>Trust actual service health over dashboard</li>
|
||
|
|
</ol>
|
||
|
|
<h3 id="q-can-i-deploy-changes-while-on-call"><a class="header" href="#q-can-i-deploy-changes-while-on-call">Q: Can I deploy changes while on-call?</a></h3>
|
||
|
|
<p><strong>A</strong>:</p>
|
||
|
|
<ul>
|
||
|
|
<li><strong>Yes</strong> if it's emergency fix for active incident</li>
|
||
|
|
<li><strong>No</strong> for normal features/changes (schedule for dedicated deployment window)</li>
|
||
|
|
<li><strong>Escalate</strong> if unsure</li>
|
||
|
|
</ul>
|
||
|
|
<h3 id="q-something-looks-weird-but-i-cant-reproduce-it"><a class="header" href="#q-something-looks-weird-but-i-cant-reproduce-it">Q: Something looks weird but I can't reproduce it</a></h3>
|
||
|
|
<p><strong>A</strong>:</p>
|
||
|
|
<ol>
|
||
|
|
<li>Save any evidence: logs, metrics, events</li>
|
||
|
|
<li>Monitor more closely for pattern</li>
|
||
|
|
<li>Document in ticket for later investigation</li>
|
||
|
|
<li>Escalate if behavior continues</li>
|
||
|
|
</ol>
|
||
|
|
<h3 id="q-an-alert-keeps-firing-but-service-is-fine"><a class="header" href="#q-an-alert-keeps-firing-but-service-is-fine">Q: An alert keeps firing but service is fine</a></h3>
|
||
|
|
<p><strong>A</strong>:</p>
|
||
|
|
<ol>
|
||
|
|
<li>Investigate why alert is false</li>
|
||
|
|
<li>Check alert thresholds (might be too sensitive)</li>
|
||
|
|
<li>Fix the alert configuration</li>
|
||
|
|
<li>Update alert runbook with details</li>
|
||
|
|
</ol>
|
||
|
|
<hr />
|
||
|
|
<h2 id="escalation-decision-tree"><a class="header" href="#escalation-decision-tree">Escalation Decision Tree</a></h2>
|
||
|
|
<p>When should you escalate?</p>
|
||
|
|
<pre><code>START: Issue detected
|
||
|
|
|
||
|
|
Is it Severity 1 (complete outage)?
|
||
|
|
YES → Escalate immediately to senior engineer
|
||
|
|
NO → Continue
|
||
|
|
|
||
|
|
Have you diagnosed root cause in 5 minutes?
|
||
|
|
YES → Continue with fix
|
||
|
|
NO → Page senior engineer or escalate
|
||
|
|
|
||
|
|
Does fix require infrastructure/database changes?
|
||
|
|
YES → Contact infrastructure/DBA team
|
||
|
|
NO → Continue with fix
|
||
|
|
|
||
|
|
Is this outside your authority (company policy)?
|
||
|
|
YES → Escalate to manager
|
||
|
|
NO → Proceed with fix
|
||
|
|
|
||
|
|
Implemented fix, service still broken?
|
||
|
|
YES → Page senior engineer immediately
|
||
|
|
NO → Verify and close incident
|
||
|
|
|
||
|
|
Result: Uncertain?
|
||
|
|
→ Ask senior engineer or manager
|
||
|
|
→ Always better to escalate early
|
||
|
|
</code></pre>
|
||
|
|
<hr />
|
||
|
|
<h2 id="when-to-page-senior-engineer"><a class="header" href="#when-to-page-senior-engineer">When to Page Senior Engineer</a></h2>
|
||
|
|
<p><strong>Page immediately if</strong>:</p>
|
||
|
|
<ul>
|
||
|
|
<li>Service completely down (Severity 1)</li>
|
||
|
|
<li>Database appears corrupted</li>
|
||
|
|
<li>You're stuck for >5 minutes</li>
|
||
|
|
<li>Rollback didn't work</li>
|
||
|
|
<li>Need infrastructure changes urgently</li>
|
||
|
|
<li>Something affecting >50% of users</li>
|
||
|
|
</ul>
|
||
|
|
<p><strong>Don't page just because</strong>:</p>
|
||
|
|
<ul>
|
||
|
|
<li>Single pod restarting (monitor first)</li>
|
||
|
|
<li>Transient network errors</li>
|
||
|
|
<li>You're slightly unsure (ask in #deployments first)</li>
|
||
|
|
<li>It's 3 AM and not critical (use tickets for morning)</li>
|
||
|
|
</ul>
|
||
|
|
<hr />
|
||
|
|
<h2 id="end-of-shift-handoff"><a class="header" href="#end-of-shift-handoff">End of Shift Handoff</a></h2>
|
||
|
|
<h3 id="create-handoff-report"><a class="header" href="#create-handoff-report">Create Handoff Report</a></h3>
|
||
|
|
<pre><code>SHIFT HANDOFF - [Your Name]
|
||
|
|
Dates: [Start] to [End] UTC
|
||
|
|
Duration: [X hours]
|
||
|
|
|
||
|
|
STATUS: ✅ All normal / ⚠️ Issues ongoing / ❌ Critical
|
||
|
|
|
||
|
|
INCIDENTS: [Number]
|
||
|
|
- Incident 1: [description, resolved or ongoing]
|
||
|
|
- Incident 2: [description]
|
||
|
|
|
||
|
|
ALERTS: [Any unusual alerts]
|
||
|
|
- Alert 1: [description, action taken]
|
||
|
|
|
||
|
|
DEPLOYMENTS: [Any scheduled or happened]
|
||
|
|
- Deployment 1: [status]
|
||
|
|
|
||
|
|
KNOWN ISSUES:
|
||
|
|
- Issue 1: [description, workaround]
|
||
|
|
- Issue 2: [description]
|
||
|
|
|
||
|
|
MONITORING NOTES:
|
||
|
|
- [Any trending issues]
|
||
|
|
- [Any monitoring gaps]
|
||
|
|
- [Any recommended actions]
|
||
|
|
|
||
|
|
RECOMMENDATIONS FOR NEXT ON-CALL:
|
||
|
|
1. [Action item]
|
||
|
|
2. [Action item]
|
||
|
|
3. [Action item]
|
||
|
|
|
||
|
|
NEXT ON-CALL: @[name]
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="send-to-next-on-call"><a class="header" href="#send-to-next-on-call">Send to Next On-Call</a></h3>
|
||
|
|
<pre><code>@next-on-call - Handoff notes attached:
|
||
|
|
[paste report above]
|
||
|
|
|
||
|
|
Key points:
|
||
|
|
- [Most important item]
|
||
|
|
- [Second important]
|
||
|
|
- [Any urgent follow-ups]
|
||
|
|
|
||
|
|
Questions? I'm available for 30 min
|
||
|
|
</code></pre>
|
||
|
|
<hr />
|
||
|
|
<h2 id="tools--commands-reference"><a class="header" href="#tools--commands-reference">Tools & Commands Reference</a></h2>
|
||
|
|
<h3 id="essential-commands"><a class="header" href="#essential-commands">Essential Commands</a></h3>
|
||
|
|
<pre><code class="language-bash"># Pod management
|
||
|
|
kubectl get pods -n vapora
|
||
|
|
kubectl logs pod-name -n vapora
|
||
|
|
kubectl exec pod-name -n vapora -- bash
|
||
|
|
kubectl describe pod pod-name -n vapora
|
||
|
|
kubectl delete pod pod-name -n vapora # (recreates via deployment)
|
||
|
|
|
||
|
|
# Deployment management
|
||
|
|
kubectl get deployments -n vapora
|
||
|
|
kubectl rollout status deployment/vapora-backend -n vapora
|
||
|
|
kubectl rollout undo deployment/vapora-backend -n vapora
|
||
|
|
kubectl scale deployment/vapora-backend --replicas=5 -n vapora
|
||
|
|
|
||
|
|
# Service health
|
||
|
|
curl http://localhost:8001/health
|
||
|
|
kubectl get events -n vapora
|
||
|
|
kubectl top pods -n vapora
|
||
|
|
kubectl get endpoints -n vapora
|
||
|
|
|
||
|
|
# Quick diagnostics
|
||
|
|
kubectl describe nodes
|
||
|
|
kubectl cluster-info
|
||
|
|
kubectl get persistent volumes
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="useful-tools"><a class="header" href="#useful-tools">Useful Tools</a></h3>
|
||
|
|
<pre><code class="language-bash"># Install these on your workstation
|
||
|
|
brew install kubectl # Kubernetes CLI
|
||
|
|
brew install k9s # Terminal UI for K8s
|
||
|
|
brew install watch # Monitor command output
|
||
|
|
brew install jq # JSON processing
|
||
|
|
brew install yq # YAML processing
|
||
|
|
brew install grpcurl # gRPC debugging
|
||
|
|
|
||
|
|
# Aliases to save time
|
||
|
|
alias k='kubectl'
|
||
|
|
alias kgp='kubectl get pods'
|
||
|
|
alias klogs='kubectl logs'
|
||
|
|
alias kexec='kubectl exec'
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="dashboards--links"><a class="header" href="#dashboards--links">Dashboards & Links</a></h3>
|
||
|
|
<p>Bookmark these:</p>
|
||
|
|
<ul>
|
||
|
|
<li>Grafana: <code>https://grafana.vapora.com</code></li>
|
||
|
|
<li>Status Page: <code>https://status.vapora.com</code></li>
|
||
|
|
<li>Incident Tracker: <code>https://github.com/your-org/vapora/issues</code></li>
|
||
|
|
<li>Runbooks: <code>https://github.com/your-org/vapora/tree/main/docs/operations</code></li>
|
||
|
|
<li>Kubernetes Dashboard: Run <code>kubectl proxy</code> then <code>http://localhost:8001/ui</code></li>
|
||
|
|
</ul>
|
||
|
|
<hr />
|
||
|
|
<h2 id="on-call-checklist"><a class="header" href="#on-call-checklist">On-Call Checklist</a></h2>
|
||
|
|
<h3 id="starting-shift"><a class="header" href="#starting-shift">Starting Shift</a></h3>
|
||
|
|
<ul>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Verified pager notifications working</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Tested access to all systems</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Reviewed current system status</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Read recent incidents</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Received handoff from previous on-call</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Set up monitoring dashboards</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Opened necessary terminal windows</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Posted "on-call" status in #deployments</li>
|
||
|
|
</ul>
|
||
|
|
<h3 id="during-shift"><a class="header" href="#during-shift">During Shift</a></h3>
|
||
|
|
<ul>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Responded to all alerts within SLA</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Updated incident status regularly</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Escalated when appropriate</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Documented actions in tickets</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Verified fixes before closing</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Communicated clearly with team</li>
|
||
|
|
</ul>
|
||
|
|
<h3 id="ending-shift"><a class="header" href="#ending-shift">Ending Shift</a></h3>
|
||
|
|
<ul>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Created handoff report</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Resolved or escalated open issues</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Updated monitoring for anomalies</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Passed report to next on-call</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Closed out incident tickets</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Verified next on-call is ready</li>
|
||
|
|
<li><input disabled="" type="checkbox"/>
|
||
|
|
Posted "handing off to [next on-call]" in #deployments</li>
|
||
|
|
</ul>
|
||
|
|
<hr />
|
||
|
|
<h2 id="post-on-call-follow-up"><a class="header" href="#post-on-call-follow-up">Post-On-Call Follow-Up</a></h2>
|
||
|
|
<p>After your shift:</p>
|
||
|
|
<ol>
|
||
|
|
<li>
|
||
|
|
<p><strong>Document lessons learned</strong></p>
|
||
|
|
<ul>
|
||
|
|
<li>Did you learn something new?</li>
|
||
|
|
<li>Did any procedure need updating?</li>
|
||
|
|
<li>Were any runbooks unclear?</li>
|
||
|
|
</ul>
|
||
|
|
</li>
|
||
|
|
<li>
|
||
|
|
<p><strong>Update runbooks</strong></p>
|
||
|
|
<ul>
|
||
|
|
<li>If you found gaps, update procedures</li>
|
||
|
|
<li>If you had questions, update docs</li>
|
||
|
|
<li>Share improvements with team</li>
|
||
|
|
</ul>
|
||
|
|
</li>
|
||
|
|
<li>
|
||
|
|
<p><strong>Communicate findings</strong></p>
|
||
|
|
<ul>
|
||
|
|
<li>Anything the team should know?</li>
|
||
|
|
<li>Any recommendations?</li>
|
||
|
|
<li>Trends to watch?</li>
|
||
|
|
</ul>
|
||
|
|
</li>
|
||
|
|
<li>
|
||
|
|
<p><strong>Celebrate successes</strong></p>
|
||
|
|
<ul>
|
||
|
|
<li>Any incidents quickly resolved?</li>
|
||
|
|
<li>Any new insights?</li>
|
||
|
|
<li>Recognize good practices</li>
|
||
|
|
</ul>
|
||
|
|
</li>
|
||
|
|
</ol>
|
||
|
|
<hr />
|
||
|
|
<h2 id="emergency-contacts"><a class="header" href="#emergency-contacts">Emergency Contacts</a></h2>
|
||
|
|
<p>Keep these accessible:</p>
|
||
|
|
<pre><code>ESCALATION CONTACTS:
|
||
|
|
|
||
|
|
Primary Escalation: [Name] [Phone] [Slack]
|
||
|
|
Backup Escalation: [Name] [Phone] [Slack]
|
||
|
|
Infrastructure: [Name] [Phone] [Slack]
|
||
|
|
Database Team: [Name] [Phone] [Slack]
|
||
|
|
Manager: [Name] [Phone] [Slack]
|
||
|
|
|
||
|
|
External Contacts:
|
||
|
|
AWS Support: [Account ID] [Contact]
|
||
|
|
CDN Provider: [Account] [Contact]
|
||
|
|
DNS Provider: [Account] [Contact]
|
||
|
|
|
||
|
|
EMERGENCY PROCEDURES:
|
||
|
|
- Complete AWS outage: Contact AWS support immediately
|
||
|
|
- Database failure: Contact DBA, activate backups
|
||
|
|
- Security incident: Contact security team immediately
|
||
|
|
- Major data loss: Activate disaster recovery
|
||
|
|
</code></pre>
|
||
|
|
<hr />
|
||
|
|
<h2 id="remember"><a class="header" href="#remember">Remember</a></h2>
|
||
|
|
<p>✅ <strong>You are the guardian of production</strong> - Your vigilance keeps services running</p>
|
||
|
|
<p>✅ <strong>Better safe than sorry</strong> - Escalate early and often</p>
|
||
|
|
<p>✅ <strong>Communication is key</strong> - Keep team informed</p>
|
||
|
|
<p>✅ <strong>Document everything</strong> - Future you and team will thank you</p>
|
||
|
|
<p>✅ <strong>Ask for help</strong> - No shame in escalating</p>
|
||
|
|
<p>❌ <strong>Don't guess</strong> - Verify before taking action</p>
|
||
|
|
<p>❌ <strong>Don't stay silent</strong> - Alert team to any issues</p>
|
||
|
|
<p>❌ <strong>Don't ignore alerts</strong> - Even false ones need investigation</p>
|
||
|
|
|
||
|
|
</main>
|
||
|
|
|
||
|
|
<nav class="nav-wrapper" aria-label="Page navigation">
|
||
|
|
<!-- Mobile navigation buttons -->
|
||
|
|
<a rel="prev" href="../../operations/monitoring-operations.html" class="mobile-nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
|
||
|
|
<i class="fa fa-angle-left"></i>
|
||
|
|
</a>
|
||
|
|
|
||
|
|
<a rel="next prefetch" href="../../operations/incident-response-runbook.html" class="mobile-nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
|
||
|
|
<i class="fa fa-angle-right"></i>
|
||
|
|
</a>
|
||
|
|
|
||
|
|
<div style="clear: both"></div>
|
||
|
|
</nav>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<nav class="nav-wide-wrapper" aria-label="Page navigation">
|
||
|
|
<a rel="prev" href="../../operations/monitoring-operations.html" class="nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
|
||
|
|
<i class="fa fa-angle-left"></i>
|
||
|
|
</a>
|
||
|
|
|
||
|
|
<a rel="next prefetch" href="../../operations/incident-response-runbook.html" class="nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
|
||
|
|
<i class="fa fa-angle-right"></i>
|
||
|
|
</a>
|
||
|
|
</nav>
|
||
|
|
|
||
|
|
</div>
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
<script>
|
||
|
|
window.playground_copyable = true;
|
||
|
|
</script>
|
||
|
|
|
||
|
|
|
||
|
|
<script src="../elasticlunr.min.js"></script>
|
||
|
|
<script src="../mark.min.js"></script>
|
||
|
|
<script src="../searcher.js"></script>
|
||
|
|
|
||
|
|
<script src="../clipboard.min.js"></script>
|
||
|
|
<script src="../highlight.js"></script>
|
||
|
|
<script src="../book.js"></script>
|
||
|
|
|
||
|
|
<!-- Custom JS scripts -->
|
||
|
|
|
||
|
|
|
||
|
|
</div>
|
||
|
|
</body>
|
||
|
|
</html>
|