787 lines
32 KiB
HTML
787 lines
32 KiB
HTML
|
|
<!DOCTYPE HTML>
|
||
|
|
<html lang="en" class="light sidebar-visible" dir="ltr">
|
||
|
|
<head>
|
||
|
|
<!-- Book generated using mdBook -->
|
||
|
|
<meta charset="UTF-8">
|
||
|
|
<title>Monitoring & Operations - VAPORA Platform Documentation</title>
|
||
|
|
|
||
|
|
|
||
|
|
<!-- Custom HTML head -->
|
||
|
|
|
||
|
|
<meta name="description" content="Comprehensive documentation for VAPORA, an intelligent development orchestration platform built entirely in Rust.">
|
||
|
|
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||
|
|
<meta name="theme-color" content="#ffffff">
|
||
|
|
|
||
|
|
<link rel="icon" href="../favicon.svg">
|
||
|
|
<link rel="shortcut icon" href="../favicon.png">
|
||
|
|
<link rel="stylesheet" href="../css/variables.css">
|
||
|
|
<link rel="stylesheet" href="../css/general.css">
|
||
|
|
<link rel="stylesheet" href="../css/chrome.css">
|
||
|
|
<link rel="stylesheet" href="../css/print.css" media="print">
|
||
|
|
|
||
|
|
<!-- Fonts -->
|
||
|
|
<link rel="stylesheet" href="../FontAwesome/css/font-awesome.css">
|
||
|
|
<link rel="stylesheet" href="../fonts/fonts.css">
|
||
|
|
|
||
|
|
<!-- Highlight.js Stylesheets -->
|
||
|
|
<link rel="stylesheet" id="highlight-css" href="../highlight.css">
|
||
|
|
<link rel="stylesheet" id="tomorrow-night-css" href="../tomorrow-night.css">
|
||
|
|
<link rel="stylesheet" id="ayu-highlight-css" href="../ayu-highlight.css">
|
||
|
|
|
||
|
|
<!-- Custom theme stylesheets -->
|
||
|
|
|
||
|
|
|
||
|
|
<!-- Provide site root and default themes to javascript -->
|
||
|
|
<script>
|
||
|
|
const path_to_root = "../";
|
||
|
|
const default_light_theme = "light";
|
||
|
|
const default_dark_theme = "dark";
|
||
|
|
</script>
|
||
|
|
<!-- Start loading toc.js asap -->
|
||
|
|
<script src="../toc.js"></script>
|
||
|
|
</head>
|
||
|
|
<body>
|
||
|
|
<div id="mdbook-help-container">
|
||
|
|
<div id="mdbook-help-popup">
|
||
|
|
<h2 class="mdbook-help-title">Keyboard shortcuts</h2>
|
||
|
|
<div>
|
||
|
|
<p>Press <kbd>←</kbd> or <kbd>→</kbd> to navigate between chapters</p>
|
||
|
|
<p>Press <kbd>S</kbd> or <kbd>/</kbd> to search in the book</p>
|
||
|
|
<p>Press <kbd>?</kbd> to show this help</p>
|
||
|
|
<p>Press <kbd>Esc</kbd> to hide this help</p>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
<div id="body-container">
|
||
|
|
<!-- Work around some values being stored in localStorage wrapped in quotes -->
|
||
|
|
<script>
|
||
|
|
try {
|
||
|
|
let theme = localStorage.getItem('mdbook-theme');
|
||
|
|
let sidebar = localStorage.getItem('mdbook-sidebar');
|
||
|
|
|
||
|
|
if (theme.startsWith('"') && theme.endsWith('"')) {
|
||
|
|
localStorage.setItem('mdbook-theme', theme.slice(1, theme.length - 1));
|
||
|
|
}
|
||
|
|
|
||
|
|
if (sidebar.startsWith('"') && sidebar.endsWith('"')) {
|
||
|
|
localStorage.setItem('mdbook-sidebar', sidebar.slice(1, sidebar.length - 1));
|
||
|
|
}
|
||
|
|
} catch (e) { }
|
||
|
|
</script>
|
||
|
|
|
||
|
|
<!-- Set the theme before any content is loaded, prevents flash -->
|
||
|
|
<script>
|
||
|
|
const default_theme = window.matchMedia("(prefers-color-scheme: dark)").matches ? default_dark_theme : default_light_theme;
|
||
|
|
let theme;
|
||
|
|
try { theme = localStorage.getItem('mdbook-theme'); } catch(e) { }
|
||
|
|
if (theme === null || theme === undefined) { theme = default_theme; }
|
||
|
|
const html = document.documentElement;
|
||
|
|
html.classList.remove('light')
|
||
|
|
html.classList.add(theme);
|
||
|
|
html.classList.add("js");
|
||
|
|
</script>
|
||
|
|
|
||
|
|
<input type="checkbox" id="sidebar-toggle-anchor" class="hidden">
|
||
|
|
|
||
|
|
<!-- Hide / unhide sidebar before it is displayed -->
|
||
|
|
<script>
|
||
|
|
let sidebar = null;
|
||
|
|
const sidebar_toggle = document.getElementById("sidebar-toggle-anchor");
|
||
|
|
if (document.body.clientWidth >= 1080) {
|
||
|
|
try { sidebar = localStorage.getItem('mdbook-sidebar'); } catch(e) { }
|
||
|
|
sidebar = sidebar || 'visible';
|
||
|
|
} else {
|
||
|
|
sidebar = 'hidden';
|
||
|
|
}
|
||
|
|
sidebar_toggle.checked = sidebar === 'visible';
|
||
|
|
html.classList.remove('sidebar-visible');
|
||
|
|
html.classList.add("sidebar-" + sidebar);
|
||
|
|
</script>
|
||
|
|
|
||
|
|
<nav id="sidebar" class="sidebar" aria-label="Table of contents">
|
||
|
|
<!-- populated by js -->
|
||
|
|
<mdbook-sidebar-scrollbox class="sidebar-scrollbox"></mdbook-sidebar-scrollbox>
|
||
|
|
<noscript>
|
||
|
|
<iframe class="sidebar-iframe-outer" src="../toc.html"></iframe>
|
||
|
|
</noscript>
|
||
|
|
<div id="sidebar-resize-handle" class="sidebar-resize-handle">
|
||
|
|
<div class="sidebar-resize-indicator"></div>
|
||
|
|
</div>
|
||
|
|
</nav>
|
||
|
|
|
||
|
|
<div id="page-wrapper" class="page-wrapper">
|
||
|
|
|
||
|
|
<div class="page">
|
||
|
|
<div id="menu-bar-hover-placeholder"></div>
|
||
|
|
<div id="menu-bar" class="menu-bar sticky">
|
||
|
|
<div class="left-buttons">
|
||
|
|
<label id="sidebar-toggle" class="icon-button" for="sidebar-toggle-anchor" title="Toggle Table of Contents" aria-label="Toggle Table of Contents" aria-controls="sidebar">
|
||
|
|
<i class="fa fa-bars"></i>
|
||
|
|
</label>
|
||
|
|
<button id="theme-toggle" class="icon-button" type="button" title="Change theme" aria-label="Change theme" aria-haspopup="true" aria-expanded="false" aria-controls="theme-list">
|
||
|
|
<i class="fa fa-paint-brush"></i>
|
||
|
|
</button>
|
||
|
|
<ul id="theme-list" class="theme-popup" aria-label="Themes" role="menu">
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="default_theme">Auto</button></li>
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="light">Light</button></li>
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="rust">Rust</button></li>
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="coal">Coal</button></li>
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="navy">Navy</button></li>
|
||
|
|
<li role="none"><button role="menuitem" class="theme" id="ayu">Ayu</button></li>
|
||
|
|
</ul>
|
||
|
|
<button id="search-toggle" class="icon-button" type="button" title="Search (`/`)" aria-label="Toggle Searchbar" aria-expanded="false" aria-keyshortcuts="/ s" aria-controls="searchbar">
|
||
|
|
<i class="fa fa-search"></i>
|
||
|
|
</button>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<h1 class="menu-title">VAPORA Platform Documentation</h1>
|
||
|
|
|
||
|
|
<div class="right-buttons">
|
||
|
|
<a href="../print.html" title="Print this book" aria-label="Print this book">
|
||
|
|
<i id="print-button" class="fa fa-print"></i>
|
||
|
|
</a>
|
||
|
|
<a href="https://github.com/vapora-platform/vapora" title="Git repository" aria-label="Git repository">
|
||
|
|
<i id="git-repository-button" class="fa fa-github"></i>
|
||
|
|
</a>
|
||
|
|
<a href="https://github.com/vapora-platform/vapora/edit/main/docs/src/../operations/monitoring-operations.md" title="Suggest an edit" aria-label="Suggest an edit">
|
||
|
|
<i id="git-edit-button" class="fa fa-edit"></i>
|
||
|
|
</a>
|
||
|
|
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<div id="search-wrapper" class="hidden">
|
||
|
|
<form id="searchbar-outer" class="searchbar-outer">
|
||
|
|
<input type="search" id="searchbar" name="searchbar" placeholder="Search this book ..." aria-controls="searchresults-outer" aria-describedby="searchresults-header">
|
||
|
|
</form>
|
||
|
|
<div id="searchresults-outer" class="searchresults-outer hidden">
|
||
|
|
<div id="searchresults-header" class="searchresults-header"></div>
|
||
|
|
<ul id="searchresults">
|
||
|
|
</ul>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<!-- Apply ARIA attributes after the sidebar and the sidebar toggle button are added to the DOM -->
|
||
|
|
<script>
|
||
|
|
document.getElementById('sidebar-toggle').setAttribute('aria-expanded', sidebar === 'visible');
|
||
|
|
document.getElementById('sidebar').setAttribute('aria-hidden', sidebar !== 'visible');
|
||
|
|
Array.from(document.querySelectorAll('#sidebar a')).forEach(function(link) {
|
||
|
|
link.setAttribute('tabIndex', sidebar === 'visible' ? 0 : -1);
|
||
|
|
});
|
||
|
|
</script>
|
||
|
|
|
||
|
|
<div id="content" class="content">
|
||
|
|
<main>
|
||
|
|
<h1 id="monitoring--health-check-operations"><a class="header" href="#monitoring--health-check-operations">Monitoring & Health Check Operations</a></h1>
|
||
|
|
<p>Guide for continuous monitoring and health checks of VAPORA in production.</p>
|
||
|
|
<hr />
|
||
|
|
<h2 id="overview"><a class="header" href="#overview">Overview</a></h2>
|
||
|
|
<p><strong>Responsibility</strong>: Maintain visibility into VAPORA service health through monitoring, logging, and alerting</p>
|
||
|
|
<p><strong>Key Activities</strong>:</p>
|
||
|
|
<ul>
|
||
|
|
<li>Regular health checks (automated and manual)</li>
|
||
|
|
<li>Alert response and investigation</li>
|
||
|
|
<li>Trend analysis and capacity planning</li>
|
||
|
|
<li>Incident prevention through early detection</li>
|
||
|
|
</ul>
|
||
|
|
<p><strong>Success Metric</strong>: Detect and respond to issues before users are significantly impacted</p>
|
||
|
|
<hr />
|
||
|
|
<h2 id="automated-health-checks"><a class="header" href="#automated-health-checks">Automated Health Checks</a></h2>
|
||
|
|
<h3 id="kubernetes-health-check-pipeline"><a class="header" href="#kubernetes-health-check-pipeline">Kubernetes Health Check Pipeline</a></h3>
|
||
|
|
<p>If using CI/CD, leverage automatic health monitoring:</p>
|
||
|
|
<p><strong>GitHub Actions</strong>:</p>
|
||
|
|
<pre><code class="language-bash"># Runs every 15 minutes (quick check)
|
||
|
|
# Runs every 6 hours (comprehensive diagnostics)
|
||
|
|
# See: .github/workflows/health-check.yml
|
||
|
|
</code></pre>
|
||
|
|
<p><strong>Woodpecker</strong>:</p>
|
||
|
|
<pre><code class="language-bash"># Runs every 15 minutes (quick check)
|
||
|
|
# Runs every 6 hours (comprehensive diagnostics)
|
||
|
|
# See: .woodpecker/health-check.yml
|
||
|
|
</code></pre>
|
||
|
|
<p><strong>Artifacts Generated</strong>:</p>
|
||
|
|
<ul>
|
||
|
|
<li><code>docker-health.log</code> - Docker container status</li>
|
||
|
|
<li><code>k8s-health.log</code> - Kubernetes deployments status</li>
|
||
|
|
<li><code>k8s-diagnostics.log</code> - Full system diagnostics</li>
|
||
|
|
<li><code>docker-diagnostics.log</code> - Docker system info</li>
|
||
|
|
<li><code>HEALTH_REPORT.md</code> - Summary report</li>
|
||
|
|
</ul>
|
||
|
|
<h3 id="quick-manual-health-check"><a class="header" href="#quick-manual-health-check">Quick Manual Health Check</a></h3>
|
||
|
|
<pre><code class="language-bash"># Run this command to get instant health status
|
||
|
|
export NAMESPACE=vapora
|
||
|
|
|
||
|
|
echo "=== Pod Status ==="
|
||
|
|
kubectl get pods -n $NAMESPACE
|
||
|
|
echo ""
|
||
|
|
|
||
|
|
echo "=== Service Health ==="
|
||
|
|
kubectl get endpoints -n $NAMESPACE
|
||
|
|
echo ""
|
||
|
|
|
||
|
|
echo "=== Recent Events ==="
|
||
|
|
kubectl get events -n $NAMESPACE --sort-by='.lastTimestamp' | tail -10
|
||
|
|
echo ""
|
||
|
|
|
||
|
|
echo "=== Resource Usage ==="
|
||
|
|
kubectl top pods -n $NAMESPACE
|
||
|
|
echo ""
|
||
|
|
|
||
|
|
echo "=== API Health ==="
|
||
|
|
curl -s http://localhost:8001/health | jq .
|
||
|
|
</code></pre>
|
||
|
|
<hr />
|
||
|
|
<h2 id="manual-daily-monitoring"><a class="header" href="#manual-daily-monitoring">Manual Daily Monitoring</a></h2>
|
||
|
|
<h3 id="morning-check-start-of-business-day"><a class="header" href="#morning-check-start-of-business-day">Morning Check (Start of Business Day)</a></h3>
|
||
|
|
<pre><code class="language-bash"># Run at start of business day (or when starting shift)
|
||
|
|
|
||
|
|
echo "=== MORNING HEALTH CHECK ==="
|
||
|
|
echo "Date: $(date -u)"
|
||
|
|
|
||
|
|
# 1. Cluster Status
|
||
|
|
echo "Cluster Status:"
|
||
|
|
kubectl cluster-info | grep server
|
||
|
|
|
||
|
|
# 2. Node Status
|
||
|
|
echo ""
|
||
|
|
echo "Node Status:"
|
||
|
|
kubectl get nodes
|
||
|
|
# Should show: All nodes Ready
|
||
|
|
|
||
|
|
# 3. Pod Status
|
||
|
|
echo ""
|
||
|
|
echo "Pod Status:"
|
||
|
|
kubectl get pods -n vapora
|
||
|
|
# Should show: All Running, 1/1 Ready
|
||
|
|
|
||
|
|
# 4. Service Endpoints
|
||
|
|
echo ""
|
||
|
|
echo "Service Endpoints:"
|
||
|
|
kubectl get endpoints -n vapora
|
||
|
|
# Should show: All services have endpoints (not empty)
|
||
|
|
|
||
|
|
# 5. Resource Usage
|
||
|
|
echo ""
|
||
|
|
echo "Resource Usage:"
|
||
|
|
kubectl top nodes
|
||
|
|
kubectl top pods -n vapora | head -10
|
||
|
|
|
||
|
|
# 6. Recent Errors
|
||
|
|
echo ""
|
||
|
|
echo "Recent Errors (last 1 hour):"
|
||
|
|
kubectl logs deployment/vapora-backend -n vapora --since=1h | grep -i error | wc -l
|
||
|
|
# Should show: 0 or very few errors
|
||
|
|
|
||
|
|
# 7. Overall Status
|
||
|
|
echo ""
|
||
|
|
echo "Overall Status: ✅ Healthy"
|
||
|
|
# If any issues found: Document and investigate
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="mid-day-check-every-4-6-hours"><a class="header" href="#mid-day-check-every-4-6-hours">Mid-Day Check (Every 4-6 hours)</a></h3>
|
||
|
|
<pre><code class="language-bash"># Quick sanity check during business hours
|
||
|
|
|
||
|
|
# 1. Service Responsiveness
|
||
|
|
curl -s http://localhost:8001/health | jq '.status'
|
||
|
|
# Should return: "healthy"
|
||
|
|
|
||
|
|
# 2. Pod Restart Tracking
|
||
|
|
kubectl get pods -n vapora -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.containerStatuses[0].restartCount}{"\n"}{end}'
|
||
|
|
# Restart count should not be increasing rapidly
|
||
|
|
|
||
|
|
# 3. Error Log Check
|
||
|
|
kubectl logs deployment/vapora-backend -n vapora --since=4h --timestamps | grep ERROR | tail -5
|
||
|
|
# Should show: Few to no errors
|
||
|
|
|
||
|
|
# 4. Performance Check
|
||
|
|
kubectl top pods -n vapora | tail -5
|
||
|
|
# CPU/Memory should be in normal range
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="end-of-day-check-before-shift-end"><a class="header" href="#end-of-day-check-before-shift-end">End-of-Day Check (Before Shift End)</a></h3>
|
||
|
|
<pre><code class="language-bash"># Summary check before handing off to on-call
|
||
|
|
|
||
|
|
echo "=== END OF DAY SUMMARY ==="
|
||
|
|
|
||
|
|
# Current status
|
||
|
|
kubectl get pods -n vapora
|
||
|
|
kubectl top pods -n vapora
|
||
|
|
|
||
|
|
# Any concerning trends?
|
||
|
|
echo ""
|
||
|
|
echo "Checking for concerning events..."
|
||
|
|
kubectl get events -n vapora --sort-by='.lastTimestamp' | grep -i warning
|
||
|
|
|
||
|
|
# Any pod restarts?
|
||
|
|
echo ""
|
||
|
|
echo "Pod restart status:"
|
||
|
|
kubectl get pods -n vapora -o jsonpath='{range .items[*]}{.metadata.name}{": "}{.status.containerStatuses[0].restartCount}{"\n"}{end}' | grep -v ": 0"
|
||
|
|
|
||
|
|
# Document for next shift
|
||
|
|
echo ""
|
||
|
|
echo "Status for on-call: All normal / Issues detected"
|
||
|
|
</code></pre>
|
||
|
|
<hr />
|
||
|
|
<h2 id="dashboard-setup--monitoring"><a class="header" href="#dashboard-setup--monitoring">Dashboard Setup & Monitoring</a></h2>
|
||
|
|
<h3 id="essential-dashboards-to-monitor"><a class="header" href="#essential-dashboards-to-monitor">Essential Dashboards to Monitor</a></h3>
|
||
|
|
<p>If you have Grafana/Prometheus, create these dashboards:</p>
|
||
|
|
<h4 id="1-service-health-dashboard"><a class="header" href="#1-service-health-dashboard">1. Service Health Dashboard</a></h4>
|
||
|
|
<p>Monitor:</p>
|
||
|
|
<ul>
|
||
|
|
<li>Pod running count (should be stable at expected count)</li>
|
||
|
|
<li>Pod restart count (should not increase rapidly)</li>
|
||
|
|
<li>Service endpoint availability (should be >99%)</li>
|
||
|
|
<li>API response time (p99, track trends)</li>
|
||
|
|
</ul>
|
||
|
|
<p><strong>Alert if:</strong></p>
|
||
|
|
<ul>
|
||
|
|
<li>Pod count drops below expected</li>
|
||
|
|
<li>Restart count increasing</li>
|
||
|
|
<li>Endpoints empty</li>
|
||
|
|
<li>Response time >2s</li>
|
||
|
|
</ul>
|
||
|
|
<h4 id="2-resource-utilization-dashboard"><a class="header" href="#2-resource-utilization-dashboard">2. Resource Utilization Dashboard</a></h4>
|
||
|
|
<p>Monitor:</p>
|
||
|
|
<ul>
|
||
|
|
<li>CPU usage per pod</li>
|
||
|
|
<li>Memory usage per pod</li>
|
||
|
|
<li>Node capacity (CPU, memory, disk)</li>
|
||
|
|
<li>Network I/O</li>
|
||
|
|
</ul>
|
||
|
|
<p><strong>Alert if:</strong></p>
|
||
|
|
<ul>
|
||
|
|
<li>Any pod >80% CPU/Memory</li>
|
||
|
|
<li>Any node >85% capacity</li>
|
||
|
|
<li>Memory trending upward consistently</li>
|
||
|
|
</ul>
|
||
|
|
<h4 id="3-error-rate-dashboard"><a class="header" href="#3-error-rate-dashboard">3. Error Rate Dashboard</a></h4>
|
||
|
|
<p>Monitor:</p>
|
||
|
|
<ul>
|
||
|
|
<li>4xx error rate (should be low)</li>
|
||
|
|
<li>5xx error rate (should be minimal)</li>
|
||
|
|
<li>Error rate by endpoint</li>
|
||
|
|
<li>Error rate by service</li>
|
||
|
|
</ul>
|
||
|
|
<p><strong>Alert if:</strong></p>
|
||
|
|
<ul>
|
||
|
|
<li>5xx error rate >5%</li>
|
||
|
|
<li>4xx error rate >10%</li>
|
||
|
|
<li>Sudden spike in errors</li>
|
||
|
|
</ul>
|
||
|
|
<h4 id="4-application-metrics-dashboard"><a class="header" href="#4-application-metrics-dashboard">4. Application Metrics Dashboard</a></h4>
|
||
|
|
<p>Monitor:</p>
|
||
|
|
<ul>
|
||
|
|
<li>Request rate (RPS)</li>
|
||
|
|
<li>Request latency (p50, p95, p99)</li>
|
||
|
|
<li>Active connections</li>
|
||
|
|
<li>Database query time</li>
|
||
|
|
</ul>
|
||
|
|
<p><strong>Alert if:</strong></p>
|
||
|
|
<ul>
|
||
|
|
<li>Request rate suddenly drops (might indicate outage)</li>
|
||
|
|
<li>Latency spikes above baseline</li>
|
||
|
|
<li>Database queries slow</li>
|
||
|
|
</ul>
|
||
|
|
<h3 id="grafana-setup-example"><a class="header" href="#grafana-setup-example">Grafana Setup Example</a></h3>
|
||
|
|
<pre><code class="language-bash"># If setting up Grafana monitoring
|
||
|
|
1. Deploy Prometheus scraping Kubernetes metrics
|
||
|
|
2. Create dashboard with above panels
|
||
|
|
3. Set alert rules:
|
||
|
|
- CPU >80%: Warning
|
||
|
|
- Memory >85%: Warning
|
||
|
|
- Error rate >5%: Critical
|
||
|
|
- Pod crashed: Critical
|
||
|
|
- Response time >2s: Warning
|
||
|
|
|
||
|
|
4. Configure notifications to Slack/email
|
||
|
|
</code></pre>
|
||
|
|
<hr />
|
||
|
|
<h2 id="alert-response-procedures"><a class="header" href="#alert-response-procedures">Alert Response Procedures</a></h2>
|
||
|
|
<h3 id="when-alert-fires"><a class="header" href="#when-alert-fires">When Alert Fires</a></h3>
|
||
|
|
<pre><code>Alert Received
|
||
|
|
↓
|
||
|
|
Step 1: Verify it's real (not false alarm)
|
||
|
|
- Check dashboard
|
||
|
|
- Check manually (curl endpoints, kubectl get pods)
|
||
|
|
- Ask in #deployments if unsure
|
||
|
|
|
||
|
|
Step 2: Assess severity
|
||
|
|
- Service completely down? Severity 1
|
||
|
|
- Service partially degraded? Severity 2
|
||
|
|
- Warning/trending issue? Severity 3
|
||
|
|
|
||
|
|
Step 3: Declare incident (if Severity 1-2)
|
||
|
|
- Create #incident channel
|
||
|
|
- Follow Incident Response Runbook
|
||
|
|
- See: incident-response-runbook.md
|
||
|
|
|
||
|
|
Step 4: Investigate (if Severity 3)
|
||
|
|
- Document in ticket
|
||
|
|
- Schedule investigation
|
||
|
|
- Monitor for escalation
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="common-alerts--actions"><a class="header" href="#common-alerts--actions">Common Alerts & Actions</a></h3>
|
||
|
|
<div class="table-wrapper"><table><thead><tr><th>Alert</th><th>Cause</th><th>Response</th></tr></thead><tbody>
|
||
|
|
<tr><td><strong>Pod CrashLoopBackOff</strong></td><td>App crashing</td><td>Get logs, fix, restart</td></tr>
|
||
|
|
<tr><td><strong>High CPU >80%</strong></td><td>Resource exhausted</td><td>Scale up or reduce load</td></tr>
|
||
|
|
<tr><td><strong>High Memory >85%</strong></td><td>Memory leak or surge</td><td>Investigate or restart</td></tr>
|
||
|
|
<tr><td><strong>Error rate spike</strong></td><td>App issue</td><td>Check logs, might rollback</td></tr>
|
||
|
|
<tr><td><strong>Response time spike</strong></td><td>Slow queries/I/O</td><td>Check database, might restart</td></tr>
|
||
|
|
<tr><td><strong>Pod pending</strong></td><td>Can't schedule</td><td>Check node resources</td></tr>
|
||
|
|
<tr><td><strong>Endpoints empty</strong></td><td>Service down</td><td>Verify service exists</td></tr>
|
||
|
|
<tr><td><strong>Disk full</strong></td><td>Storage exhausted</td><td>Clean up or expand</td></tr>
|
||
|
|
</tbody></table>
|
||
|
|
</div>
|
||
|
|
<hr />
|
||
|
|
<h2 id="metric-baselines--trends"><a class="header" href="#metric-baselines--trends">Metric Baselines & Trends</a></h2>
|
||
|
|
<h3 id="establishing-baselines"><a class="header" href="#establishing-baselines">Establishing Baselines</a></h3>
|
||
|
|
<p>Record these metrics during normal operation:</p>
|
||
|
|
<pre><code class="language-bash"># CPU per pod (typical)
|
||
|
|
Backend: 200-400m per pod
|
||
|
|
Agents: 300-500m per pod
|
||
|
|
LLM Router: 100-200m per pod
|
||
|
|
|
||
|
|
# Memory per pod (typical)
|
||
|
|
Backend: 256-512Mi per pod
|
||
|
|
Agents: 128-256Mi per pod
|
||
|
|
LLM Router: 64-128Mi per pod
|
||
|
|
|
||
|
|
# Response time (typical)
|
||
|
|
Backend: p50: 50ms, p95: 200ms, p99: 500ms
|
||
|
|
Frontend: Load time: 2-3 seconds
|
||
|
|
|
||
|
|
# Error rate (typical)
|
||
|
|
Backend: 4xx: <1%, 5xx: <0.1%
|
||
|
|
Frontend: <5% user-visible errors
|
||
|
|
|
||
|
|
# Pod restart count
|
||
|
|
Should remain 0 (no restarts expected in normal operation)
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="detecting-anomalies"><a class="header" href="#detecting-anomalies">Detecting Anomalies</a></h3>
|
||
|
|
<p>Compare current metrics to baseline:</p>
|
||
|
|
<pre><code class="language-bash"># If CPU 2x normal:
|
||
|
|
- Check if load increased
|
||
|
|
- Check for resource leak
|
||
|
|
- Monitor for further increase
|
||
|
|
|
||
|
|
# If Memory increasing:
|
||
|
|
- Might indicate memory leak
|
||
|
|
- Monitor over time (1-2 hours)
|
||
|
|
- Restart if clearly trending up
|
||
|
|
|
||
|
|
# If Error rate 10x:
|
||
|
|
- Something broke recently
|
||
|
|
- Check recent deployment
|
||
|
|
- Consider rollback
|
||
|
|
|
||
|
|
# If new process consuming resources:
|
||
|
|
- Identify the new resource consumer
|
||
|
|
- Investigate purpose
|
||
|
|
- Kill if unintended
|
||
|
|
</code></pre>
|
||
|
|
<hr />
|
||
|
|
<h2 id="capacity-planning"><a class="header" href="#capacity-planning">Capacity Planning</a></h2>
|
||
|
|
<h3 id="when-to-scale"><a class="header" href="#when-to-scale">When to Scale</a></h3>
|
||
|
|
<p>Monitor trends and plan ahead:</p>
|
||
|
|
<pre><code class="language-bash"># Trigger capacity planning if:
|
||
|
|
- Average CPU >60%
|
||
|
|
- Average Memory >60%
|
||
|
|
- Peak usage trending upward
|
||
|
|
- Disk usage >80%
|
||
|
|
|
||
|
|
# Questions to ask:
|
||
|
|
- Is traffic increasing? Seasonal spike?
|
||
|
|
- Did we add features? New workload?
|
||
|
|
- Do we have capacity for growth?
|
||
|
|
- Should we scale now or wait?
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="scaling-actions"><a class="header" href="#scaling-actions">Scaling Actions</a></h3>
|
||
|
|
<pre><code class="language-bash"># Quick scale (temporary):
|
||
|
|
kubectl scale deployment/vapora-backend --replicas=5 -n vapora
|
||
|
|
|
||
|
|
# Permanent scale (update deployment.yaml):
|
||
|
|
# Edit: replicas: 5
|
||
|
|
# Apply: kubectl apply -f deployment.yaml
|
||
|
|
|
||
|
|
# Add nodes (infrastructure):
|
||
|
|
# Contact infrastructure team
|
||
|
|
|
||
|
|
# Reduce resource consumption:
|
||
|
|
# Investigate slow queries, memory leaks, etc.
|
||
|
|
</code></pre>
|
||
|
|
<hr />
|
||
|
|
<h2 id="log-analysis--troubleshooting"><a class="header" href="#log-analysis--troubleshooting">Log Analysis & Troubleshooting</a></h2>
|
||
|
|
<h3 id="checking-logs"><a class="header" href="#checking-logs">Checking Logs</a></h3>
|
||
|
|
<pre><code class="language-bash"># Most recent logs
|
||
|
|
kubectl logs deployment/vapora-backend -n vapora
|
||
|
|
|
||
|
|
# Last N lines
|
||
|
|
kubectl logs deployment/vapora-backend -n vapora --tail=100
|
||
|
|
|
||
|
|
# From specific time
|
||
|
|
kubectl logs deployment/vapora-backend -n vapora --since=1h
|
||
|
|
|
||
|
|
# Follow/tail logs
|
||
|
|
kubectl logs deployment/vapora-backend -n vapora -f
|
||
|
|
|
||
|
|
# From specific pod
|
||
|
|
kubectl logs pod-name -n vapora
|
||
|
|
|
||
|
|
# Previous pod (if crashed)
|
||
|
|
kubectl logs pod-name -n vapora --previous
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="log-patterns-to-watch-for"><a class="header" href="#log-patterns-to-watch-for">Log Patterns to Watch For</a></h3>
|
||
|
|
<pre><code class="language-bash"># Error patterns
|
||
|
|
kubectl logs deployment/vapora-backend -n vapora | grep -i "error\|exception\|fatal"
|
||
|
|
|
||
|
|
# Database issues
|
||
|
|
kubectl logs deployment/vapora-backend -n vapora | grep -i "database\|connection\|sql"
|
||
|
|
|
||
|
|
# Authentication issues
|
||
|
|
kubectl logs deployment/vapora-backend -n vapora | grep -i "auth\|permission\|forbidden"
|
||
|
|
|
||
|
|
# Resource issues
|
||
|
|
kubectl logs deployment/vapora-backend -n vapora | grep -i "memory\|cpu\|timeout"
|
||
|
|
|
||
|
|
# Startup issues (if pod restarting)
|
||
|
|
kubectl logs pod-name -n vapora --previous | head -50
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="common-log-messages--meaning"><a class="header" href="#common-log-messages--meaning">Common Log Messages & Meaning</a></h3>
|
||
|
|
<div class="table-wrapper"><table><thead><tr><th>Log Message</th><th>Meaning</th><th>Action</th></tr></thead><tbody>
|
||
|
|
<tr><td><code>Connection refused</code></td><td>Service not listening</td><td>Check if service started</td></tr>
|
||
|
|
<tr><td><code>Out of memory</code></td><td>Memory exhausted</td><td>Increase limits or scale</td></tr>
|
||
|
|
<tr><td><code>Unauthorized</code></td><td>Auth failed</td><td>Check credentials/tokens</td></tr>
|
||
|
|
<tr><td><code>Database connection timeout</code></td><td>Database unreachable</td><td>Check DB health</td></tr>
|
||
|
|
<tr><td><code>404 Not Found</code></td><td>Endpoint doesn't exist</td><td>Check API routes</td></tr>
|
||
|
|
<tr><td><code>Slow query</code></td><td>Database query taking time</td><td>Optimize query or check DB</td></tr>
|
||
|
|
</tbody></table>
|
||
|
|
</div>
|
||
|
|
<hr />
|
||
|
|
<h2 id="proactive-monitoring-practices"><a class="header" href="#proactive-monitoring-practices">Proactive Monitoring Practices</a></h2>
|
||
|
|
<h3 id="weekly-review"><a class="header" href="#weekly-review">Weekly Review</a></h3>
|
||
|
|
<pre><code class="language-bash"># Every Monday (or your weekly cadence):
|
||
|
|
|
||
|
|
1. Review incidents from past week
|
||
|
|
- Were any preventable?
|
||
|
|
- Any patterns?
|
||
|
|
|
||
|
|
2. Check alert tuning
|
||
|
|
- False alarms?
|
||
|
|
- Missed issues?
|
||
|
|
- Adjust thresholds if needed
|
||
|
|
|
||
|
|
3. Capacity check
|
||
|
|
- How much headroom remaining?
|
||
|
|
- Plan for growth?
|
||
|
|
|
||
|
|
4. Log analysis
|
||
|
|
- Any concerning patterns?
|
||
|
|
- Warnings that should be errors?
|
||
|
|
|
||
|
|
5. Update runbooks if needed
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="monthly-review"><a class="header" href="#monthly-review">Monthly Review</a></h3>
|
||
|
|
<pre><code class="language-bash"># First of each month:
|
||
|
|
|
||
|
|
1. Performance trends
|
||
|
|
- Response time trending up/down?
|
||
|
|
- Error rate changing?
|
||
|
|
- Resource usage changing?
|
||
|
|
|
||
|
|
2. Capacity forecast
|
||
|
|
- Extrapolate current trends
|
||
|
|
- Plan for growth
|
||
|
|
- Schedule scaling if needed
|
||
|
|
|
||
|
|
3. Incident review
|
||
|
|
- MTBF (Mean Time Between Failures)
|
||
|
|
- MTTR (Mean Time To Resolve)
|
||
|
|
- MTTI (Mean Time To Identify)
|
||
|
|
- Are we improving?
|
||
|
|
|
||
|
|
4. Tool/alert improvements
|
||
|
|
- New monitoring needs?
|
||
|
|
- Alert fatigue issues?
|
||
|
|
- Better ways to visualize data?
|
||
|
|
</code></pre>
|
||
|
|
<hr />
|
||
|
|
<h2 id="health-check-checklist"><a class="header" href="#health-check-checklist">Health Check Checklist</a></h2>
|
||
|
|
<h3 id="pre-deployment-health-check"><a class="header" href="#pre-deployment-health-check">Pre-Deployment Health Check</a></h3>
|
||
|
|
<pre><code>Before any deployment, verify:
|
||
|
|
☐ All pods running: kubectl get pods
|
||
|
|
☐ No recent errors: kubectl logs --since=1h
|
||
|
|
☐ Resource usage normal: kubectl top pods
|
||
|
|
☐ Services healthy: curl /health
|
||
|
|
☐ Recent events normal: kubectl get events
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="post-deployment-health-check"><a class="header" href="#post-deployment-health-check">Post-Deployment Health Check</a></h3>
|
||
|
|
<pre><code>After deployment, verify for 2 hours:
|
||
|
|
☐ All new pods running
|
||
|
|
☐ Old pods terminated
|
||
|
|
☐ Health endpoints responding
|
||
|
|
☐ No spike in error logs
|
||
|
|
☐ Resource usage within expected range
|
||
|
|
☐ Response time normal
|
||
|
|
☐ No pod restarts
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="daily-health-check"><a class="header" href="#daily-health-check">Daily Health Check</a></h3>
|
||
|
|
<pre><code>Once per business day:
|
||
|
|
☐ kubectl get pods (all Running, 1/1 Ready)
|
||
|
|
☐ curl http://localhost:8001/health (200 OK)
|
||
|
|
☐ kubectl logs --since=24h | grep ERROR (few to none)
|
||
|
|
☐ kubectl top pods (normal usage)
|
||
|
|
☐ kubectl get events (no warnings)
|
||
|
|
</code></pre>
|
||
|
|
<hr />
|
||
|
|
<h2 id="monitoring-runbook-checklist"><a class="header" href="#monitoring-runbook-checklist">Monitoring Runbook Checklist</a></h2>
|
||
|
|
<pre><code>☐ Verified automated health checks running
|
||
|
|
☐ Manual health checks performed (daily)
|
||
|
|
☐ Dashboards set up and visible
|
||
|
|
☐ Alert thresholds tuned
|
||
|
|
☐ Log patterns identified
|
||
|
|
☐ Baselines recorded
|
||
|
|
☐ Escalation procedures understood
|
||
|
|
☐ Team trained on monitoring
|
||
|
|
☐ Alert responses tested
|
||
|
|
☐ Runbooks up to date
|
||
|
|
</code></pre>
|
||
|
|
<hr />
|
||
|
|
<h2 id="common-monitoring-issues"><a class="header" href="#common-monitoring-issues">Common Monitoring Issues</a></h2>
|
||
|
|
<h3 id="false-alerts"><a class="header" href="#false-alerts">False Alerts</a></h3>
|
||
|
|
<p><strong>Problem</strong>: Alert fires but service is actually fine</p>
|
||
|
|
<p><strong>Solution</strong>:</p>
|
||
|
|
<ol>
|
||
|
|
<li>Verify manually (don't just assume false)</li>
|
||
|
|
<li>Check alert threshold (might be too sensitive)</li>
|
||
|
|
<li>Adjust threshold if consistently false</li>
|
||
|
|
<li>Document the change</li>
|
||
|
|
</ol>
|
||
|
|
<h3 id="alert-fatigue"><a class="header" href="#alert-fatigue">Alert Fatigue</a></h3>
|
||
|
|
<p><strong>Problem</strong>: Too many alerts, getting ignored</p>
|
||
|
|
<p><strong>Solution</strong>:</p>
|
||
|
|
<ol>
|
||
|
|
<li>Review all alerts</li>
|
||
|
|
<li>Disable/adjust non-actionable ones</li>
|
||
|
|
<li>Consolidate related alerts</li>
|
||
|
|
<li>Focus on critical-only alerts</li>
|
||
|
|
</ol>
|
||
|
|
<h3 id="missing-alerts"><a class="header" href="#missing-alerts">Missing Alerts</a></h3>
|
||
|
|
<p><strong>Problem</strong>: Issue happens but no alert fired</p>
|
||
|
|
<p><strong>Solution</strong>:</p>
|
||
|
|
<ol>
|
||
|
|
<li>Investigate why alert didn't fire</li>
|
||
|
|
<li>Check alert condition</li>
|
||
|
|
<li>Add new alert for this issue</li>
|
||
|
|
<li>Test the new alert</li>
|
||
|
|
</ol>
|
||
|
|
<h3 id="lag-in-monitoring"><a class="header" href="#lag-in-monitoring">Lag in Monitoring</a></h3>
|
||
|
|
<p><strong>Problem</strong>: Dashboard/alerts slow to update</p>
|
||
|
|
<p><strong>Solution</strong>:</p>
|
||
|
|
<ol>
|
||
|
|
<li>Check monitoring system performance</li>
|
||
|
|
<li>Increase scrape frequency if appropriate</li>
|
||
|
|
<li>Reduce data retention if storage issue</li>
|
||
|
|
<li>Investigate database performance</li>
|
||
|
|
</ol>
|
||
|
|
<hr />
|
||
|
|
<h2 id="monitoring-tools--commands"><a class="header" href="#monitoring-tools--commands">Monitoring Tools & Commands</a></h2>
|
||
|
|
<h3 id="kubectl-commands"><a class="header" href="#kubectl-commands">kubectl Commands</a></h3>
|
||
|
|
<pre><code class="language-bash"># Pod monitoring
|
||
|
|
kubectl get pods -n vapora
|
||
|
|
kubectl get pods -n vapora -w # Watch mode
|
||
|
|
kubectl describe pod <pod> -n vapora
|
||
|
|
kubectl logs <pod> -n vapora -f
|
||
|
|
|
||
|
|
# Resource monitoring
|
||
|
|
kubectl top nodes
|
||
|
|
kubectl top pods -n vapora
|
||
|
|
kubectl describe nodes
|
||
|
|
|
||
|
|
# Event monitoring
|
||
|
|
kubectl get events -n vapora --sort-by='.lastTimestamp'
|
||
|
|
kubectl get events -n vapora --watch
|
||
|
|
|
||
|
|
# Health checks
|
||
|
|
kubectl get --raw /healthz # API health
|
||
|
|
</code></pre>
|
||
|
|
<h3 id="useful-commands"><a class="header" href="#useful-commands">Useful Commands</a></h3>
|
||
|
|
<pre><code class="language-bash"># Check API responsiveness
|
||
|
|
curl -v http://localhost:8001/health
|
||
|
|
|
||
|
|
# Check all endpoints have pods
|
||
|
|
for svc in backend agents llm-router; do
|
||
|
|
echo "$svc endpoints:"
|
||
|
|
kubectl get endpoints vapora-$svc -n vapora
|
||
|
|
done
|
||
|
|
|
||
|
|
# Monitor pod restarts
|
||
|
|
watch 'kubectl get pods -n vapora -o jsonpath="{range .items[*]}{.metadata.name}{\" \"}{.status.containerStatuses[0].restartCount}{\"\\n\"}{end}"'
|
||
|
|
|
||
|
|
# Find pods with high restarts
|
||
|
|
kubectl get pods -n vapora -o json | jq '.items[] | select(.status.containerStatuses[0].restartCount > 5) | .metadata.name'
|
||
|
|
</code></pre>
|
||
|
|
<hr />
|
||
|
|
<h2 id="next-steps"><a class="header" href="#next-steps">Next Steps</a></h2>
|
||
|
|
<ol>
|
||
|
|
<li><strong>Set up dashboards</strong> - Create Grafana/Prometheus dashboards if not available</li>
|
||
|
|
<li><strong>Configure alerts</strong> - Set thresholds based on baselines</li>
|
||
|
|
<li><strong>Test alerting</strong> - Verify Slack/email notifications work</li>
|
||
|
|
<li><strong>Train team</strong> - Ensure everyone knows how to read dashboards</li>
|
||
|
|
<li><strong>Document baselines</strong> - Record normal metrics for comparison</li>
|
||
|
|
<li><strong>Automate checks</strong> - Use CI/CD health check pipelines</li>
|
||
|
|
<li><strong>Review regularly</strong> - Weekly/monthly health check reviews</li>
|
||
|
|
</ol>
|
||
|
|
<hr />
|
||
|
|
<p><strong>Last Updated</strong>: 2026-01-12
|
||
|
|
<strong>Status</strong>: Production-ready</p>
|
||
|
|
|
||
|
|
</main>
|
||
|
|
|
||
|
|
<nav class="nav-wrapper" aria-label="Page navigation">
|
||
|
|
<!-- Mobile navigation buttons -->
|
||
|
|
<a rel="prev" href="../../operations/pre-deployment-checklist.html" class="mobile-nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
|
||
|
|
<i class="fa fa-angle-left"></i>
|
||
|
|
</a>
|
||
|
|
|
||
|
|
<a rel="next prefetch" href="../../operations/on-call-procedures.html" class="mobile-nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
|
||
|
|
<i class="fa fa-angle-right"></i>
|
||
|
|
</a>
|
||
|
|
|
||
|
|
<div style="clear: both"></div>
|
||
|
|
</nav>
|
||
|
|
</div>
|
||
|
|
</div>
|
||
|
|
|
||
|
|
<nav class="nav-wide-wrapper" aria-label="Page navigation">
|
||
|
|
<a rel="prev" href="../../operations/pre-deployment-checklist.html" class="nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
|
||
|
|
<i class="fa fa-angle-left"></i>
|
||
|
|
</a>
|
||
|
|
|
||
|
|
<a rel="next prefetch" href="../../operations/on-call-procedures.html" class="nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
|
||
|
|
<i class="fa fa-angle-right"></i>
|
||
|
|
</a>
|
||
|
|
</nav>
|
||
|
|
|
||
|
|
</div>
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
|
||
|
|
<script>
|
||
|
|
window.playground_copyable = true;
|
||
|
|
</script>
|
||
|
|
|
||
|
|
|
||
|
|
<script src="../elasticlunr.min.js"></script>
|
||
|
|
<script src="../mark.min.js"></script>
|
||
|
|
<script src="../searcher.js"></script>
|
||
|
|
|
||
|
|
<script src="../clipboard.min.js"></script>
|
||
|
|
<script src="../highlight.js"></script>
|
||
|
|
<script src="../book.js"></script>
|
||
|
|
|
||
|
|
<!-- Custom JS scripts -->
|
||
|
|
|
||
|
|
|
||
|
|
</div>
|
||
|
|
</body>
|
||
|
|
</html>
|