Vapora/docs/operations/monitoring-operations.html

787 lines
32 KiB
HTML
Raw Normal View History

<!DOCTYPE HTML>
<html lang="en" class="light sidebar-visible" dir="ltr">
<head>
<!-- Book generated using mdBook -->
<meta charset="UTF-8">
<title>Monitoring &amp; Operations - VAPORA Platform Documentation</title>
<!-- Custom HTML head -->
<meta name="description" content="Comprehensive documentation for VAPORA, an intelligent development orchestration platform built entirely in Rust.">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="theme-color" content="#ffffff">
<link rel="icon" href="../favicon.svg">
<link rel="shortcut icon" href="../favicon.png">
<link rel="stylesheet" href="../css/variables.css">
<link rel="stylesheet" href="../css/general.css">
<link rel="stylesheet" href="../css/chrome.css">
<link rel="stylesheet" href="../css/print.css" media="print">
<!-- Fonts -->
<link rel="stylesheet" href="../FontAwesome/css/font-awesome.css">
<link rel="stylesheet" href="../fonts/fonts.css">
<!-- Highlight.js Stylesheets -->
<link rel="stylesheet" id="highlight-css" href="../highlight.css">
<link rel="stylesheet" id="tomorrow-night-css" href="../tomorrow-night.css">
<link rel="stylesheet" id="ayu-highlight-css" href="../ayu-highlight.css">
<!-- Custom theme stylesheets -->
<!-- Provide site root and default themes to javascript -->
<script>
const path_to_root = "../";
const default_light_theme = "light";
const default_dark_theme = "dark";
</script>
<!-- Start loading toc.js asap -->
<script src="../toc.js"></script>
</head>
<body>
<div id="mdbook-help-container">
<div id="mdbook-help-popup">
<h2 class="mdbook-help-title">Keyboard shortcuts</h2>
<div>
<p>Press <kbd></kbd> or <kbd></kbd> to navigate between chapters</p>
<p>Press <kbd>S</kbd> or <kbd>/</kbd> to search in the book</p>
<p>Press <kbd>?</kbd> to show this help</p>
<p>Press <kbd>Esc</kbd> to hide this help</p>
</div>
</div>
</div>
<div id="body-container">
<!-- Work around some values being stored in localStorage wrapped in quotes -->
<script>
try {
let theme = localStorage.getItem('mdbook-theme');
let sidebar = localStorage.getItem('mdbook-sidebar');
if (theme.startsWith('"') && theme.endsWith('"')) {
localStorage.setItem('mdbook-theme', theme.slice(1, theme.length - 1));
}
if (sidebar.startsWith('"') && sidebar.endsWith('"')) {
localStorage.setItem('mdbook-sidebar', sidebar.slice(1, sidebar.length - 1));
}
} catch (e) { }
</script>
<!-- Set the theme before any content is loaded, prevents flash -->
<script>
const default_theme = window.matchMedia("(prefers-color-scheme: dark)").matches ? default_dark_theme : default_light_theme;
let theme;
try { theme = localStorage.getItem('mdbook-theme'); } catch(e) { }
if (theme === null || theme === undefined) { theme = default_theme; }
const html = document.documentElement;
html.classList.remove('light')
html.classList.add(theme);
html.classList.add("js");
</script>
<input type="checkbox" id="sidebar-toggle-anchor" class="hidden">
<!-- Hide / unhide sidebar before it is displayed -->
<script>
let sidebar = null;
const sidebar_toggle = document.getElementById("sidebar-toggle-anchor");
if (document.body.clientWidth >= 1080) {
try { sidebar = localStorage.getItem('mdbook-sidebar'); } catch(e) { }
sidebar = sidebar || 'visible';
} else {
sidebar = 'hidden';
}
sidebar_toggle.checked = sidebar === 'visible';
html.classList.remove('sidebar-visible');
html.classList.add("sidebar-" + sidebar);
</script>
<nav id="sidebar" class="sidebar" aria-label="Table of contents">
<!-- populated by js -->
<mdbook-sidebar-scrollbox class="sidebar-scrollbox"></mdbook-sidebar-scrollbox>
<noscript>
<iframe class="sidebar-iframe-outer" src="../toc.html"></iframe>
</noscript>
<div id="sidebar-resize-handle" class="sidebar-resize-handle">
<div class="sidebar-resize-indicator"></div>
</div>
</nav>
<div id="page-wrapper" class="page-wrapper">
<div class="page">
<div id="menu-bar-hover-placeholder"></div>
<div id="menu-bar" class="menu-bar sticky">
<div class="left-buttons">
<label id="sidebar-toggle" class="icon-button" for="sidebar-toggle-anchor" title="Toggle Table of Contents" aria-label="Toggle Table of Contents" aria-controls="sidebar">
<i class="fa fa-bars"></i>
</label>
<button id="theme-toggle" class="icon-button" type="button" title="Change theme" aria-label="Change theme" aria-haspopup="true" aria-expanded="false" aria-controls="theme-list">
<i class="fa fa-paint-brush"></i>
</button>
<ul id="theme-list" class="theme-popup" aria-label="Themes" role="menu">
<li role="none"><button role="menuitem" class="theme" id="default_theme">Auto</button></li>
<li role="none"><button role="menuitem" class="theme" id="light">Light</button></li>
<li role="none"><button role="menuitem" class="theme" id="rust">Rust</button></li>
<li role="none"><button role="menuitem" class="theme" id="coal">Coal</button></li>
<li role="none"><button role="menuitem" class="theme" id="navy">Navy</button></li>
<li role="none"><button role="menuitem" class="theme" id="ayu">Ayu</button></li>
</ul>
<button id="search-toggle" class="icon-button" type="button" title="Search (`/`)" aria-label="Toggle Searchbar" aria-expanded="false" aria-keyshortcuts="/ s" aria-controls="searchbar">
<i class="fa fa-search"></i>
</button>
</div>
<h1 class="menu-title">VAPORA Platform Documentation</h1>
<div class="right-buttons">
<a href="../print.html" title="Print this book" aria-label="Print this book">
<i id="print-button" class="fa fa-print"></i>
</a>
<a href="https://github.com/vapora-platform/vapora" title="Git repository" aria-label="Git repository">
<i id="git-repository-button" class="fa fa-github"></i>
</a>
<a href="https://github.com/vapora-platform/vapora/edit/main/docs/src/../operations/monitoring-operations.md" title="Suggest an edit" aria-label="Suggest an edit">
<i id="git-edit-button" class="fa fa-edit"></i>
</a>
</div>
</div>
<div id="search-wrapper" class="hidden">
<form id="searchbar-outer" class="searchbar-outer">
<input type="search" id="searchbar" name="searchbar" placeholder="Search this book ..." aria-controls="searchresults-outer" aria-describedby="searchresults-header">
</form>
<div id="searchresults-outer" class="searchresults-outer hidden">
<div id="searchresults-header" class="searchresults-header"></div>
<ul id="searchresults">
</ul>
</div>
</div>
<!-- Apply ARIA attributes after the sidebar and the sidebar toggle button are added to the DOM -->
<script>
document.getElementById('sidebar-toggle').setAttribute('aria-expanded', sidebar === 'visible');
document.getElementById('sidebar').setAttribute('aria-hidden', sidebar !== 'visible');
Array.from(document.querySelectorAll('#sidebar a')).forEach(function(link) {
link.setAttribute('tabIndex', sidebar === 'visible' ? 0 : -1);
});
</script>
<div id="content" class="content">
<main>
<h1 id="monitoring--health-check-operations"><a class="header" href="#monitoring--health-check-operations">Monitoring &amp; Health Check Operations</a></h1>
<p>Guide for continuous monitoring and health checks of VAPORA in production.</p>
<hr />
<h2 id="overview"><a class="header" href="#overview">Overview</a></h2>
<p><strong>Responsibility</strong>: Maintain visibility into VAPORA service health through monitoring, logging, and alerting</p>
<p><strong>Key Activities</strong>:</p>
<ul>
<li>Regular health checks (automated and manual)</li>
<li>Alert response and investigation</li>
<li>Trend analysis and capacity planning</li>
<li>Incident prevention through early detection</li>
</ul>
<p><strong>Success Metric</strong>: Detect and respond to issues before users are significantly impacted</p>
<hr />
<h2 id="automated-health-checks"><a class="header" href="#automated-health-checks">Automated Health Checks</a></h2>
<h3 id="kubernetes-health-check-pipeline"><a class="header" href="#kubernetes-health-check-pipeline">Kubernetes Health Check Pipeline</a></h3>
<p>If using CI/CD, leverage automatic health monitoring:</p>
<p><strong>GitHub Actions</strong>:</p>
<pre><code class="language-bash"># Runs every 15 minutes (quick check)
# Runs every 6 hours (comprehensive diagnostics)
# See: .github/workflows/health-check.yml
</code></pre>
<p><strong>Woodpecker</strong>:</p>
<pre><code class="language-bash"># Runs every 15 minutes (quick check)
# Runs every 6 hours (comprehensive diagnostics)
# See: .woodpecker/health-check.yml
</code></pre>
<p><strong>Artifacts Generated</strong>:</p>
<ul>
<li><code>docker-health.log</code> - Docker container status</li>
<li><code>k8s-health.log</code> - Kubernetes deployments status</li>
<li><code>k8s-diagnostics.log</code> - Full system diagnostics</li>
<li><code>docker-diagnostics.log</code> - Docker system info</li>
<li><code>HEALTH_REPORT.md</code> - Summary report</li>
</ul>
<h3 id="quick-manual-health-check"><a class="header" href="#quick-manual-health-check">Quick Manual Health Check</a></h3>
<pre><code class="language-bash"># Run this command to get instant health status
export NAMESPACE=vapora
echo "=== Pod Status ==="
kubectl get pods -n $NAMESPACE
echo ""
echo "=== Service Health ==="
kubectl get endpoints -n $NAMESPACE
echo ""
echo "=== Recent Events ==="
kubectl get events -n $NAMESPACE --sort-by='.lastTimestamp' | tail -10
echo ""
echo "=== Resource Usage ==="
kubectl top pods -n $NAMESPACE
echo ""
echo "=== API Health ==="
curl -s http://localhost:8001/health | jq .
</code></pre>
<hr />
<h2 id="manual-daily-monitoring"><a class="header" href="#manual-daily-monitoring">Manual Daily Monitoring</a></h2>
<h3 id="morning-check-start-of-business-day"><a class="header" href="#morning-check-start-of-business-day">Morning Check (Start of Business Day)</a></h3>
<pre><code class="language-bash"># Run at start of business day (or when starting shift)
echo "=== MORNING HEALTH CHECK ==="
echo "Date: $(date -u)"
# 1. Cluster Status
echo "Cluster Status:"
kubectl cluster-info | grep server
# 2. Node Status
echo ""
echo "Node Status:"
kubectl get nodes
# Should show: All nodes Ready
# 3. Pod Status
echo ""
echo "Pod Status:"
kubectl get pods -n vapora
# Should show: All Running, 1/1 Ready
# 4. Service Endpoints
echo ""
echo "Service Endpoints:"
kubectl get endpoints -n vapora
# Should show: All services have endpoints (not empty)
# 5. Resource Usage
echo ""
echo "Resource Usage:"
kubectl top nodes
kubectl top pods -n vapora | head -10
# 6. Recent Errors
echo ""
echo "Recent Errors (last 1 hour):"
kubectl logs deployment/vapora-backend -n vapora --since=1h | grep -i error | wc -l
# Should show: 0 or very few errors
# 7. Overall Status
echo ""
echo "Overall Status: ✅ Healthy"
# If any issues found: Document and investigate
</code></pre>
<h3 id="mid-day-check-every-4-6-hours"><a class="header" href="#mid-day-check-every-4-6-hours">Mid-Day Check (Every 4-6 hours)</a></h3>
<pre><code class="language-bash"># Quick sanity check during business hours
# 1. Service Responsiveness
curl -s http://localhost:8001/health | jq '.status'
# Should return: "healthy"
# 2. Pod Restart Tracking
kubectl get pods -n vapora -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.containerStatuses[0].restartCount}{"\n"}{end}'
# Restart count should not be increasing rapidly
# 3. Error Log Check
kubectl logs deployment/vapora-backend -n vapora --since=4h --timestamps | grep ERROR | tail -5
# Should show: Few to no errors
# 4. Performance Check
kubectl top pods -n vapora | tail -5
# CPU/Memory should be in normal range
</code></pre>
<h3 id="end-of-day-check-before-shift-end"><a class="header" href="#end-of-day-check-before-shift-end">End-of-Day Check (Before Shift End)</a></h3>
<pre><code class="language-bash"># Summary check before handing off to on-call
echo "=== END OF DAY SUMMARY ==="
# Current status
kubectl get pods -n vapora
kubectl top pods -n vapora
# Any concerning trends?
echo ""
echo "Checking for concerning events..."
kubectl get events -n vapora --sort-by='.lastTimestamp' | grep -i warning
# Any pod restarts?
echo ""
echo "Pod restart status:"
kubectl get pods -n vapora -o jsonpath='{range .items[*]}{.metadata.name}{": "}{.status.containerStatuses[0].restartCount}{"\n"}{end}' | grep -v ": 0"
# Document for next shift
echo ""
echo "Status for on-call: All normal / Issues detected"
</code></pre>
<hr />
<h2 id="dashboard-setup--monitoring"><a class="header" href="#dashboard-setup--monitoring">Dashboard Setup &amp; Monitoring</a></h2>
<h3 id="essential-dashboards-to-monitor"><a class="header" href="#essential-dashboards-to-monitor">Essential Dashboards to Monitor</a></h3>
<p>If you have Grafana/Prometheus, create these dashboards:</p>
<h4 id="1-service-health-dashboard"><a class="header" href="#1-service-health-dashboard">1. Service Health Dashboard</a></h4>
<p>Monitor:</p>
<ul>
<li>Pod running count (should be stable at expected count)</li>
<li>Pod restart count (should not increase rapidly)</li>
<li>Service endpoint availability (should be &gt;99%)</li>
<li>API response time (p99, track trends)</li>
</ul>
<p><strong>Alert if:</strong></p>
<ul>
<li>Pod count drops below expected</li>
<li>Restart count increasing</li>
<li>Endpoints empty</li>
<li>Response time &gt;2s</li>
</ul>
<h4 id="2-resource-utilization-dashboard"><a class="header" href="#2-resource-utilization-dashboard">2. Resource Utilization Dashboard</a></h4>
<p>Monitor:</p>
<ul>
<li>CPU usage per pod</li>
<li>Memory usage per pod</li>
<li>Node capacity (CPU, memory, disk)</li>
<li>Network I/O</li>
</ul>
<p><strong>Alert if:</strong></p>
<ul>
<li>Any pod &gt;80% CPU/Memory</li>
<li>Any node &gt;85% capacity</li>
<li>Memory trending upward consistently</li>
</ul>
<h4 id="3-error-rate-dashboard"><a class="header" href="#3-error-rate-dashboard">3. Error Rate Dashboard</a></h4>
<p>Monitor:</p>
<ul>
<li>4xx error rate (should be low)</li>
<li>5xx error rate (should be minimal)</li>
<li>Error rate by endpoint</li>
<li>Error rate by service</li>
</ul>
<p><strong>Alert if:</strong></p>
<ul>
<li>5xx error rate &gt;5%</li>
<li>4xx error rate &gt;10%</li>
<li>Sudden spike in errors</li>
</ul>
<h4 id="4-application-metrics-dashboard"><a class="header" href="#4-application-metrics-dashboard">4. Application Metrics Dashboard</a></h4>
<p>Monitor:</p>
<ul>
<li>Request rate (RPS)</li>
<li>Request latency (p50, p95, p99)</li>
<li>Active connections</li>
<li>Database query time</li>
</ul>
<p><strong>Alert if:</strong></p>
<ul>
<li>Request rate suddenly drops (might indicate outage)</li>
<li>Latency spikes above baseline</li>
<li>Database queries slow</li>
</ul>
<h3 id="grafana-setup-example"><a class="header" href="#grafana-setup-example">Grafana Setup Example</a></h3>
<pre><code class="language-bash"># If setting up Grafana monitoring
1. Deploy Prometheus scraping Kubernetes metrics
2. Create dashboard with above panels
3. Set alert rules:
- CPU &gt;80%: Warning
- Memory &gt;85%: Warning
- Error rate &gt;5%: Critical
- Pod crashed: Critical
- Response time &gt;2s: Warning
4. Configure notifications to Slack/email
</code></pre>
<hr />
<h2 id="alert-response-procedures"><a class="header" href="#alert-response-procedures">Alert Response Procedures</a></h2>
<h3 id="when-alert-fires"><a class="header" href="#when-alert-fires">When Alert Fires</a></h3>
<pre><code>Alert Received
Step 1: Verify it's real (not false alarm)
- Check dashboard
- Check manually (curl endpoints, kubectl get pods)
- Ask in #deployments if unsure
Step 2: Assess severity
- Service completely down? Severity 1
- Service partially degraded? Severity 2
- Warning/trending issue? Severity 3
Step 3: Declare incident (if Severity 1-2)
- Create #incident channel
- Follow Incident Response Runbook
- See: incident-response-runbook.md
Step 4: Investigate (if Severity 3)
- Document in ticket
- Schedule investigation
- Monitor for escalation
</code></pre>
<h3 id="common-alerts--actions"><a class="header" href="#common-alerts--actions">Common Alerts &amp; Actions</a></h3>
<div class="table-wrapper"><table><thead><tr><th>Alert</th><th>Cause</th><th>Response</th></tr></thead><tbody>
<tr><td><strong>Pod CrashLoopBackOff</strong></td><td>App crashing</td><td>Get logs, fix, restart</td></tr>
<tr><td><strong>High CPU &gt;80%</strong></td><td>Resource exhausted</td><td>Scale up or reduce load</td></tr>
<tr><td><strong>High Memory &gt;85%</strong></td><td>Memory leak or surge</td><td>Investigate or restart</td></tr>
<tr><td><strong>Error rate spike</strong></td><td>App issue</td><td>Check logs, might rollback</td></tr>
<tr><td><strong>Response time spike</strong></td><td>Slow queries/I/O</td><td>Check database, might restart</td></tr>
<tr><td><strong>Pod pending</strong></td><td>Can't schedule</td><td>Check node resources</td></tr>
<tr><td><strong>Endpoints empty</strong></td><td>Service down</td><td>Verify service exists</td></tr>
<tr><td><strong>Disk full</strong></td><td>Storage exhausted</td><td>Clean up or expand</td></tr>
</tbody></table>
</div>
<hr />
<h2 id="metric-baselines--trends"><a class="header" href="#metric-baselines--trends">Metric Baselines &amp; Trends</a></h2>
<h3 id="establishing-baselines"><a class="header" href="#establishing-baselines">Establishing Baselines</a></h3>
<p>Record these metrics during normal operation:</p>
<pre><code class="language-bash"># CPU per pod (typical)
Backend: 200-400m per pod
Agents: 300-500m per pod
LLM Router: 100-200m per pod
# Memory per pod (typical)
Backend: 256-512Mi per pod
Agents: 128-256Mi per pod
LLM Router: 64-128Mi per pod
# Response time (typical)
Backend: p50: 50ms, p95: 200ms, p99: 500ms
Frontend: Load time: 2-3 seconds
# Error rate (typical)
Backend: 4xx: &lt;1%, 5xx: &lt;0.1%
Frontend: &lt;5% user-visible errors
# Pod restart count
Should remain 0 (no restarts expected in normal operation)
</code></pre>
<h3 id="detecting-anomalies"><a class="header" href="#detecting-anomalies">Detecting Anomalies</a></h3>
<p>Compare current metrics to baseline:</p>
<pre><code class="language-bash"># If CPU 2x normal:
- Check if load increased
- Check for resource leak
- Monitor for further increase
# If Memory increasing:
- Might indicate memory leak
- Monitor over time (1-2 hours)
- Restart if clearly trending up
# If Error rate 10x:
- Something broke recently
- Check recent deployment
- Consider rollback
# If new process consuming resources:
- Identify the new resource consumer
- Investigate purpose
- Kill if unintended
</code></pre>
<hr />
<h2 id="capacity-planning"><a class="header" href="#capacity-planning">Capacity Planning</a></h2>
<h3 id="when-to-scale"><a class="header" href="#when-to-scale">When to Scale</a></h3>
<p>Monitor trends and plan ahead:</p>
<pre><code class="language-bash"># Trigger capacity planning if:
- Average CPU &gt;60%
- Average Memory &gt;60%
- Peak usage trending upward
- Disk usage &gt;80%
# Questions to ask:
- Is traffic increasing? Seasonal spike?
- Did we add features? New workload?
- Do we have capacity for growth?
- Should we scale now or wait?
</code></pre>
<h3 id="scaling-actions"><a class="header" href="#scaling-actions">Scaling Actions</a></h3>
<pre><code class="language-bash"># Quick scale (temporary):
kubectl scale deployment/vapora-backend --replicas=5 -n vapora
# Permanent scale (update deployment.yaml):
# Edit: replicas: 5
# Apply: kubectl apply -f deployment.yaml
# Add nodes (infrastructure):
# Contact infrastructure team
# Reduce resource consumption:
# Investigate slow queries, memory leaks, etc.
</code></pre>
<hr />
<h2 id="log-analysis--troubleshooting"><a class="header" href="#log-analysis--troubleshooting">Log Analysis &amp; Troubleshooting</a></h2>
<h3 id="checking-logs"><a class="header" href="#checking-logs">Checking Logs</a></h3>
<pre><code class="language-bash"># Most recent logs
kubectl logs deployment/vapora-backend -n vapora
# Last N lines
kubectl logs deployment/vapora-backend -n vapora --tail=100
# From specific time
kubectl logs deployment/vapora-backend -n vapora --since=1h
# Follow/tail logs
kubectl logs deployment/vapora-backend -n vapora -f
# From specific pod
kubectl logs pod-name -n vapora
# Previous pod (if crashed)
kubectl logs pod-name -n vapora --previous
</code></pre>
<h3 id="log-patterns-to-watch-for"><a class="header" href="#log-patterns-to-watch-for">Log Patterns to Watch For</a></h3>
<pre><code class="language-bash"># Error patterns
kubectl logs deployment/vapora-backend -n vapora | grep -i "error\|exception\|fatal"
# Database issues
kubectl logs deployment/vapora-backend -n vapora | grep -i "database\|connection\|sql"
# Authentication issues
kubectl logs deployment/vapora-backend -n vapora | grep -i "auth\|permission\|forbidden"
# Resource issues
kubectl logs deployment/vapora-backend -n vapora | grep -i "memory\|cpu\|timeout"
# Startup issues (if pod restarting)
kubectl logs pod-name -n vapora --previous | head -50
</code></pre>
<h3 id="common-log-messages--meaning"><a class="header" href="#common-log-messages--meaning">Common Log Messages &amp; Meaning</a></h3>
<div class="table-wrapper"><table><thead><tr><th>Log Message</th><th>Meaning</th><th>Action</th></tr></thead><tbody>
<tr><td><code>Connection refused</code></td><td>Service not listening</td><td>Check if service started</td></tr>
<tr><td><code>Out of memory</code></td><td>Memory exhausted</td><td>Increase limits or scale</td></tr>
<tr><td><code>Unauthorized</code></td><td>Auth failed</td><td>Check credentials/tokens</td></tr>
<tr><td><code>Database connection timeout</code></td><td>Database unreachable</td><td>Check DB health</td></tr>
<tr><td><code>404 Not Found</code></td><td>Endpoint doesn't exist</td><td>Check API routes</td></tr>
<tr><td><code>Slow query</code></td><td>Database query taking time</td><td>Optimize query or check DB</td></tr>
</tbody></table>
</div>
<hr />
<h2 id="proactive-monitoring-practices"><a class="header" href="#proactive-monitoring-practices">Proactive Monitoring Practices</a></h2>
<h3 id="weekly-review"><a class="header" href="#weekly-review">Weekly Review</a></h3>
<pre><code class="language-bash"># Every Monday (or your weekly cadence):
1. Review incidents from past week
- Were any preventable?
- Any patterns?
2. Check alert tuning
- False alarms?
- Missed issues?
- Adjust thresholds if needed
3. Capacity check
- How much headroom remaining?
- Plan for growth?
4. Log analysis
- Any concerning patterns?
- Warnings that should be errors?
5. Update runbooks if needed
</code></pre>
<h3 id="monthly-review"><a class="header" href="#monthly-review">Monthly Review</a></h3>
<pre><code class="language-bash"># First of each month:
1. Performance trends
- Response time trending up/down?
- Error rate changing?
- Resource usage changing?
2. Capacity forecast
- Extrapolate current trends
- Plan for growth
- Schedule scaling if needed
3. Incident review
- MTBF (Mean Time Between Failures)
- MTTR (Mean Time To Resolve)
- MTTI (Mean Time To Identify)
- Are we improving?
4. Tool/alert improvements
- New monitoring needs?
- Alert fatigue issues?
- Better ways to visualize data?
</code></pre>
<hr />
<h2 id="health-check-checklist"><a class="header" href="#health-check-checklist">Health Check Checklist</a></h2>
<h3 id="pre-deployment-health-check"><a class="header" href="#pre-deployment-health-check">Pre-Deployment Health Check</a></h3>
<pre><code>Before any deployment, verify:
☐ All pods running: kubectl get pods
☐ No recent errors: kubectl logs --since=1h
☐ Resource usage normal: kubectl top pods
☐ Services healthy: curl /health
☐ Recent events normal: kubectl get events
</code></pre>
<h3 id="post-deployment-health-check"><a class="header" href="#post-deployment-health-check">Post-Deployment Health Check</a></h3>
<pre><code>After deployment, verify for 2 hours:
☐ All new pods running
☐ Old pods terminated
☐ Health endpoints responding
☐ No spike in error logs
☐ Resource usage within expected range
☐ Response time normal
☐ No pod restarts
</code></pre>
<h3 id="daily-health-check"><a class="header" href="#daily-health-check">Daily Health Check</a></h3>
<pre><code>Once per business day:
☐ kubectl get pods (all Running, 1/1 Ready)
☐ curl http://localhost:8001/health (200 OK)
☐ kubectl logs --since=24h | grep ERROR (few to none)
☐ kubectl top pods (normal usage)
☐ kubectl get events (no warnings)
</code></pre>
<hr />
<h2 id="monitoring-runbook-checklist"><a class="header" href="#monitoring-runbook-checklist">Monitoring Runbook Checklist</a></h2>
<pre><code>☐ Verified automated health checks running
☐ Manual health checks performed (daily)
☐ Dashboards set up and visible
☐ Alert thresholds tuned
☐ Log patterns identified
☐ Baselines recorded
☐ Escalation procedures understood
☐ Team trained on monitoring
☐ Alert responses tested
☐ Runbooks up to date
</code></pre>
<hr />
<h2 id="common-monitoring-issues"><a class="header" href="#common-monitoring-issues">Common Monitoring Issues</a></h2>
<h3 id="false-alerts"><a class="header" href="#false-alerts">False Alerts</a></h3>
<p><strong>Problem</strong>: Alert fires but service is actually fine</p>
<p><strong>Solution</strong>:</p>
<ol>
<li>Verify manually (don't just assume false)</li>
<li>Check alert threshold (might be too sensitive)</li>
<li>Adjust threshold if consistently false</li>
<li>Document the change</li>
</ol>
<h3 id="alert-fatigue"><a class="header" href="#alert-fatigue">Alert Fatigue</a></h3>
<p><strong>Problem</strong>: Too many alerts, getting ignored</p>
<p><strong>Solution</strong>:</p>
<ol>
<li>Review all alerts</li>
<li>Disable/adjust non-actionable ones</li>
<li>Consolidate related alerts</li>
<li>Focus on critical-only alerts</li>
</ol>
<h3 id="missing-alerts"><a class="header" href="#missing-alerts">Missing Alerts</a></h3>
<p><strong>Problem</strong>: Issue happens but no alert fired</p>
<p><strong>Solution</strong>:</p>
<ol>
<li>Investigate why alert didn't fire</li>
<li>Check alert condition</li>
<li>Add new alert for this issue</li>
<li>Test the new alert</li>
</ol>
<h3 id="lag-in-monitoring"><a class="header" href="#lag-in-monitoring">Lag in Monitoring</a></h3>
<p><strong>Problem</strong>: Dashboard/alerts slow to update</p>
<p><strong>Solution</strong>:</p>
<ol>
<li>Check monitoring system performance</li>
<li>Increase scrape frequency if appropriate</li>
<li>Reduce data retention if storage issue</li>
<li>Investigate database performance</li>
</ol>
<hr />
<h2 id="monitoring-tools--commands"><a class="header" href="#monitoring-tools--commands">Monitoring Tools &amp; Commands</a></h2>
<h3 id="kubectl-commands"><a class="header" href="#kubectl-commands">kubectl Commands</a></h3>
<pre><code class="language-bash"># Pod monitoring
kubectl get pods -n vapora
kubectl get pods -n vapora -w # Watch mode
kubectl describe pod &lt;pod&gt; -n vapora
kubectl logs &lt;pod&gt; -n vapora -f
# Resource monitoring
kubectl top nodes
kubectl top pods -n vapora
kubectl describe nodes
# Event monitoring
kubectl get events -n vapora --sort-by='.lastTimestamp'
kubectl get events -n vapora --watch
# Health checks
kubectl get --raw /healthz # API health
</code></pre>
<h3 id="useful-commands"><a class="header" href="#useful-commands">Useful Commands</a></h3>
<pre><code class="language-bash"># Check API responsiveness
curl -v http://localhost:8001/health
# Check all endpoints have pods
for svc in backend agents llm-router; do
echo "$svc endpoints:"
kubectl get endpoints vapora-$svc -n vapora
done
# Monitor pod restarts
watch 'kubectl get pods -n vapora -o jsonpath="{range .items[*]}{.metadata.name}{\" \"}{.status.containerStatuses[0].restartCount}{\"\\n\"}{end}"'
# Find pods with high restarts
kubectl get pods -n vapora -o json | jq '.items[] | select(.status.containerStatuses[0].restartCount &gt; 5) | .metadata.name'
</code></pre>
<hr />
<h2 id="next-steps"><a class="header" href="#next-steps">Next Steps</a></h2>
<ol>
<li><strong>Set up dashboards</strong> - Create Grafana/Prometheus dashboards if not available</li>
<li><strong>Configure alerts</strong> - Set thresholds based on baselines</li>
<li><strong>Test alerting</strong> - Verify Slack/email notifications work</li>
<li><strong>Train team</strong> - Ensure everyone knows how to read dashboards</li>
<li><strong>Document baselines</strong> - Record normal metrics for comparison</li>
<li><strong>Automate checks</strong> - Use CI/CD health check pipelines</li>
<li><strong>Review regularly</strong> - Weekly/monthly health check reviews</li>
</ol>
<hr />
<p><strong>Last Updated</strong>: 2026-01-12
<strong>Status</strong>: Production-ready</p>
</main>
<nav class="nav-wrapper" aria-label="Page navigation">
<!-- Mobile navigation buttons -->
<a rel="prev" href="../../operations/pre-deployment-checklist.html" class="mobile-nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
<i class="fa fa-angle-left"></i>
</a>
<a rel="next prefetch" href="../../operations/on-call-procedures.html" class="mobile-nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
<i class="fa fa-angle-right"></i>
</a>
<div style="clear: both"></div>
</nav>
</div>
</div>
<nav class="nav-wide-wrapper" aria-label="Page navigation">
<a rel="prev" href="../../operations/pre-deployment-checklist.html" class="nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
<i class="fa fa-angle-left"></i>
</a>
<a rel="next prefetch" href="../../operations/on-call-procedures.html" class="nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
<i class="fa fa-angle-right"></i>
</a>
</nav>
</div>
<script>
window.playground_copyable = true;
</script>
<script src="../elasticlunr.min.js"></script>
<script src="../mark.min.js"></script>
<script src="../searcher.js"></script>
<script src="../clipboard.min.js"></script>
<script src="../highlight.js"></script>
<script src="../book.js"></script>
<!-- Custom JS scripts -->
</div>
</body>
</html>