Vapora/docs/operations/monitoring-operations.html

<!DOCTYPE HTML>
<html lang="en" class="light sidebar-visible" dir="ltr">
    <head>
        <!-- Book generated using mdBook -->
        <meta charset="UTF-8">
        <title>Monitoring &amp; Operations - VAPORA Platform Documentation</title>


        <!-- Custom HTML head -->

        <meta name="description" content="Comprehensive documentation for VAPORA, an intelligent development orchestration platform built entirely in Rust.">
        <meta name="viewport" content="width=device-width, initial-scale=1">
        <meta name="theme-color" content="#ffffff">

        <link rel="icon" href="../favicon.svg">
        <link rel="shortcut icon" href="../favicon.png">
        <link rel="stylesheet" href="../css/variables.css">
        <link rel="stylesheet" href="../css/general.css">
        <link rel="stylesheet" href="../css/chrome.css">
        <link rel="stylesheet" href="../css/print.css" media="print">

        <!-- Fonts -->
        <link rel="stylesheet" href="../FontAwesome/css/font-awesome.css">
        <link rel="stylesheet" href="../fonts/fonts.css">

        <!-- Highlight.js Stylesheets -->
        <link rel="stylesheet" id="highlight-css" href="../highlight.css">
        <link rel="stylesheet" id="tomorrow-night-css" href="../tomorrow-night.css">
        <link rel="stylesheet" id="ayu-highlight-css" href="../ayu-highlight.css">

        <!-- Custom theme stylesheets -->


        <!-- Provide site root and default themes to javascript -->
        <script>
            const path_to_root = "../";
            const default_light_theme = "light";
            const default_dark_theme = "dark";
        </script>
        <!-- Start loading toc.js asap -->
        <script src="../toc.js"></script>
    </head>
    <body>
    <div id="mdbook-help-container">
        <div id="mdbook-help-popup">
            <h2 class="mdbook-help-title">Keyboard shortcuts</h2>
            <div>
                <p>Press <kbd>←</kbd> or <kbd>→</kbd> to navigate between chapters</p>
                <p>Press <kbd>S</kbd> or <kbd>/</kbd> to search in the book</p>
                <p>Press <kbd>?</kbd> to show this help</p>
                <p>Press <kbd>Esc</kbd> to hide this help</p>
            </div>
        </div>
    </div>
    <div id="body-container">
        <!-- Work around some values being stored in localStorage wrapped in quotes -->
        <script>
            try {
                let theme = localStorage.getItem('mdbook-theme');
                let sidebar = localStorage.getItem('mdbook-sidebar');

                if (theme.startsWith('"') && theme.endsWith('"')) {
                    localStorage.setItem('mdbook-theme', theme.slice(1, theme.length - 1));
                }

                if (sidebar.startsWith('"') && sidebar.endsWith('"')) {
                    localStorage.setItem('mdbook-sidebar', sidebar.slice(1, sidebar.length - 1));
                }
            } catch (e) { }
        </script>

        <!-- Set the theme before any content is loaded, prevents flash -->
        <script>
            const default_theme = window.matchMedia("(prefers-color-scheme: dark)").matches ? default_dark_theme : default_light_theme;
            let theme;
            try { theme = localStorage.getItem('mdbook-theme'); } catch(e) { }
            if (theme === null || theme === undefined) { theme = default_theme; }
            const html = document.documentElement;
            html.classList.remove('light')
            html.classList.add(theme);
            html.classList.add("js");
        </script>

        <input type="checkbox" id="sidebar-toggle-anchor" class="hidden">

        <!-- Hide / unhide sidebar before it is displayed -->
        <script>
            let sidebar = null;
            const sidebar_toggle = document.getElementById("sidebar-toggle-anchor");
            if (document.body.clientWidth >= 1080) {
                try { sidebar = localStorage.getItem('mdbook-sidebar'); } catch(e) { }
                sidebar = sidebar || 'visible';
            } else {
                sidebar = 'hidden';
            }
            sidebar_toggle.checked = sidebar === 'visible';
            html.classList.remove('sidebar-visible');
            html.classList.add("sidebar-" + sidebar);
        </script>

        <nav id="sidebar" class="sidebar" aria-label="Table of contents">
            <!-- populated by js -->
            <mdbook-sidebar-scrollbox class="sidebar-scrollbox"></mdbook-sidebar-scrollbox>
            <noscript>
                <iframe class="sidebar-iframe-outer" src="../toc.html"></iframe>
            </noscript>
            <div id="sidebar-resize-handle" class="sidebar-resize-handle">
                <div class="sidebar-resize-indicator"></div>
            </div>
        </nav>

        <div id="page-wrapper" class="page-wrapper">

            <div class="page">
                <div id="menu-bar-hover-placeholder"></div>
                <div id="menu-bar" class="menu-bar sticky">
                    <div class="left-buttons">
                        <label id="sidebar-toggle" class="icon-button" for="sidebar-toggle-anchor" title="Toggle Table of Contents" aria-label="Toggle Table of Contents" aria-controls="sidebar">
                            <i class="fa fa-bars"></i>
                        </label>
                        <button id="theme-toggle" class="icon-button" type="button" title="Change theme" aria-label="Change theme" aria-haspopup="true" aria-expanded="false" aria-controls="theme-list">
                            <i class="fa fa-paint-brush"></i>
                        </button>
                        <ul id="theme-list" class="theme-popup" aria-label="Themes" role="menu">
                            <li role="none"><button role="menuitem" class="theme" id="default_theme">Auto</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="light">Light</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="rust">Rust</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="coal">Coal</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="navy">Navy</button></li>
                            <li role="none"><button role="menuitem" class="theme" id="ayu">Ayu</button></li>
                        </ul>
                        <button id="search-toggle" class="icon-button" type="button" title="Search (`/`)" aria-label="Toggle Searchbar" aria-expanded="false" aria-keyshortcuts="/ s" aria-controls="searchbar">
                            <i class="fa fa-search"></i>
                        </button>
                    </div>

                    <h1 class="menu-title">VAPORA Platform Documentation</h1>

                    <div class="right-buttons">
                        <a href="../print.html" title="Print this book" aria-label="Print this book">
                            <i id="print-button" class="fa fa-print"></i>
                        </a>
                        <a href="https://github.com/vapora-platform/vapora" title="Git repository" aria-label="Git repository">
                            <i id="git-repository-button" class="fa fa-github"></i>
                        </a>
                        <a href="https://github.com/vapora-platform/vapora/edit/main/docs/src/../operations/monitoring-operations.md" title="Suggest an edit" aria-label="Suggest an edit">
                            <i id="git-edit-button" class="fa fa-edit"></i>
                        </a>

                    </div>
                </div>

                <div id="search-wrapper" class="hidden">
                    <form id="searchbar-outer" class="searchbar-outer">
                        <input type="search" id="searchbar" name="searchbar" placeholder="Search this book ..." aria-controls="searchresults-outer" aria-describedby="searchresults-header">
                    </form>
                    <div id="searchresults-outer" class="searchresults-outer hidden">
                        <div id="searchresults-header" class="searchresults-header"></div>
                        <ul id="searchresults">
                        </ul>
                    </div>
                </div>

                <!-- Apply ARIA attributes after the sidebar and the sidebar toggle button are added to the DOM -->
                <script>
                    document.getElementById('sidebar-toggle').setAttribute('aria-expanded', sidebar === 'visible');
                    document.getElementById('sidebar').setAttribute('aria-hidden', sidebar !== 'visible');
                    Array.from(document.querySelectorAll('#sidebar a')).forEach(function(link) {
                        link.setAttribute('tabIndex', sidebar === 'visible' ? 0 : -1);
                    });
                </script>

                <div id="content" class="content">
                    <main>
                        <h1 id="monitoring--health-check-operations"><a class="header" href="#monitoring--health-check-operations">Monitoring &amp; Health Check Operations</a></h1>
<p>Guide for continuous monitoring and health checks of VAPORA in production.</p>
<hr />
<h2 id="overview"><a class="header" href="#overview">Overview</a></h2>
<p><strong>Responsibility</strong>: Maintain visibility into VAPORA service health through monitoring, logging, and alerting</p>
<p><strong>Key Activities</strong>:</p>
<ul>
<li>Regular health checks (automated and manual)</li>
<li>Alert response and investigation</li>
<li>Trend analysis and capacity planning</li>
<li>Incident prevention through early detection</li>
</ul>
<p><strong>Success Metric</strong>: Detect and respond to issues before users are significantly impacted</p>
<hr />
<h2 id="automated-health-checks"><a class="header" href="#automated-health-checks">Automated Health Checks</a></h2>
<h3 id="kubernetes-health-check-pipeline"><a class="header" href="#kubernetes-health-check-pipeline">Kubernetes Health Check Pipeline</a></h3>
<p>If using CI/CD, leverage automatic health monitoring:</p>
<p><strong>GitHub Actions</strong>:</p>
<pre><code class="language-bash"># Runs every 15 minutes (quick check)
# Runs every 6 hours (comprehensive diagnostics)
# See: .github/workflows/health-check.yml
</code></pre>
<p><strong>Woodpecker</strong>:</p>
<pre><code class="language-bash"># Runs every 15 minutes (quick check)
# Runs every 6 hours (comprehensive diagnostics)
# See: .woodpecker/health-check.yml
</code></pre>
<p><strong>Artifacts Generated</strong>:</p>
<ul>
<li><code>docker-health.log</code> - Docker container status</li>
<li><code>k8s-health.log</code> - Kubernetes deployments status</li>
<li><code>k8s-diagnostics.log</code> - Full system diagnostics</li>
<li><code>docker-diagnostics.log</code> - Docker system info</li>
<li><code>HEALTH_REPORT.md</code> - Summary report</li>
</ul>
<h3 id="quick-manual-health-check"><a class="header" href="#quick-manual-health-check">Quick Manual Health Check</a></h3>
<pre><code class="language-bash"># Run this command to get instant health status
export NAMESPACE=vapora

echo "=== Pod Status ==="
kubectl get pods -n $NAMESPACE
echo ""

echo "=== Service Health ==="
kubectl get endpoints -n $NAMESPACE
echo ""

echo "=== Recent Events ==="
kubectl get events -n $NAMESPACE --sort-by='.lastTimestamp' | tail -10
echo ""

echo "=== Resource Usage ==="
kubectl top pods -n $NAMESPACE
echo ""

echo "=== API Health ==="
curl -s http://localhost:8001/health | jq .
</code></pre>
<hr />
<h2 id="manual-daily-monitoring"><a class="header" href="#manual-daily-monitoring">Manual Daily Monitoring</a></h2>
<h3 id="morning-check-start-of-business-day"><a class="header" href="#morning-check-start-of-business-day">Morning Check (Start of Business Day)</a></h3>
<pre><code class="language-bash"># Run at start of business day (or when starting shift)

echo "=== MORNING HEALTH CHECK ==="
echo "Date: $(date -u)"

# 1. Cluster Status
echo "Cluster Status:"
kubectl cluster-info | grep server

# 2. Node Status
echo ""
echo "Node Status:"
kubectl get nodes
# Should show: All nodes Ready

# 3. Pod Status
echo ""
echo "Pod Status:"
kubectl get pods -n vapora
# Should show: All Running, 1/1 Ready

# 4. Service Endpoints
echo ""
echo "Service Endpoints:"
kubectl get endpoints -n vapora
# Should show: All services have endpoints (not empty)

# 5. Resource Usage
echo ""
echo "Resource Usage:"
kubectl top nodes
kubectl top pods -n vapora | head -10

# 6. Recent Errors
echo ""
echo "Recent Errors (last 1 hour):"
kubectl logs deployment/vapora-backend -n vapora --since=1h | grep -i error | wc -l
# Should show: 0 or very few errors

# 7. Overall Status
echo ""
echo "Overall Status: ✅ Healthy"
# If any issues found: Document and investigate
</code></pre>
<h3 id="mid-day-check-every-4-6-hours"><a class="header" href="#mid-day-check-every-4-6-hours">Mid-Day Check (Every 4-6 hours)</a></h3>
<pre><code class="language-bash"># Quick sanity check during business hours

# 1. Service Responsiveness
curl -s http://localhost:8001/health | jq '.status'
# Should return: "healthy"

# 2. Pod Restart Tracking
kubectl get pods -n vapora -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.containerStatuses[0].restartCount}{"\n"}{end}'
# Restart count should not be increasing rapidly

# 3. Error Log Check
kubectl logs deployment/vapora-backend -n vapora --since=4h --timestamps | grep ERROR | tail -5
# Should show: Few to no errors

# 4. Performance Check
kubectl top pods -n vapora | tail -5
# CPU/Memory should be in normal range
</code></pre>
<h3 id="end-of-day-check-before-shift-end"><a class="header" href="#end-of-day-check-before-shift-end">End-of-Day Check (Before Shift End)</a></h3>
<pre><code class="language-bash"># Summary check before handing off to on-call

echo "=== END OF DAY SUMMARY ==="

# Current status
kubectl get pods -n vapora
kubectl top pods -n vapora

# Any concerning trends?
echo ""
echo "Checking for concerning events..."
kubectl get events -n vapora --sort-by='.lastTimestamp' | grep -i warning

# Any pod restarts?
echo ""
echo "Pod restart status:"
kubectl get pods -n vapora -o jsonpath='{range .items[*]}{.metadata.name}{": "}{.status.containerStatuses[0].restartCount}{"\n"}{end}' | grep -v ": 0"

# Document for next shift
echo ""
echo "Status for on-call: All normal / Issues detected"
</code></pre>
<hr />
<h2 id="dashboard-setup--monitoring"><a class="header" href="#dashboard-setup--monitoring">Dashboard Setup &amp; Monitoring</a></h2>
<h3 id="essential-dashboards-to-monitor"><a class="header" href="#essential-dashboards-to-monitor">Essential Dashboards to Monitor</a></h3>
<p>If you have Grafana/Prometheus, create these dashboards:</p>
<h4 id="1-service-health-dashboard"><a class="header" href="#1-service-health-dashboard">1. Service Health Dashboard</a></h4>
<p>Monitor:</p>
<ul>
<li>Pod running count (should be stable at expected count)</li>
<li>Pod restart count (should not increase rapidly)</li>
<li>Service endpoint availability (should be &gt;99%)</li>
<li>API response time (p99, track trends)</li>
</ul>
<p><strong>Alert if:</strong></p>
<ul>
<li>Pod count drops below expected</li>
<li>Restart count increasing</li>
<li>Endpoints empty</li>
<li>Response time &gt;2s</li>
</ul>
<h4 id="2-resource-utilization-dashboard"><a class="header" href="#2-resource-utilization-dashboard">2. Resource Utilization Dashboard</a></h4>
<p>Monitor:</p>
<ul>
<li>CPU usage per pod</li>
<li>Memory usage per pod</li>
<li>Node capacity (CPU, memory, disk)</li>
<li>Network I/O</li>
</ul>
<p><strong>Alert if:</strong></p>
<ul>
<li>Any pod &gt;80% CPU/Memory</li>
<li>Any node &gt;85% capacity</li>
<li>Memory trending upward consistently</li>
</ul>
<h4 id="3-error-rate-dashboard"><a class="header" href="#3-error-rate-dashboard">3. Error Rate Dashboard</a></h4>
<p>Monitor:</p>
<ul>
<li>4xx error rate (should be low)</li>
<li>5xx error rate (should be minimal)</li>
<li>Error rate by endpoint</li>
<li>Error rate by service</li>
</ul>
<p><strong>Alert if:</strong></p>
<ul>
<li>5xx error rate &gt;5%</li>
<li>4xx error rate &gt;10%</li>
<li>Sudden spike in errors</li>
</ul>
<h4 id="4-application-metrics-dashboard"><a class="header" href="#4-application-metrics-dashboard">4. Application Metrics Dashboard</a></h4>
<p>Monitor:</p>
<ul>
<li>Request rate (RPS)</li>
<li>Request latency (p50, p95, p99)</li>
<li>Active connections</li>
<li>Database query time</li>
</ul>
<p><strong>Alert if:</strong></p>
<ul>
<li>Request rate suddenly drops (might indicate outage)</li>
<li>Latency spikes above baseline</li>
<li>Database queries slow</li>
</ul>
<h3 id="grafana-setup-example"><a class="header" href="#grafana-setup-example">Grafana Setup Example</a></h3>
<pre><code class="language-bash"># If setting up Grafana monitoring
1. Deploy Prometheus scraping Kubernetes metrics
2. Create dashboard with above panels
3. Set alert rules:
   - CPU &gt;80%: Warning
   - Memory &gt;85%: Warning
   - Error rate &gt;5%: Critical
   - Pod crashed: Critical
   - Response time &gt;2s: Warning

4. Configure notifications to Slack/email
</code></pre>
<hr />
<h2 id="alert-response-procedures"><a class="header" href="#alert-response-procedures">Alert Response Procedures</a></h2>
<h3 id="when-alert-fires"><a class="header" href="#when-alert-fires">When Alert Fires</a></h3>
<pre><code>Alert Received
    ↓
Step 1: Verify it's real (not false alarm)
  - Check dashboard
  - Check manually (curl endpoints, kubectl get pods)
  - Ask in #deployments if unsure

Step 2: Assess severity
  - Service completely down? Severity 1
  - Service partially degraded? Severity 2
  - Warning/trending issue? Severity 3

Step 3: Declare incident (if Severity 1-2)
  - Create #incident channel
  - Follow Incident Response Runbook
  - See: incident-response-runbook.md

Step 4: Investigate (if Severity 3)
  - Document in ticket
  - Schedule investigation
  - Monitor for escalation
</code></pre>
<h3 id="common-alerts--actions"><a class="header" href="#common-alerts--actions">Common Alerts &amp; Actions</a></h3>
<div class="table-wrapper"><table><thead><tr><th>Alert</th><th>Cause</th><th>Response</th></tr></thead><tbody>
<tr><td><strong>Pod CrashLoopBackOff</strong></td><td>App crashing</td><td>Get logs, fix, restart</td></tr>
<tr><td><strong>High CPU &gt;80%</strong></td><td>Resource exhausted</td><td>Scale up or reduce load</td></tr>
<tr><td><strong>High Memory &gt;85%</strong></td><td>Memory leak or surge</td><td>Investigate or restart</td></tr>
<tr><td><strong>Error rate spike</strong></td><td>App issue</td><td>Check logs, might rollback</td></tr>
<tr><td><strong>Response time spike</strong></td><td>Slow queries/I/O</td><td>Check database, might restart</td></tr>
<tr><td><strong>Pod pending</strong></td><td>Can't schedule</td><td>Check node resources</td></tr>
<tr><td><strong>Endpoints empty</strong></td><td>Service down</td><td>Verify service exists</td></tr>
<tr><td><strong>Disk full</strong></td><td>Storage exhausted</td><td>Clean up or expand</td></tr>
</tbody></table>
</div>
<hr />
<h2 id="metric-baselines--trends"><a class="header" href="#metric-baselines--trends">Metric Baselines &amp; Trends</a></h2>
<h3 id="establishing-baselines"><a class="header" href="#establishing-baselines">Establishing Baselines</a></h3>
<p>Record these metrics during normal operation:</p>
<pre><code class="language-bash"># CPU per pod (typical)
Backend:    200-400m per pod
Agents:     300-500m per pod
LLM Router: 100-200m per pod

# Memory per pod (typical)
Backend:    256-512Mi per pod
Agents:     128-256Mi per pod
LLM Router: 64-128Mi per pod

# Response time (typical)
Backend:    p50: 50ms, p95: 200ms, p99: 500ms
Frontend:   Load time: 2-3 seconds

# Error rate (typical)
Backend:    4xx: &lt;1%, 5xx: &lt;0.1%
Frontend:   &lt;5% user-visible errors

# Pod restart count
Should remain 0 (no restarts expected in normal operation)
</code></pre>
<h3 id="detecting-anomalies"><a class="header" href="#detecting-anomalies">Detecting Anomalies</a></h3>
<p>Compare current metrics to baseline:</p>
<pre><code class="language-bash"># If CPU 2x normal:
- Check if load increased
- Check for resource leak
- Monitor for further increase

# If Memory increasing:
- Might indicate memory leak
- Monitor over time (1-2 hours)
- Restart if clearly trending up

# If Error rate 10x:
- Something broke recently
- Check recent deployment
- Consider rollback

# If new process consuming resources:
- Identify the new resource consumer
- Investigate purpose
- Kill if unintended
</code></pre>
<hr />
<h2 id="capacity-planning"><a class="header" href="#capacity-planning">Capacity Planning</a></h2>
<h3 id="when-to-scale"><a class="header" href="#when-to-scale">When to Scale</a></h3>
<p>Monitor trends and plan ahead:</p>
<pre><code class="language-bash"># Trigger capacity planning if:
- Average CPU &gt;60%
- Average Memory &gt;60%
- Peak usage trending upward
- Disk usage &gt;80%

# Questions to ask:
- Is traffic increasing? Seasonal spike?
- Did we add features? New workload?
- Do we have capacity for growth?
- Should we scale now or wait?
</code></pre>
<h3 id="scaling-actions"><a class="header" href="#scaling-actions">Scaling Actions</a></h3>
<pre><code class="language-bash"># Quick scale (temporary):
kubectl scale deployment/vapora-backend --replicas=5 -n vapora

# Permanent scale (update deployment.yaml):
# Edit: replicas: 5
# Apply: kubectl apply -f deployment.yaml

# Add nodes (infrastructure):
# Contact infrastructure team

# Reduce resource consumption:
# Investigate slow queries, memory leaks, etc.
</code></pre>
<hr />
<h2 id="log-analysis--troubleshooting"><a class="header" href="#log-analysis--troubleshooting">Log Analysis &amp; Troubleshooting</a></h2>
<h3 id="checking-logs"><a class="header" href="#checking-logs">Checking Logs</a></h3>
<pre><code class="language-bash"># Most recent logs
kubectl logs deployment/vapora-backend -n vapora

# Last N lines
kubectl logs deployment/vapora-backend -n vapora --tail=100

# From specific time
kubectl logs deployment/vapora-backend -n vapora --since=1h

# Follow/tail logs
kubectl logs deployment/vapora-backend -n vapora -f

# From specific pod
kubectl logs pod-name -n vapora

# Previous pod (if crashed)
kubectl logs pod-name -n vapora --previous
</code></pre>
<h3 id="log-patterns-to-watch-for"><a class="header" href="#log-patterns-to-watch-for">Log Patterns to Watch For</a></h3>
<pre><code class="language-bash"># Error patterns
kubectl logs deployment/vapora-backend -n vapora | grep -i "error\|exception\|fatal"

# Database issues
kubectl logs deployment/vapora-backend -n vapora | grep -i "database\|connection\|sql"

# Authentication issues
kubectl logs deployment/vapora-backend -n vapora | grep -i "auth\|permission\|forbidden"

# Resource issues
kubectl logs deployment/vapora-backend -n vapora | grep -i "memory\|cpu\|timeout"

# Startup issues (if pod restarting)
kubectl logs pod-name -n vapora --previous | head -50
</code></pre>
<h3 id="common-log-messages--meaning"><a class="header" href="#common-log-messages--meaning">Common Log Messages &amp; Meaning</a></h3>
<div class="table-wrapper"><table><thead><tr><th>Log Message</th><th>Meaning</th><th>Action</th></tr></thead><tbody>
<tr><td><code>Connection refused</code></td><td>Service not listening</td><td>Check if service started</td></tr>
<tr><td><code>Out of memory</code></td><td>Memory exhausted</td><td>Increase limits or scale</td></tr>
<tr><td><code>Unauthorized</code></td><td>Auth failed</td><td>Check credentials/tokens</td></tr>
<tr><td><code>Database connection timeout</code></td><td>Database unreachable</td><td>Check DB health</td></tr>
<tr><td><code>404 Not Found</code></td><td>Endpoint doesn't exist</td><td>Check API routes</td></tr>
<tr><td><code>Slow query</code></td><td>Database query taking time</td><td>Optimize query or check DB</td></tr>
</tbody></table>
</div>
<hr />
<h2 id="proactive-monitoring-practices"><a class="header" href="#proactive-monitoring-practices">Proactive Monitoring Practices</a></h2>
<h3 id="weekly-review"><a class="header" href="#weekly-review">Weekly Review</a></h3>
<pre><code class="language-bash"># Every Monday (or your weekly cadence):

1. Review incidents from past week
   - Were any preventable?
   - Any patterns?

2. Check alert tuning
   - False alarms?
   - Missed issues?
   - Adjust thresholds if needed

3. Capacity check
   - How much headroom remaining?
   - Plan for growth?

4. Log analysis
   - Any concerning patterns?
   - Warnings that should be errors?

5. Update runbooks if needed
</code></pre>
<h3 id="monthly-review"><a class="header" href="#monthly-review">Monthly Review</a></h3>
<pre><code class="language-bash"># First of each month:

1. Performance trends
   - Response time trending up/down?
   - Error rate changing?
   - Resource usage changing?

2. Capacity forecast
   - Extrapolate current trends
   - Plan for growth
   - Schedule scaling if needed

3. Incident review
   - MTBF (Mean Time Between Failures)
   - MTTR (Mean Time To Resolve)
   - MTTI (Mean Time To Identify)
   - Are we improving?

4. Tool/alert improvements
   - New monitoring needs?
   - Alert fatigue issues?
   - Better ways to visualize data?
</code></pre>
<hr />
<h2 id="health-check-checklist"><a class="header" href="#health-check-checklist">Health Check Checklist</a></h2>
<h3 id="pre-deployment-health-check"><a class="header" href="#pre-deployment-health-check">Pre-Deployment Health Check</a></h3>
<pre><code>Before any deployment, verify:
☐ All pods running: kubectl get pods
☐ No recent errors: kubectl logs --since=1h
☐ Resource usage normal: kubectl top pods
☐ Services healthy: curl /health
☐ Recent events normal: kubectl get events
</code></pre>
<h3 id="post-deployment-health-check"><a class="header" href="#post-deployment-health-check">Post-Deployment Health Check</a></h3>
<pre><code>After deployment, verify for 2 hours:
☐ All new pods running
☐ Old pods terminated
☐ Health endpoints responding
☐ No spike in error logs
☐ Resource usage within expected range
☐ Response time normal
☐ No pod restarts
</code></pre>
<h3 id="daily-health-check"><a class="header" href="#daily-health-check">Daily Health Check</a></h3>
<pre><code>Once per business day:
☐ kubectl get pods (all Running, 1/1 Ready)
☐ curl http://localhost:8001/health (200 OK)
☐ kubectl logs --since=24h | grep ERROR (few to none)
☐ kubectl top pods (normal usage)
☐ kubectl get events (no warnings)
</code></pre>
<hr />
<h2 id="monitoring-runbook-checklist"><a class="header" href="#monitoring-runbook-checklist">Monitoring Runbook Checklist</a></h2>
<pre><code>☐ Verified automated health checks running
☐ Manual health checks performed (daily)
☐ Dashboards set up and visible
☐ Alert thresholds tuned
☐ Log patterns identified
☐ Baselines recorded
☐ Escalation procedures understood
☐ Team trained on monitoring
☐ Alert responses tested
☐ Runbooks up to date
</code></pre>
<hr />
<h2 id="common-monitoring-issues"><a class="header" href="#common-monitoring-issues">Common Monitoring Issues</a></h2>
<h3 id="false-alerts"><a class="header" href="#false-alerts">False Alerts</a></h3>
<p><strong>Problem</strong>: Alert fires but service is actually fine</p>
<p><strong>Solution</strong>:</p>
<ol>
<li>Verify manually (don't just assume false)</li>
<li>Check alert threshold (might be too sensitive)</li>
<li>Adjust threshold if consistently false</li>
<li>Document the change</li>
</ol>
<h3 id="alert-fatigue"><a class="header" href="#alert-fatigue">Alert Fatigue</a></h3>
<p><strong>Problem</strong>: Too many alerts, getting ignored</p>
<p><strong>Solution</strong>:</p>
<ol>
<li>Review all alerts</li>
<li>Disable/adjust non-actionable ones</li>
<li>Consolidate related alerts</li>
<li>Focus on critical-only alerts</li>
</ol>
<h3 id="missing-alerts"><a class="header" href="#missing-alerts">Missing Alerts</a></h3>
<p><strong>Problem</strong>: Issue happens but no alert fired</p>
<p><strong>Solution</strong>:</p>
<ol>
<li>Investigate why alert didn't fire</li>
<li>Check alert condition</li>
<li>Add new alert for this issue</li>
<li>Test the new alert</li>
</ol>
<h3 id="lag-in-monitoring"><a class="header" href="#lag-in-monitoring">Lag in Monitoring</a></h3>
<p><strong>Problem</strong>: Dashboard/alerts slow to update</p>
<p><strong>Solution</strong>:</p>
<ol>
<li>Check monitoring system performance</li>
<li>Increase scrape frequency if appropriate</li>
<li>Reduce data retention if storage issue</li>
<li>Investigate database performance</li>
</ol>
<hr />
<h2 id="monitoring-tools--commands"><a class="header" href="#monitoring-tools--commands">Monitoring Tools &amp; Commands</a></h2>
<h3 id="kubectl-commands"><a class="header" href="#kubectl-commands">kubectl Commands</a></h3>
<pre><code class="language-bash"># Pod monitoring
kubectl get pods -n vapora
kubectl get pods -n vapora -w        # Watch mode
kubectl describe pod &lt;pod&gt; -n vapora
kubectl logs &lt;pod&gt; -n vapora -f

# Resource monitoring
kubectl top nodes
kubectl top pods -n vapora
kubectl describe nodes

# Event monitoring
kubectl get events -n vapora --sort-by='.lastTimestamp'
kubectl get events -n vapora --watch

# Health checks
kubectl get --raw /healthz          # API health
</code></pre>
<h3 id="useful-commands"><a class="header" href="#useful-commands">Useful Commands</a></h3>
<pre><code class="language-bash"># Check API responsiveness
curl -v http://localhost:8001/health

# Check all endpoints have pods
for svc in backend agents llm-router; do
  echo "$svc endpoints:"
  kubectl get endpoints vapora-$svc -n vapora
done

# Monitor pod restarts
watch 'kubectl get pods -n vapora -o jsonpath="{range .items[*]}{.metadata.name}{\" \"}{.status.containerStatuses[0].restartCount}{\"\\n\"}{end}"'

# Find pods with high restarts
kubectl get pods -n vapora -o json | jq '.items[] | select(.status.containerStatuses[0].restartCount &gt; 5) | .metadata.name'
</code></pre>
<hr />
<h2 id="next-steps"><a class="header" href="#next-steps">Next Steps</a></h2>
<ol>
<li><strong>Set up dashboards</strong> - Create Grafana/Prometheus dashboards if not available</li>
<li><strong>Configure alerts</strong> - Set thresholds based on baselines</li>
<li><strong>Test alerting</strong> - Verify Slack/email notifications work</li>
<li><strong>Train team</strong> - Ensure everyone knows how to read dashboards</li>
<li><strong>Document baselines</strong> - Record normal metrics for comparison</li>
<li><strong>Automate checks</strong> - Use CI/CD health check pipelines</li>
<li><strong>Review regularly</strong> - Weekly/monthly health check reviews</li>
</ol>
<hr />
<p><strong>Last Updated</strong>: 2026-01-12
<strong>Status</strong>: Production-ready</p>

                    </main>

                    <nav class="nav-wrapper" aria-label="Page navigation">
                        <!-- Mobile navigation buttons -->
                            <a rel="prev" href="../../operations/pre-deployment-checklist.html" class="mobile-nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
                                <i class="fa fa-angle-left"></i>
                            </a>

                            <a rel="next prefetch" href="../../operations/on-call-procedures.html" class="mobile-nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
                                <i class="fa fa-angle-right"></i>
                            </a>

                        <div style="clear: both"></div>
                    </nav>
                </div>
            </div>

            <nav class="nav-wide-wrapper" aria-label="Page navigation">
                    <a rel="prev" href="../../operations/pre-deployment-checklist.html" class="nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
                        <i class="fa fa-angle-left"></i>
                    </a>

                    <a rel="next prefetch" href="../../operations/on-call-procedures.html" class="nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
                        <i class="fa fa-angle-right"></i>
                    </a>
            </nav>

        </div>


        <script>
            window.playground_copyable = true;
        </script>


        <script src="../elasticlunr.min.js"></script>
        <script src="../mark.min.js"></script>
        <script src="../searcher.js"></script>

        <script src="../clipboard.min.js"></script>
        <script src="../highlight.js"></script>
        <script src="../book.js"></script>

        <!-- Custom JS scripts -->


    </div>
    </body>
</html>