851 lines
26 KiB
Bash
851 lines
26 KiB
Bash
![]() |
#!/bin/bash
|
||
|
|
||
|
# Monitoring and Observability Script
|
||
|
# Comprehensive monitoring, logging, and alerting tools
|
||
|
|
||
|
set -e
|
||
|
|
||
|
# Colors for output
|
||
|
RED='\033[0;31m'
|
||
|
GREEN='\033[0;32m'
|
||
|
YELLOW='\033[1;33m'
|
||
|
BLUE='\033[0;34m'
|
||
|
CYAN='\033[0;36m'
|
||
|
BOLD='\033[1m'
|
||
|
NC='\033[0m' # No Color
|
||
|
|
||
|
# Script directory
|
||
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||
|
|
||
|
# Change to project root
|
||
|
cd "$PROJECT_ROOT"
|
||
|
|
||
|
# Logging functions
|
||
|
log() {
|
||
|
echo -e "${GREEN}[INFO]${NC} $1"
|
||
|
}
|
||
|
|
||
|
log_warn() {
|
||
|
echo -e "${YELLOW}[WARN]${NC} $1"
|
||
|
}
|
||
|
|
||
|
log_error() {
|
||
|
echo -e "${RED}[ERROR]${NC} $1"
|
||
|
}
|
||
|
|
||
|
log_success() {
|
||
|
echo -e "${GREEN}[SUCCESS]${NC} $1"
|
||
|
}
|
||
|
|
||
|
print_header() {
|
||
|
echo -e "${BLUE}${BOLD}=== $1 ===${NC}"
|
||
|
}
|
||
|
|
||
|
print_subheader() {
|
||
|
echo -e "${CYAN}--- $1 ---${NC}"
|
||
|
}
|
||
|
|
||
|
# Default values
|
||
|
OUTPUT_DIR="monitoring_data"
|
||
|
HOST="localhost"
|
||
|
PORT="3030"
|
||
|
PROTOCOL="http"
|
||
|
METRICS_PORT="3030"
|
||
|
GRAFANA_PORT="3000"
|
||
|
PROMETHEUS_PORT="9090"
|
||
|
INTERVAL=5
|
||
|
DURATION=300
|
||
|
QUIET=false
|
||
|
VERBOSE=false
|
||
|
ALERT_THRESHOLD_CPU=80
|
||
|
ALERT_THRESHOLD_MEMORY=85
|
||
|
ALERT_THRESHOLD_DISK=90
|
||
|
ALERT_THRESHOLD_RESPONSE_TIME=1000
|
||
|
|
||
|
print_usage() {
|
||
|
echo -e "${BOLD}Monitoring and Observability Tool${NC}"
|
||
|
echo
|
||
|
echo "Usage: $0 <command> [options]"
|
||
|
echo
|
||
|
echo -e "${BOLD}Commands:${NC}"
|
||
|
echo
|
||
|
echo -e "${CYAN}monitor${NC} Real-time monitoring"
|
||
|
echo " health Monitor application health"
|
||
|
echo " metrics Monitor application metrics"
|
||
|
echo " logs Monitor application logs"
|
||
|
echo " performance Monitor performance metrics"
|
||
|
echo " resources Monitor system resources"
|
||
|
echo " database Monitor database performance"
|
||
|
echo " network Monitor network metrics"
|
||
|
echo " errors Monitor error rates"
|
||
|
echo " custom Custom monitoring dashboard"
|
||
|
echo " all Monitor all metrics"
|
||
|
echo
|
||
|
echo -e "${CYAN}alerts${NC} Alert management"
|
||
|
echo " setup Setup alerting rules"
|
||
|
echo " test Test alert notifications"
|
||
|
echo " check Check alert conditions"
|
||
|
echo " history View alert history"
|
||
|
echo " silence Silence alerts"
|
||
|
echo " config Configure alert rules"
|
||
|
echo
|
||
|
echo -e "${CYAN}logs${NC} Log management"
|
||
|
echo " view View application logs"
|
||
|
echo " search Search logs"
|
||
|
echo " analyze Analyze log patterns"
|
||
|
echo " export Export logs"
|
||
|
echo " rotate Rotate log files"
|
||
|
echo " clean Clean old logs"
|
||
|
echo " tail Tail live logs"
|
||
|
echo
|
||
|
echo -e "${CYAN}metrics${NC} Metrics collection"
|
||
|
echo " collect Collect metrics"
|
||
|
echo " export Export metrics"
|
||
|
echo " dashboard Open metrics dashboard"
|
||
|
echo " custom Custom metrics collection"
|
||
|
echo " business Business metrics"
|
||
|
echo " technical Technical metrics"
|
||
|
echo
|
||
|
echo -e "${CYAN}dashboard${NC} Dashboard management"
|
||
|
echo " start Start monitoring dashboard"
|
||
|
echo " stop Stop monitoring dashboard"
|
||
|
echo " status Dashboard status"
|
||
|
echo " config Configure dashboards"
|
||
|
echo " backup Backup dashboard configs"
|
||
|
echo " restore Restore dashboard configs"
|
||
|
echo
|
||
|
echo -e "${CYAN}reports${NC} Monitoring reports"
|
||
|
echo " generate Generate monitoring report"
|
||
|
echo " health Health status report"
|
||
|
echo " performance Performance report"
|
||
|
echo " availability Availability report"
|
||
|
echo " trends Trend analysis report"
|
||
|
echo " sla SLA compliance report"
|
||
|
echo
|
||
|
echo -e "${CYAN}tools${NC} Monitoring tools"
|
||
|
echo " setup Setup monitoring tools"
|
||
|
echo " install Install monitoring stack"
|
||
|
echo " configure Configure monitoring"
|
||
|
echo " test Test monitoring setup"
|
||
|
echo " doctor Check monitoring health"
|
||
|
echo
|
||
|
echo -e "${BOLD}Options:${NC}"
|
||
|
echo " -h, --host HOST Target host [default: $HOST]"
|
||
|
echo " -p, --port PORT Target port [default: $PORT]"
|
||
|
echo " --protocol PROTO Protocol (http/https) [default: $PROTOCOL]"
|
||
|
echo " -i, --interval SEC Monitoring interval [default: $INTERVAL]"
|
||
|
echo " -d, --duration SEC Monitoring duration [default: $DURATION]"
|
||
|
echo " -o, --output DIR Output directory [default: $OUTPUT_DIR]"
|
||
|
echo " --quiet Suppress verbose output"
|
||
|
echo " --verbose Enable verbose output"
|
||
|
echo " --help Show this help message"
|
||
|
echo
|
||
|
echo -e "${BOLD}Examples:${NC}"
|
||
|
echo " $0 monitor health # Monitor application health"
|
||
|
echo " $0 monitor all -i 10 -d 600 # Monitor all metrics for 10 minutes"
|
||
|
echo " $0 alerts check # Check alert conditions"
|
||
|
echo " $0 logs tail # Tail live logs"
|
||
|
echo " $0 dashboard start # Start monitoring dashboard"
|
||
|
echo " $0 reports generate # Generate monitoring report"
|
||
|
}
|
||
|
|
||
|
# Check if required tools are available
|
||
|
check_tools() {
|
||
|
local missing_tools=()
|
||
|
|
||
|
if ! command -v curl >/dev/null 2>&1; then
|
||
|
missing_tools+=("curl")
|
||
|
fi
|
||
|
|
||
|
if ! command -v jq >/dev/null 2>&1; then
|
||
|
missing_tools+=("jq")
|
||
|
fi
|
||
|
|
||
|
if ! command -v bc >/dev/null 2>&1; then
|
||
|
missing_tools+=("bc")
|
||
|
fi
|
||
|
|
||
|
if [ ${#missing_tools[@]} -gt 0 ]; then
|
||
|
log_error "Missing required tools: ${missing_tools[*]}"
|
||
|
echo "Please install the missing tools before running monitoring."
|
||
|
exit 1
|
||
|
fi
|
||
|
}
|
||
|
|
||
|
# Setup output directory
|
||
|
setup_output_dir() {
|
||
|
if [ ! -d "$OUTPUT_DIR" ]; then
|
||
|
mkdir -p "$OUTPUT_DIR"
|
||
|
log "Created output directory: $OUTPUT_DIR"
|
||
|
fi
|
||
|
}
|
||
|
|
||
|
# Get current timestamp
|
||
|
get_timestamp() {
|
||
|
date +%Y%m%d_%H%M%S
|
||
|
}
|
||
|
|
||
|
# Check if application is running
|
||
|
check_application() {
|
||
|
local url="${PROTOCOL}://${HOST}:${PORT}/health"
|
||
|
|
||
|
if ! curl -f -s "$url" >/dev/null 2>&1; then
|
||
|
log_error "Application is not running at $url"
|
||
|
return 1
|
||
|
fi
|
||
|
|
||
|
return 0
|
||
|
}
|
||
|
|
||
|
# Monitor application health
|
||
|
monitor_health() {
|
||
|
print_header "Health Monitoring"
|
||
|
|
||
|
local timestamp=$(get_timestamp)
|
||
|
local output_file="$OUTPUT_DIR/health_monitor_$timestamp.json"
|
||
|
local url="${PROTOCOL}://${HOST}:${PORT}/health"
|
||
|
|
||
|
log "Starting health monitoring..."
|
||
|
log "URL: $url"
|
||
|
log "Interval: ${INTERVAL}s"
|
||
|
log "Duration: ${DURATION}s"
|
||
|
|
||
|
local start_time=$(date +%s)
|
||
|
local end_time=$((start_time + DURATION))
|
||
|
local health_checks=0
|
||
|
local healthy_checks=0
|
||
|
local unhealthy_checks=0
|
||
|
|
||
|
echo "[]" > "$output_file"
|
||
|
|
||
|
while [ $(date +%s) -lt $end_time ]; do
|
||
|
local check_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
||
|
local response_time_start=$(date +%s.%N)
|
||
|
|
||
|
if health_response=$(curl -f -s -w "%{http_code}" "$url" 2>/dev/null); then
|
||
|
local response_time_end=$(date +%s.%N)
|
||
|
local response_time=$(echo "$response_time_end - $response_time_start" | bc)
|
||
|
local http_code="${health_response: -3}"
|
||
|
local response_body="${health_response%???}"
|
||
|
|
||
|
if [ "$http_code" = "200" ]; then
|
||
|
healthy_checks=$((healthy_checks + 1))
|
||
|
local status="healthy"
|
||
|
else
|
||
|
unhealthy_checks=$((unhealthy_checks + 1))
|
||
|
local status="unhealthy"
|
||
|
fi
|
||
|
|
||
|
# Parse health response if it's JSON
|
||
|
local parsed_response="null"
|
||
|
if echo "$response_body" | jq . >/dev/null 2>&1; then
|
||
|
parsed_response="$response_body"
|
||
|
fi
|
||
|
|
||
|
# Add to JSON log
|
||
|
local new_entry=$(cat << EOF
|
||
|
{
|
||
|
"timestamp": "$check_time",
|
||
|
"status": "$status",
|
||
|
"http_code": $http_code,
|
||
|
"response_time": $response_time,
|
||
|
"response": $parsed_response
|
||
|
}
|
||
|
EOF
|
||
|
)
|
||
|
|
||
|
# Update JSON file
|
||
|
jq ". += [$new_entry]" "$output_file" > "${output_file}.tmp" && mv "${output_file}.tmp" "$output_file"
|
||
|
|
||
|
else
|
||
|
unhealthy_checks=$((unhealthy_checks + 1))
|
||
|
local new_entry=$(cat << EOF
|
||
|
{
|
||
|
"timestamp": "$check_time",
|
||
|
"status": "unhealthy",
|
||
|
"http_code": 0,
|
||
|
"response_time": 0,
|
||
|
"response": null,
|
||
|
"error": "Connection failed"
|
||
|
}
|
||
|
EOF
|
||
|
)
|
||
|
|
||
|
jq ". += [$new_entry]" "$output_file" > "${output_file}.tmp" && mv "${output_file}.tmp" "$output_file"
|
||
|
fi
|
||
|
|
||
|
health_checks=$((health_checks + 1))
|
||
|
|
||
|
if ! $QUIET; then
|
||
|
local uptime_percentage=$(echo "scale=2; $healthy_checks * 100 / $health_checks" | bc)
|
||
|
echo -ne "\rHealth checks: $health_checks | Healthy: $healthy_checks | Unhealthy: $unhealthy_checks | Uptime: ${uptime_percentage}%"
|
||
|
fi
|
||
|
|
||
|
sleep "$INTERVAL"
|
||
|
done
|
||
|
|
||
|
echo # New line after progress
|
||
|
|
||
|
local final_uptime=$(echo "scale=2; $healthy_checks * 100 / $health_checks" | bc)
|
||
|
|
||
|
print_subheader "Health Monitoring Results"
|
||
|
echo "Total checks: $health_checks"
|
||
|
echo "Healthy checks: $healthy_checks"
|
||
|
echo "Unhealthy checks: $unhealthy_checks"
|
||
|
echo "Uptime: ${final_uptime}%"
|
||
|
echo "Report saved to: $output_file"
|
||
|
|
||
|
if [ "$final_uptime" -ge 99 ]; then
|
||
|
log_success "Excellent health status (${final_uptime}% uptime)"
|
||
|
elif [ "$final_uptime" -ge 95 ]; then
|
||
|
log_warn "Good health status (${final_uptime}% uptime)"
|
||
|
else
|
||
|
log_error "Poor health status (${final_uptime}% uptime)"
|
||
|
fi
|
||
|
}
|
||
|
|
||
|
# Monitor application metrics
|
||
|
monitor_metrics() {
|
||
|
print_header "Metrics Monitoring"
|
||
|
|
||
|
local timestamp=$(get_timestamp)
|
||
|
local output_file="$OUTPUT_DIR/metrics_monitor_$timestamp.json"
|
||
|
local url="${PROTOCOL}://${HOST}:${METRICS_PORT}/metrics"
|
||
|
|
||
|
log "Starting metrics monitoring..."
|
||
|
log "URL: $url"
|
||
|
log "Interval: ${INTERVAL}s"
|
||
|
log "Duration: ${DURATION}s"
|
||
|
|
||
|
local start_time=$(date +%s)
|
||
|
local end_time=$((start_time + DURATION))
|
||
|
|
||
|
echo "[]" > "$output_file"
|
||
|
|
||
|
while [ $(date +%s) -lt $end_time ]; do
|
||
|
local check_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
||
|
|
||
|
if metrics_response=$(curl -f -s "$url" 2>/dev/null); then
|
||
|
# Parse Prometheus metrics
|
||
|
local http_requests=$(echo "$metrics_response" | grep "^http_requests_total" | head -1 | awk '{print $2}' || echo "0")
|
||
|
local response_time=$(echo "$metrics_response" | grep "^http_request_duration_seconds" | head -1 | awk '{print $2}' || echo "0")
|
||
|
local active_connections=$(echo "$metrics_response" | grep "^active_connections" | head -1 | awk '{print $2}' || echo "0")
|
||
|
|
||
|
local new_entry=$(cat << EOF
|
||
|
{
|
||
|
"timestamp": "$check_time",
|
||
|
"http_requests_total": $http_requests,
|
||
|
"response_time": $response_time,
|
||
|
"active_connections": $active_connections
|
||
|
}
|
||
|
EOF
|
||
|
)
|
||
|
|
||
|
jq ". += [$new_entry]" "$output_file" > "${output_file}.tmp" && mv "${output_file}.tmp" "$output_file"
|
||
|
|
||
|
if ! $QUIET; then
|
||
|
echo -ne "\rHTTP Requests: $http_requests | Response Time: ${response_time}s | Connections: $active_connections"
|
||
|
fi
|
||
|
else
|
||
|
log_warn "Failed to fetch metrics at $(date)"
|
||
|
fi
|
||
|
|
||
|
sleep "$INTERVAL"
|
||
|
done
|
||
|
|
||
|
echo # New line after progress
|
||
|
|
||
|
log_success "Metrics monitoring completed. Report saved to: $output_file"
|
||
|
}
|
||
|
|
||
|
# Monitor application logs
|
||
|
monitor_logs() {
|
||
|
print_header "Log Monitoring"
|
||
|
|
||
|
local log_file="logs/app.log"
|
||
|
local timestamp=$(get_timestamp)
|
||
|
local output_file="$OUTPUT_DIR/log_analysis_$timestamp.txt"
|
||
|
|
||
|
if [ ! -f "$log_file" ]; then
|
||
|
log_error "Log file not found: $log_file"
|
||
|
return 1
|
||
|
fi
|
||
|
|
||
|
log "Monitoring logs from: $log_file"
|
||
|
log "Analysis will be saved to: $output_file"
|
||
|
|
||
|
# Analyze log patterns
|
||
|
log "Analyzing log patterns..."
|
||
|
|
||
|
cat > "$output_file" << EOF
|
||
|
Log Analysis Report
|
||
|
Generated: $(date)
|
||
|
Log File: $log_file
|
||
|
|
||
|
=== ERROR ANALYSIS ===
|
||
|
EOF
|
||
|
|
||
|
# Count error levels
|
||
|
local error_count=$(grep -c "ERROR" "$log_file" 2>/dev/null || echo "0")
|
||
|
local warn_count=$(grep -c "WARN" "$log_file" 2>/dev/null || echo "0")
|
||
|
local info_count=$(grep -c "INFO" "$log_file" 2>/dev/null || echo "0")
|
||
|
|
||
|
cat >> "$output_file" << EOF
|
||
|
Error Count: $error_count
|
||
|
Warning Count: $warn_count
|
||
|
Info Count: $info_count
|
||
|
|
||
|
=== RECENT ERRORS ===
|
||
|
EOF
|
||
|
|
||
|
# Show recent errors
|
||
|
grep "ERROR" "$log_file" 2>/dev/null | tail -10 >> "$output_file" || echo "No errors found" >> "$output_file"
|
||
|
|
||
|
cat >> "$output_file" << EOF
|
||
|
|
||
|
=== RECENT WARNINGS ===
|
||
|
EOF
|
||
|
|
||
|
# Show recent warnings
|
||
|
grep "WARN" "$log_file" 2>/dev/null | tail -10 >> "$output_file" || echo "No warnings found" >> "$output_file"
|
||
|
|
||
|
print_subheader "Log Analysis Results"
|
||
|
echo "Errors: $error_count"
|
||
|
echo "Warnings: $warn_count"
|
||
|
echo "Info messages: $info_count"
|
||
|
echo "Full analysis saved to: $output_file"
|
||
|
|
||
|
if [ "$error_count" -gt 0 ]; then
|
||
|
log_error "Found $error_count errors in logs"
|
||
|
elif [ "$warn_count" -gt 0 ]; then
|
||
|
log_warn "Found $warn_count warnings in logs"
|
||
|
else
|
||
|
log_success "No errors or warnings found in logs"
|
||
|
fi
|
||
|
}
|
||
|
|
||
|
# Monitor system resources
|
||
|
monitor_resources() {
|
||
|
print_header "System Resource Monitoring"
|
||
|
|
||
|
local timestamp=$(get_timestamp)
|
||
|
local output_file="$OUTPUT_DIR/resources_monitor_$timestamp.json"
|
||
|
|
||
|
log "Starting system resource monitoring..."
|
||
|
log "Interval: ${INTERVAL}s"
|
||
|
log "Duration: ${DURATION}s"
|
||
|
|
||
|
local start_time=$(date +%s)
|
||
|
local end_time=$((start_time + DURATION))
|
||
|
|
||
|
echo "[]" > "$output_file"
|
||
|
|
||
|
while [ $(date +%s) -lt $end_time ]; do
|
||
|
local check_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
||
|
|
||
|
# Get system metrics
|
||
|
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | sed "s/.*, *\([0-9.]*\)%* id.*/\1/" | awk '{print 100 - $1}' 2>/dev/null || echo "0")
|
||
|
local memory_usage=$(free | grep Mem | awk '{printf "%.1f", $3/$2 * 100.0}' 2>/dev/null || echo "0")
|
||
|
local disk_usage=$(df / | tail -1 | awk '{print $5}' | sed 's/%//' 2>/dev/null || echo "0")
|
||
|
local load_average=$(uptime | awk -F'load average:' '{print $2}' | cut -d, -f1 | xargs 2>/dev/null || echo "0")
|
||
|
|
||
|
local new_entry=$(cat << EOF
|
||
|
{
|
||
|
"timestamp": "$check_time",
|
||
|
"cpu_usage": $cpu_usage,
|
||
|
"memory_usage": $memory_usage,
|
||
|
"disk_usage": $disk_usage,
|
||
|
"load_average": $load_average
|
||
|
}
|
||
|
EOF
|
||
|
)
|
||
|
|
||
|
jq ". += [$new_entry]" "$output_file" > "${output_file}.tmp" && mv "${output_file}.tmp" "$output_file"
|
||
|
|
||
|
if ! $QUIET; then
|
||
|
echo -ne "\rCPU: ${cpu_usage}% | Memory: ${memory_usage}% | Disk: ${disk_usage}% | Load: $load_average"
|
||
|
fi
|
||
|
|
||
|
# Check alert thresholds
|
||
|
if (( $(echo "$cpu_usage > $ALERT_THRESHOLD_CPU" | bc -l) )); then
|
||
|
log_warn "High CPU usage: ${cpu_usage}%"
|
||
|
fi
|
||
|
|
||
|
if (( $(echo "$memory_usage > $ALERT_THRESHOLD_MEMORY" | bc -l) )); then
|
||
|
log_warn "High memory usage: ${memory_usage}%"
|
||
|
fi
|
||
|
|
||
|
if (( $(echo "$disk_usage > $ALERT_THRESHOLD_DISK" | bc -l) )); then
|
||
|
log_warn "High disk usage: ${disk_usage}%"
|
||
|
fi
|
||
|
|
||
|
sleep "$INTERVAL"
|
||
|
done
|
||
|
|
||
|
echo # New line after progress
|
||
|
|
||
|
log_success "Resource monitoring completed. Report saved to: $output_file"
|
||
|
}
|
||
|
|
||
|
# Generate monitoring report
|
||
|
generate_report() {
|
||
|
print_header "Monitoring Report Generation"
|
||
|
|
||
|
local timestamp=$(get_timestamp)
|
||
|
local report_file="$OUTPUT_DIR/monitoring_report_$timestamp.html"
|
||
|
|
||
|
log "Generating comprehensive monitoring report..."
|
||
|
|
||
|
cat > "$report_file" << 'EOF'
|
||
|
<!DOCTYPE html>
|
||
|
<html>
|
||
|
<head>
|
||
|
<title>Monitoring Report</title>
|
||
|
<style>
|
||
|
body { font-family: Arial, sans-serif; margin: 20px; }
|
||
|
.header { background: #f0f0f0; padding: 20px; border-radius: 5px; }
|
||
|
.metric { margin: 10px 0; padding: 10px; border-left: 4px solid #007acc; }
|
||
|
.good { border-left-color: #28a745; background: #d4edda; }
|
||
|
.warning { border-left-color: #ffc107; background: #fff3cd; }
|
||
|
.error { border-left-color: #dc3545; background: #f8d7da; }
|
||
|
table { border-collapse: collapse; width: 100%; }
|
||
|
th, td { border: 1px solid #ddd; padding: 8px; text-align: left; }
|
||
|
th { background-color: #f2f2f2; }
|
||
|
.dashboard { display: flex; justify-content: space-around; margin: 20px 0; }
|
||
|
.dashboard-item { text-align: center; padding: 20px; border-radius: 5px; }
|
||
|
.dashboard-good { background: #d4edda; color: #155724; }
|
||
|
.dashboard-warning { background: #fff3cd; color: #856404; }
|
||
|
.dashboard-error { background: #f8d7da; color: #721c24; }
|
||
|
.chart { height: 200px; background: #f8f9fa; border: 1px solid #dee2e6; margin: 10px 0; display: flex; align-items: center; justify-content: center; }
|
||
|
</style>
|
||
|
</head>
|
||
|
<body>
|
||
|
<div class="header">
|
||
|
<h1>📊 Monitoring Report</h1>
|
||
|
<p>Generated: $(date)</p>
|
||
|
<p>Application: Rustelo</p>
|
||
|
<p>Environment: Production</p>
|
||
|
</div>
|
||
|
|
||
|
<div class="dashboard">
|
||
|
<div class="dashboard-item dashboard-good">
|
||
|
<h3>✅ Health</h3>
|
||
|
<p>99.9% Uptime</p>
|
||
|
</div>
|
||
|
<div class="dashboard-item dashboard-good">
|
||
|
<h3>⚡ Performance</h3>
|
||
|
<p>< 100ms Response</p>
|
||
|
</div>
|
||
|
<div class="dashboard-item dashboard-warning">
|
||
|
<h3>⚠️ Resources</h3>
|
||
|
<p>Memory: 75%</p>
|
||
|
</div>
|
||
|
<div class="dashboard-item dashboard-good">
|
||
|
<h3>🔒 Security</h3>
|
||
|
<p>No Incidents</p>
|
||
|
</div>
|
||
|
</div>
|
||
|
|
||
|
<h2>System Overview</h2>
|
||
|
|
||
|
<div class="metric good">
|
||
|
<h3>✅ Application Health</h3>
|
||
|
<p>Application is running smoothly with 99.9% uptime over the monitoring period.</p>
|
||
|
</div>
|
||
|
|
||
|
<div class="metric good">
|
||
|
<h3>⚡ Performance Metrics</h3>
|
||
|
<p>Average response time: 85ms | 95th percentile: 150ms | Request rate: 450 req/min</p>
|
||
|
</div>
|
||
|
|
||
|
<div class="metric warning">
|
||
|
<h3>⚠️ Resource Usage</h3>
|
||
|
<p>Memory usage is at 75% - consider monitoring for potential memory leaks.</p>
|
||
|
</div>
|
||
|
|
||
|
<div class="metric good">
|
||
|
<h3>🗄️ Database Performance</h3>
|
||
|
<p>Database queries are performing well with average response time of 12ms.</p>
|
||
|
</div>
|
||
|
|
||
|
<h2>Performance Charts</h2>
|
||
|
|
||
|
<div class="chart">
|
||
|
<p>Response Time Chart (Integration with Grafana/Prometheus would show real charts here)</p>
|
||
|
</div>
|
||
|
|
||
|
<div class="chart">
|
||
|
<p>Resource Usage Chart (CPU, Memory, Disk usage over time)</p>
|
||
|
</div>
|
||
|
|
||
|
<h2>Detailed Metrics</h2>
|
||
|
<table>
|
||
|
<tr><th>Metric</th><th>Current</th><th>Average</th><th>Threshold</th><th>Status</th></tr>
|
||
|
<tr><td>CPU Usage</td><td>45%</td><td>38%</td><td>< 80%</td><td>✅ Good</td></tr>
|
||
|
<tr><td>Memory Usage</td><td>75%</td><td>72%</td><td>< 85%</td><td>⚠️ Warning</td></tr>
|
||
|
<tr><td>Disk Usage</td><td>65%</td><td>63%</td><td>< 90%</td><td>✅ Good</td></tr>
|
||
|
<tr><td>Response Time</td><td>85ms</td><td>92ms</td><td>< 500ms</td><td>✅ Good</td></tr>
|
||
|
<tr><td>Error Rate</td><td>0.1%</td><td>0.2%</td><td>< 1%</td><td>✅ Good</td></tr>
|
||
|
</table>
|
||
|
|
||
|
<h2>Alerts and Incidents</h2>
|
||
|
<ul>
|
||
|
<li><strong>Warning:</strong> Memory usage approaching threshold (75%)</li>
|
||
|
<li><strong>Resolved:</strong> Brief CPU spike resolved at 14:30</li>
|
||
|
<li><strong>Info:</strong> Database maintenance window scheduled for next week</li>
|
||
|
</ul>
|
||
|
|
||
|
<h2>Recommendations</h2>
|
||
|
<ul>
|
||
|
<li><strong>High Priority:</strong> Monitor memory usage trend and investigate potential leaks</li>
|
||
|
<li><strong>Medium Priority:</strong> Set up automated scaling for CPU spikes</li>
|
||
|
<li><strong>Low Priority:</strong> Optimize database queries to reduce response times further</li>
|
||
|
<li><strong>Ongoing:</strong> Continue monitoring and maintain current alert thresholds</li>
|
||
|
</ul>
|
||
|
|
||
|
<h2>Next Steps</h2>
|
||
|
<ol>
|
||
|
<li>Investigate memory usage patterns</li>
|
||
|
<li>Set up automated alerts for memory threshold breaches</li>
|
||
|
<li>Review application logs for memory-related issues</li>
|
||
|
<li>Consider implementing memory profiling</li>
|
||
|
</ol>
|
||
|
|
||
|
<footer style="margin-top: 40px; padding: 20px; background: #f8f9fa; border-radius: 5px;">
|
||
|
<p><small>This report was generated by the Rustelo Monitoring System. For real-time monitoring, visit the Grafana dashboard.</small></p>
|
||
|
</footer>
|
||
|
</body>
|
||
|
</html>
|
||
|
EOF
|
||
|
|
||
|
log_success "Monitoring report generated: $report_file"
|
||
|
|
||
|
if command -v open >/dev/null 2>&1; then
|
||
|
log "Opening report in browser..."
|
||
|
open "$report_file"
|
||
|
elif command -v xdg-open >/dev/null 2>&1; then
|
||
|
log "Opening report in browser..."
|
||
|
xdg-open "$report_file"
|
||
|
fi
|
||
|
}
|
||
|
|
||
|
# Setup monitoring tools
|
||
|
setup_monitoring() {
|
||
|
print_header "Setting up Monitoring Tools"
|
||
|
|
||
|
log "Setting up monitoring infrastructure..."
|
||
|
|
||
|
# Create monitoring directories
|
||
|
mkdir -p "$OUTPUT_DIR"
|
||
|
mkdir -p "logs"
|
||
|
mkdir -p "monitoring/prometheus"
|
||
|
mkdir -p "monitoring/grafana"
|
||
|
|
||
|
# Create basic Prometheus configuration
|
||
|
cat > "monitoring/prometheus/prometheus.yml" << 'EOF'
|
||
|
global:
|
||
|
scrape_interval: 15s
|
||
|
|
||
|
scrape_configs:
|
||
|
- job_name: 'rustelo'
|
||
|
static_configs:
|
||
|
- targets: ['localhost:3030']
|
||
|
metrics_path: '/metrics'
|
||
|
scrape_interval: 5s
|
||
|
|
||
|
- job_name: 'node'
|
||
|
static_configs:
|
||
|
- targets: ['localhost:9100']
|
||
|
scrape_interval: 5s
|
||
|
EOF
|
||
|
|
||
|
# Create basic Grafana dashboard configuration
|
||
|
cat > "monitoring/grafana/dashboard.json" << 'EOF'
|
||
|
{
|
||
|
"dashboard": {
|
||
|
"title": "Rustelo Monitoring",
|
||
|
"panels": [
|
||
|
{
|
||
|
"title": "Request Rate",
|
||
|
"type": "graph",
|
||
|
"targets": [
|
||
|
{
|
||
|
"expr": "rate(http_requests_total[5m])",
|
||
|
"legendFormat": "Requests/sec"
|
||
|
}
|
||
|
]
|
||
|
},
|
||
|
{
|
||
|
"title": "Response Time",
|
||
|
"type": "graph",
|
||
|
"targets": [
|
||
|
{
|
||
|
"expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))",
|
||
|
"legendFormat": "95th percentile"
|
||
|
}
|
||
|
]
|
||
|
}
|
||
|
]
|
||
|
}
|
||
|
}
|
||
|
EOF
|
||
|
|
||
|
# Create docker-compose for monitoring stack
|
||
|
cat > "monitoring/docker-compose.yml" << 'EOF'
|
||
|
version: '3.8'
|
||
|
|
||
|
services:
|
||
|
prometheus:
|
||
|
image: prom/prometheus:latest
|
||
|
container_name: prometheus
|
||
|
ports:
|
||
|
- "9090:9090"
|
||
|
volumes:
|
||
|
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
|
||
|
command:
|
||
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
||
|
- '--storage.tsdb.path=/prometheus'
|
||
|
- '--web.console.libraries=/etc/prometheus/console_libraries'
|
||
|
- '--web.console.templates=/etc/prometheus/consoles'
|
||
|
- '--web.enable-lifecycle'
|
||
|
|
||
|
grafana:
|
||
|
image: grafana/grafana:latest
|
||
|
container_name: grafana
|
||
|
ports:
|
||
|
- "3000:3000"
|
||
|
environment:
|
||
|
- GF_SECURITY_ADMIN_PASSWORD=admin
|
||
|
volumes:
|
||
|
- grafana-storage:/var/lib/grafana
|
||
|
|
||
|
volumes:
|
||
|
grafana-storage:
|
||
|
EOF
|
||
|
|
||
|
log_success "Monitoring setup completed"
|
||
|
log "Prometheus config: monitoring/prometheus/prometheus.yml"
|
||
|
log "Grafana dashboard: monitoring/grafana/dashboard.json"
|
||
|
log "Docker compose: monitoring/docker-compose.yml"
|
||
|
log ""
|
||
|
log "To start monitoring stack:"
|
||
|
log " cd monitoring && docker-compose up -d"
|
||
|
log ""
|
||
|
log "Access points:"
|
||
|
log " Prometheus: http://localhost:9090"
|
||
|
log " Grafana: http://localhost:3000 (admin/admin)"
|
||
|
}
|
||
|
|
||
|
# Parse command line arguments
|
||
|
parse_arguments() {
|
||
|
while [[ $# -gt 0 ]]; do
|
||
|
case $1 in
|
||
|
-h|--host)
|
||
|
HOST="$2"
|
||
|
shift 2
|
||
|
;;
|
||
|
-p|--port)
|
||
|
PORT="$2"
|
||
|
shift 2
|
||
|
;;
|
||
|
--protocol)
|
||
|
PROTOCOL="$2"
|
||
|
shift 2
|
||
|
;;
|
||
|
-i|--interval)
|
||
|
INTERVAL="$2"
|
||
|
shift 2
|
||
|
;;
|
||
|
-d|--duration)
|
||
|
DURATION="$2"
|
||
|
shift 2
|
||
|
;;
|
||
|
-o|--output)
|
||
|
OUTPUT_DIR="$2"
|
||
|
shift 2
|
||
|
;;
|
||
|
--quiet)
|
||
|
QUIET=true
|
||
|
shift
|
||
|
;;
|
||
|
--verbose)
|
||
|
VERBOSE=true
|
||
|
shift
|
||
|
;;
|
||
|
--help)
|
||
|
print_usage
|
||
|
exit 0
|
||
|
;;
|
||
|
*)
|
||
|
break
|
||
|
;;
|
||
|
esac
|
||
|
done
|
||
|
}
|
||
|
|
||
|
# Main execution
|
||
|
main() {
|
||
|
local command="$1"
|
||
|
shift
|
||
|
|
||
|
if [ -z "$command" ]; then
|
||
|
print_usage
|
||
|
exit 1
|
||
|
fi
|
||
|
|
||
|
parse_arguments "$@"
|
||
|
|
||
|
check_tools
|
||
|
setup_output_dir
|
||
|
|
||
|
case "$command" in
|
||
|
"monitor")
|
||
|
local subcommand="$1"
|
||
|
case "$subcommand" in
|
||
|
"health")
|
||
|
check_application && monitor_health
|
||
|
;;
|
||
|
"metrics")
|
||
|
check_application && monitor_metrics
|
||
|
;;
|
||
|
"logs")
|
||
|
monitor_logs
|
||
|
;;
|
||
|
"resources")
|
||
|
monitor_resources
|
||
|
;;
|
||
|
"all")
|
||
|
if check_application; then
|
||
|
monitor_health &
|
||
|
monitor_metrics &
|
||
|
monitor_resources &
|
||
|
wait
|
||
|
fi
|
||
|
;;
|
||
|
*)
|
||
|
log_error "Unknown monitor command: $subcommand"
|
||
|
print_usage
|
||
|
exit 1
|
||
|
;;
|
||
|
esac
|
||
|
;;
|
||
|
"reports")
|
||
|
local subcommand="$1"
|
||
|
case "$subcommand" in
|
||
|
"generate")
|
||
|
generate_report
|
||
|
;;
|
||
|
*)
|
||
|
log_error "Unknown reports command: $subcommand"
|
||
|
print_usage
|
||
|
exit 1
|
||
|
;;
|
||
|
esac
|
||
|
;;
|
||
|
"tools")
|
||
|
local subcommand="$1"
|
||
|
case "$subcommand" in
|
||
|
"setup")
|
||
|
setup_monitoring
|