#!/bin/bash # Monitoring and Observability Script # Comprehensive monitoring, logging, and alerting tools set -e # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' CYAN='\033[0;36m' BOLD='\033[1m' NC='\033[0m' # No Color # Script directory SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" # Change to project root cd "$PROJECT_ROOT" # Logging functions log() { echo -e "${GREEN}[INFO]${NC} $1" } log_warn() { echo -e "${YELLOW}[WARN]${NC} $1" } log_error() { echo -e "${RED}[ERROR]${NC} $1" } log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1" } print_header() { echo -e "${BLUE}${BOLD}=== $1 ===${NC}" } print_subheader() { echo -e "${CYAN}--- $1 ---${NC}" } # Default values OUTPUT_DIR="monitoring_data" HOST="localhost" PORT="3030" PROTOCOL="http" METRICS_PORT="3030" GRAFANA_PORT="3000" PROMETHEUS_PORT="9090" INTERVAL=5 DURATION=300 QUIET=false VERBOSE=false ALERT_THRESHOLD_CPU=80 ALERT_THRESHOLD_MEMORY=85 ALERT_THRESHOLD_DISK=90 ALERT_THRESHOLD_RESPONSE_TIME=1000 print_usage() { echo -e "${BOLD}Monitoring and Observability Tool${NC}" echo echo "Usage: $0 [options]" echo echo -e "${BOLD}Commands:${NC}" echo echo -e "${CYAN}monitor${NC} Real-time monitoring" echo " health Monitor application health" echo " metrics Monitor application metrics" echo " logs Monitor application logs" echo " performance Monitor performance metrics" echo " resources Monitor system resources" echo " database Monitor database performance" echo " network Monitor network metrics" echo " errors Monitor error rates" echo " custom Custom monitoring dashboard" echo " all Monitor all metrics" echo echo -e "${CYAN}alerts${NC} Alert management" echo " setup Setup alerting rules" echo " test Test alert notifications" echo " check Check alert conditions" echo " history View alert history" echo " silence Silence alerts" echo " config Configure alert rules" echo echo -e "${CYAN}logs${NC} Log management" echo " view View application logs" echo " search Search logs" echo " analyze Analyze log patterns" echo " export Export logs" echo " rotate Rotate log files" echo " clean Clean old logs" echo " tail Tail live logs" echo echo -e "${CYAN}metrics${NC} Metrics collection" echo " collect Collect metrics" echo " export Export metrics" echo " dashboard Open metrics dashboard" echo " custom Custom metrics collection" echo " business Business metrics" echo " technical Technical metrics" echo echo -e "${CYAN}dashboard${NC} Dashboard management" echo " start Start monitoring dashboard" echo " stop Stop monitoring dashboard" echo " status Dashboard status" echo " config Configure dashboards" echo " backup Backup dashboard configs" echo " restore Restore dashboard configs" echo echo -e "${CYAN}reports${NC} Monitoring reports" echo " generate Generate monitoring report" echo " health Health status report" echo " performance Performance report" echo " availability Availability report" echo " trends Trend analysis report" echo " sla SLA compliance report" echo echo -e "${CYAN}tools${NC} Monitoring tools" echo " setup Setup monitoring tools" echo " install Install monitoring stack" echo " configure Configure monitoring" echo " test Test monitoring setup" echo " doctor Check monitoring health" echo echo -e "${BOLD}Options:${NC}" echo " -h, --host HOST Target host [default: $HOST]" echo " -p, --port PORT Target port [default: $PORT]" echo " --protocol PROTO Protocol (http/https) [default: $PROTOCOL]" echo " -i, --interval SEC Monitoring interval [default: $INTERVAL]" echo " -d, --duration SEC Monitoring duration [default: $DURATION]" echo " -o, --output DIR Output directory [default: $OUTPUT_DIR]" echo " --quiet Suppress verbose output" echo " --verbose Enable verbose output" echo " --help Show this help message" echo echo -e "${BOLD}Examples:${NC}" echo " $0 monitor health # Monitor application health" echo " $0 monitor all -i 10 -d 600 # Monitor all metrics for 10 minutes" echo " $0 alerts check # Check alert conditions" echo " $0 logs tail # Tail live logs" echo " $0 dashboard start # Start monitoring dashboard" echo " $0 reports generate # Generate monitoring report" } # Check if required tools are available check_tools() { local missing_tools=() if ! command -v curl >/dev/null 2>&1; then missing_tools+=("curl") fi if ! command -v jq >/dev/null 2>&1; then missing_tools+=("jq") fi if ! command -v bc >/dev/null 2>&1; then missing_tools+=("bc") fi if [ ${#missing_tools[@]} -gt 0 ]; then log_error "Missing required tools: ${missing_tools[*]}" echo "Please install the missing tools before running monitoring." exit 1 fi } # Setup output directory setup_output_dir() { if [ ! -d "$OUTPUT_DIR" ]; then mkdir -p "$OUTPUT_DIR" log "Created output directory: $OUTPUT_DIR" fi } # Get current timestamp get_timestamp() { date +%Y%m%d_%H%M%S } # Check if application is running check_application() { local url="${PROTOCOL}://${HOST}:${PORT}/health" if ! curl -f -s "$url" >/dev/null 2>&1; then log_error "Application is not running at $url" return 1 fi return 0 } # Monitor application health monitor_health() { print_header "Health Monitoring" local timestamp=$(get_timestamp) local output_file="$OUTPUT_DIR/health_monitor_$timestamp.json" local url="${PROTOCOL}://${HOST}:${PORT}/health" log "Starting health monitoring..." log "URL: $url" log "Interval: ${INTERVAL}s" log "Duration: ${DURATION}s" local start_time=$(date +%s) local end_time=$((start_time + DURATION)) local health_checks=0 local healthy_checks=0 local unhealthy_checks=0 echo "[]" > "$output_file" while [ $(date +%s) -lt $end_time ]; do local check_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ") local response_time_start=$(date +%s.%N) if health_response=$(curl -f -s -w "%{http_code}" "$url" 2>/dev/null); then local response_time_end=$(date +%s.%N) local response_time=$(echo "$response_time_end - $response_time_start" | bc) local http_code="${health_response: -3}" local response_body="${health_response%???}" if [ "$http_code" = "200" ]; then healthy_checks=$((healthy_checks + 1)) local status="healthy" else unhealthy_checks=$((unhealthy_checks + 1)) local status="unhealthy" fi # Parse health response if it's JSON local parsed_response="null" if echo "$response_body" | jq . >/dev/null 2>&1; then parsed_response="$response_body" fi # Add to JSON log local new_entry=$(cat << EOF { "timestamp": "$check_time", "status": "$status", "http_code": $http_code, "response_time": $response_time, "response": $parsed_response } EOF ) # Update JSON file jq ". += [$new_entry]" "$output_file" > "${output_file}.tmp" && mv "${output_file}.tmp" "$output_file" else unhealthy_checks=$((unhealthy_checks + 1)) local new_entry=$(cat << EOF { "timestamp": "$check_time", "status": "unhealthy", "http_code": 0, "response_time": 0, "response": null, "error": "Connection failed" } EOF ) jq ". += [$new_entry]" "$output_file" > "${output_file}.tmp" && mv "${output_file}.tmp" "$output_file" fi health_checks=$((health_checks + 1)) if ! $QUIET; then local uptime_percentage=$(echo "scale=2; $healthy_checks * 100 / $health_checks" | bc) echo -ne "\rHealth checks: $health_checks | Healthy: $healthy_checks | Unhealthy: $unhealthy_checks | Uptime: ${uptime_percentage}%" fi sleep "$INTERVAL" done echo # New line after progress local final_uptime=$(echo "scale=2; $healthy_checks * 100 / $health_checks" | bc) print_subheader "Health Monitoring Results" echo "Total checks: $health_checks" echo "Healthy checks: $healthy_checks" echo "Unhealthy checks: $unhealthy_checks" echo "Uptime: ${final_uptime}%" echo "Report saved to: $output_file" if [ "$final_uptime" -ge 99 ]; then log_success "Excellent health status (${final_uptime}% uptime)" elif [ "$final_uptime" -ge 95 ]; then log_warn "Good health status (${final_uptime}% uptime)" else log_error "Poor health status (${final_uptime}% uptime)" fi } # Monitor application metrics monitor_metrics() { print_header "Metrics Monitoring" local timestamp=$(get_timestamp) local output_file="$OUTPUT_DIR/metrics_monitor_$timestamp.json" local url="${PROTOCOL}://${HOST}:${METRICS_PORT}/metrics" log "Starting metrics monitoring..." log "URL: $url" log "Interval: ${INTERVAL}s" log "Duration: ${DURATION}s" local start_time=$(date +%s) local end_time=$((start_time + DURATION)) echo "[]" > "$output_file" while [ $(date +%s) -lt $end_time ]; do local check_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ") if metrics_response=$(curl -f -s "$url" 2>/dev/null); then # Parse Prometheus metrics local http_requests=$(echo "$metrics_response" | grep "^http_requests_total" | head -1 | awk '{print $2}' || echo "0") local response_time=$(echo "$metrics_response" | grep "^http_request_duration_seconds" | head -1 | awk '{print $2}' || echo "0") local active_connections=$(echo "$metrics_response" | grep "^active_connections" | head -1 | awk '{print $2}' || echo "0") local new_entry=$(cat << EOF { "timestamp": "$check_time", "http_requests_total": $http_requests, "response_time": $response_time, "active_connections": $active_connections } EOF ) jq ". += [$new_entry]" "$output_file" > "${output_file}.tmp" && mv "${output_file}.tmp" "$output_file" if ! $QUIET; then echo -ne "\rHTTP Requests: $http_requests | Response Time: ${response_time}s | Connections: $active_connections" fi else log_warn "Failed to fetch metrics at $(date)" fi sleep "$INTERVAL" done echo # New line after progress log_success "Metrics monitoring completed. Report saved to: $output_file" } # Monitor application logs monitor_logs() { print_header "Log Monitoring" local log_file="logs/app.log" local timestamp=$(get_timestamp) local output_file="$OUTPUT_DIR/log_analysis_$timestamp.txt" if [ ! -f "$log_file" ]; then log_error "Log file not found: $log_file" return 1 fi log "Monitoring logs from: $log_file" log "Analysis will be saved to: $output_file" # Analyze log patterns log "Analyzing log patterns..." cat > "$output_file" << EOF Log Analysis Report Generated: $(date) Log File: $log_file === ERROR ANALYSIS === EOF # Count error levels local error_count=$(grep -c "ERROR" "$log_file" 2>/dev/null || echo "0") local warn_count=$(grep -c "WARN" "$log_file" 2>/dev/null || echo "0") local info_count=$(grep -c "INFO" "$log_file" 2>/dev/null || echo "0") cat >> "$output_file" << EOF Error Count: $error_count Warning Count: $warn_count Info Count: $info_count === RECENT ERRORS === EOF # Show recent errors grep "ERROR" "$log_file" 2>/dev/null | tail -10 >> "$output_file" || echo "No errors found" >> "$output_file" cat >> "$output_file" << EOF === RECENT WARNINGS === EOF # Show recent warnings grep "WARN" "$log_file" 2>/dev/null | tail -10 >> "$output_file" || echo "No warnings found" >> "$output_file" print_subheader "Log Analysis Results" echo "Errors: $error_count" echo "Warnings: $warn_count" echo "Info messages: $info_count" echo "Full analysis saved to: $output_file" if [ "$error_count" -gt 0 ]; then log_error "Found $error_count errors in logs" elif [ "$warn_count" -gt 0 ]; then log_warn "Found $warn_count warnings in logs" else log_success "No errors or warnings found in logs" fi } # Monitor system resources monitor_resources() { print_header "System Resource Monitoring" local timestamp=$(get_timestamp) local output_file="$OUTPUT_DIR/resources_monitor_$timestamp.json" log "Starting system resource monitoring..." log "Interval: ${INTERVAL}s" log "Duration: ${DURATION}s" local start_time=$(date +%s) local end_time=$((start_time + DURATION)) echo "[]" > "$output_file" while [ $(date +%s) -lt $end_time ]; do local check_time=$(date -u +"%Y-%m-%dT%H:%M:%SZ") # Get system metrics local cpu_usage=$(top -bn1 | grep "Cpu(s)" | sed "s/.*, *\([0-9.]*\)%* id.*/\1/" | awk '{print 100 - $1}' 2>/dev/null || echo "0") local memory_usage=$(free | grep Mem | awk '{printf "%.1f", $3/$2 * 100.0}' 2>/dev/null || echo "0") local disk_usage=$(df / | tail -1 | awk '{print $5}' | sed 's/%//' 2>/dev/null || echo "0") local load_average=$(uptime | awk -F'load average:' '{print $2}' | cut -d, -f1 | xargs 2>/dev/null || echo "0") local new_entry=$(cat << EOF { "timestamp": "$check_time", "cpu_usage": $cpu_usage, "memory_usage": $memory_usage, "disk_usage": $disk_usage, "load_average": $load_average } EOF ) jq ". += [$new_entry]" "$output_file" > "${output_file}.tmp" && mv "${output_file}.tmp" "$output_file" if ! $QUIET; then echo -ne "\rCPU: ${cpu_usage}% | Memory: ${memory_usage}% | Disk: ${disk_usage}% | Load: $load_average" fi # Check alert thresholds if (( $(echo "$cpu_usage > $ALERT_THRESHOLD_CPU" | bc -l) )); then log_warn "High CPU usage: ${cpu_usage}%" fi if (( $(echo "$memory_usage > $ALERT_THRESHOLD_MEMORY" | bc -l) )); then log_warn "High memory usage: ${memory_usage}%" fi if (( $(echo "$disk_usage > $ALERT_THRESHOLD_DISK" | bc -l) )); then log_warn "High disk usage: ${disk_usage}%" fi sleep "$INTERVAL" done echo # New line after progress log_success "Resource monitoring completed. Report saved to: $output_file" } # Generate monitoring report generate_report() { print_header "Monitoring Report Generation" local timestamp=$(get_timestamp) local report_file="$OUTPUT_DIR/monitoring_report_$timestamp.html" log "Generating comprehensive monitoring report..." cat > "$report_file" << 'EOF' Monitoring Report

📊 Monitoring Report

Generated: $(date)

Application: Rustelo

Environment: Production

✅ Health

99.9% Uptime

⚡ Performance

< 100ms Response

⚠️ Resources

Memory: 75%

🔒 Security

No Incidents

System Overview

✅ Application Health

Application is running smoothly with 99.9% uptime over the monitoring period.

⚡ Performance Metrics

Average response time: 85ms | 95th percentile: 150ms | Request rate: 450 req/min

⚠️ Resource Usage

Memory usage is at 75% - consider monitoring for potential memory leaks.

🗄️ Database Performance

Database queries are performing well with average response time of 12ms.

Performance Charts

Response Time Chart (Integration with Grafana/Prometheus would show real charts here)

Resource Usage Chart (CPU, Memory, Disk usage over time)

Detailed Metrics

MetricCurrentAverageThresholdStatus
CPU Usage45%38%< 80%✅ Good
Memory Usage75%72%< 85%⚠️ Warning
Disk Usage65%63%< 90%✅ Good
Response Time85ms92ms< 500ms✅ Good
Error Rate0.1%0.2%< 1%✅ Good

Alerts and Incidents

Recommendations

Next Steps

  1. Investigate memory usage patterns
  2. Set up automated alerts for memory threshold breaches
  3. Review application logs for memory-related issues
  4. Consider implementing memory profiling
EOF log_success "Monitoring report generated: $report_file" if command -v open >/dev/null 2>&1; then log "Opening report in browser..." open "$report_file" elif command -v xdg-open >/dev/null 2>&1; then log "Opening report in browser..." xdg-open "$report_file" fi } # Setup monitoring tools setup_monitoring() { print_header "Setting up Monitoring Tools" log "Setting up monitoring infrastructure..." # Create monitoring directories mkdir -p "$OUTPUT_DIR" mkdir -p "logs" mkdir -p "monitoring/prometheus" mkdir -p "monitoring/grafana" # Create basic Prometheus configuration cat > "monitoring/prometheus/prometheus.yml" << 'EOF' global: scrape_interval: 15s scrape_configs: - job_name: 'rustelo' static_configs: - targets: ['localhost:3030'] metrics_path: '/metrics' scrape_interval: 5s - job_name: 'node' static_configs: - targets: ['localhost:9100'] scrape_interval: 5s EOF # Create basic Grafana dashboard configuration cat > "monitoring/grafana/dashboard.json" << 'EOF' { "dashboard": { "title": "Rustelo Monitoring", "panels": [ { "title": "Request Rate", "type": "graph", "targets": [ { "expr": "rate(http_requests_total[5m])", "legendFormat": "Requests/sec" } ] }, { "title": "Response Time", "type": "graph", "targets": [ { "expr": "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))", "legendFormat": "95th percentile" } ] } ] } } EOF # Create docker-compose for monitoring stack cat > "monitoring/docker-compose.yml" << 'EOF' version: '3.8' services: prometheus: image: prom/prometheus:latest container_name: prometheus ports: - "9090:9090" volumes: - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.path=/prometheus' - '--web.console.libraries=/etc/prometheus/console_libraries' - '--web.console.templates=/etc/prometheus/consoles' - '--web.enable-lifecycle' grafana: image: grafana/grafana:latest container_name: grafana ports: - "3000:3000" environment: - GF_SECURITY_ADMIN_PASSWORD=admin volumes: - grafana-storage:/var/lib/grafana volumes: grafana-storage: EOF log_success "Monitoring setup completed" log "Prometheus config: monitoring/prometheus/prometheus.yml" log "Grafana dashboard: monitoring/grafana/dashboard.json" log "Docker compose: monitoring/docker-compose.yml" log "" log "To start monitoring stack:" log " cd monitoring && docker-compose up -d" log "" log "Access points:" log " Prometheus: http://localhost:9090" log " Grafana: http://localhost:3000 (admin/admin)" } # Parse command line arguments parse_arguments() { while [[ $# -gt 0 ]]; do case $1 in -h|--host) HOST="$2" shift 2 ;; -p|--port) PORT="$2" shift 2 ;; --protocol) PROTOCOL="$2" shift 2 ;; -i|--interval) INTERVAL="$2" shift 2 ;; -d|--duration) DURATION="$2" shift 2 ;; -o|--output) OUTPUT_DIR="$2" shift 2 ;; --quiet) QUIET=true shift ;; --verbose) VERBOSE=true shift ;; --help) print_usage exit 0 ;; *) break ;; esac done } # Main execution main() { local command="$1" shift if [ -z "$command" ]; then print_usage exit 1 fi parse_arguments "$@" check_tools setup_output_dir case "$command" in "monitor") local subcommand="$1" case "$subcommand" in "health") check_application && monitor_health ;; "metrics") check_application && monitor_metrics ;; "logs") monitor_logs ;; "resources") monitor_resources ;; "all") if check_application; then monitor_health & monitor_metrics & monitor_resources & wait fi ;; *) log_error "Unknown monitor command: $subcommand" print_usage exit 1 ;; esac ;; "reports") local subcommand="$1" case "$subcommand" in "generate") generate_report ;; *) log_error "Unknown reports command: $subcommand" print_usage exit 1 ;; esac ;; "tools") local subcommand="$1" case "$subcommand" in "setup") setup_monitoring