#!/bin/bash # Database Monitoring and Health Check Script # Provides comprehensive database monitoring, performance metrics, and health checks set -e # Colors for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' CYAN='\033[0;36m' BOLD='\033[1m' NC='\033[0m' # No Color # Script directory SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" # Change to project root cd "$PROJECT_ROOT" # Default monitoring configuration MONITOR_INTERVAL=60 ALERT_THRESHOLD_CONNECTIONS=80 ALERT_THRESHOLD_DISK_USAGE=85 ALERT_THRESHOLD_MEMORY_USAGE=90 ALERT_THRESHOLD_QUERY_TIME=5000 LOG_FILE="monitoring.log" # Logging functions log() { echo -e "${GREEN}[INFO]${NC} $1" } log_warn() { echo -e "${YELLOW}[WARN]${NC} $1" } log_error() { echo -e "${RED}[ERROR]${NC} $1" } log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1" } log_metric() { echo -e "${CYAN}[METRIC]${NC} $1" } print_header() { echo -e "${BLUE}${BOLD}=== $1 ===${NC}" } print_subheader() { echo -e "${CYAN}--- $1 ---${NC}" } print_usage() { echo "Database Monitoring and Health Check Script" echo echo "Usage: $0 [options]" echo echo "Commands:" echo " health Complete health check" echo " status Quick status check" echo " connections Show active connections" echo " performance Show performance metrics" echo " slow-queries Show slow queries" echo " locks Show database locks" echo " disk-usage Show disk usage" echo " memory-usage Show memory usage" echo " backup-status Check backup status" echo " replication Check replication status" echo " monitor Start continuous monitoring" echo " alerts Check for alerts" echo " vacuum Perform database maintenance" echo " analyze Update database statistics" echo " report Generate comprehensive report" echo echo "Options:" echo " --env ENV Environment (dev/prod) [default: dev]" echo " --interval SECS Monitoring interval in seconds [default: 60]" echo " --log-file FILE Log file path [default: monitoring.log]" echo " --threshold-conn N Connection alert threshold [default: 80]" echo " --threshold-disk N Disk usage alert threshold [default: 85]" echo " --threshold-mem N Memory usage alert threshold [default: 90]" echo " --threshold-query N Query time alert threshold in ms [default: 5000]" echo " --format FORMAT Output format (table/json/csv) [default: table]" echo " --quiet Suppress verbose output" echo " --continuous Run continuously (for monitor command)" echo echo "Examples:" echo " $0 health # Complete health check" echo " $0 status # Quick status" echo " $0 performance # Performance metrics" echo " $0 monitor --interval 30 # Monitor every 30 seconds" echo " $0 slow-queries # Show slow queries" echo " $0 report --format json # JSON report" echo " $0 vacuum # Perform maintenance" } # Check if .env file exists and load it load_env() { if [ ! -f ".env" ]; then log_error ".env file not found" echo "Please run the database setup script first:" echo " ./scripts/db-setup.sh setup" exit 1 fi # Load environment variables export $(grep -v '^#' .env | xargs) } # Parse database URL parse_database_url() { if [[ $DATABASE_URL == postgresql://* ]] || [[ $DATABASE_URL == postgres://* ]]; then DB_TYPE="postgresql" DB_HOST=$(echo $DATABASE_URL | sed -n 's/.*@\([^:]*\):.*/\1/p') DB_PORT=$(echo $DATABASE_URL | sed -n 's/.*:\([0-9]*\)\/.*/\1/p') DB_NAME=$(echo $DATABASE_URL | sed -n 's/.*\/\([^?]*\).*/\1/p') DB_USER=$(echo $DATABASE_URL | sed -n 's/.*\/\/\([^:]*\):.*/\1/p') DB_PASS=$(echo $DATABASE_URL | sed -n 's/.*:\/\/[^:]*:\([^@]*\)@.*/\1/p') elif [[ $DATABASE_URL == sqlite://* ]]; then DB_TYPE="sqlite" DB_FILE=$(echo $DATABASE_URL | sed 's/sqlite:\/\///') else log_error "Unsupported database URL format: $DATABASE_URL" exit 1 fi } # Execute SQL query execute_sql() { local query="$1" local format="${2:-tuples-only}" if [ "$DB_TYPE" = "postgresql" ]; then export PGPASSWORD="$DB_PASS" psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" -t -A -c "$query" 2>/dev/null unset PGPASSWORD elif [ "$DB_TYPE" = "sqlite" ]; then sqlite3 "$DB_FILE" "$query" 2>/dev/null fi } # Check database connectivity check_connectivity() { print_subheader "Database Connectivity" if [ "$DB_TYPE" = "postgresql" ]; then export PGPASSWORD="$DB_PASS" if pg_isready -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" >/dev/null 2>&1; then log_success "PostgreSQL server is accepting connections" # Test actual connection if psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" -c "SELECT 1;" >/dev/null 2>&1; then log_success "Database connection successful" return 0 else log_error "Database connection failed" return 1 fi else log_error "PostgreSQL server is not accepting connections" return 1 fi unset PGPASSWORD elif [ "$DB_TYPE" = "sqlite" ]; then if [ -f "$DB_FILE" ]; then if sqlite3 "$DB_FILE" "SELECT 1;" >/dev/null 2>&1; then log_success "SQLite database accessible" return 0 else log_error "SQLite database access failed" return 1 fi else log_error "SQLite database file not found: $DB_FILE" return 1 fi fi } # Check database version check_version() { print_subheader "Database Version" if [ "$DB_TYPE" = "postgresql" ]; then local version=$(execute_sql "SELECT version();") log_metric "PostgreSQL Version: $version" elif [ "$DB_TYPE" = "sqlite" ]; then local version=$(sqlite3 --version | cut -d' ' -f1) log_metric "SQLite Version: $version" fi } # Check database size check_database_size() { print_subheader "Database Size" if [ "$DB_TYPE" = "postgresql" ]; then local size=$(execute_sql "SELECT pg_size_pretty(pg_database_size('$DB_NAME'));") log_metric "Database Size: $size" # Table sizes echo "Top 10 largest tables:" execute_sql " SELECT schemaname, tablename, pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) as size FROM pg_tables WHERE schemaname NOT IN ('information_schema', 'pg_catalog') ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC LIMIT 10; " | while read line; do log_metric " $line" done elif [ "$DB_TYPE" = "sqlite" ]; then if [ -f "$DB_FILE" ]; then local size=$(du -h "$DB_FILE" | cut -f1) log_metric "Database Size: $size" fi fi } # Check active connections check_connections() { print_subheader "Database Connections" if [ "$DB_TYPE" = "postgresql" ]; then local active_connections=$(execute_sql "SELECT count(*) FROM pg_stat_activity WHERE state = 'active';") local total_connections=$(execute_sql "SELECT count(*) FROM pg_stat_activity;") local max_connections=$(execute_sql "SELECT setting FROM pg_settings WHERE name = 'max_connections';") log_metric "Active Connections: $active_connections" log_metric "Total Connections: $total_connections" log_metric "Max Connections: $max_connections" local connection_percentage=$((total_connections * 100 / max_connections)) log_metric "Connection Usage: ${connection_percentage}%" if [ $connection_percentage -gt $ALERT_THRESHOLD_CONNECTIONS ]; then log_warn "Connection usage is above ${ALERT_THRESHOLD_CONNECTIONS}%" fi # Show connection details echo "Active connections by user:" execute_sql " SELECT usename, count(*) as connections, state FROM pg_stat_activity GROUP BY usename, state ORDER BY connections DESC; " | while read line; do log_metric " $line" done elif [ "$DB_TYPE" = "sqlite" ]; then log_metric "SQLite connections: Single connection (file-based)" fi } # Check performance metrics check_performance() { print_subheader "Performance Metrics" if [ "$DB_TYPE" = "postgresql" ]; then # Cache hit ratio local cache_hit_ratio=$(execute_sql " SELECT round( (sum(heap_blks_hit) / (sum(heap_blks_hit) + sum(heap_blks_read))) * 100, 2 ) as cache_hit_ratio FROM pg_statio_user_tables; ") log_metric "Cache Hit Ratio: ${cache_hit_ratio}%" # Index usage local index_usage=$(execute_sql " SELECT round( (sum(idx_blks_hit) / (sum(idx_blks_hit) + sum(idx_blks_read))) * 100, 2 ) as index_hit_ratio FROM pg_statio_user_indexes; ") log_metric "Index Hit Ratio: ${index_usage}%" # Transaction stats local commits=$(execute_sql "SELECT xact_commit FROM pg_stat_database WHERE datname = '$DB_NAME';") local rollbacks=$(execute_sql "SELECT xact_rollback FROM pg_stat_database WHERE datname = '$DB_NAME';") log_metric "Commits: $commits" log_metric "Rollbacks: $rollbacks" # Deadlocks local deadlocks=$(execute_sql "SELECT deadlocks FROM pg_stat_database WHERE datname = '$DB_NAME';") log_metric "Deadlocks: $deadlocks" elif [ "$DB_TYPE" = "sqlite" ]; then # SQLite-specific metrics local page_count=$(execute_sql "PRAGMA page_count;") local page_size=$(execute_sql "PRAGMA page_size;") local cache_size=$(execute_sql "PRAGMA cache_size;") log_metric "Page Count: $page_count" log_metric "Page Size: $page_size bytes" log_metric "Cache Size: $cache_size pages" fi } # Check slow queries check_slow_queries() { print_subheader "Slow Queries" if [ "$DB_TYPE" = "postgresql" ]; then # Check if pg_stat_statements is enabled local extension_exists=$(execute_sql "SELECT count(*) FROM pg_available_extensions WHERE name = 'pg_stat_statements';") if [ "$extension_exists" -eq "1" ]; then echo "Top 10 slowest queries:" execute_sql " SELECT round(mean_exec_time::numeric, 2) as avg_time_ms, calls, round(total_exec_time::numeric, 2) as total_time_ms, left(query, 100) as query_preview FROM pg_stat_statements ORDER BY mean_exec_time DESC LIMIT 10; " | while read line; do log_metric " $line" done else log_warn "pg_stat_statements extension not available" fi elif [ "$DB_TYPE" = "sqlite" ]; then log_metric "SQLite slow query monitoring requires application-level logging" fi } # Check database locks check_locks() { print_subheader "Database Locks" if [ "$DB_TYPE" = "postgresql" ]; then local lock_count=$(execute_sql "SELECT count(*) FROM pg_locks;") log_metric "Active Locks: $lock_count" # Check for blocking queries local blocking_queries=$(execute_sql " SELECT count(*) FROM pg_stat_activity WHERE wait_event_type = 'Lock'; ") if [ "$blocking_queries" -gt "0" ]; then log_warn "Found $blocking_queries queries waiting for locks" execute_sql " SELECT blocked_locks.pid AS blocked_pid, blocked_activity.usename AS blocked_user, blocking_locks.pid AS blocking_pid, blocking_activity.usename AS blocking_user, blocked_activity.query AS blocked_statement, blocking_activity.query AS current_statement_in_blocking_process FROM pg_catalog.pg_locks blocked_locks JOIN pg_catalog.pg_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid JOIN pg_catalog.pg_locks blocking_locks ON blocking_locks.locktype = blocked_locks.locktype AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid AND blocking_locks.pid != blocked_locks.pid JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid WHERE NOT blocked_locks.granted; " | while read line; do log_warn " $line" done else log_success "No blocking queries found" fi elif [ "$DB_TYPE" = "sqlite" ]; then log_metric "SQLite uses file-level locking" fi } # Check disk usage check_disk_usage() { print_subheader "Disk Usage" if [ "$DB_TYPE" = "postgresql" ]; then # Get PostgreSQL data directory local data_dir=$(execute_sql "SELECT setting FROM pg_settings WHERE name = 'data_directory';") if [ -n "$data_dir" ] && [ -d "$data_dir" ]; then local disk_usage=$(df -h "$data_dir" | awk 'NR==2 {print $5}' | sed 's/%//') log_metric "Data Directory Disk Usage: ${disk_usage}%" if [ "$disk_usage" -gt "$ALERT_THRESHOLD_DISK_USAGE" ]; then log_warn "Disk usage is above ${ALERT_THRESHOLD_DISK_USAGE}%" fi else log_warn "Could not determine PostgreSQL data directory" fi elif [ "$DB_TYPE" = "sqlite" ]; then local db_dir=$(dirname "$DB_FILE") local disk_usage=$(df -h "$db_dir" | awk 'NR==2 {print $5}' | sed 's/%//') log_metric "Database Directory Disk Usage: ${disk_usage}%" if [ "$disk_usage" -gt "$ALERT_THRESHOLD_DISK_USAGE" ]; then log_warn "Disk usage is above ${ALERT_THRESHOLD_DISK_USAGE}%" fi fi } # Check memory usage check_memory_usage() { print_subheader "Memory Usage" if [ "$DB_TYPE" = "postgresql" ]; then # Check shared buffers and other memory settings local shared_buffers=$(execute_sql "SELECT setting FROM pg_settings WHERE name = 'shared_buffers';") local work_mem=$(execute_sql "SELECT setting FROM pg_settings WHERE name = 'work_mem';") local maintenance_work_mem=$(execute_sql "SELECT setting FROM pg_settings WHERE name = 'maintenance_work_mem';") log_metric "Shared Buffers: $shared_buffers" log_metric "Work Mem: $work_mem" log_metric "Maintenance Work Mem: $maintenance_work_mem" # Check actual memory usage if available if command -v ps >/dev/null 2>&1; then local postgres_memory=$(ps -o pid,vsz,rss,comm -C postgres --no-headers | awk '{rss_total += $3} END {print rss_total/1024 " MB"}') if [ -n "$postgres_memory" ]; then log_metric "PostgreSQL Memory Usage: $postgres_memory" fi fi elif [ "$DB_TYPE" = "sqlite" ]; then local cache_size=$(execute_sql "PRAGMA cache_size;") local page_size=$(execute_sql "PRAGMA page_size;") local memory_usage_kb=$((cache_size * page_size / 1024)) log_metric "SQLite Cache Memory: ${memory_usage_kb} KB" fi } # Check backup status check_backup_status() { print_subheader "Backup Status" local backup_dir="backups" if [ -d "$backup_dir" ]; then local backup_count=$(find "$backup_dir" -name "*.sql*" -o -name "*.dump*" -o -name "*.tar*" 2>/dev/null | wc -l) log_metric "Available Backups: $backup_count" if [ "$backup_count" -gt "0" ]; then local latest_backup=$(find "$backup_dir" -name "*.sql*" -o -name "*.dump*" -o -name "*.tar*" 2>/dev/null | sort | tail -1) if [ -n "$latest_backup" ]; then local backup_age=$(find "$latest_backup" -mtime +1 2>/dev/null | wc -l) local backup_date=$(date -r "$latest_backup" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || echo "Unknown") log_metric "Latest Backup: $(basename "$latest_backup") ($backup_date)" if [ "$backup_age" -gt "0" ]; then log_warn "Latest backup is older than 24 hours" fi fi else log_warn "No backups found" fi else log_warn "Backup directory not found: $backup_dir" fi } # Perform vacuum operation perform_vacuum() { print_subheader "Database Maintenance (VACUUM)" if [ "$DB_TYPE" = "postgresql" ]; then log "Running VACUUM ANALYZE on all tables..." execute_sql "VACUUM ANALYZE;" >/dev/null 2>&1 log_success "VACUUM ANALYZE completed" elif [ "$DB_TYPE" = "sqlite" ]; then log "Running VACUUM on SQLite database..." execute_sql "VACUUM;" >/dev/null 2>&1 log_success "VACUUM completed" fi } # Update database statistics update_statistics() { print_subheader "Update Database Statistics" if [ "$DB_TYPE" = "postgresql" ]; then log "Running ANALYZE on all tables..." execute_sql "ANALYZE;" >/dev/null 2>&1 log_success "ANALYZE completed" elif [ "$DB_TYPE" = "sqlite" ]; then log "Running ANALYZE on SQLite database..." execute_sql "ANALYZE;" >/dev/null 2>&1 log_success "ANALYZE completed" fi } # Generate comprehensive report generate_report() { print_header "Database Health Report" echo "Report generated on: $(date)" echo "Database Type: $DB_TYPE" echo "Database Name: $DB_NAME" echo "Environment: $ENVIRONMENT" echo # Run all checks check_connectivity echo check_version echo check_database_size echo check_connections echo check_performance echo check_slow_queries echo check_locks echo check_disk_usage echo check_memory_usage echo check_backup_status echo print_header "Report Complete" } # Continuous monitoring start_monitoring() { print_header "Starting Database Monitoring" log "Monitoring interval: ${MONITOR_INTERVAL} seconds" log "Press Ctrl+C to stop monitoring" while true; do clear echo "=== Database Monitor - $(date) ===" echo # Quick health checks if check_connectivity >/dev/null 2>&1; then echo "✅ Database connectivity: OK" else echo "❌ Database connectivity: FAILED" fi check_connections echo check_performance echo if [ "$CONTINUOUS" = "true" ]; then sleep "$MONITOR_INTERVAL" else break fi done } # Parse command line arguments COMMAND="" ENVIRONMENT="dev" FORMAT="table" CONTINUOUS="false" QUIET="false" while [[ $# -gt 0 ]]; do case $1 in --env) ENVIRONMENT="$2" shift 2 ;; --interval) MONITOR_INTERVAL="$2" shift 2 ;; --log-file) LOG_FILE="$2" shift 2 ;; --threshold-conn) ALERT_THRESHOLD_CONNECTIONS="$2" shift 2 ;; --threshold-disk) ALERT_THRESHOLD_DISK_USAGE="$2" shift 2 ;; --threshold-mem) ALERT_THRESHOLD_MEMORY_USAGE="$2" shift 2 ;; --threshold-query) ALERT_THRESHOLD_QUERY_TIME="$2" shift 2 ;; --format) FORMAT="$2" shift 2 ;; --continuous) CONTINUOUS="true" shift ;; --quiet) QUIET="true" shift ;; -h|--help) print_usage exit 0 ;; *) if [ -z "$COMMAND" ]; then COMMAND="$1" else log_error "Unknown option: $1" print_usage exit 1 fi shift ;; esac done # Set environment variable export ENVIRONMENT="$ENVIRONMENT" # Validate command if [ -z "$COMMAND" ]; then print_usage exit 1 fi # Check if we're in the right directory if [ ! -f "Cargo.toml" ]; then log_error "Please run this script from the project root directory" exit 1 fi # Load environment and parse database URL load_env parse_database_url # Execute command case "$COMMAND" in "health") print_header "Complete Health Check" generate_report ;; "status") print_header "Quick Status Check" check_connectivity check_connections ;; "connections") check_connections ;; "performance") check_performance ;; "slow-queries") check_slow_queries ;; "locks") check_locks ;; "disk-usage") check_disk_usage ;; "memory-usage") check_memory_usage ;; "backup-status") check_backup_status ;; "replication") log_warn "Replication monitoring not yet implemented" ;; "monitor") start_monitoring ;; "alerts") log_warn "Alert system not yet implemented" ;; "vacuum") perform_vacuum ;; "analyze") update_statistics ;; "report") generate_report ;; *) log_error "Unknown command: $COMMAND" print_usage exit 1 ;; esac