Rustelo/scripts/databases/db-monitor.sh

721 lines
23 KiB
Bash
Raw Normal View History

2025-07-07 23:53:50 +01:00
#!/bin/bash
# Database Monitoring and Health Check Script
# Provides comprehensive database monitoring, performance metrics, and health checks
set -e
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
BOLD='\033[1m'
NC='\033[0m' # No Color
# Script directory
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(dirname "$SCRIPT_DIR")"
# Change to project root
cd "$PROJECT_ROOT"
# Default monitoring configuration
MONITOR_INTERVAL=60
ALERT_THRESHOLD_CONNECTIONS=80
ALERT_THRESHOLD_DISK_USAGE=85
ALERT_THRESHOLD_MEMORY_USAGE=90
ALERT_THRESHOLD_QUERY_TIME=5000
LOG_FILE="monitoring.log"
# Logging functions
log() {
echo -e "${GREEN}[INFO]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
log_success() {
echo -e "${GREEN}[SUCCESS]${NC} $1"
}
log_metric() {
echo -e "${CYAN}[METRIC]${NC} $1"
}
print_header() {
echo -e "${BLUE}${BOLD}=== $1 ===${NC}"
}
print_subheader() {
echo -e "${CYAN}--- $1 ---${NC}"
}
print_usage() {
echo "Database Monitoring and Health Check Script"
echo
echo "Usage: $0 <command> [options]"
echo
echo "Commands:"
echo " health Complete health check"
echo " status Quick status check"
echo " connections Show active connections"
echo " performance Show performance metrics"
echo " slow-queries Show slow queries"
echo " locks Show database locks"
echo " disk-usage Show disk usage"
echo " memory-usage Show memory usage"
echo " backup-status Check backup status"
echo " replication Check replication status"
echo " monitor Start continuous monitoring"
echo " alerts Check for alerts"
echo " vacuum Perform database maintenance"
echo " analyze Update database statistics"
echo " report Generate comprehensive report"
echo
echo "Options:"
echo " --env ENV Environment (dev/prod) [default: dev]"
echo " --interval SECS Monitoring interval in seconds [default: 60]"
echo " --log-file FILE Log file path [default: monitoring.log]"
echo " --threshold-conn N Connection alert threshold [default: 80]"
echo " --threshold-disk N Disk usage alert threshold [default: 85]"
echo " --threshold-mem N Memory usage alert threshold [default: 90]"
echo " --threshold-query N Query time alert threshold in ms [default: 5000]"
echo " --format FORMAT Output format (table/json/csv) [default: table]"
echo " --quiet Suppress verbose output"
echo " --continuous Run continuously (for monitor command)"
echo
echo "Examples:"
echo " $0 health # Complete health check"
echo " $0 status # Quick status"
echo " $0 performance # Performance metrics"
echo " $0 monitor --interval 30 # Monitor every 30 seconds"
echo " $0 slow-queries # Show slow queries"
echo " $0 report --format json # JSON report"
echo " $0 vacuum # Perform maintenance"
}
# Check if .env file exists and load it
load_env() {
if [ ! -f ".env" ]; then
log_error ".env file not found"
echo "Please run the database setup script first:"
echo " ./scripts/db-setup.sh setup"
exit 1
fi
# Load environment variables
export $(grep -v '^#' .env | xargs)
}
# Parse database URL
parse_database_url() {
if [[ $DATABASE_URL == postgresql://* ]] || [[ $DATABASE_URL == postgres://* ]]; then
DB_TYPE="postgresql"
DB_HOST=$(echo $DATABASE_URL | sed -n 's/.*@\([^:]*\):.*/\1/p')
DB_PORT=$(echo $DATABASE_URL | sed -n 's/.*:\([0-9]*\)\/.*/\1/p')
DB_NAME=$(echo $DATABASE_URL | sed -n 's/.*\/\([^?]*\).*/\1/p')
DB_USER=$(echo $DATABASE_URL | sed -n 's/.*\/\/\([^:]*\):.*/\1/p')
DB_PASS=$(echo $DATABASE_URL | sed -n 's/.*:\/\/[^:]*:\([^@]*\)@.*/\1/p')
elif [[ $DATABASE_URL == sqlite://* ]]; then
DB_TYPE="sqlite"
DB_FILE=$(echo $DATABASE_URL | sed 's/sqlite:\/\///')
else
log_error "Unsupported database URL format: $DATABASE_URL"
exit 1
fi
}
# Execute SQL query
execute_sql() {
local query="$1"
local format="${2:-tuples-only}"
if [ "$DB_TYPE" = "postgresql" ]; then
export PGPASSWORD="$DB_PASS"
psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" -t -A -c "$query" 2>/dev/null
unset PGPASSWORD
elif [ "$DB_TYPE" = "sqlite" ]; then
sqlite3 "$DB_FILE" "$query" 2>/dev/null
fi
}
# Check database connectivity
check_connectivity() {
print_subheader "Database Connectivity"
if [ "$DB_TYPE" = "postgresql" ]; then
export PGPASSWORD="$DB_PASS"
if pg_isready -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" >/dev/null 2>&1; then
log_success "PostgreSQL server is accepting connections"
# Test actual connection
if psql -h "$DB_HOST" -p "$DB_PORT" -U "$DB_USER" -d "$DB_NAME" -c "SELECT 1;" >/dev/null 2>&1; then
log_success "Database connection successful"
return 0
else
log_error "Database connection failed"
return 1
fi
else
log_error "PostgreSQL server is not accepting connections"
return 1
fi
unset PGPASSWORD
elif [ "$DB_TYPE" = "sqlite" ]; then
if [ -f "$DB_FILE" ]; then
if sqlite3 "$DB_FILE" "SELECT 1;" >/dev/null 2>&1; then
log_success "SQLite database accessible"
return 0
else
log_error "SQLite database access failed"
return 1
fi
else
log_error "SQLite database file not found: $DB_FILE"
return 1
fi
fi
}
# Check database version
check_version() {
print_subheader "Database Version"
if [ "$DB_TYPE" = "postgresql" ]; then
local version=$(execute_sql "SELECT version();")
log_metric "PostgreSQL Version: $version"
elif [ "$DB_TYPE" = "sqlite" ]; then
local version=$(sqlite3 --version | cut -d' ' -f1)
log_metric "SQLite Version: $version"
fi
}
# Check database size
check_database_size() {
print_subheader "Database Size"
if [ "$DB_TYPE" = "postgresql" ]; then
local size=$(execute_sql "SELECT pg_size_pretty(pg_database_size('$DB_NAME'));")
log_metric "Database Size: $size"
# Table sizes
echo "Top 10 largest tables:"
execute_sql "
SELECT
schemaname,
tablename,
pg_size_pretty(pg_total_relation_size(schemaname||'.'||tablename)) as size
FROM pg_tables
WHERE schemaname NOT IN ('information_schema', 'pg_catalog')
ORDER BY pg_total_relation_size(schemaname||'.'||tablename) DESC
LIMIT 10;
" | while read line; do
log_metric " $line"
done
elif [ "$DB_TYPE" = "sqlite" ]; then
if [ -f "$DB_FILE" ]; then
local size=$(du -h "$DB_FILE" | cut -f1)
log_metric "Database Size: $size"
fi
fi
}
# Check active connections
check_connections() {
print_subheader "Database Connections"
if [ "$DB_TYPE" = "postgresql" ]; then
local active_connections=$(execute_sql "SELECT count(*) FROM pg_stat_activity WHERE state = 'active';")
local total_connections=$(execute_sql "SELECT count(*) FROM pg_stat_activity;")
local max_connections=$(execute_sql "SELECT setting FROM pg_settings WHERE name = 'max_connections';")
log_metric "Active Connections: $active_connections"
log_metric "Total Connections: $total_connections"
log_metric "Max Connections: $max_connections"
local connection_percentage=$((total_connections * 100 / max_connections))
log_metric "Connection Usage: ${connection_percentage}%"
if [ $connection_percentage -gt $ALERT_THRESHOLD_CONNECTIONS ]; then
log_warn "Connection usage is above ${ALERT_THRESHOLD_CONNECTIONS}%"
fi
# Show connection details
echo "Active connections by user:"
execute_sql "
SELECT
usename,
count(*) as connections,
state
FROM pg_stat_activity
GROUP BY usename, state
ORDER BY connections DESC;
" | while read line; do
log_metric " $line"
done
elif [ "$DB_TYPE" = "sqlite" ]; then
log_metric "SQLite connections: Single connection (file-based)"
fi
}
# Check performance metrics
check_performance() {
print_subheader "Performance Metrics"
if [ "$DB_TYPE" = "postgresql" ]; then
# Cache hit ratio
local cache_hit_ratio=$(execute_sql "
SELECT
round(
(sum(heap_blks_hit) / (sum(heap_blks_hit) + sum(heap_blks_read))) * 100, 2
) as cache_hit_ratio
FROM pg_statio_user_tables;
")
log_metric "Cache Hit Ratio: ${cache_hit_ratio}%"
# Index usage
local index_usage=$(execute_sql "
SELECT
round(
(sum(idx_blks_hit) / (sum(idx_blks_hit) + sum(idx_blks_read))) * 100, 2
) as index_hit_ratio
FROM pg_statio_user_indexes;
")
log_metric "Index Hit Ratio: ${index_usage}%"
# Transaction stats
local commits=$(execute_sql "SELECT xact_commit FROM pg_stat_database WHERE datname = '$DB_NAME';")
local rollbacks=$(execute_sql "SELECT xact_rollback FROM pg_stat_database WHERE datname = '$DB_NAME';")
log_metric "Commits: $commits"
log_metric "Rollbacks: $rollbacks"
# Deadlocks
local deadlocks=$(execute_sql "SELECT deadlocks FROM pg_stat_database WHERE datname = '$DB_NAME';")
log_metric "Deadlocks: $deadlocks"
elif [ "$DB_TYPE" = "sqlite" ]; then
# SQLite-specific metrics
local page_count=$(execute_sql "PRAGMA page_count;")
local page_size=$(execute_sql "PRAGMA page_size;")
local cache_size=$(execute_sql "PRAGMA cache_size;")
log_metric "Page Count: $page_count"
log_metric "Page Size: $page_size bytes"
log_metric "Cache Size: $cache_size pages"
fi
}
# Check slow queries
check_slow_queries() {
print_subheader "Slow Queries"
if [ "$DB_TYPE" = "postgresql" ]; then
# Check if pg_stat_statements is enabled
local extension_exists=$(execute_sql "SELECT count(*) FROM pg_available_extensions WHERE name = 'pg_stat_statements';")
if [ "$extension_exists" -eq "1" ]; then
echo "Top 10 slowest queries:"
execute_sql "
SELECT
round(mean_exec_time::numeric, 2) as avg_time_ms,
calls,
round(total_exec_time::numeric, 2) as total_time_ms,
left(query, 100) as query_preview
FROM pg_stat_statements
ORDER BY mean_exec_time DESC
LIMIT 10;
" | while read line; do
log_metric " $line"
done
else
log_warn "pg_stat_statements extension not available"
fi
elif [ "$DB_TYPE" = "sqlite" ]; then
log_metric "SQLite slow query monitoring requires application-level logging"
fi
}
# Check database locks
check_locks() {
print_subheader "Database Locks"
if [ "$DB_TYPE" = "postgresql" ]; then
local lock_count=$(execute_sql "SELECT count(*) FROM pg_locks;")
log_metric "Active Locks: $lock_count"
# Check for blocking queries
local blocking_queries=$(execute_sql "
SELECT count(*)
FROM pg_stat_activity
WHERE wait_event_type = 'Lock';
")
if [ "$blocking_queries" -gt "0" ]; then
log_warn "Found $blocking_queries queries waiting for locks"
execute_sql "
SELECT
blocked_locks.pid AS blocked_pid,
blocked_activity.usename AS blocked_user,
blocking_locks.pid AS blocking_pid,
blocking_activity.usename AS blocking_user,
blocked_activity.query AS blocked_statement,
blocking_activity.query AS current_statement_in_blocking_process
FROM pg_catalog.pg_locks blocked_locks
JOIN pg_catalog.pg_stat_activity blocked_activity ON blocked_activity.pid = blocked_locks.pid
JOIN pg_catalog.pg_locks blocking_locks ON blocking_locks.locktype = blocked_locks.locktype
AND blocking_locks.database IS NOT DISTINCT FROM blocked_locks.database
AND blocking_locks.relation IS NOT DISTINCT FROM blocked_locks.relation
AND blocking_locks.page IS NOT DISTINCT FROM blocked_locks.page
AND blocking_locks.tuple IS NOT DISTINCT FROM blocked_locks.tuple
AND blocking_locks.virtualxid IS NOT DISTINCT FROM blocked_locks.virtualxid
AND blocking_locks.transactionid IS NOT DISTINCT FROM blocked_locks.transactionid
AND blocking_locks.classid IS NOT DISTINCT FROM blocked_locks.classid
AND blocking_locks.objid IS NOT DISTINCT FROM blocked_locks.objid
AND blocking_locks.objsubid IS NOT DISTINCT FROM blocked_locks.objsubid
AND blocking_locks.pid != blocked_locks.pid
JOIN pg_catalog.pg_stat_activity blocking_activity ON blocking_activity.pid = blocking_locks.pid
WHERE NOT blocked_locks.granted;
" | while read line; do
log_warn " $line"
done
else
log_success "No blocking queries found"
fi
elif [ "$DB_TYPE" = "sqlite" ]; then
log_metric "SQLite uses file-level locking"
fi
}
# Check disk usage
check_disk_usage() {
print_subheader "Disk Usage"
if [ "$DB_TYPE" = "postgresql" ]; then
# Get PostgreSQL data directory
local data_dir=$(execute_sql "SELECT setting FROM pg_settings WHERE name = 'data_directory';")
if [ -n "$data_dir" ] && [ -d "$data_dir" ]; then
local disk_usage=$(df -h "$data_dir" | awk 'NR==2 {print $5}' | sed 's/%//')
log_metric "Data Directory Disk Usage: ${disk_usage}%"
if [ "$disk_usage" -gt "$ALERT_THRESHOLD_DISK_USAGE" ]; then
log_warn "Disk usage is above ${ALERT_THRESHOLD_DISK_USAGE}%"
fi
else
log_warn "Could not determine PostgreSQL data directory"
fi
elif [ "$DB_TYPE" = "sqlite" ]; then
local db_dir=$(dirname "$DB_FILE")
local disk_usage=$(df -h "$db_dir" | awk 'NR==2 {print $5}' | sed 's/%//')
log_metric "Database Directory Disk Usage: ${disk_usage}%"
if [ "$disk_usage" -gt "$ALERT_THRESHOLD_DISK_USAGE" ]; then
log_warn "Disk usage is above ${ALERT_THRESHOLD_DISK_USAGE}%"
fi
fi
}
# Check memory usage
check_memory_usage() {
print_subheader "Memory Usage"
if [ "$DB_TYPE" = "postgresql" ]; then
# Check shared buffers and other memory settings
local shared_buffers=$(execute_sql "SELECT setting FROM pg_settings WHERE name = 'shared_buffers';")
local work_mem=$(execute_sql "SELECT setting FROM pg_settings WHERE name = 'work_mem';")
local maintenance_work_mem=$(execute_sql "SELECT setting FROM pg_settings WHERE name = 'maintenance_work_mem';")
log_metric "Shared Buffers: $shared_buffers"
log_metric "Work Mem: $work_mem"
log_metric "Maintenance Work Mem: $maintenance_work_mem"
# Check actual memory usage if available
if command -v ps >/dev/null 2>&1; then
local postgres_memory=$(ps -o pid,vsz,rss,comm -C postgres --no-headers | awk '{rss_total += $3} END {print rss_total/1024 " MB"}')
if [ -n "$postgres_memory" ]; then
log_metric "PostgreSQL Memory Usage: $postgres_memory"
fi
fi
elif [ "$DB_TYPE" = "sqlite" ]; then
local cache_size=$(execute_sql "PRAGMA cache_size;")
local page_size=$(execute_sql "PRAGMA page_size;")
local memory_usage_kb=$((cache_size * page_size / 1024))
log_metric "SQLite Cache Memory: ${memory_usage_kb} KB"
fi
}
# Check backup status
check_backup_status() {
print_subheader "Backup Status"
local backup_dir="backups"
if [ -d "$backup_dir" ]; then
local backup_count=$(find "$backup_dir" -name "*.sql*" -o -name "*.dump*" -o -name "*.tar*" 2>/dev/null | wc -l)
log_metric "Available Backups: $backup_count"
if [ "$backup_count" -gt "0" ]; then
local latest_backup=$(find "$backup_dir" -name "*.sql*" -o -name "*.dump*" -o -name "*.tar*" 2>/dev/null | sort | tail -1)
if [ -n "$latest_backup" ]; then
local backup_age=$(find "$latest_backup" -mtime +1 2>/dev/null | wc -l)
local backup_date=$(date -r "$latest_backup" '+%Y-%m-%d %H:%M:%S' 2>/dev/null || echo "Unknown")
log_metric "Latest Backup: $(basename "$latest_backup") ($backup_date)"
if [ "$backup_age" -gt "0" ]; then
log_warn "Latest backup is older than 24 hours"
fi
fi
else
log_warn "No backups found"
fi
else
log_warn "Backup directory not found: $backup_dir"
fi
}
# Perform vacuum operation
perform_vacuum() {
print_subheader "Database Maintenance (VACUUM)"
if [ "$DB_TYPE" = "postgresql" ]; then
log "Running VACUUM ANALYZE on all tables..."
execute_sql "VACUUM ANALYZE;" >/dev/null 2>&1
log_success "VACUUM ANALYZE completed"
elif [ "$DB_TYPE" = "sqlite" ]; then
log "Running VACUUM on SQLite database..."
execute_sql "VACUUM;" >/dev/null 2>&1
log_success "VACUUM completed"
fi
}
# Update database statistics
update_statistics() {
print_subheader "Update Database Statistics"
if [ "$DB_TYPE" = "postgresql" ]; then
log "Running ANALYZE on all tables..."
execute_sql "ANALYZE;" >/dev/null 2>&1
log_success "ANALYZE completed"
elif [ "$DB_TYPE" = "sqlite" ]; then
log "Running ANALYZE on SQLite database..."
execute_sql "ANALYZE;" >/dev/null 2>&1
log_success "ANALYZE completed"
fi
}
# Generate comprehensive report
generate_report() {
print_header "Database Health Report"
echo "Report generated on: $(date)"
echo "Database Type: $DB_TYPE"
echo "Database Name: $DB_NAME"
echo "Environment: $ENVIRONMENT"
echo
# Run all checks
check_connectivity
echo
check_version
echo
check_database_size
echo
check_connections
echo
check_performance
echo
check_slow_queries
echo
check_locks
echo
check_disk_usage
echo
check_memory_usage
echo
check_backup_status
echo
print_header "Report Complete"
}
# Continuous monitoring
start_monitoring() {
print_header "Starting Database Monitoring"
log "Monitoring interval: ${MONITOR_INTERVAL} seconds"
log "Press Ctrl+C to stop monitoring"
while true; do
clear
echo "=== Database Monitor - $(date) ==="
echo
# Quick health checks
if check_connectivity >/dev/null 2>&1; then
echo "✅ Database connectivity: OK"
else
echo "❌ Database connectivity: FAILED"
fi
check_connections
echo
check_performance
echo
if [ "$CONTINUOUS" = "true" ]; then
sleep "$MONITOR_INTERVAL"
else
break
fi
done
}
# Parse command line arguments
COMMAND=""
ENVIRONMENT="dev"
FORMAT="table"
CONTINUOUS="false"
QUIET="false"
while [[ $# -gt 0 ]]; do
case $1 in
--env)
ENVIRONMENT="$2"
shift 2
;;
--interval)
MONITOR_INTERVAL="$2"
shift 2
;;
--log-file)
LOG_FILE="$2"
shift 2
;;
--threshold-conn)
ALERT_THRESHOLD_CONNECTIONS="$2"
shift 2
;;
--threshold-disk)
ALERT_THRESHOLD_DISK_USAGE="$2"
shift 2
;;
--threshold-mem)
ALERT_THRESHOLD_MEMORY_USAGE="$2"
shift 2
;;
--threshold-query)
ALERT_THRESHOLD_QUERY_TIME="$2"
shift 2
;;
--format)
FORMAT="$2"
shift 2
;;
--continuous)
CONTINUOUS="true"
shift
;;
--quiet)
QUIET="true"
shift
;;
-h|--help)
print_usage
exit 0
;;
*)
if [ -z "$COMMAND" ]; then
COMMAND="$1"
else
log_error "Unknown option: $1"
print_usage
exit 1
fi
shift
;;
esac
done
# Set environment variable
export ENVIRONMENT="$ENVIRONMENT"
# Validate command
if [ -z "$COMMAND" ]; then
print_usage
exit 1
fi
# Check if we're in the right directory
if [ ! -f "Cargo.toml" ]; then
log_error "Please run this script from the project root directory"
exit 1
fi
# Load environment and parse database URL
load_env
parse_database_url
# Execute command
case "$COMMAND" in
"health")
print_header "Complete Health Check"
generate_report
;;
"status")
print_header "Quick Status Check"
check_connectivity
check_connections
;;
"connections")
check_connections
;;
"performance")
check_performance
;;
"slow-queries")
check_slow_queries
;;
"locks")
check_locks
;;
"disk-usage")
check_disk_usage
;;
"memory-usage")
check_memory_usage
;;
"backup-status")
check_backup_status
;;
"replication")
log_warn "Replication monitoring not yet implemented"
;;
"monitor")
start_monitoring
;;
"alerts")
log_warn "Alert system not yet implemented"
;;
"vacuum")
perform_vacuum
;;
"analyze")
update_statistics
;;
"report")
generate_report
;;
*)
log_error "Unknown command: $COMMAND"
print_usage
exit 1
;;
esac