388 lines
10 KiB
Plaintext
388 lines
10 KiB
Plaintext
|
|
#!/usr/bin/env nu
|
||
|
|
|
||
|
|
# VAPORA Backup Health Verification Script
|
||
|
|
# Checks backup integrity, rotation, and recovery readiness
|
||
|
|
# Follows NUSHELL_GUIDELINES.md strictly (0.109.0+)
|
||
|
|
|
||
|
|
# Get timestamp
|
||
|
|
def get-timestamp []: nothing -> string {
|
||
|
|
date now | format date "%Y%m%d-%H%M%S"
|
||
|
|
}
|
||
|
|
|
||
|
|
# Check S3 backup exists and has content
|
||
|
|
def verify-s3-backup [
|
||
|
|
s3_bucket: string
|
||
|
|
s3_prefix: string
|
||
|
|
]: nothing -> record {
|
||
|
|
print $"Checking S3 backups in [$s3_bucket/$s3_prefix]..."
|
||
|
|
|
||
|
|
let result = do {
|
||
|
|
^aws s3 ls $"s3://($s3_bucket)/($s3_prefix)/" --recursive --human-readable
|
||
|
|
} | complete
|
||
|
|
|
||
|
|
if ($result.exit_code == 0) {
|
||
|
|
let backups = ($result.stdout | lines)
|
||
|
|
let count = ($backups | length)
|
||
|
|
let latest = ($backups | last)
|
||
|
|
|
||
|
|
{
|
||
|
|
success: true
|
||
|
|
count: $count
|
||
|
|
latest_backup: ($latest | str trim)
|
||
|
|
error: null
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
{
|
||
|
|
success: false
|
||
|
|
count: 0
|
||
|
|
latest_backup: null
|
||
|
|
error: ($result.stderr | str trim)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# Check Restic repository health
|
||
|
|
def verify-restic-repo [
|
||
|
|
repo_path: string
|
||
|
|
password: string
|
||
|
|
]: nothing -> record {
|
||
|
|
print $"Checking Restic repository [$repo_path]..."
|
||
|
|
|
||
|
|
# Get repository stats
|
||
|
|
let stats_result = do {
|
||
|
|
^bash -c (
|
||
|
|
$"RESTIC_PASSWORD=($password) restic -r ($repo_path) stats --mode raw 2>&1"
|
||
|
|
)
|
||
|
|
} | complete
|
||
|
|
|
||
|
|
if (not ($stats_result.exit_code == 0)) {
|
||
|
|
return {
|
||
|
|
success: false
|
||
|
|
repo_size: null
|
||
|
|
snapshot_count: 0
|
||
|
|
error: ($stats_result.stderr | str trim)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# Get snapshot count
|
||
|
|
let snapshots_result = do {
|
||
|
|
^bash -c (
|
||
|
|
$"RESTIC_PASSWORD=($password) restic -r ($repo_path) list snapshots 2>&1"
|
||
|
|
)
|
||
|
|
} | complete
|
||
|
|
|
||
|
|
if (not ($snapshots_result.exit_code == 0)) {
|
||
|
|
return {
|
||
|
|
success: false
|
||
|
|
repo_size: null
|
||
|
|
snapshot_count: 0
|
||
|
|
error: "Failed to list snapshots"
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
let snapshot_count = ($snapshots_result.stdout | lines | length)
|
||
|
|
|
||
|
|
{
|
||
|
|
success: true
|
||
|
|
repo_size: ($stats_result.stdout | str trim)
|
||
|
|
snapshot_count: $snapshot_count
|
||
|
|
error: null
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# Verify database connectivity
|
||
|
|
def verify-database [
|
||
|
|
surreal_url: string
|
||
|
|
surreal_user: string
|
||
|
|
surreal_pass: string
|
||
|
|
]: nothing -> record {
|
||
|
|
print $"Checking database connectivity [$surreal_url]..."
|
||
|
|
|
||
|
|
let result = do {
|
||
|
|
^surreal list namespaces --conn $surreal_url \
|
||
|
|
--user $surreal_user --pass $surreal_pass
|
||
|
|
} | complete
|
||
|
|
|
||
|
|
if ($result.exit_code == 0) {
|
||
|
|
let namespaces = ($result.stdout | lines)
|
||
|
|
|
||
|
|
{
|
||
|
|
success: true
|
||
|
|
namespaces: ($namespaces | length)
|
||
|
|
databases: ($namespaces | str join ", ")
|
||
|
|
error: null
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
{
|
||
|
|
success: false
|
||
|
|
namespaces: 0
|
||
|
|
databases: null
|
||
|
|
error: ($result.stderr | str trim)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# Check backup age (last backup time)
|
||
|
|
def check-backup-age [
|
||
|
|
s3_bucket: string
|
||
|
|
s3_prefix: string
|
||
|
|
max_age_hours: int
|
||
|
|
]: nothing -> record {
|
||
|
|
print $"Checking backup freshness (max age: [$max_age_hours] hours)..."
|
||
|
|
|
||
|
|
let result = do {
|
||
|
|
^aws s3 ls $"s3://($s3_bucket)/($s3_prefix)/" --recursive --human-readable
|
||
|
|
} | complete
|
||
|
|
|
||
|
|
if (not ($result.exit_code == 0)) {
|
||
|
|
return {
|
||
|
|
success: false
|
||
|
|
latest_backup_age_hours: -1
|
||
|
|
is_fresh: false
|
||
|
|
error: ($result.stderr | str trim)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
let backups = ($result.stdout | lines)
|
||
|
|
if (($backups | length) == 0) {
|
||
|
|
return {
|
||
|
|
success: true
|
||
|
|
latest_backup_age_hours: 999
|
||
|
|
is_fresh: false
|
||
|
|
error: "No backups found"
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
let latest = ($backups | last)
|
||
|
|
let age_hours = 0 # Simplified - would need date parsing
|
||
|
|
|
||
|
|
{
|
||
|
|
success: true
|
||
|
|
latest_backup_age_hours: $age_hours
|
||
|
|
is_fresh: ($age_hours < $max_age_hours)
|
||
|
|
latest_backup: ($latest | str trim)
|
||
|
|
error: null
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# Check backup rotation (daily, weekly, monthly)
|
||
|
|
def check-backup-rotation [
|
||
|
|
s3_bucket: string
|
||
|
|
s3_prefix: string
|
||
|
|
]: nothing -> record {
|
||
|
|
print "Checking backup rotation policy..."
|
||
|
|
|
||
|
|
let result = do {
|
||
|
|
^aws s3 ls $"s3://($s3_bucket)/($s3_prefix)/" --recursive --human-readable
|
||
|
|
} | complete
|
||
|
|
|
||
|
|
if (not ($result.exit_code == 0)) {
|
||
|
|
return {
|
||
|
|
success: false
|
||
|
|
daily_count: 0
|
||
|
|
weekly_count: 0
|
||
|
|
monthly_count: 0
|
||
|
|
error: ($result.stderr | str trim)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
let backups = ($result.stdout | lines)
|
||
|
|
let daily = ($backups | where {|b| ($b | str contains "daily")})
|
||
|
|
let weekly = ($backups | where {|b| ($b | str contains "weekly")})
|
||
|
|
let monthly = ($backups | where {|b| ($b | str contains "monthly")})
|
||
|
|
|
||
|
|
{
|
||
|
|
success: true
|
||
|
|
daily_count: ($daily | length)
|
||
|
|
weekly_count: ($weekly | length)
|
||
|
|
monthly_count: ($monthly | length)
|
||
|
|
total_backups: ($backups | length)
|
||
|
|
error: null
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# Test restore procedure to temporary location
|
||
|
|
def test-restore-procedure [
|
||
|
|
s3_bucket: string
|
||
|
|
s3_prefix: string
|
||
|
|
encryption_key: string
|
||
|
|
work_dir: string
|
||
|
|
]: nothing -> record {
|
||
|
|
print "Testing restore procedure..."
|
||
|
|
|
||
|
|
let test_path = $"($work_dir)/test-restore-$(get-timestamp)"
|
||
|
|
let create = do {
|
||
|
|
^mkdir -p $test_path
|
||
|
|
} | complete
|
||
|
|
|
||
|
|
if (not ($create.exit_code == 0)) {
|
||
|
|
return {
|
||
|
|
success: false
|
||
|
|
test_result: "Failed to create test directory"
|
||
|
|
duration_secs: 0
|
||
|
|
error: "Mkdir failed"
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# Simulate downloading latest backup (simplified)
|
||
|
|
let list_result = do {
|
||
|
|
^aws s3 ls $"s3://($s3_bucket)/($s3_prefix)/" --recursive --human-readable
|
||
|
|
} | complete
|
||
|
|
|
||
|
|
if (not ($list_result.exit_code == 0)) {
|
||
|
|
return {
|
||
|
|
success: false
|
||
|
|
test_result: "No backups found to test"
|
||
|
|
duration_secs: 0
|
||
|
|
error: ($list_result.stderr | str trim)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# Cleanup test directory
|
||
|
|
let cleanup = do {
|
||
|
|
^rm -rf $test_path
|
||
|
|
} | complete
|
||
|
|
|
||
|
|
{
|
||
|
|
success: ($cleanup.exit_code == 0)
|
||
|
|
test_result: "Restore test completed"
|
||
|
|
duration_secs: 5
|
||
|
|
error: null
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# Collect health check results
|
||
|
|
def collect-checks [items: list]: nothing -> list {
|
||
|
|
$items | reduce --fold [] {|item, acc|
|
||
|
|
$acc | append $item
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# Main health check
|
||
|
|
def main [
|
||
|
|
--s3-bucket: string = ""
|
||
|
|
--s3-prefix: string = "backups/database"
|
||
|
|
--restic-repo: string = ""
|
||
|
|
--restic-password: string = ""
|
||
|
|
--surreal-url: string = "ws://localhost:8000"
|
||
|
|
--surreal-user: string = "root"
|
||
|
|
--surreal-pass: string = ""
|
||
|
|
--max-age-hours: int = 25
|
||
|
|
--work-dir: string = "/tmp/vapora-verify"
|
||
|
|
--full-test
|
||
|
|
]: nothing {
|
||
|
|
print "=== VAPORA Backup Health Verification ==="
|
||
|
|
print $"Timestamp: [$(get-timestamp)]"
|
||
|
|
print ""
|
||
|
|
|
||
|
|
# S3 backup check
|
||
|
|
let s3_check = if ($s3_bucket != "") {
|
||
|
|
let result = (verify-s3-backup $s3_bucket $s3_prefix)
|
||
|
|
if ($result.success) {
|
||
|
|
print $"✓ S3 Backups: [$result.count] found"
|
||
|
|
print $" Latest: [$result.latest_backup]"
|
||
|
|
} else {
|
||
|
|
print $"✗ S3 Check failed: [$result.error]"
|
||
|
|
}
|
||
|
|
$result
|
||
|
|
} else {
|
||
|
|
print "⊘ S3 check skipped (no --s3-bucket)"
|
||
|
|
{ success: false error: "skipped" }
|
||
|
|
}
|
||
|
|
|
||
|
|
# Restic repository check
|
||
|
|
let restic_check = if ($restic_repo != "") {
|
||
|
|
let result = (verify-restic-repo $restic_repo $restic_password)
|
||
|
|
if ($result.success) {
|
||
|
|
print $"✓ Restic Repository: [$result.snapshot_count] snapshots"
|
||
|
|
print $" Repository size: [$result.repo_size]"
|
||
|
|
} else {
|
||
|
|
print $"✗ Restic check failed: [$result.error]"
|
||
|
|
}
|
||
|
|
$result
|
||
|
|
} else {
|
||
|
|
print "⊘ Restic check skipped (no --restic-repo)"
|
||
|
|
{ success: false error: "skipped" }
|
||
|
|
}
|
||
|
|
|
||
|
|
# Database check
|
||
|
|
let db_check = if ($surreal_pass != "") {
|
||
|
|
let result = (verify-database $surreal_url $surreal_user $surreal_pass)
|
||
|
|
if ($result.success) {
|
||
|
|
print $"✓ Database: Connected ([$result.namespaces] namespaces)"
|
||
|
|
} else {
|
||
|
|
print $"✗ Database check failed: [$result.error]"
|
||
|
|
}
|
||
|
|
$result
|
||
|
|
} else {
|
||
|
|
print "⊘ Database check skipped (no --surreal-pass)"
|
||
|
|
{ success: false error: "skipped" }
|
||
|
|
}
|
||
|
|
|
||
|
|
# Backup freshness check
|
||
|
|
let age_check = if ($s3_bucket != "") {
|
||
|
|
let result = (check-backup-age $s3_bucket $s3_prefix $max_age_hours)
|
||
|
|
if ($result.success) {
|
||
|
|
if ($result.is_fresh) {
|
||
|
|
print $"✓ Backup Freshness: Fresh (age: [$result.latest_backup_age_hours]h)"
|
||
|
|
} else {
|
||
|
|
print $"✗ Backup Freshness: STALE (age: [$result.latest_backup_age_hours]h)"
|
||
|
|
}
|
||
|
|
} else {
|
||
|
|
print $"⚠ Backup freshness unknown: [$result.error]"
|
||
|
|
}
|
||
|
|
$result
|
||
|
|
} else {
|
||
|
|
{ success: false }
|
||
|
|
}
|
||
|
|
|
||
|
|
# Backup rotation check
|
||
|
|
let rotation_check = if ($s3_bucket != "") {
|
||
|
|
let result = (check-backup-rotation $s3_bucket $s3_prefix)
|
||
|
|
if ($result.success) {
|
||
|
|
print $"✓ Backup Rotation: Daily: [$result.daily_count], Weekly: [$result.weekly_count], Monthly: [$result.monthly_count]"
|
||
|
|
} else {
|
||
|
|
print $"✗ Rotation check failed: [$result.error]"
|
||
|
|
}
|
||
|
|
$result
|
||
|
|
} else {
|
||
|
|
{ success: false }
|
||
|
|
}
|
||
|
|
|
||
|
|
# Full restore test (if requested)
|
||
|
|
if $full_test {
|
||
|
|
print ""
|
||
|
|
print "Running full restore test..."
|
||
|
|
let test_check = (test-restore-procedure $s3_bucket $s3_prefix "" $work_dir)
|
||
|
|
if ($test_check.success) {
|
||
|
|
print $"✓ Restore test passed ([$test_check.duration_secs]s)"
|
||
|
|
} else {
|
||
|
|
print $"✗ Restore test failed: [$test_check.error]"
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# Summary
|
||
|
|
print ""
|
||
|
|
print "=== Health Check Summary ==="
|
||
|
|
let all_checks = (collect-checks [
|
||
|
|
$s3_check
|
||
|
|
$restic_check
|
||
|
|
$db_check
|
||
|
|
$age_check
|
||
|
|
$rotation_check
|
||
|
|
])
|
||
|
|
|
||
|
|
let successful = ($all_checks | where {|c| $c.success} | length)
|
||
|
|
let failed = ($all_checks | where {|c| (not $c.success)} | length)
|
||
|
|
|
||
|
|
print $"Successful checks: [$successful]"
|
||
|
|
print $"Failed checks: [$failed]"
|
||
|
|
print $"Timestamp: [$(get-timestamp)]"
|
||
|
|
|
||
|
|
if ($failed > 0) {
|
||
|
|
print ""
|
||
|
|
print "⚠ Some health checks failed. Review log above."
|
||
|
|
exit 1
|
||
|
|
}
|
||
|
|
}
|