prvng_core/nulib/servers/upgrade.nu

202 lines
8.2 KiB
Text

# Selective imports replacing `use lib_provisioning *` (ADR-025 Phase 4).
use lib_provisioning/result.nu [ok]
use lib_provisioning/utils/interface.nu [_print]
use lib_provisioning/utils/logging.nu [set-debug-enabled]
use lib_provisioning/utils/settings.nu [find_get_settings load]
use lib_provisioning/utils/ssh.nu [ssh_cmd]
use utils.nu *
# > Server upgrade — detect server_type drift and apply changes via provider API.
#
# Compares servers.ncl (desired server_type) against the live provider state.
# If a mismatch is found, executes: shutdown → change_type → start.
#
# Usage:
# provisioning server upgrade sgoyol-cp -i sgoyol # upgrade one server
# provisioning server upgrade -i sgoyol # check all, upgrade drifted
# provisioning server upgrade sgoyol-cp -i sgoyol --check # dry-run, show drift only
export def "main upgrade" [
name?: string # Server hostname (optional, all servers if omitted)
--infra (-i): string # Infra directory
--settings (-s): string # Settings path
--check (-c) # Dry-run: show drift without applying
--yes (-y) # Skip confirmation prompt
--debug (-x) # Debug mode
--helpinfo (-h) # Help
] {
if $helpinfo {
_print "Usage: provisioning server upgrade [hostname] -i <infra> [--check] [--yes]"
_print ""
_print " Detects server_type drift between servers.ncl and provider."
_print " If drift found: shutdown → change_type → start."
_print ""
_print " --check Show drift without applying"
_print " --yes Skip confirmation"
return
}
if $debug { set-debug-enabled true }
# Discover infras: explicit -i, or scan all infra dirs with settings.ncl
let infra_list = if ($infra | is-not-empty) {
[$infra]
} else {
let ws_path = ($env.PROVISIONING_WORKSPACE_PATH? | default $env.PWD)
let infra_dir = ($ws_path | path join "infra")
if not ($infra_dir | path exists) {
_print "No infra/ directory found. Use -i <infra> or run from a workspace."
return
}
ls $infra_dir
| where type == "dir"
| where { ($in.name | path join "settings.ncl" | path exists) }
| each {|d| $d.name | path basename }
}
if ($infra_list | is-empty) {
_print "No infras with settings.ncl found."
return
}
# Collect drift across all infras
mut all_drift = []
mut all_settings = []
for infra_name in $infra_list {
let curr_settings = (do { find_get_settings --infra $infra_name --settings $settings } catch { null })
if ($curr_settings == null) {
_print $"⚠ ($infra_name): cannot load settings — skipping"
continue
}
let servers = $curr_settings.data.servers
let live_data = (do { mw_query_servers $curr_settings "" "" } | default [])
let drift = ($servers | each {|srv|
if ($name | is-not-empty) and $srv.hostname != $name { return null }
let desired_type = ($srv.server_type? | default "")
let live = ($live_data | where {|l| $l.name == $srv.hostname } | get 0? | default null)
let actual_type = if $live != null { $live.server_type?.name? | default "unknown" } else { "not found" }
let status = if $live != null { $live.status? | default "unknown" } else { "not found" }
let needs_upgrade = ($desired_type != $actual_type and $actual_type != "not found" and $actual_type != "unknown")
{
infra: $infra_name,
hostname: $srv.hostname,
desired_type: $desired_type,
actual_type: $actual_type,
status: $status,
drift: (if $needs_upgrade { "upgrade" } else { "ok" }),
provider: ($srv.provider? | default "hetzner"),
}
} | where {|it| $it != null })
$all_drift = ($all_drift | append $drift)
$all_settings = ($all_settings | append { infra: $infra_name, settings: $curr_settings })
}
print ($all_drift | select infra hostname desired_type actual_type status drift | table)
let to_upgrade = ($all_drift | where drift == "upgrade")
if ($to_upgrade | is-empty) {
_print "\n✅ No server type drift — all servers match settings"
return
}
_print $"\n($to_upgrade | length) server\(s\) need upgrade:"
for srv in $to_upgrade {
_print $" ($srv.infra)/($srv.hostname): ($srv.actual_type) → ($srv.desired_type)"
}
if $check {
_print "\n(--check: no changes applied)"
return
}
if not $yes {
_print $"\nUpgrade requires shutdown → change_type → start. Continue? Type yes: "
let input = (input --numchar 3)
if $input != "yes" and $input != "YES" {
_print "Aborted."
return
}
}
# Execute upgrades
for srv_drift in $to_upgrade {
let infra_settings = ($all_settings | where infra == $srv_drift.infra | get 0?).settings
let srv = ($infra_settings.data.servers | where hostname == $srv_drift.hostname | get 0?)
if ($srv | is-empty) { continue }
let hn = $srv_drift.hostname
_print $"\n── ($srv_drift.infra)/($hn): ($srv_drift.actual_type) → ($srv_drift.desired_type) ──"
# 1. Shutdown
_print " ⏹ Shutting down ..."
let res_shutdown = (do { ^hcloud server shutdown $hn } | complete)
if $res_shutdown.exit_code != 0 {
_print $" 🛑 shutdown failed: ($res_shutdown.stderr)"
continue
}
# 2. Wait for server to be off
_print " ⏳ Waiting for server to stop ..."
mut is_off = false
for _ in 1..30 {
let status = (do { ^hcloud server describe $hn -o json | from json | get status } catch { "unknown" })
if $status == "off" {
$is_off = true
break
}
sleep 5sec
}
if not $is_off {
_print $" 🛑 ($hn) did not stop — skipping"
continue
}
# 3. Change type
_print $" 🔄 Changing type to ($srv_drift.desired_type) ..."
let res_change = (do { ^hcloud server change-type $hn $srv_drift.desired_type } | complete)
if $res_change.exit_code != 0 {
_print $" 🛑 change-type failed: ($res_change.stderr)"
_print " ▶ Restarting server with original type ..."
^hcloud server poweron $hn | ignore
continue
}
# 4. Start
_print " ▶ Starting ..."
let res_start = (do { ^hcloud server poweron $hn } | complete)
if $res_start.exit_code != 0 {
_print $" 🛑 poweron failed: ($res_start.stderr)"
continue
}
# 5. Wait for running
_print " ⏳ Waiting for server to start ..."
mut is_running = false
for _ in 1..30 {
let status = (do { ^hcloud server describe $hn -o json | from json | get status } catch { "unknown" })
if $status == "running" {
$is_running = true
break
}
sleep 5sec
}
if $is_running {
# Post-upgrade: ensure critical services are running after reboot.
# The shutdown → change-type → poweron cycle can leave services in
# bad/inactive state if systemd symlinks were disrupted.
_print " 🔧 Ensuring services are active ..."
let ip = (do { mw_get_ip $infra_settings $srv "public" false } catch { "" })
if ($ip | is-not-empty) {
let svc_cmd = "for svc in containerd kubelet etcd coredns; do systemctl is-enabled $svc 2>/dev/null | grep -q enabled && systemctl start $svc 2>/dev/null; done; sleep 2; systemctl is-active containerd kubelet 2>&1"
ssh_cmd $infra_settings $srv false $svc_cmd $ip
}
_print $" ✅ ($hn) upgraded to ($srv_drift.desired_type)"
} else {
_print $" ⚠ ($hn) changed but not yet running — check manually"
}
}
_print $"\n✅ Upgrade complete"
}