prvng_core/nulib/workspace/sync.nu

150 lines
7 KiB
Text

# provisioning sync — reconcile .provisioning-state.ncl against external APIs.
# Sources: Hetzner API (server existence/status), K8s API (pod/deploy readiness), SSH probe.
# Never marks a taskserv 'completed without positive confirmation.
# Ambiguous or timed-out probes write 'unknown.
use state.nu *
# Selective imports replacing `use ../lib_provisioning *` (ADR-025 Phase 4).
use lib_provisioning/utils/interface.nu [_print]
use lib_provisioning/result.nu [err]
# ─── Provider probe ───────────────────────────────────────────────────────────
# Query Hetzner API for a server and return { provider_id, provider_state }.
# Returns { provider_id: "", provider_state: "unknown" } on any error.
def probe-hetzner [settings: record, server: record]: nothing -> record {
let info = (do { mw_server_info $server true } | complete)
if $info.exit_code != 0 or ($info.stdout | is-empty) {
return { provider_id: "", provider_state: "unknown" }
}
let parsed = (do { $info.stdout | from json } catch { null })
if ($parsed | is-empty) {
return { provider_id: "", provider_state: "unknown" }
}
let raw_state = ($parsed.status? | default "unknown" | str downcase)
let mapped = match $raw_state {
"running" => "running",
"off" => "off",
_ => "unknown",
}
{
provider_id: ($parsed.id? | default "" | into string),
provider_state: $mapped,
}
}
# ─── K8s probe ────────────────────────────────────────────────────────────────
# Check if a K8s deployment or daemonset is ready via kubectl.
# Returns true only on explicit "available" status confirmation.
def probe-k8s-ready [
kubeconfig: string
resource_type: string # deployment | daemonset
name: string
namespace: string = "kube-system"
]: nothing -> bool {
let result = (do {
^kubectl --kubeconfig $kubeconfig -n $namespace get $resource_type $name -o jsonpath="{.status.readyReplicas}" err> /dev/null
} | complete)
if $result.exit_code != 0 { return false }
let ready = ($result.stdout | str trim | into int | default 0)
$ready > 0
}
# Map taskserv name to K8s resource for readiness probing.
# Returns null if the taskserv has no K8s resource to probe.
def taskserv-k8s-resource [taskserv: string]: nothing -> record {
match $taskserv {
"cilium" => { type: "daemonset", name: "cilium", ns: "kube-system" },
"hetzner_csi" => { type: "deployment", name: "hcloud-csi-controller", ns: "kube-system" },
"democratic_csi" => { type: "deployment", name: "democratic-csi-controller", ns: "democratic-csi" },
"coredns" => { type: "deployment", name: "coredns", ns: "kube-system" },
_ => null,
}
}
# ─── SSH probe ────────────────────────────────────────────────────────────────
# Returns true if the server responds to SSH on port 22 within 5 seconds.
def probe-ssh [ip: string]: nothing -> bool {
let result = (do {
^nc -z -w 5 $ip 22 err> /dev/null
} | complete)
$result.exit_code == 0
}
# ─── Main sync ────────────────────────────────────────────────────────────────
export def state-sync [
workspace_path: string
settings: record
--kubeconfig: string = "" # Path to kubeconfig for K8s probes (skipped if empty)
--skip-ssh # Skip SSH liveness probes
--infra: string = "" # Filter to specific infra name
]: nothing -> nothing {
_print "Syncing provisioning state against external APIs ..."
let ts = ((date now) | format date "%Y-%m-%dT%H:%M:%SZ")
for srv in ($settings.data.servers? | default []) {
let hostname = $srv.hostname
_print $" → ($hostname)"
# 1. Hetzner API — provider existence and state
let htz = (probe-hetzner $settings $srv)
state-server-sync $workspace_path $hostname --provider-id $htz.provider_id --provider-state $htz.provider_state
if $htz.provider_state == "unknown" {
_print $" provider: unknown (API timeout or server not found)"
continue
}
_print $" provider: ($htz.provider_state) id=($htz.provider_id)"
# 2. SSH liveness
if not $skip_ssh {
let ip = (do { mw_get_ip $settings $srv "public" false } catch { "" } | str trim)
if ($ip | is-not-empty) {
let ssh_ok = (probe-ssh $ip)
_print $" ssh ($ip): (if $ssh_ok { "reachable" } else { "unreachable" })"
if not $ssh_ok {
_print $" skipping K8s probes — node unreachable"
continue
}
}
}
# 3. K8s readiness probes (only when kubeconfig provided and server is running)
if ($kubeconfig | is-not-empty) and ($kubeconfig | path exists) and $htz.provider_state == "running" {
let st = (state-read $workspace_path)
let taskservs = ($st.servers | get -o $hostname | default {} | get -o taskservs | default {})
for ts_entry in ($taskservs | transpose taskserv node) {
let res = (taskserv-k8s-resource $ts_entry.taskserv)
if ($res | is-empty) { continue }
let ready = (probe-k8s-ready $kubeconfig $res.type $res.name $res.ns)
if $ready {
_print $" ($ts_entry.taskserv): K8s ready → confirmed completed"
state-node-set $workspace_path $hostname $ts_entry.taskserv {
state: "completed",
actor: { identity: "system", source: "sync" },
log: (log-trim ($ts_entry.node.log? | default [] | append {
ts: $ts,
event: "sync-confirmed",
source: "sync",
})),
}
} else {
_print $" ($ts_entry.taskserv): K8s not ready → unknown"
state-node-set $workspace_path $hostname $ts_entry.taskserv {
state: "unknown",
actor: { identity: "system", source: "sync" },
log: (log-trim ($ts_entry.node.log? | default [] | append {
ts: $ts,
event: "sync-unknown",
source: "sync",
})),
}
}
}
}
}
_print "Sync complete."
}