- DAG architecture: `dag show/validate/export` (nulib/main_provisioning/dag.nu),
config loader (lib_provisioning/config/loader/dag.nu), taskserv dag-executor.
Backed by schemas/lib/dag/*.ncl; orchestrator emits NATS events via
WorkspaceComposition::into_workflow. See ADR-020, ADR-021.
- Unified Component Architecture: components/mod.nu, main_provisioning/
{components,workflow,extensions,ontoref-queries}.nu. Full workflow engine with
topological sort and NATS subject emission. Blocks A-H complete (libre-daoshi).
- Commands-registry: nulib/commands-registry.ncl (Nickel source, 314 lines) +
JSON cache at ~/.cache/provisioning/commands-registry.json rebuilt on source
change. cli/provisioning fast-path alias expansion avoids cold Nu startup.
ADDING_COMMANDS.md documents new-command workflow.
- Platform service manager: service-manager.nu (+573), startup.nu (+611),
service-check.nu (+255); autostart/bootstrap/health/target refactored.
- Nushell 0.112.2 migration: removed all try/catch and bash redirections;
external commands prefixed with ^; type signatures enforced. Driven by
scripts/refactor-try-catch{,-simplified}.nu.
- TTY stack: removed shlib/*-tty.sh; replaced by cli/tty-dispatch.sh,
tty-filter.sh, tty-commands.conf.
- New domain modules: images/ (golden image lifecycle), workspace/{state,sync}.nu,
main_provisioning/{bootstrap,cluster-deploy,fip,state}.nu, commands/{state,
build,integrations/auth,utilities/alias}.nu, platform.nu expanded (+874).
- Config loader overhaul: loader/core.nu slimmed (-759), cache/core.nu
refactored (-454), removed legacy loaders/file_loader.nu (-330).
- Thirteen new provisioning-<domain>.nu top-level modules for bash dispatcher.
- Tests: test_workspace_state.nu (+351); updates to test_oci_registry,
test_services.
- README + CHANGELOG updated.
148 lines
6.8 KiB
Text
148 lines
6.8 KiB
Text
# provisioning sync — reconcile .provisioning-state.ncl against external APIs.
|
|
# Sources: Hetzner API (server existence/status), K8s API (pod/deploy readiness), SSH probe.
|
|
# Never marks a taskserv 'completed without positive confirmation.
|
|
# Ambiguous or timed-out probes write 'unknown.
|
|
|
|
use state.nu *
|
|
use ../lib_provisioning *
|
|
|
|
# ─── Provider probe ───────────────────────────────────────────────────────────
|
|
|
|
# Query Hetzner API for a server and return { provider_id, provider_state }.
|
|
# Returns { provider_id: "", provider_state: "unknown" } on any error.
|
|
def probe-hetzner [settings: record, server: record]: nothing -> record {
|
|
let info = (do { mw_server_info $server true } | complete)
|
|
if $info.exit_code != 0 or ($info.stdout | is-empty) {
|
|
return { provider_id: "", provider_state: "unknown" }
|
|
}
|
|
let parsed = (do { $info.stdout | from json } catch { null })
|
|
if ($parsed | is-empty) {
|
|
return { provider_id: "", provider_state: "unknown" }
|
|
}
|
|
let raw_state = ($parsed.status? | default "unknown" | str downcase)
|
|
let mapped = match $raw_state {
|
|
"running" => "running",
|
|
"off" => "off",
|
|
_ => "unknown",
|
|
}
|
|
{
|
|
provider_id: ($parsed.id? | default "" | into string),
|
|
provider_state: $mapped,
|
|
}
|
|
}
|
|
|
|
# ─── K8s probe ────────────────────────────────────────────────────────────────
|
|
|
|
# Check if a K8s deployment or daemonset is ready via kubectl.
|
|
# Returns true only on explicit "available" status confirmation.
|
|
def probe-k8s-ready [
|
|
kubeconfig: string
|
|
resource_type: string # deployment | daemonset
|
|
name: string
|
|
namespace: string = "kube-system"
|
|
]: nothing -> bool {
|
|
let result = (do {
|
|
^kubectl --kubeconfig $kubeconfig -n $namespace get $resource_type $name -o jsonpath="{.status.readyReplicas}" err> /dev/null
|
|
} | complete)
|
|
if $result.exit_code != 0 { return false }
|
|
let ready = ($result.stdout | str trim | into int | default 0)
|
|
$ready > 0
|
|
}
|
|
|
|
# Map taskserv name to K8s resource for readiness probing.
|
|
# Returns null if the taskserv has no K8s resource to probe.
|
|
def taskserv-k8s-resource [taskserv: string]: nothing -> record {
|
|
match $taskserv {
|
|
"cilium" => { type: "daemonset", name: "cilium", ns: "kube-system" },
|
|
"hetzner_csi" => { type: "deployment", name: "hcloud-csi-controller", ns: "kube-system" },
|
|
"democratic_csi" => { type: "deployment", name: "democratic-csi-controller", ns: "democratic-csi" },
|
|
"coredns" => { type: "deployment", name: "coredns", ns: "kube-system" },
|
|
_ => null,
|
|
}
|
|
}
|
|
|
|
# ─── SSH probe ────────────────────────────────────────────────────────────────
|
|
|
|
# Returns true if the server responds to SSH on port 22 within 5 seconds.
|
|
def probe-ssh [ip: string]: nothing -> bool {
|
|
let result = (do {
|
|
^nc -z -w 5 $ip 22 err> /dev/null
|
|
} | complete)
|
|
$result.exit_code == 0
|
|
}
|
|
|
|
# ─── Main sync ────────────────────────────────────────────────────────────────
|
|
|
|
export def state-sync [
|
|
workspace_path: string
|
|
settings: record
|
|
--kubeconfig: string = "" # Path to kubeconfig for K8s probes (skipped if empty)
|
|
--skip-ssh # Skip SSH liveness probes
|
|
--infra: string = "" # Filter to specific infra name
|
|
]: nothing -> nothing {
|
|
_print "Syncing provisioning state against external APIs ..."
|
|
let ts = ((date now) | format date "%Y-%m-%dT%H:%M:%SZ")
|
|
|
|
for srv in ($settings.data.servers? | default []) {
|
|
let hostname = $srv.hostname
|
|
_print $" → ($hostname)"
|
|
|
|
# 1. Hetzner API — provider existence and state
|
|
let htz = (probe-hetzner $settings $srv)
|
|
state-server-sync $workspace_path $hostname --provider-id $htz.provider_id --provider-state $htz.provider_state
|
|
|
|
if $htz.provider_state == "unknown" {
|
|
_print $" provider: unknown (API timeout or server not found)"
|
|
continue
|
|
}
|
|
_print $" provider: ($htz.provider_state) id=($htz.provider_id)"
|
|
|
|
# 2. SSH liveness
|
|
if not $skip_ssh {
|
|
let ip = (do { mw_get_ip $settings $srv "public" false } catch { "" } | str trim)
|
|
if ($ip | is-not-empty) {
|
|
let ssh_ok = (probe-ssh $ip)
|
|
_print $" ssh ($ip): (if $ssh_ok { "reachable" } else { "unreachable" })"
|
|
if not $ssh_ok {
|
|
_print $" skipping K8s probes — node unreachable"
|
|
continue
|
|
}
|
|
}
|
|
}
|
|
|
|
# 3. K8s readiness probes (only when kubeconfig provided and server is running)
|
|
if ($kubeconfig | is-not-empty) and ($kubeconfig | path exists) and $htz.provider_state == "running" {
|
|
let st = (state-read $workspace_path)
|
|
let taskservs = ($st.servers | get -o $hostname | default {} | get -o taskservs | default {})
|
|
for ts_entry in ($taskservs | transpose taskserv node) {
|
|
let res = (taskserv-k8s-resource $ts_entry.taskserv)
|
|
if ($res | is-empty) { continue }
|
|
let ready = (probe-k8s-ready $kubeconfig $res.type $res.name $res.ns)
|
|
if $ready {
|
|
_print $" ($ts_entry.taskserv): K8s ready → confirmed completed"
|
|
state-node-set $workspace_path $hostname $ts_entry.taskserv {
|
|
state: "completed",
|
|
actor: { identity: "system", source: "sync" },
|
|
log: (log-trim ($ts_entry.node.log? | default [] | append {
|
|
ts: $ts,
|
|
event: "sync-confirmed",
|
|
source: "sync",
|
|
})),
|
|
}
|
|
} else {
|
|
_print $" ($ts_entry.taskserv): K8s not ready → unknown"
|
|
state-node-set $workspace_path $hostname $ts_entry.taskserv {
|
|
state: "unknown",
|
|
actor: { identity: "system", source: "sync" },
|
|
log: (log-trim ($ts_entry.node.log? | default [] | append {
|
|
ts: $ts,
|
|
event: "sync-unknown",
|
|
source: "sync",
|
|
})),
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
_print "Sync complete."
|
|
}
|