prvng_core/nulib/workspace/sync.nu
Jesús Pérez 894046ef5a
feat(core): three-layer DAG, unified component arch, commands-registry cache, Nushell 0.112.2 migration
- DAG architecture: `dag show/validate/export` (nulib/main_provisioning/dag.nu),
    config loader (lib_provisioning/config/loader/dag.nu), taskserv dag-executor.
    Backed by schemas/lib/dag/*.ncl; orchestrator emits NATS events via
    WorkspaceComposition::into_workflow. See ADR-020, ADR-021.
  - Unified Component Architecture: components/mod.nu, main_provisioning/
    {components,workflow,extensions,ontoref-queries}.nu. Full workflow engine with
    topological sort and NATS subject emission. Blocks A-H complete (libre-daoshi).
  - Commands-registry: nulib/commands-registry.ncl (Nickel source, 314 lines) +
    JSON cache at ~/.cache/provisioning/commands-registry.json rebuilt on source
    change. cli/provisioning fast-path alias expansion avoids cold Nu startup.
    ADDING_COMMANDS.md documents new-command workflow.
  - Platform service manager: service-manager.nu (+573), startup.nu (+611),
    service-check.nu (+255); autostart/bootstrap/health/target refactored.
  - Nushell 0.112.2 migration: removed all try/catch and bash redirections;
    external commands prefixed with ^; type signatures enforced. Driven by
    scripts/refactor-try-catch{,-simplified}.nu.
  - TTY stack: removed shlib/*-tty.sh; replaced by cli/tty-dispatch.sh,
    tty-filter.sh, tty-commands.conf.
  - New domain modules: images/ (golden image lifecycle), workspace/{state,sync}.nu,
    main_provisioning/{bootstrap,cluster-deploy,fip,state}.nu, commands/{state,
    build,integrations/auth,utilities/alias}.nu, platform.nu expanded (+874).
  - Config loader overhaul: loader/core.nu slimmed (-759), cache/core.nu
    refactored (-454), removed legacy loaders/file_loader.nu (-330).
  - Thirteen new provisioning-<domain>.nu top-level modules for bash dispatcher.
  - Tests: test_workspace_state.nu (+351); updates to test_oci_registry,
    test_services.
  - README + CHANGELOG updated.
2026-04-17 04:27:33 +01:00

148 lines
6.8 KiB
Text

# provisioning sync — reconcile .provisioning-state.ncl against external APIs.
# Sources: Hetzner API (server existence/status), K8s API (pod/deploy readiness), SSH probe.
# Never marks a taskserv 'completed without positive confirmation.
# Ambiguous or timed-out probes write 'unknown.
use state.nu *
use ../lib_provisioning *
# ─── Provider probe ───────────────────────────────────────────────────────────
# Query Hetzner API for a server and return { provider_id, provider_state }.
# Returns { provider_id: "", provider_state: "unknown" } on any error.
def probe-hetzner [settings: record, server: record]: nothing -> record {
let info = (do { mw_server_info $server true } | complete)
if $info.exit_code != 0 or ($info.stdout | is-empty) {
return { provider_id: "", provider_state: "unknown" }
}
let parsed = (do { $info.stdout | from json } catch { null })
if ($parsed | is-empty) {
return { provider_id: "", provider_state: "unknown" }
}
let raw_state = ($parsed.status? | default "unknown" | str downcase)
let mapped = match $raw_state {
"running" => "running",
"off" => "off",
_ => "unknown",
}
{
provider_id: ($parsed.id? | default "" | into string),
provider_state: $mapped,
}
}
# ─── K8s probe ────────────────────────────────────────────────────────────────
# Check if a K8s deployment or daemonset is ready via kubectl.
# Returns true only on explicit "available" status confirmation.
def probe-k8s-ready [
kubeconfig: string
resource_type: string # deployment | daemonset
name: string
namespace: string = "kube-system"
]: nothing -> bool {
let result = (do {
^kubectl --kubeconfig $kubeconfig -n $namespace get $resource_type $name -o jsonpath="{.status.readyReplicas}" err> /dev/null
} | complete)
if $result.exit_code != 0 { return false }
let ready = ($result.stdout | str trim | into int | default 0)
$ready > 0
}
# Map taskserv name to K8s resource for readiness probing.
# Returns null if the taskserv has no K8s resource to probe.
def taskserv-k8s-resource [taskserv: string]: nothing -> record {
match $taskserv {
"cilium" => { type: "daemonset", name: "cilium", ns: "kube-system" },
"hetzner_csi" => { type: "deployment", name: "hcloud-csi-controller", ns: "kube-system" },
"democratic_csi" => { type: "deployment", name: "democratic-csi-controller", ns: "democratic-csi" },
"coredns" => { type: "deployment", name: "coredns", ns: "kube-system" },
_ => null,
}
}
# ─── SSH probe ────────────────────────────────────────────────────────────────
# Returns true if the server responds to SSH on port 22 within 5 seconds.
def probe-ssh [ip: string]: nothing -> bool {
let result = (do {
^nc -z -w 5 $ip 22 err> /dev/null
} | complete)
$result.exit_code == 0
}
# ─── Main sync ────────────────────────────────────────────────────────────────
export def state-sync [
workspace_path: string
settings: record
--kubeconfig: string = "" # Path to kubeconfig for K8s probes (skipped if empty)
--skip-ssh # Skip SSH liveness probes
--infra: string = "" # Filter to specific infra name
]: nothing -> nothing {
_print "Syncing provisioning state against external APIs ..."
let ts = ((date now) | format date "%Y-%m-%dT%H:%M:%SZ")
for srv in ($settings.data.servers? | default []) {
let hostname = $srv.hostname
_print $" → ($hostname)"
# 1. Hetzner API — provider existence and state
let htz = (probe-hetzner $settings $srv)
state-server-sync $workspace_path $hostname --provider-id $htz.provider_id --provider-state $htz.provider_state
if $htz.provider_state == "unknown" {
_print $" provider: unknown (API timeout or server not found)"
continue
}
_print $" provider: ($htz.provider_state) id=($htz.provider_id)"
# 2. SSH liveness
if not $skip_ssh {
let ip = (do { mw_get_ip $settings $srv "public" false } catch { "" } | str trim)
if ($ip | is-not-empty) {
let ssh_ok = (probe-ssh $ip)
_print $" ssh ($ip): (if $ssh_ok { "reachable" } else { "unreachable" })"
if not $ssh_ok {
_print $" skipping K8s probes — node unreachable"
continue
}
}
}
# 3. K8s readiness probes (only when kubeconfig provided and server is running)
if ($kubeconfig | is-not-empty) and ($kubeconfig | path exists) and $htz.provider_state == "running" {
let st = (state-read $workspace_path)
let taskservs = ($st.servers | get -o $hostname | default {} | get -o taskservs | default {})
for ts_entry in ($taskservs | transpose taskserv node) {
let res = (taskserv-k8s-resource $ts_entry.taskserv)
if ($res | is-empty) { continue }
let ready = (probe-k8s-ready $kubeconfig $res.type $res.name $res.ns)
if $ready {
_print $" ($ts_entry.taskserv): K8s ready → confirmed completed"
state-node-set $workspace_path $hostname $ts_entry.taskserv {
state: "completed",
actor: { identity: "system", source: "sync" },
log: (log-trim ($ts_entry.node.log? | default [] | append {
ts: $ts,
event: "sync-confirmed",
source: "sync",
})),
}
} else {
_print $" ($ts_entry.taskserv): K8s not ready → unknown"
state-node-set $workspace_path $hostname $ts_entry.taskserv {
state: "unknown",
actor: { identity: "system", source: "sync" },
log: (log-trim ($ts_entry.node.log? | default [] | append {
ts: $ts,
event: "sync-unknown",
source: "sync",
})),
}
}
}
}
}
_print "Sync complete."
}