prvng_core/nulib/main_provisioning/cluster-deploy.nu
Jesús Pérez 894046ef5a
feat(core): three-layer DAG, unified component arch, commands-registry cache, Nushell 0.112.2 migration
- DAG architecture: `dag show/validate/export` (nulib/main_provisioning/dag.nu),
    config loader (lib_provisioning/config/loader/dag.nu), taskserv dag-executor.
    Backed by schemas/lib/dag/*.ncl; orchestrator emits NATS events via
    WorkspaceComposition::into_workflow. See ADR-020, ADR-021.
  - Unified Component Architecture: components/mod.nu, main_provisioning/
    {components,workflow,extensions,ontoref-queries}.nu. Full workflow engine with
    topological sort and NATS subject emission. Blocks A-H complete (libre-daoshi).
  - Commands-registry: nulib/commands-registry.ncl (Nickel source, 314 lines) +
    JSON cache at ~/.cache/provisioning/commands-registry.json rebuilt on source
    change. cli/provisioning fast-path alias expansion avoids cold Nu startup.
    ADDING_COMMANDS.md documents new-command workflow.
  - Platform service manager: service-manager.nu (+573), startup.nu (+611),
    service-check.nu (+255); autostart/bootstrap/health/target refactored.
  - Nushell 0.112.2 migration: removed all try/catch and bash redirections;
    external commands prefixed with ^; type signatures enforced. Driven by
    scripts/refactor-try-catch{,-simplified}.nu.
  - TTY stack: removed shlib/*-tty.sh; replaced by cli/tty-dispatch.sh,
    tty-filter.sh, tty-commands.conf.
  - New domain modules: images/ (golden image lifecycle), workspace/{state,sync}.nu,
    main_provisioning/{bootstrap,cluster-deploy,fip,state}.nu, commands/{state,
    build,integrations/auth,utilities/alias}.nu, platform.nu expanded (+874).
  - Config loader overhaul: loader/core.nu slimmed (-759), cache/core.nu
    refactored (-454), removed legacy loaders/file_loader.nu (-330).
  - Thirteen new provisioning-<domain>.nu top-level modules for bash dispatcher.
  - Tests: test_workspace_state.nu (+351); updates to test_oci_registry,
    test_services.
  - README + CHANGELOG updated.
2026-04-17 04:27:33 +01:00

357 lines
16 KiB
Text
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use ../lib_provisioning/workspace *
use ../lib_provisioning/user/config.nu [get-workspace-path, get-active-workspace-details]
use ../lib_provisioning/utils/nickel_processor.nu [ncl-eval]
# Decrypt a SOPS-encrypted dotenv file and return its contents as a record.
#
# The file must be in dotenv format (KEY=VALUE lines). SOPS is called with
# --output-type=dotenv so the decrypted output is in the same format.
# Lines starting with # and blank lines are ignored.
#
# Auto-discovery: if secrets_path is empty, looks for cluster/<cluster>/secrets.sops.env
# relative to ws_root. Returns {} if no secrets file is found and path was not explicit.
def cd-load-secrets [secrets_path: string]: nothing -> record {
if (($secrets_path | path exists) == false) {
error make { msg: $"Secrets file not found: ($secrets_path)" }
}
let result = (do { ^sops --decrypt --output-type=dotenv $secrets_path } | complete)
if $result.exit_code != 0 {
error make { msg: $"SOPS decrypt failed for ($secrets_path):\n($result.stderr)" }
}
$result.stdout
| lines
| where { ($in | str starts-with "#") == false }
| where { ($in | str contains "=") }
| parse "{key}={value}"
| reduce --fold {} {|row, acc| $acc | insert $row.key $row.value }
}
# Export a Nickel file relative to the workspace root, with workspace and provisioning import paths.
def cd-ncl-export [ws_root: string, rel_path: string]: nothing -> record {
let prov_root = ($env.PROVISIONING? | default "/usr/local/provisioning")
let full_path = ($ws_root | path join $rel_path)
let result = (ncl-eval $full_path [$ws_root $prov_root])
$result
}
# Read .provisioning-state.json and return FIP env vars (FIP_A_IP/ID, FIP_B_IP/ID, FIP_C_IP/ID).
#
# FIP key mapping (set by bootstrap.nu naming convention after stripping "librecloud-fip-" prefix
# and replacing dashes with underscores):
# smtp → FIP_A (Stalwart SMTP, sgoyol-1)
# sgoyol_ingress → FIP_B (sgoyol Cilium ingress)
# wuji → FIP_C (wuji K8s API + ingress)
def cd-load-fip-env [ws_root: string]: nothing -> record {
let state_path = ($ws_root | path join ".provisioning-state.json")
if (($state_path | path exists) == false) {
error make { msg: ".provisioning-state.json not found — run: provisioning bootstrap first" }
}
let state = (open --raw $state_path | from json)
let fips = ($state | get bootstrap | get floating_ips)
let fip_a = ($fips | get -o smtp | default {})
let fip_b = ($fips | get -o sgoyol_ingress | default {})
let fip_c = ($fips | get -o wuji | default {})
{
FIP_A_IP: ($fip_a | get -o ip | default ""),
FIP_A_ID: ($fip_a | get -o id | default ""),
FIP_B_IP: ($fip_b | get -o ip | default ""),
FIP_B_ID: ($fip_b | get -o id | default ""),
FIP_C_IP: ($fip_c | get -o ip | default ""),
FIP_C_ID: ($fip_c | get -o id | default ""),
}
}
# Build env var record for an extension install script.
#
# Protocol: scalar fields → `<PREFIX>_<FIELD>`, lists/records → `<PREFIX>_<FIELD>_JSON`.
# Full config also available as `<PREFIX>_CONFIG_JSON`. FIP vars and KUBECONFIG always set.
def cd-ext-env [ext_name: string, cfg: any, fip_env: record, kubeconfig: string]: nothing -> record {
let prefix = ($ext_name | str upcase | str replace --all "-" "_" | str replace --all "." "_")
let flat = if ($cfg | describe | str starts-with "record") {
$cfg | transpose key val | reduce --fold {} {|entry, acc|
let raw_key = ($entry.key | str upcase | str replace --all "-" "_" | str replace --all "." "_")
let type_desc = ($entry.val | describe)
let is_scalar = ($type_desc in ["string", "int", "float", "bool"])
let env_key = if $is_scalar { $"($prefix)_($raw_key)" } else { $"($prefix)_($raw_key)_JSON" }
let env_val = if $type_desc == "string" {
$entry.val
} else if $is_scalar {
$entry.val | into string
} else {
$entry.val | to json --raw
}
$acc | insert $env_key $env_val
}
} else {
{}
}
$flat
| insert $"($prefix)_CONFIG_JSON" ($cfg | to json --raw)
| merge $fip_env
| insert KUBECONFIG $kubeconfig
}
# Locate the install script for an extension under extensions/clusters/.
#
# Extensions have inconsistent naming: some dirs use underscores (cert_manager, hcloud_floater)
# while scripts use dashes (install-cert-manager.sh, install-hcloud-floater.sh). Others are
# all-dash (oci-reg) or all-same (metallb, git, woodpecker, stalwart).
# Tries all 4 combinations of (dir: _ or -) × (script: _ or -).
def cd-find-script [prov_root: string, ext_name: string]: nothing -> string {
let dash_name = ($ext_name | str replace --all "_" "-")
let under_name = ($ext_name | str replace --all "-" "_")
# Pairs of [dir_name, script_name] — ordered by most-likely match first.
let combos = [
[$under_name, $under_name],
[$under_name, $dash_name],
[$dash_name, $dash_name],
[$dash_name, $under_name],
]
let found = ($combos | each {|pair|
let p = ($prov_root | path join "extensions/clusters" $pair.0 "default" $"install-($pair.1).sh")
if ($p | path exists) { $p } else { null }
} | where { $in != null })
if ($found | is-empty) {
error make { msg: $"No install script for extension '($ext_name)' in ($prov_root)/extensions/clusters/ (tried all _/- variants)" }
}
$found | first
}
# Locate the install script for a component under extensions/components/.
#
# Components are structured as extensions/components/{comp_name}/{mode}/install-{comp_name}.sh.
# Tries all 4 combinations of dir/script name with dashes and underscores.
def cd-find-component-script [prov_root: string, comp_name: string, mode: string]: nothing -> string {
let dash_name = ($comp_name | str replace --all "_" "-")
let under_name = ($comp_name | str replace --all "-" "_")
let combos = [
[$under_name, $under_name],
[$under_name, $dash_name],
[$dash_name, $dash_name],
[$dash_name, $under_name],
]
let found = ($combos | each {|pair|
let p = ($prov_root | path join "extensions/components" $pair.0 $mode $"install-($pair.1).sh")
if ($p | path exists) { $p } else { null }
} | where { $in != null })
if ($found | is-empty) {
error make { msg: $"No install script for component '($comp_name)' mode '($mode)' in ($prov_root)/extensions/components/ (tried all _/- variants)" }
}
$found | first
}
# Non-erroring variant for dry-run display — returns "<not found>" if no component script exists.
def cd-find-component-script-opt [prov_root: string, comp_name: string, mode: string]: nothing -> string {
let dash_name = ($comp_name | str replace --all "_" "-")
let under_name = ($comp_name | str replace --all "-" "_")
let combos = [
[$under_name, $under_name],
[$under_name, $dash_name],
[$dash_name, $dash_name],
[$dash_name, $under_name],
]
let found = ($combos | each {|pair|
let p = ($prov_root | path join "extensions/components" $pair.0 $mode $"install-($pair.1).sh")
if ($p | path exists) { $p } else { null }
} | where { $in != null })
if ($found | is-empty) { "<not found>" } else { $found | first }
}
# Non-erroring variant for dry-run display — returns "<not found>" if no script exists.
def cd-find-script-opt [prov_root: string, ext_name: string]: nothing -> string {
let dash_name = ($ext_name | str replace --all "_" "-")
let under_name = ($ext_name | str replace --all "-" "_")
let combos = [
[$under_name, $under_name],
[$under_name, $dash_name],
[$dash_name, $dash_name],
[$dash_name, $under_name],
]
let found = ($combos | each {|pair|
let p = ($prov_root | path join "extensions/clusters" $pair.0 "default" $"install-($pair.1).sh")
if ($p | path exists) { $p } else { null }
} | where { $in != null })
if ($found | is-empty) { "<not found>" } else { $found | first }
}
# Execute the health gate for an extension, retrying on transient failures.
def cd-health-gate [ext_id: string, gate: record]: nothing -> nothing {
mut remaining = $gate.retries
mut passed = false
while ($remaining > 0) and ($passed == false) {
let res = (do { ^bash -c $gate.check_cmd } | complete)
if $res.exit_code == 0 {
$passed = true
print $" [($ext_id)] health gate OK"
} else {
$remaining -= 1
if $remaining > 0 {
let attempt = ($gate.retries - $remaining)
print $" [($ext_id)] gate ($attempt)/($gate.retries) failed — retry in 10s"
^sleep 10
}
}
}
if $passed == false {
error make { msg: $"[($ext_id)] health gate failed after ($gate.retries) attempts.\nCmd: ($gate.check_cmd)" }
}
}
# Deploy cluster extensions — L3 platform or L4 application services.
#
# Reads the deployment DAG from cluster/<cluster>/<layer>-dag.ncl and extension configs
# from cluster/<cluster>/<layer>.ncl. Extensions execute in dependency order defined
# by the DAG `depends_on` arrays. FIP IPs and IDs come from .provisioning-state.json
# written by `provisioning bootstrap`.
#
# Each install script receives:
# <EXT>_<FIELD> — scalar config values (namespace, version, host, …)
# <EXT>_<FIELD>_JSON — complex config values (ip_pools, node_selector, …)
# <EXT>_CONFIG_JSON — full extension config as JSON
# FIP_A_IP / FIP_A_ID — FIP-A (Stalwart SMTP)
# FIP_B_IP / FIP_B_ID — FIP-B (sgoyol ingress)
# FIP_C_IP / FIP_C_ID — FIP-C (wuji)
# KUBECONFIG — path to kubeconfig
#
# Usage:
# provisioning cluster deploy platform sgoyol --workspace librecloud_renew
# provisioning cluster deploy apps sgoyol --workspace librecloud_renew
export def "main cluster deploy" [
layer: string # Deployment layer: platform | apps
cluster: string # Cluster name (e.g. sgoyol, wuji)
--workspace (-w): string # Workspace name (default: active workspace)
--dry-run (-n) # Print the execution plan without running install scripts
--kubeconfig (-k): string # Override KUBECONFIG path for kubectl calls
--secrets-file (-s): string # SOPS-encrypted dotenv file with install script secrets.
# Auto-discovered at cluster/<cluster>/secrets.sops.env if omitted.
] : nothing -> nothing {
if not ($layer in ["platform", "apps"]) {
error make { msg: $"layer must be 'platform' or 'apps', got: ($layer)" }
}
let ws_name = if ($workspace | is-not-empty) {
$workspace
} else {
let details = (get-active-workspace-details)
if ($details == null) {
error make { msg: "No active workspace — pass --workspace or activate one first" }
}
$details.name
}
let ws_root = (get-workspace-path $ws_name)
let prov_root = ($env.PROVISIONING? | default "/usr/local/provisioning")
let dag_rel = $"cluster/($cluster)/($layer)-dag.ncl"
let cfg_rel = $"cluster/($cluster)/($layer).ncl"
let kube_cfg = if ($kubeconfig | is-not-empty) {
$kubeconfig
} else {
$env.KUBECONFIG? | default "/etc/kubernetes/admin.conf"
}
print $"Cluster deploy | workspace: ($ws_name) | cluster: ($cluster) | layer: ($layer)"
if $dry_run { print "DRY RUN — install scripts will not execute" }
if ($secrets_file | is-not-empty) { print $" secrets: ($secrets_file)" }
print ""
let dag = (cd-ncl-export $ws_root $dag_rel)
let cfg = (cd-ncl-export $ws_root $cfg_rel)
let fip_env = (cd-load-fip-env $ws_root)
let ext_cfgs = ($cfg | get extensions)
# SOPS secrets: explicit path > auto-discovered cluster/<cluster>/secrets.sops.env > empty.
# Secrets are merged AFTER NCL env vars — they override any overlapping computed values.
let secrets_env = if ($secrets_file | is-not-empty) {
cd-load-secrets $secrets_file
} else {
let auto_path = ($ws_root | path join $"cluster/($cluster)/secrets.sops.env")
if ($auto_path | path exists) {
print $" secrets: ($auto_path)"
cd-load-secrets $auto_path
} else {
{}
}
}
# Walk extensions in array order; verify depends_on are satisfied, then install + gate.
let _completed = ($dag.extensions | reduce --fold [] {|entry, completed|
let ext_id = $entry.id
# Dependency guard — catches DAG authoring errors.
let unsatisfied = ($entry.depends_on | where {|dep|
($completed | any {|c| $c == $dep }) == false
})
if ($unsatisfied | is-not-empty) {
error make { msg: $"[($ext_id)] depends on [($unsatisfied | str join ', ')] not yet deployed — fix DAG ordering in ($dag_rel)" }
}
# Dispatch: component nodes use extensions/components/ path; extension nodes use extensions/clusters/.
let is_component = ("component" in $entry) and ($entry | get -o component | default null) != null
if $is_component {
let comp = ($entry.component)
let comp_name = $comp.name
let mode = ($comp | get -o mode | default "cluster")
let comp_cfg = ($cfg | get -o components | default {} | get -o $ext_id | default {})
let env_vars = (cd-ext-env $comp_name $comp_cfg $fip_env $kube_cfg | merge $secrets_env)
print $"[($ext_id)] component: ($comp_name) mode=($mode)"
if ($entry | get -o parallel | default false) { print " note: parallel=true (sequential execution)" }
if $dry_run {
let script_display = (cd-find-component-script-opt $prov_root $comp_name $mode)
print $" script: ($script_display)"
print $" env keys: ($env_vars | columns | sort | str join ', ')"
if ($entry | get -o health_gate | default null) != null {
print $" gate: ($entry.health_gate.check_cmd | str substring 0..80)..."
}
} else {
let script = (cd-find-component-script $prov_root $comp_name $mode)
print $" script: ($script)"
print ""
with-env $env_vars { ^bash $script }
let exit_code = $env.LAST_EXIT_CODE
if $exit_code != 0 {
error make { msg: $"[($ext_id)] component install script exited ($exit_code)" }
}
if ($entry | get -o health_gate | default null) != null {
cd-health-gate $ext_id $entry.health_gate
}
}
} else {
let ext_name = $entry.extension
let ext_cfg = ($ext_cfgs | get -o $ext_id | default {})
# secrets_env is merged last — its values win over any NCL-derived env var with the same key.
let env_vars = (cd-ext-env $ext_name $ext_cfg $fip_env $kube_cfg | merge $secrets_env)
print $"[($ext_id)] extension: ($ext_name)"
if ($entry | get -o parallel | default false) { print " note: parallel=true (sequential execution)" }
if $dry_run {
let script_display = (cd-find-script-opt $prov_root $ext_name)
print $" script: ($script_display)"
print $" env keys: ($env_vars | columns | sort | str join ', ')"
if ($entry | get -o health_gate | default null) != null {
print $" gate: ($entry.health_gate.check_cmd | str substring 0..80)..."
}
} else {
let script = (cd-find-script $prov_root $ext_name)
print $" script: ($script)"
print ""
with-env $env_vars { ^bash $script }
let exit_code = $env.LAST_EXIT_CODE
if $exit_code != 0 {
error make { msg: $"[($ext_id)] install script exited ($exit_code)" }
}
if ($entry | get -o health_gate | default null) != null {
cd-health-gate $ext_id $entry.health_gate
}
}
}
print ""
$completed | append $ext_id
})
print $"Cluster deploy complete: ($layer) on ($cluster)"
}