prvng_core/nulib/main_provisioning/cluster-deploy.nu

use ../lib_provisioning/workspace *
use ../lib_provisioning/user/config.nu [get-workspace-path, get-active-workspace-details]
use ../lib_provisioning/utils/nickel_processor.nu [ncl-eval]

# Decrypt a SOPS-encrypted dotenv file and return its contents as a record.
#
# The file must be in dotenv format (KEY=VALUE lines). SOPS is called with
# --output-type=dotenv so the decrypted output is in the same format.
# Lines starting with # and blank lines are ignored.
#
# Auto-discovery: if secrets_path is empty, looks for cluster/<cluster>/secrets.sops.env
# relative to ws_root. Returns {} if no secrets file is found and path was not explicit.
def cd-load-secrets [secrets_path: string]: nothing -> record {
    if (($secrets_path | path exists) == false) {
        error make { msg: $"Secrets file not found: ($secrets_path)" }
    }
    let result = (do { ^sops --decrypt --output-type=dotenv $secrets_path } | complete)
    if $result.exit_code != 0 {
        error make { msg: $"SOPS decrypt failed for ($secrets_path):\n($result.stderr)" }
    }
    $result.stdout
    | lines
    | where { ($in | str starts-with "#") == false }
    | where { ($in | str contains "=") }
    | parse "{key}={value}"
    | reduce --fold {} {|row, acc| $acc | insert $row.key $row.value }
}

# Export a Nickel file relative to the workspace root, with workspace and provisioning import paths.
def cd-ncl-export [ws_root: string, rel_path: string]: nothing -> record {
    let prov_root = ($env.PROVISIONING? | default "/usr/local/provisioning")
    let full_path = ($ws_root | path join $rel_path)
    let result = (ncl-eval $full_path [$ws_root $prov_root])
    $result
}

# Read .provisioning-state.json and return FIP env vars (FIP_A_IP/ID, FIP_B_IP/ID, FIP_C_IP/ID).
#
# FIP key mapping (set by bootstrap.nu naming convention after stripping "librecloud-fip-" prefix
# and replacing dashes with underscores):
#   smtp           → FIP_A  (Stalwart SMTP, sgoyol-1)
#   sgoyol_ingress → FIP_B  (sgoyol Cilium ingress)
#   wuji           → FIP_C  (wuji K8s API + ingress)
def cd-load-fip-env [ws_root: string]: nothing -> record {
    let state_path = ($ws_root | path join ".provisioning-state.json")
    if (($state_path | path exists) == false) {
        error make { msg: ".provisioning-state.json not found — run: provisioning bootstrap first" }
    }
    let state = (open --raw $state_path | from json)
    let fips  = ($state | get bootstrap | get floating_ips)
    let fip_a = ($fips | get -o smtp           | default {})
    let fip_b = ($fips | get -o sgoyol_ingress | default {})
    let fip_c = ($fips | get -o wuji           | default {})
    {
        FIP_A_IP: ($fip_a | get -o ip | default ""),
        FIP_A_ID: ($fip_a | get -o id | default ""),
        FIP_B_IP: ($fip_b | get -o ip | default ""),
        FIP_B_ID: ($fip_b | get -o id | default ""),
        FIP_C_IP: ($fip_c | get -o ip | default ""),
        FIP_C_ID: ($fip_c | get -o id | default ""),
    }
}

# Build env var record for an extension install script.
#
# Protocol: scalar fields → `<PREFIX>_<FIELD>`, lists/records → `<PREFIX>_<FIELD>_JSON`.
# Full config also available as `<PREFIX>_CONFIG_JSON`. FIP vars and KUBECONFIG always set.
def cd-ext-env [ext_name: string, cfg: any, fip_env: record, kubeconfig: string]: nothing -> record {
    let prefix = ($ext_name | str upcase | str replace --all "-" "_" | str replace --all "." "_")
    let flat = if ($cfg | describe | str starts-with "record") {
        $cfg | transpose key val | reduce --fold {} {|entry, acc|
            let raw_key   = ($entry.key | str upcase | str replace --all "-" "_" | str replace --all "." "_")
            let type_desc = ($entry.val | describe)
            let is_scalar = ($type_desc in ["string", "int", "float", "bool"])
            let env_key   = if $is_scalar { $"($prefix)_($raw_key)" } else { $"($prefix)_($raw_key)_JSON" }
            let env_val   = if $type_desc == "string" {
                $entry.val
            } else if $is_scalar {
                $entry.val | into string
            } else {
                $entry.val | to json --raw
            }
            $acc | insert $env_key $env_val
        }
    } else {
        {}
    }
    $flat
    | insert $"($prefix)_CONFIG_JSON" ($cfg | to json --raw)
    | merge $fip_env
    | insert KUBECONFIG $kubeconfig
}

# Locate the install script for an extension under extensions/clusters/.
#
# Extensions have inconsistent naming: some dirs use underscores (cert_manager, hcloud_floater)
# while scripts use dashes (install-cert-manager.sh, install-hcloud-floater.sh). Others are
# all-dash (oci-reg) or all-same (metallb, git, woodpecker, stalwart).
# Tries all 4 combinations of (dir: _ or -) × (script: _ or -).
def cd-find-script [prov_root: string, ext_name: string]: nothing -> string {
    let dash_name  = ($ext_name | str replace --all "_" "-")
    let under_name = ($ext_name | str replace --all "-" "_")
    # Pairs of [dir_name, script_name] — ordered by most-likely match first.
    let combos = [
        [$under_name, $under_name],
        [$under_name, $dash_name],
        [$dash_name,  $dash_name],
        [$dash_name,  $under_name],
    ]
    let found = ($combos | each {|pair|
        let p = ($prov_root | path join "extensions/clusters" $pair.0 "default" $"install-($pair.1).sh")
        if ($p | path exists) { $p } else { null }
    } | where { $in != null })
    if ($found | is-empty) {
        error make { msg: $"No install script for extension '($ext_name)' in ($prov_root)/extensions/clusters/ (tried all _/- variants)" }
    }
    $found | first
}

# Locate the install script for a component under extensions/components/.
#
# Components are structured as extensions/components/{comp_name}/{mode}/install-{comp_name}.sh.
# Tries all 4 combinations of dir/script name with dashes and underscores.
def cd-find-component-script [prov_root: string, comp_name: string, mode: string]: nothing -> string {
    let dash_name  = ($comp_name | str replace --all "_" "-")
    let under_name = ($comp_name | str replace --all "-" "_")
    let combos = [
        [$under_name, $under_name],
        [$under_name, $dash_name],
        [$dash_name,  $dash_name],
        [$dash_name,  $under_name],
    ]
    let found = ($combos | each {|pair|
        let p = ($prov_root | path join "extensions/components" $pair.0 $mode $"install-($pair.1).sh")
        if ($p | path exists) { $p } else { null }
    } | where { $in != null })
    if ($found | is-empty) {
        error make { msg: $"No install script for component '($comp_name)' mode '($mode)' in ($prov_root)/extensions/components/ (tried all _/- variants)" }
    }
    $found | first
}

# Non-erroring variant for dry-run display — returns "<not found>" if no component script exists.
def cd-find-component-script-opt [prov_root: string, comp_name: string, mode: string]: nothing -> string {
    let dash_name  = ($comp_name | str replace --all "_" "-")
    let under_name = ($comp_name | str replace --all "-" "_")
    let combos = [
        [$under_name, $under_name],
        [$under_name, $dash_name],
        [$dash_name,  $dash_name],
        [$dash_name,  $under_name],
    ]
    let found = ($combos | each {|pair|
        let p = ($prov_root | path join "extensions/components" $pair.0 $mode $"install-($pair.1).sh")
        if ($p | path exists) { $p } else { null }
    } | where { $in != null })
    if ($found | is-empty) { "<not found>" } else { $found | first }
}

# Non-erroring variant for dry-run display — returns "<not found>" if no script exists.
def cd-find-script-opt [prov_root: string, ext_name: string]: nothing -> string {
    let dash_name  = ($ext_name | str replace --all "_" "-")
    let under_name = ($ext_name | str replace --all "-" "_")
    let combos = [
        [$under_name, $under_name],
        [$under_name, $dash_name],
        [$dash_name,  $dash_name],
        [$dash_name,  $under_name],
    ]
    let found = ($combos | each {|pair|
        let p = ($prov_root | path join "extensions/clusters" $pair.0 "default" $"install-($pair.1).sh")
        if ($p | path exists) { $p } else { null }
    } | where { $in != null })
    if ($found | is-empty) { "<not found>" } else { $found | first }
}

# Execute the health gate for an extension, retrying on transient failures.
def cd-health-gate [ext_id: string, gate: record]: nothing -> nothing {
    mut remaining = $gate.retries
    mut passed    = false
    while ($remaining > 0) and ($passed == false) {
        let res = (do { ^bash -c $gate.check_cmd } | complete)
        if $res.exit_code == 0 {
            $passed = true
            print $"  [($ext_id)] health gate OK"
        } else {
            $remaining -= 1
            if $remaining > 0 {
                let attempt = ($gate.retries - $remaining)
                print $"  [($ext_id)] gate ($attempt)/($gate.retries) failed — retry in 10s"
                ^sleep 10
            }
        }
    }
    if $passed == false {
        error make { msg: $"[($ext_id)] health gate failed after ($gate.retries) attempts.\nCmd: ($gate.check_cmd)" }
    }
}

# Deploy cluster extensions — L3 platform or L4 application services.
#
# Reads the deployment DAG from cluster/<cluster>/<layer>-dag.ncl and extension configs
# from cluster/<cluster>/<layer>.ncl. Extensions execute in dependency order defined
# by the DAG `depends_on` arrays. FIP IPs and IDs come from .provisioning-state.json
# written by `provisioning bootstrap`.
#
# Each install script receives:
#   <EXT>_<FIELD>        — scalar config values (namespace, version, host, …)
#   <EXT>_<FIELD>_JSON   — complex config values (ip_pools, node_selector, …)
#   <EXT>_CONFIG_JSON    — full extension config as JSON
#   FIP_A_IP / FIP_A_ID  — FIP-A (Stalwart SMTP)
#   FIP_B_IP / FIP_B_ID  — FIP-B (sgoyol ingress)
#   FIP_C_IP / FIP_C_ID  — FIP-C (wuji)
#   KUBECONFIG           — path to kubeconfig
#
# Usage:
#   provisioning cluster deploy platform sgoyol --workspace librecloud_renew
#   provisioning cluster deploy apps    sgoyol --workspace librecloud_renew
export def "main cluster deploy" [
    layer: string              # Deployment layer: platform | apps
    cluster: string            # Cluster name (e.g. sgoyol, wuji)
    --workspace (-w): string   # Workspace name (default: active workspace)
    --dry-run (-n)              # Print the execution plan without running install scripts
    --kubeconfig (-k): string   # Override KUBECONFIG path for kubectl calls
    --secrets-file (-s): string # SOPS-encrypted dotenv file with install script secrets.
                                # Auto-discovered at cluster/<cluster>/secrets.sops.env if omitted.
] : nothing -> nothing {
    if not ($layer in ["platform", "apps"]) {
        error make { msg: $"layer must be 'platform' or 'apps', got: ($layer)" }
    }

    let ws_name = if ($workspace | is-not-empty) {
        $workspace
    } else {
        let details = (get-active-workspace-details)
        if ($details == null) {
            error make { msg: "No active workspace — pass --workspace or activate one first" }
        }
        $details.name
    }

    let ws_root   = (get-workspace-path $ws_name)
    let prov_root = ($env.PROVISIONING? | default "/usr/local/provisioning")
    let dag_rel   = $"cluster/($cluster)/($layer)-dag.ncl"
    let cfg_rel   = $"cluster/($cluster)/($layer).ncl"
    let kube_cfg  = if ($kubeconfig | is-not-empty) {
        $kubeconfig
    } else {
        $env.KUBECONFIG? | default "/etc/kubernetes/admin.conf"
    }

    print $"Cluster deploy | workspace: ($ws_name) | cluster: ($cluster) | layer: ($layer)"
    if $dry_run { print "DRY RUN — install scripts will not execute" }
    if ($secrets_file | is-not-empty) { print $"  secrets: ($secrets_file)" }
    print ""

    let dag      = (cd-ncl-export $ws_root $dag_rel)
    let cfg      = (cd-ncl-export $ws_root $cfg_rel)
    let fip_env  = (cd-load-fip-env $ws_root)
    let ext_cfgs = ($cfg | get extensions)

    # SOPS secrets: explicit path > auto-discovered cluster/<cluster>/secrets.sops.env > empty.
    # Secrets are merged AFTER NCL env vars — they override any overlapping computed values.
    let secrets_env = if ($secrets_file | is-not-empty) {
        cd-load-secrets $secrets_file
    } else {
        let auto_path = ($ws_root | path join $"cluster/($cluster)/secrets.sops.env")
        if ($auto_path | path exists) {
            print $"  secrets: ($auto_path)"
            cd-load-secrets $auto_path
        } else {
            {}
        }
    }

    # Walk extensions in array order; verify depends_on are satisfied, then install + gate.
    let _completed = ($dag.extensions | reduce --fold [] {|entry, completed|
        let ext_id = $entry.id

        # Dependency guard — catches DAG authoring errors.
        let unsatisfied = ($entry.depends_on | where {|dep|
            ($completed | any {|c| $c == $dep }) == false
        })
        if ($unsatisfied | is-not-empty) {
            error make { msg: $"[($ext_id)] depends on [($unsatisfied | str join ', ')] not yet deployed — fix DAG ordering in ($dag_rel)" }
        }

        # Dispatch: component nodes use extensions/components/ path; extension nodes use extensions/clusters/.
        let is_component = ("component" in $entry) and ($entry | get -o component | default null) != null

        if $is_component {
            let comp     = ($entry.component)
            let comp_name = $comp.name
            let mode      = ($comp | get -o mode | default "cluster")
            let comp_cfg  = ($cfg | get -o components | default {} | get -o $ext_id | default {})
            let env_vars  = (cd-ext-env $comp_name $comp_cfg $fip_env $kube_cfg | merge $secrets_env)

            print $"[($ext_id)] component: ($comp_name) mode=($mode)"
            if ($entry | get -o parallel | default false) { print "  note: parallel=true (sequential execution)" }

            if $dry_run {
                let script_display = (cd-find-component-script-opt $prov_root $comp_name $mode)
                print $"  script:   ($script_display)"
                print $"  env keys: ($env_vars | columns | sort | str join ', ')"
                if ($entry | get -o health_gate | default null) != null {
                    print $"  gate:     ($entry.health_gate.check_cmd | str substring 0..80)..."
                }
            } else {
                let script = (cd-find-component-script $prov_root $comp_name $mode)
                print $"  script: ($script)"
                print ""
                with-env $env_vars { ^bash $script }
                let exit_code = $env.LAST_EXIT_CODE
                if $exit_code != 0 {
                    error make { msg: $"[($ext_id)] component install script exited ($exit_code)" }
                }
                if ($entry | get -o health_gate | default null) != null {
                    cd-health-gate $ext_id $entry.health_gate
                }
            }
        } else {
            let ext_name = $entry.extension
            let ext_cfg  = ($ext_cfgs | get -o $ext_id | default {})
            # secrets_env is merged last — its values win over any NCL-derived env var with the same key.
            let env_vars = (cd-ext-env $ext_name $ext_cfg $fip_env $kube_cfg | merge $secrets_env)

            print $"[($ext_id)] extension: ($ext_name)"
            if ($entry | get -o parallel | default false) { print "  note: parallel=true (sequential execution)" }

            if $dry_run {
                let script_display = (cd-find-script-opt $prov_root $ext_name)
                print $"  script:   ($script_display)"
                print $"  env keys: ($env_vars | columns | sort | str join ', ')"
                if ($entry | get -o health_gate | default null) != null {
                    print $"  gate:     ($entry.health_gate.check_cmd | str substring 0..80)..."
                }
            } else {
                let script = (cd-find-script $prov_root $ext_name)
                print $"  script: ($script)"
                print ""
                with-env $env_vars { ^bash $script }
                let exit_code = $env.LAST_EXIT_CODE
                if $exit_code != 0 {
                    error make { msg: $"[($ext_id)] install script exited ($exit_code)" }
                }
                if ($entry | get -o health_gate | default null) != null {
                    cd-health-gate $ext_id $entry.health_gate
                }
            }
        }

        print ""
        $completed | append $ext_id
    })

    print $"Cluster deploy complete: ($layer) on ($cluster)"
}