let d = import "adr-defaults.ncl" in d.make_adr { id = "adr-032", title = "Node Role and Scale Constraints: ControlPlane Immutability and Worker Lifecycle Gates", status = 'Accepted, date = "2026-04-20", context = "The provisioning system gained a formal `NodeRole` enum (`ControlPlane | Worker | LoadBalancer`) declared in `schemas/infrastructure/compute/scaling.ncl` alongside a `ScalePolicy` contract that captures min/max bounds and a hardware template for spawning new nodes. Without explicit lifecycle gates, any operator with `can_operate` permission could call `server_delete` on a ControlPlane node — destroying the k0s controller, etcd state, and all cluster API endpoints in one call. A secondary risk exists in the opposite direction: deleting the last Worker node while the ControlPlane still serves its API violates the `scale.min` bound declared in the NCL, leaving the cluster in a partially healthy state with no execution capacity.", decision = "Enforce role-aware lifecycle gates at three layers: (1) Schema — `delete_lock` is implicitly true for ControlPlane nodes via the `make_server` helper in each workspace's `servers.ncl`; hcloud protection mirrors the schema intent. (2) Daemon UI — a dedicated POST `/ui/workspaces/{ws}/servers/{srv}/scale-down` handler runs two sequential gates before calling `server_delete`: Gate-1 rejects any request where `role == ControlPlane` with HTTP 422; Gate-2 counts live hcloud servers whose names match the `scale.template.hostname_pattern` prefix — if removing this node would bring the count below `scale.min`, the request is rejected. (3) Teardown order — ControlPlane nodes can only be targeted for deletion through a dedicated `teardown` workflow (future) that first deprovisions all Workers; the scale-down endpoint is not the teardown path. The scale-down endpoint is the only UI-exposed deletion path for Worker/LB nodes — the raw `server_delete` tool remains available to admin-role CLI operators only.", rationale = [ { claim = "Gate at the daemon layer, not only at the schema layer", detail = "hcloud `protection.delete = true` prevents accidental UI clicks on the hcloud console but does not fire when the provisioning daemon calls the hcloud CLI with `--force`. The daemon gate is the authoritative enforcement point because it understands role semantics. Schema-level `delete_lock` is a documentation and default-setting mechanism, not a runtime gate.", }, { claim = "Separate scale-down endpoint instead of adding guards to the existing server_delete tool", detail = "The `server_delete` tool is a low-level destructive primitive registered in provisioning-core. Adding role-awareness to it would couple infrastructure topology semantics into the core tool layer, which is designed to be workspace-agnostic. The scale-down UI handler is workspace-scoped — it loads `servers.ncl` for the active workspace to read the role and scale policy, then calls the primitive only after gates pass.", }, { claim = "ScalePolicy.min is the authoritative lower bound, not a hardcoded value", detail = "Different infra environments have different operational minimums. A dev workspace may tolerate 0 workers; a production cluster requires at least 2 for HA. Encoding min in the NCL `ScalePolicy` means the gate is always consistent with the declared intent, with no magic constants in daemon code.", }, { claim = "Teardown order (Workers before ControlPlane) is not enforced by scale-down", detail = "The scale-down endpoint enforces min-bound and CP-immutability but does not implement full teardown sequencing. A full teardown (destroy entire infra env) is a DAG-inverted workflow: reverse the provisioning DAG, deprovision Workers first, then ControlPlane. This is a separate concern handled by a future `teardown` workflow endpoint. Mixing teardown logic into scale-down would conflate two distinct operations.", }, ], consequences = { positive = [ "ControlPlane nodes cannot be deleted via the UI regardless of operator permission level", "Worker deletion is gated on the declared scale.min — under-provision accidents are caught before hcloud API call", "The daemon UI gate is the single authoritative enforcement point — no duplication across CLI, MCP, and HTTP handlers", "ScalePolicy.min can be changed in NCL without touching daemon code", ], negative = [ "Admin operators who intentionally need to delete a CP node (disaster recovery, full teardown) must use the CLI `server_delete` tool directly — the UI does not expose an override path", "The hostname_pattern prefix heuristic for counting live workers is a string-prefix match, not a typed query — it fails if two workspaces share a hostname prefix", ], }, alternatives_considered = [ { option = "Add role check to the existing server_delete tool in provisioning-core", why_rejected = "server_delete is a workspace-agnostic primitive. Loading servers.ncl inside a core tool would introduce workspace path coupling into a layer that must remain context-free. The UI handler already has workspace context.", }, { option = "Use Cedar policies for role-based node protection", why_rejected = "Cedar is configured for principal-level authorization (who can do what), not for resource-level topology constraints (which nodes are protected). The node role is a property of the infrastructure declaration, not of the actor's permissions. Cedar would need to be fed the role data per-request — more complexity than a local gate with no added safety.", }, { option = "Block deletion via hcloud protection flag only", why_rejected = "hcloud protection fires only when the hcloud CLI is called directly. The provisioning daemon calls the hcloud CLI with privilege — protection can be disabled before deletion in a single compound command. It is a backstop, not a gate.", }, ], }