60 lines
5.9 KiB
Text
60 lines
5.9 KiB
Text
|
|
let d = import "adr-defaults.ncl" in
|
||
|
|
|
||
|
|
d.make_adr {
|
||
|
|
id = "adr-032",
|
||
|
|
title = "Node Role and Scale Constraints: ControlPlane Immutability and Worker Lifecycle Gates",
|
||
|
|
status = 'Accepted,
|
||
|
|
date = "2026-04-20",
|
||
|
|
|
||
|
|
context = "The provisioning system gained a formal `NodeRole` enum (`ControlPlane | Worker | LoadBalancer`) declared in `schemas/infrastructure/compute/scaling.ncl` alongside a `ScalePolicy` contract that captures min/max bounds and a hardware template for spawning new nodes. Without explicit lifecycle gates, any operator with `can_operate` permission could call `server_delete` on a ControlPlane node — destroying the k0s controller, etcd state, and all cluster API endpoints in one call. A secondary risk exists in the opposite direction: deleting the last Worker node while the ControlPlane still serves its API violates the `scale.min` bound declared in the NCL, leaving the cluster in a partially healthy state with no execution capacity.",
|
||
|
|
|
||
|
|
decision = "Enforce role-aware lifecycle gates at three layers: (1) Schema — `delete_lock` is implicitly true for ControlPlane nodes via the `make_server` helper in each workspace's `servers.ncl`; hcloud protection mirrors the schema intent. (2) Daemon UI — a dedicated POST `/ui/workspaces/{ws}/servers/{srv}/scale-down` handler runs two sequential gates before calling `server_delete`: Gate-1 rejects any request where `role == ControlPlane` with HTTP 422; Gate-2 counts live hcloud servers whose names match the `scale.template.hostname_pattern` prefix — if removing this node would bring the count below `scale.min`, the request is rejected. (3) Teardown order — ControlPlane nodes can only be targeted for deletion through a dedicated `teardown` workflow (future) that first deprovisions all Workers; the scale-down endpoint is not the teardown path. The scale-down endpoint is the only UI-exposed deletion path for Worker/LB nodes — the raw `server_delete` tool remains available to admin-role CLI operators only.",
|
||
|
|
|
||
|
|
rationale = [
|
||
|
|
{
|
||
|
|
claim = "Gate at the daemon layer, not only at the schema layer",
|
||
|
|
detail = "hcloud `protection.delete = true` prevents accidental UI clicks on the hcloud console but does not fire when the provisioning daemon calls the hcloud CLI with `--force`. The daemon gate is the authoritative enforcement point because it understands role semantics. Schema-level `delete_lock` is a documentation and default-setting mechanism, not a runtime gate.",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
claim = "Separate scale-down endpoint instead of adding guards to the existing server_delete tool",
|
||
|
|
detail = "The `server_delete` tool is a low-level destructive primitive registered in provisioning-core. Adding role-awareness to it would couple infrastructure topology semantics into the core tool layer, which is designed to be workspace-agnostic. The scale-down UI handler is workspace-scoped — it loads `servers.ncl` for the active workspace to read the role and scale policy, then calls the primitive only after gates pass.",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
claim = "ScalePolicy.min is the authoritative lower bound, not a hardcoded value",
|
||
|
|
detail = "Different infra environments have different operational minimums. A dev workspace may tolerate 0 workers; a production cluster requires at least 2 for HA. Encoding min in the NCL `ScalePolicy` means the gate is always consistent with the declared intent, with no magic constants in daemon code.",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
claim = "Teardown order (Workers before ControlPlane) is not enforced by scale-down",
|
||
|
|
detail = "The scale-down endpoint enforces min-bound and CP-immutability but does not implement full teardown sequencing. A full teardown (destroy entire infra env) is a DAG-inverted workflow: reverse the provisioning DAG, deprovision Workers first, then ControlPlane. This is a separate concern handled by a future `teardown` workflow endpoint. Mixing teardown logic into scale-down would conflate two distinct operations.",
|
||
|
|
},
|
||
|
|
],
|
||
|
|
|
||
|
|
consequences = {
|
||
|
|
positive = [
|
||
|
|
"ControlPlane nodes cannot be deleted via the UI regardless of operator permission level",
|
||
|
|
"Worker deletion is gated on the declared scale.min — under-provision accidents are caught before hcloud API call",
|
||
|
|
"The daemon UI gate is the single authoritative enforcement point — no duplication across CLI, MCP, and HTTP handlers",
|
||
|
|
"ScalePolicy.min can be changed in NCL without touching daemon code",
|
||
|
|
],
|
||
|
|
negative = [
|
||
|
|
"Admin operators who intentionally need to delete a CP node (disaster recovery, full teardown) must use the CLI `server_delete` tool directly — the UI does not expose an override path",
|
||
|
|
"The hostname_pattern prefix heuristic for counting live workers is a string-prefix match, not a typed query — it fails if two workspaces share a hostname prefix",
|
||
|
|
],
|
||
|
|
},
|
||
|
|
|
||
|
|
alternatives_considered = [
|
||
|
|
{
|
||
|
|
option = "Add role check to the existing server_delete tool in provisioning-core",
|
||
|
|
why_rejected = "server_delete is a workspace-agnostic primitive. Loading servers.ncl inside a core tool would introduce workspace path coupling into a layer that must remain context-free. The UI handler already has workspace context.",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
option = "Use Cedar policies for role-based node protection",
|
||
|
|
why_rejected = "Cedar is configured for principal-level authorization (who can do what), not for resource-level topology constraints (which nodes are protected). The node role is a property of the infrastructure declaration, not of the actor's permissions. Cedar would need to be fed the role data per-request — more complexity than a local gate with no added safety.",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
option = "Block deletion via hcloud protection flag only",
|
||
|
|
why_rejected = "hcloud protection fires only when the hcloud CLI is called directly. The provisioning daemon calls the hcloud CLI with privilege — protection can be disabled before deletion in a single compound command. It is a backstop, not a gate.",
|
||
|
|
},
|
||
|
|
],
|
||
|
|
}
|