let d = import "adr-defaults.ncl" in d.make_adr { id = "adr-033", title = "Cluster Component Extension Pattern: split-script + manifest plan authoring contract", status = 'Accepted, date = "2026-04-24", context = "ADR-031 introduced the unified `prvng component ` CLI with polymorphic mode dispatch. The orchestrator server runs `install-{name}.sh {op}` as the cluster-mode entry point. Before this decision, no authoring contract existed for cluster extensions: credential file naming (`credentials.env` vs `_credentials.env`), method implementations, and the manifest plan structure were conventions known only from reading existing extensions. The postgresql extension was authored with the legacy monolithic pattern — `credentials.env`, all logic in `install-postgresql.sh`, no `{name}-lib.sh`, no `manifest_plan.ncl`. This produced a remote failure (`POSTGRES_PASSWORD is not set`) that was undetectable by the preflight, reached the server, and left the op in failed state.", decision = "All cluster-mode extensions must follow the split-script pattern enforced by the preflight structural gate. The contract has four parts: (1) `install-{name}.sh` sources `_credentials.env` (underscore prefix, written by the bundle builder from SOPS decryption) — never `credentials.env`; (2) `{name}-lib.sh` implements `_method_{action}` for every non-builtin action declared in `manifest_plan.ncl`, including `post`/`pre` hook actions; (3) `manifest_plan.ncl` declares the operation DAG via the `ManifestPlan` Nickel contract from `schemas/lib/manifest_plan.ncl` — this contract enforces that `namespace` and `pvc` are never deleted or recreated in `update`/`delete`/`restart` plans; (4) `metadata.ncl requires[].capability` values must exactly match a `provides[].id` declared in another workspace component's `metadata.ncl` — the precondition gate does string-exact matching, generic IDs like `'storage'` do not resolve. The preflight gate in `cli/components.nu` checks all four contracts before packaging, surfacing violations as `[preflight] ❌` with the specific cause.", rationale = [ { claim = "Credential filename mismatch is undetectable without structural inspection", detail = "The bundle builder writes `_credentials.env` (prefixed). An install script sourcing `credentials.env` (no prefix) silently skips the source — no error at local preflight, failure only on the remote node mid-plan. The structural gate reads the install script and rejects any `source.*credentials.env` line that does not contain the underscore.", }, { claim = "Method coverage check prevents partial manifest plan execution", detail = "The plan runner generates `run-init.sh` from `manifest_plan.json` and calls `_method_{action}` for each custom step. A missing method produces `command not found` mid-run, leaving the cluster in a partial state. The preflight exhaustively checks all actions in `init`, `update`, `delete`, `restart` plus their `pre`/`post` hooks.", }, { claim = "ManifestPlan Nickel contract encodes data-safety invariants at schema time", detail = "The `ManifestPlan` contract rejects any plan that applies `delete` or `recreate` to `namespace` or `pvc` in non-init operations. This is a compile-time safety net: the plan cannot be exported to JSON if it would destroy persistent data during a rolling update or delete operation.", }, { claim = "Capability ID exact-match is the only resolution mechanism in the precondition gate", detail = "The gate iterates workspace component NCL files, reads their `metadata.ncl provides[].id`, and matches against `requires[].capability`. There is no fuzzy matching, no aliasing, no category hierarchy. Using `'block-storage-csi'` vs `'storage'` is not a naming convention — it is a hard requirement for the gate to resolve the dependency chain.", }, ], consequences = { positive = [ "Credential filename bug caught at local `--check` — never reaches the remote node", "Missing `_method_*` implementations surface as named preflight failures before any SSH", "ManifestPlan contract prevents accidental PVC/namespace destruction by type system, not convention", "Capability ID mismatch caught at op submission by the precondition gate with a named error", ], negative = [ "Legacy monolithic extensions require backfill: add `{name}-lib.sh`, `manifest_plan.ncl`, rename `credentials.env` → `_credentials.env`", "Typos in `manifest_plan.ncl` action names (`'wai-ready` vs `'wait-ready`) fail at preflight but not at authoring time — no schema validation of action name strings", ], }, alternatives_considered = [ { option = "Monolithic install-{name}.sh with case/esac per-operation dispatch", why_rejected = "No structural contract between plan step declarations and shell method implementations. Credential filename bugs reach the remote node. Tested in postgresql initial authoring: produced a silent `POSTGRES_PASSWORD is not set` on the remote after a successful local preflight.", }, { option = "Schema-validate action names in manifest_plan.ncl against a closed enum", why_rejected = "Custom actions are component-specific (`'create-credentials'`, `'bootstrap-account'`, `'protect-volume'`). A closed enum would require every extension to register action names centrally — breaks the distributed authoring model of ADR-020. The method-coverage gate achieves the same safety without a registry.", }, { option = "Auto-source _credentials.env at run-{op}.sh level (bundle builder injects it)", why_rejected = "Credentials would be exported for the entire script lifetime, visible to any subcommand. The explicit `source` inside `_method_create-credentials` is the correct scope: credentials are loaded only when the method that needs them runs, and unset after. ADR-018 (secretumvault) requires minimal credential exposure time.", }, ], constraints = [ { id = "credential-filename-underscore", claim = "install-{name}.sh must source _credentials.env, never credentials.env", scope = "provisioning/extensions/components/*/cluster/install-*.sh", severity = 'Hard, check = { tag = 'Grep, pattern = "source.*[^_]credentials\\.env", paths = ["provisioning/extensions/components/"], must_be_empty = true, }, rationale = "The bundle builder writes the SOPS-decrypted secret to _credentials.env. Sourcing credentials.env (no underscore) silently skips the file — POSTGRES_PASSWORD (or any credential) is never set, and _require_env fails on the remote node with no local signal.", }, { id = "lib-sh-required-for-cluster-components", claim = "Every cluster extension must have {name}-lib.sh with all _method_* implementations declared in manifest_plan.ncl", scope = "provisioning/extensions/components/*/cluster/", severity = 'Hard, check = { tag = 'NuCmd, cmd = "PROVISIONING_NO_CACHE=true provisioning component install --check 2>&1 | grep '_method_.*missing'", expect_exit = 1, }, rationale = "The preflight structural gate exhaustively checks method coverage. A missing _method_X is a preflight failure, not a remote failure. Without this constraint, a partial lib.sh reaches the server and produces a bash `command not found` mid-plan, leaving the namespace in an inconsistent state.", }, { id = "manifest-plan-ncl-required", claim = "Every cluster extension must have manifest_plan.ncl validated by the ManifestPlan Nickel contract", scope = "provisioning/extensions/components/*/cluster/manifest_plan.ncl", severity = 'Hard, check = { tag = 'FileExists, path = "provisioning/extensions/components/{name}/cluster/manifest_plan.ncl", present = true, }, rationale = "Without manifest_plan.ncl the bundle builder produces an empty plan — no run-*.sh scripts are generated. The ManifestPlan contract is the only enforcement mechanism for the namespace/pvc deletion protection invariant.", }, { id = "capability-id-exact-provider-match", claim = "metadata.ncl requires[].capability must exactly match a provides[].id declared in a workspace component", scope = "provisioning/extensions/components/*/metadata.ncl", severity = 'Hard, check = { tag = 'NuCmd, cmd = "PROVISIONING_NO_CACHE=true provisioning component install --check 2>&1 | grep 'no provider found'", expect_exit = 1, }, rationale = "The orchestrator precondition gate in src/preconditions.rs does string-exact lookup: provides[].id == requires[].capability. Generic terms like 'storage' do not match 'block-storage-csi'. The gate rejects the op at submission time, before any SSH, with a named error. Use the exact IDs from the target provider's metadata.ncl.", }, { id = "sops-file-required-for-require-env", claim = "Every cluster extension that calls _require_env VAR in {name}-lib.sh must have infra/{ws}/secrets/{name}.sops.yaml present", scope = "provisioning/extensions/components/*/cluster/*-lib.sh", severity = 'Hard, check = { tag = 'NuCmd, cmd = "PROVISIONING_NO_CACHE=true provisioning component install --check 2>&1 | grep 'sops.yaml not found'", expect_exit = 1, }, rationale = "The preflight SOPS gate (comp-build-cluster-bundle) checks for the secrets file before attempting bundle build. A missing secrets file means _require_env variables would be unset on the remote node, causing the install script to abort mid-plan. The preflight check surfaces this locally before any SSH occurs.", }, { id = "sops-encrypted-regex-covers-require-env-vars", claim = "Every VAR referenced via _require_env in {name}-lib.sh must appear in sops.encrypted_regex of {name}.sops.yaml", scope = "infra/*/secrets/*.sops.yaml", severity = 'Hard, check = { tag = 'NuCmd, cmd = "PROVISIONING_NO_CACHE=true provisioning component install --check 2>&1 | grep 'not in sops.encrypted_regex'", expect_exit = 1, }, rationale = "SOPS only encrypts keys matching encrypted_regex. A variable in _require_env that is absent from encrypted_regex is stored in plaintext in the SOPS file and silently passes decryption — it appears to work but leaks secrets in the committed YAML. The preflight checks name coverage explicitly against the regex.", }, ], ontology_check = { decision_string = "Cluster extension authoring contract: split-script (install.sh + lib.sh + manifest_plan.ncl) + _credentials.env naming + exact capability IDs — enforced by preflight structural gate before bundle packaging", invariants_at_risk = ["type-safety-nickel", "config-driven-always"], verdict = 'Safe, }, related_adrs = ["adr-020-extension-capability-declarations", "adr-031-unified-component-cli"], }