provisioning/adrs/adr-037-ops-contract-dual-mode.ncl

let d = import "adr-defaults.ncl" in

d.make_adr {
  id     = "adr-037",
  title  = "Ops contract dual-mode: NATS pending queue, JWT-signed commands, and switchable signer (keeper-VM auto / operator manual) without code changes",
  status = 'Accepted,
  date   = "2026-04-26",

  context = "The provisioning platform needs a coordination contract for runtime workload changes (deploy/scale/restart/secret_update/drain) that satisfies four constraints simultaneously: (1) the workload cluster (libre-wuji class) must be runtime-autonomous — it cannot pull from the CI cluster (libre-daoshi class) at boot or for steady-state operation; (2) operators must be able to drive ops manually from a laptop with a hardware key when the automated signer is offline, with no code changes; (3) multiple emitters (CI pipelines on libre-daoshi, operator laptops, future GitHub Actions) must be able to propose ops concurrently without distributed-lock complexity; (4) every applied op must be auditable with cryptographic provenance independent of any single node remaining online. The naive design — workload cluster pulls a deploy spec from a known git repo on the CI cluster — fails constraint (1); a direct RPC from CI to a signing service on a single VM fails constraint (2) when the signer dies; ad-hoc multi-emitter coordination via filesystem locks or database advisory locks fails constraint (3) under network partitions; storing audit logs only on the workload cluster fails constraint (4) when that cluster is lost. The design needs a single coordination substrate that decouples emitters from signers, serializes concurrent ops, survives signer outages without losing operations, and emits auditable provenance independent of cluster health.",

  decision = "Adopt a NATS JetStream-based ops contract with three subject namespaces and dual-mode signing. (1) Subject layout per workspace: `ops.pending.<workspace>.<op_type>` for unsigned proposals, `ops.cmd.<workspace>.<op_type>` for signed commands ready to apply, `ops.ack.<workspace>.<op_type>` for application result, `ops.audit.<workspace>` for the immutable audit stream. JetStream streams `OPS_PENDING_<workspace>` (WorkQueue retention, 14 days) and `OPS_CMD_<workspace>` (WorkQueue retention, 24 hours) plus `OPS_AUDIT_<workspace>` (Limits retention, 90 days, replicas=3) implement the persistence and ordering guarantees. (2) JWT claims for every signed message: `iss` (signer identity: keeper-vm-primary | operator-<id> | gh-actions-<id>), `sub` (requesting principal: woodpecker-job-<id> | manual-<operator>), `aud` (target workspace), `scopes` (allowed op_type:target tuples), `seq` (per-issuer monotonic counter — anti-replay), `jti` (UUIDv4 idempotency key), `expected_state_version` (optimistic concurrency token), `exp`/`nbf` (validity window). (3) Signer is any subscriber to `ops.pending.*` with a key in the workspace's authorized-signers set. The keeper-daemon (running on the dedicated ops-vm workspace) auto-signs operations matching a declarative policy file (see ADR-XXX keeper policy schema); the keeper-cli running on operator laptops with a YubiKey signs interactively via `keeper pending sign <id>`. Both produce identical JWT-signed messages on `ops.cmd.*` — wuji's ops-controller does not distinguish between automated and manual signers, only the JWT validity. (4) Mode switch is operational, not configurational: stopping the keeper-daemon process on ops-vm degrades the system to operator-only mode without any code or config change in wuji or daoshi. Restarting it restores automated signing. A hybrid mode is supported by tuning the keeper policy to auto-sign only safe operations (e.g., scale and restart on staging targets) while leaving production deploys for manual approval. (5) Multi-emitter coordination is delegated to JetStream: emitters publish independently with their own per-issuer sequence; the stream's total order resolves concurrency; the ops-controller in wuji applies in stream order with `expected_state_version` optimistic concurrency, returning 409 conflict on the second emitter when two ops target the same state version. (6) Wuji's ops-controller is the single subscriber to `OPS_CMD_<workspace>` in WorkQueue mode — there is exactly one applier per workspace, eliminating the need for distributed leader election; if the controller pod restarts, persisted state in SurrealDB allows reconciliation of in-flight ops on resume.",

  rationale = [
    {
      claim  = "JetStream WorkQueue retention with single subscriber gives total order without distributed locks",
      detail = "Multi-emitter coordination is the load-bearing complexity in this design. JetStream's WorkQueue stream type with a single durable consumer (wuji ops-controller) provides exactly-once delivery in stream order. Concurrent emitters from libre-daoshi, operator laptops, and external CI write to `ops.cmd.*` independently; the stream sequences them by arrival time. No emitter needs to coordinate with another. The controller applies in order; optimistic concurrency on `expected_state_version` rejects ops that read stale state, which manifests to the emitter as a 409 conflict via NATS request-reply on `ops.ack.*`. This shifts coordination from client-side distributed locks (which require failure-mode reasoning across emitter, lock server, and cluster) to the broker, which has well-understood semantics.",
    },
    {
      claim  = "Pending queue between emitters and signers makes mode switching free",
      detail = "If emitters published directly to `ops.cmd.*` (signing inline) the system would couple emitter availability to signer availability. By interposing `ops.pending.*` as a separate subject namespace, emitters publish proposals without knowing or caring about who signs. Any subscriber to `ops.pending.*` with a key in the authorized-signers set can sign and republish to `ops.cmd.*`. Switching from auto-sign (keeper-daemon on ops-vm) to manual-sign (operator laptop with keeper-cli) requires no change to emitters and no change to the consumer (wuji ops-controller) — it requires only enabling or disabling the relevant subscriber. This is the same decoupling pattern as a message queue with multiple consumer groups, applied to a signing-and-republish role.",
    },
    {
      claim  = "Mandatory JWT scope tuples prevent privilege escalation across workspaces",
      detail = "Each signer's JWT is constrained by `scopes` — an array of `op_type:target_pattern` tuples (e.g., `deploy:staging-*`, `scale:vapora`). The ops-controller validates that the requested op falls within at least one scope tuple before applying. A keeper-vm-primary key with scope `deploy:staging-*` cannot sign a deploy to `production-*` even if the policy file permits it locally — the JWT scope is the authoritative declaration. This means a compromised keeper-VM cannot forge production ops if its key was issued with staging-only scopes. Scope rotation (narrowing or widening) is a key-rotation operation, which is auditable.",
    },
    {
      claim  = "ops-controller persists in-flight ops to SurrealDB before ack to survive restart without duplicate apply",
      detail = "The naive controller acks `ops.cmd.*` consumption first then applies, which would mean a crash between ack and apply produces a missed op (not retried by JetStream because acked). The reverse — apply first then ack — produces possible duplicate apply if the controller crashes after applying but before acking. The correct pattern is: read message, persist `(jti, op_payload, state=pending)` to SurrealDB transactionally, ack to JetStream, then apply, then update SurrealDB to `state=applied`. On restart, the controller reads SurrealDB for `state=pending` rows and reconciles each by checking whether the op was actually applied (idempotency key prevents double-apply). This requires the apply layer to be idempotent on `jti`, which is a design requirement on every op handler.",
    },
    {
      claim  = "JWT issuer values are not service identities but key identities — survives signer migration",
      detail = "The `iss` claim names a key, not a service. `keeper-vm-primary` is the key currently held by the keeper-daemon; if the keeper-daemon migrates to a different VM, it still presents the same `iss`. Scope rotation (issuing a new key with different scopes) is a separate operation. This decoupling means we can move the keeper-daemon from ops-vm to a laptop temporarily without rotating keys, and a hardware-key-only operator setup uses a different `iss` (e.g., `operator-jpl-yubikey`) so audit trails remain attributable. A compromised key is revoked by removing its `iss` from the workspace's authorized-signers set, which is itself an op (governance op) signed by the operator quorum.",
    },
  ],

  consequences = {
    positive = [
      "Wuji is runtime-autonomous: it pulls nothing from daoshi at boot or steady state — only consumes signed messages from its own NATS JetStream",
      "Daoshi is replaceable: any system holding a signer key can drive ops; the platform's ops contract is not coupled to one CI provider",
      "Mode switch (auto/manual/hybrid) is operational not architectural — `systemctl stop keeper-daemon` is the entire migration to operator-only",
      "Multi-emitter coordination is a property of the broker (JetStream stream order), not an application concern",
      "Audit trail is on a separate stream with independent retention — applying ops cannot interfere with audit log integrity",
      "Replay protection (jti uniqueness + monotonic seq) prevents reissuing intercepted JWTs",
      "Optimistic concurrency surfaces conflicts as explicit 409s to emitters, not as silent overwrites — emitters decide retry policy",
      "ops-controller restart is safe because in-flight ops are persisted before ack — no missed ops, no duplicate applies",
    ],
    negative = [
      "NATS JetStream is now load-bearing for production ops — its availability constrains deploy throughput; mitigation: replicas=3 within wuji",
      "Idempotency contract on every op handler is a development requirement that must be tested per op_type — adding a new op_type requires verifying double-apply safety",
      "JWT clock skew between signer and verifier requires NTP/chrony on all signing hosts and on wuji nodes — operational requirement not visible from code",
      "JetStream retention windows (14 days pending, 24 hours cmd, 90 days audit) must be sized against the operational rhythm — pending exhaustion in operator-only mode if quorum review takes longer than 14 days will silently drop proposals",
      "Multi-emitter conflicts surface as 409s to emitters, who must implement retry-after-restate logic — emitters that ignore 409 will lose their op silently",
    ],
  },

  alternatives_considered = [
    {
      option       = "Direct HTTP RPC from emitters to a centralized signer service",
      why_rejected = "Couples emitter availability to signer availability and re-introduces the single-VM SPOF. Also requires the signer to be reachable on the network from every emitter, including external CI providers, which is a firewall complication. NATS JetStream as the substrate is already deployed for the orchestrator (ADR-012) and provides the same effect (decoupling, retry, audit) with no new network surface.",
    },
    {
      option       = "Pull-based deploys: wuji pulls deploy specs from a git repo on daoshi at intervals",
      why_rejected = "Violates wuji autonomy — wuji's runtime would depend on daoshi's git server being reachable. Also introduces eventual-consistency uncertainty (when does a push become visible?) without giving emitters a synchronous signal of acceptance. The pending/cmd/ack triple gives emitters a clear lifecycle: proposal accepted, op signed, op applied or rejected.",
    },
    {
      option       = "GitOps via Flux/ArgoCD with workload cluster pulling from a Radicle repo",
      why_rejected = "Solves the autonomy concern (Radicle is decentralized) but inherits GitOps' weaknesses for ops not modeled as state declarations: scale/restart/drain are imperative ops that require sequencing, not state convergence. Modeling them as state-document edits requires an awkward layer of versioned state files and reconciliation loops; pending-and-signed messages on a queue match the ops semantics directly. GitOps may complement this for the workload-config layer (ADR-038 covers Radicle's role in the desired-state ledger), but is not a replacement for ops coordination.",
    },
    {
      option       = "Distributed lock via SurrealDB live queries for multi-emitter coordination",
      why_rejected = "Introduces a write-write coordination problem on the lock document under concurrent emitters, recreating the distributed-lock complexity the JetStream approach avoids. JetStream's stream order is already a globally consistent total order — using it for both the message itself and the coordination semantics is simpler than separating the two concerns.",
    },
  ],

  constraints = [
    {
      id        = "ops-controller-single-subscriber",
      claim     = "Exactly one ops-controller consumer subscribes to OPS_CMD_<workspace> in WorkQueue mode per workspace; multiple subscribers would break ordering guarantees",
      scope     = "platform/crates/ops-controller/, infra/.../components/ops_controller.ncl",
      severity  = 'Hard,
      check     = {
        tag         = 'Grep,
        pattern     = "deliver_subject|durable_consumer",
        paths       = ["platform/crates/ops-controller/"],
        must_be_empty = false,
      },
      rationale = "JetStream WorkQueue with multiple consumers distributes messages round-robin across them, which breaks the single-applier invariant that backs the optimistic-concurrency contract. The constraint is enforced by component config (single replica) and runtime check on consumer creation.",
    },
    {
      id        = "jwt-scope-validation-mandatory",
      claim     = "ops-controller MUST validate JWT scopes against the requested op_type:target before applying; missing scope = reject with 403, do not log a 200",
      scope     = "platform/crates/ops-controller/src/auth.rs",
      severity  = 'Hard,
      check     = {
        tag         = 'Grep,
        pattern     = "validate_scopes|check_scope_match",
        paths       = ["platform/crates/ops-controller/src/"],
        must_be_empty = false,
      },
      rationale = "Without scope validation, any signer key with valid signature can submit any op type to any target, eliminating the privilege boundary that makes scoped keys useful. The check ensures scope validation is at least textually present; runtime tests verify behavior.",
    },
    {
      id        = "idempotency-contract-per-op-handler",
      claim     = "Every op_type handler in ops-controller MUST be idempotent on jti — double-apply with same jti must produce the same final state and not duplicate side effects",
      scope     = "platform/crates/ops-controller/src/handlers/",
      severity  = 'Hard,
      check     = {
        tag         = 'Grep,
        pattern     = "fn handle_.*\\(.*jti.*\\)",
        paths       = ["platform/crates/ops-controller/src/handlers/"],
        must_be_empty = false,
      },
      rationale = "The persist-then-ack-then-apply protocol requires handlers to handle restart-induced re-execution. A handler that issues a deploy command twice is allowed by NATS semantics under restart and must produce no observable difference — typically by checking the jti against persisted apply state before issuing side effects.",
    },
    {
      id        = "pending-queue-ttl-monitored",
      claim     = "OPS_PENDING_<workspace> queue depth and oldest-message age MUST be exposed as Prometheus metrics so operator-only mode (where pendings can accumulate) is observable",
      scope     = "platform/crates/ops-controller/, infra/.../components/observability.ncl",
      severity  = 'Soft,
      check      = { tag = 'Grep, pattern = "ops_pending_queue_depth|ops_pending_oldest_age_seconds", paths = ["platform/crates/ops-controller/src/"], must_be_empty = false },
      rationale = "In operator-only mode, pendings accumulate awaiting human signature. Without monitoring, operators may not notice that a pending sat for 13 days and is about to expire. The 14-day retention is generous but finite; observability of queue state is the operational mitigation against silent drop.",
    },
  ],

  ontology_check = {
    decision_string    = "Ops contract dual-mode: NATS JetStream with ops.pending/ops.cmd/ops.ack/ops.audit subject namespaces + JWT-signed commands with scopes + replaceable signer (keeper-daemon auto / keeper-cli manual) + ops-controller as single per-workspace WorkQueue consumer with SurrealDB persistence of in-flight ops",
    invariants_at_risk = ["solid-boundaries", "config-driven-always"],
    verdict            = 'Safe,
  },

  related_adrs = ["adr-012-nats-event-broker", "adr-013-surrealdb-global-store", "adr-014-solid-enforcement", "adr-038-radicle-decentralized-governance", "adr-039-build-infrastructure-ephemeral"],

  invariant_justification = {
    invariant  = "solid-boundaries",
    claim      = "ops-controller is a new service with a new SOLID boundary: it ONLY consumes from ops.cmd, applies via the orchestrator API, and writes to ops.audit and SurrealDB — it does not call provider APIs or auth services directly",
    mitigation = "Cedar policy enforces that ops-controller's service identity has no permissions to call hcloud, aws, or vault directly; orchestrator interface is the only allowed dependency. Compile-time check in the ops-controller crate forbids hcloud-rs and aws-sdk-rust as dependencies.",
  },
}