provisioning/adrs/adr-014-solid-enforcement.ncl

let d = import "adr-defaults.ncl" in

d.make_adr {
  id     = "adr-014",
  title  = "SOLID Architecture Boundaries with Multi-Layer Enforcement",
  status = 'Accepted,
  date   = "2026-02-17",

  context = "As the platform expanded from a CLI tool to a multi-service control plane, a critical failure mode emerged: the Nushell CLI directly called cloud provider CLIs (hcloud, aws, doctl). This violated Single Responsibility (the CLI acquired infrastructure execution responsibility) and Dependency Inversion (CLI depended on concrete provider CLIs instead of the Orchestrator abstraction). Consequences: provider credentials leaked into CLI process environment (HCLOUD_TOKEN as env var), no audit trail for provider API calls made outside Orchestrator, SSH operations done by CLI bypassed the task state machine and rollback capability, auth decisions (JWT validation) duplicated across services instead of delegated to Control Center, secret files read directly by multiple services bypassing Vault's lease lifecycle. Documentation alone fails to enforce boundaries: engineers under time pressure skip it. The enforcement must be structural.",

  decision = "Six hard boundaries are enforced at six independent layers. Each layer is a fail-safe — any single layer catching a violation is sufficient to prevent it shipping. The six boundaries: (1) Provider API calls only in orchestrator crate, (2) SSH operations only in orchestrator+machines crates, (3) SurrealDB access from CLI forbidden, (4) Secret credentials forbidden in NATS messages, (5) Auth decisions only in control-center crate, (6) Raw secret file/env reads in services forbidden. Enforcement layers: compile-time (pub(crate) visibility), dev-time (Claude PreToolUse hook), pre-commit (git hook), CI (architecture tests), runtime (Cedar policies), continuous audit (NATS audit subject).",

  rationale = [
    {
      claim  = "Documentation alone is insufficient — enforcement must be structural",
      detail = "Engineers under time pressure bypass documentation. The six-layer enforcement stack means a violation must simultaneously evade compile-time type checking, the Claude dev hook, the pre-commit grep, the CI architecture test, Cedar policy evaluation, and the NATS audit collector. Any single layer is sufficient to catch it.",
    },
    {
      claim  = "Compile-time is the cheapest enforcement layer",
      detail = "Provider client types are pub(crate) inside orchestrator. Other crates cannot import them — the Rust compiler rejects the build before any test runs. This is O(0) runtime cost.",
    },
    {
      claim  = "AUTH corollaries prevent auth fragmentation",
      detail = "solo_auth_middleware is the only documented auth bypass, gated behind --mode solo. All protected routes are inside route_layer(). UserContext is extracted from request extensions, never from headers directly. Cedar policies are the only authorization mechanism — no ad-hoc role checks.",
    },
    {
      claim  = "NATS audit subject provides continuous violation detection at runtime",
      detail = "provisioning.audit.violation.solid is published on runtime violations. AuditCollector persists these to SurrealDB. Violations discovered after deployment are recorded and queryable.",
    },
  ],

  consequences = {
    positive = [
      "All provider credentials are scoped to Orchestrator's process — no credential leakage path to CLI",
      "Task state machine in Orchestrator provides rollback for every provider operation",
      "Auth defects are isolated to Control Center — other services cannot accidentally implement auth",
      "SOLID violations are caught at the earliest possible layer (usually compile-time or dev-time), not in production",
    ],
    negative = [
      "Adding a new cloud provider requires changes to Orchestrator only — correct by design but requires understanding the dispatch model",
      "The pre-commit hook adds ~200ms to commit time for grep scans",
      "CLI cannot query provider state directly — must call Orchestrator API, adding one HTTP hop",
    ],
  },

  alternatives_considered = [
    {
      option       = "Compile-time enforcement only via crate visibility",
      why_rejected = "Insufficient for Nushell code which has no compile-time type system. Pre-commit and Claude hooks are needed to cover .nu files where the Rust compiler cannot help.",
    },
    {
      option       = "Documentation + code review process",
      why_rejected = "The failure mode this ADR addresses (direct provider CLI calls from Nushell) was introduced despite existing documentation. Enforcement must be automatic, not manual.",
    },
  ],

  constraints = [
    {
      id         = "provider-calls-orchestrator-only",
      claim      = "Provider API calls (hcloud, aws, doctl, upctl) must only exist in the orchestrator crate",
      scope      = "provisioning/",
      severity   = 'Hard,
      check      = { tag = 'NuCmd, cmd = "rg 'hcloud|aws|doctl|upctl' --include='*.rs' provisioning/ | grep -v 'orchestrator'", expect_exit = 1 },
      rationale  = "All provider API calls must flow through the Orchestrator dispatch model to maintain audit trail and rollback capability.",
    },
    {
      id         = "ssh-orchestrator-machines-only",
      claim      = "SSH operations (russh, ssh2) must only exist in orchestrator and machines crates",
      scope      = "provisioning/platform/crates/",
      severity   = 'Hard,
      check      = { tag = 'NuCmd, cmd = "rg 'russh|ssh2' --include='*.rs' provisioning/platform/crates/ | grep -v 'orchestrator\\|machines'", expect_exit = 1 },
      rationale  = "SSH operations that bypass Orchestrator bypass the task state machine and lose rollback capability and audit trail.",
    },
    {
      id         = "solo-auth-middleware-single-bypass",
      claim      = "solo_auth_middleware is the only place in the codebase where auth is bypassed; it must be gated behind --mode solo and never used in production routing",
      scope      = "platform/crates/control-center/src/middleware/",
      severity   = 'Hard,
      check      = { tag = 'Grep, pattern = "bypass|skip.*auth|no.*auth", paths = ["platform/crates/"], must_be_empty = true },
      rationale  = "A single documented and tested auth bypass is auditable. Multiple bypass paths create an audit surface that cannot be systematically verified.",
    },
    {
      id         = "cedar-only-authorization",
      claim      = "No ad-hoc role checks (if user.roles.contains) in business logic — Cedar policies are the only authorization mechanism",
      scope      = "platform/crates/",
      severity   = 'Soft,
      check      = { tag = 'NuCmd, cmd = "rg 'roles.contains|role ==' --include='*.rs' platform/crates/ | grep -v test", expect_exit = 1 },
      rationale  = "Ad-hoc role checks create authorization logic scattered across services that cannot be audited or modified atomically.",
    },
  ],

  related_adrs = ["adr-012-nats-event-broker", "adr-013-surrealdb-global-store", "adr-015-solo-mode-architecture"],

  ontology_check = {
    decision_string    = "Six hard SOLID boundaries enforced at six independent layers; solo_auth_middleware is the only documented auth bypass; Cedar is the only authorization mechanism",
    invariants_at_risk = ["solid-boundaries", "provider-abstraction"],
    verdict            = 'Safe,
  },
}