let d = import "adr-defaults.ncl" in d.make_adr { id = "adr-014", title = "Runtime Service Toggles — AtomicBool Flags for MCP and GraphQL", status = 'Accepted, date = "2026-04-26", context = "ontoref-daemon exposes optional services (MCP, GraphQL) compiled in via Cargo feature flags. Once compiled, these services were always active for the lifetime of the process. Two scenarios require disabling them at runtime without restart: (1) security incident — temporarily disable a surface while investigating a compromise; (2) operator choice — enable graphql during a debug session, disable it in production. The alternative of restarting the daemon is disruptive because it drops all in-memory sessions, actors, and notification queues. Compile-time feature flags solve the binary presence problem but cannot address runtime availability.", decision = "Introduce ServiceFlags (pub struct in api.rs) holding one AtomicBool per toggleable service, gated by the corresponding feature flag. ServiceFlags::new() initialises all flags to true (enabled). AppState holds Arc shared across all clones. The toggle check lives in a route_layer middleware on the sub-router for each service — not inside the service handlers themselves — so the check is enforced regardless of which handler a request reaches. Two toggle surfaces are provided: (1) REST API: PUT /api/services/:service {\"enabled\": bool} — daemon admin Bearer required; (2) UI: POST /ui/manage/services/:service/toggle — AdminGuard (cookie session). Both surfaces return the new state. The manage page shows each compiled-in service with an HTMX toggle button; the navbar badges reflect runtime state on every page render.", rationale = [ { claim = "AtomicBool is the correct primitive for a hot-path toggle", detail = "A toggle check runs on every request to /mcp/* and /graphql/*. AtomicBool::load(Relaxed) is a single CPU instruction with no lock, no allocation, and no blocking — identical cost to a null pointer check. RwLock would introduce lock contention under concurrent requests with no benefit, since the only invariant needed is 'consistent within a single request', which Relaxed ordering satisfies.", }, { claim = "Middleware layer placement enforces the toggle at the router boundary", detail = "Placing the AtomicBool check inside each handler would require every handler to duplicate the guard, and a new handler added to the sub-router would silently bypass the toggle. A route_layer on the sub-router wraps all handlers uniformly — the check cannot be missed regardless of how many handlers the sub-router gains in the future.", }, { claim = "Arc (not Arc per flag) is the correct sharing primitive", detail = "AtomicBool is not Clone. Wrapping the whole ServiceFlags struct in one Arc means: (1) the middleware closure captures one Arc clone instead of N; (2) new flags can be added to ServiceFlags without changing the sharing structure; (3) the UI and REST handlers access all flags via one field on AppState.", }, { claim = "Two toggle surfaces (REST + UI) serve distinct operator workflows", detail = "The REST endpoint (daemon admin Bearer) is suitable for scripted automation and CLI pipelines (curl -X PUT). The UI endpoint (AdminGuard cookie) enables toggle from the browser manage page without exposing the raw admin token. Both require daemon admin credentials — this is not a project-level operation.", }, { claim = "Toggle is volatile — it does not survive daemon restart", detail = "ServiceFlags are initialised to enabled=true on every startup regardless of the pre-restart state. This is intentional: a flag disabled for incident response should not silently persist across restarts, which would hide the disabled state from new operators. If persistent toggle state is needed it belongs in config.ncl as a compiled-in default, not in volatile memory.", }, ], consequences = { positive = [ "MCP and GraphQL can be disabled in under 1ms without dropping in-memory state", "Toggle surfaces require daemon admin credentials — the same identity used for project management", "New optional services added to the daemon gain toggle support by adding one AtomicBool field to ServiceFlags", "Middleware placement means the toggle cannot be bypassed by adding new handlers to the sub-router", ], negative = [ "Toggle state is lost on restart — disabling a service for incident response must be re-applied after restart or captured in config", "ServiceFlags is not Clone (AtomicBool is not Clone) — AppState clones the Arc, not the flags; callers that pattern-match on AppState fields must be aware of this", ], }, alternatives_considered = [ { option = "Restart daemon with different feature flags or config", why_rejected = "Restart drops SessionStore (all active logins), actor registry, notification queue, and NATS subscriptions. A 1-second outage is acceptable for planned maintenance but not for rapid incident response.", }, { option = "RwLock per service in AppState", why_rejected = "RwLock introduces lock contention on every request. The toggle check does not need mutual exclusion with writes — a store and a load never run concurrently in a way that would corrupt state. Relaxed AtomicBool is sufficient and faster.", }, { option = "Dynamic axum Router rebuild — swap out the sub-router entirely", why_rejected = "axum Router is not live-rebuildable without replacing the entire tower Service. This would require Arc>, a custom Service wrapper, and would still incur a lock per request. The middleware approach achieves the same result with orders of magnitude less complexity.", }, ], related_adrs = ["adr-002", "adr-005"], ontology_check = { decision_string = "runtime toggle AtomicBool service mcp graphql middleware", invariants_at_risk = [], verdict = 'Safe, }, constraints = [ { id = "c-014-1", claim = "ServiceFlags::new() must initialise all AtomicBool flags to true — a compiled-in service is always enabled at startup", scope = "crates/ontoref-daemon/src/api.rs", severity = 'Hard, rationale = "Starting with flags disabled would silently hide services from operators who have not read this ADR. Explicit runtime toggle (documented in UI and REST API) is the correct mechanism to disable a service.", check = { tag = "Grep", pattern = "AtomicBool::new(true)", paths = ["crates/ontoref-daemon/src/api.rs"], must_be_empty = false }, }, { id = "c-014-2", claim = "Any new optional service added to the daemon must add an AtomicBool to ServiceFlags and a route_layer toggle middleware on its sub-router", scope = "crates/ontoref-daemon/src/api.rs", severity = 'Hard, rationale = "A service without a toggle violates the operator contract established by this ADR. Future services must be consistently operable.", check = { tag = "Grep", pattern = "pub struct ServiceFlags", paths = ["crates/ontoref-daemon/src/api.rs"], must_be_empty = false }, }, { id = "c-014-3", claim = "Service toggle endpoints require daemon admin credentials — project-level auth is insufficient", scope = "crates/ontoref-daemon/src/api.rs, crates/ontoref-daemon/src/ui/handlers.rs", severity = 'Soft, rationale = "Service availability is daemon-wide, not per-project. Allowing a project admin to disable MCP or GraphQL for all projects would be a privilege escalation.", check = { tag = "Grep", pattern = "AdminGuard", paths = ["crates/ontoref-daemon/src/ui/handlers.rs"], must_be_empty = false }, }, ], }