provisioning/adrs/adr-024-ncl-sync-nats-events.ncl

100 lines
7.4 KiB
Text
Raw Normal View History

let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-024",
title = "ncl-sync: Event-driven cache invalidation via NATS",
status = 'Accepted,
date = "2026-04-17",
context = "ADR-022 established the ncl-sync daemon with a file watcher (notify) as the automatic invalidation mechanism. ADR-023 added an explicit sync-request sidecar written by Nu processes (state-write). Both mechanisms have limitations: the file watcher has a debounce window (~100ms) where cache can be momentarily stale, and sync-request polling adds 500ms latency. The orchestrator (Rust) writes state files from a separate process — it cannot easily participate in the file-watcher's same-process events, and requiring it to write sync-request sidecars would couple it to ncl-sync's internal protocol. NATS is already used by the orchestrator for DAG events (`provisioning.dag.*`) — extending it for cache invalidation is a natural fit.",
decision = "ncl-sync gains an optional NATS subscriber behind the `nats` Cargo feature (default-enabled). The subscriber listens on two subjects: `provisioning.workspace.ncl.changed` (file modified) and `provisioning.workspace.ncl.removed` (file deleted). Payload is a JSON object `{workspace, path, import_paths, source}`. On receipt, the subscriber validates that `workspace` matches its watched workspace, then calls `export_ncl` or `evict` directly — bypassing the file-watcher debounce and the sync-request poll. Cache is refreshed in <15ms vs ~100ms (watcher) or ~500ms (sidecar). The mechanism is opt-in via `ncl_sync.nats.enabled = true` in the config — without NATS, the daemon runs identically to before (watcher + sidecar fallback).",
rationale = [
{
claim = "NATS subscriber complements rather than replaces the file watcher",
detail = "Three invalidation mechanisms now exist with different failure characteristics: (1) file watcher — always active, catches any write including manual edits, ~100ms latency; (2) sync-request sidecar — written by Nu state-write, catches Nu-originated writes, ~500ms latency; (3) NATS events — written by any publisher, zero coupling to filesystem, <15ms latency. Each covers a different failure mode: watcher catches untracked writers, sidecar catches Nu writers, NATS catches Rust writers. Redundancy is intentional — duplicate events are idempotent (same cache_key, same content).",
},
{
claim = "Workspace validation prevents cross-daemon interference",
detail = "Multiple ncl-sync daemons may run (one per workspace). All subscribe to the same subject hierarchy. The subscriber canonicalizes both its watched workspace path and the event's workspace path; only events matching its workspace are processed. This allows NATS events to fan out to all relevant daemons without coordination.",
},
{
claim = "Subject hierarchy matches the workspace event model, not the orchestrator DAG model",
detail = "`provisioning.dag.*` subjects are about workflow execution. `provisioning.workspace.ncl.*` subjects are about configuration state. Keeping them separate lets ncl-sync subscribe narrowly (two subjects) without parsing unrelated events. Future publishers (installer, backup restore, etc.) use the same namespace.",
},
{
claim = "Cargo feature flag keeps NATS optional",
detail = "`default = [\"nats\"]` enables NATS in release builds. `cargo build --no-default-features` produces a binary without async-nats linkage — useful for minimal containers, air-gapped environments, or testing. The config field `ncl_sync.nats.enabled` is an additional runtime gate independent of the compile-time feature.",
},
],
consequences = {
positive = [
"Orchestrator-driven state mutations invalidate cache in <15ms (vs ~100ms via file watcher)",
"Zero coupling between orchestrator and ncl-sync — only the subject contract is shared",
"Other subscribers (dashboard UI, audit log) can watch the same subjects without touching ncl-sync",
"Redundant with watcher+sidecar — graceful degradation if NATS is down",
],
negative = [
"Adds ~6MB to ncl-sync binary size (async-nats + dependencies)",
"NATS must be running before ncl-sync connects (but failure is non-fatal — falls back to watcher)",
"Publishers (orchestrator, etc.) must be updated to emit the new subjects — until then, NATS layer has no effect",
],
},
alternatives_considered = [
{
option = "Single mechanism: file watcher only",
why_rejected = "Misses the ~100ms debounce window. For interactive CLI this is fine; for rapid orchestrator-driven state changes (deploy with many state updates), the cache can lag.",
},
{
option = "Single mechanism: NATS only",
why_rejected = "Hard dependency on NATS — ncl-sync fails if NATS isn't running. Manual NCL edits (user opens editor) wouldn't be caught. File watcher must remain as baseline.",
},
{
option = "HTTP endpoint on ncl-sync for invalidation",
why_rejected = "Requires every publisher to know the daemon's Unix socket or HTTP port. NATS decouples publishers from subscribers.",
},
{
option = "Reuse provisioning.dag.* subjects",
why_rejected = "DAG events are about workflow state, not config state. Overloading the subject hierarchy would force ncl-sync to filter noisy events it doesn't care about.",
},
],
ontology_check = {
decision_string = "ncl-sync adds opt-in NATS subscriber on provisioning.workspace.ncl.{changed,removed} for event-driven cache invalidation; watcher + sidecar remain as fallback",
invariants_at_risk = ["config-driven-always"],
verdict = 'Safe,
},
related_adrs = ["adr-022-ncl-sync-daemon", "adr-023-ncl-export-wrapper"],
constraints = [
{
id = "ncl-sync-nats-optional",
claim = "NATS subscriber must be an optional Cargo feature, and runtime-gated by config",
scope = "provisioning/platform/crates/ncl-sync/",
severity = 'Hard,
check = { tag = 'Grep, pattern = "cfg\\(feature = \"nats\"\\)|#\\[cfg\\(feature = \"nats\"\\)\\]", paths = ["provisioning/platform/crates/ncl-sync/src/"], must_be_empty = false },
rationale = "Air-gapped environments, minimal containers, and testing scenarios require ncl-sync to build and run without NATS. Removing the feature flag would violate this.",
},
{
id = "ncl-sync-nats-fallback",
claim = "NATS connection failure must be non-fatal — daemon continues with watcher + sidecar",
scope = "provisioning/platform/crates/ncl-sync/src/main.rs",
severity = 'Hard,
check = { tag = 'Grep, pattern = "tracing::warn", paths = ["provisioning/platform/crates/ncl-sync/src/main.rs"], must_be_empty = false },
rationale = "Hard dependency on NATS would break the workspace-local, zero-platform-service guarantee from ADR-022.",
},
{
id = "ncl-sync-workspace-scope",
claim = "Subscriber must filter events by workspace — only process events matching its watched workspace",
scope = "provisioning/platform/crates/ncl-sync/src/nats_subscriber.rs",
severity = 'Hard,
check = { tag = 'Grep, pattern = "workspace_matches", paths = ["provisioning/platform/crates/ncl-sync/src/nats_subscriber.rs"], must_be_empty = false },
rationale = "Multiple ncl-sync daemons share the subject namespace. Without filtering, daemon A would process events for workspace B's cache.",
},
],
}