Vapora/adrs/adr-005-nats-jetstream.ncl

91 lines
5.7 KiB
Text
Raw Normal View History

let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-005",
title = "NATS JetStream for Agent Coordination and Workflow Progression",
status = 'Accepted,
date = "2024-11-01",
context = "Vapora agents are long-running tasks that may complete seconds or minutes after dispatch. The workflow engine needs reliable stage-to-stage progression. The A2A protocol requires async task completion notification. All of these require at-least-once delivery with persistence across restarts. As of 2026-03-27, async-nats 0.46 is in use (the markdown ADR references 0.45, which is stale). NATS is optional — all consumers implement graceful fallback when NATS is unavailable.",
decision = "NATS JetStream (via async-nats) is the message broker for agent task dispatch, heartbeat monitoring, workflow stage progression, and A2A completion notifications. No Redis Pub/Sub, no RabbitMQ, no Kafka. NATS is optional in development — all consumers check NATS availability at startup and degrade gracefully (polling fallback or sync execution) without crashing.",
rationale = [
{
claim = "JetStream at-least-once delivery prevents silent task loss",
detail = "Redis Pub/Sub drops messages if no subscriber is listening at publish time. JetStream persists messages to a stream and delivers them when a consumer reconnects. Agent crashes during task execution result in redelivery, not silent loss.",
},
{
claim = "NATS is lightweight with no external dependency beyond the server binary",
detail = "The NATS server is a single Go binary with no external runtime dependencies. RabbitMQ requires Erlang runtime + management plugins. Kafka requires ZooKeeper or KRaft + JVM. For a self-hosted platform, NATS operational burden is an order of magnitude lower.",
},
{
claim = "DashMap<String, oneshot::Sender> bridges NATS async replies to Tokio callers",
detail = "When a caller dispatches a task and needs the result, it inserts a oneshot sender into a DashMap keyed by task_id, then awaits the receiver. A background NATS subscriber resolves the sender on completion. This is the pattern used in vapora-a2a/src/bridge.rs and orchestrator.rs.",
},
],
consequences = {
positive = [
"Agent task dispatch is fire-and-forget from the caller's perspective — no blocking while the agent runs",
"Workflow stage progression survives backend restarts — JetStream re-delivers pending stage triggers",
"NATS subject hierarchy (vapora.tasks.*, vapora.agents.*) provides observable message topology",
"Graceful fallback means local development works without a running NATS server",
],
negative = [
"JetStream stream configuration (max_age, max_msgs, storage) must be provisioned before first use",
"The DashMap<String, oneshot::Sender> pattern leaks entries if the NATS completion message is never received — requires TTL cleanup",
],
},
alternatives_considered = [
{
option = "Redis Pub/Sub",
why_rejected = "No persistence — messages are dropped if the subscriber is offline when the publisher fires. Not viable for agent task coordination where agent restarts are expected.",
},
{
option = "RabbitMQ",
why_rejected = "Erlang runtime adds 200 MB+ to the container image. AMQP protocol is more complex than NATS. No meaningful capability advantage over JetStream for this use case.",
},
{
option = "Database polling (SurrealDB LIVE queries)",
why_rejected = "SurrealDB LIVE queries provide push notifications but are tightly coupled to the database connection lifecycle. NATS decouples the coordination bus from the persistence layer, allowing both to scale independently.",
},
],
constraints = [
{
id = "nats-only-message-broker",
claim = "No crate may import rabbitmq, kafka, or redis client crates for message brokering",
scope = "vapora (all crates)",
severity = 'Hard,
check = { tag = 'Cargo, crate = "vapora-backend", forbidden_deps = ["lapin", "rdkafka", "redis"] },
rationale = "Multiple message brokers would split the coordination bus, requiring consumers to subscribe to multiple systems and introducing message ordering ambiguity.",
},
{
id = "nats-graceful-fallback",
claim = "All NATS consumers must implement graceful fallback — NATS unavailability must not crash the service",
scope = "vapora (vapora-agents, vapora-workflow-engine, vapora-a2a)",
severity = 'Hard,
check = { tag = 'Grep, pattern = "nats.*error|warn.*nats|fallback", paths = ["crates/vapora-agents/src/", "crates/vapora-a2a/src/"], must_be_empty = false },
rationale = "Development environments without a running NATS server should still allow agent execution in degraded mode. A panic on NATS connection failure would block all development.",
},
{
id = "nats-subject-hierarchy",
claim = "All NATS subjects must use the vapora.* namespace prefix",
scope = "vapora (all NATS publishers and subscribers)",
severity = 'Soft,
check = { tag = 'Grep, pattern = "\"vapora\\.", paths = ["crates/"], must_be_empty = false },
rationale = "A consistent subject hierarchy prevents collisions with other services sharing the same NATS cluster and enables subject-based access control in multi-tenant deployments.",
},
],
related_adrs = ["adr-002", "adr-012"],
ontology_check = {
decision_string = "async-nats 0.46 JetStream for agent coordination and workflow progression; graceful fallback mandatory; vapora.* subject prefix",
invariants_at_risk = ["message-based-coordination"],
verdict = 'Safe,
},
}