91 lines
5.7 KiB
Text
91 lines
5.7 KiB
Text
|
|
let d = import "adr-defaults.ncl" in
|
||
|
|
|
||
|
|
d.make_adr {
|
||
|
|
id = "adr-005",
|
||
|
|
title = "NATS JetStream for Agent Coordination and Workflow Progression",
|
||
|
|
status = 'Accepted,
|
||
|
|
date = "2024-11-01",
|
||
|
|
|
||
|
|
context = "Vapora agents are long-running tasks that may complete seconds or minutes after dispatch. The workflow engine needs reliable stage-to-stage progression. The A2A protocol requires async task completion notification. All of these require at-least-once delivery with persistence across restarts. As of 2026-03-27, async-nats 0.46 is in use (the markdown ADR references 0.45, which is stale). NATS is optional — all consumers implement graceful fallback when NATS is unavailable.",
|
||
|
|
|
||
|
|
decision = "NATS JetStream (via async-nats) is the message broker for agent task dispatch, heartbeat monitoring, workflow stage progression, and A2A completion notifications. No Redis Pub/Sub, no RabbitMQ, no Kafka. NATS is optional in development — all consumers check NATS availability at startup and degrade gracefully (polling fallback or sync execution) without crashing.",
|
||
|
|
|
||
|
|
rationale = [
|
||
|
|
{
|
||
|
|
claim = "JetStream at-least-once delivery prevents silent task loss",
|
||
|
|
detail = "Redis Pub/Sub drops messages if no subscriber is listening at publish time. JetStream persists messages to a stream and delivers them when a consumer reconnects. Agent crashes during task execution result in redelivery, not silent loss.",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
claim = "NATS is lightweight with no external dependency beyond the server binary",
|
||
|
|
detail = "The NATS server is a single Go binary with no external runtime dependencies. RabbitMQ requires Erlang runtime + management plugins. Kafka requires ZooKeeper or KRaft + JVM. For a self-hosted platform, NATS operational burden is an order of magnitude lower.",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
claim = "DashMap<String, oneshot::Sender> bridges NATS async replies to Tokio callers",
|
||
|
|
detail = "When a caller dispatches a task and needs the result, it inserts a oneshot sender into a DashMap keyed by task_id, then awaits the receiver. A background NATS subscriber resolves the sender on completion. This is the pattern used in vapora-a2a/src/bridge.rs and orchestrator.rs.",
|
||
|
|
},
|
||
|
|
],
|
||
|
|
|
||
|
|
consequences = {
|
||
|
|
positive = [
|
||
|
|
"Agent task dispatch is fire-and-forget from the caller's perspective — no blocking while the agent runs",
|
||
|
|
"Workflow stage progression survives backend restarts — JetStream re-delivers pending stage triggers",
|
||
|
|
"NATS subject hierarchy (vapora.tasks.*, vapora.agents.*) provides observable message topology",
|
||
|
|
"Graceful fallback means local development works without a running NATS server",
|
||
|
|
],
|
||
|
|
negative = [
|
||
|
|
"JetStream stream configuration (max_age, max_msgs, storage) must be provisioned before first use",
|
||
|
|
"The DashMap<String, oneshot::Sender> pattern leaks entries if the NATS completion message is never received — requires TTL cleanup",
|
||
|
|
],
|
||
|
|
},
|
||
|
|
|
||
|
|
alternatives_considered = [
|
||
|
|
{
|
||
|
|
option = "Redis Pub/Sub",
|
||
|
|
why_rejected = "No persistence — messages are dropped if the subscriber is offline when the publisher fires. Not viable for agent task coordination where agent restarts are expected.",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
option = "RabbitMQ",
|
||
|
|
why_rejected = "Erlang runtime adds 200 MB+ to the container image. AMQP protocol is more complex than NATS. No meaningful capability advantage over JetStream for this use case.",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
option = "Database polling (SurrealDB LIVE queries)",
|
||
|
|
why_rejected = "SurrealDB LIVE queries provide push notifications but are tightly coupled to the database connection lifecycle. NATS decouples the coordination bus from the persistence layer, allowing both to scale independently.",
|
||
|
|
},
|
||
|
|
],
|
||
|
|
|
||
|
|
constraints = [
|
||
|
|
{
|
||
|
|
id = "nats-only-message-broker",
|
||
|
|
claim = "No crate may import rabbitmq, kafka, or redis client crates for message brokering",
|
||
|
|
scope = "vapora (all crates)",
|
||
|
|
severity = 'Hard,
|
||
|
|
check = { tag = 'Cargo, crate = "vapora-backend", forbidden_deps = ["lapin", "rdkafka", "redis"] },
|
||
|
|
rationale = "Multiple message brokers would split the coordination bus, requiring consumers to subscribe to multiple systems and introducing message ordering ambiguity.",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id = "nats-graceful-fallback",
|
||
|
|
claim = "All NATS consumers must implement graceful fallback — NATS unavailability must not crash the service",
|
||
|
|
scope = "vapora (vapora-agents, vapora-workflow-engine, vapora-a2a)",
|
||
|
|
severity = 'Hard,
|
||
|
|
check = { tag = 'Grep, pattern = "nats.*error|warn.*nats|fallback", paths = ["crates/vapora-agents/src/", "crates/vapora-a2a/src/"], must_be_empty = false },
|
||
|
|
rationale = "Development environments without a running NATS server should still allow agent execution in degraded mode. A panic on NATS connection failure would block all development.",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id = "nats-subject-hierarchy",
|
||
|
|
claim = "All NATS subjects must use the vapora.* namespace prefix",
|
||
|
|
scope = "vapora (all NATS publishers and subscribers)",
|
||
|
|
severity = 'Soft,
|
||
|
|
check = { tag = 'Grep, pattern = "\"vapora\\.", paths = ["crates/"], must_be_empty = false },
|
||
|
|
rationale = "A consistent subject hierarchy prevents collisions with other services sharing the same NATS cluster and enables subject-based access control in multi-tenant deployments.",
|
||
|
|
},
|
||
|
|
],
|
||
|
|
|
||
|
|
related_adrs = ["adr-002", "adr-012"],
|
||
|
|
|
||
|
|
ontology_check = {
|
||
|
|
decision_string = "async-nats 0.46 JetStream for agent coordination and workflow progression; graceful fallback mandatory; vapora.* subject prefix",
|
||
|
|
invariants_at_risk = ["message-based-coordination"],
|
||
|
|
verdict = 'Safe,
|
||
|
|
},
|
||
|
|
}
|