let d = import "adr-defaults.ncl" in d.make_adr { id = "adr-011", title = "A2A Protocol Implementation: SurrealDB Persistence + NATS Async Coordination", status = 'Accepted, date = "2026-02-07", context = "Vapora needs to interoperate with external agent ecosystems (Claude Code, Google ADK). The A2A (Agent-to-Agent) protocol provides a standardized interface. The implementation required learning from a prior failed attempt where task state was stored in a HashMap (lost on restart) and task completion was faked with tokio::sleep(5). The remediated implementation uses SurrealDB for persistence and real NATS subscribers for async coordination. As of 2026-03-27, vapora-a2a exposes an HTTP + NATS server and vapora-a2a-client provides the Rust client library with retry/backoff.", decision = "A2A is implemented as two crates: vapora-a2a (server) and vapora-a2a-client (client). Task state is persisted to SurrealDB (table: a2a_tasks, SCHEMAFULL, survives restarts). Async task completion uses real NATS subscribers on vapora.tasks.completed and vapora.tasks.failed — no polling, no sleep-based fake completion. The client implements exponential backoff with jitter for 5xx/network errors; 4xx errors are not retried.", rationale = [ { claim = "SurrealDB persistence is mandatory — in-memory task state is not acceptable", detail = "A server restart under an in-memory HashMap would lose all in-flight task state. A2A clients waiting for task completion would hang indefinitely with no recoverable state. SurrealDB tasks survive restarts and remain queryable by their task_id.", }, { claim = "Real NATS coordination eliminates the race condition in timeout-based fake completion", detail = "tokio::sleep(5) as a task completion mechanism is not async coordination — it is a lie with a timer. Real completion events from NATS subscribers deliver results within milliseconds of actual task completion and handle partial failures correctly.", }, { claim = "Smart retry classification prevents infinite loops on client bugs", detail = "Retrying 4xx responses forever would mask client bugs (bad request format, missing auth) and cause a thundering herd on config errors. 5xx and network errors are genuinely transient; 4xx errors require caller intervention.", }, ], consequences = { positive = [ "A2A task state survives server restarts — clients can poll task_id for completion", "NATS-based completion delivers results without polling — O(1) completion latency regardless of task duration", "Client retry with backoff handles transient server errors transparently", "7 E2E integration tests (marked #[ignore]) verify the full task lifecycle with real SurrealDB + NATS", ], negative = [ "Integration tests require live SurrealDB + NATS — they are marked #[ignore] in CI without service dependencies", "The DashMap in the NATS bridge leaks entries for tasks that never complete — requires TTL cleanup", ], }, alternatives_considered = [ { option = "gRPC instead of JSON-RPC 2.0", why_rejected = "HTTP/2 infrastructure required. More complex than JSON-RPC for the current load profile. A2A specification uses HTTP/1.1 + JSON — gRPC would require a protocol translation layer.", }, { option = "PostgreSQL or SQLite for A2A task persistence", why_rejected = "SurrealDB already used in vapora. Adding a second database engine doubles operational burden with no architectural benefit for A2A's data model.", }, ], constraints = [ { id = "a2a-tasks-in-surrealdb", claim = "A2A task state must be persisted to the SurrealDB a2a_tasks table — no in-memory HashMap storage", scope = "vapora-a2a/src/task_manager.rs", severity = 'Hard, check = { tag = 'Grep, pattern = "a2a_tasks", paths = ["crates/vapora-a2a/src/"], must_be_empty = false }, rationale = "In-memory storage was the root cause of the failed first implementation. This constraint is a hard lesson from a production incident.", }, { id = "no-sleep-based-completion", claim = "No tokio::sleep call may substitute for real async task completion in vapora-a2a", scope = "vapora-a2a/src/bridge.rs", severity = 'Hard, check = { tag = 'Grep, pattern = "sleep.*task|task.*sleep", paths = ["crates/vapora-a2a/src/bridge.rs"], must_be_empty = true }, rationale = "Sleep-based fake completion was the specific mechanism that made the first implementation fraudulent. It must never return.", }, { id = "client-retry-policy", claim = "vapora-a2a-client must use RetryPolicy with exponential backoff — no fixed-interval retries or no-retry implementations", scope = "vapora-a2a-client/src/", severity = 'Hard, check = { tag = 'Grep, pattern = "RetryPolicy", paths = ["crates/vapora-a2a-client/src/"], must_be_empty = false }, rationale = "Fixed-interval retries cause thundering herds on server recovery. No-retry clients expose callers to transient failures. Exponential backoff with jitter is the correct policy.", }, ], related_adrs = ["adr-004", "adr-005"], ontology_check = { decision_string = "A2A implemented in vapora-a2a + vapora-a2a-client; SurrealDB a2a_tasks table for persistence; NATS for async completion; exponential backoff in client", invariants_at_risk = ["a2a-protocol", "message-based-coordination"], verdict = 'Safe, }, }