Vapora/adrs/adr-011-a2a-protocol.ncl

87 lines
5.5 KiB
Text
Raw Normal View History

let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-011",
title = "A2A Protocol Implementation: SurrealDB Persistence + NATS Async Coordination",
status = 'Accepted,
date = "2026-02-07",
context = "Vapora needs to interoperate with external agent ecosystems (Claude Code, Google ADK). The A2A (Agent-to-Agent) protocol provides a standardized interface. The implementation required learning from a prior failed attempt where task state was stored in a HashMap (lost on restart) and task completion was faked with tokio::sleep(5). The remediated implementation uses SurrealDB for persistence and real NATS subscribers for async coordination. As of 2026-03-27, vapora-a2a exposes an HTTP + NATS server and vapora-a2a-client provides the Rust client library with retry/backoff.",
decision = "A2A is implemented as two crates: vapora-a2a (server) and vapora-a2a-client (client). Task state is persisted to SurrealDB (table: a2a_tasks, SCHEMAFULL, survives restarts). Async task completion uses real NATS subscribers on vapora.tasks.completed and vapora.tasks.failed — no polling, no sleep-based fake completion. The client implements exponential backoff with jitter for 5xx/network errors; 4xx errors are not retried.",
rationale = [
{
claim = "SurrealDB persistence is mandatory — in-memory task state is not acceptable",
detail = "A server restart under an in-memory HashMap would lose all in-flight task state. A2A clients waiting for task completion would hang indefinitely with no recoverable state. SurrealDB tasks survive restarts and remain queryable by their task_id.",
},
{
claim = "Real NATS coordination eliminates the race condition in timeout-based fake completion",
detail = "tokio::sleep(5) as a task completion mechanism is not async coordination — it is a lie with a timer. Real completion events from NATS subscribers deliver results within milliseconds of actual task completion and handle partial failures correctly.",
},
{
claim = "Smart retry classification prevents infinite loops on client bugs",
detail = "Retrying 4xx responses forever would mask client bugs (bad request format, missing auth) and cause a thundering herd on config errors. 5xx and network errors are genuinely transient; 4xx errors require caller intervention.",
},
],
consequences = {
positive = [
"A2A task state survives server restarts — clients can poll task_id for completion",
"NATS-based completion delivers results without polling — O(1) completion latency regardless of task duration",
"Client retry with backoff handles transient server errors transparently",
"7 E2E integration tests (marked #[ignore]) verify the full task lifecycle with real SurrealDB + NATS",
],
negative = [
"Integration tests require live SurrealDB + NATS — they are marked #[ignore] in CI without service dependencies",
"The DashMap<task_id, oneshot::Sender> in the NATS bridge leaks entries for tasks that never complete — requires TTL cleanup",
],
},
alternatives_considered = [
{
option = "gRPC instead of JSON-RPC 2.0",
why_rejected = "HTTP/2 infrastructure required. More complex than JSON-RPC for the current load profile. A2A specification uses HTTP/1.1 + JSON — gRPC would require a protocol translation layer.",
},
{
option = "PostgreSQL or SQLite for A2A task persistence",
why_rejected = "SurrealDB already used in vapora. Adding a second database engine doubles operational burden with no architectural benefit for A2A's data model.",
},
],
constraints = [
{
id = "a2a-tasks-in-surrealdb",
claim = "A2A task state must be persisted to the SurrealDB a2a_tasks table — no in-memory HashMap storage",
scope = "vapora-a2a/src/task_manager.rs",
severity = 'Hard,
check = { tag = 'Grep, pattern = "a2a_tasks", paths = ["crates/vapora-a2a/src/"], must_be_empty = false },
rationale = "In-memory storage was the root cause of the failed first implementation. This constraint is a hard lesson from a production incident.",
},
{
id = "no-sleep-based-completion",
claim = "No tokio::sleep call may substitute for real async task completion in vapora-a2a",
scope = "vapora-a2a/src/bridge.rs",
severity = 'Hard,
check = { tag = 'Grep, pattern = "sleep.*task|task.*sleep", paths = ["crates/vapora-a2a/src/bridge.rs"], must_be_empty = true },
rationale = "Sleep-based fake completion was the specific mechanism that made the first implementation fraudulent. It must never return.",
},
{
id = "client-retry-policy",
claim = "vapora-a2a-client must use RetryPolicy with exponential backoff — no fixed-interval retries or no-retry implementations",
scope = "vapora-a2a-client/src/",
severity = 'Hard,
check = { tag = 'Grep, pattern = "RetryPolicy", paths = ["crates/vapora-a2a-client/src/"], must_be_empty = false },
rationale = "Fixed-interval retries cause thundering herds on server recovery. No-retry clients expose callers to transient failures. Exponential backoff with jitter is the correct policy.",
},
],
related_adrs = ["adr-004", "adr-005"],
ontology_check = {
decision_string = "A2A implemented in vapora-a2a + vapora-a2a-client; SurrealDB a2a_tasks table for persistence; NATS for async completion; exponential backoff in client",
invariants_at_risk = ["a2a-protocol", "message-based-coordination"],
verdict = 'Safe,
},
}