Vapora/adrs/adr-016-agent-hot-reload-stable-identity.ncl

let d = import "adr-defaults.ncl" in

d.make_adr {
  id     = "adr-016",
  title  = "Agent Hot-Reload — Stable Role Identity and Zero-Downtime Config Reload",
  status = 'Accepted,
  date   = "2026-03-02",

  context = "AgentMetadata::id was Uuid::new_v4() generated at startup. learning_profiles in AgentCoordinator and agent_id in KGPersistence execution records used this UUID as their key. Every process restart or config reload rotated all UUIDs, orphaning all accumulated expertise profiles. An agent that had processed 500 tasks and learned optimal patterns for its role would reset to zero on the next deploy. VAPORA's learning-based agent selection (ADR-009) provides no value if learning state is ephemeral.",

  decision = "Add stable_id: String to AgentMetadata, computed as role.clone() at construction before role is moved. Switch all learning_profiles keys and KG execution records from ephemeral id (UUID) to stable_id_or_role(). Implement hot-reload: drain_role(role) removes agents from registry + drops executor_channels Senders (channel closure causes executor loops to exit cleanly); reload_agents re-spawns capability and config agents; learning_profiles DashMap is untouched throughout. SIGHUP handler and POST /reload endpoint both call reload_agents.",

  rationale = [
    {
      claim  = "role is the correct stable identity granularity for learning profiles",
      detail = "AgentScoringService ranks agents at the role level: it receives Vec<(agent_id, Option<LearningProfile>)> where multiple agents of the same role compete. The profile that matters is role-level expertise (how well 'developer' handles 'coding' tasks), not per-instance expertise. Using role as stable_id aggregates learning across all instances of the same role, is deterministic across restarts, and requires no UUID→role mapping table.",
    },
    {
      claim  = "Dropping the Sender is the correct shutdown signal for executor loops",
      detail = "Each executor runs while let Some(task) = rx.recv().await. When the Sender is dropped (by removing it from executor_channels), the channel closes and recv() returns None, causing the loop to exit. No explicit shutdown token or cancellation signal is needed. The loop drains buffered messages before exiting — in-flight tasks complete normally.",
    },
    {
      claim  = "BudgetManager and LLMRouter are deliberately excluded from hot-reload scope",
      detail = "BudgetManager holds per-role budget state accumulated since last process start. Reloading it mid-flight would reset budget counters, potentially allowing over-budget LLM calls that were blocked before the reload. LLMRouter routing rules could change provider selection in ways incompatible with ongoing workflow stages. Both require process restart for config changes — this is a documented limitation, not an oversight.",
    },
  ],

  consequences = {
    positive = [
      "Learning expertise accumulated over any number of restarts and hot-reloads is preserved",
      "KG execution records are partitioned by role (stable_id) — historical records and new records share the same key space",
      "SIGHUP reload is sub-millisecond for the drain+re-spawn sequence; brief NoAvailableAgent window is documented for callers",
      "POST /reload enables operator-triggered and CI-triggered config updates without process restart",
    ],
    negative = [
      "Brief availability window between drain and re-registration: assign_task returns NoAvailableAgent. Callers must implement retry.",
      "BudgetManager and LLMRouter config changes require process restart — hot-reload does not cover the full config surface.",
      "stable_id = role means two agents of the same role share learning history — per-instance specialization within a role is not supported.",
    ],
  },

  alternatives_considered = [
    {
      option       = "Persist UUID→stable_id mapping in SurrealDB",
      why_rejected = "Adds a mapping table, a migration, and a read on every profile lookup. role is already available at construction time and is deterministic — no persistence needed.",
    },
    {
      option       = "Include BudgetManager in hot-reload",
      why_rejected = "Resetting budget counters mid-flight allows LLM calls that were correctly blocked (budget exceeded) to proceed after reload. Budget state must be continuous across config changes.",
    },
    {
      option       = "Use a shutdown token (CancellationToken) instead of Sender drop for executor cleanup",
      why_rejected = "CancellationToken requires propagation through all executor spawn sites and cooperative check points in the task loop. Channel closure is implicit and automatic — every recv() point is already a shutdown check point.",
    },
  ],

  constraints = [
    {
      id        = "stable-id-is-role",
      claim     = "AgentMetadata::stable_id must be set to role.clone() before role is moved at construction — no UUID, no random suffix",
      scope     = "crates/vapora-agents/src/registry.rs",
      severity  = 'Hard,
      check     = { tag = 'Grep, pattern = "stable_id.*role\\|role.*clone.*stable", paths = ["crates/vapora-agents/src/registry.rs"], must_be_empty = false },
      rationale = "Any non-role value for stable_id defeats the learning profile persistence guarantee.",
    },
    {
      id        = "profile-key-uses-stable-id",
      claim     = "learning_profiles lookups and KG execution record agent_id must use stable_id_or_role(), never the ephemeral UUID id",
      scope     = "crates/vapora-agents/src/coordinator.rs",
      severity  = 'Hard,
      check     = { tag = 'Grep, pattern = "stable_id_or_role", paths = ["crates/vapora-agents/src/coordinator.rs", "crates/vapora-agents/src/executor.rs"], must_be_empty = false },
      rationale = "Using UUID as the profile key is the original bug — learning profiles would be orphaned on every restart.",
    },
    {
      id        = "learning-profiles-survive-drain",
      claim     = "drain_role must not clear learning_profiles — only registry and executor_channels entries are removed",
      scope     = "crates/vapora-agents/src/coordinator.rs",
      severity  = 'Hard,
      check     = { tag = 'Grep, pattern = "drain_role", paths = ["crates/vapora-agents/src/coordinator.rs"], must_be_empty = false },
      rationale = "Clearing learning_profiles in drain_role would silently reset the learning system on every hot-reload, defeating the purpose of this ADR.",
    },
  ],

  related_adrs = ["adr-009", "adr-014"],

  ontology_check = {
    decision_string    = "stable_id = role.clone() on AgentMetadata; profile keys + KG records use stable_id_or_role(); drain_role + re-spawn hot-reload; learning_profiles untouched; SIGHUP + POST /reload endpoints",
    invariants_at_risk = ["learning-based-selection"],
    verdict            = 'Safe,
  },
}