Vapora/adrs/adr-013-kg-hybrid-search.ncl

let d = import "adr-defaults.ncl" in

d.make_adr {
  id     = "adr-013",
  title  = "Knowledge Graph Hybrid Search — HNSW + BM25 + Reciprocal Rank Fusion",
  status = 'Accepted,
  date   = "2026-02-26",

  context = "find_similar_executions in KGPersistence discarded its embedding argument entirely and returned N most-recent successful executions ordered by timestamp — a correctness bug masquerading as similarity search. Separately, the kg_executions table was declared SCHEMAFULL but three fields (agent_role, provider, cost_cents) used by PersistedExecution were missing from the schema. SurrealDB drops undefined fields on INSERT in SCHEMAFULL tables, causing every SELECT to return records that failed serde deserialization, silently swallowed by filter_map. stratum-embeddings SurrealDbStore was evaluated but rejected: it loads all records into memory and computes cosine similarity in-process — suitable for bounded document chunks, unsuitable for unbounded KG execution history.",

  decision = "Replace stub similarity functions with a hybrid retrieval pipeline: (1) HNSW (SurrealDB 3 native ANN vector index) over the embedding field for semantic proximity, (2) BM25 (SurrealDB 3 native full-text search) over task_description for exact lexical matches, (3) Reciprocal Rank Fusion (k=60) for scale-invariant score fusion. Add migration 012_kg_hybrid_search.surql: fix the SCHEMAFULL schema gap (add missing fields), define the HNSW index on embedding, define the full-text search index on task_description.",

  rationale = [
    {
      claim  = "Hybrid retrieval is required because HNSW and BM25 cover disjoint failure modes",
      detail = "HNSW (semantic) misses exact keyword matches: 'cargo clippy warnings' may not find 'clippy deny warnings fix' if the embedding model compresses the phrase differently. BM25 (lexical) misses semantic proximity: a query about error handling may not match a record about exception management if terminology differs. RRF fuses both rank lists without requiring score normalization.",
    },
    {
      claim  = "The schema bug must be fixed before the index can be created",
      detail = "HNSW index creation on a SCHEMAFULL table requires the indexed field to exist in the schema. The missing agent_role/provider/cost_cents fields also caused all SELECT results to fail deserialization — fixing the schema is a prerequisite for any query correctness, not just the new index.",
    },
    {
      claim  = "RRF k=60 is the standard fusion constant and requires no tuning",
      detail = "k=60 was established by Cormack et al. (2009) as a robust default. Score-based fusion alternatives (linear combination, learned weights) require per-corpus calibration. RRF is rank-only and therefore insensitive to score scale differences between HNSW cosine similarity and BM25 TF-IDF.",
    },
  ],

  consequences = {
    positive = [
      "find_similar_executions and find_similar_rlm_tasks now use the embedding argument correctly",
      "SCHEMAFULL schema gap eliminated — all PersistedExecution fields are persisted and deserialized correctly",
      "Hybrid search handles both exact crate/error-code queries (BM25) and semantic task similarity (HNSW)",
      "HNSW ANN search is sub-linear in the number of records — query time does not degrade with accumulation",
    ],
    negative = [
      "SurrealDB 3 native HNSW requires SurrealDB >= 3.0 at runtime; earlier versions will fail the migration",
      "RRF does not expose relevance scores to callers — ranking is ordinal only",
      "Embedding dimension is fixed at creation time; changing the embedding model requires dropping and rebuilding the HNSW index",
    ],
  },

  alternatives_considered = [
    {
      option       = "stratum-embeddings SurrealDbStore in-process cosine similarity",
      why_rejected = "Loads all records into memory for every query. Acceptable for bounded document chunk sets; unacceptable for KG execution history that accumulates unbounded records across all agents and tasks over time.",
    },
    {
      option       = "Pure HNSW semantic search",
      why_rejected = "Misses exact keyword matches for crate names, error codes, and specific command strings that are semantically compressed by embedding models.",
    },
    {
      option       = "Pure BM25 lexical search",
      why_rejected = "Misses semantic equivalence for concept-level queries where terminology varies between the query and the stored record.",
    },
  ],

  constraints = [
    {
      id        = "hnsw-index-required-for-kg",
      claim     = "kg_executions must have an HNSW index on the embedding field — brute-force in-process vector search is not permitted for this table",
      scope     = "migrations/012_kg_hybrid_search.surql",
      severity  = 'Hard,
      check     = { tag = 'Grep, pattern = "HNSW\\|hnsw", paths = ["migrations/012_kg_hybrid_search.surql"], must_be_empty = false },
      rationale = "In-process similarity search over an unbounded table is the rejected alternative. The HNSW index must exist before production queries are issued.",
    },
    {
      id        = "hybrid-rrf-fusion",
      claim     = "KGPersistence similarity queries must fuse HNSW and BM25 results via RRF — no single-strategy retrieval",
      scope     = "crates/vapora-knowledge-graph/src/persistence.rs",
      severity  = 'Hard,
      check     = { tag = 'Grep, pattern = "rrf\\|reciprocal_rank\\|BM25\\|full_text", paths = ["crates/vapora-knowledge-graph/src/persistence.rs"], must_be_empty = false },
      rationale = "Reverting to single-strategy retrieval silently degrades search quality without any compile-time signal.",
    },
  ],

  related_adrs = ["adr-009"],

  ontology_check = {
    decision_string    = "HNSW + BM25 + RRF hybrid search in KGPersistence; migration 012 fixes SCHEMAFULL gap and creates indexes; stratum-embeddings in-process scan rejected",
    invariants_at_risk = [],
    verdict            = 'Safe,
  },
}