Vapora/adrs/adr-008-llm-routing-tiers.ncl

87 lines
5.4 KiB
Text
Raw Normal View History

let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-008",
title = "Three-Tier LLM Routing: Rules → Dynamic → Override with Budget Enforcement",
status = 'Accepted,
date = "2024-11-01",
context = "vapora-llm-router must select a provider for every LLM call. The selection must be deterministic for known task patterns (architecture tasks always go to the most capable model), adaptive to runtime conditions (provider outage, budget exhaustion), and overridable for debugging. Budget enforcement is a separate but tightly coupled concern: the router must refuse calls that would exceed per-role budget limits.",
decision = "The LLMRouter implements three-tier selection: (1) Rules tier — pattern-matched routing rules from llm-router.toml (e.g. architecture tasks → Claude); (2) Dynamic tier — runtime scoring based on availability, latency history, and current load when no rule matches; (3) Override tier — explicit provider specification with audit log entry. BudgetEnforcer runs before the router returns a provider: if the selected provider would breach the per-role budget, it falls back to the cheapest provider in the fallback chain. If all providers are over budget, the call is rejected with a BudgetExceeded error.",
rationale = [
{
claim = "Rules tier provides deterministic routing for known patterns",
detail = "Architecture tasks that always benefit from the most capable model should not be subject to dynamic scoring variability. Static rules give operators predictable routing behavior for their most important task types.",
},
{
claim = "Dynamic tier enables automatic recovery from provider failures",
detail = "When a provider has elevated error rates or latency, the dynamic scoring de-ranks it without operator intervention. Static-only routing would require a manual config change to route around an incident.",
},
{
claim = "Budget enforcement at the router layer is the only viable enforcement point",
detail = "BudgetEnforcer must see every token before it's spent. The LLMRouter is the single chokepoint — all LLM calls go through it (see ADR-006). This makes the router the correct enforcement point, not individual agent implementations.",
},
],
consequences = {
positive = [
"Routing rules and budget limits are in llm-router.toml — no code changes for common configuration updates",
"BudgetEnforcer prevents runaway spending even if an agent is stuck in a loop",
"Override tier with audit logging enables debugging without disabling enforcement for other roles",
"Fallback chains ensure graceful degradation: Claude → GPT-4 → Gemini → Ollama",
],
negative = [
"Three-tier selection adds latency to the provider selection path (~1ms) — acceptable but measurable",
"Budget limits must be set conservatively to avoid rejecting legitimate calls near period boundaries",
],
},
alternatives_considered = [
{
option = "Static rules only",
why_rejected = "No adaptation to provider failures. Budget enforcement would require a separate service. Provider outages would surface as errors instead of transparent fallback.",
},
{
option = "Dynamic only (no static rules)",
why_rejected = "Cold-start problem: no execution history to score providers on. Determinism guarantee lost — debugging routing decisions requires tracing the scoring algorithm.",
},
],
constraints = [
{
id = "budget-enforcer-runs-before-dispatch",
claim = "BudgetEnforcer must be invoked before any provider receives a token",
scope = "vapora-llm-router/src/router.rs",
severity = 'Hard,
check = { tag = 'Grep, pattern = "BudgetEnforcer", paths = ["crates/vapora-llm-router/src/router.rs"], must_be_empty = false },
rationale = "Post-dispatch budget checks cannot prevent overspending — tokens are already spent when the response arrives.",
},
{
id = "routing-rules-in-config",
claim = "All routing rules must be declared in llm-router.toml — no hardcoded provider names in agent code",
scope = "vapora (all crates calling LLMClient)",
severity = 'Hard,
check = { tag = 'Grep, pattern = "claude-opus|gpt-4|gemini-pro", paths = ["crates/vapora-agents/src/"], must_be_empty = true },
rationale = "Hardcoded provider names in agent code bypass the routing tier entirely and make provider substitution impossible without code changes.",
},
{
id = "override-tier-audit-log",
claim = "Provider override requests must produce an audit log entry",
scope = "vapora-llm-router/src/router.rs",
severity = 'Soft,
check = { tag = 'Grep, pattern = "override|audit", paths = ["crates/vapora-llm-router/src/router.rs"], must_be_empty = false },
rationale = "Override bypasses the rules and dynamic tiers. Without an audit trail, debugging unexpected provider selection is difficult in production.",
},
],
related_adrs = ["adr-006", "adr-009"],
ontology_check = {
decision_string = "three-tier LLM routing (rules/dynamic/override); BudgetEnforcer runs before dispatch; routing rules in llm-router.toml; no hardcoded provider names in agents",
invariants_at_risk = ["cost-aware-routing", "provider-abstraction"],
verdict = 'Safe,
},
}