33 KiB
LLM Provider Implementation Guide (VAPORA v1.2.0)
Version: 1.0 Status: Implementation Guide (Current Production Code) Last Updated: 2026-02-10 VAPORA Version: 1.2.0 (Production Ready)
Overview
How VAPORA implements multi-provider LLM routing with cost management, fallback chains, and budget enforcement in production today.
Key Components
| Component | Purpose | Status |
|---|---|---|
| LLMRouter | Hybrid routing (rules + dynamic) | ✅ Production |
| LLMClient Trait | Provider abstraction | ✅ Production |
| Cost Tracker | Token usage & cost accounting | ✅ Production |
| Budget Enforcer | Three-tier cost limits | ✅ Production |
| Fallback Chain | Automatic provider failover | ✅ Production |
Architecture Layers
┌─────────────────────────────────────────────────────┐
│ Task Request (Backend / Agent) │
├─────────────────────────────────────────────────────┤
│ LLMRouter.route() - Decision Layer │
│ ├─ Override? (manual) │
│ ├─ Mapping? (rules) │
│ ├─ Available? (rate limits) │
│ ├─ Budget? (cost check) │
│ └─ Score? (quality/cost/latency) │
├─────────────────────────────────────────────────────┤
│ LLMClient Implementations │
│ ├─ ClaudeClient (anthropic SDK) │
│ ├─ OpenAIClient (openai API) │
│ ├─ GeminiClient (google-generativeai) │
│ └─ OllamaClient (REST) │
├─────────────────────────────────────────────────────┤
│ CostTracker + BudgetEnforcer │
│ ├─ Track: tokens, cost, provider │
│ ├─ Enforce: daily/monthly/per-task limits │
│ └─ Report: cost breakdown │
├─────────────────────────────────────────────────────┤
│ Fallback Chain Executor │
│ ├─ Try provider 1 │
│ ├─ Fallback to provider 2 │
│ ├─ Fallback to provider 3 │
│ └─ Fallback to Ollama (last resort) │
├─────────────────────────────────────────────────────┤
│ External APIs │
│ ├─ Claude API (https://api.anthropic.com) │
│ ├─ OpenAI API (https://api.openai.com) │
│ ├─ Google AI (https://generativelanguage.googleapis.com) │
│ └─ Ollama Local (http://localhost:11434) │
└─────────────────────────────────────────────────────┘
1. LLMClient Trait (Provider Abstraction)
Location: crates/vapora-llm-router/src/providers.rs
use async_trait::async_trait;
use futures::stream::BoxStream;
/// Core provider abstraction - all LLMs implement this
#[async_trait]
pub trait LLMClient: Send + Sync {
/// Generate response from prompt
async fn complete(&self, prompt: &str) -> Result<String>;
/// Stream response chunks (for long outputs)
async fn stream(&self, prompt: &str) -> Result<BoxStream<String>>;
/// Cost per 1000 tokens (includes input + output average)
fn cost_per_1k_tokens(&self) -> f64;
/// Latency estimate (milliseconds)
fn latency_ms(&self) -> u32;
/// Is provider currently available (API key, rate limits)
fn available(&self) -> bool;
/// Provider name for logging/metrics
fn provider_name(&self) -> &str;
}
Claude Implementation
use anthropic::Anthropic;
pub struct ClaudeClient {
client: Anthropic,
model: String,
max_tokens: usize,
}
impl ClaudeClient {
pub fn new(api_key: &str, model: &str) -> Self {
Self {
client: Anthropic::new(api_key.into()),
model: model.to_string(),
max_tokens: 4096,
}
}
}
#[async_trait]
impl LLMClient for ClaudeClient {
async fn complete(&self, prompt: &str) -> Result<String> {
let message = self.client
.messages()
.create(CreateMessageRequest {
model: self.model.clone(),
max_tokens: self.max_tokens,
messages: vec![
MessageParam::User(ContentBlockParam::Text(
TextBlockParam {
text: prompt.into(),
}
)),
],
..Default::default()
})
.await
.map_err(|e| anyhow!("Claude API error: {}", e))?;
extract_text_response(&message)
}
async fn stream(&self, prompt: &str) -> Result<BoxStream<String>> {
let mut stream = self.client
.messages()
.stream(CreateMessageRequest {
model: self.model.clone(),
max_tokens: self.max_tokens,
messages: vec![
MessageParam::User(ContentBlockParam::Text(
TextBlockParam { text: prompt.into() }
)),
],
..Default::default()
})
.await?;
let (tx, rx) = tokio::sync::mpsc::channel(100);
tokio::spawn(async move {
while let Some(event) = stream.next().await {
match event {
Ok(evt) => {
if let Some(text) = extract_text_delta(&evt) {
let _ = tx.send(text).await;
}
}
Err(e) => {
error!("Claude stream error: {}", e);
break;
}
}
}
});
Ok(Box::pin(ReceiverStream::new(rx)))
}
fn cost_per_1k_tokens(&self) -> f64 {
// Opus: $3/$15, Sonnet: $3/$15, Haiku: $0.8/$4
match self.model.as_str() {
"opus-4" | "claude-opus-4-5" => 0.015, // Weighted avg
"sonnet-4" | "claude-sonnet-4-5" => 0.003, // Weighted avg
"haiku-3" | "claude-haiku-3" => 0.0008, // Weighted avg
_ => 0.01,
}
}
fn latency_ms(&self) -> u32 {
800 // Typical Claude latency
}
fn available(&self) -> bool {
!self.client.api_key().is_empty()
}
fn provider_name(&self) -> &str {
"claude"
}
}
OpenAI Implementation
use openai_api::OpenAI;
pub struct OpenAIClient {
client: OpenAI,
model: String,
}
#[async_trait]
impl LLMClient for OpenAIClient {
async fn complete(&self, prompt: &str) -> Result<String> {
let response = self.client
.create_chat_completion(CreateChatCompletionRequest {
model: self.model.clone(),
messages: vec![
ChatCompletionRequestMessage::User(
ChatCompletionRequestUserMessage {
content: ChatCompletionContentPart::Text(
ChatCompletionContentPartText {
text: prompt.into(),
}
),
..Default::default()
}
),
],
temperature: Some(0.7),
max_tokens: Some(2048),
..Default::default()
})
.await?;
Ok(response.choices[0].message.content.clone())
}
async fn stream(&self, prompt: &str) -> Result<BoxStream<String>> {
// Similar implementation using OpenAI streaming
todo!()
}
fn cost_per_1k_tokens(&self) -> f64 {
// GPT-4: $10/$30, GPT-4-Turbo: $10/$30
match self.model.as_str() {
"gpt-4" => 0.03,
"gpt-4-turbo" => 0.025,
"gpt-3.5-turbo" => 0.002,
_ => 0.01,
}
}
fn latency_ms(&self) -> u32 {
600
}
fn available(&self) -> bool {
!self.client.api_key().is_empty()
}
fn provider_name(&self) -> &str {
"openai"
}
}
Ollama Implementation (Local, Free)
pub struct OllamaClient {
endpoint: String,
model: String,
}
#[async_trait]
impl LLMClient for OllamaClient {
async fn complete(&self, prompt: &str) -> Result<String> {
let client = reqwest::Client::new();
let response = client
.post(format!("{}/api/generate", self.endpoint))
.json(&serde_json::json!({
"model": self.model,
"prompt": prompt,
"stream": false,
}))
.send()
.await?;
let data: serde_json::Value = response.json().await?;
Ok(data["response"].as_str().unwrap_or("").to_string())
}
async fn stream(&self, prompt: &str) -> Result<BoxStream<String>> {
// Stream from Ollama's streaming endpoint
todo!()
}
fn cost_per_1k_tokens(&self) -> f64 {
0.0 // Local, free
}
fn latency_ms(&self) -> u32 {
2000 // Local, slower
}
fn available(&self) -> bool {
// Check if Ollama is running
true // Simplified; real impl checks connectivity
}
fn provider_name(&self) -> &str {
"ollama"
}
}
2. LLMRouter - Decision Engine
Location: crates/vapora-llm-router/src/router.rs
Routing Decision Flow
pub struct LLMRouter {
providers: DashMap<String, Arc<Box<dyn LLMClient>>>,
mappings: HashMap<TaskType, Vec<String>>, // Task → [Claude, GPT-4, Gemini]
cost_tracker: Arc<CostTracker>,
budget_enforcer: Arc<BudgetEnforcer>,
}
impl LLMRouter {
/// Main routing decision: hybrid (rules + dynamic + manual)
pub async fn route(
&self,
context: TaskContext,
override_provider: Option<String>,
) -> Result<String> {
let task_id = &context.task_id;
// 1. MANUAL OVERRIDE (highest priority)
if let Some(provider_name) = override_provider {
info!("Task {}: Manual override to {}", task_id, provider_name);
return Ok(provider_name);
}
// 2. GET MAPPING (rules-based)
let mut candidates = self
.mappings
.get(&context.task_type)
.cloned()
.unwrap_or_else(|| vec!["claude".into(), "openai".into(), "ollama".into()]);
info!("Task {}: Default mapping candidates: {:?}", task_id, candidates);
// 3. FILTER BY AVAILABILITY (rate limits, API keys)
candidates.retain(|name| {
if let Some(provider) = self.providers.get(name) {
provider.available()
} else {
false
}
});
if candidates.is_empty() {
return Err(anyhow!("No available providers for task {}", task_id));
}
// 4. FILTER BY BUDGET
if let Some(budget_cents) = context.budget_cents {
candidates.retain(|name| {
if let Some(provider) = self.providers.get(name) {
let cost = provider.cost_per_1k_tokens();
cost < (budget_cents as f64 / 100.0)
} else {
false
}
});
if candidates.is_empty() {
warn!(
"Task {}: All candidates exceed budget {} cents",
task_id, budget_cents
);
// Fallback to cheapest option
return Ok("ollama".into());
}
}
// 5. SCORE & SELECT (quality/cost/latency balance)
let selected = self.select_optimal(&candidates, &context)?;
info!(
"Task {}: Selected provider {} (from {:?})",
task_id, selected, candidates
);
// 6. LOG IN COST TRACKER
self.cost_tracker.record_provider_selection(
&context.task_id,
&selected,
&context.task_type,
);
Ok(selected)
}
/// Score each candidate and select best
fn select_optimal(
&self,
candidates: &[String],
context: &TaskContext,
) -> Result<String> {
let best = candidates
.iter()
.max_by(|a, b| {
let score_a = self.score_provider(a, context);
let score_b = self.score_provider(b, context);
score_a.partial_cmp(&score_b).unwrap()
})
.ok_or_else(|| anyhow!("No candidates to score"))?;
Ok(best.clone())
}
/// Scoring formula: quality * 0.4 + cost * 0.3 + latency * 0.3
fn score_provider(&self, provider_name: &str, context: &TaskContext) -> f64 {
if let Some(provider) = self.providers.get(provider_name) {
// Quality score (higher requirement = higher score for better models)
let quality_score = match context.quality_requirement {
Quality::Critical => 1.0,
Quality::High => 0.9,
Quality::Medium => 0.7,
Quality::Low => 0.5,
};
// Cost score (lower cost = higher score)
let cost = provider.cost_per_1k_tokens();
let cost_score = 1.0 / (1.0 + cost); // Inverse function
// Latency score (lower latency = higher score)
let latency = provider.latency_ms() as f64;
let latency_score = 1.0 / (1.0 + latency / 1000.0);
// Final score
quality_score * 0.4 + cost_score * 0.3 + latency_score * 0.3
} else {
0.0
}
}
}
Task Context Definition
#[derive(Clone, Debug)]
pub enum TaskType {
CodeGeneration,
CodeReview,
ArchitectureDesign,
Documentation,
GeneralQuery,
Embeddings,
SecurityAnalysis,
}
#[derive(Clone, Debug)]
pub enum Quality {
Low, // Fast & cheap (Ollama, Gemini Flash)
Medium, // Balanced (GPT-3.5, Gemini Pro)
High, // Good quality (GPT-4, Claude Sonnet)
Critical, // Best possible (Claude Opus)
}
#[derive(Clone, Debug)]
pub struct TaskContext {
pub task_id: String,
pub task_type: TaskType,
pub domain: String, // "backend", "frontend", "infra"
pub complexity: u8, // 0-100 complexity score
pub quality_requirement: Quality,
pub latency_required_ms: u32,
pub budget_cents: Option<u32>, // Max cost in cents
}
Default Mappings
pub fn default_mappings() -> HashMap<TaskType, Vec<String>> {
let mut mappings = HashMap::new();
// Code Generation → Claude (best reasoning)
mappings.insert(TaskType::CodeGeneration, vec![
"claude".into(),
"openai".into(),
"ollama".into(),
]);
// Code Review → Claude Sonnet (balanced)
mappings.insert(TaskType::CodeReview, vec![
"claude".into(),
"openai".into(),
]);
// Architecture Design → Claude Opus (deep reasoning)
mappings.insert(TaskType::ArchitectureDesign, vec![
"claude".into(),
"openai".into(),
]);
// Documentation → GPT-4 (good formatting)
mappings.insert(TaskType::Documentation, vec![
"openai".into(),
"claude".into(),
]);
// Quick Queries → Gemini (fast)
mappings.insert(TaskType::GeneralQuery, vec![
"gemini".into(),
"ollama".into(),
]);
// Embeddings → Ollama (local, free)
mappings.insert(TaskType::Embeddings, vec![
"ollama".into(),
"openai".into(),
]);
mappings
}
3. Cost Tracking
Location: crates/vapora-llm-router/src/cost_tracker.rs
use dashmap::DashMap;
pub struct CostTracker {
/// Total cost in cents
total_cost_cents: AtomicU32,
/// Cost by provider
cost_by_provider: DashMap<String, ProviderCostMetric>,
/// Cost by task type
cost_by_task: DashMap<TaskType, TaskCostMetric>,
/// Hourly breakdown (for trending)
hourly_costs: DashMap<String, u32>, // "2026-02-10T14" → cents
}
pub struct ProviderCostMetric {
pub total_tokens: u64,
pub total_cost_cents: u32,
pub call_count: u32,
pub avg_cost_per_call: f64,
pub last_call: DateTime<Utc>,
}
pub struct TaskCostMetric {
pub task_type: TaskType,
pub total_cost_cents: u32,
pub call_count: u32,
pub avg_cost_per_call: f64,
}
impl CostTracker {
pub fn new() -> Self {
Self {
total_cost_cents: AtomicU32::new(0),
cost_by_provider: DashMap::new(),
cost_by_task: DashMap::new(),
hourly_costs: DashMap::new(),
}
}
/// Record a provider selection for future cost tracking
pub fn record_provider_selection(
&self,
task_id: &str,
provider: &str,
task_type: &TaskType,
) {
debug!("Task {} using {}", task_id, provider);
// Track which provider was selected (actual token/cost data comes from callbacks)
}
/// Track actual cost after API call
pub fn track_api_call(
&self,
provider: &str,
input_tokens: u32,
output_tokens: u32,
cost_cents: u32,
) {
let total_tokens = (input_tokens + output_tokens) as u64;
// Update total
let old_total = self.total_cost_cents
.fetch_add(cost_cents, std::sync::atomic::Ordering::SeqCst);
info!(
"API call: provider={}, tokens={}, cost={}¢ (total={}¢)",
provider,
total_tokens,
cost_cents,
old_total + cost_cents
);
// Update provider stats
self.cost_by_provider
.entry(provider.to_string())
.or_insert_with(|| ProviderCostMetric {
total_tokens: 0,
total_cost_cents: 0,
call_count: 0,
avg_cost_per_call: 0.0,
last_call: Utc::now(),
})
.alter(|_, mut metric| {
metric.total_tokens += total_tokens;
metric.total_cost_cents += cost_cents;
metric.call_count += 1;
metric.avg_cost_per_call = metric.total_cost_cents as f64 / metric.call_count as f64;
metric.last_call = Utc::now();
metric
});
// Update hourly trend
let hour_key = Utc::now().format("%Y-%m-%dT%H").to_string();
self.hourly_costs
.entry(hour_key)
.or_insert(0)
.add_assign(cost_cents);
}
/// Get full cost report
pub fn report(&self) -> CostReport {
let total = self.total_cost_cents.load(std::sync::atomic::Ordering::SeqCst);
let mut providers = Vec::new();
for entry in self.cost_by_provider.iter() {
providers.push((entry.key().clone(), entry.value().clone()));
}
CostReport {
total_cost_cents: total,
total_cost_dollars: total as f64 / 100.0,
cost_by_provider: providers,
daily_average: total as f64 / 24.0 / 100.0,
monthly_projection: (total as f64 * 30.0) / 100.0,
}
}
}
pub struct CostReport {
pub total_cost_cents: u32,
pub total_cost_dollars: f64,
pub cost_by_provider: Vec<(String, ProviderCostMetric)>,
pub daily_average: f64,
pub monthly_projection: f64,
}
Cost Tracking in Action
// When a task executes:
let router = LLMRouter::new();
// 1. Route task
let provider_name = router.route(context, None).await?;
// 2. Execute with selected provider
let provider = get_provider(&provider_name)?;
let response = provider.complete(prompt).await?;
// 3. Count tokens (from API response headers or estimation)
let input_tokens = estimate_tokens(prompt);
let output_tokens = estimate_tokens(&response);
// 4. Calculate cost
let cost_per_1k = provider.cost_per_1k_tokens();
let total_tokens = input_tokens + output_tokens;
let cost_cents = ((total_tokens as f64 / 1000.0) * cost_per_1k * 100.0) as u32;
// 5. Track in cost tracker
router.cost_tracker.track_api_call(
&provider_name,
input_tokens,
output_tokens,
cost_cents,
);
// 6. Generate report
let report = router.cost_tracker.report();
println!("Total spent: ${:.2}", report.total_cost_dollars);
println!("Monthly projection: ${:.2}", report.monthly_projection);
4. Budget Enforcement (Three Tiers)
Location: crates/vapora-llm-router/src/budget.rs
pub struct BudgetEnforcer {
daily_limit_cents: u32,
monthly_limit_cents: u32,
per_task_limit_cents: u32,
warn_threshold_percent: f64,
}
#[derive(Debug, Clone, Copy)]
pub enum BudgetTier {
/// Normal operation: cost < 50% of limit
Normal,
/// Caution: cost between 50%-90% of limit
NearThreshold {
percent_used: f64,
},
/// Exceeded: cost > 90% of limit (fallback to cheaper providers)
Exceeded {
percent_used: f64,
},
}
impl BudgetEnforcer {
pub fn new(daily: u32, monthly: u32, per_task: u32) -> Self {
Self {
daily_limit_cents: daily,
monthly_limit_cents: monthly,
per_task_limit_cents: per_task,
warn_threshold_percent: 90.0,
}
}
/// Check budget tier and decide action
pub fn check_budget(&self, current_spend_cents: u32) -> BudgetTier {
let percent_used = (current_spend_cents as f64 / self.daily_limit_cents as f64) * 100.0;
match percent_used {
0.0..50.0 => BudgetTier::Normal,
50.0..90.0 => BudgetTier::NearThreshold {
percent_used,
},
_ => BudgetTier::Exceeded {
percent_used,
},
}
}
/// Enforce budget by adjusting routing
pub fn enforce(
&self,
tier: BudgetTier,
primary_provider: &str,
) -> String {
match tier {
// Normal: use primary provider
BudgetTier::Normal => primary_provider.into(),
// Near threshold: warn and prefer cheaper option
BudgetTier::NearThreshold { percent_used } => {
warn!("Budget warning: {:.1}% used", percent_used);
match primary_provider {
"claude" | "openai" => "gemini".into(), // Fallback to cheaper
_ => primary_provider.into(),
}
}
// Exceeded: force cheapest option
BudgetTier::Exceeded { percent_used } => {
error!("Budget exceeded: {:.1}% used. Routing to Ollama", percent_used);
"ollama".into()
}
}
}
}
Three-Tier Enforcement in Router
pub async fn route_with_budget(
&self,
context: TaskContext,
) -> Result<String> {
// 1. Route normally
let primary = self.route(context.clone(), None).await?;
// 2. Check budget tier
let current_spend = self.cost_tracker.total_cost_cents();
let tier = self.budget_enforcer.check_budget(current_spend);
// 3. Enforce: may override provider based on budget
let final_provider = self.budget_enforcer.enforce(tier, &primary);
info!(
"Task {}: Primary={}, Tier={:?}, Final={}",
context.task_id, primary, tier, final_provider
);
Ok(final_provider)
}
5. Fallback Chain
Location: crates/vapora-llm-router/src/fallback.rs
pub struct FallbackChain {
/// Providers in fallback order
chain: Vec<String>,
/// Cost tracker for failure metrics
cost_tracker: Arc<CostTracker>,
}
impl FallbackChain {
pub fn new(chain: Vec<String>, cost_tracker: Arc<CostTracker>) -> Self {
Self { chain, cost_tracker }
}
/// Default fallback chain (by cost/quality)
pub fn default() -> Self {
Self {
chain: vec![
"claude".into(), // Primary (best)
"openai".into(), // First fallback
"gemini".into(), // Second fallback
"ollama".into(), // Last resort (always available)
],
cost_tracker: Arc::new(CostTracker::new()),
}
}
/// Execute with automatic fallback
pub async fn execute(
&self,
router: &LLMRouter,
prompt: &str,
timeout: Duration,
) -> Result<(String, String)> {
let mut last_error = None;
for (idx, provider_name) in self.chain.iter().enumerate() {
info!(
"Fallback: Attempting provider {} ({}/{})",
provider_name,
idx + 1,
self.chain.len()
);
match self.try_provider(router, provider_name, prompt, timeout).await {
Ok(response) => {
info!("✓ Success with {}", provider_name);
return Ok((provider_name.clone(), response));
}
Err(e) => {
warn!("✗ {} failed: {:?}", provider_name, e);
last_error = Some(e);
// Track failure
self.cost_tracker.record_provider_failure(provider_name);
// Continue to next
continue;
}
}
}
Err(last_error.unwrap_or_else(|| anyhow!("All providers failed")))
}
async fn try_provider(
&self,
router: &LLMRouter,
provider_name: &str,
prompt: &str,
timeout: Duration,
) -> Result<String> {
let provider = router.get_provider(provider_name)?;
tokio::time::timeout(timeout, provider.complete(prompt))
.await
.map_err(|_| anyhow!("Timeout after {:?}", timeout))?
}
}
Fallback Chain Example
#[tokio::test]
async fn test_fallback_chain() {
let router = LLMRouter::new();
let fallback = FallbackChain::default();
let (provider_used, response) = fallback.execute(
&router,
"Analyze this code: fn hello() { println!(\"world\"); }",
Duration::from_secs(30),
).await.unwrap();
println!("Used provider: {}", provider_used);
println!("Response: {}", response);
// With Claude: ~3-5 seconds
// With OpenAI: ~2-3 seconds
// With Ollama: ~2-5 seconds (local, no network)
}
6. Configuration
Location: config/llm-router.toml
# Provider definitions
[[providers]]
name = "claude"
api_key_env = "ANTHROPIC_API_KEY"
model = "claude-opus-4-5"
priority = 1
cost_per_1k_tokens = 0.015
timeout_ms = 30000
rate_limit_rpm = 1000000
[[providers]]
name = "openai"
api_key_env = "OPENAI_API_KEY"
model = "gpt-4"
priority = 2
cost_per_1k_tokens = 0.030
timeout_ms = 30000
rate_limit_rpm = 500000
[[providers]]
name = "gemini"
api_key_env = "GOOGLE_API_KEY"
model = "gemini-2.0-flash"
priority = 3
cost_per_1k_tokens = 0.005
timeout_ms = 25000
rate_limit_rpm = 100000
[[providers]]
name = "ollama"
endpoint = "http://localhost:11434"
model = "llama2"
priority = 4
cost_per_1k_tokens = 0.0
timeout_ms = 10000
rate_limit_rpm = 10000000
# Task type mappings
[[routing_rules]]
task_type = "CodeGeneration"
primary_provider = "claude"
fallback_chain = ["openai", "gemini"]
[[routing_rules]]
task_type = "CodeReview"
primary_provider = "claude"
fallback_chain = ["openai"]
[[routing_rules]]
task_type = "Documentation"
primary_provider = "openai"
fallback_chain = ["claude", "gemini"]
[[routing_rules]]
task_type = "GeneralQuery"
primary_provider = "gemini"
fallback_chain = ["ollama", "openai"]
[[routing_rules]]
task_type = "Embeddings"
primary_provider = "ollama"
fallback_chain = ["openai"]
# Budget enforcement
[budget]
daily_limit_cents = 10000 # $100 per day
monthly_limit_cents = 250000 # $2500 per month
per_task_limit_cents = 1000 # $10 per task max
warn_threshold_percent = 90.0 # Warn at 90%
7. Integration in Backend
Location: crates/vapora-backend/src/services/
pub struct AgentService {
llm_router: Arc<LLMRouter>,
cost_tracker: Arc<CostTracker>,
}
impl AgentService {
/// Execute agent task with LLM routing
pub async fn execute_agent_task(&self, task: &AgentTask) -> Result<String> {
let context = TaskContext {
task_id: task.id.clone(),
task_type: task.task_type.clone(),
quality_requirement: Quality::High,
budget_cents: task.budget_cents,
..Default::default()
};
// 1. Route to provider
let provider_name = self.llm_router
.route_with_budget(context)
.await?;
info!("Task {}: Using provider {}", task.id, provider_name);
// 2. Get provider
let provider = self.llm_router.get_provider(&provider_name)?;
// 3. Execute
let response = provider
.complete(&task.prompt)
.await?;
// 4. Track cost
let tokens = estimate_tokens(&task.prompt) + estimate_tokens(&response);
let cost = (tokens as f64 / 1000.0) * provider.cost_per_1k_tokens() * 100.0;
self.cost_tracker.track_api_call(
&provider_name,
estimate_tokens(&task.prompt),
estimate_tokens(&response),
cost as u32,
);
Ok(response)
}
/// Get cost report
pub fn cost_report(&self) -> Result<CostReport> {
Ok(self.cost_tracker.report())
}
}
8. Metrics & Monitoring
Location: crates/vapora-backend/src/metrics.rs
lazy_static::lazy_static! {
pub static ref PROVIDER_REQUESTS: IntCounterVec = IntCounterVec::new(
Opts::new("vapora_llm_provider_requests_total", "Total LLM requests"),
&["provider", "task_type", "status"],
).unwrap();
pub static ref PROVIDER_LATENCY: HistogramVec = HistogramVec::new(
HistogramOpts::new("vapora_llm_provider_latency_seconds", "Provider latency"),
&["provider"],
).unwrap();
pub static ref PROVIDER_TOKENS: IntCounterVec = IntCounterVec::new(
Opts::new("vapora_llm_provider_tokens_total", "Tokens used"),
&["provider", "type"], // input/output
).unwrap();
pub static ref ROUTING_DECISIONS: IntCounterVec = IntCounterVec::new(
Opts::new("vapora_llm_routing_decisions_total", "Routing decisions"),
&["selected_provider", "task_type", "reason"], // rules/budget/override
).unwrap();
pub static ref FALLBACK_TRIGGERS: IntCounterVec = IntCounterVec::new(
Opts::new("vapora_llm_fallback_triggers_total", "Fallback chain activations"),
&["from_provider", "to_provider", "reason"],
).unwrap();
pub static ref BUDGET_ENFORCEMENT: IntCounterVec = IntCounterVec::new(
Opts::new("vapora_llm_budget_enforcement_total", "Budget tier changes"),
&["tier", "action"], // Normal/NearThreshold/Exceeded → ProviderChange
).unwrap();
}
pub fn record_provider_call(provider: &str, task_type: &str, status: &str) {
PROVIDER_REQUESTS
.with_label_values(&[provider, task_type, status])
.inc();
}
pub fn record_fallback(from: &str, to: &str, reason: &str) {
FALLBACK_TRIGGERS
.with_label_values(&[from, to, reason])
.inc();
}
9. Real Example: Code Generation Task
// User requests code generation for a Rust function
// 1. CREATE CONTEXT
let context = TaskContext {
task_id: "task-12345".into(),
task_type: TaskType::CodeGeneration,
domain: "backend".into(),
complexity: 75,
quality_requirement: Quality::High,
latency_required_ms: 30000,
budget_cents: Some(500), // $5 max
};
// 2. ROUTE
let provider = router.route_with_budget(context).await?;
// Decision: "claude" (matches mapping + budget OK)
// 3. EXECUTE
let claude = router.get_provider("claude")?;
let response = claude.complete(
"Write a Rust function that validates email addresses"
).await?;
// 4. TRACK
router.cost_tracker.track_api_call(
"claude",
150, // input tokens
320, // output tokens
28, // cost cents ($0.28)
);
// 5. REPORT
let report = router.cost_tracker.report();
println!("Today's total: ${:.2}", report.total_cost_dollars);
println!("Monthly projection: ${:.2}", report.monthly_projection);
Summary
| Component | Purpose | Cost? | Real APIs? |
|---|---|---|---|
| LLMRouter | Routing logic | ✅ Tracks | ✅ Yes |
| LLMClient | Provider abstraction | ✅ Records | ✅ Yes |
| CostTracker | Token & cost accounting | ✅ Tracks | ✅ Yes |
| BudgetEnforcer | Three-tier limits | ✅ Enforces | N/A |
| FallbackChain | Automatic failover | ✅ Logs | ✅ Yes |
Key Insight: VAPORA tracks cost and enforces budgets regardless of which pattern (mock/SDK/custom) you're using. The router is provider-agnostic.
See llm-provider-patterns.md for implementation patterns without subscriptions.