Vapora/crates/vapora-llm-router/src/cost_metrics.rs

use prometheus::{GaugeVec, IntCounterVec, Registry};
use std::sync::Arc;

/// Prometheus metrics for cost tracking and budget enforcement.
/// Exposes budget utilization, spending, and fallback events.
pub struct CostMetrics {
    /// Remaining budget per role in cents (gauge)
    pub budget_remaining_cents: GaugeVec,
    /// Budget utilization per role (0.0-1.0) (gauge)
    pub budget_utilization: GaugeVec,
    /// Cost per provider in cents (counter)
    pub cost_per_provider_cents: IntCounterVec,
    /// Fallback triggered events with reason (counter)
    pub fallback_triggered_total: IntCounterVec,
    /// Total tokens used per provider (counter)
    pub tokens_per_provider: IntCounterVec,
}

impl CostMetrics {
    /// Create new cost metrics collection (registers with default global registry)
    pub fn new() -> Result<Arc<Self>, prometheus::Error> {
        let registry = prometheus::default_registry();
        Self::with_registry(registry)
    }

    /// Create metrics with existing registry
    pub fn with_registry(registry: &Registry) -> Result<Arc<Self>, prometheus::Error> {
        let budget_remaining_cents = GaugeVec::new(
            prometheus::Opts::new(
                "vapora_llm_budget_remaining_cents",
                "Remaining budget for agent role in cents",
            ),
            &["role"],
        )?;
        registry.register(Box::new(budget_remaining_cents.clone()))?;

        let budget_utilization = GaugeVec::new(
            prometheus::Opts::new(
                "vapora_llm_budget_utilization",
                "Budget utilization percentage for agent role (0.0-1.0)",
            ),
            &["role"],
        )?;
        registry.register(Box::new(budget_utilization.clone()))?;

        let cost_per_provider_cents = IntCounterVec::new(
            prometheus::Opts::new(
                "vapora_llm_cost_per_provider_cents",
                "Total cost per provider in cents",
            ),
            &["provider"],
        )?;
        registry.register(Box::new(cost_per_provider_cents.clone()))?;

        let fallback_triggered_total = IntCounterVec::new(
            prometheus::Opts::new(
                "vapora_llm_fallback_triggered_total",
                "Total times fallback provider was triggered",
            ),
            &["role", "reason"],
        )?;
        registry.register(Box::new(fallback_triggered_total.clone()))?;

        let tokens_per_provider = IntCounterVec::new(
            prometheus::Opts::new(
                "vapora_llm_tokens_per_provider",
                "Total tokens processed per provider",
            ),
            &["provider", "token_type"],
        )?;
        registry.register(Box::new(tokens_per_provider.clone()))?;

        Ok(Arc::new(Self {
            budget_remaining_cents,
            budget_utilization,
            cost_per_provider_cents,
            fallback_triggered_total,
            tokens_per_provider,
        }))
    }

    /// Record budget update for role
    pub fn record_budget_update(&self, role: &str, remaining_cents: u32, utilization: f64) {
        self.budget_remaining_cents
            .with_label_values(&[role])
            .set(remaining_cents as f64);
        self.budget_utilization
            .with_label_values(&[role])
            .set(utilization);
    }

    /// Record cost for provider
    pub fn record_provider_cost(&self, provider: &str, cost_cents: u32) {
        self.cost_per_provider_cents
            .with_label_values(&[provider])
            .inc_by(cost_cents as u64);
    }

    /// Record fallback provider activation
    pub fn record_fallback_triggered(&self, role: &str, reason: &str) {
        self.fallback_triggered_total
            .with_label_values(&[role, reason])
            .inc();
    }

    /// Record tokens used per provider
    pub fn record_tokens(&self, provider: &str, input_tokens: u64, output_tokens: u64) {
        self.tokens_per_provider
            .with_label_values(&[provider, "input"])
            .inc_by(input_tokens);
        self.tokens_per_provider
            .with_label_values(&[provider, "output"])
            .inc_by(output_tokens);
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn create_test_metrics() -> Arc<CostMetrics> {
        let registry = Registry::new();
        CostMetrics::with_registry(&registry).expect("Failed to create test metrics")
    }

    #[test]
    fn test_cost_metrics_creation() {
        let registry = Registry::new();
        let metrics = CostMetrics::with_registry(&registry);
        assert!(metrics.is_ok());
    }

    #[test]
    fn test_record_budget_update() {
        let metrics = create_test_metrics();
        metrics.record_budget_update("developer", 25000, 0.167);
        // Metric recorded (would verify via Prometheus gather in integration test)
    }

    #[test]
    fn test_record_provider_cost() {
        let metrics = create_test_metrics();
        metrics.record_provider_cost("claude", 500);
        metrics.record_provider_cost("claude", 300);
        // Counter incremented by 800 total
    }

    #[test]
    fn test_record_fallback_triggered() {
        let metrics = create_test_metrics();
        metrics.record_fallback_triggered("developer", "budget_exceeded");
        metrics.record_fallback_triggered("architect", "budget_exceeded");
        metrics.record_fallback_triggered("developer", "budget_near_threshold");
        // Multiple fallback events recorded
    }

    #[test]
    fn test_record_tokens() {
        let metrics = create_test_metrics();
        metrics.record_tokens("claude", 5000, 1000);
        metrics.record_tokens("gpt4", 3000, 500);
        // Token counts recorded per provider
    }
}
feat: Phase 5.3 - Multi-Agent Learning Infrastructure Implement intelligent agent learning from Knowledge Graph execution history with per-task-type expertise tracking, recency bias, and learning curves. ## Phase 5.3 Implementation ### Learning Infrastructure (✅ Complete) - LearningProfileService with per-task-type expertise metrics - TaskTypeExpertise model tracking success_rate, confidence, learning curves - Recency bias weighting: recent 7 days weighted 3x higher (exponential decay) - Confidence scoring prevents overfitting: min(1.0, executions / 20) - Learning curves computed from daily execution windows ### Agent Scoring Service (✅ Complete) - Unified AgentScore combining SwarmCoordinator + learning profiles - Scoring formula: 0.3base + 0.5expertise + 0.2*confidence - Rank agents by combined score for intelligent assignment - Support for recency-biased scoring (recent_success_rate) - Methods: rank_agents, select_best, rank_agents_with_recency ### KG Integration (✅ Complete) - KGPersistence::get_executions_for_task_type() - query by agent + task type - KGPersistence::get_agent_executions() - all executions for agent - Coordinator::load_learning_profile_from_kg() - core KG→Learning integration - Coordinator::load_all_learning_profiles() - batch load for multiple agents - Convert PersistedExecution → ExecutionData for learning calculations ### Agent Assignment Integration (✅ Complete) - AgentCoordinator uses learning profiles for task assignment - extract_task_type() infers task type from title/description - assign_task() scores candidates using AgentScoringService - Fallback to load-based selection if no learning data available - Learning profiles stored in coordinator.learning_profiles RwLock ### Profile Adapter Enhancements (✅ Complete) - create_learning_profile() - initialize empty profiles - add_task_type_expertise() - set task-type expertise - update_profile_with_learning() - update swarm profiles from learning ## Files Modified ### vapora-knowledge-graph/src/persistence.rs (+30 lines) - get_executions_for_task_type(agent_id, task_type, limit) - get_agent_executions(agent_id, limit) ### vapora-agents/src/coordinator.rs (+100 lines) - load_learning_profile_from_kg() - core KG integration method - load_all_learning_profiles() - batch loading for agents - assign_task() already uses learning-based scoring via AgentScoringService ### Existing Complete Implementation - vapora-knowledge-graph/src/learning.rs - calculation functions - vapora-agents/src/learning_profile.rs - data structures and expertise - vapora-agents/src/scoring.rs - unified scoring service - vapora-agents/src/profile_adapter.rs - adapter methods ## Tests Passing - learning_profile: 7 tests ✅ - scoring: 5 tests ✅ - profile_adapter: 6 tests ✅ - coordinator: learning-specific tests ✅ ## Data Flow 1. Task arrives → AgentCoordinator::assign_task() 2. Extract task_type from description 3. Query KG for task-type executions (load_learning_profile_from_kg) 4. Calculate expertise with recency bias 5. Score candidates (SwarmCoordinator + learning) 6. Assign to top-scored agent 7. Execution result → KG → Update learning profiles ## Key Design Decisions ✅ Recency bias: 7-day half-life with 3x weight for recent performance ✅ Confidence scoring: min(1.0, total_executions / 20) prevents overfitting ✅ Hierarchical scoring: 30% base load, 50% expertise, 20% confidence ✅ KG query limit: 100 recent executions per task-type for performance ✅ Async loading: load_learning_profile_from_kg supports concurrent loads ## Next: Phase 5.4 - Cost Optimization Ready to implement budget enforcement and cost-aware provider selection. 2026-01-11 13:03:53 +00:00			`use prometheus::{GaugeVec, IntCounterVec, Registry};`
			`use std::sync::Arc;`

			`/// Prometheus metrics for cost tracking and budget enforcement.`
			`/// Exposes budget utilization, spending, and fallback events.`
			`pub struct CostMetrics {`
			`/// Remaining budget per role in cents (gauge)`
			`pub budget_remaining_cents: GaugeVec,`
			`/// Budget utilization per role (0.0-1.0) (gauge)`
			`pub budget_utilization: GaugeVec,`
			`/// Cost per provider in cents (counter)`
			`pub cost_per_provider_cents: IntCounterVec,`
			`/// Fallback triggered events with reason (counter)`
			`pub fallback_triggered_total: IntCounterVec,`
			`/// Total tokens used per provider (counter)`
			`pub tokens_per_provider: IntCounterVec,`
			`}`

			`impl CostMetrics {`
			`/// Create new cost metrics collection (registers with default global registry)`
			`pub fn new() -> Result<Arc<Self>, prometheus::Error> {`
			`let registry = prometheus::default_registry();`
			`Self::with_registry(registry)`
			`}`

			`/// Create metrics with existing registry`
			`pub fn with_registry(registry: &Registry) -> Result<Arc<Self>, prometheus::Error> {`
			`let budget_remaining_cents = GaugeVec::new(`
			`prometheus::Opts::new(`
			`"vapora_llm_budget_remaining_cents",`
			`"Remaining budget for agent role in cents",`
			`),`
			`&["role"],`
			`)?;`
			`registry.register(Box::new(budget_remaining_cents.clone()))?;`

			`let budget_utilization = GaugeVec::new(`
			`prometheus::Opts::new(`
			`"vapora_llm_budget_utilization",`
			`"Budget utilization percentage for agent role (0.0-1.0)",`
			`),`
			`&["role"],`
			`)?;`
			`registry.register(Box::new(budget_utilization.clone()))?;`

			`let cost_per_provider_cents = IntCounterVec::new(`
			`prometheus::Opts::new(`
			`"vapora_llm_cost_per_provider_cents",`
			`"Total cost per provider in cents",`
			`),`
			`&["provider"],`
			`)?;`
			`registry.register(Box::new(cost_per_provider_cents.clone()))?;`

			`let fallback_triggered_total = IntCounterVec::new(`
			`prometheus::Opts::new(`
			`"vapora_llm_fallback_triggered_total",`
			`"Total times fallback provider was triggered",`
			`),`
			`&["role", "reason"],`
			`)?;`
			`registry.register(Box::new(fallback_triggered_total.clone()))?;`

			`let tokens_per_provider = IntCounterVec::new(`
			`prometheus::Opts::new(`
			`"vapora_llm_tokens_per_provider",`
			`"Total tokens processed per provider",`
			`),`
			`&["provider", "token_type"],`
			`)?;`
			`registry.register(Box::new(tokens_per_provider.clone()))?;`

			`Ok(Arc::new(Self {`
			`budget_remaining_cents,`
			`budget_utilization,`
			`cost_per_provider_cents,`
			`fallback_triggered_total,`
			`tokens_per_provider,`
			`}))`
			`}`

			`/// Record budget update for role`
			`pub fn record_budget_update(&self, role: &str, remaining_cents: u32, utilization: f64) {`
			`self.budget_remaining_cents`
			`.with_label_values(&[role])`
			`.set(remaining_cents as f64);`
			`self.budget_utilization`
			`.with_label_values(&[role])`
			`.set(utilization);`
			`}`

			`/// Record cost for provider`
			`pub fn record_provider_cost(&self, provider: &str, cost_cents: u32) {`
			`self.cost_per_provider_cents`
			`.with_label_values(&[provider])`
			`.inc_by(cost_cents as u64);`
			`}`

			`/// Record fallback provider activation`
			`pub fn record_fallback_triggered(&self, role: &str, reason: &str) {`
			`self.fallback_triggered_total`
			`.with_label_values(&[role, reason])`
			`.inc();`
			`}`

			`/// Record tokens used per provider`
			`pub fn record_tokens(&self, provider: &str, input_tokens: u64, output_tokens: u64) {`
			`self.tokens_per_provider`
			`.with_label_values(&[provider, "input"])`
			`.inc_by(input_tokens);`
			`self.tokens_per_provider`
			`.with_label_values(&[provider, "output"])`
			`.inc_by(output_tokens);`
			`}`
			`}`

			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`

			`fn create_test_metrics() -> Arc<CostMetrics> {`
			`let registry = Registry::new();`
			`CostMetrics::with_registry(&registry).expect("Failed to create test metrics")`
			`}`

			`#[test]`
			`fn test_cost_metrics_creation() {`
			`let registry = Registry::new();`
			`let metrics = CostMetrics::with_registry(&registry);`
			`assert!(metrics.is_ok());`
			`}`

			`#[test]`
			`fn test_record_budget_update() {`
			`let metrics = create_test_metrics();`
			`metrics.record_budget_update("developer", 25000, 0.167);`
			`// Metric recorded (would verify via Prometheus gather in integration test)`
			`}`

			`#[test]`
			`fn test_record_provider_cost() {`
			`let metrics = create_test_metrics();`
			`metrics.record_provider_cost("claude", 500);`
			`metrics.record_provider_cost("claude", 300);`
			`// Counter incremented by 800 total`
			`}`

			`#[test]`
			`fn test_record_fallback_triggered() {`
			`let metrics = create_test_metrics();`
			`metrics.record_fallback_triggered("developer", "budget_exceeded");`
			`metrics.record_fallback_triggered("architect", "budget_exceeded");`
			`metrics.record_fallback_triggered("developer", "budget_near_threshold");`
			`// Multiple fallback events recorded`
			`}`

			`#[test]`
			`fn test_record_tokens() {`
			`let metrics = create_test_metrics();`
			`metrics.record_tokens("claude", 5000, 1000);`
			`metrics.record_tokens("gpt4", 3000, 500);`
			`// Token counts recorded per provider`
			`}`
			`}`