+
+
+Version: 0.1.0
+Status: Specification (VAPORA v1.0 - Multi-Agent Multi-IA)
+Purpose: Sistema de routing dinámico que selecciona el LLM óptimo por contexto
+
+
+Problema:
+
+- Cada tarea necesita un LLM diferente (code ≠ embeddings ≠ review)
+- Costos varían enormemente (Ollama gratis vs Claude Opus $$$)
+- Disponibilidad varía (rate limits, latencia)
+- Necesidad de fallback automático
+
+Solución: Sistema inteligente de routing que decide qué LLM usar según:
+
+- Contexto de la tarea (type, domain, complexity)
+- Reglas predefinidas (mappings estáticos)
+- Decisión dinámica (disponibilidad, costo, carga)
+- Override manual (usuario especifica LLM requerido)
+
+
+
+
+#![allow(unused)]
+fn main() {
+pub enum LLMProvider {
+ Claude {
+ api_key: String,
+ model: String, // "opus-4", "sonnet-4", "haiku-3"
+ max_tokens: usize,
+ },
+ OpenAI {
+ api_key: String,
+ model: String, // "gpt-4", "gpt-4-turbo", "gpt-3.5-turbo"
+ max_tokens: usize,
+ },
+ Gemini {
+ api_key: String,
+ model: String, // "gemini-2.0-pro", "gemini-pro", "gemini-flash"
+ max_tokens: usize,
+ },
+ Ollama {
+ endpoint: String, // "http://localhost:11434"
+ model: String, // "llama3.2", "mistral", "neural-chat"
+ max_tokens: usize,
+ },
+}
+
+pub trait LLMClient: Send + Sync {
+ async fn complete(
+ &self,
+ prompt: String,
+ context: Option<String>,
+ ) -> anyhow::Result<String>;
+
+ async fn stream(
+ &self,
+ prompt: String,
+ ) -> anyhow::Result<tokio::sync::mpsc::Receiver<String>>;
+
+ fn cost_per_1k_tokens(&self) -> f64;
+ fn latency_ms(&self) -> u32;
+ fn available(&self) -> bool;
+}
+}
+
+#![allow(unused)]
+fn main() {
+#[derive(Debug, Clone, PartialEq)]
+pub enum TaskType {
+ // Code tasks
+ CodeGeneration,
+ CodeReview,
+ CodeRefactor,
+ UnitTest,
+ Integration Test,
+
+ // Analysis tasks
+ ArchitectureDesign,
+ SecurityAnalysis,
+ PerformanceAnalysis,
+
+ // Documentation
+ DocumentGeneration,
+ CodeDocumentation,
+ APIDocumentation,
+
+ // Search/RAG
+ Embeddings,
+ SemanticSearch,
+ ContextRetrieval,
+
+ // General
+ GeneralQuery,
+ Summarization,
+ Translation,
+}
+
+#[derive(Debug, Clone)]
+pub struct TaskContext {
+ pub task_type: TaskType,
+ pub domain: String, // "backend", "frontend", "infra"
+ pub complexity: Complexity, // Low, Medium, High, Critical
+ pub quality_requirement: Quality, // Low, Medium, High, Critical
+ pub latency_required_ms: u32, // 500 = <500ms required
+ pub budget_cents: Option<u32>, // Cost limit in cents for 1k tokens
+}
+
+#[derive(Debug, Clone, PartialEq, PartialOrd)]
+pub enum Complexity {
+ Low,
+ Medium,
+ High,
+ Critical,
+}
+
+#[derive(Debug, Clone, PartialEq, PartialOrd)]
+pub enum Quality {
+ Low, // Quick & cheap
+ Medium, // Balanced
+ High, // Good quality
+ Critical // Best possible
+}
+}
+
+#![allow(unused)]
+fn main() {
+pub struct IAMapping {
+ pub task_type: TaskType,
+ pub primary: LLMProvider,
+ pub fallback_order: Vec<LLMProvider>,
+ pub reasoning: String,
+ pub cost_estimate_per_task: f64,
+}
+
+pub static DEFAULT_MAPPINGS: &[IAMapping] = &[
+ // Embeddings → Ollama (local, free)
+ IAMapping {
+ task_type: TaskType::Embeddings,
+ primary: LLMProvider::Ollama {
+ endpoint: "http://localhost:11434".to_string(),
+ model: "nomic-embed-text".to_string(),
+ max_tokens: 8192,
+ },
+ fallback_order: vec![
+ LLMProvider::OpenAI {
+ api_key: "".to_string(),
+ model: "text-embedding-3-small".to_string(),
+ max_tokens: 8192,
+ },
+ ],
+ reasoning: "Ollama local es gratis y rápido para embeddings. Fallback a OpenAI si Ollama no disponible".to_string(),
+ cost_estimate_per_task: 0.0, // Gratis localmente
+ },
+
+ // Code Generation → Claude Opus (máxima calidad)
+ IAMapping {
+ task_type: TaskType::CodeGeneration,
+ primary: LLMProvider::Claude {
+ api_key: "".to_string(),
+ model: "opus-4".to_string(),
+ max_tokens: 8000,
+ },
+ fallback_order: vec![
+ LLMProvider::OpenAI {
+ api_key: "".to_string(),
+ model: "gpt-4".to_string(),
+ max_tokens: 8000,
+ },
+ ],
+ reasoning: "Claude Opus mejor para código complejo. GPT-4 como fallback".to_string(),
+ cost_estimate_per_task: 0.06, // ~6 cents per 1k tokens
+ },
+
+ // Code Review → Claude Sonnet (balance calidad/costo)
+ IAMapping {
+ task_type: TaskType::CodeReview,
+ primary: LLMProvider::Claude {
+ api_key: "".to_string(),
+ model: "sonnet-4".to_string(),
+ max_tokens: 4000,
+ },
+ fallback_order: vec![
+ LLMProvider::Gemini {
+ api_key: "".to_string(),
+ model: "gemini-pro".to_string(),
+ max_tokens: 4000,
+ },
+ ],
+ reasoning: "Sonnet balance perfecto. Gemini como fallback".to_string(),
+ cost_estimate_per_task: 0.015,
+ },
+
+ // Documentation → GPT-4 (mejor formato)
+ IAMapping {
+ task_type: TaskType::DocumentGeneration,
+ primary: LLMProvider::OpenAI {
+ api_key: "".to_string(),
+ model: "gpt-4".to_string(),
+ max_tokens: 4000,
+ },
+ fallback_order: vec![
+ LLMProvider::Claude {
+ api_key: "".to_string(),
+ model: "sonnet-4".to_string(),
+ max_tokens: 4000,
+ },
+ ],
+ reasoning: "GPT-4 mejor formato para docs. Claude como fallback".to_string(),
+ cost_estimate_per_task: 0.03,
+ },
+
+ // Quick Queries → Gemini Flash (velocidad)
+ IAMapping {
+ task_type: TaskType::GeneralQuery,
+ primary: LLMProvider::Gemini {
+ api_key: "".to_string(),
+ model: "gemini-flash-2.0".to_string(),
+ max_tokens: 1000,
+ },
+ fallback_order: vec![
+ LLMProvider::Ollama {
+ endpoint: "http://localhost:11434".to_string(),
+ model: "llama3.2".to_string(),
+ max_tokens: 1000,
+ },
+ ],
+ reasoning: "Gemini Flash muy rápido. Ollama como fallback".to_string(),
+ cost_estimate_per_task: 0.002,
+ },
+];
+}
+
+#![allow(unused)]
+fn main() {
+pub struct LLMRouter {
+ pub mappings: HashMap<TaskType, Vec<LLMProvider>>,
+ pub providers: HashMap<String, Box<dyn LLMClient>>,
+ pub cost_tracker: CostTracker,
+ pub rate_limiter: RateLimiter,
+}
+
+impl LLMRouter {
+ /// Routing decision: hybrid (rules + dynamic + override)
+ pub async fn route(
+ &mut self,
+ context: TaskContext,
+ override_llm: Option<LLMProvider>,
+ ) -> anyhow::Result<LLMProvider> {
+ // 1. Si hay override manual, usar ese
+ if let Some(llm) = override_llm {
+ self.cost_tracker.log_usage(&llm, &context);
+ return Ok(llm);
+ }
+
+ // 2. Obtener mappings predefinidos
+ let mut candidates = self.get_mapping(&context.task_type)?;
+
+ // 3. Filtrar por disponibilidad (rate limits, latencia)
+ candidates = self.filter_by_availability(candidates).await?;
+
+ // 4. Filtrar por presupuesto si existe
+ if let Some(budget) = context.budget_cents {
+ candidates = candidates.into_iter()
+ .filter(|llm| llm.cost_per_1k_tokens() * 10.0 < budget as f64)
+ .collect();
+ }
+
+ // 5. Seleccionar por balance calidad/costo/latencia
+ let selected = self.select_optimal(candidates, &context)?;
+
+ self.cost_tracker.log_usage(&selected, &context);
+ Ok(selected)
+ }
+
+ async fn filter_by_availability(
+ &self,
+ candidates: Vec<LLMProvider>,
+ ) -> anyhow::Result<Vec<LLMProvider>> {
+ let mut available = Vec::new();
+ for llm in candidates {
+ if self.rate_limiter.can_use(&llm).await? {
+ available.push(llm);
+ }
+ }
+ Ok(available.is_empty() ? candidates : available)
+ }
+
+ fn select_optimal(
+ &self,
+ candidates: Vec<LLMProvider>,
+ context: &TaskContext,
+ ) -> anyhow::Result<LLMProvider> {
+ // Scoring: quality * 0.4 + cost * 0.3 + latency * 0.3
+ let best = candidates.iter().max_by(|a, b| {
+ let score_a = self.score_llm(a, context);
+ let score_b = self.score_llm(b, context);
+ score_a.partial_cmp(&score_b).unwrap()
+ });
+
+ Ok(best.ok_or(anyhow::anyhow!("No LLM available"))?.clone())
+ }
+
+ fn score_llm(&self, llm: &LLMProvider, context: &TaskContext) -> f64 {
+ let quality_score = match context.quality_requirement {
+ Quality::Critical => 1.0,
+ Quality::High => 0.9,
+ Quality::Medium => 0.7,
+ Quality::Low => 0.5,
+ };
+
+ let cost = llm.cost_per_1k_tokens();
+ let cost_score = 1.0 / (1.0 + cost); // Inverse: lower cost = higher score
+
+ let latency = llm.latency_ms();
+ let latency_score = 1.0 / (1.0 + latency as f64);
+
+ quality_score * 0.4 + cost_score * 0.3 + latency_score * 0.3
+ }
+}
+}
+
+#![allow(unused)]
+fn main() {
+pub struct CostTracker {
+ pub tasks_completed: HashMap<TaskType, u32>,
+ pub total_tokens_used: u64,
+ pub total_cost_cents: u32,
+ pub cost_by_provider: HashMap<String, u32>,
+ pub cost_by_task_type: HashMap<TaskType, u32>,
+}
+
+impl CostTracker {
+ pub fn log_usage(&mut self, llm: &LLMProvider, context: &TaskContext) {
+ let provider_name = llm.provider_name();
+ let cost = (llm.cost_per_1k_tokens() * 10.0) as u32; // Estimate per task
+
+ *self.cost_by_provider.entry(provider_name).or_insert(0) += cost;
+ *self.cost_by_task_type.entry(context.task_type.clone()).or_insert(0) += cost;
+ self.total_cost_cents += cost;
+ *self.tasks_completed.entry(context.task_type.clone()).or_insert(0) += 1;
+ }
+
+ pub fn monthly_cost_estimate(&self) -> f64 {
+ self.total_cost_cents as f64 / 100.0 // Convert to dollars
+ }
+
+ pub fn generate_report(&self) -> String {
+ format!(
+ "Cost Report:\n Total: ${:.2}\n By Provider: {:?}\n By Task: {:?}",
+ self.monthly_cost_estimate(),
+ self.cost_by_provider,
+ self.cost_by_task_type
+ )
+ }
+}
+}
+
+
+
+#![allow(unused)]
+fn main() {
+// Automático, usa DEFAULT_MAPPINGS
+let router = LLMRouter::new();
+let llm = router.route(
+ TaskContext {
+ task_type: TaskType::CodeGeneration,
+ domain: "backend".to_string(),
+ complexity: Complexity::High,
+ quality_requirement: Quality::High,
+ latency_required_ms: 5000,
+ budget_cents: None,
+ },
+ None, // Sin override
+).await?;
+// Resultado: Claude Opus (regla predefinida)
+}
+
+#![allow(unused)]
+fn main() {
+// Router evalúa disponibilidad, latencia, costo
+let router = LLMRouter::with_tracking();
+let llm = router.route(
+ TaskContext {
+ task_type: TaskType::CodeReview,
+ domain: "frontend".to_string(),
+ complexity: Complexity::Medium,
+ quality_requirement: Quality::Medium,
+ latency_required_ms: 2000,
+ budget_cents: Some(20), // Max 2 cents por task
+ },
+ None,
+).await?;
+// Router elige entre Sonnet vs Gemini según disponibilidad y presupuesto
+}
+
+#![allow(unused)]
+fn main() {
+// Usuario especifica exactamente qué LLM usar
+let llm = router.route(
+ context,
+ Some(LLMProvider::Claude {
+ api_key: "sk-...".to_string(),
+ model: "opus-4".to_string(),
+ max_tokens: 8000,
+ }),
+).await?;
+// Usa exactamente lo especificado, registra en cost tracker
+}
+
+
+[llm_router]
+# Mapeos personalizados (override DEFAULT_MAPPINGS)
+[[llm_router.custom_mapping]]
+task_type = "CodeGeneration"
+primary_provider = "claude"
+primary_model = "opus-4"
+fallback_providers = ["openai:gpt-4"]
+
+# Proveedores disponibles
+[[llm_router.providers]]
+name = "claude"
+api_key = "${ANTHROPIC_API_KEY}"
+model_variants = ["opus-4", "sonnet-4", "haiku-3"]
+rate_limit = { tokens_per_minute = 1000000 }
+
+[[llm_router.providers]]
+name = "openai"
+api_key = "${OPENAI_API_KEY}"
+model_variants = ["gpt-4", "gpt-4-turbo"]
+rate_limit = { tokens_per_minute = 500000 }
+
+[[llm_router.providers]]
+name = "gemini"
+api_key = "${GEMINI_API_KEY}"
+model_variants = ["gemini-pro", "gemini-flash-2.0"]
+
+[[llm_router.providers]]
+name = "ollama"
+endpoint = "http://localhost:11434"
+model_variants = ["llama3.2", "mistral", "neural-chat"]
+rate_limit = { tokens_per_minute = 10000000 } # Local, sin límites reales
+
+# Cost tracking
+[llm_router.cost_tracking]
+enabled = true
+warn_when_exceeds_cents = 1000 # Warn if daily cost > $10
+
+
+
+
+
+
+✅ Routing decision < 100ms
+✅ Fallback automático funciona
+✅ Cost tracking preciso
+✅ Documentación de costos por tarea
+✅ Override manual siempre funciona
+✅ Rate limiting respetado
+
+Version: 0.1.0
+Status: ✅ Specification Complete (VAPORA v1.0)
+Purpose: Multi-IA routing system para orquestación de agentes
+
+