""" VAPORA Multi-IA Router Configuration Defines LLM routing rules, model mappings, cost thresholds, and fallback chains """ import k.api.all as k # ===== LLM PROVIDER DEFINITIONS ===== llm_providers = { "claude": { name = "Anthropic Claude" endpoint = "https://api.anthropic.com/v1" models = [ {name = "claude-opus-4-1", context = 200000, cost_per_mtok = 15.0} {name = "claude-sonnet-4-20250514", context = 200000, cost_per_mtok = 3.0} {name = "claude-haiku-3-5-20241022", context = 200000, cost_per_mtok = 0.80} ] availability = "production" regions = ["us-east-1", "us-west-2", "eu-west-1"] } "openai": { name = "OpenAI" endpoint = "https://api.openai.com/v1" models = [ {name = "gpt-4-turbo", context = 128000, cost_per_mtok = 10.0} {name = "gpt-4o", context = 128000, cost_per_mtok = 5.0} {name = "gpt-3.5-turbo", context = 16384, cost_per_mtok = 0.50} ] availability = "production" regions = ["us-east-1", "us-west-2", "eu-west-1"] } "gemini": { name = "Google Gemini" endpoint = "https://generativelanguage.googleapis.com/v1beta" models = [ {name = "gemini-2.0-pro", context = 1000000, cost_per_mtok = 10.0} {name = "gemini-2.0-flash", context = 1000000, cost_per_mtok = 0.075} {name = "gemini-1.5-pro", context = 1000000, cost_per_mtok = 1.25} ] availability = "production" regions = ["us-central-1"] } "ollama": { name = "Ollama Local" endpoint = "http://ollama.vapora-system:11434" models = [ {name = "llama2", context = 4096, cost_per_mtok = 0.0} {name = "mistral", context = 8192, cost_per_mtok = 0.0} {name = "neural-chat", context = 4096, cost_per_mtok = 0.0} ] availability = "local" regions = ["on-premise"] } } # ===== TASK CONTEXT CLASSIFIERS ===== task_classifiers = { "code_generation": { complexity = "high" latency_sensitive = false context_needs = 32000 quality_critical = true cost_sensitive = false recommended = ["claude-opus-4-1", "gpt-4-turbo", "claude-sonnet-4-20250514"] } "code_review": { complexity = "medium" latency_sensitive = false context_needs = 16000 quality_critical = true cost_sensitive = true recommended = ["claude-sonnet-4-20250514", "gpt-4o", "gemini-2.0-flash"] } "documentation": { complexity = "medium" latency_sensitive = false context_needs = 8000 quality_critical = true cost_sensitive = true recommended = ["gpt-4-turbo", "gemini-1.5-pro", "claude-sonnet-4-20250514"] } "testing": { complexity = "medium" latency_sensitive = false context_needs = 16000 quality_critical = true cost_sensitive = true recommended = ["claude-sonnet-4-20250514", "gpt-4o"] } "quick_query": { complexity = "low" latency_sensitive = true context_needs = 4000 quality_critical = false cost_sensitive = true recommended = ["gemini-2.0-flash", "gpt-3.5-turbo", "llama2"] } "embeddings": { complexity = "low" latency_sensitive = true context_needs = 512 quality_critical = false cost_sensitive = true recommended = ["ollama/neural-chat"] } "summarization": { complexity = "medium" latency_sensitive = false context_needs = 32000 quality_critical = true cost_sensitive = true recommended = ["claude-sonnet-4-20250514", "gemini-2.0-flash"] } "real_time_monitoring": { complexity = "low" latency_sensitive = true context_needs = 2000 quality_critical = false cost_sensitive = true recommended = ["gemini-2.0-flash", "gpt-3.5-turbo"] } } # ===== DEFAULT LLM MAPPINGS ===== default_mappings = [ { agent_role = "Architect" task_type = "*" # All tasks default_llm = "claude-opus-4-1" fallback = ["gpt-4-turbo"] override_allowed = false # Critical decisions } { agent_role = "Developer" task_type = "code_generation" default_llm = "claude-sonnet-4-20250514" fallback = ["gpt-4o", "claude-opus-4-1"] override_allowed = true } { agent_role = "CodeReviewer" task_type = "code_review" default_llm = "claude-sonnet-4-20250514" fallback = ["gpt-4o", "gemini-2.0-flash"] override_allowed = true } { agent_role = "Tester" task_type = "testing" default_llm = "claude-sonnet-4-20250514" fallback = ["gpt-4o"] override_allowed = true } { agent_role = "Documenter" task_type = "documentation" default_llm = "gpt-4-turbo" fallback = ["claude-sonnet-4-20250514", "gemini-1.5-pro"] override_allowed = true } { agent_role = "Marketer" task_type = "*" default_llm = "claude-sonnet-4-20250514" fallback = ["gpt-4o"] override_allowed = true } { agent_role = "Monitor" task_type = "real_time_monitoring" default_llm = "gemini-2.0-flash" fallback = ["gpt-3.5-turbo"] override_allowed = false # Must be fast } { agent_role = "Security" task_type = "*" default_llm = "claude-opus-4-1" fallback = ["gpt-4-turbo"] override_allowed = false # Critical security } ] # ===== COST TRACKING CONFIGURATION ===== cost_tracking = { enabled = true daily_warn_threshold = 5000 # Warn if daily cost > $5000 daily_hard_limit = 10000 # Hard stop if daily cost > $10000 monthly_warn_threshold = 100000 monthly_hard_limit = 150000 # Cost allocation by agent role budget_per_agent = { "Architect": {daily = 500, monthly = 10000} "Developer": {daily = 2000, monthly = 40000} "CodeReviewer": {daily = 1000, monthly = 20000} "Tester": {daily = 800, monthly = 16000} "Documenter": {daily = 300, monthly = 6000} "Security": {daily = 500, monthly = 10000} "Monitor": {daily = 100, monthly = 2000} "Other": {daily = 800, monthly = 16000} } # Price tracking pricing = { "claude-opus-4-1": {input = 15.0, output = 75.0} "claude-sonnet-4-20250514": {input = 3.0, output = 15.0} "gpt-4-turbo": {input = 10.0, output = 30.0} "gpt-4o": {input = 5.0, output = 15.0} "gpt-3.5-turbo": {input = 0.50, output = 1.50} "gemini-2.0-pro": {input = 10.0, output = 30.0} "gemini-2.0-flash": {input = 0.075, output = 0.30} } } # ===== LATENCY AND PERFORMANCE TARGETS ===== performance_targets = { "code_generation": {p50 = 5000, p95 = 15000, p99 = 30000} # milliseconds "code_review": {p50 = 3000, p95 = 10000, p99 = 20000} "quick_query": {p50 = 500, p95 = 2000, p99 = 5000} "real_time_monitoring": {p50 = 200, p95 = 1000, p99 = 2000} } # ===== CIRCUIT BREAKER SETTINGS ===== circuit_breakers = { "claude": { failure_threshold = 5 # Fail after 5 consecutive errors timeout_threshold = 60000 # 60s timeout half_open_max_calls = 3 reset_timeout = 30000 } "openai": { failure_threshold = 5 timeout_threshold = 45000 half_open_max_calls = 3 reset_timeout = 30000 } "gemini": { failure_threshold = 5 timeout_threshold = 30000 half_open_max_calls = 3 reset_timeout = 30000 } "ollama": { failure_threshold = 3 # Local failures more critical timeout_threshold = 15000 half_open_max_calls = 5 reset_timeout = 10000 } } # ===== ROUTING RULES ===== routing_rules = [ { condition = "task.complexity == high && cost < 1000" action = "use_claude_opus" } { condition = "task.latency_sensitive == true" action = "use_fastest_available" } { condition = "task.cost_sensitive == true && daily_cost > 4000" action = "use_ollama_or_cheap" } { condition = "provider_status[claude] == down" action = "fallback_to_gpt4" } { condition = "time_of_day == peak_hours && usage_high" action = "load_balance_all_providers" } ] # ===== MONITORING AND ALERTING ===== monitoring = { track_latencies = true track_costs = true track_failures = true track_token_usage = true metrics_retention = 30 # days alerts = [ { name = "high_daily_cost" condition = "cost_today > 5000" severity = "warning" actions = ["notify_ops", "switch_to_cheap_provider"] } { name = "provider_down" condition = "provider_status == down" severity = "critical" actions = ["failover", "notify_ops"] } { name = "high_latency" condition = "p95_latency > performance_target * 2" severity = "warning" actions = ["notify_team", "consider_load_rebalance"] } { name = "budget_exceeded" condition = "monthly_cost > monthly_hard_limit" severity = "critical" actions = ["stop_new_requests", "notify_management"] } ] } # ===== OUTPUT ===== output = { providers = llm_providers classifiers = task_classifiers default_mappings = default_mappings cost_tracking = cost_tracking performance_targets = performance_targets circuit_breakers = circuit_breakers routing_rules = routing_rules monitoring = monitoring }