333 lines
9.8 KiB
Plaintext
333 lines
9.8 KiB
Plaintext
|
|
"""
|
||
|
|
VAPORA Multi-IA Router Configuration
|
||
|
|
Defines LLM routing rules, model mappings, cost thresholds, and fallback chains
|
||
|
|
"""
|
||
|
|
|
||
|
|
import k.api.all as k
|
||
|
|
|
||
|
|
# ===== LLM PROVIDER DEFINITIONS =====
|
||
|
|
|
||
|
|
llm_providers = {
|
||
|
|
"claude": {
|
||
|
|
name = "Anthropic Claude"
|
||
|
|
endpoint = "https://api.anthropic.com/v1"
|
||
|
|
models = [
|
||
|
|
{name = "claude-opus-4-1", context = 200000, cost_per_mtok = 15.0}
|
||
|
|
{name = "claude-sonnet-4-20250514", context = 200000, cost_per_mtok = 3.0}
|
||
|
|
{name = "claude-haiku-3-5-20241022", context = 200000, cost_per_mtok = 0.80}
|
||
|
|
]
|
||
|
|
availability = "production"
|
||
|
|
regions = ["us-east-1", "us-west-2", "eu-west-1"]
|
||
|
|
}
|
||
|
|
"openai": {
|
||
|
|
name = "OpenAI"
|
||
|
|
endpoint = "https://api.openai.com/v1"
|
||
|
|
models = [
|
||
|
|
{name = "gpt-4-turbo", context = 128000, cost_per_mtok = 10.0}
|
||
|
|
{name = "gpt-4o", context = 128000, cost_per_mtok = 5.0}
|
||
|
|
{name = "gpt-3.5-turbo", context = 16384, cost_per_mtok = 0.50}
|
||
|
|
]
|
||
|
|
availability = "production"
|
||
|
|
regions = ["us-east-1", "us-west-2", "eu-west-1"]
|
||
|
|
}
|
||
|
|
"gemini": {
|
||
|
|
name = "Google Gemini"
|
||
|
|
endpoint = "https://generativelanguage.googleapis.com/v1beta"
|
||
|
|
models = [
|
||
|
|
{name = "gemini-2.0-pro", context = 1000000, cost_per_mtok = 10.0}
|
||
|
|
{name = "gemini-2.0-flash", context = 1000000, cost_per_mtok = 0.075}
|
||
|
|
{name = "gemini-1.5-pro", context = 1000000, cost_per_mtok = 1.25}
|
||
|
|
]
|
||
|
|
availability = "production"
|
||
|
|
regions = ["us-central-1"]
|
||
|
|
}
|
||
|
|
"ollama": {
|
||
|
|
name = "Ollama Local"
|
||
|
|
endpoint = "http://ollama.vapora-system:11434"
|
||
|
|
models = [
|
||
|
|
{name = "llama2", context = 4096, cost_per_mtok = 0.0}
|
||
|
|
{name = "mistral", context = 8192, cost_per_mtok = 0.0}
|
||
|
|
{name = "neural-chat", context = 4096, cost_per_mtok = 0.0}
|
||
|
|
]
|
||
|
|
availability = "local"
|
||
|
|
regions = ["on-premise"]
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# ===== TASK CONTEXT CLASSIFIERS =====
|
||
|
|
|
||
|
|
task_classifiers = {
|
||
|
|
"code_generation": {
|
||
|
|
complexity = "high"
|
||
|
|
latency_sensitive = false
|
||
|
|
context_needs = 32000
|
||
|
|
quality_critical = true
|
||
|
|
cost_sensitive = false
|
||
|
|
recommended = ["claude-opus-4-1", "gpt-4-turbo", "claude-sonnet-4-20250514"]
|
||
|
|
}
|
||
|
|
"code_review": {
|
||
|
|
complexity = "medium"
|
||
|
|
latency_sensitive = false
|
||
|
|
context_needs = 16000
|
||
|
|
quality_critical = true
|
||
|
|
cost_sensitive = true
|
||
|
|
recommended = ["claude-sonnet-4-20250514", "gpt-4o", "gemini-2.0-flash"]
|
||
|
|
}
|
||
|
|
"documentation": {
|
||
|
|
complexity = "medium"
|
||
|
|
latency_sensitive = false
|
||
|
|
context_needs = 8000
|
||
|
|
quality_critical = true
|
||
|
|
cost_sensitive = true
|
||
|
|
recommended = ["gpt-4-turbo", "gemini-1.5-pro", "claude-sonnet-4-20250514"]
|
||
|
|
}
|
||
|
|
"testing": {
|
||
|
|
complexity = "medium"
|
||
|
|
latency_sensitive = false
|
||
|
|
context_needs = 16000
|
||
|
|
quality_critical = true
|
||
|
|
cost_sensitive = true
|
||
|
|
recommended = ["claude-sonnet-4-20250514", "gpt-4o"]
|
||
|
|
}
|
||
|
|
"quick_query": {
|
||
|
|
complexity = "low"
|
||
|
|
latency_sensitive = true
|
||
|
|
context_needs = 4000
|
||
|
|
quality_critical = false
|
||
|
|
cost_sensitive = true
|
||
|
|
recommended = ["gemini-2.0-flash", "gpt-3.5-turbo", "llama2"]
|
||
|
|
}
|
||
|
|
"embeddings": {
|
||
|
|
complexity = "low"
|
||
|
|
latency_sensitive = true
|
||
|
|
context_needs = 512
|
||
|
|
quality_critical = false
|
||
|
|
cost_sensitive = true
|
||
|
|
recommended = ["ollama/neural-chat"]
|
||
|
|
}
|
||
|
|
"summarization": {
|
||
|
|
complexity = "medium"
|
||
|
|
latency_sensitive = false
|
||
|
|
context_needs = 32000
|
||
|
|
quality_critical = true
|
||
|
|
cost_sensitive = true
|
||
|
|
recommended = ["claude-sonnet-4-20250514", "gemini-2.0-flash"]
|
||
|
|
}
|
||
|
|
"real_time_monitoring": {
|
||
|
|
complexity = "low"
|
||
|
|
latency_sensitive = true
|
||
|
|
context_needs = 2000
|
||
|
|
quality_critical = false
|
||
|
|
cost_sensitive = true
|
||
|
|
recommended = ["gemini-2.0-flash", "gpt-3.5-turbo"]
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# ===== DEFAULT LLM MAPPINGS =====
|
||
|
|
|
||
|
|
default_mappings = [
|
||
|
|
{
|
||
|
|
agent_role = "Architect"
|
||
|
|
task_type = "*" # All tasks
|
||
|
|
default_llm = "claude-opus-4-1"
|
||
|
|
fallback = ["gpt-4-turbo"]
|
||
|
|
override_allowed = false # Critical decisions
|
||
|
|
}
|
||
|
|
{
|
||
|
|
agent_role = "Developer"
|
||
|
|
task_type = "code_generation"
|
||
|
|
default_llm = "claude-sonnet-4-20250514"
|
||
|
|
fallback = ["gpt-4o", "claude-opus-4-1"]
|
||
|
|
override_allowed = true
|
||
|
|
}
|
||
|
|
{
|
||
|
|
agent_role = "CodeReviewer"
|
||
|
|
task_type = "code_review"
|
||
|
|
default_llm = "claude-sonnet-4-20250514"
|
||
|
|
fallback = ["gpt-4o", "gemini-2.0-flash"]
|
||
|
|
override_allowed = true
|
||
|
|
}
|
||
|
|
{
|
||
|
|
agent_role = "Tester"
|
||
|
|
task_type = "testing"
|
||
|
|
default_llm = "claude-sonnet-4-20250514"
|
||
|
|
fallback = ["gpt-4o"]
|
||
|
|
override_allowed = true
|
||
|
|
}
|
||
|
|
{
|
||
|
|
agent_role = "Documenter"
|
||
|
|
task_type = "documentation"
|
||
|
|
default_llm = "gpt-4-turbo"
|
||
|
|
fallback = ["claude-sonnet-4-20250514", "gemini-1.5-pro"]
|
||
|
|
override_allowed = true
|
||
|
|
}
|
||
|
|
{
|
||
|
|
agent_role = "Marketer"
|
||
|
|
task_type = "*"
|
||
|
|
default_llm = "claude-sonnet-4-20250514"
|
||
|
|
fallback = ["gpt-4o"]
|
||
|
|
override_allowed = true
|
||
|
|
}
|
||
|
|
{
|
||
|
|
agent_role = "Monitor"
|
||
|
|
task_type = "real_time_monitoring"
|
||
|
|
default_llm = "gemini-2.0-flash"
|
||
|
|
fallback = ["gpt-3.5-turbo"]
|
||
|
|
override_allowed = false # Must be fast
|
||
|
|
}
|
||
|
|
{
|
||
|
|
agent_role = "Security"
|
||
|
|
task_type = "*"
|
||
|
|
default_llm = "claude-opus-4-1"
|
||
|
|
fallback = ["gpt-4-turbo"]
|
||
|
|
override_allowed = false # Critical security
|
||
|
|
}
|
||
|
|
]
|
||
|
|
|
||
|
|
# ===== COST TRACKING CONFIGURATION =====
|
||
|
|
|
||
|
|
cost_tracking = {
|
||
|
|
enabled = true
|
||
|
|
daily_warn_threshold = 5000 # Warn if daily cost > $5000
|
||
|
|
daily_hard_limit = 10000 # Hard stop if daily cost > $10000
|
||
|
|
monthly_warn_threshold = 100000
|
||
|
|
monthly_hard_limit = 150000
|
||
|
|
|
||
|
|
# Cost allocation by agent role
|
||
|
|
budget_per_agent = {
|
||
|
|
"Architect": {daily = 500, monthly = 10000}
|
||
|
|
"Developer": {daily = 2000, monthly = 40000}
|
||
|
|
"CodeReviewer": {daily = 1000, monthly = 20000}
|
||
|
|
"Tester": {daily = 800, monthly = 16000}
|
||
|
|
"Documenter": {daily = 300, monthly = 6000}
|
||
|
|
"Security": {daily = 500, monthly = 10000}
|
||
|
|
"Monitor": {daily = 100, monthly = 2000}
|
||
|
|
"Other": {daily = 800, monthly = 16000}
|
||
|
|
}
|
||
|
|
|
||
|
|
# Price tracking
|
||
|
|
pricing = {
|
||
|
|
"claude-opus-4-1": {input = 15.0, output = 75.0}
|
||
|
|
"claude-sonnet-4-20250514": {input = 3.0, output = 15.0}
|
||
|
|
"gpt-4-turbo": {input = 10.0, output = 30.0}
|
||
|
|
"gpt-4o": {input = 5.0, output = 15.0}
|
||
|
|
"gpt-3.5-turbo": {input = 0.50, output = 1.50}
|
||
|
|
"gemini-2.0-pro": {input = 10.0, output = 30.0}
|
||
|
|
"gemini-2.0-flash": {input = 0.075, output = 0.30}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# ===== LATENCY AND PERFORMANCE TARGETS =====
|
||
|
|
|
||
|
|
performance_targets = {
|
||
|
|
"code_generation": {p50 = 5000, p95 = 15000, p99 = 30000} # milliseconds
|
||
|
|
"code_review": {p50 = 3000, p95 = 10000, p99 = 20000}
|
||
|
|
"quick_query": {p50 = 500, p95 = 2000, p99 = 5000}
|
||
|
|
"real_time_monitoring": {p50 = 200, p95 = 1000, p99 = 2000}
|
||
|
|
}
|
||
|
|
|
||
|
|
# ===== CIRCUIT BREAKER SETTINGS =====
|
||
|
|
|
||
|
|
circuit_breakers = {
|
||
|
|
"claude": {
|
||
|
|
failure_threshold = 5 # Fail after 5 consecutive errors
|
||
|
|
timeout_threshold = 60000 # 60s timeout
|
||
|
|
half_open_max_calls = 3
|
||
|
|
reset_timeout = 30000
|
||
|
|
}
|
||
|
|
"openai": {
|
||
|
|
failure_threshold = 5
|
||
|
|
timeout_threshold = 45000
|
||
|
|
half_open_max_calls = 3
|
||
|
|
reset_timeout = 30000
|
||
|
|
}
|
||
|
|
"gemini": {
|
||
|
|
failure_threshold = 5
|
||
|
|
timeout_threshold = 30000
|
||
|
|
half_open_max_calls = 3
|
||
|
|
reset_timeout = 30000
|
||
|
|
}
|
||
|
|
"ollama": {
|
||
|
|
failure_threshold = 3 # Local failures more critical
|
||
|
|
timeout_threshold = 15000
|
||
|
|
half_open_max_calls = 5
|
||
|
|
reset_timeout = 10000
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
# ===== ROUTING RULES =====
|
||
|
|
|
||
|
|
routing_rules = [
|
||
|
|
{
|
||
|
|
condition = "task.complexity == high && cost < 1000"
|
||
|
|
action = "use_claude_opus"
|
||
|
|
}
|
||
|
|
{
|
||
|
|
condition = "task.latency_sensitive == true"
|
||
|
|
action = "use_fastest_available"
|
||
|
|
}
|
||
|
|
{
|
||
|
|
condition = "task.cost_sensitive == true && daily_cost > 4000"
|
||
|
|
action = "use_ollama_or_cheap"
|
||
|
|
}
|
||
|
|
{
|
||
|
|
condition = "provider_status[claude] == down"
|
||
|
|
action = "fallback_to_gpt4"
|
||
|
|
}
|
||
|
|
{
|
||
|
|
condition = "time_of_day == peak_hours && usage_high"
|
||
|
|
action = "load_balance_all_providers"
|
||
|
|
}
|
||
|
|
]
|
||
|
|
|
||
|
|
# ===== MONITORING AND ALERTING =====
|
||
|
|
|
||
|
|
monitoring = {
|
||
|
|
track_latencies = true
|
||
|
|
track_costs = true
|
||
|
|
track_failures = true
|
||
|
|
track_token_usage = true
|
||
|
|
|
||
|
|
metrics_retention = 30 # days
|
||
|
|
|
||
|
|
alerts = [
|
||
|
|
{
|
||
|
|
name = "high_daily_cost"
|
||
|
|
condition = "cost_today > 5000"
|
||
|
|
severity = "warning"
|
||
|
|
actions = ["notify_ops", "switch_to_cheap_provider"]
|
||
|
|
}
|
||
|
|
{
|
||
|
|
name = "provider_down"
|
||
|
|
condition = "provider_status == down"
|
||
|
|
severity = "critical"
|
||
|
|
actions = ["failover", "notify_ops"]
|
||
|
|
}
|
||
|
|
{
|
||
|
|
name = "high_latency"
|
||
|
|
condition = "p95_latency > performance_target * 2"
|
||
|
|
severity = "warning"
|
||
|
|
actions = ["notify_team", "consider_load_rebalance"]
|
||
|
|
}
|
||
|
|
{
|
||
|
|
name = "budget_exceeded"
|
||
|
|
condition = "monthly_cost > monthly_hard_limit"
|
||
|
|
severity = "critical"
|
||
|
|
actions = ["stop_new_requests", "notify_management"]
|
||
|
|
}
|
||
|
|
]
|
||
|
|
}
|
||
|
|
|
||
|
|
# ===== OUTPUT =====
|
||
|
|
|
||
|
|
output = {
|
||
|
|
providers = llm_providers
|
||
|
|
classifiers = task_classifiers
|
||
|
|
default_mappings = default_mappings
|
||
|
|
cost_tracking = cost_tracking
|
||
|
|
performance_targets = performance_targets
|
||
|
|
circuit_breakers = circuit_breakers
|
||
|
|
routing_rules = routing_rules
|
||
|
|
monitoring = monitoring
|
||
|
|
}
|