Vapora/provisioning/vapora-wrksp/kcl/multi-ia.k

"""
VAPORA Multi-IA Router Configuration
Defines LLM routing rules, model mappings, cost thresholds, and fallback chains
"""

import k.api.all as k

# ===== LLM PROVIDER DEFINITIONS =====

llm_providers = {
    "claude": {
        name = "Anthropic Claude"
        endpoint = "https://api.anthropic.com/v1"
        models = [
            {name = "claude-opus-4-1", context = 200000, cost_per_mtok = 15.0}
            {name = "claude-sonnet-4-20250514", context = 200000, cost_per_mtok = 3.0}
            {name = "claude-haiku-3-5-20241022", context = 200000, cost_per_mtok = 0.80}
        ]
        availability = "production"
        regions = ["us-east-1", "us-west-2", "eu-west-1"]
    }
    "openai": {
        name = "OpenAI"
        endpoint = "https://api.openai.com/v1"
        models = [
            {name = "gpt-4-turbo", context = 128000, cost_per_mtok = 10.0}
            {name = "gpt-4o", context = 128000, cost_per_mtok = 5.0}
            {name = "gpt-3.5-turbo", context = 16384, cost_per_mtok = 0.50}
        ]
        availability = "production"
        regions = ["us-east-1", "us-west-2", "eu-west-1"]
    }
    "gemini": {
        name = "Google Gemini"
        endpoint = "https://generativelanguage.googleapis.com/v1beta"
        models = [
            {name = "gemini-2.0-pro", context = 1000000, cost_per_mtok = 10.0}
            {name = "gemini-2.0-flash", context = 1000000, cost_per_mtok = 0.075}
            {name = "gemini-1.5-pro", context = 1000000, cost_per_mtok = 1.25}
        ]
        availability = "production"
        regions = ["us-central-1"]
    }
    "ollama": {
        name = "Ollama Local"
        endpoint = "http://ollama.vapora-system:11434"
        models = [
            {name = "llama2", context = 4096, cost_per_mtok = 0.0}
            {name = "mistral", context = 8192, cost_per_mtok = 0.0}
            {name = "neural-chat", context = 4096, cost_per_mtok = 0.0}
        ]
        availability = "local"
        regions = ["on-premise"]
    }
}

# ===== TASK CONTEXT CLASSIFIERS =====

task_classifiers = {
    "code_generation": {
        complexity = "high"
        latency_sensitive = false
        context_needs = 32000
        quality_critical = true
        cost_sensitive = false
        recommended = ["claude-opus-4-1", "gpt-4-turbo", "claude-sonnet-4-20250514"]
    }
    "code_review": {
        complexity = "medium"
        latency_sensitive = false
        context_needs = 16000
        quality_critical = true
        cost_sensitive = true
        recommended = ["claude-sonnet-4-20250514", "gpt-4o", "gemini-2.0-flash"]
    }
    "documentation": {
        complexity = "medium"
        latency_sensitive = false
        context_needs = 8000
        quality_critical = true
        cost_sensitive = true
        recommended = ["gpt-4-turbo", "gemini-1.5-pro", "claude-sonnet-4-20250514"]
    }
    "testing": {
        complexity = "medium"
        latency_sensitive = false
        context_needs = 16000
        quality_critical = true
        cost_sensitive = true
        recommended = ["claude-sonnet-4-20250514", "gpt-4o"]
    }
    "quick_query": {
        complexity = "low"
        latency_sensitive = true
        context_needs = 4000
        quality_critical = false
        cost_sensitive = true
        recommended = ["gemini-2.0-flash", "gpt-3.5-turbo", "llama2"]
    }
    "embeddings": {
        complexity = "low"
        latency_sensitive = true
        context_needs = 512
        quality_critical = false
        cost_sensitive = true
        recommended = ["ollama/neural-chat"]
    }
    "summarization": {
        complexity = "medium"
        latency_sensitive = false
        context_needs = 32000
        quality_critical = true
        cost_sensitive = true
        recommended = ["claude-sonnet-4-20250514", "gemini-2.0-flash"]
    }
    "real_time_monitoring": {
        complexity = "low"
        latency_sensitive = true
        context_needs = 2000
        quality_critical = false
        cost_sensitive = true
        recommended = ["gemini-2.0-flash", "gpt-3.5-turbo"]
    }
}

# ===== DEFAULT LLM MAPPINGS =====

default_mappings = [
    {
        agent_role = "Architect"
        task_type = "*"  # All tasks
        default_llm = "claude-opus-4-1"
        fallback = ["gpt-4-turbo"]
        override_allowed = false  # Critical decisions
    }
    {
        agent_role = "Developer"
        task_type = "code_generation"
        default_llm = "claude-sonnet-4-20250514"
        fallback = ["gpt-4o", "claude-opus-4-1"]
        override_allowed = true
    }
    {
        agent_role = "CodeReviewer"
        task_type = "code_review"
        default_llm = "claude-sonnet-4-20250514"
        fallback = ["gpt-4o", "gemini-2.0-flash"]
        override_allowed = true
    }
    {
        agent_role = "Tester"
        task_type = "testing"
        default_llm = "claude-sonnet-4-20250514"
        fallback = ["gpt-4o"]
        override_allowed = true
    }
    {
        agent_role = "Documenter"
        task_type = "documentation"
        default_llm = "gpt-4-turbo"
        fallback = ["claude-sonnet-4-20250514", "gemini-1.5-pro"]
        override_allowed = true
    }
    {
        agent_role = "Marketer"
        task_type = "*"
        default_llm = "claude-sonnet-4-20250514"
        fallback = ["gpt-4o"]
        override_allowed = true
    }
    {
        agent_role = "Monitor"
        task_type = "real_time_monitoring"
        default_llm = "gemini-2.0-flash"
        fallback = ["gpt-3.5-turbo"]
        override_allowed = false  # Must be fast
    }
    {
        agent_role = "Security"
        task_type = "*"
        default_llm = "claude-opus-4-1"
        fallback = ["gpt-4-turbo"]
        override_allowed = false  # Critical security
    }
]

# ===== COST TRACKING CONFIGURATION =====

cost_tracking = {
    enabled = true
    daily_warn_threshold = 5000  # Warn if daily cost > $5000
    daily_hard_limit = 10000      # Hard stop if daily cost > $10000
    monthly_warn_threshold = 100000
    monthly_hard_limit = 150000

    # Cost allocation by agent role
    budget_per_agent = {
        "Architect": {daily = 500, monthly = 10000}
        "Developer": {daily = 2000, monthly = 40000}
        "CodeReviewer": {daily = 1000, monthly = 20000}
        "Tester": {daily = 800, monthly = 16000}
        "Documenter": {daily = 300, monthly = 6000}
        "Security": {daily = 500, monthly = 10000}
        "Monitor": {daily = 100, monthly = 2000}
        "Other": {daily = 800, monthly = 16000}
    }

    # Price tracking
    pricing = {
        "claude-opus-4-1": {input = 15.0, output = 75.0}
        "claude-sonnet-4-20250514": {input = 3.0, output = 15.0}
        "gpt-4-turbo": {input = 10.0, output = 30.0}
        "gpt-4o": {input = 5.0, output = 15.0}
        "gpt-3.5-turbo": {input = 0.50, output = 1.50}
        "gemini-2.0-pro": {input = 10.0, output = 30.0}
        "gemini-2.0-flash": {input = 0.075, output = 0.30}
    }
}

# ===== LATENCY AND PERFORMANCE TARGETS =====

performance_targets = {
    "code_generation": {p50 = 5000, p95 = 15000, p99 = 30000}  # milliseconds
    "code_review": {p50 = 3000, p95 = 10000, p99 = 20000}
    "quick_query": {p50 = 500, p95 = 2000, p99 = 5000}
    "real_time_monitoring": {p50 = 200, p95 = 1000, p99 = 2000}
}

# ===== CIRCUIT BREAKER SETTINGS =====

circuit_breakers = {
    "claude": {
        failure_threshold = 5        # Fail after 5 consecutive errors
        timeout_threshold = 60000    # 60s timeout
        half_open_max_calls = 3
        reset_timeout = 30000
    }
    "openai": {
        failure_threshold = 5
        timeout_threshold = 45000
        half_open_max_calls = 3
        reset_timeout = 30000
    }
    "gemini": {
        failure_threshold = 5
        timeout_threshold = 30000
        half_open_max_calls = 3
        reset_timeout = 30000
    }
    "ollama": {
        failure_threshold = 3        # Local failures more critical
        timeout_threshold = 15000
        half_open_max_calls = 5
        reset_timeout = 10000
    }
}

# ===== ROUTING RULES =====

routing_rules = [
    {
        condition = "task.complexity == high && cost < 1000"
        action = "use_claude_opus"
    }
    {
        condition = "task.latency_sensitive == true"
        action = "use_fastest_available"
    }
    {
        condition = "task.cost_sensitive == true && daily_cost > 4000"
        action = "use_ollama_or_cheap"
    }
    {
        condition = "provider_status[claude] == down"
        action = "fallback_to_gpt4"
    }
    {
        condition = "time_of_day == peak_hours && usage_high"
        action = "load_balance_all_providers"
    }
]

# ===== MONITORING AND ALERTING =====

monitoring = {
    track_latencies = true
    track_costs = true
    track_failures = true
    track_token_usage = true

    metrics_retention = 30  # days

    alerts = [
        {
            name = "high_daily_cost"
            condition = "cost_today > 5000"
            severity = "warning"
            actions = ["notify_ops", "switch_to_cheap_provider"]
        }
        {
            name = "provider_down"
            condition = "provider_status == down"
            severity = "critical"
            actions = ["failover", "notify_ops"]
        }
        {
            name = "high_latency"
            condition = "p95_latency > performance_target * 2"
            severity = "warning"
            actions = ["notify_team", "consider_load_rebalance"]
        }
        {
            name = "budget_exceeded"
            condition = "monthly_cost > monthly_hard_limit"
            severity = "critical"
            actions = ["stop_new_requests", "notify_management"]
        }
    ]
}

# ===== OUTPUT =====

output = {
    providers = llm_providers
    classifiers = task_classifiers
    default_mappings = default_mappings
    cost_tracking = cost_tracking
    performance_targets = performance_targets
    circuit_breakers = circuit_breakers
    routing_rules = routing_rules
    monitoring = monitoring
}