# Observability Configuration for Production Mode # Optimized for reliability with JSON logging, metrics enabled, and Loki aggregation { # Production mode observability overrides observability = { logging = { # JSON structured logs for Loki/Splunk ingestion format = "json", # Info level for production (less verbose) level = "info", # Output to Loki for centralized aggregation output = { destination = "loki", loki_endpoint = "http://loki:3100", loki_labels = { environment = "production", cluster = "prod-cluster", }, }, # Enable caller info for troubleshooting fields = { caller = true, spans = true, }, # Log sampling for high-throughput services sampling = { enabled = false, # Disable sampling; keep all logs # rate = 0.1, # Uncomment to sample 10% of logs if needed }, }, metrics = { # Metrics fully enabled in production enabled = true, interval = 30, # More frequent collection # Push to OpenTelemetry Collector exporter = "otlp", otlp_endpoint = "http://otel-collector:4317", otlp_interval = 30, }, health = { # More frequent health checks in production interval = 5, failure_threshold = 2, success_threshold = 2, }, tracing = { # Distributed tracing enabled for debugging enabled = true, backend = "otlp", otlp_endpoint = "http://otel-collector:4317", # Sample 10% of traces (reduce overhead) sampler = "parentbased", sampling_rate = 0.1, environment = "production", }, audit = { # Audit logs to filesystem + SIEM export storage = "file", log_directory = "/var/log/provisioning/audit", retention_days = 365, # Keep 1 year for compliance # Enable PII tracking for compliance audits include_pii = true, # Export to both JSONL and Splunk export_formats = ["jsonl", "splunk"], # Push audit logs to Splunk HEC siem_endpoint = "https://splunk.example.com:8088", # Track all workspace operations track_workspace_operations = true, }, }, }