provisioning/schemas/platform/defaults/deployment/observability-production-overrides.ncl

85 lines
2.2 KiB
Text

# Observability Configuration for Production Mode
# Optimized for reliability with JSON logging, metrics enabled, and Loki aggregation
{
# Production mode observability overrides
observability = {
logging = {
# JSON structured logs for Loki/Splunk ingestion
format = "json",
# Info level for production (less verbose)
level = "info",
# Output to Loki for centralized aggregation
output = {
destination = "loki",
loki_endpoint = "http://loki:3100",
loki_labels = {
environment = "production",
cluster = "prod-cluster",
},
},
# Enable caller info for troubleshooting
fields = {
caller = true,
spans = true,
},
# Log sampling for high-throughput services
sampling = {
enabled = false, # Disable sampling; keep all logs
# rate = 0.1, # Uncomment to sample 10% of logs if needed
},
},
metrics = {
# Metrics fully enabled in production
enabled = true,
interval = 30, # More frequent collection
# Push to OpenTelemetry Collector
exporter = "otlp",
otlp_endpoint = "http://otel-collector:4317",
otlp_interval = 30,
},
health = {
# More frequent health checks in production
interval = 5,
failure_threshold = 2,
success_threshold = 2,
},
tracing = {
# Distributed tracing enabled for debugging
enabled = true,
backend = "otlp",
otlp_endpoint = "http://otel-collector:4317",
# Sample 10% of traces (reduce overhead)
sampler = "parentbased",
sampling_rate = 0.1,
environment = "production",
},
audit = {
# Audit logs to filesystem + SIEM export
storage = "file",
log_directory = "/var/log/provisioning/audit",
retention_days = 365, # Keep 1 year for compliance
# Enable PII tracking for compliance audits
include_pii = true,
# Export to both JSONL and Splunk
export_formats = ["jsonl", "splunk"],
# Push audit logs to Splunk HEC
siem_endpoint = "https://splunk.example.com:8088",
# Track all workspace operations
track_workspace_operations = true,
},
},
}