provisioning/schemas/platform/common/observability.ncl

189 lines
5.5 KiB
Text

# Observability Configuration Schema
# Unified schema for centralized logging, metrics, health checks, and tracing
{
# Observability configuration for services
ObservabilityConfig = {
# Enable/disable observability system-wide
enabled | Bool | default = true,
# Logging Configuration
logging | {
# Enable structured JSON logging
enabled | Bool | default = true,
# Log level: debug, info, warn, error
level | String | default = "info",
# Log format: json (for Loki ingestion) or pretty (development)
format | String | default = "json",
# RUST_LOG environment filter (granular module-level filtering)
filter | String | optional,
# Output configuration
output | {
# Log output destination: stdout, file, loki
destination | String | default = "stdout",
# File path for file output
file_path | String | optional,
# Loki endpoint (e.g., http://localhost:3100)
loki_endpoint | String | optional,
# Labels to attach to all Loki entries (labels become queryable)
loki_labels | {
} | optional,
} | optional,
# Structured field configuration
fields | {
# Include service name
service_name | Bool | default = true,
# Include timestamp (RFC3339)
timestamp | Bool | default = true,
# Include log level
level | Bool | default = true,
# Include caller location (file:line)
caller | Bool | default = false,
# Include span context (trace IDs, span IDs)
spans | Bool | default = true,
# Custom metadata fields
custom | {
} | optional,
} | optional,
# Performance optimization
sampling | {
# Enable log sampling to reduce volume
enabled | Bool | default = false,
# Sample 1 in N log entries
rate | Number | optional,
} | optional,
} | optional,
# Metrics Configuration (Prometheus)
metrics | {
# Enable metrics collection
enabled | Bool | default = true,
# Exporter backend: prometheus (default), otlp
exporter | String | default = "prometheus",
# Prometheus scrape endpoint path
prometheus_path | String | default = "/metrics",
# Metrics collection interval (seconds)
interval | Number | default = 60,
# Histogram buckets for request latency (milliseconds)
histogram_buckets | Array Number | default = [1, 5, 10, 50, 100, 500, 1000, 5000],
# Cardinality limits (prevent unbounded growth)
max_cardinality | Number | optional,
# Metric retention period (hours)
retention_hours | Number | optional,
# OpenTelemetry push endpoint (if using OTLP)
otlp_endpoint | String | optional,
# OTLP push interval (seconds)
otlp_interval | Number | optional,
} | optional,
# Health Check Configuration
health | {
# Enable health check endpoints
enabled | Bool | default = true,
# Health check HTTP server port
port | Number | default = 8081,
# Liveness probe endpoint
liveness_path | String | default = "/healthz",
# Readiness probe endpoint (depends on dependencies)
readiness_path | String | default = "/ready",
# Startup probe endpoint
startup_path | String | default = "/startup",
# Health check probe interval (seconds)
interval | Number | default = 10,
# Probe timeout (milliseconds)
timeout | Number | default = 5000,
# Number of consecutive successes to mark as healthy
success_threshold | Number | default = 1,
# Number of consecutive failures to mark as unhealthy
failure_threshold | Number | default = 3,
# Initial delay before first check (seconds)
initial_delay | Number | default = 0,
} | optional,
# Distributed Tracing Configuration (OpenTelemetry)
tracing | {
# Enable distributed tracing
enabled | Bool | default = false,
# Tracer backend: otlp (OpenTelemetry)
backend | String | default = "otlp",
# OpenTelemetry Collector endpoint (gRPC)
otlp_endpoint | String | optional,
# Trace sampler: always, never, parentbased
sampler | String | default = "parentbased",
# Sampling rate (0.0 to 1.0) for parentbased/probability samplers
sampling_rate | Number | optional,
# Service version
service_version | String | optional,
# Environment name (dev, staging, production)
environment | String | optional,
} | optional,
# Audit Logging Configuration
audit | {
# Enable workspace operation auditing
enabled | Bool | default = true,
# Storage backend: file, siem
storage | String | default = "file",
# Audit log file directory
log_directory | String | optional,
# Audit retention period (days)
retention_days | Number | default = 90,
# Include PII in audit logs (GDPR consideration)
include_pii | Bool | default = false,
# Export format(s): jsonl, csv, splunk, elastic
export_formats | Array String | default = ["jsonl"],
# SIEM endpoint (e.g., Splunk, Elastic) for real-time export
siem_endpoint | String | optional,
# Workspace operation tracking
track_workspace_operations | Bool | default = true,
# Tracked operations: create, delete, update, switch, list, sync
workspace_operations | Array String | default = ["create", "delete", "update", "switch", "list", "sync"],
} | optional,
},
}