provisioning/schemas/platform/examples/orchestrator-enterprise.ncl

222 lines
6.1 KiB
Plaintext
Raw Normal View History

# Example: Orchestrator Configuration - Enterprise Mode (Production HA)
#
# This example shows a production-grade orchestrator setup with:
# - SurrealDB cluster for distributed storage
# - High concurrency and throughput
# - Comprehensive monitoring and observability
# - Full security and audit logging
# - Advanced performance tuning
#
# Usage:
# nickel export --format toml orchestrator-enterprise.ncl > orchestrator.enterprise.toml
# ORCHESTRATOR_CONFIG=orchestrator.enterprise.toml cargo run --bin orchestrator
{
# Workspace Configuration
workspace = {
name = "production",
path = "/var/lib/provisioning/orchestrator",
enabled = true,
multi_workspace = true, # Support multiple workspaces in production
},
# Server Configuration: High performance
server = {
host = "0.0.0.0", # Listen on all interfaces
port = 9090,
workers = 16, # Multiple workers for high concurrency
keep_alive = 30, # Shorter keep-alive for better connection management
max_connections = 4096, # High limit for production
},
# Storage: SurrealDB Cluster (for HA)
storage = {
backend = "surrealdb_cluster",
surrealdb_url = "surrealdb://surrealdb-1.provisioning.svc.cluster.local:8000,surrealdb-2.provisioning.svc.cluster.local:8000",
surrealdb_namespace = "provisioning",
surrealdb_database = "orchestrator-prod",
},
# Queue/Task Processing: High throughput
queue = {
max_concurrent_tasks = 100, # Maximum concurrency for production
retry_attempts = 5, # More retries for reliability
retry_delay = 2000, # ms (exponential backoff)
task_timeout = 7200000, # 2 hours for long-running tasks
deadletter_queue = {
enabled = true,
max_messages = 10000, # Large queue for error handling
retention_period = 604800, # 7 days
},
priority_levels = ["low", "normal", "high", "critical"],
default_priority = "normal",
},
# Batch Workflow: Optimized for throughput
batch = {
parallel_limit = 50, # High parallelism
operation_timeout = 3600000, # 1 hour
checkpoint = {
enabled = true,
interval = 1000, # Checkpoint frequently for reliability
auto_cleanup = true,
max_checkpoints = 100, # Keep many checkpoints
},
rollback = {
strategy = "automatic",
retain_logs = true,
},
},
# Monitoring: Comprehensive production observability
monitoring = {
enabled = true,
metrics = {
enabled = true,
interval = 10, # Frequent metrics collection
export_format = "prometheus",
},
health_check = {
enabled = true,
interval = 30,
timeout = 5,
},
resources = {
track_cpu = true,
track_memory = true,
track_disk = true,
alert_threshold_cpu = 85, # Alert at 85% CPU
alert_threshold_memory = 90, # Alert at 90% memory
alert_threshold_disk = 95, # Alert at 95% disk
},
profiling = {
enabled = true,
sample_rate = 0.1, # Profile 10% of requests
},
},
# Logging: Production-grade with audit trail
logging = {
level = "info", # Information level for production
format = "json", # Structured logging for aggregation
outputs = [
{
destination = "stdout",
level = "warn", # Only warnings and above to stdout
},
{
destination = "file",
path = "/var/log/provisioning/orchestrator/orchestrator.log",
level = "info",
rotation = {
max_size = "500MB", # Larger files in production
max_backups = 30, # Keep many backups
max_age = 90, # Keep for 90 days
},
},
{
destination = "file",
path = "/var/log/provisioning/orchestrator/audit.log",
level = "info",
rotation = {
max_size = "200MB",
max_backups = 365, # Keep audit logs for 1 year
max_age = 365,
},
},
],
include_fields = [
"timestamp",
"level",
"message",
"task_id",
"workflow_id",
"user_id",
"duration",
"status",
"error",
"context",
],
},
# Security: Full production security
security = {
auth = {
enabled = true,
method = "jwt",
jwt_secret = "${JWT_SECRET}", # From environment
jwt_issuer = "provisioning.production",
jwt_audience = "orchestrator-prod",
token_expiration = 3600, # 1 hour
},
cors = {
enabled = true,
allowed_origins = [
"https://orchestrator.example.com",
"https://control-center.example.com",
],
allowed_methods = ["GET", "POST", "PUT"],
allowed_headers = ["Content-Type", "Authorization"],
expose_headers = ["X-Request-ID", "X-Total-Count"],
},
tls = {
enabled = true, # TLS required for production
cert_path = "/etc/provisioning/certs/orchestrator.crt",
key_path = "/etc/provisioning/certs/orchestrator.key",
min_version = "TLSv1.3",
},
rate_limit = {
enabled = true,
requests_per_second = 10000, # High limit for production
burst_size = 1000,
},
},
# Extensions: Enabled for production capability
extensions = {
auto_load = true,
oci_registry_url = "registry.example.com:5000",
oci_namespace = "provisioning/extensions",
refresh_interval = 24, # Check for updates daily
max_concurrent_init = 10, # Load extensions in parallel
},
# Database: Connection pooling for performance
database = {
pool = {
min_size = 20, # Pre-allocated connections
max_size = 100, # Maximum connections
connection_timeout = 10, # Shorter timeout for production
idle_timeout = 600, # 10 minutes
max_lifetime = 3600, # 1 hour
},
retry = {
max_attempts = 5,
initial_backoff = 100,
max_backoff = 30000,
},
},
# Features: Production-ready with full auditing
features = {
enable_audit_logging = true, # Full audit trail
enable_task_history = true, # Keep task history
enable_performance_tracking = true, # Track all metrics
enable_experimental_features = false, # No experimental features in production
},
}