222 lines
6.1 KiB
Plaintext
222 lines
6.1 KiB
Plaintext
|
|
# Example: Orchestrator Configuration - Enterprise Mode (Production HA)
|
||
|
|
#
|
||
|
|
# This example shows a production-grade orchestrator setup with:
|
||
|
|
# - SurrealDB cluster for distributed storage
|
||
|
|
# - High concurrency and throughput
|
||
|
|
# - Comprehensive monitoring and observability
|
||
|
|
# - Full security and audit logging
|
||
|
|
# - Advanced performance tuning
|
||
|
|
#
|
||
|
|
# Usage:
|
||
|
|
# nickel export --format toml orchestrator-enterprise.ncl > orchestrator.enterprise.toml
|
||
|
|
# ORCHESTRATOR_CONFIG=orchestrator.enterprise.toml cargo run --bin orchestrator
|
||
|
|
|
||
|
|
{
|
||
|
|
# Workspace Configuration
|
||
|
|
workspace = {
|
||
|
|
name = "production",
|
||
|
|
path = "/var/lib/provisioning/orchestrator",
|
||
|
|
enabled = true,
|
||
|
|
multi_workspace = true, # Support multiple workspaces in production
|
||
|
|
},
|
||
|
|
|
||
|
|
# Server Configuration: High performance
|
||
|
|
server = {
|
||
|
|
host = "0.0.0.0", # Listen on all interfaces
|
||
|
|
port = 9090,
|
||
|
|
workers = 16, # Multiple workers for high concurrency
|
||
|
|
keep_alive = 30, # Shorter keep-alive for better connection management
|
||
|
|
max_connections = 4096, # High limit for production
|
||
|
|
},
|
||
|
|
|
||
|
|
# Storage: SurrealDB Cluster (for HA)
|
||
|
|
storage = {
|
||
|
|
backend = "surrealdb_cluster",
|
||
|
|
surrealdb_url = "surrealdb://surrealdb-1.provisioning.svc.cluster.local:8000,surrealdb-2.provisioning.svc.cluster.local:8000",
|
||
|
|
surrealdb_namespace = "provisioning",
|
||
|
|
surrealdb_database = "orchestrator-prod",
|
||
|
|
},
|
||
|
|
|
||
|
|
# Queue/Task Processing: High throughput
|
||
|
|
queue = {
|
||
|
|
max_concurrent_tasks = 100, # Maximum concurrency for production
|
||
|
|
retry_attempts = 5, # More retries for reliability
|
||
|
|
retry_delay = 2000, # ms (exponential backoff)
|
||
|
|
task_timeout = 7200000, # 2 hours for long-running tasks
|
||
|
|
|
||
|
|
deadletter_queue = {
|
||
|
|
enabled = true,
|
||
|
|
max_messages = 10000, # Large queue for error handling
|
||
|
|
retention_period = 604800, # 7 days
|
||
|
|
},
|
||
|
|
|
||
|
|
priority_levels = ["low", "normal", "high", "critical"],
|
||
|
|
default_priority = "normal",
|
||
|
|
},
|
||
|
|
|
||
|
|
# Batch Workflow: Optimized for throughput
|
||
|
|
batch = {
|
||
|
|
parallel_limit = 50, # High parallelism
|
||
|
|
operation_timeout = 3600000, # 1 hour
|
||
|
|
|
||
|
|
checkpoint = {
|
||
|
|
enabled = true,
|
||
|
|
interval = 1000, # Checkpoint frequently for reliability
|
||
|
|
auto_cleanup = true,
|
||
|
|
max_checkpoints = 100, # Keep many checkpoints
|
||
|
|
},
|
||
|
|
|
||
|
|
rollback = {
|
||
|
|
strategy = "automatic",
|
||
|
|
retain_logs = true,
|
||
|
|
},
|
||
|
|
},
|
||
|
|
|
||
|
|
# Monitoring: Comprehensive production observability
|
||
|
|
monitoring = {
|
||
|
|
enabled = true,
|
||
|
|
|
||
|
|
metrics = {
|
||
|
|
enabled = true,
|
||
|
|
interval = 10, # Frequent metrics collection
|
||
|
|
export_format = "prometheus",
|
||
|
|
},
|
||
|
|
|
||
|
|
health_check = {
|
||
|
|
enabled = true,
|
||
|
|
interval = 30,
|
||
|
|
timeout = 5,
|
||
|
|
},
|
||
|
|
|
||
|
|
resources = {
|
||
|
|
track_cpu = true,
|
||
|
|
track_memory = true,
|
||
|
|
track_disk = true,
|
||
|
|
alert_threshold_cpu = 85, # Alert at 85% CPU
|
||
|
|
alert_threshold_memory = 90, # Alert at 90% memory
|
||
|
|
alert_threshold_disk = 95, # Alert at 95% disk
|
||
|
|
},
|
||
|
|
|
||
|
|
profiling = {
|
||
|
|
enabled = true,
|
||
|
|
sample_rate = 0.1, # Profile 10% of requests
|
||
|
|
},
|
||
|
|
},
|
||
|
|
|
||
|
|
# Logging: Production-grade with audit trail
|
||
|
|
logging = {
|
||
|
|
level = "info", # Information level for production
|
||
|
|
format = "json", # Structured logging for aggregation
|
||
|
|
|
||
|
|
outputs = [
|
||
|
|
{
|
||
|
|
destination = "stdout",
|
||
|
|
level = "warn", # Only warnings and above to stdout
|
||
|
|
},
|
||
|
|
{
|
||
|
|
destination = "file",
|
||
|
|
path = "/var/log/provisioning/orchestrator/orchestrator.log",
|
||
|
|
level = "info",
|
||
|
|
rotation = {
|
||
|
|
max_size = "500MB", # Larger files in production
|
||
|
|
max_backups = 30, # Keep many backups
|
||
|
|
max_age = 90, # Keep for 90 days
|
||
|
|
},
|
||
|
|
},
|
||
|
|
{
|
||
|
|
destination = "file",
|
||
|
|
path = "/var/log/provisioning/orchestrator/audit.log",
|
||
|
|
level = "info",
|
||
|
|
rotation = {
|
||
|
|
max_size = "200MB",
|
||
|
|
max_backups = 365, # Keep audit logs for 1 year
|
||
|
|
max_age = 365,
|
||
|
|
},
|
||
|
|
},
|
||
|
|
],
|
||
|
|
|
||
|
|
include_fields = [
|
||
|
|
"timestamp",
|
||
|
|
"level",
|
||
|
|
"message",
|
||
|
|
"task_id",
|
||
|
|
"workflow_id",
|
||
|
|
"user_id",
|
||
|
|
"duration",
|
||
|
|
"status",
|
||
|
|
"error",
|
||
|
|
"context",
|
||
|
|
],
|
||
|
|
},
|
||
|
|
|
||
|
|
# Security: Full production security
|
||
|
|
security = {
|
||
|
|
auth = {
|
||
|
|
enabled = true,
|
||
|
|
method = "jwt",
|
||
|
|
jwt_secret = "${JWT_SECRET}", # From environment
|
||
|
|
jwt_issuer = "provisioning.production",
|
||
|
|
jwt_audience = "orchestrator-prod",
|
||
|
|
token_expiration = 3600, # 1 hour
|
||
|
|
},
|
||
|
|
|
||
|
|
cors = {
|
||
|
|
enabled = true,
|
||
|
|
allowed_origins = [
|
||
|
|
"https://orchestrator.example.com",
|
||
|
|
"https://control-center.example.com",
|
||
|
|
],
|
||
|
|
allowed_methods = ["GET", "POST", "PUT"],
|
||
|
|
allowed_headers = ["Content-Type", "Authorization"],
|
||
|
|
expose_headers = ["X-Request-ID", "X-Total-Count"],
|
||
|
|
},
|
||
|
|
|
||
|
|
tls = {
|
||
|
|
enabled = true, # TLS required for production
|
||
|
|
cert_path = "/etc/provisioning/certs/orchestrator.crt",
|
||
|
|
key_path = "/etc/provisioning/certs/orchestrator.key",
|
||
|
|
min_version = "TLSv1.3",
|
||
|
|
},
|
||
|
|
|
||
|
|
rate_limit = {
|
||
|
|
enabled = true,
|
||
|
|
requests_per_second = 10000, # High limit for production
|
||
|
|
burst_size = 1000,
|
||
|
|
},
|
||
|
|
},
|
||
|
|
|
||
|
|
# Extensions: Enabled for production capability
|
||
|
|
extensions = {
|
||
|
|
auto_load = true,
|
||
|
|
oci_registry_url = "registry.example.com:5000",
|
||
|
|
oci_namespace = "provisioning/extensions",
|
||
|
|
refresh_interval = 24, # Check for updates daily
|
||
|
|
max_concurrent_init = 10, # Load extensions in parallel
|
||
|
|
},
|
||
|
|
|
||
|
|
# Database: Connection pooling for performance
|
||
|
|
database = {
|
||
|
|
pool = {
|
||
|
|
min_size = 20, # Pre-allocated connections
|
||
|
|
max_size = 100, # Maximum connections
|
||
|
|
connection_timeout = 10, # Shorter timeout for production
|
||
|
|
idle_timeout = 600, # 10 minutes
|
||
|
|
max_lifetime = 3600, # 1 hour
|
||
|
|
},
|
||
|
|
|
||
|
|
retry = {
|
||
|
|
max_attempts = 5,
|
||
|
|
initial_backoff = 100,
|
||
|
|
max_backoff = 30000,
|
||
|
|
},
|
||
|
|
},
|
||
|
|
|
||
|
|
# Features: Production-ready with full auditing
|
||
|
|
features = {
|
||
|
|
enable_audit_logging = true, # Full audit trail
|
||
|
|
enable_task_history = true, # Keep task history
|
||
|
|
enable_performance_tracking = true, # Track all metrics
|
||
|
|
enable_experimental_features = false, # No experimental features in production
|
||
|
|
},
|
||
|
|
}
|