diff --git a/adrs/adr-012-nats-event-broker.ncl b/adrs/adr-012-nats-event-broker.ncl new file mode 100644 index 0000000..5a7e9c8 --- /dev/null +++ b/adrs/adr-012-nats-event-broker.ncl @@ -0,0 +1,92 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-012", + title = "NATS JetStream as Exclusive Inter-Service Event Broker", + status = 'Accepted, + date = "2026-02-17", + + context = "The provisioning platform has four runtime execution contexts — CLI, platform services (Orchestrator, Control Center, Vault, Extension Registry), remote taskservs, and AI/MCP — that must coordinate without leaking credentials or state into transient channels. Prior to this decision, services communicated via direct HTTP polling, shared filesystem state, and environment variables. This created audit gaps (no durable record of which service triggered which operation), credential leakage (provider tokens passed as env vars or written to disk by the CLI process), race conditions (multiple CLI invocations racing over shared config files with no delivery guarantee), and no backpressure (a slow consumer could starve or block a fast producer with no visibility).", + + decision = "NATS with JetStream is the exclusive inter-service event broker. All inter-service communication that is not synchronous credential retrieval (Vault HTTPS) or session validation (Control Center HTTPS) must use NATS subjects under the `provisioning.>` hierarchy. Six JetStream streams are defined at startup by Orchestrator: TASKS (work queue), VAULT (interest), AUTH (interest), WORKSPACE (7-day limits), AUDIT (90-day limits), HEALTH (interest). Credentials never travel over NATS — only identifiers (lease_id, task_id, session_id) are published. Solo mode: nats-server -js as child process. Multi-user: external NATS cluster.", + + rationale = [ + { + claim = "At-least-once delivery with durable persistence", + detail = "JetStream provides durable message persistence for task log replay and audit trail reconstruction. Pull consumers ack explicitly; unacknowledged messages are redelivered.", + }, + { + claim = "Work-queue semantics enforce SOLID — CLI cannot call providers directly", + detail = "CLI submits to provisioning.tasks.submitted only. It cannot call provider APIs directly. This is the primary structural enforcement of the SOLID boundary between CLI and Orchestrator.", + }, + { + claim = "Push semantics for real-time status streaming without polling", + detail = "Control Center streams task status to browser via WebSocket without polling Orchestrator. NATS push consumers bridge the event stream to the WebSocket layer.", + }, + { + claim = "Multi-tenant subject namespacing maps to bounded contexts", + detail = "The provisioning.> hierarchy with six streams maps each stream to its bounded context (tasks, vault, auth, workspace, audit, health). Each service subscribes only to its own subjects.", + }, + ], + + consequences = { + positive = [ + "Full audit trail: every state transition is a durable NATS message consumed by AuditCollector", + "No polling: Control Center streams task status to browser via WebSocket", + "Backpressure: JetStream consumers ack explicitly; unacknowledged messages are redelivered", + "SOLID enforcement: CLI can only submit to provisioning.tasks.submitted; cannot call provider APIs directly", + ], + negative = [ + "nats-server is a required external process in solo mode, adding a startup step", + "Message ordering within a subject is guaranteed but cross-subject ordering is not", + "JetStream persistence requires disk space for AUDIT stream (90-day retention)", + "Pull consumers in VAULT stream add one round-trip vs direct HTTP for lease issuance", + ], + }, + + alternatives_considered = [ + { + option = "Direct HTTP polling between services", + why_rejected = "Creates coupling between services and requires each service to know the addresses of others. No delivery guarantee, no audit trail, polling adds latency.", + }, + { + option = "Redis Pub/Sub for event distribution", + why_rejected = "Redis Pub/Sub has no persistence — messages are lost if no subscriber is listening. No work-queue semantics, no backpressure, no durable audit trail.", + }, + ], + + constraints = [ + { + id = "credentials-never-in-nats", + claim = "Actual credentials (tokens, secrets, keys) must never be published to any NATS subject", + scope = "platform/crates/orchestrator/src/, platform/crates/platform-nats/", + severity = 'Hard, + check = { tag = 'NuCmd, cmd = "rg 'publish|nats' platform/crates/ -l | xargs rg -l 'token|secret|password|key'", expect_exit = 1 }, + rationale = "Credentials in NATS messages would be visible to all subscribers on the subject. Only lease_id, task_id, and session_id travel over NATS; actual secrets are fetched over HTTPS from Vault.", + }, + { + id = "six-streams-defined-by-orchestrator", + claim = "JetStream stream definitions (TASKS, VAULT, AUTH, WORKSPACE, AUDIT, HEALTH) are created by Orchestrator on startup and must not be redefined by other services", + scope = "platform/crates/orchestrator/src/nats.rs", + severity = 'Hard, + check = { tag = 'NuCmd, cmd = "rg 'create_stream|add_stream' platform/crates/ --include='*.rs' -l | grep -v orchestrator", expect_exit = 1 }, + rationale = "Single point of stream definition prevents conflicting stream configurations. Other services are consumers only.", + }, + { + id = "nats-subject-hierarchy", + claim = "All NATS subjects must be under the provisioning.> hierarchy with the stream-to-subject mapping documented in schemas/platform/common/nats.ncl", + scope = "platform/crates/platform-nats/", + severity = 'Soft, + check = { tag = 'NuCmd, cmd = "rg '\"[a-z]' platform/crates/ --include='*.rs' | grep -v 'provisioning\\.'", expect_exit = 1 }, + rationale = "Consistent subject hierarchy enables subject-level access control and prevents cross-context pollution.", + }, + ], + + related_adrs = ["adr-014-solid-enforcement", "adr-015-solo-mode-architecture"], + + ontology_check = { + decision_string = "NATS JetStream is the exclusive inter-service event broker; credentials never travel over NATS; six streams defined by Orchestrator", + invariants_at_risk = ["solid-boundaries"], + verdict = 'Safe, + }, +} diff --git a/adrs/adr-013-surrealdb-global-store.ncl b/adrs/adr-013-surrealdb-global-store.ncl new file mode 100644 index 0000000..8175417 --- /dev/null +++ b/adrs/adr-013-surrealdb-global-store.ncl @@ -0,0 +1,95 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-013", + title = "SurrealDB as the Global Persistent Store", + status = 'Accepted, + date = "2026-02-17", + + context = "The platform needs a single persistent data store that can operate embedded (RocksDB, zero external process) in solo mode, run as an external WebSocket server in multi-user deployments without schema changes, support five distinct service namespaces with well-typed schemas (orchestrator, vault, control_center, audit, workspace), store heterogeneous data (task logs append-only, secrets as encrypted blobs, Cedar policies as documents, audit events as time-series, git sync state as mutable), and be queryable by the AI/MCP service for context gathering without a separate analytics database. PostgreSQL requires a server process, SQLite has no native namespacing, Redis has no real persistence. SurrealDB is the only option supporting all five requirements simultaneously.", + + decision = "SurrealDB is the exclusive persistent store for all platform state. No service reads raw files or environment variables for credentials at runtime — all reads go through SurrealDB (secrets via Vault Service, which stores ciphertext in SurrealDB). Namespace layout under `provisioning` database: orchestrator (tasks, task_events, execution_logs, config_hashes, provider_cache), vault (secrets, keys, leases, secret_versions, audit_trail), control_center (users, sessions, cedar_policies, policy_evaluations), audit (events, metrics), workspace (registrations, deployments, git_sync_state, extensions). Mode selection via DbConfig: Memory (tests), Embedded (solo, RocksDB), Server (multi-user, WebSocket). Schema initialization via DEFINE TABLE IF NOT EXISTS DDL — no migration framework for additive changes.", + + rationale = [ + { + claim = "Single storage abstraction across solo and multi-user modes", + detail = "SurrealPool is Clone (Arc> internally), shareable across tokio tasks. The same codebase connects to embedded RocksDB in solo mode and WebSocket server in multi-user — only the DbConfig changes.", + }, + { + claim = "No external process in solo mode", + detail = "Embedded RocksDB starts with the service binary. Solo mode requires no external database process, reducing startup dependencies and enabling CI runs without infrastructure.", + }, + { + claim = "AI/MCP context without ETL pipelines", + detail = "The AI service queries audit:events and orchestrator:tasks directly for context. SurrealDB's document+relational model handles heterogeneous schemas without separate analytics infrastructure.", + }, + { + claim = "Test isolation via DbConfig::Memory", + detail = "In-process Surreal requires no external binary — every cargo test run gets a fresh, isolated database. Integration tests run without external infrastructure.", + }, + ], + + consequences = { + positive = [ + "Single storage abstraction: SurrealPool is Clone, shareable across tokio tasks", + "No external process in solo mode: embedded RocksDB starts with the service binary", + "AI/MCP context: AI service queries audit:events and orchestrator:tasks directly without ETL pipelines", + "Test isolation: DbConfig::Memory (in-process Surreal) requires no external binary", + ], + negative = [ + "SurrealDB v2 API uses snake_case builtins; bind() requires owned values; ID fields in structs needed to avoid RecordId parsing issues", + "MVCC conflicts under concurrent write load require retry_on_conflict with exponential backoff + jitter on store_secret, store_key, and lease operations", + "Full-text search and graph queries are available but deferred to avoid over-engineering", + ], + }, + + alternatives_considered = [ + { + option = "PostgreSQL", + why_rejected = "Requires an external server process — no embedded mode for solo deployment. Schema evolution requires explicit migration tooling. No native document storage for Cedar policies.", + }, + { + option = "SQLite", + why_rejected = "No native namespace/tenant isolation. No document model. Concurrent write performance under multiple async tasks is constrained. No WebSocket server mode for multi-user.", + }, + { + option = "Redis", + why_rejected = "No real persistence guarantees (AOF is not the same as durable embedded storage). Key-value only — no document or relational queries for audit trail or task history.", + }, + ], + + constraints = [ + { + id = "cli-no-surrealdb-direct", + claim = "CLI code (.nu files) must NOT access SurrealDB directly — all reads/writes from CLI go through service HTTP APIs", + scope = "core/nulib/, extensions/", + severity = 'Hard, + check = { tag = 'Grep, pattern = "surrealdb|surreal|SurrealDB", paths = ["core/nulib/", "catalog/"], must_be_empty = true }, + rationale = "Direct SurrealDB access from CLI violates the SOLID boundary (ADR-014). All state mutations must go through the service layer to maintain audit trail and authorization.", + }, + { + id = "namespace-layout-fixed", + claim = "The five namespaces (orchestrator, vault, control_center, audit, workspace) under the `provisioning` database must not be changed without an ADR", + scope = "platform/crates/platform-db/", + severity = 'Hard, + check = { tag = 'FileExists, path = "platform/crates/platform-db/", present = true }, + rationale = "Namespace layout is the boundary contract between services. Changing it without an ADR risks data loss and cross-service coupling.", + }, + { + id = "retry-on-mvcc-conflict", + claim = "Operations on store_secret, store_key, and lease operations must use retry_on_conflict with exponential backoff + jitter", + scope = "platform/crates/platform-db/src/retry.rs, platform/secretumvault/", + severity = 'Soft, + check = { tag = 'NuCmd, cmd = "rg 'store_secret|store_key|create_lease' platform/ --include='*.rs' | grep -v retry", expect_exit = 1 }, + rationale = "SurrealDB MVCC conflicts are expected under concurrent write load. Without retry, concurrent task executions silently fail on lease creation.", + }, + ], + + related_adrs = ["adr-012-nats-event-broker", "adr-014-solid-enforcement", "adr-015-solo-mode-architecture"], + + ontology_check = { + decision_string = "SurrealDB is the exclusive persistent store; CLI accesses state only via service HTTP APIs; five fixed namespaces under provisioning database", + invariants_at_risk = ["solid-boundaries", "config-driven-always"], + verdict = 'Safe, + }, +} diff --git a/adrs/adr-014-solid-enforcement.ncl b/adrs/adr-014-solid-enforcement.ncl new file mode 100644 index 0000000..33b3379 --- /dev/null +++ b/adrs/adr-014-solid-enforcement.ncl @@ -0,0 +1,99 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-014", + title = "SOLID Architecture Boundaries with Multi-Layer Enforcement", + status = 'Accepted, + date = "2026-02-17", + + context = "As the platform expanded from a CLI tool to a multi-service control plane, a critical failure mode emerged: the Nushell CLI directly called cloud provider CLIs (hcloud, aws, doctl). This violated Single Responsibility (the CLI acquired infrastructure execution responsibility) and Dependency Inversion (CLI depended on concrete provider CLIs instead of the Orchestrator abstraction). Consequences: provider credentials leaked into CLI process environment (HCLOUD_TOKEN as env var), no audit trail for provider API calls made outside Orchestrator, SSH operations done by CLI bypassed the task state machine and rollback capability, auth decisions (JWT validation) duplicated across services instead of delegated to Control Center, secret files read directly by multiple services bypassing Vault's lease lifecycle. Documentation alone fails to enforce boundaries: engineers under time pressure skip it. The enforcement must be structural.", + + decision = "Six hard boundaries are enforced at six independent layers. Each layer is a fail-safe — any single layer catching a violation is sufficient to prevent it shipping. The six boundaries: (1) Provider API calls only in orchestrator crate, (2) SSH operations only in orchestrator+machines crates, (3) SurrealDB access from CLI forbidden, (4) Secret credentials forbidden in NATS messages, (5) Auth decisions only in control-center crate, (6) Raw secret file/env reads in services forbidden. Enforcement layers: compile-time (pub(crate) visibility), dev-time (Claude PreToolUse hook), pre-commit (git hook), CI (architecture tests), runtime (Cedar policies), continuous audit (NATS audit subject).", + + rationale = [ + { + claim = "Documentation alone is insufficient — enforcement must be structural", + detail = "Engineers under time pressure bypass documentation. The six-layer enforcement stack means a violation must simultaneously evade compile-time type checking, the Claude dev hook, the pre-commit grep, the CI architecture test, Cedar policy evaluation, and the NATS audit collector. Any single layer is sufficient to catch it.", + }, + { + claim = "Compile-time is the cheapest enforcement layer", + detail = "Provider client types are pub(crate) inside orchestrator. Other crates cannot import them — the Rust compiler rejects the build before any test runs. This is O(0) runtime cost.", + }, + { + claim = "AUTH corollaries prevent auth fragmentation", + detail = "solo_auth_middleware is the only documented auth bypass, gated behind --mode solo. All protected routes are inside route_layer(). UserContext is extracted from request extensions, never from headers directly. Cedar policies are the only authorization mechanism — no ad-hoc role checks.", + }, + { + claim = "NATS audit subject provides continuous violation detection at runtime", + detail = "provisioning.audit.violation.solid is published on runtime violations. AuditCollector persists these to SurrealDB. Violations discovered after deployment are recorded and queryable.", + }, + ], + + consequences = { + positive = [ + "All provider credentials are scoped to Orchestrator's process — no credential leakage path to CLI", + "Task state machine in Orchestrator provides rollback for every provider operation", + "Auth defects are isolated to Control Center — other services cannot accidentally implement auth", + "SOLID violations are caught at the earliest possible layer (usually compile-time or dev-time), not in production", + ], + negative = [ + "Adding a new cloud provider requires changes to Orchestrator only — correct by design but requires understanding the dispatch model", + "The pre-commit hook adds ~200ms to commit time for grep scans", + "CLI cannot query provider state directly — must call Orchestrator API, adding one HTTP hop", + ], + }, + + alternatives_considered = [ + { + option = "Compile-time enforcement only via crate visibility", + why_rejected = "Insufficient for Nushell code which has no compile-time type system. Pre-commit and Claude hooks are needed to cover .nu files where the Rust compiler cannot help.", + }, + { + option = "Documentation + code review process", + why_rejected = "The failure mode this ADR addresses (direct provider CLI calls from Nushell) was introduced despite existing documentation. Enforcement must be automatic, not manual.", + }, + ], + + constraints = [ + { + id = "provider-calls-orchestrator-only", + claim = "Provider API calls (hcloud, aws, doctl, upctl) must only exist in the orchestrator crate", + scope = "provisioning/", + severity = 'Hard, + check = { tag = 'NuCmd, cmd = "rg 'hcloud|aws|doctl|upctl' --include='*.rs' provisioning/ | grep -v 'orchestrator'", expect_exit = 1 }, + rationale = "All provider API calls must flow through the Orchestrator dispatch model to maintain audit trail and rollback capability.", + }, + { + id = "ssh-orchestrator-machines-only", + claim = "SSH operations (russh, ssh2) must only exist in orchestrator and machines crates", + scope = "provisioning/platform/crates/", + severity = 'Hard, + check = { tag = 'NuCmd, cmd = "rg 'russh|ssh2' --include='*.rs' provisioning/platform/crates/ | grep -v 'orchestrator\\|machines'", expect_exit = 1 }, + rationale = "SSH operations that bypass Orchestrator bypass the task state machine and lose rollback capability and audit trail.", + }, + { + id = "solo-auth-middleware-single-bypass", + claim = "solo_auth_middleware is the only place in the codebase where auth is bypassed; it must be gated behind --mode solo and never used in production routing", + scope = "platform/crates/control-center/src/middleware/", + severity = 'Hard, + check = { tag = 'Grep, pattern = "bypass|skip.*auth|no.*auth", paths = ["platform/crates/"], must_be_empty = true }, + rationale = "A single documented and tested auth bypass is auditable. Multiple bypass paths create an audit surface that cannot be systematically verified.", + }, + { + id = "cedar-only-authorization", + claim = "No ad-hoc role checks (if user.roles.contains) in business logic — Cedar policies are the only authorization mechanism", + scope = "platform/crates/", + severity = 'Soft, + check = { tag = 'NuCmd, cmd = "rg 'roles.contains|role ==' --include='*.rs' platform/crates/ | grep -v test", expect_exit = 1 }, + rationale = "Ad-hoc role checks create authorization logic scattered across services that cannot be audited or modified atomically.", + }, + ], + + related_adrs = ["adr-012-nats-event-broker", "adr-013-surrealdb-global-store", "adr-015-solo-mode-architecture"], + + ontology_check = { + decision_string = "Six hard SOLID boundaries enforced at six independent layers; solo_auth_middleware is the only documented auth bypass; Cedar is the only authorization mechanism", + invariants_at_risk = ["solid-boundaries", "provider-abstraction"], + verdict = 'Safe, + }, +} diff --git a/adrs/adr-015-solo-mode-architecture.ncl b/adrs/adr-015-solo-mode-architecture.ncl new file mode 100644 index 0000000..f1336d2 --- /dev/null +++ b/adrs/adr-015-solo-mode-architecture.ncl @@ -0,0 +1,91 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-015", + title = "Solo Mode — Full Architecture with Relaxed Auth", + status = 'Accepted, + date = "2026-02-17", + + context = "The platform must run on a single operator's laptop for local development, testing, and single-operator production deployments. Two options were available: (1) Simplified mode — stripped-down binary bypassing services, writing directly to disk/files, skipping NATS and SurrealDB; (2) Full architecture with relaxed auth — same services, same NATS subjects, same SurrealDB schema, but auth middleware replaced with a no-op that auto-creates an admin session. Option 1 creates two separate code paths: solo vs multi-user. Scripts, integrations, and the CLI behave differently per mode. Testing in solo mode cannot validate multi-user behavior. Option 2 preserves a single code path with auth as the only runtime difference.", + + decision = "Solo mode uses the full architecture with relaxed auth. Every service (Orchestrator, Control Center, Vault, Extension Registry, AI/MCP) runs as the same binary with the same NATS subjects and the same SurrealDB schema. Runtime differences: SurrealDB uses embedded RocksDB in solo vs WebSocket server in multi-user; NATS uses nats-server -js child process in solo vs external cluster; auth middleware is solo_auth_middleware (auto-session, no JWT) in solo vs auth_middleware (JWT + Cedar) in multi-user; Vault auto-unseals with local age key in solo vs Shamir threshold or KMS; Cedar default-permits local user in solo vs full policy evaluation. solo_auth_middleware injects fixed UserContext { roles: [admin], mfa_verified: true, user_id: Uuid::nil() } and is gated behind --mode solo runtime flag.", + + rationale = [ + { + claim = "Single code path eliminates solo/multi-user behavioral divergence", + detail = "Any script or integration written for solo mode works in multi-user without modification — only the connection strings change. This makes solo mode a valid staging environment for multi-user behavior.", + }, + { + claim = "solo_auth_middleware is isolated and auditable", + detail = "The auth bypass is in one function, gated behind a runtime flag, explicitly tested. Auditing solo mode auth is a grep away: rg 'solo_auth_middleware'. This is safer than multiple ad-hoc bypasses scattered across services.", + }, + { + claim = "SurrealDB and NATS data persist across restarts in solo mode", + detail = "RocksDB + JetStream storage persist to disk. Solo mode is not ephemeral — state survives service restarts, enabling realistic local testing of long-running task scenarios.", + }, + { + claim = "CI can run integration tests against the solo mode harness without external infrastructure", + detail = "The solo mode harness (embedded RocksDB, child nats-server) runs in CI without network or external service dependencies. Full integration test coverage without infrastructure overhead.", + }, + ], + + consequences = { + positive = [ + "Any script or integration written for solo mode works in multi-user without modification — only connection strings change", + "The auth bypass is isolated to one function (solo_auth_middleware) — auditing solo mode auth is a grep away", + "SurrealDB and NATS data persist across restarts in solo mode (RocksDB + JetStream storage to disk)", + "CI can run the full integration test suite against the solo mode harness without external infrastructure", + ], + negative = [ + "Solo mode requires starting three service binaries (vs one monolith) — managed by service-manager.nu", + "The age key on disk is the only credential that bypasses Vault — its path must be chmod 600", + "nats-server must be in $PATH for solo mode startup", + ], + }, + + alternatives_considered = [ + { + option = "Simplified mono-binary for solo, full services for multi-user", + why_rejected = "Creates two code paths. Testing in solo mode does not validate multi-user behavior. Scripts written for solo mode require adaptation for multi-user. Doubles the maintenance surface.", + }, + { + option = "Feature flags at compile time (cfg(solo)) to disable auth", + why_rejected = "Compile-time flags prevent running the same binary in both modes. Deployment would require two separate builds. A runtime flag (--mode solo) is more operationally flexible.", + }, + ], + + constraints = [ + { + id = "solo-mode-runtime-flag-only", + claim = "solo_auth_middleware must only be activated via --mode solo runtime flag, never via environment variable or compile-time feature", + scope = "platform/crates/control-center/src/lib.rs", + severity = 'Hard, + check = { tag = 'NuCmd, cmd = "rg 'solo_auth_middleware' --include='*.rs' platform/ | grep -v '#\\[cfg(test)'", expect_exit = 0 }, + rationale = "A runtime flag is explicit and auditable in process listings. An environment variable or compile-time flag creates an invisible bypass that cannot be detected without reading code or config.", + }, + { + id = "age-key-file-permissions", + claim = "The age key at ${data_dir}/vault/master.age must be created with mode 0600 and must be the only file-based secret in the platform", + scope = "platform/secretumvault/src/solo.rs", + severity = 'Hard, + check = { tag = 'NuCmd, cmd = "rg 'master.age|vault.*key' --include='*.rs' platform/ | grep -v 'chmod|0o600|0600'", expect_exit = 1 }, + rationale = "The age key is the bootstrap secret — the only credential that bypasses Vault. Strict file permissions are the only protection. Any additional file-based secrets would violate the single-secret constraint.", + }, + { + id = "nats-server-child-lifecycle", + claim = "Orchestrator must start nats-server -js as a managed child process with TCP availability wait (10s timeout) and SIGTERM on shutdown", + scope = "platform/crates/orchestrator/src/nats.rs", + severity = 'Soft, + check = { tag = 'Grep, pattern = "nats-server|nats_server", paths = ["platform/crates/orchestrator/"], must_be_empty = false }, + rationale = "Unmanaged nats-server processes leak across service restarts and leave stale JetStream state. The 10s TCP wait prevents race conditions between Orchestrator and the NATS server on startup.", + }, + ], + + related_adrs = ["adr-012-nats-event-broker", "adr-013-surrealdb-global-store", "adr-014-solid-enforcement"], + + ontology_check = { + decision_string = "Solo mode uses full architecture with solo_auth_middleware as the only auth bypass, gated behind --mode solo runtime flag", + invariants_at_risk = ["solid-boundaries"], + verdict = 'Safe, + }, +} diff --git a/adrs/adr-016-workspace-formula-dag.ncl b/adrs/adr-016-workspace-formula-dag.ncl new file mode 100644 index 0000000..0e8a474 --- /dev/null +++ b/adrs/adr-016-workspace-formula-dag.ncl @@ -0,0 +1,104 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-016", + title = "Workspace Taskserv Execution as Typed DAGs (Formula Pattern)", + status = 'Accepted, + date = "2026-03-14", + + context = "Workspace server definitions declare taskservs as positional arrays — e.g. `taskservs = [etcd, kubernetes, containerd, cilium]`. The provisioning platform executes these in strict linear order regardless of actual dependencies between tasks. This model has three problems: (1) It cannot express parallelism — `containerd` and `coredns` are independent of each other but are serialized behind `kubernetes`. (2) It cannot express conditional edges — a failed `etcd` should halt `kubernetes` but a failed `coredns` should not. (3) The execution intent is implicit — there is no machine-readable artifact that declares which tasks depend on which, so no validation is possible at schema time. The Orchestrator already implements a full `DependencyGraph` with topological sort and `max_parallel_tasks` in `workflow.rs`, but `batch.rs` was building a linear chain from the positional array, ignoring the graph entirely.", + + decision = "Workspace infrastructure definitions declare taskserv execution order as typed DAGs via a `Formula` Nickel record exported from `schemas/lib/formula.ncl`. Each `FormulaNode` carries: `id`, a `TaskServDef` (name, profile, target_save_path), `depends_on: Array FormulaDep` (referential edges by node_id + DepKind), `parallel: Bool`, `on_error: [| Stop | Continue | Retry |]`, and `max_retries: u8`. The Formula is validated at schema time by a custom Nickel contract that checks: no duplicate node IDs, every `depends_on.node_id` references a declared node, every `edges.{from,to}` references a declared node. At runtime, `Formula::from_json` in `formula.rs` deserializes the JSON export and `Formula::into_workflow(FormulaWorkflowConfig)` converts it into a `WorkflowDefinition` fed directly to `BatchWorkflowEngine::execute_workflow`, which runs the existing `DependencyGraph` topological sort with `max_parallel_tasks`. Positional `taskservs` arrays remain valid — they are the per-server composition definition and are retrocompatible. Formulas are an additive artifact in the same `servers.ncl` file.", + + rationale = [ + { + claim = "Schema-time referential integrity catches broken DAGs before deployment", + detail = "The `_Formula` custom Nickel contract validates all `depends_on.node_id` and edge endpoints against the declared `nodes` array. A missing node ID is a typecheck error, not a runtime panic. This enforces the type-safety-nickel axiom on execution topology.", + }, + { + claim = "Parallelism is now explicit and governed", + detail = "Nodes marked `parallel = true` with no shared dependency run concurrently up to `max_parallel`. The control plane formula runs etcd first, then kubernetes, containerd, and coredns in parallel (3 workers), then cilium after k8s+containerd. This halved the estimated provisioning time for a 5-node cluster compared to the linear chain.", + }, + { + claim = "on_error semantics are declarative, not implicit", + detail = "`on_error = 'Stop` halts the entire workflow on node failure (required for etcd, kubernetes). `on_error = 'Continue` allows the workflow to proceed past a non-critical failure (coredns can fail without blocking cilium). `on_error = 'Retry` retries up to max_retries times before propagating. Previously all failures were treated as Stop with no way to express Continue.", + }, + { + claim = "Retrocompatible — zero migration cost for existing servers", + detail = "TaskServDef now has `depends_on`, `on_error`, `max_retries` fields with defaults. Existing `servers.ncl` files typecheck unchanged. Formulas are an opt-in additive array alongside the existing `servers` array. Batch.rs preserves the linear execution path when no formula is supplied.", + }, + { + claim = "Single runtime path — the existing DependencyGraph is reused", + detail = "No new execution engine was written. `Formula::into_workflow` produces a standard `WorkflowDefinition` consumed by the existing `BatchWorkflowEngine::execute_workflow`. The DependencyGraph topological sort and parallel dispatch already existed in workflow.rs and were simply never reached via the batch coordinator.", + }, + ], + + consequences = { + positive = [ + "Parallel taskserv execution is now possible and schema-validated", + "DAG structure is a first-class artifact — diffable, auditable, versionable in git", + "on+re reflection mode `provisioning-validate-formula` provides cross-validation (taskserv existence, ConflictsWith, cycle detection)", + "FormulaWorkflowConfig<'a> groups conversion parameters — batch.rs call sites are explicit and lint-clean", + "Ontology node `formula-dag-execution` registers this pattern for on+re governance", + ], + negative = [ + "Two parallel models exist: positional `taskservs` arrays (per-server composition) and `formulas` (execution DAGs). Authors must understand the distinction.", + "Formula node IDs are a new namespace within a server definition — ID collisions across formulas in the same file are not currently detected at parse time (only within a single formula).", + "Nickel's custom contract for referential integrity runs at export time, not at typecheck time — `nickel typecheck` alone is insufficient; `nickel export` is required for full validation.", + ], + }, + + alternatives_considered = [ + { + option = "Positional array with dependency annotations as comments", + why_rejected = "Comments are not machine-readable. Cannot be validated, cannot drive runtime parallelism, cannot be consumed by on+re modes. Violates the type-safety-nickel axiom.", + }, + { + option = "Separate formula file per server (e.g. wuji-formula.ncl)", + why_rejected = "Separates declaration from context. The `servers.ncl` file already owns the server definition including its taskservs — the formula belongs alongside it. Import proliferation adds no structural benefit.", + }, + { + option = "Encode DAG as a TOML/YAML file consumed by the Orchestrator", + why_rejected = "Breaks the type-safety-nickel axiom. TOML/YAML have no contracts, no referential integrity, no schema composition. The Formula pattern allows the Nickel schema to own the execution topology, which is where it belongs.", + }, + { + option = "Extend TaskServDef directly with execution metadata (depends_on, on_error) and derive the DAG implicitly", + why_rejected = "Conflates composition (which taskservs a server needs) with orchestration (in what order and how). The Formula is a separate, named artifact that can be versioned, validated, and governed independently from the taskserv list.", + }, + ], + + constraints = [ + { + id = "formula-node-ids-unique-within-formula", + claim = "Node IDs must be unique within a single Formula — the custom Nickel contract enforces this at export time", + scope = "schemas/lib/formula.ncl (_Formula contract), workspaces/*/infra/*/servers.ncl", + severity = 'Hard, + check = { tag = 'NuCmd, cmd = "nickel export --format json examples/workspaces/basic/servers.ncl 2>/dev/null | jq '[.formulas[].nodes[].id] | group_by(.) | map(select(length > 1)) | length == 0' | grep -q true", expect_exit = 0 }, + rationale = "Duplicate node IDs produce ambiguous depends_on resolution. The contract catches this before the JSON reaches formula.rs.", + }, + { + id = "formula-depends-on-declared-nodes-only", + claim = "Every depends_on.node_id and edge endpoint must reference a declared node in the same formula", + scope = "schemas/lib/formula.ncl (_Formula contract)", + severity = 'Hard, + check = { tag = 'FileExists, path = "schemas/lib/formula.ncl", present = true }, + rationale = "A reference to a non-existent node_id would silently drop the dependency at runtime, producing an incorrect execution order with no error.", + }, + { + id = "formula-runtime-conversion-via-formula-rs-only", + claim = "All Formula-to-WorkflowDefinition conversion must go through Formula::into_workflow — no ad-hoc JSON parsing in batch.rs or elsewhere", + scope = "platform/crates/orchestrator/src/batch.rs, platform/crates/orchestrator/src/formula.rs", + severity = 'Hard, + check = { tag = 'Grep, pattern = "nickel export", paths = ["platform/crates/"], must_be_empty = false }, + rationale = "The FormulaWorkflowConfig struct and into_workflow carry the semantic mapping (task names, arg construction, metadata injection). Bypassing it risks silent divergence between schema intent and runtime behavior.", + }, + ], + + ontology_check = { + decision_string = "Workspace taskserv execution topology as typed DAGs via Formula Nickel pattern, converted to WorkflowDefinition at runtime by formula.rs", + invariants_at_risk = ["type-safety-nickel", "config-driven-always"], + verdict = 'Safe, + }, + + related_adrs = ["adr-014-solid-enforcement", "adr-015-solo-mode-architecture"], +} diff --git a/adrs/adr-017-typedialog-web-ui.ncl b/adrs/adr-017-typedialog-web-ui.ncl new file mode 100644 index 0000000..6fbd389 --- /dev/null +++ b/adrs/adr-017-typedialog-web-ui.ncl @@ -0,0 +1,78 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-017", + title = "TypeDialog — Schema-Driven Web Form Backend for Workspace Configuration", + status = 'Accepted, + date = "2026-01-08", + + context = "Workspace configuration requires validated user input across multiple fields with interdependencies (e.g. selecting 'kubernetes' deployment enables K8s-specific options). Nushell's `input` command is single-line text only — no validation, no conditional fields, no multi-user collaboration. Nickel is declarative and cannot prompt users interactively. Two options considered: (1) TUI dialogs — Rust-native terminal forms, keyboard-driven, works over SSH; (2) Web form backend — browser-accessible, schema-generated forms, multi-user workflow support.", + + decision = "TypeDialog implements a schema-driven web form backend. Nickel contracts in `.typedialog/provisioning/schemas/` are the single source of truth; form fields are generated directly from schema types and constraints with zero manual form code. The web UI is embedded in the control center dashboard (accessible from any browser) and supports draft → review → approve workflows for team environments. A TUI fallback is available for SSH-only environments. Generated config is written as NCL and validated against the same contracts that drove form generation.", + + rationale = [ + { + claim = "Schema drift between form and config is structurally impossible", + detail = "Form fields are generated from Nickel contracts at render time. If a schema changes, the form changes. No manual sync required — the source of truth is a single file.", + }, + { + claim = "Web UI enables multi-user collaborative workflows impossible with TUI", + detail = "Platform engineers, security teams, and dev leads can each access the form from their own browser without SSH. Draft configs persist across sessions. Approval flows are built into the form lifecycle.", + }, + { + claim = "TypeDialog fragments encode valid configuration combinations", + detail = "Fragments (database-postgres, deployment-k8s, etc.) capture the full parameter space for each configuration pattern. The fragment system prevents invalid combinations (e.g. PostgreSQL-specific options appearing for a SQLite config) without runtime validation logic.", + }, + ], + + consequences = { + positive = [ + "Zero manual form code — schema changes propagate to UI automatically", + "Multi-user collaboration via browser; no SSH required", + "Draft config persistence with audit trail of who configured what", + "TypeDialog fragments prevent invalid config combinations structurally", + ], + negative = [ + "Control center must be running for web UI access — SSH-only environments use TUI fallback", + "Form generation requires contract files to be valid NCL — schema errors surface as broken forms, not compile errors", + ], + }, + + alternatives_considered = [ + { + option = "TUI-only (Ratatui forms, keyboard-driven)", + why_rejected = "Single-user, requires interactive terminal, no multi-user collaboration, no draft persistence, no browser access. Adequate for solo mode but insufficient for team deployments.", + }, + { + option = "Custom HTML forms maintained separately from schemas", + why_rejected = "Manual maintenance creates drift between form fields and schema types. Every schema change requires a form update. Two sources of truth.", + }, + ], + + constraints = [ + { + id = "schema-is-form-source-of-truth", + claim = "TypeDialog form fields must be generated from Nickel contracts — no manual form field definitions allowed", + scope = ".typedialog/provisioning/schemas/", + severity = 'Hard, + check = { tag = 'FileExists, path = ".typedialog/provisioning/schemas/provisioning-config.ncl", present = true }, + rationale = "Manual form definitions diverge from schemas. The contract is the form.", + }, + { + id = "generated-config-validates-against-contract", + claim = "Config output by TypeDialog must validate against the same schema contracts used for form generation", + scope = ".typedialog/provisioning/generated/", + severity = 'Hard, + check = { tag = 'FileExists, path = ".typedialog/provisioning/generated", present = true }, + rationale = "A form that produces invalid config provides false confidence. Validation must be end-to-end.", + }, + ], + + related_adrs = ["adr-013-surrealdb-global-store", "adr-014-solid-enforcement"], + + ontology_check = { + decision_string = "TypeDialog generates web forms from Nickel contracts; schema contracts are the single source of truth for both validation and UI generation", + invariants_at_risk = ["type-safety-nickel"], + verdict = 'Safe, + }, +} diff --git a/adrs/adr-018-secretumvault-integration.ncl b/adrs/adr-018-secretumvault-integration.ncl new file mode 100644 index 0000000..de4d5c8 --- /dev/null +++ b/adrs/adr-018-secretumvault-integration.ncl @@ -0,0 +1,87 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-018", + title = "SecretumVault — Dynamic Secrets Complementary to SOPS+Age", + status = 'Accepted, + date = "2026-01-08", + + context = "The platform manages two distinct classes of secrets: (1) static gitops secrets — API keys, TLS certs, SSH keys committed as encrypted SOPS+Age files; (2) dynamic runtime secrets — temporary database passwords, short-lived tokens, auto-rotating credentials. SOPS+Age handles class 1 well but has no concept of TTL, auto-rotation, or access audit trails. HashiCorp Vault handles class 2 but is Go binary (not Rust-native), uses BSL license (not permissive), and uses HCL policies (incompatible with Cedar authorization model).", + + decision = "SecretumVault provides dynamic runtime secrets management. It is embedded in the platform as a Rust-native library (path dependency `../../../Development/secretumvault`) with Cedar policy enforcement, SurrealDB-backed storage, and filesystem backend for solo mode (age key at `${data_dir}/vault/master.age`). SOPS+Age remains for static gitops secrets. SecretumVault complements, not replaces, SOPS.", + + rationale = [ + { + claim = "Rust-native: zero subprocess overhead, same authorization model as the platform", + detail = "SecretumVault is linked as a Rust library. No subprocess, no gRPC, no network hop for secret retrieval. Cedar policies for secrets are evaluated by the same Cedar engine used for infrastructure authorization — one policy model, one audit trail.", + }, + { + claim = "HashiCorp Vault rejected: BSL license and HCL policies incompatible with platform constraints", + detail = "BSL license restricts commercial use without a subscription. HCL policies require maintaining a separate policy language alongside Cedar. Neither aligns with the platform's Rust/Cedar/Nickel stack.", + }, + { + claim = "SOPS+Age remains authoritative for static gitops secrets", + detail = "Static secrets (API keys in config, TLS certs) are already gitops-managed via SOPS. Replacing SOPS would break the gitops workflow and add migration risk. SecretumVault handles only runtime-dynamic secrets.", + }, + ], + + consequences = { + positive = [ + "Dynamic credential TTL and auto-rotation for database passwords and tokens", + "Cedar-gated secret access with audit trail — compliance-ready", + "Solo mode uses filesystem age key — no infrastructure dependency for single-operator deployments", + "Consistent authorization model: Cedar governs both infrastructure operations and secret access", + ], + negative = [ + "HA deployment requires 3-node Raft cluster — additional infrastructure for production", + "Age key at `${data_dir}/vault/master.age` is the only bootstrap credential — requires chmod 600 and secure backup", + "Local path dependency — requires secretumvault repo checkout alongside provisioning", + ], + }, + + alternatives_considered = [ + { + option = "HashiCorp Vault", + why_rejected = "BSL license incompatible with open distribution. Go binary introduces subprocess overhead. HCL policies require a second policy language alongside Cedar. No Rust-native integration.", + }, + { + option = "Extend SOPS+Age to handle dynamic secrets", + why_rejected = "SOPS is a file encryption tool, not a secrets engine. TTL, auto-rotation, and audit trails are not concepts SOPS is designed for. Extending it would be a reimplementation of a secrets engine with worse UX.", + }, + ], + + constraints = [ + { + id = "sops-and-vault-complementary", + claim = "SOPS+Age handles static gitops secrets; SecretumVault handles runtime dynamic secrets — no overlap", + scope = "platform/secretumvault/", + severity = 'Hard, + check = { tag = 'Grep, pattern = "secretumvault", paths = ["platform/secretumvault/"], must_be_empty = false }, + rationale = "Mixing the two systems creates ambiguity about which is authoritative for a given secret class. Clear separation prevents accidental migration of static secrets into the vault.", + }, + { + id = "cedar-governs-vault-access", + claim = "All SecretumVault read operations must be authorized via Cedar policies — no bypass path", + scope = "platform/secretumvault/src/", + severity = 'Hard, + check = { tag = 'Grep, pattern = "cedar|authorize", paths = ["platform/secretumvault/"], must_be_empty = false }, + rationale = "Cedar is the single authorization point for the platform. Vault access bypassing Cedar creates an unaudited path to secrets.", + }, + { + id = "vault-age-key-permissions", + claim = "The bootstrap age key at vault/master.age must be chmod 600 and must be the only file-based credential", + scope = "platform/secretumvault/src/solo.rs", + severity = 'Hard, + check = { tag = 'NuCmd, cmd = "rg 'master.age' --include='*.rs' platform/ | grep -v '0o600\\|0600'", expect_exit = 1 }, + rationale = "The age key is the bootstrap trust anchor. If it is world-readable, the entire vault is compromised.", + }, + ], + + related_adrs = ["adr-014-solid-enforcement", "adr-015-solo-mode-architecture"], + + ontology_check = { + decision_string = "SecretumVault provides dynamic runtime secrets with Cedar authorization; SOPS+Age remains for static gitops secrets; complementary, not competing", + invariants_at_risk = ["solid-boundaries"], + verdict = 'Safe, + }, +} diff --git a/adrs/adr-019-ai-rag-integration.ncl b/adrs/adr-019-ai-rag-integration.ncl new file mode 100644 index 0000000..1477e0b --- /dev/null +++ b/adrs/adr-019-ai-rag-integration.ncl @@ -0,0 +1,87 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-019", + title = "Schema-Aware AI and RAG — Nickel Contracts Constrain AI Config Generation", + status = 'Accepted, + date = "2026-01-08", + + context = "Infrastructure configuration generation via LLM is unreliable without grounding: generic AI produces plausible but structurally invalid configs (wrong field names, invalid enum values, incompatible option combinations). Two risks: (1) hallucination — AI generates configs that fail schema validation; (2) security — AI agents with unrestricted access to secrets and deployment operations create unaudited paths. The platform has Nickel schemas for all configuration surfaces and Cedar for authorization — both can be used to constrain AI behavior.", + + decision = "AI config generation is constrained by Nickel schemas at generation time and by Cedar policies at authorization time. The ai-service is the HTTP entry point for all AI operations. RAG indexes Nickel schemas, documentation, and past deployments as retrieval context — AI generates WITH schema context, making hallucination structurally harder. Cedar policy forbids ai-service from accessing any secret and requires `context.human_approved == true` before any deployment operation. The mcp-server exposes tool calling (nickel_validate, schema_query, best_practices) to LLM agents.", + + rationale = [ + { + claim = "Schema-constrained generation eliminates invalid config hallucination", + detail = "Generic LLMs generate `engine = 'postgresql'` when the contract says `engine | [| 'postgres, 'mysql |]`. Providing the schema as RAG context gives the model the exact valid values. Post-generation nickel export validates the output against the same contract.", + }, + { + claim = "Cedar is the enforcement layer — not prompt engineering", + detail = "Prompting AI to 'not access secrets' is not a security boundary. Cedar policy `forbid(principal == Service::\"ai-service\", action == Action::\"read\", resource in Secret::\"*\")` is enforced at the platform layer regardless of what the LLM requests.", + }, + { + claim = "RAG over project artifacts is more accurate than generic LLM for project-specific configs", + detail = "Indexing `schemas/`, `docs/`, and past successful deployments means AI answers are grounded in actual project patterns — not generic infrastructure knowledge that may conflict with project constraints.", + }, + ], + + consequences = { + positive = [ + "AI cannot generate configs that fail Nickel schema validation — structural correctness enforced", + "Cedar prevents AI from accessing secrets or deploying without human approval", + "RAG over project artifacts reduces hallucination on project-specific options", + "MCP tool calling (nickel_validate, schema_query) enables LLM agents to self-correct", + ], + negative = [ + "RAG index must be kept current as schemas and docs evolve — stale index degrades answer quality", + "ai-service adds a service dependency for all AI-assisted operations", + "Cost tracking required: rate limiting at 60 req/min, 1M tokens/day, $100/day", + ], + }, + + alternatives_considered = [ + { + option = "Generic LLM without schema grounding (GitHub Copilot style)", + why_rejected = "Generates syntactically valid but semantically wrong configs — wrong enum values, missing required fields, invalid option combinations. Schema validation must happen after generation and frequently fails.", + }, + { + option = "Fine-tuned model on project schemas", + why_rejected = "Fine-tuning is expensive, requires retraining on every schema change, and does not generalize across projects. RAG is dynamic and always reflects the current schema state.", + }, + ], + + constraints = [ + { + id = "ai-cannot-access-secrets", + claim = "ai-service must have a Cedar policy explicitly forbidding access to any Secret resource", + scope = "platform/crates/control-center/src/policies/", + severity = 'Hard, + check = { tag = 'Grep, pattern = "ai-service.*Secret|Secret.*ai-service", paths = ["platform/"], must_be_empty = false }, + rationale = "AI agents with secret access create unaudited credential exposure. The constraint must be at the authorization layer, not in the LLM prompt.", + }, + { + id = "ai-deployment-requires-human-approval", + claim = "Any deployment action triggered by ai-service must have context.human_approved == true in the Cedar evaluation context", + scope = "platform/crates/orchestrator/src/", + severity = 'Hard, + check = { tag = 'Grep, pattern = "human_approved", paths = ["platform/"], must_be_empty = false }, + rationale = "Autonomous deployment without human review is an unacceptable risk for production infrastructure. The approval gate is enforced by Cedar, not by AI self-restraint.", + }, + { + id = "ai-generation-validates-against-schema", + claim = "All AI-generated Nickel configs must be validated via nickel export before being presented to the user or submitted to the orchestrator", + scope = "platform/crates/ai-service/src/", + severity = 'Hard, + check = { tag = 'Grep, pattern = "nickel.*export|nickel_validate", paths = ["platform/crates/ai-service/"], must_be_empty = false }, + rationale = "Post-generation validation closes the loop — if the LLM generates an invalid config despite schema grounding, the user sees a validation error, not a deployment failure.", + }, + ], + + related_adrs = ["adr-014-solid-enforcement", "adr-017-typedialog-web-ui", "adr-018-secretumvault-integration"], + + ontology_check = { + decision_string = "AI config generation is constrained by Nickel schemas (RAG grounding) and Cedar policies (secret isolation, human approval gate)", + invariants_at_risk = ["solid-boundaries", "type-safety-nickel"], + verdict = 'Safe, + }, +} diff --git a/adrs/adr-020-extension-capability-declarations.ncl b/adrs/adr-020-extension-capability-declarations.ncl new file mode 100644 index 0000000..64cdf58 --- /dev/null +++ b/adrs/adr-020-extension-capability-declarations.ncl @@ -0,0 +1,103 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-020", + title = "Extension Capability Declarations: provides/requires/conflicts_with Taxonomy", + status = 'Accepted, + date = "2026-04-03", + + context = "ADR-016 introduced typed Formula DAGs for intra-server taskserv execution order. To enable formula dependency resolution at the workspace composition layer (inter-formula DAGs, ADR-021), the Orchestrator needs a machine-readable declaration of what each taskserv produces and what it depends on. Without this, a workspace composition DAG cannot validate that a Formula consuming `kubernetes-api-server` has at least one upstream Formula that provides it — the constraint is implicit and unenforced. Ten built-in taskservs existed with only `name/version/description/supported_providers` metadata — no capability declarations.", + + decision = "Every taskserv `metadata.ncl` file must declare three fields: `provides: Array CapabilityEntry` (capabilities this taskserv makes available after successful execution), `requires: Array CapabilityRequirement` (capabilities this taskserv needs from another provider before it can run), and `conflicts_with: Array String` (taskserv names that are mutually exclusive — installing both would produce an irreconcilable conflict). A `CapabilityEntry` carries `id: String` (dot-namespaced, e.g. `kubernetes.api-server`), `kind: CapabilityKind` (`'Service | 'StorageClass | 'NetworkPolicy | 'Runtime | 'CertManager | 'Monitoring | 'Registry | 'DNS | 'Auth`), and `description: String`. A `CapabilityRequirement` carries `capability: String` (the capability `id`), `kind: RequirementKind` (`'Required | 'Optional`), and `description: String`. These fields are validated by `schemas/lib/extension-metadata.ncl` at Nickel typecheck time and audited at runtime by the `provisioning-dag-integrity` reflection mode.", + + rationale = [ + { + claim = "Machine-readable capability declarations enable schema-time resolution validation", + detail = "The `provisioning-dag-integrity` reflection mode cross-checks every `Required` capability in `requires[]` against the set of `provides[].id` values across all taskservs. An unresolved Required capability is a hard error surfaced before any deployment attempt. Without typed declarations, this check requires reading code comments or documentation.", + }, + { + claim = "ConflictsWith enforces mutual exclusion at the registry level", + detail = "A taskserv pair in `conflicts_with` that both appear in a Formula is caught by `provisioning-validate-formula`. The registry-level declaration makes conflicts auditable and tool-enforceable — no runtime failure needed to discover an incompatible combination.", + }, + { + claim = "CapabilityKind enum scopes the semantic surface of each capability", + detail = "Using a typed enum (`'Service | 'StorageClass | ...`) rather than free-form strings prevents capability ID sprawl. The DAG resolution query `extensions capabilities --type Service` is only possible with a bounded kind set.", + }, + { + claim = "Optional vs Required separation allows partial-graph deployments", + detail = "`'Optional` requirements express soft preferences (e.g. coredns optionally uses an upstream DNS). `'Required` requirements are hard blockers. The distinction enables the orchestrator to warn on unresolved Optional capabilities while failing on unresolved Required ones.", + }, + { + claim = "Dot-namespaced capability IDs provide scoping without a global registry", + detail = "IDs like `kubernetes.api-server`, `storage.ceph-block`, `network.cni` are self-documenting and conflict-resistant without requiring a central registry. The namespace prefix is the domain (kubernetes, storage, network, container, tls, dns, monitoring, identity).", + }, + ], + + consequences = { + positive = [ + "All 10 built-in taskservs now have typed capability declarations — `provisioning-dag-integrity` runs clean", + "CLI `provisioning catalog capabilities` and `provisioning extensions graph` are powered by these declarations", + "WorkspaceComposition dependency resolution can validate inter-formula capability chains at dag.ncl export time", + "Capability declarations are a first-class artifact — diffable, auditable, versionable in git", + ], + negative = [ + "New taskservs must populate provides/requires/conflicts_with or fail schema validation — increases authoring burden", + "Capability IDs are not validated against a central registry — a typo in `requires[].capability` fails silently if no provider declares the misspelled ID", + "CapabilityKind enum is closed — adding a new kind requires updating `schemas/lib/dag/contracts.ncl` and re-exporting all metadata files that use it", + ], + }, + + alternatives_considered = [ + { + option = "Free-form capability tags (Array String) instead of typed CapabilityEntry", + why_rejected = "Free strings cannot be validated for kind, cannot be queried by type, and cannot carry descriptions. The typed record is required for `provisioning catalog capabilities --type Service` to function and for the `provisioning-dag-integrity` mode to distinguish Required from Optional resolution failures.", + }, + { + option = "Single `capabilities` array with a `direction` discriminant (provides/requires encoded as field)", + why_rejected = "A flat array conflates semantically different operations — providing a capability and requiring one have different validation rules and different consumers. Separate `provides` and `requires` arrays make intent explicit and allow independent schema validation.", + }, + { + option = "Encode conflicts_with as capability-level conflicts (A.provides X conflicts with B.provides X)", + why_rejected = "Capability-level conflicts are more granular but harder to author — taskserv authors must reason about every capability pair. Taskserv-level mutual exclusion (`conflicts_with: [\"containerd\"]`) is the correct granularity for installation-time enforcement and maps directly to the package manager mental model.", + }, + { + option = "Central capability registry file (a single capabilities.ncl across all extensions)", + why_rejected = "A central registry creates a write-contention hotspot when multiple extensions are developed in parallel. Distributed declarations in each metadata.ncl, aggregated by the reflection mode and CLI, achieve the same discoverability with independent authoring.", + }, + ], + + constraints = [ + { + id = "capability-ids-dot-namespaced", + claim = "All capability IDs in provides[].id and requires[].capability must use dot-namespaced format: `.` (e.g. `kubernetes.api-server`, `storage.ceph-block`)", + scope = "catalog/taskservs/*/metadata.ncl", + severity = 'Hard, + check = { tag = 'Grep, pattern = "id = \"[^.]+\"", paths = ["catalog/taskservs/"], must_be_empty = true }, + rationale = "Flat IDs (no dot) are ambiguous and collision-prone. The dot namespace convention is the only disambiguation mechanism without a central registry.", + }, + { + id = "all-taskservs-must-declare-capability-fields", + claim = "Every taskserv metadata.ncl must declare provides, requires, and conflicts_with — even if as empty arrays", + scope = "catalog/taskservs/*/metadata.ncl, schemas/lib/extension-metadata.ncl", + severity = 'Hard, + check = { tag = 'NuCmd, cmd = "provisioning catalog capabilities 2>/dev/null | length | test { $in > 0 }", expect_exit = 0 }, + rationale = "Missing fields are caught by the schema contract but also by `provisioning-dag-integrity`. A taskserv without declarations is invisible to capability resolution — it will never be identified as a provider or dependency.", + }, + { + id = "conflicts-with-holds-taskserv-names-not-capability-ids", + claim = "conflicts_with[] must contain taskserv directory names (e.g. `\"containerd\"`), not capability IDs", + scope = "catalog/taskservs/*/metadata.ncl", + severity = 'Hard, + check = { tag = 'NuCmd, cmd = "nu -c 'ls catalog/taskservs/ | get name | each { |d| $d | path basename }'", expect_exit = 0 }, + rationale = "The conflict resolution algorithm in `provisioning-validate-formula` looks up taskserv names in the extensions registry. Capability IDs in conflicts_with would never match and silently fail to enforce the constraint.", + }, + ], + + ontology_check = { + decision_string = "Extension capability declarations via provides/requires/conflicts_with typed fields in metadata.ncl, validated by extension-metadata schema and provisioning-dag-integrity reflection mode", + invariants_at_risk = ["type-safety-nickel", "config-driven-always"], + verdict = 'Safe, + }, + + related_adrs = ["adr-016-workspace-formula-dag"], +} diff --git a/adrs/adr-021-workspace-composition-dag.ncl b/adrs/adr-021-workspace-composition-dag.ncl new file mode 100644 index 0000000..924929d --- /dev/null +++ b/adrs/adr-021-workspace-composition-dag.ncl @@ -0,0 +1,117 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-021", + title = "Workspace Composition DAG: Multi-Formula Orchestration and Task Namespacing", + status = 'Accepted, + date = "2026-04-03", + + context = "ADR-016 established the Formula pattern for intra-server taskserv execution as a typed DAG. A workspace may contain multiple servers, each with a Formula defining its taskserv execution order. When a workspace is provisioned end-to-end, the Orchestrator needs to execute these Formulas in dependency order — the storage cluster Formula must wait for the control plane Formula to reach a stable state before Ceph OSD initialization begins. This inter-formula coordination layer was missing: batch.rs executed Formulas sequentially in declaration order with no dependency semantics and no health-gate mechanism to verify cluster readiness between Formula groups.", + + decision = "Workspace infrastructure definitions declare inter-formula execution order in a `dag.ncl` artifact at `{workspace_root}/infra/{infra}/dag.ncl`. Each entry in `dag.ncl.composition.formulas[]` is a `FormulaCompositionEntry` carrying: `formula_id` (matching a formula declared in `servers.ncl`), `depends_on: Array FormulaDep` (edges to other formula_ids with a `condition: String`), `parallel: Bool`, and an optional `health_gate: HealthGateConfig` (a shell command + expected output + timeout). At runtime, `WorkspaceComposition::into_workflow` merges all formula WorkflowDefinitions into a single unified WorkflowDefinition: task names are rewritten as `formula_id::task_name` to prevent collisions across formulas, health gates are injected as synthetic `WorkflowTaskDefinition` records (with `metadata[\"type\"] = \"health-gate\"`), and inter-formula edges wire the terminal tasks of an upstream formula to the health gate (if present) or directly to the root tasks of the downstream formula. The merged WorkflowDefinition is executed by the existing `BatchWorkflowEngine::execute_workflow`. NATS events are emitted on `provisioning.dag.formula.{started,completed,failed}` and `provisioning.dag.healthgate.{checking,passed,failed}` (gated by the `nats` Cargo feature).", + + rationale = [ + { + claim = "Single WorkflowDefinition preserves the existing parallel dispatch and dependency resolution", + detail = "The `BatchWorkflowEngine` already implements topological sort and `max_parallel_tasks` dispatch via `DependencyGraph`. Merging all formulas into one WorkflowDefinition reuses this path without a new execution layer. Alternative: run each formula as a separate workflow, sequenced by the batch coordinator — this loses cross-formula parallelism (a worker-plane formula could start concurrently with a storage formula if neither depends on the other).", + }, + { + claim = "formula_id::task_name namespacing is the minimal collision prevention mechanism", + detail = "Both `wuji-cp-0-formula` and `wuji-strg-0-formula` declare a node named `etcd_create`. Without namespacing, merging these into one WorkflowDefinition produces duplicate task names, breaking DependencyGraph.find(). The `::` separator was chosen over `.` (conflicts with Nickel field access in debug output) and `/` (ambiguous with path separators in NATS subjects). Formula_id recovery at runtime is `task_name.split_once(\"::\").map(|(fid, _)| fid)`.", + }, + { + claim = "Health gates as synthetic tasks avoid a separate inter-formula synchronization mechanism", + detail = "A health gate that checks `kubectl get nodes --field-selector=... | wc -l` must block the dependent formula's root tasks until the cluster reports the expected node count. Injecting the gate as a `WorkflowTaskDefinition` with `metadata[\"type\"] = \"health-gate\"` reuses the existing task dependency mechanism — no new scheduling primitive is needed. The health gate command runs as a polling loop (up to timeout_seconds) inside `execute_task_with_retry`.", + }, + { + claim = "dag.ncl at infra/{infra}/dag.ncl separates composition from server topology", + detail = "Composition (which formulas run in what order) is a different concern from server topology (which servers exist and what taskservs they run). Keeping dag.ncl separate from servers.ncl allows the composition DAG to evolve (e.g. adding a monitoring formula) without touching the server definitions, and vice versa. The CLI `provisioning dag show/validate/export` resolves dag.ncl independently of servers.ncl.", + }, + { + claim = "NATS event emission on provisioning.dag.* enables live DAG visualization in ontoref-daemon", + detail = "The `ontoref-daemon` DagGraphProvider plugin subscribes to `provisioning.dag.>` to render live formula execution state. The subject hierarchy `provisioning.dag.{formula,healthgate}.{started,completed,failed}` maps directly to the three-layer DAG node types (formula, health gate, task). Gating emission behind `#[cfg(feature = \"nats\")]` means the orchestrator compiles and runs without NATS — the feature is additive.", + }, + ], + + consequences = { + positive = [ + "Inter-formula dependency order is a first-class schema artifact — validated by `provisioning-validate-formula` and `validate-topology` reflection mode", + "Health gates enforce cluster readiness between formula groups without ad-hoc sleep loops in provisioning scripts", + "Cross-formula parallelism: independent formulas (no depends_on edge) run concurrently within the same workspace provisioning run", + "NATS events on `provisioning.dag.*` feed live DAG visualization and audit trail", + "formula_id::task_name convention is recoverable — any consumer can extract formula_id from the task name without additional metadata", + ], + negative = [ + "Two DAG artifacts must be kept consistent: formula_ids in dag.ncl must match formula names in servers.ncl — inconsistency causes runtime panic in WorkspaceComposition::from_json (no schema-time cross-file validation)", + "Health gate commands are shell strings — not typed, not validated at schema time. A broken gate command is discovered at runtime when the gate task fails", + "Task names in NATS events and logs are `formula_id::task_name` — tooling must understand the `::` separator to route events correctly", + "Merging N formulas into one WorkflowDefinition means a single task failure with `fail_fast = true` halts all formulas, even those with no dependency on the failing formula", + ], + }, + + alternatives_considered = [ + { + option = "Sequential formula execution in declaration order (existing batch.rs behavior)", + why_rejected = "No parallelism, no dependency semantics, no health gates. The storage formula runs after the worker formula even if workers could have started in parallel. Execution order is implicit (declaration order) not explicit (DAG edges).", + }, + { + option = "Separate WorkflowDefinition per formula, coordinated by a top-level orchestrator loop", + why_rejected = "Requires a new scheduling layer above BatchWorkflowEngine. Loses cross-formula parallelism. The existing DependencyGraph already handles the scheduling — merging into one definition reuses it at zero cost.", + }, + { + option = "Health gate as a special Formula node type (within the formula, not between formulas)", + why_rejected = "A health gate that checks cluster-wide readiness (e.g. `kubectl get nodes`) crosses formula boundaries — it is a property of the composition, not of a single formula. Encoding it within a formula would couple the formula to knowledge of other formulas' outputs.", + }, + { + option = "formula_id.task_name (dot separator) for task namespacing", + why_rejected = "Dot conflicts with Nickel field access notation in debug output and log messages. It also conflicts with NATS subject segment separator conventions where dots are meaningful separators. `::` is idiomatic in Rust and unambiguous in all output contexts.", + }, + { + option = "Encode inter-formula edges in servers.ncl alongside the formula declaration", + why_rejected = "servers.ncl owns the server topology — which servers exist, what they run. Composition topology (which formulas execute in which order) is a workspace-level concern that may vary without changing the server definitions. A separate dag.ncl allows composition reuse across workspaces that share server topologies.", + }, + ], + + constraints = [ + { + id = "dag-ncl-formula-ids-must-match-servers-ncl", + claim = "Every formula_id in dag.ncl.composition.formulas[] must correspond to a formula declared in the same infra's servers.ncl", + scope = "workspaces/*/infra/*/dag.ncl, workspaces/*/infra/*/servers.ncl", + severity = 'Hard, + check = { tag = 'NuCmd, cmd = "provisioning dag validate 2>/dev/null | grep -q 'CROSS_REF_OK'", expect_exit = 0 }, + rationale = "A dag.ncl formula_id with no matching servers.ncl formula causes WorkspaceComposition::from_json to silently skip the formula, producing an incomplete workflow. The validate-topology reflection mode and `provisioning dag validate` enforce this cross-file constraint.", + }, + { + id = "task-namespacing-via-double-colon-only", + claim = "All task names in a composed WorkflowDefinition must use the `formula_id::task_name` format — no other separator is permitted", + scope = "platform/crates/orchestrator/src/formula.rs (WorkspaceComposition::into_workflow), platform/crates/orchestrator/src/workflow.rs", + severity = 'Hard, + check = { tag = 'Grep, pattern = "split_once\\(\"::\"\\)", paths = ["platform/crates/orchestrator/src/"], must_be_empty = false }, + rationale = "The `::` separator is the runtime contract between WorkspaceComposition::into_workflow (producer) and workflow.rs result processing (consumer). Using a different separator in any new composition code would break formula_id extraction and NATS event routing.", + }, + { + id = "health-gate-requires-nonempty-depends-on", + claim = "A FormulaCompositionEntry with a health_gate set must have at least one entry in depends_on", + scope = "workspaces/*/infra/*/dag.ncl, schemas/lib/dag/contracts.ncl", + severity = 'Hard, + check = { tag = 'NuCmd, cmd = "nickel export --format json --import-path . workspaces/librecloud_renew/infra/wuji/dag.ncl 2>/dev/null | jq '[.composition.formulas[] | select(.health_gate != null and (.depends_on | length) == 0)] | length == 0'", expect_exit = 0 }, + rationale = "A health gate with no upstream formula has no terminal tasks to wire the gate to. The gate task gets empty depends_on and runs immediately, defeating its purpose. The dag.nu DOT exporter crashes on this case (`$entry.depends_on.0` index-out-of-bounds when depends_on is empty).", + }, + { + id = "nats-emission-behind-feature-flag", + claim = "All NATS event emission in workflow.rs must be behind `#[cfg(feature = \"nats\")]` — the orchestrator must compile and function without NATS", + scope = "platform/crates/orchestrator/src/workflow.rs, platform/crates/orchestrator/Cargo.toml", + severity = 'Hard, + check = { tag = 'Grep, pattern = "emit_dag_event", paths = ["platform/crates/orchestrator/src/workflow.rs"], must_be_empty = false }, + rationale = "NATS is an optional external dependency. The orchestrator must run in solo mode (no NATS) and in test environments without a running NATS server. Unconditional NATS calls would make all workflow tests depend on NATS availability.", + }, + ], + + ontology_check = { + decision_string = "Workspace composition DAG via dag.ncl with formula_id::task_name namespacing, health gates as synthetic tasks, and NATS event emission on provisioning.dag.* subjects", + invariants_at_risk = ["type-safety-nickel", "config-driven-always"], + verdict = 'Safe, + }, + + related_adrs = ["adr-016-workspace-formula-dag", "adr-020-extension-capability-declarations", "adr-012-nats-event-broker"], +} diff --git a/adrs/adr-022-ncl-sync-daemon.ncl b/adrs/adr-022-ncl-sync-daemon.ncl new file mode 100644 index 0000000..01bdc27 --- /dev/null +++ b/adrs/adr-022-ncl-sync-daemon.ncl @@ -0,0 +1,95 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-022", + title = "ncl-sync: Nickel Configuration Sync Daemon", + status = 'Accepted, + date = "2026-04-16", + + context = "Every `prvng` CLI invocation that reads configuration runs `nickel export --format json` at least once, often multiple times. There are 124 call sites across the Nu codebase; each export costs 2-5s. The Nu module parse cost (~600-1200ms for 345 files) is a separate problem. This plan targets only the Nickel export cost. `lib_provisioning/config/cache/` already existed with the correct API shape (`cache-lookup`, `lookup-nickel-cache`, etc.) but every function was a no-op — the infrastructure was there but never wired to actual storage. Additionally, `nu_plugin_nickel` already implements file-content-based caching (`nickel-eval` command) but lacked `--import-path` support, which is why all 124 call sites used `^nickel export` directly instead of the plugin.", + + decision = "A Rust daemon (`ncl-sync`) compiles NCL to JSON proactively and maintains a shared cache at `~/.cache/provisioning/config-cache/`. The daemon and the `nu_plugin_nickel` plugin share a single cache directory and a single key derivation strategy: `SHA256(file_content + sorted_import_paths_joined_by_colon + format)`. This makes the key content-addressed — identical file content produces the same key regardless of path, and the daemon's pre-warmed entries are immediately visible to `nickel-eval` without any coordination protocol. Nu call sites in the hot path replace `^nickel export --format json ... | from json` with `nickel-eval ... --import-path [...]`. For soft-failure call sites (where export failure is acceptable), a `ncl-eval-soft` wrapper in `lib_provisioning/utils/nickel_processor.nu` isolates the single necessary `try/catch` and exposes clean call sites. The daemon is started by `prvng platform start` via `ncl-sync-start` in `service-manager.nu` and stopped by `prvng platform stop`. Nu processes signal re-export needs after mutations by writing `.sync-.json` sidecar files (atomic rename); the daemon drains these every 500ms.", + + rationale = [ + { + claim = "Shared cache dir + content-based key eliminates the need for a socket or IPC between daemon and Nu processes", + detail = "The plugin's `lookup_cache` reads `~/.cache/provisioning/config-cache/.json` directly from disk. The daemon writes to the same path. There is no runtime coordination — the plugin simply finds the file or falls back to direct `nickel export`. Alternative: daemon exposes a Unix socket for reads (prvng-cli daemon plan) — requires Nu processes to know the socket path, handle connection failures, and adds 10-15ms of socket overhead. The file-based approach gives <5ms reads and zero coupling.", + }, + { + claim = "Content-addressed key (SHA256 of file content) is more correct than path+mtime-based key", + detail = "A path+mtime key would falsely invalidate the cache if a file is touched without content change (e.g. `git checkout`, `touch`). A content-based key ensures that identical NCL files share a cache entry regardless of path, and that the cache only misses when the file actually changed. The tradeoff is that the key computation requires reading the file — mitigated by the daemon doing this proactively at warm-up rather than on each Nu invocation.", + }, + { + claim = "Extending nu_plugin_nickel with --import-path is the correct fix for the 124 ^nickel call sites", + detail = "The plugin existed precisely for this purpose but lacked `--import-path` support, forcing all provisioning code to use `^nickel export` directly. Adding `--import-path` to `nickel-eval` and `nickel-export` unblocks the migration. The plugin already converts JSON to Nu values natively (eliminating `| from json`), handles caching, and preserves error semantics via `LabeledError`.", + }, + { + claim = "ncl-sync does not require NATS, SurrealDB, or any platform service", + detail = "The daemon watches the filesystem via `notify`, runs `nickel` as a subprocess, and writes JSON files. It has no network dependencies. If ncl-sync depended on NATS to function, it would have a bootstrap circularity: NATS is a platform service whose configuration is described in NCL. A config cache daemon cannot depend on the services whose configuration it caches.", + }, + { + claim = "Nu processes are never writers to the cache directory", + detail = "Single-writer principle: only ncl-sync writes `.json` files to the cache. Nu processes write `.sync-.json` sidecar files as signals to the daemon, then immediately continue execution. The daemon drains sidecars and writes cache entries. This prevents concurrent-write corruption of cache files without requiring locks.", + }, + ], + + consequences = { + positive = [ + "prvng component list, workflow list: ~1.5s (from ~3-7s) — Nu module parse only, no nickel export stall", + "prvng deploy: ~3-5s (from ~15-30s) — multiple nickel exports are cache hits", + "Cache survives across prvng invocations — warm-up on platform start amortizes the cost for the whole session", + "nu_plugin_nickel is now usable for all config reads (--import-path gap closed)", + ], + negative = [ + "Nu startup cost (~1.2s module parse) is unaffected — a separate problem", + "First invocation of the day: cache cold until daemon warm-up completes (~500ms-2s)", + "ncl-sync binary must be installed and in PATH for performance benefits; absence degrades gracefully to direct nickel export", + ], + }, + + alternatives_considered = [ + { + option = "prvng-cli daemon: route read-only CLI commands to a separate Rust HTTP server via Unix socket", + why_rejected = "Solves only specific read commands (<100ms), not the general nickel export cost. Adds a second daemon with socket/PID lifecycle. Nu call sites still need output formatting to match Nu tables. Doesn't help operation-path commands that also call nickel export.", + }, + { + option = "Lazy-load Nu modules (refactor main_provisioning/mod.nu)", + why_rejected = "The dispatcher already lazy-loads commands/ subdirectory. The Nu interpreter startup (~200-400ms) is unavoidable regardless. Module parse cost is ~600-1200ms — a real problem but separate from the nickel export stall. This plan targets nickel export; module parse is a future orthogonal improvement.", + }, + { + option = "Nu-side cache with file-mtime check (no daemon)", + why_rejected = "Nu processes are ephemeral — no proactive warming. First command of each session still pays the nickel export cost. Concurrent Nu processes (Makefile, CI) cause cache stampede: multiple processes miss simultaneously and all run nickel export. No file watching — cache becomes stale silently after NCL edits.", + }, + { + option = "Separate cache directories for daemon and plugin", + why_rejected = "Requires a coordination protocol (socket, IPC, or manifest polling) so the plugin can find daemon-written entries. The shared-directory approach eliminates coordination entirely — the key derivation IS the coordination protocol.", + }, + ], + + ontology_check = { + decision_string = "ncl-sync Rust daemon + nu_plugin_nickel shared cache at ~/.cache/provisioning/config-cache/ with content-based key SHA256(content+imports+format)", + invariants_at_risk = ["config-driven-always", "type-safety-nickel"], + verdict = 'Safe, + }, + + related_adrs = ["adr-023-ncl-export-wrapper"], + + constraints = [ + { + id = "ncl-sync-single-writer", + claim = "Nu processes NEVER write .json files to the cache directory directly", + scope = "provisioning/core/nulib/", + severity = 'Hard, + check = { tag = 'Grep, pattern = "save.*config-cache.*\\.json", paths = ["provisioning/core/nulib/"], must_be_empty = true }, + rationale = "Single-writer principle: concurrent Nu processes writing cache files would corrupt manifest state and produce partial JSON. Only ncl-sync daemon writes to the cache directory.", + }, + { + id = "ncl-sync-no-platform-services", + claim = "ncl-sync binary must not depend on platform-nats, platform-db, or surrealdb", + scope = "provisioning/platform/crates/ncl-sync/Cargo.toml", + severity = 'Hard, + check = { tag = 'Grep, pattern = "platform-nats|platform-db|surrealdb", paths = ["provisioning/platform/crates/ncl-sync/"], must_be_empty = true }, + rationale = "Bootstrap circularity: NATS and SurrealDB are platform services whose configuration is managed by ncl-sync. The daemon cannot depend on services it configures.", + }, + ], +} diff --git a/adrs/adr-023-ncl-export-wrapper.ncl b/adrs/adr-023-ncl-export-wrapper.ncl new file mode 100644 index 0000000..3883969 --- /dev/null +++ b/adrs/adr-023-ncl-export-wrapper.ncl @@ -0,0 +1,73 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-023", + title = "ncl-eval wrapper: nu_plugin_nickel as the single ^nickel export abstraction in Nu", + status = 'Accepted, + date = "2026-04-16", + + context = "After ADR-022 established the ncl-sync daemon and shared cache, Nu call sites needed to be migrated from `^nickel export --format json ... | from json` to the plugin. Two call patterns exist: hard-failure (export failure should propagate as an error — uses `error make`) and soft-failure (export failure should return a fallback value — uses `if $result.exit_code != 0`). Distributing try/catch across 124 call sites would violate the guideline against widespread use of try/catch for Nu plugin commands.", + + decision = "Two wrapper functions in `lib_provisioning/utils/nickel_processor.nu` serve as the single abstraction layer: `ncl-eval [path import_paths]` for hard-failure call sites (error propagates from the plugin directly — no try/catch needed), and `ncl-eval-soft [path import_paths fallback]` for soft-failure call sites (a single try/catch returns `fallback` on any plugin error). Block C1 migrates the four hot-path call sites: `dispatcher.nu` (commands-registry), `components.nu` (comp-ncl-export helper + servers.ncl), `workflow.nu` (wf-ncl-export helper + settings.ncl + state.ncl), `extensions.nu` (metadata.ncl per taskserv). Block C2/C3 cover the remaining operation-path and validation call sites.", + + rationale = [ + { + claim = "ncl-eval-soft isolates the single try/catch to one location", + detail = "In Nu 0.111.0, try/catch is valid for Nu internal commands (including plugins). However, dispersing try/catch across dozens of call sites increases cognitive load and creates inconsistency. Centralizing it in ncl-eval-soft means the pattern is reviewed once and applied uniformly. Callers declare intent via the `fallback` parameter (`{}`, `[]`, or `null`) rather than embedding error-handling logic inline.", + }, + { + claim = "ncl-eval (hard-failure) requires no try/catch — plugin LabeledError propagates naturally", + detail = "When `nickel-eval` fails, it raises a `LabeledError` that Nu surfaces as a structured error. This is identical in behavior to `error make { msg: ... }` in the existing code. The call site is simply `ncl-eval $path [$ws $prov]` — one line instead of four. No error handling is needed because the error propagation is the correct behavior.", + }, + { + claim = "nickel-eval returns Nu values natively, eliminating | from json", + detail = "The plugin converts `serde_json::Value` to `nu_protocol::Value` via `json_value_to_nu_value`. Call sites receive a Nu record or list directly and can use cell path access (`$data.components`, `$data.dimensions`) without an intermediate string parse step. This removes a class of parse errors where `from json` would fail on empty stdout from a cached result.", + }, + ], + + consequences = { + positive = [ + "Hot-path call sites (4 files, C1) are now cache-backed via nu_plugin_nickel", + "Single try/catch location for soft-failure pattern — easy to audit", + "| from json eliminated from migrated call sites", + ], + negative = [ + "nu_plugin_nickel must be registered in the Nu session for performance benefits; unregistered sessions fall back to the `^nickel export` path in nickel-eval-soft (via the catch branch)", + "Block C2/C3 (remaining 120 call sites) are not yet migrated — those paths still use ^nickel export directly", + ], + }, + + alternatives_considered = [ + { + option = "Wrap each call site individually with do { } | complete (existing pattern)", + why_rejected = "Works only for external commands, not for Nu plugin commands. Plugin commands raise LabeledError — not catchable via complete. Keeping ^nickel export at call sites means all cache benefits are lost.", + }, + { + option = "Single ncl-export.nu wrapper delegating to ^nickel export with inline cache check", + why_rejected = "Duplicates the cache logic already inside nu_plugin_nickel. Two cache implementations with different key strategies would diverge. The plugin is the correct cache owner — the wrapper should delegate to it.", + }, + { + option = "Migrate all 124 call sites at once", + why_rejected = "Risk surface too large. Priority-ordered migration (C1 hot-path first) allows validating cache correctness on the most-exercised paths before touching validation, bootstrap, and diagnostic paths that are harder to test.", + }, + ], + + ontology_check = { + decision_string = "ncl-eval + ncl-eval-soft wrappers in nickel_processor.nu replace ^nickel export at hot-path call sites; single try/catch in ncl-eval-soft", + invariants_at_risk = ["config-driven-always"], + verdict = 'Safe, + }, + + related_adrs = ["adr-022-ncl-sync-daemon"], + + constraints = [ + { + id = "c1-no-direct-nickel-export", + claim = "Hot-path files (C1) must not contain direct ^nickel export calls after migration", + scope = "dispatcher.nu, components.nu, workflow.nu, extensions.nu", + severity = 'Hard, + check = { tag = 'Grep, pattern = "^nickel export", paths = ["provisioning/core/nulib/main_provisioning/dispatcher.nu", "provisioning/core/nulib/main_provisioning/components.nu", "provisioning/core/nulib/main_provisioning/workflow.nu", "provisioning/core/nulib/main_provisioning/extensions.nu"], must_be_empty = true }, + rationale = "Direct ^nickel export in C1 files bypasses the plugin cache, negating the performance benefit of ADR-022. All C1 exports must go through ncl-eval or ncl-eval-soft.", + }, + ], +} diff --git a/adrs/adr-024-ncl-sync-nats-events.ncl b/adrs/adr-024-ncl-sync-nats-events.ncl new file mode 100644 index 0000000..59ebf77 --- /dev/null +++ b/adrs/adr-024-ncl-sync-nats-events.ncl @@ -0,0 +1,99 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-024", + title = "ncl-sync: Event-driven cache invalidation via NATS", + status = 'Accepted, + date = "2026-04-17", + + context = "ADR-022 established the ncl-sync daemon with a file watcher (notify) as the automatic invalidation mechanism. ADR-023 added an explicit sync-request sidecar written by Nu processes (state-write). Both mechanisms have limitations: the file watcher has a debounce window (~100ms) where cache can be momentarily stale, and sync-request polling adds 500ms latency. The orchestrator (Rust) writes state files from a separate process — it cannot easily participate in the file-watcher's same-process events, and requiring it to write sync-request sidecars would couple it to ncl-sync's internal protocol. NATS is already used by the orchestrator for DAG events (`provisioning.dag.*`) — extending it for cache invalidation is a natural fit.", + + decision = "ncl-sync gains an optional NATS subscriber behind the `nats` Cargo feature (default-enabled). The subscriber listens on two subjects: `provisioning.workspace.ncl.changed` (file modified) and `provisioning.workspace.ncl.removed` (file deleted). Payload is a JSON object `{workspace, path, import_paths, source}`. On receipt, the subscriber validates that `workspace` matches its watched workspace, then calls `export_ncl` or `evict` directly — bypassing the file-watcher debounce and the sync-request poll. Cache is refreshed in <15ms vs ~100ms (watcher) or ~500ms (sidecar). The mechanism is opt-in via `ncl_sync.nats.enabled = true` in the config — without NATS, the daemon runs identically to before (watcher + sidecar fallback).", + + rationale = [ + { + claim = "NATS subscriber complements rather than replaces the file watcher", + detail = "Three invalidation mechanisms now exist with different failure characteristics: (1) file watcher — always active, catches any write including manual edits, ~100ms latency; (2) sync-request sidecar — written by Nu state-write, catches Nu-originated writes, ~500ms latency; (3) NATS events — written by any publisher, zero coupling to filesystem, <15ms latency. Each covers a different failure mode: watcher catches untracked writers, sidecar catches Nu writers, NATS catches Rust writers. Redundancy is intentional — duplicate events are idempotent (same cache_key, same content).", + }, + { + claim = "Workspace validation prevents cross-daemon interference", + detail = "Multiple ncl-sync daemons may run (one per workspace). All subscribe to the same subject hierarchy. The subscriber canonicalizes both its watched workspace path and the event's workspace path; only events matching its workspace are processed. This allows NATS events to fan out to all relevant daemons without coordination.", + }, + { + claim = "Subject hierarchy matches the workspace event model, not the orchestrator DAG model", + detail = "`provisioning.dag.*` subjects are about workflow execution. `provisioning.workspace.ncl.*` subjects are about configuration state. Keeping them separate lets ncl-sync subscribe narrowly (two subjects) without parsing unrelated events. Future publishers (installer, backup restore, etc.) use the same namespace.", + }, + { + claim = "Cargo feature flag keeps NATS optional", + detail = "`default = [\"nats\"]` enables NATS in release builds. `cargo build --no-default-features` produces a binary without async-nats linkage — useful for minimal containers, air-gapped environments, or testing. The config field `ncl_sync.nats.enabled` is an additional runtime gate independent of the compile-time feature.", + }, + ], + + consequences = { + positive = [ + "Orchestrator-driven state mutations invalidate cache in <15ms (vs ~100ms via file watcher)", + "Zero coupling between orchestrator and ncl-sync — only the subject contract is shared", + "Other subscribers (dashboard UI, audit log) can watch the same subjects without touching ncl-sync", + "Redundant with watcher+sidecar — graceful degradation if NATS is down", + ], + negative = [ + "Adds ~6MB to ncl-sync binary size (async-nats + dependencies)", + "NATS must be running before ncl-sync connects (but failure is non-fatal — falls back to watcher)", + "Publishers (orchestrator, etc.) must be updated to emit the new subjects — until then, NATS layer has no effect", + ], + }, + + alternatives_considered = [ + { + option = "Single mechanism: file watcher only", + why_rejected = "Misses the ~100ms debounce window. For interactive CLI this is fine; for rapid orchestrator-driven state changes (deploy with many state updates), the cache can lag.", + }, + { + option = "Single mechanism: NATS only", + why_rejected = "Hard dependency on NATS — ncl-sync fails if NATS isn't running. Manual NCL edits (user opens editor) wouldn't be caught. File watcher must remain as baseline.", + }, + { + option = "HTTP endpoint on ncl-sync for invalidation", + why_rejected = "Requires every publisher to know the daemon's Unix socket or HTTP port. NATS decouples publishers from subscribers.", + }, + { + option = "Reuse provisioning.dag.* subjects", + why_rejected = "DAG events are about workflow state, not config state. Overloading the subject hierarchy would force ncl-sync to filter noisy events it doesn't care about.", + }, + ], + + ontology_check = { + decision_string = "ncl-sync adds opt-in NATS subscriber on provisioning.workspace.ncl.{changed,removed} for event-driven cache invalidation; watcher + sidecar remain as fallback", + invariants_at_risk = ["config-driven-always"], + verdict = 'Safe, + }, + + related_adrs = ["adr-022-ncl-sync-daemon", "adr-023-ncl-export-wrapper"], + + constraints = [ + { + id = "ncl-sync-nats-optional", + claim = "NATS subscriber must be an optional Cargo feature, and runtime-gated by config", + scope = "provisioning/platform/crates/ncl-sync/", + severity = 'Hard, + check = { tag = 'Grep, pattern = "cfg\\(feature = \"nats\"\\)|#\\[cfg\\(feature = \"nats\"\\)\\]", paths = ["provisioning/platform/crates/ncl-sync/src/"], must_be_empty = false }, + rationale = "Air-gapped environments, minimal containers, and testing scenarios require ncl-sync to build and run without NATS. Removing the feature flag would violate this.", + }, + { + id = "ncl-sync-nats-fallback", + claim = "NATS connection failure must be non-fatal — daemon continues with watcher + sidecar", + scope = "provisioning/platform/crates/ncl-sync/src/main.rs", + severity = 'Hard, + check = { tag = 'Grep, pattern = "tracing::warn", paths = ["provisioning/platform/crates/ncl-sync/src/main.rs"], must_be_empty = false }, + rationale = "Hard dependency on NATS would break the workspace-local, zero-platform-service guarantee from ADR-022.", + }, + { + id = "ncl-sync-workspace-scope", + claim = "Subscriber must filter events by workspace — only process events matching its watched workspace", + scope = "provisioning/platform/crates/ncl-sync/src/nats_subscriber.rs", + severity = 'Hard, + check = { tag = 'Grep, pattern = "workspace_matches", paths = ["provisioning/platform/crates/ncl-sync/src/nats_subscriber.rs"], must_be_empty = false }, + rationale = "Multiple ncl-sync daemons share the subject namespace. Without filtering, daemon A would process events for workspace B's cache.", + }, + ], +} diff --git a/adrs/adr-026-nulib-restructure.ncl b/adrs/adr-026-nulib-restructure.ncl new file mode 100644 index 0000000..35d8412 --- /dev/null +++ b/adrs/adr-026-nulib-restructure.ncl @@ -0,0 +1,83 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-026", + title = "nulib M6 Restructure: 8-Layer ADR-025-Compliant Module Tree", + status = 'Accepted, + date = "2026-04-18", + + context = "ADR-025 mandated empty root mod.nu, selective imports, and bash-direct dispatch but deferred the file layout question: the existing `lib_provisioning/` and `main_provisioning/` directories were flat accretions with no enforced layer contracts. `lib_provisioning/` contained 242 files spanning primitives (string utils, logging), platform concerns (SOPS, KMS, SSH), domain logic (workspace state, orchestrator queries), and CLI command handlers — all in one directory. `main_provisioning/` held per-command Nu scripts with mixed dependency depth. Cross-layer violations were undetectable: a primitive utility could `use` a domain module without any structural signal. The ADR-025 pre-commit hook checked for star-imports but could not enforce dependency direction. The result was a codebase where adding a new utility required navigating 242 flat files and guessing import depth.", + + decision = "Reorganize provisioning/core/nulib/ into a strict 6-layer tree with one-directional dependency flow: primitives/ → tools/ → platform/ → domain/ → orchestration/ → cli/. Each layer may only import from layers below it; violations are detectable by grep on import paths. The migration uses the strangler-fig pattern: (1) move real implementation to the new tree; (2) leave a transition shim in the original location (`# Transition shim (ADR-026 M6)` as first line); (3) update external callers to the new path. All 242 lib_provisioning/ files are either moved to the new tree or archived to .wrks/core_nulib/shimmed/ when they have zero callers. The shim layer is a pure re-export façade: `export use new/path.nu *`. config/accessor is placed in platform/ (not domain/) because it already depended on platform/target.nu — layer placement follows actual dependency topology, not naming intuition.", + + rationale = [ + { + claim = "Flat directories are structurally unenforceable — layer violations are invisible at review time", + detail = "In a 242-file flat directory, `use lib_provisioning/utils/settings.nu [fn]` and `use lib_provisioning/domain/workspace/state.nu [fn]` look identical to a reviewer. The first is a primitives import; the second crosses a domain boundary. Without directory structure that encodes layer, pre-commit hooks can only check for star-imports, not dependency direction. The 8-layer tree makes violations visible: any `use platform/X` inside `primitives/Y` is a directory-level signal that something is wrong.", + }, + { + claim = "Strangler-fig migration preserves working code during the transition", + detail = "A big-bang migration of 242 files would require updating all callers atomically. With shims, each file moves independently: the shim at the old path keeps callers working until they are individually updated. This decouples the migration from caller updates and allows incremental validation. The shim marker (`# Transition shim (ADR-026 M6)`) enables bulk identification via grep for post-migration cleanup.", + }, + { + claim = "config/accessor belongs in platform/, not domain/ — discovered empirically", + detail = "The original plan placed config/accessor in domain/ because it felt like business-layer logic. During migration, it was found that config/accessor/core.nu already imported platform/target.nu. Placing it in domain/ would have required domain/ → platform/ imports, a layer violation in reverse. Moving it to platform/ eliminated all cross-layer violations from primitives/ and platform/ simultaneously and was the correct structural choice — layer assignment must follow actual dependency topology.", + }, + { + claim = "Archiving zero-caller shims to .wrks/ preserves history without polluting the live tree", + detail = "Of the 242 lib_provisioning/ files, a significant fraction had zero external callers — they were either dead code or superseded by newer implementations. Deleting them would lose their history; keeping them in the live tree would require maintaining shims forever. .wrks/core_nulib/shimmed/ is the designated archive for these files: not in git history, not in the live module tree, but recoverable if a caller is discovered later.", + }, + ], + + consequences = { + positive = [ + "Layer violations are detectable by grep on import path prefixes — no AST tooling required", + "New files have an obvious home: a string utility goes in primitives/, a SOPS wrapper in platform/, a workspace query in domain/", + "Shim layer enables incremental caller migration without breaking the working tree at any point", + "242 files reduced to a structured 6-layer tree with clear ownership boundaries", + ], + negative = [ + "Shims must be explicitly removed once all callers migrate — they are migration debt, not permanent architecture", + "The .wrks/core_nulib/shimmed/ archive is outside git tracking; files there are recoverable only from the local filesystem", + "Contributors must learn the 6-layer contract; the ADR-025 pre-commit hook alone does not enforce layer direction", + ], + }, + + alternatives_considered = [ + { + option = "Keep lib_provisioning/ flat and enforce layers via naming convention (lib_primitives_, lib_platform_, etc.)", + why_rejected = "Naming conventions degrade under refactoring pressure. A file renamed from lib_platform_foo to just foo loses the signal. Directory structure is enforced by the filesystem and grep; naming is enforced only by discipline.", + }, + { + option = "Big-bang migration: move all 242 files and update all callers atomically", + why_rejected = "The caller surface spans provisioning/core/nulib/, provisioning/extensions/, provisioning/platform/crates/ (Nushell test scripts), and workspaces/. Updating all callers atomically requires a multi-day coordinated change that cannot be validated incrementally. A single broken caller would fail the entire migration. Strangler-fig allows per-file validation.", + }, + ], + + constraints = [ + { + id = "layer-import-direction", + claim = "Files in primitives/ must not import from tools/, platform/, domain/, orchestration/, or cli/. Files in tools/ must not import from platform/ or above. The rule extends transitively up each layer.", + scope = "provisioning/core/nulib/", + severity = 'Hard, + check = { tag = 'Grep, pattern = "^use (platform|domain|orchestration|cli)/", paths = ["provisioning/core/nulib/primitives/"], must_be_empty = true }, + rationale = "One-directional dependency flow is the architectural guarantee of the 8-layer tree. Without it, the tree is a cosmetic rename of the flat directory.", + }, + { + id = "shim-marker-required", + claim = "Every transition shim in lib_provisioning/ or main_provisioning/ must have `# Transition shim (ADR-026 M6)` as its first line", + scope = "provisioning/core/nulib/lib_provisioning/, provisioning/core/nulib/main_provisioning/", + severity = 'Soft, + check = { tag = 'Manual, description = "grep -rL 'Transition shim' provisioning/core/nulib/lib_provisioning/ — must list only empty mod.nu files" }, + rationale = "The marker enables bulk identification of shims for post-migration cleanup. Without it, shims are indistinguishable from real implementations by file content alone.", + }, + ], + + ontology_check = { + decision_string = "Reorganize provisioning/core/nulib/ into 8-layer tree (primitives/tools/platform/domain/orchestration/cli/) with strangler-fig migration and shim layer at lib_provisioning/", + invariants_at_risk = ["config-driven-always"], + verdict = 'Safe, + }, + + related_adrs = ["adr-025-unified-lazy-loading"], +} diff --git a/adrs/adr-027-prvng-cli-daemon.ncl b/adrs/adr-027-prvng-cli-daemon.ncl new file mode 100644 index 0000000..851575b --- /dev/null +++ b/adrs/adr-027-prvng-cli-daemon.ncl @@ -0,0 +1,104 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-027", + title = "prvng-cli: Unix-Socket Registry Daemon Eliminating Nu Startup Cost Per Invocation", + status = 'Accepted, + date = "2026-04-18", + + context = "Every `prvng` invocation runs `_validate_command` in the bash wrapper, which determines whether a command exists in the registry and whether it requires the orchestrator daemon. The baseline implementation had two paths: (1) `nickel export` to rebuild the JSON cache (~2-5s, only on cache miss), and (2) pure bash grep over the JSON cache to extract `found` and `requires_daemon`. The bash grep path has a correctness flaw: `grep -o '\"[a-zA-Z0-9_\\-\\+\\.]*\"'` extracts every quoted string in the JSON file, not just command names, so a description substring matching the query command produces a false positive. Additionally, as the registry grows, a grep+sed window extraction over a multi-kilobyte JSON file for each invocation adds unneeded I/O. ADR-022 (ncl-sync) already established the pattern of using a lightweight Rust daemon for operations that benefit from persistent in-memory state. The registry is a read-heavy, write-rare structure — exactly the right profile for an in-memory cache behind a Unix socket.", + + decision = "Implement `prvng-cli` as a standalone Rust binary in `platform/crates/prvng-cli/`. The binary: (1) reads `~/.cache/provisioning/commands-registry.json` at startup into a `HashMap` indexed by both canonical name and all aliases; (2) listens on a Unix domain socket at `~/.local/share/provisioning/cli.sock`; (3) serves JSON-framed lookup requests with newline termination; (4) watches the JSON cache file via the `notify` crate and hot-reloads the index on any `Modify` or `Create` event without restarting; (5) shuts down automatically after 60s of idle — the bash wrapper restarts it on demand. The bash wrapper gains three functions: `_ensure_cli_daemon` (starts the binary if the socket is absent, waits up to 300ms for the socket to appear), `_cli_query` (sends `{\"op\":\"lookup\",\"command\":\"\"}` via `nc -U -w 1`, parses `found` and `requires_daemon` from the response), and a three-tier `_validate_command` (socket → bash grep JSON cache → Nu script fallback). The binary is a workspace member of `platform/Cargo.toml` with no dependency on Nushell, SurrealDB, NATS, or any platform service.", + + rationale = [ + { + claim = "The in-memory HashMap indexed by both canonical name and aliases eliminates the bash grep false-positive problem", + detail = "The HashMap is built at load time by `Registry::into_index()`: for each CommandEntry, the canonical name is inserted as a key, then each alias is inserted as an additional key pointing to the same entry. A lookup on `\"h\"` returns the `help` entry without scanning the description or any other field. The bash grep approach extracted every quoted string from the JSON, meaning a description containing `\"h\"` (e.g., as part of another word) would have matched. The HashMap provides O(1) exact lookup with no false positives.", + }, + { + claim = "Unix socket with newline-framed JSON is simpler and lower-latency than HTTP for same-host IPC", + detail = "HTTP adds header parsing, keep-alive negotiation, and TCP stack overhead. For a query that returns ~200 bytes, the round-trip overhead of HTTP is larger than the payload. Unix domain sockets bypass the network stack entirely and are available on all Unix targets. `nc -U -w 1` is universally available on macOS and Linux without additional tooling. The newline frame is parseable by any shell with `grep -o '\"field\":value'` — no JSON parser required in the caller.", + }, + { + claim = "notify-based hot reload eliminates the need to restart the daemon after `nickel export` updates the cache", + detail = "The workflow for a registry change is: edit `commands-registry.ncl` → `nickel export` writes the JSON cache → the file watcher detects the write event → the daemon reloads the HashMap in-place. No socket downtime, no bash logic to detect staleness, no version negotiation. The watcher monitors the parent directory with `RecursiveMode::NonRecursive` to catch atomic writes (where editors write to a temp file then rename into place, which does not trigger a `Modify` on the original path but does trigger `Create` on the canonical path).", + }, + { + claim = "Idle shutdown at 60s keeps resource usage zero between `prvng` invocations", + detail = "The daemon is not a long-running service — it is an on-demand cache server. On a developer workstation, `prvng` may not be invoked for hours. A daemon that runs continuously would hold an open file descriptor on the socket and consume memory permanently. The 60s idle timeout means the daemon self-terminates after a session of commands, and `_ensure_cli_daemon` restarts it on the next invocation. The restart cost is ~100ms (binary start + HashMap load for 40 commands); this is amortized across all commands in a session.", + }, + { + claim = "Three-tier fallback in `_validate_command` preserves correctness when the daemon is unavailable", + detail = "The socket path can fail in three ways: the binary is not installed, the daemon is starting (race condition during `_ensure_cli_daemon`), or `nc` returns no output. The fallback chain is: socket (fast, correct) → bash grep on JSON cache (fast, has false-positive risk but handles 99% of cases) → Nu script (slow, always correct). The Nu fallback is the pre-ADR-027 behavior; it is retained as the last resort to ensure `_validate_command` never hard-fails due to daemon absence.", + }, + ], + + consequences = { + positive = [ + "`_validate_command` completes in <5ms when the daemon is running vs ~50-200ms for bash grep+sed", + "Registry lookup correctness: HashMap indexed by exact name/alias, no substring false positives", + "Hot reload: `nickel export` → daemon reloads automatically, no restart needed", + "Zero resource usage between sessions: idle shutdown at 60s", + "No additional system dependencies: `nc` (netcat) is present on all macOS/Linux targets", + ], + negative = [ + "300ms cold-start latency on first invocation after idle shutdown — amortized across the session but visible on the very first `prvng` call", + "`nc -U -w 1` behavior differs between GNU netcat (`-q 1`) and BSD netcat (`-w 1`) — the bash wrapper must use `-w 1` for macOS compatibility", + "The binary must be installed to `~/.local/share/provisioning/bin/prvng-cli` before `_ensure_cli_daemon` can start it; the installer script must include this step", + ], + }, + + alternatives_considered = [ + { + option = "Keep pure bash grep over JSON cache as the only validation path", + why_rejected = "False-positive risk: grep extracts every quoted token from the JSON, not just command names. For 40 commands with aliases and descriptions, the extracted token list contains ~300 strings. A description containing a common word that matches an input typo would suppress the 'unknown command' error. Correctness requires exact-name matching against the command/alias fields only.", + }, + { + option = "Reuse the existing `provisioning-daemon` (platform/crates/daemon/) for registry queries", + why_rejected = "The provisioning-daemon is a full platform service: SurrealDB, NATS, auth middleware, provider APIs. It requires the orchestrator infrastructure to be running and is not designed for sub-millisecond local queries. Starting it solely for registry lookup is architectural misuse. ADR-022's ncl-sync daemon established the correct pattern: a separate binary scoped to one responsibility.", + }, + { + option = "HTTP server on localhost instead of Unix socket", + why_rejected = "HTTP requires a port allocation, adds TCP stack overhead, and exposes the registry to other processes on the host. Unix sockets are file-permission-controlled, zero-overhead, and already the established IPC pattern for this codebase (the orchestrator uses WebSocket-over-Unix-socket for SurrealDB embedded mode).", + }, + { + option = "Shared memory or mmap for the registry index", + why_rejected = "Requires either a file-format contract for the serialized HashMap or a memory-mapped file with a custom reader. `nc`-over-Unix-socket is implementable in bash with one line; mmap requires a dedicated reader binary or Nushell plugin. The complexity gain is negative: the index is 40 entries and fits in a single cache line of JSON.", + }, + ], + + constraints = [ + { + id = "prvng-cli-no-platform-deps", + claim = "platform/crates/prvng-cli/Cargo.toml must not depend on nushell, surrealdb, async-nats, platform-config, service-clients, or any crate that transitively requires them", + scope = "platform/crates/prvng-cli/", + severity = 'Hard, + check = { tag = 'Manual, description = "cargo tree -p prvng-cli | grep -E 'nushell|surrealdb|async-nats' — must be empty" }, + rationale = "prvng-cli is a lightweight daemon with a single responsibility: serve registry lookups. Platform service dependencies would pull in rustls version conflicts (nushell pins rustls=0.23.28; surrealdb requires ^0.23.36) and increase binary size by 10-50x. Keeping it dependency-minimal ensures it builds fast and stays buildable independently of the platform workspace conflicts.", + }, + { + id = "socket-path-via-xdg", + claim = "The socket path must be derived from XDG_DATA_HOME (defaulting to ~/.local/share), never hardcoded", + scope = "platform/crates/prvng-cli/src/main.rs, provisioning/core/cli/provisioning", + severity = 'Hard, + check = { tag = 'Grep, pattern = "\\.local/share/provisioning/cli\\.sock", paths = ["platform/crates/prvng-cli/src/main.rs"], must_be_empty = true }, + rationale = "Hardcoded paths break in NixOS, container environments, and CI runners where HOME may not exist or XDG_DATA_HOME points elsewhere. The PRVNG_CLI_SOCKET environment variable allows per-invocation override for testing.", + }, + { + id = "bsd-nc-compatibility", + claim = "All `nc` invocations in provisioning/core/cli/provisioning must use `-w 1` for timeout, never `-q 1`", + scope = "provisioning/core/cli/provisioning", + severity = 'Hard, + check = { tag = 'Grep, pattern = "nc.*-q", paths = ["provisioning/core/cli/provisioning"], must_be_empty = true }, + rationale = "macOS ships BSD netcat which does not implement `-q` (GNU netcat timeout flag). BSD netcat uses `-w` for connection timeout. Using `-q 1` causes nc to exit with an error on macOS, making `_cli_query` always fail and fall through to the bash grep path, silently degrading to the pre-ADR-027 behavior.", + }, + ], + + ontology_check = { + decision_string = "Rust Unix-socket daemon serving in-memory HashMap registry lookups with file-watcher hot-reload and 60s idle shutdown; bash wrapper gains three-tier _validate_command (socket → grep → Nu)", + invariants_at_risk = ["config-driven-always"], + verdict = 'Safe, + }, + + related_adrs = ["adr-022-ncl-sync-daemon", "adr-025-unified-lazy-loading", "adr-028-daemon-target-registry-field"], +} diff --git a/adrs/adr-028-daemon-target-registry-field.ncl b/adrs/adr-028-daemon-target-registry-field.ncl new file mode 100644 index 0000000..2da2f57 --- /dev/null +++ b/adrs/adr-028-daemon-target-registry-field.ncl @@ -0,0 +1,85 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-028", + title = "daemon_target: Registry Field for CLI Query Routing", + status = 'Accepted, + date = "2026-04-18", + + context = "The commands-registry.ncl schema had `requires_daemon: Bool` — a binary flag indicating whether the orchestrator daemon must be running before the command can execute. This is a runtime precondition check, not a routing directive. As prvng-cli (ADR-027) introduces a second daemon (the registry query daemon) and the orchestrator remains a third service, the question of which system should serve a given CLI query becomes a distinct concern from whether that query's command needs the orchestrator running at execution time. A future fourth service (e.g., an AI assistant backend) might need to handle `prvng ai` queries. Without an explicit routing field in the registry, the bash wrapper must embed routing logic as ad-hoc case statements that drift out of sync with the registry. The registry is already the authoritative source of truth for command metadata; routing belongs there.", + + decision = "Add `daemon_target` as an enum field to the CommandEntry schema with three values: `none` (command is handled locally by Nu thin handlers), `cli` (command's query should be routed to the prvng-cli Unix socket), `orchestrator` (command's query should be routed to the orchestrator service on port 9011). Default is `none`. The field is added to: (1) `schemas/commands_registry/schema.ncl` as `daemon_target | std.enum.TagOrString | [| 'none, 'cli, 'orchestrator |]`; (2) `schemas/commands_registry/defaults.ncl` as `daemon_target | default = 'none`; (3) `platform/crates/prvng-cli/src/registry.rs` as `DaemonTarget` enum with serde `rename_all = \"lowercase\"` and `Default = None`; (4) `LookupResult` in registry.rs as `daemon_target: Option` serialized in every socket response. The bash wrapper reads `daemon_target` from the socket response but does not yet act on it — the field is present for forward compatibility, enabling future routing without a schema migration.", + + rationale = [ + { + claim = "Routing intent belongs in the registry, not in bash case statements", + detail = "The current bash wrapper routes commands via hard-coded case branches. Adding a new service requires editing the bash wrapper in two places: the dispatch block and the `_validate_command` daemon check. With `daemon_target` in the registry, routing is data: a new service is a new enum variant, and the bash wrapper reads the variant rather than containing the routing logic. This is the configuration-driven principle applied to service dispatch.", + }, + { + claim = "`daemon_target` and `requires_daemon` are orthogonal — both must exist", + detail = "`requires_daemon: Bool` answers 'does this command need the orchestrator running at execution time?' — it is a precondition for command execution. `daemon_target` answers 'which service should handle the CLI query for this command?' — it is a routing directive. A command can have `requires_daemon = true` (needs orchestrator to execute) and `daemon_target = none` (CLI query is handled locally by Nu). A command could have `daemon_target = orchestrator` (the orchestrator itself handles the lookup) with `requires_daemon = false` (but this is currently unused). Conflating them would require complex boolean combinations to express future routing needs.", + }, + { + claim = "Enum over TagOrString allows gradual adoption without schema breakage", + detail = "`std.enum.TagOrString` is Nickel's mechanism for enums that also accept plain strings. This means existing JSON consumers that read `daemon_target` as a string continue to work. Future enum variants (e.g., `'ai-service`) can be added to the schema without forcing all existing entries to be updated — they continue to deserialize as `none` via the Rust `Default` impl.", + }, + { + claim = "Returning `daemon_target` in every LookupResult socket response costs ~20 bytes and adds zero latency", + detail = "The socket response is already a JSON object. Adding `\"daemon_target\":\"none\"` to the 40-command registry adds ~20 bytes per response. At Unix socket speeds (loopback, no copy), this is below measurement threshold. Omitting the field from responses would require schema versioning if it is added later; including it from the start avoids that migration.", + }, + ], + + consequences = { + positive = [ + "Future services can be added to the routing table by adding an enum variant — no bash wrapper changes required for the routing data layer", + "Registry is the single source of truth for both command metadata and routing intent", + "`LookupResult` carries routing information, enabling smart clients (IDE plugins, MCP tools) to route queries without duplicating registry logic", + ], + negative = [ + "The bash wrapper currently ignores `daemon_target` from the socket response — it reads `requires_daemon` only. Acting on `daemon_target` requires a future bash wrapper change that maps `daemon_target=orchestrator` to an HTTP/WebSocket call to port 9011 instead of local Nu execution.", + "Adding a new `daemon_target` variant requires: (1) schema.ncl update, (2) Rust enum update + rebuild, (3) re-export of commands-registry.json. The schema and Rust must stay in sync manually — there is no codegen.", + ], + }, + + alternatives_considered = [ + { + option = "Use `requires_daemon` as a proxy for routing — orchestrator-requiring commands route to orchestrator", + why_rejected = "The semantics differ. `requires_daemon = true` means the command cannot execute without the orchestrator — it does not mean the orchestrator should handle the CLI query. A future command might need the orchestrator for data but want its query interface served by prvng-cli (e.g., cached orchestrator state). Overloading `requires_daemon` would require a boolean override field for these cases, which is worse than having a dedicated routing field.", + }, + { + option = "Encode routing in command naming convention (prefix: `orch:workspace`, `cli:help`)", + why_rejected = "Naming conventions require parser logic in every consumer and break when commands are renamed. A dedicated schema field is strongly typed, validated by `nickel typecheck`, and queryable by grep without special parsing.", + }, + { + option = "Add routing to a separate registry file (routing-registry.ncl)", + why_rejected = "Two registries for the same command set creates synchronization debt: adding a command requires editing both files, and a mismatch is not detectable without running both through a diff tool. The registry is already the authoritative command list; routing is command metadata and belongs in the same record.", + }, + ], + + constraints = [ + { + id = "daemon-target-rust-enum-in-sync", + claim = "The DaemonTarget enum in platform/crates/prvng-cli/src/registry.rs must contain exactly the variants declared in schemas/commands_registry/schema.ncl: None, Cli, Orchestrator", + scope = "platform/crates/prvng-cli/src/registry.rs, provisioning/schemas/commands_registry/schema.ncl", + severity = 'Hard, + check = { tag = 'Manual, description = "grep -c 'None\\|Cli\\|Orchestrator' platform/crates/prvng-cli/src/registry.rs — must equal 3; grep TagOrString provisioning/schemas/commands_registry/schema.ncl — must find daemon_target line" }, + rationale = "A variant in the schema with no Rust counterpart causes serde deserialization to fail at runtime on any registry entry using the new variant. Manual sync is required until codegen is available.", + }, + { + id = "daemon-target-default-none", + claim = "All commands in commands-registry.ncl that do not explicitly set daemon_target must resolve to `none` via the schema default", + scope = "provisioning/schemas/commands_registry/defaults.ncl", + severity = 'Hard, + check = { tag = 'Grep, pattern = "daemon_target.*default.*none", paths = ["provisioning/schemas/commands_registry/defaults.ncl"], must_be_empty = false }, + rationale = "The default ensures backward compatibility: existing registry entries without daemon_target are valid and route locally. A missing default would make the field required and break all existing make_command calls.", + }, + ], + + ontology_check = { + decision_string = "Add daemon_target enum field (none|cli|orchestrator) to CommandEntry schema and LookupResult for forward-compatible CLI query routing without conflating with requires_daemon precondition", + invariants_at_risk = ["config-driven-always", "type-safety-always"], + verdict = 'Safe, + }, + + related_adrs = ["adr-027-prvng-cli-daemon"], +} diff --git a/adrs/adr-029-smart-interface-unification.ncl b/adrs/adr-029-smart-interface-unification.ncl new file mode 100644 index 0000000..b3b3ae9 --- /dev/null +++ b/adrs/adr-029-smart-interface-unification.ncl @@ -0,0 +1,127 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-029", + title = "Smart Interface Unification: CLI ↔ HTTP ↔ MCP via Shared Registry", + status = 'Accepted, + date = "2026-04-19", + + context = "Before this decision the provisioning platform exposed three user-facing surfaces — the Nushell CLI (`provisioning ...`), the MCP stdio server (`crates/mcp-server`), and the future admin HTTP UI — as three independent codebases. Each had its own dispatch logic, its own parameter validation, and its own response formatting. A single operation like `workspace list` was implemented once in Nushell for the CLI and once as a `simple_main.rs` MCP tool with separate logic. The admin UI was pending because there was no shared backend it could consume. This divergence was already causing drift: `provision_cluster_create` in MCP accepted a different parameter shape than `provisioning cluster create` in the CLI, and neither agreed with the orchestrator's HTTP POST body. The user's irrenunciable requirement was ontoref-style synchronization — one operation, one semantics, three surfaces — without forcing any surface to depend on the others (CLI must work offline; MCP stdio must not require an HTTP daemon; admin UI must not embed the CLI).", + + decision = "Introduce a four-crate layered architecture: (1) `provisioning-core` is a pure library exposing the `Tool` trait and `Registry`; all 37 operations are implemented as `impl Tool` inside it. (2) `provisioning-tool` is a thin CLI binary that instantiates the Registry and exposes `list`/`schema`/`invoke` over stdout JSON. (3) `provisioning-daemon` is an Axum HTTP+NATS server that wraps the Registry with JWT+RBAC middleware, domain-state tracking, a config-file watcher, an embedded admin UI, and Tera ontology templates. (4) `mcp-server` is reimplemented internally as a JSON-RPC 2.0 dispatcher over the same Registry, consumed via `McpServer::handle_request` for in-process tests and via stdin/stdout for the MCP protocol. The Nushell CLI uses a three-tier fallback chain (`platform/clients/fallback.nu::tool-call`): tier 1 is the HTTP daemon if reachable; tier 2 is the `provisioning-tool` child process; tier 3 is the caller-supplied Nushell legacy closure. A G3 contract test (`crates/contract-tests`) asserts that the same tool invoked through all three surfaces produces semantically equivalent payloads after envelope normalisation and validates each tier's output against a shared JSON Schema. An `.ontoref/config.ncl` hook (`domain_daemon`) declares provisioning as an external domain so ontoref-daemon can delegate `provisioning.*` ontology queries without provisioning importing any ontoref crates.", + + rationale = [ + { + claim = "A shared Rust library is the only architecture that gives autonomy + sync simultaneously", + detail = "The three surfaces have incompatible runtime models: the CLI can run without any long-running process, MCP stdio cannot share a process with an HTTP server (stdio hijacks stdin/stdout), and the admin UI requires a persistent backend. A shared service (daemon-only) forces the CLI to depend on the daemon — breaks autonomy. A shared protocol (REST-only) forces MCP to wrap HTTP — breaks stdio's contract. A shared library is the only option where each surface instantiates Registry independently and dispatches identically. Autonomy is structural; sync is guaranteed by construction because the dispatch code is literally the same function call.", + }, + { + claim = "The three-tier fallback keeps CLI hardcoded offline-first", + detail = "The user's current workflow is `provisioning workspace list` on a laptop with no daemon running. Tier 3 (Nushell legacy closure) preserves that behavior indefinitely. Tier 1 (HTTP daemon) opportunistically accelerates when the daemon is up — lets multi-developer setups cache Registry state. Tier 2 (provisioning-tool child) is the bridge: it reuses the Rust Registry but spawns a fresh process, so operations don't require a daemon yet also don't reimplement logic in Nushell. The chain is checked at call time, not configuration time, so the user never manages daemon state — it either works faster or it works the same.", + }, + { + claim = "G3 contract test converts 'sync irrenunciable' into a CI invariant", + detail = "Without G3, the three surfaces would drift silently as new tools are added. G3 asserts that for each fixture tool, all three tiers produce the same normalised payload and the same error code. This is structural: the test doesn't know which tier is 'right' — it knows they must agree. If a future change to the HTTP envelope breaks parity with MCP, CI fails. If a new error variant is added to ToolError but not mapped in `routes.rs::tool_error_code` or `registry_server.rs::tool_error_to_rpc`, the G3 error-code tests catch it. The contract cost is one integration test crate; the insurance is architectural.", + }, + { + claim = "Ontoref federation via config hook, not crate dependency", + detail = "Earlier plan revisions had provisioning-daemon depending on `ontoref-ontology` and `ontoref-derive` crates. This would force provisioning's release cadence onto ontoref's and vice versa. The `domain_daemon` config hook in `.ontoref/config.ncl` inverts the dependency: provisioning declares its HTTP URL and ontology endpoints; ontoref-daemon reads this config and delegates. provisioning has zero compile-time ontoref deps. The coupling is runtime, one-directional, and can be disabled by setting `domain_daemon.required = false` (the default).", + }, + { + claim = "37 tools, not 45+ as originally planned", + detail = "A0 inventory revealed 37 actual tools in mcp-server (7 provision_*, 5 guidance_*, 7 installer_*, 17 legacy infra, 1 ai_query). The remaining 'tools' counted in early plans were enum values for taskservs (cicd, coredns, grafana…), not operations. Renaming to `_` (workspace_list, server_create, dag_show) preserves the 37 operations under cleaner names since no external MCP consumers exist yet.", + }, + ], + + consequences = { + positive = [ + "Adding a new operation is a single `impl Tool` in provisioning-core — it appears in all three surfaces at once without surface-specific code", + "The admin UI is unblocked: it calls the same HTTP API the CLI uses, consuming the same Registry responses", + "MCP stdio and HTTP daemon can be deployed or disabled independently without affecting the CLI's offline workflow", + "G3 contract test catches silent drift at CI time instead of production", + "Schema is generated once by `Tool::schema()` and consumed by tools/list (MCP), GET /api/v1/tools (HTTP), and `provisioning-tool schema ` (CLI) — no duplicate JSON Schema files", + "`--fmt text|json|yaml|toml|md` and `--clip` global CLI flags replace the scattered `--format`, `--output`, `--json` per-handler options", + ], + negative = [ + "The Nushell legacy branch (tier 3) must be maintained until every handler is migrated to the fallback chain — currently only `workspace list` is wired; the other 36 operations still call Nushell legacy directly", + "Adding a tool now requires Rust compilation — faster iteration is lost versus the previous 'edit a Nushell file, reload' pattern. Mitigated by `cargo watch -x 'build -p provisioning-daemon'` during development", + "The fallback chain incurs up to two failed probes (daemon ping + `which provisioning-tool`) before falling through to tier 3 on cold offline use. Latency measured at ~50ms on macOS — acceptable but not zero", + "G3 can only assert semantic equivalence on payloads it can normalise. Fields not listed in `normalise()` (trace_id/timestamp/etc.) could still mask real divergence if an unknown volatile field is introduced. Mitigated by reviewing the normaliser when any new metadata field is added", + "The mcp-server binary `provisioning-mcp-server` still exists alongside `prov-mcp` (the new Registry-backed binary) during migration. Users must be told which to use", + ], + }, + + alternatives_considered = [ + { + option = "Single binary with feature flags for CLI/HTTP/MCP surfaces", + why_rejected = "stdio hijack (MCP) and persistent HTTP server are incompatible runtime modes in one process without complex flag matrices. The feature-flag model also bloats binary size — every CLI user ships the full HTTP server. The separate-binary model with shared library gives the same code-reuse guarantee without the runtime coupling.", + }, + { + option = "Ship only the daemon — CLI becomes a thin HTTP client", + why_rejected = "The user's current workflow is CLI-first and offline-first. Requiring a daemon would regress the unsurprising property that `provisioning workspace list` works with no running services. Autonomy was listed as irrenunciable in the A0 decisions.", + }, + { + option = "Keep mcp-server and CLI as independent codebases, add the daemon as a third", + why_rejected = "Sync irrenunciable fails. Every new operation would need implementation in three places, and divergence was already observable (parameter shape mismatches between MCP tools and CLI handlers). Adding a third surface would multiply drift rather than fix it.", + }, + { + option = "Use MCP stdio as the 'backend' — HTTP daemon and CLI would invoke MCP internally", + why_rejected = "MCP is a client-server protocol designed for stdin/stdout framing. Using it as an internal backend forces the HTTP daemon to spawn and manage an MCP subprocess for every request — adding latency and serialisation overhead — and couples the daemon's availability to MCP protocol versioning. A shared library avoids both issues.", + }, + { + option = "Use ontoref-ontology crate as the ontology source for provisioning-daemon", + why_rejected = "Compile-time dependency on ontoref would force coordinated releases and embed ontoref's SurrealDB+schema choices into provisioning's build. The `domain_daemon` config hook achieves delegation with no crate coupling — provisioning owns its domain ontology; ontoref-daemon discovers and delegates at runtime.", + }, + ], + + constraints = [ + { + id = "registry-sole-dispatch-path", + claim = "All three surfaces (CLI via provisioning-tool, HTTP via provisioning-daemon, MCP via mcp-server) must invoke operations through Registry::invoke — no surface may bypass the Registry with direct tool instantiation", + scope = "platform/crates/provisioning-tool, platform/crates/provisioning-daemon, platform/crates/mcp-server", + severity = 'Hard, + check = { tag = 'Grep, pattern = "Tool::invoke|tool\\.invoke\\(", paths = ["platform/crates/provisioning-tool/src", "platform/crates/provisioning-daemon/src", "platform/crates/mcp-server/src"], must_be_empty = true }, + rationale = "A surface that bypasses the Registry makes the G3 contract test meaningless for that operation because the shared dispatch path is not exercised. Enforcing Registry::invoke keeps the three surfaces contractually equivalent.", + }, + { + id = "g3-contract-test-must-pass", + claim = "The contract-tests crate must pass with 5 tests: listing agreement, echo agreement, invalid-param error agreement, failing-tool error agreement, and tools/list count agreement", + scope = "platform/crates/contract-tests", + severity = 'Hard, + check = { tag = 'NuCmd, cmd = "cargo test -p contract-tests --manifest-path platform/Cargo.toml", expect_exit = 0 }, + rationale = "G3 is the mechanism that converts sync-irrenunciable into an architectural invariant. A failing G3 means one surface has silently diverged from the others.", + }, + { + id = "nushell-fallback-legacy-closure-required", + claim = "Every call to tool-call / tool-list in Nushell must pass an explicit legacy closure — not a stub, not an error, but a working Nushell-native implementation", + scope = "provisioning/core/nulib/domain", + severity = 'Hard, + check = { tag = 'Grep, pattern = "tool-call|tool-list", paths = ["provisioning/core/nulib/domain"], must_be_empty = false }, + rationale = "Tier 3 is the offline-first guarantee. If the legacy closure errors or is empty, the fallback chain breaks when the daemon is down and provisioning-tool is not installed. This is the retirement gate: tier 3 can only be removed per-operation after G3 passes for that operation.", + }, + { + id = "mcp-dispatch-exposed-via-handle-request", + claim = "McpServer must expose `pub async fn handle_request(Value) -> Value` — the in-process entry point used by G3 contract tests", + scope = "platform/crates/mcp-server/src/registry_server.rs", + severity = 'Hard, + check = { tag = 'Grep, pattern = "pub async fn handle_request", paths = ["platform/crates/mcp-server/src/registry_server.rs"], must_be_empty = false }, + rationale = "Without handle_request the G3 MCP tier would require spawning a subprocess with pipes — brittle under concurrent test execution. Keeping handle_request public is a testability contract.", + }, + { + id = "ontoref-zero-crate-dependency", + claim = "provisioning workspace Cargo.toml must not contain ontoref-* path dependencies or the `ai` feature flag enabling them at the workspace level", + scope = "provisioning/platform/Cargo.toml, provisioning/platform/crates/provisioning-core/Cargo.toml, provisioning/platform/crates/provisioning-daemon/Cargo.toml", + severity = 'Soft, + check = { tag = 'Grep, pattern = "ontoref-ontology|ontoref-derive", paths = ["provisioning/platform/crates/provisioning-core", "provisioning/platform/crates/provisioning-daemon"], must_be_empty = true }, + rationale = "Coupling to ontoref crates inverts the delegation model: the decision is that provisioning's .ontoref/config.ncl declares a domain_daemon hook, and ontoref-daemon discovers it. provisioning must not import ontoref.", + }, + ], + + ontology_check = { + decision_string = "Unify CLI+HTTP+MCP surfaces on a shared provisioning-core Registry with a three-tier fallback in Nushell, JWT+RBAC middleware only at the HTTP layer, G3 contract test asserting semantic parity, and ontoref federation via config hook instead of crate dependency", + invariants_at_risk = ["config-driven-always", "type-safety-always", "solid-boundaries"], + verdict = 'Safe, + }, + + related_adrs = ["adr-014-solid-enforcement", "adr-022-ncl-sync-daemon", "adr-025-unified-lazy-loading", "adr-026-nulib-restructure", "adr-027-prvng-cli-daemon", "adr-028-daemon-target-registry-field"], +} diff --git a/adrs/adr-030-platform-crate-naming.ncl b/adrs/adr-030-platform-crate-naming.ncl new file mode 100644 index 0000000..fd45a4e --- /dev/null +++ b/adrs/adr-030-platform-crate-naming.ncl @@ -0,0 +1,90 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-030", + title = "Platform Workspace Crate Naming Convention", + status = 'Accepted, + date = "2026-04-19", + + context = "The platform workspace accumulated three inconsistent naming patterns: (1) shared platform libraries had a `platform-` prefix (`platform-config`, `platform-nats`, `platform-db`); (2) the smart interface layer had a `provisioning-` prefix (`provisioning-core`, `provisioning-tool`, `provisioning-daemon`); (3) everything else had no prefix at all (`rag`, `mcp-server`, `service-clients`, `observability`, `machines`, `backup`, `encrypt`). The no-prefix group caused three concrete problems: (a) `cargo build -p rag` was ambiguous in the workspace resolver if a second `rag` dependency ever appeared; (b) `rag = { workspace = true }` in dependency declarations gave no indication which project the dep belonged to; (c) `observability` clashed with an identically-named crate on crates.io if the crate ever needed publishing. The binary names (set independently via `[[bin]]`) were already consistent — all used `provisioning-` prefix — so the inconsistency was purely at the Cargo package name level.", + + decision = "Apply a four-rule naming convention across all workspace crates. Rule 1 — Shared platform libraries: `platform-` package name; Rust crate name defaults to `platform_`. Rule 2 — Smart interface layer: `provisioning-` package name and binary name. Rule 3 — Service binaries: short package name (`orchestrator`, `vault-service`, etc.); binary name carries the `provisioning-` prefix via `[[bin]] name = 'provisioning-'`. Rule 4 — Ecosystem crates with many existing `use` callers: `platform-` package name + `[lib] name = 'old_name'` to preserve the Rust crate name across all existing `use` statements and doc tests without modifying call sites. Crates that are never declared as dependencies (service binaries) are exempt from the package name prefix because their Cargo identifier is only used in `cargo build -p ` invocations, never in `[dependencies]` sections.", + + rationale = [ + { + claim = "Service binaries do not need a prefix because they are never declared as dependencies", + detail = "The workspace resolver needs unique package names when packages are referenced as dependencies. Service binaries (orchestrator, control-center, vault-service, ai-service, extension-registry, ncl-sync) are leaf nodes — nothing in the workspace has `orchestrator = { ... }` in its `[dependencies]`. Their package name is only used in `cargo build -p orchestrator` and `cargo check -p orchestrator` invocations, where the directory context already disambiguates. Adding a `provisioning-` prefix would increase keystroke cost without adding disambiguation value. The binary name (the output artifact) already carries the prefix via `[[bin]]`.", + }, + { + claim = "Ecosystem crates preserve Rust crate names via [lib] name to avoid touching 30+ doc test locations", + detail = "The ecosystem crates (machines, observability, backup, encrypt) use their crate name extensively in `///` and `//!` doc comment code examples that are compiled as doc tests. For `encrypt` alone, 25+ `use encrypt::` occurrences appear across `src/` and `examples/`. Changing the Rust crate name would require updating every one. The `[lib] name = 'old_name'` field in Cargo.toml decouples the package name (used by the workspace resolver) from the crate name (used by `use` statements). This preserves all existing Rust code, all doc tests, and all example files unchanged while making the package names consistent in `Cargo.toml` dependency declarations.", + }, + { + claim = "platform-rag and provisioning-mcp required only Cargo.toml changes because they already had custom lib and binary names", + detail = "`rag` already had `[lib] name = 'provisioning_rag'` and `[[bin]] name = 'provisioning-rag'` — both Rust names were already correct. Only the `[package] name` field and workspace dep key needed updating. Similarly, `mcp-server` had `[lib] name = 'provisioning_mcp_server'` and two `[[bin]]` entries. Renaming these packages to `platform-rag` and `provisioning-mcp` was a pure Cargo identity change with zero impact on Rust compilation or binary output.", + }, + { + claim = "service-clients required Rust use statement updates because it had no custom lib name", + detail = "Unlike the ecosystem crates, `service-clients` had no `[lib] name` override. Its Rust crate name was `service_clients` (derived from the package name by replacing hyphens with underscores). Renaming the package to `platform-clients` changes the default crate name to `platform_clients`. There were only three call sites in active crates: two in `provisioning-core/src/sources/ssh.rs` and one in `orchestrator/src/ssh/key_deployer.rs`. Updating three files was less friction than adding a `[lib] name = 'service_clients'` that would permanently diverge the package name from the Rust crate name.", + }, + ], + + consequences = { + positive = [ + "Every library crate declared as a `[dependencies]` entry now carries a `platform-` or `provisioning-` prefix — the project affiliation is unambiguous in any Cargo.toml context", + "The workspace resolver cannot silently select the wrong crate if an external dependency named `rag`, `observability`, or `machines` appears in the dependency tree", + "Binary output names are unchanged — no deployment scripts, systemd units, or Application Support paths require updates", + "NCL config keys (`rag = { ... }`, `mcp-server = { ... }`) are service identifiers unrelated to Cargo package names — they are unchanged", + "Ecosystem crate Rust code (doc tests, examples, use statements) compiles without modification", + "cargo check --workspace passes immediately after the rename", + ], + negative = [ + "The package name and Rust crate name are now different for the four ecosystem crates — `platform-machines` in Cargo.toml but `use machines::` in Rust. This is a supported Cargo feature but requires contributors to know about `[lib] name`", + "Cargo.lock contains the new package names — any tooling that parses Cargo.lock by package name (dashboards, audit tools) needs to be updated if it references the old names", + ], + }, + + alternatives_considered = [ + { + option = "Add provisioning- prefix to all service binary package names", + why_rejected = "Service binaries are never declared as dependencies — the prefix adds no disambiguation value. `cargo build -p provisioning-orchestrator` is longer than `cargo build -p orchestrator` with no benefit. The binary output already uses `provisioning-orchestrator` via `[[bin]]`.", + }, + { + option = "Add [lib] name = 'service_clients' to platform-clients instead of updating use statements", + why_rejected = "There were only three call sites. Adding a divergent lib name permanently embeds a naming inconsistency in the codebase. Updating three files is the right call at this scale. If there had been 30+ call sites the decision would have been different.", + }, + { + option = "Rename ecosystem crates and update all use statements", + why_rejected = "encrypt/src/ alone has 25+ doc test use statements across 6 files plus 3 examples. The work is mechanical but creates a large diff with no behavioral change. [lib] name achieves the same Cargo-level disambiguation with a one-line addition per crate.", + }, + { + option = "Keep the status quo — no rename", + why_rejected = "The status quo had three inconsistent naming patterns in the same workspace. `cargo tree` output was confusing; dep declarations in Cargo.toml files were ambiguous about project affiliation; crates.io collision risk existed for generic names. The inconsistency was a maintenance friction that compounds with each new crate added.", + }, + ], + + constraints = [ + { + id = "new-library-crates-need-platform-prefix", + claim = "Any new library crate added to the platform workspace that will be declared as a dependency must use the platform- or provisioning- prefix in its [package] name", + scope = "platform/Cargo.toml members, platform/crates/, platform/prov-ecosystem/crates/", + severity = 'Soft, + check = { tag = 'NuCmd, cmd = "glob 'provisioning/platform/crates/*/Cargo.toml' | each {|f| open $f | get package.name } | where { not ($in =~ 'platform-|provisioning-|orchestrator|control-center|vault-service|ai-service|extension-registry|ncl-sync|contract-tests|prvng-cli') } | if ($in | is-empty) { exit 0 } else { print $in; exit 1 }", expect_exit = 0 }, + rationale = "The naming convention is only useful if it is consistently applied to new crates. A new crate named 'cache' or 'metrics' has the same disambiguation problem the renamed crates had.", + }, + { + id = "service-binary-package-names-stay-short", + claim = "Service binary package names (leaf nodes never declared as deps) must NOT get a provisioning- prefix — short name only, prefix lives in [[bin]] name", + scope = "platform/crates/orchestrator, platform/crates/control-center, platform/crates/vault-service, platform/crates/ai-service, platform/crates/extension-registry, platform/crates/ncl-sync", + severity = 'Soft, + check = { tag = 'Grep, pattern = "orchestrator\\s*=|control-center\\s*=|vault-service\\s*=|ai-service\\s*=|extension-registry\\s*=|ncl-sync\\s*=", paths = ["provisioning/platform/Cargo.toml"], must_be_empty = true }, + rationale = "Adding a prefix to service binary package names would break existing cargo build -p muscle memory and CI scripts without adding any correctness benefit. The rule is: prefix where disambiguation matters (dep declarations), not where it is only cosmetic (package name for leaf binaries).", + }, + ], + + ontology_check = { + decision_string = "Rename platform workspace crates to apply a coherent naming convention", + invariants_at_risk = ["config-driven-always"], + verdict = 'Safe, + }, +} diff --git a/adrs/adr-031-unified-component-cli.ncl b/adrs/adr-031-unified-component-cli.ncl new file mode 100644 index 0000000..293a383 --- /dev/null +++ b/adrs/adr-031-unified-component-cli.ncl @@ -0,0 +1,48 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-031", + title = "Unified Component CLI: prvng component with Polymorphic Mode Dispatch", + status = 'Accepted, + date = "2026-04-19", + + context = "Before this decision the CLI exposed two separate command hierarchies for infrastructure lifecycle: `prvng taskserv ` for taskserv-mode components and `prvng component list|show|info` for read-only introspection. Write operations (install, delete, update, reinstall, restart, backup, restore) were routed through `taskserv.nu` regardless of the component's actual deploy mode (taskserv/cluster/container). This created three problems: (1) a cluster-mode component like postgresql required `prvng taskserv create postgresql` even though it runs as a Kubernetes deployment — the semantics were wrong and confused operators; (2) the script resolution used only the Tier-1 `install-.sh` convention — Tier-2 per-op scripts (`delete-.sh`, `backup-.sh`) were never invoked; (3) the orchestrator endpoint `/workflows/taskserv/create` accepted a `TaskservWorkflow` body that embedded no mode information, so the worker had no way to route script execution by mode. A prerequisite condition also existed: no precondition gate validated that capability providers (e.g. k0s for cluster-mode, NFS for democratic_csi) were healthy before enqueuing a write.", + + decision = "Replace `prvng taskserv` with a unified `prvng component ` command that dispatches polymorphically by deploy mode. The implementation has five parts: (1) A new orchestrator endpoint `/api/v1/workflows/component/{op}` accepts `ComponentWorkflow` (workspace + infra + component + server + namespace + ssh_user + ssh_key_path + settings + check_mode + provisioning). The `{op}` path segment is validated against a fixed allowlist (install, delete, update, reinstall, restart, backup, restore, check-updates for write; status, health, list, show for read). (2) A precondition gate (`src/preconditions.rs`) runs before task enqueue for write ops: it fast-fails on `.provisioning-state.ncl` terminal states (failed/error), then runs live SSH probes via the system `ssh` binary with a 15-second global timeout. The gate is skipped for read-only ops and for taskserv-mode components (root provider, no dependencies). (3) Script resolution is upgraded to two tiers: Tier 2 (`-.sh`) is preferred; Tier 1 (`install-.sh` + `CMD_TASK=` env) is the fallback. `cluster_deploy.nu` calls `get-component-script-path` from `extensions/discovery.nu` and merges `CMD_TASK` only for Tier-1 scripts. (4) The NuShell CLI adds eight exported lifecycle commands to `cli/components.nu` (install/delete/update/reinstall/restart/backup/restore/check-updates) and two new functions to `platform/clients/orchestrator.nu` (`orch-submit-component`, `orch-wait-task`). (5) A feature flag `orchestrator.features.enable_component_endpoint` (default: true) allows a one-line rollback to 404 without binary redeployment.", + + rationale = [ + { + claim = "Single endpoint with path-param op is cleaner than one endpoint per operation", + detail = "The alternative was `/api/v1/workflows/component/install`, `/api/v1/workflows/component/delete`, etc. — eight separate routes with identical handler bodies. A single parameterised route `/api/v1/workflows/component/{op}` validates the op at the top of the handler and reuses one `WorkflowTask` construction path. The operation name is forwarded as the Nu script's `CMD_TASK` env var for Tier-1 scripts, so the routing is structural, not branching.", + }, + { + claim = "SSH via tokio::process::Command is safer than the russh crate for the precondition gate", + detail = "The codebase has a `russh` dependency but `pool/executor.rs` is a stub. Adding a live russh integration solely for the precondition gate would require implementing the full connection pool — a significant scope addition. The system `ssh` binary with `BatchMode=yes StrictHostKeyChecking=no ConnectTimeout=N` is a one-file implementation with no state, no connection pool management, and no key-format assumptions. The probe is fire-and-forget (invoked once per precondition check, not per request), so there is no meaningful performance loss from process spawn overhead.", + }, + { + claim = "The feature flag is a deployment-level rollback, not an A/B switch", + detail = "The flag `enable_component_endpoint` returns HTTP 404 when false. This is intentional: callers that haven't migrated receive a 404 and fall back to the legacy `/workflows/taskserv/create` route, which remains in place. The flag is not intended for permanent dual-track operation — it exists to give a one-flag rollback path during the migration window, after which the legacy route and taskserv.nu will be deleted.", + }, + { + claim = "Deleting prvng taskserv is a breaking change accepted as deliberate", + detail = "All three call sites for the old taskserv API (NuShell CLI, control-center-ui, MCP server) are migrated atomically in this decision. The commands-registry.ncl entry is removed, the `t` single-char alias is removed, and the bash wrapper dispatch case is removed. `taskserv.nu` is retained temporarily to avoid breaking any in-flight sessions. Its permanent deletion is a follow-on commit after confirming no un-migrated callers exist.", + }, + ], + + consequences = { + positive = [ + "prvng component install postgresql correctly names the operation for both cluster and taskserv modes", + "Backup and restore operations are now surfaced via prvng component backup/restore — previously inaccessible from the CLI", + "Precondition gate prevents cascading failures when a capability provider is unhealthy before a write operation", + "Tier-2 per-op scripts are now invoked when present — operators can specialise delete/backup logic without patching the generic install script", + ], + negative = [ + "Breaking change: prvng taskserv is removed. Any external scripts or documentation referencing the old command require update", + "The precondition gate adds 0–15 seconds to write operations when providers are unhealthy or unreachable — healthy-path overhead is ~2s for the state-file fast-fail", + ], + neutral = [ + "taskserv.nu is not deleted in this commit — a follow-on cleanup commit removes it once in-flight migration is confirmed", + "The legacy /workflows/taskserv/create endpoint is preserved indefinitely until the feature flag is toggled and the route removed in a future cleanup", + ], + }, +} diff --git a/adrs/adr-032-node-role-scale-constraints.ncl b/adrs/adr-032-node-role-scale-constraints.ncl new file mode 100644 index 0000000..b9a942c --- /dev/null +++ b/adrs/adr-032-node-role-scale-constraints.ncl @@ -0,0 +1,59 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-032", + title = "Node Role and Scale Constraints: ControlPlane Immutability and Worker Lifecycle Gates", + status = 'Accepted, + date = "2026-04-20", + + context = "The provisioning system gained a formal `NodeRole` enum (`ControlPlane | Worker | LoadBalancer`) declared in `schemas/infrastructure/compute/scaling.ncl` alongside a `ScalePolicy` contract that captures min/max bounds and a hardware template for spawning new nodes. Without explicit lifecycle gates, any operator with `can_operate` permission could call `server_delete` on a ControlPlane node — destroying the k0s controller, etcd state, and all cluster API endpoints in one call. A secondary risk exists in the opposite direction: deleting the last Worker node while the ControlPlane still serves its API violates the `scale.min` bound declared in the NCL, leaving the cluster in a partially healthy state with no execution capacity.", + + decision = "Enforce role-aware lifecycle gates at three layers: (1) Schema — `delete_lock` is implicitly true for ControlPlane nodes via the `make_server` helper in each workspace's `servers.ncl`; hcloud protection mirrors the schema intent. (2) Daemon UI — a dedicated POST `/ui/workspaces/{ws}/servers/{srv}/scale-down` handler runs two sequential gates before calling `server_delete`: Gate-1 rejects any request where `role == ControlPlane` with HTTP 422; Gate-2 counts live hcloud servers whose names match the `scale.template.hostname_pattern` prefix — if removing this node would bring the count below `scale.min`, the request is rejected. (3) Teardown order — ControlPlane nodes can only be targeted for deletion through a dedicated `teardown` workflow (future) that first deprovisions all Workers; the scale-down endpoint is not the teardown path. The scale-down endpoint is the only UI-exposed deletion path for Worker/LB nodes — the raw `server_delete` tool remains available to admin-role CLI operators only.", + + rationale = [ + { + claim = "Gate at the daemon layer, not only at the schema layer", + detail = "hcloud `protection.delete = true` prevents accidental UI clicks on the hcloud console but does not fire when the provisioning daemon calls the hcloud CLI with `--force`. The daemon gate is the authoritative enforcement point because it understands role semantics. Schema-level `delete_lock` is a documentation and default-setting mechanism, not a runtime gate.", + }, + { + claim = "Separate scale-down endpoint instead of adding guards to the existing server_delete tool", + detail = "The `server_delete` tool is a low-level destructive primitive registered in provisioning-core. Adding role-awareness to it would couple infrastructure topology semantics into the core tool layer, which is designed to be workspace-agnostic. The scale-down UI handler is workspace-scoped — it loads `servers.ncl` for the active workspace to read the role and scale policy, then calls the primitive only after gates pass.", + }, + { + claim = "ScalePolicy.min is the authoritative lower bound, not a hardcoded value", + detail = "Different infra environments have different operational minimums. A dev workspace may tolerate 0 workers; a production cluster requires at least 2 for HA. Encoding min in the NCL `ScalePolicy` means the gate is always consistent with the declared intent, with no magic constants in daemon code.", + }, + { + claim = "Teardown order (Workers before ControlPlane) is not enforced by scale-down", + detail = "The scale-down endpoint enforces min-bound and CP-immutability but does not implement full teardown sequencing. A full teardown (destroy entire infra env) is a DAG-inverted workflow: reverse the provisioning DAG, deprovision Workers first, then ControlPlane. This is a separate concern handled by a future `teardown` workflow endpoint. Mixing teardown logic into scale-down would conflate two distinct operations.", + }, + ], + + consequences = { + positive = [ + "ControlPlane nodes cannot be deleted via the UI regardless of operator permission level", + "Worker deletion is gated on the declared scale.min — under-provision accidents are caught before hcloud API call", + "The daemon UI gate is the single authoritative enforcement point — no duplication across CLI, MCP, and HTTP handlers", + "ScalePolicy.min can be changed in NCL without touching daemon code", + ], + negative = [ + "Admin operators who intentionally need to delete a CP node (disaster recovery, full teardown) must use the CLI `server_delete` tool directly — the UI does not expose an override path", + "The hostname_pattern prefix heuristic for counting live workers is a string-prefix match, not a typed query — it fails if two workspaces share a hostname prefix", + ], + }, + + alternatives_considered = [ + { + option = "Add role check to the existing server_delete tool in provisioning-core", + why_rejected = "server_delete is a workspace-agnostic primitive. Loading servers.ncl inside a core tool would introduce workspace path coupling into a layer that must remain context-free. The UI handler already has workspace context.", + }, + { + option = "Use Cedar policies for role-based node protection", + why_rejected = "Cedar is configured for principal-level authorization (who can do what), not for resource-level topology constraints (which nodes are protected). The node role is a property of the infrastructure declaration, not of the actor's permissions. Cedar would need to be fed the role data per-request — more complexity than a local gate with no added safety.", + }, + { + option = "Block deletion via hcloud protection flag only", + why_rejected = "hcloud protection fires only when the hcloud CLI is called directly. The provisioning daemon calls the hcloud CLI with privilege — protection can be disabled before deletion in a single compound command. It is a backstop, not a gate.", + }, + ], +} diff --git a/adrs/adr-033-cluster-component-extension-pattern.ncl b/adrs/adr-033-cluster-component-extension-pattern.ncl new file mode 100644 index 0000000..399182c --- /dev/null +++ b/adrs/adr-033-cluster-component-extension-pattern.ncl @@ -0,0 +1,143 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-033", + title = "Cluster Component Extension Pattern: split-script + manifest plan authoring contract", + status = 'Accepted, + date = "2026-04-24", + + context = "ADR-031 introduced the unified `prvng component ` CLI with polymorphic mode dispatch. The orchestrator server runs `install-{name}.sh {op}` as the cluster-mode entry point. Before this decision, no authoring contract existed for cluster extensions: credential file naming (`credentials.env` vs `_credentials.env`), method implementations, and the manifest plan structure were conventions known only from reading existing extensions. The postgresql extension was authored with the legacy monolithic pattern — `credentials.env`, all logic in `install-postgresql.sh`, no `{name}-lib.sh`, no `manifest_plan.ncl`. This produced a remote failure (`POSTGRES_PASSWORD is not set`) that was undetectable by the preflight, reached the server, and left the op in failed state.", + + decision = "All cluster-mode extensions must follow the split-script pattern enforced by the preflight structural gate. The contract has four parts: (1) `install-{name}.sh` sources `_credentials.env` (underscore prefix, written by the bundle builder from SOPS decryption) — never `credentials.env`; (2) `{name}-lib.sh` implements `_method_{action}` for every non-builtin action declared in `manifest_plan.ncl`, including `post`/`pre` hook actions; (3) `manifest_plan.ncl` declares the operation DAG via the `ManifestPlan` Nickel contract from `schemas/lib/manifest_plan.ncl` — this contract enforces that `namespace` and `pvc` are never deleted or recreated in `update`/`delete`/`restart` plans; (4) `metadata.ncl requires[].capability` values must exactly match a `provides[].id` declared in another workspace component's `metadata.ncl` — the precondition gate does string-exact matching, generic IDs like `'storage'` do not resolve. The preflight gate in `cli/components.nu` checks all four contracts before packaging, surfacing violations as `[preflight] ❌` with the specific cause.", + + rationale = [ + { + claim = "Credential filename mismatch is undetectable without structural inspection", + detail = "The bundle builder writes `_credentials.env` (prefixed). An install script sourcing `credentials.env` (no prefix) silently skips the source — no error at local preflight, failure only on the remote node mid-plan. The structural gate reads the install script and rejects any `source.*credentials.env` line that does not contain the underscore.", + }, + { + claim = "Method coverage check prevents partial manifest plan execution", + detail = "The plan runner generates `run-init.sh` from `manifest_plan.json` and calls `_method_{action}` for each custom step. A missing method produces `command not found` mid-run, leaving the cluster in a partial state. The preflight exhaustively checks all actions in `init`, `update`, `delete`, `restart` plus their `pre`/`post` hooks.", + }, + { + claim = "ManifestPlan Nickel contract encodes data-safety invariants at schema time", + detail = "The `ManifestPlan` contract rejects any plan that applies `delete` or `recreate` to `namespace` or `pvc` in non-init operations. This is a compile-time safety net: the plan cannot be exported to JSON if it would destroy persistent data during a rolling update or delete operation.", + }, + { + claim = "Capability ID exact-match is the only resolution mechanism in the precondition gate", + detail = "The gate iterates workspace component NCL files, reads their `metadata.ncl provides[].id`, and matches against `requires[].capability`. There is no fuzzy matching, no aliasing, no category hierarchy. Using `'block-storage-csi'` vs `'storage'` is not a naming convention — it is a hard requirement for the gate to resolve the dependency chain.", + }, + ], + + consequences = { + positive = [ + "Credential filename bug caught at local `--check` — never reaches the remote node", + "Missing `_method_*` implementations surface as named preflight failures before any SSH", + "ManifestPlan contract prevents accidental PVC/namespace destruction by type system, not convention", + "Capability ID mismatch caught at op submission by the precondition gate with a named error", + ], + negative = [ + "Legacy monolithic extensions require backfill: add `{name}-lib.sh`, `manifest_plan.ncl`, rename `credentials.env` → `_credentials.env`", + "Typos in `manifest_plan.ncl` action names (`'wai-ready` vs `'wait-ready`) fail at preflight but not at authoring time — no schema validation of action name strings", + ], + }, + + alternatives_considered = [ + { + option = "Monolithic install-{name}.sh with case/esac per-operation dispatch", + why_rejected = "No structural contract between plan step declarations and shell method implementations. Credential filename bugs reach the remote node. Tested in postgresql initial authoring: produced a silent `POSTGRES_PASSWORD is not set` on the remote after a successful local preflight.", + }, + { + option = "Schema-validate action names in manifest_plan.ncl against a closed enum", + why_rejected = "Custom actions are component-specific (`'create-credentials'`, `'bootstrap-account'`, `'protect-volume'`). A closed enum would require every extension to register action names centrally — breaks the distributed authoring model of ADR-020. The method-coverage gate achieves the same safety without a registry.", + }, + { + option = "Auto-source _credentials.env at run-{op}.sh level (bundle builder injects it)", + why_rejected = "Credentials would be exported for the entire script lifetime, visible to any subcommand. The explicit `source` inside `_method_create-credentials` is the correct scope: credentials are loaded only when the method that needs them runs, and unset after. ADR-018 (secretumvault) requires minimal credential exposure time.", + }, + ], + + constraints = [ + { + id = "credential-filename-underscore", + claim = "install-{name}.sh must source _credentials.env, never credentials.env", + scope = "provisioning/extensions/components/*/cluster/install-*.sh", + severity = 'Hard, + check = { + tag = 'Grep, + pattern = "source.*[^_]credentials\\.env", + paths = ["provisioning/extensions/components/"], + must_be_empty = true, + }, + rationale = "The bundle builder writes the SOPS-decrypted secret to _credentials.env. Sourcing credentials.env (no underscore) silently skips the file — POSTGRES_PASSWORD (or any credential) is never set, and _require_env fails on the remote node with no local signal.", + }, + { + id = "lib-sh-required-for-cluster-components", + claim = "Every cluster extension must have {name}-lib.sh with all _method_* implementations declared in manifest_plan.ncl", + scope = "provisioning/extensions/components/*/cluster/", + severity = 'Hard, + check = { + tag = 'NuCmd, + cmd = "PROVISIONING_NO_CACHE=true provisioning component install --check 2>&1 | grep '_method_.*missing'", + expect_exit = 1, + }, + rationale = "The preflight structural gate exhaustively checks method coverage. A missing _method_X is a preflight failure, not a remote failure. Without this constraint, a partial lib.sh reaches the server and produces a bash `command not found` mid-plan, leaving the namespace in an inconsistent state.", + }, + { + id = "manifest-plan-ncl-required", + claim = "Every cluster extension must have manifest_plan.ncl validated by the ManifestPlan Nickel contract", + scope = "provisioning/extensions/components/*/cluster/manifest_plan.ncl", + severity = 'Hard, + check = { + tag = 'FileExists, + path = "provisioning/extensions/components/{name}/cluster/manifest_plan.ncl", + present = true, + }, + rationale = "Without manifest_plan.ncl the bundle builder produces an empty plan — no run-*.sh scripts are generated. The ManifestPlan contract is the only enforcement mechanism for the namespace/pvc deletion protection invariant.", + }, + { + id = "capability-id-exact-provider-match", + claim = "metadata.ncl requires[].capability must exactly match a provides[].id declared in a workspace component", + scope = "provisioning/extensions/components/*/metadata.ncl", + severity = 'Hard, + check = { + tag = 'NuCmd, + cmd = "PROVISIONING_NO_CACHE=true provisioning component install --check 2>&1 | grep 'no provider found'", + expect_exit = 1, + }, + rationale = "The orchestrator precondition gate in src/preconditions.rs does string-exact lookup: provides[].id == requires[].capability. Generic terms like 'storage' do not match 'block-storage-csi'. The gate rejects the op at submission time, before any SSH, with a named error. Use the exact IDs from the target provider's metadata.ncl.", + }, + { + id = "sops-file-required-for-require-env", + claim = "Every cluster extension that calls _require_env VAR in {name}-lib.sh must have infra/{ws}/secrets/{name}.sops.yaml present", + scope = "provisioning/extensions/components/*/cluster/*-lib.sh", + severity = 'Hard, + check = { + tag = 'NuCmd, + cmd = "PROVISIONING_NO_CACHE=true provisioning component install --check 2>&1 | grep 'sops.yaml not found'", + expect_exit = 1, + }, + rationale = "The preflight SOPS gate (comp-build-cluster-bundle) checks for the secrets file before attempting bundle build. A missing secrets file means _require_env variables would be unset on the remote node, causing the install script to abort mid-plan. The preflight check surfaces this locally before any SSH occurs.", + }, + { + id = "sops-encrypted-regex-covers-require-env-vars", + claim = "Every VAR referenced via _require_env in {name}-lib.sh must appear in sops.encrypted_regex of {name}.sops.yaml", + scope = "infra/*/secrets/*.sops.yaml", + severity = 'Hard, + check = { + tag = 'NuCmd, + cmd = "PROVISIONING_NO_CACHE=true provisioning component install --check 2>&1 | grep 'not in sops.encrypted_regex'", + expect_exit = 1, + }, + rationale = "SOPS only encrypts keys matching encrypted_regex. A variable in _require_env that is absent from encrypted_regex is stored in plaintext in the SOPS file and silently passes decryption — it appears to work but leaks secrets in the committed YAML. The preflight checks name coverage explicitly against the regex.", + }, + ], + + ontology_check = { + decision_string = "Cluster extension authoring contract: split-script (install.sh + lib.sh + manifest_plan.ncl) + _credentials.env naming + exact capability IDs — enforced by preflight structural gate before bundle packaging", + invariants_at_risk = ["type-safety-nickel", "config-driven-always"], + verdict = 'Safe, + }, + + related_adrs = ["adr-020-extension-capability-declarations", "adr-031-unified-component-cli"], +} diff --git a/adrs/adr-034-workspace-justfile-recipe-pattern.ncl b/adrs/adr-034-workspace-justfile-recipe-pattern.ncl new file mode 100644 index 0000000..a8e5bba --- /dev/null +++ b/adrs/adr-034-workspace-justfile-recipe-pattern.ncl @@ -0,0 +1,143 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-034", + title = "Workspace Justfile Recipe Pattern: thin-wrapper dispatch + op governance contract", + status = 'Accepted, + date = "2026-04-24", + + context = "Workspace justfiles in `workspaces/{ws}/justfiles/` are the operator-facing command surface for all cluster and infrastructure operations. Before this decision, no formal authoring contract existed for justfile modules: logic appeared inline (conditionals, loops), cache was not cleared on write paths, op governance wiring (preflight before op start) was inconsistent, and intent parameters were not quoted — allowing spaces in intent strings to break positional argument parsing. The `op.just` deploy/redeploy/purge recipes and the `mail.just` component-specific pattern emerged as the reference implementations during the libre-wuji postgresql deployment cycle, but the constraints were tribal knowledge. This ADR formalises the contract so any new justfile module can be validated by inspection without reading the reference implementations.", + + decision = "Workspace justfile modules follow a four-part contract. (1) Module structure: each `.just` file covers exactly one functional domain, declares a module-level variable for paths/script refs (never hardcoded inline — `infra` must be defined once as `infra := \"infra/{ws}\"` and used as `{{infra}}/ops/` throughout the module), and provides a `{module}-help` recipe that uses `awk` to extract the group's recipes from `just --list`. (2) Thin-wrapper rule: recipe bodies contain zero branching logic — all logic lives in `provisioning` CLI subcommands or `nu scripts/`. The single allowed exception is multi-step shell composition (`#!/usr/bin/env bash` + `set -euo pipefail`) when the composition itself is the value (e.g. sequencing preflight → op start → deploy → op finish). A second allowed exception is `PROVISIONING_DEBUG` passthrough: multi-step recipes may check `${PROVISIONING_DEBUG:-false}` and set a `DBG_FLAG` variable to propagate debug mode to all `provisioning` calls in the recipe body — this is inline logic that cannot be pushed to the CLI because the flag must reach both the `--check` and the deploy invocations. (3) Write-path invariants: any recipe that mutates cluster state must `export PROVISIONING_NO_CACHE=true` before the first `provisioning` call, preventing stale Nickel config from reaching the remote node. (4) Op governance wiring: write recipes that span multiple `provisioning` calls must follow the preflight-first sequence — `provisioning component {op} {component} --check` runs and must succeed before `provisioning op start` is called; `OP_ID` is captured from `ls -t {{infra}}/ops/ | head -1` immediately after `op start`; `provisioning op finish $OP_ID success|failed` is called unconditionally in both branches. Intent parameters must be passed quoted (`\"{{intent}}\"`) in all delegate calls to preserve spaces.", + + rationale = [ + { + claim = "Inline logic in justfiles silently diverges from provisioning CLI semantics", + detail = "Just is a task runner, not a shell — variables, quoting, and flow-control behaviour differ subtly from bash. Any conditional or loop written inline in a recipe body must duplicate decisions already encoded in the provisioning CLI or nu scripts, and will drift independently. The thin-wrapper rule prevents this divergence class: the justfile remains a dispatch table, not an implementation.", + }, + { + claim = "Stale Nickel config reaching the remote is undetectable at deploy time", + detail = "The provisioning CLI caches rendered Nickel config across invocations. Without `PROVISIONING_NO_CACHE=true`, a write recipe may bundle a config that was rendered before the current edit, sending outdated field values to the orchestrator. This class of bug is invisible in the local preflight because the preflight runs against the cached bundle. Exporting the flag at recipe scope ensures every build in that recipe execution is fresh.", + }, + { + claim = "Op record creation before preflight failure leaves an orphaned op in failed state", + detail = "If `provisioning op start` runs before `provisioning component {op} --check`, and the preflight then fails, an op record exists in `infra/{ws}/ops/` with no matching deploy attempt. The op log shows a failed op with no cause. The preflight-first sequence guarantees that no op record is created for a configuration that was known-bad at submission time.", + }, + { + claim = "Unquoted intent parameters silently truncate multi-word intent strings", + detail = "Just passes positional parameters to shell recipes as separate words. `provisioning op start {{component}} {{operation}} {{intent}}` receives 'initial' as intent when the caller wrote 'initial mail server setup'. The quoted form `\"{{intent}}\"` preserves the full string through the shell word-splitting boundary. This is observable only when reviewing op log entries — the intent stored in the op record will be truncated without error.", + }, + { + claim = "awk-based help recipes provide self-consistent documentation without maintenance overhead", + detail = "A `{module}-help` recipe that runs `just --list | awk '/^ \\[{group}\\]/{p=1;next} p && /^ \\[/{exit} p && NF && !/-help/{print}'` extracts group recipes from the live justfile — the help output is always current. A hand-maintained help block diverges from reality as recipes are added or removed. The awk pattern is copy-exact across modules; only the group name and description line change.", + }, + { + claim = "PROVISIONING_DEBUG passthrough is the only legitimate inline conditional in multi-step recipes", + detail = "The `PROVISIONING_DEBUG=true just deploy ...` invocation pattern requires a `DBG_FLAG` variable that is passed to both the `--check` preflight and the deploy invocation. If the flag only reached the deploy but not the preflight, debug output would be incomplete. The flag cannot be pushed to a provisioning CLI subcommand because the shell expansion happens at recipe body scope. This is a narrow, named exception to the thin-wrapper rule — not a precedent for arbitrary inline logic.", + }, + ], + + consequences = { + positive = [ + "New module authors have a verifiable contract — a module is conformant if `nu scripts/validate-justfile.nu` produces no violations", + "Contract is machine-validated: `validate-justfile.nu` checks no-cache, preflight ordering, intent quoting, and bash strict mode across all modules", + "Op log integrity preserved: orphaned ops from failed preflights cannot occur under the contract — including secret prerequisites (missing SOPS file, uncovered `_require_env` variables) which are caught by the preflight gate before `op start`", + "Help recipes are self-maintaining — adding a recipe to the group makes it appear in `{module}-help` automatically", + "Intent strings with spaces work correctly in all context (op log, audit trail, status display)", + "PROVISIONING_DEBUG propagates to both preflight and deploy — full debug output without exception to the flag passthrough pattern", + ], + negative = [ + "Multi-step bash composition (deploy/redeploy/purge pattern) is explicitly allowed but must be justified — this weakens the thin-wrapper rule at the margin; authors must recognise the boundary", + "The `OP_ID=$(ls -t infra/{ws}/ops/ | head -1)` capture is a side-effect convention, not a typed return value — it breaks silently if `ops/` is on a filesystem where mtime ordering is unreliable (not a concern for git-tracked directories, but worth documenting)", + ], + }, + + alternatives_considered = [ + { + option = "Encode op governance logic in a provisioning subcommand that wraps preflight+start+deploy+finish", + why_rejected = "The deploy recipe wrapping already exists for the common case. But purge, redeploy, and future multi-phase operations require different sequencing (e.g. purge requires interactive confirmation between op start and the destructive action). A single CLI wrapper would need flags for every variant, reintroducing the branching the thin-wrapper rule eliminates. The composition value of justfile multi-step recipes is precisely this per-operation sequencing.", + }, + { + option = "Use just variables instead of bash for PROVISIONING_NO_CACHE", + why_rejected = "Just `export` only works for simple assignments and does not compose with multi-step bash recipe bodies. The `export PROVISIONING_NO_CACHE=true` pattern inside a `#!/usr/bin/env bash` recipe is the only form that reliably propagates the environment variable to all `provisioning` subprocess calls in that recipe body, including those in conditionals.", + }, + { + option = "Generate justfile modules from provisioning component metadata", + why_rejected = "Component-specific modules (mail.just, and future postgresql.just) contain operational domain knowledge — emergency procedures, non-standard flags, guard rails — that cannot be derived from component metadata alone. Auto-generation would produce thin scaffolding without the operational value. The module contract is an authoring guide, not a codegen target.", + }, + ], + + constraints = [ + { + id = "write-recipe-no-cache", + claim = "Every write recipe (deploy, redeploy, purge, and any recipe that calls provisioning component {op}) must export PROVISIONING_NO_CACHE=true before the first provisioning call", + scope = "workspaces/*/justfiles/*.just", + severity = 'Hard, + check = { + tag = 'NuCmd, + cmd = "nu workspaces/libre-wuji/scripts/validate-justfile.nu 2>&1 | grep 'write recipe missing'", + expect_exit = 1, + }, + rationale = "Stale Nickel config silently reaches the remote node when the cache is not cleared. Without PROVISIONING_NO_CACHE=true, the bundle builder may reuse a pre-edit render for the current operation. The flag must be set before any provisioning invocation in the recipe so that even preflight runs against a fresh render.", + }, + { + id = "op-governance-preflight-first", + claim = "In any multi-step recipe that calls provisioning op start, a provisioning component {op} {component} --check must appear before it and gate on its exit code", + scope = "workspaces/*/justfiles/*.just", + severity = 'Hard, + check = { + tag = 'NuCmd, + cmd = "nu workspaces/libre-wuji/scripts/validate-justfile.nu 2>&1 | grep 'op start'", + expect_exit = 1, + }, + rationale = "Op records created before a known-bad preflight produce orphaned failed ops in the audit log with no associated deploy attempt. The preflight-first sequence ensures that op start is called only when the configuration has passed structural validation. Single-line op-start delegates are exempt — they are building blocks, not deploy owners. Purge recipes are exempt — they use interactive namespace confirmation as the gate, not bundle preflight.", + }, + { + id = "intent-parameter-quoted", + claim = "Every delegate call that passes {{intent}} must quote it: \"{{intent}}\" — never bare {{intent}}", + scope = "workspaces/*/justfiles/*.just", + severity = 'Hard, + check = { + tag = 'Grep, + pattern = "provisioning op start.*{{intent}}[^\"]", + paths = ["workspaces/"], + must_be_empty = true, + }, + rationale = "Just passes positional parameters as shell words. An unquoted {{intent}} is split on whitespace by the shell, truncating multi-word intent strings silently. The op record stores only the first word. This is undetectable at recipe invocation time — it fails only at op log review when the intent field is wrong.", + }, + { + id = "multi-line-recipe-bash-strict", + claim = "Any recipe with a #!/usr/bin/env bash shebang must have set -euo pipefail as the second line", + scope = "workspaces/*/justfiles/*.just", + severity = 'Hard, + check = { + tag = 'Grep, + pattern = "#!/usr/bin/env bash", + paths = ["workspaces/"], + must_be_empty = false, + }, + rationale = "Just does not propagate exit codes from recipe lines by default in multi-line bash recipes. Without set -euo pipefail, a failing provisioning call mid-recipe continues execution — subsequent steps run against a broken cluster state. The `set -e` part is the critical one: it ensures that op finish is not called with 'success' after a deploy failure.", + }, + { + id = "module-help-recipe-required", + claim = "Every .just module must have a {module}-help recipe using the awk group-extraction pattern", + scope = "workspaces/*/justfiles/*.just", + severity = 'Soft, + check = { + tag = 'Grep, + pattern = "-help:", + paths = ["workspaces/"], + must_be_empty = false, + }, + rationale = "Without a {module}-help recipe, the module is invisible from the root default recipe. Operators discover available operations via 'just' (default recipe) → 'just {module}-help' → recipe detail. A module without help breaks the discovery chain. The awk pattern is self-maintaining — no manual synchronisation required as recipes are added.", + }, + ], + + ontology_check = { + decision_string = "Workspace justfile modules: thin-wrapper dispatch (no inline logic) + PROVISIONING_NO_CACHE=true on write paths + preflight-first op governance sequence + quoted intent parameters + awk-based self-documenting help recipes", + invariants_at_risk = ["config-driven-always", "type-safety-nickel"], + verdict = 'Safe, + }, + + related_adrs = ["adr-031-unified-component-cli", "adr-033-cluster-component-extension-pattern"], +} diff --git a/adrs/adr-035-storage-config-schema.ncl b/adrs/adr-035-storage-config-schema.ncl new file mode 100644 index 0000000..10550c2 --- /dev/null +++ b/adrs/adr-035-storage-config-schema.ncl @@ -0,0 +1,96 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-035", + title = "StorageConfig schema: provider-declared storage policies and component requires.storage contract", + status = 'Accepted, + date = "2026-04-24", + + context = "Components declare storage needs as an untyped record: `requires.storage = { size = \"20Gi\", persistent = true }`. No contract validates that the declared size is within provider bounds, that the volume mode is compatible with the storage class, or that expansion is possible if the PVC must grow later. The postgresql deployment was provisioned with a 20Gi PVC on hcloud-volumes (minimum 10Gi, expand-only). Reducing it is impossible: Hetzner CSI only allows expansion. This class of error — requesting more storage than needed on a provider that cannot shrink volumes — has no static check and no runtime signal until the operator attempts a resize and finds it rejected. A separate problem: 'block', 'nfs', and 'object' volume semantics are not represented at all; a component could request NFS access mode on a block-only storage class without any validation. This ADR defines the StorageConfig schema to make these constraints machine-checkable.", + + decision = "Introduce `schemas/lib/storage_config.ncl` with three exports: (1) `StorageRequires` — the contract for component `requires.storage` fields, adding `volume_mode` (block/nfs/object) and `access_mode` alongside the existing `size` and `persistent` fields; (2) `ProviderStoragePolicy` — the abstract contract for provider metadata declarations, specifying `min_size`, `max_size`, `expansion_policy` (static/expand_only/full), and `volume_modes`; (3) concrete provider policy values `HetznerCSIPolicy` and `DemocraticCSINFSPolicy` with the real constraints pre-filled. Storage class providers declare their policy in `capabilities.ncl` or `metadata.ncl` using `ProviderStoragePolicy`. Component storage requirements use `StorageRequires`. The preflight gate in `comp-build-cluster-bundle` is the enforcement point: it reads the storage class from component config, resolves the matching provider policy from capabilities, and fails if the requested size is below `min_size`. No ADR-mandated change to the component CLI is required — preflight already has access to both component config and capabilities.", + + rationale = [ + { + claim = "Hetzner CSI volumes cannot shrink — min_size enforcement must happen at deploy time, not at resize time", + detail = "The Kubernetes CSI spec allows drivers to implement VolumeExpansion but not VolumeContraction. Hetzner's hcloud-volumes driver only supports expansion. A PVC provisioned at 20Gi on hcloud-volumes cannot be reduced to 10Gi without deleting the PVC (and losing data) and reprovisioning. The `min_size = \"10Gi\"` field in HetznerCSIPolicy, combined with preflight validation, catches over-provisioning before the PVC is created — where the correction is a config edit, not a data migration.", + }, + { + claim = "Volume mode (block/nfs/object) is not derivable from storage class name alone", + detail = "Storage class names like 'hcloud-volumes', 'democratic-csi-nfs', 'longhorn' carry no semantic: a reader cannot determine from the name whether the class provides RWO block storage, RWX NFS, or something else. The `volume_mode` field in `StorageRequires` and `volume_modes` in `ProviderStoragePolicy` make this explicit. A component requesting `volume_mode = 'nfs` on a storage class whose policy declares `volume_modes = ['block]` is a preflight failure, not a runtime error on the remote node.", + }, + { + claim = "expansion_policy encodes the one-way door semantics of provider volume management", + detail = "Three states: 'static (no resize at all — e.g. hostPath), 'expand_only (increase only — Hetzner CSI), 'full (expand and shrink — democratic-csi NFS, some Longhorn configurations). This field is the authoritative signal for whether a future size increase in component config will be deployable. An operator who knows their provider is 'expand_only can provision conservatively (10Gi) knowing they can grow later, rather than defensively provisioning large volumes that cannot be reclaimed.", + }, + { + claim = "Concrete provider policy values (HetznerCSIPolicy, DemocraticCSINFSPolicy) eliminate per-workspace duplication", + detail = "Without pre-defined policy constants, every workspace capabilities.ncl that uses hcloud-volumes would need to manually specify `min_size = \"10Gi\"`, `expansion_policy = 'expand_only`, etc. — and could drift. By defining HetznerCSIPolicy and DemocraticCSINFSPolicy in the schema, workspaces reference the canonical policy: `storage_policy | sc.ProviderStoragePolicy = sc.HetznerCSIPolicy`. The Nickel contract then validates any field override against the policy shape.", + }, + ], + + consequences = { + positive = [ + "PVC over-provisioning on expand-only providers is caught at preflight before the PVC exists", + "Volume mode mismatches (NFS component on block storage class) become preflight failures", + "capabilities.ncl gains a typed storage policy declaration — provider constraints are readable without consulting Hetzner docs", + "StorageRequires contract applies to all component requires.storage fields uniformly via schema import", + "Concrete policy values (HetznerCSIPolicy) are the single source of truth — workspace drift is impossible via Nickel contract", + ], + negative = [ + "Size comparison (component.requires.storage.size >= provider.min_size) requires string-to-bytes parsing — this is done in Nu (preflight), not in Nickel, because Nickel has no byte-unit parsing in std", + "Provider policy must be declared in capabilities.ncl — a storage class used without a matching policy entry cannot be validated (validation skips rather than fails, so the gap is silent)", + ], + }, + + alternatives_considered = [ + { + option = "Add min_size / max_size directly to the storage_classes list in InfraCapabilities", + why_rejected = "InfraCapabilities.storage_classes is currently Array String (a list of class names). Changing it to a typed record would require updating all capabilities.ncl files in all workspaces simultaneously. The ProviderStoragePolicy approach allows new capabilities.ncl entries to use the typed policy while old entries continue to work — opt-in migration rather than breaking change.", + }, + { + option = "Enforce via Kubernetes admission webhook (VPA/LimitRange) instead of preflight", + why_rejected = "Admission webhooks enforce at pod scheduling time, not at bundle validation time. The gap between 'provisioning op started' and 'webhook rejects the PVC' is an orphaned in-progress op with no clean recovery path. Preflight enforcement keeps the invariant: if preflight passes, the deploy can succeed without external gates.", + }, + { + option = "Allow size as a Number (Gi) instead of String", + why_rejected = "Existing components use `size = \"20Gi\"` (String). Changing to Number would require a migration across all component NCL files and breaks Nickel contract compatibility. The String representation is also the form Kubernetes expects in PVC manifests, so no conversion is needed in templates.", + }, + ], + + constraints = [ + { + id = "storage-requires-uses-contract", + claim = "Any component NCL that declares requires.storage must use StorageRequires from schemas/lib/storage_config.ncl", + scope = "provisioning/catalog/components/*/nickel/contracts.ncl", + severity = 'Soft, + check = { + tag = 'Grep, + pattern = "storage_config", + paths = ["provisioning/catalog/components/"], + must_be_empty = false, + }, + rationale = "StorageRequires adds volume_mode and access_mode to the storage spec. Without the contract import, components declare an untyped record that passes Nickel validation regardless of content — the volume_mode / access_mode fields are silently ignored. The soft severity reflects that adoption is incremental — existing components without storage can be migrated on next edit.", + }, + { + id = "provider-policy-min-size-hetzner", + claim = "Any capabilities.ncl that declares hcloud-volumes must set min_size = \"10Gi\" and expansion_policy = 'expand_only", + scope = "workspaces/*/infra/*/capabilities.ncl", + severity = 'Hard, + check = { + tag = 'NuCmd, + cmd = "nu -c \"open workspaces/libre-wuji/infra/libre-wuji/capabilities.ncl | str contains 'hcloud'\"", + expect_exit = 0, + }, + rationale = "Hetzner hcloud-volumes is the primary block storage provider in libre-wuji. Omitting min_size means components can request 5Gi PVCs which Hetzner will reject at provisioning time with a CSI error. The HetznerCSIPolicy constant in storage_config.ncl provides the correct values — workspaces should reference it rather than hard-code the constraint.", + }, + ], + + ontology_check = { + decision_string = "StorageConfig schema: StorageRequires contract for components + ProviderStoragePolicy for providers + HetznerCSIPolicy/DemocraticCSINFSPolicy constants + preflight size/mode validation", + invariants_at_risk = ["type-safety-nickel", "config-driven-always"], + verdict = 'Safe, + }, + + related_adrs = ["adr-033-cluster-component-extension-pattern", "adr-020-extension-capability-declarations"], +} diff --git a/adrs/adr-036-db-operation-abstraction.ncl b/adrs/adr-036-db-operation-abstraction.ncl new file mode 100644 index 0000000..b7f68cb --- /dev/null +++ b/adrs/adr-036-db-operation-abstraction.ncl @@ -0,0 +1,97 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-036", + title = "db-* operation abstraction: standard manifest_plan actions for database lifecycle across MySQL, PostgreSQL, and SurrealDB", + status = 'Accepted, + date = "2026-04-24", + + context = "Database components (postgresql, and future mysql, surrealdb) each implement ad-hoc backup, restore, and health-check methods in their lib.sh files with no shared naming contract. `provisioning component backup postgresql` works today because ManifestPlan supports arbitrary plan section keys and the component CLI dispatches any op name to the bundle builder. However, each engine invents its own method names (`_method_backup`, `_method_dump`, `_method_db-dump`, etc.) and parameter conventions. There is no standard for: (a) how backup artifacts are named, (b) whether Object Storage is involved, (c) what 'state' means (connection count? replication lag? table sizes?), or (d) how restore locates its source. Additionally, Object Storage integration (e.g., Hetzner Object Storage, Backblaze B2) for archival is not modeled. This ADR establishes seven standard db-* operation names as the cross-engine contract.", + + decision = "Define seven standard manifest_plan action names for database lifecycle operations. Each action maps to a `_method_{action}` implementation in the engine's lib.sh. The operations are: `db-init` (create databases, roles, and initial schema — idempotent), `db-backup` (full consistent backup, compressed, optionally pushed to object storage — artifact named `{name}-{timestamp}.dump.gz`), `db-restore` (restore from artifact path or object storage key, passed via BACKUP_SRC env or params.src), `db-dump` (plain SQL export to stdout or local path — lighter than db-backup, no binary format), `db-state` (query operational state: database sizes, connection counts, replication lag, bloat — output to stdout as structured text), `db-query` (run ad-hoc SQL from params.sql or QUERY env — read-only by default), `db-snap` (engine-native point-in-time snapshot — e.g., pg_basebackup for PostgreSQL, file-level copy for SurrealDB). These seven names become the convention: any database component that declares `operations.backup = true` in its Nickel config must implement `_method_db-backup`. The corresponding justfile module (`justfiles/db.just`) provides generic recipes that work for any database component. Engine-specific modules (e.g., `justfiles/postgresql.just`) thin-wrap the generic db.just recipes for their component.", + + rationale = [ + { + claim = "Seven operations cover the complete database lifecycle without engine-specific command surface", + detail = "db-init handles first-time setup (idempotent). db-backup and db-restore are the data safety pair. db-dump complements backup for portability — pg_dump output is readable, binary backup formats are not. db-state is the operational health surface: sizes, connections, lag — enough to answer 'is the database healthy' without custom dashboards. db-query enables one-off queries from the operator without exec-ing into the pod. db-snap provides near-zero-RPO backup using engine-native mechanisms when available. No other operations have emerged across the postgresql and docker_mailserver deployment cycles.", + }, + { + claim = "Object Storage integration lives in db-backup and db-snap, not in a separate operation", + detail = "Adding a separate 'archive' operation would require sequencing: backup → archive → verify. This three-step sequence is exactly what db-backup params.dest is for: if BACKUP_DEST is set to an S3 URI (s3://bucket/prefix), the backup method uploads directly. The method retains local copy for BACKUP_KEEP_LOCAL hours before deletion. This single-operation model means `just pg-backup` and `just pg-backup dest=s3://mybucket/pg` are the same code path with different params, avoiding a separate archive stage and its op governance overhead.", + }, + { + claim = "Naming convention db-{verb} avoids collision with existing component op names", + detail = "Existing component ops (install, update, delete, restart, backup, restore) are generic and dispatched by the component CLI. The db-* prefix is reserved for database-semantic operations that require SQL engine awareness. This avoids ambiguity: 'backup' as a component op is 'snapshot the entire component state', while 'db-backup' is 'dump database contents'. Both can coexist in manifest_plan.ncl without naming conflict because they are distinct section keys.", + }, + { + claim = "params.src and params.dest are the standard interface for artifact location, not env vars", + detail = "ManifestEntry.params is a `{ _ | String }` record — arbitrary string key-value pairs passed to _method_* implementations. Using params.src (restore source) and params.dest (backup destination) is self-documenting in manifest_plan.ncl and in the justfile recipe: `just pg-backup dest=s3://bucket/pg`. Environment variables (BACKUP_SRC, BACKUP_DEST) are the fallback when params is absent — the method checks params first, env second. This two-tier resolution allows interactive override without modifying manifest_plan.ncl.", + }, + ], + + consequences = { + positive = [ + "Cross-engine tooling: a `justfiles/db.just` with generic recipes works for postgresql, mysql, surrealdb without modification", + "Object Storage backup path is a convention (s3://bucket/prefix), not per-engine config — backup tooling is uniform", + "db-state provides a standard operational query without exec into pod — consistent with no-SSH-for-observability principle", + "db-init idempotency means reprovisioning a database component doesn't require manual schema recreation", + "Seven operations cover backup, restore, observability, and ad-hoc queries — no further operations expected for standard OLTP databases", + ], + negative = [ + "db-snap is engine-specific: pg_basebackup for PostgreSQL, file-level copy for SurrealDB, xtrabackup for MySQL — method implementations are not portable across engines", + "params.dest S3 URI handling requires credentials (S3 access key, secret) in the component SOPS file — operators must add S3 credentials alongside DB credentials before using db-backup with object storage", + "db-state output format is unstructured text per engine — there is no typed structured output contract, which limits automated parsing", + ], + }, + + alternatives_considered = [ + { + option = "Add database operations to the component manifest_plan operations field as boolean flags", + why_rejected = "The operations record (`operations.backup = true`) already controls whether the component supports an op. Adding db-specific booleans (operations.db_backup, operations.db_restore) would double the operations field without adding new information — the presence of a db-backup section in manifest_plan is the declaration. The operations field is for CLI feature gating, not for naming.", + }, + { + option = "Implement a separate 'db-operator' component that manages databases across engines", + why_rejected = "A cross-engine db-operator requires a running sidecar or separate deployment with access to all database pods. This adds infrastructure complexity and a failure mode (operator pod down → no backup). The lib.sh-in-bundle pattern keeps operations self-contained: the run-db-backup.sh script carries everything it needs, runs on the control plane node, and requires only kubectl + the database client binary. No additional components.", + }, + { + option = "Use Velero for backup instead of engine-native methods", + why_rejected = "Velero provides consistent volume snapshots (application-consistent requires hooks) and is CSI-level, not database-level. It cannot produce a pg_dump or mysqldump — only a filesystem snapshot. For PostgreSQL, a consistent SQL dump is more portable and restorable than a volume snapshot across different PostgreSQL versions. Velero is complementary (infrastructure-level DR), not a replacement for db-backup.", + }, + ], + + constraints = [ + { + id = "db-backup-method-required-if-operations-backup", + claim = "Any database component with operations.backup = true in its Nickel config must implement _method_db-backup in its lib.sh", + scope = "provisioning/extensions/components/*/cluster/*-lib.sh", + severity = 'Hard, + check = { + tag = 'NuCmd, + cmd = "grep -l 'backup.*=.*true' provisioning/extensions/components/*/nickel/defaults.ncl | each { |f| let comp = ($f | path dirname | path dirname | path basename); let lib = $'provisioning/extensions/components/($comp)/cluster/($comp)-lib.sh'; if ($lib | path exists) and (not (open $lib | str contains '_method_db-backup')) { print $'($comp): missing _method_db-backup' } } | str join ''", + expect_exit = 0, + }, + rationale = "The component CLI dispatches 'backup' to the bundle builder which extracts the manifest_plan.backup section. If the plan has a db-backup step but lib.sh does not implement _method_db-backup, the run script fails mid-execution on the remote node. The preflight method coverage check catches this — the constraint here documents the naming convention.", + }, + { + id = "db-backup-artifact-naming", + claim = "db-backup method implementations must produce artifacts named {component}-{timestamp}.dump.gz or {component}-{timestamp}.tar.gz", + scope = "provisioning/extensions/components/*/cluster/*-lib.sh", + severity = 'Soft, + check = { + tag = 'Grep, + pattern = "dump\\.gz\\|tar\\.gz", + paths = ["provisioning/extensions/components/"], + must_be_empty = false, + }, + rationale = "A consistent artifact naming scheme allows automated retention policies and object storage lifecycle rules to match on prefix. Without it, each engine invents its own format (pg-backup-20260424.sql, dump_2026-04-24.tar.bz2) and rotation scripts must be per-engine. The soft severity reflects that existing backup implementations predate this ADR.", + }, + ], + + ontology_check = { + decision_string = "db-* operation abstraction: seven standard manifest_plan action names (db-init, db-backup, db-restore, db-dump, db-state, db-query, db-snap) as cross-engine database lifecycle contract + params.src/params.dest for artifact location + Object Storage integration via BACKUP_DEST s3:// URI", + invariants_at_risk = ["config-driven-always"], + verdict = 'Safe, + }, + + related_adrs = ["adr-033-cluster-component-extension-pattern", "adr-035-storage-config-schema"], +} diff --git a/adrs/adr-037-ops-contract-dual-mode.ncl b/adrs/adr-037-ops-contract-dual-mode.ncl new file mode 100644 index 0000000..19a2a5a --- /dev/null +++ b/adrs/adr-037-ops-contract-dual-mode.ncl @@ -0,0 +1,138 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-037", + title = "Ops contract dual-mode: NATS pending queue, JWT-signed commands, and switchable signer (keeper-VM auto / operator manual) without code changes", + status = 'Accepted, + date = "2026-04-26", + + context = "The provisioning platform needs a coordination contract for runtime workload changes (deploy/scale/restart/secret_update/drain) that satisfies four constraints simultaneously: (1) the workload cluster (libre-wuji class) must be runtime-autonomous — it cannot pull from the CI cluster (libre-daoshi class) at boot or for steady-state operation; (2) operators must be able to drive ops manually from a laptop with a hardware key when the automated signer is offline, with no code changes; (3) multiple emitters (CI pipelines on libre-daoshi, operator laptops, future GitHub Actions) must be able to propose ops concurrently without distributed-lock complexity; (4) every applied op must be auditable with cryptographic provenance independent of any single node remaining online. The naive design — workload cluster pulls a deploy spec from a known git repo on the CI cluster — fails constraint (1); a direct RPC from CI to a signing service on a single VM fails constraint (2) when the signer dies; ad-hoc multi-emitter coordination via filesystem locks or database advisory locks fails constraint (3) under network partitions; storing audit logs only on the workload cluster fails constraint (4) when that cluster is lost. The design needs a single coordination substrate that decouples emitters from signers, serializes concurrent ops, survives signer outages without losing operations, and emits auditable provenance independent of cluster health.", + + decision = "Adopt a NATS JetStream-based ops contract with three subject namespaces and dual-mode signing. (1) Subject layout per workspace: `ops.pending..` for unsigned proposals, `ops.cmd..` for signed commands ready to apply, `ops.ack..` for application result, `ops.audit.` for the immutable audit stream. JetStream streams `OPS_PENDING_` (WorkQueue retention, 14 days) and `OPS_CMD_` (WorkQueue retention, 24 hours) plus `OPS_AUDIT_` (Limits retention, 90 days, replicas=3) implement the persistence and ordering guarantees. (2) JWT claims for every signed message: `iss` (signer identity: keeper-vm-primary | operator- | gh-actions-), `sub` (requesting principal: woodpecker-job- | manual-), `aud` (target workspace), `scopes` (allowed op_type:target tuples), `seq` (per-issuer monotonic counter — anti-replay), `jti` (UUIDv4 idempotency key), `expected_state_version` (optimistic concurrency token), `exp`/`nbf` (validity window). (3) Signer is any subscriber to `ops.pending.*` with a key in the workspace's authorized-signers set. The keeper-daemon (running on the dedicated ops-vm workspace) auto-signs operations matching a declarative policy file (see ADR-XXX keeper policy schema); the keeper-cli running on operator laptops with a YubiKey signs interactively via `keeper pending sign `. Both produce identical JWT-signed messages on `ops.cmd.*` — wuji's ops-controller does not distinguish between automated and manual signers, only the JWT validity. (4) Mode switch is operational, not configurational: stopping the keeper-daemon process on ops-vm degrades the system to operator-only mode without any code or config change in wuji or daoshi. Restarting it restores automated signing. A hybrid mode is supported by tuning the keeper policy to auto-sign only safe operations (e.g., scale and restart on staging targets) while leaving production deploys for manual approval. (5) Multi-emitter coordination is delegated to JetStream: emitters publish independently with their own per-issuer sequence; the stream's total order resolves concurrency; the ops-controller in wuji applies in stream order with `expected_state_version` optimistic concurrency, returning 409 conflict on the second emitter when two ops target the same state version. (6) Wuji's ops-controller is the single subscriber to `OPS_CMD_` in WorkQueue mode — there is exactly one applier per workspace, eliminating the need for distributed leader election; if the controller pod restarts, persisted state in SurrealDB allows reconciliation of in-flight ops on resume.", + + rationale = [ + { + claim = "JetStream WorkQueue retention with single subscriber gives total order without distributed locks", + detail = "Multi-emitter coordination is the load-bearing complexity in this design. JetStream's WorkQueue stream type with a single durable consumer (wuji ops-controller) provides exactly-once delivery in stream order. Concurrent emitters from libre-daoshi, operator laptops, and external CI write to `ops.cmd.*` independently; the stream sequences them by arrival time. No emitter needs to coordinate with another. The controller applies in order; optimistic concurrency on `expected_state_version` rejects ops that read stale state, which manifests to the emitter as a 409 conflict via NATS request-reply on `ops.ack.*`. This shifts coordination from client-side distributed locks (which require failure-mode reasoning across emitter, lock server, and cluster) to the broker, which has well-understood semantics.", + }, + { + claim = "Pending queue between emitters and signers makes mode switching free", + detail = "If emitters published directly to `ops.cmd.*` (signing inline) the system would couple emitter availability to signer availability. By interposing `ops.pending.*` as a separate subject namespace, emitters publish proposals without knowing or caring about who signs. Any subscriber to `ops.pending.*` with a key in the authorized-signers set can sign and republish to `ops.cmd.*`. Switching from auto-sign (keeper-daemon on ops-vm) to manual-sign (operator laptop with keeper-cli) requires no change to emitters and no change to the consumer (wuji ops-controller) — it requires only enabling or disabling the relevant subscriber. This is the same decoupling pattern as a message queue with multiple consumer groups, applied to a signing-and-republish role.", + }, + { + claim = "Mandatory JWT scope tuples prevent privilege escalation across workspaces", + detail = "Each signer's JWT is constrained by `scopes` — an array of `op_type:target_pattern` tuples (e.g., `deploy:staging-*`, `scale:vapora`). The ops-controller validates that the requested op falls within at least one scope tuple before applying. A keeper-vm-primary key with scope `deploy:staging-*` cannot sign a deploy to `production-*` even if the policy file permits it locally — the JWT scope is the authoritative declaration. This means a compromised keeper-VM cannot forge production ops if its key was issued with staging-only scopes. Scope rotation (narrowing or widening) is a key-rotation operation, which is auditable.", + }, + { + claim = "ops-controller persists in-flight ops to SurrealDB before ack to survive restart without duplicate apply", + detail = "The naive controller acks `ops.cmd.*` consumption first then applies, which would mean a crash between ack and apply produces a missed op (not retried by JetStream because acked). The reverse — apply first then ack — produces possible duplicate apply if the controller crashes after applying but before acking. The correct pattern is: read message, persist `(jti, op_payload, state=pending)` to SurrealDB transactionally, ack to JetStream, then apply, then update SurrealDB to `state=applied`. On restart, the controller reads SurrealDB for `state=pending` rows and reconciles each by checking whether the op was actually applied (idempotency key prevents double-apply). This requires the apply layer to be idempotent on `jti`, which is a design requirement on every op handler.", + }, + { + claim = "JWT issuer values are not service identities but key identities — survives signer migration", + detail = "The `iss` claim names a key, not a service. `keeper-vm-primary` is the key currently held by the keeper-daemon; if the keeper-daemon migrates to a different VM, it still presents the same `iss`. Scope rotation (issuing a new key with different scopes) is a separate operation. This decoupling means we can move the keeper-daemon from ops-vm to a laptop temporarily without rotating keys, and a hardware-key-only operator setup uses a different `iss` (e.g., `operator-jpl-yubikey`) so audit trails remain attributable. A compromised key is revoked by removing its `iss` from the workspace's authorized-signers set, which is itself an op (governance op) signed by the operator quorum.", + }, + ], + + consequences = { + positive = [ + "Wuji is runtime-autonomous: it pulls nothing from daoshi at boot or steady state — only consumes signed messages from its own NATS JetStream", + "Daoshi is replaceable: any system holding a signer key can drive ops; the platform's ops contract is not coupled to one CI provider", + "Mode switch (auto/manual/hybrid) is operational not architectural — `systemctl stop keeper-daemon` is the entire migration to operator-only", + "Multi-emitter coordination is a property of the broker (JetStream stream order), not an application concern", + "Audit trail is on a separate stream with independent retention — applying ops cannot interfere with audit log integrity", + "Replay protection (jti uniqueness + monotonic seq) prevents reissuing intercepted JWTs", + "Optimistic concurrency surfaces conflicts as explicit 409s to emitters, not as silent overwrites — emitters decide retry policy", + "ops-controller restart is safe because in-flight ops are persisted before ack — no missed ops, no duplicate applies", + ], + negative = [ + "NATS JetStream is now load-bearing for production ops — its availability constrains deploy throughput; mitigation: replicas=3 within wuji", + "Idempotency contract on every op handler is a development requirement that must be tested per op_type — adding a new op_type requires verifying double-apply safety", + "JWT clock skew between signer and verifier requires NTP/chrony on all signing hosts and on wuji nodes — operational requirement not visible from code", + "JetStream retention windows (14 days pending, 24 hours cmd, 90 days audit) must be sized against the operational rhythm — pending exhaustion in operator-only mode if quorum review takes longer than 14 days will silently drop proposals", + "Multi-emitter conflicts surface as 409s to emitters, who must implement retry-after-restate logic — emitters that ignore 409 will lose their op silently", + ], + }, + + alternatives_considered = [ + { + option = "Direct HTTP RPC from emitters to a centralized signer service", + why_rejected = "Couples emitter availability to signer availability and re-introduces the single-VM SPOF. Also requires the signer to be reachable on the network from every emitter, including external CI providers, which is a firewall complication. NATS JetStream as the substrate is already deployed for the orchestrator (ADR-012) and provides the same effect (decoupling, retry, audit) with no new network surface.", + }, + { + option = "Pull-based deploys: wuji pulls deploy specs from a git repo on daoshi at intervals", + why_rejected = "Violates wuji autonomy — wuji's runtime would depend on daoshi's git server being reachable. Also introduces eventual-consistency uncertainty (when does a push become visible?) without giving emitters a synchronous signal of acceptance. The pending/cmd/ack triple gives emitters a clear lifecycle: proposal accepted, op signed, op applied or rejected.", + }, + { + option = "GitOps via Flux/ArgoCD with workload cluster pulling from a Radicle repo", + why_rejected = "Solves the autonomy concern (Radicle is decentralized) but inherits GitOps' weaknesses for ops not modeled as state declarations: scale/restart/drain are imperative ops that require sequencing, not state convergence. Modeling them as state-document edits requires an awkward layer of versioned state files and reconciliation loops; pending-and-signed messages on a queue match the ops semantics directly. GitOps may complement this for the workload-config layer (ADR-038 covers Radicle's role in the desired-state ledger), but is not a replacement for ops coordination.", + }, + { + option = "Distributed lock via SurrealDB live queries for multi-emitter coordination", + why_rejected = "Introduces a write-write coordination problem on the lock document under concurrent emitters, recreating the distributed-lock complexity the JetStream approach avoids. JetStream's stream order is already a globally consistent total order — using it for both the message itself and the coordination semantics is simpler than separating the two concerns.", + }, + ], + + constraints = [ + { + id = "ops-controller-single-subscriber", + claim = "Exactly one ops-controller consumer subscribes to OPS_CMD_ in WorkQueue mode per workspace; multiple subscribers would break ordering guarantees", + scope = "platform/crates/ops-controller/, infra/.../components/ops_controller.ncl", + severity = 'Hard, + check = { + tag = 'Grep, + pattern = "deliver_subject|durable_consumer", + paths = ["platform/crates/ops-controller/"], + must_be_empty = false, + }, + rationale = "JetStream WorkQueue with multiple consumers distributes messages round-robin across them, which breaks the single-applier invariant that backs the optimistic-concurrency contract. The constraint is enforced by component config (single replica) and runtime check on consumer creation.", + }, + { + id = "jwt-scope-validation-mandatory", + claim = "ops-controller MUST validate JWT scopes against the requested op_type:target before applying; missing scope = reject with 403, do not log a 200", + scope = "platform/crates/ops-controller/src/auth.rs", + severity = 'Hard, + check = { + tag = 'Grep, + pattern = "validate_scopes|check_scope_match", + paths = ["platform/crates/ops-controller/src/"], + must_be_empty = false, + }, + rationale = "Without scope validation, any signer key with valid signature can submit any op type to any target, eliminating the privilege boundary that makes scoped keys useful. The check ensures scope validation is at least textually present; runtime tests verify behavior.", + }, + { + id = "idempotency-contract-per-op-handler", + claim = "Every op_type handler in ops-controller MUST be idempotent on jti — double-apply with same jti must produce the same final state and not duplicate side effects", + scope = "platform/crates/ops-controller/src/handlers/", + severity = 'Hard, + check = { + tag = 'Grep, + pattern = "fn handle_.*\\(.*jti.*\\)", + paths = ["platform/crates/ops-controller/src/handlers/"], + must_be_empty = false, + }, + rationale = "The persist-then-ack-then-apply protocol requires handlers to handle restart-induced re-execution. A handler that issues a deploy command twice is allowed by NATS semantics under restart and must produce no observable difference — typically by checking the jti against persisted apply state before issuing side effects.", + }, + { + id = "pending-queue-ttl-monitored", + claim = "OPS_PENDING_ queue depth and oldest-message age MUST be exposed as Prometheus metrics so operator-only mode (where pendings can accumulate) is observable", + scope = "platform/crates/ops-controller/, infra/.../components/observability.ncl", + severity = 'Soft, + check = { tag = 'Grep, pattern = "ops_pending_queue_depth|ops_pending_oldest_age_seconds", paths = ["platform/crates/ops-controller/src/"], must_be_empty = false }, + rationale = "In operator-only mode, pendings accumulate awaiting human signature. Without monitoring, operators may not notice that a pending sat for 13 days and is about to expire. The 14-day retention is generous but finite; observability of queue state is the operational mitigation against silent drop.", + }, + ], + + ontology_check = { + decision_string = "Ops contract dual-mode: NATS JetStream with ops.pending/ops.cmd/ops.ack/ops.audit subject namespaces + JWT-signed commands with scopes + replaceable signer (keeper-daemon auto / keeper-cli manual) + ops-controller as single per-workspace WorkQueue consumer with SurrealDB persistence of in-flight ops", + invariants_at_risk = ["solid-boundaries", "config-driven-always"], + verdict = 'Safe, + }, + + related_adrs = ["adr-012-nats-event-broker", "adr-013-surrealdb-global-store", "adr-014-solid-enforcement", "adr-038-radicle-decentralized-governance", "adr-039-build-infrastructure-ephemeral"], + + invariant_justification = { + invariant = "solid-boundaries", + claim = "ops-controller is a new service with a new SOLID boundary: it ONLY consumes from ops.cmd, applies via the orchestrator API, and writes to ops.audit and SurrealDB — it does not call provider APIs or auth services directly", + mitigation = "Cedar policy enforces that ops-controller's service identity has no permissions to call hcloud, aws, or vault directly; orchestrator interface is the only allowed dependency. Compile-time check in the ops-controller crate forbids hcloud-rs and aws-sdk-rust as dependencies.", + }, +} diff --git a/adrs/adr-038-radicle-decentralized-governance.ncl b/adrs/adr-038-radicle-decentralized-governance.ncl new file mode 100644 index 0000000..5d17cfd --- /dev/null +++ b/adrs/adr-038-radicle-decentralized-governance.ncl @@ -0,0 +1,130 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-038", + title = "Radicle Heartwood as decentralized substrate for governance, desired-state, and audit ledger across all workspaces", + status = 'Accepted, + date = "2026-04-26", + + context = "The platform requires a substrate to hold three classes of information that must survive the loss of any single cluster: (1) governance — who is authorized to sign which ops, expressed as a delegation set with M-of-N approval semantics for changes; (2) desired state — the version-controlled declaration of what each workspace should be running, used by ops emitters to compute deploy diffs; (3) audit ledger — the immutable record of which ops were applied to each workspace, signed by the applying ops-controller. All three need to be reachable by operators, ops emitters (CI on libre-daoshi, laptops, external CI), and the keeper-daemon, even when one or more nodes are unreachable. Centralized solutions (a single git server on libre-daoshi, or a hosted git provider) reintroduce the dependency the platform was designed to avoid (libre-wuji autonomy from libre-daoshi). The naive replacement — a self-hosted git server with mirroring — requires manual mirror management and does not address the governance signing question. Mutable distributed databases (etcd, Consul) handle replication but lack git's content-addressed history and signed-commit semantics, which are required for cryptographically attestable audit. The substrate must be peer-to-peer, support cryptographic identities for both repos and contributors, replicate via gossip without a central server, and allow patches (proposed changes) to require signatures from a configurable set of delegated keys.", + + decision = "Adopt Radicle Heartwood as the decentralized substrate for three repo families per workspace: `policy-` (keeper auto-sign policy + authorized-signers set), `-desired` (version-controlled declaration of components, settings, capabilities), and `-state` (immutable ledger of applied ops, written only by the wuji ops-controller). Each operator host (laptop), each cluster node intended to participate in governance (a designated node per cluster for libre-wuji and libre-daoshi), and the ops-vm host run a Radicle Heartwood seed node — there is no central hub. Repos are identified by their RID (Radicle ID), discovered via tracking peers. Authority on a repo is encoded in its delegation set: `policy-` and `-desired` use M-of-N delegation among operator keys (initial config: 2-of-3 for production workspaces, 1-of-1 for ops-vm); `-state` uses a single delegation — the workspace's ops-controller signing key — because the ledger is an attestation by the applying authority, not a multi-party decision. Keeper policy (consumed by keeper-daemon to decide what to auto-sign) is declarative-only Nickel (see ADR-XXX keeper-policy schema): no executable code, no Nickel function calls beyond the schema constructor. Audit events from NATS `ops.audit.*` are mirrored to `-state` via a sidecar process running in wuji that subscribes to JetStream and commits one git commit per audit message — this mirror runs at-most-once-per-message via JetStream durable consumer ack semantics. Operators may use any frontend over the local Radicle repo (plain git, jj, mob); the project does not mandate a frontend, only the substrate. The keeper-daemon and ops-controller use the `gix` Rust crate for direct git operations, never shelling out to git or jj — these services are not human-driven and benefit from in-process operations. The framework-level domain extension (ontoref domains/provisioning) gains a `governance` command group (governance delegations, governance signers) that reads the local Radicle clone of the workspace's policy repo and reports M-of-N quorum status.", + + rationale = [ + { + claim = "Radicle Heartwood provides cryptographic identity, gossip replication, and signed patches as a single substrate — no need to compose three lower-level primitives", + detail = "Building decentralized governance from primitives would require: a key-signed identity layer (e.g., DID), a content-addressed storage layer (git itself), a gossip replication layer (e.g., libp2p with custom protocol), and a patch/approval workflow (custom). Heartwood ships all four as a coherent system designed for source-code collaboration. The CRDT-like replication semantics of Heartwood's COB (collaborative objects) handles concurrent updates to issues, patches, and discussions correctly. We use only the patch-and-delegation subset, which is the most stable and best-tested part of the system.", + }, + { + claim = "Three repos per workspace separates concerns with different authority profiles", + detail = "Conflating policy + desired-state + audit in one repo would force a single delegation set across three semantically different actions: human governance decisions (policy), declarative configuration (desired-state), and machine attestations (audit). Splitting into three repos lets each have the right authority: M-of-N operators for policy (humans must agree), M-of-N operators + automated CI keys for desired-state (CI can propose, operators approve), and the workspace's ops-controller key alone for audit (no human approves a record of what already happened). It also lets the audit repo grow much faster than the others without bloating the histories that operators read frequently.", + }, + { + claim = "Keeper policy is declarative-only Nickel, evaluated by a deterministic Rust matcher — never executed as Nickel code", + detail = "If the keeper-daemon evaluated the policy by running `nickel export` on the file, a maliciously crafted policy committed by a quorum could exfiltrate keys via the eval environment or trigger unbounded computation. The decision: the policy schema (auto_sign + require_manual sections, each with image/target/scope patterns) is a closed, plain-data shape parsed by a Rust matcher. Adding new policy primitives requires updating both the schema and the matcher together — they are versioned in lock-step. This is not a general-purpose policy language and is not supposed to become one; if a future need exceeds what the schema expresses, a new ADR adds a new shape, not arbitrary expressiveness.", + }, + { + claim = "ops-controller is the sole delegate of -state because audit attests to applied ops, not approved ones", + detail = "Multiple delegates on the audit repo would mean operators or other parties could write to it. But the audit repo's value is precisely that it records what the applying authority observed — what actually happened in wuji. Allowing humans to write would let history be rewritten or fabricated; even with M-of-N controls, the value of the ledger is undermined. The ops-controller's signing key lives only on wuji, with backup encrypted online (per the decision in design discussion); rotation is rare. If wuji is rebuilt, the new ops-controller rotates to a new key — this is an event recorded in the policy repo (the delegation set updates), and the state repo continues with the new delegate.", + }, + { + claim = "Audit mirror from NATS to Radicle is at-most-once-per-message — duplicate audit commits are not a correctness concern", + detail = "JetStream durable-consumer ack semantics guarantee at-least-once delivery of every audit message; the mirror's idempotency on commit (write commit only if the audit jti is not already present in HEAD's ancestor chain) makes the effective semantics exactly-once for the steady state. Duplicates in transient failure modes (mirror crashes between commit-write and ack) appear as a no-op commit on retry that is detected and skipped. The git history is grow-only; readers see the same content regardless of whether one or two attempts produced it.", + }, + ], + + consequences = { + positive = [ + "Governance, desired-state, and audit survive the loss of any single cluster — every operator and seed node holds a full replica via Radicle gossip", + "M-of-N delegation is a built-in primitive, not a custom approval workflow we maintain", + "Operator onboarding and offboarding are git-native operations (delegation patch signed by quorum) — no custom auth system", + "Audit history is content-addressed and signed — tampering requires forging a signature on a commit AND propagating it to all replicas, which is detectable", + "Frontends are operator choice — git, jj, mob, custom — without affecting the protocol", + "Domain-level commands (governance delegations, governance signers) work uniformly across workspaces because they read the same repo shape", + "Bootstrapping a new workspace = `rad init` three repos with appropriate delegation sets; no new infrastructure to deploy for governance", + ], + negative = [ + "Heartwood is younger than centralized git hosts — operators must learn `rad` CLI basics; mitigation: domain commands wrap common operations", + "Gossip replication has eventual-consistency lag — a delegation change made on one operator laptop may not be visible to keeper-daemon for seconds-to-minutes; mitigation: operations that consume policy poll for the latest commit before each decision, accepting a brief inconsistency window over hard real-time consistency", + "Audit commit rate is bounded by Radicle's gossip throughput, which is lower than NATS throughput — high-frequency ops may produce backpressure on the mirror; mitigation: batch multiple ops.audit messages into a single commit when arrival rate exceeds gossip rate", + "Operator key loss without backup is unrecoverable — a lost operator key can be removed from the delegation set by the remaining M-of-N quorum, but the operator cannot re-key without going through onboarding again", + "Cross-repo consistency (e.g., a state commit references a desired-state commit hash) is the application's responsibility — Radicle does not provide cross-repo transactions", + ], + }, + + alternatives_considered = [ + { + option = "Self-hosted Forgejo with cron-mirrored backups to other nodes", + why_rejected = "Forgejo is a centralized git server with manual mirror configuration; loss of the primary node means write operations stop until the mirror is promoted. Read replication is also pull-based and stale. The platform already runs Forgejo on libre-daoshi for human-friendly code hosting; layering decentralized governance on top of it would create two truths (Forgejo + mirrors) with potential drift. Radicle keeps governance and audit on a substrate purpose-built for the property we need.", + }, + { + option = "etcd or Consul cluster as governance store with Cedar for authorization", + why_rejected = "Distributed KV stores excel at strongly-consistent state replication but do not provide signed history. A delegation change in etcd is a write; without an external signing layer, there is no cryptographic record of who proposed and approved it. Cedar adds policy evaluation but not provenance. Building signed history on top of etcd requires reinventing what git+signed-commits provides natively. Radicle gives both replication and signed history in one substrate.", + }, + { + option = "OCI artifacts in zot for desired-state and audit", + why_rejected = "zot stores OCI artifacts well but is single-cluster (or replica-of-cluster) — losing wuji loses zot. Pushing desired-state and audit as OCI artifacts would couple them to wuji's availability, contradicting the requirement that governance survive cluster loss. zot's role is defined in ADR-039 (image registry with S3 backend); using it for governance would conflate two concerns.", + }, + { + option = "GitHub/GitLab repos with branch protection rules for M-of-N approval", + why_rejected = "Reintroduces a centralized provider as a hard runtime dependency, contradicting the decentralization goal. Also the approval semantics of branch protection are advisory — the API can be bypassed by an admin or by tampering with the underlying git server. Radicle's M-of-N is enforced by the protocol: a non-quorum patch is not a valid update, full stop.", + }, + ], + + constraints = [ + { + id = "policy-files-are-declarative-only", + claim = "policy.ncl files in policy- repos MUST conform to the keeper-policy schema and contain only data — no Nickel function definitions, no imports beyond the schema", + scope = "policy-*/policy.ncl across all workspaces", + severity = 'Hard, + check = { + tag = 'Grep, + pattern = "fun |let .* = fun ", + paths = ["policy-"], + must_be_empty = true, + }, + rationale = "The keeper-daemon parses policy with a Rust matcher that handles the declarative schema only. Function definitions in a policy file would be evaluated as Nickel code if accidentally piped through nickel export, opening an exfiltration vector. The constraint enforces the schema-only convention.", + }, + { + id = "state-repo-single-delegate", + claim = "-state Radicle repos MUST have exactly one delegate: the ops-controller key for that workspace", + scope = "Radicle delegation set of all -state repos", + severity = 'Hard, + check = { + tag = 'NuCmd, + cmd = "rad inspect $WORKSPACE-state | from json | get delegates | length", + expect_exit = 0, + }, + rationale = "Multi-delegate state repos would allow rewriting audit history. The constraint enforces that only the applying authority writes the audit ledger. Rotating the ops-controller key is a separate, governed operation that updates the delegate.", + }, + { + id = "audit-mirror-idempotent-on-jti", + claim = "The audit mirror sidecar MUST refuse to commit a duplicate jti — checked against the HEAD ancestor chain before committing", + scope = "platform/crates/audit-mirror/", + severity = 'Hard, + check = { + tag = 'Grep, + pattern = "check_jti_in_ancestors|already_committed", + paths = ["platform/crates/audit-mirror/"], + must_be_empty = false, + }, + rationale = "JetStream at-least-once delivery means the mirror sees duplicate messages on retry. Without the idempotency check, the audit history would contain N-1 duplicate commits per failure event, polluting the ledger. The check makes duplicate handling a no-op.", + }, + { + id = "desired-state-references-immutable", + claim = "When -state references a -desired commit hash in an audit entry, the referenced hash MUST be present in the desired repo's history at the time of audit write", + scope = "platform/crates/ops-controller/src/audit_emit.rs", + severity = 'Soft, + check = { tag = 'Grep, pattern = "desired.*commit|commit_hash|verify_commit", paths = ["platform/crates/ops-controller/src/audit_emit.rs"], must_be_empty = false }, + rationale = "If audit references a hash that disappears (e.g., desired repo is force-pushed by a buggy operator workflow), the audit becomes uninterpretable. Soft severity because Radicle's signed-commit model already makes force-push effectively impossible without quorum, but explicit cross-reference verification adds defense in depth.", + }, + ], + + ontology_check = { + decision_string = "Radicle Heartwood as decentralized substrate: three repos per workspace (policy / desired / state) with distinct delegation profiles (M-of-N humans / M-of-N+CI / single ops-controller) + declarative-only keeper policy schema + audit mirror from NATS to Radicle with jti idempotency + domain-level governance commands reading local Radicle clones", + invariants_at_risk = ["config-driven-always", "type-safety-nickel"], + verdict = 'Safe, + }, + + related_adrs = ["adr-037-ops-contract-dual-mode", "adr-014-solid-enforcement", "adr-018-secretumvault-integration", "adr-039-build-infrastructure-ephemeral"], +} diff --git a/adrs/adr-039-build-infrastructure-ephemeral.ncl b/adrs/adr-039-build-infrastructure-ephemeral.ncl new file mode 100644 index 0000000..d33f492 --- /dev/null +++ b/adrs/adr-039-build-infrastructure-ephemeral.ncl @@ -0,0 +1,139 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-039", + title = "Build infrastructure: golden-imaged ephemeral runners with dynamic sizing, S3-backed multi-tenant zot in workload cluster, and CI-orchestration separation", + status = 'Accepted, + date = "2026-04-26", + + context = "The platform needs to compile workloads (Rust binaries like Vapora, the orchestrator crates, ops-keeper, ops-controller) and produce OCI images for runtime consumption. Three constraints shape the design: (1) the CI cluster (libre-daoshi class) is sized for orchestration and source-of-truth services (forgejo, woodpecker server, postgresql) — running CPU-heavy compiles inside its k0s cluster causes scheduling pressure on the orchestration services and forces the cluster to be sized for peak build load rather than steady-state orchestration; (2) the workload cluster (libre-wuji class) must remain CI-free per ADR-037 to preserve runtime autonomy; (3) image storage must survive the loss of any single cluster — keeping zot inside libre-wuji with local volume storage means losing wuji wipes the registry, and rebuilding the registry from external sources is a slow recovery path. The orchestrator (memory: platform/vm/ subsystem) already supports VM lifecycle (spawn, persistence, golden image cache, cleanup scheduler), making ephemeral builders feasible without new infrastructure. The remaining design decisions concern image storage durability, runner sizing, cache locality, and where the registry physically lives.", + + decision = "Adopt an ephemeral-builder + central-registry architecture with three components. (1) BuildKit runs in ephemeral VMs spawned by the orchestrator on demand. Each VM is created from a periodically-rebuilt golden image (`buildkit-runner-golden:`) pre-installed with buildkit (rootless), sccache, nushell, and SSH server keyed for the orchestrator. Spawn time targets ~30s vs ~2min for cloud-init from a generic base image. The golden image itself is rebuilt weekly via a Woodpecker pipeline that runs in a current ephemeral runner — the chain is self-rebuilding after the initial bootstrap. (2) Runner sizing is dynamic per build, resolved in three tiers: explicit declaration in `.build-spec.ncl` at the repo root (BuildSpec contract: cpu, memory_gb, disk_gb, time_budget_min, cache_keys, oom_retry); historical p95 of CPU/RAM picos for that repo from the orchestrator's SurrealDB build-metrics table, multiplied by 1.2; language defaults from the orchestrator (Cargo.toml → medium 4vCPU/8GB, package.json → small 2vCPU/4GB, etc.). Final size = max(declared, 1.2×p95_historical). OOM kill auto-retries once with one size up. Time budget enforced as VM-level kill. (3) zot lives in libre-wuji (relocated from libre-daoshi) configured with S3-compatible backend (Hetzner Object Storage, Backblaze B2, or compatible). The S3 bucket is the durable storage; zot pods are stateless and can be killed/respawned without data loss. Bucket configuration: versioning enabled (point-in-time recovery), lifecycle policy (90-day non-current version retention), optional cross-region replication to a second bucket on a different provider for catastrophic recovery. zot's auth model uses JWT integrated with the workspace's NATS account hierarchy — daoshi-ci principals have write to /images, /cache, /sccache, /crates; wuji workload pods have read on /images; operators have write on /crates; public read on /crates if the operator chooses to publish a Rust crate registry. (4) The buildkit-launcher binary (woodpecker plugin) bridges Woodpecker pipeline steps to the orchestrator: it requests a runner of the resolved size, waits for ready, ships build context via SSH, invokes buildctl with --import-cache and --export-cache pointing to zot.wuji.local, collects logs, requests destroy. The launcher carries no persistent state; orchestrator owns the lease. (5) Cache strategy uses zot as both layer cache (BuildKit registry-mode cache) and Rust object cache (sccache S3-backend pointed at zot's S3-compatible API). Cold runner with warm cache compiles at near-warm-runner speed because the network distance to zot is short and the cache is rich. (6) Coupling consequence: builds depend on wuji being reachable (zot lives there). When wuji is unreachable, builds can run cold-locally on the runner but cannot push results — operators acknowledge this trade-off; an optional pull-through cache mirror in libre-daoshi can be added later if the coupling produces measurable friction.", + + rationale = [ + { + claim = "Ephemeral runners + golden images give build farm bursting without fixed-cost capacity", + detail = "A persistent build VM sized for the largest workload (Vapora) wastes CPU and RAM 95% of the time it sits idle. Per-build VMs scale to zero between builds — the only cost is the spawn time, which the golden image reduces to ~30s. The orchestrator already manages VM lifecycle for taskservs (memory: platform/vm/lifecycle.nu, vm_persistence.nu, cleanup_scheduler), so adding the buildkit_runner role is a component definition and a launcher binary, not a new subsystem.", + }, + { + claim = "Three-tier dynamic sizing handles the spread between trivial CI tasks and Vapora-class compiles without overcommit", + detail = "Static sizing variants (small/medium/large/xlarge) impose two failure modes: under-sized (OOM, slow), over-sized (wasted resources, slower spawn for unnecessarily large VMs). Reading `.build-spec.ncl` lets the repo declare its needs explicitly. P95 historical fallback handles repos that never declared a spec but have build history — most repos converge to a stable size. Language defaults handle the first build of a new repo. The 1.2× multiplier on historical p95 absorbs typical variance without exposing builds to OOM kill on a marginally larger build than usual.", + }, + { + claim = "zot with S3 backend makes the registry stateless — DR is a property of S3, not zot", + detail = "Self-managed durable storage for a registry (cluster volumes + replication + backup) is a recurring operational task. S3-class storage (any compatible provider) gives 11-nines durability natively and supports versioning and cross-region replication as configuration. Moving zot to that backend means the kubernetes pod is replaceable on a moment's notice with no data migration — the bucket is the source of truth. The DR question reduces to: is the bucket reachable, and is its versioning intact? — both of which are provider responsibilities. Cross-provider replication (e.g., Hetzner primary + Backblaze secondary) addresses provider catastrophic loss.", + }, + { + claim = "Cache lives in the registry because BuildKit and sccache both speak S3-compatible APIs to a shared registry", + detail = "BuildKit supports `--export-cache type=registry` and `--import-cache type=registry`, writing layer cache as OCI artifacts to the same registry that holds final images. sccache supports S3 backend that can target zot's S3-compatible endpoint (zot exposes an S3 API for direct artifact upload). Both caches benefit from the same durability and replication as the images themselves. A new cold runner pulling cache from zot is essentially as fast as the cache is rich; running the cache locally on the VM gains nothing because the VM is destroyed at end of build.", + }, + { + claim = "buildkit-launcher is thin to keep state in the orchestrator, not in Woodpecker", + detail = "Putting orchestration logic (lease tracking, cleanup on failure, retry policy) in the launcher would duplicate logic the orchestrator already implements for VM-backed taskservs. The launcher is a wrapper: requests a runner, hands off to buildctl on the runner, collects results. If the launcher process dies mid-build, the orchestrator's cleanup scheduler reaps the orphaned VM. If the runner OOMs, the orchestrator retries with the next size. The launcher's only job is to bridge Woodpecker step semantics (env vars, exit code, log capture) to the orchestrator's leased-resource semantics.", + }, + ], + + consequences = { + positive = [ + "libre-daoshi cluster stays small and steady-state — orchestration services are not preempted by build CPU", + "Build capacity is elastic without operator intervention — concurrent builds spawn concurrent VMs up to the orchestrator's configured pool limit", + "Build cold-start with warm cache is near-warm — sccache hits at network speed from a same-provider VM", + "Image registry DR is reduced to S3 bucket configuration — versioning, lifecycle, cross-region replication are all provider features", + "zot multi-tenant layout (/images, /cache, /sccache, /crates) lets the same registry serve workload images, build cache, Rust crates, and OCI artifacts uniformly", + "Golden image rebuild via the system itself (a runner builds the next runner image) means no permanent external build dependency once bootstrapped", + "Sizing dynamism makes Vapora-class builds and trivial doc builds use appropriate resources without manual tuning per pipeline", + ], + negative = [ + "Builds depend on wuji being reachable for zot — wuji outage stops the publish step (mitigation: optional pull-through cache mirror in libre-daoshi if measured friction warrants)", + "Initial bootstrap requires producing the first golden image off-platform (laptop or external CI) — documented in playbook, but a one-time manual step", + "Per-build VM creation has spawn-cost floor (~30s with golden image) — hot-path one-second test runs are not the right shape for this model; small in-cluster runners may be added later if a workload demands sub-spawn-cost CI", + "Orchestrator's VM pool limit becomes a build concurrency ceiling — needs sizing per workspace based on observed peak parallelism", + "Runner OOM auto-retry doubles VM cost for that build — repeated retries for flaky builds inflate cloud costs; mitigation: max 1 retry, with explicit failure surfaced to the developer", + "Cross-provider S3 replication has lag — the secondary bucket is eventual-consistent with the primary, so a same-second push-and-pull from secondary may miss; mitigation: cross-provider replication is for DR, not for normal reads", + ], + }, + + alternatives_considered = [ + { + option = "Persistent build VMs with strong per-VM cache locality", + why_rejected = "Sized for peak load, idle 95% of the time. Cache locality benefit is partial because cross-VM cache requires central storage anyway. Operational maintenance (patching, OS updates) on persistent VMs is recurring; ephemeral VMs from a periodically-refreshed golden image trade per-build spawn cost for zero ongoing maintenance.", + }, + { + option = "BuildKit pods inside libre-daoshi cluster", + why_rejected = "Couples build CPU to orchestration cluster — large builds cause scheduler pressure on forgejo, woodpecker server, postgresql. Sizing the cluster for peak builds wastes resources between builds. Out-of-cluster ephemeral VMs avoid this entirely with no architectural cost since the orchestrator already runs them for taskservs.", + }, + { + option = "GitHub-hosted runners or other external CI for builds", + why_rejected = "Reintroduces an external runtime dependency for the build step, contradicting the platform's autonomy goals. Also creates two CI surfaces (Woodpecker + GitHub Actions) operators must reason about. The orchestrator-spawned ephemeral runners give the same elasticity within the platform's own infrastructure.", + }, + { + option = "zot in libre-daoshi cluster with local volumes", + why_rejected = "Centralizes images on the wrong cluster — wuji should be the source of truth at runtime per ADR-037. Also single-cluster local-volume storage has no DR path that does not involve manual replication. S3 backend in wuji gives DR via provider features without manual replication.", + }, + { + option = "Nix as the build system instead of BuildKit", + why_rejected = "Nix delivers reproducible builds and a richer caching model, but the project is not Nix-native — workloads are built with cargo, npm, go, and language-native toolchains. Adopting Nix wholesale is a separate, larger decision. BuildKit accepts the existing Dockerfile/buildctl workflow most workloads already have. If a future workload demands bit-reproducible builds, Nix can run inside a BuildKit step without changing the surrounding architecture.", + }, + ], + + constraints = [ + { + id = "buildkit-runner-no-persistent-storage", + claim = "buildkit_runner component MUST NOT declare persistent volumes — all state lives on ephemeral disk and is destroyed with the VM", + scope = "catalog/components/buildkit_runner.ncl", + severity = 'Hard, + check = { tag = 'Grep, pattern = "persistent.*=.*false", paths = ["provisioning/catalog/components/buildkit_runner.ncl"], must_be_empty = false }, + rationale = "Persistent storage on ephemeral runners defeats the cost model and recreates the persistent-VM maintenance burden. Cache locality is provided by zot, not by persistent disks.", + }, + { + id = "zot-storage-must-be-s3", + claim = "zot component in libre-wuji MUST configure storage.backend = 's3' — local-volume storage is not permitted for the workload-cluster registry", + scope = "workspaces/libre-wuji/infra/libre-wuji/components/zot.ncl", + severity = 'Hard, + check = { + tag = 'Grep, + pattern = "backend = \"s3\"|backend.*s3", + paths = ["workspaces/libre-wuji/infra/libre-wuji/components/zot.ncl"], + must_be_empty = false, + }, + rationale = "Local-volume zot has no DR path consistent with the platform's resilience goals. The constraint forces the S3 backend choice at config-validation time.", + }, + { + id = "build-spec-schema-versioned", + claim = ".build-spec.ncl files in repos MUST validate against schemas/lib/build_spec.ncl — invalid specs cause launcher to fail-fast with a parse error, not silently fall back", + scope = "schemas/lib/build_spec.ncl, platform/crates/buildkit-launcher/", + severity = 'Hard, + check = { tag = 'Grep, pattern = "SchemaError|schema_error|schema_validation|validation_diff", paths = ["provisioning/platform/crates/buildkit-launcher/src/"], must_be_empty = false }, + rationale = "Silent fallback on invalid build-spec files masks misconfigurations until a build OOMs unexpectedly. Fail-fast surfaces the issue at the next pipeline run, when the developer can fix it.", + }, + { + id = "oom-retry-bounded", + claim = "buildkit-launcher OOM retry MUST be bounded to one retry per build — repeated retries inflate cost and indicate misconfiguration that needs developer attention", + scope = "platform/crates/buildkit-launcher/src/retry.rs", + severity = 'Hard, + check = { + tag = 'Grep, + pattern = "max_oom_retries|MAX_OOM_RETRY|oom_retry_limit", + paths = ["platform/crates/buildkit-launcher/"], + must_be_empty = false, + }, + rationale = "Unbounded retries on flaky builds turn a $0.10 build into a $1+ build silently. The bound is policy: one retry covers transient sizing miss, repeat OOM means the developer should declare a larger spec.", + }, + { + id = "golden-image-rebuild-cadence", + claim = "buildkit-runner-golden image MUST be rebuilt at least weekly — older golden images accumulate package vulnerabilities and toolchain drift", + scope = "Woodpecker pipeline definitions, orchestrator default-image config", + severity = 'Soft, + check = { tag = 'Grep, pattern = "golden-image-rebuild", paths = [".woodpecker/"], must_be_empty = false }, + rationale = "Stale golden images are a slow-moving security problem — toolchain CVEs accumulate. Weekly rebuild is generous but acceptable; faster cadence is fine but adds noise. Soft severity because the cadence is operational policy, not a structural invariant.", + }, + ], + + ontology_check = { + decision_string = "Build infrastructure: golden-imaged ephemeral runners spawned by orchestrator + dynamic sizing (.build-spec.ncl + p95 historical + language defaults) + zot relocated to libre-wuji with S3 backend (versioning + cross-region replication) + multi-tenant zot layout (images/cache/sccache/crates) + buildkit-launcher as thin Woodpecker-to-orchestrator bridge + sccache and BuildKit cache both terminated at zot", + invariants_at_risk = ["config-driven-always", "type-safety-nickel"], + verdict = 'Safe, + }, + + related_adrs = ["adr-037-ops-contract-dual-mode", "adr-038-radicle-decentralized-governance", "adr-021-workspace-composition-dag", "adr-033-cluster-component-extension-pattern"], +} diff --git a/adrs/adr-040-lian-build-lift-out.ncl b/adrs/adr-040-lian-build-lift-out.ncl new file mode 100644 index 0000000..fa2f7ca --- /dev/null +++ b/adrs/adr-040-lian-build-lift-out.ncl @@ -0,0 +1,83 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-040", + title = "buildkit-launcher lifted out as lian-build: standalone build substrate peer project", + status = 'Accepted, + date = "2026-05-01", + + context = "provisioning/platform/crates/buildkit-launcher and provisioning/extensions/components/buildkit_runner implemented the build substrate — ephemeral buildkit compute provisioning, OCI cache management, golden image lifecycle — inside the provisioning workspace. ADR-039 defined the architecture; buildkit-launcher was the implementation. Two structural problems emerged: (1) the build substrate domain (ephemeral compute, OCI cache, multi-actor sessions, provider abstraction) is orthogonal to provisioning's core domain (workspace lifecycle management); evolution of cache namespacing, provider traits, or session models required provisioning releases; (2) vapora (multi-agent orchestration) and workspace CI pipelines (Woodpecker) needed build substrate access without depending on the full provisioning binary and config system. The component passed all four criteria of the ontoref lift-out pattern (ADR-016 in ontoref): orthogonal concern, consumer plurality, release cadence divergence, config path-agnostic.", + + decision = "buildkit-launcher and buildkit_runner are extracted from provisioning as lian-build (炼), a standalone build substrate project at /Users/Akasha/Development/lian-build. provisioning retains catalog/lian-build/ — the NCL schemas, defaults, and component declarations that allow workspace infras to supply BuildDirectives to lian-build. Workspace infras declare their build intent using lian-build's NCL vocabulary; provisioning's runtime calls the lian-build binary with the generated directive config. No provisioning crate is imported by lian-build as a library dependency. buildkit-launcher and buildkit_runner workspace member entries are removed from provisioning/platform/Cargo.toml.", + + rationale = [ + { + claim = "Orthogonal domain justifies independent release lifecycle", + detail = "provisioning's core loop is: read NCL workspace definition → reconcile component state → apply changes. lian-build's core loop is: receive BuildDirectives → provision ephemeral compute → run buildkitd → collect artifacts → tear down. These loops share no state and evolve on different schedules. Adding a second registry provider to lian-build should not require a provisioning release, and vice versa.", + }, + { + claim = "Consumer plurality is proven", + detail = "At extraction time: provisioning (workspace component builds), vapora (multi-agent build sessions), workspace CI pipelines (Woodpecker steps). Three distinct callers, each with different invocation patterns and config sources, confirm lian-build's value is not provisioning-specific.", + }, + { + claim = "catalog/lian-build/ is the correct integration surface", + detail = "provisioning loads lian-build's BuildDirectives schema as an extension, making it available to workspace infra NCL. The workspace declares build intent; provisioning validates it against lian-build's schema and passes it to the binary. This is the correct dependency direction: provisioning knows about lian-build's vocabulary, but lian-build does not know about provisioning's internal structures.", + }, + ], + + consequences = { + positive = [ + "lian-build releases independently: provider additions, cache policy changes, session model improvements do not block provisioning", + "vapora and workspace CI pipelines consume lian-build directly without routing through provisioning", + "provisioning/platform/Cargo.toml shrinks: two workspace members removed", + "ComputeProvider and RegistryProvider trait boundaries are declared at project inception, not retrofitted", + ], + negative = [ + "Two schema maintenance surfaces: lian-build/schemas/ (source of truth) and provisioning/catalog/lian-build/ (consumer-side reference)", + "Workspace infras that previously used buildkit_runner component definitions must migrate to lian-build's BuildDirectives schema", + ], + }, + + alternatives_considered = [ + { + option = "Keep buildkit-launcher as a provisioning crate, expose via provisioning subcommand", + why_rejected = "Prevents vapora and Woodpecker from using the build substrate without depending on provisioning. Release coupling blocks provider evolution. Four-criterion test (ontoref ADR-016) makes the extraction unambiguously correct.", + }, + ], + + constraints = [ + { + id = "extensions-not-binary-dep", + claim = "provisioning must not import lian-build as a Rust library dependency; interaction is via CLI invocation with NCL-generated config", + scope = "provisioning/platform/Cargo.toml, provisioning/catalog/", + severity = 'Hard, + check = { + tag = 'Grep, + pattern = "lian-build", + paths = ["provisioning/platform/Cargo.toml"], + must_be_empty = true, + }, + rationale = "Library dependency would re-couple provisioning's build cycle to lian-build's. The integration surface is the CLI binary + NCL schema, not the Rust crate graph.", + }, + { + id = "extensions-lian-build-present", + claim = "provisioning/catalog/lian-build/ must be present and contain the BuildDirectives schema before the workspace member entries are removed", + scope = "provisioning/catalog/lian-build/", + severity = 'Hard, + check = { + tag = 'FileExists, + path = "provisioning/catalog/lian-build/build_directives.ncl", + present = true, + }, + rationale = "Removing the workspace member without the extension schema would break workspace infras that declare build components. Schema first, then removal.", + }, + ], + + related_adrs = ["adr-039-build-infrastructure-ephemeral", "ontoref:adr-016-component-lift-out-pattern"], + + ontology_check = { + decision_string = "buildkit-launcher and buildkit_runner extracted as lian-build standalone project; provisioning retains catalog/lian-build/ as integration surface; no Rust library dependency from provisioning to lian-build", + invariants_at_risk = [], + verdict = 'Safe, + }, +} diff --git a/adrs/adr-041-cloudatasave-lift-out.ncl b/adrs/adr-041-cloudatasave-lift-out.ncl new file mode 100644 index 0000000..4b5de9e --- /dev/null +++ b/adrs/adr-041-cloudatasave-lift-out.ncl @@ -0,0 +1,92 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-041", + title = "backup-manager lifted out as cloudatasave: standalone backup orchestrator peer project", + status = 'Accepted, + date = "2026-05-01", + + context = "provisioning/platform/crates/backup-manager implemented backup orchestration — restic-first snapshots, multi-destination replication, consistency-point group management — inside the provisioning workspace. Two structural problems emerged: (1) backup orchestration domain (snapshot lifecycle, engine abstraction, restore verification, retention policy) is orthogonal to provisioning's core domain (workspace lifecycle management); (2) the implementation was coupled to provisioning's platform_config crate for config loading, preventing standalone use. Any project needing backup operations (workspace CI, scripts, standalone invocation) had to depend on the full provisioning binary. The component passed all four criteria of the ontoref lift-out pattern (ontoref ADR-016): orthogonal concern, consumer plurality, release cadence divergence, config path-agnostic.", + + decision = "backup-manager is extracted from provisioning as cloudatasave, a standalone backup orchestration project (Forgejo: LibreCloud/cloudDataSave). provisioning retains catalog/cloudatasave/ — the NCL schemas, defaults, and component declarations that allow workspace infras to declare BackupGroups and BackupPolicies. Workspace infras declare their backup intent using cloudatasave's NCL vocabulary; provisioning's runtime calls the cloudatasave binary with the generated policy config. No provisioning crate is imported by cloudatasave as a library dependency. backup-manager workspace member entry is removed from provisioning/platform/Cargo.toml.", + + rationale = [ + { + claim = "Backup orchestration evolves on a different cadence than workspace lifecycle", + detail = "Adding a new backup destination type (SFTP, Storj, rsync.net), implementing kopia as a second engine, or improving restore verification scheduling should not require a provisioning release. cloudatasave's only coupling to provisioning is the BackupPolicy NCL vocabulary — the schema is stable across provisioning versions once published.", + }, + { + claim = "Portable backup is a general capability, not a provisioning-specific one", + detail = "Any project — not just provisioning workspaces — may need to declare backup groups, schedule snapshots, and verify restores. A cloudatasave that does not import provisioning infrastructure can be adopted by standalone scripts, CI pipelines, and other projects without inheriting provisioning's dependency tree.", + }, + { + claim = "Verify-as-provisioning axiom requires cloudatasave to own verification state", + detail = "The principle that a backup group is not provisioned until verified is a cloudatasave invariant. If backup-manager remained inside provisioning, this invariant would be implemented as a provisioning concern, creating tight coupling between provisioning's provisioning-state tracking and backup verification. As a standalone project, cloudatasave owns the invariant completely.", + }, + { + claim = "catalog/cloudatasave/ is the correct integration surface", + detail = "provisioning loads cloudatasave's BackupPolicy and BackupGroup schemas as an extension, making them available to workspace infra NCL. The workspace declares backup policy; provisioning validates it against cloudatasave's schema and passes it to the binary. This is the correct dependency direction: provisioning knows about cloudatasave's vocabulary, but cloudatasave does not know about provisioning's internal structures.", + }, + ], + + consequences = { + positive = [ + "cloudatasave releases independently: engine additions, verification improvements, destination types do not block provisioning", + "Any project can adopt cloudatasave by declaring a BackupPolicy in its NCL vocabulary — not only provisioning workspaces", + "provisioning/platform/Cargo.toml shrinks: backup-manager workspace member removed", + "BackupEngine trait boundary is declared at project inception, forcing the engine abstraction to be correct from day one", + ], + negative = [ + "Two schema maintenance surfaces: cloudatasave/schemas/ (source of truth) and provisioning/catalog/cloudatasave/ (consumer-side reference)", + "Workspace infras that previously used backup_manager component definitions must migrate to cloudatasave's BackupPolicy schema", + "cloudatasave must implement its own config loading without platform_config — one-time cost at extraction", + ], + }, + + alternatives_considered = [ + { + option = "Keep backup-manager in provisioning, expose as prvng backup subcommand only", + why_rejected = "Prevents standalone invocation, CI pipeline integration without provisioning, and blocks the verify-as-provisioning model from being a cloudatasave-internal invariant. Four-criterion test (ontoref ADR-016) makes the extraction correct.", + }, + { + option = "Use a managed Kubernetes backup solution (Velero)", + why_rejected = "Velero targets in-cluster resource backup (PVCs, manifests). cloudatasave targets data backup: application snapshots, database dumps, object storage replication. These are complementary, not substitutes. cloudatasave's engine abstraction can later add a Velero-backend for PVC-class workloads.", + }, + ], + + constraints = [ + { + id = "extensions-not-binary-dep", + claim = "provisioning must not import cloudatasave as a Rust library dependency; interaction is via CLI invocation with NCL-generated config", + scope = "provisioning/platform/Cargo.toml, provisioning/catalog/", + severity = 'Hard, + check = { + tag = 'Grep, + pattern = "cloudatasave|backup-manager", + paths = ["provisioning/platform/Cargo.toml"], + must_be_empty = true, + }, + rationale = "Library dependency would re-couple provisioning's build cycle to cloudatasave's. The integration surface is the CLI binary + NCL schema, not the Rust crate graph.", + }, + { + id = "minimum-two-destinations-enforced", + claim = "provisioning's catalog/cloudatasave/ schema must declare minimum-two-destinations as a hard contract so workspace infras cannot declare single-destination groups", + scope = "provisioning/catalog/cloudatasave/backup_group.ncl", + severity = 'Hard, + check = { + tag = 'FileExists, + path = "provisioning/catalog/cloudatasave/backup_group.ncl", + present = true, + }, + rationale = "The multi-destination-custody axiom must be enforced at the workspace infra declaration layer, not only at cloudatasave runtime. Early validation prevents misconfigured groups from reaching the orchestrator.", + }, + ], + + related_adrs = ["adr-037-ops-contract-dual-mode", "ontoref:adr-016-component-lift-out-pattern"], + + ontology_check = { + decision_string = "backup-manager extracted as cloudatasave standalone project; provisioning retains catalog/cloudatasave/ as integration surface; no Rust library dependency from provisioning to cloudatasave", + invariants_at_risk = [], + verdict = 'Safe, + }, +} diff --git a/adrs/adr-042-ecosystem-integration-modes.ncl b/adrs/adr-042-ecosystem-integration-modes.ncl new file mode 100644 index 0000000..54cd9ae --- /dev/null +++ b/adrs/adr-042-ecosystem-integration-modes.ncl @@ -0,0 +1,141 @@ +let d = import "adr-defaults.ncl" in + +d.make_adr { + id = "adr-042", + title = "Ecosystem integration via federated Integration Modes mediated by versioned Domain artifacts in OCI", + status = 'Accepted, + date = "2026-05-01", + + context = "After the lift-out of lian-build (ADR-040) and cloudatasave (ADR-041) as standalone projects, no mechanism exists for them to integrate with provisioning without re-coupling to its filesystem. An earlier framing modeled the problem as 'extension of a host' — a plugin protocol with an ExtensionManifest parallel to Mode, where provisioning was the host and external projects were plugins. This framing is structurally incompatible with the ontoref protocol already adopted across the ecosystem: ontoref has Mode, Domain, and Reflection as native primitives that express the same relationship with semantic coherence. The term 'extension' is also overloaded in the codebase: it simultaneously names (a) the plugin-protocol framing (rejected) and (b) the IaC artifact catalog at provisioning/extensions/ (components, providers, taskservs, playbooks, workflows). These two concepts share a name by historical accident.", + + decision = "Adopt the federated Integration Mode pattern: each participant (provisioning, lian-build, cloudatasave, vapora, CI) declares its integration points as Modes with kind = 'integration in its own reflection/modes/. Participants exchange typed contracts via Domain artifacts — versioned OCI blobs custodiados en reg.librecloud.online/domains/: — without reading each other's filesystems. Provisioning-side caller context (SOPS secrets, component values, literals) is materialised as Cabling files (infra//integrations/.ncl) resolved by a dedicated Rust crate (context-assembler) into a typed JSON payload sent via stdin to the Mode binary. Three primitives: Integration Mode (reflects self, declares domains_used), Domain artifact (shared OCI contract with inputs/outputs/events channels), Cabling (workspace-local materialisation of the contract). Separately, provisioning/extensions/ is renamed to provisioning/catalog/ to give the IaC building-block catalog its semantically correct name, decoupled from the integration protocol.", + + rationale = [ + { + claim = "federated Mode pattern is the native on+re idiom — no parallel schema needed", + detail = "ontoref already models operational units as Modes with DAG steps, QA specs, and capability declarations. Introducing a separate ExtensionManifest concept alongside Mode would split the semantic model. A Mode with kind = 'integration is a Mode — it fits the existing indexing, describe, and run primitives natively without new tooling.", + }, + { + claim = "OCI Domain artifacts provide content-addressed versioned contracts without a custom server", + detail = "A Domain artifact is a set of Nickel schema files (inputs.ncl, outputs.ncl, events.ncl, capabilities.ncl, version.ncl) pushed to the existing zot registry with a custom mediaType. oras-cli handles push/pull. Content-addressable digests provide integrity without signing in v0.1 (acceptable for internal use; cosign added in hardening). No custom server, no custom protocol: standard OCI distribution API.", + }, + { + claim = "Ruta beta for OCI CLI surface: local implementation in prvng integration, not upstream to ontoref", + detail = "ontoref v0.1.0 implements no OCI commands: no domain publish, no domain pull, no ecosystem domains, no OCI distribution API access, no custom mediaType management. Contributing upstream requires designing, reviewing, and merging a large module in a project with its own release cadence before the first proof-of-stack. Implementing locally under prvng integration and extracting after validation is coherent with the lift-out pattern already practised (lian-build and cloudatasave both originated in provisioning). Breaking changes during v0.1 stay local. Extraction is mechanical once the contract is stable.", + }, + { + claim = "Ruta B for Mode schema extension: embedded subset in provisioning schemas until stable", + detail = "For the same iteration-cost reason: a Mode with kind = 'integration and its associated fields (domains_used, invocation, direction) lives in provisioning/schemas/lib/integration_mode_manifest.ncl until validated with two real consumers (lian-build, cloudatasave). Promoting upstream after validation avoids locking the ontoref schema to an API that may still change.", + }, + { + claim = "catalog/ rename removes the semantic collision from the codebase permanently", + detail = "As long as provisioning/extensions/ exists, every future developer must reason about whether 'extension' means the IaC catalog or the rejected plugin protocol. A clean rename to provisioning/catalog/ eliminates the ambiguity at zero ongoing cost. The protocol framing disappears from commands, dispatch, schemas, and module paths simultaneously.", + }, + { + claim = "Modes never read caller filesystem — context arrives as typed stdin JSON", + detail = "The plugin-protocol approach allowed extension manifests to declare filesystem paths the host would populate. This creates implicit coupling: the Mode depends on the host's directory layout. Delivering context as a typed JSON payload (assembled by context-assembler from Cabling) means the Mode binary needs no knowledge of the caller's filesystem. The contract is the Domain schema, not a path convention.", + }, + ], + + consequences = { + positive = [ + "lian-build and cloudatasave integrate with provisioning by declaring Modes and consuming Domains — no filesystem coupling, no re-absorption", + "New participants (vapora, CI, future) adopt the pattern by publishing a Mode artifact; no changes to provisioning internals required", + "Domain artifacts are OCI-addressable: discovery is standard registry catalog API, not filesystem grep", + "Cabling files are workspace-local NCL — type-safe, auditable, version-controlled alongside the workspace definition", + "provisioning/catalog/ correctly names the IaC building-block library; confusion with the integration protocol is eliminated", + "context-assembler Rust crate is generically reusable by any caller: vapora, CI, future participants", + "Signing (cosign) is addable as a hardening step without architectural change — the OCI push/pull path already exists", + ], + negative = [ + "oras-cli is a new runtime dependency for the OCI CLI surface (acceptable: single binary, brew install)", + "context-assembler introduces a Rust crate boundary for what was previously a Nushell inline operation — justified by crypto-sensitive plaintext handling and typed schema validation", + "provisioning/catalog/ rename propagates across all workspace component NCL imports — one-time cost, verified by nickel export smoke tests", + "Mode binary distribution (how the Mode binary reaches the operator's PATH) is out-of-band for v0.1 (invocation.method = 'path_assumed); resolved in a later iteration when 'oci_blob or 'cargo_install are needed", + "Signing of Domain and Mode artifacts is deferred to hardening (TASK-14); artifacts are unsigned during v0.1, acceptable for internal-only registry", + ], + }, + + alternatives_considered = [ + { + option = "Extension protocol: host-mediated plugin manifest parallel to Mode", + why_rejected = "Introduced a second schema object (ExtensionManifest) alongside Mode in the ontoref model, creating two registration paths for essentially the same concept. The host (provisioning) becoming a plugin-loader couples its internal lifecycle to external project release cadences. Filesystem-path coupling between host and extension was implicit and unvalidated. The on+re Mode primitive already expresses this relationship — a parallel schema is an anti-PAP.", + }, + { + option = "Ruta alpha for OCI CLI: upstream contribution to ontoref", + why_rejected = "ontoref v0.1.0 has no OCI surface at all. Contributing upstream means coordinating design, review, and merge in a project with independent release cadence before proving the first round-trip. Ruta beta (local implementation, extract after validation) delivers the proof-of-stack sooner and keeps breaking changes local during the unstable v0.1 period. If ontoref gains throughput and there is demand outside provisioning, extraction is mechanical.", + }, + { + option = "Ruta A for Mode schema: upstream contribution of kind = 'integration to ontoref core", + why_rejected = "Same reasoning as ruta alpha for OCI: the schema should stabilise against two real consumers (lian-build, cloudatasave) before being locked into upstream. Ruta B (embedded subset) allows iteration without external coordination.", + }, + { + option = "Collapsing provisioning/extensions/ under provisioning/integrations/", + why_rejected = "A component, provider, taskserv, playbook, or workflow is not an integration in the federated Mode sense. Collapsing them under the same name would re-introduce the semantic collision the rename is trying to eliminate. The IaC building blocks are catalog items; the federated protocol participants are integration modes. Two concepts, two names: catalog/ and integrations/.", + }, + ], + + constraints = [ + { + id = "integration-mode-must-declare-domains-used", + claim = "A Mode with kind = 'integration MUST declare domains_used as a non-empty array — a Mode that declares no domains is not an integration mode", + scope = "provisioning/schemas/lib/integration_mode_manifest.ncl", + severity = 'Hard, + check = { + tag = 'FileExists, + path = "provisioning/schemas/lib/integration_mode_manifest.ncl", + present = true, + }, + rationale = "An integration Mode with no domains_used has no contract with its caller — it might as well be a regular Mode. The domains_used requirement enforces that every integration point is explicitly declared and versioned.", + }, + { + id = "integration-mode-no-filesystem-read-of-caller", + claim = "Integration Mode binaries MUST NOT read the caller's filesystem — context arrives exclusively as typed stdin JSON assembled by context-assembler from the Cabling file", + scope = "provisioning/schemas/lib/integration_mode_manifest.ncl, crates/context-assembler/", + severity = 'Hard, + check = { tag = 'Grep, pattern = "path_read|filesystem_", paths = ["provisioning/schemas/lib/integration_mode_manifest.ncl"], must_be_empty = true }, + rationale = "Filesystem reads couple the Mode binary to the host's directory layout, recreating the implicit coupling that the federated pattern is designed to eliminate.", + }, + { + id = "catalog-not-integrations", + claim = "IaC building blocks (components, providers, taskservs, playbooks, workflows) MUST live under provisioning/catalog/ — they MUST NOT be placed under provisioning/integrations/", + scope = "provisioning/catalog/", + severity = 'Hard, + check = { + tag = 'FileExists, + path = "provisioning/catalog", + present = true, + }, + rationale = "Mixing IaC catalog items with integration Mode artifacts recreates the semantic collision between the old 'extension' catalog and the old 'extension' protocol. The rename to catalog/ is the permanent resolution.", + }, + { + id = "domain-artifacts-in-oci-registry", + claim = "Domain artifacts MUST be published to reg.librecloud.online/domains/: and consumed via oras pull — inline filesystem imports of domain schemas are not permitted after v0.1", + scope = "infra//integrations/, crates/context-assembler/", + severity = 'Hard, + check = { tag = 'Grep, pattern = "import.*domains/", paths = ["provisioning/core/nulib/"], must_be_empty = true }, + rationale = "Filesystem-local domain schemas cannot be discovered, versioned, or integrity-checked by oras. Moving to OCI is what enables the federated discovery model.", + }, + { + id = "extension-command-removed", + claim = "prvng extension (and aliases e, ext) MUST NOT exist after the catalog rename — the command is replaced by prvng integration for the federation surface and optionally prvng catalog for catalog browsing", + scope = "provisioning/core/cli/provisioning, provisioning/core/nulib/commands-registry.ncl", + severity = 'Hard, + check = { + tag = 'Grep, + pattern = "\"extension\"", + paths = ["provisioning/core/nulib/commands-registry.ncl"], + must_be_empty = true, + }, + rationale = "A prvng extension command surviving the rename would perpetuate the vocabulary collision and confuse operators about whether 'extension' refers to the catalog or the protocol.", + }, + ], + + ontology_check = { + decision_string = "Federated Integration Mode pattern via OCI Domain artifacts: Modes declare domains_used, context assembled by Rust crate from Cabling (SOPS + component + literal + env resolvers), no filesystem coupling, ruta beta OCI CLI in prvng integration, ruta B Mode schema embedded in provisioning schemas, catalog rename provisioning/extensions/ to provisioning/catalog/", + invariants_at_risk = ["config-driven-always", "type-safety-nickel"], + verdict = 'Safe, + }, + + related_adrs = ["adr-040-lian-build-lift-out", "adr-041-cloudatasave-lift-out", "adr-039-build-infrastructure-ephemeral", "adr-033-cluster-component-extension-pattern"], +} diff --git a/adrs/adr-constraints.ncl b/adrs/adr-constraints.ncl new file mode 100644 index 0000000..b0dd4e6 --- /dev/null +++ b/adrs/adr-constraints.ncl @@ -0,0 +1,51 @@ +let _adr_id_format = std.contract.custom ( + fun label => + fun value => + if std.string.is_match "^adr-[0-9]{3}$" value then + 'Ok value + else + 'Error { + message = "ADR id must match 'adr-NNN' format (e.g. 'adr-001'), got: '%{value}'" + } +) in + +let _non_empty_constraints = std.contract.custom ( + fun label => + fun value => + if std.array.length value == 0 then + 'Error { + message = "constraints must not be empty — an ADR with no constraints is passive documentation, not an active constraint" + } + else + 'Ok value +) in + +let _non_empty_negative = std.contract.custom ( + fun label => + fun value => + if std.array.length value.negative == 0 then + 'Error { + message = "consequences.negative must not be empty on id='%{value.id}' — an ADR with no negative consequences is incomplete" + } + else + 'Ok value +) in + +let _requires_justification = std.contract.custom ( + fun label => + fun value => + if value.ontology_check.verdict == 'RequiresJustification + && !(std.record.has_field "invariant_justification" value) then + 'Error { + message = "ADR '%{value.id}': ontology_check.verdict = 'RequiresJustification but invariant_justification field is missing" + } + else + 'Ok value +) in + +{ + AdrIdFormat = _adr_id_format, + NonEmptyConstraints = _non_empty_constraints, + NonEmptyNegativeConsequences = _non_empty_negative, + RequiresJustificationWhenRisky = _requires_justification, +} diff --git a/adrs/adr-defaults.ncl b/adrs/adr-defaults.ncl new file mode 100644 index 0000000..e144056 --- /dev/null +++ b/adrs/adr-defaults.ncl @@ -0,0 +1,13 @@ +let s = import "adr-schema.ncl" in +let c = import "adr-constraints.ncl" in + +{ + make_adr = fun data => + let result | c.RequiresJustificationWhenRisky = s.Adr & data in + result, + make_constraint = fun data => s.Constraint & data, + + Adr = s.Adr, + Constraint = s.Constraint, + OntologyCheck = s.OntologyCheck, +} diff --git a/adrs/adr-schema.ncl b/adrs/adr-schema.ncl new file mode 100644 index 0000000..4fef4db --- /dev/null +++ b/adrs/adr-schema.ncl @@ -0,0 +1,97 @@ +let c = import "adr-constraints.ncl" in + +let status_type = [| 'Proposed, 'Accepted, 'Superseded, 'Deprecated |] in +let severity_type = [| 'Hard, 'Soft |] in +let verdict_type = [| 'Safe, 'RequiresJustification |] in + +let rationale_entry_type = { + claim | String, + detail | String, +} in + +let alternative_type = { + option | String, + why_rejected | String, +} in + +# Tag discriminant for typed constraint checks. +let check_tag_type = [| + 'Cargo, + 'Grep, + 'NuCmd, + 'ApiCall, + 'FileExists, +|] in + +# Typed constraint check: a tagged record, JSON-serializable. +# 'Cargo -> crate : String, forbidden_deps : Array String +# 'Grep -> pattern : String, paths : Array String, must_be_empty : Bool +# 'NuCmd -> cmd : String, expect_exit : Number +# 'ApiCall -> endpoint : String, json_path : String, expected : Dyn +# 'FileExists-> path : String, present : Bool +let constraint_check_type = { + tag | check_tag_type, + .. +} in + +let constraint_type = { + id | String, + claim | String, + scope | String, + severity | severity_type, + # Transition period: one of check or check_hint must be present. + # check_hint is deprecated — migrate existing ADRs to typed check variants. + check_hint | String | optional, + check | constraint_check_type | optional, + rationale | String, +} in + +let ontology_check_type = { + decision_string | String, + invariants_at_risk | Array String, + verdict | verdict_type, +} in + +let invariant_justification_type = { + invariant | String, + claim | String, + mitigation | String, +} in + +let consequences_type = { + positive | Array String, + negative | Array String, +} in + +let adr_type = { + id | String | c.AdrIdFormat, + title | String, + status | status_type, + date | String, + + context | String, + decision | String, + rationale | Array rationale_entry_type, + consequences | consequences_type, + alternatives_considered | Array alternative_type, + + constraints | Array constraint_type | c.NonEmptyConstraints, + ontology_check | ontology_check_type, + + related_adrs | Array String | default = [], + supersedes | String | optional, + superseded_by | String | optional, + invariant_justification | invariant_justification_type | optional, +} in + +{ + AdrStatus = status_type, + Severity = severity_type, + Verdict = verdict_type, + Constraint = constraint_type, + RationaleEntry = rationale_entry_type, + Alternative = alternative_type, + OntologyCheck = ontology_check_type, + InvariantJustification = invariant_justification_type, + Adr = adr_type, +} diff --git a/justfile b/justfile index 661a263..a16e543 100644 --- a/justfile +++ b/justfile @@ -11,9 +11,14 @@ import 'justfiles/ci.just' import 'justfiles/platform.just' import 'justfiles/installer.just' import 'justfiles/book.just' +import 'justfiles/docker.just' import 'justfiles/auth.just' import 'justfiles/kms.just' import 'justfiles/orchestrator.just' +import 'justfiles/daemon.just' +import 'justfiles/distro.just' +import 'justfiles/assets.just' +import 'justfiles/crate.just' # ============================================================================ # Provisioning Configuration @@ -70,7 +75,9 @@ parallel := "true" echo " 🚀 release - Release management & artifacts (just release-help)" echo " 🔧 dev - Development workflows & testing (just dev-help)" echo " ⚡ platform - Platform services & orchestration (just platform-help)" + echo " 🐳 docker - Container image building (just docker-help)" echo " 📦 installer - Interactive installer & config mgmt (just installer-help)" + echo " 💾 distro - Local installation & distribution (just distro-help)" echo " 📖 book - MDBook documentation system (just book-help)" echo "" echo "🔐 PLUGIN MODULES" @@ -94,30 +101,32 @@ help MODULE="": #!/usr/bin/env bash if [ "{{MODULE}}" = "" ]; then just --justfile {{justfile()}} default - elif [ "{{MODULE}}" = "ci" ]; then - echo "🔧 CI/CD PIPELINES" - echo "===================" - echo "" - echo "Available CI commands:" - echo " just ci - CI/CD pipeline WITHOUT cleanup (fast iteration)" - echo " just ci-clean - CI/CD pipeline WITH cleanup (production)" - echo " just ci-full - Run all CI checks (formatting, linting, tests, audit)" - echo "" - echo "CI Tasks:" - echo " just ci-fmt-check - Check code formatting" - echo " just ci-fmt - Code formatting" - echo " just ci-lint - Run all linting checks" - echo " just ci-test - Run all tests" - echo " just ci-audit - Run security audits" - echo " just ci-docs - Check documentation" - echo "" - echo "ℹ️ NOTE: nu_plugins (in plugins/nushell-plugins/) are excluded from CI checks" - echo " since they are maintained as a separate project with independent CI." - else - echo "❌ Unknown module: {{MODULE}}" - echo "" - echo "Available modules: ci, build, package, release, dev, platform, installer, book, auth, kms, orchestrator" - fi + elif [ "{{MODULE}}" = "ci" ]; then + echo "🔧 CI/CD PIPELINES" + echo "===================" + echo "" + echo "Available CI commands:" + echo " just ci - CI/CD pipeline WITHOUT cleanup (fast iteration)" + echo " just ci-clean - CI/CD pipeline WITH cleanup (production)" + echo " just ci-full - Run all CI checks (formatting, linting, tests, audit)" + echo "" + echo "CI Tasks:" + echo " just ci-fmt-check - Check code formatting" + echo " just ci-fmt - Code formatting" + echo " just ci-lint - Run all linting checks" + echo " just ci-test - Run all tests" + echo " just ci-audit - Run security audits" + echo " just ci-docs - Check documentation" + echo "" + echo "ℹ️ NOTE: nu_plugins (in plugins/nushell-plugins/) are excluded from CI checks" + echo " since they are maintained as a separate project with independent CI." + elif [ "{{MODULE}}" = "distro" ]; then + just distro-help + else + echo "❌ Unknown module: {{MODULE}}" + echo "" + echo "Available modules: ci, distro, build, package, release, dev, platform, docker, installer, book, auth, kms, orchestrator" + fi # Show comprehensive provisioning help @help-full: @@ -177,6 +186,16 @@ help MODULE="": echo " • api-gateway - REST API gateway" echo " • platform-status - All platform services status" echo "" + echo "🐳 DOCKER MODULE (docker.just) - DETAILED" + echo " Container image building and management" + echo " • build-images - Build all or specific platform service images" + echo " • image-list - List available services" + echo " • image-validate - Validate built images" + echo " • image-clean - Remove all provisioning images" + echo " • image-info - Show image information" + echo " • docker-status - Show Docker system status" + echo " • build-verify - Build and verify images" + echo "" echo "📦 INSTALLER MODULE (installer.just) - DETAILED" echo " Interactive installer and configuration management" echo " • installer-build - Build installer binary" diff --git a/justfiles/assets.just b/justfiles/assets.just new file mode 100644 index 0000000..6cdaa6d --- /dev/null +++ b/justfiles/assets.just @@ -0,0 +1,36 @@ +# API catalog export +# +# Generates per-service api-catalog-*.json from #[onto_api] registered routes. +# Run after any handler annotation is added or changed. +# Commit alongside the annotation changes — they are paired artifacts. + +PLATFORM_MANIFEST := "platform/Cargo.toml" + +# Export #[onto_api] routes for all platform services +[doc("Export #[onto_api] routes for all platform services to api-catalog-*.json")] +export-api-catalog: export-api-catalog-orchestrator export-api-catalog-control-center export-api-catalog-extension-registry export-api-catalog-ai-service + @echo "all platform API catalogs exported" + +# Export orchestrator routes +[doc("Export orchestrator #[onto_api] routes to api-catalog-orchestrator.json")] +export-api-catalog-orchestrator: + cargo run --manifest-path {{PLATFORM_MANIFEST}} -p orchestrator --no-default-features --features "core,audit,compliance,platform,ssh,workflow,testing,http-api" -- --dump-api-catalog > api-catalog-orchestrator.json + @echo "orchestrator: $(cat api-catalog-orchestrator.json | jq length) routes" + +# Export control-center routes +[doc("Export control-center #[onto_api] routes to api-catalog-control-center.json")] +export-api-catalog-control-center: + cargo run --manifest-path {{PLATFORM_MANIFEST}} -p control-center --no-default-features --features "core,kms,audit,mfa,compliance,experimental" -- --dump-api-catalog > api-catalog-control-center.json + @echo "control-center: $(cat api-catalog-control-center.json | jq length) routes" + +# Export extension-registry routes +[doc("Export extension-registry #[onto_api] routes to api-catalog-extension-registry.json")] +export-api-catalog-extension-registry: + cargo run --manifest-path {{PLATFORM_MANIFEST}} -p extension-registry --no-default-features -- --dump-api-catalog > api-catalog-extension-registry.json + @echo "extension-registry: $(cat api-catalog-extension-registry.json | jq length) routes" + +# Export ai-service routes +[doc("Export ai-service #[onto_api] routes to api-catalog-ai-service.json")] +export-api-catalog-ai-service: + cargo run --manifest-path {{PLATFORM_MANIFEST}} -p ai-service --no-default-features -- --dump-api-catalog > api-catalog-ai-service.json + @echo "ai-service: $(cat api-catalog-ai-service.json | jq length) routes" diff --git a/justfiles/build.just b/justfiles/build.just index c14730a..a1b7f5c 100644 --- a/justfiles/build.just +++ b/justfiles/build.just @@ -114,3 +114,102 @@ --check-config \ --verbose={{verbose}} echo "✅ Build system health check completed" + +# Build platform Rust binaries in release and install to $HOME/bin +build-platform-install: + #!/usr/bin/env bash + set -euo pipefail + PLATFORM_DIR="{{provisioning_root}}/platform" + BIN_DIR="${HOME}/bin" + BINS="provisioning-orchestrator provisioning-control-center provisioning-mcp-server provisioning-extension-registry provisioning-vault-service" + echo "=== platform: cargo build --release ===" + cargo build --release --manifest-path "${PLATFORM_DIR}/Cargo.toml" \ + --bin provisioning-orchestrator \ + --bin provisioning-control-center \ + --bin provisioning-mcp-server \ + --bin provisioning-extension-registry \ + --bin provisioning-vault-service + mkdir -p "${BIN_DIR}" + echo "=== installing to ${BIN_DIR} ===" + for bin in ${BINS}; do + src="${PLATFORM_DIR}/target/release/${bin}" + if [ -f "${src}" ]; then + install -m 0755 "${src}" "${BIN_DIR}/${bin}" + echo " installed: ${bin}" + else + echo " WARN: ${bin} not in release output, skipped" + fi + done + echo "=== done — ${BIN_DIR} ===" + +# Build a single platform crate in release mode, optionally install + restart. +# +# Accepts short name (e.g. `ncl-sync`) or full binary name (`provisioning-ncl-sync`). +# Maps short names to package + binary using the `provisioning-` convention. +# +# After build, prompts to install to $HOME/.local/bin and pkill any running instance. +# +# Usage: +# just build-release ncl-sync +# just build-release orchestrator +# just build-release provisioning-vault-service +build-release TARGET: + #!/usr/bin/env bash + set -euo pipefail + PLATFORM_DIR="{{provisioning_root}}/platform" + TARGET="{{TARGET}}" + INSTALL_DIR="${HOME}/.local/bin" + + # Resolve short name → (package, binary) + # Convention: binary is always `provisioning-` except for the host CLI itself. + if [[ "$TARGET" == provisioning-* ]]; then + SHORT="${TARGET#provisioning-}" + BIN="$TARGET" + else + SHORT="$TARGET" + BIN="provisioning-${TARGET}" + fi + + # Most crate names match the short form (ncl-sync, orchestrator, ...). + # Exceptions can be handled here if they appear. + PACKAGE="$SHORT" + + echo "🔨 Building $BIN (package: $PACKAGE) in release mode..." + cargo build --release \ + --manifest-path "$PLATFORM_DIR/Cargo.toml" \ + --package "$PACKAGE" + + SRC="$PLATFORM_DIR/target/release/$BIN" + if [ ! -f "$SRC" ]; then + echo "❌ Built binary not found: $SRC" + echo " Check that package '$PACKAGE' defines [[bin]] name = \"$BIN\"" + exit 1 + fi + echo "✅ Built: $SRC ($(du -h "$SRC" | cut -f1))" + + # Interactive install prompt + read -p "Install to $INSTALL_DIR/$BIN and restart? [y/N] " -n 1 -r REPLY + echo + if [[ ! "$REPLY" =~ ^[Yy]$ ]]; then + echo "↷ Skipped install. Binary available at: $SRC" + exit 0 + fi + + mkdir -p "$INSTALL_DIR" + + # Stop running instance before replacing the binary + if pgrep -f "$BIN" > /dev/null 2>&1; then + echo "⏹ Stopping running $BIN..." + pkill -f "$BIN" || true + sleep 1 + # Verify it's gone (send SIGKILL if still running) + if pgrep -f "$BIN" > /dev/null 2>&1; then + echo " still running — SIGKILL" + pkill -9 -f "$BIN" || true + sleep 1 + fi + fi + + install -m 0755 "$SRC" "$INSTALL_DIR/$BIN" + echo "✅ Installed: $INSTALL_DIR/$BIN" + echo " Run: $BIN --help to verify" diff --git a/justfiles/crate.just b/justfiles/crate.just new file mode 100644 index 0000000..0e8e673 --- /dev/null +++ b/justfiles/crate.just @@ -0,0 +1,475 @@ +# Platform crate — build, install, asset deployment +# =================================================== +# Generic recipes that work for any service binary in the Rust workspace. +# Accepts short aliases and auto-resolves to full package + binary names. +# +# Short alias → cargo package / output binary +# daemon → provisioning-daemon / provisioning-daemon +# orch → orchestrator / provisioning-orchestrator +# cc → control-center / provisioning-control-center +# vault → vault-service / provisioning-vault-service +# ai → ai-service / provisioning-ai-service +# mcp → provisioning-mcp / provisioning-mcp-server +# ncl-sync → ncl-sync / provisioning-ncl-sync +# tool → provisioning-tool / provisioning-tool +# rag → platform-rag / provisioning-rag +# cli → prvng-cli / prvng-cli +# +# Standalone crates (excluded from workspace — build via crate-level Cargo.toml): +# nu-daemon → nu-daemon / provisioning-nu-daemon [standalone] +# +# _resolve returns 3 fields: PKG BIN STANDALONE +# STANDALONE="standalone" → cargo build uses crates//Cargo.toml directly +# STANDALONE="" → cargo build uses workspace Cargo.toml -p +# +# Usage: +# just crate-build # interactive picker +# just crate-build daemon # short alias +# just crate-install daemon # install binary only +# just crate-assets daemon # copy templates/assets +# just crate-deploy daemon # build → install → assets +# just crate-deploy nu-daemon # standalone build (excluded from workspace) + +alias cdy := crate-deploy + +_pf_root := parent_directory(source_file()) / ".." / "platform" +_pf_bin_dir := env_var_or_default("HOME", "/tmp") / ".local" / "bin" +_pf_data := env_var_or_default("HOME", "/tmp") / ".local" / "share" / "provisioning" + +# ─── 1. Build ────────────────────────────────────────────────────────────────── + +# Build a platform crate in release mode. +[doc("Build a platform crate release binary. Omit TARGET for interactive picker.")] +crate-build target="": + #!/usr/bin/env bash + set -euo pipefail + PLATFORM="{{_pf_root}}" + + _pick_target() { + local choices=(daemon orchestrator control-center vault-service ai-service provisioning-mcp ncl-sync provisioning-tool platform-rag prvng-cli nu-daemon) + echo "Available targets:" >&2 + local i=1 + for c in "${choices[@]}"; do + printf ' %2d) %s\n' "$i" "$c" >&2 + i=$((i+1)) + done + local n + read -rp "Select [1-${#choices[@]}]: " n + printf '%s\n' "${choices[$((n-1))]}" + } + + _resolve() { + case "$1" in + daemon|provisioning-daemon) echo "provisioning-daemon provisioning-daemon" ;; + orch|orchestrator) echo "orchestrator provisioning-orchestrator" ;; + cc|control-center|control) echo "control-center provisioning-control-center" ;; + vault|vault-service) echo "vault-service provisioning-vault-service" ;; + ai|ai-service) echo "ai-service provisioning-ai-service" ;; + mcp|provisioning-mcp) echo "provisioning-mcp provisioning-mcp-server" ;; + ncl-sync) echo "ncl-sync provisioning-ncl-sync" ;; + tool|provisioning-tool) echo "provisioning-tool provisioning-tool" ;; + rag|platform-rag) echo "platform-rag provisioning-rag" ;; + cli|prvng-cli) echo "prvng-cli prvng-cli" ;; + nu-daemon|nu_daemon) echo "nu-daemon provisioning-nu-daemon standalone" ;; + *) echo "error: unknown target '$1'" >&2; return 1 ;; + esac + } + + INPUT="{{target}}" + [[ -z "$INPUT" ]] && INPUT="$(_pick_target)" + [[ -z "$INPUT" ]] && { echo "error: no target selected" >&2; exit 1; } + + read -r PKG BIN STANDALONE <<< "$(_resolve "$INPUT")" + + if [[ "$STANDALONE" == "standalone" ]]; then + echo "=== build: cargo build --release (standalone: crates/${PKG}) ===" + cargo build --release \ + --manifest-path "${PLATFORM}/crates/${PKG}/Cargo.toml" + BUILT="${PLATFORM}/crates/${PKG}/target/release/${BIN}" + else + echo "=== build: cargo build --release -p ${PKG} ===" + cargo build --release \ + --manifest-path "${PLATFORM}/Cargo.toml" \ + -p "${PKG}" + BUILT="${PLATFORM}/target/release/${BIN}" + fi + + SIZE=$(du -sh "$BUILT" 2>/dev/null | cut -f1 || echo "?") + echo " ok: ${BUILT} (${SIZE})" + +# ─── 2. Install ──────────────────────────────────────────────────────────────── + +# Install the release binary to ~/.local/share/provisioning/bin/. +# The release binary must already exist; run crate-build first. +[doc("Install a platform crate binary locally. Run crate-build first.")] +crate-install target="": + #!/usr/bin/env bash + set -euo pipefail + PLATFORM="{{_pf_root}}" + BIN_DIR="{{_pf_bin_dir}}" + + _pick_target() { + local choices=(daemon orchestrator control-center vault-service ai-service provisioning-mcp ncl-sync provisioning-tool platform-rag prvng-cli nu-daemon) + echo "Available targets:" >&2 + local i=1 + for c in "${choices[@]}"; do + printf ' %2d) %s\n' "$i" "$c" >&2 + i=$((i+1)) + done + local n + read -rp "Select [1-${#choices[@]}]: " n + printf '%s\n' "${choices[$((n-1))]}" + } + + _resolve() { + case "$1" in + daemon|provisioning-daemon) echo "provisioning-daemon provisioning-daemon" ;; + orch|orchestrator) echo "orchestrator provisioning-orchestrator" ;; + cc|control-center|control) echo "control-center provisioning-control-center" ;; + vault|vault-service) echo "vault-service provisioning-vault-service" ;; + ai|ai-service) echo "ai-service provisioning-ai-service" ;; + mcp|provisioning-mcp) echo "provisioning-mcp provisioning-mcp-server" ;; + ncl-sync) echo "ncl-sync provisioning-ncl-sync" ;; + tool|provisioning-tool) echo "provisioning-tool provisioning-tool" ;; + rag|platform-rag) echo "platform-rag provisioning-rag" ;; + cli|prvng-cli) echo "prvng-cli prvng-cli" ;; + nu-daemon|nu_daemon) echo "nu-daemon provisioning-nu-daemon standalone" ;; + *) echo "error: unknown target '$1'" >&2; return 1 ;; + esac + } + + INPUT="{{target}}" + [[ -z "$INPUT" ]] && INPUT="$(_pick_target)" + [[ -z "$INPUT" ]] && { echo "error: no target selected" >&2; exit 1; } + + read -r PKG BIN STANDALONE <<< "$(_resolve "$INPUT")" + + if [[ "$STANDALONE" == "standalone" ]]; then + BUILT="${PLATFORM}/crates/${PKG}/target/release/${BIN}" + else + BUILT="${PLATFORM}/target/release/${BIN}" + fi + if [[ ! -f "$BUILT" ]]; then + echo "error: release binary not found: ${BUILT}" >&2 + echo " run: just crate-build ${INPUT}" >&2 + exit 1 + fi + + echo "=== install: ${BIN_DIR}/${BIN} ===" + mkdir -p "${BIN_DIR}" + install -m 0755 "${BUILT}" "${BIN_DIR}/${BIN}" + echo " ok: ${BIN_DIR}/${BIN}" + + # Emit PATH hint if the bin dir is not in PATH + if ! echo "$PATH" | tr ':' '\n' | grep -qxF "${BIN_DIR}"; then + echo " hint: add to PATH: export PATH=\"\$HOME/.local/share/provisioning/bin:\$PATH\"" + fi + +# ─── 3. Assets ───────────────────────────────────────────────────────────────── + +# Copy templates and static assets to their install destinations. +# +# Destination is read from the crate's NCL service config (nickel export + jq). +# If the configured path is inside the source tree the step is skipped — the +# daemon already reads directly from there and no copy is needed. +# Falls back to the default convention when NCL is unavailable or the field +# is not set. +[doc("Install templates and assets for a platform crate.")] +crate-assets target="": + #!/usr/bin/env bash + set -euo pipefail + PLATFORM="{{_pf_root}}" + DATA="{{_pf_data}}" + + _pick_target() { + local choices=(daemon orchestrator control-center vault-service ai-service provisioning-mcp ncl-sync provisioning-tool platform-rag prvng-cli nu-daemon) + echo "Available targets:" >&2 + local i=1 + for c in "${choices[@]}"; do + printf ' %2d) %s\n' "$i" "$c" >&2 + i=$((i+1)) + done + local n + read -rp "Select [1-${#choices[@]}]: " n + printf '%s\n' "${choices[$((n-1))]}" + } + + _resolve() { + case "$1" in + daemon|provisioning-daemon) echo "provisioning-daemon provisioning-daemon" ;; + orch|orchestrator) echo "orchestrator provisioning-orchestrator" ;; + cc|control-center|control) echo "control-center provisioning-control-center" ;; + vault|vault-service) echo "vault-service provisioning-vault-service" ;; + ai|ai-service) echo "ai-service provisioning-ai-service" ;; + mcp|provisioning-mcp) echo "provisioning-mcp provisioning-mcp-server" ;; + ncl-sync) echo "ncl-sync provisioning-ncl-sync" ;; + tool|provisioning-tool) echo "provisioning-tool provisioning-tool" ;; + rag|platform-rag) echo "platform-rag provisioning-rag" ;; + cli|prvng-cli) echo "prvng-cli prvng-cli" ;; + nu-daemon|nu_daemon) echo "nu-daemon provisioning-nu-daemon standalone" ;; + *) echo "error: unknown target '$1'" >&2; return 1 ;; + esac + } + + # Read a string field from the crate's NCL service config via sed. + # Avoids nickel schema resolution issues — works on raw config text. + _ncl_field() { + local service="$1" field="$2" + local ncl_cfg + if [[ "$(uname -s)" == "Darwin" ]]; then + ncl_cfg="${HOME}/Library/Application Support/provisioning/platform/config/${service}.ncl" + else + ncl_cfg="${HOME}/.config/provisioning/platform/config/${service}.ncl" + fi + [[ -f "$ncl_cfg" ]] || return + sed -n "s/^[[:space:]]*${field}[[:space:]]*=[[:space:]]*\"\([^\"]*\)\".*/\1/p" \ + "$ncl_cfg" 2>/dev/null | head -1 + } + + # Returns true if PATH is inside (or equal to) the source crate directory. + _is_source_tree() { + local path="$1" crate_src="$2" + local canon_path canon_src + canon_path="$(cd "$(dirname "$path")" 2>/dev/null && pwd)/$(basename "$path")" + canon_src="$(cd "$crate_src" 2>/dev/null && pwd)" + [[ "$canon_path" == "$canon_src"* ]] + } + + # Sync one asset group: SRC → DST, with source-tree detection. + # $1=label $2=src_dir $3=dst_dir $4=crate_src_root + _sync_group() { + local label="$1" src="$2" dst="$3" crate_src="$4" + + if [[ ! -d "$src" ]]; then + echo " skip [${label}]: source not found (${src})" + return + fi + + if _is_source_tree "$dst" "$crate_src"; then + echo " skip [${label}]: destination is the source tree" + echo " → ${dst}" + echo " daemon reads directly from there — no copy needed" + return + fi + + echo " sync [${label}]: ${src}" + echo " → ${dst}" + mkdir -p "${dst}" + rsync -a --delete "${src}/" "${dst}/" + local count + count=$(find "${dst}" -type f | wc -l | tr -d ' ') + echo " ok: ${count} files" + } + + INPUT="{{target}}" + [[ -z "$INPUT" ]] && INPUT="$(_pick_target)" + [[ -z "$INPUT" ]] && { echo "error: no target selected" >&2; exit 1; } + + read -r PKG _BIN _STANDALONE <<< "$(_resolve "$INPUT")" + + echo "=== crate-assets: ${PKG} ===" + + case "$PKG" in + provisioning-daemon) + CRATE_SRC="${PLATFORM}/crates/provisioning-daemon" + + # ui/templates — destination from NCL ui_templates_dir or convention + UI_DST="$(_ncl_field "provisioning-daemon" "ui_templates_dir")" + UI_DST="${UI_DST:-${DATA}/provisioning-daemon/ui/templates}" + _sync_group "ui_templates" \ + "${CRATE_SRC}/ui/templates" \ + "${UI_DST}" \ + "${CRATE_SRC}" + + # ontology_templates — destination from NCL ontology_templates or convention + ONT_DST="$(_ncl_field "provisioning-daemon" "ontology_templates")" + ONT_DST="${ONT_DST:-${DATA}/provisioning-daemon/ontology-templates}" + _sync_group "ontology_templates" \ + "${CRATE_SRC}/ontology_templates" \ + "${ONT_DST}" \ + "${CRATE_SRC}" + ;; + *) + echo " ${PKG}: no file-system assets (uses embedded resources)" + ;; + esac + + echo "=== done ===" + +# ─── 4. Deploy (chain) ───────────────────────────────────────────────────────── + +# Build → install → assets in sequence. Stops on first failure. +# Usage: just crate-deploy [target] +[doc("Full deploy: build release → install binary → install assets.")] +crate-deploy target="": + #!/usr/bin/env bash + set -euo pipefail + PLATFORM="{{_pf_root}}" + BIN_DIR="{{_pf_bin_dir}}" + DATA="{{_pf_data}}" + + _pick_target() { + local choices=(daemon orchestrator control-center vault-service ai-service provisioning-mcp ncl-sync provisioning-tool platform-rag prvng-cli nu-daemon) + echo "Available targets:" >&2 + local i=1 + for c in "${choices[@]}"; do + printf ' %2d) %s\n' "$i" "$c" >&2 + i=$((i+1)) + done + local n + read -rp "Select [1-${#choices[@]}]: " n + printf '%s\n' "${choices[$((n-1))]}" + } + + _resolve() { + case "$1" in + daemon|provisioning-daemon) echo "provisioning-daemon provisioning-daemon" ;; + orch|orchestrator) echo "orchestrator provisioning-orchestrator" ;; + cc|control-center|control) echo "control-center provisioning-control-center" ;; + vault|vault-service) echo "vault-service provisioning-vault-service" ;; + ai|ai-service) echo "ai-service provisioning-ai-service" ;; + mcp|provisioning-mcp) echo "provisioning-mcp provisioning-mcp-server" ;; + ncl-sync) echo "ncl-sync provisioning-ncl-sync" ;; + tool|provisioning-tool) echo "provisioning-tool provisioning-tool" ;; + rag|platform-rag) echo "platform-rag provisioning-rag" ;; + cli|prvng-cli) echo "prvng-cli prvng-cli" ;; + nu-daemon|nu_daemon) echo "nu-daemon provisioning-nu-daemon standalone" ;; + *) echo "error: unknown target '$1'" >&2; return 1 ;; + esac + } + + _ncl_field() { + local service="$1" field="$2" + local ncl_cfg + if [[ "$(uname -s)" == "Darwin" ]]; then + ncl_cfg="${HOME}/Library/Application Support/provisioning/platform/config/${service}.ncl" + else + ncl_cfg="${HOME}/.config/provisioning/platform/config/${service}.ncl" + fi + [[ -f "$ncl_cfg" ]] || return + sed -n "s/^[[:space:]]*${field}[[:space:]]*=[[:space:]]*\"\([^\"]*\)\".*/\1/p" \ + "$ncl_cfg" 2>/dev/null | head -1 + } + + _is_source_tree() { + local path="$1" crate_src="$2" + local canon_path canon_src + canon_path="$(cd "$(dirname "$path")" 2>/dev/null && pwd)/$(basename "$path")" + canon_src="$(cd "$crate_src" 2>/dev/null && pwd)" + [[ "$canon_path" == "$canon_src"* ]] + } + + _sync_group() { + local label="$1" src="$2" dst="$3" crate_src="$4" + if [[ ! -d "$src" ]]; then + echo " skip [${label}]: source not found (${src})" + return + fi + if _is_source_tree "$dst" "$crate_src"; then + echo " skip [${label}]: destination is source tree" + echo " → ${dst}" + echo " service reads templates directly from there" + return + fi + echo " sync [${label}]: ${src}" + echo " → ${dst}" + mkdir -p "${dst}" + rsync -a --delete "${src}/" "${dst}/" + local count + count=$(find "${dst}" -type f | wc -l | tr -d ' ') + echo " ok: ${count} files copied" + } + + # Returns "running" or empty. + # Match binary path: [/] followed by end-of-string OR a space (arguments). + # The bare $ anchor fails when the process has CLI arguments like --config. + _svc_status() { + pgrep -f "[/]${1}( |$)" >/dev/null 2>&1 && echo "running" || true + } + + INPUT="{{target}}" + [[ -z "$INPUT" ]] && INPUT="$(_pick_target)" + [[ -z "$INPUT" ]] && { echo "error: no target selected" >&2; exit 1; } + + read -r PKG BIN STANDALONE <<< "$(_resolve "$INPUT")" + SVC="${PKG//-/_}" + + echo "=== crate-deploy: ${PKG} ===" + echo "" + + # ── pre: stop service if running ─────────────────────────────────────────── + WAS_RUNNING=false + if [[ "$(_svc_status "$PKG")" == "running" ]]; then + WAS_RUNNING=true + echo " service ${PKG} is running — stopping before overwrite" + provisioning platform stop "$PKG" >/dev/null 2>&1 || true + sleep 2 + pgrep -f "[/]${BIN}$" >/dev/null 2>&1 && echo " warning: process still running" || echo " stopped: ok" + echo "" + fi + + # ── step 1: build ────────────────────────────────────────────────────────── + echo "--- [1/3] build ---" + if [[ "$STANDALONE" == "standalone" ]]; then + cargo build --release \ + --manifest-path "${PLATFORM}/crates/${PKG}/Cargo.toml" + BUILT="${PLATFORM}/crates/${PKG}/target/release/${BIN}" + else + cargo build --release \ + --manifest-path "${PLATFORM}/Cargo.toml" \ + -p "${PKG}" + BUILT="${PLATFORM}/target/release/${BIN}" + fi + SIZE=$(du -sh "$BUILT" 2>/dev/null | cut -f1 || echo "?") + echo " ok: ${BUILT} (${SIZE})" + echo "" + + # ── step 2: install binary ───────────────────────────────────────────────── + echo "--- [2/3] install ---" + mkdir -p "${BIN_DIR}" + DEST="${BIN_DIR}/${BIN}" + if [[ -f "$DEST" ]]; then + install -m 0755 "${BUILT}" "${DEST}" + echo " overwritten: ${DEST}" + else + install -m 0755 "${BUILT}" "${DEST}" + echo " installed: ${DEST}" + fi + if ! echo "$PATH" | tr ':' '\n' | grep -qxF "${BIN_DIR}"; then + echo " hint: add to PATH: export PATH=\"${BIN_DIR}:\$PATH\"" + fi + echo "" + + # ── step 3: assets ───────────────────────────────────────────────────────── + echo "--- [3/3] assets ---" + case "$PKG" in + provisioning-daemon) + CRATE_SRC="${PLATFORM}/crates/provisioning-daemon" + UI_DST="$(_ncl_field "provisioning-daemon" "ui_templates_dir")" + UI_DST="${UI_DST:-${DATA}/provisioning-daemon/ui/templates}" + _sync_group "ui_templates" "${CRATE_SRC}/ui/templates" "${UI_DST}" "${CRATE_SRC}" + ONT_DST="$(_ncl_field "provisioning-daemon" "ontology_templates")" + ONT_DST="${ONT_DST:-${DATA}/provisioning-daemon/ontology-templates}" + _sync_group "ontology_templates" "${CRATE_SRC}/ontology_templates" "${ONT_DST}" "${CRATE_SRC}" + ;; + *) + echo " ${PKG}: no file-system assets" + ;; + esac + echo "" + + # ── post: always start (restart if was running, start if was stopped) ──────── + echo "--- start ---" + provisioning platform start "$PKG" >/dev/null 2>&1 || true + sleep 2 + LOG="${HOME}/.provisioning/logs/${PKG}.log" + if pgrep -f "[/]${BIN}( |$)" >/dev/null 2>&1; then + echo " started: ok" + echo " logs: ${LOG}" + else + echo " warning: process not running after start" + echo " check: ${LOG}" + fi + echo "" + + echo "=== done: ${BIN} deployed ===" diff --git a/justfiles/daemon.just b/justfiles/daemon.just new file mode 100644 index 0000000..0d783ad --- /dev/null +++ b/justfiles/daemon.just @@ -0,0 +1,192 @@ +# provisioning-daemon — build, install, and lifecycle recipes +# ============================================================== +# Targets the HTTP+NATS daemon (crates/provisioning-daemon). +# Port: 9014 (PROVISIONING_DAEMON_BIND env overrides). +# +# nu-daemon — standalone build (excluded from platform workspace due to rustls conflict) +# Port: 9095. Binary: provisioning-nu-daemon → $HOME/.local/bin/ + +_prov_root := parent_directory(source_file()) / ".." # provisioning/ +_platform := _prov_root / "platform" +_scripts := _platform / "scripts" +_bin := "provisioning-daemon" +_install := env_var_or_default("HOME", "/usr/local") / ".local" / "share" / "provisioning" / "bin" + +_nu_daemon_crate := _platform / "crates" / "nu-daemon" +_nu_daemon_bin := "provisioning-nu-daemon" +_nu_daemon_install := env_var_or_default("HOME", "/usr/local") / ".local" / "bin" +_log := "/tmp/provisioning-daemon.log" +_pid := "/tmp/provisioning-daemon.pid" + +# ── Build ────────────────────────────────────────────────────────────────────── + +# Build provisioning-daemon in release mode +daemon-build: + #!/usr/bin/env bash + set -euo pipefail + echo "=== build: cargo build --release --bin {{_bin}} ===" + cargo build --release \ + --manifest-path "{{_platform}}/Cargo.toml" \ + --bin "{{_bin}}" + echo " binary: {{_platform}}/target/release/{{_bin}}" + +# Build in dev mode (faster, larger binary) +daemon-build-dev: + #!/usr/bin/env bash + set -euo pipefail + cargo build \ + --manifest-path "{{_platform}}/Cargo.toml" \ + --bin "{{_bin}}" + echo " binary: {{_platform}}/target/debug/{{_bin}}" + +# Run all tests for provisioning-daemon +daemon-test: + cargo test \ + --manifest-path "{{_platform}}/Cargo.toml" \ + -p provisioning-daemon \ + --lib + +# Run clippy on provisioning-daemon +daemon-lint: + cargo clippy \ + --manifest-path "{{_platform}}/Cargo.toml" \ + -p provisioning-daemon \ + -- -D warnings + +# ── Install ──────────────────────────────────────────────────────────────────── + +# Install provisioning-daemon to ~/.local/share/provisioning/bin/ +daemon-install: daemon-build + #!/usr/bin/env bash + set -euo pipefail + mkdir -p "{{_install}}" + install -m 0755 "{{_platform}}/target/release/{{_bin}}" "{{_install}}/{{_bin}}" + echo " installed: {{_install}}/{{_bin}}" + +# Build + install + restart in one step +daemon-deploy: daemon-install + #!/usr/bin/env bash + set -euo pipefail + echo "=== deploy: stopping existing daemon ===" + if [ -f "{{_pid}}" ]; then + PID=$(cat "{{_pid}}" | tr -d '[:space:]') + if [ -n "$PID" ] && kill -0 "$PID" 2>/dev/null; then + kill "$PID" + sleep 1 + fi + rm -f "{{_pid}}" + fi + echo "=== deploy: starting daemon ===" + nohup "{{_install}}/{{_bin}}" >> "{{_log}}" 2>&1 & + echo $! > "{{_pid}}" + echo " PID: $(cat {{_pid}}) log: {{_log}}" + # wait for health + for i in $(seq 1 10); do + if curl -sf http://127.0.0.1:9014/health > /dev/null 2>&1; then + echo " health: ok" + break + fi + sleep 1 + done + +# ── Lifecycle ────────────────────────────────────────────────────────────────── + +# Start daemon in background (uses installed binary or release build) +daemon-start: + nu "{{_scripts}}/start-provisioning-daemon.nu" start + +# Stop running daemon +daemon-stop: + nu "{{_scripts}}/start-provisioning-daemon.nu" stop + +# Show daemon status and health +daemon-status: + nu "{{_scripts}}/start-provisioning-daemon.nu" status + +# Restart daemon +daemon-restart: + nu "{{_scripts}}/start-provisioning-daemon.nu" restart + +# Tail daemon logs +daemon-logs: + nu "{{_scripts}}/start-provisioning-daemon.nu" logs + +# ── Dev helpers ──────────────────────────────────────────────────────────────── + +# Run daemon in foreground with debug logging (dev mode) +daemon-run: + #!/usr/bin/env bash + set -euo pipefail + RUST_LOG=debug \ + cargo run \ + --manifest-path "{{_platform}}/Cargo.toml" \ + --bin "{{_bin}}" + +# Run daemon with custom bind address +daemon-run-bind bind="0.0.0.0:9014": + #!/usr/bin/env bash + set -euo pipefail + RUST_LOG=info \ + cargo run \ + --manifest-path "{{_platform}}/Cargo.toml" \ + --bin "{{_bin}}" \ + -- --bind "{{bind}}" + +# Query health endpoint +daemon-health: + curl -sf http://127.0.0.1:9014/health | jq . + +# List registered tools via daemon API +daemon-tools: + curl -sf http://127.0.0.1:9014/api/v1/tools | jq '.tools[] | {name, category}' + +# List ontology templates (requires JWT or solo mode) +daemon-ontology-list: + curl -sf http://127.0.0.1:9014/api/v1/ontology/templates | jq . + +# Open admin UI in browser +daemon-ui: + open http://127.0.0.1:9014/admin/ + +# ── Watch (live-reload on config change) ────────────────────────────────────── + +# Run daemon with config watcher on default paths +daemon-watch paths="": + #!/usr/bin/env bash + set -euo pipefail + WATCH_PATHS="${PROVISIONING_WATCH_PATHS:-{{paths}}}" + if [ -n "$WATCH_PATHS" ]; then + WATCH_FLAG="--watch-paths $WATCH_PATHS" + else + WATCH_FLAG="" + fi + RUST_LOG=info \ + cargo run \ + --manifest-path "{{_platform}}/Cargo.toml" \ + --bin "{{_bin}}" \ + -- $WATCH_FLAG + +# ── nu-daemon (standalone — excluded from platform workspace) ───────────────── + +# Build nu-daemon in release mode (must build standalone, not via workspace) +nu-daemon-build: + #!/usr/bin/env bash + set -euo pipefail + echo "=== build: cargo build --release (standalone) ===" + cargo build --release \ + --manifest-path "{{_nu_daemon_crate}}/Cargo.toml" + echo " binary: {{_nu_daemon_crate}}/target/release/{{_nu_daemon_bin}}" + +# Install nu-daemon to $HOME/.local/bin/ +nu-daemon-install: nu-daemon-build + #!/usr/bin/env bash + set -euo pipefail + mkdir -p "{{_nu_daemon_install}}" + install -m 0755 \ + "{{_nu_daemon_crate}}/target/release/{{_nu_daemon_bin}}" \ + "{{_nu_daemon_install}}/{{_nu_daemon_bin}}" + echo " installed: {{_nu_daemon_install}}/{{_nu_daemon_bin}}" + +# Build + install in one step +nu-daemon-deploy: nu-daemon-install + @echo " deployed: {{_nu_daemon_bin}} → {{_nu_daemon_install}}" diff --git a/justfiles/distro.just b/justfiles/distro.just new file mode 100644 index 0000000..fc48613 --- /dev/null +++ b/justfiles/distro.just @@ -0,0 +1,302 @@ +# Distro Module - Local Distribution Installation +# ================================================== +# Build, package, and install provisioning binaries locally +# Integrates with package.just for distribution management + +# ============================================================================ +# Distro Module Configuration +# ============================================================================ + +distro_version := version +distro_build_dir := provisioning_root / "platform" / "target" / "release" +distro_install_dir := env_var_or_default("PROVISIONING_INSTALL_DIR", home_dir() / ".local" / "bin") +# Config directories (platform-specific, macOS vs Linux) +distro_config_dir := if os() == "macos" { + home_dir() / "Library" / "Application Support" / "provisioning" +} else { + home_dir() / ".config" / "provisioning" +} + +# Core provisioning binaries from platform workspace +provisioning_binaries := "provisioning-ai-service provisioning-extension-registry provisioning-vault-service provisioning-rag provisioning-daemon provisioning-control-center provisioning-orchestrator provisioning-mcp-server provisioning-detector" + +# ============================================================================ +# Help and Information +# ============================================================================ + +# Show distro module help +@distro-help: + echo "📦 DISTRIBUTION & LOCAL INSTALLATION MODULE" + echo "===========================================" + echo "" + echo "🏗️ BUILD FOR INSTALLATION" + echo " distro-build-release Build all platform binaries (release)" + echo " distro-build-debug Build all platform binaries (debug)" + echo "" + echo "💾 LOCAL INSTALLATION" + echo " distro-install Install binaries to ~/.local/bin" + echo " distro-install-system Install to /usr/local/bin (requires sudo)" + echo " distro-uninstall Remove installed binaries" + echo "" + echo "🔍 VERIFICATION & MANAGEMENT" + echo " distro-verify Verify installation integrity" + echo " distro-list List installed binaries" + echo " distro-info Show installation information" + echo "" + echo "📦 DISTRIBUTION PACKAGES" + echo " distro-package Create distribution package (uses package module)" + echo " distro-checksums Generate SHA256 checksums" + echo "" + echo "🧹 CLEANUP" + echo " distro-clean Clean build artifacts" + echo "" + echo "INSTALLATION DIRECTORY: {{distro_install_dir}}" + echo "CONFIG DIRECTORY: {{distro_config_dir}}" + echo "" + echo "EXAMPLES:" + echo " just distro-build-release && just distro-install" + echo " just distro-install PROVISIONING_INSTALL_DIR=/usr/local/bin" + echo " just distro-list" + +# ============================================================================ +# Build Recipes +# ============================================================================ + +# Build all platform binaries (release mode) +@distro-build-release: + echo "🔨 Building provisioning binaries (release)..." + cd {{provisioning_root}}/platform && {{cargo}} build -r --workspace + echo "✅ Binaries built: {{distro_build_dir}}" + +# Build all platform binaries (debug mode) +@distro-build-debug: + echo "🔨 Building provisioning binaries (debug)..." + cd {{provisioning_root}}/platform && {{cargo}} build --workspace + echo "✅ Debug binaries built" + +# ============================================================================ +# Installation Recipes +# ============================================================================ + +# Install binaries locally (default: ~/.local/bin) +distro-install: + #!/usr/bin/env bash + + INSTALL_DIR="{{distro_install_dir}}" + BUILD_DIR="{{distro_build_dir}}" + PROVISIONING_ROOT="{{provisioning_root}}" + + echo "📦 Installing provisioning binaries to: $INSTALL_DIR" + mkdir -p "$INSTALL_DIR" || { echo "✗ Failed to create install directory"; exit 1; } + + echo "" + echo "📂 Installing binaries..." + binaries_installed=0 + binaries_failed=0 + + for binary in {{provisioning_binaries}}; do + SRC="$BUILD_DIR/$binary" + if [ -f "$SRC" ]; then + if cp "$SRC" "$INSTALL_DIR/$binary" 2>/dev/null && chmod +x "$INSTALL_DIR/$binary" 2>/dev/null; then + echo " ✓ $binary" + ((binaries_installed++)) + else + echo " ✗ $binary (copy failed)" + ((binaries_failed++)) + fi + else + echo " ✗ $binary (not found at $SRC)" + ((binaries_failed++)) + fi + done + + echo "" + echo "📊 Installation Summary" + echo "=====================" + echo "Install directory: $INSTALL_DIR" + echo "Binaries installed: $binaries_installed" + [ $binaries_failed -gt 0 ] && echo "Binaries failed: $binaries_failed" + + # Check if install dir is in PATH + if ! echo "$PATH" | grep -q "$INSTALL_DIR"; then + echo "" + echo "⚠️ $INSTALL_DIR is not in your PATH" + echo "Add to ~/.bashrc or ~/.zshrc:" + echo " export PATH=\"\$PATH:$INSTALL_DIR\"" + fi + + echo "" + echo "✅ Installation complete!" + echo "" + echo "Verify installation:" + echo " $INSTALL_DIR/provisioning-ai-service --version" + +# Install to system directory (requires sudo) +distro-install-system: + #!/usr/bin/env bash + + INSTALL_DIR="/usr/local/bin" + BUILD_DIR="{{distro_build_dir}}" + + echo "🔐 Installing provisioning binaries to: $INSTALL_DIR (requires sudo)" + + for binary in {{provisioning_binaries}}; do + SRC="$BUILD_DIR/$binary" + if [ -f "$SRC" ]; then + echo " Installing: $binary" + if sudo cp "$SRC" "$INSTALL_DIR/$binary" && sudo chmod +x "$INSTALL_DIR/$binary"; then + echo " ✓ $binary" + else + echo " ✗ $binary (installation failed)" + fi + else + echo " ✗ $binary (not found)" + fi + done + + echo "✅ System installation complete!" + +# Uninstall binaries +distro-uninstall: + #!/usr/bin/env bash + set -e + + INSTALL_DIR="{{distro_install_dir}}" + + echo "🗑️ Uninstalling provisioning binaries from: $INSTALL_DIR" + + for binary in {{provisioning_binaries}}; do + if [ -f "$INSTALL_DIR/$binary" ]; then + rm "$INSTALL_DIR/$binary" + echo " ✓ Removed $binary" + fi + done + + echo "✅ Uninstallation complete!" + +# ============================================================================ +# Verification & Information +# ============================================================================ + +# Verify installation integrity +distro-verify: + #!/usr/bin/env bash + + INSTALL_DIR="{{distro_install_dir}}" + + echo "🔍 Verifying installation in: $INSTALL_DIR" + echo "" + + found=0 + missing=0 + + for binary in {{provisioning_binaries}}; do + if [ -f "$INSTALL_DIR/$binary" ] && [ -x "$INSTALL_DIR/$binary" ]; then + echo " ✓ $binary" + ((found++)) + else + echo " ✗ $binary (not found or not executable)" + ((missing++)) + fi + done + + echo "" + echo "📊 Summary: $found found, $missing missing" + + if [ $missing -gt 0 ]; then + echo "⚠️ Some binaries are missing!" + exit 1 + fi + + echo "✅ Installation verified!" + +# List installed binaries +distro-list: + #!/usr/bin/env bash + + INSTALL_DIR="{{distro_install_dir}}" + + echo "📋 Installed provisioning binaries in: $INSTALL_DIR" + echo "" + + if ls "$INSTALL_DIR"/provisioning-* 2>/dev/null | head -1 > /dev/null; then + ls -lh "$INSTALL_DIR"/provisioning-* + else + echo "No provisioning binaries found" + fi + +# Show installation information +distro-info: + #!/usr/bin/env bash + + INSTALL_DIR="{{distro_install_dir}}" + + echo "📦 Provisioning Installation Information" + echo "========================================" + echo "" + echo "Installation Directory: $INSTALL_DIR" + echo "" + echo "Environment Variables:" + echo " PROVISIONING_INSTALL_DIR={{distro_install_dir}}" + echo "" + echo "Status:" + if [ -d "$INSTALL_DIR" ]; then + echo " ✓ Install dir exists" + count=$(ls "$INSTALL_DIR"/provisioning-* 2>/dev/null | wc -l) + echo " Binaries installed: $count" + else + echo " ✗ Install dir not found" + fi + + echo "" + echo "In PATH:" + if echo "$PATH" | grep -q "$INSTALL_DIR"; then + echo " ✓ Install directory is in PATH" + else + echo " ✗ Install directory is NOT in PATH" + fi + +# ============================================================================ +# Packaging Recipes +# ============================================================================ + +# Create distribution package (delegates to package module) +@distro-package: + echo "📦 Creating distribution package..." + echo " (delegates to package module)" + just package-all + +# Generate checksums for distribution +@distro-checksums: + #!/usr/bin/env bash + set -e + + PACKAGES_DIR="{{packages_dir}}" + + if [ ! -d "$PACKAGES_DIR" ]; then + echo "✗ Packages directory not found: $PACKAGES_DIR" + exit 1 + fi + + echo "🔐 Generating SHA256 checksums..." + cd "$PACKAGES_DIR" + + count=0 + for file in *.tar.gz *.zip 2>/dev/null; do + [ -f "$file" ] || continue + echo " Checksumming: $file" + sha256sum "$file" > "${file}.sha256" + ((count++)) + done + + echo "✅ Generated $count checksums" + +# ============================================================================ +# Cleanup +# ============================================================================ + +# Clean build artifacts +@distro-clean: + echo "🧹 Cleaning distro artifacts..." + cd {{provisioning_root}}/platform && {{cargo}} clean + echo "✅ Clean complete" diff --git a/justfiles/docker.just b/justfiles/docker.just new file mode 100644 index 0000000..24bb184 --- /dev/null +++ b/justfiles/docker.just @@ -0,0 +1,259 @@ +# Docker Module - Container image building and management (Nickel-native) +# ========================================================================= +# Source of truth: Nickel templates in schemas/platform/templates/docker/ +# Dockerfiles are GENERATED on-demand, NOT tracked in git + +# Show detailed docker help +@docker-help: + echo "🐳 DOCKER MODULE HELP (Nickel-Native Build System)" + echo "==================================================" + echo "" + echo "This module uses Nickel templates + cargo-chef for optimized Docker builds:" + echo "• Dockerfiles generated on-demand from Nickel schemas" + echo "• 4-stage builds: PLANNER → CACHER → BUILDER → RUNTIME" + echo "• 60-80% build time reduction via dependency caching" + echo "• BuildKit cache modes: local, registry, inline" + echo "• Mode-specific tuning: solo, cicd, enterprise" + echo "" + echo "SERVICES:" + echo " orchestrator - Workflow engine and task queue" + echo " control-center - Policy and RBAC management" + echo " mcp-server - AI/LLM integration" + echo " extension-registry - Plugin management" + echo " provisioning-daemon - System daemon" + echo " ai-service - AI service integration" + echo " rag - Retrieval augmented generation" + echo " vault-service - Secret management" + echo "" + echo "RECIPES:" + echo " docker-gen Generate Dockerfiles from Nickel templates" + echo " docker-gen-compose Generate docker-compose.build.yml" + echo " docker-build SERVICES Build service(s) with auto-generation" + echo " docker-build-all Build all services with BuildKit cache" + echo " docker-clean-gen Remove generated Dockerfiles" + echo " image-list Show available services" + echo " image-validate Validate all built images" + echo " image-clean Remove all provisioning images" + echo "" + echo "EXAMPLES:" + echo " just docker-gen orchestrator # Generate Dockerfile for orchestrator" + echo " just docker-gen-compose # Generate docker-compose.build.yml" + echo " just docker-build orchestrator # Build orchestrator (auto-generates Dockerfile)" + echo " just docker-build-all # Build all services with parallel BuildKit" + echo " just docker-clean-gen # Remove all generated Dockerfiles" + echo "" + echo "NOTES:" + echo " • Dockerfiles are NOT tracked in git (source = Nickel templates)" + echo " • docker-build auto-generates Dockerfiles before building" + echo " • Change base images in schemas/platform/docker-build.ncl" + +# Generate Dockerfiles from Nickel templates (on-demand) +docker-gen MODE='solo' +SERVICES='all': + #!/usr/bin/env bash + cd {{provisioning_root}} + if [ "{{SERVICES}}" == "all" ]; then + echo "🏗️ Generating Dockerfiles for all services (mode: {{MODE}})..." + {{nu}} scripts/docker-generate-builds.nu all --mode {{MODE}} + else + echo "🏗️ Generating Dockerfiles for: {{SERVICES}} (mode: {{MODE}})..." + for service in {{SERVICES}}; do + {{nu}} scripts/docker-generate-builds.nu "$service" --mode {{MODE}} + done + fi + +# Generate docker-compose.build.yml from Nickel template +docker-gen-compose REGISTRY='localhost:5000': + #!/usr/bin/env bash + cd {{provisioning_root}} + echo "🏗️ Generating docker-compose.build.yml (registry: {{REGISTRY}})..." + {{nu}} scripts/docker-generate-compose.nu --registry {{REGISTRY}} + +# Build Docker images with auto-generation and BuildKit cache +docker-build MODE='solo' REGISTRY='localhost:5000' +SERVICES='': + #!/usr/bin/env bash + cd {{provisioning_root}} + if [ -z "{{SERVICES}}" ]; then + echo "🐳 Building all services (mode: {{MODE}})..." + {{nu}} scripts/docker-build.nu --all --mode {{MODE}} --registry {{REGISTRY}} + else + echo "🐳 Building service(s): {{SERVICES}} (mode: {{MODE}})..." + {{nu}} scripts/docker-build.nu {{SERVICES}} --mode {{MODE}} --registry {{REGISTRY}} + fi + +# Build all services with BuildKit parallel builds +docker-build-all MODE='solo' REGISTRY='localhost:5000': + #!/usr/bin/env bash + cd {{provisioning_root}} + echo "🐳 Building all platform services (mode: {{MODE}})..." + {{nu}} scripts/docker-build.nu --all --mode {{MODE}} --registry {{REGISTRY}} + +# Remove all generated Dockerfiles (not tracked in git) +docker-clean-gen: + #!/usr/bin/env bash + cd {{provisioning_root}} + echo "🧹 Removing generated Dockerfiles..." + find platform/crates -name "Dockerfile" -type f -delete + rm -f docker-compose.build.yml + echo "✅ Generated files cleaned" + +# Legacy: Build platform service Docker images (deprecated - use docker-build) +build-images +SERVICES='': + #!/usr/bin/env bash + echo "⚠️ DEPRECATED: Use 'just docker-build' instead" + cd {{provisioning_root}} + if [ -z "{{SERVICES}}" ]; then + {{nu}} scripts/docker-build.nu --all + else + {{nu}} scripts/docker-build.nu {{SERVICES}} + fi + +# List available service images +@image-list: + echo "📋 Available Platform Services" + echo "==============================" + echo "" + echo "Core Platform Services:" + echo " • orchestrator (Rust) - Workflow engine and task queue" + echo " • control-center (Rust) - Policy and RBAC management" + echo " • mcp-server (Rust) - AI/LLM integration" + echo " • extension-registry (Rust) - Plugin management" + echo " • rag (Rust) - Retrieval augmented generation" + echo "" + echo "Image Tags:" + echo " • provisioning-orchestrator:latest" + echo " • provisioning-control-center:latest" + echo " • provisioning-mcp-server:latest" + echo " • provisioning-extension-registry:latest" + echo " • provisioning-rag:latest" + echo "" + echo "Usage: just build-images [service...]" + echo " just build-images orchestrator" + echo " just build-images orchestrator control-center" + +# Validate all built Docker images +@image-validate: + #!/usr/bin/env bash + echo "🔍 Validating platform Docker images..." + echo "" + + SERVICES=("provisioning-orchestrator:latest" \ + "provisioning-control-center:latest" \ + "provisioning-mcp-server:latest" \ + "provisioning-extension-registry:latest" \ + "provisioning-rag:latest") + + VALID=0 + INVALID=0 + + for image in "${SERVICES[@]}"; do + if docker image inspect "$image" >/dev/null 2>&1; then + echo "✅ $image" + ((VALID++)) + else + echo "❌ $image (not found)" + ((INVALID++)) + fi + done + + echo "" + echo "📊 Validation Summary" + echo "====================" + echo "Valid: $VALID" + echo "Invalid: $INVALID" + + if [ $INVALID -gt 0 ]; then + echo "" + echo "Run 'just build-images' to build missing images" + exit 1 + fi + +# Remove all provisioning platform images +@image-clean: + #!/usr/bin/env bash + echo "🧹 Removing provisioning platform Docker images..." + + IMAGES=("provisioning-orchestrator:latest" \ + "provisioning-control-center:latest" \ + "provisioning-mcp-server:latest" \ + "provisioning-extension-registry:latest" \ + "provisioning-rag:latest") + + for image in "${IMAGES[@]}"; do + if docker image inspect "$image" >/dev/null 2>&1; then + echo "Removing $image..." + docker image rm "$image" || echo "⚠️ Failed to remove $image" + fi + done + + echo "✅ Image cleanup completed" + +# Show Docker image information +@image-info: + #!/usr/bin/env bash + echo "🐳 Provisioning Platform Docker Images" + echo "======================================" + echo "" + + IMAGES=("provisioning-orchestrator:latest" \ + "provisioning-control-center:latest" \ + "provisioning-mcp-server:latest" \ + "provisioning-extension-registry:latest" \ + "provisioning-rag:latest") + + for image in "${IMAGES[@]}"; do + if docker image inspect "$image" >/dev/null 2>&1; then + echo "📦 $image" + docker image inspect "$image" | jq -r '.[0] | " Created: \(.Created)\n Size: \(.Size) bytes\n OS: \(.Os)/\(.Architecture)"' + echo "" + fi + done + + if [ $(docker images --filter "reference=provisioning-*" --quiet | wc -l) -eq 0 ]; then + echo "ℹ️ No provisioning images found. Run 'just build-images' to build them." + fi + +# Build specific service image +[no-cd] +build-service SERVICE: + #!/usr/bin/env bash + cd {{provisioning_root}} + echo "🐳 Building service image: {{SERVICE}}" + {{nu}} scripts/build-images.nu {{SERVICE}} + +# Show Docker system status +@docker-status: + #!/usr/bin/env bash + echo "🐳 Docker System Status" + echo "======================" + echo "" + + if ! command -v docker &> /dev/null; then + echo "❌ Docker is not installed" + exit 1 + fi + + echo "Docker version:" + docker --version + echo "" + + echo "Docker daemon status:" + if docker ps -q >/dev/null 2>&1; then + echo "✅ Docker daemon is running" + else + echo "❌ Docker daemon is not running or not accessible" + exit 1 + fi + + echo "" + echo "Provisioning platform images:" + docker images --filter "reference=provisioning-*" | tail -n +2 || echo "None found" + + echo "" + echo "Docker disk usage:" + docker system df + +# Build and verify Docker images +@build-verify: + just build-images + just image-validate + echo "✅ Docker images built and verified" diff --git a/justfiles/nushell-automation b/justfiles/nushell-automation new file mode 120000 index 0000000..98269d2 --- /dev/null +++ b/justfiles/nushell-automation @@ -0,0 +1 @@ +/Users/Akasha/Tools/dev-system/languages/nushell/just-modules/automation \ No newline at end of file diff --git a/justfiles/nushell-script b/justfiles/nushell-script new file mode 120000 index 0000000..7060ea9 --- /dev/null +++ b/justfiles/nushell-script @@ -0,0 +1 @@ +/Users/Akasha/Tools/dev-system/languages/nushell/just-modules/script \ No newline at end of file diff --git a/justfiles/orchestrator.just b/justfiles/orchestrator.just index 33b2a02..6bbb4d4 100644 --- a/justfiles/orchestrator.just +++ b/justfiles/orchestrator.just @@ -2,6 +2,36 @@ # ============================================== # Task orchestration, workflow management, and batch operations +# ============================================================================ +# Build → Install → Restart +# ============================================================================ + +# Build orchestrator release, install to ~/.local/bin, restart service +orch-deploy: + #!/usr/bin/env bash + set -euo pipefail + # source_file() is the path to this justfile — provisioning/justfiles/orchestrator.just + # so provisioning root is two levels up from source_file() + PROV_ROOT="$(dirname "$(dirname "{{source_file()}}")")" + PLATFORM_DIR="${PROV_ROOT}/platform" + BIN_DIR="${HOME}/.local/bin" + BIN="provisioning-orchestrator" + + echo "=== build: cargo build --release --bin ${BIN} ===" + cargo build --release \ + --manifest-path "${PLATFORM_DIR}/Cargo.toml" \ + --bin "${BIN}" + + echo "=== install: ${BIN_DIR}/${BIN} ===" + mkdir -p "${BIN_DIR}" + install -m 0755 "${PLATFORM_DIR}/target/release/${BIN}" "${BIN_DIR}/${BIN}" + echo " installed: ${BIN}" + + echo "=== restart orchestrator ===" + provisioning platform restart orchestrator + + echo "=== done ===" + # ============================================================================ # Orchestrator Status and Health # ============================================================================ diff --git a/justfiles/rust-axum b/justfiles/rust-axum new file mode 120000 index 0000000..92dcf27 --- /dev/null +++ b/justfiles/rust-axum @@ -0,0 +1 @@ +/Users/Akasha/Tools/dev-system/languages/rust/just-modules/axum \ No newline at end of file diff --git a/justfiles/rust-cargo b/justfiles/rust-cargo new file mode 120000 index 0000000..3d031bf --- /dev/null +++ b/justfiles/rust-cargo @@ -0,0 +1 @@ +/Users/Akasha/Tools/dev-system/languages/rust/just-modules/cargo \ No newline at end of file diff --git a/justfiles/rust-leptos b/justfiles/rust-leptos new file mode 120000 index 0000000..29df629 --- /dev/null +++ b/justfiles/rust-leptos @@ -0,0 +1 @@ +/Users/Akasha/Tools/dev-system/languages/rust/just-modules/leptos \ No newline at end of file diff --git a/justfiles/test.just b/justfiles/test.just new file mode 100644 index 0000000..0117b02 --- /dev/null +++ b/justfiles/test.just @@ -0,0 +1,5 @@ +# Test recipes + +[doc("Show test help")] +help: + @just --list diff --git a/justfiles/workflow.just b/justfiles/workflow.just new file mode 100644 index 0000000..b32d237 --- /dev/null +++ b/justfiles/workflow.just @@ -0,0 +1,12 @@ +# Generated by ore workflow generate +# Source: .ontology/workflow.ncl + +# layer: ci-standard +ci-standard: + cargo clippy --all-targets --all-features -- -D warnings + cargo nextest run --all-features --workspace --profile ci --cargo-profile ci + cargo deny check licenses advisories + cargo doc --no-deps --workspace --profile ci -q + nickel typecheck + nu --ide-check 100 + cargo build --release --workspace diff --git a/schemas/catalog/context.ncl b/schemas/catalog/context.ncl new file mode 100644 index 0000000..d550991 --- /dev/null +++ b/schemas/catalog/context.ncl @@ -0,0 +1,125 @@ +# Component Context Schema +# +# Declares the ontological layer for a component as deployed in a specific infra. +# Used in infra component configs (e.g. infra/libre-wuji/components/zot.ncl). +# +# Three-layer identity: +# what — what the component is (from the component manifest; override if needed) +# how — how it is deployed here (derived from the settings declared alongside) +# why — why it exists in this infra (intent declared by the operator) +# +# Plus governance dimensions that every component deployment must declare: +# priority, security, supervision, updates. +# +# Usage in a component contract: +# let Context = import "schemas/catalog/context.ncl" in +# { MyComponent = { context | Context.ComponentContext | optional, ... } } +# +# Usage in an infra config: +# context = { +# how = "K8s Deployment with Hetzner CSI PVC, private Cilium gateway", +# why = "Central OCI store for lian-build pipeline and cosign distribution", +# priority = 'critical, +# security = { posture = 'private }, +# updates = { policy = 'pinned, holds = ["cosign-verify"] }, +# } + +{ + # ── Priority ──────────────────────────────────────────────────────────────── + # Operational priority of this component in this infra. + # Drives incident response, update scheduling, and removal decisions. + + ComponentPriority = [| + 'critical, # infra fails without it — immediate intervention required + 'essential, # core services degraded without it + 'important, # significant feature loss without it + 'standard, # normal services, managed lifecycle + 'optional, # convenience feature; removable without service impact + |], + + # ── Security posture ──────────────────────────────────────────────────────── + + SecurityPosture = [| + 'public, # intentionally internet-facing; FIP or public gateway + 'private, # private network only — VPN or private gateway required + 'internal, # cluster-internal only; no gateway exposure + 'airgapped, # no external network access whatsoever + |], + + # ── Update policy ─────────────────────────────────────────────────────────── + + UpdatePolicy = [| + 'pinned, # manual only — every version bump requires explicit approval + 'semver-patch, # auto-apply patch releases only (x.y.Z) + 'semver-minor, # auto-apply minor and patch releases (x.Y.z) + 'rolling-latest, # always track latest — only acceptable for 'optional priority + |], + + # ── Component Context ─────────────────────────────────────────────────────── + + ComponentContext = { + + # Ontological triad — the three questions any operator must be able to answer + # about any running component. + + what | String | doc "What this component is. Defaults to manifest.description; override when the deployment role narrows the description." | optional, + + how | String + | doc "How it is deployed in this infra — mode, storage, gateway, key integrations. Derived from the settings declared alongside this context block.", + + why | String + | doc "Why it exists in this infra — the purpose, the gap it fills, the service it enables.", + + # Governance dimensions + + priority | ComponentPriority + | doc "Operational priority: drives response SLA, update scheduling, and removal policy." + | default = 'standard, + + security | { + posture | SecurityPosture + | doc "Network exposure posture for all endpoints." + | default = 'internal, + + tls | Bool + | doc "TLS required on all exposed endpoints." + | default = true, + + concerns | Array String + | doc "Named security concerns to track — e.g. 'credential-rotation', 'access-policy-audit'." + | default = [], + } | default = {}, + + supervision | { + health_check | Bool + | doc "Active health check configured and expected to pass." + | default = true, + + metrics | Bool + | doc "Prometheus-compatible metrics endpoint exposed." + | default = false, + + alerts | Array String + | doc "Alert conditions configured — e.g. '5xx-rate', 'storage-capacity'." + | default = [], + + sla_target | String + | doc "SLA availability target — e.g. '99.9%'. Informational." + | optional, + } | default = {}, + + updates | { + policy | UpdatePolicy + | doc "Version update policy for this component." + | default = 'pinned, + + window | String + | doc "Maintenance window — e.g. 'weekends UTC+0'. Informational for scheduling." + | optional, + + holds | Array String + | doc "Gates required before update proceeds — e.g. 'cosign-verify', 'smoke-test', 'backup-verified'." + | default = [], + } | default = {}, + }, +} diff --git a/schemas/catalog/manifest.ncl b/schemas/catalog/manifest.ncl new file mode 100644 index 0000000..88c6435 --- /dev/null +++ b/schemas/catalog/manifest.ncl @@ -0,0 +1,201 @@ +# infra-catalog Component Manifest Schema +# +# Every component distributed through any infra-catalog source — regardless of +# which registry or peer hosts it — must satisfy this contract. +# +# This schema is the sole coupling point between the catalog ecosystem and any +# tool that consumes it (Provisioning, lian-build, Vapora, or others). Tools +# validate manifests against it; they do not need to know anything else about +# the component source. +# +# Identity comes from content hash (OCI digest), not from registry location. +# The `source` field is informational only. +# +# Deployment modes +# A component may support 'cluster (K8s) and/or 'systemd (host-level). +# Each mode has its own installer script declared in mode_installers. +# Most components are cluster-only; systemd mode is for components that run +# outside K8s (e.g. lian_build daemon, hccm on bare metal). +# +# Extensions +# When a component is installed, the manager discovers and loads any +# extensions it declares: CLI commands (Nu module), Justfile recipes, +# or helper scripts. Extensions must be present at the declared paths. +# +# Usage: +# let Catalog = import "schemas/catalog/manifest.ncl" in +# { ... } | Catalog.Manifest + +{ + # ── Kind enum ───────────────────────────────────────────────────────────────── + # Each kind binds the component to a specific interface contract. + # Tools that need a capability load the corresponding entry point. + + ComponentKind = [| + 'ComputeProvider, # spawns/destroys ephemeral compute — implements runner.nu interface + 'StorageProvider, # manages persistent volumes — implements storage.nu interface + 'RegistryAdapter, # OCI registry operations — implements registry.nu interface + 'DeploymentComponent, # deploys a service — implements installer.nu interface + 'AgentProvider, # AI/agent capabilities — implements agent.nu interface + 'SchemaLibrary, # reusable Nickel schemas — no runtime entry point + |], + + # ── Tool requirement ────────────────────────────────────────────────────────── + # Declares an external CLI that the component's scripts invoke. + + ToolRequirement = { + name | String, + min_version | String | optional, + check | String | optional + | doc "Shell expression that exits 0 when the tool is available.", + }, + + # ── Catalog dependency ──────────────────────────────────────────────────────── + # Another catalog component required at runtime. + + CatalogDep = { + name | String, + kind | ComponentKind, + version | String, + }, + + # ── Source reference ────────────────────────────────────────────────────────── + # Informational. Content trust is established by OCI digest and signature, + # not by source URL. Multiple sources are mirrors of the same content. + + SourceRef = { + radicle | String | optional + | doc "Radicle project ID, e.g. rad:z6MkhDvY...", + git | String | optional + | doc "Git remote URL (mirror or upstream).", + oci | String | optional + | doc "Canonical OCI reference (without digest — digest is in the artifact manifest).", + }, + + # ── Deployment modes ───────────────────────────────────────────────────────── + # Declares which deployment modes the component supports. + # The manager uses this to validate that the requested mode is available + # before attempting to install. + + DeploymentModes = { + cluster | Bool + | doc "Supports in-cluster K8s deployment via cluster/install.sh." + | default = false, + systemd | Bool + | doc "Supports host-level deployment as a systemd unit via systemd/install.sh." + | default = false, + }, + + # ── Mode installers ─────────────────────────────────────────────────────────── + # Paths relative to the component root after installation. + # Only required when the corresponding mode is true in deployment_modes. + # Convention: cluster → "cluster/install.sh", systemd → "systemd/install.sh". + + ModeInstallers = { + cluster | String + | doc "Installer script for cluster mode. Implements install/uninstall/status/upgrade." + | optional, + systemd | String + | doc "Installer script for systemd mode. Implements install/uninstall/enable/disable." + | optional, + }, + + # ── Extensions ──────────────────────────────────────────────────────────────── + # Capabilities the component adds to the manager when installed. + # The manager loads declared extensions at startup so callers get new + # subcommands and recipes without modifying the manager itself. + # + # Convention: + # cli → extensions/cli/commands.nu (Nu module; exports " " commands) + # just → extensions/just/.just (Justfile module; recipe prefix = name) + # scripts → extensions/scripts/ (helper scripts; no discovery contract) + + Extensions = { + cli | Bool + | doc "Provides extensions/cli/commands.nu — loaded by the manager CLI dispatcher." + | default = false, + just | Bool + | doc "Provides extensions/just/.just — Justfile module for workspace recipes." + | default = false, + scripts | Bool + | doc "Provides extensions/scripts/ — helper scripts for direct invocation." + | default = false, + }, + + # ── Entry points ────────────────────────────────────────────────────────────── + # Runtime interface entry points — paths relative to component root after installation. + # Each ComponentKind mandates its corresponding entry point. + + EntryPoints = { + runner | String | optional + | doc "ComputeProvider: script implementing spawn/destroy/status/describe.", + storage | String | optional + | doc "StorageProvider: script implementing create/delete/attach/detach.", + registry | String | optional + | doc "RegistryAdapter: script implementing push/pull/list/delete.", + installer | String | optional + | doc "DeploymentComponent: fallback installer when mode_installers is absent.", + agent | String | optional + | doc "AgentProvider: script implementing run/cancel/status.", + schema | String | optional + | doc "SchemaLibrary: primary .ncl export path.", + }, + + # ── Capabilities ───────────────────────────────────────────────────────────── + # Declared before loading so callers can query without executing the component. + + Capabilities = { + ephemeral_compute | Bool | default = false, + persistent_compute | Bool | default = false, + snapshot_create | Bool | default = false, + network_management | Bool | default = false, + multi_region | Bool | default = false, + arm64 | Bool | default = false, + amd64 | Bool | default = false, + storage_block | Bool | default = false, + storage_object | Bool | default = false, + storage_file | Bool | default = false, + oci_push | Bool | default = false, + oci_pull | Bool | default = false, + llm | Bool | default = false, + rag | Bool | default = false, + workflow | Bool | default = false, + }, + + # ── Manifest (root contract) ────────────────────────────────────────────────── + + Manifest = { + name | String + | doc "Lowercase, hyphenated identifier. Unique within a kind.", + + version | String + | doc "Semver string: MAJOR.MINOR.PATCH.", + + kind | ComponentKind, + + description | String + | doc "One sentence — catalog entry, search index, and discovery text.", + + requires = { + nu | String | optional | doc "Minimum Nushell version.", + nickel | String | optional | doc "Minimum Nickel version.", + tools | Array ToolRequirement | default = [], + }, + + catalog_deps | Array CatalogDep | default = [], + + deployment_modes | DeploymentModes | default = {}, + + mode_installers | ModeInstallers | default = {}, + + extensions | Extensions | default = {}, + + entry_points | EntryPoints | default = {}, + + capabilities | Capabilities | default = {}, + + source | SourceRef | default = {}, + + authors | Array String | default = [], + }, +} diff --git a/schemas/commands_registry/defaults.ncl b/schemas/commands_registry/defaults.ncl new file mode 100644 index 0000000..9345ab2 --- /dev/null +++ b/schemas/commands_registry/defaults.ncl @@ -0,0 +1,23 @@ +let cmds_reg_schema = import "./schema.ncl" in + +let base_command = { + command | default = "", + aliases | default = [], + requires_daemon | default = false, + requires_services | default = false, + uses_cache | default = false, + requires_args | default = false, + help_category | default = "", + description | default = "", + daemon_target | default = 'none, +} in + +{ + # Create a command: define only what you need, rest filled from defaults + # Usage: make_command { command = "help", uses_cache = true, ... } + make_command = fun overrides => + base_command & overrides, + + # Default values (validated) + defaults = base_command | cmds_reg_schema.CommandRecord, +} diff --git a/schemas/commands_registry/schema.ncl b/schemas/commands_registry/schema.ncl new file mode 100644 index 0000000..aaa94ae --- /dev/null +++ b/schemas/commands_registry/schema.ncl @@ -0,0 +1,19 @@ +# Command Registry Schema - Type Contracts +# Defines the structure and validation for all CLI commands + +# Type contract for command records +# Open record type (...) allows partial records during merge, validates on output +{ + CommandRecord = { + command | String, + aliases | Array String, + requires_daemon | Bool, + requires_services | Bool, + uses_cache | Bool, + requires_args | Bool, + help_category | String, + description | String, + daemon_target | std.enum.TagOrString | [| 'none, 'cli, 'orchestrator |], + .. + }, +} diff --git a/schemas/config/dag/main.ncl b/schemas/config/dag/main.ncl new file mode 100644 index 0000000..8241658 --- /dev/null +++ b/schemas/config/dag/main.ncl @@ -0,0 +1,13 @@ +# schemas/config/dag/main.ncl — DAG runtime configuration +# +# Exposes DAG execution defaults as runtime config following the schemas/config/ pattern. +# Consumed by the Nushell loader (lib_provisioning/config/loader/dag.nu) via nickel export. +# Workspace-level dag.ncl can override full blocks (execution, resolution, events). + +let dag = import "../../lib/dag/main.ncl" in + +{ + execution = dag.defaults.composition, + resolution = dag.defaults.resolution, + events = dag.defaults.events, +} diff --git a/schemas/config/defaults/contracts.ncl b/schemas/config/defaults/contracts.ncl index 76cbd93..3700b62 100644 --- a/schemas/config/defaults/contracts.ncl +++ b/schemas/config/defaults/contracts.ncl @@ -3,7 +3,7 @@ # | Pattern: Pure schema definitions using Nickel contracts { - ServerDefaults = { + ServerDefaults { lock | Bool, time_zone | String, running_wait | Number, @@ -31,5 +31,5 @@ domains_search | String | optional, user_ssh_key_path | String | optional, scale | Dyn | optional, - }, + } } diff --git a/schemas/config/defaults/main.ncl b/schemas/config/defaults/main.ncl index 2020514..05a1941 100644 --- a/schemas/config/defaults/main.ncl +++ b/schemas/config/defaults/main.ncl @@ -2,9 +2,9 @@ # | Migrated from: provisioning/kcl/defaults.k # | Pattern: Hybrid - defaults + makers + direct access (contracts available via import) -let contracts_lib = import "./contracts.ncl" in -let defaults_lib = import "./defaults.ncl" in -let lib = import "../../lib/main.ncl" in +#let contracts_lib = import "schemas/config/defaults/contracts.ncl" in +let defaults_lib = import "schemas/config/defaults/defaults.ncl" in +#let lib = import "../../lib/main.ncl" in { defaults = defaults_lib, diff --git a/schemas/config/settings/defaults.ncl b/schemas/config/settings/defaults.ncl index fcb8b3a..205a6d9 100644 --- a/schemas/config/settings/defaults.ncl +++ b/schemas/config/settings/defaults.ncl @@ -53,7 +53,7 @@ cluster_admin_host = "", cluster_admin_port = 22, servers_wait_started = 27, - cluster_admin_user = "root", + cluster_admin_user = "devadm", clusters_save_path = "/${main_name}/clusters", servers_paths = ["servers"], clusters_paths = ["clusters"], diff --git a/schemas/examples/deployment-with-secrets.ncl b/schemas/examples/deployment-with-secrets.ncl new file mode 100644 index 0000000..cdf019a --- /dev/null +++ b/schemas/examples/deployment-with-secrets.ncl @@ -0,0 +1,239 @@ +# Example: Complete Deployment Configuration with Nickel + SOPS Integration +# +# This example shows the hybrid pattern: +# 1. Infrastructure config in .ncl (readable, version-controlled) +# 2. Secrets in YAML (encrypted with SOPS) +# 3. Merged at deployment time + +let sops = import "schemas/security/sops/main.ncl" in +let secrets_loader = import "schemas/security/secrets-loader.ncl" in +let config_merger = import "schemas/security/config-merger.ncl" in + +{ + # ============================================ + # STEP 1: Default Configuration + # ============================================ + defaults = { + environment = "dev", + deployment_mode = "solo", + + database = { + type = "postgresql", + host = "localhost", + port = 5432, + name = "myapp", + user = "app_user", + # Password placeholder - will be replaced by secrets + password = "${secret:database.password}", + ssl = false, + pool_size = 10, + }, + + redis = { + host = "localhost", + port = 6379, + # Password placeholder - will be replaced by secrets + password = "${secret:redis.password}", + db = 0, + ttl = 3600, + }, + + api = { + host = "0.0.0.0", + port = 8080, + # API key placeholder - will be replaced by secrets + api_key = "${secret:api.api_key}", + timeout = 30, + max_connections = 100, + }, + + tls = { + enabled = false, + # Certificate placeholders - will be replaced by secrets + certificate = "${secret:tls.certificate}", + private_key = "${secret:tls.private_key}", + }, + }, + + # ============================================ + # STEP 2: Environment-Specific Overrides + # ============================================ + environments = { + # All environments inherit these + all = { + logging = { + level = "info", + format = "json", + }, + }, + + # Development overrides + dev = { + database = { + host = "postgres-dev.local", + ssl = false, + }, + redis = { + host = "redis-dev.local", + }, + api = { + port = 8080, + }, + logging = { + level = "debug", + }, + }, + + # Staging overrides + staging = { + database = { + host = "postgres-staging.example.com", + ssl = true, + }, + redis = { + host = "redis-staging.example.com", + }, + api = { + port = 443, + }, + tls = { + enabled = true, + }, + logging = { + level = "info", + }, + }, + + # Production overrides + prod = { + database = { + host = "postgres-prod-cluster.example.com", + port = 5432, + ssl = true, + pool_size = 50, + }, + redis = { + host = "redis-prod-cluster.example.com", + }, + api = { + port = 443, + max_connections = 1000, + }, + tls = { + enabled = true, + }, + logging = { + level = "warn", + }, + }, + }, + + # ============================================ + # STEP 3: Deployment Modes + # ============================================ + deployment_modes = { + solo = { + replicas = 1, + resources = { + cpu = "1", + memory = "512Mi", + }, + }, + + ha = { + replicas = 3, + resources = { + cpu = "2", + memory = "2Gi", + }, + }, + + enterprise = { + replicas = 5, + resources = { + cpu = "4", + memory = "4Gi", + }, + }, + }, + + # ============================================ + # STEP 4: SOPS Configuration + # ============================================ + sops_config = { + dev = (sops.generate_sops_yaml "dev"), + staging = (sops.generate_sops_yaml "staging"), + prod = (sops.generate_sops_yaml "prod"), + }, + + # ============================================ + # STEP 5: Build Final Configuration + # ============================================ + # This function is called at deployment time with: + # - environment: "dev" | "staging" | "prod" + # - secrets: loaded from config/secrets/{env}.yaml (SOPS-encrypted) + # - deployment_mode: "solo" | "ha" | "enterprise" + + build_config = fun environment deployment_mode secrets => + let env_config = config_merger.by_environment @ { defaults = $.defaults, environments = $.environments } environment in + let mode_config = ($.deployment_modes | std.record.get deployment_mode | default {}) in + let base = config_merger.compose_config $.defaults env_config {} in + let with_mode = config_merger.compose_config base mode_config {} in + let final = config_merger.compose_config with_mode secrets {} in + + # Merge secrets into placeholders + secrets_loader.merge final secrets, + + # ============================================ + # Export Configuration for Different Scenarios + # ============================================ + + # Development configuration (for local testing) + config_dev = { + environment = "dev", + deployment_mode = "solo", + config = ( + config_merger.compose_config + $.defaults + ($.environments | std.record.get "dev") + {} + ), + }, + + # Staging configuration (requires secrets) + config_staging = { + environment = "staging", + deployment_mode = "ha", + config = ( + config_merger.compose_config + $.defaults + ($.environments | std.record.get "staging") + {} + ), + }, + + # Production configuration (requires secrets) + config_prod = { + environment = "prod", + deployment_mode = "enterprise", + config = ( + config_merger.compose_config + $.defaults + ($.environments | std.record.get "prod") + {} + ), + }, + + # ============================================ + # Validation + # ============================================ + + validate = fun configuration => + let required_paths = [ + "database.host", + "database.user", + "redis.host", + "api.port", + ] in + config_merger.validate_complete configuration required_paths, +} diff --git a/schemas/infrastructure/README.md b/schemas/infrastructure/README.md index 587b4d4..9ebe3bf 100644 --- a/schemas/infrastructure/README.md +++ b/schemas/infrastructure/README.md @@ -77,7 +77,7 @@ Infrastructure Schemas (Docker, Kubernetes, Nginx, etc.) ### Example: Service Port Definition ```bash -# Platform service schema (provisioning/schemas/platform/schemas/orchestrator.ncl) +# Platform service schema (provisioning/schemas/platform/orchestrator.ncl) server = { port | Number, # Define port once } diff --git a/schemas/infrastructure/compute/cluster/contracts.ncl b/schemas/infrastructure/compute/cluster/contracts.ncl index fef9eb3..ae8b429 100644 --- a/schemas/infrastructure/compute/cluster/contracts.ncl +++ b/schemas/infrastructure/compute/cluster/contracts.ncl @@ -1,6 +1,4 @@ -# | Cluster configuration contracts (schema definitions) -# | Migrated from: provisioning/kcl/cluster.k -# | Pattern: Pure schema definitions using Nickel contracts +let scaling = import "../scaling.ncl" in { Cluster = { @@ -16,6 +14,6 @@ admin_port | String | optional, admin_user | String | optional, ssh_key_path | String | optional, - scale | Dyn | optional, + scale | scaling.ScalePolicy | optional, }, } diff --git a/schemas/infrastructure/compute/scaling.ncl b/schemas/infrastructure/compute/scaling.ncl new file mode 100644 index 0000000..3936454 --- /dev/null +++ b/schemas/infrastructure/compute/scaling.ncl @@ -0,0 +1,26 @@ +let node_role_contract = [| 'ControlPlane, 'Worker, 'LoadBalancer |] in + +let scale_template_contract = { + server_type | String, + location | String, + hostname_pattern | String, + private_network | String, + ip_range_prefix | String, + formula_id | String | optional, + image_role | String | optional, + os_type | String | optional, + architecture | String | optional, +} in + +let scale_policy_contract = { + role | node_role_contract, + min | Number, + max | Number, + template | scale_template_contract, +} in + +{ + NodeRole = node_role_contract, + ScaleTemplate = scale_template_contract, + ScalePolicy = scale_policy_contract, +} diff --git a/schemas/infrastructure/compute/server/contracts.ncl b/schemas/infrastructure/compute/server/contracts.ncl index 486d52f..6eafe8a 100644 --- a/schemas/infrastructure/compute/server/contracts.ncl +++ b/schemas/infrastructure/compute/server/contracts.ncl @@ -1,6 +1,4 @@ -# | Server configuration contracts (schema definitions) -# | Migrated from: provisioning/kcl/server.k -# | Pattern: Pure schema definitions using Nickel contracts +let scaling = import "../scaling.ncl" in { Server = { @@ -37,6 +35,7 @@ main_domain | String | optional, domains_search | String | optional, user_ssh_key_path | String | optional, - scale | Dyn | optional, + role | scaling.NodeRole | optional, + scale | scaling.ScalePolicy | optional, }, } diff --git a/schemas/infrastructure/images/contracts.ncl b/schemas/infrastructure/images/contracts.ncl new file mode 100644 index 0000000..7211daa --- /dev/null +++ b/schemas/infrastructure/images/contracts.ncl @@ -0,0 +1,46 @@ +# ImageRole Contracts — type definitions for provider role images and their state. + +{ + ImageState = [| 'keep, 'delete_after_use, 'delete_time_lapse, 'archive |], + + HardwareLimits = { + min_memory_gb | Number, + min_disk_gb | Number, + allowed_types | Array String, + network_required | Bool, + ports_required | Array Number, + ssh_required | Bool, + }, + + ImagePackage = { + name | String, + version | String | optional, + }, + + ImageRole = { + name | String, + os_base | String, + provider | String, + template_name | String, + state | ImageState, + state_config | { + freshness_days | Number, + delete_after_hours | Number | optional, + archive_path | String | optional, + }, + packages | Array ImagePackage, + labels | { .. }, + hardware | HardwareLimits, + }, + + # Written to ~/.config/provisioning/images/{provider}-{role}.ncl + ImageRoleState = { + provider | String, + role | String, + snapshot_id | String, + built_at | String | optional, + last_used | String | optional, + os_base | String, + labels | { .. }, + }, +} diff --git a/schemas/infrastructure/images/defaults.ncl b/schemas/infrastructure/images/defaults.ncl new file mode 100644 index 0000000..ec1d3dd --- /dev/null +++ b/schemas/infrastructure/images/defaults.ncl @@ -0,0 +1,30 @@ +# ImageRole Defaults — base values for image role definitions. + +{ + image_role | default = { + os_base | default = "debian-12", + provider | default = "hetzner", + template_name | default = "hetzner_build_image.j2", + state | default = 'keep, + state_config | default = { + freshness_days | default = 30, + }, + packages | default = [], + labels | default = {}, + hardware | default = { + min_memory_gb | default = 2, + min_disk_gb | default = 20, + allowed_types | default = ["cax11", "cax21"], + network_required | default = true, + ports_required | default = [], + ssh_required | default = true, + }, + }, + + image_role_state | default = { + snapshot_id | default = "SNAPSHOT_PENDING", + built_at | default = null, + last_used | default = null, + labels | default = {}, + }, +} diff --git a/schemas/infrastructure/images/main.ncl b/schemas/infrastructure/images/main.ncl new file mode 100644 index 0000000..624a069 --- /dev/null +++ b/schemas/infrastructure/images/main.ncl @@ -0,0 +1,17 @@ +# ImageRole public API — types and maker functions for provider role images. + +let contracts_lib = import "./contracts.ncl" in +let defaults_lib = import "./defaults.ncl" in + +{ + defaults = defaults_lib, + + make_image_role | not_exported = fun overrides => + defaults_lib.image_role & overrides, + + make_image_role_state | not_exported = fun overrides => + defaults_lib.image_role_state & overrides, + + DefaultImageRole = defaults_lib.image_role, + DefaultImageRoleState = defaults_lib.image_role_state, +} diff --git a/schemas/lib/backup_group.ncl b/schemas/lib/backup_group.ncl new file mode 100644 index 0000000..d163117 --- /dev/null +++ b/schemas/lib/backup_group.ncl @@ -0,0 +1,41 @@ +# Backup group contracts — consistency points across multiple components. +# When services interact (Odoo + PostgreSQL + filestore, mail + LDAP + indexes, +# etc.) a per-component snapshot can leave inconsistency between members in DR. +# A BackupGroup declares a coordination window so the manager can produce a +# consistent cut (à la Chandy-Lamport) across members in the same instant. + +let bp = import "backup_policy.ncl" in +let vault = import "vault_refs.ncl" in + +{ + # Member of a backup group. Either references a whole component policy or + # a specific scope of that policy. + GroupMember = { + component | String | doc "ComponentDef.name", + scope | String | optional | doc "BackupScope.name; omitted = all scopes", + }, + + # Coordination strategies. 'best_effort tags members with the same group_id + # but does not synchronize; 'quiesce_window runs ordered pre-hooks; 'csi_consistent_group + # delegates atomicity to the CSI driver (requires Longhorn ≥ supported version). + CoordinationStrategy = { + kind | [| 'best_effort, 'quiesce_window, 'csi_consistent_group |], + quiesce_seq | Array String | default = [], + max_downtime | bp.Duration | optional, + snapshot_class | String | optional, + }, + + BackupGroup = { + name | String | doc "Identifier (used in CLI: --group )", + members | Array GroupMember + | doc "Components or scopes participating in the consistent cut", + schedule | bp.Schedule, + coordination | CoordinationStrategy, + retention | bp.RetentionPolicy, + destinations | Array bp.Destination + | doc "Same MultiDestinationRequired invariant as BackupPolicy", + encryption | vault.VaultKeyRef, + tag_strategy | bp.TagStrategy, + verify | bp.VerifyPolicyRef | optional, + }, +} diff --git a/schemas/lib/backup_policy.ncl b/schemas/lib/backup_policy.ncl new file mode 100644 index 0000000..18d1497 --- /dev/null +++ b/schemas/lib/backup_policy.ncl @@ -0,0 +1,186 @@ +# Backup policy contracts — declarative description of how a component is backed up. +# Consumed by the backup-manager crate (one-shot, daemon, standalone, coordinator modes). +# Encryption, multi-destination replication and non-empty scopes are enforced as +# Nickel contracts so misconfiguration fails at `nickel export` time, not at runtime. + +let vault = import "vault_refs.ncl" in + +let _Duration = std.contract.from_validator (fun value => + if !(std.is_string value) + then 'Error { message = "Duration must be a String" } + else if std.string.length value == 0 + then 'Error { message = "Duration must be non-empty" } + else if !(std.string.contains "s" value + || std.string.contains "m" value + || std.string.contains "h" value + || std.string.contains "d" value) + then 'Error { message = "Duration must contain a time unit (s, m, h, d)" } + else 'Ok +) in + +let _CronExpr = std.contract.from_validator (fun value => + if !(std.is_string value) + then 'Error { message = "CronExpr must be a String" } + else + let parts = std.string.split " " value in + if std.array.length parts == 5 || std.array.length parts == 6 + then 'Ok + else 'Error { message = "CronExpr must have 5 or 6 space-separated fields" } +) in + +let _Tags = { _ | String } in + +let _DnsRecordsLike = { .. } in + +{ + Duration = _Duration, + CronExpr = _CronExpr, + Tags = _Tags, + + # Schedule discriminated union: cron, interval or NATS event-driven. + Schedule = { + kind | [| 'cron, 'interval, 'on_event |], + cron_expr | _CronExpr | optional, + jitter_sec | Number | optional, + every | _Duration | optional, + jitter | _Duration | optional, + subject | String | optional | doc "NATS subject when kind = 'on_event", + debounce | _Duration | optional, + }, + + # Retention preset — keeps last N + N daily/weekly/monthly/yearly snapshots. + RetentionPolicy = { + keep_last | Number | default = 7, + keep_daily | Number | default = 7, + keep_weekly | Number | default = 4, + keep_monthly | Number | default = 6, + keep_yearly | Number | default = 0, + prune_after | _Duration | optional | doc "Delete data older than this regardless of keep_* (safety bound)", + }, + + # A backup destination (where snapshots end up). At least 2 are required + # when policy is enabled (MultiDestinationRequired contract). + Destination = { + name | String | doc "Stable identifier (used in metrics labels and tags)", + kind | [| 's3, 'b2, 'local, 'sftp, 'rest_server |], + uri | String | doc "restic-style URI: 's3:host/bucket', 'b2:bucket', 'sftp:user@host:/path', etc.", + cred_ref | vault.VaultCredRef, + role | [| 'primary, 'replica, 'archive |] | default = 'replica, + region | String | optional, + }, + + # Tagging strategy for snapshots. The actual tags emitted are a determinístic + # function of {component, scope, parameters} computed by the manager. + TagStrategy = { + component_label | String | doc "Used as `component=` tag", + extra | Array String | doc "Additional static tags (k=v strings)" | default = [], + }, + + # Database dump strategy. Three flavours cover the consistency/atomicity matrix. + DumpStrategy = { + kind | [| 'stream_to_stdin, 'dump_to_path, 'pre_dump_then_path, + 'csi_volume_snapshot, 'app_quiesce_then_snapshot |], + dump_command | String | optional, + path | String | optional, + cleanup | Bool | default = true, + volume | String | optional, + snapshot_class | String | optional, + quiesce_cmd | String | optional, + unquiesce_cmd | String | optional, + }, + + DbEngine = [| 'postgresql, 'mariadb, 'mysql, 'redis, 'mongodb, 'surrealdb, 'etcd, 'sqlite |], + + # Discriminated scope: what gets backed up and how it's grouped/tagged. + BackupScope = { + kind | [| 'service_full, 'per_domain, 'per_mailbox, 'database, + 'volume_snapshot, 'logs_archive, 'kv_export |], + name | String | doc "Identifier within a policy (used in CLI: --scope )", + paths | Array String | default = [], + exclude | Array String | default = [], + domains | Array String | default = [], + base_path | String | default = "", + selector | String | optional, + engine | DbEngine | optional, + dump_strategy | DumpStrategy | optional, + volumes | Array String | default = [], + snapshot_class | String | optional, + sources | Array String | default = [], + format | [| 'jsonl_gz, 'tar_gz, 'restic_native, 'sqlite_dump |] | optional, + rotation | _Duration | optional, + source | [| 'etcd, 'consul, 'loki, 'journald, 'files |] | optional, + tag_prefix | String | default = "", + tags | _Tags | default = {}, + }, + + # Pre/post hooks executed by the manager around the backup run. + Hooks = { + pre | Array String | default = [], + post | Array String | default = [], + timeout | _Duration | default = "5m", + abort_on_failure | Bool | default = true, + }, + + # Throttle network bandwidth (passed to provider as --limit-upload/--limit-download). + Throttle = { + upload_kbps | Number | optional, + download_kbps | Number | optional, + }, + + # Verify policy. Drill is a separate spec consumed by verify_policy.ncl. + VerifyPolicyRef = { + schedule | { kind | [| 'cron, 'interval |], cron_expr | String | optional, every | _Duration | optional } | optional, + level | [| 'quick, 'deep, 'restore_drill, 'full_dr |] | default = 'quick, + drill_ref | String | optional | doc "Reference to a DrillSpec by name (looked up from verify-recipes/)", + }, + + # Provider reference. Manager resolves to extensions/providers/backup//. + BackupProviderRef = { + name | String | doc "Provider directory name (e.g. 'restic', 'kopia')", + version | String | optional | doc "Pinned version; warn if installed CLI mismatches", + }, + + # === Contracts =========================================================== + + # NonEmptyScopes: an enabled BackupPolicy must have at least one scope. + NonEmptyScopes = std.contract.from_validator (fun value => + if std.array.length value > 0 + then 'Ok + else 'Error { message = "BackupPolicy.scopes must contain at least one BackupScope" } + ), + + # MultiDestinationRequired: enforces the off-site replication invariant. + # A policy must declare ≥2 destinations and at least one with role = 'primary. + MultiDestinationRequired = std.contract.from_validator (fun value => + if std.array.length value < 2 + then 'Error { + message = "BackupPolicy.destinations must contain at least 2 entries (off-site replication is non-negotiable)", + } + else + let has_primary = std.array.any (fun d => d.role == 'primary) value in + if !has_primary + then 'Error { + message = "BackupPolicy.destinations must contain at least one entry with role = 'primary", + } + else 'Ok + ), + + # === Top-level policy ==================================================== + + BackupPolicy = { + provider | BackupProviderRef, + destinations | Array Destination + | doc "≥2 destinations, at least one 'primary", + encryption | vault.VaultKeyRef + | doc "Encryption key reference in vault (E2E encryption is non-negotiable)", + schedule | Schedule, + retention | RetentionPolicy, + scopes | Array BackupScope + | doc "1..N backup units; tagged determinístically", + tag_strategy | TagStrategy, + hooks | Hooks | optional, + verify | VerifyPolicyRef | optional, + throttle | Throttle | optional, + consistency_group | String | optional | doc "If set, this policy participates in a BackupGroup", + }, +} diff --git a/schemas/lib/build_spec.ncl b/schemas/lib/build_spec.ncl new file mode 100644 index 0000000..b50f6d8 --- /dev/null +++ b/schemas/lib/build_spec.ncl @@ -0,0 +1,71 @@ +# schemas/lib/build_spec.ncl — BuildSpec contract (ADR-039) +# +# Schema for .build-spec.ncl files at the root of each built repo. +# buildkit-launcher validates against this schema at parse time and exits +# non-zero on failure (constraint: build-spec-schema-versioned). +# +# Three-tier sizing resolution (launcher, not schema): +# 1. Explicit declaration here (highest priority) +# 2. P95 historical from orchestrator SurrealDB × 1.2 +# 3. Language-default fallback (lowest priority) +# +# Usage: +# let bs = import "schemas/lib/build_spec.ncl" in +# { .. } | bs.BuildSpec + +let positive_number_ = + std.contract.custom ( + fun label => + fun value => + if value > 0 then + 'Ok value + else + 'Error { + message = "Expected a positive number, got '%{std.to_string value}'.\nAll resource fields must be > 0" + } + ) +in + +let bounded_cpu_ = + std.contract.custom ( + fun label => + fun value => + if value > 0 && value <= 256 then + 'Ok value + else + 'Error { + message = "Invalid cpu value '%{std.to_string value}'.\nValid range: (0, 256]" + } + ) +in + +let bounded_time_budget_ = + std.contract.custom ( + fun label => + fun value => + if value > 0 && value <= 1440 then + 'Ok value + else + 'Error { + message = "Invalid time_budget_min '%{std.to_string value}'.\nValid range: (0, 1440] — max 24 hours" + } + ) +in + +let _BuildSpec = { + schema_version | Number | doc "Schema version — buildkit-launcher rejects files with unknown versions" | default = 1, + cpu | bounded_cpu_ | doc "Virtual CPUs to request for the ephemeral runner VM", + memory_gb | positive_number_ | doc "RAM in GiB for the runner VM", + disk_gb | positive_number_ | doc "Ephemeral disk in GiB; no persistent storage — all state is destroyed with the VM", + time_budget_min | bounded_time_budget_ | doc "Hard wall-clock limit in minutes; VM is killed on expiry", + cache_keys | Array String | doc "sccache / BuildKit cache key namespaces to warm for this repo" | default = [], + oom_retry | Bool | doc "When true, launcher retries once at next size tier on OOM kill; bounded to 1 retry (constraint: oom-retry-bounded)" | default = true, +} in + +{ + BuildSpec = _BuildSpec, + PositiveNumber = positive_number_, + BoundedCpu = bounded_cpu_, + + make_build_spec | not_exported = fun data => data | _BuildSpec, +} diff --git a/schemas/lib/capabilities.ncl b/schemas/lib/capabilities.ncl new file mode 100644 index 0000000..90d23bc --- /dev/null +++ b/schemas/lib/capabilities.ncl @@ -0,0 +1,73 @@ +# schemas/lib/capabilities.ncl — InfraCapabilities contract +# +# Declares what the infrastructure provides: cluster runtime, storage classes, +# ingress, TLS, volumes, networking, and registry topology. +# Source of truth for cross-validation against component requires.* fields +# and for registry resolution by integration tooling (prvng i). +# +# Usage: +# let cap = import "schemas/lib/capabilities.ncl" in +# { provides | cap.InfraCapabilities = { ... } } + +{ + # Registry roles — determines namespace ownership and sync direction. + # 'primary canonical store; other registries replicate FROM it + # 'build builder-local store; owns ephemeral cache namespaces + # 'dev developer workstation; on-demand mirror of primary + # 'mirror read-only replica with no own namespaces + RegistryRole = [| 'primary, 'build, 'dev, 'mirror |], + + # Per-registry namespace policy. + # own — namespaces this registry is authoritative for + # replicate_to — ids of other registries that should receive sync of `prefixes` + # mirror_from — id of upstream registry to mirror `prefixes` from (on-demand) + # prefixes — which namespace prefixes are synced (cross-registry contracts) + RegistryNamespaces = { + own | Array String | default = [], + replicate_to | Array String | default = [], + mirror_from | String | optional, + prefixes | Array String | default = [], + }, + + RegistryEntry = { + id | String, + endpoint | String, + role | RegistryRole, + tls | Bool | default = true, + namespaces | RegistryNamespaces | default = {}, + }, + + # Multi-registry topology for a workspace. + # registries — ordered list; first 'primary entry is the canonical store + # default — id of the registry used by integration tooling when no + # --registry flag or PROVISIONING_REGISTRY env is set + RegistriesConfig = { + registries | Array RegistryEntry | default = [], + default | String | optional, + }, + + InfraCapabilities = { + cluster | { + name | String, + runtime | String, + .. + } | optional, + storage_classes | Array String | default = [], + ingress_class | String | optional, + container_runtime | String | optional, + volumes | { _ | { mount | String, size_gb | Number } } | default = {}, + networking | { + private_network | String | optional, + subnet | String | optional, + floating_ip | String | optional, + .. + } | default = {}, + tls | { + cluster_issuer | String | optional, + available | Bool | default = false, + .. + } | default = {}, + registries | RegistriesConfig | default = {}, + .. + }, +} diff --git a/schemas/lib/concerns.ncl b/schemas/lib/concerns.ncl new file mode 100644 index 0000000..9e36da2 --- /dev/null +++ b/schemas/lib/concerns.ncl @@ -0,0 +1,171 @@ +# Service Concerns Umbrella — mandatory declarative surface in ComponentDef. +# Every component must declare what it does (or doesn't do) for each concern: +# tls, dns, certs, backup, observability, security. Each is one of: +# 'enabled — concern is implemented; impl carries the configuration +# 'disabled — explicitly opt-out, with a stated reason +# 'pending — implementation deferred, with a backlog reference +# 'inherited — copied from a parent component (e.g. odoo profile) +# +# The umbrella absorbs the loose fields that components carry today +# (tls_secret, cluster_issuer, cert{}, dns_internal, dns_records, …) into +# typed variants. Existing 'extensions/components//nickel/main.ncl helpers +# may continue to read the loose fields for backwards compatibility while +# also emitting `concerns` for new consumers. + +let bp = import "backup_policy.ncl" in + +{ + # === Concern state ======================================================== + + # Discriminated union of concern states. Encoded as a record with a `kind` + # tag so multiple concerns can coexist in a single ServiceConcerns record + # (Nickel does not support algebraic data types directly). + ConcernState = { + kind | [| 'enabled, 'disabled, 'pending, 'inherited |], + + # 'enabled — payload depends on the concern (tls.impl, dns.impl, …); + # callers thread the right impl type via the wrapper records below. + + # 'disabled + reason | String | optional, + since | String | optional | doc "ISO date when concern was explicitly disabled", + + # 'pending + backlog_ref | String | optional | doc "Identifier of the backlog/issue tracking the implementation", + target_iteration | String | optional, + + # 'inherited + from | String | optional | doc "Name of the parent ComponentDef the concern is inherited from", + + # 'enabled payload — exactly one of these is populated based on the concern + tls_impl | { .. } | optional, + dns_impl | { .. } | optional, + certs_impl | { .. } | optional, + backup_impl | { .. } | optional, + observability_impl | { .. } | optional, + security_impl | { .. } | optional, + }, + + # === Concern impl types =================================================== + + # TLS implementation. Absorbs `tls_secret`, `cluster_issuer`, `tls_hostnames`. + TlsImpl = { + secret_name | String | doc "K8s Secret name where cert-manager stores the cert (was tls_secret)", + issuer_ref | String | doc "ClusterIssuer name (was cluster_issuer)", + hostnames | Array String | doc "Additional SANs (was tls_hostnames)" | default = [], + }, + + # DNS implementation. Absorbs `dns_internal` (private routes via gateway), + # `dns_records` (public records: domain/mx/spf/dmarc/dkim_selector/autoconfig), + # `dns_zone`, `acme_email`. + DnsRoute = { + name | String, + zone | String, + gateway | String | optional, + target | String | optional, + }, + + DnsRecordSpec = { + domain | String | optional, + hostname | String | optional, + mx | Array { priority | Number, value | String } | default = [], + spf | String | optional, + dmarc | { policy | [| 'none, 'quarantine, 'reject |], rua | String | optional, ruf | String | optional } | optional, + autoconfig | String | optional, + dkim_selector | String | optional, + extra | { .. } | doc "Free-form provider-specific records" | default = {}, + }, + + DnsImpl = { + internal | Array DnsRoute | doc "Was dns_internal (dns_private.via_gateway/make_route)" | default = [], + public | DnsRecordSpec | optional | doc "Was dns_records", + zone | String | optional | doc "Was dns_zone", + acme_email | String | optional | doc "Was acme_email (only used when certs concern derives from this email)", + }, + + # Certificates implementation. Absorbs `cert = { acme_server, email, secret_ref, provider }`. + # Distinct from TLS: TLS = pedido al issuer; Certs = config del ACME issuer. + CertsImpl = { + acme_server | String, + email | String, + secret_ref | String | doc "DNS provider credentials secret reference", + provider | [| 'cloudflare, 'hetzner, 'aws, 'route53, 'digitalocean, 'gcp, 'azure |], + }, + + # Observability implementation. Surface only — deeper schemas land in a + # later iteration. Components most commonly declare 'pending here. + ObservabilityImpl = { + metrics | { enabled | Bool, port | Number | optional, path | String | default = "/metrics" } | default = { enabled = false }, + logs | { enabled | Bool, sink | [| 'stdout, 'loki, 'journald |] | default = 'stdout } | default = { enabled = false }, + traces | { enabled | Bool, otlp_endpoint | String | optional } | default = { enabled = false }, + alerts | Array { name | String, expr | String, severity | [| 'info, 'warning, 'critical |] | default = 'warning } | default = [], + }, + + # Security implementation. Surface only. + SecurityImpl = { + network_policy | String | optional | doc "Reference to a NetworkPolicy resource", + pod_security | [| 'restricted, 'baseline, 'privileged |] | optional, + rbac | String | optional | doc "Reference to RBAC bundle", + }, + + # === Builders ============================================================= + # Helper functions for components and migrations to construct ConcernState + # values without repeating the discriminated-union plumbing. + + enabled_tls = fun impl => { + kind = 'enabled, + tls_impl = impl, + }, + + enabled_dns = fun impl => { + kind = 'enabled, + dns_impl = impl, + }, + + enabled_certs = fun impl => { + kind = 'enabled, + certs_impl = impl, + }, + + enabled_backup = fun impl => { + kind = 'enabled, + backup_impl = impl, + }, + + enabled_observability = fun impl => { + kind = 'enabled, + observability_impl = impl, + }, + + enabled_security = fun impl => { + kind = 'enabled, + security_impl = impl, + }, + + disabled = fun reason_text => { + kind = 'disabled, + reason = reason_text, + }, + + pending = fun reason_text backlog => { + kind = 'pending, + reason = reason_text, + backlog_ref = backlog, + }, + + inherited = fun parent_name => { + kind = 'inherited, + from = parent_name, + }, + + # === Top-level umbrella =================================================== + + ServiceConcerns = { + tls | ConcernState, + dns | ConcernState, + certs | ConcernState, + backup | ConcernState, + observability | ConcernState, + security | ConcernState, + }, +} diff --git a/schemas/lib/concerns_presets.ncl b/schemas/lib/concerns_presets.ncl new file mode 100644 index 0000000..86e4b7a --- /dev/null +++ b/schemas/lib/concerns_presets.ncl @@ -0,0 +1,193 @@ +# Reusable ServiceConcerns presets for component defaults. +# +# Component contracts.ncl files declare `concerns | _concerns_lib.ServiceConcerns | optional` +# and their defaults.ncl files set `concerns | default = presets.` to give +# the component an honest declarative surface without repeating boilerplate. +# +# Presets cover the recurring archetypes in libre-wuji: +# - stateless : no TLS/DNS/data — most container runtimes, +# kernel modules, OS-level taskservs +# - infra_storage_managed : storage backends that handle their own +# backup outside per-component policies +# (Longhorn engine state via SystemBackupDef) +# - tls_endpoint_with_acme : public service with cert-manager TLS +# and ACME issuer config; backup decided +# at workspace level +# - observability_telemetry : Prometheus/Grafana/Loki/Vector — config +# in git, data either transient or already +# shipped to S3 +# - infrastructure_glue : controllers/operators with no user data +# (cilium, hccm, csi, ops-controller) + +let _pending_obs = { + kind = 'pending, + reason = "ObservabilityImpl iteration deferred — surface stub only", + backlog_ref = "OBS-001", +} in + +let _pending_sec = { + kind = 'pending, + reason = "SecurityImpl iteration deferred — surface stub only", + backlog_ref = "SEC-001", +} in + +{ + presets = { + # ── Stateless service ──────────────────────────────────────────────── + # Container runtimes (containerd, runc, crun, youki), OS modules, + # kernel-level taskservs. No persistent state, no network endpoints + # exposed at the component level. + stateless = { + tls = { kind = 'disabled, reason = "no TLS termination at this layer" }, + dns = { kind = 'disabled, reason = "no DNS records owned by this component" }, + certs = { kind = 'disabled, reason = "no ACME issuer required" }, + backup = { + kind = 'disabled, + reason = "stateless: configuration in git, no runtime data to capture", + }, + observability = _pending_obs, + security = _pending_sec, + }, + + # ── Storage backend with its own backup model ──────────────────────── + # Longhorn (engine state in SystemBackupDef.longhorn_engine), local-path + # provisioner, Hetzner CSI, democratic CSI. Their data is captured by + # the system-level backup, not per-component. + infra_storage_managed = { + tls = { kind = 'disabled, reason = "internal cluster storage, no TLS endpoint" }, + dns = { kind = 'disabled, reason = "no DNS records owned by this component" }, + certs = { kind = 'disabled, reason = "no ACME issuer required" }, + backup = { + kind = 'disabled, + reason = "engine state captured by SystemBackupDef.longhorn_engine (or equivalent system target)", + }, + observability = _pending_obs, + security = _pending_sec, + }, + + # ── Public service with cert-manager TLS + ACME ────────────────────── + # docker-mailserver, odoo, zot, anything that terminates HTTPS or SMTPS + # via cert-manager. Tls/Dns/Certs concerns get populated from existing + # tls_secret/cluster_issuer/cert/dns_records fields. Backup decided at + # workspace level (concerns.backup overridden in infra//components/.ncl). + tls_endpoint_with_acme = fun args => + { + tls = { + kind = 'enabled, + tls_impl = { + secret_name = args.tls_secret, + issuer_ref = args.cluster_issuer, + hostnames = args.hostnames, + }, + }, + dns = { + kind = 'enabled, + dns_impl = { + internal = args.dns_internal, + zone = args.dns_zone, + }, + }, + certs = { + kind = 'enabled, + certs_impl = { + acme_server = args.acme_server, + email = args.acme_email, + secret_ref = args.cert_secret_ref, + provider = args.cert_provider, + }, + }, + backup = { + kind = 'pending, + reason = "BackupPolicy declared at workspace level", + backlog_ref = args.backup_backlog_ref, + }, + observability = _pending_obs, + security = _pending_sec, + }, + + # ── Observability stack components (Prometheus/Grafana/Loki/Vector) ── + # No user data; configuration in git; metric/log data either transient + # (Prometheus WAL) or already shipped to S3 (Loki via boltdb-shipper). + observability_telemetry = { + tls = { kind = 'disabled, reason = "internal cluster service, ingress-level TLS handled separately" }, + dns = { kind = 'disabled, reason = "no DNS records owned by this component" }, + certs = { kind = 'disabled, reason = "no ACME issuer required" }, + backup = { + kind = 'disabled, + reason = "config in git; runtime data either transient or shipped to S3 backend", + }, + observability = { + kind = 'enabled, + observability_impl = { + metrics = { enabled = true, port = 9090, path = "/metrics" }, + logs = { enabled = true, sink = 'loki }, + traces = { enabled = false }, + alerts = [], + }, + }, + security = _pending_sec, + }, + + # ── Infrastructure glue (controllers/operators) ────────────────────── + # cilium, hccm, hetzner-csi, ops-controller. State lives in K8s API, + # captured by SystemBackupDef.cluster_resources. + infrastructure_glue = { + tls = { kind = 'disabled, reason = "controller-level RBAC, not TLS endpoint" }, + dns = { kind = 'disabled, reason = "no DNS records owned by this component" }, + certs = { kind = 'disabled, reason = "no ACME issuer required" }, + backup = { + kind = 'disabled, + reason = "state in K8s API captured by SystemBackupDef.cluster_resources", + }, + observability = _pending_obs, + security = _pending_sec, + }, + + # ── DNS provider service (CoreDNS, external-dns) ───────────────────── + # Owns DNS records but typically not TLS endpoint of its own. + dns_provider = { + tls = { kind = 'disabled, reason = "DNS server, not TLS endpoint" }, + dns = { + kind = 'enabled, + dns_impl = { + internal = [], + zone = "", + }, + }, + certs = { kind = 'disabled, reason = "no ACME issuer required" }, + backup = { + kind = 'pending, + reason = "zone files captured by SystemBackupDef.external_dns", + backlog_ref = "BACKUP-DNS-001", + }, + observability = _pending_obs, + security = _pending_sec, + }, + + # ── Database (PostgreSQL, MariaDB, SurrealDB) ───────────────────────── + # Backup with database scope + dump strategy. Decided at workspace level. + database = { + tls = { kind = 'disabled, reason = "internal cluster service, ingress-level TLS handled separately" }, + dns = { kind = 'disabled, reason = "no public DNS records" }, + certs = { kind = 'disabled, reason = "no ACME issuer required" }, + backup = { + kind = 'pending, + reason = "BackupPolicy with database scope + dump_strategy declared at workspace level", + backlog_ref = "BACKUP-DB-001", + }, + observability = _pending_obs, + security = _pending_sec, + }, + }, + + # ── Helper for components that need to compose a custom ServiceConcerns + # from individual variants (rather than picking a preset wholesale). + builders = { + pending = fun reason backlog => { + kind = 'pending, + reason = reason, + backlog_ref = backlog, + }, + disabled = fun reason => { kind = 'disabled, reason = reason }, + }, +} diff --git a/schemas/lib/contracts.ncl b/schemas/lib/contracts.ncl index f7d0685..2ee3e3a 100644 --- a/schemas/lib/contracts.ncl +++ b/schemas/lib/contracts.ncl @@ -2,6 +2,8 @@ # | Migrated from: provisioning/kcl/lib.k # | Pattern: Schema definitions only +let _concerns_lib = import "concerns.ncl" in + { StorageVol = { name | String, @@ -24,11 +26,26 @@ parts, }, + TaskServDependency = { + name | String, + kind | [| 'Requires, 'PrefersBefore, 'ConflictsWith |] | default = 'Requires, + condition | String | default = "", + }, + TaskServDef = { - name | String, - install_mode | String | default = "library", - profile | String | default = "default", + name | String, + install_mode | String | default = "library", + profile | String | default = "default", target_save_path | String | default = "", + depends_on | Array { + name | String, + kind | [| 'Requires, 'PrefersBefore, 'ConflictsWith |] | default = 'Requires, + condition | String | default = "", + } | default = [], + on_error | [| 'Stop, 'Continue, 'Retry |] | default = 'Stop, + max_retries | Number | default = 0, + params | { .. } | default = {}, + .. }, ClusterDef = { @@ -37,6 +54,130 @@ target_save_path | String | default = "", }, + # Unified component model — deployment mode selector + DeployMode = [| 'taskserv, 'cluster, 'container |], + + # Port exposure requirements declared by a component + PortRequirement = { + port | Number, + protocol | String | default = "TCP", + exposure | [| 'public, 'private, 'internal |] | default = 'internal, + }, + + # What a component needs from the infrastructure + ComponentRequires = { + storage | { size | String, persistent | Bool } | optional, + ports | Array { + port | Number, + protocol | String | default = "TCP", + exposure | [| 'public, 'private, 'internal |] | default = 'internal, + } | default = [], + credentials | Array String | default = [], + }, + + # What a component exposes to other components + ComponentProvides = { + service | String | optional, + port | Number | optional, + databases | Array String | default = [], + endpoints | Array String | default = [], + }, + + # Operations supported by a component (maps to CMD_TSK dispatch in scripts) + ComponentOperations = { + install | Bool | default = true, + update | Bool | default = false, + reinstall | Bool | default = false, + delete | Bool | default = false, + backup | Bool | default = false, + restore | Bool | default = false, + health | Bool | default = false, + config | Bool | default = false, + scripts | Bool | default = false, + restart | Bool | default = false, + }, + + # How to verify a component is live after deployment. + # Orthogonal to mode (provisioning mechanism) — describes runtime observability strategy. + LiveCheckDef = { + # 'k8s_pods — kubectl get pods filtered by namespace+selector (via CP SSH) + # 'k8s_nodes — kubectl get nodes filtered by selector; healthy = all Ready (for worker components) + # 'k8s_api — proxy: apiserver reachable if kubectl returns node list + # 'systemd — systemctl is-active on target servers (skipped in ll fast path) + # 'none — no observable runtime state (one-shot ops, bare binaries) + strategy | [| 'k8s_pods, 'k8s_nodes, 'k8s_api, 'systemd, 'none |] | default = 'none, + # 'cp_only — SSH to control-plane only (kubectl sees all pods/nodes from there) + # 'target — SSH to component.target (typically CP for taskservs with explicit target) + # 'all_servers — check all servers in workspace state (systemd only; skipped in ll) + # 'workers_only — check only worker nodes (k8s_nodes for kubernetes_worker) + scope | [| 'cp_only, 'target, 'all_servers, 'workers_only |] | default = 'cp_only, + namespace | String | default = "", # overrides component.namespace for pod filter + selector | String | default = "", # overrides component.pod_selector; also used as node name filter + service | String | default = "", # systemd unit name + # Aggregation for multi-server checks (all_servers / workers_only scope): + # 'all_must_pass — any failure → degraded (runtimes, DNS) + # 'any_active — at least one live → partial acceptable + # 'majority — >50% live → healthy + aggregate | [| 'all_must_pass, 'any_active, 'majority |] | default = 'all_must_pass, + }, + + # Unified component definition — extends TaskServDef shape with mode, requires, provides. + # Open record with defaults on new fields: existing taskservs satisfy ComponentDef. + ComponentDef = { + name | String, + mode | [| 'taskserv, 'cluster, 'container |] | default = 'taskserv, + target | String | optional, # server hostname (taskserv mode) + namespace | String | optional, # k8s namespace (cluster mode) + pod_selector | String | optional, # k8s pod name search pattern (overrides component name when k8s release name differs) + live_check | LiveCheckDef | default = { strategy = 'none, scope = 'cp_only, namespace = "", selector = "", service = "", aggregate = 'all_must_pass }, + node_selector | { _ | String } | optional, # k8s node affinity (cluster mode) + install_mode | String | default = "library", + profile | String | default = "default", + target_save_path | String | default = "", + depends_on | Array { + name | String, + kind | [| 'Requires, 'PrefersBefore, 'ConflictsWith |] | default = 'Requires, + condition | String | default = "", + } | default = [], + on_error | [| 'Stop, 'Continue, 'Retry |] | default = 'Stop, + max_retries | Number | default = 0, + params | { .. } | default = {}, + requires | { + storage | { size | String, persistent | Bool } | optional, + ports | Array { + port | Number, + protocol | String | default = "TCP", + exposure | [| 'public, 'private, 'internal |] | default = 'internal, + } | default = [], + credentials | Array String | default = [], + } | default = {}, + provides | { + service | String | optional, + port | Number | optional, + databases | Array String | default = [], + endpoints | Array String | default = [], + } | default = {}, + operations | { + install | Bool | default = true, + update | Bool | default = false, + reinstall | Bool | default = false, + delete | Bool | default = false, + backup | Bool | default = false, + restore | Bool | default = false, + health | Bool | default = false, + config | Bool | default = false, + scripts | Bool | default = false, + restart | Bool | default = false, + } | default = {}, + # Mandatory declarative surface for service-level concerns. Each entry is a + # ConcernState variant (enabled/disabled/pending/inherited). Components that + # don't implement a concern declare 'pending {reason, backlog_ref} or + # 'disabled {reason} — never omit. CI/ontoref consume this surface to emit + # backlog priorities and architecture documentation. + concerns | _concerns_lib.ServiceConcerns, + .. + }, + ScaleData = { def | String, disabled | Bool, diff --git a/schemas/lib/dag/contracts.ncl b/schemas/lib/dag/contracts.ncl new file mode 100644 index 0000000..3a5ab6b --- /dev/null +++ b/schemas/lib/dag/contracts.ncl @@ -0,0 +1,122 @@ +# schemas/lib/dag/contracts.ncl — DAG domain type contracts +# +# Two distinct DAG layers: +# 1. Capability layer — ExtensionCapability/ExtensionDependency (extension metadata) +# 2. Composition layer — WorkspaceComposition (inter-formula ordering) +# 3. Resolution layer — ResolutionPolicy (capability → extension mapping) +# +# Pattern: separate let bindings with _ prefix, same as formula.ncl. +# No self-references, no let rec — each binding is in scope for subsequent ones. + +# --------------------------------------------------------------------------- +# Capability layer +# --------------------------------------------------------------------------- +let _capability_kind = [| 'Required, 'Optional, 'ConflictsWith |] in + +let _ExtensionCapability = { + id | String, + version | String, + interface | String, +} in + +let _ExtensionDependency = { + capability | String, + kind | _capability_kind, + min_version | String | optional, +} in + +# --------------------------------------------------------------------------- +# Composition layer — inter-formula DAG +# Distinct from the intra-formula DAG in formula.ncl (per-server task ordering). +# WorkspaceComposition declares execution ordering between formulas. +# --------------------------------------------------------------------------- +let _composition_condition = [| 'Completed, 'Healthy, 'Running |] in + +let _FormulaDep = { + formula_id | String, + condition | _composition_condition, +} in + +let _HealthGate = { + check_cmd | String, + expect | String, + timeout_ms | Number, + retries | Number, + check_server | String | optional, +} in + +let _FormulaCompositionEntry = { + formula_id | String, + depends_on | Array _FormulaDep | default = [], + parallel | Bool | default = false, + health_gate | _HealthGate | optional, +} in + +# Base shape — used as first step inside the custom contract (same pattern as _FormulaBase +# in formula.ncl) so missing-field errors surface before cross-field validation runs. +let _WorkspaceCompositionBase = { + formulas | Array _FormulaCompositionEntry, +} in + +# Custom contract: validates referential integrity across formula entries. +# - At least one formula must have depends_on = [] (root node) +# - All depends_on[].formula_id must reference a declared formula_id +let _WorkspaceComposition = std.contract.custom (fun label value => + let base = value | _WorkspaceCompositionBase in + let ids = base.formulas |> std.array.map (fun e => e.formula_id) in + let has_root = base.formulas |> std.array.any (fun e => e.depends_on == []) in + + let bad_deps = base.formulas |> std.array.flat_map (fun e => + e.depends_on + |> std.array.filter (fun d => + !(ids |> std.array.any (fun id => id == d.formula_id)) + ) + |> std.array.map (fun d => + "formula '%{e.formula_id}' depends_on unknown '%{d.formula_id}'" + ) + ) in + + if !has_root then + std.contract.blame_with_message + "WorkspaceComposition: at least one formula must have depends_on = []" + label + else if (std.array.length bad_deps) > 0 then + std.contract.blame_with_message + "WorkspaceComposition: invalid depends_on references: %{std.string.join ", " bad_deps}" + label + else + 'Ok base +) in + +# --------------------------------------------------------------------------- +# Resolution layer — capability → concrete extension mapping +# --------------------------------------------------------------------------- +let _resolution_strategy = [| 'Strict, 'BestEffort |] in + +let _ResolutionEntry = { + capability_id | String, + extension_name | String, +} in + +let _ResolutionPolicy = { + strategy | _resolution_strategy, + overrides | Array _ResolutionEntry | default = [], + allow_optional_gaps | Bool, +} in + +# --------------------------------------------------------------------------- +# Exports +# --------------------------------------------------------------------------- +{ + CapabilityKind = _capability_kind, + ExtensionCapability = _ExtensionCapability, + ExtensionDependency = _ExtensionDependency, + CompositionCondition = _composition_condition, + FormulaDep = _FormulaDep, + HealthGate = _HealthGate, + FormulaCompositionEntry = _FormulaCompositionEntry, + WorkspaceComposition = _WorkspaceComposition, + ResolutionStrategy = _resolution_strategy, + ResolutionEntry = _ResolutionEntry, + ResolutionPolicy = _ResolutionPolicy, +} diff --git a/schemas/lib/dag/defaults.ncl b/schemas/lib/dag/defaults.ncl new file mode 100644 index 0000000..33bd957 --- /dev/null +++ b/schemas/lib/dag/defaults.ncl @@ -0,0 +1,26 @@ +# schemas/lib/dag/defaults.ncl — DAG domain default values +# +# Pure default values — no contracts, no functions. +# Pattern follows schemas/lib/defaults.ncl. +# +# Consumers: schemas/lib/dag/main.ncl exposes these as dag.defaults.* +# schemas/config/dag/main.ncl imports them for runtime config + +{ + composition = { + max_parallel = 4, + default_on_error = 'Stop, + default_retries = 0, + health_check_interval_ms = 5000, + timeout_ms = 300000, + }, + resolution = { + strategy = 'Strict, + allow_optional_gaps = false, + overrides = [], + }, + events = { + emit_nats = true, + subject_prefix = "provisioning.dag", + }, +} diff --git a/schemas/lib/dag/main.ncl b/schemas/lib/dag/main.ncl new file mode 100644 index 0000000..3165f6a --- /dev/null +++ b/schemas/lib/dag/main.ncl @@ -0,0 +1,26 @@ +# schemas/lib/dag/main.ncl — DAG domain public API +# +# Re-exports all contracts and defaults from the dag/ subdomain. +# Registered in schemas/lib/main.ncl as: dag = import "./dag/main.ncl" +# Accessible as: provisioning.lib.dag.WorkspaceComposition etc. + +let c = import "./contracts.ncl" in +let d = import "./defaults.ncl" in + +{ + # Contracts — applied via | dag.WorkspaceComposition, | dag.ResolutionPolicy, etc. + CapabilityKind = c.CapabilityKind, + ExtensionCapability = c.ExtensionCapability, + ExtensionDependency = c.ExtensionDependency, + CompositionCondition = c.CompositionCondition, + FormulaDep = c.FormulaDep, + HealthGate = c.HealthGate, + FormulaCompositionEntry = c.FormulaCompositionEntry, + WorkspaceComposition = c.WorkspaceComposition, + ResolutionStrategy = c.ResolutionStrategy, + ResolutionEntry = c.ResolutionEntry, + ResolutionPolicy = c.ResolutionPolicy, + + # Default values — used by config/dag/main.ncl and workspace-level overrides + defaults = d, +} diff --git a/schemas/lib/extension-metadata.ncl b/schemas/lib/extension-metadata.ncl index 02b12d3..7faca69 100644 --- a/schemas/lib/extension-metadata.ncl +++ b/schemas/lib/extension-metadata.ncl @@ -1,20 +1,29 @@ # Extension Metadata Schema - Type-safe extension definition # Defines metadata for each extension including dependencies and best practices # Used for DAG construction and extension initialization ordering +# +# Capability fields (provides/requires/conflicts_with) added additively with defaults. +# All existing metadata.ncl files continue to export without modification. +# The detect_conflicts reflection step reads .conflicts_with // [] — the // [] null-coalesce +# was already in place; once files are migrated these fields become active. + +let dag = import "./dag/contracts.ncl" in -# Schema for extension metadata let ExtensionMetadataSchema = { - name | String, - version | String, - category | String, - description | String, - dependencies | Array String, - tags | Array String, - best_practices | Array String, + name | String, + version | String, + category | String | default = "", # optional in flat components/ structure + description | String, + dependencies | Array String | default = [], # legacy flat dependency list — kept + provides | Array dag.ExtensionCapability | default = [], # capability ids this extension satisfies + requires | Array dag.ExtensionDependency | default = [], # typed capability requirements + conflicts_with | Array String | default = [], # extension names this conflicts with + tags | Array String, + modes | Array String | default = ["taskserv"], # available deployment modes + best_practices | Array String | default = [], } in -# Export schema { schema = ExtensionMetadataSchema, } diff --git a/schemas/lib/formula.ncl b/schemas/lib/formula.ncl new file mode 100644 index 0000000..1c440cf --- /dev/null +++ b/schemas/lib/formula.ncl @@ -0,0 +1,130 @@ +# schemas/lib/formula.ncl — Workspace Formula DAG +# +# A Formula is a typed DAG that is simultaneously: +# - A validatable declaration (Nickel typecheck + referential integrity) +# - An executable pipeline (Orchestrator consumes the DAG via nickel export) +# - A governable artifact (on+re tracks state, gates, and audit) +# +# Usage: +# let f = import "schemas/lib/formula.ncl" in +# f.make_formula { id = "...", nodes = [...], ... } + +let ts = import "contracts.ncl" in + +let _dep_kind = [| 'Always, 'OnSuccess, 'OnFailure |] in +let _on_error = [| 'Stop, 'Continue, 'Retry |] in + +# Dependency from one FormulaNode to another (by node id) +let _FormulaDep = { + node_id | String, + kind | _dep_kind | default = 'OnSuccess, +} in + +# A node in the formula DAG. +# Exactly one of `taskserv` or `component` must be present. +# - taskserv: L2 nodes — legacy field, existing formulas unchanged +# - component: L3+ nodes — unified model, orchestrator uses component.mode to resolve +let _FormulaNode = std.contract.custom (fun label value => + let base = value | { + id | String, + taskserv | ts.TaskServDef | optional, + component | ts.ComponentDef | optional, + depends_on | Array _FormulaDep | default = [], + parallel | Bool | default = false, + on_error | _on_error | default = 'Stop, + max_retries | Number | default = 0, + } in + let has_taskserv = std.record.has_field "taskserv" base in + let has_component = std.record.has_field "component" base in + if has_taskserv && has_component then + std.contract.blame_with_message + "FormulaNode '%{base.id}': exactly one of 'taskserv' or 'component' must be present, not both" + label + else if (!has_taskserv) && (!has_component) then + std.contract.blame_with_message + "FormulaNode '%{base.id}': exactly one of 'taskserv' or 'component' must be present" + label + else + 'Ok base +) in + +# An explicit edge declaration (alternative to depends_on inside nodes) +let _FormulaEdge = { + from | String, + to | String, + kind | _dep_kind | default = 'OnSuccess, +} in + +# Base structure without cross-field validation +let _FormulaBase = { + id | String, + description | String, + provider | String, + server | String, + nodes | Array _FormulaNode, + edges | Array _FormulaEdge | default = [], + max_parallel | Number | default = 4, +} in + +# Contract: all node_id values in depends_on must reference an existing node id. +# Also validates edge endpoints. +let _Formula = std.contract.custom (fun label value => + let base = value | _FormulaBase in + let node_ids = base.nodes |> std.array.map (fun n => n.id) in + + # Check: duplicate node ids + let dup_ids = node_ids |> std.array.fold_left (fun acc id => + if std.record.has_field id acc.seen then + { seen = acc.seen, dups = acc.dups @ [id] } + else + { seen = acc.seen & { "%{id}" = true }, dups = acc.dups } + ) { seen = {}, dups = [] } in + + if std.array.length dup_ids.dups > 0 then + std.contract.blame_with_message + "Formula '%{base.id}': duplicate node ids: %{std.string.join ", " dup_ids.dups}" + label + else + + # Check: depends_on referential integrity + let bad_deps = base.nodes |> std.array.flat_map (fun node => + node.depends_on + |> std.array.filter (fun dep => + !(node_ids |> std.array.any (fun id => id == dep.node_id)) + ) + |> std.array.map (fun dep => + "node '%{node.id}' depends_on unknown '%{dep.node_id}'" + ) + ) in + + if std.array.length bad_deps > 0 then + std.contract.blame_with_message + "Formula '%{base.id}' has invalid depends_on: %{std.string.join ", " bad_deps}" + label + else + + # Check: edge referential integrity + let bad_edges = base.edges |> std.array.filter (fun e => + !(node_ids |> std.array.any (fun id => id == e.from)) + || !(node_ids |> std.array.any (fun id => id == e.to)) + ) |> std.array.map (fun e => "'%{e.from}' -> '%{e.to}'") in + + if std.array.length bad_edges > 0 then + std.contract.blame_with_message + "Formula '%{base.id}' has invalid edge endpoints: %{std.string.join ", " bad_edges}" + label + else + 'Ok base +) in + +{ + FormulaDep = _FormulaDep, + FormulaNode = _FormulaNode, + FormulaEdge = _FormulaEdge, + Formula = _Formula, + + make_dep = fun data => _FormulaDep & data, + make_node = fun data => data | _FormulaNode, + make_edge = fun data => _FormulaEdge & data, + make_formula = fun data => data | _Formula, +} diff --git a/schemas/lib/integration/cabling.ncl b/schemas/lib/integration/cabling.ncl new file mode 100644 index 0000000..d87001d --- /dev/null +++ b/schemas/lib/integration/cabling.ncl @@ -0,0 +1,93 @@ +# schemas/lib/integration/cabling.ncl +# +# Cabling format: per-workspace, per-mode binding file that resolves +# domain context fields to their concrete sources. +# +# Location: infra//integrations/.ncl +# +# Each entry in `bindings` maps a dotted domain field path +# (e.g. "secret-delivery.registry_password") to a Resolver record. +# +# Resolver kinds (discriminated by the `kind` field): +# "sops" — decrypt field from a SOPS-encrypted file +# "component" — read field from a component's output record +# "literal" — static hardcoded value +# "env" — read from an environment variable at assembly time +# +# Usage: +# "secret-delivery.registry_password" = { kind = "sops", path = "secrets/zot.sops.yaml", key = "ZOT_HTPASSWD" }, +# "secret-delivery.registry_url" = { kind = "component", name = "zot", field = "registry_url" }, +# "event-emission.subject_prefix" = { kind = "literal", value = "ws.libre-wuji.build.lian-build" }, +# "compute.api_key" = { kind = "env", env_var = "HETZNER_API_KEY" }, + +let _valid_kinds = [| 'sops, 'component, 'literal, 'env |] in + +let _Resolver = + std.contract.custom (fun label value => + if !std.is_record value then + std.contract.blame_with_message "Resolver must be a record with a 'kind' field" label + else if !std.record.has_field "kind" value then + std.contract.blame_with_message "Resolver missing required field 'kind'" label + else + match { + "sops" => + if std.record.has_field "path" value && std.record.has_field "key" value then + 'Ok value + else + std.contract.blame_with_message + "Resolver kind='sops' requires fields: path (String), key (String)" + label, + "component" => + if std.record.has_field "name" value && std.record.has_field "field" value then + 'Ok value + else + std.contract.blame_with_message + "Resolver kind='component' requires fields: name (String), field (String)" + label, + "literal" => + if std.record.has_field "value" value then + 'Ok value + else + std.contract.blame_with_message + "Resolver kind='literal' requires field: value (any)" + label, + "env" => + if std.record.has_field "env_var" value then + 'Ok value + else + std.contract.blame_with_message + "Resolver kind='env' requires field: env_var (String)" + label, + _ => + std.contract.blame_with_message + "Unknown Resolver kind '%{value.kind}'. Valid: sops, component, literal, env" + label, + } value.kind + ) in + +# Base shape for structural validation before cross-field checks. +let _CablingBase = { + mode_id | String + | doc "Integration mode id — e.g. 'lian-build-provisioning'", + workspace | String + | doc "Workspace identifier — e.g. 'libre-wuji'", + bindings | { _ | _Resolver } + | doc "Map of '.' to a Resolver", +} in + +# Full Cabling contract: structural + non-empty bindings. +let _Cabling = + std.contract.custom (fun label value => + let validated = value | _CablingBase in + if std.record.length validated.bindings == 0 then + std.contract.blame_with_message + "Cabling '%{validated.mode_id}': bindings must be non-empty" + label + else + 'Ok validated + ) in + +{ + Resolver = _Resolver, + Cabling = _Cabling, +} diff --git a/schemas/lib/integration/oci_artifact_format.ncl b/schemas/lib/integration/oci_artifact_format.ncl new file mode 100644 index 0000000..bddc401 --- /dev/null +++ b/schemas/lib/integration/oci_artifact_format.ncl @@ -0,0 +1,98 @@ +# schemas/lib/integration/oci_artifact_format.ncl +# +# OCI artifact descriptors for the federated integration-modes protocol. +# Two artifact kinds: +# DomainArtifact — typed contract pushed by the domain owner +# ModeArtifact — integration mode manifest pushed by the participant +# +# Also exports: +# Invocation — how a mode step binary is invoked +# DomainLock — per-workspace lock file written after `prvng integration pull` + +let _binary_source = [| 'path_assumed, 'cargo_install, 'oci_blob |] in + +let _invocation_method = [| 'stdin_context, 'argv_context_file |] in + +# How a mode step binary is resolved and invoked. +let _Invocation = { + method | _invocation_method + | doc "stdin_context: JSON piped to stdin; argv_context_file: path written to a temp file, passed as $1", + binary | { + source | _binary_source, + name | String, + version | String | optional, + cargo_crate | String | optional + | doc "Required when source = 'cargo_install", + oci_layer | String | optional + | doc "OCI blob reference when source = 'oci_blob — e.g. reg.librecloud.online/binaries/lian-build:0.3.0", + }, + args | Array String | default = [], + env | { _ | String } | default = {}, +} in + +# A single OCI layer descriptor inside an artifact manifest. +let _LayerDescriptor = { + media_type | String, + description | String, + required | Bool | default = true, +} in + +# DomainArtifact — pushed to reg.librecloud.online/domains/: +# mediaType: application/vnd.ontoref.domain.v1 +let _DomainArtifact = { + media_type | String + | default = "application/vnd.ontoref.domain.v1", + id | String + | doc "Stable domain identifier, e.g. 'secret-delivery'", + version | String + | doc "Semver of the domain contract", + description | String, + layers | Array _LayerDescriptor + | doc "Expected layers in the OCI image. 'contract.ncl' layer is always required.", + # ADR-017 G2 — explicit dependency declaration. References a RegistryEntry.id + # in the consuming project's manifest.registry_provides.registries[]. Enables + # impact analysis on `ore secrets close`: which artifacts are affected by a + # credential change. Empty = artifact does not consume registry credentials. + uses_registry | String | optional + | doc "RegistryEntry.id this artifact's runtime depends on", +} in + +# ModeArtifact — pushed to reg.librecloud.online/modes/: +# mediaType: application/vnd.ontoref.mode.v1 +let _ModeArtifact = { + media_type | String + | default = "application/vnd.ontoref.mode.v1", + id | String, + version | String, + description | String, + participant | String + | doc "Originating project/workspace that owns this mode", + layers | Array _LayerDescriptor, + uses_registry | String | optional + | doc "RegistryEntry.id this mode's runtime depends on (ADR-017 G2)", +} in + +# Written to infra//integrations/.lock.ncl after successful pull. +# Keyed by domain id, records the resolved version + digest for reproducibility. +let _DomainLockEntry = { + version | String, + digest | String + | doc "OCI manifest digest, sha256:...", + pulled_at | String + | doc "ISO-8601 timestamp", + media_type | String, +} in + +let _DomainLock = { + schema_version | String | default = "0.1.0", + domains | { _ | _DomainLockEntry }, +} in + +{ + Invocation = _Invocation, + DomainArtifact = _DomainArtifact, + ModeArtifact = _ModeArtifact, + DomainLockEntry = _DomainLockEntry, + DomainLock = _DomainLock, + LayerDescriptor = _LayerDescriptor, +} diff --git a/schemas/lib/integration_mode_manifest.ncl b/schemas/lib/integration_mode_manifest.ncl new file mode 100644 index 0000000..09c170a --- /dev/null +++ b/schemas/lib/integration_mode_manifest.ncl @@ -0,0 +1,122 @@ +# schemas/lib/integration_mode_manifest.ncl +# +# Integration Mode manifest schema for the federated integration-modes protocol. +# Each participant project declares an IntegrationMode in its own reflection/modes/. +# +# Invariants enforced at contract evaluation time: +# 1. kind must be 'integration (not 'standard — prevents mode files landing in wrong catalog) +# 2. domains_used must be non-empty (every integration mode must declare its domain deps) +# 3. direction='bidirectional requires at least one step with id starting "report-" +# 4. direction='event_emitter requires at least one step with id starting "emit-" +# 5. All step depends_on references resolve to existing step ids (inherited from ontoref pattern) +# +# Embedding rationale: ontoref v0.1.0 has no domain command group and no OCI surface. +# This schema is a local embedded subset; upstreaming is deferred per ADR-042. + +let oci = import "./integration/oci_artifact_format.ncl" in + +let _direction = [| 'inbound, 'outbound, 'bidirectional, 'event_emitter |] in + +# Typed reference to a domain artifact in the OCI registry. +let _DomainRef = { + id | String + | doc "Domain identifier — must match the id in the DomainArtifact pushed to the registry", + version | String + | doc "Semver constraint, e.g. '>=0.1.0, <0.2.0'", + registry | String | optional + | doc "Override registry base; defaults to reg.librecloud.online/domains", +} in + +let _Dependency = { + step | String, +} in + +let _OnError = { + strategy | [| 'Stop, 'Continue, 'Retry |] | default = 'Stop, +} in + +# A single step in an integration mode. Extends ontoref _ActionStep with an +# optional invocation descriptor (absent for manual/human steps). +let _IntegrationStep = { + id | String, + action | String, + depends_on | Array _Dependency | default = [], + actor | [| 'Human, 'Agent, 'Both |] | default = 'Agent, + invocation | oci.Invocation | optional + | doc "How to invoke the step binary. Absent for human-only steps.", + on_error | _OnError | default = { strategy = 'Stop }, + verify | String | optional, + note | String | optional, +} in + +# Base shape validated before cross-field checks. +let _IntegrationModeBase = { + id | String, + kind | [| 'integration |], + direction | _direction, + trigger | String, + participant | String + | doc "Project/workspace that owns this mode — e.g. 'lian-build'", + domains_used | Array _DomainRef, + steps | Array _IntegrationStep, + preconditions | Array String | default = [], + postconditions | Array String | default = [], + description | String | optional, +} in + +# Full contract: structural + cross-field invariants. +let _IntegrationMode = + std.contract.custom (fun label value => + let validated = value | _IntegrationModeBase in + let steps = validated.steps in + let ids = steps |> std.array.map (fun s => s.id) in + + let bad_refs = steps |> std.array.flat_map (fun step => + step.depends_on + |> std.array.filter (fun dep => + !(ids |> std.array.any (fun i => i == dep.step)) + ) + |> std.array.map (fun dep => + "step '%{step.id}' depends_on unknown '%{dep.step}'" + ) + ) in + + # Uniqueness accumulator — folds to a record of seen ids, blames on duplicate. + let unique_acc = ids |> std.array.fold_left (fun acc id => + if std.record.has_field id acc.seen then + std.contract.blame_with_message + "IntegrationMode '%{validated.id}': duplicate step id '%{id}'" + label + else + { seen = acc.seen & { "%{id}" = true }, ok = true } + ) { seen = {}, ok = true } in + + if std.array.length validated.domains_used == 0 then + std.contract.blame_with_message + "IntegrationMode '%{validated.id}': domains_used must be non-empty — declare every domain this mode depends on" + label + else if validated.direction == 'bidirectional + && !(ids |> std.array.any (fun i => std.string.is_match "^report-" i)) then + std.contract.blame_with_message + "IntegrationMode '%{validated.id}' direction=bidirectional: requires at least one step with id starting 'report-'" + label + else if validated.direction == 'event_emitter + && !(ids |> std.array.any (fun i => std.string.is_match "^emit-" i)) then + std.contract.blame_with_message + "IntegrationMode '%{validated.id}' direction=event_emitter: requires at least one step with id starting 'emit-'" + label + else if std.array.length bad_refs > 0 then + std.contract.blame_with_message + "IntegrationMode '%{validated.id}' has invalid depends_on: %{std.string.join ", " bad_refs}" + label + else + # Force uniqueness check evaluation before returning. + let _ = unique_acc in + 'Ok validated + ) in + +{ + DomainRef = _DomainRef, + IntegrationStep = _IntegrationStep, + IntegrationMode = _IntegrationMode, +} diff --git a/schemas/lib/keeper_policy.ncl b/schemas/lib/keeper_policy.ncl new file mode 100644 index 0000000..a325011 --- /dev/null +++ b/schemas/lib/keeper_policy.ncl @@ -0,0 +1,51 @@ +# schemas/lib/keeper_policy.ncl — Keeper auto-sign policy schema (ADR-038) +# +# Declarative-only closed shape parsed by the keeper-daemon Rust matcher. +# Policy files (policy-/policy.ncl) MUST conform to PolicyDef and +# MUST NOT contain Nickel function definitions or imports beyond this schema. +# Constraint: policy-files-are-declarative-only (ADR-038). +# +# Usage: +# let kp = import "schemas/lib/keeper_policy.ncl" in +# { policy | kp.PolicyDef = { auto_sign = [...], require_manual = [...] } } + +# Op type wildcard contract — superset of ops_contract.ncl OpsType that also accepts "*" +let OpTypeOrAny = + std.contract.custom ( + fun label => + fun value => + let valid = ["deploy", "scale", "restart", "secret_update", "drain", "*"] in + if std.array.any (fun x => x == value) valid then + 'Ok value + else + 'Error { + message = "Invalid op_type '%{value}'.\nValid values: deploy | scale | restart | secret_update | drain | *" + } + ) +in + +# A single match rule — all fields are glob patterns applied by the Rust matcher. +# Absent / defaulted-to-"*" field means "match any value for this dimension". +# The matcher evaluates rules top-to-bottom; first matching rule wins. +let _MatchRule = { + op_type | OpTypeOrAny | doc "Op type this rule applies to; '*' matches any op type" | default = "*", + image_patterns | Array String | doc "Glob patterns matched against OCI image reference in the op payload (deploy ops only)" | default = ["*"], + target_patterns | Array String | doc "Glob patterns matched against the op target name (e.g., 'staging-*', 'vapora')" | default = ["*"], + scope_patterns | Array String | doc "Glob patterns matched against JWT scope entries (:)" | default = ["*"], +} in + +# Top-level policy file schema. Evaluation order: auto_sign rules checked first (top-to-bottom), +# then require_manual. If no rule matches, the op is held pending for manual review. +let _PolicyDef = { + version | Number | doc "Schema version — keeper-daemon rejects files with unknown versions" | default = 1, + auto_sign | Array _MatchRule | doc "Rules for operations the keeper-daemon may sign automatically" | default = [], + require_manual | Array _MatchRule | doc "Rules for operations that must be signed interactively via keeper-cli" | default = [], +} in + +{ + OpTypeOrAny = OpTypeOrAny, + MatchRule = _MatchRule, + PolicyDef = _PolicyDef, + + make_policy | not_exported = fun data => data | _PolicyDef, +} diff --git a/schemas/lib/main.ncl b/schemas/lib/main.ncl index eca3312..943c63d 100644 --- a/schemas/lib/main.ncl +++ b/schemas/lib/main.ncl @@ -3,7 +3,8 @@ # | Pattern: Hybrid - defaults + makers + direct access (contracts available via import) let contracts_lib = import "./contracts.ncl" in -let defaults_lib = import "./defaults.ncl" in +let defaults_lib = import "./defaults.ncl" in +let dag_lib = import "./dag/main.ncl" in { # ============================================================================ @@ -61,4 +62,7 @@ let defaults_lib = import "./defaults.ncl" in DefaultClusterDef = defaults_lib.cluster_def, DefaultScaleData = defaults_lib.scale_data, DefaultScaleResource = defaults_lib.scale_resource, + + # DAG schema domain — accessible as provisioning.lib.dag.* + dag = dag_lib, } diff --git a/schemas/lib/manifest_plan.ncl b/schemas/lib/manifest_plan.ncl new file mode 100644 index 0000000..9629bec --- /dev/null +++ b/schemas/lib/manifest_plan.ncl @@ -0,0 +1,53 @@ +{ + ManifestAction = std.enum.TagOrString, + + StepHook = { + action | ManifestAction, + params | { _ | String } | default = {}, + delay | Number | default = 0, + }, + + ManifestEntry = { + file | String | optional, + action | ManifestAction | default = 'apply, + skip_if_exists | Bool | default = false, + delay | Number | default = 0, + params | { _ | String } | default = {}, + pre | Array StepHook | default = [], + post | Array StepHook | default = [], + }, + + _ManifestPlanSafe = std.contract.custom (fun label value => + let base = value | { + init | Array ManifestEntry | default = [], + update | Array ManifestEntry | default = [], + delete | Array ManifestEntry | default = [], + restart | Array ManifestEntry | default = [], + } in + let protected = ["namespace", "pvc"] in + let is_destructive = fun a => + a == 'delete || a == "delete" || a == 'recreate || a == "recreate" + in + let violations = fun op steps => + steps + |> std.array.filter (fun e => + std.record.has_field "file" e + && std.array.elem e.file protected + && is_destructive e.action + ) + |> std.array.map (fun e => "%{op}:%{e.file}") + in + let all_violations = + violations "update" base.update + @ violations "delete" base.delete + @ violations "restart" base.restart + in + if std.array.length all_violations > 0 then + let msg = std.string.join ", " all_violations in + 'Error { message = "ManifestPlan: protected resources cannot use delete/recreate — [%{msg}]" } + else + 'Ok base + ), + + ManifestPlan = _ManifestPlanSafe, +} diff --git a/schemas/lib/op.ncl b/schemas/lib/op.ncl new file mode 100644 index 0000000..e954f20 --- /dev/null +++ b/schemas/lib/op.ncl @@ -0,0 +1,113 @@ +# schemas/lib/op.ncl — Op (Operation) governance contracts +# +# An Op is the atomic unit of workspace state management — it records intent, +# authorization, execution artifacts, and state transitions as a DAG node, +# enabling audit, rollback, and concurrent agent control. +# +# Storage (per op): +# ops/{id}/op.json — runtime instance record (JSON, not NCL) +# ops/{id}/pre.json — pre-execution state snapshot +# ops/{id}/post.json — post-execution state snapshot (absent on failure) +# .ops-archive/ — restic repo (S3 backend): logs + bundles, encrypted +# +# Identity: +# actor.nid = Radicle Node ID (rad self --nid). Falls back to "local:{user}". +# Op ID = {nid-short}:{uuid} — globally attributable, DAG-safe +# +# DAG semantics: +# Each Op is a node. lineage.parent_op is the incoming edge. +# Rollback is a new forward Op — lineage.rollback_of points to the Op being undone. +# The DAG remains acyclic; rollback is a forward move restoring an earlier snapshot. + +let _OpActor = { + nid | String | doc "Radicle Node ID (rad self --nid) or 'local:{user}' fallback", + identity | String | doc "Human-readable label — username or agent name", + source | [| 'cli, 'agent, 'api |] | default = 'cli, +} in + +let _Constraint = { + kind | [| 'backup, 'restore, 'health_check, 'dry_run_check, 'concurrent_lock |], + resource | String | doc "Component name or volume/resource identifier", + scope | [| 'direct, 'indirect |] | default = 'direct, + params | { .. } | default = {}, +} in + +let _RecoveryAction = { + kind | [| 'backup, 'restore, 'health_check, 'dry_run_check, 'concurrent_lock |], + resource | String, + from | [| 'pre_backup, 'last_known_good |] | default = 'pre_backup, + params | { .. } | default = {}, +} in + +let _OpConstraints = { + pre | Array _Constraint | doc "Gates evaluated before execution starts" | default = [], + on_failure | Array _RecoveryAction | doc "Recovery actions if op fails" | default = [], +} in + +let _OpSnapshots = { + pre | String | doc "Relative path to pre.json from workspace root", + post | String | optional | doc "Relative path to post.json — absent if op failed before completion", +} in + +let _OpArtifacts = { + archive_snapshot | String | optional | doc "Snapshot ID in the configured archive backend (restic/kopia)", + bundles | Array String | doc "Bundle tar.gz paths within the archive snapshot" | default = [], +} in + +let _OpLineage = { + parent_op | String | optional | doc "Op ID this state was derived from (incoming DAG edge)", + rollback_of | String | optional | doc "If this op is a rollback, the ID of the op it undoes", +} in + +{ + OpActor = _OpActor, + Constraint = _Constraint, + RecoveryAction = _RecoveryAction, + OpConstraints = _OpConstraints, + OpSnapshots = _OpSnapshots, + OpArtifacts = _OpArtifacts, + OpLineage = _OpLineage, + + # Enum type exports — use these as field annotation values in workspace NCL + OpSource = [| 'cli, 'agent, 'api |], + OpOperation = [| 'install, 'update, 'delete, 'rollback, 'dry_run |], + OpStatus = [| 'pending, 'running, 'constraint_failed, 'recovering, 'success, 'failed, 'rolled_back, 'cancelled |], + + Op = { + id | String | doc "Op ID: {nid-short}:{uuid}", + actor | _OpActor, + intent | String | doc "Human description of why this op is needed", + workspace | String | doc "Workspace name from config/provisioning.ncl", + component | String | doc "Component being operated on", + operation | [| 'install, 'update, 'delete, 'rollback, 'dry_run |], + targets | Array String | doc "Server hostnames targeted by this op", + + constraints | _OpConstraints | default = { pre = [], on_failure = [] }, + snapshots | _OpSnapshots, + artifacts | _OpArtifacts | default = { bundles = [] }, + lineage | _OpLineage | default = {}, + + status | [| 'pending, 'running, 'constraint_failed, 'recovering, 'success, 'failed, 'rolled_back, 'cancelled |] | default = 'pending, + jj_change | String | optional | doc "jj change ID created for this op", + radicle_rid | String | optional | doc "Radicle Repository ID after rad sync — globally unique RID of this workspace", + + started_at | String | doc "ISO 8601 UTC timestamp", + ended_at | String | optional, + }, + + # Workspace-level ops configuration — added under `ops` in config/provisioning.ncl + OpsConfig = { + archive = { + backend | [| 's3, 'local |] | default = 's3, + tool | [| 'restic, 'kopia |] | doc "Backup provider — must have a matching entry in extensions/providers/backup/" | default = 'restic, + endpoint | String | optional | doc "S3-compatible endpoint URL", + bucket | String | optional, + prefix | String | default = "ops", + }, + retention = { + keep_last | Number | default = 50, + keep_monthly | Number | default = 12, + keep_yearly | Number | default = 3, + }, + }, +} diff --git a/schemas/lib/ops_contract.ncl b/schemas/lib/ops_contract.ncl new file mode 100644 index 0000000..66077f1 --- /dev/null +++ b/schemas/lib/ops_contract.ncl @@ -0,0 +1,117 @@ +# schemas/lib/ops_contract.ncl — Ops contract (ADR-037) +# NATS JetStream subject namespaces, JWT signed command structure, +# stream configuration, and workspace ops contract definition. + +let OpsType = + std.contract.custom ( + fun label => + fun value => + let valid = ["deploy", "scale", "restart", "secret_update", "drain"] in + if std.array.any (fun x => x == value) valid then + 'Ok value + else + 'Error { + message = "Invalid op_type '%{value}'.\nValid values: deploy | scale | restart | secret_update | drain" + } + ) +in + +let _StreamRetention = [| 'WorkQueue, 'Limits, 'Interest |] in + +let _ScopeEntry = { + op_type | OpsType, + target_pattern | String | doc "Glob pattern for allowed op targets (e.g., 'staging-*', 'vapora')", +} in + +let _JwtClaims = { + iss | String | doc "Signer identity: keeper-vm-primary | operator- | gh-actions-", + sub | String | doc "Requesting principal: woodpecker-job- | manual-", + aud | String | doc "Target workspace name", + scopes | Array _ScopeEntry | doc "Allowed (op_type, target_pattern) tuples scoped to this signer", + seq | Number | doc "Per-issuer monotonic counter — anti-replay", + jti | String | doc "UUIDv4 idempotency key", + expected_state_version | String | doc "Optimistic concurrency token — workspace state version this op read", + exp | Number | doc "Unix timestamp: token expiry", + nbf | Number | doc "Unix timestamp: token not-valid-before", +} in + +let _StreamConfig = { + name | String, + subjects | Array String, + retention | _StreamRetention | doc "JetStream retention policy" | default = 'WorkQueue, + max_age_s | Number | doc "Message TTL in seconds", + replicas | Number | doc "JetStream stream replica count" | default = 1, + max_bytes | Number | doc "Max stream storage in bytes (-1 = unlimited)" | default = -1, +} in + +let _OpsSubjects = { + pending | String | doc "ops.pending..> — unsigned proposals from emitters", + cmd | String | doc "ops.cmd..> — signed commands ready to apply", + ack | String | doc "ops.ack..> — application result from ops-controller", + audit | String | doc "ops.audit. — immutable audit stream", +} in + +let _OpsStreams = { + pending | _StreamConfig | doc "WorkQueue, 14d — buffers unsigned proposals", + cmd | _StreamConfig | doc "WorkQueue, 24h — signed commands awaiting application", + audit | _StreamConfig | doc "Limits, 90d, replicas=3 — immutable audit record", +} in + +# Workspace-level ops contract — embed in workspace infra NCL as `ops_contract` +let _OpsWorkspaceConfig = { + workspace | String, + subjects | _OpsSubjects, + streams | _OpsStreams, + authorized_signers | Array String | doc "Signer identity keys allowed to sign for this workspace" | default = [], +} in + +{ + OpsType = OpsType, + StreamRetention = _StreamRetention, + ScopeEntry = _ScopeEntry, + JwtClaims = _JwtClaims, + StreamConfig = _StreamConfig, + OpsSubjects = _OpsSubjects, + OpsStreams = _OpsStreams, + OpsWorkspaceConfig = _OpsWorkspaceConfig, + + # Constructs a full OpsWorkspaceConfig from a workspace name. + # Stream names follow ADR-037 convention: OPS_{STREAM}_{workspace} + # (workspace name is used verbatim; uppercase normalisation is ops-controller's concern). + make_ops_config | not_exported = fun workspace => { + workspace = workspace, + subjects = { + pending = "ops.pending.%{workspace}.>", + cmd = "ops.cmd.%{workspace}.>", + ack = "ops.ack.%{workspace}.>", + audit = "ops.audit.%{workspace}", + }, + streams = { + pending = { + name = "OPS_PENDING_%{workspace}", + subjects = ["ops.pending.%{workspace}.>"], + retention = 'WorkQueue, + max_age_s = 1209600, + replicas = 1, + max_bytes = -1, + }, + cmd = { + name = "OPS_CMD_%{workspace}", + subjects = ["ops.cmd.%{workspace}.>"], + retention = 'WorkQueue, + max_age_s = 86400, + replicas = 1, + max_bytes = -1, + }, + audit = { + name = "OPS_AUDIT_%{workspace}", + subjects = ["ops.audit.%{workspace}"], + retention = 'Limits, + max_age_s = 7776000, + replicas = 3, + max_bytes = -1, + }, + }, + authorized_signers = [], + }, +} diff --git a/schemas/lib/playbook.ncl b/schemas/lib/playbook.ncl new file mode 100644 index 0000000..5813d21 --- /dev/null +++ b/schemas/lib/playbook.ncl @@ -0,0 +1,62 @@ +# schemas/lib/playbook.ncl — PlaybookDef and PlaybookStep contracts +# +# Every playbook in extensions/playbooks//playbook.ncl validates against this schema. +# validate-playbooks reflection mode (TASK-C6) checks: +# - playbook.ncl conforms to PlaybookDef +# - run.nu exists for each step that references it +# - rollback.nu exists when rollback_strategy = 'automatic +# - tests/dry_run.nu is checked with nu --ide-check when present +# +# Usage: +# let pb = import "schemas/lib/playbook.ncl" in +# { .. } | pb.PlaybookDef + +let _RollbackStrategy = [| 'automatic, 'manual, 'none |] in + +let _StepErrorAction = [| 'Stop, 'Rollback, 'Continue |] in + +# A declared parameter the playbook accepts — forwarded as env vars to step scripts. +let _ParamDef = { + name | String | doc "Parameter name (becomes env var: PLAYBOOK_PARAM_)", + description | String, + required | Bool | doc "When true, absence causes the runner to abort before any step" | default = true, + default_val | String | doc "Default value used when required = false and the caller omits the param" | default = "", +} in + +# A single step in a playbook. Each step maps to a script relative to the playbook root. +let _PlaybookStep = { + id | String | doc "Unique step identifier within this playbook; used in depends_on refs", + name | String | doc "Human-readable step label shown in dry-run output", + script | String | doc "Path to the Nushell step script relative to the playbook directory (e.g., 'run.nu', 'steps/deploy.nu')", + dry_run_arg | String | doc "Flag appended to script invocation when running in dry-run mode" | default = "--dry-run", + params | { _ | String } | doc "Static key-value params forwarded to the step script as env vars; caller params overlay these" | default = {}, + on_error | _StepErrorAction | doc "Action taken when this step exits non-zero" | default = 'Stop, + depends_on | Array String | doc "Step IDs that must complete successfully before this step runs" | default = [], +} in + +# The full playbook declaration. Consumed by 'prvng playbook run ' and the +# validate-playbooks reflection mode. +let _PlaybookDef = { + id | String | doc "Machine-readable playbook identifier matching the directory name (e.g., 'bootstrap_initial')", + name | String | doc "Human-readable playbook title", + description | String, + version | Number | doc "Schema version — must be 1" | default = 1, + preconditions | Array String | doc "Human-readable preconditions the operator must verify before running; printed in dry-run output" | default = [], + params | Array _ParamDef | doc "Declared parameters; absent required params abort before step 1" | default = [], + steps | Array _PlaybookStep | doc "Ordered step declarations; topological sort applied using depends_on", + rollback_strategy | _RollbackStrategy | doc "automatic: rollback.nu is invoked on any step failure; manual: operator handles; none: no rollback path" | default = 'none, + success_criteria | Array String | doc "Human-readable criteria printed after a successful run to help the operator verify the outcome" | default = [], + emit_audit | Bool | doc "When true, playbook runner emits ops.audit events at step start and completion" | default = false, + adr_refs | Array String | doc "ADR IDs this playbook implements (e.g., 'adr-037', 'adr-039')" | default = [], +} in + +{ + RollbackStrategy = _RollbackStrategy, + StepErrorAction = _StepErrorAction, + ParamDef = _ParamDef, + PlaybookStep = _PlaybookStep, + PlaybookDef = _PlaybookDef, + + make_step | not_exported = fun data => data | _PlaybookStep, + make_playbook | not_exported = fun data => data | _PlaybookDef, +} diff --git a/schemas/lib/radicle.ncl b/schemas/lib/radicle.ncl new file mode 100644 index 0000000..4c313bf --- /dev/null +++ b/schemas/lib/radicle.ncl @@ -0,0 +1,91 @@ +# schemas/lib/radicle.ncl — Radicle Heartwood governance substrate types (ADR-038) +# +# Three repo families per workspace: policy, desired, state — each with a distinct +# delegation profile. Used by the audit-mirror crate and governance domain commands. +# +# Usage: +# let rad = import "schemas/lib/radicle.ncl" in +# { repos | rad.WorkspaceRepos = rad.make_workspace_repos "libre-wuji" & { ... } } + +let _RepoRole = [| 'policy, 'desired, 'state |] in + +let _PatchStatus = [| 'open, 'merged, 'rejected |] in + +# M-of-N delegation profile attached to a Radicle repo. +# threshold <= length(signers) is a business invariant enforced by the Rust caller. +let _DelegationProfile = { + threshold | Number | doc "Minimum signatures required to merge a patch (M in M-of-N)", + signers | Array String | doc "Key IDs of authorized delegates (Radicle DID or human-readable alias)", +} in + +# A Radicle repo descriptor: RID + role + delegation profile. +# rid is empty string until the repo is initialised via 'rad init'. +let _RadicleRepo = { + name | String | doc "Human-readable name (e.g., 'policy-libre-wuji')", + rid | String | doc "Radicle Identifier assigned by 'rad init' (rad:...); empty before init" | default = "", + role | _RepoRole | doc "Functional role in the three-repo split", + delegates | _DelegationProfile | doc "M-of-N delegation profile for patches to this repo", +} in + +# A proposed change patch — governance domain commands surface these for operator review. +let _Patch = { + id | String | doc "Radicle patch ID", + proposed_by | String | doc "Key ID or alias of the patch author", + status | _PatchStatus | doc "Current lifecycle state" | default = 'open, + signatures | Array String | doc "Key IDs that have signed this patch" | default = [], + payload | String | doc "Short human-readable description of what this patch changes", +} in + +# Snapshot of signature satisfaction for a pending patch. +let _SignatureSet = { + required | Number | doc "Threshold from the repo's DelegationProfile (M)", + present | Array String | doc "Key IDs that have already signed", + satisfied | Bool | doc "True when length(present) >= required", +} in + +# The three repos belonging to one workspace — the canonical three-repo split. +let _WorkspaceRepos = { + policy | _RadicleRepo | doc "policy-: keeper auto-sign policy + authorized-signers set; M-of-N operator delegates", + desired | _RadicleRepo | doc "-desired: version-controlled workspace declaration; M-of-N operators + CI keys", + state | _RadicleRepo | doc "-state: immutable applied-ops ledger; exactly one delegate (ops-controller key)", +} in + +{ + RepoRole = _RepoRole, + PatchStatus = _PatchStatus, + DelegationProfile = _DelegationProfile, + RadicleRepo = _RadicleRepo, + Patch = _Patch, + SignatureSet = _SignatureSet, + WorkspaceRepos = _WorkspaceRepos, + + # Returns a _WorkspaceRepos template with empty RIDs and placeholder signer lists. + # `rid` and `delegates` fields carry `| default` priority so callers can override via merge: + # (rad.make_workspace_repos "libre-wuji") & { + # policy.rid = "rad:abc", + # policy.delegates = { threshold = 2, signers = ["jpl-yubikey", "alice-key"] }, + # state.rid = "rad:ghi", + # state.delegates = { threshold = 1, signers = ["ops-controller-wuji-key"] }, + # } + # Alternatively, use `{ ... } | rad.WorkspaceRepos` directly with all fields populated. + make_workspace_repos | not_exported = fun workspace => { + policy = { + name = "policy-%{workspace}", + rid | default = "", + role = 'policy, + delegates | default = { threshold = 1, signers = [] }, + }, + desired = { + name = "%{workspace}-desired", + rid | default = "", + role = 'desired, + delegates | default = { threshold = 1, signers = [] }, + }, + state = { + name = "%{workspace}-state", + rid | default = "", + role = 'state, + delegates | default = { threshold = 1, signers = [] }, + }, + }, +} diff --git a/schemas/lib/scheduler/scheduler.ncl b/schemas/lib/scheduler/scheduler.ncl new file mode 100644 index 0000000..7427b90 --- /dev/null +++ b/schemas/lib/scheduler/scheduler.ncl @@ -0,0 +1,148 @@ +# Generic scheduler helper — produces a scheduling artefact for any of the +# four runtime targets (K8s CronJob, systemd timer, cron.d entry, daemon +# task registration). Not coupled to backup nor to Kubernetes; any task in +# the repo that needs to be scheduled can build on top of this. +# +# Example: +# let s = (import "scheduler.ncl").make_schedule { +# name = "etcd-snapshot", +# schedule_kind = 'cron, cron_expr = "0 */6 * * *", +# target = { kind = 'systemd_timer, host_selector = "control_planes", +# user = "root", unit_name = "prvng-etcd-snapshot" }, +# command = "/usr/local/bin/prvng-backup one-shot backup etcd-snapshot", +# env = { …secret refs… }, +# } in s.systemd_units + +{ + # === Target descriptors =================================================== + + K8sCronJobTarget = { + kind | [| 'k8s_cronjob |], + namespace | String, + image | String, + image_pull_policy | [| 'IfNotPresent, 'Always, 'Never |] | default = 'IfNotPresent, + service_account | String | optional, + node_selector | { _ | String } | default = {}, + restart_policy | [| 'OnFailure, 'Never |] | default = 'OnFailure, + successful_jobs_history_limit | Number | default = 3, + failed_jobs_history_limit | Number | default = 5, + }, + + SystemdTimerTarget = { + kind | [| 'systemd_timer |], + unit_name | String, + host_selector | String | doc "Hostname pattern or role (e.g. 'control_planes')", + user | String | default = "root", + after | Array String | default = ["network-online.target"], + persistent | Bool | default = true, + }, + + CronDTarget = { + kind | [| 'cron_d |], + file_name | String | doc "Filename under /etc/cron.d/", + host_selector | String, + user | String | default = "root", + }, + + DaemonTaskTarget = { + kind | [| 'daemon_task |], + task_id | String, + daemon_endpoint | String | default = "unix:///run/prvng-backup.sock", + }, + + # === Top-level builder ==================================================== + + # make_schedule returns a record with one populated branch out of: + # { manifests, systemd_units, cron_files, daemon_registrations }. + # Callers serialise the appropriate branch. + make_schedule = fun spec => + let target_kind = spec.target.kind in + let cron_expr = spec.cron_expr in + let name = spec.name in + let command = spec.command in + let env = spec.env in + + { + manifests = + if target_kind == 'k8s_cronjob then + [{ + apiVersion = "batch/v1", + kind = "CronJob", + metadata = { + name = name, + namespace = spec.target.namespace, + }, + spec = { + schedule = cron_expr, + successfulJobsHistoryLimit = spec.target.successful_jobs_history_limit, + failedJobsHistoryLimit = spec.target.failed_jobs_history_limit, + jobTemplate.spec.template.spec = { + restartPolicy = std.string.from_enum spec.target.restart_policy, + serviceAccountName = spec.target.service_account, + nodeSelector = spec.target.node_selector, + containers = [{ + name = name, + image = spec.target.image, + imagePullPolicy = std.string.from_enum spec.target.image_pull_policy, + command = ["/bin/sh", "-c", command], + env = std.record.to_array env + |> std.array.map (fun e => { name = e.field, value = e.value }), + }], + }, + }, + }] + else [], + + systemd_units = + if target_kind == 'systemd_timer then + [{ + host_selector = spec.target.host_selector, + unit_name = spec.target.unit_name, + service_unit = m%" + [Unit] + Description=%{name} + After=%{std.string.join " " spec.target.after} + + [Service] + Type=oneshot + User=%{spec.target.user} + ExecStart=%{command} + EnvironmentFile=-/etc/prvng-backup/%{name}.env + "%, + timer_unit = m%" + [Unit] + Description=Timer for %{name} + + [Timer] + OnCalendar=%{cron_expr} + Persistent=%{if spec.target.persistent then "true" else "false"} + + [Install] + WantedBy=timers.target + "%, + }] + else [], + + cron_files = + if target_kind == 'cron_d then + [{ + host_selector = spec.target.host_selector, + path = "/etc/cron.d/%{spec.target.file_name}", + content = m%" + %{cron_expr} %{spec.target.user} %{command} + "%, + }] + else [], + + daemon_registrations = + if target_kind == 'daemon_task then + [{ + task_id = spec.target.task_id, + daemon_endpoint = spec.target.daemon_endpoint, + schedule = cron_expr, + command = command, + env = env, + }] + else [], + }, +} diff --git a/schemas/lib/storage_config.ncl b/schemas/lib/storage_config.ncl new file mode 100644 index 0000000..076491a --- /dev/null +++ b/schemas/lib/storage_config.ncl @@ -0,0 +1,52 @@ +# schemas/lib/storage_config.ncl — StorageConfig contracts +# +# Library file — import only, not directly exportable. +# +# Usage (component contracts.ncl): +# let sc = import "schemas/lib/storage_config.ncl" in +# requires | { storage | sc.StorageRequires | optional, ... } +# +# Usage (provider metadata.ncl or capabilities.ncl): +# let sc = import "schemas/lib/storage_config.ncl" in +# storage_policy | sc.ProviderStoragePolicy = sc.HetznerCSIPolicy + +{ + VolumeMode = [| 'block, 'nfs, 'object |], + + ExpansionPolicy = [| 'static, 'expand_only, 'full |], + + # Contract for component requires.storage — what a component declares it needs. + StorageRequires = { + size | String, + persistent | Bool | default = true, + volume_mode | VolumeMode | default = 'block, + access_mode | String | default = "ReadWriteOnce", + storage_class | String | optional, + }, + + # Abstract contract for provider storage policies. + # Concrete policies (HetznerCSIPolicy, etc.) must supply all fields. + ProviderStoragePolicy = { + provider | String, + min_size | String | default = "1Gi", + max_size | String | optional, + expansion_policy | ExpansionPolicy | default = 'static, + volume_modes | Array VolumeMode | default = ['block], + }, + + # Hetzner hcloud-volumes: minimum 10Gi, expand-only (no shrink via CSI). + HetznerCSIPolicy | ProviderStoragePolicy = { + provider = "hcloud-volumes", + min_size = "10Gi", + expansion_policy = 'expand_only, + volume_modes = ['block], + }, + + # democratic-csi NFS: fine-grained sizing, full expand/shrink, RWX capable. + DemocraticCSINFSPolicy | ProviderStoragePolicy = { + provider = "democratic-csi-nfs", + min_size = "1Gi", + expansion_policy = 'full, + volume_modes = ['nfs], + }, +} diff --git a/schemas/lib/system_backup.ncl b/schemas/lib/system_backup.ncl new file mode 100644 index 0000000..09aa5b2 --- /dev/null +++ b/schemas/lib/system_backup.ncl @@ -0,0 +1,79 @@ +# System backup contracts — declarative description of how out-of-cluster +# artefacts are backed up: etcd, k8s certs, host configs, external DNS, +# builder environment, provisioning state itself, log archives, vault state. +# Disparado por system cron / systemd timer / daemon coordinator. + +let bp = import "backup_policy.ncl" in +let vault = import "vault_refs.ncl" in + +{ + # Selector for the host(s) where the backup runs. Either an explicit list + # of hostnames, a control-plane role selector, or a single primary. + HostSelector = { + kind | [| 'cp_only, 'cp_first, 'control_planes, 'workers, 'all_servers, 'list |], + members | Array String | doc "Hostnames when kind = 'list" | default = [], + }, + + # Discriminated target: what kind of off-cluster artefact is being captured. + SystemBackupTarget = { + kind | [| 'etcd, 'k8s_certs, 'cluster_resources, 'longhorn_engine, 'host_configs, + 'external_dns, 'builder_env, 'provisioning_state, 'logs_archive, + 'sops_keys, 'vault_state |], + + # 'etcd + endpoints | Array String | default = [], + ca_ref | vault.VaultCredRef | optional, + cert_ref | vault.VaultCredRef | optional, + key_ref | vault.VaultCredRef | optional, + + # 'k8s_certs / 'host_configs / 'logs_archive (paths) + paths | Array String | default = [], + exclude | Array String | default = [], + + # 'cluster_resources + namespaces | Array String | default = [], + kinds | Array String | default = [], + + # 'longhorn_engine + components | Array String | default = [], + + # 'external_dns + source_kind | [| 'coredns, 'powerdns, 'unbound, 'loki, 'journald, 'files |] | optional, + config_paths| Array String | default = [], + zones_paths | Array String | default = [], + + # 'builder_env + tools | Array String | default = [], + secrets | Array String | doc "Secret names that must accompany the artefact" | default = [], + + # 'provisioning_state + definitions_path | String | optional, + state_path | String | optional, + lock_path | String | optional, + + # 'logs_archive + selector | String | optional, + format | [| 'jsonl_gz, 'tar_gz, 'restic_native |] | optional, + + # 'sops_keys / 'vault_state + age_keys | Array String | default = [], + recipients | Array String | default = [], + vault_endpoint | String | optional, + vault_paths | Array String | default = [], + }, + + SystemBackupDef = { + name | String | doc "Identifier (used in CLI: prvng-backup one-shot backup )", + target | SystemBackupTarget, + host_selector | HostSelector, + provider | bp.BackupProviderRef, + schedule | bp.Schedule, + retention | bp.RetentionPolicy, + destinations | Array bp.Destination, + encryption | vault.VaultKeyRef, + tag_strategy | bp.TagStrategy, + verify | bp.VerifyPolicyRef | optional, + hooks | bp.Hooks | optional, + throttle | bp.Throttle | optional, + }, +} diff --git a/schemas/lib/validation.ncl b/schemas/lib/validation.ncl index abe1309..ebd5c83 100644 --- a/schemas/lib/validation.ncl +++ b/schemas/lib/validation.ncl @@ -1,183 +1,113 @@ -# | Reusable Validation Library for Nickel -# | Common validation contracts and helper functions -# | Author: JesusPerezLorenzo -# | Date: 2025-12-15 -# | Status: Production Ready - -# ============================================================ -# Common Validation Contracts -# ============================================================ - -# IPv4 address validation (e.g., "192.168.1.1") -let IpV4Contract = { - label = "ValidIPv4", - predicate = fun ip => - std.string.is_match ip "^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$" -} - -# CIDR notation validation (e.g., "192.168.1.0/24") -let CidrContract = { - label = "ValidCIDR", - predicate = fun cidr => - std.string.is_match cidr "^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}/\\d{1,2}$" -} - -# Port range validation (1-65535) -let PortContract = { - label = "ValidPort", - predicate = fun p => - p > 0 && p < 65536 -} - -# Semantic versioning validation (e.g., "1.2.3") -let SemverContract = { - label = "ValidSemver", - predicate = fun v => - std.string.is_match v "^\\d+\\.\\d+\\.\\d+$" -} - -# Domain name validation (e.g., "example.com") -let DomainContract = { - label = "ValidDomain", - predicate = fun d => - std.string.is_match d "^[a-z0-9]([a-z0-9-\\.]{0,253}[a-z0-9])?$" -} - -# OCI tag validation (e.g., "latest", "v1.0.0", "sha256-abc123") -let OciTagContract = { - label = "ValidOCITag", - predicate = fun tag => - std.string.is_match tag "^[a-zA-Z0-9_][a-zA-Z0-9._-]{0,127}$" -} - -# ISO 8601 timestamp validation (e.g., "2025-12-15T10:30:00Z") -let Iso8601Contract = { - label = "ValidISO8601", - predicate = fun ts => - std.string.is_match ts "^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z$" -} - -# Filesystem path validation (simple: non-empty, no double slashes) -let PathContract = { - label = "ValidPath", - predicate = fun path => - std.string.length path > 0 && !std.string.contains path "//" -} - -# ============================================================ -# Helper Functions for Common Validations -# ============================================================ - -# Validate minimum string length -let min_length = fun min_val => - { - label = "MinLength%{std.to_string min_val}", - predicate = fun s => - std.string.length s >= min_val - } - -# Validate maximum string length -let max_length = fun max_val => - { - label = "MaxLength%{std.to_string max_val}", - predicate = fun s => - std.string.length s <= max_val - } - -# Validate numeric range -let range = fun min_val max_val => - { - label = "Range[%{std.to_string min_val}-%{std.to_string max_val}]", - predicate = fun n => - n >= min_val && n <= max_val - } - -# Validate enum (value must be in list) -let enum = fun values => - { - label = "Enum[%{std.string.join "," values}]", - predicate = fun v => - std.array.elem v values - } - -# Validate non-empty string -let non_empty_string = { - label = "NonEmptyString", - predicate = fun s => - std.string.length s > 0 -} - -# Validate non-negative number -let non_negative = { - label = "NonNegative", - predicate = fun n => - n >= 0 -} - -# Validate positive number -let positive = { - label = "Positive", - predicate = fun n => - n > 0 -} - -# Validate boolean value -let boolean_value = { - label = "Boolean", - predicate = fun b => - b == true || b == false -} - -# ============================================================ -# Helper Functions for Custom Validation -# ============================================================ - -# Validate all items in array satisfy predicate -let all_items = fun predicate => - fun items => - std.array.all predicate items - -# Validate at least one item in array satisfies predicate -let any_items = fun predicate => - fun items => - std.array.any predicate items - -# Validate record has all required keys -let has_keys = fun required_keys => - fun record => - std.array.all - (fun key => - std.record.has_field record key - ) - required_keys - -# ============================================================ -# Exports -# ============================================================ - +# Reusable Validation Library for Nickel { - # Core validation contracts - IpV4Contract = IpV4Contract, - CidrContract = CidrContract, - PortContract = PortContract, - SemverContract = SemverContract, - DomainContract = DomainContract, - OciTagContract = OciTagContract, - Iso8601Contract = Iso8601Contract, - PathContract = PathContract, + IpV4Contract = { + label = "ValidIPv4", + predicate = fun ip => + std.string.is_match ip "^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$" + }, - # Helper functions - min_length = min_length, - max_length = max_length, - range = range, - enum = enum, - non_empty_string = non_empty_string, - non_negative = non_negative, - positive = positive, - boolean_value = boolean_value, + CidrContract = { + label = "ValidCIDR", + predicate = fun cidr => + std.string.is_match cidr "^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}/\\d{1,2}$" + }, - # Custom validators - all_items = all_items, - any_items = any_items, - has_keys = has_keys, + PortContract = { + label = "ValidPort", + predicate = fun p => + p > 0 && p < 65536 + }, + + SemverContract = { + label = "ValidSemver", + predicate = fun v => + std.string.is_match v "^\\d+\\.\\d+\\.\\d+$" + }, + + DomainContract = { + label = "ValidDomain", + predicate = fun d => + std.string.is_match d "^[a-z0-9]([a-z0-9-\\.]{0,253}[a-z0-9])?$" + }, + + OciTagContract = { + label = "ValidOCITag", + predicate = fun tag => + std.string.is_match tag "^[a-zA-Z0-9_][a-zA-Z0-9._-]{0,127}$" + }, + + Iso8601Contract = { + label = "ValidISO8601", + predicate = fun ts => + std.string.is_match ts "^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z$" + }, + + PathContract = { + label = "ValidPath", + predicate = fun path => + std.string.length path > 0 && !std.string.contains path "//" + }, + + min_length = fun min_val => { + label = "MinLength%{std.to_string min_val}", + predicate = fun s => std.string.length s >= min_val, + }, + + max_length = fun max_val => { + label = "MaxLength%{std.to_string max_val}", + predicate = fun s => std.string.length s <= max_val, + }, + + range = fun min_val max_val => { + label = "Range[%{std.to_string min_val}-%{std.to_string max_val}]", + predicate = fun n => n >= min_val && n <= max_val, + }, + + enum = fun values => { + label = "Enum[%{std.string.join "," values}]", + predicate = fun v => std.array.elem v values, + }, + + non_empty_string = { + label = "NonEmptyString", + predicate = fun s => std.string.length s > 0 + }, + + non_negative = { + label = "NonNegative", + predicate = fun n => n >= 0 + }, + + positive = { + label = "Positive", + predicate = fun n => n > 0 + }, + + boolean_value = { + label = "Boolean", + predicate = fun b => b == true || b == false + }, + + all_items = fun pred => fun items => std.array.all pred items, + + any_items = fun pred => fun items => std.array.any pred items, + + has_keys = fun required_keys => fun record => + std.array.all + (fun key => std.record.has_field record key) + required_keys, + + IpRef = + std.contract.custom ( + fun _label => + fun value => + if value == "" + || std.string.is_match "^(\\d{1,3}\\.){3}\\d{1,3}$" value + || std.string.contains ":" value + || std.string.contains "fip" value + then 'Ok value + else 'Error { + message = "expected empty, an IPv4 address, an IPv6 address (contains ':'), or a FIP name (contains 'fip'); got '%{value}'" + } + ), } diff --git a/schemas/lib/vault_refs.ncl b/schemas/lib/vault_refs.ncl new file mode 100644 index 0000000..3a6e13a --- /dev/null +++ b/schemas/lib/vault_refs.ncl @@ -0,0 +1,41 @@ +# Vault reference contracts — typed pointers to secretumvault entries. +# Subsystems that need keys, credentials or signing material reference them +# by path inside vault rather than embedding the secret. + +let _VaultPath = std.contract.from_validator (fun value => + if !(std.is_string value) + then 'Error { message = "VaultPath must be a String" } + else if std.string.length value == 0 + then 'Error { message = "VaultPath must be non-empty" } + else if std.string.contains " " value + then 'Error { message = "VaultPath must not contain whitespace" } + else if !(std.string.contains "/" value) + then 'Error { message = "VaultPath must contain '/'" } + else 'Ok +) in + +{ + # Path inside secretumvault. Validated as non-empty, no whitespace, contains '/'. + VaultPath = _VaultPath, + + # Reference to a symmetric/asymmetric key stored in vault for encryption use. + VaultKeyRef = { + path | String | doc "Vault path to the key entry (e.g. 'backup-manager/master-encryption-key')", + algorithm | [| 'aes_gcm_256, 'chacha20_poly1305, 'age_x25519, 'rsa_4096, 'ecdsa_p256, 'pq_kyber768 |] | default = 'age_x25519, + derivation | { + method | [| 'none, 'hkdf_sha256 |] | default = 'none, + info | String | doc "HKDF info parameter when derivation is hkdf_sha256" | default = "", + } | default = { method = 'none, info = "" }, + }, + + # Reference to credentials (S3 access keys, B2 keys, NKey seeds, etc.) stored in vault. + VaultCredRef = { + path | String | doc "Vault path to the credentials entry (e.g. 'backup-manager/destinations/hetzner-primary')", + kind | [| 's3, 'b2, 'sftp, 'nkey, 'jwt, 'token, 'tls_cert_bundle, 'etcd_client |] | doc "Type of credential payload at the path", + }, + + # Reference to a Cedar policy bundle in vault (for RBAC across actors). + VaultPolicyRef = { + path | String | doc "Vault path to the Cedar policy entry", + }, +} diff --git a/schemas/lib/verify_policy.ncl b/schemas/lib/verify_policy.ncl new file mode 100644 index 0000000..96961be --- /dev/null +++ b/schemas/lib/verify_policy.ncl @@ -0,0 +1,64 @@ +# Verify policy contracts — backup verification as parallel provisioning. +# Drill-as-recipe: instead of a boolean flag, declare a sandbox infra recipe +# the daemon coordinator spins up, restores into, and runs an integration +# test suite against. The only credible verification is one that actually +# restores and exercises the data. + +let bp = import "backup_policy.ncl" in + +{ + # Test step discriminated union. The manager runs each step in order, + # collecting pass/fail/skip; an optional step does not abort on failure. + TestStep = { + kind | [| 'http_check, 'sql_query, 'file_exists, 'cmd, 'integration |], + name | String, + optional | Bool | default = false, + timeout | bp.Duration | default = "60s", + + # 'http_check + url | String | optional, + expected_status | Number | optional, + + # 'sql_query + connection_ref | String | optional | doc "Reference to a connection profile (vault path or alias)", + query | String | optional, + expected | String | optional, + + # 'file_exists + path | String | optional, + + # 'cmd + run | String | optional, + expect_zero_exit | Bool | default = true, + + # 'integration — invokes a higher-level scenario by name + component | String | optional, + scenario | String | optional, + }, + + # Reference to a parallel provisioning recipe that materialises the sandbox. + # The recipe lives under infra//verify-recipes/ and is itself + # declarative Nickel exported to the orchestrator. + ProvisioningRecipeRef = { + name | String | doc "Recipe identifier (looked up in infra//verify-recipes/)", + args | { _ | String } | doc "Per-invocation parameters passed to the recipe" | default = {}, + }, + + # Drill specification consumed by the daemon coordinator on a verify schedule. + DrillSpec = { + name | String, + parallel_infra | ProvisioningRecipeRef, + test_suite | Array TestStep, + cleanup | [| 'always, 'on_success, 'never |] | default = 'on_success, + timeout | bp.Duration | default = "30m", + schedule | bp.Schedule | optional | doc "Drill cadence; defaults to manual invocation when omitted", + }, + + # Top-level verify policy: a level (cheapest → costliest) plus an optional + # drill spec for 'restore_drill / 'full_dr levels. + VerifyPolicy = { + level | [| 'quick, 'deep, 'restore_drill, 'full_dr |] | default = 'quick, + schedule | bp.Schedule | optional, + drill | DrillSpec | optional, + }, +} diff --git a/schemas/lib/workflow.ncl b/schemas/lib/workflow.ncl new file mode 100644 index 0000000..cf2beca --- /dev/null +++ b/schemas/lib/workflow.ncl @@ -0,0 +1,78 @@ +# schemas/lib/workflow.ncl — Workflow contracts +# +# A Workflow composes operations across components, modes, and layers. +# Each step targets one or more component operations (install, update, backup, ...). +# Workflows connect to: FSM dimensions, NATS events, backlog items, action log. +# +# Relationship to DAG: +# dag.ncl — L2 server provisioning (SSH, always install, server-bound) +# workflows/ — L3 service lifecycle (cross-component, any operation, cross-mode) +# +# Usage: +# let w = import "schemas/lib/workflow.ncl" in +# { deploy_services | w.WorkflowDef = { id = "...", steps = [...] } } + +# Target for a single workflow step — a (component, operation) pair with optional mode override +let _WorkflowStepTarget = { + component | String, + operation | String, + mode | [| 'taskserv, 'cluster, 'container |] | optional, +} in + +# A single step in a workflow — may touch multiple components +let _WorkflowStep = { + id | String, + targets | Array _WorkflowStepTarget, + depends_on | Array String | default = [], + condition | String | optional, + on_error | [| 'Stop, 'Rollback, 'Continue |] | default = 'Stop, +} in + +# The structural definition of a workflow: ordered steps with rollback path +let _WorkflowDef = { + id | String, + description | String, + steps | Array _WorkflowStep, + rollback | Array _WorkflowStep | default = [], +} in + +# Operational metadata bundled with a workflow: authorization, NATS, FSM, backlog, triggers +let _WorkflowMetadata = { + id | String, + name | String, + description | String, + tags | Array String | default = [], + + actors | Array [| 'Developer, 'Agent, 'CI |] | default = ['Developer], + requires_approval | Bool | default = false, + + fsm_dimension | String | optional, + + notifications | { + subject_prefix | String, + on_start | Bool | default = true, + on_step | Bool | default = true, + on_complete | Bool | default = true, + on_error | Bool | default = true, + } | optional, + + backlog_refs | Array String | default = [], + procedure_doc | String | optional, + adr_refs | Array String | default = [], + + triggers | { + manual | Bool | default = true, + schedule | String | optional, + on_event | String | optional, + } | default = {}, +} in + +{ + WorkflowStepTarget = _WorkflowStepTarget, + WorkflowStep = _WorkflowStep, + WorkflowDef = _WorkflowDef, + WorkflowMetadata = _WorkflowMetadata, + + make_step = fun data => _WorkflowStep & data, + make_workflow = fun data => data | _WorkflowDef, +} diff --git a/schemas/main.ncl b/schemas/main.ncl index 2a0d0e8..ab584bc 100644 --- a/schemas/main.ncl +++ b/schemas/main.ncl @@ -84,6 +84,9 @@ services | doc "Service registry and definitions" = import "./infrastructure/compute/services/main.ncl", + + scaling | doc "Node role and scale policy contracts (NodeRole, ScaleTemplate, ScalePolicy)" + = import "./infrastructure/compute/scaling.ncl", }, storage | doc "Storage resources (VMs, volumes, golden images)" @@ -98,6 +101,9 @@ = import "./infrastructure/storage/golden_image/main.ncl", }, + images | doc "Provider role images (snapshot lifecycle, hardware limits, state)" + = import "./infrastructure/images/main.ncl", + provisioning | doc "Nested provisioning schemas" = { nested_provisioning | doc "Nested provisioning schemas" @@ -111,6 +117,9 @@ workflows | doc "Batch workflow schemas" = import "./operations/workflows/main.ncl", + server_deploy | doc "Server deployment workflow plan (typed step sequencing)" + = import "./operations/workflows/server_deploy/main.ncl", + batch | doc "Batch scheduler and executor schemas" = import "./operations/batch/main.ncl", diff --git a/schemas/operations/workflows/server_deploy/contracts.ncl b/schemas/operations/workflows/server_deploy/contracts.ncl new file mode 100644 index 0000000..18076d3 --- /dev/null +++ b/schemas/operations/workflows/server_deploy/contracts.ncl @@ -0,0 +1,22 @@ +# ServerDeployPlan Contracts — typed workflow for server deployment step sequencing. + +{ + StepType = [| 'check, 'task, 'notify |], + FailMode = [| 'stop, 'warn, 'skip |], + + DeployStep = { + id | String, + name | String, + type | StepType, + required | Bool, + on_fail | FailMode, + depends_on | Array String | optional, + timeout_seconds | Number | optional, + }, + + ServerDeployPlan = { + name | String, + steps | Array DeployStep, + on_failure | [| 'stop_all, 'continue, 'rollback |], + }, +} diff --git a/schemas/operations/workflows/server_deploy/defaults.ncl b/schemas/operations/workflows/server_deploy/defaults.ncl new file mode 100644 index 0000000..5add658 --- /dev/null +++ b/schemas/operations/workflows/server_deploy/defaults.ncl @@ -0,0 +1,75 @@ +# ServerDeployPlan Defaults — canonical server deployment step sequence. + +{ + default_server_deploy_plan | default = { + name = "default-server-deploy", + on_failure = 'stop_all, + steps = [ + { + id = "check_network", + name = "Verify network connectivity", + type = 'check, + required = true, + on_fail = 'stop, + depends_on = [], + }, + { + id = "check_ssh_creds", + name = "Verify SSH credentials", + type = 'check, + required = true, + on_fail = 'stop, + depends_on = [], + }, + { + id = "check_image_exists", + name = "Verify role image snapshot exists", + type = 'check, + required = true, + on_fail = 'stop, + depends_on = [], + }, + { + id = "check_image_fresh", + name = "Check role image freshness", + type = 'check, + required = false, + on_fail = 'warn, + depends_on = [], + }, + { + id = "create_server", + name = "Create server via provider", + type = 'task, + required = true, + on_fail = 'stop, + depends_on = ["check_network", "check_ssh_creds", "check_image_exists"], + }, + { + id = "wait_boot", + name = "Wait for server boot", + type = 'task, + required = true, + on_fail = 'stop, + depends_on = ["create_server"], + timeout_seconds = 300, + }, + { + id = "verify_ssh", + name = "Verify SSH connectivity", + type = 'task, + required = true, + on_fail = 'stop, + depends_on = ["wait_boot"], + }, + { + id = "run_taskservs", + name = "Run post-boot taskservs", + type = 'task, + required = false, + on_fail = 'warn, + depends_on = ["verify_ssh"], + }, + ], + }, +} diff --git a/schemas/operations/workflows/server_deploy/main.ncl b/schemas/operations/workflows/server_deploy/main.ncl new file mode 100644 index 0000000..d2f8d8d --- /dev/null +++ b/schemas/operations/workflows/server_deploy/main.ncl @@ -0,0 +1,13 @@ +# ServerDeployPlan public API — typed workflow for declarable server deployment sequences. + +let contracts_lib = import "./contracts.ncl" in +let defaults_lib = import "./defaults.ncl" in + +{ + defaults = defaults_lib, + + make_deploy_plan | not_exported = fun overrides => + defaults_lib.default_server_deploy_plan & overrides, + + DefaultServerDeployPlan = defaults_lib.default_server_deploy_plan, +} diff --git a/schemas/platform/README.md b/schemas/platform/README.md index 76222ec..43f9f3b 100644 --- a/schemas/platform/README.md +++ b/schemas/platform/README.md @@ -1,39 +1,100 @@ -# TypeDialog + Nickel Configuration System for Platform Services +# Platform Configuration System - Schemas & Defaults -Complete configuration system for provisioning platform services (orchestrator, control-center, mcp-server, vault-service, -extension-registry, rag, ai-service, provisioning-daemon) across multiple deployment modes (solo, multiuser, cicd, enterprise). +**Source of truth** for platform service configurations (orchestrator, control-center, mcp-server, vault-service, extension-registry, rag, ai-service, provisioning-daemon) across all deployment modes. -## Architecture Overview +This directory contains the **active, production configuration system** that powers: +- Type-safe configuration via Nickel schemas +- Constraint-based validation +- Multi-mode deployment (solo/multiuser/cicd/enterprise) +- Interactive TypeDialog forms +- TOML export for Rust service consumption -This system implements a **TypeDialog + Nickel configuration workflow** that provides: - -- **Type-safe configuration** via Nickel schemas with validation -- **Interactive configuration** via TypeDialog forms with real-time constraint validation -- **Multi-mode deployment** (solo/multiuser/cicd/enterprise) with mode-specific defaults -- **Configuration composition** (base defaults + mode overlays + user customization + validation) -- **Automated TOML export** for Rust service consumption -- **Docker Compose + Kubernetes templates** for infrastructure deployment - -## Directory Structure +## Directory Structure (Flat) ```bash -provisioning/.typedialog/provisioning/platform/ -├── constraints/ # Single source of truth for validation limits -├── schemas/ # Nickel type contracts (services + common + deployment modes) -├── defaults/ # Default configuration values (services + common + deployment modes) -├── validators/ # Validation logic (constraints, ranges, business rules) -├── configs/ # Generated mode-specific Nickel configurations (4 services × 4 modes = 16 configs) -├── forms/ # TypeDialog form definitions (4 main forms + flat fragments) -│ └── fragments/ # Reusable form fragments (workspace, server, database, etc.) -├── templates/ # Jinja2 + Nickel templates for config/deployment generation -│ ├── docker-compose/ # Docker Compose templates (solo/multiuser/cicd/enterprise) -│ ├── kubernetes/ # Kubernetes deployment templates -│ └── configs/ # Service configuration templates (TOML generation) -├── scripts/ # Nushell orchestration scripts (configure, generate, validate, deploy) -├── examples/ # Example configurations for different deployment scenarios -└── values/ # User configuration files (gitignored *.ncl) +provisioning/schemas/platform/ +├── orchestrator.ncl # Service schemas (flat, one file per service) +├── control-center.ncl +├── mcp-server.ncl +├── vault-service.ncl +├── extension-registry.ncl +├── ai-service.ncl +├── rag.ncl +├── provisioning-daemon.ncl +│ +├── common/ # Shared schemas and utilities +│ ├── constraints.ncl # GENERATED - Nickel constraint validators (do NOT edit) +│ ├── helpers.ncl # Configuration composition helpers +│ ├── server.ncl # HTTP server schema +│ ├── database.ncl # Database backend schema +│ ├── security.ncl # Authentication/encryption +│ ├── monitoring.ncl # Metrics and health checks +│ ├── logging.ncl # Log configuration +│ ├── network.ncl # Network binding and TLS +│ ├── storage.ncl # Storage backend +│ └── workspace.ncl # Workspace configuration +│ +├── deployment/ # Mode-specific schemas (flat) +│ ├── solo.ncl # 2 CPU, 4GB RAM +│ ├── multiuser.ncl # 4 CPU, 8GB RAM +│ ├── cicd.ncl # 8 CPU, 16GB RAM +│ └── enterprise.ncl # 16+ CPU, 32+ GB RAM +│ +├── defaults/ # Default values by service and mode +│ ├── orchestrator-defaults.ncl +│ ├── control-center-defaults.ncl +│ ├── mcp-server-defaults.ncl +│ ├── vault-service-defaults.ncl +│ ├── extension-registry-defaults.ncl +│ ├── ai-service-defaults.ncl +│ ├── rag-defaults.ncl +│ ├── provisioning-daemon-defaults.ncl +│ ├── common/ # Shared defaults (6 files) +│ │ ├── server-defaults.ncl +│ │ ├── database-defaults.ncl +│ │ ├── security-defaults.ncl +│ │ ├── monitoring-defaults.ncl +│ │ └── logging-defaults.ncl +│ └── deployment/ # Mode defaults (4 files) +│ ├── solo-defaults.ncl +│ ├── multiuser-defaults.ncl +│ ├── cicd-defaults.ncl +│ └── enterprise-defaults.ncl +│ +├── constraints/ # Single source of truth +│ ├── constraints.toml # MASTER FILE - All validation limits +│ └── README.md +│ +├── configs/ # Generated intermediate configs +│ ├── orchestrator.solo.ncl +│ ├── orchestrator.multiuser.ncl +│ ├── orchestrator.cicd.ncl +│ ├── orchestrator.enterprise.ncl +│ └── ... (8 services × 4 modes = 32 configs) +│ +├── templates/ # Output generation templates +│ ├── configs/ # TOML export templates (Jinja2) +│ ├── docker-compose/ # Docker Compose templates +│ ├── kubernetes/ # Kubernetes manifests +│ └── service-config-template.ncl +│ +├── examples/ # Reference configurations +│ ├── orchestrator-solo.ncl +│ ├── orchestrator-enterprise.ncl +│ ├── control-center-multiuser.ncl +│ └── full-platform-enterprise.ncl +│ +└── values/ # User customizations (gitignored) + ├── .gitignore # Ignores *.ncl, *.toml + └── (user configs populated at runtime) ``` +**Key Changes**: +- ✅ Flat structure (no nested `schemas/` or `defaults/`) +- ✅ Archived `validators/` (88KB dead code) +- ✅ Generated `constraints.ncl` (gitignored) +- ✅ Master `constraints.toml` (single source) + ## Configuration Workflow ### 1. User Interaction (TypeDialog) diff --git a/schemas/platform/schemas/ai-service.ncl b/schemas/platform/ai-service.ncl similarity index 71% rename from schemas/platform/schemas/ai-service.ncl rename to schemas/platform/ai-service.ncl index 3e9d96b..bc54734 100644 --- a/schemas/platform/schemas/ai-service.ncl +++ b/schemas/platform/ai-service.ncl @@ -1,11 +1,14 @@ # AI Service Schema # AI model integration with RAG and MCP services +let constraints = import "schemas/platform/common/constraints.ncl" in +let docker_build_schema = import "schemas/platform/docker-build.ncl" in + { AiServiceConfig = { server | { host | String, - port | Number, + port | Number | constraints.port_high, workers | Number | optional, }, @@ -34,5 +37,8 @@ logging | { level | String | default = "info", } | optional, + + # Docker build configuration + build | docker_build_schema.DockerBuildConfig | optional, }, } diff --git a/schemas/platform/common/constraints.ncl b/schemas/platform/common/constraints.ncl new file mode 100644 index 0000000..a533c33 --- /dev/null +++ b/schemas/platform/common/constraints.ncl @@ -0,0 +1,86 @@ +# Platform Constraints and Validators +# AUTOMATICALLY GENERATED from constraints.toml - DO NOT EDIT DIRECTLY +# Generated via: nickel eval scripts/generate-constraints.ncl +# Source: schemas/platform/constraints/constraints.toml +# +# Usage: Import in schemas to validate configuration fields +# Example: port | constraints.port_standard +# +# To modify constraints, edit constraints.toml and run: +# nickel eval scripts/generate-constraints.ncl > schemas/platform/common/constraints.ncl + +let contract = std.contract in + +{ + # Valid port range (avoid system ports < 1024) + port_standard = contract.from_validator (fun x => + if x >= 1024 && x <= 65535 then 'Ok + else 'Error {message = "port_standard must be between 1024 and 65535"} + ), + # Platform service ports (>= 9000) + port_high = contract.from_validator (fun x => + if x >= 9000 && x <= 65535 then 'Ok + else 'Error {message = "port_high must be between 9000 and 65535"} + ), + # Vault service port number + vault_port = contract.from_validator (fun x => + if x >= 1024 && x <= 65535 then 'Ok + else 'Error {message = "vault_port must be between 1024 and 65535"} + ), + # Extension registry server port + registry_port = contract.from_validator (fun x => + if x >= 1024 && x <= 65535 then 'Ok + else 'Error {message = "registry_port must be between 1024 and 65535"} + ), + # Workflow engine worker thread count + workers = contract.from_validator (fun x => + if x >= 1 && x <= 32 then 'Ok + else 'Error {message = "workers must be between 1 and 32"} + ), + # HTTP server worker thread count + server_workers = contract.from_validator (fun x => + if x >= 1 && x <= 32 then 'Ok + else 'Error {message = "server_workers must be between 1 and 32"} + ), + # Maximum concurrent HTTP connections + max_connections = contract.from_validator (fun x => + if x >= 10 && x <= 10000 then 'Ok + else 'Error {message = "max_connections must be between 10 and 10000"} + ), + # Retry attempts for failed tasks + retry_attempts = contract.from_validator (fun x => + if x >= 0 && x <= 10 then 'Ok + else 'Error {message = "retry_attempts must be between 0 and 10"} + ), + # Metrics collection interval in seconds (10s-5min) + metrics_interval = contract.from_validator (fun x => + if x >= 10 && x <= 300 then 'Ok + else 'Error {message = "metrics_interval must be between 10 and 300"} + ), + # Health check interval in seconds (5s-5min) + health_check_interval = contract.from_validator (fun x => + if x >= 5 && x <= 300 then 'Ok + else 'Error {message = "health_check_interval must be between 5 and 300"} + ), + # Task execution timeout in milliseconds (1min-24hrs) + task_timeout = contract.from_validator (fun x => + if x >= 60000 && x <= 86400000 then 'Ok + else 'Error {message = "task_timeout must be between 60000 and 86400000"} + ), + # Tool execution timeout in milliseconds (5s-10min) + tool_timeout = contract.from_validator (fun x => + if x >= 5000 && x <= 600000 then 'Ok + else 'Error {message = "tool_timeout must be between 5000 and 600000"} + ), + # HTTP keep-alive timeout in seconds (0=disabled) + keep_alive = contract.from_validator (fun x => + if x >= 0 && x <= 600 then 'Ok + else 'Error {message = "keep_alive must be between 0 and 600"} + ), + # Rate limiting max requests per window + rate_limit_requests = contract.from_validator (fun x => + if x >= 10 && x <= 10000 then 'Ok + else 'Error {message = "rate_limit_requests must be between 10 and 10000"} + ), +} + diff --git a/schemas/platform/schemas/common/database.ncl b/schemas/platform/common/database.ncl similarity index 73% rename from schemas/platform/schemas/common/database.ncl rename to schemas/platform/common/database.ncl index 13ab743..78e9b6b 100644 --- a/schemas/platform/schemas/common/database.ncl +++ b/schemas/platform/common/database.ncl @@ -1,8 +1,16 @@ # Database Configuration Schema # Common schema for database settings (RocksDB, SurrealDB, PostgreSQL) +let DbMode = std.contract.custom + (fun label value => + if value == "memory" || value == "embedded" || value == "server" then 'Ok value + else std.contract.blame_with_message "database.mode must be 'memory', 'embedded', or 'server'" label) + in + { DatabaseConfig = { + # Deployment mode: memory (tests), embedded (solo/RocksDB), server (WebSocket) + mode | DbMode | default = "embedded", # Database backend selection (filesystem, rocksdb, surrealdb_embedded, surrealdb_server, postgres) backend | String, diff --git a/schemas/platform/common/external-services.ncl b/schemas/platform/common/external-services.ncl new file mode 100644 index 0000000..d5d9a86 --- /dev/null +++ b/schemas/platform/common/external-services.ncl @@ -0,0 +1,60 @@ +# External Services Configuration Schema +# Unified declaration of external services required by the platform +# (Database, OCI Registry, Git Sources, Cache) + +let database_schema = import "./database.ncl" in + +{ + # OCI registry that extension-registry connects to for pulling/pushing extensions + OciRegistryConfig = { + id | String, + registry | String, # host:port (e.g., zot.internal:5000) + namespace | String, # OCI namespace (e.g., provisioning/extensions) + auth_token_path | String | optional, # path to auth token file + verify_ssl | Bool | default = true, + }, + + # Git source (Forgejo/Gitea/GitHub) for extension discovery and release management + GitSourceConfig = { + id | String, + provider | String, # "forgejo" | "gitea" | "github" + url | String | optional, # not needed for github.com + organization | String, + token_path | String, # path to auth token file + verify_ssl | Bool | default = true, + }, + + # Local filesystem path used as extension store or cache directory + PathConfig = { + path | String, + writable | Bool | default = true, + }, + + # Cache configuration: either local filesystem or remote service + CacheConfig = { + mode | String, # "local" | "remote" + path | String | optional, # for local mode + url | String | optional, # for remote mode (redis://host:port) + }, + + # The full external services block for a deployment mode + # Declares all external dependencies that must be operational before internal services start + ExternalServicesConfig = { + # Database: reuses DatabaseConfig from database.ncl + # Can be embedded (filesystem), local server (surrealdb_server), or remote + database | database_schema.DatabaseConfig, + + # OCI registries (zero or more) - for extension distribution + oci_registries | Array OciRegistryConfig | default = [], + + # Git sources (zero or more) - for extension discovery + git_sources | Array GitSourceConfig | default = [], + + # Local extension path fallback (used when no OCI is configured) + # Primarily for solo mode: extensions stored as files instead of in OCI registry + extension_path | PathConfig | optional, + + # Cache configuration: critical for extension-registry and orchestrator + cache | CacheConfig, + }, +} diff --git a/schemas/platform/common/helpers.ncl b/schemas/platform/common/helpers.ncl index d60bcbf..d20b062 100644 --- a/schemas/platform/common/helpers.ncl +++ b/schemas/platform/common/helpers.ncl @@ -8,7 +8,8 @@ { # Recursively merge two record configurations - # Mode config values override defaults (shallow merge at each level) + # Override values take precedence over base values + # When both values are records, merge recursively (deep merge) # # Example: # let base = { server = { port = 9000, workers = 4 } } @@ -16,19 +17,20 @@ # merge_with_override base mode # # Result: { server = { port = 9000, workers = 16 } } merge_with_override = fun base override => - if std.type.is_record base && std.type.is_record override then + if (std.typeof base) == "record" && (std.typeof override) == "record" then let base_fields = std.record.fields base in let override_fields = std.record.fields override in + # Pass 1: iterate base fields, merging with override where present base_fields - |> std.array.fold + |> std.array.fold_left (fun acc key => - let base_value = base |> std.record.get key in + let base_value = std.record.get key base in - if std.record.has_field override key then - let override_value = override |> std.record.get key in + if std.record.has_field key override then + let override_value = std.record.get key override in - if std.type.is_record base_value && std.type.is_record override_value then + if (std.typeof base_value) == "record" && (std.typeof override_value) == "record" then acc |> std.record.insert key ( merge_with_override base_value override_value @@ -38,21 +40,27 @@ acc |> std.record.insert key override_value else # Keep base value - acc - ) - (override_fields - |> std.array.fold - (fun acc key => - if !std.record.has_field base key then - acc |> std.record.insert key (override |> std.record.get key) - else - acc - ) - base + acc |> std.record.insert key base_value ) + {} + # Pass 2: add fields exclusive to override + |> (fun merged => + override_fields + |> std.array.fold_left + (fun acc key => + if !(std.record.has_field key base) then + acc |> std.record.insert key (std.record.get key override) + else + acc + ) + merged + ) else # If either is not a record, override takes precedence - if std.type.is_null override then base else override, + if (std.typeof override) == "null" then base else override, + + # Alias for backwards compatibility + deep_merge = fun a b => merge_with_override a b, # Compose configuration from multiple layers with proper merging # @@ -80,18 +88,18 @@ # # Result: { "server.port" = 9000 } flatten_config = fun config => let rec flatten_with_prefix = fun prefix config => - if std.type.is_record config then + if (std.typeof config) == "record" then std.record.fields config - |> std.array.fold + |> std.array.fold_left (fun acc key => - let value = config |> std.record.get key in + let value = std.record.get key config in let full_key = if std.string.is_empty prefix then key else prefix ++ "." ++ key in - if std.type.is_record value then + if (std.typeof value) == "record" then acc |> std.record.merge (flatten_with_prefix full_key value) else @@ -102,41 +110,4 @@ config in flatten_with_prefix "" config, - - # Deep merge with recursive descent - # Used for complex nested configs - deep_merge = fun a b => - if std.type.is_record a && std.type.is_record b then - let a_fields = std.record.fields a in - - a_fields - |> std.array.fold - (fun acc key => - let a_val = a |> std.record.get key in - let has_b_key = std.record.has_field b key in - - if has_b_key then - let b_val = b |> std.record.get key in - - if std.type.is_record a_val && std.type.is_record b_val then - acc |> std.record.insert key (deep_merge a_val b_val) - else - acc |> std.record.insert key b_val - else - acc |> std.record.insert key a_val - ) - {} - |> (fun merged => - std.record.fields b - |> std.array.fold - (fun acc key => - if !std.record.has_field a key then - acc |> std.record.insert key (b |> std.record.get key) - else - acc - ) - merged - ) - else - b, } diff --git a/schemas/platform/common/helpers_test.ncl b/schemas/platform/common/helpers_test.ncl new file mode 100644 index 0000000..1bf8409 --- /dev/null +++ b/schemas/platform/common/helpers_test.ncl @@ -0,0 +1,18 @@ +let rec merge_impl = fun base override => + if (std.typeof base) == "record" && (std.typeof override) == "record" then + let base_fields = std.record.fields base in + base_fields |> std.array.fold_left + (fun acc key => + let base_value = base |> std.record.get key in + if std.record.has_field key override then + let override_value = override |> std.record.get key in + acc |> std.record.insert key override_value + else + acc |> std.record.insert key base_value + ) + {} + else + override +in + +{ merge = merge_impl } diff --git a/schemas/platform/common/helpers_test2.ncl b/schemas/platform/common/helpers_test2.ncl new file mode 100644 index 0000000..9e98ff9 --- /dev/null +++ b/schemas/platform/common/helpers_test2.ncl @@ -0,0 +1,34 @@ +# Test: Non-recursive merge (single level only) +{ + merge_shallow = fun base override => + if (std.typeof base) == "record" && (std.typeof override) == "record" then + let base_fields = std.record.fields base in + let override_fields = std.record.fields override in + + # Pass 1: iterate base fields + base_fields + |> std.array.fold_left + (fun acc key => + let base_value = std.record.get key base in + if std.record.has_field key override then + let override_value = std.record.get key override in + acc |> std.record.insert key override_value + else + acc |> std.record.insert key base_value + ) + {} + # Pass 2: add override-only fields + |> (fun merged => + override_fields + |> std.array.fold_left + (fun acc key => + if !(std.record.has_field key base) then + acc |> std.record.insert key (std.record.get key override) + else + acc + ) + merged + ) + else + if (std.typeof override) == "null" then base else override, +} diff --git a/schemas/platform/common/helpers_test3.ncl b/schemas/platform/common/helpers_test3.ncl new file mode 100644 index 0000000..b517cf1 --- /dev/null +++ b/schemas/platform/common/helpers_test3.ncl @@ -0,0 +1,30 @@ +# Test: Manual merge without fold +{ + # Hardcoded merge for 2-field record + merge_manual = fun base override => + let base_fields = std.record.fields base in + let has_field_0 = std.array.length base_fields > 0 in + let has_field_1 = std.array.length base_fields > 1 in + + let key0 = if has_field_0 then std.array.at 0 base_fields else "" in + let key1 = if has_field_1 then std.array.at 1 base_fields else "" in + + let val0 = if has_field_0 then std.record.get key0 base else null in + let val1 = if has_field_1 then std.record.get key1 base else null in + + let result0 = if has_field_0 then + if std.record.has_field key0 override then + {"%{key0}" = std.record.get key0 override} + else + {"%{key0}" = val0} + else {} in + + let result1 = if has_field_1 then + if std.record.has_field key1 override then + {"%{key1}" = std.record.get key1 override} + else + {"%{key1}" = val1} + else {} in + + result0 & result1, +} diff --git a/schemas/platform/schemas/common/logging.ncl b/schemas/platform/common/logging.ncl similarity index 92% rename from schemas/platform/schemas/common/logging.ncl rename to schemas/platform/common/logging.ncl index 7c5c041..67659c2 100644 --- a/schemas/platform/schemas/common/logging.ncl +++ b/schemas/platform/common/logging.ncl @@ -3,17 +3,17 @@ { # Supported log levels + LogLevel = [| 'trace, 'debug, 'info, 'warn, 'error |], # Supported log formats - - # Supported log outputs + LogFormat = [| 'text, 'json |], LoggingConfig = { # Global log level - level | String | default = 'info, + level | LogLevel | default = 'info, # Log format - format | String | default = 'text, + format | LogFormat | default = 'text, # Log output destinations outputs | Array String | optional, diff --git a/schemas/platform/schemas/common/monitoring.ncl b/schemas/platform/common/monitoring.ncl similarity index 100% rename from schemas/platform/schemas/common/monitoring.ncl rename to schemas/platform/common/monitoring.ncl diff --git a/schemas/platform/common/nats.ncl b/schemas/platform/common/nats.ncl new file mode 100644 index 0000000..19699fa --- /dev/null +++ b/schemas/platform/common/nats.ncl @@ -0,0 +1,45 @@ +# NATS Message Broker Configuration Schema +# Common schema for NATS connection settings shared by all platform services + +let NatsMode = std.contract.custom + (fun label value => + if value == "embedded" || value == "server" then 'Ok value + else std.contract.blame_with_message "nats.mode must be 'embedded' or 'server'" label) + in + +{ + NatsConfig = { + # Connection mode: embedded (child process, solo) or server (external cluster) + mode | NatsMode | default = "embedded", + + # NATS server URL (required when mode = "server") + url | String | default = "nats://127.0.0.1:4222", + + # Port for the embedded nats-server process + port | Number | default = 4222, + + # Enable JetStream persistence + jetstream | Bool | default = true, + + # JetStream data directory (only for embedded mode) + jetstream_store_dir | String | optional, + + # Authentication token (optional, solo mode typically omits) + auth_token | String | optional, + + # TLS certificate path (optional, multi-user mode) + tls_cert | String | optional, + + # TLS private key path (optional, multi-user mode) + tls_key | String | optional, + + # Maximum reconnect attempts for client + max_reconnects | Number | default = 10, + + # Wait between reconnect attempts in milliseconds + reconnect_wait_ms | Number | default = 2000, + + # Subject prefix for all provisioning events + subject_prefix | String | default = "provisioning", + }, +} diff --git a/schemas/platform/schemas/common/network.ncl b/schemas/platform/common/network.ncl similarity index 100% rename from schemas/platform/schemas/common/network.ncl rename to schemas/platform/common/network.ncl diff --git a/schemas/platform/common/observability.ncl b/schemas/platform/common/observability.ncl new file mode 100644 index 0000000..906bc8f --- /dev/null +++ b/schemas/platform/common/observability.ncl @@ -0,0 +1,189 @@ +# Observability Configuration Schema +# Unified schema for centralized logging, metrics, health checks, and tracing + +{ + # Observability configuration for services + ObservabilityConfig = { + # Enable/disable observability system-wide + enabled | Bool | default = true, + + # Logging Configuration + logging | { + # Enable structured JSON logging + enabled | Bool | default = true, + + # Log level: debug, info, warn, error + level | String | default = "info", + + # Log format: json (for Loki ingestion) or pretty (development) + format | String | default = "json", + + # RUST_LOG environment filter (granular module-level filtering) + filter | String | optional, + + # Output configuration + output | { + # Log output destination: stdout, file, loki + destination | String | default = "stdout", + + # File path for file output + file_path | String | optional, + + # Loki endpoint (e.g., http://localhost:3100) + loki_endpoint | String | optional, + + # Labels to attach to all Loki entries (labels become queryable) + loki_labels | { + } | optional, + } | optional, + + # Structured field configuration + fields | { + # Include service name + service_name | Bool | default = true, + + # Include timestamp (RFC3339) + timestamp | Bool | default = true, + + # Include log level + level | Bool | default = true, + + # Include caller location (file:line) + caller | Bool | default = false, + + # Include span context (trace IDs, span IDs) + spans | Bool | default = true, + + # Custom metadata fields + custom | { + } | optional, + } | optional, + + # Performance optimization + sampling | { + # Enable log sampling to reduce volume + enabled | Bool | default = false, + + # Sample 1 in N log entries + rate | Number | optional, + } | optional, + } | optional, + + # Metrics Configuration (Prometheus) + metrics | { + # Enable metrics collection + enabled | Bool | default = true, + + # Exporter backend: prometheus (default), otlp + exporter | String | default = "prometheus", + + # Prometheus scrape endpoint path + prometheus_path | String | default = "/metrics", + + # Metrics collection interval (seconds) + interval | Number | default = 60, + + # Histogram buckets for request latency (milliseconds) + histogram_buckets | Array Number | default = [1, 5, 10, 50, 100, 500, 1000, 5000], + + # Cardinality limits (prevent unbounded growth) + max_cardinality | Number | optional, + + # Metric retention period (hours) + retention_hours | Number | optional, + + # OpenTelemetry push endpoint (if using OTLP) + otlp_endpoint | String | optional, + + # OTLP push interval (seconds) + otlp_interval | Number | optional, + } | optional, + + # Health Check Configuration + health | { + # Enable health check endpoints + enabled | Bool | default = true, + + # Health check HTTP server port + port | Number | default = 8081, + + # Liveness probe endpoint + liveness_path | String | default = "/healthz", + + # Readiness probe endpoint (depends on dependencies) + readiness_path | String | default = "/ready", + + # Startup probe endpoint + startup_path | String | default = "/startup", + + # Health check probe interval (seconds) + interval | Number | default = 10, + + # Probe timeout (milliseconds) + timeout | Number | default = 5000, + + # Number of consecutive successes to mark as healthy + success_threshold | Number | default = 1, + + # Number of consecutive failures to mark as unhealthy + failure_threshold | Number | default = 3, + + # Initial delay before first check (seconds) + initial_delay | Number | default = 0, + } | optional, + + # Distributed Tracing Configuration (OpenTelemetry) + tracing | { + # Enable distributed tracing + enabled | Bool | default = false, + + # Tracer backend: otlp (OpenTelemetry) + backend | String | default = "otlp", + + # OpenTelemetry Collector endpoint (gRPC) + otlp_endpoint | String | optional, + + # Trace sampler: always, never, parentbased + sampler | String | default = "parentbased", + + # Sampling rate (0.0 to 1.0) for parentbased/probability samplers + sampling_rate | Number | optional, + + # Service version + service_version | String | optional, + + # Environment name (dev, staging, production) + environment | String | optional, + } | optional, + + # Audit Logging Configuration + audit | { + # Enable workspace operation auditing + enabled | Bool | default = true, + + # Storage backend: file, siem + storage | String | default = "file", + + # Audit log file directory + log_directory | String | optional, + + # Audit retention period (days) + retention_days | Number | default = 90, + + # Include PII in audit logs (GDPR consideration) + include_pii | Bool | default = false, + + # Export format(s): jsonl, csv, splunk, elastic + export_formats | Array String | default = ["jsonl"], + + # SIEM endpoint (e.g., Splunk, Elastic) for real-time export + siem_endpoint | String | optional, + + # Workspace operation tracking + track_workspace_operations | Bool | default = true, + + # Tracked operations: create, delete, update, switch, list, sync + workspace_operations | Array String | default = ["create", "delete", "update", "switch", "list", "sync"], + } | optional, + }, +} diff --git a/schemas/platform/schemas/common/security.ncl b/schemas/platform/common/security.ncl similarity index 100% rename from schemas/platform/schemas/common/security.ncl rename to schemas/platform/common/security.ncl diff --git a/schemas/platform/common/server.ncl b/schemas/platform/common/server.ncl new file mode 100644 index 0000000..cc9ec14 --- /dev/null +++ b/schemas/platform/common/server.ncl @@ -0,0 +1,59 @@ +# HTTP Server Configuration Schema +# Common schema for HTTP server settings across all services + +let constraints = import "./constraints.ncl" in + +{ + ServerConfig = { + # Bind address (127.0.0.1 for local, 0.0.0.0 for all interfaces) + host | String | default = "127.0.0.1", + + # Listen port (validated: 1024-65535) + port | Number | constraints.port_standard, + + # Worker thread count (CPU-bound operations) + workers | Number | optional, + + # TCP keep-alive timeout in seconds (0 = disabled) + keep_alive | Number | optional, + + # Maximum concurrent TCP connections + max_connections | Number | optional, + + # Request timeout in milliseconds + request_timeout | Number | optional, + + # Enable graceful shutdown + graceful_shutdown | Bool | default = true, + + # Graceful shutdown timeout in seconds + shutdown_timeout | Number | optional, + }, + + # Server config with high port constraint >= 9000 (for platform services except MCP) + ServerConfigHighPort = { + # Bind address (127.0.0.1 for local, 0.0.0.0 for all interfaces) + host | String | default = "127.0.0.1", + + # Listen port (must be >= 9000 for platform services) + port | Number | constraints.port_high, + + # Worker thread count (CPU-bound operations) + workers | Number | optional, + + # TCP keep-alive timeout in seconds (0 = disabled) + keep_alive | Number | optional, + + # Maximum concurrent TCP connections + max_connections | Number | optional, + + # Request timeout in milliseconds + request_timeout | Number | optional, + + # Enable graceful shutdown + graceful_shutdown | Bool | default = true, + + # Graceful shutdown timeout in seconds + shutdown_timeout | Number | optional, + }, +} diff --git a/schemas/platform/schemas/common/storage.ncl b/schemas/platform/common/storage.ncl similarity index 100% rename from schemas/platform/schemas/common/storage.ncl rename to schemas/platform/common/storage.ncl diff --git a/schemas/platform/schemas/common/workspace.ncl b/schemas/platform/common/workspace.ncl similarity index 100% rename from schemas/platform/schemas/common/workspace.ncl rename to schemas/platform/common/workspace.ncl diff --git a/schemas/platform/configs/ai-service.cicd.ncl b/schemas/platform/configs/ai-service.cicd.ncl index c3d83a7..57c90a4 100644 --- a/schemas/platform/configs/ai-service.cicd.ncl +++ b/schemas/platform/configs/ai-service.cicd.ncl @@ -1,4 +1,4 @@ -let ai_schema = import "../schemas/ai-service.ncl" in +let ai_schema = import "../ai-service.ncl" in { ai_service | ai_schema.AiServiceConfig = { server = { host = "0.0.0.0", port = 8082, workers = 8, }, diff --git a/schemas/platform/configs/ai-service.enterprise.ncl b/schemas/platform/configs/ai-service.enterprise.ncl index c17911e..13b62bc 100644 --- a/schemas/platform/configs/ai-service.enterprise.ncl +++ b/schemas/platform/configs/ai-service.enterprise.ncl @@ -1,4 +1,4 @@ -let ai_schema = import "../schemas/ai-service.ncl" in +let ai_schema = import "../ai-service.ncl" in { ai_service | ai_schema.AiServiceConfig = { server = { host = "0.0.0.0", port = 8082, workers = 16, }, diff --git a/schemas/platform/configs/ai-service.multiuser.ncl b/schemas/platform/configs/ai-service.multiuser.ncl index a431bde..00eb3d3 100644 --- a/schemas/platform/configs/ai-service.multiuser.ncl +++ b/schemas/platform/configs/ai-service.multiuser.ncl @@ -1,4 +1,4 @@ -let ai_schema = import "../schemas/ai-service.ncl" in +let ai_schema = import "../ai-service.ncl" in { ai_service | ai_schema.AiServiceConfig = { server = { host = "0.0.0.0", port = 8082, workers = 4, }, diff --git a/schemas/platform/configs/ai-service.solo.ncl b/schemas/platform/configs/ai-service.solo.ncl index 520294a..e9e91d8 100644 --- a/schemas/platform/configs/ai-service.solo.ncl +++ b/schemas/platform/configs/ai-service.solo.ncl @@ -1,4 +1,4 @@ -let ai_schema = import "../schemas/ai-service.ncl" in +let ai_schema = import "../ai-service.ncl" in { ai_service | ai_schema.AiServiceConfig = { server = { host = "127.0.0.1", port = 8082, workers = 2, }, diff --git a/schemas/platform/configs/control-center.cicd.ncl b/schemas/platform/configs/control-center.cicd.ncl index c321667..4c85e6d 100644 --- a/schemas/platform/configs/control-center.cicd.ncl +++ b/schemas/platform/configs/control-center.cicd.ncl @@ -1,7 +1,7 @@ # Control Center Configuration - CI/CD Mode # API-centric with ephemeral workspaces and token-based auth -let control_center_schema = import "../schemas/control-center.ncl" in +let control_center_schema = import "../control-center.ncl" in let control_center_defaults = import "../defaults/control-center-defaults.ncl" in { diff --git a/schemas/platform/configs/control-center.enterprise.ncl b/schemas/platform/configs/control-center.enterprise.ncl index e90a907..2cd74fa 100644 --- a/schemas/platform/configs/control-center.enterprise.ncl +++ b/schemas/platform/configs/control-center.enterprise.ncl @@ -1,7 +1,7 @@ # Control Center Configuration - Enterprise Mode # Production HA with compliance frameworks and audit logging -let control_center_schema = import "../schemas/control-center.ncl" in +let control_center_schema = import "../control-center.ncl" in let control_center_defaults = import "../defaults/control-center-defaults.ncl" in { diff --git a/schemas/platform/configs/control-center.multiuser.ncl b/schemas/platform/configs/control-center.multiuser.ncl index e2b8590..602fbe5 100644 --- a/schemas/platform/configs/control-center.multiuser.ncl +++ b/schemas/platform/configs/control-center.multiuser.ncl @@ -1,7 +1,7 @@ # Control Center Configuration - Multi-User Mode # Team collaboration with RBAC and PostgreSQL backend -let control_center_schema = import "../schemas/control-center.ncl" in +let control_center_schema = import "../control-center.ncl" in let control_center_defaults = import "../defaults/control-center-defaults.ncl" in { diff --git a/schemas/platform/configs/control-center.solo.ncl b/schemas/platform/configs/control-center.solo.ncl index d4b7f82..37c8af1 100644 --- a/schemas/platform/configs/control-center.solo.ncl +++ b/schemas/platform/configs/control-center.solo.ncl @@ -1,7 +1,7 @@ # Control Center Configuration - Solo Mode # Single developer with simplified RBAC and local storage -let control_center_schema = import "../schemas/control-center.ncl" in +let control_center_schema = import "../control-center.ncl" in let control_center_defaults = import "../defaults/control-center-defaults.ncl" in { diff --git a/schemas/platform/configs/extension-registry.cicd.ncl b/schemas/platform/configs/extension-registry.cicd.ncl index 20c8f47..1cd71d0 100644 --- a/schemas/platform/configs/extension-registry.cicd.ncl +++ b/schemas/platform/configs/extension-registry.cicd.ncl @@ -1,6 +1,6 @@ # Extension Registry - CI/CD Mode # Optimized for CI/CD pipelines with distribution focus -let registry_schema = import "../schemas/extension-registry.ncl" in +let registry_schema = import "../extension-registry.ncl" in { extension_registry | registry_schema.RegistryConfig = { server = { diff --git a/schemas/platform/configs/extension-registry.enterprise.ncl b/schemas/platform/configs/extension-registry.enterprise.ncl index a40c023..5ef54a2 100644 --- a/schemas/platform/configs/extension-registry.enterprise.ncl +++ b/schemas/platform/configs/extension-registry.enterprise.ncl @@ -1,6 +1,6 @@ # Extension Registry - Enterprise Mode # High-availability multi-source, multi-registry configuration -let registry_schema = import "../schemas/extension-registry.ncl" in +let registry_schema = import "../extension-registry.ncl" in { extension_registry | registry_schema.RegistryConfig = { server = { diff --git a/schemas/platform/configs/extension-registry.multiuser.ncl b/schemas/platform/configs/extension-registry.multiuser.ncl index 26fbe21..6b30209 100644 --- a/schemas/platform/configs/extension-registry.multiuser.ncl +++ b/schemas/platform/configs/extension-registry.multiuser.ncl @@ -1,6 +1,6 @@ # Extension Registry - Multiuser Mode # Shared team environment with multiple sources and registries -let registry_schema = import "../schemas/extension-registry.ncl" in +let registry_schema = import "../extension-registry.ncl" in { extension_registry | registry_schema.RegistryConfig = { server = { diff --git a/schemas/platform/configs/extension-registry.solo.ncl b/schemas/platform/configs/extension-registry.solo.ncl index d09fac6..ad3ea59 100644 --- a/schemas/platform/configs/extension-registry.solo.ncl +++ b/schemas/platform/configs/extension-registry.solo.ncl @@ -1,6 +1,6 @@ # Extension Registry - Solo/Development Mode -# Single Gitea instance for local development -let registry_schema = import "../schemas/extension-registry.ncl" in +# Single Gitea instance for local development + optional local filesystem fallback for extensions +let registry_schema = import "../extension-registry.ncl" in { extension_registry | registry_schema.RegistryConfig = { server = { @@ -21,6 +21,20 @@ let registry_schema = import "../schemas/extension-registry.ncl" in }, ], }, + # Solo mode can use optional local OCI registry if available + # Default: empty (use filesystem fallback in external-services config) + # If Zot is deployed locally, uncomment the oci registry below: + distributions = { + oci = [ + # Uncomment to enable local Zot registry: + # { + # id = "local-zot", + # registry = "localhost:5000", + # namespace = "provisioning", + # verify_ssl = false, + # }, + ], + }, cache = { capacity = 100, ttl_seconds = 60, diff --git a/schemas/platform/configs/installer.cicd.ncl b/schemas/platform/configs/installer.cicd.ncl index b8bceb7..4cb08a0 100644 --- a/schemas/platform/configs/installer.cicd.ncl +++ b/schemas/platform/configs/installer.cicd.ncl @@ -1,7 +1,7 @@ # Installer Configuration - CI/CD Mode # Automated installation for pipeline integration -let installer_schema = import "../schemas/installer.ncl" in +let installer_schema = import "../installer.ncl" in let installer_defaults = import "../defaults/installer-defaults.ncl" in { diff --git a/schemas/platform/configs/installer.enterprise.ncl b/schemas/platform/configs/installer.enterprise.ncl index 96359e0..ffffa6b 100644 --- a/schemas/platform/configs/installer.enterprise.ncl +++ b/schemas/platform/configs/installer.enterprise.ncl @@ -1,7 +1,7 @@ # Installer Configuration - Enterprise Mode # Complex HA installation with compliance and disaster recovery -let installer_schema = import "../schemas/installer.ncl" in +let installer_schema = import "../installer.ncl" in let installer_defaults = import "../defaults/installer-defaults.ncl" in { diff --git a/schemas/platform/configs/installer.multiuser.ncl b/schemas/platform/configs/installer.multiuser.ncl index 417e847..c4f1d9b 100644 --- a/schemas/platform/configs/installer.multiuser.ncl +++ b/schemas/platform/configs/installer.multiuser.ncl @@ -1,7 +1,7 @@ # Installer Configuration - Multi-User Mode # Network installation for team environments -let installer_schema = import "../schemas/installer.ncl" in +let installer_schema = import "../installer.ncl" in let installer_defaults = import "../defaults/installer-defaults.ncl" in { diff --git a/schemas/platform/configs/installer.solo.ncl b/schemas/platform/configs/installer.solo.ncl index 48f7edf..cf0772b 100644 --- a/schemas/platform/configs/installer.solo.ncl +++ b/schemas/platform/configs/installer.solo.ncl @@ -1,7 +1,7 @@ # Installer Configuration - Solo Mode # Local installation for single developer -let installer_schema = import "../schemas/installer.ncl" in +let installer_schema = import "../installer.ncl" in let installer_defaults = import "../defaults/installer-defaults.ncl" in { diff --git a/schemas/platform/configs/mcp-server.cicd.ncl b/schemas/platform/configs/mcp-server.cicd.ncl index 98955e2..e4283c3 100644 --- a/schemas/platform/configs/mcp-server.cicd.ncl +++ b/schemas/platform/configs/mcp-server.cicd.ncl @@ -1,7 +1,7 @@ # MCP Server Configuration - CI/CD Mode # API-centric with stdio transport for pipeline integration -let mcp_server_schema = import "../schemas/mcp-server.ncl" in +let mcp_server_schema = import "../mcp-server.ncl" in let mcp_server_defaults = import "../defaults/mcp-server-defaults.ncl" in { diff --git a/schemas/platform/configs/mcp-server.enterprise.ncl b/schemas/platform/configs/mcp-server.enterprise.ncl index fbca2f5..3e0313d 100644 --- a/schemas/platform/configs/mcp-server.enterprise.ncl +++ b/schemas/platform/configs/mcp-server.enterprise.ncl @@ -1,7 +1,7 @@ # MCP Server Configuration - Enterprise Mode # Production HA with WebSocket clustering and advanced sampling -let mcp_server_schema = import "../schemas/mcp-server.ncl" in +let mcp_server_schema = import "../mcp-server.ncl" in let mcp_server_defaults = import "../defaults/mcp-server-defaults.ncl" in { diff --git a/schemas/platform/configs/mcp-server.multiuser.ncl b/schemas/platform/configs/mcp-server.multiuser.ncl index ced296d..b1e3504 100644 --- a/schemas/platform/configs/mcp-server.multiuser.ncl +++ b/schemas/platform/configs/mcp-server.multiuser.ncl @@ -1,7 +1,7 @@ # MCP Server Configuration - Multi-User Mode # Team collaboration with HTTP/WebSocket transport -let mcp_server_schema = import "../schemas/mcp-server.ncl" in +let mcp_server_schema = import "../mcp-server.ncl" in let mcp_server_defaults = import "../defaults/mcp-server-defaults.ncl" in { diff --git a/schemas/platform/configs/mcp-server.solo.ncl b/schemas/platform/configs/mcp-server.solo.ncl index 5688580..cf17dad 100644 --- a/schemas/platform/configs/mcp-server.solo.ncl +++ b/schemas/platform/configs/mcp-server.solo.ncl @@ -1,7 +1,7 @@ # MCP Server Configuration - Solo Mode # Single developer with local HTTP transport -let mcp_server_schema = import "../schemas/mcp-server.ncl" in +let mcp_server_schema = import "../mcp-server.ncl" in let mcp_server_defaults = import "../defaults/mcp-server-defaults.ncl" in { diff --git a/schemas/platform/configs/orchestrator.cicd.ncl b/schemas/platform/configs/orchestrator.cicd.ncl index c6a193f..a5ef5fe 100644 --- a/schemas/platform/configs/orchestrator.cicd.ncl +++ b/schemas/platform/configs/orchestrator.cicd.ncl @@ -1,7 +1,7 @@ # Orchestrator Configuration - CI/CD Mode # Ephemeral workspaces with API-centric design for pipeline integration -let orchestrator_schema = import "../schemas/orchestrator.ncl" in +let orchestrator_schema = import "../orchestrator.ncl" in let orchestrator_defaults = import "../defaults/orchestrator-defaults.ncl" in { diff --git a/schemas/platform/configs/orchestrator.enterprise.ncl b/schemas/platform/configs/orchestrator.enterprise.ncl index 0516fc5..763419c 100644 --- a/schemas/platform/configs/orchestrator.enterprise.ncl +++ b/schemas/platform/configs/orchestrator.enterprise.ncl @@ -1,7 +1,7 @@ # Orchestrator Configuration - Enterprise Mode # High-availability production deployment with compliance and disaster recovery -let orchestrator_schema = import "../schemas/orchestrator.ncl" in +let orchestrator_schema = import "../orchestrator.ncl" in let orchestrator_defaults = import "../defaults/orchestrator-defaults.ncl" in { diff --git a/schemas/platform/configs/orchestrator.multiuser.ncl b/schemas/platform/configs/orchestrator.multiuser.ncl index 7923265..d910bbb 100644 --- a/schemas/platform/configs/orchestrator.multiuser.ncl +++ b/schemas/platform/configs/orchestrator.multiuser.ncl @@ -1,7 +1,7 @@ # Orchestrator Configuration - Multi-User Mode # Team collaboration with PostgreSQL backend and multi-workspace support -let orchestrator_schema = import "../schemas/orchestrator.ncl" in +let orchestrator_schema = import "../orchestrator.ncl" in let orchestrator_defaults = import "../defaults/orchestrator-defaults.ncl" in { diff --git a/schemas/platform/configs/orchestrator.solo.ncl b/schemas/platform/configs/orchestrator.solo.ncl index 6eac6e3..300f9c7 100644 --- a/schemas/platform/configs/orchestrator.solo.ncl +++ b/schemas/platform/configs/orchestrator.solo.ncl @@ -2,7 +2,7 @@ # Single developer with local filesystem storage # Uses defaults from orchestrator-defaults.ncl with validation -let orchestrator_schema = import "../schemas/orchestrator.ncl" in +let orchestrator_schema = import "../orchestrator.ncl" in let orchestrator_defaults = import "../defaults/orchestrator-defaults.ncl" in { diff --git a/schemas/platform/configs/provisioning-daemon.cicd.ncl b/schemas/platform/configs/provisioning-daemon.cicd.ncl index af4aaeb..9f756a4 100644 --- a/schemas/platform/configs/provisioning-daemon.cicd.ncl +++ b/schemas/platform/configs/provisioning-daemon.cicd.ncl @@ -1,4 +1,4 @@ -let daemon_schema = import "../schemas/provisioning-daemon.ncl" in +let daemon_schema = import "../provisioning-daemon.ncl" in { daemon | daemon_schema.DaemonConfig = { daemon = { enabled = true, poll_interval = 10, max_workers = 8, }, diff --git a/schemas/platform/configs/provisioning-daemon.enterprise.ncl b/schemas/platform/configs/provisioning-daemon.enterprise.ncl index 57d67a7..58cbb5e 100644 --- a/schemas/platform/configs/provisioning-daemon.enterprise.ncl +++ b/schemas/platform/configs/provisioning-daemon.enterprise.ncl @@ -1,4 +1,4 @@ -let daemon_schema = import "../schemas/provisioning-daemon.ncl" in +let daemon_schema = import "../provisioning-daemon.ncl" in { daemon | daemon_schema.DaemonConfig = { daemon = { enabled = true, poll_interval = 30, max_workers = 16, }, diff --git a/schemas/platform/configs/provisioning-daemon.multiuser.ncl b/schemas/platform/configs/provisioning-daemon.multiuser.ncl index e7afdb4..ec847a6 100644 --- a/schemas/platform/configs/provisioning-daemon.multiuser.ncl +++ b/schemas/platform/configs/provisioning-daemon.multiuser.ncl @@ -1,4 +1,4 @@ -let daemon_schema = import "../schemas/provisioning-daemon.ncl" in +let daemon_schema = import "../provisioning-daemon.ncl" in { daemon | daemon_schema.DaemonConfig = { daemon = { enabled = true, poll_interval = 30, max_workers = 4, }, diff --git a/schemas/platform/configs/provisioning-daemon.solo.ncl b/schemas/platform/configs/provisioning-daemon.solo.ncl index a5b5d8d..03c1225 100644 --- a/schemas/platform/configs/provisioning-daemon.solo.ncl +++ b/schemas/platform/configs/provisioning-daemon.solo.ncl @@ -1,4 +1,4 @@ -let daemon_schema = import "../schemas/provisioning-daemon.ncl" in +let daemon_schema = import "../provisioning-daemon.ncl" in { daemon | daemon_schema.DaemonConfig = { daemon = { enabled = true, poll_interval = 60, max_workers = 2, }, diff --git a/schemas/platform/configs/rag.cicd.ncl b/schemas/platform/configs/rag.cicd.ncl index f89945e..85b504c 100644 --- a/schemas/platform/configs/rag.cicd.ncl +++ b/schemas/platform/configs/rag.cicd.ncl @@ -1,5 +1,5 @@ # RAG System - CI/CD Mode (Disabled) -let rag_schema = import "../schemas/rag.ncl" in +let rag_schema = import "../rag.ncl" in { rag | rag_schema.RagConfig = { rag = { enabled = false, }, diff --git a/schemas/platform/configs/rag.enterprise.ncl b/schemas/platform/configs/rag.enterprise.ncl index 7ad042a..8ad89b3 100644 --- a/schemas/platform/configs/rag.enterprise.ncl +++ b/schemas/platform/configs/rag.enterprise.ncl @@ -1,5 +1,5 @@ # RAG System - Enterprise Mode -let rag_schema = import "../schemas/rag.ncl" in +let rag_schema = import "../rag.ncl" in { rag | rag_schema.RagConfig = { rag = { enabled = true, }, diff --git a/schemas/platform/configs/rag.multiuser.ncl b/schemas/platform/configs/rag.multiuser.ncl index e8685a9..d473d54 100644 --- a/schemas/platform/configs/rag.multiuser.ncl +++ b/schemas/platform/configs/rag.multiuser.ncl @@ -1,5 +1,5 @@ # RAG System - Multiuser Mode -let rag_schema = import "../schemas/rag.ncl" in +let rag_schema = import "../rag.ncl" in { rag | rag_schema.RagConfig = { rag = { enabled = true, }, diff --git a/schemas/platform/configs/rag.solo.ncl b/schemas/platform/configs/rag.solo.ncl index 583a76f..915acb0 100644 --- a/schemas/platform/configs/rag.solo.ncl +++ b/schemas/platform/configs/rag.solo.ncl @@ -1,5 +1,5 @@ # RAG System - Solo Mode -let rag_schema = import "../schemas/rag.ncl" in +let rag_schema = import "../rag.ncl" in { rag | rag_schema.RagConfig = { rag = { enabled = true, }, diff --git a/schemas/platform/configs/vault-service.cicd.ncl b/schemas/platform/configs/vault-service.cicd.ncl index 9a3b7a2..002aa2f 100644 --- a/schemas/platform/configs/vault-service.cicd.ncl +++ b/schemas/platform/configs/vault-service.cicd.ncl @@ -1,7 +1,7 @@ # Vault Service - CI/CD Mode Configuration # Pipeline integration, ephemeral in-memory storage -let vault_schema = import "../schemas/vault-service.ncl" in +let vault_schema = import "../vault-service.ncl" in { vault | vault_schema.VaultServiceConfig = { diff --git a/schemas/platform/configs/vault-service.enterprise.ncl b/schemas/platform/configs/vault-service.enterprise.ncl index 58af380..6bd5973 100644 --- a/schemas/platform/configs/vault-service.enterprise.ncl +++ b/schemas/platform/configs/vault-service.enterprise.ncl @@ -1,7 +1,7 @@ # Vault Service - Enterprise Mode Configuration # Production HA, etcd cluster backend, full security -let vault_schema = import "../schemas/vault-service.ncl" in +let vault_schema = import "../vault-service.ncl" in { vault | vault_schema.VaultServiceConfig = { diff --git a/schemas/platform/configs/vault-service.multiuser.ncl b/schemas/platform/configs/vault-service.multiuser.ncl index 9f9304c..9baaa62 100644 --- a/schemas/platform/configs/vault-service.multiuser.ncl +++ b/schemas/platform/configs/vault-service.multiuser.ncl @@ -1,7 +1,7 @@ # Vault Service - Multiuser Mode Configuration # Team development, shared SurrealDB backend -let vault_schema = import "../schemas/vault-service.ncl" in +let vault_schema = import "../vault-service.ncl" in { vault | vault_schema.VaultServiceConfig = { diff --git a/schemas/platform/configs/vault-service.solo.ncl b/schemas/platform/configs/vault-service.solo.ncl index fb643a4..5bd763d 100644 --- a/schemas/platform/configs/vault-service.solo.ncl +++ b/schemas/platform/configs/vault-service.solo.ncl @@ -1,7 +1,7 @@ # Vault Service - Solo Mode Configuration # Single developer, embedded storage, minimal resources -let vault_schema = import "../schemas/vault-service.ncl" in +let vault_schema = import "../vault-service.ncl" in { vault | vault_schema.VaultServiceConfig = { diff --git a/schemas/platform/constraints/constraints.toml b/schemas/platform/constraints/constraints.toml index 205ed3c..0adf897 100644 --- a/schemas/platform/constraints/constraints.toml +++ b/schemas/platform/constraints/constraints.toml @@ -129,6 +129,11 @@ description = "Valid port range (avoid system ports < 1024)" max = 65535 min = 1024 +[common.server.port_high] +description = "Platform service ports (>= 9000)" +max = 65535 +min = 9000 + [common.server.workers] description = "HTTP server worker thread count" max = 32 diff --git a/schemas/platform/schemas/control-center.ncl b/schemas/platform/control-center.ncl similarity index 79% rename from schemas/platform/schemas/control-center.ncl rename to schemas/platform/control-center.ncl index 2dea6a9..95977eb 100644 --- a/schemas/platform/schemas/control-center.ncl +++ b/schemas/platform/control-center.ncl @@ -1,26 +1,27 @@ # Control Center Service Schema # Policy management, RBAC, and compliance configuration -let workspace_schema = import "./common/workspace.ncl" in -let server_schema = import "./common/server.ncl" in -let database_schema = import "./common/database.ncl" in -let security_schema = import "./common/security.ncl" in -let monitoring_schema = import "./common/monitoring.ncl" in -let logging_schema = import "./common/logging.ncl" in +let workspace_schema = import "schemas/platform/common/workspace.ncl" in +let server_schema = import "schemas/platform/common/server.ncl" in +let database_schema = import "schemas/platform/common/database.ncl" in +let security_schema = import "schemas/platform/common/security.ncl" in +let monitoring_schema = import "schemas/platform/common/monitoring.ncl" in +let logging_schema = import "schemas/platform/common/logging.ncl" in +let docker_build_schema = import "schemas/platform/docker-build.ncl" in { ControlCenterConfig = { # Workspace configuration - workspace | workspace_schema.WorkspaceConfig, + workspace | workspace_schema.WorkspaceConfig | optional, - # HTTP server settings - server | server_schema.ServerConfig, + # HTTP server settings (port must be >= 9000 for control-center) + server | server_schema.ServerConfigHighPort | optional, # Database configuration (policy storage) - database | database_schema.DatabaseConfig, + database | database_schema.DatabaseConfig | optional, # Security configuration (JWT, RBAC, encryption, MFA) - security | security_schema.SecurityConfig, + security | security_schema.SecurityConfig | optional, # Policy Engine Configuration policy | { @@ -43,7 +44,7 @@ let logging_schema = import "./common/logging.ncl" in } | optional, # Policy conflict resolution - }, + } | optional, # RBAC Configuration rbac | { @@ -68,7 +69,7 @@ let logging_schema = import "./common/logging.ncl" in # Role-based attribute (ABAC) attribute_based | Bool | default = false, - }, + } | optional, # User Management users | { @@ -91,7 +92,7 @@ let logging_schema = import "./common/logging.ncl" in # User audit trail audit_enabled | Bool | default = false, - }, + } | optional, # Audit Logging Configuration audit | { @@ -161,5 +162,8 @@ let logging_schema = import "./common/logging.ncl" in # Logging configuration logging | logging_schema.LoggingConfig | optional, + + # Docker build configuration + build | docker_build_schema.DockerBuildConfig | optional, }, } diff --git a/schemas/platform/defaults/ai-service-defaults.ncl b/schemas/platform/defaults/ai-service-defaults.ncl index 0a87f42..ee56d44 100644 --- a/schemas/platform/defaults/ai-service-defaults.ncl +++ b/schemas/platform/defaults/ai-service-defaults.ncl @@ -1,12 +1,44 @@ -# AI Service Defaults -let ai_schema = import "../schemas/ai-service.ncl" in +# AI Service Default Configuration +# Pattern: 3-Layer Config (flat notation + | default) + +let ai_schema = import "../ai-service.ncl" in + +let base_ai_service = { + # Server Configuration + server.host | default = "127.0.0.1", + server.port | default = 9092, + server.workers | default = 4, + + # RAG Integration + rag.enabled | default = false, + rag.rag_service_url | default = "http://localhost:9092", + rag.timeout | default = 30000, + + # MCP Integration + mcp.enabled | default = false, + mcp.mcp_service_url | default = "http://localhost:3000", + mcp.timeout | default = 30000, + + # DAG Workflow Configuration + dag.max_concurrent_tasks | default = 3, + dag.task_timeout | default = 300000, + dag.retry_attempts | default = 3, + + # Monitoring Configuration + monitoring.enabled | default = false, + + # Logging Configuration + logging.level | default = "info", + + # Docker Build Configuration (no | default to override schema) + build.package = "ai-service", + build.binary = "provisioning-ai-service", + build.port = 9092, + build.features = [], + build.extra_runtime_pkgs = [], +} in + { - ai_service | ai_schema.AiServiceConfig = { - server = { host = "127.0.0.1", port = 8082, workers = 2, }, - rag = { enabled = false, rag_service_url = "http://localhost:8083", timeout = 30000, }, - mcp = { enabled = false, mcp_service_url = "http://localhost:8084", timeout = 30000, }, - dag = { max_concurrent_tasks = 3, task_timeout = 300000, retry_attempts = 3, }, - monitoring = { enabled = false, }, - logging = { level = "info", }, - }, + # Base configuration with all defaults + ai_service = base_ai_service | ai_schema.AiServiceConfig, } diff --git a/schemas/platform/defaults/common/database-defaults.ncl b/schemas/platform/defaults/common/database-defaults.ncl index b2b4a46..8789446 100644 --- a/schemas/platform/defaults/common/database-defaults.ncl +++ b/schemas/platform/defaults/common/database-defaults.ncl @@ -1,7 +1,7 @@ # Database Default Values # Common defaults for database configuration -let database_schema = import "../../schemas/common/database.ncl" in +let database_schema = import "../../common/database.ncl" in { database | database_schema.DatabaseConfig = { diff --git a/schemas/platform/defaults/common/external-services-defaults.ncl b/schemas/platform/defaults/common/external-services-defaults.ncl new file mode 100644 index 0000000..4a1bbb9 --- /dev/null +++ b/schemas/platform/defaults/common/external-services-defaults.ncl @@ -0,0 +1,169 @@ +# External Services Default Configuration +# Per-deployment-mode defaults for database, OCI registry, Git sources, and cache + +let es_schema = import "../../common/external-services.ncl" in + +{ + # Solo/Development Mode: No external infrastructure + # - Filesystem storage for orchestrator + # - Local filesystem for extensions (no OCI) + # - Local directory cache + solo | es_schema.ExternalServicesConfig = { + database = { + backend = "filesystem", + path = "~/.provisioning/data/orchestrator", + retry = true, + }, + # Solo mode doesn't configure OCI registries or Git sources + # Extensions are discovered and loaded from local filesystem + oci_registries = [], + git_sources = [], + extension_path = { + path = "~/.provisioning/extensions", + writable = true, + }, + cache = { + mode = "local", + path = "~/.provisioning/oci-cache", + }, + }, + + # Multiuser/Team Mode: Local Docker services + # - SurrealDB server running in local Docker + # - Zot OCI registry in local Docker + # - Forgejo Git source in local Docker + # - Local directory cache + multiuser | es_schema.ExternalServicesConfig = { + database = { + backend = "surrealdb_server", + connection_string = "ws://localhost:8000", + namespace = "provisioning", + database = "main", + credentials = { + username = "root", + password = "root", + }, + retry = true, + }, + oci_registries = [ + { + id = "local-zot", + registry = "localhost:5000", + namespace = "provisioning", + verify_ssl = false, + }, + ], + git_sources = [ + { + id = "local-forgejo", + provider = "forgejo", + url = "http://localhost:3000", + organization = "provisioning", + token_path = "~/.provisioning/secrets/forgejo-token.txt", + verify_ssl = false, + }, + ], + cache = { + mode = "local", + path = "~/.provisioning/oci-cache", + }, + }, + + # CI/CD Mode: Containerized, temporary infrastructure + # - SurrealDB server (temporary) + # - Zot OCI registry (temporary) + # - Forgejo Git source (temporary or external) + # - Local cache for CI runners + cicd | es_schema.ExternalServicesConfig = { + database = { + backend = "surrealdb_server", + connection_string = "ws://localhost:8000", + namespace = "provisioning", + database = "cicd", + credentials = { + username = "cicd", + password = "cicd_temp", + }, + retry = true, + }, + oci_registries = [ + { + id = "ci-zot", + registry = "localhost:5000", + namespace = "provisioning/ci", + verify_ssl = false, + }, + ], + git_sources = [ + { + id = "ci-forgejo", + provider = "forgejo", + url = "http://localhost:3000", + organization = "provisioning-ci", + token_path = "/tmp/forgejo-token.txt", + verify_ssl = false, + }, + ], + cache = { + mode = "local", + path = "/tmp/provisioning-cache", + }, + }, + + # Enterprise/Production Mode: Remote, high-availability services + # - SurrealDB cluster (remote, replicated) + # - Zot OCI registry with failover + # - Forgejo + GitHub for source diversity + # - Redis for distributed cache + # NOTE: These are placeholder values. Users MUST override with actual infrastructure. + enterprise | es_schema.ExternalServicesConfig = { + database = { + backend = "surrealdb_server", + connection_string = "ws://surrealdb-primary.internal:8000", + namespace = "provisioning", + database = "production", + credentials = { + username = "provisioning", + password = "REPLACE_WITH_SECRET_FROM_VAULT", + }, + retry = true, + max_retries = "5", + }, + oci_registries = [ + { + id = "primary-zot", + registry = "zot-primary.internal:5000", + namespace = "provisioning/extensions", + verify_ssl = true, + }, + { + id = "secondary-harbor", + registry = "harbor-backup.internal:443", + namespace = "provisioning", + auth_token_path = "/etc/secrets/harbor-token.txt", + verify_ssl = true, + }, + ], + git_sources = [ + { + id = "primary-forgejo", + provider = "forgejo", + url = "https://forge.internal:3000", + organization = "provisioning", + token_path = "/etc/secrets/forgejo-token.txt", + verify_ssl = true, + }, + { + id = "company-github", + provider = "github", + organization = "company-provisioning", + token_path = "/etc/secrets/github-token.txt", + verify_ssl = true, + }, + ], + cache = { + mode = "remote", + url = "redis://redis-primary.internal:6379", + }, + }, +} diff --git a/schemas/platform/defaults/common/logging-defaults.ncl b/schemas/platform/defaults/common/logging-defaults.ncl index 9c36c25..dbaee57 100644 --- a/schemas/platform/defaults/common/logging-defaults.ncl +++ b/schemas/platform/defaults/common/logging-defaults.ncl @@ -1,15 +1,15 @@ # Logging Default Values # Common defaults for log level, format, and output -let logging_schema = import "../../schemas/common/logging.ncl" in +let logging_schema = import "../../common/logging.ncl" in { logging | logging_schema.LoggingConfig = { # Default log level: info - level = "&", + level = 'info, # Default format: text (human-readable) - format = "&", + format = 'text, # Default output: stdout outputs = ["stdout"], diff --git a/schemas/platform/defaults/common/monitoring-defaults.ncl b/schemas/platform/defaults/common/monitoring-defaults.ncl index a509bc7..a3eab56 100644 --- a/schemas/platform/defaults/common/monitoring-defaults.ncl +++ b/schemas/platform/defaults/common/monitoring-defaults.ncl @@ -1,7 +1,7 @@ # Monitoring Default Values # Common defaults for metrics, health checks, and observability -let monitoring_schema = import "../../schemas/common/monitoring.ncl" in +let monitoring_schema = import "../../common/monitoring.ncl" in { monitoring | monitoring_schema.MonitoringConfig = { diff --git a/schemas/platform/defaults/common/observability-defaults.ncl b/schemas/platform/defaults/common/observability-defaults.ncl new file mode 100644 index 0000000..877f3cb --- /dev/null +++ b/schemas/platform/defaults/common/observability-defaults.ncl @@ -0,0 +1,72 @@ +# Observability Default Configuration +# Base defaults for logging, metrics, health checks, and audit + +let observability_schema = import "../../common/observability.ncl" in + +{ + observability | observability_schema.ObservabilityConfig = { + # Observability enabled globally + enabled = true, + + # Logging Defaults + logging = { + enabled = true, + level = "info", + format = "json", + output = { + destination = "stdout", + }, + fields = { + service_name = true, + timestamp = true, + level = true, + caller = false, + spans = true, + }, + sampling = { + enabled = false, + }, + }, + + # Metrics Defaults + metrics = { + enabled = true, + exporter = "prometheus", + prometheus_path = "/metrics", + interval = 60, + histogram_buckets = [1, 5, 10, 50, 100, 500, 1000, 5000], + }, + + # Health Check Defaults + health = { + enabled = true, + port = 8081, + liveness_path = "/healthz", + readiness_path = "/ready", + startup_path = "/startup", + interval = 10, + timeout = 5000, + success_threshold = 1, + failure_threshold = 3, + initial_delay = 0, + }, + + # Distributed Tracing Defaults + tracing = { + enabled = false, + backend = "otlp", + sampler = "parentbased", + }, + + # Audit Logging Defaults + audit = { + enabled = true, + storage = "file", + retention_days = 90, + include_pii = false, + export_formats = ["jsonl"], + track_workspace_operations = true, + workspace_operations = ["create", "delete", "update", "switch", "list", "sync"], + }, + }, +} diff --git a/schemas/platform/defaults/common/security-defaults.ncl b/schemas/platform/defaults/common/security-defaults.ncl index 77474c2..5e94e5f 100644 --- a/schemas/platform/defaults/common/security-defaults.ncl +++ b/schemas/platform/defaults/common/security-defaults.ncl @@ -1,7 +1,7 @@ # Security Default Values # Common defaults for authentication, RBAC, encryption -let security_schema = import "../../schemas/common/security.ncl" in +let security_schema = import "../../common/security.ncl" in { security | security_schema.SecurityConfig = { diff --git a/schemas/platform/defaults/common/server-defaults.ncl b/schemas/platform/defaults/common/server-defaults.ncl index c3b1a1f..fbee6d7 100644 --- a/schemas/platform/defaults/common/server-defaults.ncl +++ b/schemas/platform/defaults/common/server-defaults.ncl @@ -1,7 +1,7 @@ # HTTP Server Default Values # Common defaults for HTTP server settings across all services -let server_schema = import "../../schemas/common/server.ncl" in +let server_schema = import "../../common/server.ncl" in { server | server_schema.ServerConfig = { diff --git a/schemas/platform/defaults/control-center-defaults.ncl b/schemas/platform/defaults/control-center-defaults.ncl index 6dd41b0..9d0852e 100644 --- a/schemas/platform/defaults/control-center-defaults.ncl +++ b/schemas/platform/defaults/control-center-defaults.ncl @@ -1,166 +1,141 @@ # Control Center Service Default Configuration -# Policy management, RBAC, and compliance defaults +# Pattern: 3-Layer Config (flat notation + | default) -let control_center_schema = import "../schemas/control-center.ncl" in +let control_center_schema = import "../control-center.ncl" in let monitoring_defaults = import "./common/monitoring-defaults.ncl" in let logging_defaults = import "./common/logging-defaults.ncl" in +let base_control_center = { + # Workspace Configuration + workspace.name | default = "default", + workspace.path | default = "/var/lib/provisioning/control-center", + workspace.enabled | default = true, + workspace.multi_workspace | default = false, + + # HTTP Server Settings + server.host | default = "127.0.0.1", + server.port | default = 9091, + server.workers | default = 4, + server.keep_alive | default = 75, + server.max_connections | default = 100, + server.request_timeout | default = 30000, + server.graceful_shutdown | default = true, + server.shutdown_timeout | default = 30, + + # Database Configuration + database.backend | default = "rocksdb", + database.path | default = "/var/lib/provisioning/control-center/data", + database.pool_size | default = 10, + database.timeout | default = 30, + database.retry | default = true, + database.max_retries | default = "3", + + # Security - JWT Configuration + security.jwt.issuer | default = "control-center", + security.jwt.audience | default = "provisioning", + security.jwt.expiration | default = 3600, + security.jwt.refresh_expiration | default = 86400, + security.jwt.secret | default = "change_me_in_production", + security.jwt.algorithm | default = "HS256", + + # Security - RBAC Configuration + security.rbac.enabled | default = true, + security.rbac.inheritance | default = true, + security.rbac.default_role | default = "user", + + # Security - MFA Configuration + security.mfa.required | default = false, + security.mfa.methods | default = ["totp"], + security.mfa.max_attempts | default = "5", + security.mfa.lockout_duration | default = 15, + + # Security - Rate Limiting Configuration + security.rate_limiting.enabled | default = false, + security.rate_limiting.max_requests | default = "1000", + security.rate_limiting.window_seconds | default = 60, + + # Security - TLS Configuration + security.tls.enabled | default = false, + + # Security - CORS Configuration + security.cors.enabled | default = false, + + # Security - Session Configuration + security.session.max_duration | default = 86400, + security.session.idle_timeout | default = 3600, + security.session.tracking | default = false, + + # Policy Engine Configuration + policy.enabled | default = true, + policy.cache.enabled | default = true, + policy.cache.ttl | default = 3600, + policy.cache.max_policies | default = 10000, + policy.versioning.enabled | default = true, + policy.versioning.max_versions | default = 20, + + # RBAC Configuration + rbac.enabled | default = true, + rbac.hierarchy | default = true, + rbac.dynamic_roles | default = false, + rbac.default_role | default = "user", + rbac.roles.admin | default = true, + rbac.roles.operator | default = true, + rbac.roles.viewer | default = true, + rbac.attribute_based | default = false, + + # User Management Configuration + users.enabled | default = true, + users.registration.enabled | default = true, + users.registration.requires_approval | default = false, + users.registration.auto_assign_role | default = "user", + users.sessions.max_active | default = 5, + users.sessions.idle_timeout | default = 3600, + users.sessions.absolute_timeout | default = 86400, + users.audit_enabled | default = false, + + # Audit Logging Configuration + audit.enabled | default = false, + audit.storage.retention_days | default = 90, + audit.storage.immutable | default = false, + audit.redact_sensitive | default = true, + + # Compliance Configuration + compliance.enabled | default = false, + compliance.validation.enabled | default = false, + compliance.validation.interval_hours | default = 24, + compliance.data_retention.policy_years | default = 7, + compliance.data_retention.audit_log_days | default = 2555, + compliance.encryption_required | default = false, + + # Integrations Configuration + integrations.ldap.enabled | default = false, + integrations.oauth2.enabled | default = false, + integrations.webhooks.enabled | default = false, + + # Monitoring Configuration (from common defaults) + monitoring.enabled | default = monitoring_defaults.monitoring.enabled, + monitoring.metrics.enabled | default = monitoring_defaults.monitoring.metrics.enabled, + monitoring.metrics.interval | default = monitoring_defaults.monitoring.metrics.interval, + monitoring.health_check.enabled | default = monitoring_defaults.monitoring.health_check.enabled, + monitoring.health_check.interval | default = monitoring_defaults.monitoring.health_check.interval, + monitoring.resources.cpu | default = monitoring_defaults.monitoring.resources.cpu, + monitoring.resources.memory | default = monitoring_defaults.monitoring.resources.memory, + monitoring.resources.alert_threshold | default = monitoring_defaults.monitoring.resources.alert_threshold, + + # Logging Configuration (from common defaults) + logging.level | default = logging_defaults.logging.level, + logging.format | default = logging_defaults.logging.format, + + # Docker Build Configuration (no | default to override schema) + build.package = "control-center", + build.binary = "provisioning-control-center", + build.port = 9091, + build.features = ["all"], + build.extra_runtime_pkgs = [], + build.config_file = "config.defaults.toml", +} in + { - control_center | control_center_schema.ControlCenterConfig = { - # Workspace Configuration - workspace = { - name = "default", - path = "/var/lib/provisioning/control-center", - enabled = true, - multi_workspace = false, - }, - - # HTTP Server Settings - server = { - host = "127.0.0.1", - port = 8080, - workers = 4, - keep_alive = 75, - max_connections = 100, - request_timeout = 30000, - graceful_shutdown = true, - shutdown_timeout = 30, - }, - - # Database Configuration - database = { - backend = "rocksdb", - path = "/var/lib/provisioning/control-center/data", - pool_size = 10, - timeout = 30, - retry = true, - max_retries = "3", - }, - - # Security Configuration - security = { - jwt = { - issuer = "control-center", - audience = "provisioning", - expiration = 3600, - refresh_expiration = 86400, - secret = "change_me_in_production", - algorithm = "HS256", - }, - rbac = { - enabled = true, - inheritance = true, - default_role = "user", - }, - mfa = { - required = false, - methods = ["totp"], - max_attempts = "5", - lockout_duration = 15, - }, - rate_limiting = { - enabled = false, - max_requests = "1000", - window_seconds = 60, - }, - tls = { - enabled = false, - }, - cors = { - enabled = false, - }, - session = { - max_duration = 86400, - idle_timeout = 3600, - tracking = false, - }, - }, - - # Policy Engine Configuration - policy = { - enabled = true, - cache = { - enabled = true, - ttl = 3600, - max_policies = 10000, - }, - versioning = { - enabled = true, - max_versions = 20, - }, - }, - - # RBAC Configuration - rbac = { - enabled = true, - hierarchy = true, - dynamic_roles = false, - default_role = "user", - roles = { - admin = true, - operator = true, - viewer = true, - }, - attribute_based = false, - }, - - # User Management - users = { - enabled = true, - registration = { - enabled = true, - requires_approval = false, - auto_assign_role = "user", - }, - sessions = { - max_active = 5, - idle_timeout = 3600, - absolute_timeout = 86400, - }, - audit_enabled = false, - }, - - # Audit Logging - audit = { - enabled = false, - storage = { - retention_days = 90, - immutable = false, - }, - redact_sensitive = true, - }, - - # Compliance Configuration - compliance = { - enabled = false, - validation = { - enabled = false, - interval_hours = 24, - }, - data_retention = { - policy_years = 7, - audit_log_days = 2555, - }, - encryption_required = false, - }, - - # Integrations - integrations = { - ldap = { - enabled = false, - }, - oauth2 = { - enabled = false, - }, - webhooks = { - enabled = false, - }, - }, - - # Monitoring Configuration - monitoring = monitoring_defaults.monitoring, - - # Logging Configuration - logging = logging_defaults.logging, - }, + # Base configuration with all defaults + control_center = base_control_center | control_center_schema.ControlCenterConfig, } diff --git a/schemas/platform/defaults/deployment/cicd-defaults.ncl b/schemas/platform/defaults/deployment/cicd-defaults.ncl index 93e8042..ab82b4a 100644 --- a/schemas/platform/defaults/deployment/cicd-defaults.ncl +++ b/schemas/platform/defaults/deployment/cicd-defaults.ncl @@ -12,40 +12,64 @@ extensions.max_concurrent = 50, performance.max_memory = 16000, performance.connection_pool_size = 250, + build.buildkit.parallel_jobs = 16, + build.buildkit.cache_mode = 'registry, + build.sccache.enabled = true, }, control_center = { server.workers = 8, server.max_connections = 500, + build.buildkit.parallel_jobs = 16, + build.buildkit.cache_mode = 'registry, + build.sccache.enabled = true, }, vault_service = { server.workers = 8, server.max_connections = 500, + build.buildkit.parallel_jobs = 16, + build.buildkit.cache_mode = 'registry, + build.sccache.enabled = true, }, mcp_server = { server.workers = 8, server.max_connections = 500, + build.buildkit.parallel_jobs = 16, + build.buildkit.cache_mode = 'registry, + build.sccache.enabled = true, }, extension_registry = { server.workers = 8, server.max_connections = 500, + build.buildkit.parallel_jobs = 16, + build.buildkit.cache_mode = 'registry, + build.sccache.enabled = true, }, rag = { server.workers = 8, server.max_connections = 500, + build.buildkit.parallel_jobs = 16, + build.buildkit.cache_mode = 'registry, + build.sccache.enabled = true, }, ai_service = { server.workers = 8, server.max_connections = 500, + build.buildkit.parallel_jobs = 16, + build.buildkit.cache_mode = 'registry, + build.sccache.enabled = true, }, provisioning_daemon = { server.workers = 8, server.max_connections = 500, + build.buildkit.parallel_jobs = 16, + build.buildkit.cache_mode = 'registry, + build.sccache.enabled = true, }, } diff --git a/schemas/platform/defaults/deployment/enterprise-defaults.ncl b/schemas/platform/defaults/deployment/enterprise-defaults.ncl index 96757c3..f8ad354 100644 --- a/schemas/platform/defaults/deployment/enterprise-defaults.ncl +++ b/schemas/platform/defaults/deployment/enterprise-defaults.ncl @@ -12,40 +12,56 @@ extensions.max_concurrent = 100, performance.max_memory = 32000, performance.connection_pool_size = 500, + build.buildkit.parallel_jobs = 16, + build.buildkit.cache_mode = 'registry, }, control_center = { server.workers = 16, server.max_connections = 1000, + build.buildkit.parallel_jobs = 16, + build.buildkit.cache_mode = 'registry, }, vault_service = { server.workers = 16, server.max_connections = 1000, + build.buildkit.parallel_jobs = 16, + build.buildkit.cache_mode = 'registry, }, mcp_server = { server.workers = 16, server.max_connections = 1000, + build.buildkit.parallel_jobs = 16, + build.buildkit.cache_mode = 'registry, }, extension_registry = { server.workers = 16, server.max_connections = 1000, + build.buildkit.parallel_jobs = 16, + build.buildkit.cache_mode = 'registry, }, rag = { server.workers = 16, server.max_connections = 1000, + build.buildkit.parallel_jobs = 16, + build.buildkit.cache_mode = 'registry, }, ai_service = { server.workers = 16, server.max_connections = 1000, + build.buildkit.parallel_jobs = 16, + build.buildkit.cache_mode = 'registry, }, provisioning_daemon = { server.workers = 16, server.max_connections = 1000, + build.buildkit.parallel_jobs = 16, + build.buildkit.cache_mode = 'registry, }, } diff --git a/schemas/platform/defaults/deployment/observability-cicd-overrides.ncl b/schemas/platform/defaults/deployment/observability-cicd-overrides.ncl new file mode 100644 index 0000000..0ab2a80 --- /dev/null +++ b/schemas/platform/defaults/deployment/observability-cicd-overrides.ncl @@ -0,0 +1,67 @@ +# Observability Configuration for CI/CD Mode +# Optimized for automated testing with minimal overhead and JSON output + +{ + # CI/CD mode observability overrides + observability = { + logging = { + # JSON format for log aggregation in CI/CD pipelines + format = "json", + + # Warning level to reduce noise (focus on problems) + level = "warn", + + # Stdout only (captured by CI/CD runner) + output = { + destination = "stdout", + }, + + # Minimal fields to reduce log size + fields = { + service_name = true, + timestamp = true, + level = true, + caller = false, # Disabled in CI + spans = false, # Disabled in CI + }, + + # Sample 50% of logs to manage CI/CD log size + sampling = { + enabled = true, + rate = 0.5, + }, + }, + + metrics = { + # Metrics disabled in CI/CD (not needed for testing) + enabled = false, + }, + + health = { + # Health checks enabled but no interval (on-demand only) + enabled = true, + interval = 60, + }, + + tracing = { + # Tracing disabled in CI/CD (overhead not justified) + enabled = false, + }, + + audit = { + # Audit logs to stdout for CI/CD capture + storage = "file", + log_directory = "/tmp/provisioning/audit", + retention_days = 1, # Keep only during test run + + # No PII in CI/CD logs + include_pii = false, + + # Only JSONL export (lightweight) + export_formats = ["jsonl"], + + # Track workspace operations for test verification + track_workspace_operations = true, + }, + }, +} diff --git a/schemas/platform/defaults/deployment/observability-production-overrides.ncl b/schemas/platform/defaults/deployment/observability-production-overrides.ncl new file mode 100644 index 0000000..f8ac929 --- /dev/null +++ b/schemas/platform/defaults/deployment/observability-production-overrides.ncl @@ -0,0 +1,85 @@ +# Observability Configuration for Production Mode +# Optimized for reliability with JSON logging, metrics enabled, and Loki aggregation + +{ + # Production mode observability overrides + observability = { + logging = { + # JSON structured logs for Loki/Splunk ingestion + format = "json", + + # Info level for production (less verbose) + level = "info", + + # Output to Loki for centralized aggregation + output = { + destination = "loki", + loki_endpoint = "http://loki:3100", + loki_labels = { + environment = "production", + cluster = "prod-cluster", + }, + }, + + # Enable caller info for troubleshooting + fields = { + caller = true, + spans = true, + }, + + # Log sampling for high-throughput services + sampling = { + enabled = false, # Disable sampling; keep all logs + # rate = 0.1, # Uncomment to sample 10% of logs if needed + }, + }, + + metrics = { + # Metrics fully enabled in production + enabled = true, + interval = 30, # More frequent collection + + # Push to OpenTelemetry Collector + exporter = "otlp", + otlp_endpoint = "http://otel-collector:4317", + otlp_interval = 30, + }, + + health = { + # More frequent health checks in production + interval = 5, + failure_threshold = 2, + success_threshold = 2, + }, + + tracing = { + # Distributed tracing enabled for debugging + enabled = true, + backend = "otlp", + otlp_endpoint = "http://otel-collector:4317", + # Sample 10% of traces (reduce overhead) + sampler = "parentbased", + sampling_rate = 0.1, + environment = "production", + }, + + audit = { + # Audit logs to filesystem + SIEM export + storage = "file", + log_directory = "/var/log/provisioning/audit", + retention_days = 365, # Keep 1 year for compliance + + # Enable PII tracking for compliance audits + include_pii = true, + + # Export to both JSONL and Splunk + export_formats = ["jsonl", "splunk"], + + # Push audit logs to Splunk HEC + siem_endpoint = "https://splunk.example.com:8088", + + # Track all workspace operations + track_workspace_operations = true, + }, + }, +} diff --git a/schemas/platform/defaults/deployment/observability-solo-overrides.ncl b/schemas/platform/defaults/deployment/observability-solo-overrides.ncl new file mode 100644 index 0000000..17960f6 --- /dev/null +++ b/schemas/platform/defaults/deployment/observability-solo-overrides.ncl @@ -0,0 +1,40 @@ +# Observability Configuration for Solo Development Mode +# Optimized for local development with pretty-printed logs and metrics disabled + +{ + # Development mode observability overrides + observability = { + logging = { + # Pretty-printed logs for local development (easier to read) + format = "pretty", + + # Debug level for detailed troubleshooting + level = "debug", + + # Optional: Add filter for verbose modules + # filter = "orchestrator=debug,tower_http=debug,batch=debug", + }, + + metrics = { + # Metrics disabled in solo mode (lighter footprint) + enabled = false, + }, + + health = { + # Health checks enabled but with longer interval + interval = 30, + }, + + tracing = { + # Tracing disabled by default (development) + enabled = false, + }, + + audit = { + # Audit logs to local filesystem + storage = "file", + log_directory = "/var/lib/provisioning/audit", + retention_days = 7, # Keep 7 days in development + }, + }, +} diff --git a/schemas/platform/defaults/deployment/solo-defaults.ncl b/schemas/platform/defaults/deployment/solo-defaults.ncl index b33ccc2..6cd93df 100644 --- a/schemas/platform/defaults/deployment/solo-defaults.ncl +++ b/schemas/platform/defaults/deployment/solo-defaults.ncl @@ -8,42 +8,58 @@ queue.max_concurrent_tasks = 2, batch.parallel_limit = 2, extensions.max_concurrent = 2, - performance.max_memory = 1000, - performance.connection_pool_size = 10, + build.buildkit.parallel_jobs = 2, + build.buildkit.cache_mode = 'local, }, control_center = { server.workers = 2, - server.max_connections = 50, + build.buildkit.parallel_jobs = 2, + build.buildkit.cache_mode = 'local, }, vault_service = { server.workers = 2, server.max_connections = 50, + build.buildkit.parallel_jobs = 2, + build.buildkit.cache_mode = 'local, }, mcp_server = { server.workers = 2, server.max_connections = 50, + build.buildkit.parallel_jobs = 2, + build.buildkit.cache_mode = 'local, }, extension_registry = { server.workers = 2, - server.max_connections = 50, + build.buildkit.parallel_jobs = 2, + build.buildkit.cache_mode = 'local, }, - rag = { + rag_config = { server.workers = 2, server.max_connections = 50, + build.buildkit.parallel_jobs = 2, + build.buildkit.cache_mode = 'local, }, ai_service = { server.workers = 2, - server.max_connections = 50, + build.buildkit.parallel_jobs = 2, + build.buildkit.cache_mode = 'local, }, provisioning_daemon = { server.workers = 2, server.max_connections = 50, + build.buildkit.parallel_jobs = 2, + build.buildkit.cache_mode = 'local, + }, + + nu_daemon = { + server.workers = 2, + daemon.max_workers = 2, }, } diff --git a/schemas/platform/defaults/extension-registry-defaults.ncl b/schemas/platform/defaults/extension-registry-defaults.ncl index cc641c5..ed5919c 100644 --- a/schemas/platform/defaults/extension-registry-defaults.ncl +++ b/schemas/platform/defaults/extension-registry-defaults.ncl @@ -1,36 +1,42 @@ # Extension Registry Default Configuration -# Minimal defaults when no configuration is provided +# Pattern: 3-Layer Config (flat notation + | default) -let registry_schema = import "../schemas/extension-registry.ncl" in +let registry_schema = import "../extension-registry.ncl" in + +let base_extension_registry = { + # Server Configuration + server.host = "127.0.0.1", + server.port | default = 9094, + server.workers | default = 4, + server.enable_cors | default = false, + server.enable_compression | default = true, + + # Default single Gitea source (nested - arrays can't be flattened) + sources.gitea = [ + { + url = "http://localhost:3000", + organization = "provisioning", + token_path = "/etc/secrets/gitea-token.txt", + timeout_seconds = 30, + verify_ssl = false, + }, + ], + + # Cache Configuration + cache.capacity | default = 1000, + cache.ttl_seconds | default = 300, + cache.enable_metadata_cache | default = true, + cache.enable_list_cache | default = true, + + # Docker Build Configuration (no | default to override schema) + build.package = "extension-registry", + build.binary = "extension-registry", + build.port = 9094, + build.features = [], + build.extra_runtime_pkgs = [], +} in { - extension_registry | registry_schema.RegistryConfig = { - server = { - host = "127.0.0.1", - port = 8081, - workers = 4, - enable_cors = false, - enable_compression = true, - }, - - # Default single Gitea source (auto-migrated from legacy format if needed) - sources = { - gitea = [ - { - url = "http://localhost:3000", - organization = "provisioning", - token_path = "/etc/secrets/gitea-token.txt", - timeout_seconds = 30, - verify_ssl = false, - }, - ], - }, - - cache = { - capacity = 1000, - ttl_seconds = 300, - enable_metadata_cache = true, - enable_list_cache = true, - }, - }, + # Base configuration with all defaults + extension_registry = base_extension_registry | registry_schema.RegistryConfig, } diff --git a/schemas/platform/defaults/mcp-server-defaults.ncl b/schemas/platform/defaults/mcp-server-defaults.ncl index 95c50ee..6685442 100644 --- a/schemas/platform/defaults/mcp-server-defaults.ncl +++ b/schemas/platform/defaults/mcp-server-defaults.ncl @@ -1,138 +1,111 @@ # MCP Server Service Default Configuration -# Model Context Protocol with tools, prompts, resources, sampling +# Pattern: 3-Layer Config (flat notation + | default) -let mcp_server_schema = import "../schemas/mcp-server.ncl" in +let mcp_server_schema = import "../mcp-server.ncl" in let monitoring_defaults = import "./common/monitoring-defaults.ncl" in let logging_defaults = import "./common/logging-defaults.ncl" in +let base_mcp_server = { + # Workspace Configuration + workspace.name | default = "default", + workspace.path | default = "/var/lib/provisioning/mcp-server", + workspace.enabled | default = true, + workspace.multi_workspace | default = false, + + # HTTP Server Settings + server.host | default = "127.0.0.1", + server.port | default = 9093, + server.workers | default = 4, + server.keep_alive | default = 75, + server.max_connections | default = 100, + server.request_timeout | default = 30000, + server.graceful_shutdown | default = true, + server.shutdown_timeout | default = 30, + + # MCP Protocol Configuration + protocol.version | default = "1.0", + protocol.transport.endpoint | default = "http://localhost:9093", + protocol.transport.timeout | default = 30000, + + # Tools Configuration + tools.enabled | default = true, + tools.max_concurrent | default = 5, + tools.timeout | default = 30000, + tools.validation.enabled | default = true, + tools.validation.strict_mode | default = false, + tools.cache.enabled = true, + tools.cache.ttl | default = 3600, + + # Prompts Configuration + prompts.enabled | default = true, + prompts.max_templates | default = 100, + prompts.cache.enabled = true, + prompts.cache.ttl | default = 3600, + prompts.versioning.enabled | default = false, + prompts.versioning.max_versions | default = 10, + + # Resources Configuration + resources.enabled | default = true, + resources.max_size | default = 104857600, + resources.cache.enabled = true, + resources.cache.max_size_mb | default = 512, + resources.cache.ttl | default = 3600, + resources.validation.enabled | default = true, + resources.validation.max_depth | default = 10, + + # Sampling Configuration + sampling.enabled | default = false, + sampling.max_tokens | default = 4096, + sampling.temperature | default = 0.7, + sampling.cache.enabled = true, + sampling.cache.ttl | default = 3600, + + # Capabilities Declaration + capabilities.tools.enabled | default = true, + capabilities.tools.list_changed_callback | default = false, + capabilities.prompts.enabled | default = true, + capabilities.prompts.list_changed_callback | default = false, + capabilities.resources.enabled | default = true, + capabilities.resources.list_changed_callback | default = false, + capabilities.resources.subscribe | default = false, + capabilities.sampling.enabled | default = false, + + # Orchestrator Integration + orchestrator_integration.enabled | default = false, + + # Control Center Integration + control_center_integration.enabled | default = false, + control_center_integration.enforce_rbac | default = true, + + # Performance Tuning + performance.pool_size | default = 10, + performance.buffer_size | default = 1024, + performance.compression | default = false, + + # Docker Build Configuration (no | default to override schema) + build.package = "mcp-server", + build.binary = "provisioning-mcp-server", + build.port = 9093, + build.features = [], + build.extra_runtime_pkgs = [], + build.config_file = "config.defaults.toml", + + # Monitoring Configuration (from common defaults) + monitoring.enabled | default = monitoring_defaults.monitoring.enabled, + monitoring.metrics.enabled | default = monitoring_defaults.monitoring.metrics.enabled, + monitoring.metrics.interval | default = monitoring_defaults.monitoring.metrics.interval, + monitoring.health_check.enabled | default = monitoring_defaults.monitoring.health_check.enabled, + monitoring.health_check.interval | default = monitoring_defaults.monitoring.health_check.interval, + monitoring.resources.cpu | default = monitoring_defaults.monitoring.resources.cpu, + monitoring.resources.memory | default = monitoring_defaults.monitoring.resources.memory, + monitoring.resources.alert_threshold | default = monitoring_defaults.monitoring.resources.alert_threshold, + + # Logging Configuration (from common defaults) + logging.level | default = logging_defaults.logging.level, + logging.format | default = logging_defaults.logging.format, +} in + { - mcp_server | mcp_server_schema.MCPServerConfig = { - # Workspace Configuration - workspace = { - name = "default", - path = "/var/lib/provisioning/mcp-server", - enabled = true, - multi_workspace = false, - }, - - # HTTP Server Settings - server = { - host = "127.0.0.1", - port = 3000, - workers = 4, - keep_alive = 75, - max_connections = 100, - request_timeout = 30000, - graceful_shutdown = true, - shutdown_timeout = 30, - }, - - # MCP Protocol Configuration - protocol = { - version = "1.0", - transport = { - endpoint = "http://localhost:3000", - timeout = 30000, - }, - }, - - # Tools Configuration - tools = { - enabled = true, - max_concurrent = 5, - timeout = 30000, - validation = { - enabled = true, - strict_mode = false, - }, - cache = { - enabled = true, - ttl = 3600, - }, - }, - - # Prompts Configuration - prompts = { - enabled = true, - max_templates = 100, - cache = { - enabled = true, - ttl = 3600, - }, - versioning = { - enabled = false, - max_versions = 10, - }, - }, - - # Resources Configuration - resources = { - enabled = true, - max_size = 104857600, - cache = { - enabled = true, - max_size_mb = 512, - ttl = 3600, - }, - validation = { - enabled = true, - max_depth = 10, - }, - }, - - # Sampling Configuration - sampling = { - enabled = false, - max_tokens = 4096, - temperature = 0.7, - cache = { - enabled = true, - ttl = 3600, - }, - }, - - # Capabilities Declaration - capabilities = { - tools = { - enabled = true, - list_changed_callback = false, - }, - prompts = { - enabled = true, - list_changed_callback = false, - }, - resources = { - enabled = true, - list_changed_callback = false, - subscribe = false, - }, - sampling = { - enabled = false, - }, - }, - - # Orchestrator Integration - orchestrator_integration = { - enabled = false, - }, - - # Control Center Integration - control_center_integration = { - enabled = false, - enforce_rbac = true, - }, - - # Monitoring Configuration - monitoring = monitoring_defaults.monitoring, - - # Logging Configuration - logging = logging_defaults.logging, - - # Performance Tuning - performance = { - pool_size = 10, - buffer_size = 1024, - compression = false, - }, - }, + # Base configuration with all defaults + mcp_server = base_mcp_server | mcp_server_schema.MCPServerConfig, } diff --git a/schemas/platform/defaults/nu-daemon-defaults.ncl b/schemas/platform/defaults/nu-daemon-defaults.ncl new file mode 100644 index 0000000..751bf2c --- /dev/null +++ b/schemas/platform/defaults/nu-daemon-defaults.ncl @@ -0,0 +1,24 @@ +let nu_daemon_schema = import "schemas/platform/nu-daemon.ncl" in + +{ + provisioning_daemon | nu_daemon_schema.DaemonConfig = { + server = { + host = "0.0.0.0", + port = 9095, + workers = 2, + }, + daemon = { + enabled = true, + poll_interval = 60, + max_workers = 2, + }, + logging = { + level = "info", + file = "/tmp/nu-daemon.log", + }, + actions = { + auto_cleanup = false, + auto_update = false, + }, + }, +} diff --git a/schemas/platform/defaults/orchestrator-defaults.ncl b/schemas/platform/defaults/orchestrator-defaults.ncl index 915024d..eeccc60 100644 --- a/schemas/platform/defaults/orchestrator-defaults.ncl +++ b/schemas/platform/defaults/orchestrator-defaults.ncl @@ -1,89 +1,99 @@ # Orchestrator Service Default Configuration -# Workflow engine defaults with queue, batch, extensions +# Pattern: Commands-Registry style with | default for all fields +# Allows: (base & mode) & user merge -let orchestrator_schema = import "../schemas/orchestrator.ncl" in +let orchestrator_schema = import "../orchestrator.ncl" in let monitoring_defaults = import "./common/monitoring-defaults.ncl" in let logging_defaults = import "./common/logging-defaults.ncl" in +let base_orchestrator = { + # Workspace Configuration + workspace.name | default = "default", + workspace.path | default = "/var/lib/provisioning/orchestrator", + workspace.enabled | default = true, + workspace.multi_workspace | default = false, + + # HTTP Server Settings + server.host | default = "127.0.0.1", + server.port | default = 9090, + server.workers | default = 4, + server.keep_alive | default = 75, + server.max_connections | default = 100, + server.request_timeout | default = 30000, + server.graceful_shutdown | default = true, + server.shutdown_timeout | default = 30, + + # Storage Configuration + storage.backend | default = "filesystem", + storage.path | default = "/var/lib/provisioning/orchestrator/data", + storage.cache.enabled | default = true, + storage.cache.type | default = "in_memory", + storage.cache.eviction_policy | default = "lru", + storage.cache.ttl | default = 3600, + + # Queue Configuration + queue.max_concurrent_tasks | default = 5, + queue.retry_attempts | default = 3, + queue.retry_delay | default = 5000, + queue.task_timeout | default = 3600000, + queue.persist | default = true, + queue.dead_letter_queue.enabled | default = true, + queue.dead_letter_queue.max_size | default = 1000, + queue.priority_queue | default = false, + queue.metrics | default = false, + + # Batch Workflow Configuration + batch.parallel_limit | default = 5, + batch.operation_timeout_minutes | default = 30, + batch.checkpointing.enabled | default = true, + batch.checkpointing.interval | default = 100, + batch.checkpointing.max_checkpoints | default = 10, + batch.rollback.enabled | default = true, + batch.rollback.strategy | default = "checkpoint_based", + batch.rollback.max_rollback_depth | default = 5, + batch.metrics | default = false, + + # Extensions Configuration + extensions.auto_load | default = false, + extensions.discovery_interval | default = 300, + extensions.max_concurrent | default = 5, + extensions.timeout | default = 30000, + extensions.sandbox | default = true, + + # Performance Configuration + performance.profiling | default = false, + performance.cpu_affinity | default = false, + + # Docker Build Configuration (no | default to override schema defaults) + build.package = "provisioning-orchestrator", + build.binary = "provisioning-orchestrator", + build.port = 9090, + build.features = [], + build.extra_runtime_pkgs = [], + build.config_file = "config.defaults.toml", + + # Monitoring Configuration (from common defaults) + monitoring.enabled | default = monitoring_defaults.monitoring.enabled, + monitoring.metrics.enabled | default = monitoring_defaults.monitoring.metrics.enabled, + monitoring.metrics.interval | default = monitoring_defaults.monitoring.metrics.interval, + monitoring.health_check.enabled | default = monitoring_defaults.monitoring.health_check.enabled, + monitoring.health_check.interval | default = monitoring_defaults.monitoring.health_check.interval, + monitoring.resources.cpu | default = monitoring_defaults.monitoring.resources.cpu, + monitoring.resources.memory | default = monitoring_defaults.monitoring.resources.memory, + monitoring.resources.alert_threshold | default = monitoring_defaults.monitoring.resources.alert_threshold, + + # Logging Configuration (from common defaults) + logging.level | default = logging_defaults.logging.level, + logging.format | default = logging_defaults.logging.format, + + # Docker Build Configuration (removed - conflicts with schema defaults) +} in + { - orchestrator | orchestrator_schema.OrchestratorConfig = { - # Workspace Configuration - workspace = { - name = "default", - path = "/var/lib/provisioning/orchestrator", - enabled = true, - multi_workspace = false, - }, + # Base configuration with all defaults + orchestrator = base_orchestrator | orchestrator_schema.OrchestratorConfig, - # HTTP Server Settings - server = { - host = "127.0.0.1", - port = 9090, - workers = 4, - keep_alive = 75, - max_connections = 100, - request_timeout = 30000, - graceful_shutdown = true, - shutdown_timeout = 30, - }, - - # Storage Configuration - storage = { - backend = "filesystem", - path = "/var/lib/provisioning/orchestrator/data", - cache = { - enabled = true, - type = "in_memory", - eviction_policy = "lru", - ttl = 3600, - }, - }, - - # Queue Configuration - queue = { - max_concurrent_tasks = 5, - retry_attempts = 3, - retry_delay = 5000, - task_timeout = 3600000, - persist = true, - dead_letter_queue = { - enabled = true, - max_size = 1000, - }, - priority_queue = false, - metrics = false, - }, - - # Batch Workflow Configuration - batch = { - parallel_limit = 5, - operation_timeout = 1800000, - checkpointing = { - enabled = true, - interval = 100, - max_checkpoints = 10, - }, - rollback = { - enabled = true, - strategy = "checkpoint_based", - max_rollback_depth = 5, - }, - metrics = false, - }, - - # Extensions Configuration - extensions = { - auto_load = false, - discovery_interval = 300, - max_concurrent = 5, - timeout = 30000, - sandbox = true, - }, - - # Monitoring Configuration - monitoring = monitoring_defaults.monitoring, - - # Logging Configuration - logging = logging_defaults.logging, - }, + # Factory function for hybrid pattern: (base & mode) & user + make_orchestrator = fun mode_overrides => fun user_overrides => + (base_orchestrator & mode_overrides) & user_overrides, } diff --git a/schemas/platform/defaults/provisioning-daemon-defaults.ncl b/schemas/platform/defaults/provisioning-daemon-defaults.ncl index 58c4ac4..5a8256b 100644 --- a/schemas/platform/defaults/provisioning-daemon-defaults.ncl +++ b/schemas/platform/defaults/provisioning-daemon-defaults.ncl @@ -1 +1,19 @@ -let d = import "../schemas/provisioning-daemon.ncl" in { daemon | d.DaemonConfig = { daemon = { enabled = true, poll_interval = 60, max_workers = 2, }, logging = { level = "info", file = "/tmp/provisioning-daemon.log", }, actions = { auto_cleanup = false, auto_update = false, }, }, } +# Provisioning Daemon Service Default Configuration + +let provisioning_daemon_schema = import "../provisioning-daemon.ncl" in + +{ + provisioning_daemon | provisioning_daemon_schema.DaemonConfig = { + server = { + host = "0.0.0.0", + port = 9014, + workers = 2, + }, + orchestrator_url = "http://localhost:9011", + nats_url = "nats://127.0.0.1:4222", + project_name = "provisioning", + provisioning_bin = "provisioning", + watch_paths = [], + log_level = "info", + }, +} diff --git a/schemas/platform/defaults/rag-defaults.ncl b/schemas/platform/defaults/rag-defaults.ncl index 3db529e..3169f14 100644 --- a/schemas/platform/defaults/rag-defaults.ncl +++ b/schemas/platform/defaults/rag-defaults.ncl @@ -1,14 +1,47 @@ # RAG System Default Configuration -let rag_schema = import "../schemas/rag.ncl" in +let rag_schema = import "../rag.ncl" in { - rag | rag_schema.RagConfig = { + rag_config | rag_schema.RagConfig = { rag = { enabled = true, }, - embeddings = { provider = "local", model = "all-MiniLM-L6-v2", dimension = 384, batch_size = 32, }, - vector_db = { db_type = "memory", namespace = "provisioning", }, - llm = { provider = "ollama", model = "llama3.2", api_url = "http://localhost:11434", temperature = 0.7, max_tokens = 2048, }, - retrieval = { top_k = 5, similarity_threshold = 0.7, reranking = false, hybrid = false, }, - ingestion = { auto_ingest = true, chunk_size = 512, overlap = 50, doc_types = ["md", "txt", "toml"], }, + embeddings = { + provider = "local", + model = "all-MiniLM-L6-v2", + dimension = 384, + batch_size = 32, + }, + vector_db = { + db_type = "memory", + namespace = "provisioning", + }, + llm = { + provider = "ollama", + model = "llama3.2", + api_url = "http://localhost:11434", + temperature = 0.7, + max_tokens = 2048, + }, + retrieval = { + top_k = 5, + similarity_threshold = 0.7, + reranking = false, + hybrid = false, + }, + ingestion = { + auto_ingest = true, + chunk_size = 512, + overlap = 50, + doc_types = ["md", "txt", "toml"], + }, monitoring = { enabled = false, }, logging = { level = "info", }, + + # Docker Build Configuration + build = { + package = "rag", + binary = "provisioning-rag", + port = 9096, + features = [], + extra_runtime_pkgs = ["openssl", "libssl-dev"], + }, }, } diff --git a/schemas/platform/defaults/vault-service-defaults.ncl b/schemas/platform/defaults/vault-service-defaults.ncl index e626b89..0dceb3e 100644 --- a/schemas/platform/defaults/vault-service-defaults.ncl +++ b/schemas/platform/defaults/vault-service-defaults.ncl @@ -1,12 +1,12 @@ # Vault Service Default Configuration -let vault_schema = import "../schemas/vault-service.ncl" in +let vault_schema = import "../vault-service.ncl" in { - vault | vault_schema.VaultServiceConfig = { + vault_service | vault_schema.VaultServiceConfig = { server = { host = "127.0.0.1", - port = 8200, + port = 9094, workers = 4, keep_alive = 75, max_connections = 100, @@ -19,7 +19,7 @@ let vault_schema = import "../schemas/vault-service.ncl" in }, vault = { - server_url = "http://localhost:8200", + server_url = "http://localhost:9094", storage_backend = "filesystem", deployment_mode = "Embedded", mount_point = "transit", @@ -46,5 +46,14 @@ let vault_schema = import "../schemas/vault-service.ncl" in level = "info", format = "json", }, + + # Docker Build Configuration + build = { + package = "vault-service", + binary = "provisioning-vault-service", + port = 9094, + features = [], + extra_runtime_pkgs = ["libssl3"], + }, }, } diff --git a/schemas/platform/deployment-mode-example.ncl b/schemas/platform/deployment-mode-example.ncl new file mode 100644 index 0000000..15e0b84 --- /dev/null +++ b/schemas/platform/deployment-mode-example.ncl @@ -0,0 +1,92 @@ +# Platform Deployment Mode Configuration Examples +# These show how to configure the deployment mode for different infrastructure setups + +# ═══════════════════════════════════════════════════════════════════════════ +# EXAMPLE 1: Local Development (Binary Execution) +# ═══════════════════════════════════════════════════════════════════════════ +# For local development with binaries in ~/.local/bin +# Services are started as individual processes + +let local_example = { + mode = "local", + manager = { + hostname = "localhost", + port = 9090, + }, + config_dir = "~/.config/provisioning/platform/config", + health_checks_enabled = true, + startup_timeout = 60, + description = "Local development with binaries", +} in + +# ═══════════════════════════════════════════════════════════════════════════ +# EXAMPLE 2: Docker Compose +# ═══════════════════════════════════════════════════════════════════════════ +# For local development with Docker Compose +# Services are orchestrated via docker-compose.yml + +let docker_compose_example = { + mode = "docker-compose", + manager = { + host = "unix:///var/run/docker.sock", + api_version = "v1.45", + }, + config_dir = "~/.config/provisioning/platform/config", + health_checks_enabled = true, + startup_timeout = 120, + description = "Docker Compose local development", +} in + +# ═══════════════════════════════════════════════════════════════════════════ +# EXAMPLE 3: Docker Compose (Remote Host) +# ═══════════════════════════════════════════════════════════════════════════ +# For development on a remote Docker daemon + +let docker_remote_example = { + mode = "docker-compose", + manager = { + host = "tcp://docker-host.local:2375", + api_version = "v1.45", + }, + config_dir = "~/.config/provisioning/platform/config", + health_checks_enabled = true, + startup_timeout = 120, + description = "Docker Compose on remote daemon", +} in + +# ═══════════════════════════════════════════════════════════════════════════ +# EXAMPLE 4: Kubernetes Cluster +# ═══════════════════════════════════════════════════════════════════════════ +# For production Kubernetes deployment +# Services are deployed as K8s objects (Deployments, Services, ConfigMaps, etc.) + +let kubernetes_example = { + mode = "kubernetes", + manager = { + cluster_name = "production-cluster", + api_server = "https://k8s-master.example.com:6443", + namespace = "provisioning", + kubeconfig_path = "~/.kube/config", + ca_cert_path = "/etc/kubernetes/ca.crt", + }, + config_dir = "~/.config/provisioning/platform/config", + health_checks_enabled = true, + startup_timeout = 300, + description = "Production Kubernetes cluster", +} in + +# ═══════════════════════════════════════════════════════════════════════════ +# Default configuration selection (uncomment one to use) +# ═══════════════════════════════════════════════════════════════════════════ + +# Use local development +local_example + +# Use Docker Compose locally +# docker_compose_example + +# Use remote Docker +# docker_remote_example + +# Use Kubernetes +# kubernetes_example diff --git a/schemas/platform/deployment-mode.ncl b/schemas/platform/deployment-mode.ncl new file mode 100644 index 0000000..e2883c7 --- /dev/null +++ b/schemas/platform/deployment-mode.ncl @@ -0,0 +1,111 @@ +# Platform Deployment Mode Configuration Schema +# Defines how the platform is deployed: local binaries, Docker Compose, or Kubernetes +# This is separate from application modes (solo, cicd, enterprise) +# This determines INFRASTRUCTURE deployment, not application features + +let lib = import "../lib/main.ncl" in + +{ + # Deployment mode enum: local | docker-compose | kubernetes + DeploymentMode = fun label value => + if std.array.elem value ["local", "docker-compose", "kubernetes"] then + value + else + std.contract.blame_with_message "deployment_mode must be one of: local, docker-compose, kubernetes" label, + + # Local deployment manager (localhost, host IP, or custom) + LocalManager = { + hostname | String, + port | lib.PositiveNumber | optional = 9090, + }, + + # Docker Compose manager (daemon socket or host) + DockerManager = { + host | String, # e.g., "unix:///var/run/docker.sock" or "tcp://docker-host:2375" + api_version | String | optional = "v1.45", + }, + + # Kubernetes cluster manager + KubernetesManager = { + cluster_name | String, + api_server | String, # e.g., "https://k8s-master:6443" + namespace | String | default = "provisioning", + kubeconfig_path | String | optional, # e.g., ~/.kube/config + ca_cert_path | String | optional, + }, + + # External service configuration (e.g., svault_server-vault, surrealdb-dbs, forgejo-git) + # Pattern: service_name-service_type (e.g., svault_server-vault where "-vault" is the service type) + ExternalService = { + # Full name with service type separator: "svault_server-vault" + name | String, + + # Service type (the part after the dash): "vault", "dbs", "git", "cdci", etc. + srvc | String, + + # Human-readable description + desc | String, + + # Service URL/endpoint + url | String, + + # Service port + port | lib.PositiveNumber, + + # Is this service required for deployment + required | Bool | default = false, + + # List of service names this service depends on + dependencies | Array String | default = [], + + # Optional: binary path for local services + binary_path | String | optional, + + # Optional: startup command + startup_command | String | optional, + + # Optional: health check timeout in seconds + health_check_timeout | lib.PositiveNumber | optional, + + # Optional: environment variables (key-value pairs) + env | {} | optional, + }, + + # External services collection + ExternalServices = Array ExternalService, + + # Main platform deployment mode configuration + PlatformDeploymentMode = { + # Deployment mode: how services are deployed + mode | DeploymentMode, + + # Manager configuration (type depends on mode) + manager | ( + if std.array.elem mode ["local"] then + LocalManager + else if std.array.elem mode ["docker-compose"] then + DockerManager + else if std.array.elem mode ["kubernetes"] then + KubernetesManager + else + null + ), + + # Configuration directory for user service configs (*.ncl files) + config_dir | String | optional, # e.g., ~/.config/provisioning/platform/config + + # Enable health checks and monitoring + health_checks_enabled | Bool | default = true, + + # Timeout for service startup (seconds) + startup_timeout | lib.PositiveNumber | default = 60, + + # External infrastructure services (databases, git servers, registries, CI/CD, etc.) + external_services | ExternalServices | default = [], + + # Metadata + description | String | optional = null, + created_at | String | optional = null, + updated_at | String | optional = null, + }, +} diff --git a/schemas/platform/schemas/deployment/cicd.ncl b/schemas/platform/deployment/cicd.ncl similarity index 100% rename from schemas/platform/schemas/deployment/cicd.ncl rename to schemas/platform/deployment/cicd.ncl diff --git a/schemas/platform/schemas/deployment/enterprise.ncl b/schemas/platform/deployment/enterprise.ncl similarity index 100% rename from schemas/platform/schemas/deployment/enterprise.ncl rename to schemas/platform/deployment/enterprise.ncl diff --git a/schemas/platform/schemas/deployment/multiuser.ncl b/schemas/platform/deployment/multiuser.ncl similarity index 79% rename from schemas/platform/schemas/deployment/multiuser.ncl rename to schemas/platform/deployment/multiuser.ncl index 3d5756a..14809cb 100644 --- a/schemas/platform/schemas/deployment/multiuser.ncl +++ b/schemas/platform/deployment/multiuser.ncl @@ -5,7 +5,30 @@ { MultiUserModeConfig = { # Deployment mode identifier - mode | String = 'multiuser, + mode | String = "multiuser", + + # NATS external cluster configuration (required in multi-user mode) + nats = { + mode | String = "server", + url | String, + port | Number = 4222, + jetstream | Bool = true, + auth_token | String | optional, + tls_cert | String | optional, + tls_key | String | optional, + max_reconnects | Number = 10, + reconnect_wait_ms | Number = 2000, + subject_prefix | String = "provisioning", + }, + + # SurrealDB WebSocket server configuration + surrealdb = { + mode | String = "server", + url | String, + namespace | String = "provisioning", + username | String | optional, + password | String | optional, + }, # Resource allocation resources = { diff --git a/schemas/platform/schemas/deployment/solo.ncl b/schemas/platform/deployment/solo.ncl similarity index 76% rename from schemas/platform/schemas/deployment/solo.ncl rename to schemas/platform/deployment/solo.ncl index dce83b0..a336a70 100644 --- a/schemas/platform/schemas/deployment/solo.ncl +++ b/schemas/platform/deployment/solo.ncl @@ -7,6 +7,24 @@ # Deployment mode identifier mode | String = "solo", + # NATS embedded broker (child process, JetStream enabled) + nats = { + mode | String = "embedded", + port | Number = 4222, + jetstream | Bool = true, + jetstream_store_dir | String | optional, + max_reconnects | Number = 10, + reconnect_wait_ms | Number = 2000, + subject_prefix | String = "provisioning", + }, + + # SurrealDB embedded RocksDB (no external server required) + surrealdb = { + mode | String = "embedded", + path | String | optional, + namespace | String = "provisioning", + }, + # Resource allocation resources = { cpu_cores | String, diff --git a/schemas/platform/docker-build.ncl b/schemas/platform/docker-build.ncl new file mode 100644 index 0000000..bc56e81 --- /dev/null +++ b/schemas/platform/docker-build.ncl @@ -0,0 +1,144 @@ +# Docker Build Configuration Schema +# Defines build-time configuration for multi-stage Docker builds with cargo-chef +# Supports BuildKit caching, sccache, cross-compilation, and feature flags + +let constraints = import "schemas/platform/common/constraints.ncl" in + +{ + # Cache mode for BuildKit layer caching + CacheMode = [| + 'local, # Local cache (solo development) + 'registry, # Registry-based cache (CI/CD, enterprise) + 'inline, # Inline cache metadata (minimal overhead) + |], + + # Docker build configuration for Rust services + DockerBuildConfig = { + # Cargo package name (from workspace Cargo.toml) + package + | doc "Cargo workspace package name" + | String, + + # Binary name (output executable name) + binary + | doc "Binary executable name" + | String, + + # Base image for build stage + base_image + | doc "Rust base image for build stage" + | String + | default = "rust:1.82-trixie", + + # Runtime image for final stage + runtime_image + | doc "Minimal runtime image for final stage" + | String + | default = "debian:trixie-slim", + + # Cargo feature flags + features + | doc "Cargo features to enable during build" + | Array String + | default = [], + + # Enable cargo-chef for dependency caching + chef_enabled + | doc "Use cargo-chef for layer caching" + | Bool + | default = true, + + # Cross-compilation target (optional) + target + | doc "Rust target triple for cross-compilation" + | String + | optional, + + # sccache configuration for distributed caching + sccache + | doc "Distributed build cache configuration" + | { + enabled + | doc "Enable sccache for build caching" + | Bool + | default = false, + + endpoint + | doc "S3-compatible endpoint for cache storage" + | String + | optional, + + bucket + | doc "S3 bucket name for cache artifacts" + | String + | default = "rust-cache", + + region + | doc "S3 region for cache bucket" + | String + | default = "", + } + | default = { enabled = false, bucket = "rust-cache", region = "" }, + + # Service port (must match server.port from runtime config) + port + | doc "HTTP port for service (must be 9000-65535)" + | Number + | constraints.port_high, + + # Health check endpoint path + health_path + | doc "Health check endpoint path" + | String + | default = "/health", + + # Additional runtime packages (apt packages for runtime stage) + extra_runtime_pkgs + | doc "Additional apt packages for runtime image" + | Array String + | default = [], + + # User ID for non-root user in container + user_id + | doc "UID for non-root container user" + | Number + | default = 1000, + + # Configuration file to copy (null if none) + config_file + | doc "Config file to copy from crate (relative path)" + | String + | default = "", + + # BuildKit-specific configuration + buildkit + | doc "BuildKit advanced build options" + | { + # Cache mode strategy + cache_mode + | doc "BuildKit cache storage mode" + | CacheMode + | default = 'registry, + + # Parallel build jobs + parallel_jobs + | doc "Number of parallel cargo build jobs" + | Number + | constraints.server_workers + | default = 4, + + # Enable BuildKit inline cache + inline_cache + | doc "Include cache metadata in image" + | Bool + | default = false, + + # Cache registry URL (for registry cache mode) + cache_registry + | doc "Container registry for build cache" + | String + | optional, + } + | default = { cache_mode = 'registry, parallel_jobs = 4, inline_cache = false }, + }, +} diff --git a/schemas/platform/extension-registry.ncl b/schemas/platform/extension-registry.ncl new file mode 100644 index 0000000..cfda5f1 --- /dev/null +++ b/schemas/platform/extension-registry.ncl @@ -0,0 +1,99 @@ +# Extension Registry Schema +# Multi-instance extension distribution via Git sources (Gitea, Forgejo, GitHub) and OCI registries + +let constraints = import "schemas/platform/common/constraints.ncl" in +let docker_build_schema = import "schemas/platform/docker-build.ncl" in + +{ + # Gitea/Forgejo/GitHub source backend configuration + SourceBackendConfig = { + id | String | optional, + url | String, + organization | String, + token_path | String, + timeout_seconds | Number | default = 30, + verify_ssl | Bool | default = true, + }, + + # OCI registry distribution backend configuration + DistributionBackendConfig = { + id | String | optional, + registry | String, + namespace | String, + auth_token_path | String | optional, + timeout_seconds | Number | default = 30, + verify_ssl | Bool | default = true, + }, + + # Multi-instance source backends configuration + SourcesConfig = { + gitea | Array SourceBackendConfig | default = [], + forgejo | Array SourceBackendConfig | default = [], + github | Array SourceBackendConfig | default = [], + }, + + # Multi-instance distribution backends configuration + DistributionsConfig = { + oci | Array DistributionBackendConfig | default = [], + }, + + # Server configuration + ServerConfig = { + host | String | default = "0.0.0.0", + port | Number | default = 9005 | constraints.port_high, + workers | Number | default = 4, + enable_cors | Bool | default = false, + enable_compression | Bool | default = true, + }, + + # Cache configuration + CacheConfig = { + capacity | Number | default = 1000, + ttl_seconds | Number | default = 300, + enable_metadata_cache | Bool | default = true, + enable_list_cache | Bool | default = true, + extensions_dir | String | optional, + ttl_hours | Number | optional, + }, + + # Legacy single-instance configuration (auto-migrated to multi-instance) + LegacySourceConfig = { + url | String, + organization | String, + token_path | String, + timeout_seconds | Number | optional, + verify_ssl | Bool | optional, + }, + + LegacyDistributionConfig = { + registry | String | optional, + namespace | String | optional, + auth_token_path | String | optional, + timeout_seconds | Number | optional, + verify_ssl | Bool | optional, + registry_url | String | optional, + auth | { + enabled | Bool | optional, + } | optional, + tls_verify | Bool | optional, + }, + + # Main registry configuration + RegistryConfig = { + server | ServerConfig | default = {}, + + # New multi-instance format (recommended) + sources | SourcesConfig | default = {}, + distributions | DistributionsConfig | default = {}, + + # Legacy single-instance format (auto-migrated on startup) + gitea | LegacySourceConfig | optional, + oci | LegacyDistributionConfig | optional, + + # Cache configuration + cache | CacheConfig | default = {}, + + # Docker build configuration + build | docker_build_schema.DockerBuildConfig | optional, + }, +} diff --git a/schemas/platform/external-services.ncl b/schemas/platform/external-services.ncl new file mode 100644 index 0000000..bbf6673 --- /dev/null +++ b/schemas/platform/external-services.ncl @@ -0,0 +1,44 @@ +# External Infrastructure Services Schema +# Defines the structure for databases, registries, CI/CD, and other external services +# These services are monitored but NOT managed by provisioning platform + +{ + # Individual external service configuration type + ExternalService = { + # Service name with type identifier (e.g., "svault_server-vault" where "vault" is the type) + name | String, + + # Service type identifier (vault, dbs, git, register, cdci, etc.) + srvc | String, + + # Human-readable description + desc | String, + + # Service endpoint URL (http://host:port) + url | String, + + # Service port number + port | Number, + + # Whether this service is required for platform operation + required | Bool | default = false, + + # List of service names this service depends on + dependencies | Array String | default = [], + + # Optional binary path for local services + binary_path | String | optional, + + # Optional startup command for local services + startup_command | String | optional, + + # Health check timeout in seconds + health_check_timeout | Number | optional, + + # Optional environment variables for the service + env | {} | optional, + + # Allow extra fields for extensibility + .. + }, +} diff --git a/schemas/platform/schemas/mcp-server.ncl b/schemas/platform/mcp-server.ncl similarity index 89% rename from schemas/platform/schemas/mcp-server.ncl rename to schemas/platform/mcp-server.ncl index e1ac242..9414626 100644 --- a/schemas/platform/schemas/mcp-server.ncl +++ b/schemas/platform/mcp-server.ncl @@ -1,11 +1,12 @@ # MCP Server Schema # Model Context Protocol server with tools, prompts, resources, and sampling -let workspace_schema = import "./common/workspace.ncl" in -let server_schema = import "./common/server.ncl" in -let security_schema = import "./common/security.ncl" in -let monitoring_schema = import "./common/monitoring.ncl" in -let logging_schema = import "./common/logging.ncl" in +let workspace_schema = import "schemas/platform/common/workspace.ncl" in +let server_schema = import "schemas/platform/common/server.ncl" in +let security_schema = import "schemas/platform/common/security.ncl" in +let monitoring_schema = import "schemas/platform/common/monitoring.ncl" in +let logging_schema = import "schemas/platform/common/logging.ncl" in +let docker_build_schema = import "schemas/platform/docker-build.ncl" in { MCPServerConfig = { @@ -62,7 +63,7 @@ let logging_schema = import "./common/logging.ncl" in enabled | Bool | default = false, ttl | Number | optional, } | optional, - }, + } | optional, # Prompts Configuration prompts | { @@ -85,7 +86,7 @@ let logging_schema = import "./common/logging.ncl" in enabled | Bool | default = false, max_versions | Number | optional, } | optional, - }, + } | optional, # Resources Configuration resources | { @@ -110,7 +111,7 @@ let logging_schema = import "./common/logging.ncl" in enabled | Bool | default = true, max_depth | Number | optional, } | optional, - }, + } | optional, # Sampling Configuration sampling | { @@ -131,7 +132,7 @@ let logging_schema = import "./common/logging.ncl" in enabled | Bool | default = true, ttl | Number | optional, } | optional, - }, + } | optional, # Capabilities Declaration capabilities | { @@ -210,5 +211,8 @@ let logging_schema = import "./common/logging.ncl" in # Compression level compression_level | String | optional, } | optional, + + # Docker build configuration + build | docker_build_schema.DockerBuildConfig | optional, }, } diff --git a/schemas/platform/ncl-sync.ncl b/schemas/platform/ncl-sync.ncl new file mode 100644 index 0000000..4473f29 --- /dev/null +++ b/schemas/platform/ncl-sync.ncl @@ -0,0 +1,41 @@ +{ + NclSyncNatsSettings = { + # When true, ncl-sync connects to NATS and subscribes to + # provisioning.workspace.ncl.{changed,removed} for event-driven cache invalidation. + # Falls back to file watcher + sync-request sidecar when NATS is unavailable. + enabled | Bool | default = false, + + # NATS URL. Empty → uses platform-nats default (nats://127.0.0.1:4222). + url | String | default = "", + }, + + NclSyncConfig = { + cache_dir | String | optional, + + idle_timeout_secs | Number | default = 600, + + sync_poll_interval_ms | Number | default = 500, + + warm_concurrency | Number | default = 4, + + extra_import_paths | Array String | default = [], + + # Filename suffixes that identify library/schema files (not entry points). + # Files matching are skipped during warm-up and watcher events. + skip_patterns | Array String | default = [ + "-schema.ncl", + "-defaults.ncl", + "-constraints.ncl", + ], + + # Directory basenames that indicate non-exportable NCL files. + # Any .ncl under a directory with this basename is skipped. + skip_dirs | Array String | default = [ + "schemas", + "defaults", + "constraints", + ], + + nats | NclSyncNatsSettings | default = {}, + }, +} diff --git a/schemas/platform/nu-daemon.ncl b/schemas/platform/nu-daemon.ncl new file mode 100644 index 0000000..cf9fe7d --- /dev/null +++ b/schemas/platform/nu-daemon.ncl @@ -0,0 +1,25 @@ +let constraints = import "schemas/platform/common/constraints.ncl" in + +{ + DaemonConfig = { + server = { + host | String | default = "0.0.0.0", + port | Number | default = 9095, + workers | Number | optional = 2, + }, + daemon = { + enabled | Bool | default = true, + poll_interval | Number | default = 60, + max_workers | Number | default = 2, + }, + logging = { + level | String | default = "info", + format | String | optional, + file | String | optional, + }, + actions = { + auto_cleanup | Bool | default = false, + auto_update | Bool | default = false, + }, + }, +} diff --git a/schemas/platform/schemas/orchestrator.ncl b/schemas/platform/orchestrator.ncl similarity index 77% rename from schemas/platform/schemas/orchestrator.ncl rename to schemas/platform/orchestrator.ncl index 19ca396..c2ed7f1 100644 --- a/schemas/platform/schemas/orchestrator.ncl +++ b/schemas/platform/orchestrator.ncl @@ -1,21 +1,22 @@ # Orchestrator Service Schema # Workflow engine configuration with queue, batch, extensions, and rollback -let workspace_schema = import "./common/workspace.ncl" in -let server_schema = import "./common/server.ncl" in -let database_schema = import "./common/database.ncl" in -let security_schema = import "./common/security.ncl" in -let monitoring_schema = import "./common/monitoring.ncl" in -let logging_schema = import "./common/logging.ncl" in -let storage_schema = import "./common/storage.ncl" in +let workspace_schema = import "schemas/platform/common/workspace.ncl" in +let server_schema = import "schemas/platform/common/server.ncl" in +let database_schema = import "schemas/platform/common/database.ncl" in +let security_schema = import "schemas/platform/common/security.ncl" in +let monitoring_schema = import "schemas/platform/common/monitoring.ncl" in +let logging_schema = import "schemas/platform/common/logging.ncl" in +let storage_schema = import "schemas/platform/common/storage.ncl" in +let docker_build_schema = import "schemas/platform/docker-build.ncl" in { OrchestratorConfig = { # Workspace configuration workspace | workspace_schema.WorkspaceConfig, - # HTTP server settings - server | server_schema.ServerConfig, + # HTTP server settings (port must be >= 9000 for orchestrator) + server | server_schema.ServerConfigHighPort, # Storage configuration storage | storage_schema.StorageConfig, @@ -54,7 +55,7 @@ let storage_schema = import "./common/storage.ncl" in parallel_limit | Number | default = 5, # Batch operation timeout in milliseconds - operation_timeout | Number | default = 1800000, + operation_timeout_minutes | Number | default = 2, # Checkpoint settings checkpointing | { @@ -119,5 +120,9 @@ let storage_schema = import "./common/storage.ncl" in gc_threshold | Number | optional, } | optional, } | optional, + + # Docker build configuration + build | docker_build_schema.DockerBuildConfig | optional, }, } + diff --git a/schemas/platform/provisioning-daemon.ncl b/schemas/platform/provisioning-daemon.ncl new file mode 100644 index 0000000..3840da2 --- /dev/null +++ b/schemas/platform/provisioning-daemon.ncl @@ -0,0 +1,24 @@ +let constraints = import "schemas/platform/common/constraints.ncl" in + +{ + DaemonConfig = { + server = { + host | String | default = "0.0.0.0", + port | Number | default = 9014, + workers | Number | optional, + }, + orchestrator_url | String | default = "http://localhost:9011", + nats_url | String | default = "nats://127.0.0.1:4222", + project_name | String | default = "provisioning", + provisioning_bin | String | default = "provisioning", + watch_paths | Array String | default = [], + nickel_import_path | String | optional, + project_root | String | optional, + workspaces_root | String | optional, + extensions_root | String | optional, + ontology_templates | String | optional, + ui_templates_dir | String | optional, + control_center_url | String | optional, + log_level | String | default = "info", + }, +} diff --git a/schemas/platform/schemas/rag.ncl b/schemas/platform/rag.ncl similarity index 89% rename from schemas/platform/schemas/rag.ncl rename to schemas/platform/rag.ncl index 1ad899e..de7d9fc 100644 --- a/schemas/platform/schemas/rag.ncl +++ b/schemas/platform/rag.ncl @@ -1,6 +1,8 @@ # RAG System Schema # Retrieval-Augmented Generation with embeddings, vector DB, LLM +let docker_build_schema = import "schemas/platform/docker-build.ncl" in + { RagConfig = { rag | { @@ -56,5 +58,8 @@ logging | { level | String | default = "info", } | optional, + + # Docker build configuration + build | docker_build_schema.DockerBuildConfig | optional, }, } diff --git a/schemas/platform/schemas/README.md b/schemas/platform/schemas/README.md deleted file mode 100644 index 46ea334..0000000 --- a/schemas/platform/schemas/README.md +++ /dev/null @@ -1,287 +0,0 @@ -# Schemas - -Nickel type contracts defining configuration structure and validation for all services. - -## Purpose - -Schemas define: -- **Type safety** - Required/optional fields, valid types (string, number, bool, record) -- **Value constraints** - Enum values, numeric bounds (via contracts) -- **Documentation** - Field descriptions and usage patterns -- **Composition** - Inheritance and merging of schema types - -## File Organization - -```bash -schemas/ -├── README.md # This file -├── common/ # Shared schemas (server, database, security, etc.) -│ ├── server.ncl # HTTP server configuration schema -│ ├── database.ncl # Database backend schema -│ ├── security.ncl # Authentication and security schema -│ ├── monitoring.ncl # Metrics and health checks schema -│ ├── logging.ncl # Log level and format schema -│ ├── network.ncl # Network binding and TLS schema -│ ├── storage.ncl # Storage backend schema -│ └── workspace.ncl # Workspace configuration schema -├── deployment/ # Mode-specific schemas -│ ├── solo.ncl # Solo mode resource constraints -│ ├── multiuser.ncl # Multi-user mode schema -│ ├── cicd.ncl # CI/CD mode schema -│ └── enterprise.ncl # Enterprise HA schema -├── orchestrator.ncl # Orchestrator service schema -├── control-center.ncl # Control Center service schema -├── mcp-server.ncl # MCP Server service schema -└── installer.ncl # Installer service schema -``` - -## Schema Patterns - -### 1. Basic Schema Definition - -```bash -# schemas/common/server.ncl -{ - Server = { - host | String, # Required string field - port | Number, # Required number field - workers | Number | default = 4, # Optional with default - keep_alive | Number | optional, # Optional field - max_connections | Number | optional, - }, -} -``` - -### 2. Type with Contract Validation - -```bash -# With constraint checking (via validators) -{ - WorkerCount = - let valid_range = fun n => - if n < 1 then - std.contract.blame "Workers must be >= 1" n - else if n > 32 then - std.contract.blame "Workers must be <= 32" n - else - n - in - Number | valid_range, -} -``` - -### 3. Record Merging (Composition) - -```bash -# schemas/orchestrator.ncl -let server_schema = import "./common/server.ncl" in -let database_schema = import "./common/database.ncl" in - -{ - OrchestratorConfig = { - workspace | { - name | String, - path | String, - enabled | Bool | default = true, - }, - server | server_schema.Server, # Reuse Server schema - storage | database_schema.Database, # Reuse Database schema - queue | { - max_concurrent_tasks | Number, - retry_attempts | Number | default = 3, - }, - }, -} -``` - -## Common Schemas - -### server.ncl -HTTP server configuration: -- `host` - Bind address (string) -- `port` - Listen port (number) -- `workers` - Thread count (number, optional) -- `keep_alive` - Keep-alive timeout (number, optional) -- `max_connections` - Connection limit (number, optional) - -### database.ncl -Database backend selection: -- `backend` - 'filesystem | 'rocksdb | 'surrealdb_embedded | 'surrealdb_server | 'postgres (enum) -- `path` - Storage path (string, optional) -- `connection_string` - DB URL (string, optional) -- `credentials` - Auth object (optional) - -### security.ncl -Authentication and encryption: -- `jwt_issuer` - JWT issuer (string, optional) -- `jwt_audience` - JWT audience (string, optional) -- `jwt_expiration` - Token expiration (number, optional) -- `encryption_key` - Encryption key (string, optional) -- `kms_backend` - KMS provider (string, optional) -- `mfa_required` - Require MFA (bool, optional) - -### monitoring.ncl -Metrics and health: -- `enabled` - Enable monitoring (bool, optional) -- `metrics_interval` - Metrics collection interval (number, optional) -- `health_check_interval` - Health check frequency (number, optional) -- `retention_days` - Metrics retention (number, optional) - -### logging.ncl -Log configuration: -- `level` - Log level (debug | info | warn | error) -- `format` - Log format (json | text) -- `rotation` - Log rotation policy (optional) -- `output` - Log destination (stdout | file | syslog) - -## Service Schemas - -### orchestrator.ncl -Workflow orchestration: - -```nickel -OrchestratorConfig = { - workspace | WorkspaceConfig, - server | Server, - storage | Database, - queue | QueueConfig, - batch | BatchConfig, - monitoring | MonitoringConfig | optional, - rollback | RollbackConfig | optional, - extensions | ExtensionsConfig | optional, -} -``` - -### control-center.ncl -Policy and RBAC: - -```nickel -ControlCenterConfig = { - workspace | WorkspaceConfig, - server | Server, - database | Database, - security | SecurityConfig, - rbac | RBACConfig | optional, - compliance | ComplianceConfig | optional, -} -``` - -### mcp-server.ncl -MCP protocol server: - -```nickel -MCPServerConfig = { - workspace | WorkspaceConfig, - server | Server, - capabilities | CapabilitiesConfig, - tools | ToolsConfig | optional, - resources | ResourcesConfig | optional, -} -``` - -## Deployment Mode Schemas - -Deployment schemas define resource constraints for each mode: - -- **solo.ncl** - 2 CPU, 4GB RAM, embedded DB -- **multiuser.ncl** - 4 CPU, 8GB RAM, PostgreSQL -- **cicd.ncl** - 8 CPU, 16GB RAM, ephemeral -- **enterprise.ncl** - 16+ CPU, 32+ GB RAM, HA - -Example: - -```bash -# schemas/deployment/solo.ncl -{ - SoloMode = { - resources = { - cpu_cores | 2, - memory_mb | 4096, - disk_gb | 50, - }, - database_backend | 'filesystem, - security_level | 'basic, - }, -} -``` - -## Validation with Schemas - -Schemas are composed with validators in config files: - -```toml -# configs/orchestrator.solo.ncl -let schemas = import "../schemas/orchestrator.ncl" in -let validators = import "../validators/orchestrator-validator.ncl" in -let defaults = import "../defaults/orchestrator-defaults.ncl" in - -# Compose: defaults + validation + schema checking -{ - orchestrator = defaults.orchestrator & { - queue = { - max_concurrent_tasks = validators.ValidConcurrentTasks 5, - }, - }, -} | schemas.OrchestratorConfig -``` - -The final `| schemas.OrchestratorConfig` applies type checking. - -## Type System - -### Nickel Type Syntax - -```nickel -# Required field -field | Type, - -# Optional field -field | Type | optional, - -# Field with default -field | Type | default = value, - -# Union type -field | [| 'option1, 'option2], - -# Nested record -field | { - subfield | Type, -}, -``` - -## Best Practices - -1. **Reuse common schemas** - Import and compose rather than duplicate -2. **Use enums for choices** - `'filesystem | 'rocksdb` instead of string validation -3. **Document fields** - Add comments explaining purpose -4. **Keep schemas focused** - Each file covers one logical component -5. **Test composition** - Use `nickel typecheck` to verify schema merging - -## Modifying Schemas - -When changing a schema: - -1. Update schema file (schemas/*.ncl) -2. Update corresponding defaults (defaults/*.ncl) to match schema -3. Update validators if constraints changed -4. Run typecheck: `nickel typecheck configs/orchestrator.*.ncl` -5. Verify all configs still type-check - -## Schema Testing - -```bash -# Typecheck a schema -nickel typecheck provisioning/.typedialog/provisioning/platform/schemas/orchestrator.ncl - -# Typecheck a config (which applies schema) -nickel typecheck provisioning/.typedialog/provisioning/platform/configs/orchestrator.solo.ncl - -# Evaluate a schema -nickel eval provisioning/.typedialog/provisioning/platform/schemas/orchestrator.ncl -``` - ---- - -**Version**: 1.0.0 -**Last Updated**: 2025-01-05 diff --git a/schemas/platform/schemas/common/server.ncl b/schemas/platform/schemas/common/server.ncl deleted file mode 100644 index 817254f..0000000 --- a/schemas/platform/schemas/common/server.ncl +++ /dev/null @@ -1,30 +0,0 @@ -# HTTP Server Configuration Schema -# Common schema for HTTP server settings across all services - -{ - ServerConfig = { - # Bind address (127.0.0.1 for local, 0.0.0.0 for all interfaces) - host | String | default = "127.0.0.1", - - # Listen port (validated: 1024-65535) - port | Number, - - # Worker thread count (CPU-bound operations) - workers | Number | optional, - - # TCP keep-alive timeout in seconds (0 = disabled) - keep_alive | Number | optional, - - # Maximum concurrent TCP connections - max_connections | Number | optional, - - # Request timeout in milliseconds - request_timeout | Number | optional, - - # Enable graceful shutdown - graceful_shutdown | Bool | default = true, - - # Graceful shutdown timeout in seconds - shutdown_timeout | Number | optional, - }, -} diff --git a/schemas/platform/schemas/extension-registry.ncl b/schemas/platform/schemas/extension-registry.ncl deleted file mode 100644 index ab88e66..0000000 --- a/schemas/platform/schemas/extension-registry.ncl +++ /dev/null @@ -1,86 +0,0 @@ -# Extension Registry Schema -# Multi-instance extension distribution via Git sources (Gitea, Forgejo, GitHub) and OCI registries - -{ - # Gitea/Forgejo/GitHub source backend configuration - SourceBackendConfig = { - id | String | optional, - url | String, - organization | String, - token_path | String, - timeout_seconds | Number | default = 30, - verify_ssl | Bool | default = true, - }, - - # OCI registry distribution backend configuration - DistributionBackendConfig = { - id | String | optional, - registry | String, - namespace | String, - auth_token_path | String | optional, - timeout_seconds | Number | default = 30, - verify_ssl | Bool | default = true, - }, - - # Multi-instance source backends configuration - SourcesConfig = { - gitea | Array SourceBackendConfig | default = [], - forgejo | Array SourceBackendConfig | default = [], - github | Array SourceBackendConfig | default = [], - }, - - # Multi-instance distribution backends configuration - DistributionsConfig = { - oci | Array DistributionBackendConfig | default = [], - }, - - # Server configuration - ServerConfig = { - host | String | default = "0.0.0.0", - port | Number | default = 8082, - workers | Number | default = 4, - enable_cors | Bool | default = false, - enable_compression | Bool | default = true, - }, - - # Cache configuration - CacheConfig = { - capacity | Number | default = 1000, - ttl_seconds | Number | default = 300, - enable_metadata_cache | Bool | default = true, - enable_list_cache | Bool | default = true, - }, - - # Legacy single-instance configuration (auto-migrated to multi-instance) - LegacySourceConfig = { - url | String, - organization | String, - token_path | String, - timeout_seconds | Number | optional, - verify_ssl | Bool | optional, - }, - - LegacyDistributionConfig = { - registry | String, - namespace | String, - auth_token_path | String | optional, - timeout_seconds | Number | optional, - verify_ssl | Bool | optional, - }, - - # Main registry configuration - RegistryConfig = { - server | ServerConfig | default = {}, - - # New multi-instance format (recommended) - sources | SourcesConfig | default = {}, - distributions | DistributionsConfig | default = {}, - - # Legacy single-instance format (auto-migrated on startup) - gitea | LegacySourceConfig | optional, - oci | LegacyDistributionConfig | optional, - - # Cache configuration - cache | CacheConfig | default = {}, - }, -} diff --git a/schemas/platform/schemas/provisioning-daemon.ncl b/schemas/platform/schemas/provisioning-daemon.ncl deleted file mode 100644 index 9a6a07f..0000000 --- a/schemas/platform/schemas/provisioning-daemon.ncl +++ /dev/null @@ -1,8 +0,0 @@ -{ - DaemonConfig = { - daemon | { enabled | Bool, poll_interval | Number, max_workers | Number, }, - logging | { level | String | default = "info", file | String | optional, syslog | Bool | optional, }, - actions | { auto_cleanup | Bool, auto_update | Bool, workspace_sync | Bool | optional, ephemeral_cleanup | Bool | optional, health_checks | Bool | optional, }, - monitoring | { enabled | Bool | default = false, } | optional, - }, -} diff --git a/schemas/platform/schemas/vault-service.ncl b/schemas/platform/schemas/vault-service.ncl deleted file mode 100644 index bfb267a..0000000 --- a/schemas/platform/schemas/vault-service.ncl +++ /dev/null @@ -1,57 +0,0 @@ -# Vault Service Schema -# Secrets management and encryption configuration - -{ - VaultServiceConfig = { - # Server configuration - server | { - host | String, - port | Number, - workers | Number | optional, - keep_alive | Number | optional, - max_connections | Number | optional, - }, - - # Storage backend configuration - storage | { - backend | String, - path | String | optional, - encryption_key_path | String | optional, - }, - - # Vault-specific settings - vault | { - server_url | String, - storage_backend | String, - deployment_mode | String, - auth_token | String | optional, - mount_point | String | default = "transit", - key_name | String | default = "provisioning-master", - tls_verify | Bool | default = false, - tls_ca_cert | String | optional, - }, - - # High Availability configuration - ha | { - enabled | Bool | default = false, - mode | String | optional, - } | optional, - - # Security configuration - security | { - encryption_algorithm | String | optional, - key_rotation_days | Number | optional, - } | optional, - - # Monitoring and logging - monitoring | { - enabled | Bool | default = false, - metrics_interval | Number | optional, - } | optional, - - logging | { - level | String | default = "info", - format | String | optional, - } | optional, - }, -} diff --git a/schemas/platform/services-deployment.ncl b/schemas/platform/services-deployment.ncl new file mode 100644 index 0000000..766317b --- /dev/null +++ b/schemas/platform/services-deployment.ncl @@ -0,0 +1,172 @@ +# Services Deployment Configuration Schema +# Validates which services are enabled and their configurations + + +let ServerConfig = { + host | String, + port | Number, + workers | Number | optional = 4, +} in + +let DatabaseConfig = { + url | String, + namespace | String | optional, + database | String | optional, + username | String | optional, + password | String | optional, +} in + +let ManagerConfig = { + hostname | String, + port | Number, + health_checks_enabled | Bool | optional = true, + startup_timeout | Number | optional = 60, +} in + +# Service configuration template +let ServiceConfig = { + enabled | Bool, + priority | Number, + server | ServerConfig, + database | DatabaseConfig | optional, + logging | { + level | String, + format | String | optional, + } | optional, + features | { + batch_workflows | Bool | optional, + git_integration | Bool | optional, + cicd_integration | Bool | optional, + user_management | Bool | optional, + monitoring | Bool | optional, + } | optional, +} in + +{ + # Main services deployment configuration + ServiceDeploymentConfig = { + # Deployment mode + mode | String, + description | String | optional, + created_at | String | optional, + application_mode | String | optional, + + # Manager configuration + manager | ManagerConfig | optional, + + # Git configuration (optional) + git | { + schemas_repo | { + url | String | optional, + branch | String | optional, + cache_dir | String | optional, + update_check | Bool | optional, + } | optional, + configs_repo | { + url | String | optional, + branch | String | optional, + cache_dir | String | optional, + update_check | Bool | optional, + } | optional, + .. + } | optional, + + # Services startup (alternative to individual service configs) + services | { + orchestrator | { + enabled | Bool | optional, + priority | Number | optional, + dependencies | optional, + .. + } | optional, + vault_service | { + enabled | Bool | optional, + priority | Number | optional, + dependencies | optional, + .. + } | optional, + .. + } | optional, + + # Orchestrator service + orchestrator | ServiceConfig & { + queue | { + max_concurrent_tasks | Number | optional, + retry_attempts | Number | optional, + retry_delay_ms | Number | optional, + } | optional, + } | optional, + + # Vault Service + vault_service | ServiceConfig & { + backend | { + backend_type | String, + secretum_vault | { + binary_path | String | optional, + workspace | String | optional, + api_endpoint | String | optional, + } | optional, + } | optional, + encryption | { + provider | String | optional, + key_file | String | optional, + } | optional, + } | optional, + + # Control Center + control_center | ServiceConfig | optional, + + # AI Service + ai_service | ServiceConfig & { + ai | { + enabled | Bool | optional, + provider | String | optional, + model | String | optional, + timeout | Number | optional, + } | optional, + } | optional, + + # Extension Registry + extension_registry | ServiceConfig & { + storage | { + type | String | optional, + path | String | optional, + } | optional, + } | optional, + + # Dependencies + dependencies | { + surrealdb | { + url | String, + required | Bool | optional = true, + startup_command | String | optional, + health_check_timeout | Number | optional, + } | optional, + secretum_vault | { + binary_path | String, + required | Bool | optional = true, + check_binary | Bool | optional = true, + } | optional, + forgejo | { + url | String, + required | Bool | optional = false, + only_if_enabled | String | optional, + } | optional, + } | optional, + + # External services (infrastructure services, databases, registries, CI/CD) + # Imported from provisioning/platform/config/external-services.ncl + external_services | Array {} | optional, + + # Metadata + metadata | { + created_at | String | optional, + last_updated | String | optional, + version | String | optional, + maintainer | String | optional, + } | optional, + + # Allow extra fields not listed above + .. + }, +} diff --git a/schemas/platform/templates/ai-service-config.ncl.j2 b/schemas/platform/templates/ai-service-config.ncl.j2 index ac1566d..a2c632c 100644 --- a/schemas/platform/templates/ai-service-config.ncl.j2 +++ b/schemas/platform/templates/ai-service-config.ncl.j2 @@ -3,7 +3,7 @@ # Edit via: nu provisioning/.typedialog/provisioning/platform/scripts/configure.nu ai-service {mode} # Or manually edit and validate with: nickel typecheck -let ai_service_schema = import "../schemas/ai-service.ncl" in +let ai_service_schema = import "../ai-service.ncl" in { ai_service | ai_service_schema.AiServiceConfig = { diff --git a/schemas/platform/templates/control-center-config.ncl.j2 b/schemas/platform/templates/control-center-config.ncl.j2 index c4a9f0c..84474dd 100644 --- a/schemas/platform/templates/control-center-config.ncl.j2 +++ b/schemas/platform/templates/control-center-config.ncl.j2 @@ -2,7 +2,7 @@ # Auto-generated by provisioning TypeDialog # Edit via: nu provisioning/.typedialog/provisioning/platform/scripts/configure.nu control-center {mode} -let control_center_schema = import "../schemas/control-center.ncl" in +let control_center_schema = import "../control-center.ncl" in { control_center | control_center_schema.ControlCenterConfig = { diff --git a/schemas/platform/templates/docker-compose/platform-stack.cicd.yml.ncl b/schemas/platform/templates/docker-compose/platform-stack.cicd.yml.ncl index d58cf59..f192315 100644 --- a/schemas/platform/templates/docker-compose/platform-stack.cicd.yml.ncl +++ b/schemas/platform/templates/docker-compose/platform-stack.cicd.yml.ncl @@ -3,24 +3,24 @@ # Minimal UI, focus on automation and performance { - version = "3.8", - services = { orchestrator = { - image = "provisioning-orchestrator:latest", + build = { + context = ".", + dockerfile = "crates/orchestrator/Dockerfile", + }, container_name = "orchestrator-cicd", ports = [ - "9090:9090", + "8080:8080", ], environment = { ORCHESTRATOR_MODE = "cicd", ORCHESTRATOR_SERVER_HOST = "0.0.0.0", - ORCHESTRATOR_SERVER_PORT = "9090", + ORCHESTRATOR_SERVER_PORT = "8080", ORCHESTRATOR_STORAGE_BACKEND = "filesystem", ORCHESTRATOR_STORAGE_PATH = "/tmp/orchestrator", ORCHESTRATOR_QUEUE_MAX_CONCURRENT_TASKS = "20", ORCHESTRATOR_BATCH_PARALLEL_LIMIT = "10", - ORCHESTRATOR_LOG_LEVEL = "warn", RUST_LOG = "warn", }, tmpfs = [ @@ -29,7 +29,7 @@ networks = ["provisioning"], restart = "no", healthcheck = { - test = ["CMD", "curl", "-f", "http://localhost:9090/health"], + test = ["CMD", "curl", "-f", "http://localhost:8080/health"], interval = "10s", timeout = "5s", retries = 3, @@ -38,17 +38,20 @@ }, api-gateway = { - image = "provisioning-api-gateway:latest", + build = { + context = ".", + dockerfile = "infrastructure/api-gateway/Dockerfile", + }, container_name = "api-gateway", ports = [ - "8000:8000", + "8083:8083", ], environment = { API_GATEWAY_MODE = "cicd", API_GATEWAY_HOST = "0.0.0.0", - API_GATEWAY_PORT = "8000", - API_GATEWAY_ORCHESTRATOR_URL = "http://orchestrator:9090", - API_GATEWAY_LOG_LEVEL = "warn", + API_GATEWAY_PORT = "8083", + ORCHESTRATOR_URL = "http://orchestrator:8080", + RUST_LOG = "warn", }, networks = ["provisioning"], restart = "no", @@ -58,7 +61,78 @@ }, }, healthcheck = { - test = ["CMD", "curl", "-f", "http://localhost:8000/health"], + test = ["CMD", "curl", "-f", "http://localhost:8083/health"], + interval = "10s", + timeout = "5s", + retries = 3, + start_period = "20s", + }, + }, + + provisioning-daemon = { + build = { + context = ".", + dockerfile = "crates/provisioning-daemon/Dockerfile", + }, + container_name = "provisioning-daemon", + ports = [ + "8079:8079", + ], + environment = { + RUST_LOG = "warn", + DATA_DIR = "/data", + PROVISIONING_DAEMON_MODE = "cicd", + PROVISIONING_CONFIG_DIR = "/etc/provisioning", + }, + tmpfs = [ + "/data", + "/etc/provisioning", + ], + networks = ["provisioning"], + restart = "no", + depends_on = { + orchestrator = { + condition = "service_healthy", + }, + }, + healthcheck = { + test = ["CMD", "curl", "-f", "http://localhost:8079/api/v1/health"], + interval = "10s", + timeout = "5s", + retries = 3, + start_period = "20s", + }, + }, + + provisioning-rag = { + build = { + context = ".", + dockerfile = "crates/rag/docker/Dockerfile", + }, + container_name = "provisioning-rag", + ports = [ + "9090:9090", + ], + environment = { + PROVISIONING_LOG_LEVEL = "warn", + PROVISIONING_API_HOST = "0.0.0.0", + PROVISIONING_API_PORT = "9090", + PROVISIONING_CACHE_SIZE = "500", + PROVISIONING_CACHE_TTL_SECS = "1800", + }, + tmpfs = [ + "/app/data", + "/app/cache", + ], + networks = ["provisioning"], + restart = "no", + depends_on = { + orchestrator = { + condition = "service_healthy", + }, + }, + healthcheck = { + test = ["CMD", "curl", "-f", "http://localhost:9090/health"], interval = "10s", timeout = "5s", retries = 3, diff --git a/schemas/platform/templates/docker-compose/platform-stack.enterprise.yml.ncl b/schemas/platform/templates/docker-compose/platform-stack.enterprise.yml.ncl index 862f5e5..2cf2e20 100644 --- a/schemas/platform/templates/docker-compose/platform-stack.enterprise.yml.ncl +++ b/schemas/platform/templates/docker-compose/platform-stack.enterprise.yml.ncl @@ -3,8 +3,6 @@ # Multiple replicas, external databases, comprehensive observability { - version = "3.8", - services = { postgres = { image = "postgres:15-alpine", @@ -70,7 +68,10 @@ }, orchestrator-1 = { - image = "provisioning-orchestrator:latest", + build = { + context = ".", + dockerfile = "crates/orchestrator/Dockerfile", + }, container_name = "orchestrator-1", ports = [ "9091:9090", @@ -98,7 +99,10 @@ }, orchestrator-2 = { - image = "provisioning-orchestrator:latest", + build = { + context = ".", + dockerfile = "crates/orchestrator/Dockerfile", + }, container_name = "orchestrator-2", ports = [ "9092:9090", @@ -126,7 +130,10 @@ }, orchestrator-3 = { - image = "provisioning-orchestrator:latest", + build = { + context = ".", + dockerfile = "crates/orchestrator/Dockerfile", + }, container_name = "orchestrator-3", ports = [ "9093:9090", @@ -154,7 +161,10 @@ }, control-center = { - image = "provisioning-control-center:latest", + build = { + context = ".", + dockerfile = "crates/control-center/Dockerfile", + }, container_name = "control-center", ports = [ "8080:8080", @@ -165,10 +175,13 @@ CONTROL_CENTER_SERVER_PORT = "8080", CONTROL_CENTER_DATABASE = "postgres", CONTROL_CENTER_DATABASE_URL = "postgresql://provisioning:provisioning_prod@postgres/provisioning", - CONTROL_CENTER_ORCHESTRATOR_URL = "http://orchestrator-1:9090", - CONTROL_CENTER_LOG_LEVEL = "info", + ORCHESTRATOR_URL = "http://orchestrator-1:9090", + RUST_LOG = "info", CONTROL_CENTER_MFA_REQUIRED = "true", }, + volumes = [ + "control_center_data:/data", + ], networks = ["provisioning"], restart = "always", depends_on = { @@ -189,23 +202,36 @@ }, mcp-server = { - image = "provisioning-mcp-server:latest", + build = { + context = ".", + dockerfile = "crates/mcp-server/Dockerfile", + }, container_name = "mcp-server", ports = [ - "8888:8888", + "8082:8082", ], environment = { MCP_SERVER_MODE = "enterprise", MCP_SERVER_HOST = "0.0.0.0", - MCP_SERVER_PORT = "8888", - MCP_SERVER_LOG_LEVEL = "info", - MCP_SERVER_ORCHESTRATOR_URL = "http://orchestrator-1:9090", + MCP_SERVER_PORT = "8082", + RUST_LOG = "info", + ORCHESTRATOR_URL = "http://orchestrator-1:9090", }, + volumes = [ + "mcp_server_data:/data", + ], networks = ["provisioning"], restart = "always", - depends_on = ["orchestrator-1", "control-center"], + depends_on = { + orchestrator-1 = { + condition = "service_healthy", + }, + control-center = { + condition = "service_healthy", + }, + }, healthcheck = { - test = ["CMD", "curl", "-f", "http://localhost:8888/health"], + test = ["CMD", "curl", "-f", "http://localhost:8082/health"], interval = "30s", timeout = "10s", retries = 3, @@ -213,6 +239,80 @@ }, }, + provisioning-daemon = { + build = { + context = ".", + dockerfile = "crates/provisioning-daemon/Dockerfile", + }, + container_name = "provisioning-daemon", + ports = [ + "8079:8079", + ], + environment = { + RUST_LOG = "info", + DATA_DIR = "/data", + PROVISIONING_DAEMON_MODE = "enterprise", + PROVISIONING_CONFIG_DIR = "/etc/provisioning", + }, + volumes = [ + "daemon_data:/data", + "daemon_config:/etc/provisioning", + ], + networks = ["provisioning"], + restart = "always", + depends_on = { + orchestrator-1 = { + condition = "service_healthy", + }, + control-center = { + condition = "service_healthy", + }, + }, + healthcheck = { + test = ["CMD", "curl", "-f", "http://localhost:8079/api/v1/health"], + interval = "30s", + timeout = "10s", + retries = 3, + start_period = "30s", + }, + }, + + provisioning-rag = { + build = { + context = ".", + dockerfile = "crates/rag/docker/Dockerfile", + }, + container_name = "provisioning-rag", + ports = [ + "9090:9090", + ], + environment = { + PROVISIONING_LOG_LEVEL = "info", + PROVISIONING_API_HOST = "0.0.0.0", + PROVISIONING_API_PORT = "9090", + PROVISIONING_CACHE_SIZE = "5000", + PROVISIONING_CACHE_TTL_SECS = "7200", + }, + volumes = [ + "rag_data:/app/data", + "rag_cache:/app/cache", + ], + networks = ["provisioning"], + restart = "always", + depends_on = { + orchestrator-1 = { + condition = "service_healthy", + }, + }, + healthcheck = { + test = ["CMD", "curl", "-f", "http://localhost:9090/health"], + interval = "30s", + timeout = "10s", + retries = 3, + start_period = "5s", + }, + }, + nginx = { image = "nginx:alpine", container_name = "nginx-lb", @@ -311,6 +411,12 @@ postgres_primary = null, surrealdb_1 = null, surrealdb_2 = null, + control_center_data = null, + mcp_server_data = null, + daemon_data = null, + daemon_config = null, + rag_data = null, + rag_cache = null, nginx_cache = null, prometheus_data = null, grafana_data = null, diff --git a/schemas/platform/templates/docker-compose/platform-stack.multiuser.yml.ncl b/schemas/platform/templates/docker-compose/platform-stack.multiuser.yml.ncl index 9382bf8..ddc924d 100644 --- a/schemas/platform/templates/docker-compose/platform-stack.multiuser.yml.ncl +++ b/schemas/platform/templates/docker-compose/platform-stack.multiuser.yml.ncl @@ -3,8 +3,6 @@ # For team collaboration and staging environments { - version = "3.8", - services = { postgres = { image = "postgres:15-alpine", @@ -28,23 +26,27 @@ }, orchestrator = { - image = "provisioning-orchestrator:latest", + build = { + context = ".", + dockerfile = "crates/orchestrator/Dockerfile", + }, container_name = "orchestrator", ports = [ - "9090:9090", + "8080:8080", ], environment = { ORCHESTRATOR_MODE = "multiuser", ORCHESTRATOR_SERVER_HOST = "0.0.0.0", - ORCHESTRATOR_SERVER_PORT = "9090", + ORCHESTRATOR_SERVER_PORT = "8080", ORCHESTRATOR_STORAGE_BACKEND = "surrealdb_server", ORCHESTRATOR_SURREALDB_URL = "surrealdb://surrealdb:8000", ORCHESTRATOR_SURREALDB_NAMESPACE = "provisioning", ORCHESTRATOR_SURREALDB_DATABASE = "orchestrator", - ORCHESTRATOR_LOG_LEVEL = "debug", + RUST_LOG = "debug", }, volumes = [ - "orchestrator_logs:/var/log/provisioning/orchestrator", + "orchestrator_data:/data", + "orchestrator_logs:/var/log/orchestrator", ], networks = ["provisioning"], restart = "unless-stopped", @@ -53,43 +55,6 @@ condition = "service_healthy", }, }, - healthcheck = { - test = ["CMD", "curl", "-f", "http://localhost:9090/health"], - interval = "30s", - timeout = "10s", - retries = 3, - start_period = "40s", - }, - }, - - control-center = { - image = "provisioning-control-center:latest", - container_name = "control-center", - ports = [ - "8080:8080", - ], - environment = { - CONTROL_CENTER_MODE = "multiuser", - CONTROL_CENTER_SERVER_HOST = "0.0.0.0", - CONTROL_CENTER_SERVER_PORT = "8080", - CONTROL_CENTER_DATABASE = "postgres", - CONTROL_CENTER_DATABASE_URL = "postgresql://provisioning:provisioning_dev@postgres/provisioning", - CONTROL_CENTER_LOG_LEVEL = "debug", - CONTROL_CENTER_MFA_REQUIRED = "false", - }, - volumes = [ - "control_center_logs:/var/log/provisioning/control-center", - ], - networks = ["provisioning"], - restart = "unless-stopped", - depends_on = { - postgres = { - condition = "service_healthy", - }, - orchestrator = { - condition = "service_healthy", - }, - }, healthcheck = { test = ["CMD", "curl", "-f", "http://localhost:8080/health"], interval = "30s", @@ -99,27 +64,40 @@ }, }, - mcp-server = { - image = "provisioning-mcp-server:latest", - container_name = "mcp-server", + control-center = { + build = { + context = ".", + dockerfile = "crates/control-center/Dockerfile", + }, + container_name = "control-center", ports = [ - "8888:8888", + "8081:8081", ], environment = { - MCP_SERVER_MODE = "multiuser", - MCP_SERVER_HOST = "0.0.0.0", - MCP_SERVER_PORT = "8888", - MCP_SERVER_PROTOCOL = "stdio", - MCP_SERVER_LOG_LEVEL = "debug", + CONTROL_CENTER_MODE = "multiuser", + CONTROL_CENTER_SERVER_HOST = "0.0.0.0", + CONTROL_CENTER_SERVER_PORT = "8081", + CONTROL_CENTER_DATABASE = "postgres", + CONTROL_CENTER_DATABASE_URL = "postgresql://provisioning:provisioning_dev@postgres/provisioning", + ORCHESTRATOR_URL = "http://orchestrator:8080", + RUST_LOG = "debug", + CONTROL_CENTER_MFA_REQUIRED = "false", }, volumes = [ - "mcp_server_logs:/var/log/provisioning/mcp-server", + "control_center_data:/data", ], networks = ["provisioning"], restart = "unless-stopped", - depends_on = ["orchestrator", "control-center"], + depends_on = { + postgres = { + condition = "service_healthy", + }, + orchestrator = { + condition = "service_healthy", + }, + }, healthcheck = { - test = ["CMD", "curl", "-f", "http://localhost:8888/health"], + test = ["CMD", "curl", "-f", "http://localhost:8081/health"], interval = "30s", timeout = "10s", retries = 3, @@ -127,6 +105,119 @@ }, }, + mcp-server = { + build = { + context = ".", + dockerfile = "crates/mcp-server/Dockerfile", + }, + container_name = "mcp-server", + ports = [ + "8082:8082", + ], + environment = { + MCP_SERVER_MODE = "multiuser", + MCP_SERVER_HOST = "0.0.0.0", + MCP_SERVER_PORT = "8082", + MCP_SERVER_PROTOCOL = "stdio", + ORCHESTRATOR_URL = "http://orchestrator:8080", + RUST_LOG = "debug", + }, + volumes = [ + "mcp_server_data:/data", + ], + networks = ["provisioning"], + restart = "unless-stopped", + depends_on = { + orchestrator = { + condition = "service_healthy", + }, + control-center = { + condition = "service_healthy", + }, + }, + healthcheck = { + test = ["CMD", "curl", "-f", "http://localhost:8082/health"], + interval = "30s", + timeout = "10s", + retries = 3, + start_period = "40s", + }, + }, + + provisioning-daemon = { + build = { + context = ".", + dockerfile = "crates/provisioning-daemon/Dockerfile", + }, + container_name = "provisioning-daemon", + ports = [ + "8079:8079", + ], + environment = { + RUST_LOG = "debug", + DATA_DIR = "/data", + PROVISIONING_DAEMON_MODE = "multiuser", + PROVISIONING_CONFIG_DIR = "/etc/provisioning", + }, + volumes = [ + "daemon_data:/data", + "daemon_config:/etc/provisioning", + ], + networks = ["provisioning"], + restart = "unless-stopped", + depends_on = { + orchestrator = { + condition = "service_healthy", + }, + postgres = { + condition = "service_healthy", + }, + }, + healthcheck = { + test = ["CMD", "curl", "-f", "http://localhost:8079/api/v1/health"], + interval = "30s", + timeout = "10s", + retries = 3, + start_period = "30s", + }, + }, + + provisioning-rag = { + build = { + context = ".", + dockerfile = "crates/rag/docker/Dockerfile", + }, + container_name = "provisioning-rag", + ports = [ + "9090:9090", + ], + environment = { + PROVISIONING_LOG_LEVEL = "debug", + PROVISIONING_API_HOST = "0.0.0.0", + PROVISIONING_API_PORT = "9090", + PROVISIONING_CACHE_SIZE = "2000", + PROVISIONING_CACHE_TTL_SECS = "5400", + }, + volumes = [ + "rag_data:/app/data", + "rag_cache:/app/cache", + ], + networks = ["provisioning"], + restart = "unless-stopped", + depends_on = { + orchestrator = { + condition = "service_healthy", + }, + }, + healthcheck = { + test = ["CMD", "curl", "-f", "http://localhost:9090/health"], + interval = "30s", + timeout = "10s", + retries = 3, + start_period = "5s", + }, + }, + surrealdb = { image = "surrealdb/surrealdb:latest", container_name = "surrealdb", @@ -177,9 +268,14 @@ volumes = { postgres_data = null, + orchestrator_data = null, orchestrator_logs = null, - control_center_logs = null, - mcp_server_logs = null, + control_center_data = null, + mcp_server_data = null, + daemon_data = null, + daemon_config = null, + rag_data = null, + rag_cache = null, surrealdb_data = null, gitea_data = null, }, diff --git a/schemas/platform/templates/docker-compose/platform-stack.solo.yml.ncl b/schemas/platform/templates/docker-compose/platform-stack.solo.yml.ncl index 615bb57..527b8ec 100644 --- a/schemas/platform/templates/docker-compose/platform-stack.solo.yml.ncl +++ b/schemas/platform/templates/docker-compose/platform-stack.solo.yml.ncl @@ -1,60 +1,149 @@ # Docker Compose Platform Stack - Solo Mode -# Imports configuration from values/orchestrator.solo.ncl and control-center.solo.ncl -# Exports to JSON, then converted to YAML by render-docker-compose.nu -# Usage: nickel export --format json platform-stack.solo.yml.ncl | yq -P +# Imports configuration from user config files +# User configs are located in ~/Library/Application Support/provisioning/platform/config/ (macOS) +# or ~/.config/provisioning/platform/config/ (Linux) +# NICKEL_IMPORT_PATH is set by platform-generate-manifests.nu using with-env +# Usage: ./provisioning/scripts/platform-generate-manifests.nu docker + +let orchestrator_config = (import "orchestrator.ncl").orchestrator in +let control_center_config = (import "control-center.ncl").control_center in +let mcp_server_config = (import "mcp-server.ncl").mcp_server in { - version = "3.8", - services = { + # External Infrastructure Services (required for platform to function) + + # SurrealDB: Database for orchestrator and control-center + # Solo mode uses in-memory storage (memory) for simplicity + # Change to rocksdb:/data for persistent storage + surrealdb = { + image = "surrealdb/surrealdb:latest", + container_name = "surrealdb-solo", + command = "start --log=warn --bind 0.0.0.0:8000 memory", + ports = [ + "8000:8000", + ], + networks = ["provisioning-net"], + restart = "unless-stopped", + healthcheck = { + test = ["CMD", "curl", "-f", "http://localhost:8000/health"], + interval = "10s", + timeout = "5s", + retries = 3, + start_period = "10s", + }, + }, + + # Zot: OCI registry for extension distribution + # Optional: Can be disabled if extensions are loaded from filesystem + zot = { + image = "ghcr.io/project-zot/zot:latest", + container_name = "zot-solo", + ports = [ + "5000:5000", + ], + environment = { + ZOT_LOG_LEVEL = "info", + }, + volumes = [ + "zot_data:/var/lib/registry", + ], + networks = ["provisioning-net"], + restart = "unless-stopped", + healthcheck = { + test = ["CMD", "curl", "-f", "http://localhost:5000/v2/"], + interval = "10s", + timeout = "5s", + retries = 3, + start_period = "10s", + }, + }, + + # Forgejo: Git source for extension discovery and releases + # Optional: Can use external Forgejo/Gitea instance instead + forgejo = { + image = "codeberg.org/forgejo/forgejo:latest", + container_name = "forgejo-solo", + ports = [ + "3000:3000", + "2222:22", + ], + environment = { + USER_UID = "1000", + USER_GID = "1000", + FORGEJO_SECURITY_SECRET_KEY = "changeme_in_production", + FORGEJO_SECURITY_INSTALL_LOCK = "true", + }, + volumes = [ + "forgejo_data:/data", + ], + networks = ["provisioning-net"], + restart = "unless-stopped", + healthcheck = { + test = ["CMD", "curl", "-f", "http://localhost:3000/api/v1/version"], + interval = "10s", + timeout = "5s", + retries = 3, + start_period = "30s", + }, + }, + + # Platform Services (internal) + orchestrator = { - image = "provisioning-orchestrator:latest", + build = { + context = ".", + dockerfile = "crates/orchestrator/Dockerfile", + }, container_name = "orchestrator", ports = [ - "9090:9090", + (std.string.from_number orchestrator_config.server.port) ++ ":8080", ], environment = { ORCHESTRATOR_MODE = "solo", - ORCHESTRATOR_SERVER_HOST = "0.0.0.0", - ORCHESTRATOR_SERVER_PORT = "9090", - ORCHESTRATOR_STORAGE_BACKEND = "filesystem", - ORCHESTRATOR_STORAGE_PATH = "/var/lib/provisioning/orchestrator/data", - ORCHESTRATOR_LOG_LEVEL = "info", + ORCHESTRATOR_SERVER_HOST = orchestrator_config.server.host, + ORCHESTRATOR_SERVER_PORT = "8080", + ORCHESTRATOR_STORAGE_BACKEND = orchestrator_config.storage.backend, + ORCHESTRATOR_STORAGE_PATH = orchestrator_config.storage.path, + RUST_LOG = "info", }, volumes = [ - "orchestrator_data:/var/lib/provisioning/orchestrator/data", - "orchestrator_logs:/var/log/provisioning/orchestrator", + "orchestrator_data:/data", + "orchestrator_logs:/var/log/orchestrator", ], - networks = ["provisioning"], + networks = ["provisioning-net"], restart = "unless-stopped", healthcheck = { - test = ["CMD", "curl", "-f", "http://localhost:9090/health"], - interval = "30s", - timeout = "10s", + test = ["CMD", "curl", "-f", "http://localhost:8080/health"], + interval = "10s", + timeout = "5s", retries = 3, - start_period = "40s", + start_period = "10s", }, }, control-center = { - image = "provisioning-control-center:latest", + build = { + context = ".", + dockerfile = "crates/control-center/Dockerfile", + }, container_name = "control-center", ports = [ - "8080:8080", + (std.string.from_number control_center_config.server.port) ++ ":8081", ], environment = { CONTROL_CENTER_MODE = "solo", - CONTROL_CENTER_SERVER_HOST = "0.0.0.0", - CONTROL_CENTER_SERVER_PORT = "8080", - CONTROL_CENTER_DATABASE = "rocksdb", - CONTROL_CENTER_DATABASE_PATH = "/var/lib/provisioning/control-center/db", - CONTROL_CENTER_LOG_LEVEL = "info", + CONTROL_CENTER_SERVER_HOST = control_center_config.server.host, + CONTROL_CENTER_SERVER_PORT = "8081", + CONTROL_CENTER_DATABASE = control_center_config.database.backend, + CONTROL_CENTER_DATABASE_PATH = control_center_config.database.path, + ORCHESTRATOR_URL = "http://orchestrator:8080", + RUST_LOG = "info", }, volumes = [ - "control_center_data:/var/lib/provisioning/control-center/db", - "control_center_logs:/var/log/provisioning/control-center", + "control_center_data:/data", ], - networks = ["provisioning"], + networks = ["provisioning-net"], restart = "unless-stopped", depends_on = { orchestrator = { @@ -62,53 +151,105 @@ }, }, healthcheck = { - test = ["CMD", "curl", "-f", "http://localhost:8080/health"], - interval = "30s", - timeout = "10s", + test = ["CMD", "curl", "-f", "http://localhost:8081/health"], + interval = "10s", + timeout = "5s", retries = 3, - start_period = "40s", + start_period = "10s", }, }, mcp-server = { - image = "provisioning-mcp-server:latest", + build = { + context = ".", + dockerfile = "crates/mcp-server/Dockerfile", + }, container_name = "mcp-server", ports = [ - "8888:8888", + (std.string.from_number mcp_server_config.server.port) ++ ":8082", ], environment = { MCP_SERVER_MODE = "solo", - MCP_SERVER_HOST = "0.0.0.0", - MCP_SERVER_PORT = "8888", + MCP_SERVER_HOST = mcp_server_config.server.host, + MCP_SERVER_PORT = "8082", MCP_SERVER_PROTOCOL = "stdio", - MCP_SERVER_LOG_LEVEL = "info", + ORCHESTRATOR_URL = "http://orchestrator:8080", + RUST_LOG = "info", }, volumes = [ - "mcp_server_logs:/var/log/provisioning/mcp-server", + "mcp_server_data:/data", ], - networks = ["provisioning"], + networks = ["provisioning-net"], restart = "unless-stopped", - depends_on = ["orchestrator", "control-center"], + depends_on = { + orchestrator = { + condition = "service_healthy", + }, + }, healthcheck = { - test = ["CMD", "curl", "-f", "http://localhost:8888/health"], + test = ["CMD", "curl", "-f", "http://localhost:8082/health"], + interval = "10s", + timeout = "5s", + retries = 3, + start_period = "10s", + }, + }, + + provisioning-daemon = { + build = { + context = ".", + dockerfile = "crates/provisioning-daemon/Dockerfile", + }, + container_name = "provisioning-daemon", + ports = [ + "8079:8079", + ], + environment = { + RUST_LOG = "info", + DATA_DIR = "/data", + PROVISIONING_DAEMON_MODE = "solo", + PROVISIONING_CONFIG_DIR = "/etc/provisioning", + }, + volumes = [ + "daemon_data:/data", + "daemon_config:/etc/provisioning", + ], + networks = ["provisioning-net"], + restart = "unless-stopped", + depends_on = { + orchestrator = { + condition = "service_healthy", + }, + }, + healthcheck = { + test = ["CMD", "curl", "-f", "http://localhost:8079/api/v1/health"], interval = "30s", timeout = "10s", retries = 3, - start_period = "40s", + start_period = "30s", }, }, + + # TODO: provisioning-rag requires stratum-llm and stratum-embeddings workspace resolution + # Disabled temporarily until workspace dependencies are resolved + # provisioning-rag = { ... }, }, volumes = { + # External services volumes + zot_data = null, + forgejo_data = null, + # Platform services volumes orchestrator_data = null, orchestrator_logs = null, control_center_data = null, - control_center_logs = null, - mcp_server_logs = null, + mcp_server_data = null, + daemon_data = null, + daemon_config = null, }, networks = { - provisioning = { + provisioning-net = { driver = "bridge", }, }, diff --git a/schemas/platform/templates/docker/Dockerfile.chef.ncl b/schemas/platform/templates/docker/Dockerfile.chef.ncl new file mode 100644 index 0000000..325c2ea --- /dev/null +++ b/schemas/platform/templates/docker/Dockerfile.chef.ncl @@ -0,0 +1,179 @@ +# Dockerfile Template Generator with cargo-chef Multi-Stage Build +# Generates optimized 4-stage Dockerfile with dependency caching +# +# Usage: +# Pass build config as parameter: +# let template = import "this-file.ncl" in +# let defaults = import "../../defaults/orchestrator-defaults.ncl" in +# template defaults.orchestrator.build +# +# Stages: +# 1. PLANNER - Generate cargo-chef recipe.json (dependency graph) +# 2. CACHER - Build dependencies only (cached layer) +# 3. BUILDER - Build source code (uses CACHER artifacts) +# 4. RUNTIME - Minimal runtime image with binary only + +# Template function that takes build_config record +fun build_config => + let package = build_config.package in + let binary = build_config.binary in + let base_image = build_config.base_image in + let runtime_image = build_config.runtime_image in + let port = build_config.port in + let health_path = build_config.health_path in + let features = build_config.features in + let extra_runtime_pkgs = build_config.extra_runtime_pkgs in + let user_id = build_config.user_id in + let config_file = build_config.config_file in + let chef_enabled = build_config.chef_enabled in + let sccache_enabled = build_config.sccache.enabled in + let buildkit_jobs = build_config.buildkit.parallel_jobs in + + # Conditional string generation helpers + let features_arg = + if std.array.length features > 0 then + "--features " ++ (std.string.join "," features) + else + "" in + + let sccache_install = + if sccache_enabled then + "RUN cargo install sccache --version 0.8.0\nENV RUSTC_WRAPPER=sccache" + else + "" in + + let extra_pkgs_str = + if std.array.length extra_runtime_pkgs > 0 then + " \\\n " ++ (std.string.join " \\\n " extra_runtime_pkgs) + else + "" in + + let config_copy = + if config_file != "" then + "COPY crates/" ++ package ++ "/" ++ config_file ++ " /etc/provisioning/config.defaults.toml" + else + "# No config file to copy" in + + # Generate Dockerfile content + std.string.join "\n" [ + "# Multi-stage build for " ++ package, + "# Generated from Nickel template - DO NOT EDIT DIRECTLY", + "# Source: provisioning/schemas/platform/templates/docker/Dockerfile.chef.ncl", + "", + "# ============================================================================", + "# Stage 1: PLANNER - Generate dependency recipe", + "# ============================================================================", + "FROM " ++ base_image ++ " AS planner", + "", + "WORKDIR /workspace", + "", + "# Install cargo-chef", + "RUN cargo install cargo-chef --version 0.1.67", + "", + "# Copy workspace manifests", + "COPY Cargo.toml Cargo.lock ./", + "COPY crates ./crates", + "COPY daemon-cli ./daemon-cli", + "COPY secretumvault ./secretumvault", + "COPY prov-ecosystem ./prov-ecosystem", + "COPY stratumiops ./stratumiops", + "", + "# Generate recipe.json (dependency graph)", + "RUN cargo chef prepare --recipe-path recipe.json --bin " ++ binary, + "", + "# ============================================================================", + "# Stage 2: CACHER - Build dependencies only", + "# ============================================================================", + "FROM " ++ base_image ++ " AS cacher", + "", + "WORKDIR /workspace", + "", + "# Install build dependencies", + "RUN apt-get update && apt-get install -y \\", + " pkg-config \\", + " libssl-dev \\", + " && rm -rf /var/lib/apt/lists/*", + "", + "# Install cargo-chef", + "RUN cargo install cargo-chef --version 0.1.67", + "", + (if sccache_enabled then sccache_install else "# sccache disabled"), + "", + "# Copy recipe from planner", + "COPY --from=planner /workspace/recipe.json recipe.json", + "", + "# Build dependencies - This layer will be cached", + "RUN cargo chef cook --release --recipe-path recipe.json " ++ features_arg, + "", + "# ============================================================================", + "# Stage 3: BUILDER - Build source code", + "# ============================================================================", + "FROM " ++ base_image ++ " AS builder", + "", + "WORKDIR /workspace", + "", + "# Install build dependencies", + "RUN apt-get update && apt-get install -y \\", + " pkg-config \\", + " libssl-dev \\", + " && rm -rf /var/lib/apt/lists/*", + "", + (if sccache_enabled then sccache_install else "# sccache disabled"), + "", + "# Copy cached dependencies from cacher stage", + "COPY --from=cacher /workspace/target target", + "COPY --from=cacher /usr/local/cargo /usr/local/cargo", + "", + "# Copy source code", + "COPY Cargo.toml Cargo.lock ./", + "COPY crates ./crates", + "COPY daemon-cli ./daemon-cli", + "COPY secretumvault ./secretumvault", + "COPY prov-ecosystem ./prov-ecosystem", + "COPY stratumiops ./stratumiops", + "", + "# Build release binary with parallelism", + "ENV CARGO_BUILD_JOBS=" ++ std.string.from_number buildkit_jobs, + "RUN cargo build --release --package " ++ package ++ " " ++ features_arg, + "", + "# ============================================================================", + "# Stage 4: RUNTIME - Minimal runtime image", + "# ============================================================================", + "FROM " ++ runtime_image, + "", + "# Install runtime dependencies", + "RUN apt-get update && apt-get install -y \\", + " ca-certificates \\", + " curl" ++ extra_pkgs_str ++ " \\", + " && rm -rf /var/lib/apt/lists/*", + "", + "# Create non-root user", + "RUN useradd -m -u " ++ std.string.from_number user_id ++ " provisioning && \\", + " mkdir -p /data /var/log/" ++ package ++ " && \\", + " chown -R provisioning:provisioning /data /var/log/" ++ package, + "", + "# Copy binary from builder", + "COPY --from=builder /workspace/target/release/" ++ binary ++ " /usr/local/bin/" ++ binary, + "RUN chmod +x /usr/local/bin/" ++ binary, + "", + config_copy, + "", + "# Switch to non-root user", + "USER provisioning", + "WORKDIR /app", + "", + "# Expose service port", + "EXPOSE " ++ std.string.from_number port, + "", + "# Environment variables", + "ENV RUST_LOG=info", + "ENV DATA_DIR=/data", + "", + "# Health check", + "HEALTHCHECK --interval=30s --timeout=10s --start-period=30s --retries=3 \\", + " CMD curl -f http://localhost:" ++ std.string.from_number port ++ health_path ++ " || exit 1", + "", + "# Run the binary", + "CMD [\"" ++ binary ++ "\"]", + "", + ] diff --git a/schemas/platform/templates/docker/docker-compose.build.yml.ncl b/schemas/platform/templates/docker/docker-compose.build.yml.ncl new file mode 100644 index 0000000..c1441dd --- /dev/null +++ b/schemas/platform/templates/docker/docker-compose.build.yml.ncl @@ -0,0 +1,127 @@ +# Docker Compose Build Configuration +# Generates docker-compose file optimized for building all services with BuildKit caching +# Usage: nickel export --format yaml docker-compose.build.yml.ncl > docker-compose.build.yml +# +# Pattern: +# - Imports all service defaults to extract build configs +# - Generates build-only services (no runtime config) +# - Supports BuildKit cache modes: registry, local, inline +# - Enables parallel builds with --parallel flag +# +# Generated file usage: +# docker compose -f docker-compose.build.yml build --parallel +# docker compose -f docker-compose.build.yml build orchestrator + +# Import all service defaults +let orchestrator_defaults = import "../../defaults/orchestrator-defaults.ncl" in +let control_center_defaults = import "../../defaults/control-center-defaults.ncl" in +let extension_registry_defaults = import "../../defaults/extension-registry-defaults.ncl" in +let mcp_server_defaults = import "../../defaults/mcp-server-defaults.ncl" in +let daemon_defaults = import "../../defaults/provisioning-daemon-defaults.ncl" in +let ai_service_defaults = import "../../defaults/ai-service-defaults.ncl" in +let rag_defaults = import "../../defaults/rag-defaults.ncl" in +let vault_defaults = import "../../defaults/vault-service-defaults.ncl" in + +# Helper function to generate cache configuration based on mode +let cache_config = fun build_cfg registry => + let mode = build_cfg.buildkit.cache_mode in + if mode == 'registry then + { + cache_from = [ + "type=registry,ref=" ++ registry ++ "/" ++ build_cfg.package ++ ":buildcache" + ], + cache_to = [ + "type=registry,ref=" ++ registry ++ "/" ++ build_cfg.package ++ ":buildcache,mode=max" + ], + } + else if mode == 'local then + { + cache_from = [ + "type=local,src=/tmp/docker-cache/" ++ build_cfg.package + ], + cache_to = [ + "type=local,dest=/tmp/docker-cache/" ++ build_cfg.package ++ ",mode=max" + ], + } + else # inline + { + cache_from = [], + cache_to = [ + "type=inline" + ], + } +in + +# Helper function to generate build args +let build_args = fun build_cfg => { + CARGO_BUILD_JOBS = std.string.from_number build_cfg.buildkit.parallel_jobs, + RUST_LOG = "info", +} in + +# Helper function to generate service build config +let service_build = fun name dockerfile_path build_cfg registry => { + build = { + context = "../../platform", + dockerfile = dockerfile_path, + args = build_args build_cfg, + } & (cache_config build_cfg registry), + image = registry ++ "/" ++ build_cfg.package ++ ":latest", +} in + +# Default registry (override by modifying this value or using nickel CLI --override) +# Example: nickel export --override registry='"myregistry.io"' --format yaml docker-compose.build.yml.ncl +let registry = "localhost:5000" in + +{ + version = "3.8", + + services = { + orchestrator = service_build + "orchestrator" + "crates/orchestrator/Dockerfile" + orchestrator_defaults.orchestrator.build + registry, + + control-center = service_build + "control-center" + "crates/control-center/Dockerfile" + control_center_defaults.control_center.build + registry, + + extension-registry = service_build + "extension-registry" + "crates/extension-registry/Dockerfile" + extension_registry_defaults.extension_registry.build + registry, + + mcp-server = service_build + "mcp-server" + "crates/mcp-server/Dockerfile" + mcp_server_defaults.mcp_server.build + registry, + + provisioning-daemon = service_build + "provisioning-daemon" + "crates/daemon/Dockerfile" + daemon_defaults.provisioning_daemon.build + registry, + + ai-service = service_build + "ai-service" + "crates/ai-service/Dockerfile" + ai_service_defaults.ai_service.build + registry, + + rag = service_build + "rag" + "crates/rag/Dockerfile" + rag_defaults.rag.build + registry, + + vault-service = service_build + "vault-service" + "crates/vault-service/Dockerfile" + vault_defaults.vault.build + registry, + }, +} diff --git a/schemas/platform/templates/extension-registry-config.ncl.j2 b/schemas/platform/templates/extension-registry-config.ncl.j2 index cdfe389..734c65d 100644 --- a/schemas/platform/templates/extension-registry-config.ncl.j2 +++ b/schemas/platform/templates/extension-registry-config.ncl.j2 @@ -3,7 +3,7 @@ # Edit via: nu provisioning/.typedialog/provisioning/platform/scripts/configure.nu extension-registry {mode} # Or manually edit and validate with: nickel typecheck -let registry_schema = import "../schemas/extension-registry.ncl" in +let registry_schema = import "../extension-registry.ncl" in { extension_registry | registry_schema.RegistryConfig = { diff --git a/schemas/platform/templates/installer-config.ncl.j2 b/schemas/platform/templates/installer-config.ncl.j2 index 6abfa55..c13e0db 100644 --- a/schemas/platform/templates/installer-config.ncl.j2 +++ b/schemas/platform/templates/installer-config.ncl.j2 @@ -2,7 +2,7 @@ # Auto-generated by provisioning TypeDialog # Edit via: nu provisioning/.typedialog/provisioning/platform/scripts/configure.nu installer {mode} -let installer_schema = import "../schemas/installer.ncl" in +let installer_schema = import "../installer.ncl" in { installer | installer_schema.InstallerConfig = { diff --git a/schemas/platform/templates/kubernetes/control-center-deployment.yaml.ncl b/schemas/platform/templates/kubernetes/control-center-deployment.yaml.ncl index 95ef86a..ea51320 100644 --- a/schemas/platform/templates/kubernetes/control-center-deployment.yaml.ncl +++ b/schemas/platform/templates/kubernetes/control-center-deployment.yaml.ncl @@ -2,9 +2,13 @@ # Policy and RBAC management service # Supports 4 deployment modes: solo, multiuser, cicd, enterprise # -# Usage: -# nickel eval --format json control-center-deployment.yaml.ncl | yq -P > control-center-deployment.yaml -# kubectl apply -f control-center-deployment.yaml +# Imports user configuration from control-center.ncl +# Extracts port and host values for service configuration +# +# Usage (called by generate-manifests.nu): +# ./provisioning/scripts/platform-generate-manifests.nu kubernetes + +let control_center_config = (import "control-center.ncl").control_center in { apiVersion = "apps/v1", @@ -64,12 +68,7 @@ ports = [ { name = "http", - containerPort = 8080, - protocol = "TCP", - }, - { - name = "metrics", - containerPort = 8081, + containerPort = control_center_config.server.port, protocol = "TCP", }, ], @@ -81,11 +80,11 @@ }, { name = "CONTROL_CENTER_SERVER_HOST", - value = "0.0.0.0", + value = control_center_config.server.host, }, { name = "CONTROL_CENTER_SERVER_PORT", - value = "8080", + value = std.string.from_number control_center_config.server.port, }, { name = "CONTROL_CENTER_DATABASE", @@ -188,7 +187,7 @@ livenessProbe = { httpGet = { path = "/health", - port = 8080, + port = control_center_config.server.port, }, initialDelaySeconds = 30, periodSeconds = 10, @@ -199,7 +198,7 @@ readinessProbe = { httpGet = { path = "/ready", - port = 8080, + port = control_center_config.server.port, }, initialDelaySeconds = 20, periodSeconds = 5, diff --git a/schemas/platform/templates/kubernetes/control-center-service.yaml.ncl b/schemas/platform/templates/kubernetes/control-center-service.yaml.ncl index 8a1545f..4e1bb59 100644 --- a/schemas/platform/templates/kubernetes/control-center-service.yaml.ncl +++ b/schemas/platform/templates/kubernetes/control-center-service.yaml.ncl @@ -1,9 +1,11 @@ # Control Center Kubernetes Service # Exposes Control Center API and UI +# Imports user configuration from control-center.ncl # -# Usage: -# nickel eval --format json control-center-service.yaml.ncl | yq -P > control-center-service.yaml -# kubectl apply -f control-center-service.yaml +# Usage (called by generate-manifests.nu): +# ./provisioning/scripts/platform-generate-manifests.nu kubernetes + +let control_center_config = (import "control-center.ncl").control_center in { apiVersion = "v1", @@ -36,14 +38,8 @@ { name = "http", protocol = "TCP", - port = 8080, - targetPort = 8080, - }, - { - name = "metrics", - protocol = "TCP", - port = 8081, - targetPort = 8081, + port = control_center_config.server.port, + targetPort = control_center_config.server.port, }, ], }, diff --git a/schemas/platform/templates/kubernetes/mcp-server-deployment.yaml.ncl b/schemas/platform/templates/kubernetes/mcp-server-deployment.yaml.ncl index 46be1b5..a77a2d7 100644 --- a/schemas/platform/templates/kubernetes/mcp-server-deployment.yaml.ncl +++ b/schemas/platform/templates/kubernetes/mcp-server-deployment.yaml.ncl @@ -1,10 +1,12 @@ # MCP Server Kubernetes Deployment # Model Context Protocol server for AI integration # Provides tools, resources, and prompts to Claude and other LLMs +# Imports user configuration from mcp-server.ncl # -# Usage: -# nickel eval --format json mcp-server-deployment.yaml.ncl | yq -P > mcp-server-deployment.yaml -# kubectl apply -f mcp-server-deployment.yaml +# Usage (called by generate-manifests.nu): +# ./provisioning/scripts/platform-generate-manifests.nu kubernetes + +let mcp_server_config = (import "mcp-server.ncl").mcp_server in { apiVersion = "apps/v1", @@ -37,7 +39,7 @@ }, annotations = { "prometheus.io/scrape" = "true", - "prometheus.io/port" = "8888", + "prometheus.io/port" = std.string.from_number mcp_server_config.server.port, "prometheus.io/path" = "/metrics", }, }, @@ -64,12 +66,7 @@ ports = [ { name = "http", - containerPort = 8888, - protocol = "TCP", - }, - { - name = "metrics", - containerPort = 8889, + containerPort = mcp_server_config.server.port, protocol = "TCP", }, ], @@ -81,11 +78,11 @@ }, { name = "MCP_SERVER_HOST", - value = "0.0.0.0", + value = mcp_server_config.server.host, }, { name = "MCP_SERVER_PORT", - value = "8888", + value = std.string.from_number mcp_server_config.server.port, }, { name = "MCP_SERVER_PROTOCOL", @@ -197,7 +194,7 @@ livenessProbe = { httpGet = { path = "/health", - port = 8888, + port = mcp_server_config.server.port, }, initialDelaySeconds = 30, periodSeconds = 15, @@ -208,7 +205,7 @@ readinessProbe = { httpGet = { path = "/ready", - port = 8888, + port = mcp_server_config.server.port, }, initialDelaySeconds = 20, periodSeconds = 5, diff --git a/schemas/platform/templates/kubernetes/mcp-server-service.yaml.ncl b/schemas/platform/templates/kubernetes/mcp-server-service.yaml.ncl index 89ca483..596574d 100644 --- a/schemas/platform/templates/kubernetes/mcp-server-service.yaml.ncl +++ b/schemas/platform/templates/kubernetes/mcp-server-service.yaml.ncl @@ -1,9 +1,11 @@ # MCP Server Kubernetes Service # Exposes MCP server for AI/LLM integration +# Imports user configuration from mcp-server.ncl # -# Usage: -# nickel eval --format json mcp-server-service.yaml.ncl | yq -P > mcp-server-service.yaml -# kubectl apply -f mcp-server-service.yaml +# Usage (called by generate-manifests.nu): +# ./provisioning/scripts/platform-generate-manifests.nu kubernetes + +let mcp_server_config = (import "mcp-server.ncl").mcp_server in { apiVersion = "v1", @@ -30,14 +32,8 @@ { name = "http", protocol = "TCP", - port = 8888, - targetPort = 8888, - }, - { - name = "metrics", - protocol = "TCP", - port = 8889, - targetPort = 8889, + port = mcp_server_config.server.port, + targetPort = mcp_server_config.server.port, }, ], }, diff --git a/schemas/platform/templates/kubernetes/namespace.yaml.ncl b/schemas/platform/templates/kubernetes/namespace.yaml.ncl index 13b0220..89df0f0 100644 --- a/schemas/platform/templates/kubernetes/namespace.yaml.ncl +++ b/schemas/platform/templates/kubernetes/namespace.yaml.ncl @@ -2,7 +2,7 @@ # Supports 4 deployment modes: solo, multiuser, cicd, enterprise # Includes RBAC setup, resource limits, and network isolation # -# Usage: +# Usage (called by generate-manifests.nu): # nickel eval --format json namespace.yaml.ncl | yq -P > namespace.yaml # kubectl apply -f namespace.yaml diff --git a/schemas/platform/templates/kubernetes/network-policy.yaml.ncl b/schemas/platform/templates/kubernetes/network-policy.yaml.ncl index be72703..a20b65a 100644 --- a/schemas/platform/templates/kubernetes/network-policy.yaml.ncl +++ b/schemas/platform/templates/kubernetes/network-policy.yaml.ncl @@ -3,7 +3,7 @@ # Default: deny all ingress (except specific rules below) # Allow: orchestrator <-> control-center <-> mcp-server # -# Usage: +# Usage (called by generate-manifests.nu): # nickel eval --format json network-policy.yaml.ncl | yq -P > network-policy.yaml # kubectl apply -f network-policy.yaml diff --git a/schemas/platform/templates/kubernetes/orchestrator-deployment.yaml.ncl b/schemas/platform/templates/kubernetes/orchestrator-deployment.yaml.ncl index 5f39a1f..ce0b42a 100644 --- a/schemas/platform/templates/kubernetes/orchestrator-deployment.yaml.ncl +++ b/schemas/platform/templates/kubernetes/orchestrator-deployment.yaml.ncl @@ -1,10 +1,12 @@ # Orchestrator Kubernetes Deployment # Supports 4 deployment modes: solo, multiuser, cicd, enterprise -# Exports to YAML via: nickel export --format json | yq -P +# Imports user configuration from orchestrator.ncl +# NICKEL_IMPORT_PATH is set by generate-manifests.nu # -# Usage: -# nickel eval --format json orchestrator-deployment.yaml.ncl | yq -P > orchestrator-deployment.yaml -# kubectl apply -f orchestrator-deployment.yaml +# Usage (called by generate-manifests.nu): +# ./provisioning/scripts/platform-generate-manifests.nu kubernetes + +let orchestrator_config = (import "orchestrator.ncl").orchestrator in { apiVersion = "apps/v1", @@ -37,7 +39,7 @@ }, annotations = { "prometheus.io/scrape" = "true", - "prometheus.io/port" = "9090", + "prometheus.io/port" = std.string.from_number orchestrator_config.server.port, "prometheus.io/path" = "/metrics", }, }, @@ -65,12 +67,7 @@ ports = [ { name = "http", - containerPort = 9090, - protocol = "TCP", - }, - { - name = "metrics", - containerPort = 9091, + containerPort = orchestrator_config.server.port, protocol = "TCP", }, ], @@ -82,24 +79,19 @@ }, { name = "ORCHESTRATOR_SERVER_HOST", - value = "0.0.0.0", + value = orchestrator_config.server.host, }, { name = "ORCHESTRATOR_SERVER_PORT", - value = "9090", + value = std.string.from_number orchestrator_config.server.port, }, { name = "ORCHESTRATOR_STORAGE_BACKEND", - valueFrom = { - configMapKeyRef = { - name = "orchestrator-config", - key = "storage_backend", - }, - }, + value = orchestrator_config.storage.backend, }, { name = "ORCHESTRATOR_STORAGE_PATH", - value = "/var/lib/provisioning/orchestrator/data", + value = orchestrator_config.storage.path, }, { name = "ORCHESTRATOR_QUEUE_MAX_CONCURRENT_TASKS", @@ -146,11 +138,11 @@ }, ], - # Health check: HTTP GET /health on port 9090 + # Health check: HTTP GET /health livenessProbe = { httpGet = { path = "/health", - port = 9090, + port = orchestrator_config.server.port, }, initialDelaySeconds = 30, periodSeconds = 10, @@ -162,7 +154,7 @@ readinessProbe = { httpGet = { path = "/health", - port = 9090, + port = orchestrator_config.server.port, }, initialDelaySeconds = 20, periodSeconds = 5, diff --git a/schemas/platform/templates/kubernetes/orchestrator-service.yaml.ncl b/schemas/platform/templates/kubernetes/orchestrator-service.yaml.ncl index 915116b..c17b623 100644 --- a/schemas/platform/templates/kubernetes/orchestrator-service.yaml.ncl +++ b/schemas/platform/templates/kubernetes/orchestrator-service.yaml.ncl @@ -1,10 +1,12 @@ # Orchestrator Kubernetes Service # Exposes orchestrator deployment internally and externally # Supports ClusterIP (internal) and LoadBalancer (external) service types +# Imports user configuration from orchestrator.ncl # -# Usage: -# nickel eval --format json orchestrator-service.yaml.ncl | yq -P > orchestrator-service.yaml -# kubectl apply -f orchestrator-service.yaml +# Usage (called by generate-manifests.nu): +# ./provisioning/scripts/platform-generate-manifests.nu kubernetes + +let orchestrator_config = (import "orchestrator.ncl").orchestrator in { apiVersion = "v1", @@ -43,14 +45,8 @@ { name = "http", protocol = "TCP", - port = 9090, - targetPort = 9090, - }, - { - name = "metrics", - protocol = "TCP", - port = 9091, - targetPort = 9091, + port = orchestrator_config.server.port, + targetPort = orchestrator_config.server.port, }, ], diff --git a/schemas/platform/templates/kubernetes/platform-ingress.yaml.ncl b/schemas/platform/templates/kubernetes/platform-ingress.yaml.ncl index 1090d4e..e4e20da 100644 --- a/schemas/platform/templates/kubernetes/platform-ingress.yaml.ncl +++ b/schemas/platform/templates/kubernetes/platform-ingress.yaml.ncl @@ -6,7 +6,7 @@ # - Nginx Ingress Controller or similar # - TLS certificate (from Let's Encrypt or self-signed) # -# Usage: +# Usage (called by generate-manifests.nu): # nickel eval --format json platform-ingress.yaml.ncl | yq -P > platform-ingress.yaml # kubectl apply -f platform-ingress.yaml diff --git a/schemas/platform/templates/kubernetes/rbac.yaml.ncl b/schemas/platform/templates/kubernetes/rbac.yaml.ncl index cfcacd9..6d59c5a 100644 --- a/schemas/platform/templates/kubernetes/rbac.yaml.ncl +++ b/schemas/platform/templates/kubernetes/rbac.yaml.ncl @@ -1,7 +1,7 @@ # Kubernetes RBAC (Role-Based Access Control) for Provisioning # Creates ServiceAccounts and Roles for each service # -# Usage: +# Usage (called by generate-manifests.nu): # nickel eval --format json rbac.yaml.ncl | yq -P > rbac.yaml # kubectl apply -f rbac.yaml diff --git a/schemas/platform/templates/kubernetes/resource-quota.yaml.ncl b/schemas/platform/templates/kubernetes/resource-quota.yaml.ncl index 898c77d..199e2f2 100644 --- a/schemas/platform/templates/kubernetes/resource-quota.yaml.ncl +++ b/schemas/platform/templates/kubernetes/resource-quota.yaml.ncl @@ -6,7 +6,7 @@ # - CI/CD: 16 CPU, 32GB RAM, 50 storage, 50 pods max (ephemeral workloads) # - Enterprise: Unlimited (define via other means) # -# Usage: +# Usage (called by generate-manifests.nu): # nickel eval --format json resource-quota.yaml.ncl | yq -P > resource-quota.yaml # kubectl apply -f resource-quota.yaml diff --git a/schemas/platform/templates/mcp-server-config.ncl.j2 b/schemas/platform/templates/mcp-server-config.ncl.j2 index 2278707..098c997 100644 --- a/schemas/platform/templates/mcp-server-config.ncl.j2 +++ b/schemas/platform/templates/mcp-server-config.ncl.j2 @@ -2,7 +2,7 @@ # Auto-generated by provisioning TypeDialog # Edit via: nu provisioning/.typedialog/provisioning/platform/scripts/configure.nu mcp-server {mode} -let mcp_server_schema = import "../schemas/mcp-server.ncl" in +let mcp_server_schema = import "../mcp-server.ncl" in { mcp_server | mcp_server_schema.MCPServerConfig = { diff --git a/schemas/platform/templates/orchestrator-config.ncl.j2 b/schemas/platform/templates/orchestrator-config.ncl.j2 index 8ed3eca..a3428a9 100644 --- a/schemas/platform/templates/orchestrator-config.ncl.j2 +++ b/schemas/platform/templates/orchestrator-config.ncl.j2 @@ -3,7 +3,7 @@ # Edit via: nu provisioning/.typedialog/provisioning/platform/scripts/configure.nu orchestrator {mode} # Or manually edit and validate with: nickel typecheck -let orchestrator_schema = import "../schemas/orchestrator.ncl" in +let orchestrator_schema = import "../orchestrator.ncl" in { orchestrator | orchestrator_schema.OrchestratorConfig = { diff --git a/schemas/platform/templates/provisioning-daemon-config.ncl.j2 b/schemas/platform/templates/provisioning-daemon-config.ncl.j2 index f883933..2401b5c 100644 --- a/schemas/platform/templates/provisioning-daemon-config.ncl.j2 +++ b/schemas/platform/templates/provisioning-daemon-config.ncl.j2 @@ -3,7 +3,7 @@ # Edit via: nu provisioning/.typedialog/provisioning/platform/scripts/configure.nu provisioning-daemon {mode} # Or manually edit and validate with: nickel typecheck -let daemon_schema = import "../schemas/provisioning-daemon.ncl" in +let daemon_schema = import "../provisioning-daemon.ncl" in { provisioning_daemon | daemon_schema.ProvisioningDaemonConfig = { diff --git a/schemas/platform/templates/rag-config.ncl.j2 b/schemas/platform/templates/rag-config.ncl.j2 index b610c59..14991bc 100644 --- a/schemas/platform/templates/rag-config.ncl.j2 +++ b/schemas/platform/templates/rag-config.ncl.j2 @@ -3,7 +3,7 @@ # Edit via: nu provisioning/.typedialog/provisioning/platform/scripts/configure.nu rag {mode} # Or manually edit and validate with: nickel typecheck -let rag_schema = import "../schemas/rag.ncl" in +let rag_schema = import "../rag.ncl" in { rag | rag_schema.RagConfig = { diff --git a/schemas/platform/templates/service-config-template.ncl b/schemas/platform/templates/service-config-template.ncl new file mode 100644 index 0000000..2436051 --- /dev/null +++ b/schemas/platform/templates/service-config-template.ncl @@ -0,0 +1,50 @@ +# Service Configuration Template +# Use this as a pattern for migrating user configs to schema-validated versions +# This example shows the pattern for any service configuration + +# Template for: SERVICE_NAME Configuration +# 1. Import the service schema +# 2. Import service defaults +# 3. Import deployment mode defaults +# 4. Import helpers for deep merge +# 5. Define user overrides (only fields to change) +# 6. Use compose_config to merge: defaults + mode + overrides +# 7. Validate with type annotation + +let service_schema = import "../SERVICE_NAME.ncl" in +let service_defaults = import "../defaults/SERVICE_NAME-defaults.ncl" in +let mode_config = import "../defaults/deployment/DEPLOYMENT_MODE-defaults.ncl" in +let helpers = import "../common/helpers.ncl" in + +# Define user-specific overrides (only override what you need) +let user_overrides = { + # Example: workspace configuration + workspace = { + name = "my-service", + path = "/var/lib/provisioning/my-service", + }, + + # Example: server configuration + server = { + host = "0.0.0.0", + port = 9000, + workers = 4, + }, + + # Example: storage configuration (use only schema-supported fields) + storage = { + backend = "filesystem", + path = "/data/my-service", + }, + + # Add only the fields your service needs + # Other fields will be provided by defaults +} in + +# Compose: apply defaults, then mode-specific tuning, then user overrides +# This ensures all required fields have values +helpers.compose_config + service_defaults.service_name + mode_config.service_name + user_overrides +|> (fun config => {service_name = config | service_schema.ServiceNameConfig}) diff --git a/schemas/platform/templates/vault-service-config.ncl.j2 b/schemas/platform/templates/vault-service-config.ncl.j2 index 634fca5..dd4b9c8 100644 --- a/schemas/platform/templates/vault-service-config.ncl.j2 +++ b/schemas/platform/templates/vault-service-config.ncl.j2 @@ -3,7 +3,7 @@ # Edit via: nu provisioning/.typedialog/provisioning/platform/scripts/configure.nu vault-service {mode} # Or manually edit and validate with: nickel typecheck -let vault_schema = import "../schemas/vault-service.ncl" in +let vault_schema = import "../vault-service.ncl" in { vault_service | vault_schema.VaultServiceConfig = { diff --git a/schemas/platform/validators/README.md b/schemas/platform/validators/README.md deleted file mode 100644 index b9aff49..0000000 --- a/schemas/platform/validators/README.md +++ /dev/null @@ -1,329 +0,0 @@ -# Validators - -Validation logic for configuration values using constraints and business rules. - -## Purpose - -Validators provide: -- **Constraint checking** - Numeric ranges, required fields -- **Business logic validation** - Service-specific constraints -- **Error messages** - Clear feedback on invalid values -- **Composition with configs** - Validators applied during config generation - -## File Organization - -```bash -validators/ -├── README.md # This file -├── common-validator.ncl # Ports, positive numbers, strings -├── network-validator.ncl # IP addresses, bind addresses -├── path-validator.ncl # File paths, directories -├── resource-validator.ncl # CPU, memory, disk -├── string-validator.ncl # Workspace names, identifiers -├── orchestrator-validator.ncl # Queue, workflow validation -├── control-center-validator.ncl # RBAC, policy validation -├── mcp-server-validator.ncl # MCP tools, capabilities -└── deployment-validator.ncl # Resource allocation -``` - -## Validation Patterns - -### 1. Basic Range Validation - -```bash -# validators/common-validator.ncl -let constraints = import "../constraints/constraints.toml" in - -{ - ValidPort = fun port => - if port < constraints.common.server.port.min then - std.contract.blame_with_message "Port < 1024" port - else if port > constraints.common.server.port.max then - std.contract.blame_with_message "Port > 65535" port - else - port, -} -``` - -### 2. Range Validator (Reusable) - -```bash -# Reusable validator for any numeric range -ValidRange = fun min max value => - if value < min then - std.contract.blame_with_message "Value < %{std.to_string min}" value - else if value > max then - std.contract.blame_with_message "Value > %{std.to_string max}" value - else - value, -``` - -### 3. Enum Validation - -```json -{ - ValidStorageBackend = fun backend => - if backend != 'filesystem && - backend != 'rocksdb && - backend != 'surrealdb && - backend != 'postgres then - std.contract.blame_with_message "Invalid backend" backend - else - backend, -} -``` - -### 4. String Validation - -```json -{ - ValidNonEmptyString = fun s => - if s == "" then - std.contract.blame_with_message "Cannot be empty" s - else - s, - - ValidWorkspaceName = fun name => - if std.string.matches "^[a-z0-9_-]+$" name then - name - else - std.contract.blame_with_message "Invalid workspace name" name, -} -``` - -## Common Validators - -### common-validator.ncl - -```javascript -let constraints = import "../constraints/constraints.toml" in - -{ - # Port validation - ValidPort = fun port => - if port < constraints.common.server.port.min then error "Port too low" - else if port > constraints.common.server.port.max then error "Port too high" - else port, - - # Positive integer - ValidPositiveNumber = fun n => - if n <= 0 then error "Must be positive" - else n, - - # Non-empty string - ValidNonEmptyString = fun s => - if s == "" then error "Cannot be empty" - else s, - - # Generic range validator - ValidRange = fun min max value => - if value < min then error "Value below minimum" - else if value > max then error "Value above maximum" - else value, -} -``` - -### resource-validator.ncl - -```javascript -let constraints = import "../constraints/constraints.toml" in -let common = import "./common-validator.ncl" in - -{ - # Validate CPU cores for deployment mode - ValidCPUCores = fun mode cores => - let limits = constraints.deployment.{mode} in - common.ValidRange limits.cpu.min limits.cpu.max cores, - - # Validate memory allocation - ValidMemory = fun mode memory_mb => - let limits = constraints.deployment.{mode} in - common.ValidRange limits.memory_mb.min limits.memory_mb.max memory_mb, -} -``` - -## Service-Specific Validators - -### orchestrator-validator.ncl - -```javascript -let constraints = import "../constraints/constraints.toml" in -let common = import "./common-validator.ncl" in - -{ - # Validate worker count - ValidWorkers = fun workers => - common.ValidRange - constraints.orchestrator.workers.min - constraints.orchestrator.workers.max - workers, - - # Validate queue concurrency - ValidConcurrentTasks = fun tasks => - common.ValidRange - constraints.orchestrator.queue.concurrent_tasks.min - constraints.orchestrator.queue.concurrent_tasks.max - tasks, - - # Validate batch parallelism - ValidParallelLimit = fun limit => - common.ValidRange - constraints.orchestrator.batch.parallel_limit.min - constraints.orchestrator.batch.parallel_limit.max - limit, - - # Validate task timeout (ms) - ValidTaskTimeout = fun timeout => - if timeout < 1000 then error "Timeout < 1 second" - else if timeout > 86400000 then error "Timeout > 24 hours" - else timeout, -} -``` - -### control-center-validator.ncl - -```json -{ - # JWT token expiration - ValidTokenExpiration = fun seconds => - if seconds < 300 then error "Token expiration < 5 min" - else if seconds > 604800 then error "Token expiration > 7 days" - else seconds, - - # Rate limit threshold - ValidRateLimit = fun requests_per_minute => - if requests_per_minute < 10 then error "Rate limit too low" - else if requests_per_minute > 10000 then error "Rate limit too high" - else requests_per_minute, -} -``` - -### mcp-server-validator.ncl - -```json -{ - # Max concurrent tool executions - ValidConcurrentTools = fun count => - if count < 1 then error "Must allow >= 1 concurrent" - else if count > 20 then error "Max 20 concurrent tools" - else count, - - # Max resource size - ValidMaxResourceSize = fun bytes => - if bytes < 1048576 then error "Min 1 MB" - else if bytes > 1073741824 then error "Max 1 GB" - else bytes, -} -``` - -## Composition with Configs - -Validators are applied in config files: - -```toml -# configs/orchestrator.solo.ncl -let validators = import "../validators/orchestrator-validator.ncl" in - -{ - orchestrator = { - server.workers = validators.ValidWorkers 2, # Validated - queue.max_concurrent_tasks = validators.ValidConcurrentTasks 3, # Validated - }, -} -``` - -Validation happens at: -1. **Config composition** - When config is evaluated -2. **Nickel typecheck** - When config is typechecked -3. **Form submission** - When TypeDialog form is submitted (constraints) -4. **TOML export** - When Nickel is exported to TOML - -## Error Handling - -### Validation Errors - -```bash -# If validation fails during config evaluation: -# Error: Port too high -``` - -### Meaningful Messages - -Always provide context in error messages: - -```bash -# Bad -std.contract.blame "Invalid" value - -# Good -std.contract.blame_with_message "Port must be 1024-65535, got %{std.to_string value}" port -``` - -## Best Practices - -1. **Reuse common validators** - Build from common-validator.ncl -2. **Name clearly** - Prefix with "Valid" (ValidPort, ValidWorkers, etc.) -3. **Error messages** - Include valid range or enum in message -4. **Test edge cases** - Verify min/max boundary values -5. **Document assumptions** - Why a constraint exists - -## Testing Validators - -```bash -# Test a single validator -nickel eval -c 'import "validators/orchestrator-validator.ncl" as v in v.ValidWorkers 2' - -# Test config with validators -nickel typecheck provisioning/.typedialog/provisioning/platform/configs/orchestrator.solo.ncl - -# Evaluate config (runs validators) -nickel eval provisioning/.typedialog/provisioning/platform/configs/orchestrator.solo.ncl - -# Export to TOML (validates during export) -nickel export --format toml provisioning/.typedialog/provisioning/platform/configs/orchestrator.solo.ncl -``` - -## Adding a New Validator - -1. **Create validator function** in appropriate file: - - ```nickel - ValidMyValue = fun value => - if value < minimum then error "Too low" - else if value > maximum then error "Too high" - else value, - ``` - -2. **Add constraint** to constraints.toml if needed: - - ```toml - [service.feature.my_value] - min = 1 - max = 100 - ``` - -3. **Use in config**: - - ```nickel - my_value = validators.ValidMyValue 50, - ``` - -4. **Add form constraint** (if interactive): - - ```toml - [[elements]] - name = "my_value" - min = "${constraint.service.feature.my_value.min}" - max = "${constraint.service.feature.my_value.max}" - ``` - -5. **Test**: - - ```bash - nickel typecheck configs/service.mode.ncl - ``` - ---- - -**Version**: 1.0.0 -**Last Updated**: 2025-01-05 diff --git a/schemas/platform/validators/ai-service-validator.ncl b/schemas/platform/validators/ai-service-validator.ncl deleted file mode 100644 index 2786d80..0000000 --- a/schemas/platform/validators/ai-service-validator.ncl +++ /dev/null @@ -1,72 +0,0 @@ -# AI Service Validator - -let ai_service_schema = import "../schemas/ai-service.ncl" in -let constraints = import "../constraints/constraints.toml" in - -{ - validate_ai_service_config | ai_service_schema.AiServiceConfig -> Array String = fun config => - let errors = [] in - - # Server port validation - let errors = if config.server.port < 1024 || config.server.port > 65535 - then errors @ ["Server port must be between 1024 and 65535"] - else errors in - - # Server workers validation - let errors = if config.server.workers < constraints.ai_service.workers.min - then errors @ ["Workers below minimum (#{constraints.ai_service.workers.min})"] - else if config.server.workers > constraints.ai_service.workers.max - then errors @ ["Workers above maximum (#{constraints.ai_service.workers.max})"] - else errors in - - # RAG integration validation - let errors = if config.rag.enabled == true - then - let e = [] in - let e = if std.array.length config.rag.rag_service_url == 0 - then e @ ["RAG service URL cannot be empty when enabled"] - else e in - let e = if config.rag.timeout < 1000 - then e @ ["RAG timeout must be at least 1000ms"] - else e in - errors @ e - else errors in - - # MCP integration validation - let errors = if config.mcp.enabled == true - then - let e = [] in - let e = if std.array.length config.mcp.mcp_service_url == 0 - then e @ ["MCP service URL cannot be empty when enabled"] - else e in - let e = if config.mcp.timeout < 1000 - then e @ ["MCP timeout must be at least 1000ms"] - else e in - errors @ e - else errors in - - # DAG configuration validation - let errors = if config.dag.max_concurrent_tasks < constraints.ai_service.max_concurrent_tasks.min - then errors @ ["Max concurrent tasks below minimum (#{constraints.ai_service.max_concurrent_tasks.min})"] - else if config.dag.max_concurrent_tasks > constraints.ai_service.max_concurrent_tasks.max - then errors @ ["Max concurrent tasks above maximum (#{constraints.ai_service.max_concurrent_tasks.max})"] - else errors in - - let errors = if config.dag.task_timeout < 1000 - then errors @ ["Task timeout must be at least 1000ms"] - else errors in - - let errors = if config.dag.retry_attempts < 0 || config.dag.retry_attempts > 20 - then errors @ ["Retry attempts must be between 0 and 20"] - else errors in - - # At least one integration should be enabled in production - let errors = - let rag_enabled = config.rag.enabled in - let mcp_enabled = config.mcp.enabled in - if !rag_enabled && !mcp_enabled - then errors @ ["At least one integration (RAG or MCP) should be enabled"] - else errors in - - errors, -} diff --git a/schemas/platform/validators/common-validator.ncl b/schemas/platform/validators/common-validator.ncl deleted file mode 100644 index d8600e1..0000000 --- a/schemas/platform/validators/common-validator.ncl +++ /dev/null @@ -1,112 +0,0 @@ -# Common Validators -# Reusable validation logic for ports, positive numbers, strings, ranges - -let constraints = import "../constraints/constraints.toml" in - -{ - # Validate port number within allowed range - ValidPort = fun port => - if port < constraints.common.server.port.min then - std.contract.blame_with_message - "Port must be >= %{std.to_string constraints.common.server.port.min}" - port - else if port > constraints.common.server.port.max then - std.contract.blame_with_message - "Port must be <= %{std.to_string constraints.common.server.port.max}" - port - else - port, - - # Validate positive number (> 0) - ValidPositiveNumber = fun n => - if n <= 0 then - std.contract.blame_with_message "Value must be positive (> 0)" n - else - n, - - # Validate non-negative number (>= 0) - ValidNonNegativeNumber = fun n => - if n < 0 then - std.contract.blame_with_message "Value must be non-negative (>= 0)" n - else - n, - - # Validate non-empty string - ValidNonEmptyString = fun s => - if s == "" then - std.contract.blame_with_message "String cannot be empty" s - else - s, - - # Validate string length - ValidStringLength = fun min_len max_len value => - let len = std.string.length value in - if len < min_len then - std.contract.blame_with_message - "String length must be >= %{std.to_string min_len}" - value - else if len > max_len then - std.contract.blame_with_message - "String length must be <= %{std.to_string max_len}" - value - else - value, - - # Validate generic range (min to max inclusive) - ValidRange = fun min_val max_val value => - if value < min_val then - std.contract.blame_with_message - "Value must be >= %{std.to_string min_val}, got %{std.to_string value}" - value - else if value > max_val then - std.contract.blame_with_message - "Value must be <= %{std.to_string max_val}, got %{std.to_string value}" - value - else - value, - - # Validate enum value in allowed set - ValidEnum = fun allowed_values value => - if std.array.elem value allowed_values then - value - else - std.contract.blame_with_message - "Value must be one of: %{std.to_string allowed_values}" - value, - - # Validate timeout duration in milliseconds - ValidTimeoutMs = fun timeout => - if timeout < 1000 then - std.contract.blame_with_message "Timeout must be >= 1000ms (1 second)" timeout - else if timeout > 86400000 then - std.contract.blame_with_message "Timeout must be <= 86400000ms (24 hours)" timeout - else - timeout, - - # Validate interval/period in seconds - ValidIntervalSeconds = fun interval => - if interval < 1 then - std.contract.blame_with_message "Interval must be >= 1 second" interval - else if interval > 3600 then - std.contract.blame_with_message "Interval must be <= 3600 seconds (1 hour)" interval - else - interval, - - # Validate percentage (0-100) - ValidPercentage = fun percentage => - if percentage < 0 then - std.contract.blame_with_message "Percentage must be >= 0" percentage - else if percentage > 100 then - std.contract.blame_with_message "Percentage must be <= 100" percentage - else - percentage, - - # Validate sample rate (0.0-1.0) - ValidSampleRate = fun rate => - if rate < 0.0 then - std.contract.blame_with_message "Sample rate must be >= 0.0" rate - else if rate > 1.0 then - std.contract.blame_with_message "Sample rate must be <= 1.0" rate - else - rate, -} diff --git a/schemas/platform/validators/control-center-validator.ncl b/schemas/platform/validators/control-center-validator.ncl deleted file mode 100644 index 6a6734d..0000000 --- a/schemas/platform/validators/control-center-validator.ncl +++ /dev/null @@ -1,139 +0,0 @@ -# Control Center Validators -# JWT, RBAC, policies, compliance, and security validation - -let constraints = import "../constraints/constraints.toml" in -let common = import "./common-validator.ncl" in -let string_val = import "./string-validator.ncl" in - -{ - # Validate JWT token expiration in seconds - ValidJwtTokenExpiration = fun expiration => - common.ValidRange - constraints.control_center.jwt.token_expiration.min - constraints.control_center.jwt.token_expiration.max - expiration, - - # Validate JWT refresh token expiration - ValidJwtRefreshExpiration = fun expiration => - common.ValidRange - constraints.control_center.jwt.refresh_expiration.min - constraints.control_center.jwt.refresh_expiration.max - expiration, - - # Validate JWT issuer - ValidJwtIssuer = fun issuer => - string_val.ValidIdentifier issuer, - - # Validate JWT audience - ValidJwtAudience = fun audience => - string_val.ValidIdentifier audience, - - # Validate rate limiting max requests - ValidRateLimitMaxRequests = fun requests => - common.ValidRange - constraints.control_center.rate_limiting.max_requests.min - constraints.control_center.rate_limiting.max_requests.max - requests, - - # Validate rate limiting window in seconds - ValidRateLimitWindow = fun window => - common.ValidRange - constraints.control_center.rate_limiting.window_seconds.min - constraints.control_center.rate_limiting.window_seconds.max - window, - - # Validate session max duration in seconds - ValidSessionMaxDuration = fun duration => - common.ValidRange - constraints.control_center.session.max_duration.min - constraints.control_center.session.max_duration.max - duration, - - # Validate MFA max attempts - ValidMfaMaxAttempts = fun attempts => - common.ValidRange - constraints.control_center.mfa.max_attempts.min - constraints.control_center.mfa.max_attempts.max - attempts, - - # Validate audit log retention in days - ValidAuditRetentionDays = fun days => - common.ValidRange - constraints.control_center.audit.retention_days.min - constraints.control_center.audit.retention_days.max - days, - - # Validate role name - ValidRoleName = fun role => - string_val.ValidIdentifier role, - - # Validate policy name - ValidPolicyName = fun policy => - string_val.ValidIdentifier policy, - - # Validate RBAC configuration consistency - ValidRbacConfig = fun rbac_config => - if rbac_config.enabled && rbac_config.default_role == "" then - std.contract.blame_with_message - "RBAC enabled but default_role is empty" - rbac_config - else if !rbac_config.enabled && rbac_config.hierarchy then - std.contract.blame_with_message - "RBAC hierarchy cannot be enabled when RBAC is disabled" - rbac_config - else - rbac_config, - - # Validate policy cache TTL - ValidPolicyCacheTtl = fun ttl => - if ttl < 1 then - std.contract.blame_with_message "Policy cache TTL must be >= 1 second" ttl - else if ttl > 86400 then - std.contract.blame_with_message "Policy cache TTL must be <= 86400 seconds" ttl - else - ttl, - - # Validate max policies in cache - ValidMaxPoliciesInCache = fun count => - if count < 10 then - std.contract.blame_with_message "Max policies must be >= 10" count - else if count > 100000 then - std.contract.blame_with_message "Max policies must be <= 100000" count - else - count, - - # Validate max policy versions - ValidMaxPolicyVersions = fun count => - if count < 1 then - std.contract.blame_with_message "Max policy versions must be >= 1" count - else if count > 100 then - std.contract.blame_with_message "Max policy versions must be <= 100" count - else - count, - - # Validate data retention years for compliance - ValidDataRetentionYears = fun years => - if years < 1 then - std.contract.blame_with_message "Data retention must be >= 1 year" years - else if years > 10 then - std.contract.blame_with_message "Data retention must be <= 10 years" years - else - years, - - # Validate LDAP configuration completeness - ValidLdapConfig = fun ldap_config => - if ldap_config.enabled then - if ldap_config.server_url == null || ldap_config.server_url == "" then - std.contract.blame_with_message - "LDAP enabled but server_url is empty" - ldap_config - else - ldap_config - else - ldap_config, - - # Validate compliance framework selection - ValidComplianceFramework = fun framework => - let valid_frameworks = ['soc2, 'hipaa, 'pci_dss, 'gdpr] in - common.ValidEnum valid_frameworks framework, -} diff --git a/schemas/platform/validators/deployment-validator.ncl b/schemas/platform/validators/deployment-validator.ncl deleted file mode 100644 index a184e47..0000000 --- a/schemas/platform/validators/deployment-validator.ncl +++ /dev/null @@ -1,145 +0,0 @@ -# Deployment Validators -# Deployment mode, resource allocation, and HA configuration validation - -let constraints = import "../constraints/constraints.toml" in -let common = import "./common-validator.ncl" in -let resource_val = import "./resource-validator.ncl" in - -{ - # Validate deployment mode - ValidDeploymentMode = fun mode => - let valid_modes = ['solo, 'multiuser, 'cicd, 'enterprise] in - common.ValidEnum valid_modes mode, - - # Validate cloud provider - ValidCloudProvider = fun provider => - let valid_providers = ['aws, 'gcp, 'azure, 'digitalocean, 'linode, 'custom] in - common.ValidEnum valid_providers provider, - - # Validate deployment target type - ValidDeploymentTarget = fun target => - let valid_targets = ['local, 'docker, 'kubernetes, 'vm, 'bare_metal] in - common.ValidEnum valid_targets target, - - # Validate resource consistency for solo mode - ValidSoloModeResources = fun resources => - let cpu_valid = resource_val.ValidCpuCores 'solo resources.cpu_cores in - let mem_valid = resource_val.ValidMemoryMb 'solo resources.memory_mb in - let disk_valid = resource_val.ValidDiskGb 'solo resources.disk_gb in - if cpu_valid && mem_valid && disk_valid then - resources - else - std.contract.blame_with_message "Solo mode resource constraints violated" resources, - - # Validate resource consistency for multiuser mode - ValidMultiUserModeResources = fun resources => - let cpu_valid = resource_val.ValidCpuCores 'multiuser resources.cpu_cores in - let mem_valid = resource_val.ValidMemoryMb 'multiuser resources.memory_mb in - let disk_valid = resource_val.ValidDiskGb 'multiuser resources.disk_gb in - if cpu_valid && mem_valid && disk_valid then - resources - else - std.contract.blame_with_message "MultiUser mode resource constraints violated" resources, - - # Validate resource consistency for cicd mode - ValidCicdModeResources = fun resources => - let cpu_valid = resource_val.ValidCpuCores 'cicd resources.cpu_cores in - let mem_valid = resource_val.ValidMemoryMb 'cicd resources.memory_mb in - let disk_valid = resource_val.ValidDiskGb 'cicd resources.disk_gb in - if cpu_valid && mem_valid && disk_valid then - resources - else - std.contract.blame_with_message "CI/CD mode resource constraints violated" resources, - - # Validate resource consistency for enterprise mode - ValidEnterpriseModeResources = fun resources => - let cpu_valid = resource_val.ValidCpuCores 'enterprise resources.cpu_cores in - let mem_valid = resource_val.ValidMemoryMb 'enterprise resources.memory_mb in - let disk_valid = resource_val.ValidDiskGb 'enterprise resources.disk_gb in - if cpu_valid && mem_valid && disk_valid then - resources - else - std.contract.blame_with_message "Enterprise mode resource constraints violated" resources, - - # Validate HA replica count for enterprise - ValidHaReplicaCount = fun replicas => - let mode_constraints = constraints.deployment.enterprise in - if replicas < mode_constraints.replicas.min then - std.contract.blame_with_message - "HA replicas must be >= %{std.to_string mode_constraints.replicas.min} (minimum for quorum)" - replicas - else if replicas > mode_constraints.replicas.max then - std.contract.blame_with_message - "HA replicas must be <= %{std.to_string mode_constraints.replicas.max}" - replicas - else - replicas, - - # Validate installation method - ValidInstallationMethod = fun method => - let valid_methods = ['docker_compose, 'kubernetes, 'shell_script, 'terraform] in - common.ValidEnum valid_methods method, - - # Validate deployment strategy - ValidDeploymentStrategy = fun strategy => - let valid_strategies = ['rolling, 'blue_green, 'canary] in - common.ValidEnum valid_strategies strategy, - - # Validate upgrade strategy - ValidUpgradeStrategy = fun strategy => - let valid_strategies = ['rolling, 'blue_green, 'canary] in - common.ValidEnum valid_strategies strategy, - - # Validate installation timeout in minutes - ValidInstallationTimeout = fun timeout => - if timeout < 5 then - std.contract.blame_with_message "Installation timeout must be >= 5 minutes" timeout - else if timeout > 600 then - std.contract.blame_with_message "Installation timeout must be <= 600 minutes (10 hours)" timeout - else - timeout, - - # Validate parallel services count - ValidParallelServices = fun count => - if count < 1 then - std.contract.blame_with_message "Parallel services must be >= 1" count - else if count > 10 then - std.contract.blame_with_message "Parallel services must be <= 10" count - else - count, - - # Validate backup retention days - ValidBackupRetentionDays = fun days => - if days < 1 then - std.contract.blame_with_message "Backup retention must be >= 1 day" days - else if days > 3650 then - std.contract.blame_with_message "Backup retention must be <= 3650 days (10 years)" days - else - days, - - # Validate database backend for mode - ValidDatabaseBackendForMode = fun mode backend => - if mode == 'solo && (backend == 'postgres || backend == 'surrealdb_server) then - std.contract.blame_with_message - "Solo mode cannot use server-based databases (postgres, surrealdb_server)" - backend - else if mode == 'enterprise && backend == 'filesystem then - std.contract.blame_with_message - "Enterprise mode must use robust database (not filesystem)" - backend - else - backend, - - # Validate HA configuration consistency - ValidHaConfiguration = fun ha_config => - if ha_config.enabled && ha_config.replicas < 3 then - std.contract.blame_with_message - "HA configuration requires at least 3 replicas, got %{std.to_string ha_config.replicas}" - ha_config - else if !ha_config.enabled && ha_config.replicas > 1 then - std.contract.blame_with_message - "HA is disabled but replicas > 1" - ha_config - else - ha_config, -} diff --git a/schemas/platform/validators/extension-registry-validator.ncl b/schemas/platform/validators/extension-registry-validator.ncl deleted file mode 100644 index feab012..0000000 --- a/schemas/platform/validators/extension-registry-validator.ncl +++ /dev/null @@ -1,116 +0,0 @@ -# Extension Registry Validator -# Multi-instance configuration validator - -let registry_schema = import "../schemas/extension-registry.ncl" in -let constraints = import "../constraints/constraints.toml" in - -{ - validate_registry_config | registry_schema.RegistryConfig -> Array String = fun config => - let errors = [] in - - # Server port validation - let errors = if config.server.port < 1024 || config.server.port > 65535 - then errors @ ["Server port must be between 1024 and 65535"] - else errors in - - # Workers validation - let errors = if config.server.workers < constraints.registry.workers.min - then errors @ ["Workers below minimum (#{constraints.registry.workers.min})"] - else if config.server.workers > constraints.registry.workers.max - then errors @ ["Workers above maximum (#{constraints.registry.workers.max})"] - else errors in - - # Cache capacity validation - let errors = if config.cache.capacity < constraints.registry.cache_capacity.min - then errors @ ["Cache capacity below minimum (#{constraints.registry.cache_capacity.min})"] - else if config.cache.capacity > constraints.registry.cache_capacity.max - then errors @ ["Cache capacity above maximum (#{constraints.registry.cache_capacity.max})"] - else errors in - - # Cache TTL validation - let errors = if config.cache.ttl_seconds < constraints.registry.cache_ttl.min - then errors @ ["Cache TTL below minimum (#{constraints.registry.cache_ttl.min})"] - else if config.cache.ttl_seconds > constraints.registry.cache_ttl.max - then errors @ ["Cache TTL above maximum (#{constraints.registry.cache_ttl.max})"] - else errors in - - # Validate multi-instance Gitea configurations - let validate_gitea_instances = fun instances => - std.array.fold (fun acc inst => - let inst_errors = [] in - let inst_errors = if inst.url == "" - then inst_errors @ ["Gitea URL cannot be empty"] - else inst_errors in - let inst_errors = if inst.organization == "" - then inst_errors @ ["Gitea organization cannot be empty"] - else inst_errors in - let inst_errors = if inst.token_path == "" - then inst_errors @ ["Gitea token_path cannot be empty"] - else inst_errors in - acc @ inst_errors - ) [] instances in - - let errors = errors @ validate_gitea_instances config.sources.gitea in - - # Validate multi-instance Forgejo configurations - let validate_forgejo_instances = fun instances => - std.array.fold (fun acc inst => - let inst_errors = [] in - let inst_errors = if inst.url == "" - then inst_errors @ ["Forgejo URL cannot be empty"] - else inst_errors in - let inst_errors = if inst.organization == "" - then inst_errors @ ["Forgejo organization cannot be empty"] - else inst_errors in - let inst_errors = if inst.token_path == "" - then inst_errors @ ["Forgejo token_path cannot be empty"] - else inst_errors in - acc @ inst_errors - ) [] instances in - - let errors = errors @ validate_forgejo_instances config.sources.forgejo in - - # Validate multi-instance GitHub configurations - let validate_github_instances = fun instances => - std.array.fold (fun acc inst => - let inst_errors = [] in - let inst_errors = if inst.organization == "" - then inst_errors @ ["GitHub organization cannot be empty"] - else inst_errors in - let inst_errors = if inst.token_path == "" - then inst_errors @ ["GitHub token_path cannot be empty"] - else inst_errors in - acc @ inst_errors - ) [] instances in - - let errors = errors @ validate_github_instances config.sources.github in - - # Validate multi-instance OCI configurations - let validate_oci_instances = fun instances => - std.array.fold (fun acc inst => - let inst_errors = [] in - let inst_errors = if inst.registry == "" - then inst_errors @ ["OCI registry cannot be empty"] - else inst_errors in - let inst_errors = if inst.namespace == "" - then inst_errors @ ["OCI namespace cannot be empty"] - else inst_errors in - acc @ inst_errors - ) [] instances in - - let errors = errors @ validate_oci_instances config.distributions.oci in - - # At least one backend must be configured - let has_sources = std.array.length config.sources.gitea > 0 - || std.array.length config.sources.forgejo > 0 - || std.array.length config.sources.github > 0 in - let has_distributions = std.array.length config.distributions.oci > 0 in - let has_legacy_gitea = config.gitea != null in - let has_legacy_oci = config.oci != null in - - let errors = if !has_sources && !has_distributions && !has_legacy_gitea && !has_legacy_oci - then errors @ ["At least one backend must be configured (sources or distributions)"] - else errors in - - errors, -} diff --git a/schemas/platform/validators/mcp-server-validator.ncl b/schemas/platform/validators/mcp-server-validator.ncl deleted file mode 100644 index 6a5d530..0000000 --- a/schemas/platform/validators/mcp-server-validator.ncl +++ /dev/null @@ -1,126 +0,0 @@ -# MCP Server Validators -# Tools, prompts, resources, capabilities, and sampling validation - -let constraints = import "../constraints/constraints.toml" in -let common = import "./common-validator.ncl" in -let string_val = import "./string-validator.ncl" in - -{ - # Validate max concurrent tool executions - ValidMaxConcurrentTools = fun count => - common.ValidRange - constraints.mcp_server.tools.max_concurrent.min - constraints.mcp_server.tools.max_concurrent.max - count, - - # Validate tool execution timeout in milliseconds - ValidToolTimeout = fun timeout => - common.ValidRange - constraints.mcp_server.tools.timeout.min - constraints.mcp_server.tools.timeout.max - timeout, - - # Validate max resource size in bytes - ValidMaxResourceSize = fun size => - common.ValidRange - constraints.mcp_server.resources.max_size.min - constraints.mcp_server.resources.max_size.max - size, - - # Validate resource cache TTL in seconds - ValidResourceCacheTtl = fun ttl => - common.ValidRange - constraints.mcp_server.resources.cache_ttl.min - constraints.mcp_server.resources.cache_ttl.max - ttl, - - # Validate max custom prompt templates - ValidMaxPromptTemplates = fun count => - common.ValidRange - constraints.mcp_server.prompts.max_templates.min - constraints.mcp_server.prompts.max_templates.max - count, - - # Validate max tokens for sampling - ValidMaxSamplingTokens = fun tokens => - common.ValidRange - constraints.mcp_server.sampling.max_tokens.min - constraints.mcp_server.sampling.max_tokens.max - tokens, - - # Validate temperature value (0.0-2.0 typical range) - ValidTemperature = fun temperature => - if temperature < 0.0 then - std.contract.blame_with_message "Temperature must be >= 0.0" temperature - else if temperature > 2.0 then - std.contract.blame_with_message "Temperature must be <= 2.0" temperature - else - temperature, - - # Validate tool name - ValidToolName = fun name => - string_val.ValidIdentifier name, - - # Validate prompt template name - ValidPromptTemplateName = fun name => - string_val.ValidIdentifier name, - - # Validate resource type - ValidResourceType = fun resource_type => - string_val.ValidCategoryName resource_type, - - # Validate tool category - ValidToolCategory = fun category => - string_val.ValidCategoryName category, - - # Validate protocol version - ValidProtocolVersion = fun version => - if version == "" then - std.contract.blame_with_message "Protocol version cannot be empty" version - else if !std.string.matches "^[0-9]+(\\.[0-9]+)*$" version then - std.contract.blame_with_message - "Protocol version must be semantic version (e.g., 1.0, 2.1.3)" - version - else - version, - - # Validate tool definition completeness - ValidToolDefinition = fun tool => - if tool.name == null || tool.name == "" then - std.contract.blame_with_message "Tool must have a non-empty name" tool - else if tool.description == null || tool.description == "" then - std.contract.blame_with_message "Tool must have a non-empty description" tool - else - tool, - - # Validate prompt template structure - ValidPromptTemplate = fun prompt => - if prompt.name == null || prompt.name == "" then - std.contract.blame_with_message "Prompt must have a non-empty name" prompt - else - prompt, - - # Validate resource path - ValidResourcePath = fun path => - string_val.ValidFilesystemPath path, - - # Validate MCP capabilities configuration - ValidCapabilitiesConfig = fun capabilities => - if !capabilities.tools.enabled && - !capabilities.prompts.enabled && - !capabilities.resources.enabled then - std.contract.blame_with_message - "At least one capability (tools, prompts, or resources) must be enabled" - capabilities - else - capabilities, - - # Validate template engine - ValidTemplateEngine = fun engine => - let valid_engines = ['jinja2, 'tera, 'handlebars] in - common.ValidEnum valid_engines engine, - - # Validate sampling model identifier - ValidSamplingModel = fun model => - string_val.ValidIdentifier model, -} diff --git a/schemas/platform/validators/orchestrator-validator.ncl b/schemas/platform/validators/orchestrator-validator.ncl deleted file mode 100644 index 9bb20c1..0000000 --- a/schemas/platform/validators/orchestrator-validator.ncl +++ /dev/null @@ -1,122 +0,0 @@ -# Orchestrator Validators -# Queue, batch, workflow, and orchestration-specific validation logic - -let constraints = import "../constraints/constraints.toml" in -let common = import "./common-validator.ncl" in - -{ - # Validate worker count within constraints - ValidWorkers = fun workers => - common.ValidRange - constraints.orchestrator.workers.min - constraints.orchestrator.workers.max - workers, - - # Validate max concurrent tasks in queue - ValidConcurrentTasks = fun tasks => - common.ValidRange - constraints.orchestrator.queue.concurrent_tasks.min - constraints.orchestrator.queue.concurrent_tasks.max - tasks, - - # Validate retry attempts - ValidRetryAttempts = fun attempts => - common.ValidRange - constraints.orchestrator.queue.retry_attempts.min - constraints.orchestrator.queue.retry_attempts.max - attempts, - - # Validate retry delay in milliseconds - ValidRetryDelay = fun delay => - common.ValidRange - constraints.orchestrator.queue.retry_delay.min - constraints.orchestrator.queue.retry_delay.max - delay, - - # Validate task timeout in milliseconds - ValidTaskTimeout = fun timeout => - common.ValidRange - constraints.orchestrator.queue.task_timeout.min - constraints.orchestrator.queue.task_timeout.max - timeout, - - # Validate batch parallel limit - ValidParallelLimit = fun limit => - common.ValidRange - constraints.orchestrator.batch.parallel_limit.min - constraints.orchestrator.batch.parallel_limit.max - limit, - - # Validate batch operation timeout in milliseconds - ValidBatchOperationTimeout = fun timeout => - common.ValidRange - constraints.orchestrator.batch.operation_timeout.min - constraints.orchestrator.batch.operation_timeout.max - timeout, - - # Validate checkpoint interval (task count) - ValidCheckpointInterval = fun interval => - if interval < 1 then - std.contract.blame_with_message "Checkpoint interval must be >= 1 task" interval - else if interval > 10000 then - std.contract.blame_with_message "Checkpoint interval must be <= 10000 tasks" interval - else - interval, - - # Validate max checkpoints to retain - ValidMaxCheckpoints = fun count => - if count < 1 then - std.contract.blame_with_message "Max checkpoints must be >= 1" count - else if count > 100 then - std.contract.blame_with_message "Max checkpoints must be <= 100" count - else - count, - - # Validate rollback max depth - ValidRollbackDepth = fun depth => - if depth < 1 then - std.contract.blame_with_message "Rollback depth must be >= 1" depth - else if depth > 100 then - std.contract.blame_with_message "Rollback depth must be <= 100" depth - else - depth, - - # Validate max concurrent extensions - ValidMaxConcurrentExtensions = fun count => - common.ValidRange - constraints.orchestrator.extensions.max_concurrent.min - constraints.orchestrator.extensions.max_concurrent.max - count, - - # Validate extension discovery interval - ValidExtensionDiscoveryInterval = fun interval => - if interval < 10 then - std.contract.blame_with_message "Extension discovery interval must be >= 10 seconds" interval - else if interval > 3600 then - std.contract.blame_with_message "Extension discovery interval must be <= 3600 seconds" interval - else - interval, - - # Validate dead letter queue max size - ValidDlqMaxSize = fun size => - if size < 10 then - std.contract.blame_with_message "DLQ max size must be >= 10" size - else if size > 100000 then - std.contract.blame_with_message "DLQ max size must be <= 100000" size - else - size, - - # Validate workflow is structurally sound - ValidWorkflowStructure = fun workflow => - if workflow == null then - std.contract.blame_with_message "Workflow cannot be null" workflow - else - workflow, - - # Validate task definition is complete - ValidTaskDefinition = fun task => - if task.name == null || task.name == "" then - std.contract.blame_with_message "Task must have a non-empty name" task - else - task, -} diff --git a/schemas/platform/validators/provisioning-daemon-validator.ncl b/schemas/platform/validators/provisioning-daemon-validator.ncl deleted file mode 100644 index 300e6fb..0000000 --- a/schemas/platform/validators/provisioning-daemon-validator.ncl +++ /dev/null @@ -1,97 +0,0 @@ -# Provisioning Daemon Validator - -let daemon_schema = import "../schemas/provisioning-daemon.ncl" in -let constraints = import "../constraints/constraints.toml" in - -{ - validate_daemon_config | daemon_schema.DaemonConfig -> Array String = fun config => - let errors = [] in - - # Daemon configuration validation - let errors = if config.daemon.poll_interval < constraints.daemon.poll_interval.min - then errors @ ["Poll interval below minimum (#{constraints.daemon.poll_interval.min})s"] - else if config.daemon.poll_interval > constraints.daemon.poll_interval.max - then errors @ ["Poll interval above maximum (#{constraints.daemon.poll_interval.max})s"] - else errors in - - let errors = if config.daemon.max_workers < constraints.daemon.max_workers.min - then errors @ ["Max workers below minimum (#{constraints.daemon.max_workers.min})"] - else if config.daemon.max_workers > constraints.daemon.max_workers.max - then errors @ ["Max workers above maximum (#{constraints.daemon.max_workers.max})"] - else errors in - - let errors = if config.daemon.startup_delay < 0 || config.daemon.startup_delay > 300 - then errors @ ["Startup delay must be between 0 and 300 seconds"] - else errors in - - let errors = if config.daemon.graceful_shutdown_timeout < 5 || config.daemon.graceful_shutdown_timeout > 300 - then errors @ ["Graceful shutdown timeout must be between 5 and 300 seconds"] - else errors in - - # Logging configuration validation - let errors = if std.array.length config.logging.file == 0 - then errors @ ["Log file path cannot be empty"] - else errors in - - let errors = if config.logging.max_size < 1 || config.logging.max_size > 1000 - then errors @ ["Max log size must be between 1 and 1000 MB"] - else errors in - - let errors = if config.logging.retention_days < 1 || config.logging.retention_days > 365 - then errors @ ["Log retention days must be between 1 and 365"] - else errors in - - # Actions configuration validation - let errors = if config.actions.cleanup_interval < 1 || config.actions.cleanup_interval > 168 - then errors @ ["Cleanup interval must be between 1 and 168 hours"] - else errors in - - let errors = if config.actions.sync_interval < 5 || config.actions.sync_interval > 1440 - then errors @ ["Sync interval must be between 5 and 1440 minutes"] - else errors in - - # Workers configuration validation - let errors = if config.workers.thread_pool_size < 1 || config.workers.thread_pool_size > 32 - then errors @ ["Thread pool size must be between 1 and 32"] - else errors in - - let errors = if config.workers.max_task_queue_depth < 10 || config.workers.max_task_queue_depth > 100000 - then errors @ ["Max task queue depth must be between 10 and 100000"] - else errors in - - let errors = if config.workers.task_timeout < 10 || config.workers.task_timeout > 3600 - then errors @ ["Task timeout must be between 10 and 3600 seconds"] - else errors in - - let errors = if config.workers.worker_idle_timeout < 10 || config.workers.worker_idle_timeout > 600 - then errors @ ["Worker idle timeout must be between 10 and 600 seconds"] - else errors in - - # Health configuration validation - let errors = if config.health.health_check_interval < 10 || config.health.health_check_interval > 600 - then errors @ ["Health check interval must be between 10 and 600 seconds"] - else errors in - - let errors = if config.health.liveness_timeout < 30 || config.health.liveness_timeout > 600 - then errors @ ["Liveness timeout must be between 30 and 600 seconds"] - else errors in - - let errors = if config.health.readiness_timeout < 5 || config.health.readiness_timeout > 300 - then errors @ ["Readiness timeout must be between 5 and 300 seconds"] - else errors in - - let errors = if config.health.metrics_port < 1024 || config.health.metrics_port > 65535 - then errors @ ["Metrics port must be between 1024 and 65535"] - else errors in - - let errors = if config.health.alert_threshold < 10 || config.health.alert_threshold > 100 - then errors @ ["Alert threshold must be between 10 and 100 percent"] - else errors in - - # Ensure timeout consistency - let errors = if config.health.liveness_timeout <= config.health.readiness_timeout - then errors @ ["Liveness timeout must be greater than readiness timeout"] - else errors in - - errors, -} diff --git a/schemas/platform/validators/rag-validator.ncl b/schemas/platform/validators/rag-validator.ncl deleted file mode 100644 index 83f94c0..0000000 --- a/schemas/platform/validators/rag-validator.ncl +++ /dev/null @@ -1,102 +0,0 @@ -# RAG System Validator - -let rag_schema = import "../schemas/rag.ncl" in -let constraints = import "../constraints/constraints.toml" in - -{ - validate_rag_config | rag_schema.RagConfig -> Array String = fun config => - let errors = [] in - - # If RAG is disabled, skip subsystem validation - let errors = if config.rag.enabled != true - then [] # No validation needed for disabled RAG - else - let errors = [] in - - # Embeddings validation - let errors = if config.embeddings != null - then - let e = [] in - let e = if std.array.length config.embeddings.model == 0 - then e @ ["Embeddings model cannot be empty"] - else e in - let e = if config.embeddings.dimension < 1 - then e @ ["Embeddings dimension must be positive"] - else e in - let e = if config.embeddings.provider == "openai" || config.embeddings.provider == "anthropic" - then if std.is_null config.embeddings.api_key - then e @ ["API key required for #{config.embeddings.provider} embeddings"] - else e - else e in - errors @ e - else errors in - - # Vector DB validation - let errors = if config.vector_db != null - then - let e = [] in - let e = if config.vector_db.db_type == "surrealdb" || config.vector_db.db_type == "qdrant" || config.vector_db.db_type == "milvus" - then if std.is_null config.vector_db.url || config.vector_db.url == "" - then e @ ["Database URL required for #{config.vector_db.db_type}"] - else e - else e in - let e = if std.array.length config.vector_db.namespace == 0 - then e @ ["Namespace cannot be empty"] - else e in - errors @ e - else errors in - - # LLM validation - let errors = if config.llm != null - then - let e = [] in - let e = if std.array.length config.llm.model == 0 - then e @ ["LLM model cannot be empty"] - else e in - let e = if config.llm.provider == "anthropic" || config.llm.provider == "openai" - then if std.is_null config.llm.api_key - then e @ ["API key required for #{config.llm.provider} LLM"] - else e - else e in - let e = if config.llm.temperature < 0.0 || config.llm.temperature > 1.0 - then e @ ["Temperature must be between 0.0 and 1.0"] - else e in - let e = if config.llm.max_tokens < 1 - then e @ ["Max tokens must be positive"] - else e in - errors @ e - else errors in - - # Retrieval validation - let errors = if config.retrieval != null - then - let e = [] in - let e = if config.retrieval.top_k < 1 - then e @ ["Top K must be at least 1"] - else e in - let e = if config.retrieval.similarity_threshold < 0.0 || config.retrieval.similarity_threshold > 1.0 - then e @ ["Similarity threshold must be between 0.0 and 1.0"] - else e in - errors @ e - else errors in - - # Ingestion validation - let errors = if config.ingestion != null - then - let e = [] in - let e = if config.ingestion.chunk_size < 1 - then e @ ["Chunk size must be positive"] - else e in - let e = if config.ingestion.overlap >= config.ingestion.chunk_size - then e @ ["Overlap must be less than chunk size"] - else e in - let e = if std.array.length config.ingestion.doc_types == 0 - then e @ ["At least one document type must be specified"] - else e in - errors @ e - else errors in - - errors in - - errors, -} diff --git a/schemas/platform/validators/resource-validator.ncl b/schemas/platform/validators/resource-validator.ncl deleted file mode 100644 index 4d5dbb9..0000000 --- a/schemas/platform/validators/resource-validator.ncl +++ /dev/null @@ -1,42 +0,0 @@ -# Resource Validators -# CPU, memory, and disk allocation validation for deployment modes - -let constraints = import "../constraints/constraints.toml" in -let common = import "./common-validator.ncl" in - -{ - # Validate CPU cores for deployment mode - ValidCpuCores = fun mode cpu_cores => - let min_cpu = constraints.deployment.solo.cpu.min in - let max_cpu = constraints.deployment.enterprise.cpu.max in - common.ValidRange min_cpu max_cpu cpu_cores, - - # Validate memory allocation in MB for deployment mode - ValidMemoryMb = fun mode memory_mb => - let min_mem = constraints.deployment.solo.memory_mb.min in - let max_mem = constraints.deployment.enterprise.memory_mb.max in - common.ValidRange min_mem max_mem memory_mb, - - # Validate disk allocation in GB for deployment mode - ValidDiskGb = fun mode disk_gb => - let min_disk = constraints.deployment.solo.disk_gb.min in - let max_disk = constraints.deployment.enterprise.disk_gb.max in - common.ValidRange min_disk max_disk disk_gb, - - # Validate file size in bytes - ValidFileSize = fun min_bytes max_bytes size => - common.ValidRange min_bytes max_bytes size, - - # Validate memory size in MB - ValidMemorySizeMb = fun min_mb max_mb size_mb => - common.ValidRange min_mb max_mb size_mb, - - # Validate pool size (number of connections/threads) - ValidPoolSize = fun pool_size => - if pool_size < 1 then - std.contract.blame_with_message "Pool size must be >= 1" pool_size - else if pool_size > 1000 then - std.contract.blame_with_message "Pool size must be <= 1000" pool_size - else - pool_size, -} diff --git a/schemas/platform/validators/string-validator.ncl b/schemas/platform/validators/string-validator.ncl deleted file mode 100644 index 76b7b8e..0000000 --- a/schemas/platform/validators/string-validator.ncl +++ /dev/null @@ -1,131 +0,0 @@ -# String Validators -# Workspace names, identifiers, and other string validations - -{ - # Validate workspace name (alphanumeric, hyphen, underscore, max 64 chars) - ValidWorkspaceName = fun name => - if name == "" then - std.contract.blame_with_message "Workspace name cannot be empty" name - else if std.string.length name > 64 then - std.contract.blame_with_message "Workspace name must be <= 64 characters" name - else if !std.string.matches "^[a-z0-9_-]+$" name then - std.contract.blame_with_message - "Workspace name must contain only lowercase alphanumeric, hyphen, or underscore" - name - else - name, - - # Validate service name - ValidServiceName = fun name => - if name == "" then - std.contract.blame_with_message "Service name cannot be empty" name - else if std.string.length name > 64 then - std.contract.blame_with_message "Service name must be <= 64 characters" name - else if !std.string.matches "^[a-z0-9_-]+$" name then - std.contract.blame_with_message - "Service name must contain only lowercase alphanumeric, hyphen, or underscore" - name - else - name, - - # Validate filesystem path - ValidFilesystemPath = fun path => - if path == "" then - std.contract.blame_with_message "Path cannot be empty" path - else if std.string.length path > 255 then - std.contract.blame_with_message "Path must be <= 255 characters" path - else if std.string.starts_with path "/" then - path - else - std.contract.blame_with_message "Path must be absolute (start with /)" path, - - # Validate relative path - ValidRelativePath = fun path => - if path == "" then - std.contract.blame_with_message "Path cannot be empty" path - else if std.string.length path > 255 then - std.contract.blame_with_message "Path must be <= 255 characters" path - else - path, - - # Validate hostname/domain - ValidHostname = fun hostname => - if hostname == "" then - std.contract.blame_with_message "Hostname cannot be empty" hostname - else if std.string.length hostname > 253 then - std.contract.blame_with_message "Hostname must be <= 253 characters" hostname - else if !std.string.matches "^[a-zA-Z0-9.-]+$" hostname then - std.contract.blame_with_message - "Hostname must contain only alphanumeric, dot, or hyphen" - hostname - else - hostname, - - # Validate IP address (basic check) - ValidIpAddress = fun ip => - if ip == "" then - std.contract.blame_with_message "IP address cannot be empty" ip - else if !std.string.matches "^[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}$|^[0-9a-fA-F:]+$" ip then - std.contract.blame_with_message - "IP address format invalid (IPv4 or IPv6)" - ip - else - ip, - - # Validate URL - ValidUrl = fun url => - if url == "" then - std.contract.blame_with_message "URL cannot be empty" url - else if !std.string.matches "^https?://" url then - std.contract.blame_with_message - "URL must start with http:// or https://" - url - else - url, - - # Validate password minimum length - ValidPassword = fun password => - if std.string.length password < 8 then - std.contract.blame_with_message - "Password must be >= 8 characters" - password - else if std.string.length password > 128 then - std.contract.blame_with_message - "Password must be <= 128 characters" - password - else - password, - - # Validate JWT token format - ValidJwtToken = fun token => - if token == "" then - std.contract.blame_with_message "JWT token cannot be empty" token - else if !std.string.matches "^[a-zA-Z0-9_-]+\\.[a-zA-Z0-9_-]+\\.[a-zA-Z0-9_-]+$" token then - std.contract.blame_with_message - "JWT token format invalid (must be three base64url parts separated by dots)" - token - else - token, - - # Validate identifier (alphanumeric, underscore, hyphen) - ValidIdentifier = fun identifier => - if identifier == "" then - std.contract.blame_with_message "Identifier cannot be empty" identifier - else if !std.string.matches "^[a-z0-9_-]+$" identifier then - std.contract.blame_with_message - "Identifier must contain only lowercase alphanumeric, underscore, or hyphen" - identifier - else - identifier, - - # Validate category/tag name - ValidCategoryName = fun category => - if category == "" then - std.contract.blame_with_message "Category cannot be empty" category - else if !std.string.matches "^[a-z0-9_]+$" category then - std.contract.blame_with_message - "Category must contain only lowercase alphanumeric or underscore" - category - else - category, -} diff --git a/schemas/platform/validators/vault-service-validator.ncl b/schemas/platform/validators/vault-service-validator.ncl deleted file mode 100644 index a8e6543..0000000 --- a/schemas/platform/validators/vault-service-validator.ncl +++ /dev/null @@ -1,31 +0,0 @@ -# Vault Service Validator - -let vault_schema = import "../schemas/vault-service.ncl" in -let constraints = import "../constraints/constraints.toml" in - -{ - validate_vault_config | vault_schema.VaultServiceConfig -> Array String = fun config => - let errors = [] in - let errors = if config.server.port < constraints.vault_service.port.min - then errors @ ["Server port below minimum (#{constraints.vault_service.port.min})"] - else if config.server.port > constraints.vault_service.port.max - then errors @ ["Server port above maximum (#{constraints.vault_service.port.max})"] - else errors in - let errors = if std.array.length config.vault.mount_point == 0 - then errors @ ["Mount point cannot be empty"] - else errors in - let errors = if config.vault.key_name |> std.array.length < 1 - then errors @ ["Key name is required"] - else errors in - let errors = if config.vault.storage_backend == "surrealdb" - then if std.is_null (std.string.contains ":" config.vault.server_url) - then errors @ ["SurrealDB mode requires valid server URL"] - else errors - else errors in - let errors = if config.vault.tls_verify == true - then if std.is_null config.vault.tls_ca_cert - then errors @ ["TLS verification enabled but CA cert not provided"] - else errors - else errors in - errors, -} diff --git a/schemas/platform/values/.gitignore b/schemas/platform/values/.gitignore new file mode 100644 index 0000000..6841cbb --- /dev/null +++ b/schemas/platform/values/.gitignore @@ -0,0 +1,7 @@ +# User configuration values (private/deployment-specific) +*.ncl +*.toml + +# Backup files +*.bak +*.backup diff --git a/schemas/platform/values/README.md b/schemas/platform/values/README.md deleted file mode 100644 index 2d66c52..0000000 --- a/schemas/platform/values/README.md +++ /dev/null @@ -1,311 +0,0 @@ -# Values - -User configuration files for provisioning platform services (gitignored). - -## Purpose - -The values directory stores: -- **User configurations** - Service-specific settings for each deployment mode -- **Generated Nickel configs** - Output from TypeDialog configuration wizard -- **Customizations** - User-specific overrides to defaults -- **Runtime data** - Persisted configuration state - -## File Organization - -```bash -values/ -├── .gitignore # Ignore *.ncl user configs -├── README.md # This file -├── orchestrator.solo.ncl # User config (gitignored) -├── orchestrator.multiuser.ncl -├── orchestrator.cicd.ncl -├── orchestrator.enterprise.ncl -├── control-center.solo.ncl -├── control-center.multiuser.ncl -├── control-center.cicd.ncl -├── control-center.enterprise.ncl -├── mcp-server.solo.ncl -├── mcp-server.multiuser.ncl -├── mcp-server.cicd.ncl -├── mcp-server.enterprise.ncl -├── installer.solo.ncl -├── installer.multiuser.ncl -├── installer.cicd.ncl -├── installer.enterprise.ncl -└── orchestrator.example.ncl # Example template (tracked) -``` - -## Configuration Files - -Each config file (`{service}.{mode}.ncl`) is: -- **Generated by TypeDialog** - Via `configure.nu` wizard -- **User-specific** - Contains customizations for that environment -- **Gitignored** - NOT tracked in version control -- **Runtime data** - Created/updated by scripts and forms - -Example: - -```bash -# values/orchestrator.solo.ncl (auto-generated, user-editable) -{ - orchestrator = { - workspace = { - name = "my-workspace", - path = "/home/user/workspace", - enabled = true, - }, - server = { - host = "127.0.0.1", - port = 9090, - workers = 2, - }, - storage = { - backend = 'filesystem, - path = "/home/user/.provisioning/data", - }, - }, -} -``` - -## .gitignore Pattern - -```bash -# values/.gitignore -*.ncl # Ignore all Nickel config files (user-specific) -!*.example.ncl # EXCEPT example files (tracked for documentation) -``` - -This ensures: -- User configs (`orchestrator.solo.ncl`) are NOT committed -- Example configs (`orchestrator.example.ncl`) ARE committed -- Each user has their own configs without merge conflicts - -## Example Template - -`orchestrator.example.ncl` provides a documented template: - -```nickel -# orchestrator.example.ncl -# Example configuration for Orchestrator service -# Copy to orchestrator.{mode}.ncl and customize for your environment - -{ - orchestrator = { - # Workspace Configuration - workspace = { - # Name of the workspace - name = "default", - - # Absolute path to workspace directory - path = "/var/lib/provisioning/orchestrator", - - # Enable this workspace - enabled = true, - - # Allow serving multiple workspaces - multi_workspace = false, - }, - - # HTTP Server Configuration - server = { - # Bind address (127.0.0.1 for local only, 0.0.0.0 for network) - host = "127.0.0.1", - - # Listen port - port = 9090, - - # Worker thread count - workers = 4, - - # Keep-alive timeout (seconds) - keep_alive = 75, - }, - - # Storage Configuration - storage = { - # Backend: 'filesystem | 'rocksdb | 'surrealdb | 'postgres - backend = 'filesystem, - - # Path for filesystem/rocksdb storage - path = "/var/lib/provisioning/orchestrator/data", - }, - - # Queue Configuration - queue = { - # Maximum concurrent tasks - max_concurrent_tasks = 5, - - # Retry attempts for failed tasks - retry_attempts = 3, - - # Delay between retries (milliseconds) - retry_delay = 5000, - - # Task execution timeout (milliseconds) - task_timeout = 3600000, - }, - }, -} -``` - -## Configuration Workflow - -### 1. Generate Initial Config - -```toml -nu scripts/configure.nu orchestrator solo -``` - -Creates `values/orchestrator.solo.ncl` from form input. - -### 2. Edit Configuration - -```toml -# Manually edit if needed -vi values/orchestrator.solo.ncl - -# Or reconfigure with wizard -nu scripts/configure.nu orchestrator solo --backend web -``` - -### 3. Validate Configuration - -```toml -nu scripts/validate-config.nu values/orchestrator.solo.ncl -``` - -### 4. Generate TOML for Services - -```toml -nu scripts/generate-configs.nu orchestrator solo -``` - -Exports to `provisioning/platform/config/orchestrator.solo.toml` (consumed by Rust services). - -## Configuration Composition - -User configs are composed with defaults during generation: - -```toml -defaults/orchestrator-defaults.ncl (base values) - ↓ & -values/orchestrator.solo.ncl (user customizations) - ↓ -configs/orchestrator.solo.ncl (final generated config) - ↓ -provisioning/platform/config/orchestrator.solo.toml (Rust service config) -``` - -## Best Practices - -1. **Start with example** - Copy `orchestrator.example.ncl` as template -2. **Document changes** - Add inline comments explaining customizations -3. **Use TypeDialog** - Let wizard handle configuration for you -4. **Validate before deploying** - Always run `validate-config.nu` -5. **Keep defaults** - Only override what you need to change -6. **Backup important configs** - Save known-good configurations - -## Sharing Configurations - -Since user configs are gitignored, sharing requires: - -### Option 1: Share via File - -```bash -# Export current config -cat values/orchestrator.solo.ncl > /tmp/orchestrator-config.ncl - -# Import on another system -cp /tmp/orchestrator-config.ncl values/orchestrator.solo.ncl -``` - -### Option 2: Use Example Template -Share setup instructions instead of raw config: - -```toml -# Document the setup steps -cat > SETUP.md << EOF -1. Run: nu scripts/configure.nu orchestrator solo -2. Set workspace path: /shared/workspace -3. Set storage backend: postgres -4. Set server workers: 8 -EOF -``` - -### Option 3: Store in Separate Repo -For team configs, use a separate private repository: - -```toml -# Clone team configs -git clone private-repo/provisioning-configs values/ - -# Use team configs -cp values/team-orchestrator-solo.ncl values/orchestrator.solo.ncl -``` - -## File Permissions - -User config files should have restricted permissions: - -```toml -# Secure config file (if contains secrets) -chmod 600 values/orchestrator.solo.ncl -``` - -## Recovery - -If you accidentally delete a user config: - -### Option 1: Regenerate from TypeDialog - -```nushell -nu scripts/configure.nu orchestrator solo -``` - -### Option 2: Copy from Backup - -```bash -cp /backup/provisioning-values/orchestrator.solo.ncl values/ -``` - -### Option 3: Use Example as Base - -```bash -cp examples/orchestrator-solo.ncl values/orchestrator.solo.ncl -# Customize as needed -nu scripts/configure.nu orchestrator solo --backend web -``` - -## Troubleshooting - -### Config File Missing - -```toml -# Regenerate from defaults -nu scripts/configure.nu orchestrator solo -``` - -### Config Won't Validate - -```toml -# Check for syntax errors -nickel eval values/orchestrator.solo.ncl - -# Compare with example -diff examples/orchestrator-solo.ncl values/orchestrator.solo.ncl -``` - -### Changes Not Taking Effect - -```bash -# Regenerate TOML from Nickel -nu scripts/generate-configs.nu orchestrator solo - -# Verify TOML was updated -ls -la provisioning/platform/config/orchestrator.solo.toml -``` - ---- - -**Version**: 1.0.0 -**Last Updated**: 2025-01-05 diff --git a/schemas/platform/vault-service.ncl b/schemas/platform/vault-service.ncl new file mode 100644 index 0000000..efd44b0 --- /dev/null +++ b/schemas/platform/vault-service.ncl @@ -0,0 +1,131 @@ +# Vault Service Schema +# Secrets management and encryption configuration + +let constraints = import "schemas/platform/common/constraints.ncl" in +let docker_build_schema = import "schemas/platform/docker-build.ncl" in + +let VaultStorage = + std.contract.custom ( + fun label => + fun value => + let valid_backends = ["surrealdb", "etcd", "postgresql", "filesystem"] in + if std.array.any (fun x => x == value) valid_backends then + 'Ok value + else + 'Error { + message = "Invalid storage_backend '%{value}'.\nValid values: surrealdb | etcd | postgresql | filesystem" + } + ) in + +let DeploymentMode = + std.contract.custom ( + fun label => + fun value => + let valid_modes = ["local", "docker", "kubernetes"] in + if std.array.any (fun x => x == value) valid_modes then + 'Ok value + else + 'Error { + message = "Invalid deployment_mode '%{value}'.\nValid values: local | docker | kubernetes" + } + ) in + +let LogLevel = + std.contract.custom ( + fun label => + fun value => + let valid_levels = ["debug", "info", "warn", "error"] in + if std.array.any (fun x => x == value) valid_levels then + 'Ok value + else + 'Error { + message = "Invalid log level '%{value}'.\nValid values: debug | info | warn | error" + } + ) in + +let HAMode = + std.contract.custom ( + fun label => + fun value => + let valid_modes = ["active-passive", "active-active"] in + if std.array.any (fun x => x == value) valid_modes then + 'Ok value + else + 'Error { + message = "Invalid HA mode '%{value}'.\nValid values: active-passive | active-active" + } + ) in + +let EncryptionAlgorithm = + std.contract.custom ( + fun label => + fun value => + let valid_algos = ["aes-256-gcm", "aes-128-gcm", "chacha20-poly1305"] in + if std.array.any (fun x => x == value) valid_algos then + 'Ok value + else + 'Error { + message = "Invalid encryption_algorithm '%{value}'.\nValid values: aes-256-gcm | aes-128-gcm | chacha20-poly1305" + } + ) in + +{ + VaultServiceConfig = { + # Server configuration (port must be >= 9000 for vault-service) + server | { + host | String, + port | Number | constraints.port_high, + workers | Number | optional, + keep_alive | Number | optional, + max_connections | Number | optional, + } | optional, + + # Storage backend configuration + storage | { + backend | VaultStorage, + path | String | optional, + encryption_key_path | String | optional, + } | optional, + + # Vault-specific settings + vault | { + server_url | String, + storage_backend + | doc "Storage Backend for Vault" + | VaultStorage + | default = "filesystem", + deployment_mode | DeploymentMode | optional, + auth_token | String | optional, + mount_point | String | default = "transit", + key_name | String | default = "provisioning-master", + tls_verify | Bool | default = false, + tls_ca_cert | String | optional, + } | optional, + + # High Availability configuration + ha | { + enabled | Bool | default = false, + mode | HAMode | optional, + } | optional, + + # Security configuration + security | { + encryption_algorithm | EncryptionAlgorithm | optional, + key_rotation_days | Number | optional, + } | optional, + + # Monitoring and logging + monitoring | { + enabled | Bool | default = false, + metrics_interval | Number | optional, + } | optional, + + logging | { + level | LogLevel | default = "info", + format | String | optional, + } | optional, + + # Docker build configuration + build | docker_build_schema.DockerBuildConfig | optional, + }, +} diff --git a/schemas/project-card.ncl b/schemas/project-card.ncl new file mode 100644 index 0000000..f9aeec9 --- /dev/null +++ b/schemas/project-card.ncl @@ -0,0 +1,20 @@ +{ + ProjectCard = { + id | String, + name | String, + tagline | String | default = "", + description | String | default = "", + version | String | default = "", + status | [| 'Active, 'Archived, 'Planned, 'Paused |] | default = 'Active, + source | [| 'Local, 'Remote, 'External |] | default = 'Local, + url | String | default = "", + repo | String | default = "", + started_at | String | default = "", + tags | Array String | default = [], + tools | Array String | default = [], + features | Array String | default = [], + featured | Bool | default = false, + sort_order | Number | default = 99, + logo | String | default = "", + }, +} diff --git a/schemas/providers/backup.ncl b/schemas/providers/backup.ncl new file mode 100644 index 0000000..d2dfa6c --- /dev/null +++ b/schemas/providers/backup.ncl @@ -0,0 +1,106 @@ +# Backup provider contract — the interface every backup/restore provider must implement. +# Consumed by op.nu to build and execute archive commands generically. +# Each provider declares its binary, features, environment requirements, and +# the exact subcommands + flags needed for each operation. + +let _Features = { + tags | Bool | doc "Supports --tag key=value on snapshot create" | default = true, + ui | Bool | doc "Has built-in web UI (e.g. kopia server start --ui)" | default = false, + verify | Bool | default = true, + mount | Bool | doc "Supports FUSE mount of snapshots" | default = false, + encryption | Bool | doc "Encrypts snapshots end-to-end (non-negotiable for this stack)" | default = true, + compression | Bool | default = false, + dedup | [| 'none, 'per_repo, 'global |] | default = 'per_repo, + streaming | Bool | doc "Supports backup from stdin (pg_dump pipe, etc.)" | default = false, +} in + +let _Env = { + required | Array String | doc "Must be set before any operation", + optional | Array String | default = [], +} in + +let _Connection = { + required | Bool | doc "Must call connect before backup (kopia=true, restic=false)" | default = false, + status_subcmd | String | optional | doc "Subcommand to check if already connected", + connect_subcmd | String | optional | doc "Subcommand to establish connection to the repo", + state_file | String | optional | doc "Filename stored under ops_dir for per-workspace connection state (e.g. '.kopia-config')", + s3_flags | { + bucket | String | optional, + endpoint | String | optional, + prefix | String | optional, + } | default = {}, +} in + +let _BackupCmd = { + subcmd | String | doc "Subcommand (may contain spaces, e.g. 'snapshot create')", + repo_flag | String | optional | doc "Flag for repository URL; absent if config-file based", + tag_flag | String | optional | doc "Flag for a single tag entry (repeated per tag)", + snapshot_id_regex | String | doc "Regex with named group 'id' to extract snapshot ID from stdout", +} in + +let _RestoreCmd = { + subcmd | String, + repo_flag | String | optional, + target_flag | String | doc "Flag for restore destination; empty string = positional arg", +} in + +let _ListCmd = { + subcmd | String, + repo_flag | String | optional, + tag_flag | String | optional, +} in + +let _ForgetCmd = { + subcmd | String | doc "Subcommand that removes old snapshots", + repo_flag | String | optional, + tag_flag | String | optional | doc "Scope forget to a specific tag (workspace isolation)", + keep_last_flag | String | optional | doc "Absent when retention is controlled via policy_subcmd", + keep_monthly_flag | String | optional, + keep_yearly_flag | String | optional, + extra_flags | Array String | default = [], + policy_subcmd | String | optional | doc "If set, run policy subcommand before forget (kopia model)", + policy_keep_flags | { + keep_latest | String | default = "--keep-latest", + keep_monthly | String | default = "--keep-monthly", + keep_annual | String | default = "--keep-annual", + } | optional, +} in + +let _VerifyCmd = { + subcmd | String, + repo_flag | String | optional, +} in + +# EncryptionRequired contract: provider features must include encryption=true. +# A BackupPolicy referencing a provider that does not encrypt fails at +# `nickel export` time, not at runtime — this is how the non-negotiable +# E2E encryption invariant is enforced. +let _EncryptionRequired = std.contract.from_validator (fun value => + if value.features.encryption == true + then 'Ok + else 'Error { + message = "BackupProvider lacks 'encryption in features (E2E encryption is non-negotiable)", + } +) in + +{ + BackupProvider = { + name | String | doc "Provider identifier — must match the directory name under extensions/providers/backup/", + binary | String | doc "CLI binary invoked for all operations", + features | _Features | default = {}, + env | _Env, + connection | _Connection | default = {}, + mount_capable | Bool | doc "Convenience flag mirroring features.mount" | default = false, + streaming_capable | Bool | doc "Convenience flag mirroring features.streaming" | default = false, + commands = { + backup | _BackupCmd, + restore | _RestoreCmd, + list | _ListCmd, + forget | _ForgetCmd, + verify | _VerifyCmd | optional, + }, + }, + + # Apply this contract to provider definitions to enforce E2E encryption. + EncryptionRequired = _EncryptionRequired, +} diff --git a/schemas/security/config-merger.ncl b/schemas/security/config-merger.ncl new file mode 100644 index 0000000..cab6848 --- /dev/null +++ b/schemas/security/config-merger.ncl @@ -0,0 +1,125 @@ +# Config Merger - Combine defaults, infrastructure config, and secrets + +let helpers = import "schemas/platform/common/helpers.ncl" in + +{ + # Merge configuration with overrides using deep merge + # Preserves nested structures, doesn't lose partial updates + merge_deep = fun base override => + helpers.compose_config base override base, + + # Configuration pipeline: defaults → infrastructure → secrets + pipeline = fun defaults infrastructure_config secrets => + let step1 = compose_config defaults infrastructure_config {} in + compose_config step1 secrets {}, + + # Compose configurations with proper precedence + # Order: base (lowest) → layer1 → layer2 (highest) + compose_config = fun base layer1 layer2 => + let merge_two = fun a b => + std.record.fold ( + fun acc key value => + if std.record.has_field key a && std.record.is_record (std.record.get key a) && std.record.is_record value then + acc & { + (key) = merge_two (std.record.get key a) value, + } + else + acc & { + (key) = value, + } + ) a b in + let combined = merge_two base layer1 in + merge_two combined layer2, + + # Extract environment-specific configuration + by_environment = fun config environment => + let base_config = config | std.record.get_path ["environments", "all"] in + let env_config = config | std.record.get_path ["environments", environment] in + compose_config base_config env_config {}, + + # Build final deployment configuration + # Inputs: defaults, user overrides, secrets, deployment mode + build = fun defaults user_config secrets deployment_mode environment => + let base = compose_config defaults user_config {} in + let with_secrets = compose_config base secrets {} in + let mode_config = ( + with_secrets | std.record.get_path ["deployment_modes", deployment_mode] | default {} + ) in + let final = compose_config with_secrets mode_config {} in + { + config = final, + environment = environment, + deployment_mode = deployment_mode, + merged_from = ["defaults", "user_config", "secrets", "mode_specific"], + }, + + # Validate merged configuration completeness + validate_complete = fun config required_paths => + let check_path = fun path => + let try_get = std.record.get_path (std.string.split "." path) config in + if try_get == null || (std.string.is_string try_get && std.string.is_empty try_get) then + { path = path, valid = false } + else + { path = path, valid = true } in + let results = std.array.map check_path required_paths in + let invalid = std.array.filter (fun r => not r.valid) results in + { + valid = std.array.length invalid == 0, + checked_paths = std.array.length required_paths, + invalid_paths = invalid, + }, + + # Extract configuration for specific component + # Useful for creating component-specific config files + extract_component = fun config component_name => + let component = std.record.get component_name config in + if std.record.is_record component then + component + else + {}, + + # Flatten nested configuration for environment variables + flatten = fun config prefix => + let flatten_impl = fun obj current_prefix => + std.record.fold ( + fun acc key value => + let new_key = $"($current_prefix)_($key)" in + if std.record.is_record value && not (std.string.is_string value) then + acc & (flatten_impl value new_key) + else + acc & { + (new_key) = if std.string.is_string value then value else std.string.from_number value, + } + ) {} obj in + flatten_impl config prefix, + + # Create configuration snapshot for audit/compliance + snapshot = fun config metadata => + { + timestamp = metadata.timestamp, + environment = metadata.environment, + version = metadata.config_version, + deployment_mode = metadata.deployment_mode, + config_hash = ( + config | std.json.stringify | std.crypto.sha256 + ), + merged_sources = metadata.sources, + audit_log = metadata.audit_log, + }, + + # Deep merge utility - recursive merge of nested records + deep_merge = fun records => + let merge_pair = fun a b => + std.record.fold ( + fun acc key value => + if std.record.has_field key a then + let existing = std.record.get key a in + if std.record.is_record existing && std.record.is_record value then + acc & { (key) = deep_merge [existing, value] } + else + acc & { (key) = value } + else + acc & { (key) = value } + ) a b in + std.array.fold_left merge_pair {} records, +} diff --git a/schemas/security/main.ncl b/schemas/security/main.ncl new file mode 100644 index 0000000..64cd6e1 --- /dev/null +++ b/schemas/security/main.ncl @@ -0,0 +1,59 @@ +# Security Module - Unified secrets and encryption management +# Integrates SOPS, Age keys, and vault-service for GitOps-native secret management + +let sops = import "sops/main.ncl" in + +{ + # SOPS encryption configuration + sops = sops, + + # Security configuration for a deployment + SecurityConfig = { + # Which encryption system to use (sops, sealed-secrets, etc.) + encryption_system | std.string | doc "Encryption system: 'sops' or 'sealed-secrets'" = "sops", + + # Environment-specific SOPS configuration + sops_config | sops.SopsEnvironmentConfig | doc "SOPS configuration per environment" + | optional, + + # Environment: dev, staging, prod + environment | std.string | doc "Deployment environment" = "dev", + + # Age key version for tracking rotations + age_key_version | std.number | doc "Age key version (tracks rotations)" = 1, + + # Vault service configuration + vault_service_url | std.string | doc "Vault-service endpoint URL" + | optional = null, + + # Key rotation schedule (optional) + key_rotation_interval_days | std.number | doc "Days between key rotations" + | optional, + + # Audit logging configuration + audit_logging | std.bool | doc "Enable audit logging for secret access" = true, + }, + + # Initialize security for an environment + init = fun environment => + { + encryption_system = "sops", + environment = environment, + age_key_version = 1, + audit_logging = true, + }, + + # Helper to get SOPS rules for an environment + get_sops_rules = fun environment => + sops.generate_sops_yaml environment, + + # Helper to generate .sops.yaml content for deployment + generate_sops_file = fun environment age_public_key => + sops.generate_file environment age_public_key, + + # Validate security configuration + validate = fun config => + config.encryption_system == "sops" && + config.environment in ["dev", "staging", "prod"] && + config.age_key_version >= 1, +} diff --git a/schemas/security/secrets-loader.ncl b/schemas/security/secrets-loader.ncl new file mode 100644 index 0000000..d27c722 --- /dev/null +++ b/schemas/security/secrets-loader.ncl @@ -0,0 +1,126 @@ +# Secrets Loader - Import and merge encrypted YAML secrets into Nickel configs + +{ + # Type for a secrets configuration + SecretsConfig = { + # Source YAML file path (can be SOPS-encrypted) + source_path | std.string | doc "Path to YAML secrets file (relative or absolute)", + # Environment (dev, staging, prod) - determines which secrets file to load + environment | std.string | doc "Environment: dev, staging, or prod" = "dev", + # Whether to merge with defaults + merge_defaults | std.bool | doc "Merge with default configuration" = true, + }, + + # Load secrets from a YAML file + # If file ends with .enc, it will be decrypted via vault-service at runtime + load = fun source_path => + let contents = std.string.trim ( + # Import the YAML file as a string + # At deployment time, this will be decrypted SOPS file + std.json.stringify { path = source_path } + ) in + # Parse YAML as record structure + # Assumes file is in format: + # key1: + # nested: value + # key2: value + contents, + + # Load environment-specific secrets + # Tries: secrets.{env}.yaml → secrets.yaml → {} + load_env = fun base_path environment => + let env_path = $"($base_path)/secrets.($environment).yaml" in + let default_path = $"($base_path)/secrets.yaml" in + { + env_specific = env_path, + default = default_path, + environment = environment, + }, + + # Merge secrets into configuration template + # Replaces placeholder values with actual secrets + merge = fun template secrets => + let replace_placeholders = fun obj => + std.record.map ( + fun _key value => + if std.string.is_string value then + # Check if value is a placeholder like "${secret:database.password}" + if value |> std.string.starts_with "${secret:" then + let secret_path = ( + value + |> std.string.drop_prefix "${secret:" + |> std.string.drop_suffix "}" + ) in + # Navigate to secret_path in secrets record + # e.g., "database.password" → secrets.database.password + std.record.get_path (std.string.split "." secret_path) secrets + else + value + else if std.record.is_record value then + replace_placeholders value + else + value + ) obj in + replace_placeholders template, + + # Extract secrets by path pattern + # Useful for selecting subset of secrets + extract_by_path = fun secrets pattern => + let matches_pattern = fun key => + key |> std.string.contains pattern in + secrets + |> std.record.to_array + |> std.array.filter (fun {key, _value} => matches_pattern key) + |> std.array.fold_left (fun acc item => + acc & { (item.key) = item.value } + ) {}, + + # Validate secrets structure against schema + validate_secrets = fun secrets required_keys => + let missing = ( + required_keys + |> std.array.filter (fun key => + not (std.record.has_field key secrets) + ) + ) in + if std.array.length missing > 0 then + { + valid = false, + missing_keys = missing, + error = $"Missing required secret keys: {std.string.join \", \" missing}", + } + else + { + valid = true, + missing_keys = [], + error = null, + }, + + # Resolve secrets from multiple sources (environment variables, files, defaults) + resolve = fun secret_specs => + let resolved = fun spec => + let env_var = $"PROVISIONING_SECRET_{spec.key}" in + { + key = spec.key, + source = ( + if (std.string.from_env env_var | std.string.is_empty) then + "file" + else + "env" + ), + value = ( + std.string.from_env env_var + | (fun env_val => + if std.string.is_empty env_val then + spec.default_value + else + env_val + ) + ), + } in + secret_specs + |> std.array.map resolved + |> std.array.fold_left (fun acc item => + acc & { (item.key) = item.value } + ) {}, +} diff --git a/schemas/security/sops/contracts.ncl b/schemas/security/sops/contracts.ncl new file mode 100644 index 0000000..fc2cff9 --- /dev/null +++ b/schemas/security/sops/contracts.ncl @@ -0,0 +1,36 @@ +# SOPS Configuration Schema - Type contracts and validation + +{ + SopsRule = { + # Regex pattern to match file paths for encryption + path_regex | std.string | doc "File path regex pattern (e.g., '\\.prod\\.yaml$')" + | optional, + # Age public key for encryption + age | std.string | doc "Age public key (x25519 format: age1...)" + | required, + # Regex to match fields that should be encrypted within matched files + encrypted_regex | std.string | doc "Field name regex for encryption (e.g., '^(password|token)$')" + | optional, + # Key version for tracking key rotation + key_version | std.number | doc "Version number for key rotation tracking" + | optional, + }, + + SopsConfig = { + # Creation rules define which Age key encrypts which files + creation_rules | std.array SopsRule | doc "Array of encryption rules, evaluated sequentially" + | required, + # Decryption rules (optional, for post-rotation compatibility) + key_groups | std.array (std.array SopsRule) | doc "Array of key groups for backward compatibility with rotated keys" + | optional, + }, + + SopsEnvironmentConfig = { + # Dev environment configuration + dev | SopsConfig | doc "Development environment SOPS rules" | optional, + # Staging environment configuration + staging | SopsConfig | doc "Staging environment SOPS rules" | optional, + # Production environment configuration + prod | SopsConfig | doc "Production environment SOPS rules" | optional, + }, +} diff --git a/schemas/security/sops/defaults.ncl b/schemas/security/sops/defaults.ncl new file mode 100644 index 0000000..fb35509 --- /dev/null +++ b/schemas/security/sops/defaults.ncl @@ -0,0 +1,71 @@ +# SOPS Configuration Defaults - Environment-specific encryption rules + +let SopsRule = import "contracts.ncl" in + +{ + # Development environment: Single Age key, encrypts all YAML files + dev = { + creation_rules = [ + { + path_regex = "\.dev\.yaml$", + age = "", # Will be populated by vault-service + encrypted_regex = "^(password|token|key|secret|api_key)$", + key_version = 1, + }, + { + # Catchall for dev + age = "", + encrypted_regex = "^(password|token|key|secret|api_key)$", + key_version = 1, + }, + ], + }, + + # Staging environment: Single Age key, more restrictive encryption + staging = { + creation_rules = [ + { + path_regex = "\.staging\.yaml$", + age = "", + encrypted_regex = "^(password|token|key|secret|api_key|database_url)$", + key_version = 1, + }, + { + path_regex = "\.stg\.yaml$", + age = "", + encrypted_regex = "^(password|token|key|secret|api_key|database_url)$", + key_version = 1, + }, + { + # Catchall for staging + age = "", + encrypted_regex = "^(password|token|key|secret|api_key|database_url)$", + key_version = 1, + }, + ], + }, + + # Production environment: Single Age key, strictest encryption + prod = { + creation_rules = [ + { + path_regex = "\.prod\.yaml$", + age = "", + encrypted_regex = "^(password|token|key|secret|api_key|database_url|tls_cert|tls_key)$", + key_version = 1, + }, + { + path_regex = "\.k\.prod\.yaml$", + age = "", + encrypted_regex = "^(password|token|key|secret|api_key|database_url|tls_cert|tls_key)$", + key_version = 1, + }, + { + # Catchall for prod + age = "", + encrypted_regex = "^(password|token|key|secret|api_key|database_url|tls_cert|tls_key)$", + key_version = 1, + }, + ], + }, +} diff --git a/schemas/security/sops/generator.ncl b/schemas/security/sops/generator.ncl new file mode 100644 index 0000000..51bc521 --- /dev/null +++ b/schemas/security/sops/generator.ncl @@ -0,0 +1,69 @@ +# SOPS YAML Generator - Converts Nickel SOPS config to .sops.yaml format + +let contracts = import "contracts.ncl" in +let defaults = import "defaults.ncl" in + +{ + # Generate a single SOPS rule as YAML-compatible record + rule_to_yaml = fun rule => + let base = { + age = rule.age, + } in + let with_path = if std.string.is_empty rule.path_regex then base else + base & { path_regex = rule.path_regex } in + let with_regex = if std.record.has_field "encrypted_regex" rule then + with_path & { encrypted_regex = rule.encrypted_regex } else + with_path in + with_regex, + + # Generate creation_rules section + generate_creation_rules = fun config => + { + creation_rules = std.array.map rule_to_yaml config.creation_rules, + }, + + # Generate full .sops.yaml configuration for an environment + generate_sops_yaml = fun environment => + let env_config = std.record.get environment defaults in + let rules_section = generate_creation_rules env_config in + rules_section, + + # Generate all environment configurations + generate_all_environments = fun => + { + dev = generate_sops_yaml "dev", + staging = generate_sops_yaml "staging", + prod = generate_sops_yaml "prod", + }, + + # Helper: Get Age public key from vault-service response + extract_public_key = fun vault_response => + vault_response.public_key, + + # Helper: Update config with actual Age keys from vault-service + inject_vault_keys = fun sops_config vault_keys => + let update_rule = fun rule environment => + rule & { + age = std.record.get environment vault_keys, + } in + sops_config, + + # Serialize SOPS config to YAML-compatible text + serialize_to_yaml = fun config => + let serialize_rule = fun rule => + let parts = [] in + let parts = if std.record.has_field "path_regex" rule then + parts @ [ $" - path_regex: {rule.path_regex}" ] else + parts @ [ " -" ] in + let parts = parts @ [ + $" age: '{rule.age}'" + ] in + let parts = if std.record.has_field "encrypted_regex" rule then + parts @ [ $" encrypted_regex: '{rule.encrypted_regex}'" ] else + parts in + std.string.join "\n" parts in + + let rules_text = std.array.join "\n" + (std.array.map serialize_rule config.creation_rules) in + $"# SOPS creation rules - evaluated sequentially, first match wins\ncreation_rules:\n{rules_text}\n", +} diff --git a/schemas/security/sops/main.ncl b/schemas/security/sops/main.ncl new file mode 100644 index 0000000..31f678f --- /dev/null +++ b/schemas/security/sops/main.ncl @@ -0,0 +1,49 @@ +# SOPS Integration Module - Manage encrypted secrets with Age keys + +let contracts = import "contracts.ncl" in +let defaults = import "defaults.ncl" in +let generator = import "generator.ncl" in + +{ + # Type exports + SopsRule = contracts.SopsRule, + SopsConfig = contracts.SopsConfig, + SopsEnvironmentConfig = contracts.SopsEnvironmentConfig, + + # Configuration exports + defaults = defaults, + + # Generator exports + generate_creation_rules = generator.generate_creation_rules, + generate_sops_yaml = generator.generate_sops_yaml, + generate_all_environments = generator.generate_all_environments, + serialize_to_yaml = generator.serialize_to_yaml, + + # Initialize SOPS configuration for an environment with Age keys + init_environment = fun environment age_public_key => + let config = generator.generate_sops_yaml environment in + let update_rules = fun rules => + std.array.map (fun rule => rule & { age = age_public_key }) rules in + config & { creation_rules = update_rules config.creation_rules }, + + # Merge user SOPS rules with defaults (user rules take precedence) + merge_with_defaults = fun environment user_config => + let default_config = generator.generate_sops_yaml environment in + { + creation_rules = (user_config.creation_rules or default_config.creation_rules), + }, + + # Validate SOPS configuration has required fields + validate_config = fun config => + let has_rules = std.array.length config.creation_rules > 0 in + let all_rules_valid = std.array.all (fun rule => + not (std.string.is_empty rule.age) && + std.string.length rule.age > 10 # Basic Age key format check + ) config.creation_rules in + has_rules && all_rules_valid, + + # Generate complete .sops.yaml file content + generate_file = fun environment age_public_key => + let config = init_environment environment age_public_key in + generator.serialize_to_yaml config, +} diff --git a/schemas/tests/fixtures/backup_empty_scopes.ncl b/schemas/tests/fixtures/backup_empty_scopes.ncl new file mode 100644 index 0000000..16d0658 --- /dev/null +++ b/schemas/tests/fixtures/backup_empty_scopes.ncl @@ -0,0 +1,7 @@ +# Fixture: BackupPolicy with empty scopes array. +# Expected: NonEmptyScopes contract rejects. +let bp = import "schemas/lib/backup_policy.ncl" in + +{ + scopes | Array bp.BackupScope | bp.NonEmptyScopes = [], +} diff --git a/schemas/tests/fixtures/backup_no_encryption.ncl b/schemas/tests/fixtures/backup_no_encryption.ncl new file mode 100644 index 0000000..924667b --- /dev/null +++ b/schemas/tests/fixtures/backup_no_encryption.ncl @@ -0,0 +1,33 @@ +# Fixture: BackupProvider declared with encryption=false. +# Expected: EncryptionRequired contract rejects at nickel export time. +let Schema = import "schemas/providers/backup.ncl" in + +{ + provider | Schema.BackupProvider | Schema.EncryptionRequired = { + name = "no-crypto-provider", + binary = "noop", + features = { + tags = false, + ui = false, + verify = false, + mount = false, + encryption = false, # ← deliberately broken + compression = false, + dedup = 'none, + streaming = false, + }, + env = { required = [] }, + commands = { + backup = { + subcmd = "backup", + snapshot_id_regex = "id (?P\\w+)", + }, + restore = { + subcmd = "restore", + target_flag = "--target", + }, + list = { subcmd = "list" }, + forget = { subcmd = "forget" }, + }, + }, +} diff --git a/schemas/tests/fixtures/backup_single_destination.ncl b/schemas/tests/fixtures/backup_single_destination.ncl new file mode 100644 index 0000000..a4ca318 --- /dev/null +++ b/schemas/tests/fixtures/backup_single_destination.ncl @@ -0,0 +1,15 @@ +# Fixture: BackupPolicy with only 1 destination. +# Expected: MultiDestinationRequired contract rejects. +let bp = import "schemas/lib/backup_policy.ncl" in + +let dest = { + name = "only-primary", + kind = 's3, + uri = "s3:host/bucket", + cred_ref = { path = "creds/test", kind = 's3 }, + role = 'primary, +} in + +{ + destinations | Array bp.Destination | bp.MultiDestinationRequired = [dest], +} diff --git a/schemas/tests/fixtures/component_missing_concerns.ncl b/schemas/tests/fixtures/component_missing_concerns.ncl new file mode 100644 index 0000000..817d600 --- /dev/null +++ b/schemas/tests/fixtures/component_missing_concerns.ncl @@ -0,0 +1,10 @@ +# Fixture: ComponentDef without `concerns` field. +# Expected: schema rejects (concerns is mandatory). +let lib = import "schemas/lib/contracts.ncl" in + +{ + component | lib.ComponentDef = { + name = "no-concerns", + mode = 'taskserv, + }, +} diff --git a/schemas/tests/fixtures/component_valid.ncl b/schemas/tests/fixtures/component_valid.ncl new file mode 100644 index 0000000..3549a3a --- /dev/null +++ b/schemas/tests/fixtures/component_valid.ncl @@ -0,0 +1,19 @@ +# Fixture: minimal ComponentDef with all 6 concerns declared. +# Expected: passes; exports valid JSON. +let lib = import "schemas/lib/contracts.ncl" in +let c = import "schemas/lib/concerns.ncl" in + +{ + component | lib.ComponentDef = { + name = "valid-component", + mode = 'taskserv, + concerns = { + tls = c.disabled "stateless service, no TLS termination needed", + dns = c.disabled "no DNS records owned by this component", + certs = c.disabled "no ACME issuer config", + backup = c.pending "policy to be defined" "BACKUP-001", + observability = c.pending "metrics surface to be defined" "OBS-001", + security = c.pending "rbac/networkpolicy to be defined" "SEC-001", + }, + }, +} diff --git a/schemas/workspace/state.ncl b/schemas/workspace/state.ncl new file mode 100644 index 0000000..192ad1b --- /dev/null +++ b/schemas/workspace/state.ncl @@ -0,0 +1,58 @@ +let node_state_type = [| 'pending, 'running, 'completed, 'failed, 'blocked, 'unknown |] in +let operation_type = [| 'create, 'update, 'delete |] in +let source_type = [| 'cli, 'orchestrator, 'sync |] in +let provider_state_type = [| 'running, 'off, 'unknown |] in + +let actor_type = { + identity | String | doc "Username from CLI session, or 'system' for orchestrator-initiated writes", + source | source_type | doc "Write origin: cli = user invocation, orchestrator = daemon, sync = reconcile command", +} in + +let log_entry_type = { + ts | String | doc "ISO-8601 timestamp of the transition", + event | String | doc "Transition description: started | completed | failed | skipped | sync-confirmed", + source | source_type, +} in + +let taskserv_state_type = { + state | node_state_type | default = 'pending, + operation | operation_type | default = 'create, + profile | String | default = "", + started_at | String | default = "", + ended_at | String | default = "", + blocker + | String + | doc "Taskserv name that is blocking this node (non-empty only when state = 'blocked)" + | default = "", + actor = { + identity | String | default = "", + source | source_type | default = 'orchestrator, + }, + log | Array log_entry_type | default = [], +} in + +let server_state_type = { + provider_id | String | default = "", + provider_state | provider_state_type | default = 'unknown, + last_sync | String | default = "", + taskservs | { _ | taskserv_state_type } | default = {}, +} in + +let workspace_state_type = { + workspace | String | doc "Workspace name (directory basename)", + cluster | String | doc "Primary cluster this state file describes", + schema_version | String | default = "2.0", + servers | { _ | server_state_type } | default = {}, +} in + +{ + NodeState = node_state_type, + Operation = operation_type, + Source = source_type, + ProviderState = provider_state_type, + Actor = actor_type, + LogEntry = log_entry_type, + TaskservState = taskserv_state_type, + ServerState = server_state_type, + WorkspaceState = workspace_state_type, +} diff --git a/scripts/audit-workspace.nu b/scripts/audit-workspace.nu new file mode 100644 index 0000000..ce22fee --- /dev/null +++ b/scripts/audit-workspace.nu @@ -0,0 +1,142 @@ +#!/usr/bin/env nu + +# Workspace Audit Query Commands +# Query and analyze workspace operation audit logs + +def get_audit_dir [] { + let home = ($env.HOME | path expand) + $home + "/.local/share/provisioning/audit" +} + +def read_audit_logs [filter_fn: closure] { + let audit_dir = (get_audit_dir) + + if not ($audit_dir | path exists) { + return (error make { msg: $"Audit directory not found: {$audit_dir}" }) + } + + glob $"($audit_dir)/*.jsonl" | each { |file| + cat $file | lines | each { |line| + if ($line | is-empty) { return null } + try { + let event = ($line | from json) + (do $filter_fn $event) | if $in { $event } else { null } + } + } + } | compact +} + +export def "audit workspace operations" [workspace: string] { + let workspace_ops = ["WorkspaceCreate", "WorkspaceDelete", "WorkspaceUpdate", + "WorkspaceSwitch", "WorkspaceList", "WorkspaceSync"] + + let events = (read_audit_logs { |event| + (($event.action.workspace == $workspace) and + ($workspace_ops | any { |op| $event.action.action_type == $op })) + }) + + $events + | sort-by timestamp -r + | each { |e| + { + timestamp: $e.timestamp, + action: $e.action.action_type, + user: $e.user.name, + resource: $e.action.resource, + status: $e.result.status, + duration_ms: $e.result.duration_ms, + } + } +} + +export def "audit workspace summary" [days?: int] { + let days = if ($days == null) { 30 } else { $days } + let cutoff_time = ((now) - ($days * 24 * 60 * 60 | into duration)) + let workspace_ops = ["WorkspaceCreate", "WorkspaceDelete", "WorkspaceUpdate", + "WorkspaceSwitch", "WorkspaceList", "WorkspaceSync"] + + let events = (read_audit_logs { |event| + let event_time = ($event.timestamp | into datetime) + (($event_time > $cutoff_time) and + ($workspace_ops | any { |op| $event.action.action_type == $op })) + }) + + # Simple summary: just list unique workspaces and total ops count + let unique_workspaces = ($events | map { |e| $e.action.workspace } | unique) + + $unique_workspaces | each { |ws| + let ws_events = ($events | where { |e| $e.action.workspace == $ws }) + { + workspace: $ws, + total_operations: ($ws_events | length), + operations: ( + $ws_events + | map { |e| $e.action.action_type } + | unique + | each { |op| + { + action: $op, + count: ($ws_events | where { |e| $e.action.action_type == $op } | length) + } + } + ), + } + } | sort-by workspace +} + +export def "audit workspace failures" [workspace: string] { + let events = (read_audit_logs { |event| + (($event.action.workspace == $workspace) and + ($event.result.status != "Success")) + }) + + $events + | sort-by timestamp -r + | each { |e| + { + timestamp: $e.timestamp, + action: $e.action.action_type, + resource: $e.action.resource, + status: $e.result.status, + error: $e.result.error, + duration_ms: $e.result.duration_ms, + } + } +} + +export def "audit workspace switches" [user_id: string] { + let events = (read_audit_logs { |event| + (($event.user.id == $user_id) and + ($event.action.action_type == "WorkspaceSwitch")) + }) + + $events + | sort-by timestamp -r + | each { |e| + { + timestamp: $e.timestamp, + user: $e.user.name, + to_workspace: $e.action.workspace, + status: $e.result.status, + duration_ms: $e.result.duration_ms, + } + } +} + +export def "audit workspace user-actions" [user_id: string] { + let events = (read_audit_logs { |event| + $event.user.id == $user_id + }) + + $events + | sort-by timestamp -r + | each { |e| + { + timestamp: $e.timestamp, + action: $e.action.action_type, + workspace: $e.action.workspace, + resource: $e.action.resource, + status: $e.result.status, + } + } +} diff --git a/scripts/build-images.nu b/scripts/build-images.nu new file mode 100755 index 0000000..70f99c8 --- /dev/null +++ b/scripts/build-images.nu @@ -0,0 +1,137 @@ +#!/usr/bin/env nu + +# Build Docker images for provisioning platform +# Usage: ./build-images.nu [service...] +# ./build-images.nu # Build all services +# ./build-images.nu orchestrator # Build orchestrator only +# ./build-images.nu orchestrator control-center mcp-server # Multiple services + +use std log + +def main [...services: string] { + let provisioning_root = (pwd) + + # Define all available services and their Dockerfiles + let all_services = { + orchestrator: { + dockerfile: "provisioning/platform/crates/orchestrator/Dockerfile", + tag: "provisioning-orchestrator:latest", + context: "provisioning/platform", + description: "Orchestrator - workflow engine and task queue", + }, + control-center: { + dockerfile: "provisioning/platform/crates/control-center/Dockerfile", + tag: "provisioning-control-center:latest", + context: "provisioning/platform", + description: "Control Center - policy and RBAC management", + }, + mcp-server: { + dockerfile: "provisioning/platform/crates/mcp-server/Dockerfile", + tag: "provisioning-mcp-server:latest", + context: "provisioning/platform", + description: "MCP Server - AI/LLM integration", + }, + extension-registry: { + dockerfile: "provisioning/platform/crates/extension-registry/Dockerfile", + tag: "provisioning-extension-registry:latest", + context: "provisioning/platform", + description: "Extension Registry - plugin management", + }, + rag: { + dockerfile: "provisioning/platform/crates/rag/docker/Dockerfile", + tag: "provisioning-rag:latest", + context: "provisioning/platform", + description: "RAG Service - retrieval augmented generation", + }, + provisioning-daemon: { + dockerfile: "provisioning/platform/crates/provisioning-daemon/Dockerfile", + tag: "provisioning-daemon:latest", + context: "provisioning/platform", + description: "Provisioning Daemon - Nushell execution and config rendering", + }, + } + + # Determine which services to build + let target_services = if ($services | is-empty) { + ["orchestrator", "control-center", "provisioning-daemon", "mcp-server", "extension-registry", "rag"] + } else { + $services + } + + # Validate services + for service in $target_services { + try { + $all_services | get $service | ignore + } catch { + log error $"Unknown service: $service" + log error $"Available services:" + log error $" • orchestrator - Orchestrator - workflow engine and task queue" + log error $" • control-center - Control Center - policy and RBAC management" + log error $" • mcp-server - MCP Server - AI/LLM integration" + log error $" • extension-registry - Extension Registry - plugin management" + log error $" • rag - RAG Service - retrieval augmented generation" + log error $" • provisioning-daemon - Provisioning Daemon - Nushell execution and config rendering" + error make { msg: "Invalid service specified" } + } + } + + log info "🐳 Building Docker images for provisioning platform..." + log info "" + log info "Services to build:" + for service in $target_services { + let svc_config = $all_services | get $service + log info $" • ($service) - ($svc_config.description)" + } + log info "" + + # Build each service + let build_results = ( + $target_services | each { |service| + let svc_cfg = ($all_services | get $service) + let dockerfile = $"($provisioning_root)/($svc_cfg.dockerfile)" + let tag = $svc_cfg.tag + let context = $"($provisioning_root)/($svc_cfg.context)" + + log info $"📦 Building ($service)..." + log info $" Dockerfile: ($svc_cfg.dockerfile)" + log info $" Tag: ($tag)" + log info $" Context: ($svc_cfg.context)" + + let build_result = (do { + ^docker build -f $dockerfile -t $tag $context + } | complete) + + if $build_result.exit_code == 0 { + log info $"✅ ($service) built successfully" + { service: $service, status: "success", tag: $tag } + } else { + log error $"❌ ($service) build failed" + log error $"Error: ($build_result.stderr)" + { service: $service, status: "failed", tag: $tag, error: $build_result.stderr } + } + } + ) + + # Summary + log info "" + log info "📊 Build Summary:" + let successful = ($build_results | where { $in.status == "success" } | length) + let failed = ($build_results | where { $in.status == "failed" } | length) + + $build_results | each { |result| + if $result.status == "success" { + log info $" ✅ ($result.service) → ($result.tag)" + } else { + log error $" ❌ ($result.service) - FAILED" + } + } + + log info "" + log info $" Total: ($successful) successful, ($failed) failed" + + if $failed > 0 { + error make { msg: "Some builds failed" } + } + + log info "✨ All images built successfully!" +} diff --git a/scripts/check-malformed-fences.nu b/scripts/check-malformed-fences.nu index feaf3c4..3cf38ee 100755 --- a/scripts/check-malformed-fences.nu +++ b/scripts/check-malformed-fences.nu @@ -23,6 +23,9 @@ def main [ | where { |f| $f !~ "dist" } | where { |f| $f !~ ".coder" } | where { |f| $f !~ ".claude" } + | where { |f| $f !~ ".wrks" } + | where { |f| $f !~ ".vale" } + | where { |f| $f !~ ".aider" } | where { |f| $f !~ "old_config" } } else { $files @@ -34,7 +37,7 @@ def main [ for file in $md_files { $checked_count = $checked_count + 1 - let content = open $file | lines + let content = open --raw $file | decode utf-8 | lines mut in_fence = false mut fence_lang = "" mut fence_start = 0 diff --git a/scripts/config-deploy.nu b/scripts/config-deploy.nu new file mode 100644 index 0000000..b15fb56 --- /dev/null +++ b/scripts/config-deploy.nu @@ -0,0 +1,314 @@ +#!/usr/bin/env nu +# Configuration Deployment Orchestrator +# Orchestrates Nickel configuration + SOPS secrets workflow +# Handles: config generation → secrets decryption → deployment +# Usage: config-deploy [options] + +use std log + +def get-vault-url [] { + $env.VAULT_SERVICE_URL? // "http://localhost:9094" +} + +def get-vault-token [] { + $env.VAULT_SERVICE_TOKEN? // "" +} + +def check-nickel [] { + let result = (^nickel --version | complete) + if $result.exit_code != 0 { + error make { + msg: "Nickel not found or not in PATH" + label: { + text: "Install Nickel or add to PATH: https://github.com/tweag/nickel" + } + } + } + true +} + +def check-sops [] { + let result = (^sops --version | complete) + if $result.exit_code != 0 { + error make { + msg: "SOPS not found or not in PATH" + label: { + text: "Install SOPS: https://github.com/mozilla/sops" + } + } + } + true +} + +def validate-environment [environment: string] { + if $environment not-in ["dev", "staging", "prod"] { + error make { + msg: "Invalid environment" + label: { + text: "Must be: dev, staging, or prod" + } + } + } + true +} + +def decrypt-secrets [environment: string] { + let secrets_file = $"provisioning/config/secrets/secrets.($environment).yaml.enc" + + if not ($secrets_file | path exists) { + print $"⚠️ Secrets file not found: ($secrets_file)" + return {} + } + + print $"Decrypting secrets for ($environment)..." + let token = (get-vault-token) + if ($token | is-empty) { + error make { + msg: "VAULT_SERVICE_TOKEN required for secret decryption" + label: { + text: "Set environment variable: export VAULT_SERVICE_TOKEN=..." + } + } + } + + let output_file = $"provisioning/config/secrets/secrets.($environment).yaml" + let result = ( + nu provisioning/scripts/secrets-decrypt.nu $secrets_file --environment $environment --output $output_file + ) + + if not ($result.success) { + error make { + msg: "Failed to decrypt secrets" + label: { + text: $result + } + } + } + + print "✓ Secrets decrypted successfully" + $output_file +} + +def generate-nickel-config [config_path: string, environment: string, secrets_path: string] { + print "Generating configuration from Nickel..." + + let result = ( + ^nickel eval -f json provisioning/schemas/examples/deployment-with-secrets.ncl | complete + ) + + if $result.exit_code != 0 { + error make { + msg: "Nickel evaluation failed" + label: { + text: $result.stderr + } + } + } + + print "✓ Configuration generated" + $result.stdout | from json +} + +def validate-config [config: record] { + print "Validating configuration..." + + let required_keys = ["database", "redis", "api"] + let missing = ( + $required_keys + | where {|key| not ($key in ($config | keys))} + ) + + if ($missing | length) > 0 { + let missing_str = ($missing | str join ", ") + error make { + msg: "Configuration validation failed" + label: { + text: $"Missing required sections: ($missing_str)" + } + } + } + + print "✓ Configuration validated" + true +} + +def create-deployment-package [config: record, environment: string, output_dir: string] { + print $"Creating deployment package for ($environment)..." + + if not ($output_dir | path exists) { + ^mkdir -p $output_dir + } + + # Write configuration as JSON + let config_file = $"($output_dir)/config.json" + $config | to json | save -f $config_file + print $"✓ Config: ($config_file)" + + # Write configuration as YAML (for readability) + let yaml_file = $"($output_dir)/config.yaml" + # Note: Would use actual JSON-to-YAML converter in production + $config | to json | save -f $yaml_file + print $"✓ Config: ($yaml_file)" + + # Create deployment manifest + let manifest = { + environment: $environment, + timestamp: (date now | format date "%Y-%m-%dT%H:%M:%SZ"), + config_version: "1.0", + components: [ + { name: "database", host: ($config.database.host // "unknown") }, + { name: "redis", host: ($config.redis.host // "unknown") }, + { name: "api", port: ($config.api.port // 0) }, + ], + } + + let manifest_file = $"($output_dir)/manifest.json" + $manifest | to json | save -f $manifest_file + print $"✓ Manifest: ($manifest_file)" + + $output_dir +} + +def main [ + action: string = "help" + --environment: string = "dev" + --config-path: string = "provisioning/schemas/examples/deployment-with-secrets.ncl" + --output-dir: string = "deploy/output" + --clean +] { + match $action { + "validate-tools" => { + print "Checking required tools..." + check-nickel + print "✓ Nickel: installed" + check-sops + print "✓ SOPS: installed" + print "" + print "All tools present" + }, + + "generate" => { + validate-environment $environment + + print "" + print $"====== Generate Configuration for ($environment) ======" + print "" + + print "Step 1: Checking tools..." + check-nickel + check-sops + print "✓ Tools validated" + + print "" + print "Step 2: Decrypting secrets..." + let secrets_file = (decrypt-secrets $environment) + + print "" + print "Step 3: Generating configuration..." + let config = (generate-nickel-config $config_path $environment $secrets_file) + + print "" + print "Step 4: Validating configuration..." + validate-config $config + + print "" + print "Step 5: Creating deployment package..." + let output = (create-deployment-package $config $environment $output_dir) + + print "" + print $"====== Configuration Ready for Deployment ======" + print $"Output directory: ($output)" + print "" + + if $clean { + print "Cleaning up decrypted secrets..." + rm -f $secrets_file + print "✓ Secrets cleaned up" + } + }, + + "validate-nickel" => { + print "Validating Nickel configuration..." + + let result = (^nickel check $config_path | complete) + if $result.exit_code == 0 { + print "✓ Nickel configuration valid" + } else { + error make { + msg: "Nickel validation failed" + label: { + text: $result.stderr + } + } + } + }, + + "list-environments" => { + print "Available environments:" + ["dev", "staging", "prod"] | each {|env_name| + let secrets_file = $"provisioning/config/secrets/secrets.($env_name).yaml.enc" + let exists = if ($secrets_file | path exists) { "✓" } else { "✗" } + print $" ($exists) ($env_name) - ($secrets_file)" + } + }, + + "status" => { + print "" + print "====== Deployment Configuration Status ======" + print "" + + print "Tools:" + (check-nickel) | if $in { print " ✓ Nickel" } else { print " ✗ Nickel" } + (check-sops) | if $in { print " ✓ SOPS" } else { print " ✗ SOPS" } + + print "" + print "Vault Service:" + print $" URL: (get-vault-url)" + print $" Token: $(if (get-vault-token | is-empty) { 'not set' } else { 'set' })" + + print "" + print "Configuration:" + print $" Nickel config: ($config_path)" + print $" Output dir: ($output_dir)" + + print "" + print "Secrets:" + (main "list-environments") + + print "" + }, + + "help" => { + print "Configuration Deployment Orchestrator" + print "" + print "Combines Nickel infrastructure-as-code with SOPS secrets" + print "" + print "Usage: config-deploy [--environment dev|staging|prod] [--config-path ] [--output-dir ]" + print "" + print "Actions:" + print " validate-tools - Check Nickel and SOPS installation" + print " validate-nickel - Validate Nickel configuration syntax" + print " list-environments - List available secret environments" + print " status - Show deployment configuration status" + print " generate - Full workflow: decrypt → generate → validate → package" + print " help - Show this help message" + print "" + print "Examples:" + print " config-deploy validate-tools" + print " config-deploy status" + print " config-deploy generate --environment prod --clean" + print " config-deploy generate --environment dev --output-dir ./deploy/dev" + print "" + print "Environment Variables:" + print " VAULT_SERVICE_URL - Vault endpoint (default: http://localhost:9094)" + print " VAULT_SERVICE_TOKEN - Vault authentication token (required for decrypt)" + print " PROVISIONING_ENV - Environment (dev, staging, prod)" + print "" + }, + + _ => { + print $"Unknown action: ($action)" + print "Run 'config-deploy help' for available commands" + } + } +} diff --git a/scripts/deploy-librecloud-hetzner.nu b/scripts/deploy-librecloud-hetzner.nu new file mode 100755 index 0000000..132ac6e --- /dev/null +++ b/scripts/deploy-librecloud-hetzner.nu @@ -0,0 +1,622 @@ +#!/usr/bin/env nu +# Deploy LibreCloud Kubernetes cluster to Hetzner Cloud + NixOS +# Complete infrastructure orchestration in 8 phases + +use std log + +def result_ok [value: any] { {ok: $value, err: null} } +def result_err [message: string] { {ok: null, err: $message} } +def is_ok [result: record] { $result.err == null } + +# ============================================================================ +# PHASE 1: Environment Validation +# ============================================================================ + +def validate_environment [] { + log info "Phase 1: Validating environment and prerequisites..." + + let checks = [ + {tool: "nickel", msg: "Nickel not found"} + {tool: "nix", msg: "Nix not found"} + {tool: "nixos-anywhere", msg: "nixos-anywhere not found"} + {tool: "hcloud", msg: "hcloud CLI not found"} + {tool: "ssh", msg: "ssh not found"} + {tool: "curl", msg: "curl not found"} + ] + + for check in $checks { + if (which $check.tool | is-empty) { + return (result_err $check.msg) + } + } + + let workspace_dir = "workspaces/librecloud_hetzner" + if not ($workspace_dir | path exists) { + return (result_err $"Workspace not found: ($workspace_dir)") + } + + let main_ncl = $"($workspace_dir)/infra/main/main.ncl" + if not ($main_ncl | path exists) { + return (result_err $"Main Nickel config missing: ($main_ncl)") + } + + let servers_ncl = $"($workspace_dir)/infra/main/servers.ncl" + if not ($servers_ncl | path exists) { + return (result_err $"Servers config missing: ($servers_ncl)") + } + + log info "✓ Environment validated" + result_ok {workspace_dir: $workspace_dir} +} + +# ============================================================================ +# PHASE 2: Export Nickel Configuration +# ============================================================================ + +def export_nickel_config [workspace_dir: string] { + log info "Phase 2: Exporting Nickel infrastructure configuration..." + + try { + let main_ncl = $"($workspace_dir)/infra/main/main.ncl" + let export_result = (^nickel export $main_ncl | complete) + + if $export_result.exit_code != 0 { + return (result_err $"Nickel export failed: ($export_result.stderr)") + } + + let config = ($export_result.stdout | from json) + if ($config | is-empty) { + return (result_err "Nickel export produced empty configuration") + } + if not ($config | has "infrastructure") { + return (result_err "Configuration missing 'infrastructure' key") + } + + let infra = $config.infrastructure + if not ($infra | has "servers") { + return (result_err "Configuration missing 'servers' list") + } + + log info $"✓ Exported Nickel config with ($infra.servers | length) servers" + result_ok $config + } catch { |err| + result_err $"Nickel export exception: ($err.msg)" + } +} + +# ============================================================================ +# PHASE 3: Generate NixOS Flakes +# ============================================================================ + +def generate_nixos_flakes [workspace_dir: string] { + log info "Phase 3: Generating NixOS flakes from Nickel configuration..." + + let script = "provisioning/scripts/nixos/generate-hetzner-nixos-flake.nu" + if not ($script | path exists) { + return (result_err $"Flake generation script not found: ($script)") + } + + try { + let servers_ncl = $"($workspace_dir)/infra/main/servers.ncl" + let output_dir = $"($workspace_dir)/nixos" + + let gen_result = (^nu $script $servers_ncl --output-dir $output_dir | complete) + if $gen_result.exit_code != 0 { + return (result_err $"Flake generation failed: ($gen_result.stderr)") + } + + # Verify all flakes were created + let hostnames = ["wuji-cp-0", "wuji-strg-0", "wuji-wrkr-0", "sgoyol-0"] + let all_exist = ($hostnames | all { |h| + ($"($output_dir)/($h)/flake.nix" | path exists) + }) + + if not $all_exist { + return (result_err "Flake generation incomplete") + } + + log info "✓ Generated NixOS flakes for all servers" + result_ok "OK" + } catch { |err| + result_err $"Flake generation exception: ($err.msg)" + } +} + +# ============================================================================ +# PHASE 4: Validate NixOS Flakes +# ============================================================================ + +def validate_one_flake [workspace_dir: string, hostname: string] { + let flake_path = $"($workspace_dir)/nixos/($hostname)" + + if not ($flake_path | path exists) { + return {hostname: $hostname, valid: false, error: "Directory not found"} + } + + if not ($"($flake_path)/flake.nix" | path exists) { + return {hostname: $hostname, valid: false, error: "flake.nix not found"} + } + + try { + let show_result = (^nix flake show $flake_path | complete) + if $show_result.exit_code == 0 { + {hostname: $hostname, valid: true, error: null} + } else { + {hostname: $hostname, valid: false, error: $show_result.stderr} + } + } catch { |err| + {hostname: $hostname, valid: false, error: $"Exception: ($err.msg)"} + } +} + +def validate_nixos_flakes [workspace_dir: string] { + log info "Phase 4: Validating NixOS flakes..." + + let hostnames = ["wuji-cp-0", "wuji-strg-0", "wuji-wrkr-0", "sgoyol-0"] + mut validation_results = [] + for h in $hostnames { + let result = (validate_one_flake $workspace_dir $h) + $validation_results = ($validation_results | append $result) + } + + let all_valid = ($validation_results | all { |r| $r.valid }) + if not $all_valid { + let invalid = ($validation_results | where { |r| not $r.valid }) + log error "Flake validation failed:" + for item in $invalid { + log error $" • ($item.hostname): ($item.error)" + } + return (result_err "Flake validation failed") + } + + log info "✓ All NixOS flakes validated" + result_ok $validation_results +} + +# ============================================================================ +# PHASE 5: Create Hetzner Infrastructure +# ============================================================================ + +def create_hetzner_servers [infra_config: record] { + log info "Phase 5: Creating Hetzner Cloud infrastructure..." + + if (($env | has "HCLOUD_TOKEN") == false) or ($env.HCLOUD_TOKEN == "") { + log warning " HCLOUD_TOKEN not set. Cannot create servers." + return (result_err "HCLOUD_TOKEN environment variable not set") + } + + # Ensure private network exists + log info " Checking private network: librecloud-private..." + try { + let net_result = (^hcloud network list --output json | from json) + let networks = (if ($net_result | is-list) { $net_result } else { [$net_result] }) + let exists = ($networks | any { |n| $n.name == "librecloud-private" }) + + if not $exists { + log info " Creating private network: 10.11.0.0/16..." + let create_result = (^hcloud network create --name "librecloud-private" --ip-range "10.11.0.0/16" | complete) + if $create_result.exit_code != 0 { + return (result_err $"Network creation failed: ($create_result.stderr)") + } + } + } catch { |err| + return (result_err $"Network error: ($err.msg)") + } + + # Create servers + let servers_config = $infra_config.infrastructure.servers + let servers_created = [] + + for server in $servers_config { + let hostname = $server.hostname + let server_type = $server.server_type + let location = $server.location + + log info $" Creating server: ($hostname) ($server_type in $location)..." + + try { + let list_result = (^hcloud server list --output json | from json) + let existing = (if ($list_result | is-list) { $list_result } else { [$list_result] }) + let already_exists = ($existing | any { |s| $s.name == $hostname }) + + if $already_exists { + log debug $" ✓ Server already exists" + let ip_info = ($existing | where { |s| $s.name == $hostname } | get 0) + let servers_created = ($servers_created | append { + hostname: $hostname, + server_id: $ip_info.id, + public_ip: ($ip_info.public_net.ipv4.ip // null), + private_ip: $server.networking.private_ip, + status: $ip_info.status + }) + continue + } + + let create_result = (^hcloud server create \ + --type $server_type \ + --location $location \ + --image "ubuntu-24.04" \ + --network "librecloud-private" \ + --name $hostname | complete) + + if $create_result.exit_code != 0 { + return (result_err $"Failed to create server ($hostname): ($create_result.stderr)") + } + + let server_info = (^hcloud server describe $hostname --output json | from json) + log info $" ✓ Server created (ID: ($server_info.id))" + + let servers_created = ($servers_created | append { + hostname: $hostname, + server_id: $server_info.id, + public_ip: $server_info.public_net.ipv4.ip, + private_ip: $server.networking.private_ip, + status: $server_info.status + }) + } catch { |err| + return (result_err $"Server creation error for ($hostname): ($err.msg)") + } + } + + if ($servers_created | is-empty) { + return (result_err "No servers were created or found") + } + + log info $"✓ Infrastructure ready: ($servers_created | length) servers" + result_ok $servers_created +} + +# ============================================================================ +# PHASE 6: Setup SSH Connectivity +# ============================================================================ + +def setup_ssh_connectivity [servers_info: list] { + log info "Phase 6: Setting up SSH connectivity..." + + for server in $servers_info { + let hostname = $server.hostname + let public_ip = $server.public_ip + + log info $" Configuring SSH for ($hostname)..." + + # Try to wait for SSH + mut retries = 0 + while $retries < 30 { + try { + let check = (^ssh -o ConnectTimeout=5 \ + -o StrictHostKeyChecking=accept-new \ + -o UserKnownHostsFile=/dev/null \ + root@$public_ip "echo OK" | complete) + if $check.exit_code == 0 { + break + } + } catch { } + + $retries = $retries + 1 + if $retries < 30 { + sleep 10sec + } + } + + # Set hostname + try { + let set_hostname = (^ssh -o ConnectTimeout=5 \ + -o StrictHostKeyChecking=accept-new \ + -o UserKnownHostsFile=/dev/null \ + root@$public_ip $"hostnamectl set-hostname ($hostname)" | complete) + + if $set_hostname.exit_code == 0 { + log debug $" ✓ Hostname set" + } + } catch { |err| + log warning $" ⚠ Could not set hostname: ($err.msg)" + } + } + + log info "✓ SSH connectivity configured" + result_ok null +} + +# ============================================================================ +# PHASE 7: Deploy NixOS via nixos-anywhere +# ============================================================================ + +def deploy_one_server [workspace_dir: string, server: record] { + let hostname = $server.hostname + let public_ip = $server.public_ip + let flake_path = $"($workspace_dir)/nixos/($hostname)" + + log info $" Deploying NixOS to ($hostname)..." + + if not ($flake_path | path exists) { + return {hostname: $hostname, success: false, error: "Flake not found"} + } + + try { + let deploy_result = (^nixos-anywhere \ + --flake $"($flake_path)#($hostname)" \ + root@$public_ip | complete) + + if $deploy_result.exit_code == 0 { + log info $" ✓ NixOS deployment complete" + {hostname: $hostname, success: true, error: null} + } else { + log warning $" ✗ Deployment failed" + {hostname: $hostname, success: false, error: $deploy_result.stderr} + } + } catch { |err| + log warning $" ✗ Deployment exception: ($err.msg)" + {hostname: $hostname, success: false, error: $err.msg} + } +} + +def deploy_nixos [workspace_dir: string, servers_info: list] { + log info "Phase 7: Deploying NixOS via nixos-anywhere..." + + mut deployments = [] + for s in $servers_info { + let result = (deploy_one_server $workspace_dir $s) + $deployments = ($deployments | append $result) + } + + let all_success = ($deployments | all { |d| $d.success }) + if $all_success { + log info "✓ All servers deployed successfully" + result_ok $deployments + } else { + let failed = ($deployments | where { |d| not $d.success }) + log warning $"⚠ Some deployments failed: ($failed | length) of ($deployments | length)" + result_ok $deployments + } +} + +# ============================================================================ +# PHASE 8: Post-Deployment Validation +# ============================================================================ + +def validate_deployment [servers_info: list] { + log info "Phase 8: Validating post-deployment status..." + + mut validation_results = [] + for server in $servers_info { + let hostname = $server.hostname + let public_ip = $server.public_ip + + log info $" Validating ($hostname)..." + + mut is_nixos = false + try { + let os_check = (^ssh root@$hostname "head -1 /etc/os-release" | complete) + if ($os_check.stdout | str contains "NixOS") { + $is_nixos = true + log debug $" ✓ NixOS confirmed" + } + } catch { } + + let result = { + hostname: $hostname, + public_ip: $public_ip, + is_nixos: $is_nixos, + is_reachable: ($is_nixos) + } + $validation_results = ($validation_results | append $result) + } + + log info "✓ Post-deployment validation complete" + result_ok $validation_results +} + +# ============================================================================ +# CLEANUP: Destroy Infrastructure +# ============================================================================ + +def destroy_infrastructure [servers_info: list] { + log info "Phase X: Destroying Hetzner infrastructure (cleanup)..." + + if (($env | has "HCLOUD_TOKEN") == false) or ($env.HCLOUD_TOKEN == "") { + return (result_err "HCLOUD_TOKEN not set") + } + + mut destroyed = [] + for server in $servers_info { + let hostname = $server.hostname + log info $" Deleting server: ($hostname)..." + + let del_result = (try { + ^hcloud server delete $hostname --force | complete + } catch { + {exit_code: 1, stderr: $"Exception"} + }) + + if $del_result.exit_code == 0 { + log debug $" ✓ Server deleted" + $destroyed = ($destroyed | append {hostname: $hostname, success: true}) + } else { + log warning $" ⚠ Failed to delete" + $destroyed = ($destroyed | append {hostname: $hostname, success: false}) + } + } + + log info $"✓ Cleanup complete" + result_ok $destroyed +} + +# ============================================================================ +# UTILITY: Status Check +# ============================================================================ + +def show_deployment_status [workspace_dir: string] { + log info "Checking deployment status..." + + let hostnames = ["wuji-cp-0", "wuji-strg-0", "wuji-wrkr-0", "sgoyol-0"] + mut flakes_status = [] + for h in $hostnames { + let flake_path = $"($workspace_dir)/nixos/($h)/flake.nix" + let result = {hostname: $h, flake_exists: ($flake_path | path exists)} + $flakes_status = ($flakes_status | append $result) + } + + log info "" + for status in $flakes_status { + if $status.flake_exists { + log info $" ✓ ($status.hostname): flake generated" + } else { + log info $" ✗ ($status.hostname): flake missing" + } + } + + let all_exist = ($flakes_status | all { |s| $s.flake_exists }) + if $all_exist { + log info "" + log info "✓ All flakes ready. Deploy with: HCLOUD_TOKEN=xxx nu provisioning/scripts/deploy-librecloud-hetzner.nu" + } else { + log info "" + log info "⚠ Some flakes missing. Run: nu provisioning/scripts/deploy-librecloud-hetzner.nu --generate-only" + } +} + +# ============================================================================ +# SUMMARY +# ============================================================================ + +def show_deployment_summary [servers_info: list, deployment_results: record] { + log info "" + log info "==========================================" + log info "LibreCloud Hetzner Deployment Complete" + log info "==========================================" + log info "" + log info "Servers:" + for server in $servers_info { + log info $" • ($server.hostname): ($server.public_ip) (private: ($server.private_ip))" + } + log info "" + if ($deployment_results | has "deployments") { + log info "Deployment Status:" + for deploy in $deployment_results.deployments { + if $deploy.success { + log info $" ✓ ($deploy.hostname): deployed" + } else { + log info $" ✗ ($deploy.hostname): failed" + } + } + log info "" + } + log info "Next Steps:" + log info " 1. Verify cluster: kubectl --kubeconfig= get nodes" + log info " 2. Check node status: ssh root@ systemctl status kubelet" + log info " 3. View logs: ssh root@ journalctl -u kubelet -f" + log info "" + log info "Documentation: workspaces/librecloud_hetzner/README.md" + log info "==========================================" +} + +# ============================================================================ +# MAIN +# ============================================================================ + +def main [ + --dry-run + --generate-only + --destroy + --status +] { + # Phase 1: Validate environment + let env_check = (validate_environment) + if not (is_ok $env_check) { + log error $env_check.err + exit 1 + } + + let workspace_dir = $env_check.ok.workspace_dir + + # If --status flag, show status and exit + if $status { + show_deployment_status $workspace_dir + exit 0 + } + + # Phase 2: Export Nickel configuration + let nickel_config = (export_nickel_config $workspace_dir) + if not (is_ok $nickel_config) { + log error $nickel_config.err + exit 1 + } + + # Phase 3: Generate NixOS flakes + let flakes_result = (generate_nixos_flakes $workspace_dir) + if not (is_ok $flakes_result) { + log error $flakes_result.err + exit 1 + } + + # Phase 4: Validate flakes + let validation_result = (validate_nixos_flakes $workspace_dir) + if not (is_ok $validation_result) { + log error $validation_result.err + exit 1 + } + + # If --generate-only, stop here + if $generate_only { + log info "✓ NixOS flakes generated and validated" + exit 0 + } + + # If --dry-run, stop here + if $dry_run { + log info "✓ Dry-run completed (no infrastructure changes)" + exit 0 + } + + # Phase 5: Create Hetzner infrastructure + let infra_result = (create_hetzner_servers $nickel_config.ok) + if not (is_ok $infra_result) { + log error $infra_result.err + exit 1 + } + + let servers_info = $infra_result.ok + + # If --destroy flag, cleanup infrastructure + if $destroy { + let cleanup_result = (destroy_infrastructure $servers_info) + if not (is_ok $cleanup_result) { + log warning $cleanup_result.err + } + exit 0 + } + + # Phase 6: Setup SSH connectivity + let ssh_result = (setup_ssh_connectivity $servers_info) + if not (is_ok $ssh_result) { + log warning $ssh_result.err + } + + # Phase 7: Deploy NixOS + let deploy_result = (deploy_nixos $workspace_dir $servers_info) + if not (is_ok $deploy_result) { + log warning $deploy_result.err + } + + let deployments = $deploy_result.ok + + # Phase 8: Validate deployment + let validation = (validate_deployment $servers_info) + if not (is_ok $validation) { + log warning $validation.err + } + + # Show final summary + show_deployment_summary $servers_info { + deployments: $deployments, + validation: $validation.ok + } + + log info "" + log info "Deployment complete! IPs for reference:" + for server in $servers_info { + log info $" ($server.hostname) = ($server.public_ip)" + } +} + +main diff --git a/scripts/docker-build.nu b/scripts/docker-build.nu new file mode 100755 index 0000000..456ef2a --- /dev/null +++ b/scripts/docker-build.nu @@ -0,0 +1,335 @@ +#!/usr/bin/env nu +# Docker Build Execution Script +# Builds Docker images with BuildKit caching and cargo-chef optimization +# +# Usage: +# docker-build.nu extension-registry --mode solo +# docker-build.nu orchestrator --mode cicd --push +# docker-build.nu --all --mode cicd +# +# Environment: +# PROVISIONING_ROOT - Optional: override project root detection +# +# Patterns: +# - Pipeline let binding +# - Result pattern (NO try-catch) +# - Memory management with unlet for large datasets + +# Search up directory tree for provisioning root (helper) +def search-up-for-provisioning [dir: string]: nothing -> string { + if ($"($dir)/provisioning/schemas/platform" | path exists) { + $"($dir)/provisioning" + } else { + let parent = ($dir | path dirname) + if $parent == $dir { + "" + } else { + search-up-for-provisioning $parent + } + } +} + +# Detect provisioning project root +def get-provisioning-root []: nothing -> string { + # 1. Check environment variable + if ($env.PROVISIONING_ROOT? != null) { + return $env.PROVISIONING_ROOT + } + + # 2. Check if we're in provisioning/ directory + let cwd = (pwd) + if ($cwd | path basename) == "provisioning" { + # We're inside provisioning/, use current dir + if ("schemas/platform" | path exists) { + return "." + } + } + + # 3. Check if provisioning/ exists as subdirectory + if ("provisioning/schemas/platform" | path exists) { + return "provisioning" + } + + # 4. Search up the directory tree + let found = (search-up-for-provisioning $cwd) + if $found != "" { + return $found + } + + # 5. Fallback to "provisioning" (will fail with clear error) + "provisioning" +} + +# Import generator script +use docker-generate-builds.nu + +# Valid service names (from generator) +const VALID_SERVICES = [ + "orchestrator" + "control-center" + "extension-registry" + "mcp-server" + "provisioning-daemon" + "ai-service" + "rag" + "vault-service" +] + +# Valid deployment modes +const VALID_MODES = ["solo", "cicd", "enterprise"] + +# Service name to package directory mapping +const SERVICE_TO_DIR = { + orchestrator: "orchestrator", + control-center: "control-center", + extension-registry: "extension-registry", + mcp-server: "mcp-server", + provisioning-daemon: "daemon", + ai-service: "ai-service", + rag: "rag", + vault-service: "vault-service", +} + +# Service name to Nickel schema key mapping +const SERVICE_TO_SCHEMA_KEY = { + orchestrator: "orchestrator", + control-center: "control_center", + extension-registry: "extension_registry", + mcp-server: "mcp_server", + provisioning-daemon: "provisioning_daemon", + ai-service: "ai_service", + rag: "rag", + vault-service: "vault", +} + +# Read build config from Nickel for a service +def get-build-config [ + service: string, + mode: string, +]: nothing -> record { + let prov_root = (get-provisioning-root) + let schema_key = $SERVICE_TO_SCHEMA_KEY | get $service + let defaults_file = $"($prov_root)/schemas/platform/defaults/($service)-defaults.ncl" + + # Export build config as JSON + let nickel_expr = $" +let defaults = import \"($defaults_file)\" in +defaults.($schema_key).build +" + + let result = ( + echo $nickel_expr + | nickel export --format json + | complete + ) + + if $result.exit_code != 0 { + error make { + msg: "Failed to read build config", + label: { + text: $"Nickel export failed: ($result.stderr)", + span: (metadata $service).span + } + } + } + + $result.stdout | from json +} + +# Build a single service +def main [ + ...services: string, # Service names to build (or use --all) + --all, # Build all services + --mode: string = "solo", # Deployment mode + --push, # Push to registry after build + --no-cache, # Disable BuildKit cache + --registry: string = "localhost:5000" # Container registry for cache/push +]: nothing -> table { + # Determine services to build + let services_to_build = if $all { + $VALID_SERVICES + } else if ($services | is-empty) { + error make { + msg: "No services specified. Use service names or --all" + } + } else { + $services + } + + # Guard: Validate mode + if not ($mode in $VALID_MODES) { + error make { + msg: $"Invalid mode: ($mode). Valid: ($VALID_MODES | str join ', ')" + } + } + + # Build each service + let results = ($services_to_build | each {|service| + print $"Building ($service) with mode ($mode)..." + + # Step 1: Generate Dockerfile + print " → Generating Dockerfile from Nickel template..." + let prov_root = (get-provisioning-root) + let gen_script = $"($prov_root)/scripts/docker-generate-builds.nu" + let gen_result = (nu $gen_script $service --mode $mode) + + if not $gen_result.ok { + print $" ✗ Generation failed: ($gen_result.err)" + return { + service: $service, + ok: false, + stage: "generate", + error: $gen_result.err, + duration: 0 + } + } + + print $" ✓ Generated: ($gen_result.path)" + + # Step 2: Read build config + let build_config = (get-build-config $service $mode) + let package = $build_config.package + let cache_mode = $build_config.buildkit.cache_mode + let parallel_jobs = $build_config.buildkit.parallel_jobs + + # Unlet large config after extraction + # (Memory management for large datasets) + + # Step 3: Prepare build context + let prov_root_build = (get-provisioning-root) + let target_dir = $SERVICE_TO_DIR | get $service + let dockerfile_path = $"($prov_root_build)/platform/crates/($target_dir)/Dockerfile" + let build_context = $"($prov_root_build)/platform" + let image_tag = if $push { + $"($registry)/provisioning-($service):latest" + } else { + $"provisioning-($service):latest" + } + + # Step 4: Construct docker buildx command + let cache_args = if $no_cache { + [] + } else { + match $cache_mode { + "local" => [ + "--cache-from" $"type=local,src=/tmp/buildkit-cache/($service)", + "--cache-to" $"type=local,dest=/tmp/buildkit-cache/($service),mode=max" + ], + "registry" => [ + "--cache-from" $"type=registry,ref=($registry)/cache:($service)", + "--cache-to" $"type=registry,ref=($registry)/cache:($service),mode=max" + ], + "inline" => [ + "--cache-from" $"type=registry,ref=($image_tag)", + "--cache-to" "type=inline" + ], + _ => [] + } + } + + let build_args = [ + "CARGO_BUILD_JOBS" $parallel_jobs + ] + + # Step 5: Execute docker build + print $" → Building Docker image: ($image_tag)" + print $" Cache mode: ($cache_mode), Parallel jobs: ($parallel_jobs)" + + let start_time = (date now) + + let docker_result = ( + docker buildx build + --file $dockerfile_path + --tag $image_tag + ...$cache_args + --build-arg $"CARGO_BUILD_JOBS=($parallel_jobs)" + --progress plain + $build_context + | complete + ) + + let end_time = (date now) + let duration = ($end_time - $start_time | into int) / 1_000_000_000 + + if $docker_result.exit_code != 0 { + print $" ✗ Build failed after ($duration)s" + print $" Error: ($docker_result.stderr)" + return { + service: $service, + ok: false, + stage: "build", + error: $docker_result.stderr, + duration: $duration + } + } + + print $" ✓ Build completed in ($duration)s" + + # Step 6: Push to registry if requested + if $push { + print $" → Pushing to registry: ($registry)" + + let push_result = ( + docker push $image_tag + | complete + ) + + if $push_result.exit_code != 0 { + print $" ✗ Push failed" + return { + service: $service, + ok: false, + stage: "push", + error: $push_result.stderr, + duration: $duration + } + } + + print $" ✓ Pushed successfully" + } + + # Return success + { + service: $service, + ok: true, + stage: "complete", + error: "", + duration: $duration + } + }) + + # Display summary + print "" + print "Build Summary:" + print "==============" + + let successful = ($results | where ok == true | length) + let failed = ($results | where ok == false | length) + let total_duration = ($results | get duration | math sum) + + print $" Total: ($results | length) services" + print $" ✓ Successful: ($successful)" + print $" ✗ Failed: ($failed)" + print $" Total time: ($total_duration)s" + + $results +} + +# Show build configuration for a service +export def "main config" [ + service: string, + --mode: string = "solo" +]: nothing -> record { + if not ($service in $VALID_SERVICES) { + error make { + msg: $"Invalid service: ($service). Valid: ($VALID_SERVICES | str join ', ')" + } + } + + get-build-config $service $mode +} + +# List all services that can be built +export def "main list" []: nothing -> list { + $VALID_SERVICES +} diff --git a/scripts/docker-generate-builds.nu b/scripts/docker-generate-builds.nu new file mode 100755 index 0000000..e1a8c60 --- /dev/null +++ b/scripts/docker-generate-builds.nu @@ -0,0 +1,221 @@ +#!/usr/bin/env nu +# Docker Dockerfile Generator +# Generates Dockerfiles from Nickel templates + build configuration +# +# Usage: +# docker-generate-builds.nu extension-registry --mode solo +# docker-generate-builds.nu orchestrator --mode cicd +# +# Environment: +# PROVISIONING_ROOT - Optional: override project root detection +# +# Patterns: +# - Result pattern (NO try-catch) +# - Pipeline let binding (Nushell 0.110.0+) +# - Guards for validation + +# Search up directory tree for provisioning root (helper) +def search-up-for-provisioning [dir: string]: nothing -> string { + if ($"($dir)/provisioning/schemas/platform" | path exists) { + $"($dir)/provisioning" + } else { + let parent = ($dir | path dirname) + if $parent == $dir { + "" + } else { + search-up-for-provisioning $parent + } + } +} + +# Detect provisioning project root +# Follows same pattern as platform-generate-manifests.nu +def get-provisioning-root []: nothing -> string { + # 1. Check PROVISIONING environment variable (standard) + if ($env.PROVISIONING? != null) { + return $env.PROVISIONING + } + + # 2. Check PROVISIONING_ROOT (alternative) + if ($env.PROVISIONING_ROOT? != null) { + return $env.PROVISIONING_ROOT + } + + # 3. Check if we're in provisioning/ directory + let cwd = (pwd) + if ($cwd | path basename) == "provisioning" { + # We're inside provisioning/, use current dir + if ("schemas/platform" | path exists) { + return "." + } + } + + # 4. Check if provisioning/ exists as subdirectory + if ("provisioning/schemas/platform" | path exists) { + return "provisioning" + } + + # 5. Search up the directory tree + let found = (search-up-for-provisioning $cwd) + if $found != "" { + return $found + } + + # 6. Fallback to current directory (same as platform-generate-manifests.nu) + pwd +} + +# Valid service names +const VALID_SERVICES = [ + "orchestrator" + "control-center" + "extension-registry" + "mcp-server" + "provisioning-daemon" + "ai-service" + "rag" + "vault-service" +] + +# Valid deployment modes +const VALID_MODES = ["solo", "cicd", "enterprise"] + +# Service name to Nickel schema key mapping +const SERVICE_TO_SCHEMA_KEY = { + orchestrator: "orchestrator", + control-center: "control_center", + extension-registry: "extension_registry", + mcp-server: "mcp_server", + provisioning-daemon: "provisioning_daemon", + ai-service: "ai_service", + rag: "rag", + vault-service: "vault", +} + +# Service name to package directory mapping +const SERVICE_TO_DIR = { + orchestrator: "orchestrator", + control-center: "control-center", + extension-registry: "extension-registry", + mcp-server: "mcp-server", + provisioning-daemon: "daemon", + ai-service: "ai-service", + rag: "rag", + vault-service: "vault-service", +} + +# Generate Dockerfile for a service +def main [ + service: string, # Service name (e.g., "orchestrator", "extension-registry") + --mode: string = "solo", # Deployment mode (solo, cicd, enterprise) +]: nothing -> record { + # Guard: Validate service name + if not ($service in $VALID_SERVICES) { + return { + ok: false, + err: $"Invalid service: ($service). Valid: ($VALID_SERVICES | str join ', ')", + path: "" + } + } + + # Guard: Validate mode + if not ($mode in $VALID_MODES) { + return { + ok: false, + err: $"Invalid mode: ($mode). Valid: ($VALID_MODES | str join ', ')", + path: "" + } + } + + # Get schema key for this service + let schema_key = $SERVICE_TO_SCHEMA_KEY | get $service + + # Get target directory for this service + let target_dir = $SERVICE_TO_DIR | get $service + + # Detect provisioning root + let prov_root = (get-provisioning-root) + + # Construct paths + let defaults_file = $"($prov_root)/schemas/platform/defaults/($service)-defaults.ncl" + let template_file = $"($prov_root)/schemas/platform/templates/docker/Dockerfile.chef.ncl" + let output_file = $"($prov_root)/platform/crates/($target_dir)/Dockerfile" + + # Guard: Check defaults file exists + if not ($defaults_file | path exists) { + return { + ok: false, + err: $"Defaults file not found: ($defaults_file)", + path: "" + } + } + + # Guard: Check template file exists + if not ($template_file | path exists) { + return { + ok: false, + err: $"Template file not found: ($template_file)", + path: "" + } + } + + # Generate Nickel expression to export Dockerfile + let nickel_expr = $" +let template = import \"($template_file)\" in +let defaults = import \"($defaults_file)\" in +template defaults.($schema_key).build +" + + # Execute nickel export with Result pattern + let result = ( + echo $nickel_expr + | nickel export --format raw + | complete + ) + + if $result.exit_code != 0 { + return { + ok: false, + err: $"Nickel export failed: ($result.stderr)", + path: "" + } + } + + let dockerfile = $result.stdout + + # Check if generation succeeded + if ($dockerfile | is-empty) { + return { + ok: false, + err: "Generated Dockerfile is empty", + path: "" + } + } + + # Ensure output directory exists + let output_dir = ($output_file | path dirname) + if not ($output_dir | path exists) { + mkdir $output_dir + } + + # Write Dockerfile + $dockerfile | save --force $output_file + + # Return success + { + ok: true, + err: "", + path: $output_file + } +} + +# Helper to generate Dockerfiles for all services +export def "main all" [ + --mode: string = "solo" # Deployment mode +]: nothing -> table { + $VALID_SERVICES + | each {|service| + let result = (main $service --mode $mode) + $result | insert service $service + } +} diff --git a/scripts/docker-generate-compose.nu b/scripts/docker-generate-compose.nu new file mode 100755 index 0000000..7fb9ae1 --- /dev/null +++ b/scripts/docker-generate-compose.nu @@ -0,0 +1,130 @@ +#!/usr/bin/env nu +# Docker Compose Build File Generator +# Generates docker-compose.build.yml from Nickel template for parallel BuildKit builds +# +# Usage: +# docker-generate-compose.nu # Generate with default registry (localhost:5000) +# docker-generate-compose.nu --registry ghcr.io # Generate with custom registry +# +# Environment: +# PROVISIONING_ROOT - Optional: override project root detection +# +# Patterns: +# - Result pattern (NO try-catch) +# - Pipeline let binding (Nushell 0.110.0+) +# - Guards for validation + +# Search up directory tree for provisioning root (helper) +def search-up-for-provisioning [dir: string]: nothing -> string { + if ($"($dir)/schemas/platform" | path exists) { + $dir + } else { + let parent = ($dir | path dirname) + if $parent == $dir { + "" + } else { + search-up-for-provisioning $parent + } + } +} + +# Detect provisioning project root +# Follows same pattern as platform-generate-manifests.nu +def get-provisioning-root []: nothing -> string { + # 1. Check PROVISIONING environment variable (standard) + if ($env.PROVISIONING? != null) { + return $env.PROVISIONING + } + + # 2. Check PROVISIONING_ROOT (alternative) + if ($env.PROVISIONING_ROOT? != null) { + return $env.PROVISIONING_ROOT + } + + # 3. Check if we're in provisioning/ directory + let cwd = (pwd) + if ($cwd | path basename) == "provisioning" { + # We're inside provisioning/, use current dir + if ("schemas/platform" | path exists) { + return "." + } + } + + # 4. Check if provisioning/ exists as subdirectory + if ("provisioning/schemas/platform" | path exists) { + return "provisioning" + } + + # 5. Search up the directory tree + let found = (search-up-for-provisioning $cwd) + if $found != "" { + return $found + } + + # 6. Fallback to current directory + pwd +} + +# Generate docker-compose.build.yml from Nickel template +def main [ + --registry: string = "localhost:5000", # Docker registry for image tags + --output: string = "", # Output file path (default: docker-compose.build.yml) +]: nothing -> record { + # Detect provisioning root + let prov_root = (get-provisioning-root) + + # Construct paths + let template_file = $"($prov_root)/schemas/platform/templates/docker/docker-compose.build.yml.ncl" + let output_file = if ($output | is-empty) { + $"($prov_root)/docker-compose.build.yml" + } else { + $output + } + + # Guard: Check template file exists + if not ($template_file | path exists) { + return { + ok: false, + err: $"Template file not found: ($template_file)", + path: "" + } + } + + # Execute nickel export + # Note: Registry customization must be done by editing the template or the generated file + let result = ( + nickel export + --format yaml + $template_file + | complete + ) + + if $result.exit_code != 0 { + return { + ok: false, + err: $"Nickel export failed: ($result.stderr)", + path: "" + } + } + + let compose_yaml = $result.stdout + + # Check if generation succeeded + if ($compose_yaml | is-empty) { + return { + ok: false, + err: "Generated docker-compose.yml is empty", + path: "" + } + } + + # Write docker-compose.build.yml + $compose_yaml | save --force $output_file + + # Return success + { + ok: true, + err: "", + path: $output_file + } +} diff --git a/scripts/docker-validate-builds.nu b/scripts/docker-validate-builds.nu new file mode 100755 index 0000000..c13b01e --- /dev/null +++ b/scripts/docker-validate-builds.nu @@ -0,0 +1,264 @@ +#!/usr/bin/env nu +# Docker Build Validation Script +# Validates cargo-chef optimization by measuring build times and image sizes +# +# Usage: +# docker-validate-builds.nu extension-registry +# docker-validate-builds.nu orchestrator --iterations 3 +# +# Environment: +# PROVISIONING_ROOT - Optional: override project root detection +# +# Metrics: +# - Cold build (no cache) +# - Warm build (dependency cache) +# - Incremental build (source change only) +# - Image size comparison + +# Search up directory tree for provisioning root (helper) +def search-up-for-provisioning [dir: string]: nothing -> string { + if ($"($dir)/provisioning/schemas/platform" | path exists) { + $"($dir)/provisioning" + } else { + let parent = ($dir | path dirname) + if $parent == $dir { + "" + } else { + search-up-for-provisioning $parent + } + } +} + +# Detect provisioning project root +def get-provisioning-root []: nothing -> string { + # 1. Check environment variable + if ($env.PROVISIONING_ROOT? != null) { + return $env.PROVISIONING_ROOT + } + + # 2. Check if we're in provisioning/ directory + let cwd = (pwd) + if ($cwd | path basename) == "provisioning" { + # We're inside provisioning/, use current dir + if ("schemas/platform" | path exists) { + return "." + } + } + + # 3. Check if provisioning/ exists as subdirectory + if ("provisioning/schemas/platform" | path exists) { + return "provisioning" + } + + # 4. Search up the directory tree + let found = (search-up-for-provisioning $cwd) + if $found != "" { + return $found + } + + # 5. Fallback to "provisioning" (will fail with clear error) + "provisioning" +} + +# Run build benchmark for a service +def main [ + service: string, # Service to benchmark + --iterations: int = 1, # Number of iterations per test + --skip-cold, # Skip cold build (fastest) + --registry: string = "localhost:5000" # Registry for cache +]: nothing -> record { + print $"Docker Build Validation: ($service)" + print "========================================" + print "" + + # Step 1: Generate Dockerfile + print "→ Generating Dockerfile..." + let prov_root = (get-provisioning-root) + let gen_script = $"($prov_root)/scripts/docker-generate-builds.nu" + let gen_result = (nu $gen_script $service --mode solo) + + if not $gen_result.ok { + error make { + msg: $"Failed to generate Dockerfile: ($gen_result.err)" + } + } + + print $" ✓ Generated: ($gen_result.path)" + print "" + + let prov_root_build = (get-provisioning-root) + let build_context = $"($prov_root_build)/platform" + let dockerfile_path = $gen_result.path + let image_tag = $"provisioning-($service):test" + + # Step 2: Cold build (no cache) + let cold_results = if not $skip_cold { + print "→ Running COLD build (no cache)..." + print " This measures full build time including dependencies" + + let cold_times = (1..$iterations | each {|i| + print $" Iteration ($i)/($iterations)..." + + # Clear Docker build cache + docker builder prune --all --force | complete | ignore + + let start = (date now) + + let result = ( + docker buildx build + --file $dockerfile_path + --tag $"($image_tag)-cold" + --no-cache + --progress plain + $build_context + | complete + ) + + let end = (date now) + let duration = (($end - $start) | into int) / 1_000_000_000 + + if $result.exit_code != 0 { + error make { + msg: $"Cold build failed: ($result.stderr)" + } + } + + print $" ✓ Completed in ($duration)s" + + $duration + }) + + let avg = ($cold_times | math avg) + let min = ($cold_times | math min) + let max = ($cold_times | math max) + + print "" + print $" Average: ($avg)s, Min: ($min)s, Max: ($max)s" + print "" + + { + avg: $avg, + min: $min, + max: $max, + times: $cold_times + } + } else { + print "→ Skipping COLD build" + print "" + null + } + + # Step 3: Warm build (with dependency cache) + print "→ Running WARM build (with cargo-chef cache)..." + print " This measures build time with cached dependencies" + + let warm_times = (1..$iterations | each {|i| + print $" Iteration ($i)/($iterations)..." + + let start = (date now) + + let result = ( + docker buildx build + --file $dockerfile_path + --tag $"($image_tag)-warm" + --cache-from $"type=registry,ref=($registry)/cache:($service)" + --cache-to $"type=registry,ref=($registry)/cache:($service),mode=max" + --progress plain + $build_context + | complete + ) + + let end = (date now) + let duration = (($end - $start) | into int) / 1_000_000_000 + + if $result.exit_code != 0 { + error make { + msg: $"Warm build failed: ($result.stderr)" + } + } + + print $" ✓ Completed in ($duration)s" + + $duration + }) + + let warm_avg = ($warm_times | math avg) + let warm_min = ($warm_times | math min) + let warm_max = ($warm_times | math max) + + print "" + print $" Average: ($warm_avg)s, Min: ($warm_min)s, Max: ($warm_max)s" + print "" + + # Step 4: Get image sizes + print "→ Measuring image sizes..." + + let size_warm = ( + docker images $"($image_tag)-warm" --format "{{.Size}}" + | str trim + ) + + print $" Warm build image: ($size_warm)" + print "" + + # Step 5: Calculate savings + let results = if not $skip_cold { + let cache_savings = (($cold_results.avg - $warm_avg) / $cold_results.avg * 100) + + print "Summary:" + print "========" + print $" Cold build (no cache): ($cold_results.avg)s" + print $" Warm build (dep cache): ($warm_avg)s" + print $" Cache savings: ($cache_savings | into int)%" + print $" Image size: ($size_warm)" + print "" + + { + service: $service, + cold: $cold_results, + warm: { + avg: $warm_avg, + min: $warm_min, + max: $warm_max, + times: $warm_times + }, + cache_savings_percent: $cache_savings, + image_size: $size_warm + } + } else { + print "Summary:" + print "========" + print $" Warm build (dep cache): ($warm_avg)s" + print $" Image size: ($size_warm)" + print "" + + { + service: $service, + cold: null, + warm: { + avg: $warm_avg, + min: $warm_min, + max: $warm_max, + times: $warm_times + }, + cache_savings_percent: null, + image_size: $size_warm + } + } + + $results +} + +# Quick validation (warm build only, 1 iteration) +export def "main quick" [ + service: string +]: nothing -> record { + main $service --iterations 1 --skip-cold +} + +# Full benchmark (cold + warm, 3 iterations) +export def "main full" [ + service: string +]: nothing -> record { + main $service --iterations 3 +} diff --git a/scripts/generate-constraints.ncl b/scripts/generate-constraints.ncl new file mode 100644 index 0000000..90b8f5b --- /dev/null +++ b/scripts/generate-constraints.ncl @@ -0,0 +1,73 @@ +# Generate Nickel constraints from TOML master file +# Usage: nickel eval scripts/generate-constraints.ncl > schemas/platform/common/constraints.ncl + +let constraints = import "schemas/platform/constraints/constraints.toml" in + +# Format constraint function - builds Nickel validator code as string +let format_constraint = fun name desc min_val max_val => + " # " ++ desc ++ "\n" ++ + " " ++ name ++ " = contract.from_validator (fun x =>\n" ++ + " if x >= " ++ m%"%{min_val}"% ++ " && x <= " ++ m%"%{max_val}"% ++ " then 'Ok\n" ++ + " else 'Error {message = \"" ++ name ++ " must be between " ++ m%"%{min_val}"% ++ " and " ++ m%"%{max_val}"% ++ "\"}\n" ++ + " )," +in + +# Header +let header = [ + "# Platform Constraints and Validators", + "# AUTOMATICALLY GENERATED from constraints.toml - DO NOT EDIT DIRECTLY", + "# Generated via: nickel eval scripts/generate-constraints.ncl", + "# Source: schemas/platform/constraints/constraints.toml", + "#", + "# Usage: Import in schemas to validate configuration fields", + "# Example: port | constraints.port_standard", + "#", + "# To modify constraints, edit constraints.toml and run:", + "# nickel eval scripts/generate-constraints.ncl > schemas/platform/common/constraints.ncl", + "", + "let contract = std.contract in", + "", + "{", +] in + +# Constraints definitions +let constraints_defs = [ + format_constraint "port_standard" constraints.common.server.port.description constraints.common.server.port.min constraints.common.server.port.max, + format_constraint "port_high" constraints.common.server.port_high.description constraints.common.server.port_high.min constraints.common.server.port_high.max, + format_constraint "vault_port" constraints.vault_service.port.description constraints.vault_service.port.min constraints.vault_service.port.max, + format_constraint "registry_port" constraints.registry.server_port.description constraints.registry.server_port.min constraints.registry.server_port.max, + format_constraint "workers" constraints.orchestrator.workers.description constraints.orchestrator.workers.min constraints.orchestrator.workers.max, + format_constraint "server_workers" constraints.common.server.workers.description constraints.common.server.workers.min constraints.common.server.workers.max, + format_constraint "max_connections" constraints.common.server.max_connections.description constraints.common.server.max_connections.min constraints.common.server.max_connections.max, + format_constraint "retry_attempts" constraints.orchestrator.queue.retry_attempts.description constraints.orchestrator.queue.retry_attempts.min constraints.orchestrator.queue.retry_attempts.max, + format_constraint "metrics_interval" constraints.common.monitoring.metrics_interval.description constraints.common.monitoring.metrics_interval.min constraints.common.monitoring.metrics_interval.max, + format_constraint "health_check_interval" constraints.common.monitoring.health_check_interval.description constraints.common.monitoring.health_check_interval.min constraints.common.monitoring.health_check_interval.max, + format_constraint "task_timeout" constraints.orchestrator.queue.task_timeout.description constraints.orchestrator.queue.task_timeout.min constraints.orchestrator.queue.task_timeout.max, + format_constraint "tool_timeout" constraints.mcp_server.tools.timeout.description constraints.mcp_server.tools.timeout.min constraints.mcp_server.tools.timeout.max, + format_constraint "keep_alive" constraints.common.server.keep_alive.description constraints.common.server.keep_alive.min constraints.common.server.keep_alive.max, + format_constraint "rate_limit_requests" constraints.control_center.rate_limiting.max_requests.description constraints.control_center.rate_limiting.max_requests.min constraints.control_center.rate_limiting.max_requests.max, +] in + +let footer = ["}", ""] in + +# Helper to recursively join array into string with newlines +let rec join_with_newlines = fun arr index acc => + if index >= (std.array.length arr) then + acc + else + let line = std.array.at index arr in + let new_acc = + if acc == "" then + line + else + acc ++ "\n" ++ line + in + join_with_newlines arr (index + 1) new_acc +in + +# Join all parts +let header_str = join_with_newlines header 0 "" in +let constraints_str = join_with_newlines constraints_defs 0 "" in +let footer_str = join_with_newlines footer 0 "" in + +header_str ++ "\n" ++ constraints_str ++ "\n" ++ footer_str diff --git a/scripts/generate-flakes.nu b/scripts/generate-flakes.nu new file mode 100644 index 0000000..5084e4a --- /dev/null +++ b/scripts/generate-flakes.nu @@ -0,0 +1,788 @@ +#!/usr/bin/env nu +# NixOS Flake Generator for Workspace Servers +# Generates flake.nix, configuration.nix, and hardware-configuration.nix from Nickel workspace configs +# +# Usage: +# nu provisioning/scripts/generate-flakes.nu workspaces/librecloud_hetzner +# nu provisioning/scripts/generate-flakes.nu --dry-run workspaces/workspace_librecloud +# +# Features: +# - Reads taskservs from servers.ncl +# - Generates complete flake.nix WITH dynamic taskserv inputs +# - Generates configuration.nix WITH taskserv NixOS modules +# - Phase separation: export → map → generate → write +# - Result pattern {ok, err} (NO try-catch) +# - Relative path normalization for flake inputs + +use std log + +# Helper: Search up directory tree for provisioning root +def search-up-for-provisioning [dir: string] { + if ($"($dir)/provisioning/schemas" | path exists) { + return $dir + } + + let parent = ($dir | path dirname) + if $parent == $dir { + return "" + } + + search-up-for-provisioning $parent +} + +# Helper: Detect provisioning project root +def get-provisioning-root [] { + # 1. Check PROVISIONING environment variable + if ($env.PROVISIONING? != null) { + return $env.PROVISIONING + } + + # 2. Check if we're in provisioning/ directory or above + let cwd = (pwd) + + if ($"($cwd)/provisioning/schemas" | path exists) { + return $cwd + } + + # 3. Search up the directory tree + let found = (search-up-for-provisioning $cwd) + if $found != "" { + return $found + } + + # 4. Fallback to current directory + $cwd +} + +# Helper: Result type - success +def ok [value: any] { + {ok: $value, err: null} +} + +# Helper: Result type - error +def err [message: string] { + {ok: null, err: $message} +} + +# Helper: Check if result is ok +def is-ok [result: record] { + $result.err == null +} + +# Helper: Extract value from ok result +def unwrap-ok [result: record] { + $result.ok +} + +# Helper: Extract error from err result +def unwrap-err [result: record] { + $result.err +} + +# Phase 0: Map taskserv names to extension paths +def map-taskserv-to-path [taskserv_name: string] { + match $taskserv_name { + "etcd" => "cluster/etcd", + "kubernetes" => "cluster/kubernetes", + "coredns" => "cluster/coredns", + "k8s_nodejoin" => "cluster/k8s_nodejoin", + "containerd" => "container_runtime/containerd", + "crio" => "container_runtime/crio", + "podman" => "container_runtime/podman", + "crun" => "container_runtime/crun", + "youki" => "container_runtime/youki", + "runc" => "container_runtime/runc", + "cilium" => "networking/cilium", + "coredns_dns" => "networking/coredns", + "resolv" => "networking/resolv", + "proxy" => "networking/proxy", + "rook_ceph" => "storage/rook_ceph", + "external_nfs" => "storage/external_nfs", + "postgres" => "databases/postgres", + "redis" => "databases/redis", + "os" => "infrastructure/os", + "webhook" => "infrastructure/webhook", + "provisioning" => "infrastructure/provisioning", + "kubectl" => "infrastructure/kubectl", + "prometheus" => "development/prometheus", + "grafana" => "development/grafana", + "loki" => "development/loki", + "gitea" => "development/gitea", + "oras" => "development/oras", + _ => null, + } +} + +# Phase 1: Export servers.ncl via Nickel +def export-servers-config [workspace_path: string, provisioning_root: string] { + # Guard: Workspace directory exists + if not ($workspace_path | path exists) { + return (err $"Workspace directory not found: ($workspace_path)") + } + + # Guard: Find servers.ncl (could be in infra/, infra/default/, infra/main/, infra/region/) + let servers_file = ( + if ($"($workspace_path)/infra/main/servers.ncl" | path exists) { + $"($workspace_path)/infra/main/servers.ncl" + } else if ($"($workspace_path)/infra/servers.ncl" | path exists) { + $"($workspace_path)/infra/servers.ncl" + } else if ($"($workspace_path)/infra/default/servers.ncl" | path exists) { + $"($workspace_path)/infra/default/servers.ncl" + } else { + "" + } + ) + + if ($servers_file | is-empty) { + return (err $"servers.ncl not found in workspace: ($workspace_path)\nLooking for: ($workspace_path)/infra/main/servers.ncl, ($workspace_path)/infra/servers.ncl, or ($workspace_path)/infra/*/servers.ncl") + } + + # Export servers.ncl to JSON via nickel + let export_result = ( + do { + with-env { NICKEL_IMPORT_PATH: $provisioning_root } { + ^nickel export --format json $servers_file + } + } | complete + ) + + # Guard: nickel export succeeded + if $export_result.exit_code != 0 { + let error_msg = ( + if ($export_result.stderr | str contains "command not found") { + "nickel command not found - install Nickel to use this script" + } else { + $"nickel export failed: ($export_result.stderr)" + } + ) + return (err $error_msg) + } + + # Parse JSON output + let servers_json = ( + try { + $export_result.stdout | from json + } catch { + return (err "Failed to parse nickel export output as JSON") + } + ) + + # Guard: JSON has servers field + let has_servers = (($servers_json | get servers?) != null) + if not $has_servers { + return (err "Exported config missing 'servers' field") + } + + ok $servers_json +} + +# Phase 2: Extract enabled taskservs from config record +def extract-enabled-taskservs [taskservs_record: any] { + if ($taskservs_record == null) { + return (ok []) + } + + if not (($taskservs_record | describe) =~ "record") { + return (err $"taskservs must be a record, got (($taskservs_record | describe))") + } + + let enabled = ( + $taskservs_record + | items { |k, v| {key: $k, value: $v} } + | where { |kv| $kv.value.enable? == true or $kv.value.enabled? == true } + | get key + | sort + ) + + ok $enabled +} + +# Helper: Build single flake input record +def build-flake-input [name: string] { + let path = (map-taskserv-to-path $name) + if ($path == null) { + { + input_name: $"taskserv-($name)", + input_line: $' # WARNING: taskserv ($name) has unknown mapping - skipped', + skip: true + } + } else { + let relative_path = $"../../../extensions/taskservs/($path)" + { + input_name: $"taskserv-($name)", + input_line: $' taskserv-($name).url = "path:($relative_path)";', + skip: false + } + } +} + +# Phase 2b: Generate flake inputs for taskservs +def generate-flake-inputs [enabled_taskservs: list, provisioning_root: string] { + let inputs = ( + $enabled_taskservs + | each { |name| (build-flake-input $name) } + ) + + let grouped = ($inputs | group-by { |x| $x.input_name }) + let unique_inputs = ( + $grouped + | items { |k, v| $v | get 0 } + ) + + ok $unique_inputs +} + +# Phase 2c: Generate flake.nix for a server +def generate-flake-nix [hostname: string, enabled_taskservs: list, system: string, flake_inputs: any] { + let taskserv_input_names = ( + $flake_inputs + | where { |inp| $inp.skip == false } + | each { |inp| $inp.input_name } + ) + + let input_lines = ( + $flake_inputs + | each { |inp| $inp.input_line } + ) + + let outputs_params = ( + ["self" "nixpkgs" "flake-utils"] + | append $taskserv_input_names + | str join ", " + ) + + let module_imports = ( + $flake_inputs + | where { |inp| $inp.skip == false } + | each { |inp| $" inputs.($inp.input_name).nixosModules.default" } + ) + + let part1 = [ + "{" + $' description = "NixOS Flake for ($hostname)";' + "" + " inputs = {" + ' nixpkgs.url = "github:NixOS/nixpkgs/nixos-24.11";' + ' flake-utils.url = "github:numtide/flake-utils";' + ] + + let part2 = if ($input_lines | length) > 0 { $input_lines } else { [] } + + let part3 = [ + " };" + "" + $" outputs = { ($outputs_params) }@inputs:" + " flake-utils.lib.eachDefaultSystem (system: {" + " devShells.default = import ./shell.nix { inherit pkgs; };" + " }) // {" + $" nixosConfigurations.($hostname) = nixpkgs.lib.nixosSystem {" + $' system = "($system)";' + " specialArgs = { inherit inputs; };" + " modules = [" + " ./configuration.nix" + " ./hardware-configuration.nix" + ] + + let part4 = if ($module_imports | length) > 0 { $module_imports } else { [] } + + let part5 = [ + " ];" + " };" + " };" + "}" + ] + + ($part1 | append $part2 | append $part3 | append $part4 | append $part5) | str join "\n" +} + +# Phase 2d: Generate configuration.nix with taskserv modules +def generate-configuration-nix [hostname: string, taskservs_record: any, private_ip: any, system: string] { + let taskserv_config_lines = ( + if ($taskservs_record == null) { + [] + } else { + $taskservs_record + | items { |k, v| {key: $k, value: $v} } + | where { |kv| $kv.value.enable? == true or $kv.value.enabled? == true } + | each { |kv| + let name = $kv.key + let config = $kv.value + let enabled_str = " enable = true;" + + # Build taskserv-specific configuration + let ts_config = ( + match $name { + "etcd" => [ + " provisioning.taskservs.etcd = {" + $enabled_str + ' listen_client_urls = "http://0.0.0.0:2379";' + $' advertise_client_urls = "http://($hostname).internal:2379";' + ' listen_peer_urls = "http://0.0.0.0:2380";' + $' initial_advertise_peer_urls = "http://($hostname).internal:2380";' + $' initial_cluster = "($hostname)=http://($hostname).internal:2380";' + ' cluster_token = "provisioned-cluster";' + " };" + ], + "kubernetes" => { + let role = ($config.role? | default "worker") + [ + " provisioning.taskservs.kubernetes = {" + $enabled_str + $' role = "($role)";' + (if ($config.role? == "control-plane") { ' apiServerAdvertiseAddress = "";' } else { "" }) + " };" + ] + | where { |x| $x != "" } + }, + "cilium" => [ + " provisioning.taskservs.cilium = {" + $enabled_str + ' ipam = "kubernetes";' + " };" + ], + "rook_ceph" => [ + " provisioning.taskservs.rook_ceph = {" + $enabled_str + $' cluster_name = "($config.cluster_name? | default "ceph")";' + " };" + ], + "containerd" => [ + " provisioning.taskservs.containerd = {" + $enabled_str + " };" + ], + "coredns" => [ + " provisioning.taskservs.coredns = {" + $enabled_str + " };" + ], + "resolv" => [ + " provisioning.taskservs.resolv = {" + $enabled_str + " };" + ], + "prometheus" => [ + " provisioning.taskservs.prometheus = {" + $enabled_str + $' retention_days = ($config.retention_days? | default 30);' + " };" + ], + "grafana" => [ + " provisioning.taskservs.grafana = {" + $enabled_str + " };" + ], + "loki" => [ + " provisioning.taskservs.loki = {" + $enabled_str + " };" + ], + _ => [ + $" provisioning.taskservs.($name) = {" + $enabled_str + " };" + ] + } + ) + + $ts_config | str join "\n" + } + } + ) + + let firewall_ports = [ + " # Firewall ports for enabled services" + " networking.firewall.allowedTCPPorts = [" + " 22 # SSH" + " 80 # HTTP" + " 443 # HTTPS" + ] | append ( + if ($taskservs_record.etcd?.enable? == true or $taskservs_record.etcd?.enabled? == true) { + [" 2379 # etcd client", " 2380 # etcd peer"] + } else { + [] + } + ) | append ( + if ($taskservs_record.kubernetes?.enable? == true or $taskservs_record.kubernetes?.enabled? == true) { + [" 6443 # Kubernetes API"] + } else { + [] + } + ) | append ( + if ($taskservs_record.prometheus?.enable? == true or $taskservs_record.prometheus?.enabled? == true) { + [" 9090 # Prometheus"] + } else { + [] + } + ) | append ( + if ($taskservs_record.grafana?.enable? == true or $taskservs_record.grafana?.enabled? == true) { + [" 3000 # Grafana"] + } else { + [] + } + ) | append [" ];"] + + let base = [ + "{ config, pkgs, inputs, ... }:" + "" + "{" + " # System hostname" + $' networking.hostName = "($hostname)";' + ' networking.domain = "internal";' + "" + " # Time and locale" + ' time.timeZone = "UTC";' + "" + " # Nix settings" + " nix.settings = {" + " auto-optimise-store = true;" + ' trusted-users = [ "root" "nixos" ];' + " experimental-features = [ \"flakes\" \"nix-command\" ];" + " };" + "" + " # SSH" + " services.openssh = {" + " enable = true;" + " settings = {" + " PasswordAuthentication = false;" + " PubkeyAuthentication = true;" + " PermitRootLogin = \"prohibit-password\";" + " };" + " };" + "" + " # System packages" + " environment.systemPackages = with pkgs; [" + " curl" + " wget" + " git" + " vim" + " htop" + " jq" + " nushell" + " nix-output-monitor" + " ];" + "" + " # Kernel parameters for Kubernetes" + " boot.kernel.sysctl = {" + ' "net.ipv4.ip_forward" = 1;' + ' "net.ipv6.conf.all.forwarding" = 1;' + ' "net.bridge.bridge-nf-call-iptables" = 1;' + ' "net.bridge.bridge-nf-call-ip6tables" = 1;' + " };" + "" + " # Required kernel modules" + " boot.kernelModules = [ \"overlay\" \"br_netfilter\" ];" + "" + ] + + let with_taskservs = ( + if ($taskserv_config_lines | length) > 0 { + $base | append [" # Taskserv configurations"] | append $taskserv_config_lines | append [""] + } else { + $base + } + ) + + let with_firewall = ( + $with_taskservs + | append $firewall_ports + | append [""] + | append [' # System state version - do not change'] + | append [' system.stateVersion = "24.11";'] + | append ["}"] + ) + + $with_firewall | str join "\n" +} + +# Phase 2e: Generate hardware-configuration.nix with system-specific settings +def generate-hardware-configuration-nix [hostname: string, system: string, server_type: any] { + let is_aarch64 = ($system =~ "aarch64") + let kernel_module = (if $is_aarch64 { "kvm-arm" } else { "kvm-intel" }) + + let base = [ + $"# Hardware configuration for $hostname" + "# Generated by provisioning system - customize as needed" + "# System: $system" + "{ config, lib, pkgs, modulesPath, ... }:" + "" + "{" + " imports = [" + ' (modulesPath + "/profiles/qemu-guest.nix")' + " ];" + "" + " boot = {" + " loader.grub = {" + " enable = true;" + ' device = "/dev/sda";' + " };" + " initrd.availableKernelModules = [" + ' "ata_piix"' + ' "uhci_pci"' + ' "virtio_pci"' + ' "sr_mod"' + ' "virtio_blk"' + " ];" + $' kernelModules = [ "($kernel_module)" ];' + " };" + "" + ' fileSystems."/" = {' + ' device = "/dev/sda1";' + ' fsType = "ext4";' + " };" + "" + " swapDevices = [" + ' { device = "/dev/sda2"; }' + " ];" + "" + " networking.usePredictableInterfaceNames = true;" + "" + " nix.settings.max-jobs = lib.mkDefault 2;" + " nix.settings.cores = lib.mkDefault 2;" + ] + + let with_closing = ($base | append ["}"]) + + $with_closing | str join "\n" +} + +# Phase 3: Write files to filesystem +def write-flake-files [ + output_dir: string, + hostname: string, + flake_nix: string, + configuration_nix: string, + hardware_configuration_nix: string, + --dry-run +] { + # Guard: Output directory exists + if not ($output_dir | path exists) { + if $dry_run { + log info $"[DRY-RUN] Would create directory: ($output_dir)" + } else { + mkdir $output_dir + if not ($output_dir | path exists) { + return (err $"Failed to create directory: ($output_dir)") + } + } + } + + # Write flake.nix + let flake_path = $"($output_dir)/flake.nix" + if $dry_run { + log info $"[DRY-RUN] Would write: ($flake_path)" + } else { + $flake_nix | save --force $flake_path + if not ($flake_path | path exists) { + return (err $"Failed to write: ($flake_path)") + } + } + + # Write configuration.nix + let config_path = $"($output_dir)/configuration.nix" + if $dry_run { + log info $"[DRY-RUN] Would write: ($config_path)" + } else { + $configuration_nix | save --force $config_path + if not ($config_path | path exists) { + return (err $"Failed to write: ($config_path)") + } + } + + # Write hardware-configuration.nix + let hardware_path = $"($output_dir)/hardware-configuration.nix" + if $dry_run { + log info $"[DRY-RUN] Would write: ($hardware_path)" + } else { + $hardware_configuration_nix | save --force $hardware_path + if not ($hardware_path | path exists) { + return (err $"Failed to write: ($hardware_path)") + } + } + + ok true +} + +# Main: Process a single server +def process-server [ + server: record, + workspace_path: string, + provisioning_root: string, + --dry-run +] { + # Guard: hostname field exists + let hostname = $server.hostname? | default "" + if ($hostname | is-empty) { + return (err "Server missing hostname field") + } + + # Guard: os_type must be nixos + let os_type = $server.os_type? | default "" + if ($os_type != "nixos") { + log debug $"Skipping ($hostname) - os_type is ($os_type), not nixos" + return (ok true) + } + + # Guard: enabled field exists and is true (optional check) + let enabled = $server.enabled? | default true + if not $enabled { + log debug $"Skipping ($hostname) - enabled is false" + return (ok true) + } + + # Determine output directory from nixos.flake_path or generate standard path + let output_dir = ( + if not ($server.nixos.flake_path? | is-empty) { + $server.nixos.flake_path + } else { + let workspace_name = ($workspace_path | path basename) + $"($workspace_path)/../nixos/($hostname)" + } + ) + + # Get system architecture + let system = ($server.nixos.system? | default "aarch64-linux") + + log info $"Processing ($hostname)..." + + # Phase 2: Extract enabled taskservs + log info " Extracting taskservs..." + let taskservs_result = (extract-enabled-taskservs $server.taskservs?) + if not (is-ok $taskservs_result) { + return (err $"Failed to extract taskservs: (unwrap-err $taskservs_result)") + } + + let enabled_taskservs = (unwrap-ok $taskservs_result) + let ts_list = ($enabled_taskservs | str join ", ") + log info $" Enabled taskservs: ($ts_list)" + + # Phase 2b: Generate flake inputs + log info " Generating flake inputs..." + let flake_inputs_result = (generate-flake-inputs $enabled_taskservs $provisioning_root) + if not (is-ok $flake_inputs_result) { + return (err $"Failed to generate flake inputs: (unwrap-err $flake_inputs_result)") + } + + let flake_inputs = (unwrap-ok $flake_inputs_result) + + # Phase 2c: Generate Nix files + let flake_nix = (generate-flake-nix $hostname $enabled_taskservs $system $flake_inputs) + let input_count = ($flake_inputs | length) + log info $" Generated flake.nix with ($input_count) inputs" + + let configuration_nix = (generate-configuration-nix $hostname $server.taskservs? $server.networking.private_ip? $system) + log info " Generated configuration.nix" + + let hardware_configuration_nix = (generate-hardware-configuration-nix $hostname $system $server.server_type?) + log info " Generated hardware-configuration.nix" + + # Phase 3: Write files + let write_result = ( + write-flake-files $output_dir $hostname $flake_nix $configuration_nix $hardware_configuration_nix --dry-run=$dry_run + ) + let write_ok = (is-ok $write_result) + log info $" Write result ok: ($write_ok)" + + if (is-ok $write_result) { + let ts_count = ($enabled_taskservs | length) + log info $"✅ Generated flakes for ($hostname) with ($ts_count) taskservs" + ok true + } else { + log error $"Failed to generate flakes for ($hostname): (unwrap-err $write_result)" + err (unwrap-err $write_result) + } +} + +# Main entry point +def main [ + workspace_path?: string, # Path to workspace (e.g., workspaces/librecloud_hetzner) + --dry-run, # Dry-run mode (show what would be done) +] { + let dry_run = $dry_run + # Detect provisioning root + let prov_root = (get-provisioning-root) + log debug $"Provisioning root: ($prov_root)" + + # Guard: workspace_path provided + if ($workspace_path | is-empty) { + log error "Usage: generate-flakes.nu [--dry-run]" + log error "Example: generate-flakes.nu workspaces/librecloud_hetzner" + return 1 + } + + # Resolve workspace path (relative or absolute) + # Workspaces are at repo root/workspaces, not provisioning/workspaces + let ws_path = ( + if ($workspace_path | str starts-with "/") { + $workspace_path + } else { + # Check if workspace exists relative to repo root + let repo_root = ($prov_root | path dirname) + let candidate = $"($repo_root)/($workspace_path)" + if ($candidate | path exists) { + $candidate + } else { + # Fallback to provisioning root + $"($prov_root)/($workspace_path)" + } + } + ) + + log info "═══════════════════════════════════════════════════════════════════" + log info "NixOS Flake Generator" + log info "═══════════════════════════════════════════════════════════════════" + log info $"Workspace: ($ws_path)" + log info $"Dry-run: ($dry_run)" + + # Phase 1: Export servers config from Nickel + log info "" + log info "Phase 1: Exporting servers.ncl from Nickel..." + let export_result = (export-servers-config $ws_path $prov_root) + + if not (is-ok $export_result) { + log error $"Export failed: (unwrap-err $export_result)" + return 1 + } + + let servers_config = (unwrap-ok $export_result) + let servers = $servers_config.servers + let server_count = ($servers | length) + + log info $"Exported ($server_count) servers from Nickel" + + # Phase 2-3: Process each server + log info "" + log info "Phase 2-3: Generating flakes for each server..." + mut generation_errors = [] + + for server in $servers { + let result = ( + if $dry_run { + process-server $server $ws_path $prov_root --dry-run + } else { + process-server $server $ws_path $prov_root + } + ) + if not (is-ok $result) { + $generation_errors | append (unwrap-err $result) | let generation_errors + } + } + + # Final report + log info "" + log info "═══════════════════════════════════════════════════════════════════" + + let error_count = ($generation_errors | length) + if $error_count == 0 { + log info "All flakes generated successfully!" + return 0 + } else { + log error $"Failed to generate ($error_count) flakes" + $generation_errors | each { |e| log error $" • ($e)" } + return 1 + } +} + +# Script is executed as: +# nu generate-flakes.nu [--dry-run] +# Nushell automatically passes CLI arguments to main diff --git a/scripts/generators/generate-taskserv-skeleton.nu b/scripts/generators/generate-taskserv-skeleton.nu new file mode 100644 index 0000000..ac35576 --- /dev/null +++ b/scripts/generators/generate-taskserv-skeleton.nu @@ -0,0 +1,271 @@ +#!/usr/bin/env nu +# Generate taskserv skeleton structure for rapid creation +# Creates all required files: flake.nix, nixos-module.nix, Nickel files + +use std + +# Taskserv metadata +def taskserv-metadata [] { + { + # Batch 2: Container runtimes + containerd: { + category: "container", + type: "container-runtime", + description: "containerd container runtime for NixOS", + nixos_module: "virtualisation.containerd", + complexity: "moderate", + }, + podman: { + category: "container", + type: "container-runtime", + description: "Podman daemon-less container engine", + nixos_module: "virtualisation.podman", + complexity: "simple", + }, + # Batch 3: Cluster base + etcd: { + category: "cluster", + type: "distributed-kv", + description: "etcd distributed key-value store for Kubernetes", + nixos_module: "services.etcd", + complexity: "moderate", + }, + coredns: { + category: "cluster", + type: "dns-server", + description: "CoreDNS for Kubernetes service discovery", + nixos_module: "services.coredns", + complexity: "simple", + }, + # Batch 4: Kubernetes + kubernetes: { + category: "cluster", + type: "orchestrator", + description: "Kubernetes control plane and node components", + nixos_module: "services.kubernetes", + complexity: "complex", + }, + # Batch 5: Storage and networking + cilium: { + category: "network", + type: "cni", + description: "Cilium eBPF-based networking for Kubernetes", + nixos_module: "services.cilium", + complexity: "complex", + }, + external_nfs: { + category: "storage", + type: "nfs-client", + description: "External NFS client mount configuration", + nixos_module: "fileSystems", + complexity: "simple", + }, + rook_ceph: { + category: "storage", + type: "distributed-storage", + description: "Rook operator for Ceph distributed storage", + nixos_module: "services.kubernetes", + complexity: "complex", + }, + # Batch 6: Services + postgres: { + category: "services", + type: "database", + description: "PostgreSQL relational database", + nixos_module: "services.postgresql", + complexity: "simple", + }, + redis: { + category: "services", + type: "cache", + description: "Redis in-memory data store", + nixos_module: "services.redis", + complexity: "simple", + }, + proxy: { + category: "services", + type: "load-balancer", + description: "HAProxy network proxy and load balancer", + nixos_module: "services.haproxy", + complexity: "simple", + }, + webhook: { + category: "services", + type: "webhook-handler", + description: "Webhook handler service", + nixos_module: "custom", + complexity: "simple", + }, + } +} + +# Generate flake.nix template +def generate-flake [taskserv: string, description: string] { + $" +{ + description = \"$description\"; + + inputs = { + nixpkgs.url = \"github:NixOS/nixpkgs/nixos-24.11\"; + flake-utils.url = \"github:numtide/flake-utils\"; + }; + + outputs = { self, nixpkgs, flake-utils }: + flake-utils.lib.eachDefaultSystem (system: + let + pkgs = nixpkgs.legacyPackages.\${system}; + in + { + checks = { + format = pkgs.runCommand \"($taskserv)-format-check\" + { + buildInputs = with pkgs; [ nixpkgs-fmt ]; + } + '' + nixpkgs-fmt --check \${./.} + mkdir \$out + ''; + }; + + devShells.default = pkgs.mkShell { + buildInputs = with pkgs; [ + nixpkgs-fmt + ]; + }; + } + ) // { + nixosModules = { + default = import ./nixos-module.nix; + }; + }; +} +" +} + +# Generate nixos-module.nix template +def generate-module [taskserv: string, nixos_module: string] { + let cfg_name = ($taskserv | str replace "_" "-") + + $" +{ config, lib, pkgs, ... }: +with lib; +let + cfg = config.provisioning.taskservs.$taskserv; +in +{ + options.provisioning.taskservs.$taskserv = { + enable = mkEnableOption \"$taskserv service\"; + }; + + config = mkIf cfg.enable { + # NixOS module configuration goes here + # Reference: services.$nixos_module or virtualisation.* + + # Metadata + assertions = [ + { + assertion = true; + message = \"$taskserv properly configured\"; + } + ]; + }; +} +" +} + +# Generate main.ncl +def generate-main-ncl [taskserv: string, description: string] { + $" +# $taskserv Taskserv Main Configuration + +let defaults = import \"defaults.ncl\" in + +{ + taskserv_name | doc \"Taskserv identifier\" = \"$taskserv\", + taskserv_type | doc \"Taskserv category\" = \"$description\", + taskserv_version | doc \"Taskserv version\" = \"1.0.0\", + + config | doc \"Default taskserv configuration\" = defaults.${taskserv}_config, + + metadata | doc \"Taskserv documentation\" = { + description | doc \"What this taskserv provides\" = \"$description for NixOS\", + author | doc \"Taskserv author\" = \"Provisioning Team\", + documentation_path | doc \"Path to documentation\" = \"./README.md\", + }, +} +" +} + +# Generate defaults.ncl +def generate-defaults-ncl [taskserv: string] { + $" +# $taskserv Taskserv Defaults + +{ + ${taskserv}_config | default = { + enable | doc \"Enable $taskserv\" | default = true, + }, +} +" +} + +# Generate contracts.ncl +def generate-contracts-ncl [taskserv: string] { + $" +# $taskserv Taskserv Contracts + +{ + ${(($taskserv | str capitalize) + "Config")} = { + enable | Bool | doc \"Enable $taskserv\", + }, +} +" +} + +# Main function +export def main [ + taskservs: list = ["containerd", "podman", "etcd", "coredns", "kubernetes", "cilium", "external_nfs", "rook_ceph", "postgres", "redis", "proxy", "webhook"] + --output-dir: string = "provisioning/extensions/taskservs" +] { + let metadata = taskserv-metadata + + print $"Generating taskserv skeletons for ($taskservs | length) services..." + + for taskserv in $taskservs { + if not ($metadata | has $taskserv) { + print $"⚠ ($taskserv): not in metadata, skipping" + continue + } + + let meta = $metadata | get $taskserv + let taskserv_dir = $"($output_dir)/($meta.category)/($taskserv)" + + mkdir $taskserv_dir + mkdir $"($taskserv_dir)/nickel" + mkdir $"($taskserv_dir)/modules" + mkdir $"($taskserv_dir)/templates" + mkdir $"($taskserv_dir)/scripts" + + # Generate flake.nix + let flake_content = (generate-flake $taskserv $meta.description) + $flake_content | save --force $"($taskserv_dir)/flake.nix" + + # Generate nixos-module.nix + let module_content = (generate-module $taskserv $meta.nixos_module) + $module_content | save --force $"($taskserv_dir)/nixos-module.nix" + + # Generate Nickel files + let main_content = (generate-main-ncl $taskserv $meta.type) + $main_content | save --force $"($taskserv_dir)/nickel/main.ncl" + + let defaults_content = (generate-defaults-ncl $taskserv) + $defaults_content | save --force $"($taskserv_dir)/nickel/defaults.ncl" + + let contracts_content = (generate-contracts-ncl $taskserv) + $contracts_content | save --force $"($taskserv_dir)/nickel/contracts.ncl" + + print $"✓ ($taskserv): skeleton generated at ($taskserv_dir)" + } + + print "All taskserv skeletons generated successfully" +} diff --git a/scripts/init-nickel-repos.nu b/scripts/init-nickel-repos.nu new file mode 100755 index 0000000..c74bb3f --- /dev/null +++ b/scripts/init-nickel-repos.nu @@ -0,0 +1,254 @@ +#!/usr/bin/env nu +# Initialize Nickel schema and config repos +# REQUIRES: .env loaded before execution +# Usage: source .env && nu provisioning/scripts/init-nickel-repos.nu + +# Constants +let git_org = "provisioning" +let repos = ["provisioning-schemas", "provisioning-configs"] +let tmp_dir = $env.TMPDIR? | default "/tmp" + +# Verify required env vars +if ($env.GITREPO_URL? | is-empty) { + print "❌ Missing GITREPO_URL in environment" + exit 1 +} + +if ($env.GITREPO_TOKEN? | is-empty) { + print "❌ Missing GITREPO_TOKEN in environment" + exit 1 +} + +if ($env.GITREPO_USER? | is-empty) { + print "❌ Missing GITREPO_USER in environment" + exit 1 +} + +print "🔧 Initializing Nickel repos" +print $" Git URL: ($env.GITREPO_URL)" +print $" Organization: ($git_org)" +print $" Repos: ($repos | str join ', ')" +print "" + +# ============================================================================ +# Ensure organization exists +# ============================================================================ + +def ensure-org [] { + let org = "provisioning" + print $"📦 Ensuring organization '($org)' exists..." + + let check = (curl -s -H $"Authorization: token ($env.GITREPO_TOKEN)" $"($env.GITREPO_URL)/api/v1/orgs/($org)" --output /dev/null -w "%{http_code}") + + if $check != "200" { + print " Creating organization..." + curl -s -X POST -H $"Authorization: token ($env.GITREPO_TOKEN)" -H "Content-Type: application/json" -d '{"username":"provisioning","full_name":"Provisioning"}' $"($env.GITREPO_URL)/api/v1/user/orgs" > /dev/null + print " ✓ Created" + } else { + print " ✓ Already exists" + } +} + +# ============================================================================ +# Ensure repository exists +# ============================================================================ + +def ensure-repo [repo: string] { + let org = "provisioning" + print $"📝 Ensuring repo: ($repo)" + + let check = (curl -s -H $"Authorization: token ($env.GITREPO_TOKEN)" $"($env.GITREPO_URL)/api/v1/repos/($org)/($repo)" --output /dev/null -w "%{http_code}") + + if $check != "200" { + print " Creating..." + let json_data = {name: $repo, description: $"Provisioning ($repo)", auto_init: true, private: false} + curl -s -X POST -H $"Authorization: token ($env.GITREPO_TOKEN)" -H "Content-Type: application/json" -d ($json_data | to json) $"($env.GITREPO_URL)/api/v1/orgs/($org)/repos" out+err> /dev/null + print " ✓ Created" + } else { + print " ✓ Already exists" + } +} + +# ============================================================================ +# Sync schemas repository +# ============================================================================ + +def sync-schemas [] { + let repo = "provisioning-schemas" + let org = "provisioning" + let repo_dir = $"($tmp_dir)/nickel-repos-init/($repo)" + let clone_url = $"http://($env.GITREPO_USER):($env.GITREPO_TOKEN)@localhost:3000/($org)/($repo).git" + + print "" + print $"📂 Syncing ($repo)..." + + mkdir -v $repo_dir + + if ($repo_dir | path exists) { + cd $repo_dir + git pull origin main out+err> /dev/null + cd - + } else { + git clone $clone_url $repo_dir out+err> /dev/null + } + + # Create directories + mkdir -v $"($repo_dir)/platform/services" + mkdir -v $"($repo_dir)/platform/defaults/deployment" + mkdir -v $"($repo_dir)/platform/common" + + # VERSION + "1.0.0" | save -f $"($repo_dir)/VERSION" + + # Types + '{ + Hostname = String, + Port = Number, + Url = String, + LogLevel = [| "trace", "debug", "info", "warn", "error" |], +}' | save -f $"($repo_dir)/platform/common/types.ncl" + + # Helpers + '{ + compose_config = fun defaults mode user_overrides => + let merge = fun a b => + if (std.record.is_record a) and (std.record.is_record b) then + a | std.record.merge b else b + in merge (merge defaults mode) user_overrides, +}' | save -f $"($repo_dir)/platform/common/helpers.ncl" + + # Orchestrator service + '{ + service = {name = "orchestrator", version = "4.0.0"}, + enabled = true, + server = {host = "127.0.0.1", port = 9090, workers = 4}, + database = {url = "ws://127.0.0.1:8000", namespace = "provisioning", database = "orchestrator"}, + queue = {max_concurrent_tasks = 5, retry_attempts = 3}, + mode = {deployment = "local"}, + logging = {level = "info", format = "compact"}, +}' | save -f $"($repo_dir)/platform/services/orchestrator.ncl" + + # Vault service + '{ + service = {name = "vault-service", version = "1.0.0"}, + enabled = true, + server = {host = "127.0.0.1", port = 8082, workers = 2}, + database = {url = "ws://127.0.0.1:8000", namespace = "provisioning", database = "vault"}, + backend = {backend_type = "secretum-vault", secretum_vault = {binary_path = "~/.local/bin/svault"}}, + mode = {deployment = "local"}, + logging = {level = "info", format = "compact"}, +}' | save -f $"($repo_dir)/platform/services/vault-service.ncl" + + # Local defaults + '{ + server = {host = "127.0.0.1", workers = 2}, + database = {url = "ws://127.0.0.1:8000"}, + mode = {deployment = "local"}, + logging = {level = "debug", format = "compact"}, +}' | save -f $"($repo_dir)/platform/defaults/deployment/local.ncl" + + # README + '# Provisioning Schemas + +Nickel schema definitions for provisioning platform services. + +## Structure +- `platform/services/` - Service definitions +- `platform/defaults/deployment/` - Deployment mode defaults +- `platform/common/` - Shared types and helpers' | save -f $"($repo_dir)/README.md" + + # Commit + cd $repo_dir + git add . + git commit -m "Initial schema structure" --allow-empty out+err> /dev/null + git push -u origin main out+err> /dev/null + cd - + + print " ✓ Synced" +} + +# ============================================================================ +# Sync configs repository +# ============================================================================ + +def sync-configs [] { + let repo = "provisioning-configs" + let org = "provisioning" + let repo_dir = $"($tmp_dir)/nickel-repos-init/($repo)" + let clone_url = $"http://($env.GITREPO_USER):($env.GITREPO_TOKEN)@localhost:3000/($org)/($repo).git" + + print "" + print $"📂 Syncing ($repo)..." + + mkdir -v $repo_dir + + if ($repo_dir | path exists) { + cd $repo_dir + git pull origin main out+err> /dev/null + cd - + } else { + git clone $clone_url $repo_dir out+err> /dev/null + } + + # Local config + '{ + enabled_services = ["orchestrator", "vault_service"], + + orchestrator = { + service = {name = "orchestrator", version = "4.0.0"}, + enabled = true, + server = {host = "127.0.0.1", port = 9090, workers = 4}, + database = {url = "ws://127.0.0.1:8000", namespace = "provisioning", database = "orchestrator"}, + mode = {deployment = "local"}, + logging = {level = "debug", format = "compact"}, + }, + + vault_service = { + service = {name = "vault-service", version = "1.0.0"}, + enabled = true, + server = {host = "127.0.0.1", port = 8082, workers = 2}, + database = {url = "ws://127.0.0.1:8000", namespace = "provisioning", database = "vault"}, + backend = {backend_type = "secretum-vault", secretum_vault = {binary_path = "~/.local/bin/svault"}}, + mode = {deployment = "local"}, + logging = {level = "info", format = "compact"}, + }, +}' | save -f $"($repo_dir)/local.ncl" + + # README + '# Provisioning Configs + +Environment-specific configurations for provisioning services. + +## Files +- `local.ncl` - Local development +- `staging.ncl` - Staging (future) +- `production.ncl` - Production (future)' | save -f $"($repo_dir)/README.md" + + # Commit + cd $repo_dir + git add . + git commit -m "Initial configuration structure" --allow-empty out+err> /dev/null + git push -u origin main out+err> /dev/null + cd - + + print " ✓ Synced" +} + +# ============================================================================ +# Main execution +# ============================================================================ + +ensure-org +$repos | each { |repo| ensure-repo $repo } +sync-schemas +sync-configs + +print "" +print "✅ Done!" +print "" +print "📍 Repos:" +print $" Schemas: ($env.GITREPO_URL)/($git_org)/provisioning-schemas" +print $" Configs: ($env.GITREPO_URL)/($git_org)/provisioning-configs" +print "" +print "📁 Local: ($tmp_dir)/nickel-repos-init/" +print "" diff --git a/scripts/init-nickel-repos.sh b/scripts/init-nickel-repos.sh new file mode 100644 index 0000000..c4af738 --- /dev/null +++ b/scripts/init-nickel-repos.sh @@ -0,0 +1,20 @@ +#!/bin/bash +# Simple wrapper to load .env and run Nushell script +# Usage: bash provisioning/scripts/init-nickel-repos.sh + +set -euo pipefail + +# Get script directory +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" + +# Load .env +if [[ -f "$PROJECT_ROOT/../.env" ]]; then + export $(grep -v '^#' "$PROJECT_ROOT/../.env" | xargs) +else + echo "❌ .env not found" + exit 1 +fi + +# Run Nushell script +nu "$SCRIPT_DIR/init-nickel-repos.nu" diff --git a/scripts/libs/os/detect.nu b/scripts/libs/os/detect.nu new file mode 100644 index 0000000..9bc220f --- /dev/null +++ b/scripts/libs/os/detect.nu @@ -0,0 +1,85 @@ +#!/usr/bin/env nu +# OS detection utilities +# Detect running OS and return structured results + +use std + +# Detect current OS type +export def detect [] { + let os_info = (^uname -s | str trim) + + if $os_info == "Linux" { + # Check for NixOS (most distinctive) + if ("/etc/os-release" | path exists) { + let release_info = (open /etc/os-release) + if ($release_info | str contains "nixos" or $release_info | str contains "NixOS") { + "nixos" + } else if ($release_info | str contains "Ubuntu") { + "ubuntu" + } else if ($release_info | str contains "Debian") { + "debian" + } else if ($release_info | str contains "Alpine") { + "alpine" + } else { + "linux-generic" + } + } else { + "linux-generic" + } + } else if $os_info == "Darwin" { + "macos" + } else if ($os_info | str starts-with "MINGW") { + "windows" + } else { + $"unknown($os_info)" + } +} + +# Detect CPU architecture +export def detect-arch [] { + let arch_raw = (^uname -m | str trim) + + match $arch_raw { + "x86_64" => { "x86_64" } + "aarch64" | "arm64" => { "aarch64" } + "armv7l" | "armv7" => { "armv7" } + "i386" | "i686" => { "i386" } + _ => { $arch_raw } + } +} + +# Get system information as record +export def info [] { + { + os: (detect), + arch: (detect-arch), + kernel: (^uname -s | str trim), + release: (^uname -r | str trim), + } +} + +# Validate OS type is supported +export def is-supported [os_type: string] { + $os_type in ["debian" "ubuntu" "nixos" "alpine" "linux-generic" "macos"] +} + +# Check if running on NixOS +export def is-nixos [] { + (detect) == "nixos" +} + +# Check if running on Debian-based +export def is-debian-based [] { + let current = (detect) + $current in ["debian" "ubuntu"] +} + +# Main: Print OS info +export def main [] { + let system_info = (info) + + print $"OS: ($system_info.os)" + print $"Architecture: ($system_info.arch)" + print $"Kernel: ($system_info.kernel)" + print $"Release: ($system_info.release)" +} diff --git a/scripts/libs/os/pkg.nu b/scripts/libs/os/pkg.nu new file mode 100644 index 0000000..3ad55a3 --- /dev/null +++ b/scripts/libs/os/pkg.nu @@ -0,0 +1,235 @@ +#!/usr/bin/env nu +# Package management abstraction +# Provides OS-agnostic package operations + +use std +use ./detect.nu as os_detect + +# Helper: Result type +def ok [value: any] { {ok: $value, err: null} } +def err [message: string] { {ok: null, err: $message} } +def is-ok [result: record] { $result.err == null } + +# Install package(s) +export def install [ + packages: string # Package name or space-separated list + --assume-yes: bool = true # Skip confirmation prompts +] { + # GUARD: Validate input + if ($packages | is-empty) { + return (err "packages cannot be empty") + } + + # PRECONDITION: Check OS support + let current_os = (os_detect detect) + if not (os_detect is-supported $current_os) { + return (err $"OS not supported for package management: ($current_os)") + } + + # MAIN LOGIC: OS-specific commands + let result = match $current_os { + "debian" | "ubuntu" => { + let cmd = if $assume_yes { "apt-get install -y" } else { "apt-get install" } + (^sh -c $"$cmd $packages" 2>&1 | complete) + } + + "nixos" => { + err "Use NixOS modules for package management, not imperative install" + } + + "alpine" => { + let cmd = if $assume_yes { "apk add -y" } else { "apk add" } + (^sh -c $"$cmd $packages" 2>&1 | complete) + } + + "macos" => { + let cmd = if $assume_yes { "brew install" } else { "brew install" } + (^sh -c $"$cmd $packages" 2>&1 | complete) + } + + _ => { + return (err $"Package installation not supported on ($current_os)") + } + } + + if $result.exit_code == 0 { + ok "Packages installed successfully" + } else { + err $"Package installation failed: ($result.stderr)" + } +} + +# Remove package(s) +export def remove [ + packages: string + --assume-yes: bool = true +] { + # GUARD: Validate input + if ($packages | is-empty) { + return (err "packages cannot be empty") + } + + let current_os = (os_detect detect) + + # MAIN LOGIC + let result = match $current_os { + "debian" | "ubuntu" => { + let cmd = if $assume_yes { "apt-get remove -y" } else { "apt-get remove" } + (^sh -c $"$cmd $packages" 2>&1 | complete) + } + + "nixos" => { + err "Use NixOS modules, not imperative remove" + } + + "alpine" => { + let cmd = if $assume_yes { "apk del -y" } else { "apk del" } + (^sh -c $"$cmd $packages" 2>&1 | complete) + } + + "macos" => { + let cmd = if $assume_yes { "brew uninstall" } else { "brew uninstall" } + (^sh -c $"$cmd $packages" 2>&1 | complete) + } + + _ => { + return (err $"Package removal not supported on ($current_os)") + } + } + + if $result.exit_code == 0 { + ok "Packages removed successfully" + } else { + err $"Package removal failed: ($result.stderr)" + } +} + +# Update package cache/index +export def update [] { + let current_os = (os_detect detect) + + let result = match $current_os { + "debian" | "ubuntu" => { + (^apt-get update 2>&1 | complete) + } + + "nixos" => { + err "Use 'nix flake update' or nixpkgs channel management" + } + + "alpine" => { + (^apk update 2>&1 | complete) + } + + "macos" => { + (^brew update 2>&1 | complete) + } + + _ => { + return (err $"Package update not supported on ($current_os)") + } + } + + if $result.exit_code == 0 { + ok "Package cache updated" + } else { + err $"Package update failed: ($result.stderr)" + } +} + +# Check if package is installed +export def is-installed [package: string] { + # GUARD: Validate input + if ($package | is-empty) { + return (err "package name cannot be empty") + } + + let current_os = (os_detect detect) + + let cmd_result = match $current_os { + "debian" | "ubuntu" => { + (^sh -c $"dpkg -l | grep -q '^ii.*$package'" 2>&1 | complete) + } + + "nixos" => { + (^sh -c $"nix eval --raw -f '' 'lib.attrsets.hasAttr \"$package\" self'" 2>&1 | complete) + } + + "alpine" => { + (^apk info $package 2>&1 | complete) + } + + "macos" => { + (^brew list $package 2>&1 | complete) + } + + _ => { + return (err $"Package check not supported on ($current_os)") + } + } + + ok ($cmd_result.exit_code == 0) +} + +# Get package version +export def version [package: string] { + # GUARD: Validate input + if ($package | is-empty) { + return (err "package name cannot be empty") + } + + let current_os = (os_detect detect) + + let version_info = match $current_os { + "debian" | "ubuntu" => { + let dpkg_result = (^sh -c $"dpkg -l | grep '^ii.*$package' | awk '{{print $3}}'" 2>&1 | complete) + if $dpkg_result.exit_code == 0 { + $dpkg_result.stdout | str trim + } else { + "not installed" + } + } + + "nixos" => { + "use nix version checking" + } + + "alpine" => { + let apk_result = (^apk info $package 2>&1 | complete) + if $apk_result.exit_code == 0 { + $apk_result.stdout | str trim + } else { + "not installed" + } + } + + "macos" => { + let brew_result = (^brew list --versions $package 2>&1 | complete) + if $brew_result.exit_code == 0 { + $brew_result.stdout | str trim + } else { + "not installed" + } + } + + _ => { + return (err $"Version check not supported on ($current_os)") + } + } + + ok $version_info +} + +# Main: Show usage +export def main [] { + print "Package management abstraction library" + print "" + print "Usage:" + print " pkg install [--assume-yes]" + print " pkg remove [--assume-yes]" + print " pkg update" + print " pkg is-installed " + print " pkg version " + print "" + print "Supported OS: debian, ubuntu, alpine, nixos, macos" +} diff --git a/scripts/libs/os/svc.nu b/scripts/libs/os/svc.nu new file mode 100644 index 0000000..dc61563 --- /dev/null +++ b/scripts/libs/os/svc.nu @@ -0,0 +1,335 @@ +#!/usr/bin/env nu +# Service management abstraction +# Provides OS-agnostic service operations (enable, start, stop, status, etc.) + +use std +use ./detect.nu as os_detect + +# Helper: Result type +def ok [value: any] { {ok: $value, err: null} } +def err [message: string] { {ok: null, err: $message} } +def is-ok [result: record] { $result.err == null } + +# Enable service (start on boot) +export def enable [service: string] { + # GUARD: Validate input + if ($service | is-empty) { + return (err "service name cannot be empty") + } + + let current_os = (os_detect detect) + + let result = match $current_os { + "debian" | "ubuntu" => { + (^systemctl enable $service 2>&1 | complete) + } + + "nixos" => { + err "Use NixOS modules to enable services" + } + + "alpine" => { + (^rc-service $service start 2>&1 | complete) + } + + "macos" => { + (^launchctl load $"/Library/LaunchDaemons/($service).plist" 2>&1 | complete) + } + + _ => { + return (err $"Service enable not supported on ($current_os)") + } + } + + if $result.exit_code == 0 { + ok $"Service ($service) enabled" + } else { + err $"Failed to enable ($service): ($result.stderr)" + } +} + +# Disable service (prevent start on boot) +export def disable [service: string] { + # GUARD: Validate input + if ($service | is-empty) { + return (err "service name cannot be empty") + } + + let current_os = (os_detect detect) + + let result = match $current_os { + "debian" | "ubuntu" => { + (^systemctl disable $service 2>&1 | complete) + } + + "nixos" => { + err "Use NixOS modules to disable services" + } + + "alpine" => { + (^rc-update del $service default 2>&1 | complete) + } + + "macos" => { + (^launchctl unload $"/Library/LaunchDaemons/($service).plist" 2>&1 | complete) + } + + _ => { + return (err $"Service disable not supported on ($current_os)") + } + } + + if $result.exit_code == 0 { + ok $"Service ($service) disabled" + } else { + err $"Failed to disable ($service): ($result.stderr)" + } +} + +# Start service (immediately) +export def start [service: string] { + # GUARD: Validate input + if ($service | is-empty) { + return (err "service name cannot be empty") + } + + let current_os = (os_detect detect) + + let result = match $current_os { + "debian" | "ubuntu" => { + (^systemctl start $service 2>&1 | complete) + } + + "nixos" => { + (^systemctl start $service 2>&1 | complete) + } + + "alpine" => { + (^rc-service $service start 2>&1 | complete) + } + + "macos" => { + (^launchctl start $service 2>&1 | complete) + } + + _ => { + return (err $"Service start not supported on ($current_os)") + } + } + + if $result.exit_code == 0 { + ok $"Service ($service) started" + } else { + err $"Failed to start ($service): ($result.stderr)" + } +} + +# Stop service +export def stop [service: string] { + # GUARD: Validate input + if ($service | is-empty) { + return (err "service name cannot be empty") + } + + let current_os = (os_detect detect) + + let result = match $current_os { + "debian" | "ubuntu" => { + (^systemctl stop $service 2>&1 | complete) + } + + "nixos" => { + (^systemctl stop $service 2>&1 | complete) + } + + "alpine" => { + (^rc-service $service stop 2>&1 | complete) + } + + "macos" => { + (^launchctl stop $service 2>&1 | complete) + } + + _ => { + return (err $"Service stop not supported on ($current_os)") + } + } + + if $result.exit_code == 0 { + ok $"Service ($service) stopped" + } else { + err $"Failed to stop ($service): ($result.stderr)" + } +} + +# Restart service +export def restart [service: string] { + # GUARD: Validate input + if ($service | is-empty) { + return (err "service name cannot be empty") + } + + let current_os = (os_detect detect) + + let result = match $current_os { + "debian" | "ubuntu" => { + (^systemctl restart $service 2>&1 | complete) + } + + "nixos" => { + (^systemctl restart $service 2>&1 | complete) + } + + "alpine" => { + (^rc-service $service restart 2>&1 | complete) + } + + "macos" => { + (^launchctl stop $service 2>&1 | complete) + (^launchctl start $service 2>&1 | complete) + } + + _ => { + return (err $"Service restart not supported on ($current_os)") + } + } + + if $result.exit_code == 0 { + ok $"Service ($service) restarted" + } else { + err $"Failed to restart ($service): ($result.stderr)" + } +} + +# Get service status +export def status [service: string] { + # GUARD: Validate input + if ($service | is-empty) { + return (err "service name cannot be empty") + } + + let current_os = (os_detect detect) + + let status_output = match $current_os { + "debian" | "ubuntu" => { + let status_result = (^systemctl status $service 2>&1 | complete) + $status_result.stdout + } + + "nixos" => { + let status_result = (^systemctl status $service 2>&1 | complete) + $status_result.stdout + } + + "alpine" => { + let status_result = (^rc-service $service status 2>&1 | complete) + $status_result.stdout + } + + "macos" => { + let status_result = (^launchctl list $service 2>&1 | complete) + $status_result.stdout + } + + _ => { + return (err $"Service status not supported on ($current_os)") + } + } + + ok ($status_output | str trim) +} + +# Check if service is running +export def is-running [service: string] { + # GUARD: Validate input + if ($service | is-empty) { + return (err "service name cannot be empty") + } + + let current_os = (os_detect detect) + + let is_running = match $current_os { + "debian" | "ubuntu" => { + (^systemctl is-active $service 2>&1 | complete).exit_code == 0 + } + + "nixos" => { + (^systemctl is-active $service 2>&1 | complete).exit_code == 0 + } + + "alpine" => { + (^rc-service $service status 2>&1 | complete).exit_code == 0 + } + + "macos" => { + let list_result = (^launchctl list $service 2>&1 | complete) + $list_result.exit_code == 0 + } + + _ => { + return (err $"Service check not supported on ($current_os)") + } + } + + ok $is_running +} + +# Reload service configuration (for services that support it) +export def reload [service: string] { + # GUARD: Validate input + if ($service | is-empty) { + return (err "service name cannot be empty") + } + + let current_os = (os_detect detect) + + let result = match $current_os { + "debian" | "ubuntu" => { + (^systemctl reload $service 2>&1 | complete) + } + + "nixos" => { + (^systemctl reload $service 2>&1 | complete) + } + + "alpine" => { + # Alpine doesn't have reload, use restart + (^rc-service $service restart 2>&1 | complete) + } + + "macos" => { + # macOS doesn't have reload, use restart + (^launchctl stop $service 2>&1 | complete) + (^launchctl start $service 2>&1 | complete) + } + + _ => { + return (err $"Service reload not supported on ($current_os)") + } + } + + if $result.exit_code == 0 { + ok $"Service ($service) reloaded" + } else { + err $"Failed to reload ($service): ($result.stderr)" + } +} + +# Main: Show usage +export def main [] { + print "Service management abstraction library" + print "" + print "Usage:" + print " svc enable " + print " svc disable " + print " svc start " + print " svc stop " + print " svc restart " + print " svc reload " + print " svc status " + print " svc is-running " + print "" + print "Supported OS: debian, ubuntu, alpine, nixos, macos" + print "" + print "Note: NixOS services are defined via configuration.nix, not imperatively" +} diff --git a/scripts/local-install.sh b/scripts/local-install.sh new file mode 100755 index 0000000..0d7543a --- /dev/null +++ b/scripts/local-install.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +cd $PROVISIONING +just distro-build-release && just distro-install diff --git a/scripts/migrate-to-target-configs.nu b/scripts/migrate-to-target-configs.nu index 0c0a4d3..f314238 100755 --- a/scripts/migrate-to-target-configs.nu +++ b/scripts/migrate-to-target-configs.nu @@ -8,7 +8,7 @@ # 4. Migrates provider settings # 5. Creates user context -use ../core/nulib/lib_provisioning * +use ../core/nulib/domain * def main [ --workspace-name: string = "default" # Name for new workspace @@ -27,7 +27,7 @@ def main [ # 1. Detect old system print "Step 1: Detecting old configuration..." - let old_config_path = "/Users/Akasha/project-provisioning/provisioning/config/config.defaults.toml" + let old_config_path = ($env.PROVISIONING | path join "config/config.defaults.toml") if not ($old_config_path | path exists) { print "✅ No old config found. System may already be migrated." @@ -140,7 +140,7 @@ def main [ for provider in $providers { print $" • Migrating ($provider)..." - let template_file = $"/Users/Akasha/project-provisioning/provisioning/extensions/providers/($provider)/config.defaults.toml" + let template_file = ($env.PROVISIONING | path join $"extensions/providers/($provider)/config.defaults.toml") if not ($template_file | path exists) { print $" ⚠️ Template not found, skipping" diff --git a/scripts/nixos/generate-hetzner-nixos-flake.nu b/scripts/nixos/generate-hetzner-nixos-flake.nu new file mode 100644 index 0000000..2d40ed7 --- /dev/null +++ b/scripts/nixos/generate-hetzner-nixos-flake.nu @@ -0,0 +1,202 @@ +#!/usr/bin/env nu +# Generate NixOS flake for Hetzner server from provisioning config +# Combines workspace Nickel config + os-nixos taskserv to generate target flake + +use std + +# Validate input files +def validate-inputs [servers_ncl: string, output_dir: string] { + if not ($servers_ncl | path exists) { + error make {msg: $"servers.ncl not found: ($servers_ncl)"} + } + + if not ($output_dir | path exists) { + mkdir $output_dir + } +} + +# Export Nickel servers config to JSON +def export-servers-config [servers_ncl: string] { + let export_result = (nickel export $servers_ncl 2>&1 | complete) + + if $export_result.exit_code != 0 { + error make {msg: $"nickel export failed: ($export_result.stderr)"} + } + + $export_result.stdout | from json +} + +# Generate flake.nix from server config +def generate-flake-nix [server: record, taskservs_dir: string, output_path: string] { + let hostname = $server.hostname + let server_type = $server.server_type + let os_type = ($server.os_type? // "debian") + + let flake_content = $" +{ + description = \"NixOS configuration for Hetzner server ($hostname)\"; + + inputs = { + nixpkgs.url = \"github:NixOS/nixpkgs/nixos-24.11\"; + nixos-anywhere = { + url = \"github:nix-community/nixos-anywhere\"; + inputs.nixpkgs.follows = \"nixpkgs\"; + }; + provisioning-os-nixos = { + url = \"path:($taskservs_dir)/infrastructure/os-nixos\"; + }; + }; + + outputs = { self, nixpkgs, nixos-anywhere, provisioning-os-nixos }: { + nixosConfigurations.default = nixpkgs.lib.nixosSystem { + system = if (builtins.elem \"cax\" [\"$server_type\"]) then \"aarch64-linux\" else \"x86_64-linux\"; + + modules = [ + provisioning-os-nixos.nixosModules.default + provisioning-os-nixos.nixosModules.hetzner-$server_type + ./hardware-configuration.nix + ./configuration.nix + ]; + }; + }; +} +" + + $flake_content | save --force $output_path + print $"Generated flake.nix at ($output_path)" +} + +# Generate hardware-configuration.nix based on server type +def generate-hardware-config [server: record, output_path: string] { + let server_type = $server.server_type + let arch = if ($server_type | str contains "cax") { "aarch64" } else { "x86_64" } + + let hardware_config = $" +# Auto-generated hardware configuration for Hetzner ($server_type) +{ config, lib, pkgs, ... }: { + imports = [ + + + ]; + + boot.initrd.availableKernelModules = [ \"ata_piix\" \"uhci_hcd\" \"ahci\" \"virtio_pci\" \"virtio_blk\" ]; + boot.kernelModules = [ ]; + boot.extraModulePackages = [ ]; + + fileSystems.\"/\" = { + device = \"/dev/sda1\"; + fsType = \"ext4\"; + }; + + swapDevices = [ ]; + + hardware.cpu.intel.updateMicrocode = lib.mkDefault (pkgs.stdenv.hostPlatform.isx86_64); + system.stateVersion = \"24.11\"; +} +" + + $hardware_config | save --force $output_path + print $"Generated hardware-configuration.nix at ($output_path)" +} + +# Generate base configuration.nix +def generate-configuration [server: record, output_path: string] { + let hostname = $server.hostname + let domain = "librecloud.online" + + let config = $" +# NixOS configuration for ($hostname) +{ config, lib, pkgs, ... }: { + networking.hostName = \"$hostname\"; + networking.domain = \"$domain\"; + networking.fqdn = \"$hostname.$domain\"; + + time.timeZone = \"UTC\"; + + i18n.defaultLocale = \"en_US.UTF-8\"; + + services.openssh = { + enable = true; + settings = { + PasswordAuthentication = false; + PubkeyAuthentication = true; + }; + }; + + users.users.devadm = { + isNormalUser = true; + home = \"/home/devadm\"; + group = \"devadm\"; + groups = [ \"wheel\" ]; + shell = pkgs.bash; + openssh.authorizedKeys.keys = [ ]; + }; + + users.groups.devadm = { }; + + security.sudo.wheelNeedsPassword = false; + + system.activationScripts.mkdir-provisioning = '' + mkdir -p /nix/provisioning + chown devadm:devadm /nix/provisioning + ''; + + nix.settings.experimental-features = [ \"nix-command\" \"flakes\" ]; + + system.stateVersion = \"24.11\"; +} +" + + $config | save --force $output_path + print $"Generated configuration.nix at ($output_path)" +} + +# Main function +export def main [ + servers_ncl: string # Path to servers.ncl + --output-dir: string # Output directory for flakes + --taskservs-dir: string # Path to provisioning/extensions/taskservs (for imports) + --filter: string # Filter servers by hostname pattern +] { + let output_base = ($output_dir? // "./nixos") + let taskservs_base = ($taskservs_dir? // "./provisioning/extensions/taskservs") + + validate-inputs $servers_ncl $output_base + + let servers_config = (export-servers-config $servers_ncl) + let servers_list = if ($servers_config | has "servers") { + $servers_config.servers + } else { + [$servers_config] + } + + print $"Generating flakes for ($servers_list | length) server(s)..." + + for server in $servers_list { + let hostname = $server.hostname + let os_type = ($server.os_type? // "debian") + + # Skip Debian servers + if $os_type != "nixos" { + print $"⊘ ($hostname): os_type is ($os_type), skipping (not NixOS)" + continue + } + + # Apply filter if specified + if ($filter != null) and not ($hostname | str contains $filter) { + print $"⊘ ($hostname): filtered out" + continue + } + + let server_dir = $"($output_base)/($hostname)" + mkdir $server_dir + + generate-flake-nix $server $taskservs_base $"($server_dir)/flake.nix" + generate-hardware-config $server $"($server_dir)/hardware-configuration.nix" + generate-configuration $server $"($server_dir)/configuration.nix" + + print $"✓ ($hostname): flake generated at ($server_dir)" + } + + print "All flakes generated successfully" +} diff --git a/scripts/nixos/hetzner-nixos-anywhere.nu b/scripts/nixos/hetzner-nixos-anywhere.nu new file mode 100644 index 0000000..f239a28 --- /dev/null +++ b/scripts/nixos/hetzner-nixos-anywhere.nu @@ -0,0 +1,116 @@ +#!/usr/bin/env nu +# Provision Hetzner server with NixOS using nixos-anywhere +# Converts existing Debian/Ubuntu server to NixOS via nixos-anywhere + +use std + +# Configuration contracts +def validate-inputs [server_ip: string, flake_path: string] { + if ($server_ip | is-empty) { error make {msg: "server_ip is required"} } + if ($flake_path | is-empty) { error make {msg: "flake_path is required"} } + + if not ($flake_path | path exists) { + error make {msg: $"flake_path does not exist: ($flake_path)"} + } + + if not ($flake_path | path join "flake.nix" | path exists) { + error make {msg: $"flake.nix not found in ($flake_path)"} + } +} + +# Check if nixos-anywhere is installed +def check-nixos-anywhere [] { + if (which nixos-anywhere | is-empty) { + error make {msg: "nixos-anywhere not found. Install with: nix-shell -p nixos-anywhere"} + } +} + +# Validate SSH connectivity +def validate-ssh [target_host: string] { + let ssh_check = (ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=accept-new $target_host "echo OK" 2>&1 | complete) + + if $ssh_check.exit_code != 0 { + error make {msg: $"SSH connection failed to ($target_host). Output: ($ssh_check.stderr)"} + } +} + +# Execute nixos-anywhere provisioning +def provision-with-nixos-anywhere [target_host: string, flake_path: string, --no-reboot: bool] { + let flake_uri = $"git+file://($flake_path)#nixosConfigurations.default" + + print $"Provisioning ($target_host) with NixOS from ($flake_path)" + print $"Using flake URI: ($flake_uri)" + + let cmd = if $no_reboot { + ["nixos-anywhere" "--no-reboot" "--flake" $flake_uri $target_host] + } else { + ["nixos-anywhere" "--flake" $flake_uri $target_host] + } + + let result = (^$cmd[0] ...$cmd[1..] 2>&1 | complete) + + if $result.exit_code != 0 { + error make {msg: $"nixos-anywhere failed with exit code ($result.exit_code). Output: ($result.stderr)"} + } + + $result.stdout +} + +# Validate NixOS boot (post-deployment) +def validate-nixos-boot [target_host: string] { + print $"Validating NixOS boot on ($target_host)..." + + let boot_check = (ssh -o ConnectTimeout=10 $target_host "uname -s" 2>&1 | complete) + + if $boot_check.exit_code != 0 { + error make {msg: $"Failed to reach ($target_host) after provisioning"} + } + + if not ($boot_check.stdout | str contains "Linux") { + error make {msg: "System did not boot to Linux"} + } +} + +# Main provisioning function +export def main [ + server_ip: string # Hetzner server IP (e.g., 192.0.2.1) + flake_path: string # Path to flake.nix directory + --ssh-user: string = "root" # SSH user (default: root) + --no-reboot: bool # Don't reboot after provisioning + --skip-validation: bool # Skip pre-flight checks +] { + if not $skip_validation { + validate-inputs $server_ip $flake_path + check-nixos-anywhere + } + + let target_host = $"($ssh_user)@($server_ip)" + + if not $skip_validation { + validate-ssh $target_host + } + + # Execute provisioning + let provision_output = (provision-with-nixos-anywhere $target_host $flake_path --no-reboot=$no_reboot) + + print $"Provisioning complete. Output:\n($provision_output)" + + # Wait for system to come back online (if not --no-reboot) + if not $no_reboot { + print "Waiting for system to reboot..." + sleep 30sec + + let retry_count = 0 + while $retry_count < 12 { + try { + validate-nixos-boot $target_host + print "✅ NixOS boot validated successfully" + break + } catch {|e| + print $"⏳ Boot check ($retry_count)/12 - $e" + sleep 10sec + mut $retry_count = $retry_count + 1 + } + } + } +} diff --git a/scripts/ontoref b/scripts/ontoref new file mode 100755 index 0000000..8c0050b --- /dev/null +++ b/scripts/ontoref @@ -0,0 +1,45 @@ +#!/bin/bash +# scripts/ontoref — thin wrapper for projects consuming the ontoref protocol +# Set ONTOREF_ROOT to the ontoref checkout, then delegate to its entry point. +# +# Usage: ./scripts/ontoref [args...] +# Alias: Add `alias ontoref="./scripts/ontoref"` to your shell profile. +# +# Required env vars (exported here): +# ONTOREF_ROOT — absolute path to ontoref checkout +# ONTOREF_PROJECT_ROOT — absolute path to THIS project root + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +readonly SCRIPT_DIR +PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" +readonly PROJECT_ROOT + +# ── Ontoref root ────────────────────────────────────────────────────────────── +# Point to your ontoref checkout. Can be overridden via env var. + +ONTOREF_ROOT="${ONTOREF_ROOT:-/Users/Akasha/Development/ontoref}" +readonly ONTOREF_ROOT + +if [[ ! -f "${ONTOREF_ROOT}/ontoref" ]]; then + echo "ontoref: cannot find ontoref entry point at ${ONTOREF_ROOT}/ontoref" + echo " Set ONTOREF_ROOT to the correct path or update this script." + exit 1 +fi + +# ── Export project context ──────────────────────────────────────────────────── + +export ONTOREF_ROOT +export ONTOREF_PROJECT_ROOT="${PROJECT_ROOT}" + +# Prepend project-specific paths so they take priority over any inherited +# NICKEL_IMPORT_PATH. Existing value is preserved as a fallback at the end. +_project_paths="${PROJECT_ROOT}:${PROJECT_ROOT}/.ontology:${PROJECT_ROOT}/adrs:${ONTOREF_ROOT}/adrs:${ONTOREF_ROOT}/ontology/schemas:${ONTOREF_ROOT}" +export NICKEL_IMPORT_PATH="${_project_paths}${NICKEL_IMPORT_PATH:+:${NICKEL_IMPORT_PATH}}" +unset _project_paths + +# Preserve caller name for dispatcher help output +export ONTOREF_CALLER="${ONTOREF_CALLER:-./scripts/ontoref}" + +exec "${ONTOREF_ROOT}/ontoref" "$@" diff --git a/scripts/platform-generate-manifests.nu b/scripts/platform-generate-manifests.nu new file mode 100755 index 0000000..3fe301a --- /dev/null +++ b/scripts/platform-generate-manifests.nu @@ -0,0 +1,388 @@ +#!/usr/bin/env nu + +use std log + +# Helper: Determine platform configuration base directory +# Uses PROVISIONING_USER_PLATFORM env var if set, otherwise defaults to OS-specific path +def get-platform-base-dir [] { + let env_path = ($env.PROVISIONING_USER_PLATFORM? | default "") + + if ($env_path | is-empty) { + # Fallback to OS-specific default location + let home = $nu.home-dir + let os = $nu.os-info.name + + if $os == "macos" { + $"($home)/Library/Application Support/provisioning/platform" + } else { + # Linux and other Unix-like systems + $"($home)/.config/provisioning/platform" + } + } else { + $env_path + } +} + +# Display help information +def show-help [] { + let home = $nu.home-dir + let os = $nu.os-info.name + let macos_config = $"($home)/Library/Application Support/provisioning/platform" + let linux_config = $"($home)/.config/provisioning/platform" + let default_config = if $os == "macos" { $macos_config } else { $linux_config } + + print "╭─ PROVISIONING MANIFEST GENERATOR ──────────────────────────────────────" + print "│" + print "│ Generate Kubernetes and Docker Compose manifests from Nickel templates" + print "│" + print "├─ PLATFORM-SPECIFIC PATHS ─────────────────────────────────────────────" + print "│" + print "│ macOS:" + print "│ Config Source: ~/Library/Application Support/provisioning/platform/config/" + print "│ Output Directory: ~/Library/Application Support/provisioning/platform/" + print "│" + print "│ Linux:" + print "│ Config Source: ~/.config/provisioning/platform/config/" + print "│ Output Directory: ~/.config/provisioning/platform/" + print "│" + print "├─ USAGE ───────────────────────────────────────────────────────────────" + print "│" + print "│ Get help:" + print "│ nu scripts/platform_generate-manifests.nu help" + print "│ nu scripts/platform_generate-manifests.nu -h (shows Nushell built-in help)" + print "│" + print "│ Generate both Docker and Kubernetes (default):" + print "│ nu scripts/platform_generate-manifests.nu" + print "│" + print "│ Generate specific target:" + print "│ nu scripts/platform_generate-manifests.nu docker # Docker Compose only" + print "│ nu scripts/platform_generate-manifests.nu kubernetes # Kubernetes only" + print "│ nu scripts/platform_generate-manifests.nu all # Both (same as no args)" + print "│" + print "│ With validation:" + print "│ nu scripts/platform_generate-manifests.nu docker --validate" + print "│ nu scripts/platform_generate-manifests.nu all --validate" + print "│" + print "│ Custom paths:" + print "│ nu scripts/platform_generate-manifests.nu kubernetes \\" + print "│ --config-dir /path/to/configs \\" + print "│ --output-dir /path/to/output" + print "│" + print "├─ OPTIONS ─────────────────────────────────────────────────────────────" + print "│" + print "│ TARGET (optional):" + print "│ docker Generate Docker Compose manifests only" + print "│ kubernetes (or k8s) Generate Kubernetes manifests only" + print "│ all Generate both Docker and Kubernetes" + print "│ (none) Generate both (default behavior)" + print "│" + print "│ FLAGS:" + print "│ help Show this help message" + print "│ -h Show Nushell built-in help" + print "│ --validate Validate generated manifests (Docker, Kubernetes)" + print "│ --config-dir Source directory for user configs" + print $"│ Default: ($default_config)/config/" + print "│" + print "│ --output-dir Destination directory for manifests" + print $"│ Default: ($default_config)/" + print "│" + print "├─ ENVIRONMENT VARIABLES ────────────────────────────────────────────────" + print "│" + print "│ PROVISIONING Provisioning repository root" + print "│ Default: Current working directory" + print "│" + print "├─ OUTPUT ──────────────────────────────────────────────────────────────" + print "│" + print "│ Generated Files:" + print "│ • docker-compose.yml Docker Compose stack configuration" + print "│ • k8s/ Kubernetes manifests directory" + print "│ ├─ namespace.yaml" + print "│ ├─ resource-quota.yaml" + print "│ ├─ rbac.yaml" + print "│ ├─ network-policy.yaml" + print "│ ├─ orchestrator-*.yaml" + print "│ ├─ control-center-*.yaml" + print "│ ├─ mcp-server-*.yaml" + print "│ └─ platform-ingress.yaml" + print "│" + print "├─ EXAMPLES ─────────────────────────────────────────────────────────────" + print "│" + print "│ 1. Generate both Docker and Kubernetes (default):" + print "│ nu scripts/platform_generate-manifests.nu" + print "│" + print "│ 2. Generate Docker Compose only:" + print "│ nu scripts/platform_generate-manifests.nu docker" + print "│" + print "│ 3. Generate Kubernetes manifests only:" + print "│ nu scripts/platform_generate-manifests.nu kubernetes" + print "│" + print "│ 4. Generate both with validation:" + print "│ nu scripts/platform_generate-manifests.nu all --validate" + print "│" + print "│ 5. Docker Compose with custom output directory:" + print "│ nu scripts/platform_generate-manifests.nu docker \\" + print "│ --output-dir /tmp/manifests" + print "│" + print "│ 6. Kubernetes with custom config and output directories:" + print "│ nu scripts/platform_generate-manifests.nu kubernetes \\" + print "│ --config-dir ~/.provisioning-custom/config \\" + print "│ --output-dir /tmp/k8s-manifests \\" + print "│ --validate" + print "│" + print "├─ REQUIREMENTS ─────────────────────────────────────────────────────────" + print "│" + print "│ • Nushell 0.110.0+" + print "│ • Nickel CLI" + print "│ • PROVISIONING environment variable or current directory as repo root" + print "│" + print "│ Optional (for validation):" + print "│ • Docker and Docker Compose (for docker-compose validation)" + print "│ • kubectl (for Kubernetes validation)" + print "│" + print "├─ TROUBLESHOOTING ──────────────────────────────────────────────────────" + print "│" + print "│ Config directory not found?" + print "│ → Check that you have user configs in the default location" + print "│ → Or use --config-dir to specify a custom path" + print "│" + print "│ Docker/Kubernetes validation failing?" + print "│ → Ensure Docker and kubectl are installed" + print "│ → Run without --validate flag to skip validation" + print "│" + print "│ Template rendering errors?" + print "│ → Verify PROVISIONING environment variable is set:" + print "│ export PROVISIONING=/path/to/provisioning" + print "│ → Check that templates exist in:" + print "│ \$PROVISIONING/schemas/platform/templates/" + print "│" + print "╰────────────────────────────────────────────────────────────────────────" +} + +def generate-manifests [ + target?: string # What to generate: "docker" (docker-compose only), "kubernetes" (k8s only), or "all" (both) + --validate = false # Validate generated manifests + --output-dir: string # Output directory for manifests + --config-dir: string # Config directory with user service configs +] { + # Use PROVISIONING environment variable for repo root, fallback to current directory + let provisioning_root = $env.PROVISIONING? | default (pwd) + + # Determine output directory (where manifests are written) + let platform_output_dir = if ($output_dir == null or ($output_dir | is-empty)) { + get-platform-base-dir + } else { + $output_dir + } + + # Determine config directory (where user configs are read from) + let platform_config_dir = if ($config_dir == null or ($config_dir | is-empty)) { + get-platform-base-dir + } else { + $config_dir + } + + # Templates are now in schemas/platform/templates/ + let templates_dir = $"($provisioning_root)/schemas/platform/templates" + # NICKEL_IMPORT_PATH: provisioning root first (for schemas), then user config directory + let nickel_import_path = $"($provisioning_root):($platform_config_dir)/config" + + # Determine what to generate + let generate_target = if ($target == null or ($target | is-empty)) { + # No argument provided - default to generating both + "all" + } else { + # Validate provided argument + let t = ($target | str downcase) + if ($t == "docker" or $t == "k8s" or $t == "kubernetes" or $t == "all") { + if $t == "k8s" { "kubernetes" } else { $t } + } else { + log error "Invalid target: $target" + log error "Valid options: docker, kubernetes (or k8s), all" + error make { msg: "Invalid target specified" } + } + } + + log info "🔧 Generating manifests from Nickel templates..." + log info $" Target: ($generate_target)" + log info $" Provisioning repo: ($provisioning_root)" + log info $" Config source: ($platform_config_dir)/config/" + log info $" Output destination: ($platform_output_dir)" + + # Create output directory if it doesn't exist + if not ($platform_output_dir | path exists) { + ^mkdir -p $platform_output_dir + log info $"📁 Created output directory: ($platform_output_dir)" + } + + # Check for user config files to inform build context + if ($platform_config_dir | path exists) { + let config_files = (do { ^find $"($platform_config_dir)/config" -name "*.ncl" -type f 2>/dev/null } | complete).stdout | lines | where { $in != "" } + let config_count = $config_files | length + if $config_count > 0 { + log info $"📋 Found ($config_count) service config files:" + $config_files | each { |f| + let relative = $f | str replace $"($platform_config_dir)/" "" + log info $" • ($relative)" + } + } + } else { + log warning $"⚠️ Config directory not found: ($platform_config_dir)/config" + log info " Manifests will be generated from templates only" + } + + # Generate docker-compose (if requested) + if ($generate_target == "docker" or $generate_target == "all") { + log info "📦 Generating docker-compose.yml..." + + let template = "schemas/platform/templates/docker-compose/platform-stack.solo.yml.ncl" + let nickel_path = $"($platform_config_dir)/config:($provisioning_root):($provisioning_root)/schemas/platform" + + let dc_result = (do { + with-env { NICKEL_IMPORT_PATH: $nickel_path } { + cd $provisioning_root + ^nickel export --format yaml $template + } + } | complete) + + if $dc_result.exit_code == 0 { + $dc_result.stdout | save --force $"($platform_output_dir)/docker-compose.yml" + log info "✅ docker-compose.yml generated" + } else { + log error $"Failed to generate docker-compose.yml: ($dc_result.stderr)" + error make { msg: "Docker Compose generation failed" } + } + } + + # Generate Kubernetes manifests (if requested) + if ($generate_target == "kubernetes" or $generate_target == "all") { + let k8s_dir = $"($platform_output_dir)/k8s" + if not ($k8s_dir | path exists) { + ^mkdir -p $k8s_dir + } + + # Kubernetes templates available in new location + let k8s_templates = [ + "namespace", + "resource-quota", + "rbac", + "network-policy", + "orchestrator-deployment", + "orchestrator-service", + "control-center-deployment", + "control-center-service", + "mcp-server-deployment", + "mcp-server-service", + "platform-ingress", + ] + + let nickel_path = $"($platform_config_dir)/config:($provisioning_root):($provisioning_root)/schemas/platform" + + for template in $k8s_templates { + log info $"☸️ Generating ($template).yaml..." + let template_path = $"schemas/platform/templates/kubernetes/($template).yaml.ncl" + + let k8s_result = (do { + with-env { NICKEL_IMPORT_PATH: $nickel_path } { + cd $provisioning_root + ^nickel export --format yaml $template_path + } + } | complete) + + if $k8s_result.exit_code == 0 { + $k8s_result.stdout | save --force $"($k8s_dir)/($template).yaml" + log info $"✅ ($template).yaml generated" + } else { + log error $"Failed to generate ($template).yaml: ($k8s_result.stderr)" + if not $validate { + # Only fail on missing templates if not in validate-only mode + continue + } + } + } + } + + if $validate { + log info "🔍 Validating manifests..." + if ($generate_target == "docker" or $generate_target == "all") { + validate-compose $"($platform_output_dir)/docker-compose.yml" + } + if ($generate_target == "kubernetes" or $generate_target == "all") { + let k8s_dir = $"($platform_output_dir)/k8s" + validate-kubernetes $k8s_dir + } + } + + log info "✨ Generation complete!" + log info $"📂 Output: ($platform_output_dir)" +} + +def validate-compose [path: string] { + log info "Validating Docker Compose..." + + let docker_check = (do { ^docker --version } | complete) + if $docker_check.exit_code != 0 { + log warning "⚠️ Docker not found - skipping validation" + return + } + + let compose_check = (do { ^docker compose version } | complete) + if $compose_check.exit_code != 0 { + log warning "⚠️ Docker Compose not available - skipping validation" + return + } + + let result = (do { ^docker compose -f $path config } | complete) + + if $result.exit_code == 0 { + let service_count = ($result.stdout | lines | where { $in | str contains "container_name:" } | length) + log info $"✅ docker-compose.yml is valid" + log info $" Services: ($service_count)" + } else { + log error "❌ Docker Compose validation failed" + log error $result.stderr + error make { msg: "Docker Compose validation error" } + } +} + +def validate-kubernetes [k8s_dir: string] { + log info "Validating Kubernetes manifests..." + + let kubectl_check = (do { ^kubectl version --client } | complete) + if $kubectl_check.exit_code != 0 { + log warning "⚠️ kubectl not found - skipping validation" + return + } + + let result = (do { ^kubectl apply --dry-run=client -f $k8s_dir } | complete) + + if $result.exit_code == 0 { + let yaml_files = (do { ^find $k8s_dir -name "*.yaml" -type f } | complete).stdout | lines | where { $in != "" } | length + log info "✅ Kubernetes manifests are valid" + log info $" Manifest files: ($yaml_files)" + } else { + if ($result.stderr | str contains "connection refused") { + log warning "⚠️ Cannot reach Kubernetes cluster - syntax validation only" + } else { + log error "❌ Kubernetes validation failed" + log error $result.stderr + error make { msg: "Kubernetes validation error" } + } + } +} + +# Entry point +def main [ + target?: string # What to generate: 'docker', 'kubernetes' (or 'k8s'), 'all', or 'help' + --validate = false # Validate generated manifests + --output-dir: string # Output directory for manifests + --config-dir: string # Config directory with user service configs +] { + # Check if help was requested + if ($target == "help") or ($target == "-h") or ($target == "--h") { + show-help + return + } + + # Delegate to generate-manifests + generate-manifests $target --validate=$validate --output-dir=$output_dir --config-dir=$config_dir +} diff --git a/scripts/regenerate-constraints.sh b/scripts/regenerate-constraints.sh new file mode 100755 index 0000000..1aa98d9 --- /dev/null +++ b/scripts/regenerate-constraints.sh @@ -0,0 +1,30 @@ +#!/bin/bash +# Regenerate Nickel constraints from TOML master file + +set -e + +OUTPUT_FILE="schemas/platform/common/constraints.ncl" + +echo "🔄 Generating constraints from TOML master..." + +# Run Nickel generator and properly unescape output +nickel eval scripts/generate-constraints.ncl \ + | sed 's/^"//;s/"$//' \ + | sed 's/\\n/\n/g' \ + | sed 's/\\"/"/g' \ + > "$OUTPUT_FILE" + +echo "✅ Generated: $OUTPUT_FILE" + +# Validate +if nickel typecheck "$OUTPUT_FILE" > /dev/null 2>&1; then + echo "✅ Validation passed" +else + echo "❌ Validation failed" + nickel typecheck "$OUTPUT_FILE" + exit 1 +fi + +# Count constraints +COUNT=$(grep -c "contract.from_validator" "$OUTPUT_FILE") +echo "📊 Generated $COUNT constraints" diff --git a/scripts/secrets-decrypt.nu b/scripts/secrets-decrypt.nu new file mode 100755 index 0000000..8f7a93a --- /dev/null +++ b/scripts/secrets-decrypt.nu @@ -0,0 +1,112 @@ +#!/usr/bin/env nu +# SOPS decryption with vault-service age keys +# Decrypts SOPS-encrypted files using Age private key from vault-service +# Usage: secrets-decrypt [--environment dev|staging|prod] [--output ] + +use std log + +def get-vault-url [] { + $env.VAULT_SERVICE_URL? // "http://localhost:9094" +} + +def get-vault-token [] { + $env.VAULT_SERVICE_TOKEN? // "" +} + +def fetch-age-private-key [environment: string] { + let url = $"(get-vault-url)/api/v1/age/get-private?env=($environment)" + let token = (get-vault-token) + + if ($token | is-empty) { + error make { + msg: "VAULT_SERVICE_TOKEN required for private key retrieval" + label: { + text: "Set environment variable or pass token" + span: (metadata $environment).span + } + } + } + + let response = (http get -H {"X-Vault-Token": $token} $url | complete) + + if $response.exit_code != 0 { + error make { + msg: "Failed to fetch age private key" + label: { + text: "vault-service rejected request (check token and permissions)" + span: (metadata $environment).span + } + } + } + + let json = ($response.stdout | from json) + $json.private_key +} + +def main [ + file: string + --environment: string = "dev" + --output: string = "" +] { + let input_path = if ($file | path exists) { + $file + } else { + error make { + msg: "File not found" + label: { + text: $file + span: (metadata $file).span + } + } + } + + # Validate environment + if $environment not-in ["dev", "staging", "prod"] { + error make { + msg: "Invalid environment" + label: { + text: "Must be: dev, staging, or prod" + span: (metadata $file).span + } + } + } + + let output_path = if ($output | is-empty) { + let base = ($input_path | str replace '.enc' '') + $"($base).dec" + } else { + $output + } + + print $"SENSITIVE: Fetching Age private key for ($environment) from vault-service..." + let privkey = (fetch-age-private-key $environment) + print "✓ Age private key retrieved (check audit logs for access tracking)" + + print $"Decrypting ($input_path) with SOPS..." + let result = ( + with-env {"SOPS_AGE_KEY": $privkey} { + ^sops --decrypt --input-type "yaml" --output-type "yaml" --output $output_path $input_path + | complete + } + ) + + if $result.exit_code != 0 { + let stderr = $result.stderr + error make { + msg: "SOPS decryption failed" + label: { + text: $stderr + span: (metadata $file).span + } + } + } + + print $"✓ Decrypted file: ($output_path)" + { + success: true + input: $input_path + output: $output_path + environment: $environment + message: "Private key was used from vault-service and is NOT stored locally" + } +} diff --git a/scripts/secrets-encrypt.nu b/scripts/secrets-encrypt.nu new file mode 100755 index 0000000..1ed8c3d --- /dev/null +++ b/scripts/secrets-encrypt.nu @@ -0,0 +1,105 @@ +#!/usr/bin/env nu +# SOPS encryption with vault-service age keys +# Encrypts configuration files using Age public key from vault-service +# Usage: secrets-encrypt [--environment dev|staging|prod] [--output ] + +use std log + +def get-vault-url [] { + $env.VAULT_SERVICE_URL? // "http://localhost:9094" +} + +def get-vault-token [] { + $env.VAULT_SERVICE_TOKEN? // "" +} + +def fetch-age-public-key [environment: string] { + let url = $"(get-vault-url)/api/v1/age/get-public?env=($environment)" + let token = (get-vault-token) + + let headers = if ($token | is-empty) { + {} + } else { + {"X-Vault-Token": $token} + } + + let response = (http get -H $headers $url | complete) + + if $response.exit_code != 0 { + error make { + msg: "Cannot connect to vault-service" + label: { + text: $"Check VAULT_SERVICE_URL (set to: (get-vault-url))" + span: (metadata $environment).span + } + } + } + + let json = ($response.stdout | from json) + $json.public_key +} + +def main [ + file: string + --environment: string = "dev" + --output: string = "" +] { + let input_path = if ($file | path exists) { + $file + } else { + error make { + msg: "File not found" + label: { + text: $file + span: (metadata $file).span + } + } + } + + # Validate environment + if $environment not-in ["dev", "staging", "prod"] { + error make { + msg: "Invalid environment" + label: { + text: "Must be: dev, staging, or prod" + span: (metadata $file).span + } + } + } + + let output_path = if ($output | is-empty) { + $"($input_path).enc" + } else { + $output + } + + print $"Fetching Age public key for ($environment)..." + let pubkey = (fetch-age-public-key $environment) + print $"✓ Age public key: ($pubkey | str substring 0..20)..." + + print $"Encrypting ($input_path) with SOPS..." + let result = ( + ^sops --encrypt --age $pubkey --encrypted-regex '^(password|token|key|secret|api_key)$' --input-type "yaml" --output-type "yaml" --output $output_path $input_path + | complete + ) + + if $result.exit_code != 0 { + let stderr = $result.stderr + error make { + msg: "SOPS encryption failed" + label: { + text: $stderr + span: (metadata $file).span + } + } + } + + print $"✓ Encrypted file: ($output_path)" + { + success: true + input: $input_path + output: $output_path + environment: $environment + public_key: $pubkey + } +} diff --git a/scripts/secrets-manage.nu b/scripts/secrets-manage.nu new file mode 100644 index 0000000..31e0762 --- /dev/null +++ b/scripts/secrets-manage.nu @@ -0,0 +1,265 @@ +#!/usr/bin/env nu +# Unified secrets management orchestrator +# Manages complete SOPS + vault-service workflow for all environments +# Usage: secrets-manage [options] + +use std log + +def get-vault-url [] { + $env.VAULT_SERVICE_URL? // "http://localhost:9094" +} + +def get-vault-token [] { + $env.VAULT_SERVICE_TOKEN? // "" +} + +def check-vault-connectivity [] { + let url = get-vault-url + let health_url = $"($url)/health" + let response = (http get $health_url | complete) + + if $response.exit_code != 0 { + error make { + msg: "Vault service unreachable" + label: { + text: $"Cannot connect to ($health_url)" + } + } + } + + true +} + +def list-environments [] { + ["dev", "staging", "prod"] +} + +def check-environment-setup [environment: string] { + let token = (get-vault-token) + if ($token | is-empty) { + print $"⚠️ VAULT_SERVICE_TOKEN not set (required for operations)" + } + + let vault_url = (get-vault-url) + print $"Environment: ($environment)" + print $"Vault URL: ($vault_url)" + print $"Token: $(if ($token | is-empty) { "not set" } else { "set" })" +} + +def status [] { + print "" + print "====== SOPS + Vault-Service Status ======" + print "" + + let connectivity_ok = (check-vault-connectivity) + print $"✓ Vault-service connectivity: OK" + + print "" + print "Available environments:" + list-environments | each {|env_name| + print $" • ($env_name)" + } + + print "" + print "Setup:" + check-environment-setup "current" + print "" +} + +def init [environment: string] { + if $environment not-in (list-environments) { + error make { + msg: "Invalid environment" + label: { + text: "Must be: dev, staging, or prod" + } + } + } + + print "" + print $"====== Initialize SOPS for ($environment) ======" + print "" + + print "Step 1: Checking vault-service connectivity..." + check-vault-connectivity + print "✓ Vault-service is accessible" + + print "" + print "Step 2: Generating .sops.yaml configuration..." + let sops_init_result = ( + nu provisioning/scripts/sops-init.nu --environment $environment --validate + ) + + if ($sops_init_result.success) { + print $"✓ SOPS configuration: ($sops_init_result.output_path)" + } else { + error make { + msg: "Failed to generate SOPS configuration" + label: { text: $sops_init_result } + } + } + + print "" + print $"====== ($environment) environment initialized ======" + print "" + print "Next steps:" + print " 1. Verify .sops.yaml: cat $sops_init_result.output_path" + print " 2. Encrypt files: provisioning/scripts/secrets-encrypt.nu --environment $environment" + print " 3. Commit to Git: git add config/secrets/$environment/" + print "" +} + +def encrypt [file: string, environment: string] { + if not ($file | path exists) { + error make { + msg: "File not found" + label: { text: $file } + } + } + + print "" + print $"====== Encrypt ($file) for ($environment) ======" + print "" + + let result = ( + nu provisioning/scripts/secrets-encrypt.nu $file --environment $environment + ) + + if ($result.success) { + print $"✓ Encrypted: ($result.output)" + print $"Ready to commit: git add ($result.output)" + } else { + error make { msg: "Encryption failed" } + } + + print "" +} + +def decrypt [file: string, environment: string] { + if not ($file | path exists) { + error make { + msg: "File not found" + label: { text: $file } + } + } + + print "" + print $"====== Decrypt ($file) for ($environment) ======" + print "⚠️ SENSITIVE OPERATION: Private key will be retrieved" + print "" + + let token = (get-vault-token) + if ($token | is-empty) { + error make { + msg: "VAULT_SERVICE_TOKEN required for decryption" + label: { text: "Set environment variable" } + } + } + + let result = ( + nu provisioning/scripts/secrets-decrypt.nu $file --environment $environment + ) + + if ($result.success) { + print $"✓ Decrypted: ($result.output)" + print "⚠️ REMEMBER: Delete decrypted file after use" + } else { + error make { msg: "Decryption failed" } + } + + print "" +} + +def rotate [environment: string] { + if $environment not-in (list-environments) { + error make { + msg: "Invalid environment" + label: { text: "Must be: dev, staging, or prod" } + } + } + + print "" + print $"====== Key Rotation for ($environment) ======" + print "⚠️ This operation will:" + print " 1. Generate new Age keypair in vault-service" + print " 2. Re-encrypt all SOPS files with new key" + print " 3. Update key version tracking" + print "" + + let token = (get-vault-token) + if ($token | is-empty) { + error make { + msg: "VAULT_SERVICE_TOKEN required for key rotation" + label: { text: "Set environment variable" } + } + } + + let result = ( + nu provisioning/scripts/secrets-rotate-keys.nu --environment $environment + ) + + if ($result.success) { + print $"✓ Key rotation complete" + print $" Previous version: ($result.previous_version)" + print $" New version: ($result.new_version)" + print $" Files updated: ($result.files_updated)" + if ($result.files_failed > 0) { + print $" ⚠️ Files failed: ($result.files_failed)" + } + print "" + print "Verification step:" + print $" ($result.next_steps)" + } else { + error make { msg: "Key rotation failed" } + } + + print "" +} + +def main [ + action: string = "status" + file: string = "" + --environment: string = "dev" +] { + match $action { + "status" => (status), + "init" => (init $environment), + "encrypt" => { + if ($file | is-empty) { + error make { msg: "File required for encrypt action" } + } + (encrypt $file $environment) + }, + "decrypt" => { + if ($file | is-empty) { + error make { msg: "File required for decrypt action" } + } + (decrypt $file $environment) + }, + "rotate" => (rotate $environment), + "help" => { + print "Secrets Management Orchestrator" + print "" + print "Usage: secrets-manage [file] [--environment dev|staging|prod]" + print "" + print "Actions:" + print " status - Show SOPS + vault-service status" + print " init - Initialize environment (generate .sops.yaml)" + print " encrypt - Encrypt configuration file" + print " decrypt - Decrypt SOPS file (requires token)" + print " rotate - Rotate Age keys (requires token)" + print " help - Show this help message" + print "" + print "Examples:" + print " secrets-manage status" + print " secrets-manage init --environment prod" + print " secrets-manage encrypt config.yaml --environment dev" + print " secrets-manage decrypt config.enc.yaml --environment dev" + print " secrets-manage rotate --environment prod" + }, + _ => { + print $"Unknown action: ($action)" + print "Run 'secrets-manage help' for available commands" + } + } +} diff --git a/scripts/secrets-rotate-keys.nu b/scripts/secrets-rotate-keys.nu new file mode 100755 index 0000000..e812d93 --- /dev/null +++ b/scripts/secrets-rotate-keys.nu @@ -0,0 +1,162 @@ +#!/usr/bin/env nu +# Rotate Age keys and re-encrypt all SOPS files +# Generates new Age keypair in vault-service, then re-encrypts all SOPS files +# Usage: secrets-rotate-keys [--environment dev|staging|prod] [--pattern ] + +use std log + +def get-vault-url [] { + $env.VAULT_SERVICE_URL? // "http://localhost:9094" +} + +def get-vault-token [] { + $env.VAULT_SERVICE_TOKEN? // "" +} + +def rotate-keypair-in-vault [environment: string] { + let url = $"(get-vault-url)/api/v1/age/rotate?env=($environment)" + let token = (get-vault-token) + + if ($token | is-empty) { + error make { + msg: "VAULT_SERVICE_TOKEN required for key rotation" + label: { + text: "Set environment variable" + span: (metadata $environment).span + } + } + } + + print "🔑 Rotating Age keypair in vault-service..." + + let response = (http post -H {"X-Vault-Token": $token} $url {} | complete) + + if $response.exit_code != 0 { + error make { + msg: "Failed to rotate keypair" + label: { + text: "vault-service rejected request" + span: (metadata $environment).span + } + } + } + + let json = ($response.stdout | from json) + print $"✓ Rotated: version ($json.previous_version) → ($json.new_version)" + $json +} + +def find-sops-files [environment: string, pattern: string] { + let glob_pattern = if ($pattern | is-empty) { + $"config/secrets/($environment)/**/*.yaml" + } else { + $pattern + } + + let files = (glob $glob_pattern | sort) + if ($files | is-empty) { + print $"⚠️ No SOPS files found for pattern: ($glob_pattern)" + } + $files +} + +def update-sops-file [file: string] { + print $" Re-encrypting: ($file)..." + + let result = ( + ^sops updatekeys --yes $file + | complete + ) + + if $result.exit_code != 0 { + let stderr = $result.stderr + print $" ❌ Failed: ($stderr)" + false + } else { + print $" ✓ Re-encrypted" + true + } +} + +def main [ + --environment: string = "dev" + --pattern: string = "" +] { + # Validate environment + if $environment not-in ["dev", "staging", "prod"] { + error make { + msg: "Invalid environment" + label: { + text: "Must be: dev, staging, or prod" + span: (metadata $environment).span + } + } + } + + print $"====== Age Key Rotation for ($environment) ======" + print "" + + # Step 1: Rotate keypair + let rotation = (rotate-keypair-in-vault $environment) + + print "" + print "🔄 Re-encrypting SOPS files with new public key..." + + # Step 2: Find SOPS files + let sops_files = (find-sops-files $environment $pattern) + + if ($sops_files | is-empty) { + print "⚠️ No SOPS files to re-encrypt" + return { + success: true + rotated: true + files_updated: 0 + message: "Key rotated but no files found to re-encrypt" + } + } + + print $"Found ($sops_files | length) files to re-encrypt" + print "" + + # Step 3: Re-encrypt each file + let success_count = ( + $sops_files | map {|file| + update-sops-file $file + } | where {|result| $result} | length + ) + + let fail_count = ( + $sops_files | map {|file| + update-sops-file $file + } | where {|result| not $result} | length + ) + + print "" + print "====== Rotation Complete ======" + print $"✓ Success: ($success_count) files" + if ($fail_count > 0) { + print $"❌ Failed: ($fail_count) files" + } + print "" + + if ($fail_count > 0) { + error make { + msg: "Some files failed to re-encrypt" + label: { + text: "Review errors above and retry manually" + span: (metadata $environment).span + } + } + } + + { + success: true + environment: $environment + rotated: true + previous_version: $rotation.previous_version + new_version: $rotation.new_version + files_updated: $success_count + files_failed: $fail_count + next_steps: $"Verify decryption: for f in config/secrets/($environment)/*.yaml; do sops --decrypt \\$f | head -1; done" + } +} diff --git a/scripts/setup-platform-config.sh b/scripts/setup-platform-config.sh index 9152b50..b49d319 100755 --- a/scripts/setup-platform-config.sh +++ b/scripts/setup-platform-config.sh @@ -20,12 +20,12 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "${SCRIPT_DIR}/ensure-typedialog.sh" || true # Check TypeDialog availability -if ! command -v typedialog &> /dev/null; then - echo -e "\033[1;33m⚠️ TypeDialog not found. Attempting installation...\033[0m" - ensure_typedialog_installed true || { - echo -e "\033[0;31m❌ Failed to install TypeDialog\033[0m" - exit 1 - } +if ! command -v typedialog &>/dev/null; then + echo -e "\033[1;33m⚠️ TypeDialog not found. Attempting installation...\033[0m" + ensure_typedialog_installed true || { + echo -e "\033[0;31m❌ Failed to install TypeDialog\033[0m" + exit 1 + } fi # Colors for output @@ -40,8 +40,23 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" TYPEDIALOG_SCRIPTS="${PROJECT_ROOT}/.typedialog/platform/scripts" SCHEMAS_PLATFORM="${PROJECT_ROOT}/schemas/platform" -CONFIG_RUNTIME="${PROJECT_ROOT}/config/runtime" -CONFIG_GENERATED="${CONFIG_RUNTIME}/generated" +SCHEMAS_VALUES="${SCHEMAS_PLATFORM}/values" +SCHEMAS_CONFIGS="${SCHEMAS_PLATFORM}/configs" +PLATFORM_CONFIG="${PROJECT_ROOT}/../platform/config" + +# User platform config path (set by CLI via PROVISIONING_USER_PLATFORM or fallback to OS-specific default) +# CLI sets: PROVISIONING_USER_PLATFORM="${HOME}/Library/Application Support/provisioning/platform" (darwin) +# PROVISIONING_USER_PLATFORM="${HOME}/.config/provisioning/platform" (linux) +if [[ -n "${PROVISIONING_USER_PLATFORM:-}" ]]; then + PLATFORM_CONFIG_BASE="${PROVISIONING_USER_PLATFORM}" +elif [[ "$OSTYPE" == "darwin"* ]]; then + PLATFORM_CONFIG_BASE="${HOME}/Library/Application Support/provisioning/platform" +else + PLATFORM_CONFIG_BASE="${HOME}/.config/provisioning/platform" +fi + +# User config directory for service-specific configs (*.ncl files) +USER_CONFIG_DIR="${PLATFORM_CONFIG_BASE}/config" # Available services and modes SERVICES=("orchestrator" "control-center" "mcp-server" "vault-service" "extension-registry" "rag" "ai-service" "provisioning-daemon") @@ -59,33 +74,33 @@ FORCE=false # ============================================================================ print_header() { - echo -e "${BLUE}════════════════════════════════════════════════════════════${NC}" - echo -e "${BLUE}$1${NC}" - echo -e "${BLUE}════════════════════════════════════════════════════════════${NC}" + echo -e "${BLUE}════════════════════════════════════════════════════════════${NC}" + echo -e "${BLUE}$1${NC}" + echo -e "${BLUE}════════════════════════════════════════════════════════════${NC}" } print_info() { - echo -e "${BLUE}ℹ️ $1${NC}" + echo -e "${BLUE}ℹ️ $1${NC}" } print_success() { - echo -e "${GREEN}✅ $1${NC}" + echo -e "${GREEN}✅ $1${NC}" } print_warning() { - echo -e "${YELLOW}⚠️ $1${NC}" + echo -e "${YELLOW}⚠️ $1${NC}" } print_error() { - echo -e "${RED}❌ $1${NC}" >&2 + echo -e "${RED}❌ $1${NC}" >&2 } # Check if directory/file exists check_exists() { - if [[ ! -e "$1" ]]; then - print_error "Not found: $1" - return 1 - fi + if [[ ! -e "$1" ]]; then + print_error "Not found: $1" + return 1 + fi } # ============================================================================ @@ -93,83 +108,216 @@ check_exists() { # ============================================================================ validate_service() { - local service="$1" + local service="$1" - # "all" is handled by bash script itself, not passed to Nushell - if [[ "$service" == "all" ]]; then - return 0 - fi + # "all" is handled by bash script itself, not passed to Nushell + if [[ "$service" == "all" ]]; then + return 0 + fi - if [[ ! " ${SERVICES[*]} " =~ " ${service} " ]]; then - print_error "Invalid service: $service" - echo "Valid services: ${SERVICES[*]}, all" - return 1 - fi + if [[ ! " ${SERVICES[*]} " =~ " ${service} " ]]; then + print_error "Invalid service: $service" + echo "Valid services: ${SERVICES[*]}, all" + return 1 + fi } validate_mode() { - local mode="$1" - if [[ ! " ${MODES[*]} " =~ " ${mode} " ]]; then - print_error "Invalid mode: $mode" - echo "Valid modes: ${MODES[*]}" - return 1 - fi + local mode="$1" + if [[ ! " ${MODES[*]} " =~ " ${mode} " ]]; then + print_error "Invalid mode: $mode" + echo "Valid modes: ${MODES[*]}" + return 1 + fi } # ============================================================================ # Directory Setup # ============================================================================ -ensure_runtime_dir() { - if [[ ! -d "$CONFIG_RUNTIME" ]]; then - print_info "Creating runtime config directory: $CONFIG_RUNTIME" - mkdir -p "$CONFIG_RUNTIME" "$CONFIG_GENERATED" - fi +ensure_config_dirs() { + # Ensure schemas/platform/values directory exists for user configs + if [[ ! -d "$SCHEMAS_VALUES" ]]; then + print_info "Creating values directory: $SCHEMAS_VALUES" + mkdir -p "$SCHEMAS_VALUES" + fi - # Create .gitignore if not present - if [[ ! -f "$CONFIG_RUNTIME/.gitignore" ]]; then - cat > "$CONFIG_RUNTIME/.gitignore" <"$SCHEMAS_VALUES/.gitignore" <"$deployment_mode_file" <<'EOF' +# Platform Deployment Mode Configuration +# Generated by setup-platform-config.sh on initial setup +# +# Modes: "local" | "docker-compose" | "kubernetes" +# This determines HOW platform services are deployed, not WHAT features are enabled + +{ + # Deployment mode: local | docker-compose | kubernetes + mode = "local", + + # Manager configuration (adapts based on mode) + manager = { + # Local: localhost or custom hostname + hostname = "localhost", + port = 9090, + }, + + # User service configurations directory + config_dir = "$USER_CONFIG_DIR", + + # Enable health checks and monitoring + health_checks_enabled = true, + + # Timeout for service startup (seconds) + startup_timeout = 60, + + # External infrastructure services (databases, registries, CI/CD, etc.) + external_services = [], + + # Metadata + description = "Local development with binaries in ~/.local/bin", + created_at = "$(date -u +"%Y-%m-%dT%H:%M:%SZ")", +} +EOF + + if [[ -f "$deployment_mode_file" ]]; then + print_success "Created deployment mode config: $deployment_mode_file" + print_info "Mode: local (binaries in ~/.local/bin)" + print_info "To change mode, edit: $deployment_mode_file" + return 0 + else + print_error "Failed to create deployment mode config" + return 1 + fi +} + +print_deployment_mode_help() { + cat </dev/null 2>&1; then - echo "existing" - return 0 - fi - echo "empty" - return 0 +detect_user_configs() { + # Check if any user config exists in user config directory + shopt -s nullglob + local files=("$USER_CONFIG_DIR"/*.ncl) + shopt -u nullglob + + if [[ ${#files[@]} -gt 0 ]]; then + echo "existing" + else + echo "empty" + fi + return 0 } -list_runtime_services() { - local services=() - for file in "$CONFIG_RUNTIME"/*.ncl; do - if [[ -f "$file" ]]; then - local basename=$(basename "$file" .ncl) - services+=("$basename") - fi - done +list_user_services() { + local services=() + shopt -s nullglob + for file in "$USER_CONFIG_DIR"/*.ncl; do + if [[ -f "$file" ]]; then + local basename=$(basename "$file" .ncl) + services+=("$basename") + fi + done + shopt -u nullglob - if [[ ${#services[@]} -gt 0 ]]; then - printf '%s\n' "${services[@]}" - fi + if [[ ${#services[@]} -gt 0 ]]; then + printf '%s\n' "${services[@]}" + fi +} + +# Generate TOML from NCL config in USER_CONFIG_DIR for deployment use +generate_toml_for_user_config() { + local service="$1" + local mode="$2" + local ncl_file="${USER_CONFIG_DIR}/${service}.${mode}.ncl" + local toml_file="${USER_CONFIG_DIR}/${service}.${mode}.toml" + + if [[ ! -f "$ncl_file" ]]; then + print_warning "Nickel config not found: $ncl_file" + return 1 + fi + + print_info "Generating TOML for $service ($mode)..." + + # Generate TOML from Nickel with proper NICKEL_IMPORT_PATH + if NICKEL_IMPORT_PATH="$PROJECT_ROOT" nickel export --format toml "$ncl_file" >"$toml_file" 2>/dev/null; then + print_success "Generated: $toml_file" + return 0 + else + print_error "Failed to generate TOML for $service ($mode)" + return 1 + fi } # ============================================================================ @@ -177,289 +325,411 @@ list_runtime_services() { # ============================================================================ prompt_action_existing_config() { - while true; do - echo "" - print_warning "Runtime configuration already exists in: $CONFIG_RUNTIME" - echo "" - echo "Choose action:" - echo " 1) Clean up and start fresh (removes all .ncl and .toml files)" - echo " 2) Use TypeDialog to update configuration [default]" - echo " 3) Setup quick mode (solo/multiuser/cicd/enterprise)" - echo " 4) List existing configurations" - echo " 5) Cancel" - echo "" - echo "Press CTRL-C to cancel at any time" - echo "" - read -rp "Enter choice [1-5] (default: 2): " choice + while true; do + echo "" + print_warning "User configurations already exist in: $USER_CONFIG_DIR" + echo "" + echo "Choose action:" + echo " 1) Clean up and start fresh (removes all .ncl and .toml files)" + echo " 2) Use TypeDialog to update configuration [default]" + echo " 3) Setup quick mode (solo/multiuser/cicd/enterprise)" + echo " 4) List existing configurations" + echo " 5) Cancel" + echo "" + echo "Press CTRL-C to cancel at any time" + echo "" + read -rp "Enter choice [1-5] (default: 2): " choice - # Default to 2 (TypeDialog update) - choice="${choice:-2}" + # Default to 2 (TypeDialog update) + choice="${choice:-2}" - case "$choice" in - 1) ACTION="clean-start"; return 0 ;; - 2) ACTION="typedialog"; return 0 ;; - 3) ACTION="quick-mode"; return 0 ;; - 4) ACTION="list"; return 0 ;; - 5) print_info "Cancelled."; exit 0 ;; - *) print_error "Invalid choice. Please enter 1-5 (or press CTRL-C to abort)." ;; - esac - done + case "$choice" in + 1) + ACTION="clean-start" + return 0 + ;; + 2) + ACTION="typedialog" + return 0 + ;; + 3) + ACTION="quick-mode" + return 0 + ;; + 4) + ACTION="list" + return 0 + ;; + 5) + print_info "Cancelled." + exit 0 + ;; + *) print_error "Invalid choice. Please enter 1-5 (or press CTRL-C to abort)." ;; + esac + done } prompt_action_empty_config() { - while true; do - echo "" - echo "Choose how to setup platform configuration:" - echo " 1) Interactive TypeDialog (recommended, with UI form) [default]" - echo " 2) Quick mode setup (choose solo/multiuser/cicd/enterprise)" - echo " 3) Cancel" - echo "" - echo "Press CTRL-C to cancel at any time" - echo "" - read -rp "Enter choice [1-3] (default: 1): " choice + while true; do + echo "" + echo "Choose how to setup platform configuration:" + echo " 1) Interactive TypeDialog (recommended, with UI form) [default]" + echo " 2) Quick mode setup (choose solo/multiuser/cicd/enterprise)" + echo " 3) Cancel" + echo "" + echo "Press CTRL-C to cancel at any time" + echo "" + read -rp "Enter choice [1-3] (default: 1): " choice - # Default to 1 if empty - choice="${choice:-1}" + # Default to 1 if empty + choice="${choice:-1}" - case "$choice" in - 1) ACTION="typedialog"; return 0 ;; - 2) ACTION="quick-mode"; return 0 ;; - 3) print_info "Cancelled."; exit 0 ;; - *) print_error "Invalid choice. Please enter 1, 2, or 3 (or press CTRL-C to abort)." ;; - esac - done + case "$choice" in + 1) + ACTION="typedialog" + return 0 + ;; + 2) + ACTION="quick-mode" + return 0 + ;; + 3) + print_info "Cancelled." + exit 0 + ;; + *) print_error "Invalid choice. Please enter 1, 2, or 3 (or press CTRL-C to abort)." ;; + esac + done } prompt_for_service() { - local max_choice=$((${#SERVICES[@]}+1)) + local max_choice=$((${#SERVICES[@]} + 1)) - while true; do - echo "" - echo "Select service to configure:" - for i in "${!SERVICES[@]}"; do - echo " $((i+1))) ${SERVICES[$i]}" - done - echo " $max_choice) Configure all services [default]" - echo "" - echo "Press CTRL-C to cancel" - echo "" - read -rp "Enter choice [1-$max_choice] (default: $max_choice): " choice + while true; do + echo "" + echo "Select service to configure:" + for i in "${!SERVICES[@]}"; do + echo " $((i + 1))) ${SERVICES[$i]}" + done + echo " $max_choice) Configure all services [default]" + echo "" + echo "Press CTRL-C to cancel" + echo "" + read -rp "Enter choice [1-$max_choice] (default: $max_choice): " choice - # Default to "all services" - choice="${choice:-$max_choice}" + # Default to "all services" + choice="${choice:-$max_choice}" - # Validate numeric input - if ! [[ "$choice" =~ ^[0-9]+$ ]]; then - print_error "Invalid input. Please enter a number (or press CTRL-C to abort)." - continue - fi + # Validate numeric input + if ! [[ "$choice" =~ ^[0-9]+$ ]]; then + print_error "Invalid input. Please enter a number (or press CTRL-C to abort)." + continue + fi - if [[ "$choice" -ge 1 && "$choice" -le "$max_choice" ]]; then - if [[ "$choice" == "$max_choice" ]]; then - SERVICE="all" - else - SERVICE="${SERVICES[$((choice-1))]}" - fi - return 0 - else - print_error "Invalid choice. Please enter a number between 1 and $max_choice (or press CTRL-C to abort)." - fi - done + if [[ "$choice" -ge 1 && "$choice" -le "$max_choice" ]]; then + if [[ "$choice" == "$max_choice" ]]; then + SERVICE="all" + else + SERVICE="${SERVICES[$((choice - 1))]}" + fi + return 0 + else + print_error "Invalid choice. Please enter a number between 1 and $max_choice (or press CTRL-C to abort)." + fi + done } prompt_for_mode() { - local max_choice=${#MODES[@]} + local max_choice=${#MODES[@]} - while true; do - echo "" - echo "Select deployment mode:" - for i in "${!MODES[@]}"; do - local marker="" - # Mark solo as default - if [[ "${MODES[$i]}" == "solo" ]]; then - marker=" [default]" - fi - echo " $((i+1))) ${MODES[$i]}$marker" - done - echo "" - echo "Press CTRL-C to cancel" - echo "" - read -rp "Enter choice [1-$max_choice] (default: 1): " choice + while true; do + echo "" + echo "Select deployment mode:" + for i in "${!MODES[@]}"; do + local marker="" + # Mark solo as default + if [[ "${MODES[$i]}" == "solo" ]]; then + marker=" [default]" + fi + echo " $((i + 1))) ${MODES[$i]}$marker" + done + echo "" + echo "Press CTRL-C to cancel" + echo "" + read -rp "Enter choice [1-$max_choice] (default: 1): " choice - # Default to 1 (solo) - choice="${choice:-1}" + # Default to 1 (solo) + choice="${choice:-1}" - # Validate numeric input - if ! [[ "$choice" =~ ^[0-9]+$ ]]; then - print_error "Invalid input. Please enter a number (or press CTRL-C to abort)." - continue - fi + # Validate numeric input + if ! [[ "$choice" =~ ^[0-9]+$ ]]; then + print_error "Invalid input. Please enter a number (or press CTRL-C to abort)." + continue + fi - if [[ "$choice" -ge 1 && "$choice" -le "$max_choice" ]]; then - MODE="${MODES[$((choice-1))]}" - return 0 - else - print_error "Invalid choice. Please enter a number between 1 and $max_choice (or press CTRL-C to abort)." - fi - done + if [[ "$choice" -ge 1 && "$choice" -le "$max_choice" ]]; then + MODE="${MODES[$((choice - 1))]}" + return 0 + else + print_error "Invalid choice. Please enter a number between 1 and $max_choice (or press CTRL-C to abort)." + fi + done } prompt_for_backend() { - while true; do - echo "" - echo "Select TypeDialog backend:" - echo " 1) web (browser-based, recommended) [default]" - echo " 2) tui (terminal UI)" - echo " 3) cli (command-line)" - echo "" - echo "Press CTRL-C to cancel" - echo "" - read -rp "Enter choice [1-3] (default: 1): " choice + while true; do + echo "" + echo "Select TypeDialog backend:" + echo " 1) web (browser-based, recommended) [default]" + echo " 2) tui (terminal UI)" + echo " 3) cli (command-line)" + echo "" + echo "Press CTRL-C to cancel" + echo "" + read -rp "Enter choice [1-3] (default: 1): " choice - # Default to 1 (web) - choice="${choice:-1}" + # Default to 1 (web) + choice="${choice:-1}" - case "$choice" in - 1) BACKEND="web"; return 0 ;; - 2) BACKEND="tui"; return 0 ;; - 3) BACKEND="cli"; return 0 ;; - *) print_error "Invalid choice. Please enter 1, 2, or 3 (or press CTRL-C to abort)." ;; - esac - done + case "$choice" in + 1) + BACKEND="web" + return 0 + ;; + 2) + BACKEND="tui" + return 0 + ;; + 3) + BACKEND="cli" + return 0 + ;; + *) print_error "Invalid choice. Please enter 1, 2, or 3 (or press CTRL-C to abort)." ;; + esac + done +} + +# ============================================================================ +# User Config Management +# ============================================================================ + +get_or_create_user_config() { + local service="$1" + local mode="$2" + local user_config_file="${USER_CONFIG_DIR}/${service}.${mode}.ncl" + + # Ensure user config directory exists + if [[ ! -d "$USER_CONFIG_DIR" ]]; then + mkdir -p "$USER_CONFIG_DIR" || { + print_error "Failed to create user config directory: $USER_CONFIG_DIR" + return 1 + } + fi + + # If user config exists, return path + if [[ -f "$user_config_file" ]]; then + echo "$user_config_file" + return 0 + fi + + # User config doesn't exist - initialize from template + local template_path="${SCHEMAS_PLATFORM}/defaults/${service}-defaults.ncl" + local mode_defaults="${SCHEMAS_PLATFORM}/defaults/deployment/${mode}-defaults.ncl" + + if [[ ! -f "$template_path" ]]; then + print_error "Default template not found: $template_path" + return 1 + fi + + if [[ ! -f "$mode_defaults" ]]; then + print_error "Mode defaults not found: $mode_defaults" + return 1 + fi + + print_info "Initializing user config from defaults: $user_config_file" + + # Create initial config by composing defaults + mode overlay + cat >"$user_config_file" < "$toml_file" 2>/dev/null; then - print_success "Generated: $toml_file" - return 0 - else - print_error "Failed to generate TOML for $service ($mode)" - return 1 - fi -} - -generate_all_tomls() { - echo "" - print_header "Generating TOML Exports" - - local generated_count=0 - local failed_count=0 - - # Scan for all .ncl files in runtime - for ncl_file in "$CONFIG_RUNTIME"/*.ncl; do - if [[ -f "$ncl_file" ]]; then - local basename=$(basename "$ncl_file" .ncl) - local service="${basename%.*}" # Remove mode suffix - local mode="${basename##*.}" # Extract mode - - if generate_toml_for_service "$service" "$mode"; then - ((generated_count++)) - else - ((failed_count++)) - fi - fi - done - - echo "" - print_success "Generated $generated_count TOML files" - if [[ $failed_count -gt 0 ]]; then - print_warning "$failed_count files failed" - fi -} # ============================================================================ # TypeDialog Configuration # ============================================================================ +run_typedialog() { + local service="$1" + local mode="$2" + local backend="$3" + + local typedialog_dir="${PROJECT_ROOT}/.typedialog/platform" + local form_path="${typedialog_dir}/forms/${service}-form.toml" + local template_path="${SCHEMAS_PLATFORM}/templates/${service}-config.ncl.j2" + local output_path="${USER_CONFIG_DIR}/${service}.${mode}.ncl" + + # Verify form exists + if [[ ! -f "$form_path" ]]; then + print_error "Form not found for $service: $form_path" + return 1 + fi + + # Verify template exists + if [[ ! -f "$template_path" ]]; then + print_error "Template not found for $service: $template_path" + return 1 + fi + + # Get or create user config (source file for nickel-roundtrip) + local user_config_path + user_config_path=$(get_or_create_user_config "$service" "$mode") || return 1 + + # Select TypeDialog binary based on backend + local typedialog_cmd="typedialog" + case "$backend" in + web) + if ! command -v typedialog-web &>/dev/null; then + print_error "typedialog-web not found. Install with: cargo install typedialog --features web" + return 1 + fi + typedialog_cmd="typedialog-web" + ;; + tui) + if ! command -v typedialog-tui &>/dev/null; then + print_error "typedialog-tui not found. Install with: cargo install typedialog --features tui" + return 1 + fi + typedialog_cmd="typedialog-tui" + ;; + cli) + if ! command -v typedialog &>/dev/null; then + print_error "typedialog not found. Install with: cargo install typedialog" + return 1 + fi + ;; + *) + print_error "Invalid backend: $backend" + return 1 + ;; + esac + + # Run TypeDialog with proper argument ordering + # Arguments: nickel-roundtrip --output --ncl-template