docs: update README and CHANGELOG for nickel branch (2026-05-12)

This commit is contained in:
Jesús Pérez 2026-05-12 02:23:01 +01:00
parent 8e721582a7
commit 749cbcf8b6
Signed by: jesus
GPG key ID: 9F243E355E0BC939
288 changed files with 19873 additions and 4320 deletions

View file

@ -0,0 +1,92 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-012",
title = "NATS JetStream as Exclusive Inter-Service Event Broker",
status = 'Accepted,
date = "2026-02-17",
context = "The provisioning platform has four runtime execution contexts — CLI, platform services (Orchestrator, Control Center, Vault, Extension Registry), remote taskservs, and AI/MCP — that must coordinate without leaking credentials or state into transient channels. Prior to this decision, services communicated via direct HTTP polling, shared filesystem state, and environment variables. This created audit gaps (no durable record of which service triggered which operation), credential leakage (provider tokens passed as env vars or written to disk by the CLI process), race conditions (multiple CLI invocations racing over shared config files with no delivery guarantee), and no backpressure (a slow consumer could starve or block a fast producer with no visibility).",
decision = "NATS with JetStream is the exclusive inter-service event broker. All inter-service communication that is not synchronous credential retrieval (Vault HTTPS) or session validation (Control Center HTTPS) must use NATS subjects under the `provisioning.>` hierarchy. Six JetStream streams are defined at startup by Orchestrator: TASKS (work queue), VAULT (interest), AUTH (interest), WORKSPACE (7-day limits), AUDIT (90-day limits), HEALTH (interest). Credentials never travel over NATS — only identifiers (lease_id, task_id, session_id) are published. Solo mode: nats-server -js as child process. Multi-user: external NATS cluster.",
rationale = [
{
claim = "At-least-once delivery with durable persistence",
detail = "JetStream provides durable message persistence for task log replay and audit trail reconstruction. Pull consumers ack explicitly; unacknowledged messages are redelivered.",
},
{
claim = "Work-queue semantics enforce SOLID — CLI cannot call providers directly",
detail = "CLI submits to provisioning.tasks.submitted only. It cannot call provider APIs directly. This is the primary structural enforcement of the SOLID boundary between CLI and Orchestrator.",
},
{
claim = "Push semantics for real-time status streaming without polling",
detail = "Control Center streams task status to browser via WebSocket without polling Orchestrator. NATS push consumers bridge the event stream to the WebSocket layer.",
},
{
claim = "Multi-tenant subject namespacing maps to bounded contexts",
detail = "The provisioning.> hierarchy with six streams maps each stream to its bounded context (tasks, vault, auth, workspace, audit, health). Each service subscribes only to its own subjects.",
},
],
consequences = {
positive = [
"Full audit trail: every state transition is a durable NATS message consumed by AuditCollector",
"No polling: Control Center streams task status to browser via WebSocket",
"Backpressure: JetStream consumers ack explicitly; unacknowledged messages are redelivered",
"SOLID enforcement: CLI can only submit to provisioning.tasks.submitted; cannot call provider APIs directly",
],
negative = [
"nats-server is a required external process in solo mode, adding a startup step",
"Message ordering within a subject is guaranteed but cross-subject ordering is not",
"JetStream persistence requires disk space for AUDIT stream (90-day retention)",
"Pull consumers in VAULT stream add one round-trip vs direct HTTP for lease issuance",
],
},
alternatives_considered = [
{
option = "Direct HTTP polling between services",
why_rejected = "Creates coupling between services and requires each service to know the addresses of others. No delivery guarantee, no audit trail, polling adds latency.",
},
{
option = "Redis Pub/Sub for event distribution",
why_rejected = "Redis Pub/Sub has no persistence — messages are lost if no subscriber is listening. No work-queue semantics, no backpressure, no durable audit trail.",
},
],
constraints = [
{
id = "credentials-never-in-nats",
claim = "Actual credentials (tokens, secrets, keys) must never be published to any NATS subject",
scope = "platform/crates/orchestrator/src/, platform/crates/platform-nats/",
severity = 'Hard,
check = { tag = 'NuCmd, cmd = "rg 'publish|nats' platform/crates/ -l | xargs rg -l 'token|secret|password|key'", expect_exit = 1 },
rationale = "Credentials in NATS messages would be visible to all subscribers on the subject. Only lease_id, task_id, and session_id travel over NATS; actual secrets are fetched over HTTPS from Vault.",
},
{
id = "six-streams-defined-by-orchestrator",
claim = "JetStream stream definitions (TASKS, VAULT, AUTH, WORKSPACE, AUDIT, HEALTH) are created by Orchestrator on startup and must not be redefined by other services",
scope = "platform/crates/orchestrator/src/nats.rs",
severity = 'Hard,
check = { tag = 'NuCmd, cmd = "rg 'create_stream|add_stream' platform/crates/ --include='*.rs' -l | grep -v orchestrator", expect_exit = 1 },
rationale = "Single point of stream definition prevents conflicting stream configurations. Other services are consumers only.",
},
{
id = "nats-subject-hierarchy",
claim = "All NATS subjects must be under the provisioning.> hierarchy with the stream-to-subject mapping documented in schemas/platform/common/nats.ncl",
scope = "platform/crates/platform-nats/",
severity = 'Soft,
check = { tag = 'NuCmd, cmd = "rg '\"[a-z]' platform/crates/ --include='*.rs' | grep -v 'provisioning\\.'", expect_exit = 1 },
rationale = "Consistent subject hierarchy enables subject-level access control and prevents cross-context pollution.",
},
],
related_adrs = ["adr-014-solid-enforcement", "adr-015-solo-mode-architecture"],
ontology_check = {
decision_string = "NATS JetStream is the exclusive inter-service event broker; credentials never travel over NATS; six streams defined by Orchestrator",
invariants_at_risk = ["solid-boundaries"],
verdict = 'Safe,
},
}

View file

@ -0,0 +1,95 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-013",
title = "SurrealDB as the Global Persistent Store",
status = 'Accepted,
date = "2026-02-17",
context = "The platform needs a single persistent data store that can operate embedded (RocksDB, zero external process) in solo mode, run as an external WebSocket server in multi-user deployments without schema changes, support five distinct service namespaces with well-typed schemas (orchestrator, vault, control_center, audit, workspace), store heterogeneous data (task logs append-only, secrets as encrypted blobs, Cedar policies as documents, audit events as time-series, git sync state as mutable), and be queryable by the AI/MCP service for context gathering without a separate analytics database. PostgreSQL requires a server process, SQLite has no native namespacing, Redis has no real persistence. SurrealDB is the only option supporting all five requirements simultaneously.",
decision = "SurrealDB is the exclusive persistent store for all platform state. No service reads raw files or environment variables for credentials at runtime — all reads go through SurrealDB (secrets via Vault Service, which stores ciphertext in SurrealDB). Namespace layout under `provisioning` database: orchestrator (tasks, task_events, execution_logs, config_hashes, provider_cache), vault (secrets, keys, leases, secret_versions, audit_trail), control_center (users, sessions, cedar_policies, policy_evaluations), audit (events, metrics), workspace (registrations, deployments, git_sync_state, extensions). Mode selection via DbConfig: Memory (tests), Embedded (solo, RocksDB), Server (multi-user, WebSocket). Schema initialization via DEFINE TABLE IF NOT EXISTS DDL — no migration framework for additive changes.",
rationale = [
{
claim = "Single storage abstraction across solo and multi-user modes",
detail = "SurrealPool is Clone (Arc<Surreal<Any>> internally), shareable across tokio tasks. The same codebase connects to embedded RocksDB in solo mode and WebSocket server in multi-user — only the DbConfig changes.",
},
{
claim = "No external process in solo mode",
detail = "Embedded RocksDB starts with the service binary. Solo mode requires no external database process, reducing startup dependencies and enabling CI runs without infrastructure.",
},
{
claim = "AI/MCP context without ETL pipelines",
detail = "The AI service queries audit:events and orchestrator:tasks directly for context. SurrealDB's document+relational model handles heterogeneous schemas without separate analytics infrastructure.",
},
{
claim = "Test isolation via DbConfig::Memory",
detail = "In-process Surreal<Mem> requires no external binary — every cargo test run gets a fresh, isolated database. Integration tests run without external infrastructure.",
},
],
consequences = {
positive = [
"Single storage abstraction: SurrealPool is Clone, shareable across tokio tasks",
"No external process in solo mode: embedded RocksDB starts with the service binary",
"AI/MCP context: AI service queries audit:events and orchestrator:tasks directly without ETL pipelines",
"Test isolation: DbConfig::Memory (in-process Surreal<Mem>) requires no external binary",
],
negative = [
"SurrealDB v2 API uses snake_case builtins; bind() requires owned values; ID fields in structs needed to avoid RecordId parsing issues",
"MVCC conflicts under concurrent write load require retry_on_conflict with exponential backoff + jitter on store_secret, store_key, and lease operations",
"Full-text search and graph queries are available but deferred to avoid over-engineering",
],
},
alternatives_considered = [
{
option = "PostgreSQL",
why_rejected = "Requires an external server process — no embedded mode for solo deployment. Schema evolution requires explicit migration tooling. No native document storage for Cedar policies.",
},
{
option = "SQLite",
why_rejected = "No native namespace/tenant isolation. No document model. Concurrent write performance under multiple async tasks is constrained. No WebSocket server mode for multi-user.",
},
{
option = "Redis",
why_rejected = "No real persistence guarantees (AOF is not the same as durable embedded storage). Key-value only — no document or relational queries for audit trail or task history.",
},
],
constraints = [
{
id = "cli-no-surrealdb-direct",
claim = "CLI code (.nu files) must NOT access SurrealDB directly — all reads/writes from CLI go through service HTTP APIs",
scope = "core/nulib/, extensions/",
severity = 'Hard,
check = { tag = 'Grep, pattern = "surrealdb|surreal|SurrealDB", paths = ["core/nulib/", "catalog/"], must_be_empty = true },
rationale = "Direct SurrealDB access from CLI violates the SOLID boundary (ADR-014). All state mutations must go through the service layer to maintain audit trail and authorization.",
},
{
id = "namespace-layout-fixed",
claim = "The five namespaces (orchestrator, vault, control_center, audit, workspace) under the `provisioning` database must not be changed without an ADR",
scope = "platform/crates/platform-db/",
severity = 'Hard,
check = { tag = 'FileExists, path = "platform/crates/platform-db/", present = true },
rationale = "Namespace layout is the boundary contract between services. Changing it without an ADR risks data loss and cross-service coupling.",
},
{
id = "retry-on-mvcc-conflict",
claim = "Operations on store_secret, store_key, and lease operations must use retry_on_conflict with exponential backoff + jitter",
scope = "platform/crates/platform-db/src/retry.rs, platform/secretumvault/",
severity = 'Soft,
check = { tag = 'NuCmd, cmd = "rg 'store_secret|store_key|create_lease' platform/ --include='*.rs' | grep -v retry", expect_exit = 1 },
rationale = "SurrealDB MVCC conflicts are expected under concurrent write load. Without retry, concurrent task executions silently fail on lease creation.",
},
],
related_adrs = ["adr-012-nats-event-broker", "adr-014-solid-enforcement", "adr-015-solo-mode-architecture"],
ontology_check = {
decision_string = "SurrealDB is the exclusive persistent store; CLI accesses state only via service HTTP APIs; five fixed namespaces under provisioning database",
invariants_at_risk = ["solid-boundaries", "config-driven-always"],
verdict = 'Safe,
},
}

View file

@ -0,0 +1,99 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-014",
title = "SOLID Architecture Boundaries with Multi-Layer Enforcement",
status = 'Accepted,
date = "2026-02-17",
context = "As the platform expanded from a CLI tool to a multi-service control plane, a critical failure mode emerged: the Nushell CLI directly called cloud provider CLIs (hcloud, aws, doctl). This violated Single Responsibility (the CLI acquired infrastructure execution responsibility) and Dependency Inversion (CLI depended on concrete provider CLIs instead of the Orchestrator abstraction). Consequences: provider credentials leaked into CLI process environment (HCLOUD_TOKEN as env var), no audit trail for provider API calls made outside Orchestrator, SSH operations done by CLI bypassed the task state machine and rollback capability, auth decisions (JWT validation) duplicated across services instead of delegated to Control Center, secret files read directly by multiple services bypassing Vault's lease lifecycle. Documentation alone fails to enforce boundaries: engineers under time pressure skip it. The enforcement must be structural.",
decision = "Six hard boundaries are enforced at six independent layers. Each layer is a fail-safe — any single layer catching a violation is sufficient to prevent it shipping. The six boundaries: (1) Provider API calls only in orchestrator crate, (2) SSH operations only in orchestrator+machines crates, (3) SurrealDB access from CLI forbidden, (4) Secret credentials forbidden in NATS messages, (5) Auth decisions only in control-center crate, (6) Raw secret file/env reads in services forbidden. Enforcement layers: compile-time (pub(crate) visibility), dev-time (Claude PreToolUse hook), pre-commit (git hook), CI (architecture tests), runtime (Cedar policies), continuous audit (NATS audit subject).",
rationale = [
{
claim = "Documentation alone is insufficient — enforcement must be structural",
detail = "Engineers under time pressure bypass documentation. The six-layer enforcement stack means a violation must simultaneously evade compile-time type checking, the Claude dev hook, the pre-commit grep, the CI architecture test, Cedar policy evaluation, and the NATS audit collector. Any single layer is sufficient to catch it.",
},
{
claim = "Compile-time is the cheapest enforcement layer",
detail = "Provider client types are pub(crate) inside orchestrator. Other crates cannot import them — the Rust compiler rejects the build before any test runs. This is O(0) runtime cost.",
},
{
claim = "AUTH corollaries prevent auth fragmentation",
detail = "solo_auth_middleware is the only documented auth bypass, gated behind --mode solo. All protected routes are inside route_layer(). UserContext is extracted from request extensions, never from headers directly. Cedar policies are the only authorization mechanism — no ad-hoc role checks.",
},
{
claim = "NATS audit subject provides continuous violation detection at runtime",
detail = "provisioning.audit.violation.solid is published on runtime violations. AuditCollector persists these to SurrealDB. Violations discovered after deployment are recorded and queryable.",
},
],
consequences = {
positive = [
"All provider credentials are scoped to Orchestrator's process — no credential leakage path to CLI",
"Task state machine in Orchestrator provides rollback for every provider operation",
"Auth defects are isolated to Control Center — other services cannot accidentally implement auth",
"SOLID violations are caught at the earliest possible layer (usually compile-time or dev-time), not in production",
],
negative = [
"Adding a new cloud provider requires changes to Orchestrator only — correct by design but requires understanding the dispatch model",
"The pre-commit hook adds ~200ms to commit time for grep scans",
"CLI cannot query provider state directly — must call Orchestrator API, adding one HTTP hop",
],
},
alternatives_considered = [
{
option = "Compile-time enforcement only via crate visibility",
why_rejected = "Insufficient for Nushell code which has no compile-time type system. Pre-commit and Claude hooks are needed to cover .nu files where the Rust compiler cannot help.",
},
{
option = "Documentation + code review process",
why_rejected = "The failure mode this ADR addresses (direct provider CLI calls from Nushell) was introduced despite existing documentation. Enforcement must be automatic, not manual.",
},
],
constraints = [
{
id = "provider-calls-orchestrator-only",
claim = "Provider API calls (hcloud, aws, doctl, upctl) must only exist in the orchestrator crate",
scope = "provisioning/",
severity = 'Hard,
check = { tag = 'NuCmd, cmd = "rg 'hcloud|aws|doctl|upctl' --include='*.rs' provisioning/ | grep -v 'orchestrator'", expect_exit = 1 },
rationale = "All provider API calls must flow through the Orchestrator dispatch model to maintain audit trail and rollback capability.",
},
{
id = "ssh-orchestrator-machines-only",
claim = "SSH operations (russh, ssh2) must only exist in orchestrator and machines crates",
scope = "provisioning/platform/crates/",
severity = 'Hard,
check = { tag = 'NuCmd, cmd = "rg 'russh|ssh2' --include='*.rs' provisioning/platform/crates/ | grep -v 'orchestrator\\|machines'", expect_exit = 1 },
rationale = "SSH operations that bypass Orchestrator bypass the task state machine and lose rollback capability and audit trail.",
},
{
id = "solo-auth-middleware-single-bypass",
claim = "solo_auth_middleware is the only place in the codebase where auth is bypassed; it must be gated behind --mode solo and never used in production routing",
scope = "platform/crates/control-center/src/middleware/",
severity = 'Hard,
check = { tag = 'Grep, pattern = "bypass|skip.*auth|no.*auth", paths = ["platform/crates/"], must_be_empty = true },
rationale = "A single documented and tested auth bypass is auditable. Multiple bypass paths create an audit surface that cannot be systematically verified.",
},
{
id = "cedar-only-authorization",
claim = "No ad-hoc role checks (if user.roles.contains) in business logic — Cedar policies are the only authorization mechanism",
scope = "platform/crates/",
severity = 'Soft,
check = { tag = 'NuCmd, cmd = "rg 'roles.contains|role ==' --include='*.rs' platform/crates/ | grep -v test", expect_exit = 1 },
rationale = "Ad-hoc role checks create authorization logic scattered across services that cannot be audited or modified atomically.",
},
],
related_adrs = ["adr-012-nats-event-broker", "adr-013-surrealdb-global-store", "adr-015-solo-mode-architecture"],
ontology_check = {
decision_string = "Six hard SOLID boundaries enforced at six independent layers; solo_auth_middleware is the only documented auth bypass; Cedar is the only authorization mechanism",
invariants_at_risk = ["solid-boundaries", "provider-abstraction"],
verdict = 'Safe,
},
}

View file

@ -0,0 +1,91 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-015",
title = "Solo Mode — Full Architecture with Relaxed Auth",
status = 'Accepted,
date = "2026-02-17",
context = "The platform must run on a single operator's laptop for local development, testing, and single-operator production deployments. Two options were available: (1) Simplified mode — stripped-down binary bypassing services, writing directly to disk/files, skipping NATS and SurrealDB; (2) Full architecture with relaxed auth — same services, same NATS subjects, same SurrealDB schema, but auth middleware replaced with a no-op that auto-creates an admin session. Option 1 creates two separate code paths: solo vs multi-user. Scripts, integrations, and the CLI behave differently per mode. Testing in solo mode cannot validate multi-user behavior. Option 2 preserves a single code path with auth as the only runtime difference.",
decision = "Solo mode uses the full architecture with relaxed auth. Every service (Orchestrator, Control Center, Vault, Extension Registry, AI/MCP) runs as the same binary with the same NATS subjects and the same SurrealDB schema. Runtime differences: SurrealDB uses embedded RocksDB in solo vs WebSocket server in multi-user; NATS uses nats-server -js child process in solo vs external cluster; auth middleware is solo_auth_middleware (auto-session, no JWT) in solo vs auth_middleware (JWT + Cedar) in multi-user; Vault auto-unseals with local age key in solo vs Shamir threshold or KMS; Cedar default-permits local user in solo vs full policy evaluation. solo_auth_middleware injects fixed UserContext { roles: [admin], mfa_verified: true, user_id: Uuid::nil() } and is gated behind --mode solo runtime flag.",
rationale = [
{
claim = "Single code path eliminates solo/multi-user behavioral divergence",
detail = "Any script or integration written for solo mode works in multi-user without modification — only the connection strings change. This makes solo mode a valid staging environment for multi-user behavior.",
},
{
claim = "solo_auth_middleware is isolated and auditable",
detail = "The auth bypass is in one function, gated behind a runtime flag, explicitly tested. Auditing solo mode auth is a grep away: rg 'solo_auth_middleware'. This is safer than multiple ad-hoc bypasses scattered across services.",
},
{
claim = "SurrealDB and NATS data persist across restarts in solo mode",
detail = "RocksDB + JetStream storage persist to disk. Solo mode is not ephemeral — state survives service restarts, enabling realistic local testing of long-running task scenarios.",
},
{
claim = "CI can run integration tests against the solo mode harness without external infrastructure",
detail = "The solo mode harness (embedded RocksDB, child nats-server) runs in CI without network or external service dependencies. Full integration test coverage without infrastructure overhead.",
},
],
consequences = {
positive = [
"Any script or integration written for solo mode works in multi-user without modification — only connection strings change",
"The auth bypass is isolated to one function (solo_auth_middleware) — auditing solo mode auth is a grep away",
"SurrealDB and NATS data persist across restarts in solo mode (RocksDB + JetStream storage to disk)",
"CI can run the full integration test suite against the solo mode harness without external infrastructure",
],
negative = [
"Solo mode requires starting three service binaries (vs one monolith) — managed by service-manager.nu",
"The age key on disk is the only credential that bypasses Vault — its path must be chmod 600",
"nats-server must be in $PATH for solo mode startup",
],
},
alternatives_considered = [
{
option = "Simplified mono-binary for solo, full services for multi-user",
why_rejected = "Creates two code paths. Testing in solo mode does not validate multi-user behavior. Scripts written for solo mode require adaptation for multi-user. Doubles the maintenance surface.",
},
{
option = "Feature flags at compile time (cfg(solo)) to disable auth",
why_rejected = "Compile-time flags prevent running the same binary in both modes. Deployment would require two separate builds. A runtime flag (--mode solo) is more operationally flexible.",
},
],
constraints = [
{
id = "solo-mode-runtime-flag-only",
claim = "solo_auth_middleware must only be activated via --mode solo runtime flag, never via environment variable or compile-time feature",
scope = "platform/crates/control-center/src/lib.rs",
severity = 'Hard,
check = { tag = 'NuCmd, cmd = "rg 'solo_auth_middleware' --include='*.rs' platform/ | grep -v '#\\[cfg(test)'", expect_exit = 0 },
rationale = "A runtime flag is explicit and auditable in process listings. An environment variable or compile-time flag creates an invisible bypass that cannot be detected without reading code or config.",
},
{
id = "age-key-file-permissions",
claim = "The age key at ${data_dir}/vault/master.age must be created with mode 0600 and must be the only file-based secret in the platform",
scope = "platform/secretumvault/src/solo.rs",
severity = 'Hard,
check = { tag = 'NuCmd, cmd = "rg 'master.age|vault.*key' --include='*.rs' platform/ | grep -v 'chmod|0o600|0600'", expect_exit = 1 },
rationale = "The age key is the bootstrap secret — the only credential that bypasses Vault. Strict file permissions are the only protection. Any additional file-based secrets would violate the single-secret constraint.",
},
{
id = "nats-server-child-lifecycle",
claim = "Orchestrator must start nats-server -js as a managed child process with TCP availability wait (10s timeout) and SIGTERM on shutdown",
scope = "platform/crates/orchestrator/src/nats.rs",
severity = 'Soft,
check = { tag = 'Grep, pattern = "nats-server|nats_server", paths = ["platform/crates/orchestrator/"], must_be_empty = false },
rationale = "Unmanaged nats-server processes leak across service restarts and leave stale JetStream state. The 10s TCP wait prevents race conditions between Orchestrator and the NATS server on startup.",
},
],
related_adrs = ["adr-012-nats-event-broker", "adr-013-surrealdb-global-store", "adr-014-solid-enforcement"],
ontology_check = {
decision_string = "Solo mode uses full architecture with solo_auth_middleware as the only auth bypass, gated behind --mode solo runtime flag",
invariants_at_risk = ["solid-boundaries"],
verdict = 'Safe,
},
}

View file

@ -0,0 +1,104 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-016",
title = "Workspace Taskserv Execution as Typed DAGs (Formula Pattern)",
status = 'Accepted,
date = "2026-03-14",
context = "Workspace server definitions declare taskservs as positional arrays — e.g. `taskservs = [etcd, kubernetes, containerd, cilium]`. The provisioning platform executes these in strict linear order regardless of actual dependencies between tasks. This model has three problems: (1) It cannot express parallelism — `containerd` and `coredns` are independent of each other but are serialized behind `kubernetes`. (2) It cannot express conditional edges — a failed `etcd` should halt `kubernetes` but a failed `coredns` should not. (3) The execution intent is implicit — there is no machine-readable artifact that declares which tasks depend on which, so no validation is possible at schema time. The Orchestrator already implements a full `DependencyGraph` with topological sort and `max_parallel_tasks` in `workflow.rs`, but `batch.rs` was building a linear chain from the positional array, ignoring the graph entirely.",
decision = "Workspace infrastructure definitions declare taskserv execution order as typed DAGs via a `Formula` Nickel record exported from `schemas/lib/formula.ncl`. Each `FormulaNode` carries: `id`, a `TaskServDef` (name, profile, target_save_path), `depends_on: Array FormulaDep` (referential edges by node_id + DepKind), `parallel: Bool`, `on_error: [| Stop | Continue | Retry |]`, and `max_retries: u8`. The Formula is validated at schema time by a custom Nickel contract that checks: no duplicate node IDs, every `depends_on.node_id` references a declared node, every `edges.{from,to}` references a declared node. At runtime, `Formula::from_json` in `formula.rs` deserializes the JSON export and `Formula::into_workflow(FormulaWorkflowConfig)` converts it into a `WorkflowDefinition` fed directly to `BatchWorkflowEngine::execute_workflow`, which runs the existing `DependencyGraph` topological sort with `max_parallel_tasks`. Positional `taskservs` arrays remain valid — they are the per-server composition definition and are retrocompatible. Formulas are an additive artifact in the same `servers.ncl` file.",
rationale = [
{
claim = "Schema-time referential integrity catches broken DAGs before deployment",
detail = "The `_Formula` custom Nickel contract validates all `depends_on.node_id` and edge endpoints against the declared `nodes` array. A missing node ID is a typecheck error, not a runtime panic. This enforces the type-safety-nickel axiom on execution topology.",
},
{
claim = "Parallelism is now explicit and governed",
detail = "Nodes marked `parallel = true` with no shared dependency run concurrently up to `max_parallel`. The control plane formula runs etcd first, then kubernetes, containerd, and coredns in parallel (3 workers), then cilium after k8s+containerd. This halved the estimated provisioning time for a 5-node cluster compared to the linear chain.",
},
{
claim = "on_error semantics are declarative, not implicit",
detail = "`on_error = 'Stop` halts the entire workflow on node failure (required for etcd, kubernetes). `on_error = 'Continue` allows the workflow to proceed past a non-critical failure (coredns can fail without blocking cilium). `on_error = 'Retry` retries up to max_retries times before propagating. Previously all failures were treated as Stop with no way to express Continue.",
},
{
claim = "Retrocompatible — zero migration cost for existing servers",
detail = "TaskServDef now has `depends_on`, `on_error`, `max_retries` fields with defaults. Existing `servers.ncl` files typecheck unchanged. Formulas are an opt-in additive array alongside the existing `servers` array. Batch.rs preserves the linear execution path when no formula is supplied.",
},
{
claim = "Single runtime path — the existing DependencyGraph is reused",
detail = "No new execution engine was written. `Formula::into_workflow` produces a standard `WorkflowDefinition` consumed by the existing `BatchWorkflowEngine::execute_workflow`. The DependencyGraph topological sort and parallel dispatch already existed in workflow.rs and were simply never reached via the batch coordinator.",
},
],
consequences = {
positive = [
"Parallel taskserv execution is now possible and schema-validated",
"DAG structure is a first-class artifact — diffable, auditable, versionable in git",
"on+re reflection mode `provisioning-validate-formula` provides cross-validation (taskserv existence, ConflictsWith, cycle detection)",
"FormulaWorkflowConfig<'a> groups conversion parameters — batch.rs call sites are explicit and lint-clean",
"Ontology node `formula-dag-execution` registers this pattern for on+re governance",
],
negative = [
"Two parallel models exist: positional `taskservs` arrays (per-server composition) and `formulas` (execution DAGs). Authors must understand the distinction.",
"Formula node IDs are a new namespace within a server definition — ID collisions across formulas in the same file are not currently detected at parse time (only within a single formula).",
"Nickel's custom contract for referential integrity runs at export time, not at typecheck time — `nickel typecheck` alone is insufficient; `nickel export` is required for full validation.",
],
},
alternatives_considered = [
{
option = "Positional array with dependency annotations as comments",
why_rejected = "Comments are not machine-readable. Cannot be validated, cannot drive runtime parallelism, cannot be consumed by on+re modes. Violates the type-safety-nickel axiom.",
},
{
option = "Separate formula file per server (e.g. wuji-formula.ncl)",
why_rejected = "Separates declaration from context. The `servers.ncl` file already owns the server definition including its taskservs — the formula belongs alongside it. Import proliferation adds no structural benefit.",
},
{
option = "Encode DAG as a TOML/YAML file consumed by the Orchestrator",
why_rejected = "Breaks the type-safety-nickel axiom. TOML/YAML have no contracts, no referential integrity, no schema composition. The Formula pattern allows the Nickel schema to own the execution topology, which is where it belongs.",
},
{
option = "Extend TaskServDef directly with execution metadata (depends_on, on_error) and derive the DAG implicitly",
why_rejected = "Conflates composition (which taskservs a server needs) with orchestration (in what order and how). The Formula is a separate, named artifact that can be versioned, validated, and governed independently from the taskserv list.",
},
],
constraints = [
{
id = "formula-node-ids-unique-within-formula",
claim = "Node IDs must be unique within a single Formula — the custom Nickel contract enforces this at export time",
scope = "schemas/lib/formula.ncl (_Formula contract), workspaces/*/infra/*/servers.ncl",
severity = 'Hard,
check = { tag = 'NuCmd, cmd = "nickel export --format json examples/workspaces/basic/servers.ncl 2>/dev/null | jq '[.formulas[].nodes[].id] | group_by(.) | map(select(length > 1)) | length == 0' | grep -q true", expect_exit = 0 },
rationale = "Duplicate node IDs produce ambiguous depends_on resolution. The contract catches this before the JSON reaches formula.rs.",
},
{
id = "formula-depends-on-declared-nodes-only",
claim = "Every depends_on.node_id and edge endpoint must reference a declared node in the same formula",
scope = "schemas/lib/formula.ncl (_Formula contract)",
severity = 'Hard,
check = { tag = 'FileExists, path = "schemas/lib/formula.ncl", present = true },
rationale = "A reference to a non-existent node_id would silently drop the dependency at runtime, producing an incorrect execution order with no error.",
},
{
id = "formula-runtime-conversion-via-formula-rs-only",
claim = "All Formula-to-WorkflowDefinition conversion must go through Formula::into_workflow — no ad-hoc JSON parsing in batch.rs or elsewhere",
scope = "platform/crates/orchestrator/src/batch.rs, platform/crates/orchestrator/src/formula.rs",
severity = 'Hard,
check = { tag = 'Grep, pattern = "nickel export", paths = ["platform/crates/"], must_be_empty = false },
rationale = "The FormulaWorkflowConfig struct and into_workflow carry the semantic mapping (task names, arg construction, metadata injection). Bypassing it risks silent divergence between schema intent and runtime behavior.",
},
],
ontology_check = {
decision_string = "Workspace taskserv execution topology as typed DAGs via Formula Nickel pattern, converted to WorkflowDefinition at runtime by formula.rs",
invariants_at_risk = ["type-safety-nickel", "config-driven-always"],
verdict = 'Safe,
},
related_adrs = ["adr-014-solid-enforcement", "adr-015-solo-mode-architecture"],
}

View file

@ -0,0 +1,78 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-017",
title = "TypeDialog — Schema-Driven Web Form Backend for Workspace Configuration",
status = 'Accepted,
date = "2026-01-08",
context = "Workspace configuration requires validated user input across multiple fields with interdependencies (e.g. selecting 'kubernetes' deployment enables K8s-specific options). Nushell's `input` command is single-line text only — no validation, no conditional fields, no multi-user collaboration. Nickel is declarative and cannot prompt users interactively. Two options considered: (1) TUI dialogs — Rust-native terminal forms, keyboard-driven, works over SSH; (2) Web form backend — browser-accessible, schema-generated forms, multi-user workflow support.",
decision = "TypeDialog implements a schema-driven web form backend. Nickel contracts in `.typedialog/provisioning/schemas/` are the single source of truth; form fields are generated directly from schema types and constraints with zero manual form code. The web UI is embedded in the control center dashboard (accessible from any browser) and supports draft → review → approve workflows for team environments. A TUI fallback is available for SSH-only environments. Generated config is written as NCL and validated against the same contracts that drove form generation.",
rationale = [
{
claim = "Schema drift between form and config is structurally impossible",
detail = "Form fields are generated from Nickel contracts at render time. If a schema changes, the form changes. No manual sync required — the source of truth is a single file.",
},
{
claim = "Web UI enables multi-user collaborative workflows impossible with TUI",
detail = "Platform engineers, security teams, and dev leads can each access the form from their own browser without SSH. Draft configs persist across sessions. Approval flows are built into the form lifecycle.",
},
{
claim = "TypeDialog fragments encode valid configuration combinations",
detail = "Fragments (database-postgres, deployment-k8s, etc.) capture the full parameter space for each configuration pattern. The fragment system prevents invalid combinations (e.g. PostgreSQL-specific options appearing for a SQLite config) without runtime validation logic.",
},
],
consequences = {
positive = [
"Zero manual form code — schema changes propagate to UI automatically",
"Multi-user collaboration via browser; no SSH required",
"Draft config persistence with audit trail of who configured what",
"TypeDialog fragments prevent invalid config combinations structurally",
],
negative = [
"Control center must be running for web UI access — SSH-only environments use TUI fallback",
"Form generation requires contract files to be valid NCL — schema errors surface as broken forms, not compile errors",
],
},
alternatives_considered = [
{
option = "TUI-only (Ratatui forms, keyboard-driven)",
why_rejected = "Single-user, requires interactive terminal, no multi-user collaboration, no draft persistence, no browser access. Adequate for solo mode but insufficient for team deployments.",
},
{
option = "Custom HTML forms maintained separately from schemas",
why_rejected = "Manual maintenance creates drift between form fields and schema types. Every schema change requires a form update. Two sources of truth.",
},
],
constraints = [
{
id = "schema-is-form-source-of-truth",
claim = "TypeDialog form fields must be generated from Nickel contracts — no manual form field definitions allowed",
scope = ".typedialog/provisioning/schemas/",
severity = 'Hard,
check = { tag = 'FileExists, path = ".typedialog/provisioning/schemas/provisioning-config.ncl", present = true },
rationale = "Manual form definitions diverge from schemas. The contract is the form.",
},
{
id = "generated-config-validates-against-contract",
claim = "Config output by TypeDialog must validate against the same schema contracts used for form generation",
scope = ".typedialog/provisioning/generated/",
severity = 'Hard,
check = { tag = 'FileExists, path = ".typedialog/provisioning/generated", present = true },
rationale = "A form that produces invalid config provides false confidence. Validation must be end-to-end.",
},
],
related_adrs = ["adr-013-surrealdb-global-store", "adr-014-solid-enforcement"],
ontology_check = {
decision_string = "TypeDialog generates web forms from Nickel contracts; schema contracts are the single source of truth for both validation and UI generation",
invariants_at_risk = ["type-safety-nickel"],
verdict = 'Safe,
},
}

View file

@ -0,0 +1,87 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-018",
title = "SecretumVault — Dynamic Secrets Complementary to SOPS+Age",
status = 'Accepted,
date = "2026-01-08",
context = "The platform manages two distinct classes of secrets: (1) static gitops secrets — API keys, TLS certs, SSH keys committed as encrypted SOPS+Age files; (2) dynamic runtime secrets — temporary database passwords, short-lived tokens, auto-rotating credentials. SOPS+Age handles class 1 well but has no concept of TTL, auto-rotation, or access audit trails. HashiCorp Vault handles class 2 but is Go binary (not Rust-native), uses BSL license (not permissive), and uses HCL policies (incompatible with Cedar authorization model).",
decision = "SecretumVault provides dynamic runtime secrets management. It is embedded in the platform as a Rust-native library (path dependency `../../../Development/secretumvault`) with Cedar policy enforcement, SurrealDB-backed storage, and filesystem backend for solo mode (age key at `${data_dir}/vault/master.age`). SOPS+Age remains for static gitops secrets. SecretumVault complements, not replaces, SOPS.",
rationale = [
{
claim = "Rust-native: zero subprocess overhead, same authorization model as the platform",
detail = "SecretumVault is linked as a Rust library. No subprocess, no gRPC, no network hop for secret retrieval. Cedar policies for secrets are evaluated by the same Cedar engine used for infrastructure authorization — one policy model, one audit trail.",
},
{
claim = "HashiCorp Vault rejected: BSL license and HCL policies incompatible with platform constraints",
detail = "BSL license restricts commercial use without a subscription. HCL policies require maintaining a separate policy language alongside Cedar. Neither aligns with the platform's Rust/Cedar/Nickel stack.",
},
{
claim = "SOPS+Age remains authoritative for static gitops secrets",
detail = "Static secrets (API keys in config, TLS certs) are already gitops-managed via SOPS. Replacing SOPS would break the gitops workflow and add migration risk. SecretumVault handles only runtime-dynamic secrets.",
},
],
consequences = {
positive = [
"Dynamic credential TTL and auto-rotation for database passwords and tokens",
"Cedar-gated secret access with audit trail — compliance-ready",
"Solo mode uses filesystem age key — no infrastructure dependency for single-operator deployments",
"Consistent authorization model: Cedar governs both infrastructure operations and secret access",
],
negative = [
"HA deployment requires 3-node Raft cluster — additional infrastructure for production",
"Age key at `${data_dir}/vault/master.age` is the only bootstrap credential — requires chmod 600 and secure backup",
"Local path dependency — requires secretumvault repo checkout alongside provisioning",
],
},
alternatives_considered = [
{
option = "HashiCorp Vault",
why_rejected = "BSL license incompatible with open distribution. Go binary introduces subprocess overhead. HCL policies require a second policy language alongside Cedar. No Rust-native integration.",
},
{
option = "Extend SOPS+Age to handle dynamic secrets",
why_rejected = "SOPS is a file encryption tool, not a secrets engine. TTL, auto-rotation, and audit trails are not concepts SOPS is designed for. Extending it would be a reimplementation of a secrets engine with worse UX.",
},
],
constraints = [
{
id = "sops-and-vault-complementary",
claim = "SOPS+Age handles static gitops secrets; SecretumVault handles runtime dynamic secrets — no overlap",
scope = "platform/secretumvault/",
severity = 'Hard,
check = { tag = 'Grep, pattern = "secretumvault", paths = ["platform/secretumvault/"], must_be_empty = false },
rationale = "Mixing the two systems creates ambiguity about which is authoritative for a given secret class. Clear separation prevents accidental migration of static secrets into the vault.",
},
{
id = "cedar-governs-vault-access",
claim = "All SecretumVault read operations must be authorized via Cedar policies — no bypass path",
scope = "platform/secretumvault/src/",
severity = 'Hard,
check = { tag = 'Grep, pattern = "cedar|authorize", paths = ["platform/secretumvault/"], must_be_empty = false },
rationale = "Cedar is the single authorization point for the platform. Vault access bypassing Cedar creates an unaudited path to secrets.",
},
{
id = "vault-age-key-permissions",
claim = "The bootstrap age key at vault/master.age must be chmod 600 and must be the only file-based credential",
scope = "platform/secretumvault/src/solo.rs",
severity = 'Hard,
check = { tag = 'NuCmd, cmd = "rg 'master.age' --include='*.rs' platform/ | grep -v '0o600\\|0600'", expect_exit = 1 },
rationale = "The age key is the bootstrap trust anchor. If it is world-readable, the entire vault is compromised.",
},
],
related_adrs = ["adr-014-solid-enforcement", "adr-015-solo-mode-architecture"],
ontology_check = {
decision_string = "SecretumVault provides dynamic runtime secrets with Cedar authorization; SOPS+Age remains for static gitops secrets; complementary, not competing",
invariants_at_risk = ["solid-boundaries"],
verdict = 'Safe,
},
}

View file

@ -0,0 +1,87 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-019",
title = "Schema-Aware AI and RAG — Nickel Contracts Constrain AI Config Generation",
status = 'Accepted,
date = "2026-01-08",
context = "Infrastructure configuration generation via LLM is unreliable without grounding: generic AI produces plausible but structurally invalid configs (wrong field names, invalid enum values, incompatible option combinations). Two risks: (1) hallucination — AI generates configs that fail schema validation; (2) security — AI agents with unrestricted access to secrets and deployment operations create unaudited paths. The platform has Nickel schemas for all configuration surfaces and Cedar for authorization — both can be used to constrain AI behavior.",
decision = "AI config generation is constrained by Nickel schemas at generation time and by Cedar policies at authorization time. The ai-service is the HTTP entry point for all AI operations. RAG indexes Nickel schemas, documentation, and past deployments as retrieval context — AI generates WITH schema context, making hallucination structurally harder. Cedar policy forbids ai-service from accessing any secret and requires `context.human_approved == true` before any deployment operation. The mcp-server exposes tool calling (nickel_validate, schema_query, best_practices) to LLM agents.",
rationale = [
{
claim = "Schema-constrained generation eliminates invalid config hallucination",
detail = "Generic LLMs generate `engine = 'postgresql'` when the contract says `engine | [| 'postgres, 'mysql |]`. Providing the schema as RAG context gives the model the exact valid values. Post-generation nickel export validates the output against the same contract.",
},
{
claim = "Cedar is the enforcement layer — not prompt engineering",
detail = "Prompting AI to 'not access secrets' is not a security boundary. Cedar policy `forbid(principal == Service::\"ai-service\", action == Action::\"read\", resource in Secret::\"*\")` is enforced at the platform layer regardless of what the LLM requests.",
},
{
claim = "RAG over project artifacts is more accurate than generic LLM for project-specific configs",
detail = "Indexing `schemas/`, `docs/`, and past successful deployments means AI answers are grounded in actual project patterns — not generic infrastructure knowledge that may conflict with project constraints.",
},
],
consequences = {
positive = [
"AI cannot generate configs that fail Nickel schema validation — structural correctness enforced",
"Cedar prevents AI from accessing secrets or deploying without human approval",
"RAG over project artifacts reduces hallucination on project-specific options",
"MCP tool calling (nickel_validate, schema_query) enables LLM agents to self-correct",
],
negative = [
"RAG index must be kept current as schemas and docs evolve — stale index degrades answer quality",
"ai-service adds a service dependency for all AI-assisted operations",
"Cost tracking required: rate limiting at 60 req/min, 1M tokens/day, $100/day",
],
},
alternatives_considered = [
{
option = "Generic LLM without schema grounding (GitHub Copilot style)",
why_rejected = "Generates syntactically valid but semantically wrong configs — wrong enum values, missing required fields, invalid option combinations. Schema validation must happen after generation and frequently fails.",
},
{
option = "Fine-tuned model on project schemas",
why_rejected = "Fine-tuning is expensive, requires retraining on every schema change, and does not generalize across projects. RAG is dynamic and always reflects the current schema state.",
},
],
constraints = [
{
id = "ai-cannot-access-secrets",
claim = "ai-service must have a Cedar policy explicitly forbidding access to any Secret resource",
scope = "platform/crates/control-center/src/policies/",
severity = 'Hard,
check = { tag = 'Grep, pattern = "ai-service.*Secret|Secret.*ai-service", paths = ["platform/"], must_be_empty = false },
rationale = "AI agents with secret access create unaudited credential exposure. The constraint must be at the authorization layer, not in the LLM prompt.",
},
{
id = "ai-deployment-requires-human-approval",
claim = "Any deployment action triggered by ai-service must have context.human_approved == true in the Cedar evaluation context",
scope = "platform/crates/orchestrator/src/",
severity = 'Hard,
check = { tag = 'Grep, pattern = "human_approved", paths = ["platform/"], must_be_empty = false },
rationale = "Autonomous deployment without human review is an unacceptable risk for production infrastructure. The approval gate is enforced by Cedar, not by AI self-restraint.",
},
{
id = "ai-generation-validates-against-schema",
claim = "All AI-generated Nickel configs must be validated via nickel export before being presented to the user or submitted to the orchestrator",
scope = "platform/crates/ai-service/src/",
severity = 'Hard,
check = { tag = 'Grep, pattern = "nickel.*export|nickel_validate", paths = ["platform/crates/ai-service/"], must_be_empty = false },
rationale = "Post-generation validation closes the loop — if the LLM generates an invalid config despite schema grounding, the user sees a validation error, not a deployment failure.",
},
],
related_adrs = ["adr-014-solid-enforcement", "adr-017-typedialog-web-ui", "adr-018-secretumvault-integration"],
ontology_check = {
decision_string = "AI config generation is constrained by Nickel schemas (RAG grounding) and Cedar policies (secret isolation, human approval gate)",
invariants_at_risk = ["solid-boundaries", "type-safety-nickel"],
verdict = 'Safe,
},
}

View file

@ -0,0 +1,103 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-020",
title = "Extension Capability Declarations: provides/requires/conflicts_with Taxonomy",
status = 'Accepted,
date = "2026-04-03",
context = "ADR-016 introduced typed Formula DAGs for intra-server taskserv execution order. To enable formula dependency resolution at the workspace composition layer (inter-formula DAGs, ADR-021), the Orchestrator needs a machine-readable declaration of what each taskserv produces and what it depends on. Without this, a workspace composition DAG cannot validate that a Formula consuming `kubernetes-api-server` has at least one upstream Formula that provides it — the constraint is implicit and unenforced. Ten built-in taskservs existed with only `name/version/description/supported_providers` metadata — no capability declarations.",
decision = "Every taskserv `metadata.ncl` file must declare three fields: `provides: Array CapabilityEntry` (capabilities this taskserv makes available after successful execution), `requires: Array CapabilityRequirement` (capabilities this taskserv needs from another provider before it can run), and `conflicts_with: Array String` (taskserv names that are mutually exclusive — installing both would produce an irreconcilable conflict). A `CapabilityEntry` carries `id: String` (dot-namespaced, e.g. `kubernetes.api-server`), `kind: CapabilityKind` (`'Service | 'StorageClass | 'NetworkPolicy | 'Runtime | 'CertManager | 'Monitoring | 'Registry | 'DNS | 'Auth`), and `description: String`. A `CapabilityRequirement` carries `capability: String` (the capability `id`), `kind: RequirementKind` (`'Required | 'Optional`), and `description: String`. These fields are validated by `schemas/lib/extension-metadata.ncl` at Nickel typecheck time and audited at runtime by the `provisioning-dag-integrity` reflection mode.",
rationale = [
{
claim = "Machine-readable capability declarations enable schema-time resolution validation",
detail = "The `provisioning-dag-integrity` reflection mode cross-checks every `Required` capability in `requires[]` against the set of `provides[].id` values across all taskservs. An unresolved Required capability is a hard error surfaced before any deployment attempt. Without typed declarations, this check requires reading code comments or documentation.",
},
{
claim = "ConflictsWith enforces mutual exclusion at the registry level",
detail = "A taskserv pair in `conflicts_with` that both appear in a Formula is caught by `provisioning-validate-formula`. The registry-level declaration makes conflicts auditable and tool-enforceable — no runtime failure needed to discover an incompatible combination.",
},
{
claim = "CapabilityKind enum scopes the semantic surface of each capability",
detail = "Using a typed enum (`'Service | 'StorageClass | ...`) rather than free-form strings prevents capability ID sprawl. The DAG resolution query `extensions capabilities --type Service` is only possible with a bounded kind set.",
},
{
claim = "Optional vs Required separation allows partial-graph deployments",
detail = "`'Optional` requirements express soft preferences (e.g. coredns optionally uses an upstream DNS). `'Required` requirements are hard blockers. The distinction enables the orchestrator to warn on unresolved Optional capabilities while failing on unresolved Required ones.",
},
{
claim = "Dot-namespaced capability IDs provide scoping without a global registry",
detail = "IDs like `kubernetes.api-server`, `storage.ceph-block`, `network.cni` are self-documenting and conflict-resistant without requiring a central registry. The namespace prefix is the domain (kubernetes, storage, network, container, tls, dns, monitoring, identity).",
},
],
consequences = {
positive = [
"All 10 built-in taskservs now have typed capability declarations — `provisioning-dag-integrity` runs clean",
"CLI `provisioning catalog capabilities` and `provisioning extensions graph` are powered by these declarations",
"WorkspaceComposition dependency resolution can validate inter-formula capability chains at dag.ncl export time",
"Capability declarations are a first-class artifact — diffable, auditable, versionable in git",
],
negative = [
"New taskservs must populate provides/requires/conflicts_with or fail schema validation — increases authoring burden",
"Capability IDs are not validated against a central registry — a typo in `requires[].capability` fails silently if no provider declares the misspelled ID",
"CapabilityKind enum is closed — adding a new kind requires updating `schemas/lib/dag/contracts.ncl` and re-exporting all metadata files that use it",
],
},
alternatives_considered = [
{
option = "Free-form capability tags (Array String) instead of typed CapabilityEntry",
why_rejected = "Free strings cannot be validated for kind, cannot be queried by type, and cannot carry descriptions. The typed record is required for `provisioning catalog capabilities --type Service` to function and for the `provisioning-dag-integrity` mode to distinguish Required from Optional resolution failures.",
},
{
option = "Single `capabilities` array with a `direction` discriminant (provides/requires encoded as field)",
why_rejected = "A flat array conflates semantically different operations — providing a capability and requiring one have different validation rules and different consumers. Separate `provides` and `requires` arrays make intent explicit and allow independent schema validation.",
},
{
option = "Encode conflicts_with as capability-level conflicts (A.provides X conflicts with B.provides X)",
why_rejected = "Capability-level conflicts are more granular but harder to author — taskserv authors must reason about every capability pair. Taskserv-level mutual exclusion (`conflicts_with: [\"containerd\"]`) is the correct granularity for installation-time enforcement and maps directly to the package manager mental model.",
},
{
option = "Central capability registry file (a single capabilities.ncl across all extensions)",
why_rejected = "A central registry creates a write-contention hotspot when multiple extensions are developed in parallel. Distributed declarations in each metadata.ncl, aggregated by the reflection mode and CLI, achieve the same discoverability with independent authoring.",
},
],
constraints = [
{
id = "capability-ids-dot-namespaced",
claim = "All capability IDs in provides[].id and requires[].capability must use dot-namespaced format: `<domain>.<name>` (e.g. `kubernetes.api-server`, `storage.ceph-block`)",
scope = "catalog/taskservs/*/metadata.ncl",
severity = 'Hard,
check = { tag = 'Grep, pattern = "id = \"[^.]+\"", paths = ["catalog/taskservs/"], must_be_empty = true },
rationale = "Flat IDs (no dot) are ambiguous and collision-prone. The dot namespace convention is the only disambiguation mechanism without a central registry.",
},
{
id = "all-taskservs-must-declare-capability-fields",
claim = "Every taskserv metadata.ncl must declare provides, requires, and conflicts_with — even if as empty arrays",
scope = "catalog/taskservs/*/metadata.ncl, schemas/lib/extension-metadata.ncl",
severity = 'Hard,
check = { tag = 'NuCmd, cmd = "provisioning catalog capabilities 2>/dev/null | length | test { $in > 0 }", expect_exit = 0 },
rationale = "Missing fields are caught by the schema contract but also by `provisioning-dag-integrity`. A taskserv without declarations is invisible to capability resolution — it will never be identified as a provider or dependency.",
},
{
id = "conflicts-with-holds-taskserv-names-not-capability-ids",
claim = "conflicts_with[] must contain taskserv directory names (e.g. `\"containerd\"`), not capability IDs",
scope = "catalog/taskservs/*/metadata.ncl",
severity = 'Hard,
check = { tag = 'NuCmd, cmd = "nu -c 'ls catalog/taskservs/ | get name | each { |d| $d | path basename }'", expect_exit = 0 },
rationale = "The conflict resolution algorithm in `provisioning-validate-formula` looks up taskserv names in the extensions registry. Capability IDs in conflicts_with would never match and silently fail to enforce the constraint.",
},
],
ontology_check = {
decision_string = "Extension capability declarations via provides/requires/conflicts_with typed fields in metadata.ncl, validated by extension-metadata schema and provisioning-dag-integrity reflection mode",
invariants_at_risk = ["type-safety-nickel", "config-driven-always"],
verdict = 'Safe,
},
related_adrs = ["adr-016-workspace-formula-dag"],
}

View file

@ -0,0 +1,117 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-021",
title = "Workspace Composition DAG: Multi-Formula Orchestration and Task Namespacing",
status = 'Accepted,
date = "2026-04-03",
context = "ADR-016 established the Formula pattern for intra-server taskserv execution as a typed DAG. A workspace may contain multiple servers, each with a Formula defining its taskserv execution order. When a workspace is provisioned end-to-end, the Orchestrator needs to execute these Formulas in dependency order — the storage cluster Formula must wait for the control plane Formula to reach a stable state before Ceph OSD initialization begins. This inter-formula coordination layer was missing: batch.rs executed Formulas sequentially in declaration order with no dependency semantics and no health-gate mechanism to verify cluster readiness between Formula groups.",
decision = "Workspace infrastructure definitions declare inter-formula execution order in a `dag.ncl` artifact at `{workspace_root}/infra/{infra}/dag.ncl`. Each entry in `dag.ncl.composition.formulas[]` is a `FormulaCompositionEntry` carrying: `formula_id` (matching a formula declared in `servers.ncl`), `depends_on: Array FormulaDep` (edges to other formula_ids with a `condition: String`), `parallel: Bool`, and an optional `health_gate: HealthGateConfig` (a shell command + expected output + timeout). At runtime, `WorkspaceComposition::into_workflow` merges all formula WorkflowDefinitions into a single unified WorkflowDefinition: task names are rewritten as `formula_id::task_name` to prevent collisions across formulas, health gates are injected as synthetic `WorkflowTaskDefinition` records (with `metadata[\"type\"] = \"health-gate\"`), and inter-formula edges wire the terminal tasks of an upstream formula to the health gate (if present) or directly to the root tasks of the downstream formula. The merged WorkflowDefinition is executed by the existing `BatchWorkflowEngine::execute_workflow`. NATS events are emitted on `provisioning.dag.formula.{started,completed,failed}` and `provisioning.dag.healthgate.{checking,passed,failed}` (gated by the `nats` Cargo feature).",
rationale = [
{
claim = "Single WorkflowDefinition preserves the existing parallel dispatch and dependency resolution",
detail = "The `BatchWorkflowEngine` already implements topological sort and `max_parallel_tasks` dispatch via `DependencyGraph`. Merging all formulas into one WorkflowDefinition reuses this path without a new execution layer. Alternative: run each formula as a separate workflow, sequenced by the batch coordinator — this loses cross-formula parallelism (a worker-plane formula could start concurrently with a storage formula if neither depends on the other).",
},
{
claim = "formula_id::task_name namespacing is the minimal collision prevention mechanism",
detail = "Both `wuji-cp-0-formula` and `wuji-strg-0-formula` declare a node named `etcd_create`. Without namespacing, merging these into one WorkflowDefinition produces duplicate task names, breaking DependencyGraph.find(). The `::` separator was chosen over `.` (conflicts with Nickel field access in debug output) and `/` (ambiguous with path separators in NATS subjects). Formula_id recovery at runtime is `task_name.split_once(\"::\").map(|(fid, _)| fid)`.",
},
{
claim = "Health gates as synthetic tasks avoid a separate inter-formula synchronization mechanism",
detail = "A health gate that checks `kubectl get nodes --field-selector=... | wc -l` must block the dependent formula's root tasks until the cluster reports the expected node count. Injecting the gate as a `WorkflowTaskDefinition` with `metadata[\"type\"] = \"health-gate\"` reuses the existing task dependency mechanism — no new scheduling primitive is needed. The health gate command runs as a polling loop (up to timeout_seconds) inside `execute_task_with_retry`.",
},
{
claim = "dag.ncl at infra/{infra}/dag.ncl separates composition from server topology",
detail = "Composition (which formulas run in what order) is a different concern from server topology (which servers exist and what taskservs they run). Keeping dag.ncl separate from servers.ncl allows the composition DAG to evolve (e.g. adding a monitoring formula) without touching the server definitions, and vice versa. The CLI `provisioning dag show/validate/export` resolves dag.ncl independently of servers.ncl.",
},
{
claim = "NATS event emission on provisioning.dag.* enables live DAG visualization in ontoref-daemon",
detail = "The `ontoref-daemon` DagGraphProvider plugin subscribes to `provisioning.dag.>` to render live formula execution state. The subject hierarchy `provisioning.dag.{formula,healthgate}.{started,completed,failed}` maps directly to the three-layer DAG node types (formula, health gate, task). Gating emission behind `#[cfg(feature = \"nats\")]` means the orchestrator compiles and runs without NATS — the feature is additive.",
},
],
consequences = {
positive = [
"Inter-formula dependency order is a first-class schema artifact — validated by `provisioning-validate-formula` and `validate-topology` reflection mode",
"Health gates enforce cluster readiness between formula groups without ad-hoc sleep loops in provisioning scripts",
"Cross-formula parallelism: independent formulas (no depends_on edge) run concurrently within the same workspace provisioning run",
"NATS events on `provisioning.dag.*` feed live DAG visualization and audit trail",
"formula_id::task_name convention is recoverable — any consumer can extract formula_id from the task name without additional metadata",
],
negative = [
"Two DAG artifacts must be kept consistent: formula_ids in dag.ncl must match formula names in servers.ncl — inconsistency causes runtime panic in WorkspaceComposition::from_json (no schema-time cross-file validation)",
"Health gate commands are shell strings — not typed, not validated at schema time. A broken gate command is discovered at runtime when the gate task fails",
"Task names in NATS events and logs are `formula_id::task_name` — tooling must understand the `::` separator to route events correctly",
"Merging N formulas into one WorkflowDefinition means a single task failure with `fail_fast = true` halts all formulas, even those with no dependency on the failing formula",
],
},
alternatives_considered = [
{
option = "Sequential formula execution in declaration order (existing batch.rs behavior)",
why_rejected = "No parallelism, no dependency semantics, no health gates. The storage formula runs after the worker formula even if workers could have started in parallel. Execution order is implicit (declaration order) not explicit (DAG edges).",
},
{
option = "Separate WorkflowDefinition per formula, coordinated by a top-level orchestrator loop",
why_rejected = "Requires a new scheduling layer above BatchWorkflowEngine. Loses cross-formula parallelism. The existing DependencyGraph already handles the scheduling — merging into one definition reuses it at zero cost.",
},
{
option = "Health gate as a special Formula node type (within the formula, not between formulas)",
why_rejected = "A health gate that checks cluster-wide readiness (e.g. `kubectl get nodes`) crosses formula boundaries — it is a property of the composition, not of a single formula. Encoding it within a formula would couple the formula to knowledge of other formulas' outputs.",
},
{
option = "formula_id.task_name (dot separator) for task namespacing",
why_rejected = "Dot conflicts with Nickel field access notation in debug output and log messages. It also conflicts with NATS subject segment separator conventions where dots are meaningful separators. `::` is idiomatic in Rust and unambiguous in all output contexts.",
},
{
option = "Encode inter-formula edges in servers.ncl alongside the formula declaration",
why_rejected = "servers.ncl owns the server topology — which servers exist, what they run. Composition topology (which formulas execute in which order) is a workspace-level concern that may vary without changing the server definitions. A separate dag.ncl allows composition reuse across workspaces that share server topologies.",
},
],
constraints = [
{
id = "dag-ncl-formula-ids-must-match-servers-ncl",
claim = "Every formula_id in dag.ncl.composition.formulas[] must correspond to a formula declared in the same infra's servers.ncl",
scope = "workspaces/*/infra/*/dag.ncl, workspaces/*/infra/*/servers.ncl",
severity = 'Hard,
check = { tag = 'NuCmd, cmd = "provisioning dag validate 2>/dev/null | grep -q 'CROSS_REF_OK'", expect_exit = 0 },
rationale = "A dag.ncl formula_id with no matching servers.ncl formula causes WorkspaceComposition::from_json to silently skip the formula, producing an incomplete workflow. The validate-topology reflection mode and `provisioning dag validate` enforce this cross-file constraint.",
},
{
id = "task-namespacing-via-double-colon-only",
claim = "All task names in a composed WorkflowDefinition must use the `formula_id::task_name` format — no other separator is permitted",
scope = "platform/crates/orchestrator/src/formula.rs (WorkspaceComposition::into_workflow), platform/crates/orchestrator/src/workflow.rs",
severity = 'Hard,
check = { tag = 'Grep, pattern = "split_once\\(\"::\"\\)", paths = ["platform/crates/orchestrator/src/"], must_be_empty = false },
rationale = "The `::` separator is the runtime contract between WorkspaceComposition::into_workflow (producer) and workflow.rs result processing (consumer). Using a different separator in any new composition code would break formula_id extraction and NATS event routing.",
},
{
id = "health-gate-requires-nonempty-depends-on",
claim = "A FormulaCompositionEntry with a health_gate set must have at least one entry in depends_on",
scope = "workspaces/*/infra/*/dag.ncl, schemas/lib/dag/contracts.ncl",
severity = 'Hard,
check = { tag = 'NuCmd, cmd = "nickel export --format json --import-path . workspaces/librecloud_renew/infra/wuji/dag.ncl 2>/dev/null | jq '[.composition.formulas[] | select(.health_gate != null and (.depends_on | length) == 0)] | length == 0'", expect_exit = 0 },
rationale = "A health gate with no upstream formula has no terminal tasks to wire the gate to. The gate task gets empty depends_on and runs immediately, defeating its purpose. The dag.nu DOT exporter crashes on this case (`$entry.depends_on.0` index-out-of-bounds when depends_on is empty).",
},
{
id = "nats-emission-behind-feature-flag",
claim = "All NATS event emission in workflow.rs must be behind `#[cfg(feature = \"nats\")]` — the orchestrator must compile and function without NATS",
scope = "platform/crates/orchestrator/src/workflow.rs, platform/crates/orchestrator/Cargo.toml",
severity = 'Hard,
check = { tag = 'Grep, pattern = "emit_dag_event", paths = ["platform/crates/orchestrator/src/workflow.rs"], must_be_empty = false },
rationale = "NATS is an optional external dependency. The orchestrator must run in solo mode (no NATS) and in test environments without a running NATS server. Unconditional NATS calls would make all workflow tests depend on NATS availability.",
},
],
ontology_check = {
decision_string = "Workspace composition DAG via dag.ncl with formula_id::task_name namespacing, health gates as synthetic tasks, and NATS event emission on provisioning.dag.* subjects",
invariants_at_risk = ["type-safety-nickel", "config-driven-always"],
verdict = 'Safe,
},
related_adrs = ["adr-016-workspace-formula-dag", "adr-020-extension-capability-declarations", "adr-012-nats-event-broker"],
}

View file

@ -0,0 +1,95 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-022",
title = "ncl-sync: Nickel Configuration Sync Daemon",
status = 'Accepted,
date = "2026-04-16",
context = "Every `prvng` CLI invocation that reads configuration runs `nickel export --format json` at least once, often multiple times. There are 124 call sites across the Nu codebase; each export costs 2-5s. The Nu module parse cost (~600-1200ms for 345 files) is a separate problem. This plan targets only the Nickel export cost. `lib_provisioning/config/cache/` already existed with the correct API shape (`cache-lookup`, `lookup-nickel-cache`, etc.) but every function was a no-op — the infrastructure was there but never wired to actual storage. Additionally, `nu_plugin_nickel` already implements file-content-based caching (`nickel-eval` command) but lacked `--import-path` support, which is why all 124 call sites used `^nickel export` directly instead of the plugin.",
decision = "A Rust daemon (`ncl-sync`) compiles NCL to JSON proactively and maintains a shared cache at `~/.cache/provisioning/config-cache/`. The daemon and the `nu_plugin_nickel` plugin share a single cache directory and a single key derivation strategy: `SHA256(file_content + sorted_import_paths_joined_by_colon + format)`. This makes the key content-addressed — identical file content produces the same key regardless of path, and the daemon's pre-warmed entries are immediately visible to `nickel-eval` without any coordination protocol. Nu call sites in the hot path replace `^nickel export --format json ... | from json` with `nickel-eval ... --import-path [...]`. For soft-failure call sites (where export failure is acceptable), a `ncl-eval-soft` wrapper in `lib_provisioning/utils/nickel_processor.nu` isolates the single necessary `try/catch` and exposes clean call sites. The daemon is started by `prvng platform start` via `ncl-sync-start` in `service-manager.nu` and stopped by `prvng platform stop`. Nu processes signal re-export needs after mutations by writing `.sync-<pid>.json` sidecar files (atomic rename); the daemon drains these every 500ms.",
rationale = [
{
claim = "Shared cache dir + content-based key eliminates the need for a socket or IPC between daemon and Nu processes",
detail = "The plugin's `lookup_cache` reads `~/.cache/provisioning/config-cache/<key>.json` directly from disk. The daemon writes to the same path. There is no runtime coordination — the plugin simply finds the file or falls back to direct `nickel export`. Alternative: daemon exposes a Unix socket for reads (prvng-cli daemon plan) — requires Nu processes to know the socket path, handle connection failures, and adds 10-15ms of socket overhead. The file-based approach gives <5ms reads and zero coupling.",
},
{
claim = "Content-addressed key (SHA256 of file content) is more correct than path+mtime-based key",
detail = "A path+mtime key would falsely invalidate the cache if a file is touched without content change (e.g. `git checkout`, `touch`). A content-based key ensures that identical NCL files share a cache entry regardless of path, and that the cache only misses when the file actually changed. The tradeoff is that the key computation requires reading the file — mitigated by the daemon doing this proactively at warm-up rather than on each Nu invocation.",
},
{
claim = "Extending nu_plugin_nickel with --import-path is the correct fix for the 124 ^nickel call sites",
detail = "The plugin existed precisely for this purpose but lacked `--import-path` support, forcing all provisioning code to use `^nickel export` directly. Adding `--import-path` to `nickel-eval` and `nickel-export` unblocks the migration. The plugin already converts JSON to Nu values natively (eliminating `| from json`), handles caching, and preserves error semantics via `LabeledError`.",
},
{
claim = "ncl-sync does not require NATS, SurrealDB, or any platform service",
detail = "The daemon watches the filesystem via `notify`, runs `nickel` as a subprocess, and writes JSON files. It has no network dependencies. If ncl-sync depended on NATS to function, it would have a bootstrap circularity: NATS is a platform service whose configuration is described in NCL. A config cache daemon cannot depend on the services whose configuration it caches.",
},
{
claim = "Nu processes are never writers to the cache directory",
detail = "Single-writer principle: only ncl-sync writes `<key>.json` files to the cache. Nu processes write `.sync-<pid>.json` sidecar files as signals to the daemon, then immediately continue execution. The daemon drains sidecars and writes cache entries. This prevents concurrent-write corruption of cache files without requiring locks.",
},
],
consequences = {
positive = [
"prvng component list, workflow list: ~1.5s (from ~3-7s) — Nu module parse only, no nickel export stall",
"prvng deploy: ~3-5s (from ~15-30s) — multiple nickel exports are cache hits",
"Cache survives across prvng invocations — warm-up on platform start amortizes the cost for the whole session",
"nu_plugin_nickel is now usable for all config reads (--import-path gap closed)",
],
negative = [
"Nu startup cost (~1.2s module parse) is unaffected — a separate problem",
"First invocation of the day: cache cold until daemon warm-up completes (~500ms-2s)",
"ncl-sync binary must be installed and in PATH for performance benefits; absence degrades gracefully to direct nickel export",
],
},
alternatives_considered = [
{
option = "prvng-cli daemon: route read-only CLI commands to a separate Rust HTTP server via Unix socket",
why_rejected = "Solves only specific read commands (<100ms), not the general nickel export cost. Adds a second daemon with socket/PID lifecycle. Nu call sites still need output formatting to match Nu tables. Doesn't help operation-path commands that also call nickel export.",
},
{
option = "Lazy-load Nu modules (refactor main_provisioning/mod.nu)",
why_rejected = "The dispatcher already lazy-loads commands/ subdirectory. The Nu interpreter startup (~200-400ms) is unavoidable regardless. Module parse cost is ~600-1200ms — a real problem but separate from the nickel export stall. This plan targets nickel export; module parse is a future orthogonal improvement.",
},
{
option = "Nu-side cache with file-mtime check (no daemon)",
why_rejected = "Nu processes are ephemeral — no proactive warming. First command of each session still pays the nickel export cost. Concurrent Nu processes (Makefile, CI) cause cache stampede: multiple processes miss simultaneously and all run nickel export. No file watching — cache becomes stale silently after NCL edits.",
},
{
option = "Separate cache directories for daemon and plugin",
why_rejected = "Requires a coordination protocol (socket, IPC, or manifest polling) so the plugin can find daemon-written entries. The shared-directory approach eliminates coordination entirely — the key derivation IS the coordination protocol.",
},
],
ontology_check = {
decision_string = "ncl-sync Rust daemon + nu_plugin_nickel shared cache at ~/.cache/provisioning/config-cache/ with content-based key SHA256(content+imports+format)",
invariants_at_risk = ["config-driven-always", "type-safety-nickel"],
verdict = 'Safe,
},
related_adrs = ["adr-023-ncl-export-wrapper"],
constraints = [
{
id = "ncl-sync-single-writer",
claim = "Nu processes NEVER write .json files to the cache directory directly",
scope = "provisioning/core/nulib/",
severity = 'Hard,
check = { tag = 'Grep, pattern = "save.*config-cache.*\\.json", paths = ["provisioning/core/nulib/"], must_be_empty = true },
rationale = "Single-writer principle: concurrent Nu processes writing cache files would corrupt manifest state and produce partial JSON. Only ncl-sync daemon writes to the cache directory.",
},
{
id = "ncl-sync-no-platform-services",
claim = "ncl-sync binary must not depend on platform-nats, platform-db, or surrealdb",
scope = "provisioning/platform/crates/ncl-sync/Cargo.toml",
severity = 'Hard,
check = { tag = 'Grep, pattern = "platform-nats|platform-db|surrealdb", paths = ["provisioning/platform/crates/ncl-sync/"], must_be_empty = true },
rationale = "Bootstrap circularity: NATS and SurrealDB are platform services whose configuration is managed by ncl-sync. The daemon cannot depend on services it configures.",
},
],
}

View file

@ -0,0 +1,73 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-023",
title = "ncl-eval wrapper: nu_plugin_nickel as the single ^nickel export abstraction in Nu",
status = 'Accepted,
date = "2026-04-16",
context = "After ADR-022 established the ncl-sync daemon and shared cache, Nu call sites needed to be migrated from `^nickel export --format json ... | from json` to the plugin. Two call patterns exist: hard-failure (export failure should propagate as an error — uses `error make`) and soft-failure (export failure should return a fallback value — uses `if $result.exit_code != 0`). Distributing try/catch across 124 call sites would violate the guideline against widespread use of try/catch for Nu plugin commands.",
decision = "Two wrapper functions in `lib_provisioning/utils/nickel_processor.nu` serve as the single abstraction layer: `ncl-eval [path import_paths]` for hard-failure call sites (error propagates from the plugin directly — no try/catch needed), and `ncl-eval-soft [path import_paths fallback]` for soft-failure call sites (a single try/catch returns `fallback` on any plugin error). Block C1 migrates the four hot-path call sites: `dispatcher.nu` (commands-registry), `components.nu` (comp-ncl-export helper + servers.ncl), `workflow.nu` (wf-ncl-export helper + settings.ncl + state.ncl), `extensions.nu` (metadata.ncl per taskserv). Block C2/C3 cover the remaining operation-path and validation call sites.",
rationale = [
{
claim = "ncl-eval-soft isolates the single try/catch to one location",
detail = "In Nu 0.111.0, try/catch is valid for Nu internal commands (including plugins). However, dispersing try/catch across dozens of call sites increases cognitive load and creates inconsistency. Centralizing it in ncl-eval-soft means the pattern is reviewed once and applied uniformly. Callers declare intent via the `fallback` parameter (`{}`, `[]`, or `null`) rather than embedding error-handling logic inline.",
},
{
claim = "ncl-eval (hard-failure) requires no try/catch — plugin LabeledError propagates naturally",
detail = "When `nickel-eval` fails, it raises a `LabeledError` that Nu surfaces as a structured error. This is identical in behavior to `error make { msg: ... }` in the existing code. The call site is simply `ncl-eval $path [$ws $prov]` — one line instead of four. No error handling is needed because the error propagation is the correct behavior.",
},
{
claim = "nickel-eval returns Nu values natively, eliminating | from json",
detail = "The plugin converts `serde_json::Value` to `nu_protocol::Value` via `json_value_to_nu_value`. Call sites receive a Nu record or list directly and can use cell path access (`$data.components`, `$data.dimensions`) without an intermediate string parse step. This removes a class of parse errors where `from json` would fail on empty stdout from a cached result.",
},
],
consequences = {
positive = [
"Hot-path call sites (4 files, C1) are now cache-backed via nu_plugin_nickel",
"Single try/catch location for soft-failure pattern — easy to audit",
"| from json eliminated from migrated call sites",
],
negative = [
"nu_plugin_nickel must be registered in the Nu session for performance benefits; unregistered sessions fall back to the `^nickel export` path in nickel-eval-soft (via the catch branch)",
"Block C2/C3 (remaining 120 call sites) are not yet migrated — those paths still use ^nickel export directly",
],
},
alternatives_considered = [
{
option = "Wrap each call site individually with do { } | complete (existing pattern)",
why_rejected = "Works only for external commands, not for Nu plugin commands. Plugin commands raise LabeledError — not catchable via complete. Keeping ^nickel export at call sites means all cache benefits are lost.",
},
{
option = "Single ncl-export.nu wrapper delegating to ^nickel export with inline cache check",
why_rejected = "Duplicates the cache logic already inside nu_plugin_nickel. Two cache implementations with different key strategies would diverge. The plugin is the correct cache owner — the wrapper should delegate to it.",
},
{
option = "Migrate all 124 call sites at once",
why_rejected = "Risk surface too large. Priority-ordered migration (C1 hot-path first) allows validating cache correctness on the most-exercised paths before touching validation, bootstrap, and diagnostic paths that are harder to test.",
},
],
ontology_check = {
decision_string = "ncl-eval + ncl-eval-soft wrappers in nickel_processor.nu replace ^nickel export at hot-path call sites; single try/catch in ncl-eval-soft",
invariants_at_risk = ["config-driven-always"],
verdict = 'Safe,
},
related_adrs = ["adr-022-ncl-sync-daemon"],
constraints = [
{
id = "c1-no-direct-nickel-export",
claim = "Hot-path files (C1) must not contain direct ^nickel export calls after migration",
scope = "dispatcher.nu, components.nu, workflow.nu, extensions.nu",
severity = 'Hard,
check = { tag = 'Grep, pattern = "^nickel export", paths = ["provisioning/core/nulib/main_provisioning/dispatcher.nu", "provisioning/core/nulib/main_provisioning/components.nu", "provisioning/core/nulib/main_provisioning/workflow.nu", "provisioning/core/nulib/main_provisioning/extensions.nu"], must_be_empty = true },
rationale = "Direct ^nickel export in C1 files bypasses the plugin cache, negating the performance benefit of ADR-022. All C1 exports must go through ncl-eval or ncl-eval-soft.",
},
],
}

View file

@ -0,0 +1,99 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-024",
title = "ncl-sync: Event-driven cache invalidation via NATS",
status = 'Accepted,
date = "2026-04-17",
context = "ADR-022 established the ncl-sync daemon with a file watcher (notify) as the automatic invalidation mechanism. ADR-023 added an explicit sync-request sidecar written by Nu processes (state-write). Both mechanisms have limitations: the file watcher has a debounce window (~100ms) where cache can be momentarily stale, and sync-request polling adds 500ms latency. The orchestrator (Rust) writes state files from a separate process — it cannot easily participate in the file-watcher's same-process events, and requiring it to write sync-request sidecars would couple it to ncl-sync's internal protocol. NATS is already used by the orchestrator for DAG events (`provisioning.dag.*`) — extending it for cache invalidation is a natural fit.",
decision = "ncl-sync gains an optional NATS subscriber behind the `nats` Cargo feature (default-enabled). The subscriber listens on two subjects: `provisioning.workspace.ncl.changed` (file modified) and `provisioning.workspace.ncl.removed` (file deleted). Payload is a JSON object `{workspace, path, import_paths, source}`. On receipt, the subscriber validates that `workspace` matches its watched workspace, then calls `export_ncl` or `evict` directly — bypassing the file-watcher debounce and the sync-request poll. Cache is refreshed in <15ms vs ~100ms (watcher) or ~500ms (sidecar). The mechanism is opt-in via `ncl_sync.nats.enabled = true` in the config — without NATS, the daemon runs identically to before (watcher + sidecar fallback).",
rationale = [
{
claim = "NATS subscriber complements rather than replaces the file watcher",
detail = "Three invalidation mechanisms now exist with different failure characteristics: (1) file watcher — always active, catches any write including manual edits, ~100ms latency; (2) sync-request sidecar — written by Nu state-write, catches Nu-originated writes, ~500ms latency; (3) NATS events — written by any publisher, zero coupling to filesystem, <15ms latency. Each covers a different failure mode: watcher catches untracked writers, sidecar catches Nu writers, NATS catches Rust writers. Redundancy is intentional — duplicate events are idempotent (same cache_key, same content).",
},
{
claim = "Workspace validation prevents cross-daemon interference",
detail = "Multiple ncl-sync daemons may run (one per workspace). All subscribe to the same subject hierarchy. The subscriber canonicalizes both its watched workspace path and the event's workspace path; only events matching its workspace are processed. This allows NATS events to fan out to all relevant daemons without coordination.",
},
{
claim = "Subject hierarchy matches the workspace event model, not the orchestrator DAG model",
detail = "`provisioning.dag.*` subjects are about workflow execution. `provisioning.workspace.ncl.*` subjects are about configuration state. Keeping them separate lets ncl-sync subscribe narrowly (two subjects) without parsing unrelated events. Future publishers (installer, backup restore, etc.) use the same namespace.",
},
{
claim = "Cargo feature flag keeps NATS optional",
detail = "`default = [\"nats\"]` enables NATS in release builds. `cargo build --no-default-features` produces a binary without async-nats linkage — useful for minimal containers, air-gapped environments, or testing. The config field `ncl_sync.nats.enabled` is an additional runtime gate independent of the compile-time feature.",
},
],
consequences = {
positive = [
"Orchestrator-driven state mutations invalidate cache in <15ms (vs ~100ms via file watcher)",
"Zero coupling between orchestrator and ncl-sync — only the subject contract is shared",
"Other subscribers (dashboard UI, audit log) can watch the same subjects without touching ncl-sync",
"Redundant with watcher+sidecar — graceful degradation if NATS is down",
],
negative = [
"Adds ~6MB to ncl-sync binary size (async-nats + dependencies)",
"NATS must be running before ncl-sync connects (but failure is non-fatal — falls back to watcher)",
"Publishers (orchestrator, etc.) must be updated to emit the new subjects — until then, NATS layer has no effect",
],
},
alternatives_considered = [
{
option = "Single mechanism: file watcher only",
why_rejected = "Misses the ~100ms debounce window. For interactive CLI this is fine; for rapid orchestrator-driven state changes (deploy with many state updates), the cache can lag.",
},
{
option = "Single mechanism: NATS only",
why_rejected = "Hard dependency on NATS — ncl-sync fails if NATS isn't running. Manual NCL edits (user opens editor) wouldn't be caught. File watcher must remain as baseline.",
},
{
option = "HTTP endpoint on ncl-sync for invalidation",
why_rejected = "Requires every publisher to know the daemon's Unix socket or HTTP port. NATS decouples publishers from subscribers.",
},
{
option = "Reuse provisioning.dag.* subjects",
why_rejected = "DAG events are about workflow state, not config state. Overloading the subject hierarchy would force ncl-sync to filter noisy events it doesn't care about.",
},
],
ontology_check = {
decision_string = "ncl-sync adds opt-in NATS subscriber on provisioning.workspace.ncl.{changed,removed} for event-driven cache invalidation; watcher + sidecar remain as fallback",
invariants_at_risk = ["config-driven-always"],
verdict = 'Safe,
},
related_adrs = ["adr-022-ncl-sync-daemon", "adr-023-ncl-export-wrapper"],
constraints = [
{
id = "ncl-sync-nats-optional",
claim = "NATS subscriber must be an optional Cargo feature, and runtime-gated by config",
scope = "provisioning/platform/crates/ncl-sync/",
severity = 'Hard,
check = { tag = 'Grep, pattern = "cfg\\(feature = \"nats\"\\)|#\\[cfg\\(feature = \"nats\"\\)\\]", paths = ["provisioning/platform/crates/ncl-sync/src/"], must_be_empty = false },
rationale = "Air-gapped environments, minimal containers, and testing scenarios require ncl-sync to build and run without NATS. Removing the feature flag would violate this.",
},
{
id = "ncl-sync-nats-fallback",
claim = "NATS connection failure must be non-fatal — daemon continues with watcher + sidecar",
scope = "provisioning/platform/crates/ncl-sync/src/main.rs",
severity = 'Hard,
check = { tag = 'Grep, pattern = "tracing::warn", paths = ["provisioning/platform/crates/ncl-sync/src/main.rs"], must_be_empty = false },
rationale = "Hard dependency on NATS would break the workspace-local, zero-platform-service guarantee from ADR-022.",
},
{
id = "ncl-sync-workspace-scope",
claim = "Subscriber must filter events by workspace — only process events matching its watched workspace",
scope = "provisioning/platform/crates/ncl-sync/src/nats_subscriber.rs",
severity = 'Hard,
check = { tag = 'Grep, pattern = "workspace_matches", paths = ["provisioning/platform/crates/ncl-sync/src/nats_subscriber.rs"], must_be_empty = false },
rationale = "Multiple ncl-sync daemons share the subject namespace. Without filtering, daemon A would process events for workspace B's cache.",
},
],
}

View file

@ -0,0 +1,83 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-026",
title = "nulib M6 Restructure: 8-Layer ADR-025-Compliant Module Tree",
status = 'Accepted,
date = "2026-04-18",
context = "ADR-025 mandated empty root mod.nu, selective imports, and bash-direct dispatch but deferred the file layout question: the existing `lib_provisioning/` and `main_provisioning/` directories were flat accretions with no enforced layer contracts. `lib_provisioning/` contained 242 files spanning primitives (string utils, logging), platform concerns (SOPS, KMS, SSH), domain logic (workspace state, orchestrator queries), and CLI command handlers — all in one directory. `main_provisioning/` held per-command Nu scripts with mixed dependency depth. Cross-layer violations were undetectable: a primitive utility could `use` a domain module without any structural signal. The ADR-025 pre-commit hook checked for star-imports but could not enforce dependency direction. The result was a codebase where adding a new utility required navigating 242 flat files and guessing import depth.",
decision = "Reorganize provisioning/core/nulib/ into a strict 6-layer tree with one-directional dependency flow: primitives/ → tools/ → platform/ → domain/ → orchestration/ → cli/. Each layer may only import from layers below it; violations are detectable by grep on import paths. The migration uses the strangler-fig pattern: (1) move real implementation to the new tree; (2) leave a transition shim in the original location (`# Transition shim (ADR-026 M6)` as first line); (3) update external callers to the new path. All 242 lib_provisioning/ files are either moved to the new tree or archived to .wrks/core_nulib/shimmed/ when they have zero callers. The shim layer is a pure re-export façade: `export use new/path.nu *`. config/accessor is placed in platform/ (not domain/) because it already depended on platform/target.nu — layer placement follows actual dependency topology, not naming intuition.",
rationale = [
{
claim = "Flat directories are structurally unenforceable — layer violations are invisible at review time",
detail = "In a 242-file flat directory, `use lib_provisioning/utils/settings.nu [fn]` and `use lib_provisioning/domain/workspace/state.nu [fn]` look identical to a reviewer. The first is a primitives import; the second crosses a domain boundary. Without directory structure that encodes layer, pre-commit hooks can only check for star-imports, not dependency direction. The 8-layer tree makes violations visible: any `use platform/X` inside `primitives/Y` is a directory-level signal that something is wrong.",
},
{
claim = "Strangler-fig migration preserves working code during the transition",
detail = "A big-bang migration of 242 files would require updating all callers atomically. With shims, each file moves independently: the shim at the old path keeps callers working until they are individually updated. This decouples the migration from caller updates and allows incremental validation. The shim marker (`# Transition shim (ADR-026 M6)`) enables bulk identification via grep for post-migration cleanup.",
},
{
claim = "config/accessor belongs in platform/, not domain/ — discovered empirically",
detail = "The original plan placed config/accessor in domain/ because it felt like business-layer logic. During migration, it was found that config/accessor/core.nu already imported platform/target.nu. Placing it in domain/ would have required domain/ → platform/ imports, a layer violation in reverse. Moving it to platform/ eliminated all cross-layer violations from primitives/ and platform/ simultaneously and was the correct structural choice — layer assignment must follow actual dependency topology.",
},
{
claim = "Archiving zero-caller shims to .wrks/ preserves history without polluting the live tree",
detail = "Of the 242 lib_provisioning/ files, a significant fraction had zero external callers — they were either dead code or superseded by newer implementations. Deleting them would lose their history; keeping them in the live tree would require maintaining shims forever. .wrks/core_nulib/shimmed/ is the designated archive for these files: not in git history, not in the live module tree, but recoverable if a caller is discovered later.",
},
],
consequences = {
positive = [
"Layer violations are detectable by grep on import path prefixes — no AST tooling required",
"New files have an obvious home: a string utility goes in primitives/, a SOPS wrapper in platform/, a workspace query in domain/",
"Shim layer enables incremental caller migration without breaking the working tree at any point",
"242 files reduced to a structured 6-layer tree with clear ownership boundaries",
],
negative = [
"Shims must be explicitly removed once all callers migrate — they are migration debt, not permanent architecture",
"The .wrks/core_nulib/shimmed/ archive is outside git tracking; files there are recoverable only from the local filesystem",
"Contributors must learn the 6-layer contract; the ADR-025 pre-commit hook alone does not enforce layer direction",
],
},
alternatives_considered = [
{
option = "Keep lib_provisioning/ flat and enforce layers via naming convention (lib_primitives_, lib_platform_, etc.)",
why_rejected = "Naming conventions degrade under refactoring pressure. A file renamed from lib_platform_foo to just foo loses the signal. Directory structure is enforced by the filesystem and grep; naming is enforced only by discipline.",
},
{
option = "Big-bang migration: move all 242 files and update all callers atomically",
why_rejected = "The caller surface spans provisioning/core/nulib/, provisioning/extensions/, provisioning/platform/crates/ (Nushell test scripts), and workspaces/. Updating all callers atomically requires a multi-day coordinated change that cannot be validated incrementally. A single broken caller would fail the entire migration. Strangler-fig allows per-file validation.",
},
],
constraints = [
{
id = "layer-import-direction",
claim = "Files in primitives/ must not import from tools/, platform/, domain/, orchestration/, or cli/. Files in tools/ must not import from platform/ or above. The rule extends transitively up each layer.",
scope = "provisioning/core/nulib/",
severity = 'Hard,
check = { tag = 'Grep, pattern = "^use (platform|domain|orchestration|cli)/", paths = ["provisioning/core/nulib/primitives/"], must_be_empty = true },
rationale = "One-directional dependency flow is the architectural guarantee of the 8-layer tree. Without it, the tree is a cosmetic rename of the flat directory.",
},
{
id = "shim-marker-required",
claim = "Every transition shim in lib_provisioning/ or main_provisioning/ must have `# Transition shim (ADR-026 M6)` as its first line",
scope = "provisioning/core/nulib/lib_provisioning/, provisioning/core/nulib/main_provisioning/",
severity = 'Soft,
check = { tag = 'Manual, description = "grep -rL 'Transition shim' provisioning/core/nulib/lib_provisioning/ — must list only empty mod.nu files" },
rationale = "The marker enables bulk identification of shims for post-migration cleanup. Without it, shims are indistinguishable from real implementations by file content alone.",
},
],
ontology_check = {
decision_string = "Reorganize provisioning/core/nulib/ into 8-layer tree (primitives/tools/platform/domain/orchestration/cli/) with strangler-fig migration and shim layer at lib_provisioning/",
invariants_at_risk = ["config-driven-always"],
verdict = 'Safe,
},
related_adrs = ["adr-025-unified-lazy-loading"],
}

View file

@ -0,0 +1,104 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-027",
title = "prvng-cli: Unix-Socket Registry Daemon Eliminating Nu Startup Cost Per Invocation",
status = 'Accepted,
date = "2026-04-18",
context = "Every `prvng` invocation runs `_validate_command` in the bash wrapper, which determines whether a command exists in the registry and whether it requires the orchestrator daemon. The baseline implementation had two paths: (1) `nickel export` to rebuild the JSON cache (~2-5s, only on cache miss), and (2) pure bash grep over the JSON cache to extract `found` and `requires_daemon`. The bash grep path has a correctness flaw: `grep -o '\"[a-zA-Z0-9_\\-\\+\\.]*\"'` extracts every quoted string in the JSON file, not just command names, so a description substring matching the query command produces a false positive. Additionally, as the registry grows, a grep+sed window extraction over a multi-kilobyte JSON file for each invocation adds unneeded I/O. ADR-022 (ncl-sync) already established the pattern of using a lightweight Rust daemon for operations that benefit from persistent in-memory state. The registry is a read-heavy, write-rare structure — exactly the right profile for an in-memory cache behind a Unix socket.",
decision = "Implement `prvng-cli` as a standalone Rust binary in `platform/crates/prvng-cli/`. The binary: (1) reads `~/.cache/provisioning/commands-registry.json` at startup into a `HashMap<String, CommandEntry>` indexed by both canonical name and all aliases; (2) listens on a Unix domain socket at `~/.local/share/provisioning/cli.sock`; (3) serves JSON-framed lookup requests with newline termination; (4) watches the JSON cache file via the `notify` crate and hot-reloads the index on any `Modify` or `Create` event without restarting; (5) shuts down automatically after 60s of idle — the bash wrapper restarts it on demand. The bash wrapper gains three functions: `_ensure_cli_daemon` (starts the binary if the socket is absent, waits up to 300ms for the socket to appear), `_cli_query` (sends `{\"op\":\"lookup\",\"command\":\"<cmd>\"}` via `nc -U -w 1`, parses `found` and `requires_daemon` from the response), and a three-tier `_validate_command` (socket → bash grep JSON cache → Nu script fallback). The binary is a workspace member of `platform/Cargo.toml` with no dependency on Nushell, SurrealDB, NATS, or any platform service.",
rationale = [
{
claim = "The in-memory HashMap indexed by both canonical name and aliases eliminates the bash grep false-positive problem",
detail = "The HashMap is built at load time by `Registry::into_index()`: for each CommandEntry, the canonical name is inserted as a key, then each alias is inserted as an additional key pointing to the same entry. A lookup on `\"h\"` returns the `help` entry without scanning the description or any other field. The bash grep approach extracted every quoted string from the JSON, meaning a description containing `\"h\"` (e.g., as part of another word) would have matched. The HashMap provides O(1) exact lookup with no false positives.",
},
{
claim = "Unix socket with newline-framed JSON is simpler and lower-latency than HTTP for same-host IPC",
detail = "HTTP adds header parsing, keep-alive negotiation, and TCP stack overhead. For a query that returns ~200 bytes, the round-trip overhead of HTTP is larger than the payload. Unix domain sockets bypass the network stack entirely and are available on all Unix targets. `nc -U -w 1` is universally available on macOS and Linux without additional tooling. The newline frame is parseable by any shell with `grep -o '\"field\":value'` — no JSON parser required in the caller.",
},
{
claim = "notify-based hot reload eliminates the need to restart the daemon after `nickel export` updates the cache",
detail = "The workflow for a registry change is: edit `commands-registry.ncl` → `nickel export` writes the JSON cache → the file watcher detects the write event → the daemon reloads the HashMap in-place. No socket downtime, no bash logic to detect staleness, no version negotiation. The watcher monitors the parent directory with `RecursiveMode::NonRecursive` to catch atomic writes (where editors write to a temp file then rename into place, which does not trigger a `Modify` on the original path but does trigger `Create` on the canonical path).",
},
{
claim = "Idle shutdown at 60s keeps resource usage zero between `prvng` invocations",
detail = "The daemon is not a long-running service — it is an on-demand cache server. On a developer workstation, `prvng` may not be invoked for hours. A daemon that runs continuously would hold an open file descriptor on the socket and consume memory permanently. The 60s idle timeout means the daemon self-terminates after a session of commands, and `_ensure_cli_daemon` restarts it on the next invocation. The restart cost is ~100ms (binary start + HashMap load for 40 commands); this is amortized across all commands in a session.",
},
{
claim = "Three-tier fallback in `_validate_command` preserves correctness when the daemon is unavailable",
detail = "The socket path can fail in three ways: the binary is not installed, the daemon is starting (race condition during `_ensure_cli_daemon`), or `nc` returns no output. The fallback chain is: socket (fast, correct) → bash grep on JSON cache (fast, has false-positive risk but handles 99% of cases) → Nu script (slow, always correct). The Nu fallback is the pre-ADR-027 behavior; it is retained as the last resort to ensure `_validate_command` never hard-fails due to daemon absence.",
},
],
consequences = {
positive = [
"`_validate_command` completes in <5ms when the daemon is running vs ~50-200ms for bash grep+sed",
"Registry lookup correctness: HashMap indexed by exact name/alias, no substring false positives",
"Hot reload: `nickel export` → daemon reloads automatically, no restart needed",
"Zero resource usage between sessions: idle shutdown at 60s",
"No additional system dependencies: `nc` (netcat) is present on all macOS/Linux targets",
],
negative = [
"300ms cold-start latency on first invocation after idle shutdown — amortized across the session but visible on the very first `prvng` call",
"`nc -U -w 1` behavior differs between GNU netcat (`-q 1`) and BSD netcat (`-w 1`) — the bash wrapper must use `-w 1` for macOS compatibility",
"The binary must be installed to `~/.local/share/provisioning/bin/prvng-cli` before `_ensure_cli_daemon` can start it; the installer script must include this step",
],
},
alternatives_considered = [
{
option = "Keep pure bash grep over JSON cache as the only validation path",
why_rejected = "False-positive risk: grep extracts every quoted token from the JSON, not just command names. For 40 commands with aliases and descriptions, the extracted token list contains ~300 strings. A description containing a common word that matches an input typo would suppress the 'unknown command' error. Correctness requires exact-name matching against the command/alias fields only.",
},
{
option = "Reuse the existing `provisioning-daemon` (platform/crates/daemon/) for registry queries",
why_rejected = "The provisioning-daemon is a full platform service: SurrealDB, NATS, auth middleware, provider APIs. It requires the orchestrator infrastructure to be running and is not designed for sub-millisecond local queries. Starting it solely for registry lookup is architectural misuse. ADR-022's ncl-sync daemon established the correct pattern: a separate binary scoped to one responsibility.",
},
{
option = "HTTP server on localhost instead of Unix socket",
why_rejected = "HTTP requires a port allocation, adds TCP stack overhead, and exposes the registry to other processes on the host. Unix sockets are file-permission-controlled, zero-overhead, and already the established IPC pattern for this codebase (the orchestrator uses WebSocket-over-Unix-socket for SurrealDB embedded mode).",
},
{
option = "Shared memory or mmap for the registry index",
why_rejected = "Requires either a file-format contract for the serialized HashMap or a memory-mapped file with a custom reader. `nc`-over-Unix-socket is implementable in bash with one line; mmap requires a dedicated reader binary or Nushell plugin. The complexity gain is negative: the index is 40 entries and fits in a single cache line of JSON.",
},
],
constraints = [
{
id = "prvng-cli-no-platform-deps",
claim = "platform/crates/prvng-cli/Cargo.toml must not depend on nushell, surrealdb, async-nats, platform-config, service-clients, or any crate that transitively requires them",
scope = "platform/crates/prvng-cli/",
severity = 'Hard,
check = { tag = 'Manual, description = "cargo tree -p prvng-cli | grep -E 'nushell|surrealdb|async-nats' — must be empty" },
rationale = "prvng-cli is a lightweight daemon with a single responsibility: serve registry lookups. Platform service dependencies would pull in rustls version conflicts (nushell pins rustls=0.23.28; surrealdb requires ^0.23.36) and increase binary size by 10-50x. Keeping it dependency-minimal ensures it builds fast and stays buildable independently of the platform workspace conflicts.",
},
{
id = "socket-path-via-xdg",
claim = "The socket path must be derived from XDG_DATA_HOME (defaulting to ~/.local/share), never hardcoded",
scope = "platform/crates/prvng-cli/src/main.rs, provisioning/core/cli/provisioning",
severity = 'Hard,
check = { tag = 'Grep, pattern = "\\.local/share/provisioning/cli\\.sock", paths = ["platform/crates/prvng-cli/src/main.rs"], must_be_empty = true },
rationale = "Hardcoded paths break in NixOS, container environments, and CI runners where HOME may not exist or XDG_DATA_HOME points elsewhere. The PRVNG_CLI_SOCKET environment variable allows per-invocation override for testing.",
},
{
id = "bsd-nc-compatibility",
claim = "All `nc` invocations in provisioning/core/cli/provisioning must use `-w 1` for timeout, never `-q 1`",
scope = "provisioning/core/cli/provisioning",
severity = 'Hard,
check = { tag = 'Grep, pattern = "nc.*-q", paths = ["provisioning/core/cli/provisioning"], must_be_empty = true },
rationale = "macOS ships BSD netcat which does not implement `-q` (GNU netcat timeout flag). BSD netcat uses `-w` for connection timeout. Using `-q 1` causes nc to exit with an error on macOS, making `_cli_query` always fail and fall through to the bash grep path, silently degrading to the pre-ADR-027 behavior.",
},
],
ontology_check = {
decision_string = "Rust Unix-socket daemon serving in-memory HashMap registry lookups with file-watcher hot-reload and 60s idle shutdown; bash wrapper gains three-tier _validate_command (socket → grep → Nu)",
invariants_at_risk = ["config-driven-always"],
verdict = 'Safe,
},
related_adrs = ["adr-022-ncl-sync-daemon", "adr-025-unified-lazy-loading", "adr-028-daemon-target-registry-field"],
}

View file

@ -0,0 +1,85 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-028",
title = "daemon_target: Registry Field for CLI Query Routing",
status = 'Accepted,
date = "2026-04-18",
context = "The commands-registry.ncl schema had `requires_daemon: Bool` — a binary flag indicating whether the orchestrator daemon must be running before the command can execute. This is a runtime precondition check, not a routing directive. As prvng-cli (ADR-027) introduces a second daemon (the registry query daemon) and the orchestrator remains a third service, the question of which system should serve a given CLI query becomes a distinct concern from whether that query's command needs the orchestrator running at execution time. A future fourth service (e.g., an AI assistant backend) might need to handle `prvng ai` queries. Without an explicit routing field in the registry, the bash wrapper must embed routing logic as ad-hoc case statements that drift out of sync with the registry. The registry is already the authoritative source of truth for command metadata; routing belongs there.",
decision = "Add `daemon_target` as an enum field to the CommandEntry schema with three values: `none` (command is handled locally by Nu thin handlers), `cli` (command's query should be routed to the prvng-cli Unix socket), `orchestrator` (command's query should be routed to the orchestrator service on port 9011). Default is `none`. The field is added to: (1) `schemas/commands_registry/schema.ncl` as `daemon_target | std.enum.TagOrString | [| 'none, 'cli, 'orchestrator |]`; (2) `schemas/commands_registry/defaults.ncl` as `daemon_target | default = 'none`; (3) `platform/crates/prvng-cli/src/registry.rs` as `DaemonTarget` enum with serde `rename_all = \"lowercase\"` and `Default = None`; (4) `LookupResult` in registry.rs as `daemon_target: Option<String>` serialized in every socket response. The bash wrapper reads `daemon_target` from the socket response but does not yet act on it — the field is present for forward compatibility, enabling future routing without a schema migration.",
rationale = [
{
claim = "Routing intent belongs in the registry, not in bash case statements",
detail = "The current bash wrapper routes commands via hard-coded case branches. Adding a new service requires editing the bash wrapper in two places: the dispatch block and the `_validate_command` daemon check. With `daemon_target` in the registry, routing is data: a new service is a new enum variant, and the bash wrapper reads the variant rather than containing the routing logic. This is the configuration-driven principle applied to service dispatch.",
},
{
claim = "`daemon_target` and `requires_daemon` are orthogonal — both must exist",
detail = "`requires_daemon: Bool` answers 'does this command need the orchestrator running at execution time?' — it is a precondition for command execution. `daemon_target` answers 'which service should handle the CLI query for this command?' — it is a routing directive. A command can have `requires_daemon = true` (needs orchestrator to execute) and `daemon_target = none` (CLI query is handled locally by Nu). A command could have `daemon_target = orchestrator` (the orchestrator itself handles the lookup) with `requires_daemon = false` (but this is currently unused). Conflating them would require complex boolean combinations to express future routing needs.",
},
{
claim = "Enum over TagOrString allows gradual adoption without schema breakage",
detail = "`std.enum.TagOrString` is Nickel's mechanism for enums that also accept plain strings. This means existing JSON consumers that read `daemon_target` as a string continue to work. Future enum variants (e.g., `'ai-service`) can be added to the schema without forcing all existing entries to be updated — they continue to deserialize as `none` via the Rust `Default` impl.",
},
{
claim = "Returning `daemon_target` in every LookupResult socket response costs ~20 bytes and adds zero latency",
detail = "The socket response is already a JSON object. Adding `\"daemon_target\":\"none\"` to the 40-command registry adds ~20 bytes per response. At Unix socket speeds (loopback, no copy), this is below measurement threshold. Omitting the field from responses would require schema versioning if it is added later; including it from the start avoids that migration.",
},
],
consequences = {
positive = [
"Future services can be added to the routing table by adding an enum variant — no bash wrapper changes required for the routing data layer",
"Registry is the single source of truth for both command metadata and routing intent",
"`LookupResult` carries routing information, enabling smart clients (IDE plugins, MCP tools) to route queries without duplicating registry logic",
],
negative = [
"The bash wrapper currently ignores `daemon_target` from the socket response — it reads `requires_daemon` only. Acting on `daemon_target` requires a future bash wrapper change that maps `daemon_target=orchestrator` to an HTTP/WebSocket call to port 9011 instead of local Nu execution.",
"Adding a new `daemon_target` variant requires: (1) schema.ncl update, (2) Rust enum update + rebuild, (3) re-export of commands-registry.json. The schema and Rust must stay in sync manually — there is no codegen.",
],
},
alternatives_considered = [
{
option = "Use `requires_daemon` as a proxy for routing — orchestrator-requiring commands route to orchestrator",
why_rejected = "The semantics differ. `requires_daemon = true` means the command cannot execute without the orchestrator — it does not mean the orchestrator should handle the CLI query. A future command might need the orchestrator for data but want its query interface served by prvng-cli (e.g., cached orchestrator state). Overloading `requires_daemon` would require a boolean override field for these cases, which is worse than having a dedicated routing field.",
},
{
option = "Encode routing in command naming convention (prefix: `orch:workspace`, `cli:help`)",
why_rejected = "Naming conventions require parser logic in every consumer and break when commands are renamed. A dedicated schema field is strongly typed, validated by `nickel typecheck`, and queryable by grep without special parsing.",
},
{
option = "Add routing to a separate registry file (routing-registry.ncl)",
why_rejected = "Two registries for the same command set creates synchronization debt: adding a command requires editing both files, and a mismatch is not detectable without running both through a diff tool. The registry is already the authoritative command list; routing is command metadata and belongs in the same record.",
},
],
constraints = [
{
id = "daemon-target-rust-enum-in-sync",
claim = "The DaemonTarget enum in platform/crates/prvng-cli/src/registry.rs must contain exactly the variants declared in schemas/commands_registry/schema.ncl: None, Cli, Orchestrator",
scope = "platform/crates/prvng-cli/src/registry.rs, provisioning/schemas/commands_registry/schema.ncl",
severity = 'Hard,
check = { tag = 'Manual, description = "grep -c 'None\\|Cli\\|Orchestrator' platform/crates/prvng-cli/src/registry.rs — must equal 3; grep TagOrString provisioning/schemas/commands_registry/schema.ncl — must find daemon_target line" },
rationale = "A variant in the schema with no Rust counterpart causes serde deserialization to fail at runtime on any registry entry using the new variant. Manual sync is required until codegen is available.",
},
{
id = "daemon-target-default-none",
claim = "All commands in commands-registry.ncl that do not explicitly set daemon_target must resolve to `none` via the schema default",
scope = "provisioning/schemas/commands_registry/defaults.ncl",
severity = 'Hard,
check = { tag = 'Grep, pattern = "daemon_target.*default.*none", paths = ["provisioning/schemas/commands_registry/defaults.ncl"], must_be_empty = false },
rationale = "The default ensures backward compatibility: existing registry entries without daemon_target are valid and route locally. A missing default would make the field required and break all existing make_command calls.",
},
],
ontology_check = {
decision_string = "Add daemon_target enum field (none|cli|orchestrator) to CommandEntry schema and LookupResult for forward-compatible CLI query routing without conflating with requires_daemon precondition",
invariants_at_risk = ["config-driven-always", "type-safety-always"],
verdict = 'Safe,
},
related_adrs = ["adr-027-prvng-cli-daemon"],
}

View file

@ -0,0 +1,127 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-029",
title = "Smart Interface Unification: CLI ↔ HTTP ↔ MCP via Shared Registry",
status = 'Accepted,
date = "2026-04-19",
context = "Before this decision the provisioning platform exposed three user-facing surfaces — the Nushell CLI (`provisioning ...`), the MCP stdio server (`crates/mcp-server`), and the future admin HTTP UI — as three independent codebases. Each had its own dispatch logic, its own parameter validation, and its own response formatting. A single operation like `workspace list` was implemented once in Nushell for the CLI and once as a `simple_main.rs` MCP tool with separate logic. The admin UI was pending because there was no shared backend it could consume. This divergence was already causing drift: `provision_cluster_create` in MCP accepted a different parameter shape than `provisioning cluster create` in the CLI, and neither agreed with the orchestrator's HTTP POST body. The user's irrenunciable requirement was ontoref-style synchronization — one operation, one semantics, three surfaces — without forcing any surface to depend on the others (CLI must work offline; MCP stdio must not require an HTTP daemon; admin UI must not embed the CLI).",
decision = "Introduce a four-crate layered architecture: (1) `provisioning-core` is a pure library exposing the `Tool` trait and `Registry`; all 37 operations are implemented as `impl Tool` inside it. (2) `provisioning-tool` is a thin CLI binary that instantiates the Registry and exposes `list`/`schema`/`invoke` over stdout JSON. (3) `provisioning-daemon` is an Axum HTTP+NATS server that wraps the Registry with JWT+RBAC middleware, domain-state tracking, a config-file watcher, an embedded admin UI, and Tera ontology templates. (4) `mcp-server` is reimplemented internally as a JSON-RPC 2.0 dispatcher over the same Registry, consumed via `McpServer::handle_request` for in-process tests and via stdin/stdout for the MCP protocol. The Nushell CLI uses a three-tier fallback chain (`platform/clients/fallback.nu::tool-call`): tier 1 is the HTTP daemon if reachable; tier 2 is the `provisioning-tool` child process; tier 3 is the caller-supplied Nushell legacy closure. A G3 contract test (`crates/contract-tests`) asserts that the same tool invoked through all three surfaces produces semantically equivalent payloads after envelope normalisation and validates each tier's output against a shared JSON Schema. An `.ontoref/config.ncl` hook (`domain_daemon`) declares provisioning as an external domain so ontoref-daemon can delegate `provisioning.*` ontology queries without provisioning importing any ontoref crates.",
rationale = [
{
claim = "A shared Rust library is the only architecture that gives autonomy + sync simultaneously",
detail = "The three surfaces have incompatible runtime models: the CLI can run without any long-running process, MCP stdio cannot share a process with an HTTP server (stdio hijacks stdin/stdout), and the admin UI requires a persistent backend. A shared service (daemon-only) forces the CLI to depend on the daemon — breaks autonomy. A shared protocol (REST-only) forces MCP to wrap HTTP — breaks stdio's contract. A shared library is the only option where each surface instantiates Registry independently and dispatches identically. Autonomy is structural; sync is guaranteed by construction because the dispatch code is literally the same function call.",
},
{
claim = "The three-tier fallback keeps CLI hardcoded offline-first",
detail = "The user's current workflow is `provisioning workspace list` on a laptop with no daemon running. Tier 3 (Nushell legacy closure) preserves that behavior indefinitely. Tier 1 (HTTP daemon) opportunistically accelerates when the daemon is up — lets multi-developer setups cache Registry state. Tier 2 (provisioning-tool child) is the bridge: it reuses the Rust Registry but spawns a fresh process, so operations don't require a daemon yet also don't reimplement logic in Nushell. The chain is checked at call time, not configuration time, so the user never manages daemon state — it either works faster or it works the same.",
},
{
claim = "G3 contract test converts 'sync irrenunciable' into a CI invariant",
detail = "Without G3, the three surfaces would drift silently as new tools are added. G3 asserts that for each fixture tool, all three tiers produce the same normalised payload and the same error code. This is structural: the test doesn't know which tier is 'right' — it knows they must agree. If a future change to the HTTP envelope breaks parity with MCP, CI fails. If a new error variant is added to ToolError but not mapped in `routes.rs::tool_error_code` or `registry_server.rs::tool_error_to_rpc`, the G3 error-code tests catch it. The contract cost is one integration test crate; the insurance is architectural.",
},
{
claim = "Ontoref federation via config hook, not crate dependency",
detail = "Earlier plan revisions had provisioning-daemon depending on `ontoref-ontology` and `ontoref-derive` crates. This would force provisioning's release cadence onto ontoref's and vice versa. The `domain_daemon` config hook in `.ontoref/config.ncl` inverts the dependency: provisioning declares its HTTP URL and ontology endpoints; ontoref-daemon reads this config and delegates. provisioning has zero compile-time ontoref deps. The coupling is runtime, one-directional, and can be disabled by setting `domain_daemon.required = false` (the default).",
},
{
claim = "37 tools, not 45+ as originally planned",
detail = "A0 inventory revealed 37 actual tools in mcp-server (7 provision_*, 5 guidance_*, 7 installer_*, 17 legacy infra, 1 ai_query). The remaining 'tools' counted in early plans were enum values for taskservs (cicd, coredns, grafana…), not operations. Renaming to `<domain>_<action>` (workspace_list, server_create, dag_show) preserves the 37 operations under cleaner names since no external MCP consumers exist yet.",
},
],
consequences = {
positive = [
"Adding a new operation is a single `impl Tool` in provisioning-core — it appears in all three surfaces at once without surface-specific code",
"The admin UI is unblocked: it calls the same HTTP API the CLI uses, consuming the same Registry responses",
"MCP stdio and HTTP daemon can be deployed or disabled independently without affecting the CLI's offline workflow",
"G3 contract test catches silent drift at CI time instead of production",
"Schema is generated once by `Tool::schema()` and consumed by tools/list (MCP), GET /api/v1/tools (HTTP), and `provisioning-tool schema <name>` (CLI) — no duplicate JSON Schema files",
"`--fmt text|json|yaml|toml|md` and `--clip` global CLI flags replace the scattered `--format`, `--output`, `--json` per-handler options",
],
negative = [
"The Nushell legacy branch (tier 3) must be maintained until every handler is migrated to the fallback chain — currently only `workspace list` is wired; the other 36 operations still call Nushell legacy directly",
"Adding a tool now requires Rust compilation — faster iteration is lost versus the previous 'edit a Nushell file, reload' pattern. Mitigated by `cargo watch -x 'build -p provisioning-daemon'` during development",
"The fallback chain incurs up to two failed probes (daemon ping + `which provisioning-tool`) before falling through to tier 3 on cold offline use. Latency measured at ~50ms on macOS — acceptable but not zero",
"G3 can only assert semantic equivalence on payloads it can normalise. Fields not listed in `normalise()` (trace_id/timestamp/etc.) could still mask real divergence if an unknown volatile field is introduced. Mitigated by reviewing the normaliser when any new metadata field is added",
"The mcp-server binary `provisioning-mcp-server` still exists alongside `prov-mcp` (the new Registry-backed binary) during migration. Users must be told which to use",
],
},
alternatives_considered = [
{
option = "Single binary with feature flags for CLI/HTTP/MCP surfaces",
why_rejected = "stdio hijack (MCP) and persistent HTTP server are incompatible runtime modes in one process without complex flag matrices. The feature-flag model also bloats binary size — every CLI user ships the full HTTP server. The separate-binary model with shared library gives the same code-reuse guarantee without the runtime coupling.",
},
{
option = "Ship only the daemon — CLI becomes a thin HTTP client",
why_rejected = "The user's current workflow is CLI-first and offline-first. Requiring a daemon would regress the unsurprising property that `provisioning workspace list` works with no running services. Autonomy was listed as irrenunciable in the A0 decisions.",
},
{
option = "Keep mcp-server and CLI as independent codebases, add the daemon as a third",
why_rejected = "Sync irrenunciable fails. Every new operation would need implementation in three places, and divergence was already observable (parameter shape mismatches between MCP tools and CLI handlers). Adding a third surface would multiply drift rather than fix it.",
},
{
option = "Use MCP stdio as the 'backend' — HTTP daemon and CLI would invoke MCP internally",
why_rejected = "MCP is a client-server protocol designed for stdin/stdout framing. Using it as an internal backend forces the HTTP daemon to spawn and manage an MCP subprocess for every request — adding latency and serialisation overhead — and couples the daemon's availability to MCP protocol versioning. A shared library avoids both issues.",
},
{
option = "Use ontoref-ontology crate as the ontology source for provisioning-daemon",
why_rejected = "Compile-time dependency on ontoref would force coordinated releases and embed ontoref's SurrealDB+schema choices into provisioning's build. The `domain_daemon` config hook achieves delegation with no crate coupling — provisioning owns its domain ontology; ontoref-daemon discovers and delegates at runtime.",
},
],
constraints = [
{
id = "registry-sole-dispatch-path",
claim = "All three surfaces (CLI via provisioning-tool, HTTP via provisioning-daemon, MCP via mcp-server) must invoke operations through Registry::invoke — no surface may bypass the Registry with direct tool instantiation",
scope = "platform/crates/provisioning-tool, platform/crates/provisioning-daemon, platform/crates/mcp-server",
severity = 'Hard,
check = { tag = 'Grep, pattern = "Tool::invoke|tool\\.invoke\\(", paths = ["platform/crates/provisioning-tool/src", "platform/crates/provisioning-daemon/src", "platform/crates/mcp-server/src"], must_be_empty = true },
rationale = "A surface that bypasses the Registry makes the G3 contract test meaningless for that operation because the shared dispatch path is not exercised. Enforcing Registry::invoke keeps the three surfaces contractually equivalent.",
},
{
id = "g3-contract-test-must-pass",
claim = "The contract-tests crate must pass with 5 tests: listing agreement, echo agreement, invalid-param error agreement, failing-tool error agreement, and tools/list count agreement",
scope = "platform/crates/contract-tests",
severity = 'Hard,
check = { tag = 'NuCmd, cmd = "cargo test -p contract-tests --manifest-path platform/Cargo.toml", expect_exit = 0 },
rationale = "G3 is the mechanism that converts sync-irrenunciable into an architectural invariant. A failing G3 means one surface has silently diverged from the others.",
},
{
id = "nushell-fallback-legacy-closure-required",
claim = "Every call to tool-call / tool-list in Nushell must pass an explicit legacy closure — not a stub, not an error, but a working Nushell-native implementation",
scope = "provisioning/core/nulib/domain",
severity = 'Hard,
check = { tag = 'Grep, pattern = "tool-call|tool-list", paths = ["provisioning/core/nulib/domain"], must_be_empty = false },
rationale = "Tier 3 is the offline-first guarantee. If the legacy closure errors or is empty, the fallback chain breaks when the daemon is down and provisioning-tool is not installed. This is the retirement gate: tier 3 can only be removed per-operation after G3 passes for that operation.",
},
{
id = "mcp-dispatch-exposed-via-handle-request",
claim = "McpServer must expose `pub async fn handle_request(Value) -> Value` — the in-process entry point used by G3 contract tests",
scope = "platform/crates/mcp-server/src/registry_server.rs",
severity = 'Hard,
check = { tag = 'Grep, pattern = "pub async fn handle_request", paths = ["platform/crates/mcp-server/src/registry_server.rs"], must_be_empty = false },
rationale = "Without handle_request the G3 MCP tier would require spawning a subprocess with pipes — brittle under concurrent test execution. Keeping handle_request public is a testability contract.",
},
{
id = "ontoref-zero-crate-dependency",
claim = "provisioning workspace Cargo.toml must not contain ontoref-* path dependencies or the `ai` feature flag enabling them at the workspace level",
scope = "provisioning/platform/Cargo.toml, provisioning/platform/crates/provisioning-core/Cargo.toml, provisioning/platform/crates/provisioning-daemon/Cargo.toml",
severity = 'Soft,
check = { tag = 'Grep, pattern = "ontoref-ontology|ontoref-derive", paths = ["provisioning/platform/crates/provisioning-core", "provisioning/platform/crates/provisioning-daemon"], must_be_empty = true },
rationale = "Coupling to ontoref crates inverts the delegation model: the decision is that provisioning's .ontoref/config.ncl declares a domain_daemon hook, and ontoref-daemon discovers it. provisioning must not import ontoref.",
},
],
ontology_check = {
decision_string = "Unify CLI+HTTP+MCP surfaces on a shared provisioning-core Registry with a three-tier fallback in Nushell, JWT+RBAC middleware only at the HTTP layer, G3 contract test asserting semantic parity, and ontoref federation via config hook instead of crate dependency",
invariants_at_risk = ["config-driven-always", "type-safety-always", "solid-boundaries"],
verdict = 'Safe,
},
related_adrs = ["adr-014-solid-enforcement", "adr-022-ncl-sync-daemon", "adr-025-unified-lazy-loading", "adr-026-nulib-restructure", "adr-027-prvng-cli-daemon", "adr-028-daemon-target-registry-field"],
}

View file

@ -0,0 +1,90 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-030",
title = "Platform Workspace Crate Naming Convention",
status = 'Accepted,
date = "2026-04-19",
context = "The platform workspace accumulated three inconsistent naming patterns: (1) shared platform libraries had a `platform-` prefix (`platform-config`, `platform-nats`, `platform-db`); (2) the smart interface layer had a `provisioning-` prefix (`provisioning-core`, `provisioning-tool`, `provisioning-daemon`); (3) everything else had no prefix at all (`rag`, `mcp-server`, `service-clients`, `observability`, `machines`, `backup`, `encrypt`). The no-prefix group caused three concrete problems: (a) `cargo build -p rag` was ambiguous in the workspace resolver if a second `rag` dependency ever appeared; (b) `rag = { workspace = true }` in dependency declarations gave no indication which project the dep belonged to; (c) `observability` clashed with an identically-named crate on crates.io if the crate ever needed publishing. The binary names (set independently via `[[bin]]`) were already consistent — all used `provisioning-` prefix — so the inconsistency was purely at the Cargo package name level.",
decision = "Apply a four-rule naming convention across all workspace crates. Rule 1 — Shared platform libraries: `platform-<name>` package name; Rust crate name defaults to `platform_<name>`. Rule 2 — Smart interface layer: `provisioning-<name>` package name and binary name. Rule 3 — Service binaries: short package name (`orchestrator`, `vault-service`, etc.); binary name carries the `provisioning-` prefix via `[[bin]] name = 'provisioning-<name>'`. Rule 4 — Ecosystem crates with many existing `use` callers: `platform-<name>` package name + `[lib] name = 'old_name'` to preserve the Rust crate name across all existing `use` statements and doc tests without modifying call sites. Crates that are never declared as dependencies (service binaries) are exempt from the package name prefix because their Cargo identifier is only used in `cargo build -p <name>` invocations, never in `[dependencies]` sections.",
rationale = [
{
claim = "Service binaries do not need a prefix because they are never declared as dependencies",
detail = "The workspace resolver needs unique package names when packages are referenced as dependencies. Service binaries (orchestrator, control-center, vault-service, ai-service, extension-registry, ncl-sync) are leaf nodes — nothing in the workspace has `orchestrator = { ... }` in its `[dependencies]`. Their package name is only used in `cargo build -p orchestrator` and `cargo check -p orchestrator` invocations, where the directory context already disambiguates. Adding a `provisioning-` prefix would increase keystroke cost without adding disambiguation value. The binary name (the output artifact) already carries the prefix via `[[bin]]`.",
},
{
claim = "Ecosystem crates preserve Rust crate names via [lib] name to avoid touching 30+ doc test locations",
detail = "The ecosystem crates (machines, observability, backup, encrypt) use their crate name extensively in `///` and `//!` doc comment code examples that are compiled as doc tests. For `encrypt` alone, 25+ `use encrypt::` occurrences appear across `src/` and `examples/`. Changing the Rust crate name would require updating every one. The `[lib] name = 'old_name'` field in Cargo.toml decouples the package name (used by the workspace resolver) from the crate name (used by `use` statements). This preserves all existing Rust code, all doc tests, and all example files unchanged while making the package names consistent in `Cargo.toml` dependency declarations.",
},
{
claim = "platform-rag and provisioning-mcp required only Cargo.toml changes because they already had custom lib and binary names",
detail = "`rag` already had `[lib] name = 'provisioning_rag'` and `[[bin]] name = 'provisioning-rag'` — both Rust names were already correct. Only the `[package] name` field and workspace dep key needed updating. Similarly, `mcp-server` had `[lib] name = 'provisioning_mcp_server'` and two `[[bin]]` entries. Renaming these packages to `platform-rag` and `provisioning-mcp` was a pure Cargo identity change with zero impact on Rust compilation or binary output.",
},
{
claim = "service-clients required Rust use statement updates because it had no custom lib name",
detail = "Unlike the ecosystem crates, `service-clients` had no `[lib] name` override. Its Rust crate name was `service_clients` (derived from the package name by replacing hyphens with underscores). Renaming the package to `platform-clients` changes the default crate name to `platform_clients`. There were only three call sites in active crates: two in `provisioning-core/src/sources/ssh.rs` and one in `orchestrator/src/ssh/key_deployer.rs`. Updating three files was less friction than adding a `[lib] name = 'service_clients'` that would permanently diverge the package name from the Rust crate name.",
},
],
consequences = {
positive = [
"Every library crate declared as a `[dependencies]` entry now carries a `platform-` or `provisioning-` prefix — the project affiliation is unambiguous in any Cargo.toml context",
"The workspace resolver cannot silently select the wrong crate if an external dependency named `rag`, `observability`, or `machines` appears in the dependency tree",
"Binary output names are unchanged — no deployment scripts, systemd units, or Application Support paths require updates",
"NCL config keys (`rag = { ... }`, `mcp-server = { ... }`) are service identifiers unrelated to Cargo package names — they are unchanged",
"Ecosystem crate Rust code (doc tests, examples, use statements) compiles without modification",
"cargo check --workspace passes immediately after the rename",
],
negative = [
"The package name and Rust crate name are now different for the four ecosystem crates — `platform-machines` in Cargo.toml but `use machines::` in Rust. This is a supported Cargo feature but requires contributors to know about `[lib] name`",
"Cargo.lock contains the new package names — any tooling that parses Cargo.lock by package name (dashboards, audit tools) needs to be updated if it references the old names",
],
},
alternatives_considered = [
{
option = "Add provisioning- prefix to all service binary package names",
why_rejected = "Service binaries are never declared as dependencies — the prefix adds no disambiguation value. `cargo build -p provisioning-orchestrator` is longer than `cargo build -p orchestrator` with no benefit. The binary output already uses `provisioning-orchestrator` via `[[bin]]`.",
},
{
option = "Add [lib] name = 'service_clients' to platform-clients instead of updating use statements",
why_rejected = "There were only three call sites. Adding a divergent lib name permanently embeds a naming inconsistency in the codebase. Updating three files is the right call at this scale. If there had been 30+ call sites the decision would have been different.",
},
{
option = "Rename ecosystem crates and update all use statements",
why_rejected = "encrypt/src/ alone has 25+ doc test use statements across 6 files plus 3 examples. The work is mechanical but creates a large diff with no behavioral change. [lib] name achieves the same Cargo-level disambiguation with a one-line addition per crate.",
},
{
option = "Keep the status quo — no rename",
why_rejected = "The status quo had three inconsistent naming patterns in the same workspace. `cargo tree` output was confusing; dep declarations in Cargo.toml files were ambiguous about project affiliation; crates.io collision risk existed for generic names. The inconsistency was a maintenance friction that compounds with each new crate added.",
},
],
constraints = [
{
id = "new-library-crates-need-platform-prefix",
claim = "Any new library crate added to the platform workspace that will be declared as a dependency must use the platform- or provisioning- prefix in its [package] name",
scope = "platform/Cargo.toml members, platform/crates/, platform/prov-ecosystem/crates/",
severity = 'Soft,
check = { tag = 'NuCmd, cmd = "glob 'provisioning/platform/crates/*/Cargo.toml' | each {|f| open $f | get package.name } | where { not ($in =~ 'platform-|provisioning-|orchestrator|control-center|vault-service|ai-service|extension-registry|ncl-sync|contract-tests|prvng-cli') } | if ($in | is-empty) { exit 0 } else { print $in; exit 1 }", expect_exit = 0 },
rationale = "The naming convention is only useful if it is consistently applied to new crates. A new crate named 'cache' or 'metrics' has the same disambiguation problem the renamed crates had.",
},
{
id = "service-binary-package-names-stay-short",
claim = "Service binary package names (leaf nodes never declared as deps) must NOT get a provisioning- prefix — short name only, prefix lives in [[bin]] name",
scope = "platform/crates/orchestrator, platform/crates/control-center, platform/crates/vault-service, platform/crates/ai-service, platform/crates/extension-registry, platform/crates/ncl-sync",
severity = 'Soft,
check = { tag = 'Grep, pattern = "orchestrator\\s*=|control-center\\s*=|vault-service\\s*=|ai-service\\s*=|extension-registry\\s*=|ncl-sync\\s*=", paths = ["provisioning/platform/Cargo.toml"], must_be_empty = true },
rationale = "Adding a prefix to service binary package names would break existing cargo build -p <name> muscle memory and CI scripts without adding any correctness benefit. The rule is: prefix where disambiguation matters (dep declarations), not where it is only cosmetic (package name for leaf binaries).",
},
],
ontology_check = {
decision_string = "Rename platform workspace crates to apply a coherent naming convention",
invariants_at_risk = ["config-driven-always"],
verdict = 'Safe,
},
}

View file

@ -0,0 +1,48 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-031",
title = "Unified Component CLI: prvng component <op> with Polymorphic Mode Dispatch",
status = 'Accepted,
date = "2026-04-19",
context = "Before this decision the CLI exposed two separate command hierarchies for infrastructure lifecycle: `prvng taskserv <op>` for taskserv-mode components and `prvng component list|show|info` for read-only introspection. Write operations (install, delete, update, reinstall, restart, backup, restore) were routed through `taskserv.nu` regardless of the component's actual deploy mode (taskserv/cluster/container). This created three problems: (1) a cluster-mode component like postgresql required `prvng taskserv create postgresql` even though it runs as a Kubernetes deployment — the semantics were wrong and confused operators; (2) the script resolution used only the Tier-1 `install-<name>.sh` convention — Tier-2 per-op scripts (`delete-<name>.sh`, `backup-<name>.sh`) were never invoked; (3) the orchestrator endpoint `/workflows/taskserv/create` accepted a `TaskservWorkflow` body that embedded no mode information, so the worker had no way to route script execution by mode. A prerequisite condition also existed: no precondition gate validated that capability providers (e.g. k0s for cluster-mode, NFS for democratic_csi) were healthy before enqueuing a write.",
decision = "Replace `prvng taskserv` with a unified `prvng component <op>` command that dispatches polymorphically by deploy mode. The implementation has five parts: (1) A new orchestrator endpoint `/api/v1/workflows/component/{op}` accepts `ComponentWorkflow` (workspace + infra + component + server + namespace + ssh_user + ssh_key_path + settings + check_mode + provisioning). The `{op}` path segment is validated against a fixed allowlist (install, delete, update, reinstall, restart, backup, restore, check-updates for write; status, health, list, show for read). (2) A precondition gate (`src/preconditions.rs`) runs before task enqueue for write ops: it fast-fails on `.provisioning-state.ncl` terminal states (failed/error), then runs live SSH probes via the system `ssh` binary with a 15-second global timeout. The gate is skipped for read-only ops and for taskserv-mode components (root provider, no dependencies). (3) Script resolution is upgraded to two tiers: Tier 2 (`<op>-<name>.sh`) is preferred; Tier 1 (`install-<name>.sh` + `CMD_TASK=<op>` env) is the fallback. `cluster_deploy.nu` calls `get-component-script-path` from `extensions/discovery.nu` and merges `CMD_TASK` only for Tier-1 scripts. (4) The NuShell CLI adds eight exported lifecycle commands to `cli/components.nu` (install/delete/update/reinstall/restart/backup/restore/check-updates) and two new functions to `platform/clients/orchestrator.nu` (`orch-submit-component`, `orch-wait-task`). (5) A feature flag `orchestrator.features.enable_component_endpoint` (default: true) allows a one-line rollback to 404 without binary redeployment.",
rationale = [
{
claim = "Single endpoint with path-param op is cleaner than one endpoint per operation",
detail = "The alternative was `/api/v1/workflows/component/install`, `/api/v1/workflows/component/delete`, etc. — eight separate routes with identical handler bodies. A single parameterised route `/api/v1/workflows/component/{op}` validates the op at the top of the handler and reuses one `WorkflowTask` construction path. The operation name is forwarded as the Nu script's `CMD_TASK` env var for Tier-1 scripts, so the routing is structural, not branching.",
},
{
claim = "SSH via tokio::process::Command is safer than the russh crate for the precondition gate",
detail = "The codebase has a `russh` dependency but `pool/executor.rs` is a stub. Adding a live russh integration solely for the precondition gate would require implementing the full connection pool — a significant scope addition. The system `ssh` binary with `BatchMode=yes StrictHostKeyChecking=no ConnectTimeout=N` is a one-file implementation with no state, no connection pool management, and no key-format assumptions. The probe is fire-and-forget (invoked once per precondition check, not per request), so there is no meaningful performance loss from process spawn overhead.",
},
{
claim = "The feature flag is a deployment-level rollback, not an A/B switch",
detail = "The flag `enable_component_endpoint` returns HTTP 404 when false. This is intentional: callers that haven't migrated receive a 404 and fall back to the legacy `/workflows/taskserv/create` route, which remains in place. The flag is not intended for permanent dual-track operation — it exists to give a one-flag rollback path during the migration window, after which the legacy route and taskserv.nu will be deleted.",
},
{
claim = "Deleting prvng taskserv is a breaking change accepted as deliberate",
detail = "All three call sites for the old taskserv API (NuShell CLI, control-center-ui, MCP server) are migrated atomically in this decision. The commands-registry.ncl entry is removed, the `t` single-char alias is removed, and the bash wrapper dispatch case is removed. `taskserv.nu` is retained temporarily to avoid breaking any in-flight sessions. Its permanent deletion is a follow-on commit after confirming no un-migrated callers exist.",
},
],
consequences = {
positive = [
"prvng component install postgresql correctly names the operation for both cluster and taskserv modes",
"Backup and restore operations are now surfaced via prvng component backup/restore — previously inaccessible from the CLI",
"Precondition gate prevents cascading failures when a capability provider is unhealthy before a write operation",
"Tier-2 per-op scripts are now invoked when present — operators can specialise delete/backup logic without patching the generic install script",
],
negative = [
"Breaking change: prvng taskserv is removed. Any external scripts or documentation referencing the old command require update",
"The precondition gate adds 015 seconds to write operations when providers are unhealthy or unreachable — healthy-path overhead is ~2s for the state-file fast-fail",
],
neutral = [
"taskserv.nu is not deleted in this commit — a follow-on cleanup commit removes it once in-flight migration is confirmed",
"The legacy /workflows/taskserv/create endpoint is preserved indefinitely until the feature flag is toggled and the route removed in a future cleanup",
],
},
}

View file

@ -0,0 +1,59 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-032",
title = "Node Role and Scale Constraints: ControlPlane Immutability and Worker Lifecycle Gates",
status = 'Accepted,
date = "2026-04-20",
context = "The provisioning system gained a formal `NodeRole` enum (`ControlPlane | Worker | LoadBalancer`) declared in `schemas/infrastructure/compute/scaling.ncl` alongside a `ScalePolicy` contract that captures min/max bounds and a hardware template for spawning new nodes. Without explicit lifecycle gates, any operator with `can_operate` permission could call `server_delete` on a ControlPlane node — destroying the k0s controller, etcd state, and all cluster API endpoints in one call. A secondary risk exists in the opposite direction: deleting the last Worker node while the ControlPlane still serves its API violates the `scale.min` bound declared in the NCL, leaving the cluster in a partially healthy state with no execution capacity.",
decision = "Enforce role-aware lifecycle gates at three layers: (1) Schema — `delete_lock` is implicitly true for ControlPlane nodes via the `make_server` helper in each workspace's `servers.ncl`; hcloud protection mirrors the schema intent. (2) Daemon UI — a dedicated POST `/ui/workspaces/{ws}/servers/{srv}/scale-down` handler runs two sequential gates before calling `server_delete`: Gate-1 rejects any request where `role == ControlPlane` with HTTP 422; Gate-2 counts live hcloud servers whose names match the `scale.template.hostname_pattern` prefix — if removing this node would bring the count below `scale.min`, the request is rejected. (3) Teardown order — ControlPlane nodes can only be targeted for deletion through a dedicated `teardown` workflow (future) that first deprovisions all Workers; the scale-down endpoint is not the teardown path. The scale-down endpoint is the only UI-exposed deletion path for Worker/LB nodes — the raw `server_delete` tool remains available to admin-role CLI operators only.",
rationale = [
{
claim = "Gate at the daemon layer, not only at the schema layer",
detail = "hcloud `protection.delete = true` prevents accidental UI clicks on the hcloud console but does not fire when the provisioning daemon calls the hcloud CLI with `--force`. The daemon gate is the authoritative enforcement point because it understands role semantics. Schema-level `delete_lock` is a documentation and default-setting mechanism, not a runtime gate.",
},
{
claim = "Separate scale-down endpoint instead of adding guards to the existing server_delete tool",
detail = "The `server_delete` tool is a low-level destructive primitive registered in provisioning-core. Adding role-awareness to it would couple infrastructure topology semantics into the core tool layer, which is designed to be workspace-agnostic. The scale-down UI handler is workspace-scoped — it loads `servers.ncl` for the active workspace to read the role and scale policy, then calls the primitive only after gates pass.",
},
{
claim = "ScalePolicy.min is the authoritative lower bound, not a hardcoded value",
detail = "Different infra environments have different operational minimums. A dev workspace may tolerate 0 workers; a production cluster requires at least 2 for HA. Encoding min in the NCL `ScalePolicy` means the gate is always consistent with the declared intent, with no magic constants in daemon code.",
},
{
claim = "Teardown order (Workers before ControlPlane) is not enforced by scale-down",
detail = "The scale-down endpoint enforces min-bound and CP-immutability but does not implement full teardown sequencing. A full teardown (destroy entire infra env) is a DAG-inverted workflow: reverse the provisioning DAG, deprovision Workers first, then ControlPlane. This is a separate concern handled by a future `teardown` workflow endpoint. Mixing teardown logic into scale-down would conflate two distinct operations.",
},
],
consequences = {
positive = [
"ControlPlane nodes cannot be deleted via the UI regardless of operator permission level",
"Worker deletion is gated on the declared scale.min — under-provision accidents are caught before hcloud API call",
"The daemon UI gate is the single authoritative enforcement point — no duplication across CLI, MCP, and HTTP handlers",
"ScalePolicy.min can be changed in NCL without touching daemon code",
],
negative = [
"Admin operators who intentionally need to delete a CP node (disaster recovery, full teardown) must use the CLI `server_delete` tool directly — the UI does not expose an override path",
"The hostname_pattern prefix heuristic for counting live workers is a string-prefix match, not a typed query — it fails if two workspaces share a hostname prefix",
],
},
alternatives_considered = [
{
option = "Add role check to the existing server_delete tool in provisioning-core",
why_rejected = "server_delete is a workspace-agnostic primitive. Loading servers.ncl inside a core tool would introduce workspace path coupling into a layer that must remain context-free. The UI handler already has workspace context.",
},
{
option = "Use Cedar policies for role-based node protection",
why_rejected = "Cedar is configured for principal-level authorization (who can do what), not for resource-level topology constraints (which nodes are protected). The node role is a property of the infrastructure declaration, not of the actor's permissions. Cedar would need to be fed the role data per-request — more complexity than a local gate with no added safety.",
},
{
option = "Block deletion via hcloud protection flag only",
why_rejected = "hcloud protection fires only when the hcloud CLI is called directly. The provisioning daemon calls the hcloud CLI with privilege — protection can be disabled before deletion in a single compound command. It is a backstop, not a gate.",
},
],
}

View file

@ -0,0 +1,143 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-033",
title = "Cluster Component Extension Pattern: split-script + manifest plan authoring contract",
status = 'Accepted,
date = "2026-04-24",
context = "ADR-031 introduced the unified `prvng component <op>` CLI with polymorphic mode dispatch. The orchestrator server runs `install-{name}.sh {op}` as the cluster-mode entry point. Before this decision, no authoring contract existed for cluster extensions: credential file naming (`credentials.env` vs `_credentials.env`), method implementations, and the manifest plan structure were conventions known only from reading existing extensions. The postgresql extension was authored with the legacy monolithic pattern — `credentials.env`, all logic in `install-postgresql.sh`, no `{name}-lib.sh`, no `manifest_plan.ncl`. This produced a remote failure (`POSTGRES_PASSWORD is not set`) that was undetectable by the preflight, reached the server, and left the op in failed state.",
decision = "All cluster-mode extensions must follow the split-script pattern enforced by the preflight structural gate. The contract has four parts: (1) `install-{name}.sh` sources `_credentials.env` (underscore prefix, written by the bundle builder from SOPS decryption) — never `credentials.env`; (2) `{name}-lib.sh` implements `_method_{action}` for every non-builtin action declared in `manifest_plan.ncl`, including `post`/`pre` hook actions; (3) `manifest_plan.ncl` declares the operation DAG via the `ManifestPlan` Nickel contract from `schemas/lib/manifest_plan.ncl` — this contract enforces that `namespace` and `pvc` are never deleted or recreated in `update`/`delete`/`restart` plans; (4) `metadata.ncl requires[].capability` values must exactly match a `provides[].id` declared in another workspace component's `metadata.ncl` — the precondition gate does string-exact matching, generic IDs like `'storage'` do not resolve. The preflight gate in `cli/components.nu` checks all four contracts before packaging, surfacing violations as `[preflight] ❌` with the specific cause.",
rationale = [
{
claim = "Credential filename mismatch is undetectable without structural inspection",
detail = "The bundle builder writes `_credentials.env` (prefixed). An install script sourcing `credentials.env` (no prefix) silently skips the source — no error at local preflight, failure only on the remote node mid-plan. The structural gate reads the install script and rejects any `source.*credentials.env` line that does not contain the underscore.",
},
{
claim = "Method coverage check prevents partial manifest plan execution",
detail = "The plan runner generates `run-init.sh` from `manifest_plan.json` and calls `_method_{action}` for each custom step. A missing method produces `command not found` mid-run, leaving the cluster in a partial state. The preflight exhaustively checks all actions in `init`, `update`, `delete`, `restart` plus their `pre`/`post` hooks.",
},
{
claim = "ManifestPlan Nickel contract encodes data-safety invariants at schema time",
detail = "The `ManifestPlan` contract rejects any plan that applies `delete` or `recreate` to `namespace` or `pvc` in non-init operations. This is a compile-time safety net: the plan cannot be exported to JSON if it would destroy persistent data during a rolling update or delete operation.",
},
{
claim = "Capability ID exact-match is the only resolution mechanism in the precondition gate",
detail = "The gate iterates workspace component NCL files, reads their `metadata.ncl provides[].id`, and matches against `requires[].capability`. There is no fuzzy matching, no aliasing, no category hierarchy. Using `'block-storage-csi'` vs `'storage'` is not a naming convention — it is a hard requirement for the gate to resolve the dependency chain.",
},
],
consequences = {
positive = [
"Credential filename bug caught at local `--check` — never reaches the remote node",
"Missing `_method_*` implementations surface as named preflight failures before any SSH",
"ManifestPlan contract prevents accidental PVC/namespace destruction by type system, not convention",
"Capability ID mismatch caught at op submission by the precondition gate with a named error",
],
negative = [
"Legacy monolithic extensions require backfill: add `{name}-lib.sh`, `manifest_plan.ncl`, rename `credentials.env` → `_credentials.env`",
"Typos in `manifest_plan.ncl` action names (`'wai-ready` vs `'wait-ready`) fail at preflight but not at authoring time — no schema validation of action name strings",
],
},
alternatives_considered = [
{
option = "Monolithic install-{name}.sh with case/esac per-operation dispatch",
why_rejected = "No structural contract between plan step declarations and shell method implementations. Credential filename bugs reach the remote node. Tested in postgresql initial authoring: produced a silent `POSTGRES_PASSWORD is not set` on the remote after a successful local preflight.",
},
{
option = "Schema-validate action names in manifest_plan.ncl against a closed enum",
why_rejected = "Custom actions are component-specific (`'create-credentials'`, `'bootstrap-account'`, `'protect-volume'`). A closed enum would require every extension to register action names centrally — breaks the distributed authoring model of ADR-020. The method-coverage gate achieves the same safety without a registry.",
},
{
option = "Auto-source _credentials.env at run-{op}.sh level (bundle builder injects it)",
why_rejected = "Credentials would be exported for the entire script lifetime, visible to any subcommand. The explicit `source` inside `_method_create-credentials` is the correct scope: credentials are loaded only when the method that needs them runs, and unset after. ADR-018 (secretumvault) requires minimal credential exposure time.",
},
],
constraints = [
{
id = "credential-filename-underscore",
claim = "install-{name}.sh must source _credentials.env, never credentials.env",
scope = "provisioning/extensions/components/*/cluster/install-*.sh",
severity = 'Hard,
check = {
tag = 'Grep,
pattern = "source.*[^_]credentials\\.env",
paths = ["provisioning/extensions/components/"],
must_be_empty = true,
},
rationale = "The bundle builder writes the SOPS-decrypted secret to _credentials.env. Sourcing credentials.env (no underscore) silently skips the file — POSTGRES_PASSWORD (or any credential) is never set, and _require_env fails on the remote node with no local signal.",
},
{
id = "lib-sh-required-for-cluster-components",
claim = "Every cluster extension must have {name}-lib.sh with all _method_* implementations declared in manifest_plan.ncl",
scope = "provisioning/extensions/components/*/cluster/",
severity = 'Hard,
check = {
tag = 'NuCmd,
cmd = "PROVISIONING_NO_CACHE=true provisioning component install --check 2>&1 | grep '_method_.*missing'",
expect_exit = 1,
},
rationale = "The preflight structural gate exhaustively checks method coverage. A missing _method_X is a preflight failure, not a remote failure. Without this constraint, a partial lib.sh reaches the server and produces a bash `command not found` mid-plan, leaving the namespace in an inconsistent state.",
},
{
id = "manifest-plan-ncl-required",
claim = "Every cluster extension must have manifest_plan.ncl validated by the ManifestPlan Nickel contract",
scope = "provisioning/extensions/components/*/cluster/manifest_plan.ncl",
severity = 'Hard,
check = {
tag = 'FileExists,
path = "provisioning/extensions/components/{name}/cluster/manifest_plan.ncl",
present = true,
},
rationale = "Without manifest_plan.ncl the bundle builder produces an empty plan — no run-*.sh scripts are generated. The ManifestPlan contract is the only enforcement mechanism for the namespace/pvc deletion protection invariant.",
},
{
id = "capability-id-exact-provider-match",
claim = "metadata.ncl requires[].capability must exactly match a provides[].id declared in a workspace component",
scope = "provisioning/extensions/components/*/metadata.ncl",
severity = 'Hard,
check = {
tag = 'NuCmd,
cmd = "PROVISIONING_NO_CACHE=true provisioning component install --check 2>&1 | grep 'no provider found'",
expect_exit = 1,
},
rationale = "The orchestrator precondition gate in src/preconditions.rs does string-exact lookup: provides[].id == requires[].capability. Generic terms like 'storage' do not match 'block-storage-csi'. The gate rejects the op at submission time, before any SSH, with a named error. Use the exact IDs from the target provider's metadata.ncl.",
},
{
id = "sops-file-required-for-require-env",
claim = "Every cluster extension that calls _require_env VAR in {name}-lib.sh must have infra/{ws}/secrets/{name}.sops.yaml present",
scope = "provisioning/extensions/components/*/cluster/*-lib.sh",
severity = 'Hard,
check = {
tag = 'NuCmd,
cmd = "PROVISIONING_NO_CACHE=true provisioning component install --check 2>&1 | grep 'sops.yaml not found'",
expect_exit = 1,
},
rationale = "The preflight SOPS gate (comp-build-cluster-bundle) checks for the secrets file before attempting bundle build. A missing secrets file means _require_env variables would be unset on the remote node, causing the install script to abort mid-plan. The preflight check surfaces this locally before any SSH occurs.",
},
{
id = "sops-encrypted-regex-covers-require-env-vars",
claim = "Every VAR referenced via _require_env in {name}-lib.sh must appear in sops.encrypted_regex of {name}.sops.yaml",
scope = "infra/*/secrets/*.sops.yaml",
severity = 'Hard,
check = {
tag = 'NuCmd,
cmd = "PROVISIONING_NO_CACHE=true provisioning component install --check 2>&1 | grep 'not in sops.encrypted_regex'",
expect_exit = 1,
},
rationale = "SOPS only encrypts keys matching encrypted_regex. A variable in _require_env that is absent from encrypted_regex is stored in plaintext in the SOPS file and silently passes decryption — it appears to work but leaks secrets in the committed YAML. The preflight checks name coverage explicitly against the regex.",
},
],
ontology_check = {
decision_string = "Cluster extension authoring contract: split-script (install.sh + lib.sh + manifest_plan.ncl) + _credentials.env naming + exact capability IDs — enforced by preflight structural gate before bundle packaging",
invariants_at_risk = ["type-safety-nickel", "config-driven-always"],
verdict = 'Safe,
},
related_adrs = ["adr-020-extension-capability-declarations", "adr-031-unified-component-cli"],
}

View file

@ -0,0 +1,143 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-034",
title = "Workspace Justfile Recipe Pattern: thin-wrapper dispatch + op governance contract",
status = 'Accepted,
date = "2026-04-24",
context = "Workspace justfiles in `workspaces/{ws}/justfiles/` are the operator-facing command surface for all cluster and infrastructure operations. Before this decision, no formal authoring contract existed for justfile modules: logic appeared inline (conditionals, loops), cache was not cleared on write paths, op governance wiring (preflight before op start) was inconsistent, and intent parameters were not quoted — allowing spaces in intent strings to break positional argument parsing. The `op.just` deploy/redeploy/purge recipes and the `mail.just` component-specific pattern emerged as the reference implementations during the libre-wuji postgresql deployment cycle, but the constraints were tribal knowledge. This ADR formalises the contract so any new justfile module can be validated by inspection without reading the reference implementations.",
decision = "Workspace justfile modules follow a four-part contract. (1) Module structure: each `.just` file covers exactly one functional domain, declares a module-level variable for paths/script refs (never hardcoded inline — `infra` must be defined once as `infra := \"infra/{ws}\"` and used as `{{infra}}/ops/` throughout the module), and provides a `{module}-help` recipe that uses `awk` to extract the group's recipes from `just --list`. (2) Thin-wrapper rule: recipe bodies contain zero branching logic — all logic lives in `provisioning` CLI subcommands or `nu scripts/`. The single allowed exception is multi-step shell composition (`#!/usr/bin/env bash` + `set -euo pipefail`) when the composition itself is the value (e.g. sequencing preflight → op start → deploy → op finish). A second allowed exception is `PROVISIONING_DEBUG` passthrough: multi-step recipes may check `${PROVISIONING_DEBUG:-false}` and set a `DBG_FLAG` variable to propagate debug mode to all `provisioning` calls in the recipe body — this is inline logic that cannot be pushed to the CLI because the flag must reach both the `--check` and the deploy invocations. (3) Write-path invariants: any recipe that mutates cluster state must `export PROVISIONING_NO_CACHE=true` before the first `provisioning` call, preventing stale Nickel config from reaching the remote node. (4) Op governance wiring: write recipes that span multiple `provisioning` calls must follow the preflight-first sequence — `provisioning component {op} {component} --check` runs and must succeed before `provisioning op start` is called; `OP_ID` is captured from `ls -t {{infra}}/ops/ | head -1` immediately after `op start`; `provisioning op finish $OP_ID success|failed` is called unconditionally in both branches. Intent parameters must be passed quoted (`\"{{intent}}\"`) in all delegate calls to preserve spaces.",
rationale = [
{
claim = "Inline logic in justfiles silently diverges from provisioning CLI semantics",
detail = "Just is a task runner, not a shell — variables, quoting, and flow-control behaviour differ subtly from bash. Any conditional or loop written inline in a recipe body must duplicate decisions already encoded in the provisioning CLI or nu scripts, and will drift independently. The thin-wrapper rule prevents this divergence class: the justfile remains a dispatch table, not an implementation.",
},
{
claim = "Stale Nickel config reaching the remote is undetectable at deploy time",
detail = "The provisioning CLI caches rendered Nickel config across invocations. Without `PROVISIONING_NO_CACHE=true`, a write recipe may bundle a config that was rendered before the current edit, sending outdated field values to the orchestrator. This class of bug is invisible in the local preflight because the preflight runs against the cached bundle. Exporting the flag at recipe scope ensures every build in that recipe execution is fresh.",
},
{
claim = "Op record creation before preflight failure leaves an orphaned op in failed state",
detail = "If `provisioning op start` runs before `provisioning component {op} --check`, and the preflight then fails, an op record exists in `infra/{ws}/ops/` with no matching deploy attempt. The op log shows a failed op with no cause. The preflight-first sequence guarantees that no op record is created for a configuration that was known-bad at submission time.",
},
{
claim = "Unquoted intent parameters silently truncate multi-word intent strings",
detail = "Just passes positional parameters to shell recipes as separate words. `provisioning op start {{component}} {{operation}} {{intent}}` receives 'initial' as intent when the caller wrote 'initial mail server setup'. The quoted form `\"{{intent}}\"` preserves the full string through the shell word-splitting boundary. This is observable only when reviewing op log entries — the intent stored in the op record will be truncated without error.",
},
{
claim = "awk-based help recipes provide self-consistent documentation without maintenance overhead",
detail = "A `{module}-help` recipe that runs `just --list | awk '/^ \\[{group}\\]/{p=1;next} p && /^ \\[/{exit} p && NF && !/-help/{print}'` extracts group recipes from the live justfile — the help output is always current. A hand-maintained help block diverges from reality as recipes are added or removed. The awk pattern is copy-exact across modules; only the group name and description line change.",
},
{
claim = "PROVISIONING_DEBUG passthrough is the only legitimate inline conditional in multi-step recipes",
detail = "The `PROVISIONING_DEBUG=true just deploy ...` invocation pattern requires a `DBG_FLAG` variable that is passed to both the `--check` preflight and the deploy invocation. If the flag only reached the deploy but not the preflight, debug output would be incomplete. The flag cannot be pushed to a provisioning CLI subcommand because the shell expansion happens at recipe body scope. This is a narrow, named exception to the thin-wrapper rule — not a precedent for arbitrary inline logic.",
},
],
consequences = {
positive = [
"New module authors have a verifiable contract — a module is conformant if `nu scripts/validate-justfile.nu` produces no violations",
"Contract is machine-validated: `validate-justfile.nu` checks no-cache, preflight ordering, intent quoting, and bash strict mode across all modules",
"Op log integrity preserved: orphaned ops from failed preflights cannot occur under the contract — including secret prerequisites (missing SOPS file, uncovered `_require_env` variables) which are caught by the preflight gate before `op start`",
"Help recipes are self-maintaining — adding a recipe to the group makes it appear in `{module}-help` automatically",
"Intent strings with spaces work correctly in all context (op log, audit trail, status display)",
"PROVISIONING_DEBUG propagates to both preflight and deploy — full debug output without exception to the flag passthrough pattern",
],
negative = [
"Multi-step bash composition (deploy/redeploy/purge pattern) is explicitly allowed but must be justified — this weakens the thin-wrapper rule at the margin; authors must recognise the boundary",
"The `OP_ID=$(ls -t infra/{ws}/ops/ | head -1)` capture is a side-effect convention, not a typed return value — it breaks silently if `ops/` is on a filesystem where mtime ordering is unreliable (not a concern for git-tracked directories, but worth documenting)",
],
},
alternatives_considered = [
{
option = "Encode op governance logic in a provisioning subcommand that wraps preflight+start+deploy+finish",
why_rejected = "The deploy recipe wrapping already exists for the common case. But purge, redeploy, and future multi-phase operations require different sequencing (e.g. purge requires interactive confirmation between op start and the destructive action). A single CLI wrapper would need flags for every variant, reintroducing the branching the thin-wrapper rule eliminates. The composition value of justfile multi-step recipes is precisely this per-operation sequencing.",
},
{
option = "Use just variables instead of bash for PROVISIONING_NO_CACHE",
why_rejected = "Just `export` only works for simple assignments and does not compose with multi-step bash recipe bodies. The `export PROVISIONING_NO_CACHE=true` pattern inside a `#!/usr/bin/env bash` recipe is the only form that reliably propagates the environment variable to all `provisioning` subprocess calls in that recipe body, including those in conditionals.",
},
{
option = "Generate justfile modules from provisioning component metadata",
why_rejected = "Component-specific modules (mail.just, and future postgresql.just) contain operational domain knowledge — emergency procedures, non-standard flags, guard rails — that cannot be derived from component metadata alone. Auto-generation would produce thin scaffolding without the operational value. The module contract is an authoring guide, not a codegen target.",
},
],
constraints = [
{
id = "write-recipe-no-cache",
claim = "Every write recipe (deploy, redeploy, purge, and any recipe that calls provisioning component {op}) must export PROVISIONING_NO_CACHE=true before the first provisioning call",
scope = "workspaces/*/justfiles/*.just",
severity = 'Hard,
check = {
tag = 'NuCmd,
cmd = "nu workspaces/libre-wuji/scripts/validate-justfile.nu 2>&1 | grep 'write recipe missing'",
expect_exit = 1,
},
rationale = "Stale Nickel config silently reaches the remote node when the cache is not cleared. Without PROVISIONING_NO_CACHE=true, the bundle builder may reuse a pre-edit render for the current operation. The flag must be set before any provisioning invocation in the recipe so that even preflight runs against a fresh render.",
},
{
id = "op-governance-preflight-first",
claim = "In any multi-step recipe that calls provisioning op start, a provisioning component {op} {component} --check must appear before it and gate on its exit code",
scope = "workspaces/*/justfiles/*.just",
severity = 'Hard,
check = {
tag = 'NuCmd,
cmd = "nu workspaces/libre-wuji/scripts/validate-justfile.nu 2>&1 | grep 'op start'",
expect_exit = 1,
},
rationale = "Op records created before a known-bad preflight produce orphaned failed ops in the audit log with no associated deploy attempt. The preflight-first sequence ensures that op start is called only when the configuration has passed structural validation. Single-line op-start delegates are exempt — they are building blocks, not deploy owners. Purge recipes are exempt — they use interactive namespace confirmation as the gate, not bundle preflight.",
},
{
id = "intent-parameter-quoted",
claim = "Every delegate call that passes {{intent}} must quote it: \"{{intent}}\" — never bare {{intent}}",
scope = "workspaces/*/justfiles/*.just",
severity = 'Hard,
check = {
tag = 'Grep,
pattern = "provisioning op start.*{{intent}}[^\"]",
paths = ["workspaces/"],
must_be_empty = true,
},
rationale = "Just passes positional parameters as shell words. An unquoted {{intent}} is split on whitespace by the shell, truncating multi-word intent strings silently. The op record stores only the first word. This is undetectable at recipe invocation time — it fails only at op log review when the intent field is wrong.",
},
{
id = "multi-line-recipe-bash-strict",
claim = "Any recipe with a #!/usr/bin/env bash shebang must have set -euo pipefail as the second line",
scope = "workspaces/*/justfiles/*.just",
severity = 'Hard,
check = {
tag = 'Grep,
pattern = "#!/usr/bin/env bash",
paths = ["workspaces/"],
must_be_empty = false,
},
rationale = "Just does not propagate exit codes from recipe lines by default in multi-line bash recipes. Without set -euo pipefail, a failing provisioning call mid-recipe continues execution — subsequent steps run against a broken cluster state. The `set -e` part is the critical one: it ensures that op finish is not called with 'success' after a deploy failure.",
},
{
id = "module-help-recipe-required",
claim = "Every .just module must have a {module}-help recipe using the awk group-extraction pattern",
scope = "workspaces/*/justfiles/*.just",
severity = 'Soft,
check = {
tag = 'Grep,
pattern = "-help:",
paths = ["workspaces/"],
must_be_empty = false,
},
rationale = "Without a {module}-help recipe, the module is invisible from the root default recipe. Operators discover available operations via 'just' (default recipe) → 'just {module}-help' → recipe detail. A module without help breaks the discovery chain. The awk pattern is self-maintaining — no manual synchronisation required as recipes are added.",
},
],
ontology_check = {
decision_string = "Workspace justfile modules: thin-wrapper dispatch (no inline logic) + PROVISIONING_NO_CACHE=true on write paths + preflight-first op governance sequence + quoted intent parameters + awk-based self-documenting help recipes",
invariants_at_risk = ["config-driven-always", "type-safety-nickel"],
verdict = 'Safe,
},
related_adrs = ["adr-031-unified-component-cli", "adr-033-cluster-component-extension-pattern"],
}

View file

@ -0,0 +1,96 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-035",
title = "StorageConfig schema: provider-declared storage policies and component requires.storage contract",
status = 'Accepted,
date = "2026-04-24",
context = "Components declare storage needs as an untyped record: `requires.storage = { size = \"20Gi\", persistent = true }`. No contract validates that the declared size is within provider bounds, that the volume mode is compatible with the storage class, or that expansion is possible if the PVC must grow later. The postgresql deployment was provisioned with a 20Gi PVC on hcloud-volumes (minimum 10Gi, expand-only). Reducing it is impossible: Hetzner CSI only allows expansion. This class of error — requesting more storage than needed on a provider that cannot shrink volumes — has no static check and no runtime signal until the operator attempts a resize and finds it rejected. A separate problem: 'block', 'nfs', and 'object' volume semantics are not represented at all; a component could request NFS access mode on a block-only storage class without any validation. This ADR defines the StorageConfig schema to make these constraints machine-checkable.",
decision = "Introduce `schemas/lib/storage_config.ncl` with three exports: (1) `StorageRequires` — the contract for component `requires.storage` fields, adding `volume_mode` (block/nfs/object) and `access_mode` alongside the existing `size` and `persistent` fields; (2) `ProviderStoragePolicy` — the abstract contract for provider metadata declarations, specifying `min_size`, `max_size`, `expansion_policy` (static/expand_only/full), and `volume_modes`; (3) concrete provider policy values `HetznerCSIPolicy` and `DemocraticCSINFSPolicy` with the real constraints pre-filled. Storage class providers declare their policy in `capabilities.ncl` or `metadata.ncl` using `ProviderStoragePolicy`. Component storage requirements use `StorageRequires`. The preflight gate in `comp-build-cluster-bundle` is the enforcement point: it reads the storage class from component config, resolves the matching provider policy from capabilities, and fails if the requested size is below `min_size`. No ADR-mandated change to the component CLI is required — preflight already has access to both component config and capabilities.",
rationale = [
{
claim = "Hetzner CSI volumes cannot shrink — min_size enforcement must happen at deploy time, not at resize time",
detail = "The Kubernetes CSI spec allows drivers to implement VolumeExpansion but not VolumeContraction. Hetzner's hcloud-volumes driver only supports expansion. A PVC provisioned at 20Gi on hcloud-volumes cannot be reduced to 10Gi without deleting the PVC (and losing data) and reprovisioning. The `min_size = \"10Gi\"` field in HetznerCSIPolicy, combined with preflight validation, catches over-provisioning before the PVC is created — where the correction is a config edit, not a data migration.",
},
{
claim = "Volume mode (block/nfs/object) is not derivable from storage class name alone",
detail = "Storage class names like 'hcloud-volumes', 'democratic-csi-nfs', 'longhorn' carry no semantic: a reader cannot determine from the name whether the class provides RWO block storage, RWX NFS, or something else. The `volume_mode` field in `StorageRequires` and `volume_modes` in `ProviderStoragePolicy` make this explicit. A component requesting `volume_mode = 'nfs` on a storage class whose policy declares `volume_modes = ['block]` is a preflight failure, not a runtime error on the remote node.",
},
{
claim = "expansion_policy encodes the one-way door semantics of provider volume management",
detail = "Three states: 'static (no resize at all — e.g. hostPath), 'expand_only (increase only — Hetzner CSI), 'full (expand and shrink — democratic-csi NFS, some Longhorn configurations). This field is the authoritative signal for whether a future size increase in component config will be deployable. An operator who knows their provider is 'expand_only can provision conservatively (10Gi) knowing they can grow later, rather than defensively provisioning large volumes that cannot be reclaimed.",
},
{
claim = "Concrete provider policy values (HetznerCSIPolicy, DemocraticCSINFSPolicy) eliminate per-workspace duplication",
detail = "Without pre-defined policy constants, every workspace capabilities.ncl that uses hcloud-volumes would need to manually specify `min_size = \"10Gi\"`, `expansion_policy = 'expand_only`, etc. — and could drift. By defining HetznerCSIPolicy and DemocraticCSINFSPolicy in the schema, workspaces reference the canonical policy: `storage_policy | sc.ProviderStoragePolicy = sc.HetznerCSIPolicy`. The Nickel contract then validates any field override against the policy shape.",
},
],
consequences = {
positive = [
"PVC over-provisioning on expand-only providers is caught at preflight before the PVC exists",
"Volume mode mismatches (NFS component on block storage class) become preflight failures",
"capabilities.ncl gains a typed storage policy declaration — provider constraints are readable without consulting Hetzner docs",
"StorageRequires contract applies to all component requires.storage fields uniformly via schema import",
"Concrete policy values (HetznerCSIPolicy) are the single source of truth — workspace drift is impossible via Nickel contract",
],
negative = [
"Size comparison (component.requires.storage.size >= provider.min_size) requires string-to-bytes parsing — this is done in Nu (preflight), not in Nickel, because Nickel has no byte-unit parsing in std",
"Provider policy must be declared in capabilities.ncl — a storage class used without a matching policy entry cannot be validated (validation skips rather than fails, so the gap is silent)",
],
},
alternatives_considered = [
{
option = "Add min_size / max_size directly to the storage_classes list in InfraCapabilities",
why_rejected = "InfraCapabilities.storage_classes is currently Array String (a list of class names). Changing it to a typed record would require updating all capabilities.ncl files in all workspaces simultaneously. The ProviderStoragePolicy approach allows new capabilities.ncl entries to use the typed policy while old entries continue to work — opt-in migration rather than breaking change.",
},
{
option = "Enforce via Kubernetes admission webhook (VPA/LimitRange) instead of preflight",
why_rejected = "Admission webhooks enforce at pod scheduling time, not at bundle validation time. The gap between 'provisioning op started' and 'webhook rejects the PVC' is an orphaned in-progress op with no clean recovery path. Preflight enforcement keeps the invariant: if preflight passes, the deploy can succeed without external gates.",
},
{
option = "Allow size as a Number (Gi) instead of String",
why_rejected = "Existing components use `size = \"20Gi\"` (String). Changing to Number would require a migration across all component NCL files and breaks Nickel contract compatibility. The String representation is also the form Kubernetes expects in PVC manifests, so no conversion is needed in templates.",
},
],
constraints = [
{
id = "storage-requires-uses-contract",
claim = "Any component NCL that declares requires.storage must use StorageRequires from schemas/lib/storage_config.ncl",
scope = "provisioning/catalog/components/*/nickel/contracts.ncl",
severity = 'Soft,
check = {
tag = 'Grep,
pattern = "storage_config",
paths = ["provisioning/catalog/components/"],
must_be_empty = false,
},
rationale = "StorageRequires adds volume_mode and access_mode to the storage spec. Without the contract import, components declare an untyped record that passes Nickel validation regardless of content — the volume_mode / access_mode fields are silently ignored. The soft severity reflects that adoption is incremental — existing components without storage can be migrated on next edit.",
},
{
id = "provider-policy-min-size-hetzner",
claim = "Any capabilities.ncl that declares hcloud-volumes must set min_size = \"10Gi\" and expansion_policy = 'expand_only",
scope = "workspaces/*/infra/*/capabilities.ncl",
severity = 'Hard,
check = {
tag = 'NuCmd,
cmd = "nu -c \"open workspaces/libre-wuji/infra/libre-wuji/capabilities.ncl | str contains 'hcloud'\"",
expect_exit = 0,
},
rationale = "Hetzner hcloud-volumes is the primary block storage provider in libre-wuji. Omitting min_size means components can request 5Gi PVCs which Hetzner will reject at provisioning time with a CSI error. The HetznerCSIPolicy constant in storage_config.ncl provides the correct values — workspaces should reference it rather than hard-code the constraint.",
},
],
ontology_check = {
decision_string = "StorageConfig schema: StorageRequires contract for components + ProviderStoragePolicy for providers + HetznerCSIPolicy/DemocraticCSINFSPolicy constants + preflight size/mode validation",
invariants_at_risk = ["type-safety-nickel", "config-driven-always"],
verdict = 'Safe,
},
related_adrs = ["adr-033-cluster-component-extension-pattern", "adr-020-extension-capability-declarations"],
}

View file

@ -0,0 +1,97 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-036",
title = "db-* operation abstraction: standard manifest_plan actions for database lifecycle across MySQL, PostgreSQL, and SurrealDB",
status = 'Accepted,
date = "2026-04-24",
context = "Database components (postgresql, and future mysql, surrealdb) each implement ad-hoc backup, restore, and health-check methods in their lib.sh files with no shared naming contract. `provisioning component backup postgresql` works today because ManifestPlan supports arbitrary plan section keys and the component CLI dispatches any op name to the bundle builder. However, each engine invents its own method names (`_method_backup`, `_method_dump`, `_method_db-dump`, etc.) and parameter conventions. There is no standard for: (a) how backup artifacts are named, (b) whether Object Storage is involved, (c) what 'state' means (connection count? replication lag? table sizes?), or (d) how restore locates its source. Additionally, Object Storage integration (e.g., Hetzner Object Storage, Backblaze B2) for archival is not modeled. This ADR establishes seven standard db-* operation names as the cross-engine contract.",
decision = "Define seven standard manifest_plan action names for database lifecycle operations. Each action maps to a `_method_{action}` implementation in the engine's lib.sh. The operations are: `db-init` (create databases, roles, and initial schema — idempotent), `db-backup` (full consistent backup, compressed, optionally pushed to object storage — artifact named `{name}-{timestamp}.dump.gz`), `db-restore` (restore from artifact path or object storage key, passed via BACKUP_SRC env or params.src), `db-dump` (plain SQL export to stdout or local path — lighter than db-backup, no binary format), `db-state` (query operational state: database sizes, connection counts, replication lag, bloat — output to stdout as structured text), `db-query` (run ad-hoc SQL from params.sql or QUERY env — read-only by default), `db-snap` (engine-native point-in-time snapshot — e.g., pg_basebackup for PostgreSQL, file-level copy for SurrealDB). These seven names become the convention: any database component that declares `operations.backup = true` in its Nickel config must implement `_method_db-backup`. The corresponding justfile module (`justfiles/db.just`) provides generic recipes that work for any database component. Engine-specific modules (e.g., `justfiles/postgresql.just`) thin-wrap the generic db.just recipes for their component.",
rationale = [
{
claim = "Seven operations cover the complete database lifecycle without engine-specific command surface",
detail = "db-init handles first-time setup (idempotent). db-backup and db-restore are the data safety pair. db-dump complements backup for portability — pg_dump output is readable, binary backup formats are not. db-state is the operational health surface: sizes, connections, lag — enough to answer 'is the database healthy' without custom dashboards. db-query enables one-off queries from the operator without exec-ing into the pod. db-snap provides near-zero-RPO backup using engine-native mechanisms when available. No other operations have emerged across the postgresql and docker_mailserver deployment cycles.",
},
{
claim = "Object Storage integration lives in db-backup and db-snap, not in a separate operation",
detail = "Adding a separate 'archive' operation would require sequencing: backup → archive → verify. This three-step sequence is exactly what db-backup params.dest is for: if BACKUP_DEST is set to an S3 URI (s3://bucket/prefix), the backup method uploads directly. The method retains local copy for BACKUP_KEEP_LOCAL hours before deletion. This single-operation model means `just pg-backup` and `just pg-backup dest=s3://mybucket/pg` are the same code path with different params, avoiding a separate archive stage and its op governance overhead.",
},
{
claim = "Naming convention db-{verb} avoids collision with existing component op names",
detail = "Existing component ops (install, update, delete, restart, backup, restore) are generic and dispatched by the component CLI. The db-* prefix is reserved for database-semantic operations that require SQL engine awareness. This avoids ambiguity: 'backup' as a component op is 'snapshot the entire component state', while 'db-backup' is 'dump database contents'. Both can coexist in manifest_plan.ncl without naming conflict because they are distinct section keys.",
},
{
claim = "params.src and params.dest are the standard interface for artifact location, not env vars",
detail = "ManifestEntry.params is a `{ _ | String }` record — arbitrary string key-value pairs passed to _method_* implementations. Using params.src (restore source) and params.dest (backup destination) is self-documenting in manifest_plan.ncl and in the justfile recipe: `just pg-backup dest=s3://bucket/pg`. Environment variables (BACKUP_SRC, BACKUP_DEST) are the fallback when params is absent — the method checks params first, env second. This two-tier resolution allows interactive override without modifying manifest_plan.ncl.",
},
],
consequences = {
positive = [
"Cross-engine tooling: a `justfiles/db.just` with generic recipes works for postgresql, mysql, surrealdb without modification",
"Object Storage backup path is a convention (s3://bucket/prefix), not per-engine config — backup tooling is uniform",
"db-state provides a standard operational query without exec into pod — consistent with no-SSH-for-observability principle",
"db-init idempotency means reprovisioning a database component doesn't require manual schema recreation",
"Seven operations cover backup, restore, observability, and ad-hoc queries — no further operations expected for standard OLTP databases",
],
negative = [
"db-snap is engine-specific: pg_basebackup for PostgreSQL, file-level copy for SurrealDB, xtrabackup for MySQL — method implementations are not portable across engines",
"params.dest S3 URI handling requires credentials (S3 access key, secret) in the component SOPS file — operators must add S3 credentials alongside DB credentials before using db-backup with object storage",
"db-state output format is unstructured text per engine — there is no typed structured output contract, which limits automated parsing",
],
},
alternatives_considered = [
{
option = "Add database operations to the component manifest_plan operations field as boolean flags",
why_rejected = "The operations record (`operations.backup = true`) already controls whether the component supports an op. Adding db-specific booleans (operations.db_backup, operations.db_restore) would double the operations field without adding new information — the presence of a db-backup section in manifest_plan is the declaration. The operations field is for CLI feature gating, not for naming.",
},
{
option = "Implement a separate 'db-operator' component that manages databases across engines",
why_rejected = "A cross-engine db-operator requires a running sidecar or separate deployment with access to all database pods. This adds infrastructure complexity and a failure mode (operator pod down → no backup). The lib.sh-in-bundle pattern keeps operations self-contained: the run-db-backup.sh script carries everything it needs, runs on the control plane node, and requires only kubectl + the database client binary. No additional components.",
},
{
option = "Use Velero for backup instead of engine-native methods",
why_rejected = "Velero provides consistent volume snapshots (application-consistent requires hooks) and is CSI-level, not database-level. It cannot produce a pg_dump or mysqldump — only a filesystem snapshot. For PostgreSQL, a consistent SQL dump is more portable and restorable than a volume snapshot across different PostgreSQL versions. Velero is complementary (infrastructure-level DR), not a replacement for db-backup.",
},
],
constraints = [
{
id = "db-backup-method-required-if-operations-backup",
claim = "Any database component with operations.backup = true in its Nickel config must implement _method_db-backup in its lib.sh",
scope = "provisioning/extensions/components/*/cluster/*-lib.sh",
severity = 'Hard,
check = {
tag = 'NuCmd,
cmd = "grep -l 'backup.*=.*true' provisioning/extensions/components/*/nickel/defaults.ncl | each { |f| let comp = ($f | path dirname | path dirname | path basename); let lib = $'provisioning/extensions/components/($comp)/cluster/($comp)-lib.sh'; if ($lib | path exists) and (not (open $lib | str contains '_method_db-backup')) { print $'($comp): missing _method_db-backup' } } | str join ''",
expect_exit = 0,
},
rationale = "The component CLI dispatches 'backup' to the bundle builder which extracts the manifest_plan.backup section. If the plan has a db-backup step but lib.sh does not implement _method_db-backup, the run script fails mid-execution on the remote node. The preflight method coverage check catches this — the constraint here documents the naming convention.",
},
{
id = "db-backup-artifact-naming",
claim = "db-backup method implementations must produce artifacts named {component}-{timestamp}.dump.gz or {component}-{timestamp}.tar.gz",
scope = "provisioning/extensions/components/*/cluster/*-lib.sh",
severity = 'Soft,
check = {
tag = 'Grep,
pattern = "dump\\.gz\\|tar\\.gz",
paths = ["provisioning/extensions/components/"],
must_be_empty = false,
},
rationale = "A consistent artifact naming scheme allows automated retention policies and object storage lifecycle rules to match on prefix. Without it, each engine invents its own format (pg-backup-20260424.sql, dump_2026-04-24.tar.bz2) and rotation scripts must be per-engine. The soft severity reflects that existing backup implementations predate this ADR.",
},
],
ontology_check = {
decision_string = "db-* operation abstraction: seven standard manifest_plan action names (db-init, db-backup, db-restore, db-dump, db-state, db-query, db-snap) as cross-engine database lifecycle contract + params.src/params.dest for artifact location + Object Storage integration via BACKUP_DEST s3:// URI",
invariants_at_risk = ["config-driven-always"],
verdict = 'Safe,
},
related_adrs = ["adr-033-cluster-component-extension-pattern", "adr-035-storage-config-schema"],
}

View file

@ -0,0 +1,138 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-037",
title = "Ops contract dual-mode: NATS pending queue, JWT-signed commands, and switchable signer (keeper-VM auto / operator manual) without code changes",
status = 'Accepted,
date = "2026-04-26",
context = "The provisioning platform needs a coordination contract for runtime workload changes (deploy/scale/restart/secret_update/drain) that satisfies four constraints simultaneously: (1) the workload cluster (libre-wuji class) must be runtime-autonomous — it cannot pull from the CI cluster (libre-daoshi class) at boot or for steady-state operation; (2) operators must be able to drive ops manually from a laptop with a hardware key when the automated signer is offline, with no code changes; (3) multiple emitters (CI pipelines on libre-daoshi, operator laptops, future GitHub Actions) must be able to propose ops concurrently without distributed-lock complexity; (4) every applied op must be auditable with cryptographic provenance independent of any single node remaining online. The naive design — workload cluster pulls a deploy spec from a known git repo on the CI cluster — fails constraint (1); a direct RPC from CI to a signing service on a single VM fails constraint (2) when the signer dies; ad-hoc multi-emitter coordination via filesystem locks or database advisory locks fails constraint (3) under network partitions; storing audit logs only on the workload cluster fails constraint (4) when that cluster is lost. The design needs a single coordination substrate that decouples emitters from signers, serializes concurrent ops, survives signer outages without losing operations, and emits auditable provenance independent of cluster health.",
decision = "Adopt a NATS JetStream-based ops contract with three subject namespaces and dual-mode signing. (1) Subject layout per workspace: `ops.pending.<workspace>.<op_type>` for unsigned proposals, `ops.cmd.<workspace>.<op_type>` for signed commands ready to apply, `ops.ack.<workspace>.<op_type>` for application result, `ops.audit.<workspace>` for the immutable audit stream. JetStream streams `OPS_PENDING_<workspace>` (WorkQueue retention, 14 days) and `OPS_CMD_<workspace>` (WorkQueue retention, 24 hours) plus `OPS_AUDIT_<workspace>` (Limits retention, 90 days, replicas=3) implement the persistence and ordering guarantees. (2) JWT claims for every signed message: `iss` (signer identity: keeper-vm-primary | operator-<id> | gh-actions-<id>), `sub` (requesting principal: woodpecker-job-<id> | manual-<operator>), `aud` (target workspace), `scopes` (allowed op_type:target tuples), `seq` (per-issuer monotonic counter — anti-replay), `jti` (UUIDv4 idempotency key), `expected_state_version` (optimistic concurrency token), `exp`/`nbf` (validity window). (3) Signer is any subscriber to `ops.pending.*` with a key in the workspace's authorized-signers set. The keeper-daemon (running on the dedicated ops-vm workspace) auto-signs operations matching a declarative policy file (see ADR-XXX keeper policy schema); the keeper-cli running on operator laptops with a YubiKey signs interactively via `keeper pending sign <id>`. Both produce identical JWT-signed messages on `ops.cmd.*` — wuji's ops-controller does not distinguish between automated and manual signers, only the JWT validity. (4) Mode switch is operational, not configurational: stopping the keeper-daemon process on ops-vm degrades the system to operator-only mode without any code or config change in wuji or daoshi. Restarting it restores automated signing. A hybrid mode is supported by tuning the keeper policy to auto-sign only safe operations (e.g., scale and restart on staging targets) while leaving production deploys for manual approval. (5) Multi-emitter coordination is delegated to JetStream: emitters publish independently with their own per-issuer sequence; the stream's total order resolves concurrency; the ops-controller in wuji applies in stream order with `expected_state_version` optimistic concurrency, returning 409 conflict on the second emitter when two ops target the same state version. (6) Wuji's ops-controller is the single subscriber to `OPS_CMD_<workspace>` in WorkQueue mode — there is exactly one applier per workspace, eliminating the need for distributed leader election; if the controller pod restarts, persisted state in SurrealDB allows reconciliation of in-flight ops on resume.",
rationale = [
{
claim = "JetStream WorkQueue retention with single subscriber gives total order without distributed locks",
detail = "Multi-emitter coordination is the load-bearing complexity in this design. JetStream's WorkQueue stream type with a single durable consumer (wuji ops-controller) provides exactly-once delivery in stream order. Concurrent emitters from libre-daoshi, operator laptops, and external CI write to `ops.cmd.*` independently; the stream sequences them by arrival time. No emitter needs to coordinate with another. The controller applies in order; optimistic concurrency on `expected_state_version` rejects ops that read stale state, which manifests to the emitter as a 409 conflict via NATS request-reply on `ops.ack.*`. This shifts coordination from client-side distributed locks (which require failure-mode reasoning across emitter, lock server, and cluster) to the broker, which has well-understood semantics.",
},
{
claim = "Pending queue between emitters and signers makes mode switching free",
detail = "If emitters published directly to `ops.cmd.*` (signing inline) the system would couple emitter availability to signer availability. By interposing `ops.pending.*` as a separate subject namespace, emitters publish proposals without knowing or caring about who signs. Any subscriber to `ops.pending.*` with a key in the authorized-signers set can sign and republish to `ops.cmd.*`. Switching from auto-sign (keeper-daemon on ops-vm) to manual-sign (operator laptop with keeper-cli) requires no change to emitters and no change to the consumer (wuji ops-controller) — it requires only enabling or disabling the relevant subscriber. This is the same decoupling pattern as a message queue with multiple consumer groups, applied to a signing-and-republish role.",
},
{
claim = "Mandatory JWT scope tuples prevent privilege escalation across workspaces",
detail = "Each signer's JWT is constrained by `scopes` — an array of `op_type:target_pattern` tuples (e.g., `deploy:staging-*`, `scale:vapora`). The ops-controller validates that the requested op falls within at least one scope tuple before applying. A keeper-vm-primary key with scope `deploy:staging-*` cannot sign a deploy to `production-*` even if the policy file permits it locally — the JWT scope is the authoritative declaration. This means a compromised keeper-VM cannot forge production ops if its key was issued with staging-only scopes. Scope rotation (narrowing or widening) is a key-rotation operation, which is auditable.",
},
{
claim = "ops-controller persists in-flight ops to SurrealDB before ack to survive restart without duplicate apply",
detail = "The naive controller acks `ops.cmd.*` consumption first then applies, which would mean a crash between ack and apply produces a missed op (not retried by JetStream because acked). The reverse — apply first then ack — produces possible duplicate apply if the controller crashes after applying but before acking. The correct pattern is: read message, persist `(jti, op_payload, state=pending)` to SurrealDB transactionally, ack to JetStream, then apply, then update SurrealDB to `state=applied`. On restart, the controller reads SurrealDB for `state=pending` rows and reconciles each by checking whether the op was actually applied (idempotency key prevents double-apply). This requires the apply layer to be idempotent on `jti`, which is a design requirement on every op handler.",
},
{
claim = "JWT issuer values are not service identities but key identities — survives signer migration",
detail = "The `iss` claim names a key, not a service. `keeper-vm-primary` is the key currently held by the keeper-daemon; if the keeper-daemon migrates to a different VM, it still presents the same `iss`. Scope rotation (issuing a new key with different scopes) is a separate operation. This decoupling means we can move the keeper-daemon from ops-vm to a laptop temporarily without rotating keys, and a hardware-key-only operator setup uses a different `iss` (e.g., `operator-jpl-yubikey`) so audit trails remain attributable. A compromised key is revoked by removing its `iss` from the workspace's authorized-signers set, which is itself an op (governance op) signed by the operator quorum.",
},
],
consequences = {
positive = [
"Wuji is runtime-autonomous: it pulls nothing from daoshi at boot or steady state — only consumes signed messages from its own NATS JetStream",
"Daoshi is replaceable: any system holding a signer key can drive ops; the platform's ops contract is not coupled to one CI provider",
"Mode switch (auto/manual/hybrid) is operational not architectural — `systemctl stop keeper-daemon` is the entire migration to operator-only",
"Multi-emitter coordination is a property of the broker (JetStream stream order), not an application concern",
"Audit trail is on a separate stream with independent retention — applying ops cannot interfere with audit log integrity",
"Replay protection (jti uniqueness + monotonic seq) prevents reissuing intercepted JWTs",
"Optimistic concurrency surfaces conflicts as explicit 409s to emitters, not as silent overwrites — emitters decide retry policy",
"ops-controller restart is safe because in-flight ops are persisted before ack — no missed ops, no duplicate applies",
],
negative = [
"NATS JetStream is now load-bearing for production ops — its availability constrains deploy throughput; mitigation: replicas=3 within wuji",
"Idempotency contract on every op handler is a development requirement that must be tested per op_type — adding a new op_type requires verifying double-apply safety",
"JWT clock skew between signer and verifier requires NTP/chrony on all signing hosts and on wuji nodes — operational requirement not visible from code",
"JetStream retention windows (14 days pending, 24 hours cmd, 90 days audit) must be sized against the operational rhythm — pending exhaustion in operator-only mode if quorum review takes longer than 14 days will silently drop proposals",
"Multi-emitter conflicts surface as 409s to emitters, who must implement retry-after-restate logic — emitters that ignore 409 will lose their op silently",
],
},
alternatives_considered = [
{
option = "Direct HTTP RPC from emitters to a centralized signer service",
why_rejected = "Couples emitter availability to signer availability and re-introduces the single-VM SPOF. Also requires the signer to be reachable on the network from every emitter, including external CI providers, which is a firewall complication. NATS JetStream as the substrate is already deployed for the orchestrator (ADR-012) and provides the same effect (decoupling, retry, audit) with no new network surface.",
},
{
option = "Pull-based deploys: wuji pulls deploy specs from a git repo on daoshi at intervals",
why_rejected = "Violates wuji autonomy — wuji's runtime would depend on daoshi's git server being reachable. Also introduces eventual-consistency uncertainty (when does a push become visible?) without giving emitters a synchronous signal of acceptance. The pending/cmd/ack triple gives emitters a clear lifecycle: proposal accepted, op signed, op applied or rejected.",
},
{
option = "GitOps via Flux/ArgoCD with workload cluster pulling from a Radicle repo",
why_rejected = "Solves the autonomy concern (Radicle is decentralized) but inherits GitOps' weaknesses for ops not modeled as state declarations: scale/restart/drain are imperative ops that require sequencing, not state convergence. Modeling them as state-document edits requires an awkward layer of versioned state files and reconciliation loops; pending-and-signed messages on a queue match the ops semantics directly. GitOps may complement this for the workload-config layer (ADR-038 covers Radicle's role in the desired-state ledger), but is not a replacement for ops coordination.",
},
{
option = "Distributed lock via SurrealDB live queries for multi-emitter coordination",
why_rejected = "Introduces a write-write coordination problem on the lock document under concurrent emitters, recreating the distributed-lock complexity the JetStream approach avoids. JetStream's stream order is already a globally consistent total order — using it for both the message itself and the coordination semantics is simpler than separating the two concerns.",
},
],
constraints = [
{
id = "ops-controller-single-subscriber",
claim = "Exactly one ops-controller consumer subscribes to OPS_CMD_<workspace> in WorkQueue mode per workspace; multiple subscribers would break ordering guarantees",
scope = "platform/crates/ops-controller/, infra/.../components/ops_controller.ncl",
severity = 'Hard,
check = {
tag = 'Grep,
pattern = "deliver_subject|durable_consumer",
paths = ["platform/crates/ops-controller/"],
must_be_empty = false,
},
rationale = "JetStream WorkQueue with multiple consumers distributes messages round-robin across them, which breaks the single-applier invariant that backs the optimistic-concurrency contract. The constraint is enforced by component config (single replica) and runtime check on consumer creation.",
},
{
id = "jwt-scope-validation-mandatory",
claim = "ops-controller MUST validate JWT scopes against the requested op_type:target before applying; missing scope = reject with 403, do not log a 200",
scope = "platform/crates/ops-controller/src/auth.rs",
severity = 'Hard,
check = {
tag = 'Grep,
pattern = "validate_scopes|check_scope_match",
paths = ["platform/crates/ops-controller/src/"],
must_be_empty = false,
},
rationale = "Without scope validation, any signer key with valid signature can submit any op type to any target, eliminating the privilege boundary that makes scoped keys useful. The check ensures scope validation is at least textually present; runtime tests verify behavior.",
},
{
id = "idempotency-contract-per-op-handler",
claim = "Every op_type handler in ops-controller MUST be idempotent on jti — double-apply with same jti must produce the same final state and not duplicate side effects",
scope = "platform/crates/ops-controller/src/handlers/",
severity = 'Hard,
check = {
tag = 'Grep,
pattern = "fn handle_.*\\(.*jti.*\\)",
paths = ["platform/crates/ops-controller/src/handlers/"],
must_be_empty = false,
},
rationale = "The persist-then-ack-then-apply protocol requires handlers to handle restart-induced re-execution. A handler that issues a deploy command twice is allowed by NATS semantics under restart and must produce no observable difference — typically by checking the jti against persisted apply state before issuing side effects.",
},
{
id = "pending-queue-ttl-monitored",
claim = "OPS_PENDING_<workspace> queue depth and oldest-message age MUST be exposed as Prometheus metrics so operator-only mode (where pendings can accumulate) is observable",
scope = "platform/crates/ops-controller/, infra/.../components/observability.ncl",
severity = 'Soft,
check = { tag = 'Grep, pattern = "ops_pending_queue_depth|ops_pending_oldest_age_seconds", paths = ["platform/crates/ops-controller/src/"], must_be_empty = false },
rationale = "In operator-only mode, pendings accumulate awaiting human signature. Without monitoring, operators may not notice that a pending sat for 13 days and is about to expire. The 14-day retention is generous but finite; observability of queue state is the operational mitigation against silent drop.",
},
],
ontology_check = {
decision_string = "Ops contract dual-mode: NATS JetStream with ops.pending/ops.cmd/ops.ack/ops.audit subject namespaces + JWT-signed commands with scopes + replaceable signer (keeper-daemon auto / keeper-cli manual) + ops-controller as single per-workspace WorkQueue consumer with SurrealDB persistence of in-flight ops",
invariants_at_risk = ["solid-boundaries", "config-driven-always"],
verdict = 'Safe,
},
related_adrs = ["adr-012-nats-event-broker", "adr-013-surrealdb-global-store", "adr-014-solid-enforcement", "adr-038-radicle-decentralized-governance", "adr-039-build-infrastructure-ephemeral"],
invariant_justification = {
invariant = "solid-boundaries",
claim = "ops-controller is a new service with a new SOLID boundary: it ONLY consumes from ops.cmd, applies via the orchestrator API, and writes to ops.audit and SurrealDB — it does not call provider APIs or auth services directly",
mitigation = "Cedar policy enforces that ops-controller's service identity has no permissions to call hcloud, aws, or vault directly; orchestrator interface is the only allowed dependency. Compile-time check in the ops-controller crate forbids hcloud-rs and aws-sdk-rust as dependencies.",
},
}

View file

@ -0,0 +1,130 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-038",
title = "Radicle Heartwood as decentralized substrate for governance, desired-state, and audit ledger across all workspaces",
status = 'Accepted,
date = "2026-04-26",
context = "The platform requires a substrate to hold three classes of information that must survive the loss of any single cluster: (1) governance — who is authorized to sign which ops, expressed as a delegation set with M-of-N approval semantics for changes; (2) desired state — the version-controlled declaration of what each workspace should be running, used by ops emitters to compute deploy diffs; (3) audit ledger — the immutable record of which ops were applied to each workspace, signed by the applying ops-controller. All three need to be reachable by operators, ops emitters (CI on libre-daoshi, laptops, external CI), and the keeper-daemon, even when one or more nodes are unreachable. Centralized solutions (a single git server on libre-daoshi, or a hosted git provider) reintroduce the dependency the platform was designed to avoid (libre-wuji autonomy from libre-daoshi). The naive replacement — a self-hosted git server with mirroring — requires manual mirror management and does not address the governance signing question. Mutable distributed databases (etcd, Consul) handle replication but lack git's content-addressed history and signed-commit semantics, which are required for cryptographically attestable audit. The substrate must be peer-to-peer, support cryptographic identities for both repos and contributors, replicate via gossip without a central server, and allow patches (proposed changes) to require signatures from a configurable set of delegated keys.",
decision = "Adopt Radicle Heartwood as the decentralized substrate for three repo families per workspace: `policy-<workspace>` (keeper auto-sign policy + authorized-signers set), `<workspace>-desired` (version-controlled declaration of components, settings, capabilities), and `<workspace>-state` (immutable ledger of applied ops, written only by the wuji ops-controller). Each operator host (laptop), each cluster node intended to participate in governance (a designated node per cluster for libre-wuji and libre-daoshi), and the ops-vm host run a Radicle Heartwood seed node — there is no central hub. Repos are identified by their RID (Radicle ID), discovered via tracking peers. Authority on a repo is encoded in its delegation set: `policy-<workspace>` and `<workspace>-desired` use M-of-N delegation among operator keys (initial config: 2-of-3 for production workspaces, 1-of-1 for ops-vm); `<workspace>-state` uses a single delegation — the workspace's ops-controller signing key — because the ledger is an attestation by the applying authority, not a multi-party decision. Keeper policy (consumed by keeper-daemon to decide what to auto-sign) is declarative-only Nickel (see ADR-XXX keeper-policy schema): no executable code, no Nickel function calls beyond the schema constructor. Audit events from NATS `ops.audit.*` are mirrored to `<workspace>-state` via a sidecar process running in wuji that subscribes to JetStream and commits one git commit per audit message — this mirror runs at-most-once-per-message via JetStream durable consumer ack semantics. Operators may use any frontend over the local Radicle repo (plain git, jj, mob); the project does not mandate a frontend, only the substrate. The keeper-daemon and ops-controller use the `gix` Rust crate for direct git operations, never shelling out to git or jj — these services are not human-driven and benefit from in-process operations. The framework-level domain extension (ontoref domains/provisioning) gains a `governance` command group (governance delegations, governance signers) that reads the local Radicle clone of the workspace's policy repo and reports M-of-N quorum status.",
rationale = [
{
claim = "Radicle Heartwood provides cryptographic identity, gossip replication, and signed patches as a single substrate — no need to compose three lower-level primitives",
detail = "Building decentralized governance from primitives would require: a key-signed identity layer (e.g., DID), a content-addressed storage layer (git itself), a gossip replication layer (e.g., libp2p with custom protocol), and a patch/approval workflow (custom). Heartwood ships all four as a coherent system designed for source-code collaboration. The CRDT-like replication semantics of Heartwood's COB (collaborative objects) handles concurrent updates to issues, patches, and discussions correctly. We use only the patch-and-delegation subset, which is the most stable and best-tested part of the system.",
},
{
claim = "Three repos per workspace separates concerns with different authority profiles",
detail = "Conflating policy + desired-state + audit in one repo would force a single delegation set across three semantically different actions: human governance decisions (policy), declarative configuration (desired-state), and machine attestations (audit). Splitting into three repos lets each have the right authority: M-of-N operators for policy (humans must agree), M-of-N operators + automated CI keys for desired-state (CI can propose, operators approve), and the workspace's ops-controller key alone for audit (no human approves a record of what already happened). It also lets the audit repo grow much faster than the others without bloating the histories that operators read frequently.",
},
{
claim = "Keeper policy is declarative-only Nickel, evaluated by a deterministic Rust matcher — never executed as Nickel code",
detail = "If the keeper-daemon evaluated the policy by running `nickel export` on the file, a maliciously crafted policy committed by a quorum could exfiltrate keys via the eval environment or trigger unbounded computation. The decision: the policy schema (auto_sign + require_manual sections, each with image/target/scope patterns) is a closed, plain-data shape parsed by a Rust matcher. Adding new policy primitives requires updating both the schema and the matcher together — they are versioned in lock-step. This is not a general-purpose policy language and is not supposed to become one; if a future need exceeds what the schema expresses, a new ADR adds a new shape, not arbitrary expressiveness.",
},
{
claim = "ops-controller is the sole delegate of <workspace>-state because audit attests to applied ops, not approved ones",
detail = "Multiple delegates on the audit repo would mean operators or other parties could write to it. But the audit repo's value is precisely that it records what the applying authority observed — what actually happened in wuji. Allowing humans to write would let history be rewritten or fabricated; even with M-of-N controls, the value of the ledger is undermined. The ops-controller's signing key lives only on wuji, with backup encrypted online (per the decision in design discussion); rotation is rare. If wuji is rebuilt, the new ops-controller rotates to a new key — this is an event recorded in the policy repo (the delegation set updates), and the state repo continues with the new delegate.",
},
{
claim = "Audit mirror from NATS to Radicle is at-most-once-per-message — duplicate audit commits are not a correctness concern",
detail = "JetStream durable-consumer ack semantics guarantee at-least-once delivery of every audit message; the mirror's idempotency on commit (write commit only if the audit jti is not already present in HEAD's ancestor chain) makes the effective semantics exactly-once for the steady state. Duplicates in transient failure modes (mirror crashes between commit-write and ack) appear as a no-op commit on retry that is detected and skipped. The git history is grow-only; readers see the same content regardless of whether one or two attempts produced it.",
},
],
consequences = {
positive = [
"Governance, desired-state, and audit survive the loss of any single cluster — every operator and seed node holds a full replica via Radicle gossip",
"M-of-N delegation is a built-in primitive, not a custom approval workflow we maintain",
"Operator onboarding and offboarding are git-native operations (delegation patch signed by quorum) — no custom auth system",
"Audit history is content-addressed and signed — tampering requires forging a signature on a commit AND propagating it to all replicas, which is detectable",
"Frontends are operator choice — git, jj, mob, custom — without affecting the protocol",
"Domain-level commands (governance delegations, governance signers) work uniformly across workspaces because they read the same repo shape",
"Bootstrapping a new workspace = `rad init` three repos with appropriate delegation sets; no new infrastructure to deploy for governance",
],
negative = [
"Heartwood is younger than centralized git hosts — operators must learn `rad` CLI basics; mitigation: domain commands wrap common operations",
"Gossip replication has eventual-consistency lag — a delegation change made on one operator laptop may not be visible to keeper-daemon for seconds-to-minutes; mitigation: operations that consume policy poll for the latest commit before each decision, accepting a brief inconsistency window over hard real-time consistency",
"Audit commit rate is bounded by Radicle's gossip throughput, which is lower than NATS throughput — high-frequency ops may produce backpressure on the mirror; mitigation: batch multiple ops.audit messages into a single commit when arrival rate exceeds gossip rate",
"Operator key loss without backup is unrecoverable — a lost operator key can be removed from the delegation set by the remaining M-of-N quorum, but the operator cannot re-key without going through onboarding again",
"Cross-repo consistency (e.g., a state commit references a desired-state commit hash) is the application's responsibility — Radicle does not provide cross-repo transactions",
],
},
alternatives_considered = [
{
option = "Self-hosted Forgejo with cron-mirrored backups to other nodes",
why_rejected = "Forgejo is a centralized git server with manual mirror configuration; loss of the primary node means write operations stop until the mirror is promoted. Read replication is also pull-based and stale. The platform already runs Forgejo on libre-daoshi for human-friendly code hosting; layering decentralized governance on top of it would create two truths (Forgejo + mirrors) with potential drift. Radicle keeps governance and audit on a substrate purpose-built for the property we need.",
},
{
option = "etcd or Consul cluster as governance store with Cedar for authorization",
why_rejected = "Distributed KV stores excel at strongly-consistent state replication but do not provide signed history. A delegation change in etcd is a write; without an external signing layer, there is no cryptographic record of who proposed and approved it. Cedar adds policy evaluation but not provenance. Building signed history on top of etcd requires reinventing what git+signed-commits provides natively. Radicle gives both replication and signed history in one substrate.",
},
{
option = "OCI artifacts in zot for desired-state and audit",
why_rejected = "zot stores OCI artifacts well but is single-cluster (or replica-of-cluster) — losing wuji loses zot. Pushing desired-state and audit as OCI artifacts would couple them to wuji's availability, contradicting the requirement that governance survive cluster loss. zot's role is defined in ADR-039 (image registry with S3 backend); using it for governance would conflate two concerns.",
},
{
option = "GitHub/GitLab repos with branch protection rules for M-of-N approval",
why_rejected = "Reintroduces a centralized provider as a hard runtime dependency, contradicting the decentralization goal. Also the approval semantics of branch protection are advisory — the API can be bypassed by an admin or by tampering with the underlying git server. Radicle's M-of-N is enforced by the protocol: a non-quorum patch is not a valid update, full stop.",
},
],
constraints = [
{
id = "policy-files-are-declarative-only",
claim = "policy.ncl files in policy-<workspace> repos MUST conform to the keeper-policy schema and contain only data — no Nickel function definitions, no imports beyond the schema",
scope = "policy-*/policy.ncl across all workspaces",
severity = 'Hard,
check = {
tag = 'Grep,
pattern = "fun |let .* = fun ",
paths = ["policy-"],
must_be_empty = true,
},
rationale = "The keeper-daemon parses policy with a Rust matcher that handles the declarative schema only. Function definitions in a policy file would be evaluated as Nickel code if accidentally piped through nickel export, opening an exfiltration vector. The constraint enforces the schema-only convention.",
},
{
id = "state-repo-single-delegate",
claim = "<workspace>-state Radicle repos MUST have exactly one delegate: the ops-controller key for that workspace",
scope = "Radicle delegation set of all <workspace>-state repos",
severity = 'Hard,
check = {
tag = 'NuCmd,
cmd = "rad inspect $WORKSPACE-state | from json | get delegates | length",
expect_exit = 0,
},
rationale = "Multi-delegate state repos would allow rewriting audit history. The constraint enforces that only the applying authority writes the audit ledger. Rotating the ops-controller key is a separate, governed operation that updates the delegate.",
},
{
id = "audit-mirror-idempotent-on-jti",
claim = "The audit mirror sidecar MUST refuse to commit a duplicate jti — checked against the HEAD ancestor chain before committing",
scope = "platform/crates/audit-mirror/",
severity = 'Hard,
check = {
tag = 'Grep,
pattern = "check_jti_in_ancestors|already_committed",
paths = ["platform/crates/audit-mirror/"],
must_be_empty = false,
},
rationale = "JetStream at-least-once delivery means the mirror sees duplicate messages on retry. Without the idempotency check, the audit history would contain N-1 duplicate commits per failure event, polluting the ledger. The check makes duplicate handling a no-op.",
},
{
id = "desired-state-references-immutable",
claim = "When <workspace>-state references a <workspace>-desired commit hash in an audit entry, the referenced hash MUST be present in the desired repo's history at the time of audit write",
scope = "platform/crates/ops-controller/src/audit_emit.rs",
severity = 'Soft,
check = { tag = 'Grep, pattern = "desired.*commit|commit_hash|verify_commit", paths = ["platform/crates/ops-controller/src/audit_emit.rs"], must_be_empty = false },
rationale = "If audit references a hash that disappears (e.g., desired repo is force-pushed by a buggy operator workflow), the audit becomes uninterpretable. Soft severity because Radicle's signed-commit model already makes force-push effectively impossible without quorum, but explicit cross-reference verification adds defense in depth.",
},
],
ontology_check = {
decision_string = "Radicle Heartwood as decentralized substrate: three repos per workspace (policy / desired / state) with distinct delegation profiles (M-of-N humans / M-of-N+CI / single ops-controller) + declarative-only keeper policy schema + audit mirror from NATS to Radicle with jti idempotency + domain-level governance commands reading local Radicle clones",
invariants_at_risk = ["config-driven-always", "type-safety-nickel"],
verdict = 'Safe,
},
related_adrs = ["adr-037-ops-contract-dual-mode", "adr-014-solid-enforcement", "adr-018-secretumvault-integration", "adr-039-build-infrastructure-ephemeral"],
}

View file

@ -0,0 +1,139 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-039",
title = "Build infrastructure: golden-imaged ephemeral runners with dynamic sizing, S3-backed multi-tenant zot in workload cluster, and CI-orchestration separation",
status = 'Accepted,
date = "2026-04-26",
context = "The platform needs to compile workloads (Rust binaries like Vapora, the orchestrator crates, ops-keeper, ops-controller) and produce OCI images for runtime consumption. Three constraints shape the design: (1) the CI cluster (libre-daoshi class) is sized for orchestration and source-of-truth services (forgejo, woodpecker server, postgresql) — running CPU-heavy compiles inside its k0s cluster causes scheduling pressure on the orchestration services and forces the cluster to be sized for peak build load rather than steady-state orchestration; (2) the workload cluster (libre-wuji class) must remain CI-free per ADR-037 to preserve runtime autonomy; (3) image storage must survive the loss of any single cluster — keeping zot inside libre-wuji with local volume storage means losing wuji wipes the registry, and rebuilding the registry from external sources is a slow recovery path. The orchestrator (memory: platform/vm/ subsystem) already supports VM lifecycle (spawn, persistence, golden image cache, cleanup scheduler), making ephemeral builders feasible without new infrastructure. The remaining design decisions concern image storage durability, runner sizing, cache locality, and where the registry physically lives.",
decision = "Adopt an ephemeral-builder + central-registry architecture with three components. (1) BuildKit runs in ephemeral VMs spawned by the orchestrator on demand. Each VM is created from a periodically-rebuilt golden image (`buildkit-runner-golden:<date>`) pre-installed with buildkit (rootless), sccache, nushell, and SSH server keyed for the orchestrator. Spawn time targets ~30s vs ~2min for cloud-init from a generic base image. The golden image itself is rebuilt weekly via a Woodpecker pipeline that runs in a current ephemeral runner — the chain is self-rebuilding after the initial bootstrap. (2) Runner sizing is dynamic per build, resolved in three tiers: explicit declaration in `.build-spec.ncl` at the repo root (BuildSpec contract: cpu, memory_gb, disk_gb, time_budget_min, cache_keys, oom_retry); historical p95 of CPU/RAM picos for that repo from the orchestrator's SurrealDB build-metrics table, multiplied by 1.2; language defaults from the orchestrator (Cargo.toml → medium 4vCPU/8GB, package.json → small 2vCPU/4GB, etc.). Final size = max(declared, 1.2×p95_historical). OOM kill auto-retries once with one size up. Time budget enforced as VM-level kill. (3) zot lives in libre-wuji (relocated from libre-daoshi) configured with S3-compatible backend (Hetzner Object Storage, Backblaze B2, or compatible). The S3 bucket is the durable storage; zot pods are stateless and can be killed/respawned without data loss. Bucket configuration: versioning enabled (point-in-time recovery), lifecycle policy (90-day non-current version retention), optional cross-region replication to a second bucket on a different provider for catastrophic recovery. zot's auth model uses JWT integrated with the workspace's NATS account hierarchy — daoshi-ci principals have write to /images, /cache, /sccache, /crates; wuji workload pods have read on /images; operators have write on /crates; public read on /crates if the operator chooses to publish a Rust crate registry. (4) The buildkit-launcher binary (woodpecker plugin) bridges Woodpecker pipeline steps to the orchestrator: it requests a runner of the resolved size, waits for ready, ships build context via SSH, invokes buildctl with --import-cache and --export-cache pointing to zot.wuji.local, collects logs, requests destroy. The launcher carries no persistent state; orchestrator owns the lease. (5) Cache strategy uses zot as both layer cache (BuildKit registry-mode cache) and Rust object cache (sccache S3-backend pointed at zot's S3-compatible API). Cold runner with warm cache compiles at near-warm-runner speed because the network distance to zot is short and the cache is rich. (6) Coupling consequence: builds depend on wuji being reachable (zot lives there). When wuji is unreachable, builds can run cold-locally on the runner but cannot push results — operators acknowledge this trade-off; an optional pull-through cache mirror in libre-daoshi can be added later if the coupling produces measurable friction.",
rationale = [
{
claim = "Ephemeral runners + golden images give build farm bursting without fixed-cost capacity",
detail = "A persistent build VM sized for the largest workload (Vapora) wastes CPU and RAM 95% of the time it sits idle. Per-build VMs scale to zero between builds — the only cost is the spawn time, which the golden image reduces to ~30s. The orchestrator already manages VM lifecycle for taskservs (memory: platform/vm/lifecycle.nu, vm_persistence.nu, cleanup_scheduler), so adding the buildkit_runner role is a component definition and a launcher binary, not a new subsystem.",
},
{
claim = "Three-tier dynamic sizing handles the spread between trivial CI tasks and Vapora-class compiles without overcommit",
detail = "Static sizing variants (small/medium/large/xlarge) impose two failure modes: under-sized (OOM, slow), over-sized (wasted resources, slower spawn for unnecessarily large VMs). Reading `.build-spec.ncl` lets the repo declare its needs explicitly. P95 historical fallback handles repos that never declared a spec but have build history — most repos converge to a stable size. Language defaults handle the first build of a new repo. The 1.2× multiplier on historical p95 absorbs typical variance without exposing builds to OOM kill on a marginally larger build than usual.",
},
{
claim = "zot with S3 backend makes the registry stateless — DR is a property of S3, not zot",
detail = "Self-managed durable storage for a registry (cluster volumes + replication + backup) is a recurring operational task. S3-class storage (any compatible provider) gives 11-nines durability natively and supports versioning and cross-region replication as configuration. Moving zot to that backend means the kubernetes pod is replaceable on a moment's notice with no data migration — the bucket is the source of truth. The DR question reduces to: is the bucket reachable, and is its versioning intact? — both of which are provider responsibilities. Cross-provider replication (e.g., Hetzner primary + Backblaze secondary) addresses provider catastrophic loss.",
},
{
claim = "Cache lives in the registry because BuildKit and sccache both speak S3-compatible APIs to a shared registry",
detail = "BuildKit supports `--export-cache type=registry` and `--import-cache type=registry`, writing layer cache as OCI artifacts to the same registry that holds final images. sccache supports S3 backend that can target zot's S3-compatible endpoint (zot exposes an S3 API for direct artifact upload). Both caches benefit from the same durability and replication as the images themselves. A new cold runner pulling cache from zot is essentially as fast as the cache is rich; running the cache locally on the VM gains nothing because the VM is destroyed at end of build.",
},
{
claim = "buildkit-launcher is thin to keep state in the orchestrator, not in Woodpecker",
detail = "Putting orchestration logic (lease tracking, cleanup on failure, retry policy) in the launcher would duplicate logic the orchestrator already implements for VM-backed taskservs. The launcher is a wrapper: requests a runner, hands off to buildctl on the runner, collects results. If the launcher process dies mid-build, the orchestrator's cleanup scheduler reaps the orphaned VM. If the runner OOMs, the orchestrator retries with the next size. The launcher's only job is to bridge Woodpecker step semantics (env vars, exit code, log capture) to the orchestrator's leased-resource semantics.",
},
],
consequences = {
positive = [
"libre-daoshi cluster stays small and steady-state — orchestration services are not preempted by build CPU",
"Build capacity is elastic without operator intervention — concurrent builds spawn concurrent VMs up to the orchestrator's configured pool limit",
"Build cold-start with warm cache is near-warm — sccache hits at network speed from a same-provider VM",
"Image registry DR is reduced to S3 bucket configuration — versioning, lifecycle, cross-region replication are all provider features",
"zot multi-tenant layout (/images, /cache, /sccache, /crates) lets the same registry serve workload images, build cache, Rust crates, and OCI artifacts uniformly",
"Golden image rebuild via the system itself (a runner builds the next runner image) means no permanent external build dependency once bootstrapped",
"Sizing dynamism makes Vapora-class builds and trivial doc builds use appropriate resources without manual tuning per pipeline",
],
negative = [
"Builds depend on wuji being reachable for zot — wuji outage stops the publish step (mitigation: optional pull-through cache mirror in libre-daoshi if measured friction warrants)",
"Initial bootstrap requires producing the first golden image off-platform (laptop or external CI) — documented in playbook, but a one-time manual step",
"Per-build VM creation has spawn-cost floor (~30s with golden image) — hot-path one-second test runs are not the right shape for this model; small in-cluster runners may be added later if a workload demands sub-spawn-cost CI",
"Orchestrator's VM pool limit becomes a build concurrency ceiling — needs sizing per workspace based on observed peak parallelism",
"Runner OOM auto-retry doubles VM cost for that build — repeated retries for flaky builds inflate cloud costs; mitigation: max 1 retry, with explicit failure surfaced to the developer",
"Cross-provider S3 replication has lag — the secondary bucket is eventual-consistent with the primary, so a same-second push-and-pull from secondary may miss; mitigation: cross-provider replication is for DR, not for normal reads",
],
},
alternatives_considered = [
{
option = "Persistent build VMs with strong per-VM cache locality",
why_rejected = "Sized for peak load, idle 95% of the time. Cache locality benefit is partial because cross-VM cache requires central storage anyway. Operational maintenance (patching, OS updates) on persistent VMs is recurring; ephemeral VMs from a periodically-refreshed golden image trade per-build spawn cost for zero ongoing maintenance.",
},
{
option = "BuildKit pods inside libre-daoshi cluster",
why_rejected = "Couples build CPU to orchestration cluster — large builds cause scheduler pressure on forgejo, woodpecker server, postgresql. Sizing the cluster for peak builds wastes resources between builds. Out-of-cluster ephemeral VMs avoid this entirely with no architectural cost since the orchestrator already runs them for taskservs.",
},
{
option = "GitHub-hosted runners or other external CI for builds",
why_rejected = "Reintroduces an external runtime dependency for the build step, contradicting the platform's autonomy goals. Also creates two CI surfaces (Woodpecker + GitHub Actions) operators must reason about. The orchestrator-spawned ephemeral runners give the same elasticity within the platform's own infrastructure.",
},
{
option = "zot in libre-daoshi cluster with local volumes",
why_rejected = "Centralizes images on the wrong cluster — wuji should be the source of truth at runtime per ADR-037. Also single-cluster local-volume storage has no DR path that does not involve manual replication. S3 backend in wuji gives DR via provider features without manual replication.",
},
{
option = "Nix as the build system instead of BuildKit",
why_rejected = "Nix delivers reproducible builds and a richer caching model, but the project is not Nix-native — workloads are built with cargo, npm, go, and language-native toolchains. Adopting Nix wholesale is a separate, larger decision. BuildKit accepts the existing Dockerfile/buildctl workflow most workloads already have. If a future workload demands bit-reproducible builds, Nix can run inside a BuildKit step without changing the surrounding architecture.",
},
],
constraints = [
{
id = "buildkit-runner-no-persistent-storage",
claim = "buildkit_runner component MUST NOT declare persistent volumes — all state lives on ephemeral disk and is destroyed with the VM",
scope = "catalog/components/buildkit_runner.ncl",
severity = 'Hard,
check = { tag = 'Grep, pattern = "persistent.*=.*false", paths = ["provisioning/catalog/components/buildkit_runner.ncl"], must_be_empty = false },
rationale = "Persistent storage on ephemeral runners defeats the cost model and recreates the persistent-VM maintenance burden. Cache locality is provided by zot, not by persistent disks.",
},
{
id = "zot-storage-must-be-s3",
claim = "zot component in libre-wuji MUST configure storage.backend = 's3' — local-volume storage is not permitted for the workload-cluster registry",
scope = "workspaces/libre-wuji/infra/libre-wuji/components/zot.ncl",
severity = 'Hard,
check = {
tag = 'Grep,
pattern = "backend = \"s3\"|backend.*s3",
paths = ["workspaces/libre-wuji/infra/libre-wuji/components/zot.ncl"],
must_be_empty = false,
},
rationale = "Local-volume zot has no DR path consistent with the platform's resilience goals. The constraint forces the S3 backend choice at config-validation time.",
},
{
id = "build-spec-schema-versioned",
claim = ".build-spec.ncl files in repos MUST validate against schemas/lib/build_spec.ncl — invalid specs cause launcher to fail-fast with a parse error, not silently fall back",
scope = "schemas/lib/build_spec.ncl, platform/crates/buildkit-launcher/",
severity = 'Hard,
check = { tag = 'Grep, pattern = "SchemaError|schema_error|schema_validation|validation_diff", paths = ["provisioning/platform/crates/buildkit-launcher/src/"], must_be_empty = false },
rationale = "Silent fallback on invalid build-spec files masks misconfigurations until a build OOMs unexpectedly. Fail-fast surfaces the issue at the next pipeline run, when the developer can fix it.",
},
{
id = "oom-retry-bounded",
claim = "buildkit-launcher OOM retry MUST be bounded to one retry per build — repeated retries inflate cost and indicate misconfiguration that needs developer attention",
scope = "platform/crates/buildkit-launcher/src/retry.rs",
severity = 'Hard,
check = {
tag = 'Grep,
pattern = "max_oom_retries|MAX_OOM_RETRY|oom_retry_limit",
paths = ["platform/crates/buildkit-launcher/"],
must_be_empty = false,
},
rationale = "Unbounded retries on flaky builds turn a $0.10 build into a $1+ build silently. The bound is policy: one retry covers transient sizing miss, repeat OOM means the developer should declare a larger spec.",
},
{
id = "golden-image-rebuild-cadence",
claim = "buildkit-runner-golden image MUST be rebuilt at least weekly — older golden images accumulate package vulnerabilities and toolchain drift",
scope = "Woodpecker pipeline definitions, orchestrator default-image config",
severity = 'Soft,
check = { tag = 'Grep, pattern = "golden-image-rebuild", paths = [".woodpecker/"], must_be_empty = false },
rationale = "Stale golden images are a slow-moving security problem — toolchain CVEs accumulate. Weekly rebuild is generous but acceptable; faster cadence is fine but adds noise. Soft severity because the cadence is operational policy, not a structural invariant.",
},
],
ontology_check = {
decision_string = "Build infrastructure: golden-imaged ephemeral runners spawned by orchestrator + dynamic sizing (.build-spec.ncl + p95 historical + language defaults) + zot relocated to libre-wuji with S3 backend (versioning + cross-region replication) + multi-tenant zot layout (images/cache/sccache/crates) + buildkit-launcher as thin Woodpecker-to-orchestrator bridge + sccache and BuildKit cache both terminated at zot",
invariants_at_risk = ["config-driven-always", "type-safety-nickel"],
verdict = 'Safe,
},
related_adrs = ["adr-037-ops-contract-dual-mode", "adr-038-radicle-decentralized-governance", "adr-021-workspace-composition-dag", "adr-033-cluster-component-extension-pattern"],
}

View file

@ -0,0 +1,83 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-040",
title = "buildkit-launcher lifted out as lian-build: standalone build substrate peer project",
status = 'Accepted,
date = "2026-05-01",
context = "provisioning/platform/crates/buildkit-launcher and provisioning/extensions/components/buildkit_runner implemented the build substrate — ephemeral buildkit compute provisioning, OCI cache management, golden image lifecycle — inside the provisioning workspace. ADR-039 defined the architecture; buildkit-launcher was the implementation. Two structural problems emerged: (1) the build substrate domain (ephemeral compute, OCI cache, multi-actor sessions, provider abstraction) is orthogonal to provisioning's core domain (workspace lifecycle management); evolution of cache namespacing, provider traits, or session models required provisioning releases; (2) vapora (multi-agent orchestration) and workspace CI pipelines (Woodpecker) needed build substrate access without depending on the full provisioning binary and config system. The component passed all four criteria of the ontoref lift-out pattern (ADR-016 in ontoref): orthogonal concern, consumer plurality, release cadence divergence, config path-agnostic.",
decision = "buildkit-launcher and buildkit_runner are extracted from provisioning as lian-build (炼), a standalone build substrate project at /Users/Akasha/Development/lian-build. provisioning retains catalog/lian-build/ — the NCL schemas, defaults, and component declarations that allow workspace infras to supply BuildDirectives to lian-build. Workspace infras declare their build intent using lian-build's NCL vocabulary; provisioning's runtime calls the lian-build binary with the generated directive config. No provisioning crate is imported by lian-build as a library dependency. buildkit-launcher and buildkit_runner workspace member entries are removed from provisioning/platform/Cargo.toml.",
rationale = [
{
claim = "Orthogonal domain justifies independent release lifecycle",
detail = "provisioning's core loop is: read NCL workspace definition → reconcile component state → apply changes. lian-build's core loop is: receive BuildDirectives → provision ephemeral compute → run buildkitd → collect artifacts → tear down. These loops share no state and evolve on different schedules. Adding a second registry provider to lian-build should not require a provisioning release, and vice versa.",
},
{
claim = "Consumer plurality is proven",
detail = "At extraction time: provisioning (workspace component builds), vapora (multi-agent build sessions), workspace CI pipelines (Woodpecker steps). Three distinct callers, each with different invocation patterns and config sources, confirm lian-build's value is not provisioning-specific.",
},
{
claim = "catalog/lian-build/ is the correct integration surface",
detail = "provisioning loads lian-build's BuildDirectives schema as an extension, making it available to workspace infra NCL. The workspace declares build intent; provisioning validates it against lian-build's schema and passes it to the binary. This is the correct dependency direction: provisioning knows about lian-build's vocabulary, but lian-build does not know about provisioning's internal structures.",
},
],
consequences = {
positive = [
"lian-build releases independently: provider additions, cache policy changes, session model improvements do not block provisioning",
"vapora and workspace CI pipelines consume lian-build directly without routing through provisioning",
"provisioning/platform/Cargo.toml shrinks: two workspace members removed",
"ComputeProvider and RegistryProvider trait boundaries are declared at project inception, not retrofitted",
],
negative = [
"Two schema maintenance surfaces: lian-build/schemas/ (source of truth) and provisioning/catalog/lian-build/ (consumer-side reference)",
"Workspace infras that previously used buildkit_runner component definitions must migrate to lian-build's BuildDirectives schema",
],
},
alternatives_considered = [
{
option = "Keep buildkit-launcher as a provisioning crate, expose via provisioning subcommand",
why_rejected = "Prevents vapora and Woodpecker from using the build substrate without depending on provisioning. Release coupling blocks provider evolution. Four-criterion test (ontoref ADR-016) makes the extraction unambiguously correct.",
},
],
constraints = [
{
id = "extensions-not-binary-dep",
claim = "provisioning must not import lian-build as a Rust library dependency; interaction is via CLI invocation with NCL-generated config",
scope = "provisioning/platform/Cargo.toml, provisioning/catalog/",
severity = 'Hard,
check = {
tag = 'Grep,
pattern = "lian-build",
paths = ["provisioning/platform/Cargo.toml"],
must_be_empty = true,
},
rationale = "Library dependency would re-couple provisioning's build cycle to lian-build's. The integration surface is the CLI binary + NCL schema, not the Rust crate graph.",
},
{
id = "extensions-lian-build-present",
claim = "provisioning/catalog/lian-build/ must be present and contain the BuildDirectives schema before the workspace member entries are removed",
scope = "provisioning/catalog/lian-build/",
severity = 'Hard,
check = {
tag = 'FileExists,
path = "provisioning/catalog/lian-build/build_directives.ncl",
present = true,
},
rationale = "Removing the workspace member without the extension schema would break workspace infras that declare build components. Schema first, then removal.",
},
],
related_adrs = ["adr-039-build-infrastructure-ephemeral", "ontoref:adr-016-component-lift-out-pattern"],
ontology_check = {
decision_string = "buildkit-launcher and buildkit_runner extracted as lian-build standalone project; provisioning retains catalog/lian-build/ as integration surface; no Rust library dependency from provisioning to lian-build",
invariants_at_risk = [],
verdict = 'Safe,
},
}

View file

@ -0,0 +1,92 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-041",
title = "backup-manager lifted out as cloudatasave: standalone backup orchestrator peer project",
status = 'Accepted,
date = "2026-05-01",
context = "provisioning/platform/crates/backup-manager implemented backup orchestration — restic-first snapshots, multi-destination replication, consistency-point group management — inside the provisioning workspace. Two structural problems emerged: (1) backup orchestration domain (snapshot lifecycle, engine abstraction, restore verification, retention policy) is orthogonal to provisioning's core domain (workspace lifecycle management); (2) the implementation was coupled to provisioning's platform_config crate for config loading, preventing standalone use. Any project needing backup operations (workspace CI, scripts, standalone invocation) had to depend on the full provisioning binary. The component passed all four criteria of the ontoref lift-out pattern (ontoref ADR-016): orthogonal concern, consumer plurality, release cadence divergence, config path-agnostic.",
decision = "backup-manager is extracted from provisioning as cloudatasave, a standalone backup orchestration project (Forgejo: LibreCloud/cloudDataSave). provisioning retains catalog/cloudatasave/ — the NCL schemas, defaults, and component declarations that allow workspace infras to declare BackupGroups and BackupPolicies. Workspace infras declare their backup intent using cloudatasave's NCL vocabulary; provisioning's runtime calls the cloudatasave binary with the generated policy config. No provisioning crate is imported by cloudatasave as a library dependency. backup-manager workspace member entry is removed from provisioning/platform/Cargo.toml.",
rationale = [
{
claim = "Backup orchestration evolves on a different cadence than workspace lifecycle",
detail = "Adding a new backup destination type (SFTP, Storj, rsync.net), implementing kopia as a second engine, or improving restore verification scheduling should not require a provisioning release. cloudatasave's only coupling to provisioning is the BackupPolicy NCL vocabulary — the schema is stable across provisioning versions once published.",
},
{
claim = "Portable backup is a general capability, not a provisioning-specific one",
detail = "Any project — not just provisioning workspaces — may need to declare backup groups, schedule snapshots, and verify restores. A cloudatasave that does not import provisioning infrastructure can be adopted by standalone scripts, CI pipelines, and other projects without inheriting provisioning's dependency tree.",
},
{
claim = "Verify-as-provisioning axiom requires cloudatasave to own verification state",
detail = "The principle that a backup group is not provisioned until verified is a cloudatasave invariant. If backup-manager remained inside provisioning, this invariant would be implemented as a provisioning concern, creating tight coupling between provisioning's provisioning-state tracking and backup verification. As a standalone project, cloudatasave owns the invariant completely.",
},
{
claim = "catalog/cloudatasave/ is the correct integration surface",
detail = "provisioning loads cloudatasave's BackupPolicy and BackupGroup schemas as an extension, making them available to workspace infra NCL. The workspace declares backup policy; provisioning validates it against cloudatasave's schema and passes it to the binary. This is the correct dependency direction: provisioning knows about cloudatasave's vocabulary, but cloudatasave does not know about provisioning's internal structures.",
},
],
consequences = {
positive = [
"cloudatasave releases independently: engine additions, verification improvements, destination types do not block provisioning",
"Any project can adopt cloudatasave by declaring a BackupPolicy in its NCL vocabulary — not only provisioning workspaces",
"provisioning/platform/Cargo.toml shrinks: backup-manager workspace member removed",
"BackupEngine trait boundary is declared at project inception, forcing the engine abstraction to be correct from day one",
],
negative = [
"Two schema maintenance surfaces: cloudatasave/schemas/ (source of truth) and provisioning/catalog/cloudatasave/ (consumer-side reference)",
"Workspace infras that previously used backup_manager component definitions must migrate to cloudatasave's BackupPolicy schema",
"cloudatasave must implement its own config loading without platform_config — one-time cost at extraction",
],
},
alternatives_considered = [
{
option = "Keep backup-manager in provisioning, expose as prvng backup subcommand only",
why_rejected = "Prevents standalone invocation, CI pipeline integration without provisioning, and blocks the verify-as-provisioning model from being a cloudatasave-internal invariant. Four-criterion test (ontoref ADR-016) makes the extraction correct.",
},
{
option = "Use a managed Kubernetes backup solution (Velero)",
why_rejected = "Velero targets in-cluster resource backup (PVCs, manifests). cloudatasave targets data backup: application snapshots, database dumps, object storage replication. These are complementary, not substitutes. cloudatasave's engine abstraction can later add a Velero-backend for PVC-class workloads.",
},
],
constraints = [
{
id = "extensions-not-binary-dep",
claim = "provisioning must not import cloudatasave as a Rust library dependency; interaction is via CLI invocation with NCL-generated config",
scope = "provisioning/platform/Cargo.toml, provisioning/catalog/",
severity = 'Hard,
check = {
tag = 'Grep,
pattern = "cloudatasave|backup-manager",
paths = ["provisioning/platform/Cargo.toml"],
must_be_empty = true,
},
rationale = "Library dependency would re-couple provisioning's build cycle to cloudatasave's. The integration surface is the CLI binary + NCL schema, not the Rust crate graph.",
},
{
id = "minimum-two-destinations-enforced",
claim = "provisioning's catalog/cloudatasave/ schema must declare minimum-two-destinations as a hard contract so workspace infras cannot declare single-destination groups",
scope = "provisioning/catalog/cloudatasave/backup_group.ncl",
severity = 'Hard,
check = {
tag = 'FileExists,
path = "provisioning/catalog/cloudatasave/backup_group.ncl",
present = true,
},
rationale = "The multi-destination-custody axiom must be enforced at the workspace infra declaration layer, not only at cloudatasave runtime. Early validation prevents misconfigured groups from reaching the orchestrator.",
},
],
related_adrs = ["adr-037-ops-contract-dual-mode", "ontoref:adr-016-component-lift-out-pattern"],
ontology_check = {
decision_string = "backup-manager extracted as cloudatasave standalone project; provisioning retains catalog/cloudatasave/ as integration surface; no Rust library dependency from provisioning to cloudatasave",
invariants_at_risk = [],
verdict = 'Safe,
},
}

View file

@ -0,0 +1,141 @@
let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-042",
title = "Ecosystem integration via federated Integration Modes mediated by versioned Domain artifacts in OCI",
status = 'Accepted,
date = "2026-05-01",
context = "After the lift-out of lian-build (ADR-040) and cloudatasave (ADR-041) as standalone projects, no mechanism exists for them to integrate with provisioning without re-coupling to its filesystem. An earlier framing modeled the problem as 'extension of a host' — a plugin protocol with an ExtensionManifest parallel to Mode, where provisioning was the host and external projects were plugins. This framing is structurally incompatible with the ontoref protocol already adopted across the ecosystem: ontoref has Mode, Domain, and Reflection as native primitives that express the same relationship with semantic coherence. The term 'extension' is also overloaded in the codebase: it simultaneously names (a) the plugin-protocol framing (rejected) and (b) the IaC artifact catalog at provisioning/extensions/ (components, providers, taskservs, playbooks, workflows). These two concepts share a name by historical accident.",
decision = "Adopt the federated Integration Mode pattern: each participant (provisioning, lian-build, cloudatasave, vapora, CI) declares its integration points as Modes with kind = 'integration in its own reflection/modes/. Participants exchange typed contracts via Domain artifacts — versioned OCI blobs custodiados en reg.librecloud.online/domains/<id>:<semver> — without reading each other's filesystems. Provisioning-side caller context (SOPS secrets, component values, literals) is materialised as Cabling files (infra/<ws>/integrations/<mode-id>.ncl) resolved by a dedicated Rust crate (context-assembler) into a typed JSON payload sent via stdin to the Mode binary. Three primitives: Integration Mode (reflects self, declares domains_used), Domain artifact (shared OCI contract with inputs/outputs/events channels), Cabling (workspace-local materialisation of the contract). Separately, provisioning/extensions/ is renamed to provisioning/catalog/ to give the IaC building-block catalog its semantically correct name, decoupled from the integration protocol.",
rationale = [
{
claim = "federated Mode pattern is the native on+re idiom — no parallel schema needed",
detail = "ontoref already models operational units as Modes with DAG steps, QA specs, and capability declarations. Introducing a separate ExtensionManifest concept alongside Mode would split the semantic model. A Mode with kind = 'integration is a Mode — it fits the existing indexing, describe, and run primitives natively without new tooling.",
},
{
claim = "OCI Domain artifacts provide content-addressed versioned contracts without a custom server",
detail = "A Domain artifact is a set of Nickel schema files (inputs.ncl, outputs.ncl, events.ncl, capabilities.ncl, version.ncl) pushed to the existing zot registry with a custom mediaType. oras-cli handles push/pull. Content-addressable digests provide integrity without signing in v0.1 (acceptable for internal use; cosign added in hardening). No custom server, no custom protocol: standard OCI distribution API.",
},
{
claim = "Ruta beta for OCI CLI surface: local implementation in prvng integration, not upstream to ontoref",
detail = "ontoref v0.1.0 implements no OCI commands: no domain publish, no domain pull, no ecosystem domains, no OCI distribution API access, no custom mediaType management. Contributing upstream requires designing, reviewing, and merging a large module in a project with its own release cadence before the first proof-of-stack. Implementing locally under prvng integration and extracting after validation is coherent with the lift-out pattern already practised (lian-build and cloudatasave both originated in provisioning). Breaking changes during v0.1 stay local. Extraction is mechanical once the contract is stable.",
},
{
claim = "Ruta B for Mode schema extension: embedded subset in provisioning schemas until stable",
detail = "For the same iteration-cost reason: a Mode with kind = 'integration and its associated fields (domains_used, invocation, direction) lives in provisioning/schemas/lib/integration_mode_manifest.ncl until validated with two real consumers (lian-build, cloudatasave). Promoting upstream after validation avoids locking the ontoref schema to an API that may still change.",
},
{
claim = "catalog/ rename removes the semantic collision from the codebase permanently",
detail = "As long as provisioning/extensions/ exists, every future developer must reason about whether 'extension' means the IaC catalog or the rejected plugin protocol. A clean rename to provisioning/catalog/ eliminates the ambiguity at zero ongoing cost. The protocol framing disappears from commands, dispatch, schemas, and module paths simultaneously.",
},
{
claim = "Modes never read caller filesystem — context arrives as typed stdin JSON",
detail = "The plugin-protocol approach allowed extension manifests to declare filesystem paths the host would populate. This creates implicit coupling: the Mode depends on the host's directory layout. Delivering context as a typed JSON payload (assembled by context-assembler from Cabling) means the Mode binary needs no knowledge of the caller's filesystem. The contract is the Domain schema, not a path convention.",
},
],
consequences = {
positive = [
"lian-build and cloudatasave integrate with provisioning by declaring Modes and consuming Domains — no filesystem coupling, no re-absorption",
"New participants (vapora, CI, future) adopt the pattern by publishing a Mode artifact; no changes to provisioning internals required",
"Domain artifacts are OCI-addressable: discovery is standard registry catalog API, not filesystem grep",
"Cabling files are workspace-local NCL — type-safe, auditable, version-controlled alongside the workspace definition",
"provisioning/catalog/ correctly names the IaC building-block library; confusion with the integration protocol is eliminated",
"context-assembler Rust crate is generically reusable by any caller: vapora, CI, future participants",
"Signing (cosign) is addable as a hardening step without architectural change — the OCI push/pull path already exists",
],
negative = [
"oras-cli is a new runtime dependency for the OCI CLI surface (acceptable: single binary, brew install)",
"context-assembler introduces a Rust crate boundary for what was previously a Nushell inline operation — justified by crypto-sensitive plaintext handling and typed schema validation",
"provisioning/catalog/ rename propagates across all workspace component NCL imports — one-time cost, verified by nickel export smoke tests",
"Mode binary distribution (how the Mode binary reaches the operator's PATH) is out-of-band for v0.1 (invocation.method = 'path_assumed); resolved in a later iteration when 'oci_blob or 'cargo_install are needed",
"Signing of Domain and Mode artifacts is deferred to hardening (TASK-14); artifacts are unsigned during v0.1, acceptable for internal-only registry",
],
},
alternatives_considered = [
{
option = "Extension protocol: host-mediated plugin manifest parallel to Mode",
why_rejected = "Introduced a second schema object (ExtensionManifest) alongside Mode in the ontoref model, creating two registration paths for essentially the same concept. The host (provisioning) becoming a plugin-loader couples its internal lifecycle to external project release cadences. Filesystem-path coupling between host and extension was implicit and unvalidated. The on+re Mode primitive already expresses this relationship — a parallel schema is an anti-PAP.",
},
{
option = "Ruta alpha for OCI CLI: upstream contribution to ontoref",
why_rejected = "ontoref v0.1.0 has no OCI surface at all. Contributing upstream means coordinating design, review, and merge in a project with independent release cadence before proving the first round-trip. Ruta beta (local implementation, extract after validation) delivers the proof-of-stack sooner and keeps breaking changes local during the unstable v0.1 period. If ontoref gains throughput and there is demand outside provisioning, extraction is mechanical.",
},
{
option = "Ruta A for Mode schema: upstream contribution of kind = 'integration to ontoref core",
why_rejected = "Same reasoning as ruta alpha for OCI: the schema should stabilise against two real consumers (lian-build, cloudatasave) before being locked into upstream. Ruta B (embedded subset) allows iteration without external coordination.",
},
{
option = "Collapsing provisioning/extensions/ under provisioning/integrations/",
why_rejected = "A component, provider, taskserv, playbook, or workflow is not an integration in the federated Mode sense. Collapsing them under the same name would re-introduce the semantic collision the rename is trying to eliminate. The IaC building blocks are catalog items; the federated protocol participants are integration modes. Two concepts, two names: catalog/ and integrations/.",
},
],
constraints = [
{
id = "integration-mode-must-declare-domains-used",
claim = "A Mode with kind = 'integration MUST declare domains_used as a non-empty array — a Mode that declares no domains is not an integration mode",
scope = "provisioning/schemas/lib/integration_mode_manifest.ncl",
severity = 'Hard,
check = {
tag = 'FileExists,
path = "provisioning/schemas/lib/integration_mode_manifest.ncl",
present = true,
},
rationale = "An integration Mode with no domains_used has no contract with its caller — it might as well be a regular Mode. The domains_used requirement enforces that every integration point is explicitly declared and versioned.",
},
{
id = "integration-mode-no-filesystem-read-of-caller",
claim = "Integration Mode binaries MUST NOT read the caller's filesystem — context arrives exclusively as typed stdin JSON assembled by context-assembler from the Cabling file",
scope = "provisioning/schemas/lib/integration_mode_manifest.ncl, crates/context-assembler/",
severity = 'Hard,
check = { tag = 'Grep, pattern = "path_read|filesystem_", paths = ["provisioning/schemas/lib/integration_mode_manifest.ncl"], must_be_empty = true },
rationale = "Filesystem reads couple the Mode binary to the host's directory layout, recreating the implicit coupling that the federated pattern is designed to eliminate.",
},
{
id = "catalog-not-integrations",
claim = "IaC building blocks (components, providers, taskservs, playbooks, workflows) MUST live under provisioning/catalog/ — they MUST NOT be placed under provisioning/integrations/",
scope = "provisioning/catalog/",
severity = 'Hard,
check = {
tag = 'FileExists,
path = "provisioning/catalog",
present = true,
},
rationale = "Mixing IaC catalog items with integration Mode artifacts recreates the semantic collision between the old 'extension' catalog and the old 'extension' protocol. The rename to catalog/ is the permanent resolution.",
},
{
id = "domain-artifacts-in-oci-registry",
claim = "Domain artifacts MUST be published to reg.librecloud.online/domains/<id>:<semver> and consumed via oras pull — inline filesystem imports of domain schemas are not permitted after v0.1",
scope = "infra/<ws>/integrations/, crates/context-assembler/",
severity = 'Hard,
check = { tag = 'Grep, pattern = "import.*domains/", paths = ["provisioning/core/nulib/"], must_be_empty = true },
rationale = "Filesystem-local domain schemas cannot be discovered, versioned, or integrity-checked by oras. Moving to OCI is what enables the federated discovery model.",
},
{
id = "extension-command-removed",
claim = "prvng extension (and aliases e, ext) MUST NOT exist after the catalog rename — the command is replaced by prvng integration for the federation surface and optionally prvng catalog for catalog browsing",
scope = "provisioning/core/cli/provisioning, provisioning/core/nulib/commands-registry.ncl",
severity = 'Hard,
check = {
tag = 'Grep,
pattern = "\"extension\"",
paths = ["provisioning/core/nulib/commands-registry.ncl"],
must_be_empty = true,
},
rationale = "A prvng extension command surviving the rename would perpetuate the vocabulary collision and confuse operators about whether 'extension' refers to the catalog or the protocol.",
},
],
ontology_check = {
decision_string = "Federated Integration Mode pattern via OCI Domain artifacts: Modes declare domains_used, context assembled by Rust crate from Cabling (SOPS + component + literal + env resolvers), no filesystem coupling, ruta beta OCI CLI in prvng integration, ruta B Mode schema embedded in provisioning schemas, catalog rename provisioning/extensions/ to provisioning/catalog/",
invariants_at_risk = ["config-driven-always", "type-safety-nickel"],
verdict = 'Safe,
},
related_adrs = ["adr-040-lian-build-lift-out", "adr-041-cloudatasave-lift-out", "adr-039-build-infrastructure-ephemeral", "adr-033-cluster-component-extension-pattern"],
}

51
adrs/adr-constraints.ncl Normal file
View file

@ -0,0 +1,51 @@
let _adr_id_format = std.contract.custom (
fun label =>
fun value =>
if std.string.is_match "^adr-[0-9]{3}$" value then
'Ok value
else
'Error {
message = "ADR id must match 'adr-NNN' format (e.g. 'adr-001'), got: '%{value}'"
}
) in
let _non_empty_constraints = std.contract.custom (
fun label =>
fun value =>
if std.array.length value == 0 then
'Error {
message = "constraints must not be empty — an ADR with no constraints is passive documentation, not an active constraint"
}
else
'Ok value
) in
let _non_empty_negative = std.contract.custom (
fun label =>
fun value =>
if std.array.length value.negative == 0 then
'Error {
message = "consequences.negative must not be empty on id='%{value.id}' — an ADR with no negative consequences is incomplete"
}
else
'Ok value
) in
let _requires_justification = std.contract.custom (
fun label =>
fun value =>
if value.ontology_check.verdict == 'RequiresJustification
&& !(std.record.has_field "invariant_justification" value) then
'Error {
message = "ADR '%{value.id}': ontology_check.verdict = 'RequiresJustification but invariant_justification field is missing"
}
else
'Ok value
) in
{
AdrIdFormat = _adr_id_format,
NonEmptyConstraints = _non_empty_constraints,
NonEmptyNegativeConsequences = _non_empty_negative,
RequiresJustificationWhenRisky = _requires_justification,
}

13
adrs/adr-defaults.ncl Normal file
View file

@ -0,0 +1,13 @@
let s = import "adr-schema.ncl" in
let c = import "adr-constraints.ncl" in
{
make_adr = fun data =>
let result | c.RequiresJustificationWhenRisky = s.Adr & data in
result,
make_constraint = fun data => s.Constraint & data,
Adr = s.Adr,
Constraint = s.Constraint,
OntologyCheck = s.OntologyCheck,
}

97
adrs/adr-schema.ncl Normal file
View file

@ -0,0 +1,97 @@
let c = import "adr-constraints.ncl" in
let status_type = [| 'Proposed, 'Accepted, 'Superseded, 'Deprecated |] in
let severity_type = [| 'Hard, 'Soft |] in
let verdict_type = [| 'Safe, 'RequiresJustification |] in
let rationale_entry_type = {
claim | String,
detail | String,
} in
let alternative_type = {
option | String,
why_rejected | String,
} in
# Tag discriminant for typed constraint checks.
let check_tag_type = [|
'Cargo,
'Grep,
'NuCmd,
'ApiCall,
'FileExists,
|] in
# Typed constraint check: a tagged record, JSON-serializable.
# 'Cargo -> crate : String, forbidden_deps : Array String
# 'Grep -> pattern : String, paths : Array String, must_be_empty : Bool
# 'NuCmd -> cmd : String, expect_exit : Number
# 'ApiCall -> endpoint : String, json_path : String, expected : Dyn
# 'FileExists-> path : String, present : Bool
let constraint_check_type = {
tag | check_tag_type,
..
} in
let constraint_type = {
id | String,
claim | String,
scope | String,
severity | severity_type,
# Transition period: one of check or check_hint must be present.
# check_hint is deprecated — migrate existing ADRs to typed check variants.
check_hint | String | optional,
check | constraint_check_type | optional,
rationale | String,
} in
let ontology_check_type = {
decision_string | String,
invariants_at_risk | Array String,
verdict | verdict_type,
} in
let invariant_justification_type = {
invariant | String,
claim | String,
mitigation | String,
} in
let consequences_type = {
positive | Array String,
negative | Array String,
} in
let adr_type = {
id | String | c.AdrIdFormat,
title | String,
status | status_type,
date | String,
context | String,
decision | String,
rationale | Array rationale_entry_type,
consequences | consequences_type,
alternatives_considered | Array alternative_type,
constraints | Array constraint_type | c.NonEmptyConstraints,
ontology_check | ontology_check_type,
related_adrs | Array String | default = [],
supersedes | String | optional,
superseded_by | String | optional,
invariant_justification | invariant_justification_type | optional,
} in
{
AdrStatus = status_type,
Severity = severity_type,
Verdict = verdict_type,
Constraint = constraint_type,
RationaleEntry = rationale_entry_type,
Alternative = alternative_type,
OntologyCheck = ontology_check_type,
InvariantJustification = invariant_justification_type,
Adr = adr_type,
}

View file

@ -11,9 +11,14 @@ import 'justfiles/ci.just'
import 'justfiles/platform.just'
import 'justfiles/installer.just'
import 'justfiles/book.just'
import 'justfiles/docker.just'
import 'justfiles/auth.just'
import 'justfiles/kms.just'
import 'justfiles/orchestrator.just'
import 'justfiles/daemon.just'
import 'justfiles/distro.just'
import 'justfiles/assets.just'
import 'justfiles/crate.just'
# ============================================================================
# Provisioning Configuration
@ -70,7 +75,9 @@ parallel := "true"
echo " 🚀 release - Release management & artifacts (just release-help)"
echo " 🔧 dev - Development workflows & testing (just dev-help)"
echo " ⚡ platform - Platform services & orchestration (just platform-help)"
echo " 🐳 docker - Container image building (just docker-help)"
echo " 📦 installer - Interactive installer & config mgmt (just installer-help)"
echo " 💾 distro - Local installation & distribution (just distro-help)"
echo " 📖 book - MDBook documentation system (just book-help)"
echo ""
echo "🔐 PLUGIN MODULES"
@ -94,30 +101,32 @@ help MODULE="":
#!/usr/bin/env bash
if [ "{{MODULE}}" = "" ]; then
just --justfile {{justfile()}} default
elif [ "{{MODULE}}" = "ci" ]; then
echo "🔧 CI/CD PIPELINES"
echo "==================="
echo ""
echo "Available CI commands:"
echo " just ci - CI/CD pipeline WITHOUT cleanup (fast iteration)"
echo " just ci-clean - CI/CD pipeline WITH cleanup (production)"
echo " just ci-full - Run all CI checks (formatting, linting, tests, audit)"
echo ""
echo "CI Tasks:"
echo " just ci-fmt-check - Check code formatting"
echo " just ci-fmt - Code formatting"
echo " just ci-lint - Run all linting checks"
echo " just ci-test - Run all tests"
echo " just ci-audit - Run security audits"
echo " just ci-docs - Check documentation"
echo ""
echo " NOTE: nu_plugins (in plugins/nushell-plugins/) are excluded from CI checks"
echo " since they are maintained as a separate project with independent CI."
else
echo "❌ Unknown module: {{MODULE}}"
echo ""
echo "Available modules: ci, build, package, release, dev, platform, installer, book, auth, kms, orchestrator"
fi
elif [ "{{MODULE}}" = "ci" ]; then
echo "🔧 CI/CD PIPELINES"
echo "==================="
echo ""
echo "Available CI commands:"
echo " just ci - CI/CD pipeline WITHOUT cleanup (fast iteration)"
echo " just ci-clean - CI/CD pipeline WITH cleanup (production)"
echo " just ci-full - Run all CI checks (formatting, linting, tests, audit)"
echo ""
echo "CI Tasks:"
echo " just ci-fmt-check - Check code formatting"
echo " just ci-fmt - Code formatting"
echo " just ci-lint - Run all linting checks"
echo " just ci-test - Run all tests"
echo " just ci-audit - Run security audits"
echo " just ci-docs - Check documentation"
echo ""
echo " NOTE: nu_plugins (in plugins/nushell-plugins/) are excluded from CI checks"
echo " since they are maintained as a separate project with independent CI."
elif [ "{{MODULE}}" = "distro" ]; then
just distro-help
else
echo "❌ Unknown module: {{MODULE}}"
echo ""
echo "Available modules: ci, distro, build, package, release, dev, platform, docker, installer, book, auth, kms, orchestrator"
fi
# Show comprehensive provisioning help
@help-full:
@ -177,6 +186,16 @@ help MODULE="":
echo " • api-gateway - REST API gateway"
echo " • platform-status - All platform services status"
echo ""
echo "🐳 DOCKER MODULE (docker.just) - DETAILED"
echo " Container image building and management"
echo " • build-images - Build all or specific platform service images"
echo " • image-list - List available services"
echo " • image-validate - Validate built images"
echo " • image-clean - Remove all provisioning images"
echo " • image-info - Show image information"
echo " • docker-status - Show Docker system status"
echo " • build-verify - Build and verify images"
echo ""
echo "📦 INSTALLER MODULE (installer.just) - DETAILED"
echo " Interactive installer and configuration management"
echo " • installer-build - Build installer binary"

36
justfiles/assets.just Normal file
View file

@ -0,0 +1,36 @@
# API catalog export
#
# Generates per-service api-catalog-*.json from #[onto_api] registered routes.
# Run after any handler annotation is added or changed.
# Commit alongside the annotation changes — they are paired artifacts.
PLATFORM_MANIFEST := "platform/Cargo.toml"
# Export #[onto_api] routes for all platform services
[doc("Export #[onto_api] routes for all platform services to api-catalog-*.json")]
export-api-catalog: export-api-catalog-orchestrator export-api-catalog-control-center export-api-catalog-extension-registry export-api-catalog-ai-service
@echo "all platform API catalogs exported"
# Export orchestrator routes
[doc("Export orchestrator #[onto_api] routes to api-catalog-orchestrator.json")]
export-api-catalog-orchestrator:
cargo run --manifest-path {{PLATFORM_MANIFEST}} -p orchestrator --no-default-features --features "core,audit,compliance,platform,ssh,workflow,testing,http-api" -- --dump-api-catalog > api-catalog-orchestrator.json
@echo "orchestrator: $(cat api-catalog-orchestrator.json | jq length) routes"
# Export control-center routes
[doc("Export control-center #[onto_api] routes to api-catalog-control-center.json")]
export-api-catalog-control-center:
cargo run --manifest-path {{PLATFORM_MANIFEST}} -p control-center --no-default-features --features "core,kms,audit,mfa,compliance,experimental" -- --dump-api-catalog > api-catalog-control-center.json
@echo "control-center: $(cat api-catalog-control-center.json | jq length) routes"
# Export extension-registry routes
[doc("Export extension-registry #[onto_api] routes to api-catalog-extension-registry.json")]
export-api-catalog-extension-registry:
cargo run --manifest-path {{PLATFORM_MANIFEST}} -p extension-registry --no-default-features -- --dump-api-catalog > api-catalog-extension-registry.json
@echo "extension-registry: $(cat api-catalog-extension-registry.json | jq length) routes"
# Export ai-service routes
[doc("Export ai-service #[onto_api] routes to api-catalog-ai-service.json")]
export-api-catalog-ai-service:
cargo run --manifest-path {{PLATFORM_MANIFEST}} -p ai-service --no-default-features -- --dump-api-catalog > api-catalog-ai-service.json
@echo "ai-service: $(cat api-catalog-ai-service.json | jq length) routes"

View file

@ -114,3 +114,102 @@
--check-config \
--verbose={{verbose}}
echo "✅ Build system health check completed"
# Build platform Rust binaries in release and install to $HOME/bin
build-platform-install:
#!/usr/bin/env bash
set -euo pipefail
PLATFORM_DIR="{{provisioning_root}}/platform"
BIN_DIR="${HOME}/bin"
BINS="provisioning-orchestrator provisioning-control-center provisioning-mcp-server provisioning-extension-registry provisioning-vault-service"
echo "=== platform: cargo build --release ==="
cargo build --release --manifest-path "${PLATFORM_DIR}/Cargo.toml" \
--bin provisioning-orchestrator \
--bin provisioning-control-center \
--bin provisioning-mcp-server \
--bin provisioning-extension-registry \
--bin provisioning-vault-service
mkdir -p "${BIN_DIR}"
echo "=== installing to ${BIN_DIR} ==="
for bin in ${BINS}; do
src="${PLATFORM_DIR}/target/release/${bin}"
if [ -f "${src}" ]; then
install -m 0755 "${src}" "${BIN_DIR}/${bin}"
echo " installed: ${bin}"
else
echo " WARN: ${bin} not in release output, skipped"
fi
done
echo "=== done — ${BIN_DIR} ==="
# Build a single platform crate in release mode, optionally install + restart.
#
# Accepts short name (e.g. `ncl-sync`) or full binary name (`provisioning-ncl-sync`).
# Maps short names to package + binary using the `provisioning-<short>` convention.
#
# After build, prompts to install to $HOME/.local/bin and pkill any running instance.
#
# Usage:
# just build-release ncl-sync
# just build-release orchestrator
# just build-release provisioning-vault-service
build-release TARGET:
#!/usr/bin/env bash
set -euo pipefail
PLATFORM_DIR="{{provisioning_root}}/platform"
TARGET="{{TARGET}}"
INSTALL_DIR="${HOME}/.local/bin"
# Resolve short name → (package, binary)
# Convention: binary is always `provisioning-<target>` except for the host CLI itself.
if [[ "$TARGET" == provisioning-* ]]; then
SHORT="${TARGET#provisioning-}"
BIN="$TARGET"
else
SHORT="$TARGET"
BIN="provisioning-${TARGET}"
fi
# Most crate names match the short form (ncl-sync, orchestrator, ...).
# Exceptions can be handled here if they appear.
PACKAGE="$SHORT"
echo "🔨 Building $BIN (package: $PACKAGE) in release mode..."
cargo build --release \
--manifest-path "$PLATFORM_DIR/Cargo.toml" \
--package "$PACKAGE"
SRC="$PLATFORM_DIR/target/release/$BIN"
if [ ! -f "$SRC" ]; then
echo "❌ Built binary not found: $SRC"
echo " Check that package '$PACKAGE' defines [[bin]] name = \"$BIN\""
exit 1
fi
echo "✅ Built: $SRC ($(du -h "$SRC" | cut -f1))"
# Interactive install prompt
read -p "Install to $INSTALL_DIR/$BIN and restart? [y/N] " -n 1 -r REPLY
echo
if [[ ! "$REPLY" =~ ^[Yy]$ ]]; then
echo "↷ Skipped install. Binary available at: $SRC"
exit 0
fi
mkdir -p "$INSTALL_DIR"
# Stop running instance before replacing the binary
if pgrep -f "$BIN" > /dev/null 2>&1; then
echo "⏹ Stopping running $BIN..."
pkill -f "$BIN" || true
sleep 1
# Verify it's gone (send SIGKILL if still running)
if pgrep -f "$BIN" > /dev/null 2>&1; then
echo " still running — SIGKILL"
pkill -9 -f "$BIN" || true
sleep 1
fi
fi
install -m 0755 "$SRC" "$INSTALL_DIR/$BIN"
echo "✅ Installed: $INSTALL_DIR/$BIN"
echo " Run: $BIN --help to verify"

475
justfiles/crate.just Normal file
View file

@ -0,0 +1,475 @@
# Platform crate — build, install, asset deployment
# ===================================================
# Generic recipes that work for any service binary in the Rust workspace.
# Accepts short aliases and auto-resolves to full package + binary names.
#
# Short alias → cargo package / output binary
# daemon → provisioning-daemon / provisioning-daemon
# orch → orchestrator / provisioning-orchestrator
# cc → control-center / provisioning-control-center
# vault → vault-service / provisioning-vault-service
# ai → ai-service / provisioning-ai-service
# mcp → provisioning-mcp / provisioning-mcp-server
# ncl-sync → ncl-sync / provisioning-ncl-sync
# tool → provisioning-tool / provisioning-tool
# rag → platform-rag / provisioning-rag
# cli → prvng-cli / prvng-cli
#
# Standalone crates (excluded from workspace — build via crate-level Cargo.toml):
# nu-daemon → nu-daemon / provisioning-nu-daemon [standalone]
#
# _resolve returns 3 fields: PKG BIN STANDALONE
# STANDALONE="standalone" → cargo build uses crates/<PKG>/Cargo.toml directly
# STANDALONE="" → cargo build uses workspace Cargo.toml -p <PKG>
#
# Usage:
# just crate-build # interactive picker
# just crate-build daemon # short alias
# just crate-install daemon # install binary only
# just crate-assets daemon # copy templates/assets
# just crate-deploy daemon # build → install → assets
# just crate-deploy nu-daemon # standalone build (excluded from workspace)
alias cdy := crate-deploy
_pf_root := parent_directory(source_file()) / ".." / "platform"
_pf_bin_dir := env_var_or_default("HOME", "/tmp") / ".local" / "bin"
_pf_data := env_var_or_default("HOME", "/tmp") / ".local" / "share" / "provisioning"
# ─── 1. Build ──────────────────────────────────────────────────────────────────
# Build a platform crate in release mode.
[doc("Build a platform crate release binary. Omit TARGET for interactive picker.")]
crate-build target="":
#!/usr/bin/env bash
set -euo pipefail
PLATFORM="{{_pf_root}}"
_pick_target() {
local choices=(daemon orchestrator control-center vault-service ai-service provisioning-mcp ncl-sync provisioning-tool platform-rag prvng-cli nu-daemon)
echo "Available targets:" >&2
local i=1
for c in "${choices[@]}"; do
printf ' %2d) %s\n' "$i" "$c" >&2
i=$((i+1))
done
local n
read -rp "Select [1-${#choices[@]}]: " n
printf '%s\n' "${choices[$((n-1))]}"
}
_resolve() {
case "$1" in
daemon|provisioning-daemon) echo "provisioning-daemon provisioning-daemon" ;;
orch|orchestrator) echo "orchestrator provisioning-orchestrator" ;;
cc|control-center|control) echo "control-center provisioning-control-center" ;;
vault|vault-service) echo "vault-service provisioning-vault-service" ;;
ai|ai-service) echo "ai-service provisioning-ai-service" ;;
mcp|provisioning-mcp) echo "provisioning-mcp provisioning-mcp-server" ;;
ncl-sync) echo "ncl-sync provisioning-ncl-sync" ;;
tool|provisioning-tool) echo "provisioning-tool provisioning-tool" ;;
rag|platform-rag) echo "platform-rag provisioning-rag" ;;
cli|prvng-cli) echo "prvng-cli prvng-cli" ;;
nu-daemon|nu_daemon) echo "nu-daemon provisioning-nu-daemon standalone" ;;
*) echo "error: unknown target '$1'" >&2; return 1 ;;
esac
}
INPUT="{{target}}"
[[ -z "$INPUT" ]] && INPUT="$(_pick_target)"
[[ -z "$INPUT" ]] && { echo "error: no target selected" >&2; exit 1; }
read -r PKG BIN STANDALONE <<< "$(_resolve "$INPUT")"
if [[ "$STANDALONE" == "standalone" ]]; then
echo "=== build: cargo build --release (standalone: crates/${PKG}) ==="
cargo build --release \
--manifest-path "${PLATFORM}/crates/${PKG}/Cargo.toml"
BUILT="${PLATFORM}/crates/${PKG}/target/release/${BIN}"
else
echo "=== build: cargo build --release -p ${PKG} ==="
cargo build --release \
--manifest-path "${PLATFORM}/Cargo.toml" \
-p "${PKG}"
BUILT="${PLATFORM}/target/release/${BIN}"
fi
SIZE=$(du -sh "$BUILT" 2>/dev/null | cut -f1 || echo "?")
echo " ok: ${BUILT} (${SIZE})"
# ─── 2. Install ────────────────────────────────────────────────────────────────
# Install the release binary to ~/.local/share/provisioning/bin/.
# The release binary must already exist; run crate-build first.
[doc("Install a platform crate binary locally. Run crate-build first.")]
crate-install target="":
#!/usr/bin/env bash
set -euo pipefail
PLATFORM="{{_pf_root}}"
BIN_DIR="{{_pf_bin_dir}}"
_pick_target() {
local choices=(daemon orchestrator control-center vault-service ai-service provisioning-mcp ncl-sync provisioning-tool platform-rag prvng-cli nu-daemon)
echo "Available targets:" >&2
local i=1
for c in "${choices[@]}"; do
printf ' %2d) %s\n' "$i" "$c" >&2
i=$((i+1))
done
local n
read -rp "Select [1-${#choices[@]}]: " n
printf '%s\n' "${choices[$((n-1))]}"
}
_resolve() {
case "$1" in
daemon|provisioning-daemon) echo "provisioning-daemon provisioning-daemon" ;;
orch|orchestrator) echo "orchestrator provisioning-orchestrator" ;;
cc|control-center|control) echo "control-center provisioning-control-center" ;;
vault|vault-service) echo "vault-service provisioning-vault-service" ;;
ai|ai-service) echo "ai-service provisioning-ai-service" ;;
mcp|provisioning-mcp) echo "provisioning-mcp provisioning-mcp-server" ;;
ncl-sync) echo "ncl-sync provisioning-ncl-sync" ;;
tool|provisioning-tool) echo "provisioning-tool provisioning-tool" ;;
rag|platform-rag) echo "platform-rag provisioning-rag" ;;
cli|prvng-cli) echo "prvng-cli prvng-cli" ;;
nu-daemon|nu_daemon) echo "nu-daemon provisioning-nu-daemon standalone" ;;
*) echo "error: unknown target '$1'" >&2; return 1 ;;
esac
}
INPUT="{{target}}"
[[ -z "$INPUT" ]] && INPUT="$(_pick_target)"
[[ -z "$INPUT" ]] && { echo "error: no target selected" >&2; exit 1; }
read -r PKG BIN STANDALONE <<< "$(_resolve "$INPUT")"
if [[ "$STANDALONE" == "standalone" ]]; then
BUILT="${PLATFORM}/crates/${PKG}/target/release/${BIN}"
else
BUILT="${PLATFORM}/target/release/${BIN}"
fi
if [[ ! -f "$BUILT" ]]; then
echo "error: release binary not found: ${BUILT}" >&2
echo " run: just crate-build ${INPUT}" >&2
exit 1
fi
echo "=== install: ${BIN_DIR}/${BIN} ==="
mkdir -p "${BIN_DIR}"
install -m 0755 "${BUILT}" "${BIN_DIR}/${BIN}"
echo " ok: ${BIN_DIR}/${BIN}"
# Emit PATH hint if the bin dir is not in PATH
if ! echo "$PATH" | tr ':' '\n' | grep -qxF "${BIN_DIR}"; then
echo " hint: add to PATH: export PATH=\"\$HOME/.local/share/provisioning/bin:\$PATH\""
fi
# ─── 3. Assets ─────────────────────────────────────────────────────────────────
# Copy templates and static assets to their install destinations.
#
# Destination is read from the crate's NCL service config (nickel export + jq).
# If the configured path is inside the source tree the step is skipped — the
# daemon already reads directly from there and no copy is needed.
# Falls back to the default convention when NCL is unavailable or the field
# is not set.
[doc("Install templates and assets for a platform crate.")]
crate-assets target="":
#!/usr/bin/env bash
set -euo pipefail
PLATFORM="{{_pf_root}}"
DATA="{{_pf_data}}"
_pick_target() {
local choices=(daemon orchestrator control-center vault-service ai-service provisioning-mcp ncl-sync provisioning-tool platform-rag prvng-cli nu-daemon)
echo "Available targets:" >&2
local i=1
for c in "${choices[@]}"; do
printf ' %2d) %s\n' "$i" "$c" >&2
i=$((i+1))
done
local n
read -rp "Select [1-${#choices[@]}]: " n
printf '%s\n' "${choices[$((n-1))]}"
}
_resolve() {
case "$1" in
daemon|provisioning-daemon) echo "provisioning-daemon provisioning-daemon" ;;
orch|orchestrator) echo "orchestrator provisioning-orchestrator" ;;
cc|control-center|control) echo "control-center provisioning-control-center" ;;
vault|vault-service) echo "vault-service provisioning-vault-service" ;;
ai|ai-service) echo "ai-service provisioning-ai-service" ;;
mcp|provisioning-mcp) echo "provisioning-mcp provisioning-mcp-server" ;;
ncl-sync) echo "ncl-sync provisioning-ncl-sync" ;;
tool|provisioning-tool) echo "provisioning-tool provisioning-tool" ;;
rag|platform-rag) echo "platform-rag provisioning-rag" ;;
cli|prvng-cli) echo "prvng-cli prvng-cli" ;;
nu-daemon|nu_daemon) echo "nu-daemon provisioning-nu-daemon standalone" ;;
*) echo "error: unknown target '$1'" >&2; return 1 ;;
esac
}
# Read a string field from the crate's NCL service config via sed.
# Avoids nickel schema resolution issues — works on raw config text.
_ncl_field() {
local service="$1" field="$2"
local ncl_cfg
if [[ "$(uname -s)" == "Darwin" ]]; then
ncl_cfg="${HOME}/Library/Application Support/provisioning/platform/config/${service}.ncl"
else
ncl_cfg="${HOME}/.config/provisioning/platform/config/${service}.ncl"
fi
[[ -f "$ncl_cfg" ]] || return
sed -n "s/^[[:space:]]*${field}[[:space:]]*=[[:space:]]*\"\([^\"]*\)\".*/\1/p" \
"$ncl_cfg" 2>/dev/null | head -1
}
# Returns true if PATH is inside (or equal to) the source crate directory.
_is_source_tree() {
local path="$1" crate_src="$2"
local canon_path canon_src
canon_path="$(cd "$(dirname "$path")" 2>/dev/null && pwd)/$(basename "$path")"
canon_src="$(cd "$crate_src" 2>/dev/null && pwd)"
[[ "$canon_path" == "$canon_src"* ]]
}
# Sync one asset group: SRC → DST, with source-tree detection.
# $1=label $2=src_dir $3=dst_dir $4=crate_src_root
_sync_group() {
local label="$1" src="$2" dst="$3" crate_src="$4"
if [[ ! -d "$src" ]]; then
echo " skip [${label}]: source not found (${src})"
return
fi
if _is_source_tree "$dst" "$crate_src"; then
echo " skip [${label}]: destination is the source tree"
echo " → ${dst}"
echo " daemon reads directly from there — no copy needed"
return
fi
echo " sync [${label}]: ${src}"
echo " → ${dst}"
mkdir -p "${dst}"
rsync -a --delete "${src}/" "${dst}/"
local count
count=$(find "${dst}" -type f | wc -l | tr -d ' ')
echo " ok: ${count} files"
}
INPUT="{{target}}"
[[ -z "$INPUT" ]] && INPUT="$(_pick_target)"
[[ -z "$INPUT" ]] && { echo "error: no target selected" >&2; exit 1; }
read -r PKG _BIN _STANDALONE <<< "$(_resolve "$INPUT")"
echo "=== crate-assets: ${PKG} ==="
case "$PKG" in
provisioning-daemon)
CRATE_SRC="${PLATFORM}/crates/provisioning-daemon"
# ui/templates — destination from NCL ui_templates_dir or convention
UI_DST="$(_ncl_field "provisioning-daemon" "ui_templates_dir")"
UI_DST="${UI_DST:-${DATA}/provisioning-daemon/ui/templates}"
_sync_group "ui_templates" \
"${CRATE_SRC}/ui/templates" \
"${UI_DST}" \
"${CRATE_SRC}"
# ontology_templates — destination from NCL ontology_templates or convention
ONT_DST="$(_ncl_field "provisioning-daemon" "ontology_templates")"
ONT_DST="${ONT_DST:-${DATA}/provisioning-daemon/ontology-templates}"
_sync_group "ontology_templates" \
"${CRATE_SRC}/ontology_templates" \
"${ONT_DST}" \
"${CRATE_SRC}"
;;
*)
echo " ${PKG}: no file-system assets (uses embedded resources)"
;;
esac
echo "=== done ==="
# ─── 4. Deploy (chain) ─────────────────────────────────────────────────────────
# Build → install → assets in sequence. Stops on first failure.
# Usage: just crate-deploy [target]
[doc("Full deploy: build release → install binary → install assets.")]
crate-deploy target="":
#!/usr/bin/env bash
set -euo pipefail
PLATFORM="{{_pf_root}}"
BIN_DIR="{{_pf_bin_dir}}"
DATA="{{_pf_data}}"
_pick_target() {
local choices=(daemon orchestrator control-center vault-service ai-service provisioning-mcp ncl-sync provisioning-tool platform-rag prvng-cli nu-daemon)
echo "Available targets:" >&2
local i=1
for c in "${choices[@]}"; do
printf ' %2d) %s\n' "$i" "$c" >&2
i=$((i+1))
done
local n
read -rp "Select [1-${#choices[@]}]: " n
printf '%s\n' "${choices[$((n-1))]}"
}
_resolve() {
case "$1" in
daemon|provisioning-daemon) echo "provisioning-daemon provisioning-daemon" ;;
orch|orchestrator) echo "orchestrator provisioning-orchestrator" ;;
cc|control-center|control) echo "control-center provisioning-control-center" ;;
vault|vault-service) echo "vault-service provisioning-vault-service" ;;
ai|ai-service) echo "ai-service provisioning-ai-service" ;;
mcp|provisioning-mcp) echo "provisioning-mcp provisioning-mcp-server" ;;
ncl-sync) echo "ncl-sync provisioning-ncl-sync" ;;
tool|provisioning-tool) echo "provisioning-tool provisioning-tool" ;;
rag|platform-rag) echo "platform-rag provisioning-rag" ;;
cli|prvng-cli) echo "prvng-cli prvng-cli" ;;
nu-daemon|nu_daemon) echo "nu-daemon provisioning-nu-daemon standalone" ;;
*) echo "error: unknown target '$1'" >&2; return 1 ;;
esac
}
_ncl_field() {
local service="$1" field="$2"
local ncl_cfg
if [[ "$(uname -s)" == "Darwin" ]]; then
ncl_cfg="${HOME}/Library/Application Support/provisioning/platform/config/${service}.ncl"
else
ncl_cfg="${HOME}/.config/provisioning/platform/config/${service}.ncl"
fi
[[ -f "$ncl_cfg" ]] || return
sed -n "s/^[[:space:]]*${field}[[:space:]]*=[[:space:]]*\"\([^\"]*\)\".*/\1/p" \
"$ncl_cfg" 2>/dev/null | head -1
}
_is_source_tree() {
local path="$1" crate_src="$2"
local canon_path canon_src
canon_path="$(cd "$(dirname "$path")" 2>/dev/null && pwd)/$(basename "$path")"
canon_src="$(cd "$crate_src" 2>/dev/null && pwd)"
[[ "$canon_path" == "$canon_src"* ]]
}
_sync_group() {
local label="$1" src="$2" dst="$3" crate_src="$4"
if [[ ! -d "$src" ]]; then
echo " skip [${label}]: source not found (${src})"
return
fi
if _is_source_tree "$dst" "$crate_src"; then
echo " skip [${label}]: destination is source tree"
echo " → ${dst}"
echo " service reads templates directly from there"
return
fi
echo " sync [${label}]: ${src}"
echo " → ${dst}"
mkdir -p "${dst}"
rsync -a --delete "${src}/" "${dst}/"
local count
count=$(find "${dst}" -type f | wc -l | tr -d ' ')
echo " ok: ${count} files copied"
}
# Returns "running" or empty.
# Match binary path: [/]<bin> followed by end-of-string OR a space (arguments).
# The bare $ anchor fails when the process has CLI arguments like --config.
_svc_status() {
pgrep -f "[/]${1}( |$)" >/dev/null 2>&1 && echo "running" || true
}
INPUT="{{target}}"
[[ -z "$INPUT" ]] && INPUT="$(_pick_target)"
[[ -z "$INPUT" ]] && { echo "error: no target selected" >&2; exit 1; }
read -r PKG BIN STANDALONE <<< "$(_resolve "$INPUT")"
SVC="${PKG//-/_}"
echo "=== crate-deploy: ${PKG} ==="
echo ""
# ── pre: stop service if running ───────────────────────────────────────────
WAS_RUNNING=false
if [[ "$(_svc_status "$PKG")" == "running" ]]; then
WAS_RUNNING=true
echo " service ${PKG} is running — stopping before overwrite"
provisioning platform stop "$PKG" >/dev/null 2>&1 || true
sleep 2
pgrep -f "[/]${BIN}$" >/dev/null 2>&1 && echo " warning: process still running" || echo " stopped: ok"
echo ""
fi
# ── step 1: build ──────────────────────────────────────────────────────────
echo "--- [1/3] build ---"
if [[ "$STANDALONE" == "standalone" ]]; then
cargo build --release \
--manifest-path "${PLATFORM}/crates/${PKG}/Cargo.toml"
BUILT="${PLATFORM}/crates/${PKG}/target/release/${BIN}"
else
cargo build --release \
--manifest-path "${PLATFORM}/Cargo.toml" \
-p "${PKG}"
BUILT="${PLATFORM}/target/release/${BIN}"
fi
SIZE=$(du -sh "$BUILT" 2>/dev/null | cut -f1 || echo "?")
echo " ok: ${BUILT} (${SIZE})"
echo ""
# ── step 2: install binary ─────────────────────────────────────────────────
echo "--- [2/3] install ---"
mkdir -p "${BIN_DIR}"
DEST="${BIN_DIR}/${BIN}"
if [[ -f "$DEST" ]]; then
install -m 0755 "${BUILT}" "${DEST}"
echo " overwritten: ${DEST}"
else
install -m 0755 "${BUILT}" "${DEST}"
echo " installed: ${DEST}"
fi
if ! echo "$PATH" | tr ':' '\n' | grep -qxF "${BIN_DIR}"; then
echo " hint: add to PATH: export PATH=\"${BIN_DIR}:\$PATH\""
fi
echo ""
# ── step 3: assets ─────────────────────────────────────────────────────────
echo "--- [3/3] assets ---"
case "$PKG" in
provisioning-daemon)
CRATE_SRC="${PLATFORM}/crates/provisioning-daemon"
UI_DST="$(_ncl_field "provisioning-daemon" "ui_templates_dir")"
UI_DST="${UI_DST:-${DATA}/provisioning-daemon/ui/templates}"
_sync_group "ui_templates" "${CRATE_SRC}/ui/templates" "${UI_DST}" "${CRATE_SRC}"
ONT_DST="$(_ncl_field "provisioning-daemon" "ontology_templates")"
ONT_DST="${ONT_DST:-${DATA}/provisioning-daemon/ontology-templates}"
_sync_group "ontology_templates" "${CRATE_SRC}/ontology_templates" "${ONT_DST}" "${CRATE_SRC}"
;;
*)
echo " ${PKG}: no file-system assets"
;;
esac
echo ""
# ── post: always start (restart if was running, start if was stopped) ────────
echo "--- start ---"
provisioning platform start "$PKG" >/dev/null 2>&1 || true
sleep 2
LOG="${HOME}/.provisioning/logs/${PKG}.log"
if pgrep -f "[/]${BIN}( |$)" >/dev/null 2>&1; then
echo " started: ok"
echo " logs: ${LOG}"
else
echo " warning: process not running after start"
echo " check: ${LOG}"
fi
echo ""
echo "=== done: ${BIN} deployed ==="

192
justfiles/daemon.just Normal file
View file

@ -0,0 +1,192 @@
# provisioning-daemon — build, install, and lifecycle recipes
# ==============================================================
# Targets the HTTP+NATS daemon (crates/provisioning-daemon).
# Port: 9014 (PROVISIONING_DAEMON_BIND env overrides).
#
# nu-daemon — standalone build (excluded from platform workspace due to rustls conflict)
# Port: 9095. Binary: provisioning-nu-daemon → $HOME/.local/bin/
_prov_root := parent_directory(source_file()) / ".." # provisioning/
_platform := _prov_root / "platform"
_scripts := _platform / "scripts"
_bin := "provisioning-daemon"
_install := env_var_or_default("HOME", "/usr/local") / ".local" / "share" / "provisioning" / "bin"
_nu_daemon_crate := _platform / "crates" / "nu-daemon"
_nu_daemon_bin := "provisioning-nu-daemon"
_nu_daemon_install := env_var_or_default("HOME", "/usr/local") / ".local" / "bin"
_log := "/tmp/provisioning-daemon.log"
_pid := "/tmp/provisioning-daemon.pid"
# ── Build ──────────────────────────────────────────────────────────────────────
# Build provisioning-daemon in release mode
daemon-build:
#!/usr/bin/env bash
set -euo pipefail
echo "=== build: cargo build --release --bin {{_bin}} ==="
cargo build --release \
--manifest-path "{{_platform}}/Cargo.toml" \
--bin "{{_bin}}"
echo " binary: {{_platform}}/target/release/{{_bin}}"
# Build in dev mode (faster, larger binary)
daemon-build-dev:
#!/usr/bin/env bash
set -euo pipefail
cargo build \
--manifest-path "{{_platform}}/Cargo.toml" \
--bin "{{_bin}}"
echo " binary: {{_platform}}/target/debug/{{_bin}}"
# Run all tests for provisioning-daemon
daemon-test:
cargo test \
--manifest-path "{{_platform}}/Cargo.toml" \
-p provisioning-daemon \
--lib
# Run clippy on provisioning-daemon
daemon-lint:
cargo clippy \
--manifest-path "{{_platform}}/Cargo.toml" \
-p provisioning-daemon \
-- -D warnings
# ── Install ────────────────────────────────────────────────────────────────────
# Install provisioning-daemon to ~/.local/share/provisioning/bin/
daemon-install: daemon-build
#!/usr/bin/env bash
set -euo pipefail
mkdir -p "{{_install}}"
install -m 0755 "{{_platform}}/target/release/{{_bin}}" "{{_install}}/{{_bin}}"
echo " installed: {{_install}}/{{_bin}}"
# Build + install + restart in one step
daemon-deploy: daemon-install
#!/usr/bin/env bash
set -euo pipefail
echo "=== deploy: stopping existing daemon ==="
if [ -f "{{_pid}}" ]; then
PID=$(cat "{{_pid}}" | tr -d '[:space:]')
if [ -n "$PID" ] && kill -0 "$PID" 2>/dev/null; then
kill "$PID"
sleep 1
fi
rm -f "{{_pid}}"
fi
echo "=== deploy: starting daemon ==="
nohup "{{_install}}/{{_bin}}" >> "{{_log}}" 2>&1 &
echo $! > "{{_pid}}"
echo " PID: $(cat {{_pid}}) log: {{_log}}"
# wait for health
for i in $(seq 1 10); do
if curl -sf http://127.0.0.1:9014/health > /dev/null 2>&1; then
echo " health: ok"
break
fi
sleep 1
done
# ── Lifecycle ──────────────────────────────────────────────────────────────────
# Start daemon in background (uses installed binary or release build)
daemon-start:
nu "{{_scripts}}/start-provisioning-daemon.nu" start
# Stop running daemon
daemon-stop:
nu "{{_scripts}}/start-provisioning-daemon.nu" stop
# Show daemon status and health
daemon-status:
nu "{{_scripts}}/start-provisioning-daemon.nu" status
# Restart daemon
daemon-restart:
nu "{{_scripts}}/start-provisioning-daemon.nu" restart
# Tail daemon logs
daemon-logs:
nu "{{_scripts}}/start-provisioning-daemon.nu" logs
# ── Dev helpers ────────────────────────────────────────────────────────────────
# Run daemon in foreground with debug logging (dev mode)
daemon-run:
#!/usr/bin/env bash
set -euo pipefail
RUST_LOG=debug \
cargo run \
--manifest-path "{{_platform}}/Cargo.toml" \
--bin "{{_bin}}"
# Run daemon with custom bind address
daemon-run-bind bind="0.0.0.0:9014":
#!/usr/bin/env bash
set -euo pipefail
RUST_LOG=info \
cargo run \
--manifest-path "{{_platform}}/Cargo.toml" \
--bin "{{_bin}}" \
-- --bind "{{bind}}"
# Query health endpoint
daemon-health:
curl -sf http://127.0.0.1:9014/health | jq .
# List registered tools via daemon API
daemon-tools:
curl -sf http://127.0.0.1:9014/api/v1/tools | jq '.tools[] | {name, category}'
# List ontology templates (requires JWT or solo mode)
daemon-ontology-list:
curl -sf http://127.0.0.1:9014/api/v1/ontology/templates | jq .
# Open admin UI in browser
daemon-ui:
open http://127.0.0.1:9014/admin/
# ── Watch (live-reload on config change) ──────────────────────────────────────
# Run daemon with config watcher on default paths
daemon-watch paths="":
#!/usr/bin/env bash
set -euo pipefail
WATCH_PATHS="${PROVISIONING_WATCH_PATHS:-{{paths}}}"
if [ -n "$WATCH_PATHS" ]; then
WATCH_FLAG="--watch-paths $WATCH_PATHS"
else
WATCH_FLAG=""
fi
RUST_LOG=info \
cargo run \
--manifest-path "{{_platform}}/Cargo.toml" \
--bin "{{_bin}}" \
-- $WATCH_FLAG
# ── nu-daemon (standalone — excluded from platform workspace) ─────────────────
# Build nu-daemon in release mode (must build standalone, not via workspace)
nu-daemon-build:
#!/usr/bin/env bash
set -euo pipefail
echo "=== build: cargo build --release (standalone) ==="
cargo build --release \
--manifest-path "{{_nu_daemon_crate}}/Cargo.toml"
echo " binary: {{_nu_daemon_crate}}/target/release/{{_nu_daemon_bin}}"
# Install nu-daemon to $HOME/.local/bin/
nu-daemon-install: nu-daemon-build
#!/usr/bin/env bash
set -euo pipefail
mkdir -p "{{_nu_daemon_install}}"
install -m 0755 \
"{{_nu_daemon_crate}}/target/release/{{_nu_daemon_bin}}" \
"{{_nu_daemon_install}}/{{_nu_daemon_bin}}"
echo " installed: {{_nu_daemon_install}}/{{_nu_daemon_bin}}"
# Build + install in one step
nu-daemon-deploy: nu-daemon-install
@echo " deployed: {{_nu_daemon_bin}} → {{_nu_daemon_install}}"

302
justfiles/distro.just Normal file
View file

@ -0,0 +1,302 @@
# Distro Module - Local Distribution Installation
# ==================================================
# Build, package, and install provisioning binaries locally
# Integrates with package.just for distribution management
# ============================================================================
# Distro Module Configuration
# ============================================================================
distro_version := version
distro_build_dir := provisioning_root / "platform" / "target" / "release"
distro_install_dir := env_var_or_default("PROVISIONING_INSTALL_DIR", home_dir() / ".local" / "bin")
# Config directories (platform-specific, macOS vs Linux)
distro_config_dir := if os() == "macos" {
home_dir() / "Library" / "Application Support" / "provisioning"
} else {
home_dir() / ".config" / "provisioning"
}
# Core provisioning binaries from platform workspace
provisioning_binaries := "provisioning-ai-service provisioning-extension-registry provisioning-vault-service provisioning-rag provisioning-daemon provisioning-control-center provisioning-orchestrator provisioning-mcp-server provisioning-detector"
# ============================================================================
# Help and Information
# ============================================================================
# Show distro module help
@distro-help:
echo "📦 DISTRIBUTION & LOCAL INSTALLATION MODULE"
echo "==========================================="
echo ""
echo "🏗️ BUILD FOR INSTALLATION"
echo " distro-build-release Build all platform binaries (release)"
echo " distro-build-debug Build all platform binaries (debug)"
echo ""
echo "💾 LOCAL INSTALLATION"
echo " distro-install Install binaries to ~/.local/bin"
echo " distro-install-system Install to /usr/local/bin (requires sudo)"
echo " distro-uninstall Remove installed binaries"
echo ""
echo "🔍 VERIFICATION & MANAGEMENT"
echo " distro-verify Verify installation integrity"
echo " distro-list List installed binaries"
echo " distro-info Show installation information"
echo ""
echo "📦 DISTRIBUTION PACKAGES"
echo " distro-package Create distribution package (uses package module)"
echo " distro-checksums Generate SHA256 checksums"
echo ""
echo "🧹 CLEANUP"
echo " distro-clean Clean build artifacts"
echo ""
echo "INSTALLATION DIRECTORY: {{distro_install_dir}}"
echo "CONFIG DIRECTORY: {{distro_config_dir}}"
echo ""
echo "EXAMPLES:"
echo " just distro-build-release && just distro-install"
echo " just distro-install PROVISIONING_INSTALL_DIR=/usr/local/bin"
echo " just distro-list"
# ============================================================================
# Build Recipes
# ============================================================================
# Build all platform binaries (release mode)
@distro-build-release:
echo "🔨 Building provisioning binaries (release)..."
cd {{provisioning_root}}/platform && {{cargo}} build -r --workspace
echo "✅ Binaries built: {{distro_build_dir}}"
# Build all platform binaries (debug mode)
@distro-build-debug:
echo "🔨 Building provisioning binaries (debug)..."
cd {{provisioning_root}}/platform && {{cargo}} build --workspace
echo "✅ Debug binaries built"
# ============================================================================
# Installation Recipes
# ============================================================================
# Install binaries locally (default: ~/.local/bin)
distro-install:
#!/usr/bin/env bash
INSTALL_DIR="{{distro_install_dir}}"
BUILD_DIR="{{distro_build_dir}}"
PROVISIONING_ROOT="{{provisioning_root}}"
echo "📦 Installing provisioning binaries to: $INSTALL_DIR"
mkdir -p "$INSTALL_DIR" || { echo "✗ Failed to create install directory"; exit 1; }
echo ""
echo "📂 Installing binaries..."
binaries_installed=0
binaries_failed=0
for binary in {{provisioning_binaries}}; do
SRC="$BUILD_DIR/$binary"
if [ -f "$SRC" ]; then
if cp "$SRC" "$INSTALL_DIR/$binary" 2>/dev/null && chmod +x "$INSTALL_DIR/$binary" 2>/dev/null; then
echo " ✓ $binary"
((binaries_installed++))
else
echo " ✗ $binary (copy failed)"
((binaries_failed++))
fi
else
echo " ✗ $binary (not found at $SRC)"
((binaries_failed++))
fi
done
echo ""
echo "📊 Installation Summary"
echo "====================="
echo "Install directory: $INSTALL_DIR"
echo "Binaries installed: $binaries_installed"
[ $binaries_failed -gt 0 ] && echo "Binaries failed: $binaries_failed"
# Check if install dir is in PATH
if ! echo "$PATH" | grep -q "$INSTALL_DIR"; then
echo ""
echo "⚠️ $INSTALL_DIR is not in your PATH"
echo "Add to ~/.bashrc or ~/.zshrc:"
echo " export PATH=\"\$PATH:$INSTALL_DIR\""
fi
echo ""
echo "✅ Installation complete!"
echo ""
echo "Verify installation:"
echo " $INSTALL_DIR/provisioning-ai-service --version"
# Install to system directory (requires sudo)
distro-install-system:
#!/usr/bin/env bash
INSTALL_DIR="/usr/local/bin"
BUILD_DIR="{{distro_build_dir}}"
echo "🔐 Installing provisioning binaries to: $INSTALL_DIR (requires sudo)"
for binary in {{provisioning_binaries}}; do
SRC="$BUILD_DIR/$binary"
if [ -f "$SRC" ]; then
echo " Installing: $binary"
if sudo cp "$SRC" "$INSTALL_DIR/$binary" && sudo chmod +x "$INSTALL_DIR/$binary"; then
echo " ✓ $binary"
else
echo " ✗ $binary (installation failed)"
fi
else
echo " ✗ $binary (not found)"
fi
done
echo "✅ System installation complete!"
# Uninstall binaries
distro-uninstall:
#!/usr/bin/env bash
set -e
INSTALL_DIR="{{distro_install_dir}}"
echo "🗑️ Uninstalling provisioning binaries from: $INSTALL_DIR"
for binary in {{provisioning_binaries}}; do
if [ -f "$INSTALL_DIR/$binary" ]; then
rm "$INSTALL_DIR/$binary"
echo " ✓ Removed $binary"
fi
done
echo "✅ Uninstallation complete!"
# ============================================================================
# Verification & Information
# ============================================================================
# Verify installation integrity
distro-verify:
#!/usr/bin/env bash
INSTALL_DIR="{{distro_install_dir}}"
echo "🔍 Verifying installation in: $INSTALL_DIR"
echo ""
found=0
missing=0
for binary in {{provisioning_binaries}}; do
if [ -f "$INSTALL_DIR/$binary" ] && [ -x "$INSTALL_DIR/$binary" ]; then
echo " ✓ $binary"
((found++))
else
echo " ✗ $binary (not found or not executable)"
((missing++))
fi
done
echo ""
echo "📊 Summary: $found found, $missing missing"
if [ $missing -gt 0 ]; then
echo "⚠️ Some binaries are missing!"
exit 1
fi
echo "✅ Installation verified!"
# List installed binaries
distro-list:
#!/usr/bin/env bash
INSTALL_DIR="{{distro_install_dir}}"
echo "📋 Installed provisioning binaries in: $INSTALL_DIR"
echo ""
if ls "$INSTALL_DIR"/provisioning-* 2>/dev/null | head -1 > /dev/null; then
ls -lh "$INSTALL_DIR"/provisioning-*
else
echo "No provisioning binaries found"
fi
# Show installation information
distro-info:
#!/usr/bin/env bash
INSTALL_DIR="{{distro_install_dir}}"
echo "📦 Provisioning Installation Information"
echo "========================================"
echo ""
echo "Installation Directory: $INSTALL_DIR"
echo ""
echo "Environment Variables:"
echo " PROVISIONING_INSTALL_DIR={{distro_install_dir}}"
echo ""
echo "Status:"
if [ -d "$INSTALL_DIR" ]; then
echo " ✓ Install dir exists"
count=$(ls "$INSTALL_DIR"/provisioning-* 2>/dev/null | wc -l)
echo " Binaries installed: $count"
else
echo " ✗ Install dir not found"
fi
echo ""
echo "In PATH:"
if echo "$PATH" | grep -q "$INSTALL_DIR"; then
echo " ✓ Install directory is in PATH"
else
echo " ✗ Install directory is NOT in PATH"
fi
# ============================================================================
# Packaging Recipes
# ============================================================================
# Create distribution package (delegates to package module)
@distro-package:
echo "📦 Creating distribution package..."
echo " (delegates to package module)"
just package-all
# Generate checksums for distribution
@distro-checksums:
#!/usr/bin/env bash
set -e
PACKAGES_DIR="{{packages_dir}}"
if [ ! -d "$PACKAGES_DIR" ]; then
echo "✗ Packages directory not found: $PACKAGES_DIR"
exit 1
fi
echo "🔐 Generating SHA256 checksums..."
cd "$PACKAGES_DIR"
count=0
for file in *.tar.gz *.zip 2>/dev/null; do
[ -f "$file" ] || continue
echo " Checksumming: $file"
sha256sum "$file" > "${file}.sha256"
((count++))
done
echo "✅ Generated $count checksums"
# ============================================================================
# Cleanup
# ============================================================================
# Clean build artifacts
@distro-clean:
echo "🧹 Cleaning distro artifacts..."
cd {{provisioning_root}}/platform && {{cargo}} clean
echo "✅ Clean complete"

259
justfiles/docker.just Normal file
View file

@ -0,0 +1,259 @@
# Docker Module - Container image building and management (Nickel-native)
# =========================================================================
# Source of truth: Nickel templates in schemas/platform/templates/docker/
# Dockerfiles are GENERATED on-demand, NOT tracked in git
# Show detailed docker help
@docker-help:
echo "🐳 DOCKER MODULE HELP (Nickel-Native Build System)"
echo "=================================================="
echo ""
echo "This module uses Nickel templates + cargo-chef for optimized Docker builds:"
echo "• Dockerfiles generated on-demand from Nickel schemas"
echo "• 4-stage builds: PLANNER → CACHER → BUILDER → RUNTIME"
echo "• 60-80% build time reduction via dependency caching"
echo "• BuildKit cache modes: local, registry, inline"
echo "• Mode-specific tuning: solo, cicd, enterprise"
echo ""
echo "SERVICES:"
echo " orchestrator - Workflow engine and task queue"
echo " control-center - Policy and RBAC management"
echo " mcp-server - AI/LLM integration"
echo " extension-registry - Plugin management"
echo " provisioning-daemon - System daemon"
echo " ai-service - AI service integration"
echo " rag - Retrieval augmented generation"
echo " vault-service - Secret management"
echo ""
echo "RECIPES:"
echo " docker-gen Generate Dockerfiles from Nickel templates"
echo " docker-gen-compose Generate docker-compose.build.yml"
echo " docker-build SERVICES Build service(s) with auto-generation"
echo " docker-build-all Build all services with BuildKit cache"
echo " docker-clean-gen Remove generated Dockerfiles"
echo " image-list Show available services"
echo " image-validate Validate all built images"
echo " image-clean Remove all provisioning images"
echo ""
echo "EXAMPLES:"
echo " just docker-gen orchestrator # Generate Dockerfile for orchestrator"
echo " just docker-gen-compose # Generate docker-compose.build.yml"
echo " just docker-build orchestrator # Build orchestrator (auto-generates Dockerfile)"
echo " just docker-build-all # Build all services with parallel BuildKit"
echo " just docker-clean-gen # Remove all generated Dockerfiles"
echo ""
echo "NOTES:"
echo " • Dockerfiles are NOT tracked in git (source = Nickel templates)"
echo " • docker-build auto-generates Dockerfiles before building"
echo " • Change base images in schemas/platform/docker-build.ncl"
# Generate Dockerfiles from Nickel templates (on-demand)
docker-gen MODE='solo' +SERVICES='all':
#!/usr/bin/env bash
cd {{provisioning_root}}
if [ "{{SERVICES}}" == "all" ]; then
echo "🏗️ Generating Dockerfiles for all services (mode: {{MODE}})..."
{{nu}} scripts/docker-generate-builds.nu all --mode {{MODE}}
else
echo "🏗️ Generating Dockerfiles for: {{SERVICES}} (mode: {{MODE}})..."
for service in {{SERVICES}}; do
{{nu}} scripts/docker-generate-builds.nu "$service" --mode {{MODE}}
done
fi
# Generate docker-compose.build.yml from Nickel template
docker-gen-compose REGISTRY='localhost:5000':
#!/usr/bin/env bash
cd {{provisioning_root}}
echo "🏗️ Generating docker-compose.build.yml (registry: {{REGISTRY}})..."
{{nu}} scripts/docker-generate-compose.nu --registry {{REGISTRY}}
# Build Docker images with auto-generation and BuildKit cache
docker-build MODE='solo' REGISTRY='localhost:5000' +SERVICES='':
#!/usr/bin/env bash
cd {{provisioning_root}}
if [ -z "{{SERVICES}}" ]; then
echo "🐳 Building all services (mode: {{MODE}})..."
{{nu}} scripts/docker-build.nu --all --mode {{MODE}} --registry {{REGISTRY}}
else
echo "🐳 Building service(s): {{SERVICES}} (mode: {{MODE}})..."
{{nu}} scripts/docker-build.nu {{SERVICES}} --mode {{MODE}} --registry {{REGISTRY}}
fi
# Build all services with BuildKit parallel builds
docker-build-all MODE='solo' REGISTRY='localhost:5000':
#!/usr/bin/env bash
cd {{provisioning_root}}
echo "🐳 Building all platform services (mode: {{MODE}})..."
{{nu}} scripts/docker-build.nu --all --mode {{MODE}} --registry {{REGISTRY}}
# Remove all generated Dockerfiles (not tracked in git)
docker-clean-gen:
#!/usr/bin/env bash
cd {{provisioning_root}}
echo "🧹 Removing generated Dockerfiles..."
find platform/crates -name "Dockerfile" -type f -delete
rm -f docker-compose.build.yml
echo "✅ Generated files cleaned"
# Legacy: Build platform service Docker images (deprecated - use docker-build)
build-images +SERVICES='':
#!/usr/bin/env bash
echo "⚠️ DEPRECATED: Use 'just docker-build' instead"
cd {{provisioning_root}}
if [ -z "{{SERVICES}}" ]; then
{{nu}} scripts/docker-build.nu --all
else
{{nu}} scripts/docker-build.nu {{SERVICES}}
fi
# List available service images
@image-list:
echo "📋 Available Platform Services"
echo "=============================="
echo ""
echo "Core Platform Services:"
echo " • orchestrator (Rust) - Workflow engine and task queue"
echo " • control-center (Rust) - Policy and RBAC management"
echo " • mcp-server (Rust) - AI/LLM integration"
echo " • extension-registry (Rust) - Plugin management"
echo " • rag (Rust) - Retrieval augmented generation"
echo ""
echo "Image Tags:"
echo " • provisioning-orchestrator:latest"
echo " • provisioning-control-center:latest"
echo " • provisioning-mcp-server:latest"
echo " • provisioning-extension-registry:latest"
echo " • provisioning-rag:latest"
echo ""
echo "Usage: just build-images [service...]"
echo " just build-images orchestrator"
echo " just build-images orchestrator control-center"
# Validate all built Docker images
@image-validate:
#!/usr/bin/env bash
echo "🔍 Validating platform Docker images..."
echo ""
SERVICES=("provisioning-orchestrator:latest" \
"provisioning-control-center:latest" \
"provisioning-mcp-server:latest" \
"provisioning-extension-registry:latest" \
"provisioning-rag:latest")
VALID=0
INVALID=0
for image in "${SERVICES[@]}"; do
if docker image inspect "$image" >/dev/null 2>&1; then
echo "✅ $image"
((VALID++))
else
echo "❌ $image (not found)"
((INVALID++))
fi
done
echo ""
echo "📊 Validation Summary"
echo "===================="
echo "Valid: $VALID"
echo "Invalid: $INVALID"
if [ $INVALID -gt 0 ]; then
echo ""
echo "Run 'just build-images' to build missing images"
exit 1
fi
# Remove all provisioning platform images
@image-clean:
#!/usr/bin/env bash
echo "🧹 Removing provisioning platform Docker images..."
IMAGES=("provisioning-orchestrator:latest" \
"provisioning-control-center:latest" \
"provisioning-mcp-server:latest" \
"provisioning-extension-registry:latest" \
"provisioning-rag:latest")
for image in "${IMAGES[@]}"; do
if docker image inspect "$image" >/dev/null 2>&1; then
echo "Removing $image..."
docker image rm "$image" || echo "⚠️ Failed to remove $image"
fi
done
echo "✅ Image cleanup completed"
# Show Docker image information
@image-info:
#!/usr/bin/env bash
echo "🐳 Provisioning Platform Docker Images"
echo "======================================"
echo ""
IMAGES=("provisioning-orchestrator:latest" \
"provisioning-control-center:latest" \
"provisioning-mcp-server:latest" \
"provisioning-extension-registry:latest" \
"provisioning-rag:latest")
for image in "${IMAGES[@]}"; do
if docker image inspect "$image" >/dev/null 2>&1; then
echo "📦 $image"
docker image inspect "$image" | jq -r '.[0] | " Created: \(.Created)\n Size: \(.Size) bytes\n OS: \(.Os)/\(.Architecture)"'
echo ""
fi
done
if [ $(docker images --filter "reference=provisioning-*" --quiet | wc -l) -eq 0 ]; then
echo " No provisioning images found. Run 'just build-images' to build them."
fi
# Build specific service image
[no-cd]
build-service SERVICE:
#!/usr/bin/env bash
cd {{provisioning_root}}
echo "🐳 Building service image: {{SERVICE}}"
{{nu}} scripts/build-images.nu {{SERVICE}}
# Show Docker system status
@docker-status:
#!/usr/bin/env bash
echo "🐳 Docker System Status"
echo "======================"
echo ""
if ! command -v docker &> /dev/null; then
echo "❌ Docker is not installed"
exit 1
fi
echo "Docker version:"
docker --version
echo ""
echo "Docker daemon status:"
if docker ps -q >/dev/null 2>&1; then
echo "✅ Docker daemon is running"
else
echo "❌ Docker daemon is not running or not accessible"
exit 1
fi
echo ""
echo "Provisioning platform images:"
docker images --filter "reference=provisioning-*" | tail -n +2 || echo "None found"
echo ""
echo "Docker disk usage:"
docker system df
# Build and verify Docker images
@build-verify:
just build-images
just image-validate
echo "✅ Docker images built and verified"

View file

@ -0,0 +1 @@
/Users/Akasha/Tools/dev-system/languages/nushell/just-modules/automation

1
justfiles/nushell-script Symbolic link
View file

@ -0,0 +1 @@
/Users/Akasha/Tools/dev-system/languages/nushell/just-modules/script

View file

@ -2,6 +2,36 @@
# ==============================================
# Task orchestration, workflow management, and batch operations
# ============================================================================
# Build → Install → Restart
# ============================================================================
# Build orchestrator release, install to ~/.local/bin, restart service
orch-deploy:
#!/usr/bin/env bash
set -euo pipefail
# source_file() is the path to this justfile — provisioning/justfiles/orchestrator.just
# so provisioning root is two levels up from source_file()
PROV_ROOT="$(dirname "$(dirname "{{source_file()}}")")"
PLATFORM_DIR="${PROV_ROOT}/platform"
BIN_DIR="${HOME}/.local/bin"
BIN="provisioning-orchestrator"
echo "=== build: cargo build --release --bin ${BIN} ==="
cargo build --release \
--manifest-path "${PLATFORM_DIR}/Cargo.toml" \
--bin "${BIN}"
echo "=== install: ${BIN_DIR}/${BIN} ==="
mkdir -p "${BIN_DIR}"
install -m 0755 "${PLATFORM_DIR}/target/release/${BIN}" "${BIN_DIR}/${BIN}"
echo " installed: ${BIN}"
echo "=== restart orchestrator ==="
provisioning platform restart orchestrator
echo "=== done ==="
# ============================================================================
# Orchestrator Status and Health
# ============================================================================

1
justfiles/rust-axum Symbolic link
View file

@ -0,0 +1 @@
/Users/Akasha/Tools/dev-system/languages/rust/just-modules/axum

1
justfiles/rust-cargo Symbolic link
View file

@ -0,0 +1 @@
/Users/Akasha/Tools/dev-system/languages/rust/just-modules/cargo

1
justfiles/rust-leptos Symbolic link
View file

@ -0,0 +1 @@
/Users/Akasha/Tools/dev-system/languages/rust/just-modules/leptos

5
justfiles/test.just Normal file
View file

@ -0,0 +1,5 @@
# Test recipes
[doc("Show test help")]
help:
@just --list

12
justfiles/workflow.just Normal file
View file

@ -0,0 +1,12 @@
# Generated by ore workflow generate
# Source: .ontology/workflow.ncl
# layer: ci-standard
ci-standard:
cargo clippy --all-targets --all-features -- -D warnings
cargo nextest run --all-features --workspace --profile ci --cargo-profile ci
cargo deny check licenses advisories
cargo doc --no-deps --workspace --profile ci -q
nickel typecheck
nu --ide-check 100
cargo build --release --workspace

125
schemas/catalog/context.ncl Normal file
View file

@ -0,0 +1,125 @@
# Component Context Schema
#
# Declares the ontological layer for a component as deployed in a specific infra.
# Used in infra component configs (e.g. infra/libre-wuji/components/zot.ncl).
#
# Three-layer identity:
# what — what the component is (from the component manifest; override if needed)
# how — how it is deployed here (derived from the settings declared alongside)
# why — why it exists in this infra (intent declared by the operator)
#
# Plus governance dimensions that every component deployment must declare:
# priority, security, supervision, updates.
#
# Usage in a component contract:
# let Context = import "schemas/catalog/context.ncl" in
# { MyComponent = { context | Context.ComponentContext | optional, ... } }
#
# Usage in an infra config:
# context = {
# how = "K8s Deployment with Hetzner CSI PVC, private Cilium gateway",
# why = "Central OCI store for lian-build pipeline and cosign distribution",
# priority = 'critical,
# security = { posture = 'private },
# updates = { policy = 'pinned, holds = ["cosign-verify"] },
# }
{
# ── Priority ────────────────────────────────────────────────────────────────
# Operational priority of this component in this infra.
# Drives incident response, update scheduling, and removal decisions.
ComponentPriority = [|
'critical, # infra fails without it — immediate intervention required
'essential, # core services degraded without it
'important, # significant feature loss without it
'standard, # normal services, managed lifecycle
'optional, # convenience feature; removable without service impact
|],
# ── Security posture ────────────────────────────────────────────────────────
SecurityPosture = [|
'public, # intentionally internet-facing; FIP or public gateway
'private, # private network only — VPN or private gateway required
'internal, # cluster-internal only; no gateway exposure
'airgapped, # no external network access whatsoever
|],
# ── Update policy ───────────────────────────────────────────────────────────
UpdatePolicy = [|
'pinned, # manual only — every version bump requires explicit approval
'semver-patch, # auto-apply patch releases only (x.y.Z)
'semver-minor, # auto-apply minor and patch releases (x.Y.z)
'rolling-latest, # always track latest — only acceptable for 'optional priority
|],
# ── Component Context ───────────────────────────────────────────────────────
ComponentContext = {
# Ontological triad — the three questions any operator must be able to answer
# about any running component.
what | String | doc "What this component is. Defaults to manifest.description; override when the deployment role narrows the description." | optional,
how | String
| doc "How it is deployed in this infra — mode, storage, gateway, key integrations. Derived from the settings declared alongside this context block.",
why | String
| doc "Why it exists in this infra — the purpose, the gap it fills, the service it enables.",
# Governance dimensions
priority | ComponentPriority
| doc "Operational priority: drives response SLA, update scheduling, and removal policy."
| default = 'standard,
security | {
posture | SecurityPosture
| doc "Network exposure posture for all endpoints."
| default = 'internal,
tls | Bool
| doc "TLS required on all exposed endpoints."
| default = true,
concerns | Array String
| doc "Named security concerns to track — e.g. 'credential-rotation', 'access-policy-audit'."
| default = [],
} | default = {},
supervision | {
health_check | Bool
| doc "Active health check configured and expected to pass."
| default = true,
metrics | Bool
| doc "Prometheus-compatible metrics endpoint exposed."
| default = false,
alerts | Array String
| doc "Alert conditions configured — e.g. '5xx-rate', 'storage-capacity'."
| default = [],
sla_target | String
| doc "SLA availability target — e.g. '99.9%'. Informational."
| optional,
} | default = {},
updates | {
policy | UpdatePolicy
| doc "Version update policy for this component."
| default = 'pinned,
window | String
| doc "Maintenance window — e.g. 'weekends UTC+0'. Informational for scheduling."
| optional,
holds | Array String
| doc "Gates required before update proceeds — e.g. 'cosign-verify', 'smoke-test', 'backup-verified'."
| default = [],
} | default = {},
},
}

View file

@ -0,0 +1,201 @@
# infra-catalog Component Manifest Schema
#
# Every component distributed through any infra-catalog source — regardless of
# which registry or peer hosts it — must satisfy this contract.
#
# This schema is the sole coupling point between the catalog ecosystem and any
# tool that consumes it (Provisioning, lian-build, Vapora, or others). Tools
# validate manifests against it; they do not need to know anything else about
# the component source.
#
# Identity comes from content hash (OCI digest), not from registry location.
# The `source` field is informational only.
#
# Deployment modes
# A component may support 'cluster (K8s) and/or 'systemd (host-level).
# Each mode has its own installer script declared in mode_installers.
# Most components are cluster-only; systemd mode is for components that run
# outside K8s (e.g. lian_build daemon, hccm on bare metal).
#
# Extensions
# When a component is installed, the manager discovers and loads any
# extensions it declares: CLI commands (Nu module), Justfile recipes,
# or helper scripts. Extensions must be present at the declared paths.
#
# Usage:
# let Catalog = import "schemas/catalog/manifest.ncl" in
# { ... } | Catalog.Manifest
{
# ── Kind enum ─────────────────────────────────────────────────────────────────
# Each kind binds the component to a specific interface contract.
# Tools that need a capability load the corresponding entry point.
ComponentKind = [|
'ComputeProvider, # spawns/destroys ephemeral compute — implements runner.nu interface
'StorageProvider, # manages persistent volumes — implements storage.nu interface
'RegistryAdapter, # OCI registry operations — implements registry.nu interface
'DeploymentComponent, # deploys a service — implements installer.nu interface
'AgentProvider, # AI/agent capabilities — implements agent.nu interface
'SchemaLibrary, # reusable Nickel schemas — no runtime entry point
|],
# ── Tool requirement ──────────────────────────────────────────────────────────
# Declares an external CLI that the component's scripts invoke.
ToolRequirement = {
name | String,
min_version | String | optional,
check | String | optional
| doc "Shell expression that exits 0 when the tool is available.",
},
# ── Catalog dependency ────────────────────────────────────────────────────────
# Another catalog component required at runtime.
CatalogDep = {
name | String,
kind | ComponentKind,
version | String,
},
# ── Source reference ──────────────────────────────────────────────────────────
# Informational. Content trust is established by OCI digest and signature,
# not by source URL. Multiple sources are mirrors of the same content.
SourceRef = {
radicle | String | optional
| doc "Radicle project ID, e.g. rad:z6MkhDvY...",
git | String | optional
| doc "Git remote URL (mirror or upstream).",
oci | String | optional
| doc "Canonical OCI reference (without digest — digest is in the artifact manifest).",
},
# ── Deployment modes ─────────────────────────────────────────────────────────
# Declares which deployment modes the component supports.
# The manager uses this to validate that the requested mode is available
# before attempting to install.
DeploymentModes = {
cluster | Bool
| doc "Supports in-cluster K8s deployment via cluster/install.sh."
| default = false,
systemd | Bool
| doc "Supports host-level deployment as a systemd unit via systemd/install.sh."
| default = false,
},
# ── Mode installers ───────────────────────────────────────────────────────────
# Paths relative to the component root after installation.
# Only required when the corresponding mode is true in deployment_modes.
# Convention: cluster → "cluster/install.sh", systemd → "systemd/install.sh".
ModeInstallers = {
cluster | String
| doc "Installer script for cluster mode. Implements install/uninstall/status/upgrade."
| optional,
systemd | String
| doc "Installer script for systemd mode. Implements install/uninstall/enable/disable."
| optional,
},
# ── Extensions ────────────────────────────────────────────────────────────────
# Capabilities the component adds to the manager when installed.
# The manager loads declared extensions at startup so callers get new
# subcommands and recipes without modifying the manager itself.
#
# Convention:
# cli → extensions/cli/commands.nu (Nu module; exports "<noun> <verb>" commands)
# just → extensions/just/<name>.just (Justfile module; recipe prefix = name)
# scripts → extensions/scripts/ (helper scripts; no discovery contract)
Extensions = {
cli | Bool
| doc "Provides extensions/cli/commands.nu — loaded by the manager CLI dispatcher."
| default = false,
just | Bool
| doc "Provides extensions/just/<name>.just — Justfile module for workspace recipes."
| default = false,
scripts | Bool
| doc "Provides extensions/scripts/ — helper scripts for direct invocation."
| default = false,
},
# ── Entry points ──────────────────────────────────────────────────────────────
# Runtime interface entry points — paths relative to component root after installation.
# Each ComponentKind mandates its corresponding entry point.
EntryPoints = {
runner | String | optional
| doc "ComputeProvider: script implementing spawn/destroy/status/describe.",
storage | String | optional
| doc "StorageProvider: script implementing create/delete/attach/detach.",
registry | String | optional
| doc "RegistryAdapter: script implementing push/pull/list/delete.",
installer | String | optional
| doc "DeploymentComponent: fallback installer when mode_installers is absent.",
agent | String | optional
| doc "AgentProvider: script implementing run/cancel/status.",
schema | String | optional
| doc "SchemaLibrary: primary .ncl export path.",
},
# ── Capabilities ─────────────────────────────────────────────────────────────
# Declared before loading so callers can query without executing the component.
Capabilities = {
ephemeral_compute | Bool | default = false,
persistent_compute | Bool | default = false,
snapshot_create | Bool | default = false,
network_management | Bool | default = false,
multi_region | Bool | default = false,
arm64 | Bool | default = false,
amd64 | Bool | default = false,
storage_block | Bool | default = false,
storage_object | Bool | default = false,
storage_file | Bool | default = false,
oci_push | Bool | default = false,
oci_pull | Bool | default = false,
llm | Bool | default = false,
rag | Bool | default = false,
workflow | Bool | default = false,
},
# ── Manifest (root contract) ──────────────────────────────────────────────────
Manifest = {
name | String
| doc "Lowercase, hyphenated identifier. Unique within a kind.",
version | String
| doc "Semver string: MAJOR.MINOR.PATCH.",
kind | ComponentKind,
description | String
| doc "One sentence — catalog entry, search index, and discovery text.",
requires = {
nu | String | optional | doc "Minimum Nushell version.",
nickel | String | optional | doc "Minimum Nickel version.",
tools | Array ToolRequirement | default = [],
},
catalog_deps | Array CatalogDep | default = [],
deployment_modes | DeploymentModes | default = {},
mode_installers | ModeInstallers | default = {},
extensions | Extensions | default = {},
entry_points | EntryPoints | default = {},
capabilities | Capabilities | default = {},
source | SourceRef | default = {},
authors | Array String | default = [],
},
}

View file

@ -0,0 +1,23 @@
let cmds_reg_schema = import "./schema.ncl" in
let base_command = {
command | default = "",
aliases | default = [],
requires_daemon | default = false,
requires_services | default = false,
uses_cache | default = false,
requires_args | default = false,
help_category | default = "",
description | default = "",
daemon_target | default = 'none,
} in
{
# Create a command: define only what you need, rest filled from defaults
# Usage: make_command { command = "help", uses_cache = true, ... }
make_command = fun overrides =>
base_command & overrides,
# Default values (validated)
defaults = base_command | cmds_reg_schema.CommandRecord,
}

View file

@ -0,0 +1,19 @@
# Command Registry Schema - Type Contracts
# Defines the structure and validation for all CLI commands
# Type contract for command records
# Open record type (...) allows partial records during merge, validates on output
{
CommandRecord = {
command | String,
aliases | Array String,
requires_daemon | Bool,
requires_services | Bool,
uses_cache | Bool,
requires_args | Bool,
help_category | String,
description | String,
daemon_target | std.enum.TagOrString | [| 'none, 'cli, 'orchestrator |],
..
},
}

View file

@ -0,0 +1,13 @@
# schemas/config/dag/main.ncl — DAG runtime configuration
#
# Exposes DAG execution defaults as runtime config following the schemas/config/ pattern.
# Consumed by the Nushell loader (lib_provisioning/config/loader/dag.nu) via nickel export.
# Workspace-level dag.ncl can override full blocks (execution, resolution, events).
let dag = import "../../lib/dag/main.ncl" in
{
execution = dag.defaults.composition,
resolution = dag.defaults.resolution,
events = dag.defaults.events,
}

View file

@ -3,7 +3,7 @@
# | Pattern: Pure schema definitions using Nickel contracts
{
ServerDefaults = {
ServerDefaults {
lock | Bool,
time_zone | String,
running_wait | Number,
@ -31,5 +31,5 @@
domains_search | String | optional,
user_ssh_key_path | String | optional,
scale | Dyn | optional,
},
}
}

View file

@ -2,9 +2,9 @@
# | Migrated from: provisioning/kcl/defaults.k
# | Pattern: Hybrid - defaults + makers + direct access (contracts available via import)
let contracts_lib = import "./contracts.ncl" in
let defaults_lib = import "./defaults.ncl" in
let lib = import "../../lib/main.ncl" in
#let contracts_lib = import "schemas/config/defaults/contracts.ncl" in
let defaults_lib = import "schemas/config/defaults/defaults.ncl" in
#let lib = import "../../lib/main.ncl" in
{
defaults = defaults_lib,

View file

@ -53,7 +53,7 @@
cluster_admin_host = "",
cluster_admin_port = 22,
servers_wait_started = 27,
cluster_admin_user = "root",
cluster_admin_user = "devadm",
clusters_save_path = "/${main_name}/clusters",
servers_paths = ["servers"],
clusters_paths = ["clusters"],

View file

@ -0,0 +1,239 @@
# Example: Complete Deployment Configuration with Nickel + SOPS Integration
#
# This example shows the hybrid pattern:
# 1. Infrastructure config in .ncl (readable, version-controlled)
# 2. Secrets in YAML (encrypted with SOPS)
# 3. Merged at deployment time
let sops = import "schemas/security/sops/main.ncl" in
let secrets_loader = import "schemas/security/secrets-loader.ncl" in
let config_merger = import "schemas/security/config-merger.ncl" in
{
# ============================================
# STEP 1: Default Configuration
# ============================================
defaults = {
environment = "dev",
deployment_mode = "solo",
database = {
type = "postgresql",
host = "localhost",
port = 5432,
name = "myapp",
user = "app_user",
# Password placeholder - will be replaced by secrets
password = "${secret:database.password}",
ssl = false,
pool_size = 10,
},
redis = {
host = "localhost",
port = 6379,
# Password placeholder - will be replaced by secrets
password = "${secret:redis.password}",
db = 0,
ttl = 3600,
},
api = {
host = "0.0.0.0",
port = 8080,
# API key placeholder - will be replaced by secrets
api_key = "${secret:api.api_key}",
timeout = 30,
max_connections = 100,
},
tls = {
enabled = false,
# Certificate placeholders - will be replaced by secrets
certificate = "${secret:tls.certificate}",
private_key = "${secret:tls.private_key}",
},
},
# ============================================
# STEP 2: Environment-Specific Overrides
# ============================================
environments = {
# All environments inherit these
all = {
logging = {
level = "info",
format = "json",
},
},
# Development overrides
dev = {
database = {
host = "postgres-dev.local",
ssl = false,
},
redis = {
host = "redis-dev.local",
},
api = {
port = 8080,
},
logging = {
level = "debug",
},
},
# Staging overrides
staging = {
database = {
host = "postgres-staging.example.com",
ssl = true,
},
redis = {
host = "redis-staging.example.com",
},
api = {
port = 443,
},
tls = {
enabled = true,
},
logging = {
level = "info",
},
},
# Production overrides
prod = {
database = {
host = "postgres-prod-cluster.example.com",
port = 5432,
ssl = true,
pool_size = 50,
},
redis = {
host = "redis-prod-cluster.example.com",
},
api = {
port = 443,
max_connections = 1000,
},
tls = {
enabled = true,
},
logging = {
level = "warn",
},
},
},
# ============================================
# STEP 3: Deployment Modes
# ============================================
deployment_modes = {
solo = {
replicas = 1,
resources = {
cpu = "1",
memory = "512Mi",
},
},
ha = {
replicas = 3,
resources = {
cpu = "2",
memory = "2Gi",
},
},
enterprise = {
replicas = 5,
resources = {
cpu = "4",
memory = "4Gi",
},
},
},
# ============================================
# STEP 4: SOPS Configuration
# ============================================
sops_config = {
dev = (sops.generate_sops_yaml "dev"),
staging = (sops.generate_sops_yaml "staging"),
prod = (sops.generate_sops_yaml "prod"),
},
# ============================================
# STEP 5: Build Final Configuration
# ============================================
# This function is called at deployment time with:
# - environment: "dev" | "staging" | "prod"
# - secrets: loaded from config/secrets/{env}.yaml (SOPS-encrypted)
# - deployment_mode: "solo" | "ha" | "enterprise"
build_config = fun environment deployment_mode secrets =>
let env_config = config_merger.by_environment @ { defaults = $.defaults, environments = $.environments } environment in
let mode_config = ($.deployment_modes | std.record.get deployment_mode | default {}) in
let base = config_merger.compose_config $.defaults env_config {} in
let with_mode = config_merger.compose_config base mode_config {} in
let final = config_merger.compose_config with_mode secrets {} in
# Merge secrets into placeholders
secrets_loader.merge final secrets,
# ============================================
# Export Configuration for Different Scenarios
# ============================================
# Development configuration (for local testing)
config_dev = {
environment = "dev",
deployment_mode = "solo",
config = (
config_merger.compose_config
$.defaults
($.environments | std.record.get "dev")
{}
),
},
# Staging configuration (requires secrets)
config_staging = {
environment = "staging",
deployment_mode = "ha",
config = (
config_merger.compose_config
$.defaults
($.environments | std.record.get "staging")
{}
),
},
# Production configuration (requires secrets)
config_prod = {
environment = "prod",
deployment_mode = "enterprise",
config = (
config_merger.compose_config
$.defaults
($.environments | std.record.get "prod")
{}
),
},
# ============================================
# Validation
# ============================================
validate = fun configuration =>
let required_paths = [
"database.host",
"database.user",
"redis.host",
"api.port",
] in
config_merger.validate_complete configuration required_paths,
}

View file

@ -77,7 +77,7 @@ Infrastructure Schemas (Docker, Kubernetes, Nginx, etc.)
### Example: Service Port Definition
```bash
# Platform service schema (provisioning/schemas/platform/schemas/orchestrator.ncl)
# Platform service schema (provisioning/schemas/platform/orchestrator.ncl)
server = {
port | Number, # Define port once
}

View file

@ -1,6 +1,4 @@
# | Cluster configuration contracts (schema definitions)
# | Migrated from: provisioning/kcl/cluster.k
# | Pattern: Pure schema definitions using Nickel contracts
let scaling = import "../scaling.ncl" in
{
Cluster = {
@ -16,6 +14,6 @@
admin_port | String | optional,
admin_user | String | optional,
ssh_key_path | String | optional,
scale | Dyn | optional,
scale | scaling.ScalePolicy | optional,
},
}

View file

@ -0,0 +1,26 @@
let node_role_contract = [| 'ControlPlane, 'Worker, 'LoadBalancer |] in
let scale_template_contract = {
server_type | String,
location | String,
hostname_pattern | String,
private_network | String,
ip_range_prefix | String,
formula_id | String | optional,
image_role | String | optional,
os_type | String | optional,
architecture | String | optional,
} in
let scale_policy_contract = {
role | node_role_contract,
min | Number,
max | Number,
template | scale_template_contract,
} in
{
NodeRole = node_role_contract,
ScaleTemplate = scale_template_contract,
ScalePolicy = scale_policy_contract,
}

View file

@ -1,6 +1,4 @@
# | Server configuration contracts (schema definitions)
# | Migrated from: provisioning/kcl/server.k
# | Pattern: Pure schema definitions using Nickel contracts
let scaling = import "../scaling.ncl" in
{
Server = {
@ -37,6 +35,7 @@
main_domain | String | optional,
domains_search | String | optional,
user_ssh_key_path | String | optional,
scale | Dyn | optional,
role | scaling.NodeRole | optional,
scale | scaling.ScalePolicy | optional,
},
}

View file

@ -0,0 +1,46 @@
# ImageRole Contracts — type definitions for provider role images and their state.
{
ImageState = [| 'keep, 'delete_after_use, 'delete_time_lapse, 'archive |],
HardwareLimits = {
min_memory_gb | Number,
min_disk_gb | Number,
allowed_types | Array String,
network_required | Bool,
ports_required | Array Number,
ssh_required | Bool,
},
ImagePackage = {
name | String,
version | String | optional,
},
ImageRole = {
name | String,
os_base | String,
provider | String,
template_name | String,
state | ImageState,
state_config | {
freshness_days | Number,
delete_after_hours | Number | optional,
archive_path | String | optional,
},
packages | Array ImagePackage,
labels | { .. },
hardware | HardwareLimits,
},
# Written to ~/.config/provisioning/images/{provider}-{role}.ncl
ImageRoleState = {
provider | String,
role | String,
snapshot_id | String,
built_at | String | optional,
last_used | String | optional,
os_base | String,
labels | { .. },
},
}

View file

@ -0,0 +1,30 @@
# ImageRole Defaults — base values for image role definitions.
{
image_role | default = {
os_base | default = "debian-12",
provider | default = "hetzner",
template_name | default = "hetzner_build_image.j2",
state | default = 'keep,
state_config | default = {
freshness_days | default = 30,
},
packages | default = [],
labels | default = {},
hardware | default = {
min_memory_gb | default = 2,
min_disk_gb | default = 20,
allowed_types | default = ["cax11", "cax21"],
network_required | default = true,
ports_required | default = [],
ssh_required | default = true,
},
},
image_role_state | default = {
snapshot_id | default = "SNAPSHOT_PENDING",
built_at | default = null,
last_used | default = null,
labels | default = {},
},
}

View file

@ -0,0 +1,17 @@
# ImageRole public API — types and maker functions for provider role images.
let contracts_lib = import "./contracts.ncl" in
let defaults_lib = import "./defaults.ncl" in
{
defaults = defaults_lib,
make_image_role | not_exported = fun overrides =>
defaults_lib.image_role & overrides,
make_image_role_state | not_exported = fun overrides =>
defaults_lib.image_role_state & overrides,
DefaultImageRole = defaults_lib.image_role,
DefaultImageRoleState = defaults_lib.image_role_state,
}

View file

@ -0,0 +1,41 @@
# Backup group contracts — consistency points across multiple components.
# When services interact (Odoo + PostgreSQL + filestore, mail + LDAP + indexes,
# etc.) a per-component snapshot can leave inconsistency between members in DR.
# A BackupGroup declares a coordination window so the manager can produce a
# consistent cut (à la Chandy-Lamport) across members in the same instant.
let bp = import "backup_policy.ncl" in
let vault = import "vault_refs.ncl" in
{
# Member of a backup group. Either references a whole component policy or
# a specific scope of that policy.
GroupMember = {
component | String | doc "ComponentDef.name",
scope | String | optional | doc "BackupScope.name; omitted = all scopes",
},
# Coordination strategies. 'best_effort tags members with the same group_id
# but does not synchronize; 'quiesce_window runs ordered pre-hooks; 'csi_consistent_group
# delegates atomicity to the CSI driver (requires Longhorn ≥ supported version).
CoordinationStrategy = {
kind | [| 'best_effort, 'quiesce_window, 'csi_consistent_group |],
quiesce_seq | Array String | default = [],
max_downtime | bp.Duration | optional,
snapshot_class | String | optional,
},
BackupGroup = {
name | String | doc "Identifier (used in CLI: --group <name>)",
members | Array GroupMember
| doc "Components or scopes participating in the consistent cut",
schedule | bp.Schedule,
coordination | CoordinationStrategy,
retention | bp.RetentionPolicy,
destinations | Array bp.Destination
| doc "Same MultiDestinationRequired invariant as BackupPolicy",
encryption | vault.VaultKeyRef,
tag_strategy | bp.TagStrategy,
verify | bp.VerifyPolicyRef | optional,
},
}

View file

@ -0,0 +1,186 @@
# Backup policy contracts — declarative description of how a component is backed up.
# Consumed by the backup-manager crate (one-shot, daemon, standalone, coordinator modes).
# Encryption, multi-destination replication and non-empty scopes are enforced as
# Nickel contracts so misconfiguration fails at `nickel export` time, not at runtime.
let vault = import "vault_refs.ncl" in
let _Duration = std.contract.from_validator (fun value =>
if !(std.is_string value)
then 'Error { message = "Duration must be a String" }
else if std.string.length value == 0
then 'Error { message = "Duration must be non-empty" }
else if !(std.string.contains "s" value
|| std.string.contains "m" value
|| std.string.contains "h" value
|| std.string.contains "d" value)
then 'Error { message = "Duration must contain a time unit (s, m, h, d)" }
else 'Ok
) in
let _CronExpr = std.contract.from_validator (fun value =>
if !(std.is_string value)
then 'Error { message = "CronExpr must be a String" }
else
let parts = std.string.split " " value in
if std.array.length parts == 5 || std.array.length parts == 6
then 'Ok
else 'Error { message = "CronExpr must have 5 or 6 space-separated fields" }
) in
let _Tags = { _ | String } in
let _DnsRecordsLike = { .. } in
{
Duration = _Duration,
CronExpr = _CronExpr,
Tags = _Tags,
# Schedule discriminated union: cron, interval or NATS event-driven.
Schedule = {
kind | [| 'cron, 'interval, 'on_event |],
cron_expr | _CronExpr | optional,
jitter_sec | Number | optional,
every | _Duration | optional,
jitter | _Duration | optional,
subject | String | optional | doc "NATS subject when kind = 'on_event",
debounce | _Duration | optional,
},
# Retention preset — keeps last N + N daily/weekly/monthly/yearly snapshots.
RetentionPolicy = {
keep_last | Number | default = 7,
keep_daily | Number | default = 7,
keep_weekly | Number | default = 4,
keep_monthly | Number | default = 6,
keep_yearly | Number | default = 0,
prune_after | _Duration | optional | doc "Delete data older than this regardless of keep_* (safety bound)",
},
# A backup destination (where snapshots end up). At least 2 are required
# when policy is enabled (MultiDestinationRequired contract).
Destination = {
name | String | doc "Stable identifier (used in metrics labels and tags)",
kind | [| 's3, 'b2, 'local, 'sftp, 'rest_server |],
uri | String | doc "restic-style URI: 's3:host/bucket', 'b2:bucket', 'sftp:user@host:/path', etc.",
cred_ref | vault.VaultCredRef,
role | [| 'primary, 'replica, 'archive |] | default = 'replica,
region | String | optional,
},
# Tagging strategy for snapshots. The actual tags emitted are a determinístic
# function of {component, scope, parameters} computed by the manager.
TagStrategy = {
component_label | String | doc "Used as `component=<value>` tag",
extra | Array String | doc "Additional static tags (k=v strings)" | default = [],
},
# Database dump strategy. Three flavours cover the consistency/atomicity matrix.
DumpStrategy = {
kind | [| 'stream_to_stdin, 'dump_to_path, 'pre_dump_then_path,
'csi_volume_snapshot, 'app_quiesce_then_snapshot |],
dump_command | String | optional,
path | String | optional,
cleanup | Bool | default = true,
volume | String | optional,
snapshot_class | String | optional,
quiesce_cmd | String | optional,
unquiesce_cmd | String | optional,
},
DbEngine = [| 'postgresql, 'mariadb, 'mysql, 'redis, 'mongodb, 'surrealdb, 'etcd, 'sqlite |],
# Discriminated scope: what gets backed up and how it's grouped/tagged.
BackupScope = {
kind | [| 'service_full, 'per_domain, 'per_mailbox, 'database,
'volume_snapshot, 'logs_archive, 'kv_export |],
name | String | doc "Identifier within a policy (used in CLI: --scope <name>)",
paths | Array String | default = [],
exclude | Array String | default = [],
domains | Array String | default = [],
base_path | String | default = "",
selector | String | optional,
engine | DbEngine | optional,
dump_strategy | DumpStrategy | optional,
volumes | Array String | default = [],
snapshot_class | String | optional,
sources | Array String | default = [],
format | [| 'jsonl_gz, 'tar_gz, 'restic_native, 'sqlite_dump |] | optional,
rotation | _Duration | optional,
source | [| 'etcd, 'consul, 'loki, 'journald, 'files |] | optional,
tag_prefix | String | default = "",
tags | _Tags | default = {},
},
# Pre/post hooks executed by the manager around the backup run.
Hooks = {
pre | Array String | default = [],
post | Array String | default = [],
timeout | _Duration | default = "5m",
abort_on_failure | Bool | default = true,
},
# Throttle network bandwidth (passed to provider as --limit-upload/--limit-download).
Throttle = {
upload_kbps | Number | optional,
download_kbps | Number | optional,
},
# Verify policy. Drill is a separate spec consumed by verify_policy.ncl.
VerifyPolicyRef = {
schedule | { kind | [| 'cron, 'interval |], cron_expr | String | optional, every | _Duration | optional } | optional,
level | [| 'quick, 'deep, 'restore_drill, 'full_dr |] | default = 'quick,
drill_ref | String | optional | doc "Reference to a DrillSpec by name (looked up from verify-recipes/)",
},
# Provider reference. Manager resolves to extensions/providers/backup/<name>/.
BackupProviderRef = {
name | String | doc "Provider directory name (e.g. 'restic', 'kopia')",
version | String | optional | doc "Pinned version; warn if installed CLI mismatches",
},
# === Contracts ===========================================================
# NonEmptyScopes: an enabled BackupPolicy must have at least one scope.
NonEmptyScopes = std.contract.from_validator (fun value =>
if std.array.length value > 0
then 'Ok
else 'Error { message = "BackupPolicy.scopes must contain at least one BackupScope" }
),
# MultiDestinationRequired: enforces the off-site replication invariant.
# A policy must declare ≥2 destinations and at least one with role = 'primary.
MultiDestinationRequired = std.contract.from_validator (fun value =>
if std.array.length value < 2
then 'Error {
message = "BackupPolicy.destinations must contain at least 2 entries (off-site replication is non-negotiable)",
}
else
let has_primary = std.array.any (fun d => d.role == 'primary) value in
if !has_primary
then 'Error {
message = "BackupPolicy.destinations must contain at least one entry with role = 'primary",
}
else 'Ok
),
# === Top-level policy ====================================================
BackupPolicy = {
provider | BackupProviderRef,
destinations | Array Destination
| doc "≥2 destinations, at least one 'primary",
encryption | vault.VaultKeyRef
| doc "Encryption key reference in vault (E2E encryption is non-negotiable)",
schedule | Schedule,
retention | RetentionPolicy,
scopes | Array BackupScope
| doc "1..N backup units; tagged determinístically",
tag_strategy | TagStrategy,
hooks | Hooks | optional,
verify | VerifyPolicyRef | optional,
throttle | Throttle | optional,
consistency_group | String | optional | doc "If set, this policy participates in a BackupGroup",
},
}

View file

@ -0,0 +1,71 @@
# schemas/lib/build_spec.ncl — BuildSpec contract (ADR-039)
#
# Schema for .build-spec.ncl files at the root of each built repo.
# buildkit-launcher validates against this schema at parse time and exits
# non-zero on failure (constraint: build-spec-schema-versioned).
#
# Three-tier sizing resolution (launcher, not schema):
# 1. Explicit declaration here (highest priority)
# 2. P95 historical from orchestrator SurrealDB × 1.2
# 3. Language-default fallback (lowest priority)
#
# Usage:
# let bs = import "schemas/lib/build_spec.ncl" in
# { .. } | bs.BuildSpec
let positive_number_ =
std.contract.custom (
fun label =>
fun value =>
if value > 0 then
'Ok value
else
'Error {
message = "Expected a positive number, got '%{std.to_string value}'.\nAll resource fields must be > 0"
}
)
in
let bounded_cpu_ =
std.contract.custom (
fun label =>
fun value =>
if value > 0 && value <= 256 then
'Ok value
else
'Error {
message = "Invalid cpu value '%{std.to_string value}'.\nValid range: (0, 256]"
}
)
in
let bounded_time_budget_ =
std.contract.custom (
fun label =>
fun value =>
if value > 0 && value <= 1440 then
'Ok value
else
'Error {
message = "Invalid time_budget_min '%{std.to_string value}'.\nValid range: (0, 1440] — max 24 hours"
}
)
in
let _BuildSpec = {
schema_version | Number | doc "Schema version — buildkit-launcher rejects files with unknown versions" | default = 1,
cpu | bounded_cpu_ | doc "Virtual CPUs to request for the ephemeral runner VM",
memory_gb | positive_number_ | doc "RAM in GiB for the runner VM",
disk_gb | positive_number_ | doc "Ephemeral disk in GiB; no persistent storage — all state is destroyed with the VM",
time_budget_min | bounded_time_budget_ | doc "Hard wall-clock limit in minutes; VM is killed on expiry",
cache_keys | Array String | doc "sccache / BuildKit cache key namespaces to warm for this repo" | default = [],
oom_retry | Bool | doc "When true, launcher retries once at next size tier on OOM kill; bounded to 1 retry (constraint: oom-retry-bounded)" | default = true,
} in
{
BuildSpec = _BuildSpec,
PositiveNumber = positive_number_,
BoundedCpu = bounded_cpu_,
make_build_spec | not_exported = fun data => data | _BuildSpec,
}

View file

@ -0,0 +1,73 @@
# schemas/lib/capabilities.ncl — InfraCapabilities contract
#
# Declares what the infrastructure provides: cluster runtime, storage classes,
# ingress, TLS, volumes, networking, and registry topology.
# Source of truth for cross-validation against component requires.* fields
# and for registry resolution by integration tooling (prvng i).
#
# Usage:
# let cap = import "schemas/lib/capabilities.ncl" in
# { provides | cap.InfraCapabilities = { ... } }
{
# Registry roles — determines namespace ownership and sync direction.
# 'primary canonical store; other registries replicate FROM it
# 'build builder-local store; owns ephemeral cache namespaces
# 'dev developer workstation; on-demand mirror of primary
# 'mirror read-only replica with no own namespaces
RegistryRole = [| 'primary, 'build, 'dev, 'mirror |],
# Per-registry namespace policy.
# own — namespaces this registry is authoritative for
# replicate_to — ids of other registries that should receive sync of `prefixes`
# mirror_from — id of upstream registry to mirror `prefixes` from (on-demand)
# prefixes — which namespace prefixes are synced (cross-registry contracts)
RegistryNamespaces = {
own | Array String | default = [],
replicate_to | Array String | default = [],
mirror_from | String | optional,
prefixes | Array String | default = [],
},
RegistryEntry = {
id | String,
endpoint | String,
role | RegistryRole,
tls | Bool | default = true,
namespaces | RegistryNamespaces | default = {},
},
# Multi-registry topology for a workspace.
# registries — ordered list; first 'primary entry is the canonical store
# default — id of the registry used by integration tooling when no
# --registry flag or PROVISIONING_REGISTRY env is set
RegistriesConfig = {
registries | Array RegistryEntry | default = [],
default | String | optional,
},
InfraCapabilities = {
cluster | {
name | String,
runtime | String,
..
} | optional,
storage_classes | Array String | default = [],
ingress_class | String | optional,
container_runtime | String | optional,
volumes | { _ | { mount | String, size_gb | Number } } | default = {},
networking | {
private_network | String | optional,
subnet | String | optional,
floating_ip | String | optional,
..
} | default = {},
tls | {
cluster_issuer | String | optional,
available | Bool | default = false,
..
} | default = {},
registries | RegistriesConfig | default = {},
..
},
}

171
schemas/lib/concerns.ncl Normal file
View file

@ -0,0 +1,171 @@
# Service Concerns Umbrella — mandatory declarative surface in ComponentDef.
# Every component must declare what it does (or doesn't do) for each concern:
# tls, dns, certs, backup, observability, security. Each is one of:
# 'enabled <impl> — concern is implemented; impl carries the configuration
# 'disabled — explicitly opt-out, with a stated reason
# 'pending — implementation deferred, with a backlog reference
# 'inherited — copied from a parent component (e.g. odoo profile)
#
# The umbrella absorbs the loose fields that components carry today
# (tls_secret, cluster_issuer, cert{}, dns_internal, dns_records, …) into
# typed variants. Existing 'extensions/components/<x>/nickel/main.ncl helpers
# may continue to read the loose fields for backwards compatibility while
# also emitting `concerns` for new consumers.
let bp = import "backup_policy.ncl" in
{
# === Concern state ========================================================
# Discriminated union of concern states. Encoded as a record with a `kind`
# tag so multiple concerns can coexist in a single ServiceConcerns record
# (Nickel does not support algebraic data types directly).
ConcernState = {
kind | [| 'enabled, 'disabled, 'pending, 'inherited |],
# 'enabled — payload depends on the concern (tls.impl, dns.impl, …);
# callers thread the right impl type via the wrapper records below.
# 'disabled
reason | String | optional,
since | String | optional | doc "ISO date when concern was explicitly disabled",
# 'pending
backlog_ref | String | optional | doc "Identifier of the backlog/issue tracking the implementation",
target_iteration | String | optional,
# 'inherited
from | String | optional | doc "Name of the parent ComponentDef the concern is inherited from",
# 'enabled payload — exactly one of these is populated based on the concern
tls_impl | { .. } | optional,
dns_impl | { .. } | optional,
certs_impl | { .. } | optional,
backup_impl | { .. } | optional,
observability_impl | { .. } | optional,
security_impl | { .. } | optional,
},
# === Concern impl types ===================================================
# TLS implementation. Absorbs `tls_secret`, `cluster_issuer`, `tls_hostnames`.
TlsImpl = {
secret_name | String | doc "K8s Secret name where cert-manager stores the cert (was tls_secret)",
issuer_ref | String | doc "ClusterIssuer name (was cluster_issuer)",
hostnames | Array String | doc "Additional SANs (was tls_hostnames)" | default = [],
},
# DNS implementation. Absorbs `dns_internal` (private routes via gateway),
# `dns_records` (public records: domain/mx/spf/dmarc/dkim_selector/autoconfig),
# `dns_zone`, `acme_email`.
DnsRoute = {
name | String,
zone | String,
gateway | String | optional,
target | String | optional,
},
DnsRecordSpec = {
domain | String | optional,
hostname | String | optional,
mx | Array { priority | Number, value | String } | default = [],
spf | String | optional,
dmarc | { policy | [| 'none, 'quarantine, 'reject |], rua | String | optional, ruf | String | optional } | optional,
autoconfig | String | optional,
dkim_selector | String | optional,
extra | { .. } | doc "Free-form provider-specific records" | default = {},
},
DnsImpl = {
internal | Array DnsRoute | doc "Was dns_internal (dns_private.via_gateway/make_route)" | default = [],
public | DnsRecordSpec | optional | doc "Was dns_records",
zone | String | optional | doc "Was dns_zone",
acme_email | String | optional | doc "Was acme_email (only used when certs concern derives from this email)",
},
# Certificates implementation. Absorbs `cert = { acme_server, email, secret_ref, provider }`.
# Distinct from TLS: TLS = pedido al issuer; Certs = config del ACME issuer.
CertsImpl = {
acme_server | String,
email | String,
secret_ref | String | doc "DNS provider credentials secret reference",
provider | [| 'cloudflare, 'hetzner, 'aws, 'route53, 'digitalocean, 'gcp, 'azure |],
},
# Observability implementation. Surface only — deeper schemas land in a
# later iteration. Components most commonly declare 'pending here.
ObservabilityImpl = {
metrics | { enabled | Bool, port | Number | optional, path | String | default = "/metrics" } | default = { enabled = false },
logs | { enabled | Bool, sink | [| 'stdout, 'loki, 'journald |] | default = 'stdout } | default = { enabled = false },
traces | { enabled | Bool, otlp_endpoint | String | optional } | default = { enabled = false },
alerts | Array { name | String, expr | String, severity | [| 'info, 'warning, 'critical |] | default = 'warning } | default = [],
},
# Security implementation. Surface only.
SecurityImpl = {
network_policy | String | optional | doc "Reference to a NetworkPolicy resource",
pod_security | [| 'restricted, 'baseline, 'privileged |] | optional,
rbac | String | optional | doc "Reference to RBAC bundle",
},
# === Builders =============================================================
# Helper functions for components and migrations to construct ConcernState
# values without repeating the discriminated-union plumbing.
enabled_tls = fun impl => {
kind = 'enabled,
tls_impl = impl,
},
enabled_dns = fun impl => {
kind = 'enabled,
dns_impl = impl,
},
enabled_certs = fun impl => {
kind = 'enabled,
certs_impl = impl,
},
enabled_backup = fun impl => {
kind = 'enabled,
backup_impl = impl,
},
enabled_observability = fun impl => {
kind = 'enabled,
observability_impl = impl,
},
enabled_security = fun impl => {
kind = 'enabled,
security_impl = impl,
},
disabled = fun reason_text => {
kind = 'disabled,
reason = reason_text,
},
pending = fun reason_text backlog => {
kind = 'pending,
reason = reason_text,
backlog_ref = backlog,
},
inherited = fun parent_name => {
kind = 'inherited,
from = parent_name,
},
# === Top-level umbrella ===================================================
ServiceConcerns = {
tls | ConcernState,
dns | ConcernState,
certs | ConcernState,
backup | ConcernState,
observability | ConcernState,
security | ConcernState,
},
}

View file

@ -0,0 +1,193 @@
# Reusable ServiceConcerns presets for component defaults.
#
# Component contracts.ncl files declare `concerns | _concerns_lib.ServiceConcerns | optional`
# and their defaults.ncl files set `concerns | default = presets.<preset>` to give
# the component an honest declarative surface without repeating boilerplate.
#
# Presets cover the recurring archetypes in libre-wuji:
# - stateless : no TLS/DNS/data — most container runtimes,
# kernel modules, OS-level taskservs
# - infra_storage_managed : storage backends that handle their own
# backup outside per-component policies
# (Longhorn engine state via SystemBackupDef)
# - tls_endpoint_with_acme : public service with cert-manager TLS
# and ACME issuer config; backup decided
# at workspace level
# - observability_telemetry : Prometheus/Grafana/Loki/Vector — config
# in git, data either transient or already
# shipped to S3
# - infrastructure_glue : controllers/operators with no user data
# (cilium, hccm, csi, ops-controller)
let _pending_obs = {
kind = 'pending,
reason = "ObservabilityImpl iteration deferred — surface stub only",
backlog_ref = "OBS-001",
} in
let _pending_sec = {
kind = 'pending,
reason = "SecurityImpl iteration deferred — surface stub only",
backlog_ref = "SEC-001",
} in
{
presets = {
# ── Stateless service ────────────────────────────────────────────────
# Container runtimes (containerd, runc, crun, youki), OS modules,
# kernel-level taskservs. No persistent state, no network endpoints
# exposed at the component level.
stateless = {
tls = { kind = 'disabled, reason = "no TLS termination at this layer" },
dns = { kind = 'disabled, reason = "no DNS records owned by this component" },
certs = { kind = 'disabled, reason = "no ACME issuer required" },
backup = {
kind = 'disabled,
reason = "stateless: configuration in git, no runtime data to capture",
},
observability = _pending_obs,
security = _pending_sec,
},
# ── Storage backend with its own backup model ────────────────────────
# Longhorn (engine state in SystemBackupDef.longhorn_engine), local-path
# provisioner, Hetzner CSI, democratic CSI. Their data is captured by
# the system-level backup, not per-component.
infra_storage_managed = {
tls = { kind = 'disabled, reason = "internal cluster storage, no TLS endpoint" },
dns = { kind = 'disabled, reason = "no DNS records owned by this component" },
certs = { kind = 'disabled, reason = "no ACME issuer required" },
backup = {
kind = 'disabled,
reason = "engine state captured by SystemBackupDef.longhorn_engine (or equivalent system target)",
},
observability = _pending_obs,
security = _pending_sec,
},
# ── Public service with cert-manager TLS + ACME ──────────────────────
# docker-mailserver, odoo, zot, anything that terminates HTTPS or SMTPS
# via cert-manager. Tls/Dns/Certs concerns get populated from existing
# tls_secret/cluster_issuer/cert/dns_records fields. Backup decided at
# workspace level (concerns.backup overridden in infra/<workspace>/components/<x>.ncl).
tls_endpoint_with_acme = fun args =>
{
tls = {
kind = 'enabled,
tls_impl = {
secret_name = args.tls_secret,
issuer_ref = args.cluster_issuer,
hostnames = args.hostnames,
},
},
dns = {
kind = 'enabled,
dns_impl = {
internal = args.dns_internal,
zone = args.dns_zone,
},
},
certs = {
kind = 'enabled,
certs_impl = {
acme_server = args.acme_server,
email = args.acme_email,
secret_ref = args.cert_secret_ref,
provider = args.cert_provider,
},
},
backup = {
kind = 'pending,
reason = "BackupPolicy declared at workspace level",
backlog_ref = args.backup_backlog_ref,
},
observability = _pending_obs,
security = _pending_sec,
},
# ── Observability stack components (Prometheus/Grafana/Loki/Vector) ──
# No user data; configuration in git; metric/log data either transient
# (Prometheus WAL) or already shipped to S3 (Loki via boltdb-shipper).
observability_telemetry = {
tls = { kind = 'disabled, reason = "internal cluster service, ingress-level TLS handled separately" },
dns = { kind = 'disabled, reason = "no DNS records owned by this component" },
certs = { kind = 'disabled, reason = "no ACME issuer required" },
backup = {
kind = 'disabled,
reason = "config in git; runtime data either transient or shipped to S3 backend",
},
observability = {
kind = 'enabled,
observability_impl = {
metrics = { enabled = true, port = 9090, path = "/metrics" },
logs = { enabled = true, sink = 'loki },
traces = { enabled = false },
alerts = [],
},
},
security = _pending_sec,
},
# ── Infrastructure glue (controllers/operators) ──────────────────────
# cilium, hccm, hetzner-csi, ops-controller. State lives in K8s API,
# captured by SystemBackupDef.cluster_resources.
infrastructure_glue = {
tls = { kind = 'disabled, reason = "controller-level RBAC, not TLS endpoint" },
dns = { kind = 'disabled, reason = "no DNS records owned by this component" },
certs = { kind = 'disabled, reason = "no ACME issuer required" },
backup = {
kind = 'disabled,
reason = "state in K8s API captured by SystemBackupDef.cluster_resources",
},
observability = _pending_obs,
security = _pending_sec,
},
# ── DNS provider service (CoreDNS, external-dns) ─────────────────────
# Owns DNS records but typically not TLS endpoint of its own.
dns_provider = {
tls = { kind = 'disabled, reason = "DNS server, not TLS endpoint" },
dns = {
kind = 'enabled,
dns_impl = {
internal = [],
zone = "",
},
},
certs = { kind = 'disabled, reason = "no ACME issuer required" },
backup = {
kind = 'pending,
reason = "zone files captured by SystemBackupDef.external_dns",
backlog_ref = "BACKUP-DNS-001",
},
observability = _pending_obs,
security = _pending_sec,
},
# ── Database (PostgreSQL, MariaDB, SurrealDB) ─────────────────────────
# Backup with database scope + dump strategy. Decided at workspace level.
database = {
tls = { kind = 'disabled, reason = "internal cluster service, ingress-level TLS handled separately" },
dns = { kind = 'disabled, reason = "no public DNS records" },
certs = { kind = 'disabled, reason = "no ACME issuer required" },
backup = {
kind = 'pending,
reason = "BackupPolicy with database scope + dump_strategy declared at workspace level",
backlog_ref = "BACKUP-DB-001",
},
observability = _pending_obs,
security = _pending_sec,
},
},
# ── Helper for components that need to compose a custom ServiceConcerns
# from individual variants (rather than picking a preset wholesale).
builders = {
pending = fun reason backlog => {
kind = 'pending,
reason = reason,
backlog_ref = backlog,
},
disabled = fun reason => { kind = 'disabled, reason = reason },
},
}

View file

@ -2,6 +2,8 @@
# | Migrated from: provisioning/kcl/lib.k
# | Pattern: Schema definitions only
let _concerns_lib = import "concerns.ncl" in
{
StorageVol = {
name | String,
@ -24,11 +26,26 @@
parts,
},
TaskServDependency = {
name | String,
kind | [| 'Requires, 'PrefersBefore, 'ConflictsWith |] | default = 'Requires,
condition | String | default = "",
},
TaskServDef = {
name | String,
install_mode | String | default = "library",
profile | String | default = "default",
name | String,
install_mode | String | default = "library",
profile | String | default = "default",
target_save_path | String | default = "",
depends_on | Array {
name | String,
kind | [| 'Requires, 'PrefersBefore, 'ConflictsWith |] | default = 'Requires,
condition | String | default = "",
} | default = [],
on_error | [| 'Stop, 'Continue, 'Retry |] | default = 'Stop,
max_retries | Number | default = 0,
params | { .. } | default = {},
..
},
ClusterDef = {
@ -37,6 +54,130 @@
target_save_path | String | default = "",
},
# Unified component model — deployment mode selector
DeployMode = [| 'taskserv, 'cluster, 'container |],
# Port exposure requirements declared by a component
PortRequirement = {
port | Number,
protocol | String | default = "TCP",
exposure | [| 'public, 'private, 'internal |] | default = 'internal,
},
# What a component needs from the infrastructure
ComponentRequires = {
storage | { size | String, persistent | Bool } | optional,
ports | Array {
port | Number,
protocol | String | default = "TCP",
exposure | [| 'public, 'private, 'internal |] | default = 'internal,
} | default = [],
credentials | Array String | default = [],
},
# What a component exposes to other components
ComponentProvides = {
service | String | optional,
port | Number | optional,
databases | Array String | default = [],
endpoints | Array String | default = [],
},
# Operations supported by a component (maps to CMD_TSK dispatch in scripts)
ComponentOperations = {
install | Bool | default = true,
update | Bool | default = false,
reinstall | Bool | default = false,
delete | Bool | default = false,
backup | Bool | default = false,
restore | Bool | default = false,
health | Bool | default = false,
config | Bool | default = false,
scripts | Bool | default = false,
restart | Bool | default = false,
},
# How to verify a component is live after deployment.
# Orthogonal to mode (provisioning mechanism) — describes runtime observability strategy.
LiveCheckDef = {
# 'k8s_pods — kubectl get pods filtered by namespace+selector (via CP SSH)
# 'k8s_nodes — kubectl get nodes filtered by selector; healthy = all Ready (for worker components)
# 'k8s_api — proxy: apiserver reachable if kubectl returns node list
# 'systemd — systemctl is-active <service> on target servers (skipped in ll fast path)
# 'none — no observable runtime state (one-shot ops, bare binaries)
strategy | [| 'k8s_pods, 'k8s_nodes, 'k8s_api, 'systemd, 'none |] | default = 'none,
# 'cp_only — SSH to control-plane only (kubectl sees all pods/nodes from there)
# 'target — SSH to component.target (typically CP for taskservs with explicit target)
# 'all_servers — check all servers in workspace state (systemd only; skipped in ll)
# 'workers_only — check only worker nodes (k8s_nodes for kubernetes_worker)
scope | [| 'cp_only, 'target, 'all_servers, 'workers_only |] | default = 'cp_only,
namespace | String | default = "", # overrides component.namespace for pod filter
selector | String | default = "", # overrides component.pod_selector; also used as node name filter
service | String | default = "", # systemd unit name
# Aggregation for multi-server checks (all_servers / workers_only scope):
# 'all_must_pass — any failure → degraded (runtimes, DNS)
# 'any_active — at least one live → partial acceptable
# 'majority — >50% live → healthy
aggregate | [| 'all_must_pass, 'any_active, 'majority |] | default = 'all_must_pass,
},
# Unified component definition — extends TaskServDef shape with mode, requires, provides.
# Open record with defaults on new fields: existing taskservs satisfy ComponentDef.
ComponentDef = {
name | String,
mode | [| 'taskserv, 'cluster, 'container |] | default = 'taskserv,
target | String | optional, # server hostname (taskserv mode)
namespace | String | optional, # k8s namespace (cluster mode)
pod_selector | String | optional, # k8s pod name search pattern (overrides component name when k8s release name differs)
live_check | LiveCheckDef | default = { strategy = 'none, scope = 'cp_only, namespace = "", selector = "", service = "", aggregate = 'all_must_pass },
node_selector | { _ | String } | optional, # k8s node affinity (cluster mode)
install_mode | String | default = "library",
profile | String | default = "default",
target_save_path | String | default = "",
depends_on | Array {
name | String,
kind | [| 'Requires, 'PrefersBefore, 'ConflictsWith |] | default = 'Requires,
condition | String | default = "",
} | default = [],
on_error | [| 'Stop, 'Continue, 'Retry |] | default = 'Stop,
max_retries | Number | default = 0,
params | { .. } | default = {},
requires | {
storage | { size | String, persistent | Bool } | optional,
ports | Array {
port | Number,
protocol | String | default = "TCP",
exposure | [| 'public, 'private, 'internal |] | default = 'internal,
} | default = [],
credentials | Array String | default = [],
} | default = {},
provides | {
service | String | optional,
port | Number | optional,
databases | Array String | default = [],
endpoints | Array String | default = [],
} | default = {},
operations | {
install | Bool | default = true,
update | Bool | default = false,
reinstall | Bool | default = false,
delete | Bool | default = false,
backup | Bool | default = false,
restore | Bool | default = false,
health | Bool | default = false,
config | Bool | default = false,
scripts | Bool | default = false,
restart | Bool | default = false,
} | default = {},
# Mandatory declarative surface for service-level concerns. Each entry is a
# ConcernState variant (enabled/disabled/pending/inherited). Components that
# don't implement a concern declare 'pending {reason, backlog_ref} or
# 'disabled {reason} — never omit. CI/ontoref consume this surface to emit
# backlog priorities and architecture documentation.
concerns | _concerns_lib.ServiceConcerns,
..
},
ScaleData = {
def | String,
disabled | Bool,

View file

@ -0,0 +1,122 @@
# schemas/lib/dag/contracts.ncl — DAG domain type contracts
#
# Two distinct DAG layers:
# 1. Capability layer — ExtensionCapability/ExtensionDependency (extension metadata)
# 2. Composition layer — WorkspaceComposition (inter-formula ordering)
# 3. Resolution layer — ResolutionPolicy (capability → extension mapping)
#
# Pattern: separate let bindings with _ prefix, same as formula.ncl.
# No self-references, no let rec — each binding is in scope for subsequent ones.
# ---------------------------------------------------------------------------
# Capability layer
# ---------------------------------------------------------------------------
let _capability_kind = [| 'Required, 'Optional, 'ConflictsWith |] in
let _ExtensionCapability = {
id | String,
version | String,
interface | String,
} in
let _ExtensionDependency = {
capability | String,
kind | _capability_kind,
min_version | String | optional,
} in
# ---------------------------------------------------------------------------
# Composition layer — inter-formula DAG
# Distinct from the intra-formula DAG in formula.ncl (per-server task ordering).
# WorkspaceComposition declares execution ordering between formulas.
# ---------------------------------------------------------------------------
let _composition_condition = [| 'Completed, 'Healthy, 'Running |] in
let _FormulaDep = {
formula_id | String,
condition | _composition_condition,
} in
let _HealthGate = {
check_cmd | String,
expect | String,
timeout_ms | Number,
retries | Number,
check_server | String | optional,
} in
let _FormulaCompositionEntry = {
formula_id | String,
depends_on | Array _FormulaDep | default = [],
parallel | Bool | default = false,
health_gate | _HealthGate | optional,
} in
# Base shape — used as first step inside the custom contract (same pattern as _FormulaBase
# in formula.ncl) so missing-field errors surface before cross-field validation runs.
let _WorkspaceCompositionBase = {
formulas | Array _FormulaCompositionEntry,
} in
# Custom contract: validates referential integrity across formula entries.
# - At least one formula must have depends_on = [] (root node)
# - All depends_on[].formula_id must reference a declared formula_id
let _WorkspaceComposition = std.contract.custom (fun label value =>
let base = value | _WorkspaceCompositionBase in
let ids = base.formulas |> std.array.map (fun e => e.formula_id) in
let has_root = base.formulas |> std.array.any (fun e => e.depends_on == []) in
let bad_deps = base.formulas |> std.array.flat_map (fun e =>
e.depends_on
|> std.array.filter (fun d =>
!(ids |> std.array.any (fun id => id == d.formula_id))
)
|> std.array.map (fun d =>
"formula '%{e.formula_id}' depends_on unknown '%{d.formula_id}'"
)
) in
if !has_root then
std.contract.blame_with_message
"WorkspaceComposition: at least one formula must have depends_on = []"
label
else if (std.array.length bad_deps) > 0 then
std.contract.blame_with_message
"WorkspaceComposition: invalid depends_on references: %{std.string.join ", " bad_deps}"
label
else
'Ok base
) in
# ---------------------------------------------------------------------------
# Resolution layer — capability → concrete extension mapping
# ---------------------------------------------------------------------------
let _resolution_strategy = [| 'Strict, 'BestEffort |] in
let _ResolutionEntry = {
capability_id | String,
extension_name | String,
} in
let _ResolutionPolicy = {
strategy | _resolution_strategy,
overrides | Array _ResolutionEntry | default = [],
allow_optional_gaps | Bool,
} in
# ---------------------------------------------------------------------------
# Exports
# ---------------------------------------------------------------------------
{
CapabilityKind = _capability_kind,
ExtensionCapability = _ExtensionCapability,
ExtensionDependency = _ExtensionDependency,
CompositionCondition = _composition_condition,
FormulaDep = _FormulaDep,
HealthGate = _HealthGate,
FormulaCompositionEntry = _FormulaCompositionEntry,
WorkspaceComposition = _WorkspaceComposition,
ResolutionStrategy = _resolution_strategy,
ResolutionEntry = _ResolutionEntry,
ResolutionPolicy = _ResolutionPolicy,
}

View file

@ -0,0 +1,26 @@
# schemas/lib/dag/defaults.ncl — DAG domain default values
#
# Pure default values — no contracts, no functions.
# Pattern follows schemas/lib/defaults.ncl.
#
# Consumers: schemas/lib/dag/main.ncl exposes these as dag.defaults.*
# schemas/config/dag/main.ncl imports them for runtime config
{
composition = {
max_parallel = 4,
default_on_error = 'Stop,
default_retries = 0,
health_check_interval_ms = 5000,
timeout_ms = 300000,
},
resolution = {
strategy = 'Strict,
allow_optional_gaps = false,
overrides = [],
},
events = {
emit_nats = true,
subject_prefix = "provisioning.dag",
},
}

26
schemas/lib/dag/main.ncl Normal file
View file

@ -0,0 +1,26 @@
# schemas/lib/dag/main.ncl — DAG domain public API
#
# Re-exports all contracts and defaults from the dag/ subdomain.
# Registered in schemas/lib/main.ncl as: dag = import "./dag/main.ncl"
# Accessible as: provisioning.lib.dag.WorkspaceComposition etc.
let c = import "./contracts.ncl" in
let d = import "./defaults.ncl" in
{
# Contracts — applied via | dag.WorkspaceComposition, | dag.ResolutionPolicy, etc.
CapabilityKind = c.CapabilityKind,
ExtensionCapability = c.ExtensionCapability,
ExtensionDependency = c.ExtensionDependency,
CompositionCondition = c.CompositionCondition,
FormulaDep = c.FormulaDep,
HealthGate = c.HealthGate,
FormulaCompositionEntry = c.FormulaCompositionEntry,
WorkspaceComposition = c.WorkspaceComposition,
ResolutionStrategy = c.ResolutionStrategy,
ResolutionEntry = c.ResolutionEntry,
ResolutionPolicy = c.ResolutionPolicy,
# Default values — used by config/dag/main.ncl and workspace-level overrides
defaults = d,
}

View file

@ -1,20 +1,29 @@
# Extension Metadata Schema - Type-safe extension definition
# Defines metadata for each extension including dependencies and best practices
# Used for DAG construction and extension initialization ordering
#
# Capability fields (provides/requires/conflicts_with) added additively with defaults.
# All existing metadata.ncl files continue to export without modification.
# The detect_conflicts reflection step reads .conflicts_with // [] — the // [] null-coalesce
# was already in place; once files are migrated these fields become active.
let dag = import "./dag/contracts.ncl" in
# Schema for extension metadata
let ExtensionMetadataSchema = {
name | String,
version | String,
category | String,
description | String,
dependencies | Array String,
tags | Array String,
best_practices | Array String,
name | String,
version | String,
category | String | default = "", # optional in flat components/ structure
description | String,
dependencies | Array String | default = [], # legacy flat dependency list — kept
provides | Array dag.ExtensionCapability | default = [], # capability ids this extension satisfies
requires | Array dag.ExtensionDependency | default = [], # typed capability requirements
conflicts_with | Array String | default = [], # extension names this conflicts with
tags | Array String,
modes | Array String | default = ["taskserv"], # available deployment modes
best_practices | Array String | default = [],
}
in
# Export schema
{
schema = ExtensionMetadataSchema,
}

130
schemas/lib/formula.ncl Normal file
View file

@ -0,0 +1,130 @@
# schemas/lib/formula.ncl — Workspace Formula DAG
#
# A Formula is a typed DAG that is simultaneously:
# - A validatable declaration (Nickel typecheck + referential integrity)
# - An executable pipeline (Orchestrator consumes the DAG via nickel export)
# - A governable artifact (on+re tracks state, gates, and audit)
#
# Usage:
# let f = import "schemas/lib/formula.ncl" in
# f.make_formula { id = "...", nodes = [...], ... }
let ts = import "contracts.ncl" in
let _dep_kind = [| 'Always, 'OnSuccess, 'OnFailure |] in
let _on_error = [| 'Stop, 'Continue, 'Retry |] in
# Dependency from one FormulaNode to another (by node id)
let _FormulaDep = {
node_id | String,
kind | _dep_kind | default = 'OnSuccess,
} in
# A node in the formula DAG.
# Exactly one of `taskserv` or `component` must be present.
# - taskserv: L2 nodes — legacy field, existing formulas unchanged
# - component: L3+ nodes — unified model, orchestrator uses component.mode to resolve
let _FormulaNode = std.contract.custom (fun label value =>
let base = value | {
id | String,
taskserv | ts.TaskServDef | optional,
component | ts.ComponentDef | optional,
depends_on | Array _FormulaDep | default = [],
parallel | Bool | default = false,
on_error | _on_error | default = 'Stop,
max_retries | Number | default = 0,
} in
let has_taskserv = std.record.has_field "taskserv" base in
let has_component = std.record.has_field "component" base in
if has_taskserv && has_component then
std.contract.blame_with_message
"FormulaNode '%{base.id}': exactly one of 'taskserv' or 'component' must be present, not both"
label
else if (!has_taskserv) && (!has_component) then
std.contract.blame_with_message
"FormulaNode '%{base.id}': exactly one of 'taskserv' or 'component' must be present"
label
else
'Ok base
) in
# An explicit edge declaration (alternative to depends_on inside nodes)
let _FormulaEdge = {
from | String,
to | String,
kind | _dep_kind | default = 'OnSuccess,
} in
# Base structure without cross-field validation
let _FormulaBase = {
id | String,
description | String,
provider | String,
server | String,
nodes | Array _FormulaNode,
edges | Array _FormulaEdge | default = [],
max_parallel | Number | default = 4,
} in
# Contract: all node_id values in depends_on must reference an existing node id.
# Also validates edge endpoints.
let _Formula = std.contract.custom (fun label value =>
let base = value | _FormulaBase in
let node_ids = base.nodes |> std.array.map (fun n => n.id) in
# Check: duplicate node ids
let dup_ids = node_ids |> std.array.fold_left (fun acc id =>
if std.record.has_field id acc.seen then
{ seen = acc.seen, dups = acc.dups @ [id] }
else
{ seen = acc.seen & { "%{id}" = true }, dups = acc.dups }
) { seen = {}, dups = [] } in
if std.array.length dup_ids.dups > 0 then
std.contract.blame_with_message
"Formula '%{base.id}': duplicate node ids: %{std.string.join ", " dup_ids.dups}"
label
else
# Check: depends_on referential integrity
let bad_deps = base.nodes |> std.array.flat_map (fun node =>
node.depends_on
|> std.array.filter (fun dep =>
!(node_ids |> std.array.any (fun id => id == dep.node_id))
)
|> std.array.map (fun dep =>
"node '%{node.id}' depends_on unknown '%{dep.node_id}'"
)
) in
if std.array.length bad_deps > 0 then
std.contract.blame_with_message
"Formula '%{base.id}' has invalid depends_on: %{std.string.join ", " bad_deps}"
label
else
# Check: edge referential integrity
let bad_edges = base.edges |> std.array.filter (fun e =>
!(node_ids |> std.array.any (fun id => id == e.from))
|| !(node_ids |> std.array.any (fun id => id == e.to))
) |> std.array.map (fun e => "'%{e.from}' -> '%{e.to}'") in
if std.array.length bad_edges > 0 then
std.contract.blame_with_message
"Formula '%{base.id}' has invalid edge endpoints: %{std.string.join ", " bad_edges}"
label
else
'Ok base
) in
{
FormulaDep = _FormulaDep,
FormulaNode = _FormulaNode,
FormulaEdge = _FormulaEdge,
Formula = _Formula,
make_dep = fun data => _FormulaDep & data,
make_node = fun data => data | _FormulaNode,
make_edge = fun data => _FormulaEdge & data,
make_formula = fun data => data | _Formula,
}

View file

@ -0,0 +1,93 @@
# schemas/lib/integration/cabling.ncl
#
# Cabling format: per-workspace, per-mode binding file that resolves
# domain context fields to their concrete sources.
#
# Location: infra/<ws>/integrations/<mode-id>.ncl
#
# Each entry in `bindings` maps a dotted domain field path
# (e.g. "secret-delivery.registry_password") to a Resolver record.
#
# Resolver kinds (discriminated by the `kind` field):
# "sops" — decrypt field from a SOPS-encrypted file
# "component" — read field from a component's output record
# "literal" — static hardcoded value
# "env" — read from an environment variable at assembly time
#
# Usage:
# "secret-delivery.registry_password" = { kind = "sops", path = "secrets/zot.sops.yaml", key = "ZOT_HTPASSWD" },
# "secret-delivery.registry_url" = { kind = "component", name = "zot", field = "registry_url" },
# "event-emission.subject_prefix" = { kind = "literal", value = "ws.libre-wuji.build.lian-build" },
# "compute.api_key" = { kind = "env", env_var = "HETZNER_API_KEY" },
let _valid_kinds = [| 'sops, 'component, 'literal, 'env |] in
let _Resolver =
std.contract.custom (fun label value =>
if !std.is_record value then
std.contract.blame_with_message "Resolver must be a record with a 'kind' field" label
else if !std.record.has_field "kind" value then
std.contract.blame_with_message "Resolver missing required field 'kind'" label
else
match {
"sops" =>
if std.record.has_field "path" value && std.record.has_field "key" value then
'Ok value
else
std.contract.blame_with_message
"Resolver kind='sops' requires fields: path (String), key (String)"
label,
"component" =>
if std.record.has_field "name" value && std.record.has_field "field" value then
'Ok value
else
std.contract.blame_with_message
"Resolver kind='component' requires fields: name (String), field (String)"
label,
"literal" =>
if std.record.has_field "value" value then
'Ok value
else
std.contract.blame_with_message
"Resolver kind='literal' requires field: value (any)"
label,
"env" =>
if std.record.has_field "env_var" value then
'Ok value
else
std.contract.blame_with_message
"Resolver kind='env' requires field: env_var (String)"
label,
_ =>
std.contract.blame_with_message
"Unknown Resolver kind '%{value.kind}'. Valid: sops, component, literal, env"
label,
} value.kind
) in
# Base shape for structural validation before cross-field checks.
let _CablingBase = {
mode_id | String
| doc "Integration mode id — e.g. 'lian-build-provisioning'",
workspace | String
| doc "Workspace identifier — e.g. 'libre-wuji'",
bindings | { _ | _Resolver }
| doc "Map of '<domain-id>.<field>' to a Resolver",
} in
# Full Cabling contract: structural + non-empty bindings.
let _Cabling =
std.contract.custom (fun label value =>
let validated = value | _CablingBase in
if std.record.length validated.bindings == 0 then
std.contract.blame_with_message
"Cabling '%{validated.mode_id}': bindings must be non-empty"
label
else
'Ok validated
) in
{
Resolver = _Resolver,
Cabling = _Cabling,
}

View file

@ -0,0 +1,98 @@
# schemas/lib/integration/oci_artifact_format.ncl
#
# OCI artifact descriptors for the federated integration-modes protocol.
# Two artifact kinds:
# DomainArtifact — typed contract pushed by the domain owner
# ModeArtifact — integration mode manifest pushed by the participant
#
# Also exports:
# Invocation — how a mode step binary is invoked
# DomainLock — per-workspace lock file written after `prvng integration pull`
let _binary_source = [| 'path_assumed, 'cargo_install, 'oci_blob |] in
let _invocation_method = [| 'stdin_context, 'argv_context_file |] in
# How a mode step binary is resolved and invoked.
let _Invocation = {
method | _invocation_method
| doc "stdin_context: JSON piped to stdin; argv_context_file: path written to a temp file, passed as $1",
binary | {
source | _binary_source,
name | String,
version | String | optional,
cargo_crate | String | optional
| doc "Required when source = 'cargo_install",
oci_layer | String | optional
| doc "OCI blob reference when source = 'oci_blob — e.g. reg.librecloud.online/binaries/lian-build:0.3.0",
},
args | Array String | default = [],
env | { _ | String } | default = {},
} in
# A single OCI layer descriptor inside an artifact manifest.
let _LayerDescriptor = {
media_type | String,
description | String,
required | Bool | default = true,
} in
# DomainArtifact — pushed to reg.librecloud.online/domains/<id>:<semver>
# mediaType: application/vnd.ontoref.domain.v1
let _DomainArtifact = {
media_type | String
| default = "application/vnd.ontoref.domain.v1",
id | String
| doc "Stable domain identifier, e.g. 'secret-delivery'",
version | String
| doc "Semver of the domain contract",
description | String,
layers | Array _LayerDescriptor
| doc "Expected layers in the OCI image. 'contract.ncl' layer is always required.",
# ADR-017 G2 — explicit dependency declaration. References a RegistryEntry.id
# in the consuming project's manifest.registry_provides.registries[]. Enables
# impact analysis on `ore secrets close`: which artifacts are affected by a
# credential change. Empty = artifact does not consume registry credentials.
uses_registry | String | optional
| doc "RegistryEntry.id this artifact's runtime depends on",
} in
# ModeArtifact — pushed to reg.librecloud.online/modes/<id>:<semver>
# mediaType: application/vnd.ontoref.mode.v1
let _ModeArtifact = {
media_type | String
| default = "application/vnd.ontoref.mode.v1",
id | String,
version | String,
description | String,
participant | String
| doc "Originating project/workspace that owns this mode",
layers | Array _LayerDescriptor,
uses_registry | String | optional
| doc "RegistryEntry.id this mode's runtime depends on (ADR-017 G2)",
} in
# Written to infra/<ws>/integrations/<mode-id>.lock.ncl after successful pull.
# Keyed by domain id, records the resolved version + digest for reproducibility.
let _DomainLockEntry = {
version | String,
digest | String
| doc "OCI manifest digest, sha256:...",
pulled_at | String
| doc "ISO-8601 timestamp",
media_type | String,
} in
let _DomainLock = {
schema_version | String | default = "0.1.0",
domains | { _ | _DomainLockEntry },
} in
{
Invocation = _Invocation,
DomainArtifact = _DomainArtifact,
ModeArtifact = _ModeArtifact,
DomainLockEntry = _DomainLockEntry,
DomainLock = _DomainLock,
LayerDescriptor = _LayerDescriptor,
}

View file

@ -0,0 +1,122 @@
# schemas/lib/integration_mode_manifest.ncl
#
# Integration Mode manifest schema for the federated integration-modes protocol.
# Each participant project declares an IntegrationMode in its own reflection/modes/.
#
# Invariants enforced at contract evaluation time:
# 1. kind must be 'integration (not 'standard — prevents mode files landing in wrong catalog)
# 2. domains_used must be non-empty (every integration mode must declare its domain deps)
# 3. direction='bidirectional requires at least one step with id starting "report-"
# 4. direction='event_emitter requires at least one step with id starting "emit-"
# 5. All step depends_on references resolve to existing step ids (inherited from ontoref pattern)
#
# Embedding rationale: ontoref v0.1.0 has no domain command group and no OCI surface.
# This schema is a local embedded subset; upstreaming is deferred per ADR-042.
let oci = import "./integration/oci_artifact_format.ncl" in
let _direction = [| 'inbound, 'outbound, 'bidirectional, 'event_emitter |] in
# Typed reference to a domain artifact in the OCI registry.
let _DomainRef = {
id | String
| doc "Domain identifier — must match the id in the DomainArtifact pushed to the registry",
version | String
| doc "Semver constraint, e.g. '>=0.1.0, <0.2.0'",
registry | String | optional
| doc "Override registry base; defaults to reg.librecloud.online/domains",
} in
let _Dependency = {
step | String,
} in
let _OnError = {
strategy | [| 'Stop, 'Continue, 'Retry |] | default = 'Stop,
} in
# A single step in an integration mode. Extends ontoref _ActionStep with an
# optional invocation descriptor (absent for manual/human steps).
let _IntegrationStep = {
id | String,
action | String,
depends_on | Array _Dependency | default = [],
actor | [| 'Human, 'Agent, 'Both |] | default = 'Agent,
invocation | oci.Invocation | optional
| doc "How to invoke the step binary. Absent for human-only steps.",
on_error | _OnError | default = { strategy = 'Stop },
verify | String | optional,
note | String | optional,
} in
# Base shape validated before cross-field checks.
let _IntegrationModeBase = {
id | String,
kind | [| 'integration |],
direction | _direction,
trigger | String,
participant | String
| doc "Project/workspace that owns this mode — e.g. 'lian-build'",
domains_used | Array _DomainRef,
steps | Array _IntegrationStep,
preconditions | Array String | default = [],
postconditions | Array String | default = [],
description | String | optional,
} in
# Full contract: structural + cross-field invariants.
let _IntegrationMode =
std.contract.custom (fun label value =>
let validated = value | _IntegrationModeBase in
let steps = validated.steps in
let ids = steps |> std.array.map (fun s => s.id) in
let bad_refs = steps |> std.array.flat_map (fun step =>
step.depends_on
|> std.array.filter (fun dep =>
!(ids |> std.array.any (fun i => i == dep.step))
)
|> std.array.map (fun dep =>
"step '%{step.id}' depends_on unknown '%{dep.step}'"
)
) in
# Uniqueness accumulator — folds to a record of seen ids, blames on duplicate.
let unique_acc = ids |> std.array.fold_left (fun acc id =>
if std.record.has_field id acc.seen then
std.contract.blame_with_message
"IntegrationMode '%{validated.id}': duplicate step id '%{id}'"
label
else
{ seen = acc.seen & { "%{id}" = true }, ok = true }
) { seen = {}, ok = true } in
if std.array.length validated.domains_used == 0 then
std.contract.blame_with_message
"IntegrationMode '%{validated.id}': domains_used must be non-empty — declare every domain this mode depends on"
label
else if validated.direction == 'bidirectional
&& !(ids |> std.array.any (fun i => std.string.is_match "^report-" i)) then
std.contract.blame_with_message
"IntegrationMode '%{validated.id}' direction=bidirectional: requires at least one step with id starting 'report-'"
label
else if validated.direction == 'event_emitter
&& !(ids |> std.array.any (fun i => std.string.is_match "^emit-" i)) then
std.contract.blame_with_message
"IntegrationMode '%{validated.id}' direction=event_emitter: requires at least one step with id starting 'emit-'"
label
else if std.array.length bad_refs > 0 then
std.contract.blame_with_message
"IntegrationMode '%{validated.id}' has invalid depends_on: %{std.string.join ", " bad_refs}"
label
else
# Force uniqueness check evaluation before returning.
let _ = unique_acc in
'Ok validated
) in
{
DomainRef = _DomainRef,
IntegrationStep = _IntegrationStep,
IntegrationMode = _IntegrationMode,
}

View file

@ -0,0 +1,51 @@
# schemas/lib/keeper_policy.ncl — Keeper auto-sign policy schema (ADR-038)
#
# Declarative-only closed shape parsed by the keeper-daemon Rust matcher.
# Policy files (policy-<workspace>/policy.ncl) MUST conform to PolicyDef and
# MUST NOT contain Nickel function definitions or imports beyond this schema.
# Constraint: policy-files-are-declarative-only (ADR-038).
#
# Usage:
# let kp = import "schemas/lib/keeper_policy.ncl" in
# { policy | kp.PolicyDef = { auto_sign = [...], require_manual = [...] } }
# Op type wildcard contract — superset of ops_contract.ncl OpsType that also accepts "*"
let OpTypeOrAny =
std.contract.custom (
fun label =>
fun value =>
let valid = ["deploy", "scale", "restart", "secret_update", "drain", "*"] in
if std.array.any (fun x => x == value) valid then
'Ok value
else
'Error {
message = "Invalid op_type '%{value}'.\nValid values: deploy | scale | restart | secret_update | drain | *"
}
)
in
# A single match rule — all fields are glob patterns applied by the Rust matcher.
# Absent / defaulted-to-"*" field means "match any value for this dimension".
# The matcher evaluates rules top-to-bottom; first matching rule wins.
let _MatchRule = {
op_type | OpTypeOrAny | doc "Op type this rule applies to; '*' matches any op type" | default = "*",
image_patterns | Array String | doc "Glob patterns matched against OCI image reference in the op payload (deploy ops only)" | default = ["*"],
target_patterns | Array String | doc "Glob patterns matched against the op target name (e.g., 'staging-*', 'vapora')" | default = ["*"],
scope_patterns | Array String | doc "Glob patterns matched against JWT scope entries (<op_type>:<target_pattern>)" | default = ["*"],
} in
# Top-level policy file schema. Evaluation order: auto_sign rules checked first (top-to-bottom),
# then require_manual. If no rule matches, the op is held pending for manual review.
let _PolicyDef = {
version | Number | doc "Schema version — keeper-daemon rejects files with unknown versions" | default = 1,
auto_sign | Array _MatchRule | doc "Rules for operations the keeper-daemon may sign automatically" | default = [],
require_manual | Array _MatchRule | doc "Rules for operations that must be signed interactively via keeper-cli" | default = [],
} in
{
OpTypeOrAny = OpTypeOrAny,
MatchRule = _MatchRule,
PolicyDef = _PolicyDef,
make_policy | not_exported = fun data => data | _PolicyDef,
}

View file

@ -3,7 +3,8 @@
# | Pattern: Hybrid - defaults + makers + direct access (contracts available via import)
let contracts_lib = import "./contracts.ncl" in
let defaults_lib = import "./defaults.ncl" in
let defaults_lib = import "./defaults.ncl" in
let dag_lib = import "./dag/main.ncl" in
{
# ============================================================================
@ -61,4 +62,7 @@ let defaults_lib = import "./defaults.ncl" in
DefaultClusterDef = defaults_lib.cluster_def,
DefaultScaleData = defaults_lib.scale_data,
DefaultScaleResource = defaults_lib.scale_resource,
# DAG schema domain — accessible as provisioning.lib.dag.*
dag = dag_lib,
}

View file

@ -0,0 +1,53 @@
{
ManifestAction = std.enum.TagOrString,
StepHook = {
action | ManifestAction,
params | { _ | String } | default = {},
delay | Number | default = 0,
},
ManifestEntry = {
file | String | optional,
action | ManifestAction | default = 'apply,
skip_if_exists | Bool | default = false,
delay | Number | default = 0,
params | { _ | String } | default = {},
pre | Array StepHook | default = [],
post | Array StepHook | default = [],
},
_ManifestPlanSafe = std.contract.custom (fun label value =>
let base = value | {
init | Array ManifestEntry | default = [],
update | Array ManifestEntry | default = [],
delete | Array ManifestEntry | default = [],
restart | Array ManifestEntry | default = [],
} in
let protected = ["namespace", "pvc"] in
let is_destructive = fun a =>
a == 'delete || a == "delete" || a == 'recreate || a == "recreate"
in
let violations = fun op steps =>
steps
|> std.array.filter (fun e =>
std.record.has_field "file" e
&& std.array.elem e.file protected
&& is_destructive e.action
)
|> std.array.map (fun e => "%{op}:%{e.file}")
in
let all_violations =
violations "update" base.update
@ violations "delete" base.delete
@ violations "restart" base.restart
in
if std.array.length all_violations > 0 then
let msg = std.string.join ", " all_violations in
'Error { message = "ManifestPlan: protected resources cannot use delete/recreate — [%{msg}]" }
else
'Ok base
),
ManifestPlan = _ManifestPlanSafe,
}

113
schemas/lib/op.ncl Normal file
View file

@ -0,0 +1,113 @@
# schemas/lib/op.ncl — Op (Operation) governance contracts
#
# An Op is the atomic unit of workspace state management — it records intent,
# authorization, execution artifacts, and state transitions as a DAG node,
# enabling audit, rollback, and concurrent agent control.
#
# Storage (per op):
# ops/{id}/op.json — runtime instance record (JSON, not NCL)
# ops/{id}/pre.json — pre-execution state snapshot
# ops/{id}/post.json — post-execution state snapshot (absent on failure)
# .ops-archive/ — restic repo (S3 backend): logs + bundles, encrypted
#
# Identity:
# actor.nid = Radicle Node ID (rad self --nid). Falls back to "local:{user}".
# Op ID = {nid-short}:{uuid} — globally attributable, DAG-safe
#
# DAG semantics:
# Each Op is a node. lineage.parent_op is the incoming edge.
# Rollback is a new forward Op — lineage.rollback_of points to the Op being undone.
# The DAG remains acyclic; rollback is a forward move restoring an earlier snapshot.
let _OpActor = {
nid | String | doc "Radicle Node ID (rad self --nid) or 'local:{user}' fallback",
identity | String | doc "Human-readable label — username or agent name",
source | [| 'cli, 'agent, 'api |] | default = 'cli,
} in
let _Constraint = {
kind | [| 'backup, 'restore, 'health_check, 'dry_run_check, 'concurrent_lock |],
resource | String | doc "Component name or volume/resource identifier",
scope | [| 'direct, 'indirect |] | default = 'direct,
params | { .. } | default = {},
} in
let _RecoveryAction = {
kind | [| 'backup, 'restore, 'health_check, 'dry_run_check, 'concurrent_lock |],
resource | String,
from | [| 'pre_backup, 'last_known_good |] | default = 'pre_backup,
params | { .. } | default = {},
} in
let _OpConstraints = {
pre | Array _Constraint | doc "Gates evaluated before execution starts" | default = [],
on_failure | Array _RecoveryAction | doc "Recovery actions if op fails" | default = [],
} in
let _OpSnapshots = {
pre | String | doc "Relative path to pre.json from workspace root",
post | String | optional | doc "Relative path to post.json — absent if op failed before completion",
} in
let _OpArtifacts = {
archive_snapshot | String | optional | doc "Snapshot ID in the configured archive backend (restic/kopia)",
bundles | Array String | doc "Bundle tar.gz paths within the archive snapshot" | default = [],
} in
let _OpLineage = {
parent_op | String | optional | doc "Op ID this state was derived from (incoming DAG edge)",
rollback_of | String | optional | doc "If this op is a rollback, the ID of the op it undoes",
} in
{
OpActor = _OpActor,
Constraint = _Constraint,
RecoveryAction = _RecoveryAction,
OpConstraints = _OpConstraints,
OpSnapshots = _OpSnapshots,
OpArtifacts = _OpArtifacts,
OpLineage = _OpLineage,
# Enum type exports — use these as field annotation values in workspace NCL
OpSource = [| 'cli, 'agent, 'api |],
OpOperation = [| 'install, 'update, 'delete, 'rollback, 'dry_run |],
OpStatus = [| 'pending, 'running, 'constraint_failed, 'recovering, 'success, 'failed, 'rolled_back, 'cancelled |],
Op = {
id | String | doc "Op ID: {nid-short}:{uuid}",
actor | _OpActor,
intent | String | doc "Human description of why this op is needed",
workspace | String | doc "Workspace name from config/provisioning.ncl",
component | String | doc "Component being operated on",
operation | [| 'install, 'update, 'delete, 'rollback, 'dry_run |],
targets | Array String | doc "Server hostnames targeted by this op",
constraints | _OpConstraints | default = { pre = [], on_failure = [] },
snapshots | _OpSnapshots,
artifacts | _OpArtifacts | default = { bundles = [] },
lineage | _OpLineage | default = {},
status | [| 'pending, 'running, 'constraint_failed, 'recovering, 'success, 'failed, 'rolled_back, 'cancelled |] | default = 'pending,
jj_change | String | optional | doc "jj change ID created for this op",
radicle_rid | String | optional | doc "Radicle Repository ID after rad sync — globally unique RID of this workspace",
started_at | String | doc "ISO 8601 UTC timestamp",
ended_at | String | optional,
},
# Workspace-level ops configuration — added under `ops` in config/provisioning.ncl
OpsConfig = {
archive = {
backend | [| 's3, 'local |] | default = 's3,
tool | [| 'restic, 'kopia |] | doc "Backup provider — must have a matching entry in extensions/providers/backup/" | default = 'restic,
endpoint | String | optional | doc "S3-compatible endpoint URL",
bucket | String | optional,
prefix | String | default = "ops",
},
retention = {
keep_last | Number | default = 50,
keep_monthly | Number | default = 12,
keep_yearly | Number | default = 3,
},
},
}

View file

@ -0,0 +1,117 @@
# schemas/lib/ops_contract.ncl — Ops contract (ADR-037)
# NATS JetStream subject namespaces, JWT signed command structure,
# stream configuration, and workspace ops contract definition.
let OpsType =
std.contract.custom (
fun label =>
fun value =>
let valid = ["deploy", "scale", "restart", "secret_update", "drain"] in
if std.array.any (fun x => x == value) valid then
'Ok value
else
'Error {
message = "Invalid op_type '%{value}'.\nValid values: deploy | scale | restart | secret_update | drain"
}
)
in
let _StreamRetention = [| 'WorkQueue, 'Limits, 'Interest |] in
let _ScopeEntry = {
op_type | OpsType,
target_pattern | String | doc "Glob pattern for allowed op targets (e.g., 'staging-*', 'vapora')",
} in
let _JwtClaims = {
iss | String | doc "Signer identity: keeper-vm-primary | operator-<id> | gh-actions-<id>",
sub | String | doc "Requesting principal: woodpecker-job-<id> | manual-<operator>",
aud | String | doc "Target workspace name",
scopes | Array _ScopeEntry | doc "Allowed (op_type, target_pattern) tuples scoped to this signer",
seq | Number | doc "Per-issuer monotonic counter — anti-replay",
jti | String | doc "UUIDv4 idempotency key",
expected_state_version | String | doc "Optimistic concurrency token — workspace state version this op read",
exp | Number | doc "Unix timestamp: token expiry",
nbf | Number | doc "Unix timestamp: token not-valid-before",
} in
let _StreamConfig = {
name | String,
subjects | Array String,
retention | _StreamRetention | doc "JetStream retention policy" | default = 'WorkQueue,
max_age_s | Number | doc "Message TTL in seconds",
replicas | Number | doc "JetStream stream replica count" | default = 1,
max_bytes | Number | doc "Max stream storage in bytes (-1 = unlimited)" | default = -1,
} in
let _OpsSubjects = {
pending | String | doc "ops.pending.<workspace>.> — unsigned proposals from emitters",
cmd | String | doc "ops.cmd.<workspace>.> — signed commands ready to apply",
ack | String | doc "ops.ack.<workspace>.> — application result from ops-controller",
audit | String | doc "ops.audit.<workspace> — immutable audit stream",
} in
let _OpsStreams = {
pending | _StreamConfig | doc "WorkQueue, 14d — buffers unsigned proposals",
cmd | _StreamConfig | doc "WorkQueue, 24h — signed commands awaiting application",
audit | _StreamConfig | doc "Limits, 90d, replicas=3 — immutable audit record",
} in
# Workspace-level ops contract — embed in workspace infra NCL as `ops_contract`
let _OpsWorkspaceConfig = {
workspace | String,
subjects | _OpsSubjects,
streams | _OpsStreams,
authorized_signers | Array String | doc "Signer identity keys allowed to sign for this workspace" | default = [],
} in
{
OpsType = OpsType,
StreamRetention = _StreamRetention,
ScopeEntry = _ScopeEntry,
JwtClaims = _JwtClaims,
StreamConfig = _StreamConfig,
OpsSubjects = _OpsSubjects,
OpsStreams = _OpsStreams,
OpsWorkspaceConfig = _OpsWorkspaceConfig,
# Constructs a full OpsWorkspaceConfig from a workspace name.
# Stream names follow ADR-037 convention: OPS_{STREAM}_{workspace}
# (workspace name is used verbatim; uppercase normalisation is ops-controller's concern).
make_ops_config | not_exported = fun workspace => {
workspace = workspace,
subjects = {
pending = "ops.pending.%{workspace}.>",
cmd = "ops.cmd.%{workspace}.>",
ack = "ops.ack.%{workspace}.>",
audit = "ops.audit.%{workspace}",
},
streams = {
pending = {
name = "OPS_PENDING_%{workspace}",
subjects = ["ops.pending.%{workspace}.>"],
retention = 'WorkQueue,
max_age_s = 1209600,
replicas = 1,
max_bytes = -1,
},
cmd = {
name = "OPS_CMD_%{workspace}",
subjects = ["ops.cmd.%{workspace}.>"],
retention = 'WorkQueue,
max_age_s = 86400,
replicas = 1,
max_bytes = -1,
},
audit = {
name = "OPS_AUDIT_%{workspace}",
subjects = ["ops.audit.%{workspace}"],
retention = 'Limits,
max_age_s = 7776000,
replicas = 3,
max_bytes = -1,
},
},
authorized_signers = [],
},
}

62
schemas/lib/playbook.ncl Normal file
View file

@ -0,0 +1,62 @@
# schemas/lib/playbook.ncl — PlaybookDef and PlaybookStep contracts
#
# Every playbook in extensions/playbooks/<name>/playbook.ncl validates against this schema.
# validate-playbooks reflection mode (TASK-C6) checks:
# - playbook.ncl conforms to PlaybookDef
# - run.nu exists for each step that references it
# - rollback.nu exists when rollback_strategy = 'automatic
# - tests/dry_run.nu is checked with nu --ide-check when present
#
# Usage:
# let pb = import "schemas/lib/playbook.ncl" in
# { .. } | pb.PlaybookDef
let _RollbackStrategy = [| 'automatic, 'manual, 'none |] in
let _StepErrorAction = [| 'Stop, 'Rollback, 'Continue |] in
# A declared parameter the playbook accepts — forwarded as env vars to step scripts.
let _ParamDef = {
name | String | doc "Parameter name (becomes env var: PLAYBOOK_PARAM_<NAME>)",
description | String,
required | Bool | doc "When true, absence causes the runner to abort before any step" | default = true,
default_val | String | doc "Default value used when required = false and the caller omits the param" | default = "",
} in
# A single step in a playbook. Each step maps to a script relative to the playbook root.
let _PlaybookStep = {
id | String | doc "Unique step identifier within this playbook; used in depends_on refs",
name | String | doc "Human-readable step label shown in dry-run output",
script | String | doc "Path to the Nushell step script relative to the playbook directory (e.g., 'run.nu', 'steps/deploy.nu')",
dry_run_arg | String | doc "Flag appended to script invocation when running in dry-run mode" | default = "--dry-run",
params | { _ | String } | doc "Static key-value params forwarded to the step script as env vars; caller params overlay these" | default = {},
on_error | _StepErrorAction | doc "Action taken when this step exits non-zero" | default = 'Stop,
depends_on | Array String | doc "Step IDs that must complete successfully before this step runs" | default = [],
} in
# The full playbook declaration. Consumed by 'prvng playbook run <id>' and the
# validate-playbooks reflection mode.
let _PlaybookDef = {
id | String | doc "Machine-readable playbook identifier matching the directory name (e.g., 'bootstrap_initial')",
name | String | doc "Human-readable playbook title",
description | String,
version | Number | doc "Schema version — must be 1" | default = 1,
preconditions | Array String | doc "Human-readable preconditions the operator must verify before running; printed in dry-run output" | default = [],
params | Array _ParamDef | doc "Declared parameters; absent required params abort before step 1" | default = [],
steps | Array _PlaybookStep | doc "Ordered step declarations; topological sort applied using depends_on",
rollback_strategy | _RollbackStrategy | doc "automatic: rollback.nu is invoked on any step failure; manual: operator handles; none: no rollback path" | default = 'none,
success_criteria | Array String | doc "Human-readable criteria printed after a successful run to help the operator verify the outcome" | default = [],
emit_audit | Bool | doc "When true, playbook runner emits ops.audit events at step start and completion" | default = false,
adr_refs | Array String | doc "ADR IDs this playbook implements (e.g., 'adr-037', 'adr-039')" | default = [],
} in
{
RollbackStrategy = _RollbackStrategy,
StepErrorAction = _StepErrorAction,
ParamDef = _ParamDef,
PlaybookStep = _PlaybookStep,
PlaybookDef = _PlaybookDef,
make_step | not_exported = fun data => data | _PlaybookStep,
make_playbook | not_exported = fun data => data | _PlaybookDef,
}

91
schemas/lib/radicle.ncl Normal file
View file

@ -0,0 +1,91 @@
# schemas/lib/radicle.ncl — Radicle Heartwood governance substrate types (ADR-038)
#
# Three repo families per workspace: policy, desired, state — each with a distinct
# delegation profile. Used by the audit-mirror crate and governance domain commands.
#
# Usage:
# let rad = import "schemas/lib/radicle.ncl" in
# { repos | rad.WorkspaceRepos = rad.make_workspace_repos "libre-wuji" & { ... } }
let _RepoRole = [| 'policy, 'desired, 'state |] in
let _PatchStatus = [| 'open, 'merged, 'rejected |] in
# M-of-N delegation profile attached to a Radicle repo.
# threshold <= length(signers) is a business invariant enforced by the Rust caller.
let _DelegationProfile = {
threshold | Number | doc "Minimum signatures required to merge a patch (M in M-of-N)",
signers | Array String | doc "Key IDs of authorized delegates (Radicle DID or human-readable alias)",
} in
# A Radicle repo descriptor: RID + role + delegation profile.
# rid is empty string until the repo is initialised via 'rad init'.
let _RadicleRepo = {
name | String | doc "Human-readable name (e.g., 'policy-libre-wuji')",
rid | String | doc "Radicle Identifier assigned by 'rad init' (rad:...); empty before init" | default = "",
role | _RepoRole | doc "Functional role in the three-repo split",
delegates | _DelegationProfile | doc "M-of-N delegation profile for patches to this repo",
} in
# A proposed change patch — governance domain commands surface these for operator review.
let _Patch = {
id | String | doc "Radicle patch ID",
proposed_by | String | doc "Key ID or alias of the patch author",
status | _PatchStatus | doc "Current lifecycle state" | default = 'open,
signatures | Array String | doc "Key IDs that have signed this patch" | default = [],
payload | String | doc "Short human-readable description of what this patch changes",
} in
# Snapshot of signature satisfaction for a pending patch.
let _SignatureSet = {
required | Number | doc "Threshold from the repo's DelegationProfile (M)",
present | Array String | doc "Key IDs that have already signed",
satisfied | Bool | doc "True when length(present) >= required",
} in
# The three repos belonging to one workspace — the canonical three-repo split.
let _WorkspaceRepos = {
policy | _RadicleRepo | doc "policy-<workspace>: keeper auto-sign policy + authorized-signers set; M-of-N operator delegates",
desired | _RadicleRepo | doc "<workspace>-desired: version-controlled workspace declaration; M-of-N operators + CI keys",
state | _RadicleRepo | doc "<workspace>-state: immutable applied-ops ledger; exactly one delegate (ops-controller key)",
} in
{
RepoRole = _RepoRole,
PatchStatus = _PatchStatus,
DelegationProfile = _DelegationProfile,
RadicleRepo = _RadicleRepo,
Patch = _Patch,
SignatureSet = _SignatureSet,
WorkspaceRepos = _WorkspaceRepos,
# Returns a _WorkspaceRepos template with empty RIDs and placeholder signer lists.
# `rid` and `delegates` fields carry `| default` priority so callers can override via merge:
# (rad.make_workspace_repos "libre-wuji") & {
# policy.rid = "rad:abc",
# policy.delegates = { threshold = 2, signers = ["jpl-yubikey", "alice-key"] },
# state.rid = "rad:ghi",
# state.delegates = { threshold = 1, signers = ["ops-controller-wuji-key"] },
# }
# Alternatively, use `{ ... } | rad.WorkspaceRepos` directly with all fields populated.
make_workspace_repos | not_exported = fun workspace => {
policy = {
name = "policy-%{workspace}",
rid | default = "",
role = 'policy,
delegates | default = { threshold = 1, signers = [] },
},
desired = {
name = "%{workspace}-desired",
rid | default = "",
role = 'desired,
delegates | default = { threshold = 1, signers = [] },
},
state = {
name = "%{workspace}-state",
rid | default = "",
role = 'state,
delegates | default = { threshold = 1, signers = [] },
},
},
}

View file

@ -0,0 +1,148 @@
# Generic scheduler helper — produces a scheduling artefact for any of the
# four runtime targets (K8s CronJob, systemd timer, cron.d entry, daemon
# task registration). Not coupled to backup nor to Kubernetes; any task in
# the repo that needs to be scheduled can build on top of this.
#
# Example:
# let s = (import "scheduler.ncl").make_schedule {
# name = "etcd-snapshot",
# schedule_kind = 'cron, cron_expr = "0 */6 * * *",
# target = { kind = 'systemd_timer, host_selector = "control_planes",
# user = "root", unit_name = "prvng-etcd-snapshot" },
# command = "/usr/local/bin/prvng-backup one-shot backup etcd-snapshot",
# env = { …secret refs… },
# } in s.systemd_units
{
# === Target descriptors ===================================================
K8sCronJobTarget = {
kind | [| 'k8s_cronjob |],
namespace | String,
image | String,
image_pull_policy | [| 'IfNotPresent, 'Always, 'Never |] | default = 'IfNotPresent,
service_account | String | optional,
node_selector | { _ | String } | default = {},
restart_policy | [| 'OnFailure, 'Never |] | default = 'OnFailure,
successful_jobs_history_limit | Number | default = 3,
failed_jobs_history_limit | Number | default = 5,
},
SystemdTimerTarget = {
kind | [| 'systemd_timer |],
unit_name | String,
host_selector | String | doc "Hostname pattern or role (e.g. 'control_planes')",
user | String | default = "root",
after | Array String | default = ["network-online.target"],
persistent | Bool | default = true,
},
CronDTarget = {
kind | [| 'cron_d |],
file_name | String | doc "Filename under /etc/cron.d/",
host_selector | String,
user | String | default = "root",
},
DaemonTaskTarget = {
kind | [| 'daemon_task |],
task_id | String,
daemon_endpoint | String | default = "unix:///run/prvng-backup.sock",
},
# === Top-level builder ====================================================
# make_schedule returns a record with one populated branch out of:
# { manifests, systemd_units, cron_files, daemon_registrations }.
# Callers serialise the appropriate branch.
make_schedule = fun spec =>
let target_kind = spec.target.kind in
let cron_expr = spec.cron_expr in
let name = spec.name in
let command = spec.command in
let env = spec.env in
{
manifests =
if target_kind == 'k8s_cronjob then
[{
apiVersion = "batch/v1",
kind = "CronJob",
metadata = {
name = name,
namespace = spec.target.namespace,
},
spec = {
schedule = cron_expr,
successfulJobsHistoryLimit = spec.target.successful_jobs_history_limit,
failedJobsHistoryLimit = spec.target.failed_jobs_history_limit,
jobTemplate.spec.template.spec = {
restartPolicy = std.string.from_enum spec.target.restart_policy,
serviceAccountName = spec.target.service_account,
nodeSelector = spec.target.node_selector,
containers = [{
name = name,
image = spec.target.image,
imagePullPolicy = std.string.from_enum spec.target.image_pull_policy,
command = ["/bin/sh", "-c", command],
env = std.record.to_array env
|> std.array.map (fun e => { name = e.field, value = e.value }),
}],
},
},
}]
else [],
systemd_units =
if target_kind == 'systemd_timer then
[{
host_selector = spec.target.host_selector,
unit_name = spec.target.unit_name,
service_unit = m%"
[Unit]
Description=%{name}
After=%{std.string.join " " spec.target.after}
[Service]
Type=oneshot
User=%{spec.target.user}
ExecStart=%{command}
EnvironmentFile=-/etc/prvng-backup/%{name}.env
"%,
timer_unit = m%"
[Unit]
Description=Timer for %{name}
[Timer]
OnCalendar=%{cron_expr}
Persistent=%{if spec.target.persistent then "true" else "false"}
[Install]
WantedBy=timers.target
"%,
}]
else [],
cron_files =
if target_kind == 'cron_d then
[{
host_selector = spec.target.host_selector,
path = "/etc/cron.d/%{spec.target.file_name}",
content = m%"
%{cron_expr} %{spec.target.user} %{command}
"%,
}]
else [],
daemon_registrations =
if target_kind == 'daemon_task then
[{
task_id = spec.target.task_id,
daemon_endpoint = spec.target.daemon_endpoint,
schedule = cron_expr,
command = command,
env = env,
}]
else [],
},
}

View file

@ -0,0 +1,52 @@
# schemas/lib/storage_config.ncl — StorageConfig contracts
#
# Library file — import only, not directly exportable.
#
# Usage (component contracts.ncl):
# let sc = import "schemas/lib/storage_config.ncl" in
# requires | { storage | sc.StorageRequires | optional, ... }
#
# Usage (provider metadata.ncl or capabilities.ncl):
# let sc = import "schemas/lib/storage_config.ncl" in
# storage_policy | sc.ProviderStoragePolicy = sc.HetznerCSIPolicy
{
VolumeMode = [| 'block, 'nfs, 'object |],
ExpansionPolicy = [| 'static, 'expand_only, 'full |],
# Contract for component requires.storage — what a component declares it needs.
StorageRequires = {
size | String,
persistent | Bool | default = true,
volume_mode | VolumeMode | default = 'block,
access_mode | String | default = "ReadWriteOnce",
storage_class | String | optional,
},
# Abstract contract for provider storage policies.
# Concrete policies (HetznerCSIPolicy, etc.) must supply all fields.
ProviderStoragePolicy = {
provider | String,
min_size | String | default = "1Gi",
max_size | String | optional,
expansion_policy | ExpansionPolicy | default = 'static,
volume_modes | Array VolumeMode | default = ['block],
},
# Hetzner hcloud-volumes: minimum 10Gi, expand-only (no shrink via CSI).
HetznerCSIPolicy | ProviderStoragePolicy = {
provider = "hcloud-volumes",
min_size = "10Gi",
expansion_policy = 'expand_only,
volume_modes = ['block],
},
# democratic-csi NFS: fine-grained sizing, full expand/shrink, RWX capable.
DemocraticCSINFSPolicy | ProviderStoragePolicy = {
provider = "democratic-csi-nfs",
min_size = "1Gi",
expansion_policy = 'full,
volume_modes = ['nfs],
},
}

View file

@ -0,0 +1,79 @@
# System backup contracts — declarative description of how out-of-cluster
# artefacts are backed up: etcd, k8s certs, host configs, external DNS,
# builder environment, provisioning state itself, log archives, vault state.
# Disparado por system cron / systemd timer / daemon coordinator.
let bp = import "backup_policy.ncl" in
let vault = import "vault_refs.ncl" in
{
# Selector for the host(s) where the backup runs. Either an explicit list
# of hostnames, a control-plane role selector, or a single primary.
HostSelector = {
kind | [| 'cp_only, 'cp_first, 'control_planes, 'workers, 'all_servers, 'list |],
members | Array String | doc "Hostnames when kind = 'list" | default = [],
},
# Discriminated target: what kind of off-cluster artefact is being captured.
SystemBackupTarget = {
kind | [| 'etcd, 'k8s_certs, 'cluster_resources, 'longhorn_engine, 'host_configs,
'external_dns, 'builder_env, 'provisioning_state, 'logs_archive,
'sops_keys, 'vault_state |],
# 'etcd
endpoints | Array String | default = [],
ca_ref | vault.VaultCredRef | optional,
cert_ref | vault.VaultCredRef | optional,
key_ref | vault.VaultCredRef | optional,
# 'k8s_certs / 'host_configs / 'logs_archive (paths)
paths | Array String | default = [],
exclude | Array String | default = [],
# 'cluster_resources
namespaces | Array String | default = [],
kinds | Array String | default = [],
# 'longhorn_engine
components | Array String | default = [],
# 'external_dns
source_kind | [| 'coredns, 'powerdns, 'unbound, 'loki, 'journald, 'files |] | optional,
config_paths| Array String | default = [],
zones_paths | Array String | default = [],
# 'builder_env
tools | Array String | default = [],
secrets | Array String | doc "Secret names that must accompany the artefact" | default = [],
# 'provisioning_state
definitions_path | String | optional,
state_path | String | optional,
lock_path | String | optional,
# 'logs_archive
selector | String | optional,
format | [| 'jsonl_gz, 'tar_gz, 'restic_native |] | optional,
# 'sops_keys / 'vault_state
age_keys | Array String | default = [],
recipients | Array String | default = [],
vault_endpoint | String | optional,
vault_paths | Array String | default = [],
},
SystemBackupDef = {
name | String | doc "Identifier (used in CLI: prvng-backup one-shot backup <name>)",
target | SystemBackupTarget,
host_selector | HostSelector,
provider | bp.BackupProviderRef,
schedule | bp.Schedule,
retention | bp.RetentionPolicy,
destinations | Array bp.Destination,
encryption | vault.VaultKeyRef,
tag_strategy | bp.TagStrategy,
verify | bp.VerifyPolicyRef | optional,
hooks | bp.Hooks | optional,
throttle | bp.Throttle | optional,
},
}

View file

@ -1,183 +1,113 @@
# | Reusable Validation Library for Nickel
# | Common validation contracts and helper functions
# | Author: JesusPerezLorenzo
# | Date: 2025-12-15
# | Status: Production Ready
# ============================================================
# Common Validation Contracts
# ============================================================
# IPv4 address validation (e.g., "192.168.1.1")
let IpV4Contract = {
label = "ValidIPv4",
predicate = fun ip =>
std.string.is_match ip "^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$"
}
# CIDR notation validation (e.g., "192.168.1.0/24")
let CidrContract = {
label = "ValidCIDR",
predicate = fun cidr =>
std.string.is_match cidr "^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}/\\d{1,2}$"
}
# Port range validation (1-65535)
let PortContract = {
label = "ValidPort",
predicate = fun p =>
p > 0 && p < 65536
}
# Semantic versioning validation (e.g., "1.2.3")
let SemverContract = {
label = "ValidSemver",
predicate = fun v =>
std.string.is_match v "^\\d+\\.\\d+\\.\\d+$"
}
# Domain name validation (e.g., "example.com")
let DomainContract = {
label = "ValidDomain",
predicate = fun d =>
std.string.is_match d "^[a-z0-9]([a-z0-9-\\.]{0,253}[a-z0-9])?$"
}
# OCI tag validation (e.g., "latest", "v1.0.0", "sha256-abc123")
let OciTagContract = {
label = "ValidOCITag",
predicate = fun tag =>
std.string.is_match tag "^[a-zA-Z0-9_][a-zA-Z0-9._-]{0,127}$"
}
# ISO 8601 timestamp validation (e.g., "2025-12-15T10:30:00Z")
let Iso8601Contract = {
label = "ValidISO8601",
predicate = fun ts =>
std.string.is_match ts "^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z$"
}
# Filesystem path validation (simple: non-empty, no double slashes)
let PathContract = {
label = "ValidPath",
predicate = fun path =>
std.string.length path > 0 && !std.string.contains path "//"
}
# ============================================================
# Helper Functions for Common Validations
# ============================================================
# Validate minimum string length
let min_length = fun min_val =>
{
label = "MinLength%{std.to_string min_val}",
predicate = fun s =>
std.string.length s >= min_val
}
# Validate maximum string length
let max_length = fun max_val =>
{
label = "MaxLength%{std.to_string max_val}",
predicate = fun s =>
std.string.length s <= max_val
}
# Validate numeric range
let range = fun min_val max_val =>
{
label = "Range[%{std.to_string min_val}-%{std.to_string max_val}]",
predicate = fun n =>
n >= min_val && n <= max_val
}
# Validate enum (value must be in list)
let enum = fun values =>
{
label = "Enum[%{std.string.join "," values}]",
predicate = fun v =>
std.array.elem v values
}
# Validate non-empty string
let non_empty_string = {
label = "NonEmptyString",
predicate = fun s =>
std.string.length s > 0
}
# Validate non-negative number
let non_negative = {
label = "NonNegative",
predicate = fun n =>
n >= 0
}
# Validate positive number
let positive = {
label = "Positive",
predicate = fun n =>
n > 0
}
# Validate boolean value
let boolean_value = {
label = "Boolean",
predicate = fun b =>
b == true || b == false
}
# ============================================================
# Helper Functions for Custom Validation
# ============================================================
# Validate all items in array satisfy predicate
let all_items = fun predicate =>
fun items =>
std.array.all predicate items
# Validate at least one item in array satisfies predicate
let any_items = fun predicate =>
fun items =>
std.array.any predicate items
# Validate record has all required keys
let has_keys = fun required_keys =>
fun record =>
std.array.all
(fun key =>
std.record.has_field record key
)
required_keys
# ============================================================
# Exports
# ============================================================
# Reusable Validation Library for Nickel
{
# Core validation contracts
IpV4Contract = IpV4Contract,
CidrContract = CidrContract,
PortContract = PortContract,
SemverContract = SemverContract,
DomainContract = DomainContract,
OciTagContract = OciTagContract,
Iso8601Contract = Iso8601Contract,
PathContract = PathContract,
IpV4Contract = {
label = "ValidIPv4",
predicate = fun ip =>
std.string.is_match ip "^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)$"
},
# Helper functions
min_length = min_length,
max_length = max_length,
range = range,
enum = enum,
non_empty_string = non_empty_string,
non_negative = non_negative,
positive = positive,
boolean_value = boolean_value,
CidrContract = {
label = "ValidCIDR",
predicate = fun cidr =>
std.string.is_match cidr "^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}/\\d{1,2}$"
},
# Custom validators
all_items = all_items,
any_items = any_items,
has_keys = has_keys,
PortContract = {
label = "ValidPort",
predicate = fun p =>
p > 0 && p < 65536
},
SemverContract = {
label = "ValidSemver",
predicate = fun v =>
std.string.is_match v "^\\d+\\.\\d+\\.\\d+$"
},
DomainContract = {
label = "ValidDomain",
predicate = fun d =>
std.string.is_match d "^[a-z0-9]([a-z0-9-\\.]{0,253}[a-z0-9])?$"
},
OciTagContract = {
label = "ValidOCITag",
predicate = fun tag =>
std.string.is_match tag "^[a-zA-Z0-9_][a-zA-Z0-9._-]{0,127}$"
},
Iso8601Contract = {
label = "ValidISO8601",
predicate = fun ts =>
std.string.is_match ts "^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z$"
},
PathContract = {
label = "ValidPath",
predicate = fun path =>
std.string.length path > 0 && !std.string.contains path "//"
},
min_length = fun min_val => {
label = "MinLength%{std.to_string min_val}",
predicate = fun s => std.string.length s >= min_val,
},
max_length = fun max_val => {
label = "MaxLength%{std.to_string max_val}",
predicate = fun s => std.string.length s <= max_val,
},
range = fun min_val max_val => {
label = "Range[%{std.to_string min_val}-%{std.to_string max_val}]",
predicate = fun n => n >= min_val && n <= max_val,
},
enum = fun values => {
label = "Enum[%{std.string.join "," values}]",
predicate = fun v => std.array.elem v values,
},
non_empty_string = {
label = "NonEmptyString",
predicate = fun s => std.string.length s > 0
},
non_negative = {
label = "NonNegative",
predicate = fun n => n >= 0
},
positive = {
label = "Positive",
predicate = fun n => n > 0
},
boolean_value = {
label = "Boolean",
predicate = fun b => b == true || b == false
},
all_items = fun pred => fun items => std.array.all pred items,
any_items = fun pred => fun items => std.array.any pred items,
has_keys = fun required_keys => fun record =>
std.array.all
(fun key => std.record.has_field record key)
required_keys,
IpRef =
std.contract.custom (
fun _label =>
fun value =>
if value == ""
|| std.string.is_match "^(\\d{1,3}\\.){3}\\d{1,3}$" value
|| std.string.contains ":" value
|| std.string.contains "fip" value
then 'Ok value
else 'Error {
message = "expected empty, an IPv4 address, an IPv6 address (contains ':'), or a FIP name (contains 'fip'); got '%{value}'"
}
),
}

View file

@ -0,0 +1,41 @@
# Vault reference contracts — typed pointers to secretumvault entries.
# Subsystems that need keys, credentials or signing material reference them
# by path inside vault rather than embedding the secret.
let _VaultPath = std.contract.from_validator (fun value =>
if !(std.is_string value)
then 'Error { message = "VaultPath must be a String" }
else if std.string.length value == 0
then 'Error { message = "VaultPath must be non-empty" }
else if std.string.contains " " value
then 'Error { message = "VaultPath must not contain whitespace" }
else if !(std.string.contains "/" value)
then 'Error { message = "VaultPath must contain '/'" }
else 'Ok
) in
{
# Path inside secretumvault. Validated as non-empty, no whitespace, contains '/'.
VaultPath = _VaultPath,
# Reference to a symmetric/asymmetric key stored in vault for encryption use.
VaultKeyRef = {
path | String | doc "Vault path to the key entry (e.g. 'backup-manager/master-encryption-key')",
algorithm | [| 'aes_gcm_256, 'chacha20_poly1305, 'age_x25519, 'rsa_4096, 'ecdsa_p256, 'pq_kyber768 |] | default = 'age_x25519,
derivation | {
method | [| 'none, 'hkdf_sha256 |] | default = 'none,
info | String | doc "HKDF info parameter when derivation is hkdf_sha256" | default = "",
} | default = { method = 'none, info = "" },
},
# Reference to credentials (S3 access keys, B2 keys, NKey seeds, etc.) stored in vault.
VaultCredRef = {
path | String | doc "Vault path to the credentials entry (e.g. 'backup-manager/destinations/hetzner-primary')",
kind | [| 's3, 'b2, 'sftp, 'nkey, 'jwt, 'token, 'tls_cert_bundle, 'etcd_client |] | doc "Type of credential payload at the path",
},
# Reference to a Cedar policy bundle in vault (for RBAC across actors).
VaultPolicyRef = {
path | String | doc "Vault path to the Cedar policy entry",
},
}

View file

@ -0,0 +1,64 @@
# Verify policy contracts — backup verification as parallel provisioning.
# Drill-as-recipe: instead of a boolean flag, declare a sandbox infra recipe
# the daemon coordinator spins up, restores into, and runs an integration
# test suite against. The only credible verification is one that actually
# restores and exercises the data.
let bp = import "backup_policy.ncl" in
{
# Test step discriminated union. The manager runs each step in order,
# collecting pass/fail/skip; an optional step does not abort on failure.
TestStep = {
kind | [| 'http_check, 'sql_query, 'file_exists, 'cmd, 'integration |],
name | String,
optional | Bool | default = false,
timeout | bp.Duration | default = "60s",
# 'http_check
url | String | optional,
expected_status | Number | optional,
# 'sql_query
connection_ref | String | optional | doc "Reference to a connection profile (vault path or alias)",
query | String | optional,
expected | String | optional,
# 'file_exists
path | String | optional,
# 'cmd
run | String | optional,
expect_zero_exit | Bool | default = true,
# 'integration — invokes a higher-level scenario by name
component | String | optional,
scenario | String | optional,
},
# Reference to a parallel provisioning recipe that materialises the sandbox.
# The recipe lives under infra/<workspace>/verify-recipes/ and is itself
# declarative Nickel exported to the orchestrator.
ProvisioningRecipeRef = {
name | String | doc "Recipe identifier (looked up in infra/<workspace>/verify-recipes/)",
args | { _ | String } | doc "Per-invocation parameters passed to the recipe" | default = {},
},
# Drill specification consumed by the daemon coordinator on a verify schedule.
DrillSpec = {
name | String,
parallel_infra | ProvisioningRecipeRef,
test_suite | Array TestStep,
cleanup | [| 'always, 'on_success, 'never |] | default = 'on_success,
timeout | bp.Duration | default = "30m",
schedule | bp.Schedule | optional | doc "Drill cadence; defaults to manual invocation when omitted",
},
# Top-level verify policy: a level (cheapest → costliest) plus an optional
# drill spec for 'restore_drill / 'full_dr levels.
VerifyPolicy = {
level | [| 'quick, 'deep, 'restore_drill, 'full_dr |] | default = 'quick,
schedule | bp.Schedule | optional,
drill | DrillSpec | optional,
},
}

78
schemas/lib/workflow.ncl Normal file
View file

@ -0,0 +1,78 @@
# schemas/lib/workflow.ncl — Workflow contracts
#
# A Workflow composes operations across components, modes, and layers.
# Each step targets one or more component operations (install, update, backup, ...).
# Workflows connect to: FSM dimensions, NATS events, backlog items, action log.
#
# Relationship to DAG:
# dag.ncl — L2 server provisioning (SSH, always install, server-bound)
# workflows/ — L3 service lifecycle (cross-component, any operation, cross-mode)
#
# Usage:
# let w = import "schemas/lib/workflow.ncl" in
# { deploy_services | w.WorkflowDef = { id = "...", steps = [...] } }
# Target for a single workflow step — a (component, operation) pair with optional mode override
let _WorkflowStepTarget = {
component | String,
operation | String,
mode | [| 'taskserv, 'cluster, 'container |] | optional,
} in
# A single step in a workflow — may touch multiple components
let _WorkflowStep = {
id | String,
targets | Array _WorkflowStepTarget,
depends_on | Array String | default = [],
condition | String | optional,
on_error | [| 'Stop, 'Rollback, 'Continue |] | default = 'Stop,
} in
# The structural definition of a workflow: ordered steps with rollback path
let _WorkflowDef = {
id | String,
description | String,
steps | Array _WorkflowStep,
rollback | Array _WorkflowStep | default = [],
} in
# Operational metadata bundled with a workflow: authorization, NATS, FSM, backlog, triggers
let _WorkflowMetadata = {
id | String,
name | String,
description | String,
tags | Array String | default = [],
actors | Array [| 'Developer, 'Agent, 'CI |] | default = ['Developer],
requires_approval | Bool | default = false,
fsm_dimension | String | optional,
notifications | {
subject_prefix | String,
on_start | Bool | default = true,
on_step | Bool | default = true,
on_complete | Bool | default = true,
on_error | Bool | default = true,
} | optional,
backlog_refs | Array String | default = [],
procedure_doc | String | optional,
adr_refs | Array String | default = [],
triggers | {
manual | Bool | default = true,
schedule | String | optional,
on_event | String | optional,
} | default = {},
} in
{
WorkflowStepTarget = _WorkflowStepTarget,
WorkflowStep = _WorkflowStep,
WorkflowDef = _WorkflowDef,
WorkflowMetadata = _WorkflowMetadata,
make_step = fun data => _WorkflowStep & data,
make_workflow = fun data => data | _WorkflowDef,
}

View file

@ -84,6 +84,9 @@
services | doc "Service registry and definitions"
= import "./infrastructure/compute/services/main.ncl",
scaling | doc "Node role and scale policy contracts (NodeRole, ScaleTemplate, ScalePolicy)"
= import "./infrastructure/compute/scaling.ncl",
},
storage | doc "Storage resources (VMs, volumes, golden images)"
@ -98,6 +101,9 @@
= import "./infrastructure/storage/golden_image/main.ncl",
},
images | doc "Provider role images (snapshot lifecycle, hardware limits, state)"
= import "./infrastructure/images/main.ncl",
provisioning | doc "Nested provisioning schemas"
= {
nested_provisioning | doc "Nested provisioning schemas"
@ -111,6 +117,9 @@
workflows | doc "Batch workflow schemas"
= import "./operations/workflows/main.ncl",
server_deploy | doc "Server deployment workflow plan (typed step sequencing)"
= import "./operations/workflows/server_deploy/main.ncl",
batch | doc "Batch scheduler and executor schemas"
= import "./operations/batch/main.ncl",

View file

@ -0,0 +1,22 @@
# ServerDeployPlan Contracts — typed workflow for server deployment step sequencing.
{
StepType = [| 'check, 'task, 'notify |],
FailMode = [| 'stop, 'warn, 'skip |],
DeployStep = {
id | String,
name | String,
type | StepType,
required | Bool,
on_fail | FailMode,
depends_on | Array String | optional,
timeout_seconds | Number | optional,
},
ServerDeployPlan = {
name | String,
steps | Array DeployStep,
on_failure | [| 'stop_all, 'continue, 'rollback |],
},
}

View file

@ -0,0 +1,75 @@
# ServerDeployPlan Defaults — canonical server deployment step sequence.
{
default_server_deploy_plan | default = {
name = "default-server-deploy",
on_failure = 'stop_all,
steps = [
{
id = "check_network",
name = "Verify network connectivity",
type = 'check,
required = true,
on_fail = 'stop,
depends_on = [],
},
{
id = "check_ssh_creds",
name = "Verify SSH credentials",
type = 'check,
required = true,
on_fail = 'stop,
depends_on = [],
},
{
id = "check_image_exists",
name = "Verify role image snapshot exists",
type = 'check,
required = true,
on_fail = 'stop,
depends_on = [],
},
{
id = "check_image_fresh",
name = "Check role image freshness",
type = 'check,
required = false,
on_fail = 'warn,
depends_on = [],
},
{
id = "create_server",
name = "Create server via provider",
type = 'task,
required = true,
on_fail = 'stop,
depends_on = ["check_network", "check_ssh_creds", "check_image_exists"],
},
{
id = "wait_boot",
name = "Wait for server boot",
type = 'task,
required = true,
on_fail = 'stop,
depends_on = ["create_server"],
timeout_seconds = 300,
},
{
id = "verify_ssh",
name = "Verify SSH connectivity",
type = 'task,
required = true,
on_fail = 'stop,
depends_on = ["wait_boot"],
},
{
id = "run_taskservs",
name = "Run post-boot taskservs",
type = 'task,
required = false,
on_fail = 'warn,
depends_on = ["verify_ssh"],
},
],
},
}

View file

@ -0,0 +1,13 @@
# ServerDeployPlan public API — typed workflow for declarable server deployment sequences.
let contracts_lib = import "./contracts.ncl" in
let defaults_lib = import "./defaults.ncl" in
{
defaults = defaults_lib,
make_deploy_plan | not_exported = fun overrides =>
defaults_lib.default_server_deploy_plan & overrides,
DefaultServerDeployPlan = defaults_lib.default_server_deploy_plan,
}

View file

@ -1,39 +1,100 @@
# TypeDialog + Nickel Configuration System for Platform Services
# Platform Configuration System - Schemas & Defaults
Complete configuration system for provisioning platform services (orchestrator, control-center, mcp-server, vault-service,
extension-registry, rag, ai-service, provisioning-daemon) across multiple deployment modes (solo, multiuser, cicd, enterprise).
**Source of truth** for platform service configurations (orchestrator, control-center, mcp-server, vault-service, extension-registry, rag, ai-service, provisioning-daemon) across all deployment modes.
## Architecture Overview
This directory contains the **active, production configuration system** that powers:
- Type-safe configuration via Nickel schemas
- Constraint-based validation
- Multi-mode deployment (solo/multiuser/cicd/enterprise)
- Interactive TypeDialog forms
- TOML export for Rust service consumption
This system implements a **TypeDialog + Nickel configuration workflow** that provides:
- **Type-safe configuration** via Nickel schemas with validation
- **Interactive configuration** via TypeDialog forms with real-time constraint validation
- **Multi-mode deployment** (solo/multiuser/cicd/enterprise) with mode-specific defaults
- **Configuration composition** (base defaults + mode overlays + user customization + validation)
- **Automated TOML export** for Rust service consumption
- **Docker Compose + Kubernetes templates** for infrastructure deployment
## Directory Structure
## Directory Structure (Flat)
```bash
provisioning/.typedialog/provisioning/platform/
├── constraints/ # Single source of truth for validation limits
├── schemas/ # Nickel type contracts (services + common + deployment modes)
├── defaults/ # Default configuration values (services + common + deployment modes)
├── validators/ # Validation logic (constraints, ranges, business rules)
├── configs/ # Generated mode-specific Nickel configurations (4 services × 4 modes = 16 configs)
├── forms/ # TypeDialog form definitions (4 main forms + flat fragments)
│ └── fragments/ # Reusable form fragments (workspace, server, database, etc.)
├── templates/ # Jinja2 + Nickel templates for config/deployment generation
│ ├── docker-compose/ # Docker Compose templates (solo/multiuser/cicd/enterprise)
│ ├── kubernetes/ # Kubernetes deployment templates
│ └── configs/ # Service configuration templates (TOML generation)
├── scripts/ # Nushell orchestration scripts (configure, generate, validate, deploy)
├── examples/ # Example configurations for different deployment scenarios
└── values/ # User configuration files (gitignored *.ncl)
provisioning/schemas/platform/
├── orchestrator.ncl # Service schemas (flat, one file per service)
├── control-center.ncl
├── mcp-server.ncl
├── vault-service.ncl
├── extension-registry.ncl
├── ai-service.ncl
├── rag.ncl
├── provisioning-daemon.ncl
├── common/ # Shared schemas and utilities
│ ├── constraints.ncl # GENERATED - Nickel constraint validators (do NOT edit)
│ ├── helpers.ncl # Configuration composition helpers
│ ├── server.ncl # HTTP server schema
│ ├── database.ncl # Database backend schema
│ ├── security.ncl # Authentication/encryption
│ ├── monitoring.ncl # Metrics and health checks
│ ├── logging.ncl # Log configuration
│ ├── network.ncl # Network binding and TLS
│ ├── storage.ncl # Storage backend
│ └── workspace.ncl # Workspace configuration
├── deployment/ # Mode-specific schemas (flat)
│ ├── solo.ncl # 2 CPU, 4GB RAM
│ ├── multiuser.ncl # 4 CPU, 8GB RAM
│ ├── cicd.ncl # 8 CPU, 16GB RAM
│ └── enterprise.ncl # 16+ CPU, 32+ GB RAM
├── defaults/ # Default values by service and mode
│ ├── orchestrator-defaults.ncl
│ ├── control-center-defaults.ncl
│ ├── mcp-server-defaults.ncl
│ ├── vault-service-defaults.ncl
│ ├── extension-registry-defaults.ncl
│ ├── ai-service-defaults.ncl
│ ├── rag-defaults.ncl
│ ├── provisioning-daemon-defaults.ncl
│ ├── common/ # Shared defaults (6 files)
│ │ ├── server-defaults.ncl
│ │ ├── database-defaults.ncl
│ │ ├── security-defaults.ncl
│ │ ├── monitoring-defaults.ncl
│ │ └── logging-defaults.ncl
│ └── deployment/ # Mode defaults (4 files)
│ ├── solo-defaults.ncl
│ ├── multiuser-defaults.ncl
│ ├── cicd-defaults.ncl
│ └── enterprise-defaults.ncl
├── constraints/ # Single source of truth
│ ├── constraints.toml # MASTER FILE - All validation limits
│ └── README.md
├── configs/ # Generated intermediate configs
│ ├── orchestrator.solo.ncl
│ ├── orchestrator.multiuser.ncl
│ ├── orchestrator.cicd.ncl
│ ├── orchestrator.enterprise.ncl
│ └── ... (8 services × 4 modes = 32 configs)
├── templates/ # Output generation templates
│ ├── configs/ # TOML export templates (Jinja2)
│ ├── docker-compose/ # Docker Compose templates
│ ├── kubernetes/ # Kubernetes manifests
│ └── service-config-template.ncl
├── examples/ # Reference configurations
│ ├── orchestrator-solo.ncl
│ ├── orchestrator-enterprise.ncl
│ ├── control-center-multiuser.ncl
│ └── full-platform-enterprise.ncl
└── values/ # User customizations (gitignored)
├── .gitignore # Ignores *.ncl, *.toml
└── (user configs populated at runtime)
```
**Key Changes**:
- ✅ Flat structure (no nested `schemas/` or `defaults/`)
- ✅ Archived `validators/` (88KB dead code)
- ✅ Generated `constraints.ncl` (gitignored)
- ✅ Master `constraints.toml` (single source)
## Configuration Workflow
### 1. User Interaction (TypeDialog)

View file

@ -1,11 +1,14 @@
# AI Service Schema
# AI model integration with RAG and MCP services
let constraints = import "schemas/platform/common/constraints.ncl" in
let docker_build_schema = import "schemas/platform/docker-build.ncl" in
{
AiServiceConfig = {
server | {
host | String,
port | Number,
port | Number | constraints.port_high,
workers | Number | optional,
},
@ -34,5 +37,8 @@
logging | {
level | String | default = "info",
} | optional,
# Docker build configuration
build | docker_build_schema.DockerBuildConfig | optional,
},
}

View file

@ -0,0 +1,86 @@
# Platform Constraints and Validators
# AUTOMATICALLY GENERATED from constraints.toml - DO NOT EDIT DIRECTLY
# Generated via: nickel eval scripts/generate-constraints.ncl
# Source: schemas/platform/constraints/constraints.toml
#
# Usage: Import in schemas to validate configuration fields
# Example: port | constraints.port_standard
#
# To modify constraints, edit constraints.toml and run:
# nickel eval scripts/generate-constraints.ncl > schemas/platform/common/constraints.ncl
let contract = std.contract in
{
# Valid port range (avoid system ports < 1024)
port_standard = contract.from_validator (fun x =>
if x >= 1024 && x <= 65535 then 'Ok
else 'Error {message = "port_standard must be between 1024 and 65535"}
),
# Platform service ports (>= 9000)
port_high = contract.from_validator (fun x =>
if x >= 9000 && x <= 65535 then 'Ok
else 'Error {message = "port_high must be between 9000 and 65535"}
),
# Vault service port number
vault_port = contract.from_validator (fun x =>
if x >= 1024 && x <= 65535 then 'Ok
else 'Error {message = "vault_port must be between 1024 and 65535"}
),
# Extension registry server port
registry_port = contract.from_validator (fun x =>
if x >= 1024 && x <= 65535 then 'Ok
else 'Error {message = "registry_port must be between 1024 and 65535"}
),
# Workflow engine worker thread count
workers = contract.from_validator (fun x =>
if x >= 1 && x <= 32 then 'Ok
else 'Error {message = "workers must be between 1 and 32"}
),
# HTTP server worker thread count
server_workers = contract.from_validator (fun x =>
if x >= 1 && x <= 32 then 'Ok
else 'Error {message = "server_workers must be between 1 and 32"}
),
# Maximum concurrent HTTP connections
max_connections = contract.from_validator (fun x =>
if x >= 10 && x <= 10000 then 'Ok
else 'Error {message = "max_connections must be between 10 and 10000"}
),
# Retry attempts for failed tasks
retry_attempts = contract.from_validator (fun x =>
if x >= 0 && x <= 10 then 'Ok
else 'Error {message = "retry_attempts must be between 0 and 10"}
),
# Metrics collection interval in seconds (10s-5min)
metrics_interval = contract.from_validator (fun x =>
if x >= 10 && x <= 300 then 'Ok
else 'Error {message = "metrics_interval must be between 10 and 300"}
),
# Health check interval in seconds (5s-5min)
health_check_interval = contract.from_validator (fun x =>
if x >= 5 && x <= 300 then 'Ok
else 'Error {message = "health_check_interval must be between 5 and 300"}
),
# Task execution timeout in milliseconds (1min-24hrs)
task_timeout = contract.from_validator (fun x =>
if x >= 60000 && x <= 86400000 then 'Ok
else 'Error {message = "task_timeout must be between 60000 and 86400000"}
),
# Tool execution timeout in milliseconds (5s-10min)
tool_timeout = contract.from_validator (fun x =>
if x >= 5000 && x <= 600000 then 'Ok
else 'Error {message = "tool_timeout must be between 5000 and 600000"}
),
# HTTP keep-alive timeout in seconds (0=disabled)
keep_alive = contract.from_validator (fun x =>
if x >= 0 && x <= 600 then 'Ok
else 'Error {message = "keep_alive must be between 0 and 600"}
),
# Rate limiting max requests per window
rate_limit_requests = contract.from_validator (fun x =>
if x >= 10 && x <= 10000 then 'Ok
else 'Error {message = "rate_limit_requests must be between 10 and 10000"}
),
}

Some files were not shown because too many files have changed in this diff Show more