104 lines
9.1 KiB
XML
104 lines
9.1 KiB
XML
let d = import "adr-defaults.ncl" in
|
|
|
|
d.make_adr {
|
|
id = "adr-016",
|
|
title = "Workspace Taskserv Execution as Typed DAGs (Formula Pattern)",
|
|
status = 'Accepted,
|
|
date = "2026-03-14",
|
|
|
|
context = "Workspace server definitions declare taskservs as positional arrays — e.g. `taskservs = [etcd, kubernetes, containerd, cilium]`. The provisioning platform executes these in strict linear order regardless of actual dependencies between tasks. This model has three problems: (1) It cannot express parallelism — `containerd` and `coredns` are independent of each other but are serialized behind `kubernetes`. (2) It cannot express conditional edges — a failed `etcd` should halt `kubernetes` but a failed `coredns` should not. (3) The execution intent is implicit — there is no machine-readable artifact that declares which tasks depend on which, so no validation is possible at schema time. The Orchestrator already implements a full `DependencyGraph` with topological sort and `max_parallel_tasks` in `workflow.rs`, but `batch.rs` was building a linear chain from the positional array, ignoring the graph entirely.",
|
|
|
|
decision = "Workspace infrastructure definitions declare taskserv execution order as typed DAGs via a `Formula` Nickel record exported from `schemas/lib/formula.ncl`. Each `FormulaNode` carries: `id`, a `TaskServDef` (name, profile, target_save_path), `depends_on: Array FormulaDep` (referential edges by node_id + DepKind), `parallel: Bool`, `on_error: [| Stop | Continue | Retry |]`, and `max_retries: u8`. The Formula is validated at schema time by a custom Nickel contract that checks: no duplicate node IDs, every `depends_on.node_id` references a declared node, every `edges.{from,to}` references a declared node. At runtime, `Formula::from_json` in `formula.rs` deserializes the JSON export and `Formula::into_workflow(FormulaWorkflowConfig)` converts it into a `WorkflowDefinition` fed directly to `BatchWorkflowEngine::execute_workflow`, which runs the existing `DependencyGraph` topological sort with `max_parallel_tasks`. Positional `taskservs` arrays remain valid — they are the per-server composition definition and are retrocompatible. Formulas are an additive artifact in the same `servers.ncl` file.",
|
|
|
|
rationale = [
|
|
{
|
|
claim = "Schema-time referential integrity catches broken DAGs before deployment",
|
|
detail = "The `_Formula` custom Nickel contract validates all `depends_on.node_id` and edge endpoints against the declared `nodes` array. A missing node ID is a typecheck error, not a runtime panic. This enforces the type-safety-nickel axiom on execution topology.",
|
|
},
|
|
{
|
|
claim = "Parallelism is now explicit and governed",
|
|
detail = "Nodes marked `parallel = true` with no shared dependency run concurrently up to `max_parallel`. The control plane formula runs etcd first, then kubernetes, containerd, and coredns in parallel (3 workers), then cilium after k8s+containerd. This halved the estimated provisioning time for a 5-node cluster compared to the linear chain.",
|
|
},
|
|
{
|
|
claim = "on_error semantics are declarative, not implicit",
|
|
detail = "`on_error = 'Stop` halts the entire workflow on node failure (required for etcd, kubernetes). `on_error = 'Continue` allows the workflow to proceed past a non-critical failure (coredns can fail without blocking cilium). `on_error = 'Retry` retries up to max_retries times before propagating. Previously all failures were treated as Stop with no way to express Continue.",
|
|
},
|
|
{
|
|
claim = "Retrocompatible — zero migration cost for existing servers",
|
|
detail = "TaskServDef now has `depends_on`, `on_error`, `max_retries` fields with defaults. Existing `servers.ncl` files typecheck unchanged. Formulas are an opt-in additive array alongside the existing `servers` array. Batch.rs preserves the linear execution path when no formula is supplied.",
|
|
},
|
|
{
|
|
claim = "Single runtime path — the existing DependencyGraph is reused",
|
|
detail = "No new execution engine was written. `Formula::into_workflow` produces a standard `WorkflowDefinition` consumed by the existing `BatchWorkflowEngine::execute_workflow`. The DependencyGraph topological sort and parallel dispatch already existed in workflow.rs and were simply never reached via the batch coordinator.",
|
|
},
|
|
],
|
|
|
|
consequences = {
|
|
positive = [
|
|
"Parallel taskserv execution is now possible and schema-validated",
|
|
"DAG structure is a first-class artifact — diffable, auditable, versionable in git",
|
|
"on+re reflection mode `provisioning-validate-formula` provides cross-validation (taskserv existence, ConflictsWith, cycle detection)",
|
|
"FormulaWorkflowConfig<'a> groups conversion parameters — batch.rs call sites are explicit and lint-clean",
|
|
"Ontology node `formula-dag-execution` registers this pattern for on+re governance",
|
|
],
|
|
negative = [
|
|
"Two parallel models exist: positional `taskservs` arrays (per-server composition) and `formulas` (execution DAGs). Authors must understand the distinction.",
|
|
"Formula node IDs are a new namespace within a server definition — ID collisions across formulas in the same file are not currently detected at parse time (only within a single formula).",
|
|
"Nickel's custom contract for referential integrity runs at export time, not at typecheck time — `nickel typecheck` alone is insufficient; `nickel export` is required for full validation.",
|
|
],
|
|
},
|
|
|
|
alternatives_considered = [
|
|
{
|
|
option = "Positional array with dependency annotations as comments",
|
|
why_rejected = "Comments are not machine-readable. Cannot be validated, cannot drive runtime parallelism, cannot be consumed by on+re modes. Violates the type-safety-nickel axiom.",
|
|
},
|
|
{
|
|
option = "Separate formula file per server (e.g. wuji-formula.ncl)",
|
|
why_rejected = "Separates declaration from context. The `servers.ncl` file already owns the server definition including its taskservs — the formula belongs alongside it. Import proliferation adds no structural benefit.",
|
|
},
|
|
{
|
|
option = "Encode DAG as a TOML/YAML file consumed by the Orchestrator",
|
|
why_rejected = "Breaks the type-safety-nickel axiom. TOML/YAML have no contracts, no referential integrity, no schema composition. The Formula pattern allows the Nickel schema to own the execution topology, which is where it belongs.",
|
|
},
|
|
{
|
|
option = "Extend TaskServDef directly with execution metadata (depends_on, on_error) and derive the DAG implicitly",
|
|
why_rejected = "Conflates composition (which taskservs a server needs) with orchestration (in what order and how). The Formula is a separate, named artifact that can be versioned, validated, and governed independently from the taskserv list.",
|
|
},
|
|
],
|
|
|
|
constraints = [
|
|
{
|
|
id = "formula-node-ids-unique-within-formula",
|
|
claim = "Node IDs must be unique within a single Formula — the custom Nickel contract enforces this at export time",
|
|
scope = "schemas/lib/formula.ncl (_Formula contract), workspaces/*/infra/*/servers.ncl",
|
|
severity = 'Hard,
|
|
check = { tag = 'NuCmd, cmd = "nickel export --format json examples/workspaces/basic/servers.ncl 2>/dev/null | jq '[.formulas[].nodes[].id] | group_by(.) | map(select(length > 1)) | length == 0' | grep -q true", expect_exit = 0 },
|
|
rationale = "Duplicate node IDs produce ambiguous depends_on resolution. The contract catches this before the JSON reaches formula.rs.",
|
|
},
|
|
{
|
|
id = "formula-depends-on-declared-nodes-only",
|
|
claim = "Every depends_on.node_id and edge endpoint must reference a declared node in the same formula",
|
|
scope = "schemas/lib/formula.ncl (_Formula contract)",
|
|
severity = 'Hard,
|
|
check = { tag = 'FileExists, path = "schemas/lib/formula.ncl", present = true },
|
|
rationale = "A reference to a non-existent node_id would silently drop the dependency at runtime, producing an incorrect execution order with no error.",
|
|
},
|
|
{
|
|
id = "formula-runtime-conversion-via-formula-rs-only",
|
|
claim = "All Formula-to-WorkflowDefinition conversion must go through Formula::into_workflow — no ad-hoc JSON parsing in batch.rs or elsewhere",
|
|
scope = "platform/crates/orchestrator/src/batch.rs, platform/crates/orchestrator/src/formula.rs",
|
|
severity = 'Hard,
|
|
check = { tag = 'Grep, pattern = "nickel export", paths = ["platform/crates/"], must_be_empty = false },
|
|
rationale = "The FormulaWorkflowConfig struct and into_workflow carry the semantic mapping (task names, arg construction, metadata injection). Bypassing it risks silent divergence between schema intent and runtime behavior.",
|
|
},
|
|
],
|
|
|
|
ontology_check = {
|
|
decision_string = "Workspace taskserv execution topology as typed DAGs via Formula Nickel pattern, converted to WorkflowDefinition at runtime by formula.rs",
|
|
invariants_at_risk = ["type-safety-nickel", "config-driven-always"],
|
|
verdict = 'Safe,
|
|
},
|
|
|
|
related_adrs = ["adr-014-solid-enforcement", "adr-015-solo-mode-architecture"],
|
|
}
|