provisioning/adrs/adr-021-workspace-composition-dag.ncl

118 lines
12 KiB
Text
Raw Permalink Normal View History

let d = import "adr-defaults.ncl" in
d.make_adr {
id = "adr-021",
title = "Workspace Composition DAG: Multi-Formula Orchestration and Task Namespacing",
status = 'Accepted,
date = "2026-04-03",
context = "ADR-016 established the Formula pattern for intra-server taskserv execution as a typed DAG. A workspace may contain multiple servers, each with a Formula defining its taskserv execution order. When a workspace is provisioned end-to-end, the Orchestrator needs to execute these Formulas in dependency order — the storage cluster Formula must wait for the control plane Formula to reach a stable state before Ceph OSD initialization begins. This inter-formula coordination layer was missing: batch.rs executed Formulas sequentially in declaration order with no dependency semantics and no health-gate mechanism to verify cluster readiness between Formula groups.",
decision = "Workspace infrastructure definitions declare inter-formula execution order in a `dag.ncl` artifact at `{workspace_root}/infra/{infra}/dag.ncl`. Each entry in `dag.ncl.composition.formulas[]` is a `FormulaCompositionEntry` carrying: `formula_id` (matching a formula declared in `servers.ncl`), `depends_on: Array FormulaDep` (edges to other formula_ids with a `condition: String`), `parallel: Bool`, and an optional `health_gate: HealthGateConfig` (a shell command + expected output + timeout). At runtime, `WorkspaceComposition::into_workflow` merges all formula WorkflowDefinitions into a single unified WorkflowDefinition: task names are rewritten as `formula_id::task_name` to prevent collisions across formulas, health gates are injected as synthetic `WorkflowTaskDefinition` records (with `metadata[\"type\"] = \"health-gate\"`), and inter-formula edges wire the terminal tasks of an upstream formula to the health gate (if present) or directly to the root tasks of the downstream formula. The merged WorkflowDefinition is executed by the existing `BatchWorkflowEngine::execute_workflow`. NATS events are emitted on `provisioning.dag.formula.{started,completed,failed}` and `provisioning.dag.healthgate.{checking,passed,failed}` (gated by the `nats` Cargo feature).",
rationale = [
{
claim = "Single WorkflowDefinition preserves the existing parallel dispatch and dependency resolution",
detail = "The `BatchWorkflowEngine` already implements topological sort and `max_parallel_tasks` dispatch via `DependencyGraph`. Merging all formulas into one WorkflowDefinition reuses this path without a new execution layer. Alternative: run each formula as a separate workflow, sequenced by the batch coordinator — this loses cross-formula parallelism (a worker-plane formula could start concurrently with a storage formula if neither depends on the other).",
},
{
claim = "formula_id::task_name namespacing is the minimal collision prevention mechanism",
detail = "Both `wuji-cp-0-formula` and `wuji-strg-0-formula` declare a node named `etcd_create`. Without namespacing, merging these into one WorkflowDefinition produces duplicate task names, breaking DependencyGraph.find(). The `::` separator was chosen over `.` (conflicts with Nickel field access in debug output) and `/` (ambiguous with path separators in NATS subjects). Formula_id recovery at runtime is `task_name.split_once(\"::\").map(|(fid, _)| fid)`.",
},
{
claim = "Health gates as synthetic tasks avoid a separate inter-formula synchronization mechanism",
detail = "A health gate that checks `kubectl get nodes --field-selector=... | wc -l` must block the dependent formula's root tasks until the cluster reports the expected node count. Injecting the gate as a `WorkflowTaskDefinition` with `metadata[\"type\"] = \"health-gate\"` reuses the existing task dependency mechanism — no new scheduling primitive is needed. The health gate command runs as a polling loop (up to timeout_seconds) inside `execute_task_with_retry`.",
},
{
claim = "dag.ncl at infra/{infra}/dag.ncl separates composition from server topology",
detail = "Composition (which formulas run in what order) is a different concern from server topology (which servers exist and what taskservs they run). Keeping dag.ncl separate from servers.ncl allows the composition DAG to evolve (e.g. adding a monitoring formula) without touching the server definitions, and vice versa. The CLI `provisioning dag show/validate/export` resolves dag.ncl independently of servers.ncl.",
},
{
claim = "NATS event emission on provisioning.dag.* enables live DAG visualization in ontoref-daemon",
detail = "The `ontoref-daemon` DagGraphProvider plugin subscribes to `provisioning.dag.>` to render live formula execution state. The subject hierarchy `provisioning.dag.{formula,healthgate}.{started,completed,failed}` maps directly to the three-layer DAG node types (formula, health gate, task). Gating emission behind `#[cfg(feature = \"nats\")]` means the orchestrator compiles and runs without NATS — the feature is additive.",
},
],
consequences = {
positive = [
"Inter-formula dependency order is a first-class schema artifact — validated by `provisioning-validate-formula` and `validate-topology` reflection mode",
"Health gates enforce cluster readiness between formula groups without ad-hoc sleep loops in provisioning scripts",
"Cross-formula parallelism: independent formulas (no depends_on edge) run concurrently within the same workspace provisioning run",
"NATS events on `provisioning.dag.*` feed live DAG visualization and audit trail",
"formula_id::task_name convention is recoverable — any consumer can extract formula_id from the task name without additional metadata",
],
negative = [
"Two DAG artifacts must be kept consistent: formula_ids in dag.ncl must match formula names in servers.ncl — inconsistency causes runtime panic in WorkspaceComposition::from_json (no schema-time cross-file validation)",
"Health gate commands are shell strings — not typed, not validated at schema time. A broken gate command is discovered at runtime when the gate task fails",
"Task names in NATS events and logs are `formula_id::task_name` — tooling must understand the `::` separator to route events correctly",
"Merging N formulas into one WorkflowDefinition means a single task failure with `fail_fast = true` halts all formulas, even those with no dependency on the failing formula",
],
},
alternatives_considered = [
{
option = "Sequential formula execution in declaration order (existing batch.rs behavior)",
why_rejected = "No parallelism, no dependency semantics, no health gates. The storage formula runs after the worker formula even if workers could have started in parallel. Execution order is implicit (declaration order) not explicit (DAG edges).",
},
{
option = "Separate WorkflowDefinition per formula, coordinated by a top-level orchestrator loop",
why_rejected = "Requires a new scheduling layer above BatchWorkflowEngine. Loses cross-formula parallelism. The existing DependencyGraph already handles the scheduling — merging into one definition reuses it at zero cost.",
},
{
option = "Health gate as a special Formula node type (within the formula, not between formulas)",
why_rejected = "A health gate that checks cluster-wide readiness (e.g. `kubectl get nodes`) crosses formula boundaries — it is a property of the composition, not of a single formula. Encoding it within a formula would couple the formula to knowledge of other formulas' outputs.",
},
{
option = "formula_id.task_name (dot separator) for task namespacing",
why_rejected = "Dot conflicts with Nickel field access notation in debug output and log messages. It also conflicts with NATS subject segment separator conventions where dots are meaningful separators. `::` is idiomatic in Rust and unambiguous in all output contexts.",
},
{
option = "Encode inter-formula edges in servers.ncl alongside the formula declaration",
why_rejected = "servers.ncl owns the server topology — which servers exist, what they run. Composition topology (which formulas execute in which order) is a workspace-level concern that may vary without changing the server definitions. A separate dag.ncl allows composition reuse across workspaces that share server topologies.",
},
],
constraints = [
{
id = "dag-ncl-formula-ids-must-match-servers-ncl",
claim = "Every formula_id in dag.ncl.composition.formulas[] must correspond to a formula declared in the same infra's servers.ncl",
scope = "workspaces/*/infra/*/dag.ncl, workspaces/*/infra/*/servers.ncl",
severity = 'Hard,
check = { tag = 'NuCmd, cmd = "provisioning dag validate 2>/dev/null | grep -q 'CROSS_REF_OK'", expect_exit = 0 },
rationale = "A dag.ncl formula_id with no matching servers.ncl formula causes WorkspaceComposition::from_json to silently skip the formula, producing an incomplete workflow. The validate-topology reflection mode and `provisioning dag validate` enforce this cross-file constraint.",
},
{
id = "task-namespacing-via-double-colon-only",
claim = "All task names in a composed WorkflowDefinition must use the `formula_id::task_name` format — no other separator is permitted",
scope = "platform/crates/orchestrator/src/formula.rs (WorkspaceComposition::into_workflow), platform/crates/orchestrator/src/workflow.rs",
severity = 'Hard,
check = { tag = 'Grep, pattern = "split_once\\(\"::\"\\)", paths = ["platform/crates/orchestrator/src/"], must_be_empty = false },
rationale = "The `::` separator is the runtime contract between WorkspaceComposition::into_workflow (producer) and workflow.rs result processing (consumer). Using a different separator in any new composition code would break formula_id extraction and NATS event routing.",
},
{
id = "health-gate-requires-nonempty-depends-on",
claim = "A FormulaCompositionEntry with a health_gate set must have at least one entry in depends_on",
scope = "workspaces/*/infra/*/dag.ncl, schemas/lib/dag/contracts.ncl",
severity = 'Hard,
check = { tag = 'NuCmd, cmd = "nickel export --format json --import-path . workspaces/librecloud_renew/infra/wuji/dag.ncl 2>/dev/null | jq '[.composition.formulas[] | select(.health_gate != null and (.depends_on | length) == 0)] | length == 0'", expect_exit = 0 },
rationale = "A health gate with no upstream formula has no terminal tasks to wire the gate to. The gate task gets empty depends_on and runs immediately, defeating its purpose. The dag.nu DOT exporter crashes on this case (`$entry.depends_on.0` index-out-of-bounds when depends_on is empty).",
},
{
id = "nats-emission-behind-feature-flag",
claim = "All NATS event emission in workflow.rs must be behind `#[cfg(feature = \"nats\")]` — the orchestrator must compile and function without NATS",
scope = "platform/crates/orchestrator/src/workflow.rs, platform/crates/orchestrator/Cargo.toml",
severity = 'Hard,
check = { tag = 'Grep, pattern = "emit_dag_event", paths = ["platform/crates/orchestrator/src/workflow.rs"], must_be_empty = false },
rationale = "NATS is an optional external dependency. The orchestrator must run in solo mode (no NATS) and in test environments without a running NATS server. Unconditional NATS calls would make all workflow tests depend on NATS availability.",
},
],
ontology_check = {
decision_string = "Workspace composition DAG via dag.ncl with formula_id::task_name namespacing, health gates as synthetic tasks, and NATS event emission on provisioning.dag.* subjects",
invariants_at_risk = ["type-safety-nickel", "config-driven-always"],
verdict = 'Safe,
},
related_adrs = ["adr-016-workspace-formula-dag", "adr-020-extension-capability-declarations", "adr-012-nats-event-broker"],
}