# reflection/modes/ops-radicle-build-info.ncl # # Info mode: structured architectural knowledge for the ops contract + Radicle # governance + ephemeral build infrastructure implemented in ADRs 037/038/039. # # Run by an agent to answer questions like: # "What planes exist and what components run in each?" # "What is the data flow for an ops command?" # "What Radicle repos does a workspace require?" # "How does the build pipeline work?" # "What playbooks are available and what do they do?" # "Which reflection modes validate this subsystem?" # # Agent entry point: # ONTOREF_ACTOR=agent ontoref run start ops-radicle-build-info --task "describe architecture" # # then per step: ontoref step report ops-radicle-build-info --status pass # # Direct execution from provisioning root: # nu reflection/handlers/ops-radicle-build-info.nu --provisioning-root . { id = "ops-radicle-build-info", trigger = "Describe the complete ops + Radicle governance + build infrastructure architecture (ADRs 037/038/039)", strategy = 'Override, preconditions = [ "nickel is available in PATH", "jq is available in PATH", "provisioning/.ontology/core.ncl is readable", "provisioning/adrs/adr-037-ops-contract-dual-mode.ncl is readable", "provisioning/adrs/adr-038-radicle-decentralized-governance.ncl is readable", "provisioning/adrs/adr-039-build-infrastructure-ephemeral.ncl is readable", ], steps = [ # ── Plane topology ──────────────────────────────────────────────────────── { id = "plane_topology", action = "describe_three_planes", actor = 'Agent, cmd = "echo '{\"planes\": [{\"name\": \"workload\", \"workspace\": \"libre-wuji\", \"role\": \"production runtime, runtime-autonomous\", \"components\": [\"k8s\", \"zot-s3\", \"ops-controller\", \"audit-mirror\", \"radicle-seed\", \"observability-stack\"]}, {\"name\": \"ci-orchestration\", \"workspace\": \"libre-daoshi\", \"role\": \"build + VCS + CI dispatch, replaceable\", \"components\": [\"k0s\", \"forgejo\", \"woodpecker\", \"postgresql\", \"radicle-seed\", \"buildkit-launcher\"]}, {\"name\": \"signing\", \"workspace\": \"ops-vm\", \"role\": \"auto-sign (keeper) or manual-sign, replaceable\", \"components\": [\"keeper-daemon\", \"radicle-seed\"]}]}'", depends_on = [], on_error = { strategy = 'Continue }, note = "Three-plane separation: workload plane is runtime-autonomous (continues when signing or CI planes are down). ops-vm is replaceable (its only persistent state is the keeper signing key, backed up and rotatable via the rotate_keys playbook). libre-daoshi is replaceable (all source is in Radicle; it can be re-provisioned without data loss). Both ADR-037 and ADR-038.", }, # ── Ops command data flow ───────────────────────────────────────────────── { id = "ops_command_flow", action = "describe_ops_data_flow", actor = 'Agent, cmd = "echo '{\"flow\": [{\"step\": 1, \"actor\": \"operator or keeper-daemon\", \"action\": \"sign JWT op command with Ed25519 key, POST to ops.pending.\"}, {\"step\": 2, \"actor\": \"keeper-daemon (auto) or keeper-cli (manual)\", \"action\": \"subscribe OPS_PENDING_ WorkQueue, validate JWT claims+scope+sequence, ack, publish to ops.cmd.\"}, {\"step\": 3, \"actor\": \"ops-controller\", \"action\": \"WorkQueue consumer of OPS_CMD_: persist to SurrealDB BEFORE ack, apply via orchestrator API, write result to ops.audit.\"}, {\"step\": 4, \"actor\": \"audit-mirror\", \"action\": \"subscribe ops.audit., batch commits to -state Radicle repo with jti idempotency\"}], \"invariants\": [\"WorkQueue semantics: exactly-once delivery guaranteed by JetStream AckPolicy=explicit\", \"Idempotency: expected_state_version prevents double-apply across restarts\", \"Audit trail: every command lands in Radicle ledger — tamper-evident, gossip-replicated\"]}'", depends_on = [], on_error = { strategy = 'Continue }, note = "Four-step command flow. Step 3 is the SOLID boundary: ops-controller touches only orchestrator API and NATS — never provider APIs or secrets directly. Step 4 is async best-effort (audit-mirror on_error=Continue); the ops command is not blocked by audit replication.", }, # ── Mode switch (auto/manual/hybrid) ───────────────────────────────────── { id = "signing_mode_switch", action = "describe_keeper_modes", actor = 'Agent, cmd = "echo '{\"modes\": [{\"name\": \"auto\", \"keeper_daemon\": \"running\", \"human_approval\": false, \"use_when\": \"routine deployments in business hours\"}, {\"name\": \"manual\", \"keeper_daemon\": \"stopped\", \"human_approval\": true, \"use_when\": \"destructive ops or off-hours\"}, {\"name\": \"hybrid\", \"keeper_daemon\": \"running with narrow auto_sign policy\", \"human_approval\": \"only for ops not matching auto_sign\", \"use_when\": \"gradual trust ramp-up\"}], \"switch_mechanism\": \"operational — start/stop keeper-daemon + edit keeper_policy.ncl; no code change, no architectural change\", \"playbooks\": [\"switch_to_vm_ops (auto→manual)\", \"switch_to_operator_only\"]}'", depends_on = [ { step = "ops_command_flow", kind = 'Always }, ], on_error = { strategy = 'Continue }, note = "Mode switching is operational, not architectural — there are no mode constants in code. The keeper_policy.ncl schema is declarative-only Nickel (ADR-038 constraint: policy-files-are-declarative-only). Any Nickel function definition in a policy file is a hard schema violation.", }, # ── Radicle repo structure ───────────────────────────────────────────────── { id = "radicle_repo_structure", action = "describe_radicle_repos_per_workspace", actor = 'Agent, cmd = "echo '{\"repos_per_workspace\": [{\"name\": \"policy-\", \"purpose\": \"keeper auto-sign policy + M-of-N operator delegation\", \"delegates\": \"M-of-N operators (M>=2 for production)\", \"writeable_by\": \"any delegate via rad patch create + M approvals\"}, {\"name\": \"-desired\", \"purpose\": \"desired infrastructure state, CI-consumable\", \"delegates\": \"M-of-N operators + CI JWT keys\", \"writeable_by\": \"operators (GitOps) or Woodpecker (automated PR merge)\"}, {\"name\": \"-state\", \"purpose\": \"authoritative audit ledger of applied ops\", \"delegates\": \"exactly one: ops-controller signing key\", \"writeable_by\": \"ops-controller only (via audit-mirror commits)\"}], \"jti_idempotency\": \"audit-mirror checks commit message for jti field before pushing — prevents duplicate audit entries on retry\"}'", depends_on = [], on_error = { strategy = 'Continue }, note = "Three-repo split is the core ADR-038 invariant. The state repo having exactly ONE delegate (ops-controller) is what makes the audit trail tamper-evident — no human can push to it. The M-of-N quorum on policy/desired repos prevents unilateral operator action. Verified by validate-radicle-governance reflection mode.", }, # ── Build pipeline ───────────────────────────────────────────────────────── { id = "build_pipeline", action = "describe_build_flow", actor = 'Agent, cmd = "echo '{\"pipeline\": [{\"step\": 1, \"actor\": \"Woodpecker CI\", \"action\": \"invoke buildkit-launcher as plugin step\"}, {\"step\": 2, \"actor\": \"buildkit-launcher\", \"action\": \"resolve size (3-tier: .build-spec.ncl > p95*1.2 > language-defaults), call POST /api/v1/vm-pool/spawn\"}, {\"step\": 3, \"actor\": \"orchestrator vm_pool\", \"action\": \"create hcloud server from golden image snapshot, wait for SSH, return SpawnResponse with lease_id+host+port\"}, {\"step\": 4, \"actor\": \"buildkit-launcher\", \"action\": \"rsync build context to runner via SSH, run buildctl on runner via SSH\"}, {\"step\": 5, \"actor\": \"buildctl on runner\", \"action\": \"build Dockerfile, push image to zot, export cache to zot /cache\"}, {\"step\": 6, \"actor\": \"buildkit-launcher\", \"action\": \"DELETE /api/v1/vm-pool/, POST /api/v1/vm-pool/metrics\"}], \"oom_retry\": \"exit 137 → retry once at next hcloud tier (cx22→cx32→cx42→cx52)\", \"hcloud_tiers\": {\"cx22\": \"2cpu/4GB\", \"cx32\": \"4cpu/8GB\", \"cx42\": \"8cpu/16GB\", \"cx52\": \"16cpu/32GB\"}}'", depends_on = [], on_error = { strategy = 'Continue }, note = "Runners are always destroyed — buildkit-launcher calls DELETE even on build failure (destroy is in a tokio::spawn so it doesn't block error propagation). The golden image is rebuilt weekly by a Woodpecker pipeline; reflect mode validate-build-infrastructure checks image age < 14 days.", }, # ── zot multi-tenant layout ──────────────────────────────────────────────── { id = "zot_layout", action = "describe_zot_multi_tenant_layout", actor = 'Agent, cmd = "echo '{\"zot_location\": \"libre-wuji (not libre-daoshi — relocated by ADR-039)\", \"backend\": \"S3-compatible with versioning and cross-region replication\", \"namespaces\": [{\"path\": \"/images\", \"purpose\": \"final built images pushed by buildkit-launcher\"}, {\"path\": \"/cache\", \"purpose\": \"BuildKit registry-cache (warm cache across runner spawns)\"}, {\"path\": \"/sccache\", \"purpose\": \"sccache S3 backend for Rust/C++ compiler caches\"}, {\"path\": \"/crates\", \"purpose\": \"private crates.io mirror for Rust dependencies\"}, {\"path\": \"/golden\", \"purpose\": \"golden image artifacts (tar/OCI layers) pre-push to hcloud snapshot\"}], \"auth\": \"JWT-scoped per namespace — Woodpecker token can push /images and /cache only\"}'", depends_on = [ { step = "build_pipeline", kind = 'Always }, ], on_error = { strategy = 'Continue }, note = "zot relocation to libre-wuji is an ADR-039 invariant. libre-daoshi is replaceable, so its zot was removed — all caching and registry state is now in the workload plane which has S3-backed persistence and DR replication.", }, # ── Live artifact_paths from ontology ───────────────────────────────────── { id = "artifact_paths", action = "list_all_implementation_artifacts", actor = 'Agent, cmd = "nickel export --format json --import-path . .ontology/core.ncl 2>/dev/null | jq '[.nodes[] | select(.id == \"ops-contract-dual-mode\" or .id == \"decentralized-governance-radicle\" or .id == \"ephemeral-build-infrastructure\") | {id: .id, description: .description, artifact_paths: .artifact_paths}]'", depends_on = [], on_error = { strategy = 'Continue }, note = "Live read from .ontology/core.ncl — artifact_paths lists every file that implements a given node. Use these paths to navigate to the implementation. If this step fails (nickel not in PATH), fall back to the static list: adr-037→ops-keeper/+ops-controller/+ops_contract.ncl; adr-038→audit-mirror/+radicle.ncl+keeper_policy.ncl; adr-039→buildkit-launcher/+buildkit_runner.ncl+build_spec.ncl.", }, # ── Available playbooks ──────────────────────────────────────────────────── { id = "available_playbooks", action = "catalog_ops_playbooks", actor = 'Agent, cmd = "find catalog/playbooks -maxdepth 1 -mindepth 1 -type d | sort | while read d; do name=$(basename $d); desc=$(nickel export --format json \"$d/playbook.ncl\" 2>/dev/null | jq -r '.description // .trigger // \"(no description)\"'); echo \"{\\\"playbook\\\": \\\"$name\\\", \\\"description\\\": \\\"$desc\\\"}\"; done | jq -s '.'", depends_on = [], on_error = { strategy = 'Continue }, note = "Key playbooks: bootstrap_initial (one-time full system bring-up), switch_to_vm_ops (enable keeper-daemon auto-sign), switch_to_operator_only (disable auto-sign), rotate_keys (rotate any signing key via Radicle delegation), onboard_operator/offboard_operator (NATS cred + Radicle delegation), dr_wuji_lost/dr_daoshi_lost (disaster recovery).", }, # ── Validation modes ────────────────────────────────────────────────────── { id = "validation_modes", action = "list_available_reflection_modes", actor = 'Agent, cmd = "echo '{\"modes\": [{\"id\": \"validate-ops-contract\", \"validates\": \"NATS streams exist with WorkQueue retention, single ops-controller consumer\"}, {\"id\": \"validate-radicle-governance\", \"validates\": \"three repos per workspace, correct delegation profiles, signed commits\"}, {\"id\": \"validate-keeper-policy\", \"validates\": \"policy files are declarative-only Nickel, schema conformance\"}, {\"id\": \"validate-build-infrastructure\", \"validates\": \"golden image freshness, zot reachable, buildkit_runner component registered\"}, {\"id\": \"validate-observability\", \"validates\": \"Vector+Loki+Grafana healthy, Prometheus scraping ops-controller metrics\"}, {\"id\": \"validate-playbooks\", \"validates\": \"playbook.ncl schema, script existence, nu ide-check, dry-run tests\"}], \"handlers_path\": \"reflection/handlers/\", \"handler_convention\": \"nu reflection/handlers/.nu -- \"}'", depends_on = [], on_error = { strategy = 'Continue }, note = "Six validation modes cover all three ADRs. Run validate-ops-contract and validate-radicle-governance daily as health checks. Run validate-build-infrastructure weekly (checks golden image age). Run validate-playbooks before any playbook execution window.", }, # ── Edge cases and known constraints ───────────────────────────────────── { id = "constraints_and_edge_cases", action = "describe_key_invariants", actor = 'Agent, cmd = "echo '{\"hard_constraints\": [{\"id\": \"policy-files-are-declarative-only\", \"rule\": \"keeper_policy.ncl files must not contain Nickel function definitions (fun keyword). Schema: PolicyDef in schemas/lib/keeper_policy.ncl.\"}, {\"id\": \"ops-controller-single-workqueue-consumer\", \"rule\": \"exactly one ops-controller consumer per workspace OPS_CMD stream. Multiple consumers cause ops to be split across instances — undefined behavior.\"}, {\"id\": \"state-repo-single-delegate\", \"rule\": \"-state Radicle repo has exactly one delegate (ops-controller key). Any additional delegate breaks audit tamper-evidence.\"}, {\"id\": \"oom-retry-bounded\", \"rule\": \"buildkit-launcher retries OOM (exit 137) at most once, at the next hcloud tier. No further retries — surface as build failure.\"}, {\"id\": \"runner-always-destroyed\", \"rule\": \"vm_pool lease is destroyed by buildkit-launcher regardless of build result. Never leak runners.\"}]}'", depends_on = [ { step = "ops_command_flow", kind = 'Always }, { step = "radicle_repo_structure", kind = 'Always }, { step = "build_pipeline", kind = 'Always }, ], on_error = { strategy = 'Continue }, note = "These five constraints are the invariants that, if violated, cause silent data loss or audit gaps. ops-controller-single-workqueue-consumer and state-repo-single-delegate are verified by validate-ops-contract and validate-radicle-governance respectively. The others are enforced at code level (keeper_policy schema fail-fast, retry.rs bounded logic, always-destroy in main.rs).", }, ], postconditions = [ "Agent has a complete picture of the three-plane topology", "Agent understands the ops command data flow end-to-end", "Agent can navigate to any implementation artifact via artifact_paths", "Agent knows which playbook to invoke for common operational scenarios", "Agent knows which validation mode to run for each subsystem", ], }