provisioning/reflection/modes/ops-radicle-build-info.ncl

# reflection/modes/ops-radicle-build-info.ncl
#
# Info mode: structured architectural knowledge for the ops contract + Radicle
# governance + ephemeral build infrastructure implemented in ADRs 037/038/039.
#
# Run by an agent to answer questions like:
#   "What planes exist and what components run in each?"
#   "What is the data flow for an ops command?"
#   "What Radicle repos does a workspace require?"
#   "How does the build pipeline work?"
#   "What playbooks are available and what do they do?"
#   "Which reflection modes validate this subsystem?"
#
# Agent entry point:
#   ONTOREF_ACTOR=agent ontoref run start ops-radicle-build-info --task "describe architecture"
#   # then per step: ontoref step report ops-radicle-build-info <step-id> --status pass
#
# Direct execution from provisioning root:
#   nu reflection/handlers/ops-radicle-build-info.nu --provisioning-root .

{
  id       = "ops-radicle-build-info",
  trigger  = "Describe the complete ops + Radicle governance + build infrastructure architecture (ADRs 037/038/039)",
  strategy = 'Override,

  preconditions = [
    "nickel is available in PATH",
    "jq is available in PATH",
    "provisioning/.ontology/core.ncl is readable",
    "provisioning/adrs/adr-037-ops-contract-dual-mode.ncl is readable",
    "provisioning/adrs/adr-038-radicle-decentralized-governance.ncl is readable",
    "provisioning/adrs/adr-039-build-infrastructure-ephemeral.ncl is readable",
  ],

  steps = [

    # ── Plane topology ────────────────────────────────────────────────────────

    {
      id         = "plane_topology",
      action     = "describe_three_planes",
      actor      = 'Agent,
      cmd        = "echo '{\"planes\": [{\"name\": \"workload\", \"workspace\": \"libre-wuji\", \"role\": \"production runtime, runtime-autonomous\", \"components\": [\"k8s\", \"zot-s3\", \"ops-controller\", \"audit-mirror\", \"radicle-seed\", \"observability-stack\"]}, {\"name\": \"ci-orchestration\", \"workspace\": \"libre-daoshi\", \"role\": \"build + VCS + CI dispatch, replaceable\", \"components\": [\"k0s\", \"forgejo\", \"woodpecker\", \"postgresql\", \"radicle-seed\", \"buildkit-launcher\"]}, {\"name\": \"signing\", \"workspace\": \"ops-vm\", \"role\": \"auto-sign (keeper) or manual-sign, replaceable\", \"components\": [\"keeper-daemon\", \"radicle-seed\"]}]}'",
      depends_on = [],
      on_error   = { strategy = 'Continue },
      note       = "Three-plane separation: workload plane is runtime-autonomous (continues when signing or CI planes are down). ops-vm is replaceable (its only persistent state is the keeper signing key, backed up and rotatable via the rotate_keys playbook). libre-daoshi is replaceable (all source is in Radicle; it can be re-provisioned without data loss). Both ADR-037 and ADR-038.",
    },

    # ── Ops command data flow ─────────────────────────────────────────────────

    {
      id         = "ops_command_flow",
      action     = "describe_ops_data_flow",
      actor      = 'Agent,
      cmd        = "echo '{\"flow\": [{\"step\": 1, \"actor\": \"operator or keeper-daemon\", \"action\": \"sign JWT op command with Ed25519 key, POST to ops.pending.<workspace>\"}, {\"step\": 2, \"actor\": \"keeper-daemon (auto) or keeper-cli (manual)\", \"action\": \"subscribe OPS_PENDING_<workspace> WorkQueue, validate JWT claims+scope+sequence, ack, publish to ops.cmd.<workspace>\"}, {\"step\": 3, \"actor\": \"ops-controller\", \"action\": \"WorkQueue consumer of OPS_CMD_<workspace>: persist to SurrealDB BEFORE ack, apply via orchestrator API, write result to ops.audit.<workspace>\"}, {\"step\": 4, \"actor\": \"audit-mirror\", \"action\": \"subscribe ops.audit.<workspace>, batch commits to <workspace>-state Radicle repo with jti idempotency\"}], \"invariants\": [\"WorkQueue semantics: exactly-once delivery guaranteed by JetStream AckPolicy=explicit\", \"Idempotency: expected_state_version prevents double-apply across restarts\", \"Audit trail: every command lands in Radicle ledger — tamper-evident, gossip-replicated\"]}'",
      depends_on = [],
      on_error   = { strategy = 'Continue },
      note       = "Four-step command flow. Step 3 is the SOLID boundary: ops-controller touches only orchestrator API and NATS — never provider APIs or secrets directly. Step 4 is async best-effort (audit-mirror on_error=Continue); the ops command is not blocked by audit replication.",
    },

    # ── Mode switch (auto/manual/hybrid) ─────────────────────────────────────

    {
      id         = "signing_mode_switch",
      action     = "describe_keeper_modes",
      actor      = 'Agent,
      cmd        = "echo '{\"modes\": [{\"name\": \"auto\", \"keeper_daemon\": \"running\", \"human_approval\": false, \"use_when\": \"routine deployments in business hours\"}, {\"name\": \"manual\", \"keeper_daemon\": \"stopped\", \"human_approval\": true, \"use_when\": \"destructive ops or off-hours\"}, {\"name\": \"hybrid\", \"keeper_daemon\": \"running with narrow auto_sign policy\", \"human_approval\": \"only for ops not matching auto_sign\", \"use_when\": \"gradual trust ramp-up\"}], \"switch_mechanism\": \"operational — start/stop keeper-daemon + edit keeper_policy.ncl; no code change, no architectural change\", \"playbooks\": [\"switch_to_vm_ops (auto→manual)\", \"switch_to_operator_only\"]}'",
      depends_on = [
        { step = "ops_command_flow", kind = 'Always },
      ],
      on_error   = { strategy = 'Continue },
      note       = "Mode switching is operational, not architectural — there are no mode constants in code. The keeper_policy.ncl schema is declarative-only Nickel (ADR-038 constraint: policy-files-are-declarative-only). Any Nickel function definition in a policy file is a hard schema violation.",
    },

    # ── Radicle repo structure ─────────────────────────────────────────────────

    {
      id         = "radicle_repo_structure",
      action     = "describe_radicle_repos_per_workspace",
      actor      = 'Agent,
      cmd        = "echo '{\"repos_per_workspace\": [{\"name\": \"policy-<workspace>\", \"purpose\": \"keeper auto-sign policy + M-of-N operator delegation\", \"delegates\": \"M-of-N operators (M>=2 for production)\", \"writeable_by\": \"any delegate via rad patch create + M approvals\"}, {\"name\": \"<workspace>-desired\", \"purpose\": \"desired infrastructure state, CI-consumable\", \"delegates\": \"M-of-N operators + CI JWT keys\", \"writeable_by\": \"operators (GitOps) or Woodpecker (automated PR merge)\"}, {\"name\": \"<workspace>-state\", \"purpose\": \"authoritative audit ledger of applied ops\", \"delegates\": \"exactly one: ops-controller signing key\", \"writeable_by\": \"ops-controller only (via audit-mirror commits)\"}], \"jti_idempotency\": \"audit-mirror checks commit message for jti field before pushing — prevents duplicate audit entries on retry\"}'",
      depends_on = [],
      on_error   = { strategy = 'Continue },
      note       = "Three-repo split is the core ADR-038 invariant. The state repo having exactly ONE delegate (ops-controller) is what makes the audit trail tamper-evident — no human can push to it. The M-of-N quorum on policy/desired repos prevents unilateral operator action. Verified by validate-radicle-governance reflection mode.",
    },

    # ── Build pipeline ─────────────────────────────────────────────────────────

    {
      id         = "build_pipeline",
      action     = "describe_build_flow",
      actor      = 'Agent,
      cmd        = "echo '{\"pipeline\": [{\"step\": 1, \"actor\": \"Woodpecker CI\", \"action\": \"invoke buildkit-launcher as plugin step\"}, {\"step\": 2, \"actor\": \"buildkit-launcher\", \"action\": \"resolve size (3-tier: .build-spec.ncl > p95*1.2 > language-defaults), call POST /api/v1/vm-pool/spawn\"}, {\"step\": 3, \"actor\": \"orchestrator vm_pool\", \"action\": \"create hcloud server from golden image snapshot, wait for SSH, return SpawnResponse with lease_id+host+port\"}, {\"step\": 4, \"actor\": \"buildkit-launcher\", \"action\": \"rsync build context to runner via SSH, run buildctl on runner via SSH\"}, {\"step\": 5, \"actor\": \"buildctl on runner\", \"action\": \"build Dockerfile, push image to zot, export cache to zot /cache\"}, {\"step\": 6, \"actor\": \"buildkit-launcher\", \"action\": \"DELETE /api/v1/vm-pool/<lease_id>, POST /api/v1/vm-pool/metrics\"}], \"oom_retry\": \"exit 137 → retry once at next hcloud tier (cx22→cx32→cx42→cx52)\", \"hcloud_tiers\": {\"cx22\": \"2cpu/4GB\", \"cx32\": \"4cpu/8GB\", \"cx42\": \"8cpu/16GB\", \"cx52\": \"16cpu/32GB\"}}'",
      depends_on = [],
      on_error   = { strategy = 'Continue },
      note       = "Runners are always destroyed — buildkit-launcher calls DELETE even on build failure (destroy is in a tokio::spawn so it doesn't block error propagation). The golden image is rebuilt weekly by a Woodpecker pipeline; reflect mode validate-build-infrastructure checks image age < 14 days.",
    },

    # ── zot multi-tenant layout ────────────────────────────────────────────────

    {
      id         = "zot_layout",
      action     = "describe_zot_multi_tenant_layout",
      actor      = 'Agent,
      cmd        = "echo '{\"zot_location\": \"libre-wuji (not libre-daoshi — relocated by ADR-039)\", \"backend\": \"S3-compatible with versioning and cross-region replication\", \"namespaces\": [{\"path\": \"/images\", \"purpose\": \"final built images pushed by buildkit-launcher\"}, {\"path\": \"/cache\", \"purpose\": \"BuildKit registry-cache (warm cache across runner spawns)\"}, {\"path\": \"/sccache\", \"purpose\": \"sccache S3 backend for Rust/C++ compiler caches\"}, {\"path\": \"/crates\", \"purpose\": \"private crates.io mirror for Rust dependencies\"}, {\"path\": \"/golden\", \"purpose\": \"golden image artifacts (tar/OCI layers) pre-push to hcloud snapshot\"}], \"auth\": \"JWT-scoped per namespace — Woodpecker token can push /images and /cache only\"}'",
      depends_on = [
        { step = "build_pipeline", kind = 'Always },
      ],
      on_error   = { strategy = 'Continue },
      note       = "zot relocation to libre-wuji is an ADR-039 invariant. libre-daoshi is replaceable, so its zot was removed — all caching and registry state is now in the workload plane which has S3-backed persistence and DR replication.",
    },

    # ── Live artifact_paths from ontology ─────────────────────────────────────

    {
      id         = "artifact_paths",
      action     = "list_all_implementation_artifacts",
      actor      = 'Agent,
      cmd        = "nickel export --format json --import-path . .ontology/core.ncl 2>/dev/null | jq '[.nodes[] | select(.id == \"ops-contract-dual-mode\" or .id == \"decentralized-governance-radicle\" or .id == \"ephemeral-build-infrastructure\") | {id: .id, description: .description, artifact_paths: .artifact_paths}]'",
      depends_on = [],
      on_error   = { strategy = 'Continue },
      note       = "Live read from .ontology/core.ncl — artifact_paths lists every file that implements a given node. Use these paths to navigate to the implementation. If this step fails (nickel not in PATH), fall back to the static list: adr-037→ops-keeper/+ops-controller/+ops_contract.ncl; adr-038→audit-mirror/+radicle.ncl+keeper_policy.ncl; adr-039→buildkit-launcher/+buildkit_runner.ncl+build_spec.ncl.",
    },

    # ── Available playbooks ────────────────────────────────────────────────────

    {
      id         = "available_playbooks",
      action     = "catalog_ops_playbooks",
      actor      = 'Agent,
      cmd        = "find catalog/playbooks -maxdepth 1 -mindepth 1 -type d | sort | while read d; do name=$(basename $d); desc=$(nickel export --format json \"$d/playbook.ncl\" 2>/dev/null | jq -r '.description // .trigger // \"(no description)\"'); echo \"{\\\"playbook\\\": \\\"$name\\\", \\\"description\\\": \\\"$desc\\\"}\"; done | jq -s '.'",
      depends_on = [],
      on_error   = { strategy = 'Continue },
      note       = "Key playbooks: bootstrap_initial (one-time full system bring-up), switch_to_vm_ops (enable keeper-daemon auto-sign), switch_to_operator_only (disable auto-sign), rotate_keys (rotate any signing key via Radicle delegation), onboard_operator/offboard_operator (NATS cred + Radicle delegation), dr_wuji_lost/dr_daoshi_lost (disaster recovery).",
    },

    # ── Validation modes ──────────────────────────────────────────────────────

    {
      id         = "validation_modes",
      action     = "list_available_reflection_modes",
      actor      = 'Agent,
      cmd        = "echo '{\"modes\": [{\"id\": \"validate-ops-contract\", \"validates\": \"NATS streams exist with WorkQueue retention, single ops-controller consumer\"}, {\"id\": \"validate-radicle-governance\", \"validates\": \"three repos per workspace, correct delegation profiles, signed commits\"}, {\"id\": \"validate-keeper-policy\", \"validates\": \"policy files are declarative-only Nickel, schema conformance\"}, {\"id\": \"validate-build-infrastructure\", \"validates\": \"golden image freshness, zot reachable, buildkit_runner component registered\"}, {\"id\": \"validate-observability\", \"validates\": \"Vector+Loki+Grafana healthy, Prometheus scraping ops-controller metrics\"}, {\"id\": \"validate-playbooks\", \"validates\": \"playbook.ncl schema, script existence, nu ide-check, dry-run tests\"}], \"handlers_path\": \"reflection/handlers/\", \"handler_convention\": \"nu reflection/handlers/<mode-id>.nu --<param> <value>\"}'",
      depends_on = [],
      on_error   = { strategy = 'Continue },
      note       = "Six validation modes cover all three ADRs. Run validate-ops-contract and validate-radicle-governance daily as health checks. Run validate-build-infrastructure weekly (checks golden image age). Run validate-playbooks before any playbook execution window.",
    },

    # ── Edge cases and known constraints ─────────────────────────────────────

    {
      id         = "constraints_and_edge_cases",
      action     = "describe_key_invariants",
      actor      = 'Agent,
      cmd        = "echo '{\"hard_constraints\": [{\"id\": \"policy-files-are-declarative-only\", \"rule\": \"keeper_policy.ncl files must not contain Nickel function definitions (fun keyword). Schema: PolicyDef in schemas/lib/keeper_policy.ncl.\"}, {\"id\": \"ops-controller-single-workqueue-consumer\", \"rule\": \"exactly one ops-controller consumer per workspace OPS_CMD stream. Multiple consumers cause ops to be split across instances — undefined behavior.\"}, {\"id\": \"state-repo-single-delegate\", \"rule\": \"<workspace>-state Radicle repo has exactly one delegate (ops-controller key). Any additional delegate breaks audit tamper-evidence.\"}, {\"id\": \"oom-retry-bounded\", \"rule\": \"buildkit-launcher retries OOM (exit 137) at most once, at the next hcloud tier. No further retries — surface as build failure.\"}, {\"id\": \"runner-always-destroyed\", \"rule\": \"vm_pool lease is destroyed by buildkit-launcher regardless of build result. Never leak runners.\"}]}'",
      depends_on = [
        { step = "ops_command_flow",      kind = 'Always },
        { step = "radicle_repo_structure", kind = 'Always },
        { step = "build_pipeline",         kind = 'Always },
      ],
      on_error   = { strategy = 'Continue },
      note       = "These five constraints are the invariants that, if violated, cause silent data loss or audit gaps. ops-controller-single-workqueue-consumer and state-repo-single-delegate are verified by validate-ops-contract and validate-radicle-governance respectively. The others are enforced at code level (keeper_policy schema fail-fast, retry.rs bounded logic, always-destroy in main.rs).",
    },

  ],

  postconditions = [
    "Agent has a complete picture of the three-plane topology",
    "Agent understands the ops command data flow end-to-end",
    "Agent can navigate to any implementation artifact via artifact_paths",
    "Agent knows which playbook to invoke for common operational scenarios",
    "Agent knows which validation mode to run for each subsystem",
  ],
}