provisioning/adrs/adr-039-build-infrastructure-ephemeral.ncl

let d = import "adr-defaults.ncl" in

d.make_adr {
  id     = "adr-039",
  title  = "Build infrastructure: golden-imaged ephemeral runners with dynamic sizing, S3-backed multi-tenant zot in workload cluster, and CI-orchestration separation",
  status = 'Accepted,
  date   = "2026-04-26",

  context = "The platform needs to compile workloads (Rust binaries like Vapora, the orchestrator crates, ops-keeper, ops-controller) and produce OCI images for runtime consumption. Three constraints shape the design: (1) the CI cluster (libre-daoshi class) is sized for orchestration and source-of-truth services (forgejo, woodpecker server, postgresql) — running CPU-heavy compiles inside its k0s cluster causes scheduling pressure on the orchestration services and forces the cluster to be sized for peak build load rather than steady-state orchestration; (2) the workload cluster (libre-wuji class) must remain CI-free per ADR-037 to preserve runtime autonomy; (3) image storage must survive the loss of any single cluster — keeping zot inside libre-wuji with local volume storage means losing wuji wipes the registry, and rebuilding the registry from external sources is a slow recovery path. The orchestrator (memory: platform/vm/ subsystem) already supports VM lifecycle (spawn, persistence, golden image cache, cleanup scheduler), making ephemeral builders feasible without new infrastructure. The remaining design decisions concern image storage durability, runner sizing, cache locality, and where the registry physically lives.",

  decision = "Adopt an ephemeral-builder + central-registry architecture with three components. (1) BuildKit runs in ephemeral VMs spawned by the orchestrator on demand. Each VM is created from a periodically-rebuilt golden image (`buildkit-runner-golden:<date>`) pre-installed with buildkit (rootless), sccache, nushell, and SSH server keyed for the orchestrator. Spawn time targets ~30s vs ~2min for cloud-init from a generic base image. The golden image itself is rebuilt weekly via a Woodpecker pipeline that runs in a current ephemeral runner — the chain is self-rebuilding after the initial bootstrap. (2) Runner sizing is dynamic per build, resolved in three tiers: explicit declaration in `.build-spec.ncl` at the repo root (BuildSpec contract: cpu, memory_gb, disk_gb, time_budget_min, cache_keys, oom_retry); historical p95 of CPU/RAM picos for that repo from the orchestrator's SurrealDB build-metrics table, multiplied by 1.2; language defaults from the orchestrator (Cargo.toml → medium 4vCPU/8GB, package.json → small 2vCPU/4GB, etc.). Final size = max(declared, 1.2×p95_historical). OOM kill auto-retries once with one size up. Time budget enforced as VM-level kill. (3) zot lives in libre-wuji (relocated from libre-daoshi) configured with S3-compatible backend (Hetzner Object Storage, Backblaze B2, or compatible). The S3 bucket is the durable storage; zot pods are stateless and can be killed/respawned without data loss. Bucket configuration: versioning enabled (point-in-time recovery), lifecycle policy (90-day non-current version retention), optional cross-region replication to a second bucket on a different provider for catastrophic recovery. zot's auth model uses JWT integrated with the workspace's NATS account hierarchy — daoshi-ci principals have write to /images, /cache, /sccache, /crates; wuji workload pods have read on /images; operators have write on /crates; public read on /crates if the operator chooses to publish a Rust crate registry. (4) The buildkit-launcher binary (woodpecker plugin) bridges Woodpecker pipeline steps to the orchestrator: it requests a runner of the resolved size, waits for ready, ships build context via SSH, invokes buildctl with --import-cache and --export-cache pointing to zot.wuji.local, collects logs, requests destroy. The launcher carries no persistent state; orchestrator owns the lease. (5) Cache strategy uses zot as both layer cache (BuildKit registry-mode cache) and Rust object cache (sccache S3-backend pointed at zot's S3-compatible API). Cold runner with warm cache compiles at near-warm-runner speed because the network distance to zot is short and the cache is rich. (6) Coupling consequence: builds depend on wuji being reachable (zot lives there). When wuji is unreachable, builds can run cold-locally on the runner but cannot push results — operators acknowledge this trade-off; an optional pull-through cache mirror in libre-daoshi can be added later if the coupling produces measurable friction.",

  rationale = [
    {
      claim  = "Ephemeral runners + golden images give build farm bursting without fixed-cost capacity",
      detail = "A persistent build VM sized for the largest workload (Vapora) wastes CPU and RAM 95% of the time it sits idle. Per-build VMs scale to zero between builds — the only cost is the spawn time, which the golden image reduces to ~30s. The orchestrator already manages VM lifecycle for taskservs (memory: platform/vm/lifecycle.nu, vm_persistence.nu, cleanup_scheduler), so adding the buildkit_runner role is a component definition and a launcher binary, not a new subsystem.",
    },
    {
      claim  = "Three-tier dynamic sizing handles the spread between trivial CI tasks and Vapora-class compiles without overcommit",
      detail = "Static sizing variants (small/medium/large/xlarge) impose two failure modes: under-sized (OOM, slow), over-sized (wasted resources, slower spawn for unnecessarily large VMs). Reading `.build-spec.ncl` lets the repo declare its needs explicitly. P95 historical fallback handles repos that never declared a spec but have build history — most repos converge to a stable size. Language defaults handle the first build of a new repo. The 1.2× multiplier on historical p95 absorbs typical variance without exposing builds to OOM kill on a marginally larger build than usual.",
    },
    {
      claim  = "zot with S3 backend makes the registry stateless — DR is a property of S3, not zot",
      detail = "Self-managed durable storage for a registry (cluster volumes + replication + backup) is a recurring operational task. S3-class storage (any compatible provider) gives 11-nines durability natively and supports versioning and cross-region replication as configuration. Moving zot to that backend means the kubernetes pod is replaceable on a moment's notice with no data migration — the bucket is the source of truth. The DR question reduces to: is the bucket reachable, and is its versioning intact? — both of which are provider responsibilities. Cross-provider replication (e.g., Hetzner primary + Backblaze secondary) addresses provider catastrophic loss.",
    },
    {
      claim  = "Cache lives in the registry because BuildKit and sccache both speak S3-compatible APIs to a shared registry",
      detail = "BuildKit supports `--export-cache type=registry` and `--import-cache type=registry`, writing layer cache as OCI artifacts to the same registry that holds final images. sccache supports S3 backend that can target zot's S3-compatible endpoint (zot exposes an S3 API for direct artifact upload). Both caches benefit from the same durability and replication as the images themselves. A new cold runner pulling cache from zot is essentially as fast as the cache is rich; running the cache locally on the VM gains nothing because the VM is destroyed at end of build.",
    },
    {
      claim  = "buildkit-launcher is thin to keep state in the orchestrator, not in Woodpecker",
      detail = "Putting orchestration logic (lease tracking, cleanup on failure, retry policy) in the launcher would duplicate logic the orchestrator already implements for VM-backed taskservs. The launcher is a wrapper: requests a runner, hands off to buildctl on the runner, collects results. If the launcher process dies mid-build, the orchestrator's cleanup scheduler reaps the orphaned VM. If the runner OOMs, the orchestrator retries with the next size. The launcher's only job is to bridge Woodpecker step semantics (env vars, exit code, log capture) to the orchestrator's leased-resource semantics.",
    },
  ],

  consequences = {
    positive = [
      "libre-daoshi cluster stays small and steady-state — orchestration services are not preempted by build CPU",
      "Build capacity is elastic without operator intervention — concurrent builds spawn concurrent VMs up to the orchestrator's configured pool limit",
      "Build cold-start with warm cache is near-warm — sccache hits at network speed from a same-provider VM",
      "Image registry DR is reduced to S3 bucket configuration — versioning, lifecycle, cross-region replication are all provider features",
      "zot multi-tenant layout (/images, /cache, /sccache, /crates) lets the same registry serve workload images, build cache, Rust crates, and OCI artifacts uniformly",
      "Golden image rebuild via the system itself (a runner builds the next runner image) means no permanent external build dependency once bootstrapped",
      "Sizing dynamism makes Vapora-class builds and trivial doc builds use appropriate resources without manual tuning per pipeline",
    ],
    negative = [
      "Builds depend on wuji being reachable for zot — wuji outage stops the publish step (mitigation: optional pull-through cache mirror in libre-daoshi if measured friction warrants)",
      "Initial bootstrap requires producing the first golden image off-platform (laptop or external CI) — documented in playbook, but a one-time manual step",
      "Per-build VM creation has spawn-cost floor (~30s with golden image) — hot-path one-second test runs are not the right shape for this model; small in-cluster runners may be added later if a workload demands sub-spawn-cost CI",
      "Orchestrator's VM pool limit becomes a build concurrency ceiling — needs sizing per workspace based on observed peak parallelism",
      "Runner OOM auto-retry doubles VM cost for that build — repeated retries for flaky builds inflate cloud costs; mitigation: max 1 retry, with explicit failure surfaced to the developer",
      "Cross-provider S3 replication has lag — the secondary bucket is eventual-consistent with the primary, so a same-second push-and-pull from secondary may miss; mitigation: cross-provider replication is for DR, not for normal reads",
    ],
  },

  alternatives_considered = [
    {
      option       = "Persistent build VMs with strong per-VM cache locality",
      why_rejected = "Sized for peak load, idle 95% of the time. Cache locality benefit is partial because cross-VM cache requires central storage anyway. Operational maintenance (patching, OS updates) on persistent VMs is recurring; ephemeral VMs from a periodically-refreshed golden image trade per-build spawn cost for zero ongoing maintenance.",
    },
    {
      option       = "BuildKit pods inside libre-daoshi cluster",
      why_rejected = "Couples build CPU to orchestration cluster — large builds cause scheduler pressure on forgejo, woodpecker server, postgresql. Sizing the cluster for peak builds wastes resources between builds. Out-of-cluster ephemeral VMs avoid this entirely with no architectural cost since the orchestrator already runs them for taskservs.",
    },
    {
      option       = "GitHub-hosted runners or other external CI for builds",
      why_rejected = "Reintroduces an external runtime dependency for the build step, contradicting the platform's autonomy goals. Also creates two CI surfaces (Woodpecker + GitHub Actions) operators must reason about. The orchestrator-spawned ephemeral runners give the same elasticity within the platform's own infrastructure.",
    },
    {
      option       = "zot in libre-daoshi cluster with local volumes",
      why_rejected = "Centralizes images on the wrong cluster — wuji should be the source of truth at runtime per ADR-037. Also single-cluster local-volume storage has no DR path that does not involve manual replication. S3 backend in wuji gives DR via provider features without manual replication.",
    },
    {
      option       = "Nix as the build system instead of BuildKit",
      why_rejected = "Nix delivers reproducible builds and a richer caching model, but the project is not Nix-native — workloads are built with cargo, npm, go, and language-native toolchains. Adopting Nix wholesale is a separate, larger decision. BuildKit accepts the existing Dockerfile/buildctl workflow most workloads already have. If a future workload demands bit-reproducible builds, Nix can run inside a BuildKit step without changing the surrounding architecture.",
    },
  ],

  constraints = [
    {
      id        = "buildkit-runner-no-persistent-storage",
      claim     = "buildkit_runner component MUST NOT declare persistent volumes — all state lives on ephemeral disk and is destroyed with the VM",
      scope     = "catalog/components/buildkit_runner.ncl",
      severity  = 'Hard,
      check      = { tag = 'Grep, pattern = "persistent.*=.*false", paths = ["provisioning/catalog/components/buildkit_runner.ncl"], must_be_empty = false },
      rationale = "Persistent storage on ephemeral runners defeats the cost model and recreates the persistent-VM maintenance burden. Cache locality is provided by zot, not by persistent disks.",
    },
    {
      id        = "zot-storage-must-be-s3",
      claim     = "zot component in libre-wuji MUST configure storage.backend = 's3' — local-volume storage is not permitted for the workload-cluster registry",
      scope     = "workspaces/libre-wuji/infra/libre-wuji/components/zot.ncl",
      severity  = 'Hard,
      check     = {
        tag         = 'Grep,
        pattern     = "backend = \"s3\"|backend.*s3",
        paths       = ["workspaces/libre-wuji/infra/libre-wuji/components/zot.ncl"],
        must_be_empty = false,
      },
      rationale = "Local-volume zot has no DR path consistent with the platform's resilience goals. The constraint forces the S3 backend choice at config-validation time.",
    },
    {
      id        = "build-spec-schema-versioned",
      claim     = ".build-spec.ncl files in repos MUST validate against schemas/lib/build_spec.ncl — invalid specs cause launcher to fail-fast with a parse error, not silently fall back",
      scope     = "schemas/lib/build_spec.ncl, platform/crates/buildkit-launcher/",
      severity  = 'Hard,
      check      = { tag = 'Grep, pattern = "SchemaError|schema_error|schema_validation|validation_diff", paths = ["provisioning/platform/crates/buildkit-launcher/src/"], must_be_empty = false },
      rationale = "Silent fallback on invalid build-spec files masks misconfigurations until a build OOMs unexpectedly. Fail-fast surfaces the issue at the next pipeline run, when the developer can fix it.",
    },
    {
      id        = "oom-retry-bounded",
      claim     = "buildkit-launcher OOM retry MUST be bounded to one retry per build — repeated retries inflate cost and indicate misconfiguration that needs developer attention",
      scope     = "platform/crates/buildkit-launcher/src/retry.rs",
      severity  = 'Hard,
      check     = {
        tag         = 'Grep,
        pattern     = "max_oom_retries|MAX_OOM_RETRY|oom_retry_limit",
        paths       = ["platform/crates/buildkit-launcher/"],
        must_be_empty = false,
      },
      rationale = "Unbounded retries on flaky builds turn a $0.10 build into a $1+ build silently. The bound is policy: one retry covers transient sizing miss, repeat OOM means the developer should declare a larger spec.",
    },
    {
      id        = "golden-image-rebuild-cadence",
      claim     = "buildkit-runner-golden image MUST be rebuilt at least weekly — older golden images accumulate package vulnerabilities and toolchain drift",
      scope     = "Woodpecker pipeline definitions, orchestrator default-image config",
      severity  = 'Soft,
      check      = { tag = 'Grep, pattern = "golden-image-rebuild", paths = [".woodpecker/"], must_be_empty = false },
      rationale = "Stale golden images are a slow-moving security problem — toolchain CVEs accumulate. Weekly rebuild is generous but acceptable; faster cadence is fine but adds noise. Soft severity because the cadence is operational policy, not a structural invariant.",
    },
  ],

  ontology_check = {
    decision_string    = "Build infrastructure: golden-imaged ephemeral runners spawned by orchestrator + dynamic sizing (.build-spec.ncl + p95 historical + language defaults) + zot relocated to libre-wuji with S3 backend (versioning + cross-region replication) + multi-tenant zot layout (images/cache/sccache/crates) + buildkit-launcher as thin Woodpecker-to-orchestrator bridge + sccache and BuildKit cache both terminated at zot",
    invariants_at_risk = ["config-driven-always", "type-safety-nickel"],
    verdict            = 'Safe,
  },

  related_adrs = ["adr-037-ops-contract-dual-mode", "adr-038-radicle-decentralized-governance", "adr-021-workspace-composition-dag", "adr-033-cluster-component-extension-pattern"],
}