97 lines
9.4 KiB
Text
97 lines
9.4 KiB
Text
|
|
let d = import "adr-defaults.ncl" in
|
||
|
|
|
||
|
|
d.make_adr {
|
||
|
|
id = "adr-035",
|
||
|
|
title = "StorageConfig schema: provider-declared storage policies and component requires.storage contract",
|
||
|
|
status = 'Accepted,
|
||
|
|
date = "2026-04-24",
|
||
|
|
|
||
|
|
context = "Components declare storage needs as an untyped record: `requires.storage = { size = \"20Gi\", persistent = true }`. No contract validates that the declared size is within provider bounds, that the volume mode is compatible with the storage class, or that expansion is possible if the PVC must grow later. The postgresql deployment was provisioned with a 20Gi PVC on hcloud-volumes (minimum 10Gi, expand-only). Reducing it is impossible: Hetzner CSI only allows expansion. This class of error — requesting more storage than needed on a provider that cannot shrink volumes — has no static check and no runtime signal until the operator attempts a resize and finds it rejected. A separate problem: 'block', 'nfs', and 'object' volume semantics are not represented at all; a component could request NFS access mode on a block-only storage class without any validation. This ADR defines the StorageConfig schema to make these constraints machine-checkable.",
|
||
|
|
|
||
|
|
decision = "Introduce `schemas/lib/storage_config.ncl` with three exports: (1) `StorageRequires` — the contract for component `requires.storage` fields, adding `volume_mode` (block/nfs/object) and `access_mode` alongside the existing `size` and `persistent` fields; (2) `ProviderStoragePolicy` — the abstract contract for provider metadata declarations, specifying `min_size`, `max_size`, `expansion_policy` (static/expand_only/full), and `volume_modes`; (3) concrete provider policy values `HetznerCSIPolicy` and `DemocraticCSINFSPolicy` with the real constraints pre-filled. Storage class providers declare their policy in `capabilities.ncl` or `metadata.ncl` using `ProviderStoragePolicy`. Component storage requirements use `StorageRequires`. The preflight gate in `comp-build-cluster-bundle` is the enforcement point: it reads the storage class from component config, resolves the matching provider policy from capabilities, and fails if the requested size is below `min_size`. No ADR-mandated change to the component CLI is required — preflight already has access to both component config and capabilities.",
|
||
|
|
|
||
|
|
rationale = [
|
||
|
|
{
|
||
|
|
claim = "Hetzner CSI volumes cannot shrink — min_size enforcement must happen at deploy time, not at resize time",
|
||
|
|
detail = "The Kubernetes CSI spec allows drivers to implement VolumeExpansion but not VolumeContraction. Hetzner's hcloud-volumes driver only supports expansion. A PVC provisioned at 20Gi on hcloud-volumes cannot be reduced to 10Gi without deleting the PVC (and losing data) and reprovisioning. The `min_size = \"10Gi\"` field in HetznerCSIPolicy, combined with preflight validation, catches over-provisioning before the PVC is created — where the correction is a config edit, not a data migration.",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
claim = "Volume mode (block/nfs/object) is not derivable from storage class name alone",
|
||
|
|
detail = "Storage class names like 'hcloud-volumes', 'democratic-csi-nfs', 'longhorn' carry no semantic: a reader cannot determine from the name whether the class provides RWO block storage, RWX NFS, or something else. The `volume_mode` field in `StorageRequires` and `volume_modes` in `ProviderStoragePolicy` make this explicit. A component requesting `volume_mode = 'nfs` on a storage class whose policy declares `volume_modes = ['block]` is a preflight failure, not a runtime error on the remote node.",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
claim = "expansion_policy encodes the one-way door semantics of provider volume management",
|
||
|
|
detail = "Three states: 'static (no resize at all — e.g. hostPath), 'expand_only (increase only — Hetzner CSI), 'full (expand and shrink — democratic-csi NFS, some Longhorn configurations). This field is the authoritative signal for whether a future size increase in component config will be deployable. An operator who knows their provider is 'expand_only can provision conservatively (10Gi) knowing they can grow later, rather than defensively provisioning large volumes that cannot be reclaimed.",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
claim = "Concrete provider policy values (HetznerCSIPolicy, DemocraticCSINFSPolicy) eliminate per-workspace duplication",
|
||
|
|
detail = "Without pre-defined policy constants, every workspace capabilities.ncl that uses hcloud-volumes would need to manually specify `min_size = \"10Gi\"`, `expansion_policy = 'expand_only`, etc. — and could drift. By defining HetznerCSIPolicy and DemocraticCSINFSPolicy in the schema, workspaces reference the canonical policy: `storage_policy | sc.ProviderStoragePolicy = sc.HetznerCSIPolicy`. The Nickel contract then validates any field override against the policy shape.",
|
||
|
|
},
|
||
|
|
],
|
||
|
|
|
||
|
|
consequences = {
|
||
|
|
positive = [
|
||
|
|
"PVC over-provisioning on expand-only providers is caught at preflight before the PVC exists",
|
||
|
|
"Volume mode mismatches (NFS component on block storage class) become preflight failures",
|
||
|
|
"capabilities.ncl gains a typed storage policy declaration — provider constraints are readable without consulting Hetzner docs",
|
||
|
|
"StorageRequires contract applies to all component requires.storage fields uniformly via schema import",
|
||
|
|
"Concrete policy values (HetznerCSIPolicy) are the single source of truth — workspace drift is impossible via Nickel contract",
|
||
|
|
],
|
||
|
|
negative = [
|
||
|
|
"Size comparison (component.requires.storage.size >= provider.min_size) requires string-to-bytes parsing — this is done in Nu (preflight), not in Nickel, because Nickel has no byte-unit parsing in std",
|
||
|
|
"Provider policy must be declared in capabilities.ncl — a storage class used without a matching policy entry cannot be validated (validation skips rather than fails, so the gap is silent)",
|
||
|
|
],
|
||
|
|
},
|
||
|
|
|
||
|
|
alternatives_considered = [
|
||
|
|
{
|
||
|
|
option = "Add min_size / max_size directly to the storage_classes list in InfraCapabilities",
|
||
|
|
why_rejected = "InfraCapabilities.storage_classes is currently Array String (a list of class names). Changing it to a typed record would require updating all capabilities.ncl files in all workspaces simultaneously. The ProviderStoragePolicy approach allows new capabilities.ncl entries to use the typed policy while old entries continue to work — opt-in migration rather than breaking change.",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
option = "Enforce via Kubernetes admission webhook (VPA/LimitRange) instead of preflight",
|
||
|
|
why_rejected = "Admission webhooks enforce at pod scheduling time, not at bundle validation time. The gap between 'provisioning op started' and 'webhook rejects the PVC' is an orphaned in-progress op with no clean recovery path. Preflight enforcement keeps the invariant: if preflight passes, the deploy can succeed without external gates.",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
option = "Allow size as a Number (Gi) instead of String",
|
||
|
|
why_rejected = "Existing components use `size = \"20Gi\"` (String). Changing to Number would require a migration across all component NCL files and breaks Nickel contract compatibility. The String representation is also the form Kubernetes expects in PVC manifests, so no conversion is needed in templates.",
|
||
|
|
},
|
||
|
|
],
|
||
|
|
|
||
|
|
constraints = [
|
||
|
|
{
|
||
|
|
id = "storage-requires-uses-contract",
|
||
|
|
claim = "Any component NCL that declares requires.storage must use StorageRequires from schemas/lib/storage_config.ncl",
|
||
|
|
scope = "provisioning/catalog/components/*/nickel/contracts.ncl",
|
||
|
|
severity = 'Soft,
|
||
|
|
check = {
|
||
|
|
tag = 'Grep,
|
||
|
|
pattern = "storage_config",
|
||
|
|
paths = ["provisioning/catalog/components/"],
|
||
|
|
must_be_empty = false,
|
||
|
|
},
|
||
|
|
rationale = "StorageRequires adds volume_mode and access_mode to the storage spec. Without the contract import, components declare an untyped record that passes Nickel validation regardless of content — the volume_mode / access_mode fields are silently ignored. The soft severity reflects that adoption is incremental — existing components without storage can be migrated on next edit.",
|
||
|
|
},
|
||
|
|
{
|
||
|
|
id = "provider-policy-min-size-hetzner",
|
||
|
|
claim = "Any capabilities.ncl that declares hcloud-volumes must set min_size = \"10Gi\" and expansion_policy = 'expand_only",
|
||
|
|
scope = "workspaces/*/infra/*/capabilities.ncl",
|
||
|
|
severity = 'Hard,
|
||
|
|
check = {
|
||
|
|
tag = 'NuCmd,
|
||
|
|
cmd = "nu -c \"open workspaces/libre-wuji/infra/libre-wuji/capabilities.ncl | str contains 'hcloud'\"",
|
||
|
|
expect_exit = 0,
|
||
|
|
},
|
||
|
|
rationale = "Hetzner hcloud-volumes is the primary block storage provider in libre-wuji. Omitting min_size means components can request 5Gi PVCs which Hetzner will reject at provisioning time with a CSI error. The HetznerCSIPolicy constant in storage_config.ncl provides the correct values — workspaces should reference it rather than hard-code the constraint.",
|
||
|
|
},
|
||
|
|
],
|
||
|
|
|
||
|
|
ontology_check = {
|
||
|
|
decision_string = "StorageConfig schema: StorageRequires contract for components + ProviderStoragePolicy for providers + HetznerCSIPolicy/DemocraticCSINFSPolicy constants + preflight size/mode validation",
|
||
|
|
invariants_at_risk = ["type-safety-nickel", "config-driven-always"],
|
||
|
|
verdict = 'Safe,
|
||
|
|
},
|
||
|
|
|
||
|
|
related_adrs = ["adr-033-cluster-component-extension-pattern", "adr-020-extension-capability-declarations"],
|
||
|
|
}
|