prvng_core/scripts/build-nixos-image-remote.sh

198 lines
5.7 KiB
Bash
Raw Permalink Normal View History

feat(core): three-layer DAG, unified component arch, commands-registry cache, Nushell 0.112.2 migration - DAG architecture: `dag show/validate/export` (nulib/main_provisioning/dag.nu), config loader (lib_provisioning/config/loader/dag.nu), taskserv dag-executor. Backed by schemas/lib/dag/*.ncl; orchestrator emits NATS events via WorkspaceComposition::into_workflow. See ADR-020, ADR-021. - Unified Component Architecture: components/mod.nu, main_provisioning/ {components,workflow,extensions,ontoref-queries}.nu. Full workflow engine with topological sort and NATS subject emission. Blocks A-H complete (libre-daoshi). - Commands-registry: nulib/commands-registry.ncl (Nickel source, 314 lines) + JSON cache at ~/.cache/provisioning/commands-registry.json rebuilt on source change. cli/provisioning fast-path alias expansion avoids cold Nu startup. ADDING_COMMANDS.md documents new-command workflow. - Platform service manager: service-manager.nu (+573), startup.nu (+611), service-check.nu (+255); autostart/bootstrap/health/target refactored. - Nushell 0.112.2 migration: removed all try/catch and bash redirections; external commands prefixed with ^; type signatures enforced. Driven by scripts/refactor-try-catch{,-simplified}.nu. - TTY stack: removed shlib/*-tty.sh; replaced by cli/tty-dispatch.sh, tty-filter.sh, tty-commands.conf. - New domain modules: images/ (golden image lifecycle), workspace/{state,sync}.nu, main_provisioning/{bootstrap,cluster-deploy,fip,state}.nu, commands/{state, build,integrations/auth,utilities/alias}.nu, platform.nu expanded (+874). - Config loader overhaul: loader/core.nu slimmed (-759), cache/core.nu refactored (-454), removed legacy loaders/file_loader.nu (-330). - Thirteen new provisioning-<domain>.nu top-level modules for bash dispatcher. - Tests: test_workspace_state.nu (+351); updates to test_oci_registry, test_services. - README + CHANGELOG updated.
2026-04-17 04:27:33 +01:00
#!/bin/bash
# Build NixOS image on remote Hetzner server (cross-platform builds)
# Usage: ./build-nixos-image-remote.sh [role] [location] [project_path]
# Output: SNAPSHOT_ID written to stdout on success
set -euo pipefail
# Configuration
ROLE="${1:-cp}"
LOCATION="${2:-nbg1}"
PROJECT_PATH="${3:-.}"
SSH_KEY="${SSH_KEY:-htz_ops}"
HCLOUD_TOKEN="${HCLOUD_TOKEN:?HCLOUD_TOKEN required}"
# Derived
TEMP_NAME="build-nixos-${ROLE}-$$"
FLAKE_DIR="workspaces/librecloud_hetzner/nixos"
TIMESTAMP=$(date -u +%Y-%m-%dT%H%M%SZ)
DESCRIPTION="nixos-${ROLE}-aarch64-${TIMESTAMP}"
echo "=== Building NixOS ${ROLE} image on Hetzner ==="
echo "Temp server: $TEMP_NAME | Role: $ROLE | Location: $LOCATION"
# Create temporary build server
echo "=== 1. Creating temp server $TEMP_NAME ==="
hcloud server create \
--name "$TEMP_NAME" \
--type cax11 \
--location "$LOCATION" \
--image debian-12 \
--ssh-key "$SSH_KEY" > /dev/null
SERVER_ID=$(hcloud server describe "$TEMP_NAME" -o format='{{.ID}}')
SERVER_IP=$(hcloud server describe "$TEMP_NAME" -o format='{{.PublicNet.IPv4.IP}}')
echo "Created: $TEMP_NAME (ID=$SERVER_ID, IP=$SERVER_IP)"
cleanup() {
echo "=== Cleanup: deleting server ==="
hcloud server delete "$SERVER_ID" 2>/dev/null || true
rm -f /tmp/build-remote-*.sh /tmp/project-build.tar.gz
}
trap cleanup EXIT
# Wait for SSH
echo "=== 2. Waiting for SSH connectivity ==="
SSH_READY=0
for i in $(seq 1 60); do
if ssh -o StrictHostKeyChecking=no -o ConnectTimeout=3 -o BatchMode=yes root@"${SERVER_IP}" true 2>/dev/null; then
echo "SSH ready after $((i*5)) seconds"
SSH_READY=1
break
fi
printf "."
sleep 5
done
if [ "$SSH_READY" -eq 0 ]; then
echo ""
echo "ERROR: SSH timeout after 300 seconds"
echo "Server: $SERVER_IP"
echo "Check: ssh -o StrictHostKeyChecking=no root@${SERVER_IP}"
exit 1
fi
echo ""
# Transfer project
echo "=== 3. Transferring project ==="
SSH_OPTS="-o StrictHostKeyChecking=no -o ServerAliveInterval=60 -o ServerAliveCountMax=10"
tar -czf /tmp/project-build.tar.gz \
--exclude='.git/objects' \
--exclude='.git/logs' \
--exclude='.nix' \
--exclude='result*' \
--exclude='*.img' \
--exclude='target' \
--exclude='.coder' \
-C "$PROJECT_PATH" .
SIZE=$(ls -lh /tmp/project-build.tar.gz | awk '{print $5}')
echo "Uploading $SIZE..."
scp $SSH_OPTS /tmp/project-build.tar.gz "root@${SERVER_IP}:/tmp/" || {
echo "ERROR: Failed to upload project"
exit 1
}
ssh $SSH_OPTS root@"${SERVER_IP}" "cd /tmp && tar -xzf project-build.tar.gz && rm project-build.tar.gz && echo 'Project extracted'" || {
echo "ERROR: Failed to extract project"
exit 1
}
echo "Project transferred"
# Install Nix and build
echo "=== 4. Installing Nix on server ==="
cat > /tmp/build-remote-install.sh << 'INSTALL_NIX'
#!/bin/bash
set -euo pipefail
apt-get update -qq
apt-get install -y -qq curl xz-utils
curl -L https://nixos.org/nix/install | bash -s -- --no-daemon --yes 2>/dev/null
export PATH="${HOME}/.nix-profile/bin:$PATH"
nix --version
INSTALL_NIX
scp $SSH_OPTS /tmp/build-remote-install.sh "root@${SERVER_IP}:/tmp/"
ssh $SSH_OPTS root@"${SERVER_IP}" bash /tmp/build-remote-install.sh
echo "=== 5. Building image ==="
cat > /tmp/build-remote-build.sh << BUILD_IMAGE
#!/bin/bash
set -euo pipefail
export PATH="\${HOME}/.nix-profile/bin:\$PATH"
export NIX_CONFIG="experimental-features = nix-command flakes"
cd /tmp
echo "Building ${ROLE} image..."
nix build "${FLAKE_DIR}#packages.aarch64-linux.${ROLE}-image" \
--out-link "/tmp/nixos-${ROLE}-image" \
--print-build-logs 2>&1 | tail -20
IMG=\$(find /tmp/nixos-${ROLE}-image -name "*.img" | head -1)
if [ -z "\$IMG" ]; then
echo "ERROR: image not found"
exit 1
fi
ls -lh "\$IMG"
echo "SUCCESS: Image built"
BUILD_IMAGE
scp $SSH_OPTS /tmp/build-remote-build.sh "root@${SERVER_IP}:/tmp/"
ssh $SSH_OPTS root@"${SERVER_IP}" bash /tmp/build-remote-build.sh
# Fetch image
echo "=== 6. Fetching image back ==="
mkdir -p /tmp/nixos-build
scp $SSH_OPTS "root@${SERVER_IP}:/tmp/nixos-${ROLE}-image/*.img" /tmp/nixos-build/ 2>/dev/null || {
echo "ERROR: Failed to fetch image"
exit 1
}
IMAGE_LOCAL=$(find /tmp/nixos-build -name "*.img" | head -1)
echo "Image: $(ls -lh "$IMAGE_LOCAL" | awk '{print $5, $9}')"
# Reboot and deploy
echo "=== 7. Rebooting into rescue ==="
hcloud server reboot "$SERVER_ID" --force
sleep 15
hcloud server enable-rescue "$SERVER_ID" --type linux64 --ssh-key "$SSH_KEY" > /dev/null
hcloud server reboot "$SERVER_ID"
echo "Waiting for rescue SSH..."
RESCUE_READY=0
for i in $(seq 1 60); do
if ssh $SSH_OPTS -o ConnectTimeout=3 -o BatchMode=yes root@"${SERVER_IP}" true 2>/dev/null; then
echo "Rescue ready"
RESCUE_READY=1
break
fi
printf "."
sleep 5
done
if [ "$RESCUE_READY" -eq 0 ]; then
echo ""
echo "ERROR: Rescue SSH timeout"
exit 1
fi
echo ""
# Write image to disk
echo "=== 8. Writing image to /dev/sda ==="
gzip -dc "$IMAGE_LOCAL" | ssh $SSH_OPTS root@"${SERVER_IP}" \
"dd of=/dev/sda bs=4M conv=fsync status=progress"
echo "=== 9. Powering off ==="
hcloud server poweroff "$SERVER_ID"
sleep 15
echo "=== 10. Creating snapshot ==="
SNAPSHOT_ID=$(hcloud server create-image "$SERVER_ID" \
--type snapshot \
--description "$DESCRIPTION" \
-o format='{{.ID}}')
echo ""
echo "════════════════════════════════════════"
echo "✓ BUILD SUCCESS"
echo "════════════════════════════════════════"
echo "SNAPSHOT_ID=$SNAPSHOT_ID"
echo ""
echo "Next: Update servers.ncl for role '$ROLE':"
echo " image = \"$SNAPSHOT_ID\""
echo "════════════════════════════════════════"
# Keep snapshot, delete server
trap - EXIT
hcloud server delete "$SERVER_ID"