prvng_kcl/vm_lifecycle.k
2025-12-11 22:17:44 +00:00

249 lines
7.5 KiB
Plaintext

# VM Lifecycle and Persistence Schemas (Phase 2)
#
# Extends core VmConfig with persistence and lifecycle management.
# Follows KCL patterns: schema-first, explicit types, check blocks.
schema VmPersistence:
"""
VM persistence configuration for permanent/temporary management.
Controls VM behavior across reboots and cleanup policies.
Examples:
# Permanent VM - persists across reboots
VmPersistence {
mode = "permanent"
auto_start = true
restart_policy = "always"
}
# Temporary VM - auto-cleanup after TTL
VmPersistence {
mode = "temporary"
ttl_hours = 24
auto_cleanup = true
force_cleanup = false
}
"""
# Persistence mode (Pattern 8: Union types)
mode: "permanent" | "temporary" = "permanent"
# Permanent VM settings
# Start on host boot
auto_start: bool = False
restart_policy: "no" | "always" | "on-failure" = "always"
# Max restart attempts
max_retries: int = 5
# Temporary VM settings
# Time to live
ttl_hours: int = 24
# Auto-delete on TTL
auto_cleanup: bool = True
# Force cleanup without graceful shutdown
force_cleanup: bool = False
# Seconds to wait before force kill
cleanup_grace_period: int = 60
# State tracking
# Creation timestamp (Unix epoch)
created_at_unix: int
# Cleanup timestamp if scheduled
scheduled_cleanup?: int
# Last state change timestamp
last_state_change?: int
check:
# TTL validation
ttl_hours > 0 and ttl_hours <= 8760, "TTL must be 1-8760 hours (1 year max)"
# Restart policy validation
not (auto_start and mode == "temporary"), "Temporary VMs cannot have auto_start enabled"
# Cleanup validation
not (force_cleanup and not auto_cleanup), "force_cleanup requires auto_cleanup enabled"
# Grace period validation
cleanup_grace_period >= 0 and cleanup_grace_period <= 300, "Grace period must be 0-300 seconds"
schema VmLifecyclePolicy:
"""
VM lifecycle policy defining behavior across system events.
Controls how VMs behave on host reboot, shutdown, and resource contention.
"""
# On host reboot behavior
on_host_reboot: "start" | "keep-stopped" | "destroy" = "start"
# On host shutdown behavior
on_host_shutdown: "shutdown" | "save-state" | "destroy" = "shutdown"
# On resource contention
on_memory_pressure: "suspend" | "kill" | "none" = "none"
on_disk_full: "suspend" | "kill" | "none" = "none"
# Resource limits enforcement
enforce_memory_limit: bool = True
enforce_cpu_limit: bool = True
# Risky if enabled
enforce_disk_limit: bool = False
check:
on_host_shutdown != "save-state" or on_host_reboot != "destroy", "Cannot save-state on shutdown if VM is destroyed on reboot"
schema VmCleanupSchedule:
"""
Cleanup scheduling information for temporary VMs.
Tracks when VMs are scheduled for cleanup and status.
"""
vm_name: str
vm_id: str
mode: "temporary" = "temporary"
# ISO 8601 timestamp
created_at: str
# ISO 8601 timestamp
scheduled_cleanup_at: str
ttl_hours: int
# Cleanup status tracking
cleanup_status: "pending" | "in-progress" | "completed" | "failed" = "pending"
cleanup_attempts: int = 0
# ISO 8601 timestamp
last_cleanup_attempt?: str
cleanup_error?: str
check:
len(vm_name) > 0, "VM name required"
len(vm_id) > 0, "VM ID required"
ttl_hours > 0, "TTL must be positive"
cleanup_attempts >= 0, "Cleanup attempts cannot be negative"
schema VmRecoveryState:
"""
VM state snapshot for recovery after host reboot.
Captures VM state before shutdown for restoration.
"""
vm_name: str
vm_id: str
state_before_shutdown: "running" | "stopped" | "paused"
# ISO 8601
creation_timestamp: str
# ISO 8601
last_checkpoint: str
# Memory state (for save-state)
# Path to memory dump file
memory_snapshot?: str
memory_size_mb?: int
# Configuration snapshot
# Full VmConfig at snapshot time (stored as JSON/dict)
config_snapshot: {str: any}
check:
len(vm_name) > 0, "VM name required"
state_before_shutdown in ["running", "stopped", "paused"], "Invalid shutdown state"
schema VmAutoStartConfig:
"""
Configuration for automatic VM startup on host boot.
Manages order and dependencies for VM startup.
"""
vm_name: str
enabled: bool = True
# Lower numbers start first
start_order: int = 0
# Delay before starting
start_delay_seconds: int = 0
# Wait for SSH before continuing
wait_for_ssh: bool = True
# Max wait time
ssh_timeout_seconds: int = 300
on_start_failure: "stop" | "retry" | "ignore" = "retry"
max_start_retries: int = 3
# Dependencies
# Other VMs to start first
depends_on: [str] = []
check:
len(vm_name) > 0, "VM name required"
start_order >= 0, "Start order must be non-negative"
start_delay_seconds >= 0, "Delay must be non-negative"
ssh_timeout_seconds > 0, "SSH timeout must be positive"
max_start_retries > 0, "Max retries must be positive"
on_start_failure in ["stop", "retry", "ignore"], "Invalid start failure policy"
schema VmCleanupPolicy:
"""
Global cleanup policy for all temporary VMs on system.
Defines system-wide cleanup behavior and constraints.
"""
# Cleanup scheduling
cleanup_enabled: bool = True
# How often to check for cleanup
check_interval_minutes: int = 60
# HH:MM format
cleanup_window_start: str = "02:00"
# HH:MM format
cleanup_window_end: str = "06:00"
cleanup_in_window_only: bool = True
# Resource constraints
# Max VMs cleaning up simultaneously
max_concurrent_cleanups: int = 3
# Max VMs to check per batch
cleanup_batch_size: int = 10
# Safety features
# Require approval before cleanup
require_confirmation: bool = False
# Log cleanups without executing
dry_run_mode: bool = False
# Skip cleanup if system busy
skip_on_low_resources: bool = True
# Logging and monitoring
log_cleanup_operations: bool = True
alert_on_cleanup_failure: bool = True
# Keep cleanup logs for N days
retention_days: int = 7
check:
check_interval_minutes > 0 and check_interval_minutes <= 1440, "Check interval must be 1-1440 minutes (1 day max)"
max_concurrent_cleanups > 0, "Must allow at least 1 concurrent cleanup"
cleanup_batch_size > 0, "Batch size must be positive"
retention_days >= 0, "Retention days cannot be negative"
schema VmStateSnapshot:
"""
Snapshot of VM state for persistence and recovery.
Used for state persistence across operations.
"""
vm_name: str
# ISO 8601 timestamp
snapshot_time: str
vm_state: "stopped" | "starting" | "running" | "stopping" | "paused" | "error"
# Resource state at snapshot
cpu_usage_percent: float
memory_usage_mb: int
disk_usage_gb: int
# Network state
ip_addresses: [str]
mac_addresses: [str]
# Performance metrics
uptime_seconds: int
restart_count: int
check:
len(vm_name) > 0, "VM name required"
cpu_usage_percent >= 0 and cpu_usage_percent <= 100, "CPU usage must be 0-100%"
memory_usage_mb >= 0, "Memory usage cannot be negative"
disk_usage_gb >= 0, "Disk usage cannot be negative"
uptime_seconds >= 0, "Uptime cannot be negative"
restart_count >= 0, "Restart count cannot be negative"