prvng_kcl/vm_lifecycle.k

# VM Lifecycle and Persistence Schemas (Phase 2)
#
# Extends core VmConfig with persistence and lifecycle management.
# Follows KCL patterns: schema-first, explicit types, check blocks.
schema VmPersistence:
    """
    VM persistence configuration for permanent/temporary management.

    Controls VM behavior across reboots and cleanup policies.

    Examples:
        # Permanent VM - persists across reboots
        VmPersistence {
            mode = "permanent"
            auto_start = true
            restart_policy = "always"
        }

        # Temporary VM - auto-cleanup after TTL
        VmPersistence {
            mode = "temporary"
            ttl_hours = 24
            auto_cleanup = true
            force_cleanup = false
        }
    """
    # Persistence mode (Pattern 8: Union types)
    mode: "permanent" | "temporary" = "permanent"

    # Permanent VM settings
    # Start on host boot
    auto_start: bool = False
    restart_policy: "no" | "always" | "on-failure" = "always"
    # Max restart attempts
    max_retries: int = 5

    # Temporary VM settings
    # Time to live
    ttl_hours: int = 24
    # Auto-delete on TTL
    auto_cleanup: bool = True
    # Force cleanup without graceful shutdown
    force_cleanup: bool = False
    # Seconds to wait before force kill
    cleanup_grace_period: int = 60

    # State tracking
    # Creation timestamp (Unix epoch)
    created_at_unix: int
    # Cleanup timestamp if scheduled
    scheduled_cleanup?: int
    # Last state change timestamp
    last_state_change?: int

    check:
        # TTL validation
        ttl_hours > 0 and ttl_hours <= 8760, "TTL must be 1-8760 hours (1 year max)"
        # Restart policy validation
        not (auto_start and mode == "temporary"), "Temporary VMs cannot have auto_start enabled"
        # Cleanup validation
        not (force_cleanup and not auto_cleanup), "force_cleanup requires auto_cleanup enabled"
        # Grace period validation
        cleanup_grace_period >= 0 and cleanup_grace_period <= 300, "Grace period must be 0-300 seconds"

schema VmLifecyclePolicy:
    """
    VM lifecycle policy defining behavior across system events.

    Controls how VMs behave on host reboot, shutdown, and resource contention.
    """
    # On host reboot behavior
    on_host_reboot: "start" | "keep-stopped" | "destroy" = "start"

    # On host shutdown behavior
    on_host_shutdown: "shutdown" | "save-state" | "destroy" = "shutdown"

    # On resource contention
    on_memory_pressure: "suspend" | "kill" | "none" = "none"
    on_disk_full: "suspend" | "kill" | "none" = "none"

    # Resource limits enforcement
    enforce_memory_limit: bool = True
    enforce_cpu_limit: bool = True
    # Risky if enabled
    enforce_disk_limit: bool = False

    check:
        on_host_shutdown != "save-state" or on_host_reboot != "destroy", "Cannot save-state on shutdown if VM is destroyed on reboot"

schema VmCleanupSchedule:
    """
    Cleanup scheduling information for temporary VMs.

    Tracks when VMs are scheduled for cleanup and status.
    """
    vm_name: str
    vm_id: str
    mode: "temporary" = "temporary"
    # ISO 8601 timestamp
    created_at: str
    # ISO 8601 timestamp
    scheduled_cleanup_at: str
    ttl_hours: int

    # Cleanup status tracking
    cleanup_status: "pending" | "in-progress" | "completed" | "failed" = "pending"
    cleanup_attempts: int = 0
    # ISO 8601 timestamp
    last_cleanup_attempt?: str
    cleanup_error?: str

    check:
        len(vm_name) > 0, "VM name required"
        len(vm_id) > 0, "VM ID required"
        ttl_hours > 0, "TTL must be positive"
        cleanup_attempts >= 0, "Cleanup attempts cannot be negative"

schema VmRecoveryState:
    """
    VM state snapshot for recovery after host reboot.

    Captures VM state before shutdown for restoration.
    """
    vm_name: str
    vm_id: str
    state_before_shutdown: "running" | "stopped" | "paused"
    # ISO 8601
    creation_timestamp: str
    # ISO 8601
    last_checkpoint: str

    # Memory state (for save-state)
    # Path to memory dump file
    memory_snapshot?: str
    memory_size_mb?: int
    # Configuration snapshot
    # Full VmConfig at snapshot time (stored as JSON/dict)
    config_snapshot: {str: any}

    check:
        len(vm_name) > 0, "VM name required"
        state_before_shutdown in ["running", "stopped", "paused"], "Invalid shutdown state"

schema VmAutoStartConfig:
    """
    Configuration for automatic VM startup on host boot.

    Manages order and dependencies for VM startup.
    """
    vm_name: str
    enabled: bool = True
    # Lower numbers start first
    start_order: int = 0
    # Delay before starting
    start_delay_seconds: int = 0
    # Wait for SSH before continuing
    wait_for_ssh: bool = True
    # Max wait time
    ssh_timeout_seconds: int = 300
    on_start_failure: "stop" | "retry" | "ignore" = "retry"
    max_start_retries: int = 3

    # Dependencies
    # Other VMs to start first
    depends_on: [str] = []

    check:
        len(vm_name) > 0, "VM name required"
        start_order >= 0, "Start order must be non-negative"
        start_delay_seconds >= 0, "Delay must be non-negative"
        ssh_timeout_seconds > 0, "SSH timeout must be positive"
        max_start_retries > 0, "Max retries must be positive"
        on_start_failure in ["stop", "retry", "ignore"], "Invalid start failure policy"

schema VmCleanupPolicy:
    """
    Global cleanup policy for all temporary VMs on system.

    Defines system-wide cleanup behavior and constraints.
    """
    # Cleanup scheduling
    cleanup_enabled: bool = True
    # How often to check for cleanup
    check_interval_minutes: int = 60
    # HH:MM format
    cleanup_window_start: str = "02:00"
    # HH:MM format
    cleanup_window_end: str = "06:00"
    cleanup_in_window_only: bool = True

    # Resource constraints
    # Max VMs cleaning up simultaneously
    max_concurrent_cleanups: int = 3
    # Max VMs to check per batch
    cleanup_batch_size: int = 10

    # Safety features
    # Require approval before cleanup
    require_confirmation: bool = False
    # Log cleanups without executing
    dry_run_mode: bool = False
    # Skip cleanup if system busy
    skip_on_low_resources: bool = True

    # Logging and monitoring
    log_cleanup_operations: bool = True
    alert_on_cleanup_failure: bool = True
    # Keep cleanup logs for N days
    retention_days: int = 7

    check:
        check_interval_minutes > 0 and check_interval_minutes <= 1440, "Check interval must be 1-1440 minutes (1 day max)"
        max_concurrent_cleanups > 0, "Must allow at least 1 concurrent cleanup"
        cleanup_batch_size > 0, "Batch size must be positive"
        retention_days >= 0, "Retention days cannot be negative"

schema VmStateSnapshot:
    """
    Snapshot of VM state for persistence and recovery.

    Used for state persistence across operations.
    """
    vm_name: str
    # ISO 8601 timestamp
    snapshot_time: str
    vm_state: "stopped" | "starting" | "running" | "stopping" | "paused" | "error"

    # Resource state at snapshot
    cpu_usage_percent: float
    memory_usage_mb: int
    disk_usage_gb: int

    # Network state
    ip_addresses: [str]
    mac_addresses: [str]

    # Performance metrics
    uptime_seconds: int
    restart_count: int

    check:
        len(vm_name) > 0, "VM name required"
        cpu_usage_percent >= 0 and cpu_usage_percent <= 100, "CPU usage must be 0-100%"
        memory_usage_mb >= 0, "Memory usage cannot be negative"
        disk_usage_gb >= 0, "Disk usage cannot be negative"
        uptime_seconds >= 0, "Uptime cannot be negative"
        restart_count >= 0, "Restart count cannot be negative"