968 lines
35 KiB
Rust
Raw Normal View History

2025-10-07 10:59:52 +01:00
use anyhow::{Context, Result};
use axum::{
extract::{Path, State},
http::StatusCode,
response::Json,
routing::{get, post},
Router,
};
use clap::Parser;
use serde::{Deserialize, Serialize};
use std::{
process::Stdio,
sync::Arc,
};
use tokio::process::Command;
use tower_http::cors::CorsLayer;
use tracing::{error, info, warn};
use uuid::Uuid;
// Use types from the library
use provisioning_orchestrator::{
Args, WorkflowTask, TaskStatus, CreateServerWorkflow, TaskservWorkflow, ClusterWorkflow,
storage::{create_storage_from_args, TaskStorage},
workflow::{WorkflowExecutionState},
batch::{BatchCoordinator, BatchCoordinatorFactory, BatchOperationRequest, BatchOperationResult},
dependency::{DependencyResolver},
state::{WorkflowStateManager, StateConfig, ProgressTracker, ProgressInfo, StateSnapshot, SystemMetrics, StateManagerStatistics},
monitor::{MonitoringSystem, MonitoringConfig, SystemResourceHealthCheck, MonitoringEvent, MonitoringEventType, SystemHealthStatus},
rollback::{RollbackSystem, RollbackConfig, RollbackResult, Checkpoint, RollbackStatistics, RollbackStrategy, providers::*},
test_environment::{TestEnvironment, CreateTestEnvironmentRequest, RunTestRequest, TestEnvironmentResponse, TestResult},
test_orchestrator::TestOrchestrator,
dns::DnsManager,
extensions::ExtensionManager,
oci::OciManager,
services::ServiceOrchestrator,
};
// Application state for the orchestrator
pub type SharedState = Arc<AppState>;
pub struct AppState {
pub task_storage: Arc<dyn TaskStorage>,
pub batch_coordinator: BatchCoordinator,
pub dependency_resolver: DependencyResolver,
pub state_manager: Arc<WorkflowStateManager>,
pub monitoring_system: Arc<MonitoringSystem>,
pub progress_tracker: Arc<ProgressTracker>,
pub rollback_system: Arc<RollbackSystem>,
pub test_orchestrator: Arc<TestOrchestrator>,
pub dns_manager: Arc<DnsManager>,
pub extension_manager: Arc<ExtensionManager>,
pub oci_manager: Arc<OciManager>,
pub service_orchestrator: Arc<ServiceOrchestrator>,
pub args: Args,
}
impl AppState {
pub async fn new(args: Args) -> Result<Self> {
// Create storage using the factory pattern
let task_storage = create_storage_from_args(&args).await
.context("Failed to create storage backend")?;
info!("Successfully initialized {} storage backend", args.storage_type);
// Create batch coordinator
let batch_coordinator = BatchCoordinatorFactory::create_from_args(
task_storage.clone(),
args.clone(),
).await.context("Failed to create batch coordinator")?;
info!("Successfully initialized batch coordinator");
// Create dependency resolver with default configuration
let dependency_resolver = DependencyResolver::new();
info!("Successfully initialized dependency resolver");
// Create state manager with default configuration
let state_config = StateConfig::default();
let state_manager = Arc::new(WorkflowStateManager::new(task_storage.clone(), state_config));
// Initialize state manager
state_manager.init().await
.context("Failed to initialize state manager")?;
info!("Successfully initialized state manager");
// Create monitoring system with default configuration
let monitoring_config = MonitoringConfig::default();
let monitoring_system = Arc::new(MonitoringSystem::new(
monitoring_config,
task_storage.clone(),
state_manager.clone(),
));
// Initialize monitoring system
monitoring_system.init().await
.context("Failed to initialize monitoring system")?;
info!("Successfully initialized monitoring system");
// Register system resource health check
let health_monitor = monitoring_system.health_monitor();
health_monitor.register_health_check(
"system_resources".to_string(),
SystemResourceHealthCheck::new(1024, 80.0), // 1GB memory, 80% CPU thresholds
).await;
// Create progress tracker
let progress_tracker = Arc::new(ProgressTracker::new(state_manager.clone()));
info!("Successfully initialized progress tracker");
// Create rollback system with configuration-driven strategy
let rollback_config = RollbackConfig {
checkpoint_interval_seconds: 300, // 5 minutes
auto_checkpoint_enabled: true,
strategy: RollbackStrategy::ConfigDriven,
..RollbackConfig::default()
};
let mut rollback_system = RollbackSystem::new(task_storage.clone(), rollback_config);
// Register provider-specific rollback handlers
rollback_system.resource_tracker.register_provider_handler(
Arc::new(UpCloudRollbackHandler {})
).await;
rollback_system.resource_tracker.register_provider_handler(
Arc::new(AwsRollbackHandler {})
).await;
rollback_system.resource_tracker.register_provider_handler(
Arc::new(LocalRollbackHandler {})
).await;
// Initialize rollback system
rollback_system.init().await
.context("Failed to initialize rollback system")?;
info!("Successfully initialized rollback system with {} providers", 3);
let rollback_system = Arc::new(rollback_system);
// Create test orchestrator
let test_orchestrator = Arc::new(TestOrchestrator::new()
.context("Failed to initialize test orchestrator")?);
info!("Successfully initialized test orchestrator");
// Create DNS manager
let dns_manager = Arc::new(DnsManager::new(
"http://localhost:53".to_string(), // TODO: Read from config
true, // auto_register
300, // default_ttl
));
info!("Successfully initialized DNS manager");
// Create extension manager
let extension_manager = Arc::new(ExtensionManager::new(
args.nu_path.clone(),
args.provisioning_path.clone(),
));
info!("Successfully initialized extension manager");
// Create OCI manager
let oci_manager = Arc::new(OciManager::new(
"http://localhost:5000".to_string(), // TODO: Read from config
"provisioning-extensions".to_string(), // namespace
std::path::PathBuf::from(&args.data_dir).join("oci-cache"),
));
info!("Successfully initialized OCI manager");
// Create service orchestrator
let service_orchestrator = Arc::new(ServiceOrchestrator::new(
args.nu_path.clone(),
args.provisioning_path.clone(),
true, // auto_start_dependencies
));
info!("Successfully initialized service orchestrator");
Ok(Self {
task_storage,
batch_coordinator,
dependency_resolver,
state_manager,
monitoring_system,
progress_tracker,
rollback_system,
test_orchestrator,
dns_manager,
extension_manager,
oci_manager,
service_orchestrator,
args,
})
}
pub async fn execute_nushell_command(&self, command: &str, args: &[String]) -> Result<String> {
let mut cmd = Command::new(&self.args.nu_path);
cmd.arg("-c")
.arg(format!("{} {}", command, args.join(" ")))
.stdout(Stdio::piped())
.stderr(Stdio::piped());
let output = cmd.output().await
.context("Failed to execute Nushell command")?;
if output.status.success() {
Ok(String::from_utf8_lossy(&output.stdout).to_string())
} else {
let error = String::from_utf8_lossy(&output.stderr);
Err(anyhow::anyhow!("Nushell command failed: {}", error))
}
}
}
#[derive(Serialize)]
struct ApiResponse<T> {
success: bool,
data: Option<T>,
error: Option<String>,
}
impl<T> ApiResponse<T> {
fn success(data: T) -> Self {
Self {
success: true,
data: Some(data),
error: None,
}
}
fn error(message: String) -> Self {
Self {
success: false,
data: None,
error: Some(message),
}
}
}
// REST API Routes
async fn create_server_workflow(
State(state): State<SharedState>,
Json(workflow): Json<CreateServerWorkflow>,
) -> Result<Json<ApiResponse<String>>, StatusCode> {
let task_id = Uuid::new_v4().to_string();
let task = WorkflowTask {
id: task_id.clone(),
name: "create_servers".to_string(),
command: format!("{} servers create", state.args.provisioning_path),
args: vec![
"--infra".to_string(),
workflow.infra.clone(),
"--settings".to_string(),
workflow.settings.clone(),
if workflow.check_mode { "--check".to_string() } else { "".to_string() },
if workflow.wait { "--wait".to_string() } else { "".to_string() },
].into_iter().filter(|s| !s.is_empty()).collect(),
dependencies: vec![],
status: TaskStatus::Pending,
created_at: chrono::Utc::now(),
started_at: None,
completed_at: None,
output: None,
error: None,
};
match state.task_storage.enqueue(task, 5).await {
Ok(()) => {
info!("Enqueued server creation workflow: {}", task_id);
Ok(Json(ApiResponse::success(task_id)))
}
Err(e) => {
error!("Failed to enqueue task: {}", e);
Err(StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
async fn create_taskserv_workflow(
State(state): State<SharedState>,
Json(workflow): Json<TaskservWorkflow>,
) -> Result<Json<ApiResponse<String>>, StatusCode> {
let task_id = Uuid::new_v4().to_string();
let task = WorkflowTask {
id: task_id.clone(),
name: format!("{}_taskserv", workflow.operation),
command: format!("{} taskserv {}", state.args.provisioning_path, workflow.operation),
args: vec![
workflow.taskserv.clone(),
"--infra".to_string(),
workflow.infra.clone(),
"--settings".to_string(),
workflow.settings.clone(),
if workflow.check_mode { "--check".to_string() } else { "".to_string() },
if workflow.wait { "--wait".to_string() } else { "".to_string() },
].into_iter().filter(|s| !s.is_empty()).collect(),
dependencies: vec![],
status: TaskStatus::Pending,
created_at: chrono::Utc::now(),
started_at: None,
completed_at: None,
output: None,
error: None,
};
match state.task_storage.enqueue(task, 6).await {
Ok(()) => {
info!("Enqueued taskserv {} workflow: {}", workflow.operation, task_id);
Ok(Json(ApiResponse::success(task_id)))
}
Err(e) => {
error!("Failed to enqueue task: {}", e);
Err(StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
async fn create_cluster_workflow(
State(state): State<SharedState>,
Json(workflow): Json<ClusterWorkflow>,
) -> Result<Json<ApiResponse<String>>, StatusCode> {
let task_id = Uuid::new_v4().to_string();
let task = WorkflowTask {
id: task_id.clone(),
name: format!("{}_cluster", workflow.operation),
command: format!("{} cluster {}", state.args.provisioning_path, workflow.operation),
args: vec![
workflow.cluster_type.clone(),
"--infra".to_string(),
workflow.infra.clone(),
"--settings".to_string(),
workflow.settings.clone(),
if workflow.check_mode { "--check".to_string() } else { "".to_string() },
if workflow.wait { "--wait".to_string() } else { "".to_string() },
].into_iter().filter(|s| !s.is_empty()).collect(),
dependencies: vec![],
status: TaskStatus::Pending,
created_at: chrono::Utc::now(),
started_at: None,
completed_at: None,
output: None,
error: None,
};
match state.task_storage.enqueue(task, 7).await {
Ok(()) => {
info!("Enqueued cluster {} workflow: {}", workflow.operation, task_id);
Ok(Json(ApiResponse::success(task_id)))
}
Err(e) => {
error!("Failed to enqueue task: {}", e);
Err(StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
async fn get_task_status(
State(state): State<SharedState>,
Path(task_id): Path<String>,
) -> Result<Json<ApiResponse<WorkflowTask>>, StatusCode> {
match state.task_storage.get_task(&task_id).await {
Ok(Some(task)) => Ok(Json(ApiResponse::success(task))),
Ok(None) => Err(StatusCode::NOT_FOUND),
Err(e) => {
error!("Failed to get task {}: {}", task_id, e);
Err(StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
async fn list_tasks(
State(state): State<SharedState>,
) -> Result<Json<ApiResponse<Vec<WorkflowTask>>>, StatusCode> {
match state.task_storage.list_tasks(None).await {
Ok(task_list) => Ok(Json(ApiResponse::success(task_list))),
Err(e) => {
error!("Failed to list tasks: {}", e);
Err(StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
async fn health_check() -> Json<ApiResponse<String>> {
Json(ApiResponse::success("Orchestrator is healthy".to_string()))
}
// Batch operation routes
async fn execute_batch_operation(
State(state): State<SharedState>,
Json(request): Json<BatchOperationRequest>,
) -> Result<Json<ApiResponse<BatchOperationResult>>, StatusCode> {
match state.batch_coordinator.execute_batch_operation(request).await {
Ok(result) => Ok(Json(ApiResponse::success(result))),
Err(e) => {
error!("Failed to execute batch operation: {}", e);
Err(StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
async fn get_batch_operation_status(
State(state): State<SharedState>,
Path(operation_id): Path<String>,
) -> Result<Json<ApiResponse<WorkflowExecutionState>>, StatusCode> {
match state.batch_coordinator.get_operation_status(&operation_id).await {
Some(state) => Ok(Json(ApiResponse::success(state))),
None => Err(StatusCode::NOT_FOUND),
}
}
async fn list_batch_operations(
State(state): State<SharedState>,
) -> Result<Json<ApiResponse<Vec<WorkflowExecutionState>>>, StatusCode> {
let operations = state.batch_coordinator.list_operations().await;
Ok(Json(ApiResponse::success(operations)))
}
async fn cancel_batch_operation(
State(state): State<SharedState>,
Path(operation_id): Path<String>,
) -> Result<Json<ApiResponse<String>>, StatusCode> {
match state.batch_coordinator.cancel_operation(&operation_id).await {
Ok(_) => Ok(Json(ApiResponse::success(format!("Operation {} cancelled", operation_id)))),
Err(e) => {
error!("Failed to cancel operation {}: {}", operation_id, e);
Err(StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
// Rollback and recovery routes
#[derive(Deserialize)]
struct CreateCheckpointRequest {
name: String,
description: Option<String>,
}
#[derive(Deserialize)]
struct RollbackRequest {
checkpoint_id: Option<String>,
operation_ids: Option<Vec<String>>,
}
async fn create_checkpoint(
State(state): State<SharedState>,
Json(request): Json<CreateCheckpointRequest>,
) -> Result<Json<ApiResponse<String>>, StatusCode> {
match state.rollback_system.checkpoint_manager.create_checkpoint(
request.name,
request.description,
).await {
Ok(checkpoint_id) => {
info!("Created checkpoint: {}", checkpoint_id);
Ok(Json(ApiResponse::success(checkpoint_id)))
}
Err(e) => {
error!("Failed to create checkpoint: {}", e);
Err(StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
async fn list_checkpoints(
State(state): State<SharedState>,
) -> Result<Json<ApiResponse<Vec<Checkpoint>>>, StatusCode> {
let checkpoints = state.rollback_system.checkpoint_manager.list_checkpoints().await;
Ok(Json(ApiResponse::success(checkpoints)))
}
async fn get_checkpoint(
State(state): State<SharedState>,
Path(checkpoint_id): Path<String>,
) -> Result<Json<ApiResponse<Checkpoint>>, StatusCode> {
match state.rollback_system.checkpoint_manager.get_checkpoint(&checkpoint_id).await {
Some(checkpoint) => Ok(Json(ApiResponse::success(checkpoint))),
None => Err(StatusCode::NOT_FOUND),
}
}
async fn execute_rollback(
State(state): State<SharedState>,
Json(request): Json<RollbackRequest>,
) -> Result<Json<ApiResponse<RollbackResult>>, StatusCode> {
let result = if let Some(checkpoint_id) = request.checkpoint_id {
// Rollback to specific checkpoint
match state.rollback_system.rollback_executor.rollback_to_checkpoint(&checkpoint_id).await {
Ok(result) => result,
Err(e) => {
error!("Failed to execute rollback to checkpoint {}: {}", checkpoint_id, e);
return Err(StatusCode::INTERNAL_SERVER_ERROR);
}
}
} else if let Some(operation_ids) = request.operation_ids {
// Partial rollback of specific operations
match state.rollback_system.rollback_executor.rollback_operations(operation_ids).await {
Ok(result) => result,
Err(e) => {
error!("Failed to execute partial rollback: {}", e);
return Err(StatusCode::INTERNAL_SERVER_ERROR);
}
}
} else {
return Err(StatusCode::BAD_REQUEST);
};
if result.success {
info!("Rollback executed successfully: {} operations", result.operations_executed);
} else {
warn!("Rollback completed with errors: {}/{} operations failed",
result.operations_failed, result.operations_executed + result.operations_failed);
}
Ok(Json(ApiResponse::success(result)))
}
async fn get_rollback_statistics(
State(state): State<SharedState>,
) -> Result<Json<ApiResponse<RollbackStatistics>>, StatusCode> {
let stats = state.rollback_system.get_statistics().await;
Ok(Json(ApiResponse::success(stats)))
}
async fn restore_from_checkpoint(
State(state): State<SharedState>,
Path(checkpoint_id): Path<String>,
) -> Result<Json<ApiResponse<String>>, StatusCode> {
match state.rollback_system.state_restorer.restore_from_checkpoint(&checkpoint_id).await {
Ok(_) => {
info!("Successfully restored state from checkpoint: {}", checkpoint_id);
Ok(Json(ApiResponse::success(format!("State restored from checkpoint {}", checkpoint_id))))
}
Err(e) => {
error!("Failed to restore from checkpoint {}: {}", checkpoint_id, e);
Err(StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
// State management and monitoring routes
async fn get_workflow_progress(
State(state): State<SharedState>,
Path(workflow_id): Path<String>,
) -> Result<Json<ApiResponse<ProgressInfo>>, StatusCode> {
match state.progress_tracker.get_real_time_progress(&workflow_id).await {
Some(progress) => Ok(Json(ApiResponse::success(progress))),
None => Err(StatusCode::NOT_FOUND),
}
}
async fn get_workflow_snapshots(
State(state): State<SharedState>,
Path(workflow_id): Path<String>,
) -> Result<Json<ApiResponse<Vec<StateSnapshot>>>, StatusCode> {
let snapshots = state.state_manager.get_workflow_snapshots(&workflow_id).await;
Ok(Json(ApiResponse::success(snapshots)))
}
async fn get_system_metrics(
State(state): State<SharedState>,
) -> Result<Json<ApiResponse<SystemMetrics>>, StatusCode> {
let metrics = state.state_manager.get_system_metrics().await;
Ok(Json(ApiResponse::success(metrics)))
}
async fn get_system_health(
State(state): State<SharedState>,
) -> Result<Json<ApiResponse<SystemHealthStatus>>, StatusCode> {
let health_status = state.monitoring_system.health_monitor().get_system_health().await;
Ok(Json(ApiResponse::success(health_status)))
}
async fn get_state_statistics(
State(state): State<SharedState>,
) -> Result<Json<ApiResponse<StateManagerStatistics>>, StatusCode> {
match state.state_manager.get_statistics().await {
Ok(stats) => Ok(Json(ApiResponse::success(stats))),
Err(e) => {
error!("Failed to get state statistics: {}", e);
Err(StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
// Test environment routes
async fn create_test_environment(
State(state): State<SharedState>,
Json(request): Json<CreateTestEnvironmentRequest>,
) -> Result<Json<ApiResponse<TestEnvironmentResponse>>, StatusCode> {
match state.test_orchestrator.create_environment(request).await {
Ok(response) => Ok(Json(ApiResponse::success(response))),
Err(e) => {
error!("Failed to create test environment: {}", e);
Err(StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
async fn get_test_environment(
State(state): State<SharedState>,
Path(env_id): Path<String>,
) -> Result<Json<ApiResponse<TestEnvironment>>, StatusCode> {
match state.test_orchestrator.get_environment(&env_id).await {
Some(env) => Ok(Json(ApiResponse::success(env))),
None => Err(StatusCode::NOT_FOUND),
}
}
async fn list_test_environments(
State(state): State<SharedState>,
) -> Result<Json<ApiResponse<Vec<TestEnvironment>>>, StatusCode> {
let environments = state.test_orchestrator.list_environments().await;
Ok(Json(ApiResponse::success(environments)))
}
async fn run_environment_tests(
State(state): State<SharedState>,
Path(env_id): Path<String>,
Json(request): Json<RunTestRequest>,
) -> Result<Json<ApiResponse<Vec<TestResult>>>, StatusCode> {
match state.test_orchestrator.run_tests(&env_id, request).await {
Ok(results) => Ok(Json(ApiResponse::success(results))),
Err(e) => {
error!("Failed to run tests: {}", e);
Err(StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
async fn cleanup_test_environment(
State(state): State<SharedState>,
Path(env_id): Path<String>,
) -> Result<Json<ApiResponse<String>>, StatusCode> {
match state.test_orchestrator.cleanup_environment(&env_id, false).await {
Ok(_) => Ok(Json(ApiResponse::success(format!("Environment {} cleaned up", env_id)))),
Err(e) => {
error!("Failed to cleanup environment: {}", e);
Err(StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
async fn get_environment_logs(
State(state): State<SharedState>,
Path(env_id): Path<String>,
) -> Result<Json<ApiResponse<Vec<String>>>, StatusCode> {
match state.test_orchestrator.get_environment_logs(&env_id).await {
Ok(logs) => Ok(Json(ApiResponse::success(logs))),
Err(e) => {
error!("Failed to get environment logs: {}", e);
Err(StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
// DNS Management Routes
async fn list_dns_records(
State(state): State<SharedState>,
) -> Result<Json<ApiResponse<Vec<provisioning_orchestrator::dns::DnsRecord>>>, StatusCode> {
match state.dns_manager.list_records().await {
Ok(records) => Ok(Json(ApiResponse::success(records))),
Err(e) => {
error!("Failed to list DNS records: {}", e);
Err(StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
// Extension Management Routes
async fn list_loaded_extensions(
State(state): State<SharedState>,
) -> Result<Json<ApiResponse<Vec<provisioning_orchestrator::extensions::Extension>>>, StatusCode> {
let extensions = state.extension_manager.list_loaded_extensions().await;
Ok(Json(ApiResponse::success(extensions)))
}
#[derive(Deserialize)]
struct ReloadExtensionRequest {
extension_type: String,
name: String,
}
async fn reload_extension(
State(state): State<SharedState>,
Json(request): Json<ReloadExtensionRequest>,
) -> Result<Json<ApiResponse<String>>, StatusCode> {
use provisioning_orchestrator::extensions::ExtensionType;
let ext_type = match request.extension_type.as_str() {
"provider" => ExtensionType::Provider,
"taskserv" => ExtensionType::Taskserv,
"cluster" => ExtensionType::Cluster,
_ => return Err(StatusCode::BAD_REQUEST),
};
match state.extension_manager.reload_extension(ext_type, request.name.clone()).await {
Ok(_) => Ok(Json(ApiResponse::success(format!("Extension {} reloaded", request.name)))),
Err(e) => {
error!("Failed to reload extension: {}", e);
Err(StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
// OCI Registry Routes
#[derive(Deserialize)]
struct ListOciArtifactsRequest {
namespace: String,
}
async fn list_oci_artifacts(
State(state): State<SharedState>,
Json(request): Json<ListOciArtifactsRequest>,
) -> Result<Json<ApiResponse<Vec<provisioning_orchestrator::oci::OciArtifact>>>, StatusCode> {
match state.oci_manager.list_oci_artifacts(&request.namespace).await {
Ok(artifacts) => Ok(Json(ApiResponse::success(artifacts))),
Err(e) => {
error!("Failed to list OCI artifacts: {}", e);
Err(StatusCode::INTERNAL_SERVER_ERROR)
}
}
}
// Service Orchestration Routes
async fn list_services(
State(state): State<SharedState>,
) -> Result<Json<ApiResponse<Vec<provisioning_orchestrator::services::Service>>>, StatusCode> {
let services = state.service_orchestrator.list_services().await;
Ok(Json(ApiResponse::success(services)))
}
#[derive(Serialize)]
struct ServiceStatusResponse {
name: String,
status: provisioning_orchestrator::services::ServiceStatus,
}
async fn get_services_status(
State(state): State<SharedState>,
) -> Result<Json<ApiResponse<Vec<ServiceStatusResponse>>>, StatusCode> {
let services = state.service_orchestrator.list_services().await;
let mut statuses = Vec::new();
for service in services {
if let Ok(status) = state.service_orchestrator.get_service_status(&service.name).await {
statuses.push(ServiceStatusResponse {
name: service.name,
status,
});
}
}
Ok(Json(ApiResponse::success(statuses)))
}
// Task processor - runs tasks from the queue
async fn process_tasks(state: SharedState) {
info!("Starting task processor");
let metrics_collector = state.monitoring_system.metrics_collector();
loop {
match state.task_storage.dequeue().await {
Ok(Some(mut task)) => {
// Track task dequeue
metrics_collector.increment_task_counter();
metrics_collector.record_storage_operation(true);
if let Err(e) = state.task_storage.update_task_status(&task.id, TaskStatus::Running).await {
error!("Failed to update task status: {}", e);
metrics_collector.record_storage_operation(false);
continue;
}
info!("Processing task: {} ({})", task.id, task.name);
let task_start = std::time::Instant::now();
let result = state.execute_nushell_command(&task.command, &task.args).await;
let task_duration = task_start.elapsed();
match result {
Ok(output) => {
info!("Task {} completed successfully", task.id);
task.output = Some(output);
task.status = TaskStatus::Completed;
task.completed_at = Some(chrono::Utc::now());
// Record metrics
metrics_collector.record_task_completion(task_duration.as_millis() as u64);
// Publish monitoring event
let event = MonitoringEvent {
event_type: MonitoringEventType::TaskStatusChanged,
timestamp: chrono::Utc::now(),
data: serde_json::to_value(&task).unwrap_or_default(),
metadata: {
let mut meta = std::collections::HashMap::new();
meta.insert("task_id".to_string(), task.id.clone());
meta.insert("status".to_string(), "completed".to_string());
meta.insert("duration_ms".to_string(), task_duration.as_millis().to_string());
meta
},
};
if let Err(e) = state.monitoring_system.publish_event(event).await {
error!("Failed to publish monitoring event: {}", e);
}
if let Err(e) = state.task_storage.update_task(task).await {
error!("Failed to update task: {}", e);
metrics_collector.record_storage_operation(false);
} else {
metrics_collector.record_storage_operation(true);
}
}
Err(e) => {
error!("Task {} failed: {}", task.id, e);
task.error = Some(e.to_string());
task.status = TaskStatus::Failed;
task.completed_at = Some(chrono::Utc::now());
// Record metrics
metrics_collector.record_task_failure();
// Publish monitoring event
let event = MonitoringEvent {
event_type: MonitoringEventType::TaskStatusChanged,
timestamp: chrono::Utc::now(),
data: serde_json::to_value(&task).unwrap_or_default(),
metadata: {
let mut meta = std::collections::HashMap::new();
meta.insert("task_id".to_string(), task.id.clone());
meta.insert("status".to_string(), "failed".to_string());
meta.insert("error".to_string(), e.to_string());
meta
},
};
if let Err(e) = state.monitoring_system.publish_event(event).await {
error!("Failed to publish monitoring event: {}", e);
}
if let Err(e) = state.task_storage.update_task(task.clone()).await {
error!("Failed to update task: {}", e);
metrics_collector.record_storage_operation(false);
} else {
metrics_collector.record_storage_operation(true);
}
// Try to requeue for retry
if let Err(e) = state.task_storage.requeue_failed_task(&task.id).await {
error!("Failed to requeue task: {}", e);
metrics_collector.record_storage_operation(false);
}
}
}
}
Ok(None) => {
// No tasks in queue, sleep
tokio::time::sleep(tokio::time::Duration::from_secs(1)).await;
}
Err(e) => {
error!("Error dequeuing task: {}", e);
metrics_collector.record_storage_operation(false);
tokio::time::sleep(tokio::time::Duration::from_secs(5)).await;
}
}
}
}
#[tokio::main]
async fn main() -> Result<()> {
tracing_subscriber::fmt::init();
let args = Args::parse();
let port = args.port;
info!("Starting provisioning orchestrator on port {}", port);
let state = Arc::new(AppState::new(args).await?);
// Start task processor
let processor_state = state.clone();
tokio::spawn(async move {
process_tasks(processor_state).await;
});
let app = Router::new()
.route("/health", get(health_check))
.route("/tasks", get(list_tasks))
.route("/tasks/{id}", get(get_task_status))
.route("/workflows/servers/create", post(create_server_workflow))
.route("/workflows/taskserv/create", post(create_taskserv_workflow))
.route("/workflows/cluster/create", post(create_cluster_workflow))
// Batch operation routes
.route("/batch/execute", post(execute_batch_operation))
.route("/batch/operations", get(list_batch_operations))
.route("/batch/operations/{id}", get(get_batch_operation_status))
.route("/batch/operations/{id}/cancel", post(cancel_batch_operation))
// State management routes
.route("/state/workflows/{id}/progress", get(get_workflow_progress))
.route("/state/workflows/{id}/snapshots", get(get_workflow_snapshots))
.route("/state/system/metrics", get(get_system_metrics))
.route("/state/system/health", get(get_system_health))
.route("/state/statistics", get(get_state_statistics))
// Rollback and recovery routes
.route("/rollback/checkpoints", post(create_checkpoint))
.route("/rollback/checkpoints", get(list_checkpoints))
.route("/rollback/checkpoints/{id}", get(get_checkpoint))
.route("/rollback/execute", post(execute_rollback))
.route("/rollback/restore/{id}", post(restore_from_checkpoint))
.route("/rollback/statistics", get(get_rollback_statistics))
// Test environment routes
.route("/test/environments/create", post(create_test_environment))
.route("/test/environments", get(list_test_environments))
.route("/test/environments/{id}", get(get_test_environment))
.route("/test/environments/{id}/run", post(run_environment_tests))
.route("/test/environments/{id}", axum::routing::delete(cleanup_test_environment))
.route("/test/environments/{id}/logs", get(get_environment_logs))
// DNS integration routes
.route("/api/v1/dns/records", get(list_dns_records))
// Extension loading routes
.route("/api/v1/extensions/loaded", get(list_loaded_extensions))
.route("/api/v1/extensions/reload", post(reload_extension))
// OCI registry routes
.route("/api/v1/oci/artifacts", post(list_oci_artifacts))
// Service orchestration routes
.route("/api/v1/services/list", get(list_services))
.route("/api/v1/services/status", get(get_services_status))
// Merge monitoring routes (includes /metrics, /ws, /events)
.merge(state.monitoring_system.create_routes())
.layer(CorsLayer::permissive())
.with_state(state);
let listener = tokio::net::TcpListener::bind(format!("0.0.0.0:{}", port))
.await
.context("Failed to bind to address")?;
info!("Orchestrator listening on port {}", port);
axum::serve(listener, app)
.await
.context("Server error")?;
Ok(())
}