968 lines
35 KiB
Rust
968 lines
35 KiB
Rust
use anyhow::{Context, Result};
|
|
use axum::{
|
|
extract::{Path, State},
|
|
http::StatusCode,
|
|
response::Json,
|
|
routing::{get, post},
|
|
Router,
|
|
};
|
|
use clap::Parser;
|
|
use serde::{Deserialize, Serialize};
|
|
use std::{
|
|
process::Stdio,
|
|
sync::Arc,
|
|
};
|
|
use tokio::process::Command;
|
|
use tower_http::cors::CorsLayer;
|
|
use tracing::{error, info, warn};
|
|
use uuid::Uuid;
|
|
|
|
// Use types from the library
|
|
use provisioning_orchestrator::{
|
|
Args, WorkflowTask, TaskStatus, CreateServerWorkflow, TaskservWorkflow, ClusterWorkflow,
|
|
storage::{create_storage_from_args, TaskStorage},
|
|
workflow::{WorkflowExecutionState},
|
|
batch::{BatchCoordinator, BatchCoordinatorFactory, BatchOperationRequest, BatchOperationResult},
|
|
dependency::{DependencyResolver},
|
|
state::{WorkflowStateManager, StateConfig, ProgressTracker, ProgressInfo, StateSnapshot, SystemMetrics, StateManagerStatistics},
|
|
monitor::{MonitoringSystem, MonitoringConfig, SystemResourceHealthCheck, MonitoringEvent, MonitoringEventType, SystemHealthStatus},
|
|
rollback::{RollbackSystem, RollbackConfig, RollbackResult, Checkpoint, RollbackStatistics, RollbackStrategy, providers::*},
|
|
test_environment::{TestEnvironment, CreateTestEnvironmentRequest, RunTestRequest, TestEnvironmentResponse, TestResult},
|
|
test_orchestrator::TestOrchestrator,
|
|
dns::DnsManager,
|
|
extensions::ExtensionManager,
|
|
oci::OciManager,
|
|
services::ServiceOrchestrator,
|
|
};
|
|
|
|
// Application state for the orchestrator
|
|
pub type SharedState = Arc<AppState>;
|
|
|
|
pub struct AppState {
|
|
pub task_storage: Arc<dyn TaskStorage>,
|
|
pub batch_coordinator: BatchCoordinator,
|
|
pub dependency_resolver: DependencyResolver,
|
|
pub state_manager: Arc<WorkflowStateManager>,
|
|
pub monitoring_system: Arc<MonitoringSystem>,
|
|
pub progress_tracker: Arc<ProgressTracker>,
|
|
pub rollback_system: Arc<RollbackSystem>,
|
|
pub test_orchestrator: Arc<TestOrchestrator>,
|
|
pub dns_manager: Arc<DnsManager>,
|
|
pub extension_manager: Arc<ExtensionManager>,
|
|
pub oci_manager: Arc<OciManager>,
|
|
pub service_orchestrator: Arc<ServiceOrchestrator>,
|
|
pub args: Args,
|
|
}
|
|
|
|
impl AppState {
|
|
pub async fn new(args: Args) -> Result<Self> {
|
|
// Create storage using the factory pattern
|
|
let task_storage = create_storage_from_args(&args).await
|
|
.context("Failed to create storage backend")?;
|
|
|
|
info!("Successfully initialized {} storage backend", args.storage_type);
|
|
|
|
// Create batch coordinator
|
|
let batch_coordinator = BatchCoordinatorFactory::create_from_args(
|
|
task_storage.clone(),
|
|
args.clone(),
|
|
).await.context("Failed to create batch coordinator")?;
|
|
|
|
info!("Successfully initialized batch coordinator");
|
|
|
|
// Create dependency resolver with default configuration
|
|
let dependency_resolver = DependencyResolver::new();
|
|
|
|
info!("Successfully initialized dependency resolver");
|
|
|
|
// Create state manager with default configuration
|
|
let state_config = StateConfig::default();
|
|
let state_manager = Arc::new(WorkflowStateManager::new(task_storage.clone(), state_config));
|
|
|
|
// Initialize state manager
|
|
state_manager.init().await
|
|
.context("Failed to initialize state manager")?;
|
|
|
|
info!("Successfully initialized state manager");
|
|
|
|
// Create monitoring system with default configuration
|
|
let monitoring_config = MonitoringConfig::default();
|
|
let monitoring_system = Arc::new(MonitoringSystem::new(
|
|
monitoring_config,
|
|
task_storage.clone(),
|
|
state_manager.clone(),
|
|
));
|
|
|
|
// Initialize monitoring system
|
|
monitoring_system.init().await
|
|
.context("Failed to initialize monitoring system")?;
|
|
|
|
info!("Successfully initialized monitoring system");
|
|
|
|
// Register system resource health check
|
|
let health_monitor = monitoring_system.health_monitor();
|
|
health_monitor.register_health_check(
|
|
"system_resources".to_string(),
|
|
SystemResourceHealthCheck::new(1024, 80.0), // 1GB memory, 80% CPU thresholds
|
|
).await;
|
|
|
|
// Create progress tracker
|
|
let progress_tracker = Arc::new(ProgressTracker::new(state_manager.clone()));
|
|
|
|
info!("Successfully initialized progress tracker");
|
|
|
|
// Create rollback system with configuration-driven strategy
|
|
let rollback_config = RollbackConfig {
|
|
checkpoint_interval_seconds: 300, // 5 minutes
|
|
auto_checkpoint_enabled: true,
|
|
strategy: RollbackStrategy::ConfigDriven,
|
|
..RollbackConfig::default()
|
|
};
|
|
|
|
let mut rollback_system = RollbackSystem::new(task_storage.clone(), rollback_config);
|
|
|
|
// Register provider-specific rollback handlers
|
|
rollback_system.resource_tracker.register_provider_handler(
|
|
Arc::new(UpCloudRollbackHandler {})
|
|
).await;
|
|
|
|
rollback_system.resource_tracker.register_provider_handler(
|
|
Arc::new(AwsRollbackHandler {})
|
|
).await;
|
|
|
|
rollback_system.resource_tracker.register_provider_handler(
|
|
Arc::new(LocalRollbackHandler {})
|
|
).await;
|
|
|
|
// Initialize rollback system
|
|
rollback_system.init().await
|
|
.context("Failed to initialize rollback system")?;
|
|
|
|
info!("Successfully initialized rollback system with {} providers", 3);
|
|
|
|
let rollback_system = Arc::new(rollback_system);
|
|
|
|
// Create test orchestrator
|
|
let test_orchestrator = Arc::new(TestOrchestrator::new()
|
|
.context("Failed to initialize test orchestrator")?);
|
|
|
|
info!("Successfully initialized test orchestrator");
|
|
|
|
// Create DNS manager
|
|
let dns_manager = Arc::new(DnsManager::new(
|
|
"http://localhost:53".to_string(), // TODO: Read from config
|
|
true, // auto_register
|
|
300, // default_ttl
|
|
));
|
|
|
|
info!("Successfully initialized DNS manager");
|
|
|
|
// Create extension manager
|
|
let extension_manager = Arc::new(ExtensionManager::new(
|
|
args.nu_path.clone(),
|
|
args.provisioning_path.clone(),
|
|
));
|
|
|
|
info!("Successfully initialized extension manager");
|
|
|
|
// Create OCI manager
|
|
let oci_manager = Arc::new(OciManager::new(
|
|
"http://localhost:5000".to_string(), // TODO: Read from config
|
|
"provisioning-extensions".to_string(), // namespace
|
|
std::path::PathBuf::from(&args.data_dir).join("oci-cache"),
|
|
));
|
|
|
|
info!("Successfully initialized OCI manager");
|
|
|
|
// Create service orchestrator
|
|
let service_orchestrator = Arc::new(ServiceOrchestrator::new(
|
|
args.nu_path.clone(),
|
|
args.provisioning_path.clone(),
|
|
true, // auto_start_dependencies
|
|
));
|
|
|
|
info!("Successfully initialized service orchestrator");
|
|
|
|
Ok(Self {
|
|
task_storage,
|
|
batch_coordinator,
|
|
dependency_resolver,
|
|
state_manager,
|
|
monitoring_system,
|
|
progress_tracker,
|
|
rollback_system,
|
|
test_orchestrator,
|
|
dns_manager,
|
|
extension_manager,
|
|
oci_manager,
|
|
service_orchestrator,
|
|
args,
|
|
})
|
|
}
|
|
|
|
pub async fn execute_nushell_command(&self, command: &str, args: &[String]) -> Result<String> {
|
|
let mut cmd = Command::new(&self.args.nu_path);
|
|
cmd.arg("-c")
|
|
.arg(format!("{} {}", command, args.join(" ")))
|
|
.stdout(Stdio::piped())
|
|
.stderr(Stdio::piped());
|
|
|
|
let output = cmd.output().await
|
|
.context("Failed to execute Nushell command")?;
|
|
|
|
if output.status.success() {
|
|
Ok(String::from_utf8_lossy(&output.stdout).to_string())
|
|
} else {
|
|
let error = String::from_utf8_lossy(&output.stderr);
|
|
Err(anyhow::anyhow!("Nushell command failed: {}", error))
|
|
}
|
|
}
|
|
}
|
|
|
|
#[derive(Serialize)]
|
|
struct ApiResponse<T> {
|
|
success: bool,
|
|
data: Option<T>,
|
|
error: Option<String>,
|
|
}
|
|
|
|
impl<T> ApiResponse<T> {
|
|
fn success(data: T) -> Self {
|
|
Self {
|
|
success: true,
|
|
data: Some(data),
|
|
error: None,
|
|
}
|
|
}
|
|
|
|
fn error(message: String) -> Self {
|
|
Self {
|
|
success: false,
|
|
data: None,
|
|
error: Some(message),
|
|
}
|
|
}
|
|
}
|
|
|
|
// REST API Routes
|
|
|
|
async fn create_server_workflow(
|
|
State(state): State<SharedState>,
|
|
Json(workflow): Json<CreateServerWorkflow>,
|
|
) -> Result<Json<ApiResponse<String>>, StatusCode> {
|
|
let task_id = Uuid::new_v4().to_string();
|
|
|
|
let task = WorkflowTask {
|
|
id: task_id.clone(),
|
|
name: "create_servers".to_string(),
|
|
command: format!("{} servers create", state.args.provisioning_path),
|
|
args: vec![
|
|
"--infra".to_string(),
|
|
workflow.infra.clone(),
|
|
"--settings".to_string(),
|
|
workflow.settings.clone(),
|
|
if workflow.check_mode { "--check".to_string() } else { "".to_string() },
|
|
if workflow.wait { "--wait".to_string() } else { "".to_string() },
|
|
].into_iter().filter(|s| !s.is_empty()).collect(),
|
|
dependencies: vec![],
|
|
status: TaskStatus::Pending,
|
|
created_at: chrono::Utc::now(),
|
|
started_at: None,
|
|
completed_at: None,
|
|
output: None,
|
|
error: None,
|
|
};
|
|
|
|
match state.task_storage.enqueue(task, 5).await {
|
|
Ok(()) => {
|
|
info!("Enqueued server creation workflow: {}", task_id);
|
|
Ok(Json(ApiResponse::success(task_id)))
|
|
}
|
|
Err(e) => {
|
|
error!("Failed to enqueue task: {}", e);
|
|
Err(StatusCode::INTERNAL_SERVER_ERROR)
|
|
}
|
|
}
|
|
}
|
|
|
|
async fn create_taskserv_workflow(
|
|
State(state): State<SharedState>,
|
|
Json(workflow): Json<TaskservWorkflow>,
|
|
) -> Result<Json<ApiResponse<String>>, StatusCode> {
|
|
let task_id = Uuid::new_v4().to_string();
|
|
|
|
let task = WorkflowTask {
|
|
id: task_id.clone(),
|
|
name: format!("{}_taskserv", workflow.operation),
|
|
command: format!("{} taskserv {}", state.args.provisioning_path, workflow.operation),
|
|
args: vec![
|
|
workflow.taskserv.clone(),
|
|
"--infra".to_string(),
|
|
workflow.infra.clone(),
|
|
"--settings".to_string(),
|
|
workflow.settings.clone(),
|
|
if workflow.check_mode { "--check".to_string() } else { "".to_string() },
|
|
if workflow.wait { "--wait".to_string() } else { "".to_string() },
|
|
].into_iter().filter(|s| !s.is_empty()).collect(),
|
|
dependencies: vec![],
|
|
status: TaskStatus::Pending,
|
|
created_at: chrono::Utc::now(),
|
|
started_at: None,
|
|
completed_at: None,
|
|
output: None,
|
|
error: None,
|
|
};
|
|
|
|
match state.task_storage.enqueue(task, 6).await {
|
|
Ok(()) => {
|
|
info!("Enqueued taskserv {} workflow: {}", workflow.operation, task_id);
|
|
Ok(Json(ApiResponse::success(task_id)))
|
|
}
|
|
Err(e) => {
|
|
error!("Failed to enqueue task: {}", e);
|
|
Err(StatusCode::INTERNAL_SERVER_ERROR)
|
|
}
|
|
}
|
|
}
|
|
|
|
async fn create_cluster_workflow(
|
|
State(state): State<SharedState>,
|
|
Json(workflow): Json<ClusterWorkflow>,
|
|
) -> Result<Json<ApiResponse<String>>, StatusCode> {
|
|
let task_id = Uuid::new_v4().to_string();
|
|
|
|
let task = WorkflowTask {
|
|
id: task_id.clone(),
|
|
name: format!("{}_cluster", workflow.operation),
|
|
command: format!("{} cluster {}", state.args.provisioning_path, workflow.operation),
|
|
args: vec![
|
|
workflow.cluster_type.clone(),
|
|
"--infra".to_string(),
|
|
workflow.infra.clone(),
|
|
"--settings".to_string(),
|
|
workflow.settings.clone(),
|
|
if workflow.check_mode { "--check".to_string() } else { "".to_string() },
|
|
if workflow.wait { "--wait".to_string() } else { "".to_string() },
|
|
].into_iter().filter(|s| !s.is_empty()).collect(),
|
|
dependencies: vec![],
|
|
status: TaskStatus::Pending,
|
|
created_at: chrono::Utc::now(),
|
|
started_at: None,
|
|
completed_at: None,
|
|
output: None,
|
|
error: None,
|
|
};
|
|
|
|
match state.task_storage.enqueue(task, 7).await {
|
|
Ok(()) => {
|
|
info!("Enqueued cluster {} workflow: {}", workflow.operation, task_id);
|
|
Ok(Json(ApiResponse::success(task_id)))
|
|
}
|
|
Err(e) => {
|
|
error!("Failed to enqueue task: {}", e);
|
|
Err(StatusCode::INTERNAL_SERVER_ERROR)
|
|
}
|
|
}
|
|
}
|
|
|
|
async fn get_task_status(
|
|
State(state): State<SharedState>,
|
|
Path(task_id): Path<String>,
|
|
) -> Result<Json<ApiResponse<WorkflowTask>>, StatusCode> {
|
|
match state.task_storage.get_task(&task_id).await {
|
|
Ok(Some(task)) => Ok(Json(ApiResponse::success(task))),
|
|
Ok(None) => Err(StatusCode::NOT_FOUND),
|
|
Err(e) => {
|
|
error!("Failed to get task {}: {}", task_id, e);
|
|
Err(StatusCode::INTERNAL_SERVER_ERROR)
|
|
}
|
|
}
|
|
}
|
|
|
|
async fn list_tasks(
|
|
State(state): State<SharedState>,
|
|
) -> Result<Json<ApiResponse<Vec<WorkflowTask>>>, StatusCode> {
|
|
match state.task_storage.list_tasks(None).await {
|
|
Ok(task_list) => Ok(Json(ApiResponse::success(task_list))),
|
|
Err(e) => {
|
|
error!("Failed to list tasks: {}", e);
|
|
Err(StatusCode::INTERNAL_SERVER_ERROR)
|
|
}
|
|
}
|
|
}
|
|
|
|
async fn health_check() -> Json<ApiResponse<String>> {
|
|
Json(ApiResponse::success("Orchestrator is healthy".to_string()))
|
|
}
|
|
|
|
// Batch operation routes
|
|
|
|
async fn execute_batch_operation(
|
|
State(state): State<SharedState>,
|
|
Json(request): Json<BatchOperationRequest>,
|
|
) -> Result<Json<ApiResponse<BatchOperationResult>>, StatusCode> {
|
|
match state.batch_coordinator.execute_batch_operation(request).await {
|
|
Ok(result) => Ok(Json(ApiResponse::success(result))),
|
|
Err(e) => {
|
|
error!("Failed to execute batch operation: {}", e);
|
|
Err(StatusCode::INTERNAL_SERVER_ERROR)
|
|
}
|
|
}
|
|
}
|
|
|
|
async fn get_batch_operation_status(
|
|
State(state): State<SharedState>,
|
|
Path(operation_id): Path<String>,
|
|
) -> Result<Json<ApiResponse<WorkflowExecutionState>>, StatusCode> {
|
|
match state.batch_coordinator.get_operation_status(&operation_id).await {
|
|
Some(state) => Ok(Json(ApiResponse::success(state))),
|
|
None => Err(StatusCode::NOT_FOUND),
|
|
}
|
|
}
|
|
|
|
async fn list_batch_operations(
|
|
State(state): State<SharedState>,
|
|
) -> Result<Json<ApiResponse<Vec<WorkflowExecutionState>>>, StatusCode> {
|
|
let operations = state.batch_coordinator.list_operations().await;
|
|
Ok(Json(ApiResponse::success(operations)))
|
|
}
|
|
|
|
async fn cancel_batch_operation(
|
|
State(state): State<SharedState>,
|
|
Path(operation_id): Path<String>,
|
|
) -> Result<Json<ApiResponse<String>>, StatusCode> {
|
|
match state.batch_coordinator.cancel_operation(&operation_id).await {
|
|
Ok(_) => Ok(Json(ApiResponse::success(format!("Operation {} cancelled", operation_id)))),
|
|
Err(e) => {
|
|
error!("Failed to cancel operation {}: {}", operation_id, e);
|
|
Err(StatusCode::INTERNAL_SERVER_ERROR)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Rollback and recovery routes
|
|
|
|
#[derive(Deserialize)]
|
|
struct CreateCheckpointRequest {
|
|
name: String,
|
|
description: Option<String>,
|
|
}
|
|
|
|
#[derive(Deserialize)]
|
|
struct RollbackRequest {
|
|
checkpoint_id: Option<String>,
|
|
operation_ids: Option<Vec<String>>,
|
|
}
|
|
|
|
async fn create_checkpoint(
|
|
State(state): State<SharedState>,
|
|
Json(request): Json<CreateCheckpointRequest>,
|
|
) -> Result<Json<ApiResponse<String>>, StatusCode> {
|
|
match state.rollback_system.checkpoint_manager.create_checkpoint(
|
|
request.name,
|
|
request.description,
|
|
).await {
|
|
Ok(checkpoint_id) => {
|
|
info!("Created checkpoint: {}", checkpoint_id);
|
|
Ok(Json(ApiResponse::success(checkpoint_id)))
|
|
}
|
|
Err(e) => {
|
|
error!("Failed to create checkpoint: {}", e);
|
|
Err(StatusCode::INTERNAL_SERVER_ERROR)
|
|
}
|
|
}
|
|
}
|
|
|
|
async fn list_checkpoints(
|
|
State(state): State<SharedState>,
|
|
) -> Result<Json<ApiResponse<Vec<Checkpoint>>>, StatusCode> {
|
|
let checkpoints = state.rollback_system.checkpoint_manager.list_checkpoints().await;
|
|
Ok(Json(ApiResponse::success(checkpoints)))
|
|
}
|
|
|
|
async fn get_checkpoint(
|
|
State(state): State<SharedState>,
|
|
Path(checkpoint_id): Path<String>,
|
|
) -> Result<Json<ApiResponse<Checkpoint>>, StatusCode> {
|
|
match state.rollback_system.checkpoint_manager.get_checkpoint(&checkpoint_id).await {
|
|
Some(checkpoint) => Ok(Json(ApiResponse::success(checkpoint))),
|
|
None => Err(StatusCode::NOT_FOUND),
|
|
}
|
|
}
|
|
|
|
async fn execute_rollback(
|
|
State(state): State<SharedState>,
|
|
Json(request): Json<RollbackRequest>,
|
|
) -> Result<Json<ApiResponse<RollbackResult>>, StatusCode> {
|
|
let result = if let Some(checkpoint_id) = request.checkpoint_id {
|
|
// Rollback to specific checkpoint
|
|
match state.rollback_system.rollback_executor.rollback_to_checkpoint(&checkpoint_id).await {
|
|
Ok(result) => result,
|
|
Err(e) => {
|
|
error!("Failed to execute rollback to checkpoint {}: {}", checkpoint_id, e);
|
|
return Err(StatusCode::INTERNAL_SERVER_ERROR);
|
|
}
|
|
}
|
|
} else if let Some(operation_ids) = request.operation_ids {
|
|
// Partial rollback of specific operations
|
|
match state.rollback_system.rollback_executor.rollback_operations(operation_ids).await {
|
|
Ok(result) => result,
|
|
Err(e) => {
|
|
error!("Failed to execute partial rollback: {}", e);
|
|
return Err(StatusCode::INTERNAL_SERVER_ERROR);
|
|
}
|
|
}
|
|
} else {
|
|
return Err(StatusCode::BAD_REQUEST);
|
|
};
|
|
|
|
if result.success {
|
|
info!("Rollback executed successfully: {} operations", result.operations_executed);
|
|
} else {
|
|
warn!("Rollback completed with errors: {}/{} operations failed",
|
|
result.operations_failed, result.operations_executed + result.operations_failed);
|
|
}
|
|
|
|
Ok(Json(ApiResponse::success(result)))
|
|
}
|
|
|
|
async fn get_rollback_statistics(
|
|
State(state): State<SharedState>,
|
|
) -> Result<Json<ApiResponse<RollbackStatistics>>, StatusCode> {
|
|
let stats = state.rollback_system.get_statistics().await;
|
|
Ok(Json(ApiResponse::success(stats)))
|
|
}
|
|
|
|
async fn restore_from_checkpoint(
|
|
State(state): State<SharedState>,
|
|
Path(checkpoint_id): Path<String>,
|
|
) -> Result<Json<ApiResponse<String>>, StatusCode> {
|
|
match state.rollback_system.state_restorer.restore_from_checkpoint(&checkpoint_id).await {
|
|
Ok(_) => {
|
|
info!("Successfully restored state from checkpoint: {}", checkpoint_id);
|
|
Ok(Json(ApiResponse::success(format!("State restored from checkpoint {}", checkpoint_id))))
|
|
}
|
|
Err(e) => {
|
|
error!("Failed to restore from checkpoint {}: {}", checkpoint_id, e);
|
|
Err(StatusCode::INTERNAL_SERVER_ERROR)
|
|
}
|
|
}
|
|
}
|
|
|
|
// State management and monitoring routes
|
|
|
|
async fn get_workflow_progress(
|
|
State(state): State<SharedState>,
|
|
Path(workflow_id): Path<String>,
|
|
) -> Result<Json<ApiResponse<ProgressInfo>>, StatusCode> {
|
|
match state.progress_tracker.get_real_time_progress(&workflow_id).await {
|
|
Some(progress) => Ok(Json(ApiResponse::success(progress))),
|
|
None => Err(StatusCode::NOT_FOUND),
|
|
}
|
|
}
|
|
|
|
async fn get_workflow_snapshots(
|
|
State(state): State<SharedState>,
|
|
Path(workflow_id): Path<String>,
|
|
) -> Result<Json<ApiResponse<Vec<StateSnapshot>>>, StatusCode> {
|
|
let snapshots = state.state_manager.get_workflow_snapshots(&workflow_id).await;
|
|
Ok(Json(ApiResponse::success(snapshots)))
|
|
}
|
|
|
|
async fn get_system_metrics(
|
|
State(state): State<SharedState>,
|
|
) -> Result<Json<ApiResponse<SystemMetrics>>, StatusCode> {
|
|
let metrics = state.state_manager.get_system_metrics().await;
|
|
Ok(Json(ApiResponse::success(metrics)))
|
|
}
|
|
|
|
async fn get_system_health(
|
|
State(state): State<SharedState>,
|
|
) -> Result<Json<ApiResponse<SystemHealthStatus>>, StatusCode> {
|
|
let health_status = state.monitoring_system.health_monitor().get_system_health().await;
|
|
Ok(Json(ApiResponse::success(health_status)))
|
|
}
|
|
|
|
async fn get_state_statistics(
|
|
State(state): State<SharedState>,
|
|
) -> Result<Json<ApiResponse<StateManagerStatistics>>, StatusCode> {
|
|
match state.state_manager.get_statistics().await {
|
|
Ok(stats) => Ok(Json(ApiResponse::success(stats))),
|
|
Err(e) => {
|
|
error!("Failed to get state statistics: {}", e);
|
|
Err(StatusCode::INTERNAL_SERVER_ERROR)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Test environment routes
|
|
|
|
async fn create_test_environment(
|
|
State(state): State<SharedState>,
|
|
Json(request): Json<CreateTestEnvironmentRequest>,
|
|
) -> Result<Json<ApiResponse<TestEnvironmentResponse>>, StatusCode> {
|
|
match state.test_orchestrator.create_environment(request).await {
|
|
Ok(response) => Ok(Json(ApiResponse::success(response))),
|
|
Err(e) => {
|
|
error!("Failed to create test environment: {}", e);
|
|
Err(StatusCode::INTERNAL_SERVER_ERROR)
|
|
}
|
|
}
|
|
}
|
|
|
|
async fn get_test_environment(
|
|
State(state): State<SharedState>,
|
|
Path(env_id): Path<String>,
|
|
) -> Result<Json<ApiResponse<TestEnvironment>>, StatusCode> {
|
|
match state.test_orchestrator.get_environment(&env_id).await {
|
|
Some(env) => Ok(Json(ApiResponse::success(env))),
|
|
None => Err(StatusCode::NOT_FOUND),
|
|
}
|
|
}
|
|
|
|
async fn list_test_environments(
|
|
State(state): State<SharedState>,
|
|
) -> Result<Json<ApiResponse<Vec<TestEnvironment>>>, StatusCode> {
|
|
let environments = state.test_orchestrator.list_environments().await;
|
|
Ok(Json(ApiResponse::success(environments)))
|
|
}
|
|
|
|
async fn run_environment_tests(
|
|
State(state): State<SharedState>,
|
|
Path(env_id): Path<String>,
|
|
Json(request): Json<RunTestRequest>,
|
|
) -> Result<Json<ApiResponse<Vec<TestResult>>>, StatusCode> {
|
|
match state.test_orchestrator.run_tests(&env_id, request).await {
|
|
Ok(results) => Ok(Json(ApiResponse::success(results))),
|
|
Err(e) => {
|
|
error!("Failed to run tests: {}", e);
|
|
Err(StatusCode::INTERNAL_SERVER_ERROR)
|
|
}
|
|
}
|
|
}
|
|
|
|
async fn cleanup_test_environment(
|
|
State(state): State<SharedState>,
|
|
Path(env_id): Path<String>,
|
|
) -> Result<Json<ApiResponse<String>>, StatusCode> {
|
|
match state.test_orchestrator.cleanup_environment(&env_id, false).await {
|
|
Ok(_) => Ok(Json(ApiResponse::success(format!("Environment {} cleaned up", env_id)))),
|
|
Err(e) => {
|
|
error!("Failed to cleanup environment: {}", e);
|
|
Err(StatusCode::INTERNAL_SERVER_ERROR)
|
|
}
|
|
}
|
|
}
|
|
|
|
async fn get_environment_logs(
|
|
State(state): State<SharedState>,
|
|
Path(env_id): Path<String>,
|
|
) -> Result<Json<ApiResponse<Vec<String>>>, StatusCode> {
|
|
match state.test_orchestrator.get_environment_logs(&env_id).await {
|
|
Ok(logs) => Ok(Json(ApiResponse::success(logs))),
|
|
Err(e) => {
|
|
error!("Failed to get environment logs: {}", e);
|
|
Err(StatusCode::INTERNAL_SERVER_ERROR)
|
|
}
|
|
}
|
|
}
|
|
|
|
// DNS Management Routes
|
|
|
|
async fn list_dns_records(
|
|
State(state): State<SharedState>,
|
|
) -> Result<Json<ApiResponse<Vec<provisioning_orchestrator::dns::DnsRecord>>>, StatusCode> {
|
|
match state.dns_manager.list_records().await {
|
|
Ok(records) => Ok(Json(ApiResponse::success(records))),
|
|
Err(e) => {
|
|
error!("Failed to list DNS records: {}", e);
|
|
Err(StatusCode::INTERNAL_SERVER_ERROR)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Extension Management Routes
|
|
|
|
async fn list_loaded_extensions(
|
|
State(state): State<SharedState>,
|
|
) -> Result<Json<ApiResponse<Vec<provisioning_orchestrator::extensions::Extension>>>, StatusCode> {
|
|
let extensions = state.extension_manager.list_loaded_extensions().await;
|
|
Ok(Json(ApiResponse::success(extensions)))
|
|
}
|
|
|
|
#[derive(Deserialize)]
|
|
struct ReloadExtensionRequest {
|
|
extension_type: String,
|
|
name: String,
|
|
}
|
|
|
|
async fn reload_extension(
|
|
State(state): State<SharedState>,
|
|
Json(request): Json<ReloadExtensionRequest>,
|
|
) -> Result<Json<ApiResponse<String>>, StatusCode> {
|
|
use provisioning_orchestrator::extensions::ExtensionType;
|
|
|
|
let ext_type = match request.extension_type.as_str() {
|
|
"provider" => ExtensionType::Provider,
|
|
"taskserv" => ExtensionType::Taskserv,
|
|
"cluster" => ExtensionType::Cluster,
|
|
_ => return Err(StatusCode::BAD_REQUEST),
|
|
};
|
|
|
|
match state.extension_manager.reload_extension(ext_type, request.name.clone()).await {
|
|
Ok(_) => Ok(Json(ApiResponse::success(format!("Extension {} reloaded", request.name)))),
|
|
Err(e) => {
|
|
error!("Failed to reload extension: {}", e);
|
|
Err(StatusCode::INTERNAL_SERVER_ERROR)
|
|
}
|
|
}
|
|
}
|
|
|
|
// OCI Registry Routes
|
|
|
|
#[derive(Deserialize)]
|
|
struct ListOciArtifactsRequest {
|
|
namespace: String,
|
|
}
|
|
|
|
async fn list_oci_artifacts(
|
|
State(state): State<SharedState>,
|
|
Json(request): Json<ListOciArtifactsRequest>,
|
|
) -> Result<Json<ApiResponse<Vec<provisioning_orchestrator::oci::OciArtifact>>>, StatusCode> {
|
|
match state.oci_manager.list_oci_artifacts(&request.namespace).await {
|
|
Ok(artifacts) => Ok(Json(ApiResponse::success(artifacts))),
|
|
Err(e) => {
|
|
error!("Failed to list OCI artifacts: {}", e);
|
|
Err(StatusCode::INTERNAL_SERVER_ERROR)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Service Orchestration Routes
|
|
|
|
async fn list_services(
|
|
State(state): State<SharedState>,
|
|
) -> Result<Json<ApiResponse<Vec<provisioning_orchestrator::services::Service>>>, StatusCode> {
|
|
let services = state.service_orchestrator.list_services().await;
|
|
Ok(Json(ApiResponse::success(services)))
|
|
}
|
|
|
|
#[derive(Serialize)]
|
|
struct ServiceStatusResponse {
|
|
name: String,
|
|
status: provisioning_orchestrator::services::ServiceStatus,
|
|
}
|
|
|
|
async fn get_services_status(
|
|
State(state): State<SharedState>,
|
|
) -> Result<Json<ApiResponse<Vec<ServiceStatusResponse>>>, StatusCode> {
|
|
let services = state.service_orchestrator.list_services().await;
|
|
let mut statuses = Vec::new();
|
|
|
|
for service in services {
|
|
if let Ok(status) = state.service_orchestrator.get_service_status(&service.name).await {
|
|
statuses.push(ServiceStatusResponse {
|
|
name: service.name,
|
|
status,
|
|
});
|
|
}
|
|
}
|
|
|
|
Ok(Json(ApiResponse::success(statuses)))
|
|
}
|
|
|
|
// Task processor - runs tasks from the queue
|
|
async fn process_tasks(state: SharedState) {
|
|
info!("Starting task processor");
|
|
|
|
let metrics_collector = state.monitoring_system.metrics_collector();
|
|
|
|
loop {
|
|
match state.task_storage.dequeue().await {
|
|
Ok(Some(mut task)) => {
|
|
// Track task dequeue
|
|
metrics_collector.increment_task_counter();
|
|
metrics_collector.record_storage_operation(true);
|
|
|
|
if let Err(e) = state.task_storage.update_task_status(&task.id, TaskStatus::Running).await {
|
|
error!("Failed to update task status: {}", e);
|
|
metrics_collector.record_storage_operation(false);
|
|
continue;
|
|
}
|
|
|
|
info!("Processing task: {} ({})", task.id, task.name);
|
|
|
|
let task_start = std::time::Instant::now();
|
|
let result = state.execute_nushell_command(&task.command, &task.args).await;
|
|
let task_duration = task_start.elapsed();
|
|
|
|
match result {
|
|
Ok(output) => {
|
|
info!("Task {} completed successfully", task.id);
|
|
|
|
task.output = Some(output);
|
|
task.status = TaskStatus::Completed;
|
|
task.completed_at = Some(chrono::Utc::now());
|
|
|
|
// Record metrics
|
|
metrics_collector.record_task_completion(task_duration.as_millis() as u64);
|
|
|
|
// Publish monitoring event
|
|
let event = MonitoringEvent {
|
|
event_type: MonitoringEventType::TaskStatusChanged,
|
|
timestamp: chrono::Utc::now(),
|
|
data: serde_json::to_value(&task).unwrap_or_default(),
|
|
metadata: {
|
|
let mut meta = std::collections::HashMap::new();
|
|
meta.insert("task_id".to_string(), task.id.clone());
|
|
meta.insert("status".to_string(), "completed".to_string());
|
|
meta.insert("duration_ms".to_string(), task_duration.as_millis().to_string());
|
|
meta
|
|
},
|
|
};
|
|
|
|
if let Err(e) = state.monitoring_system.publish_event(event).await {
|
|
error!("Failed to publish monitoring event: {}", e);
|
|
}
|
|
|
|
if let Err(e) = state.task_storage.update_task(task).await {
|
|
error!("Failed to update task: {}", e);
|
|
metrics_collector.record_storage_operation(false);
|
|
} else {
|
|
metrics_collector.record_storage_operation(true);
|
|
}
|
|
}
|
|
Err(e) => {
|
|
error!("Task {} failed: {}", task.id, e);
|
|
|
|
task.error = Some(e.to_string());
|
|
task.status = TaskStatus::Failed;
|
|
task.completed_at = Some(chrono::Utc::now());
|
|
|
|
// Record metrics
|
|
metrics_collector.record_task_failure();
|
|
|
|
// Publish monitoring event
|
|
let event = MonitoringEvent {
|
|
event_type: MonitoringEventType::TaskStatusChanged,
|
|
timestamp: chrono::Utc::now(),
|
|
data: serde_json::to_value(&task).unwrap_or_default(),
|
|
metadata: {
|
|
let mut meta = std::collections::HashMap::new();
|
|
meta.insert("task_id".to_string(), task.id.clone());
|
|
meta.insert("status".to_string(), "failed".to_string());
|
|
meta.insert("error".to_string(), e.to_string());
|
|
meta
|
|
},
|
|
};
|
|
|
|
if let Err(e) = state.monitoring_system.publish_event(event).await {
|
|
error!("Failed to publish monitoring event: {}", e);
|
|
}
|
|
|
|
if let Err(e) = state.task_storage.update_task(task.clone()).await {
|
|
error!("Failed to update task: {}", e);
|
|
metrics_collector.record_storage_operation(false);
|
|
} else {
|
|
metrics_collector.record_storage_operation(true);
|
|
}
|
|
|
|
// Try to requeue for retry
|
|
if let Err(e) = state.task_storage.requeue_failed_task(&task.id).await {
|
|
error!("Failed to requeue task: {}", e);
|
|
metrics_collector.record_storage_operation(false);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
Ok(None) => {
|
|
// No tasks in queue, sleep
|
|
tokio::time::sleep(tokio::time::Duration::from_secs(1)).await;
|
|
}
|
|
Err(e) => {
|
|
error!("Error dequeuing task: {}", e);
|
|
metrics_collector.record_storage_operation(false);
|
|
tokio::time::sleep(tokio::time::Duration::from_secs(5)).await;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
#[tokio::main]
|
|
async fn main() -> Result<()> {
|
|
tracing_subscriber::fmt::init();
|
|
|
|
let args = Args::parse();
|
|
let port = args.port;
|
|
|
|
info!("Starting provisioning orchestrator on port {}", port);
|
|
|
|
let state = Arc::new(AppState::new(args).await?);
|
|
|
|
// Start task processor
|
|
let processor_state = state.clone();
|
|
tokio::spawn(async move {
|
|
process_tasks(processor_state).await;
|
|
});
|
|
|
|
let app = Router::new()
|
|
.route("/health", get(health_check))
|
|
.route("/tasks", get(list_tasks))
|
|
.route("/tasks/{id}", get(get_task_status))
|
|
.route("/workflows/servers/create", post(create_server_workflow))
|
|
.route("/workflows/taskserv/create", post(create_taskserv_workflow))
|
|
.route("/workflows/cluster/create", post(create_cluster_workflow))
|
|
// Batch operation routes
|
|
.route("/batch/execute", post(execute_batch_operation))
|
|
.route("/batch/operations", get(list_batch_operations))
|
|
.route("/batch/operations/{id}", get(get_batch_operation_status))
|
|
.route("/batch/operations/{id}/cancel", post(cancel_batch_operation))
|
|
// State management routes
|
|
.route("/state/workflows/{id}/progress", get(get_workflow_progress))
|
|
.route("/state/workflows/{id}/snapshots", get(get_workflow_snapshots))
|
|
.route("/state/system/metrics", get(get_system_metrics))
|
|
.route("/state/system/health", get(get_system_health))
|
|
.route("/state/statistics", get(get_state_statistics))
|
|
// Rollback and recovery routes
|
|
.route("/rollback/checkpoints", post(create_checkpoint))
|
|
.route("/rollback/checkpoints", get(list_checkpoints))
|
|
.route("/rollback/checkpoints/{id}", get(get_checkpoint))
|
|
.route("/rollback/execute", post(execute_rollback))
|
|
.route("/rollback/restore/{id}", post(restore_from_checkpoint))
|
|
.route("/rollback/statistics", get(get_rollback_statistics))
|
|
// Test environment routes
|
|
.route("/test/environments/create", post(create_test_environment))
|
|
.route("/test/environments", get(list_test_environments))
|
|
.route("/test/environments/{id}", get(get_test_environment))
|
|
.route("/test/environments/{id}/run", post(run_environment_tests))
|
|
.route("/test/environments/{id}", axum::routing::delete(cleanup_test_environment))
|
|
.route("/test/environments/{id}/logs", get(get_environment_logs))
|
|
// DNS integration routes
|
|
.route("/api/v1/dns/records", get(list_dns_records))
|
|
// Extension loading routes
|
|
.route("/api/v1/extensions/loaded", get(list_loaded_extensions))
|
|
.route("/api/v1/extensions/reload", post(reload_extension))
|
|
// OCI registry routes
|
|
.route("/api/v1/oci/artifacts", post(list_oci_artifacts))
|
|
// Service orchestration routes
|
|
.route("/api/v1/services/list", get(list_services))
|
|
.route("/api/v1/services/status", get(get_services_status))
|
|
// Merge monitoring routes (includes /metrics, /ws, /events)
|
|
.merge(state.monitoring_system.create_routes())
|
|
.layer(CorsLayer::permissive())
|
|
.with_state(state);
|
|
|
|
let listener = tokio::net::TcpListener::bind(format!("0.0.0.0:{}", port))
|
|
.await
|
|
.context("Failed to bind to address")?;
|
|
|
|
info!("Orchestrator listening on port {}", port);
|
|
|
|
axum::serve(listener, app)
|
|
.await
|
|
.context("Server error")?;
|
|
|
|
Ok(())
|
|
}
|
|
|