From 7a95a3d7dec122dd2f40433db8b72d6a4144895e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jesu=CC=81s=20Pe=CC=81rez?= Date: Wed, 14 Jan 2026 03:20:59 +0000 Subject: [PATCH] chore: fix docs --- .pre-commit-config.yaml | 38 +- .typedialog/README.md | 372 +------- Cargo.toml | 2 +- README.md | 586 +------------ config/README.md | 110 +-- config/examples/README.md | 202 +---- crates/control-center-ui/README.md | 369 +------- crates/control-center-ui/REFERENCE.md | 34 +- crates/control-center-ui/auth-system.md | 382 +------- .../upstream-dependency-issue.md | 146 +--- crates/control-center/README.md | 372 +------- .../docs/security-considerations.md | 711 +-------------- crates/control-center/src/kms/README.md | 454 +--------- crates/control-center/web/README.md | 181 +--- crates/extension-registry/API.md | 587 +------------ crates/extension-registry/README.md | 636 +------------- crates/mcp-server/README.md | 136 +-- crates/orchestrator/README.md | 516 +---------- crates/orchestrator/docs/dns-integration.md | 222 +---- crates/orchestrator/docs/extension-loading.md | 377 +------- crates/orchestrator/docs/oci-integration.md | 418 +-------- .../docs/service-orchestration.md | 468 +--------- .../orchestrator/docs/ssh-key-management.md | 526 +---------- crates/orchestrator/docs/storage-backends.md | 386 +-------- .../orchestrator/scripts/migrate-storage.nu | 0 crates/orchestrator/wrks/readme-testing.md | 393 +-------- crates/vault-service/README.md | 468 +--------- crates/vault-service/scripts/start-kms.nu | 0 docs/README.md | 10 +- docs/deployment/deployment-guide.md | 758 +--------------- docs/deployment/guide.md | 469 +--------- docs/deployment/known-issues.md | 98 +-- docs/guides/quick-start.md | 283 +----- infrastructure/README.md | 19 +- infrastructure/oci-registry/README.md | 818 +----------------- scripts/deploy-platform.nu | 0 scripts/generate-infrastructure-configs.nu | 0 scripts/health-check.nu | 0 scripts/run-docker.nu | 0 scripts/run-native.nu | 0 scripts/start-provisioning-daemon.nu | 0 scripts/validate-configs.nu | 0 scripts/validate-infrastructure.nu | 0 43 files changed, 60 insertions(+), 11487 deletions(-) mode change 100755 => 100644 crates/orchestrator/scripts/migrate-storage.nu mode change 100755 => 100644 crates/vault-service/scripts/start-kms.nu mode change 100755 => 100644 scripts/deploy-platform.nu mode change 100755 => 100644 scripts/generate-infrastructure-configs.nu mode change 100755 => 100644 scripts/health-check.nu mode change 100755 => 100644 scripts/run-docker.nu mode change 100755 => 100644 scripts/run-native.nu mode change 100755 => 100644 scripts/start-provisioning-daemon.nu mode change 100755 => 100644 scripts/validate-configs.nu mode change 100755 => 100644 scripts/validate-infrastructure.nu diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index b1fdc1e..5287958 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -41,6 +41,22 @@ repos: # pass_filenames: false # stages: [pre-push] + # ============================================================================ + # Nickel Hooks (ACTIVE) + # ============================================================================ + - repo: local + hooks: + - id: nickel-typecheck + name: Nickel type checking + entry: >- + bash -c 'export NICKEL_IMPORT_PATH="../:."; for f in $(git diff --cached --name-only --diff-filter=ACM | grep "\.ncl$"); do + echo "Checking: $f"; nickel typecheck "$f" || exit 1; done' + language: system + types: [file] + files: \.ncl$ + pass_filenames: false + stages: [pre-commit] + # ============================================================================ # Markdown Hooks (ACTIVE) # ============================================================================ @@ -53,16 +69,18 @@ repos: types: [markdown] stages: [pre-commit] - # NOTE: Disabled - markdownlint-cli2 already catches syntax issues - # This script is redundant and causing false positives - # - id: check-malformed-fences - # name: Check malformed closing fences - # entry: bash -c 'cd .. && nu scripts/check-malformed-fences.nu $(git diff --cached --name-only --diff-filter=ACM | grep "\.md$" | grep -v ".coder/" | grep -v ".claude/" | grep -v "old_config/" | tr "\n" " ")' - # language: system - # types: [markdown] - # pass_filenames: false - # stages: [pre-commit] - # exclude: ^\.coder/|^\.claude/|^old_config/ + # CRITICAL: markdownlint-cli2 MD040 only checks opening fences for language. + # It does NOT catch malformed closing fences (e.g., ```plaintext) - CommonMark violation. + # This hook is ESSENTIAL to prevent malformed closing fences from entering the repo. + # See: .markdownlint-cli2.jsonc line 22-24 for details. + - id: check-malformed-fences + name: Check malformed closing fences (CommonMark) + entry: bash -c 'nu ../scripts/check-malformed-fences.nu $(git diff --cached --name-only --diff-filter=ACM | grep "\.md$" | grep -v ".coder/" | grep -v ".claude/" | grep -v "old_config/" | tr "\n" " ")' + language: system + types: [markdown] + pass_filenames: false + stages: [pre-commit] + exclude: ^\.coder/|^\.claude/|^old_config/ # ============================================================================ # General Pre-commit Hooks diff --git a/.typedialog/README.md b/.typedialog/README.md index 5b322ac..bba9a49 100644 --- a/.typedialog/README.md +++ b/.typedialog/README.md @@ -1,371 +1 @@ -# TypeDialog Integration - -TypeDialog enables interactive form-based configuration from Nickel schemas. - -## Status - -- **TypeDialog Binary**: Not yet installed (planned: `typedialog` command) -- **TypeDialog Forms**: Created and ready (setup wizard, auth login, MFA enrollment) -- **Bash Wrappers**: Implemented to handle TTY input properly -- **ForminQuire**: DEPRECATED - Archived to `.coder/archive/forminquire/` - -## Directory Structure - -```plaintext -.typedialog/ -└── provisioning/platform/ - ├── README.md # This file - ├── forms/ # Form definitions (to be generated) - │ ├── orchestrator.form.toml - │ ├── control-center.form.toml - │ └── ... - ├── templates/ # Jinja2 templates for schema rendering - │ └── service-form.template.j2 - ├── schemas/ # Symlink to Nickel schemas - │ └── platform/schemas/ → ../../../schemas/platform/schemas/ - └── constraints/ # Validation constraints - └── constraints.toml # Shared validation rules -```text - -## How TypeDialog Would Work - -### 1. Form Generation from Schemas - -```bash -# Auto-generate form from Nickel schema -typedialog generate-form --schema orchestrator.ncl \ - --output forms/orchestrator.form.toml -```text - -### 2. Interactive Configuration - -```bash -# Run interactive form -typedialog run-form --form forms/orchestrator.form.toml \ - --output orchestrator-configured.ncl -```text - -### 3. Validation - -```bash -# Validate user input against schema -typedialog validate --form forms/orchestrator.form.toml \ - --data user-config.ncl -```text - -## Current Status: TypeDialog Forms Ready - -TypeDialog forms have been created and are ready to use: - -**Form Locations**: -- Setup wizard: `provisioning/.typedialog/core/forms/setup-wizard.toml` -- Auth login: `provisioning/.typedialog/core/forms/auth-login.toml` -- MFA enrollment: `provisioning/.typedialog/core/forms/mfa-enroll.toml` - -**Bash Wrappers** (TTY-safe, handle input properly): -- `provisioning/core/shlib/setup-wizard-tty.sh` -- `provisioning/core/shlib/auth-login-tty.sh` -- `provisioning/core/shlib/mfa-enroll-tty.sh` - -**Usage Pattern**: -1. Bash wrapper calls TypeDialog (handles TTY input) -2. TypeDialog generates Nickel config file -3. Nushell scripts read the generated config (no input issues) - -**Example**: - -```bash -# Run TypeDialog setup wizard -./provisioning/core/shlib/setup-wizard-tty.sh - -# Nushell reads the generated config -let config = (open provisioning/.typedialog/core/generated/setup-wizard-result.json | from json) -```text - -**Note**: ForminQuire (Jinja2-based forms) has been archived to `provisioning/.coder/archive/forminquire/` and is no longer in use. - -## Integration Plan (When TypeDialog Available) - -### Step 1: Install TypeDialog - -```bash -cargo install --path /Users/Akasha/Development/typedialog -typedialog --version -```text - -### Step 2: Generate Forms from Schemas - -```bash -# Batch generate all forms -for schema in provisioning/schemas/platform/schemas/*.ncl; do - service=$(basename $schema .ncl) - typedialog generate-form \ - --schema $schema \ - --output provisioning/platform/.typedialog/forms/${service}.form.toml -done -```text - -### Step 3: Create Setup Wizard - -```bash -# Unified setup workflow -provisioning setup-platform \ - --mode solo|multiuser|enterprise \ - --provider docker|kubernetes \ - --interactive # Uses TypeDialog forms -```text - -### Step 4: Update Platform Setup Script - -```bash -# provisioning/platform/scripts/setup-platform-config.sh - -if command -v typedialog &> /dev/null; then - # TypeDialog is installed - use bash wrapper for proper TTY handling - ./provisioning/core/shlib/setup-wizard-tty.sh - - # Read generated JSON config - # Nushell scripts can now read the config without input issues -else - # Fallback to basic prompts - echo "TypeDialog not available. Using basic interactive prompts..." - # Nushell wizard with basic input prompts - nu -c "use provisioning/core/nulib/lib_provisioning/setup/wizard.nu *; run-setup-wizard" -fi -```text - -## Form Definition Example - -```toml -# provisioning/platform/.typedialog/forms/orchestrator.form.toml -[metadata] -name = "Orchestrator Configuration" -description = "Configure the Orchestrator service" -version = "1.0.0" -schema = "orchestrator.ncl" - -[fields.mode] -type = "enum" -label = "Deployment Mode" -description = "Select deployment mode: solo, multiuser, or enterprise" -options = ["solo", "multiuser", "enterprise"] -default = "solo" -required = true - -[fields.server.port] -type = "number" -label = "Server Port" -description = "HTTP server port (1-65535)" -min = 1 -max = 65535 -default = 8080 -required = true - -[fields.database.host] -type = "string" -label = "Database Host" -description = "PostgreSQL host" -default = "localhost" -required = true - -[fields.logging.level] -type = "enum" -label = "Logging Level" -options = ["debug", "info", "warning", "error"] -default = "info" -required = false -```text - -## Validation Constraints - -```toml -# provisioning/platform/.typedialog/constraints/constraints.toml - -[orchestrator] -mode = ["solo", "multiuser", "enterprise"] -port = "range(1, 65535)" -database_pool_size = "range(1, 100)" -memory = "pattern(^\\d+[MG]B$)" - -[control-center] -port = "range(1, 65535)" -replicas = "range(1, 10)" - -[nginx] -worker_processes = "range(1, 32)" -worker_connections = "range(1, 65536)" -```text - -## Workflow: Setup to Deployment - -```plaintext -1. User runs setup command - ↓ -2. TypeDialog displays form - ↓ -3. User fills form with validation - ↓ -4. Form data → Nickel config - ↓ -5. Nickel config → TOML (via ConfigLoader) - ↓ -6. Service reads TOML config - ↓ -7. Service starts with configured values -```text - -## Benefits of TypeDialog Integration - -- ✅ **Type-safe forms** - Generated from Nickel schemas -- ✅ **Real-time validation** - Enforce constraints as user types -- ✅ **Progressive disclosure** - Show advanced options only when needed -- ✅ **Consistent UX** - Same forms across platforms (CLI, Web, TUI) -- ✅ **Auto-generated** - Forms stay in sync with schemas automatically -- ✅ **TTY handling** - Bash wrappers solve Nushell input stack issues -- ✅ **Graceful fallback** - Falls back to basic prompts if TypeDialog unavailable - -## Testing TypeDialog Forms - -```bash -# Validate form structure -typedialog check-form provisioning/platform/.typedialog/forms/orchestrator.form.toml - -# Run form with test data -typedialog run-form \ - --form provisioning/platform/.typedialog/forms/orchestrator.form.toml \ - --test-mode # Automated validation - -# Generate sample output -typedialog generate-sample \ - --form provisioning/platform/.typedialog/forms/orchestrator.form.toml \ - --output /tmp/orchestrator-sample.ncl -```text - -## Migration Path - -### Phase A: Legacy (DEPRECATED) - -```plaintext -FormInquire (Jinja2) → Nushell processing → TOML config -Status: ARCHIVED to .coder/archive/forminquire/ -```text - -### Phase B: Current Implementation - -```plaintext -Bash wrapper → TypeDialog (TTY input) → Nickel config → JSON export → Nushell reads JSON -Status: IMPLEMENTED with forms ready -```text - -### Phase C: TypeDialog Binary Available (Future) - -```plaintext -TypeDialog binary installed → Full nickel-roundtrip workflow → Auto-sync with schemas -Status: PLANNED - awaiting TypeDialog binary release -```text - -### Phase D: Unified (Future) - -```plaintext -ConfigLoader discovers config → Service reads → TypeDialog updates UI -```text - -## Integration with Infrastructure Schemas - -TypeDialog forms work seamlessly with infrastructure schemas: - -### Infrastructure Configuration Workflow - -**1. Define Infrastructure Schemas** (completed) -- Location: `provisioning/schemas/infrastructure/` -- 6 schemas: docker-compose, kubernetes, nginx, prometheus, systemd, oci-registry -- All validated with `nickel typecheck` - -**2. Generate Infrastructure Configs** (completed) -- Script: `provisioning/platform/scripts/generate-infrastructure-configs.nu` -- Supports: solo, multiuser, enterprise, cicd modes -- Formats: YAML, JSON, conf, service - -**3. Validate Generated Configs** (completed) -- Script: `provisioning/platform/scripts/validate-infrastructure.nu` -- Tools: docker-compose config, kubectl apply --dry-run, nginx -t, promtool check -- Examples: `examples-solo-deployment.ncl`, `examples-enterprise-deployment.ncl` - -**4. Interactive Setup with Forms** (TypeDialog ready) -- Script: `provisioning/platform/scripts/setup-with-forms.sh` -- Bash wrappers: `provisioning/core/shlib/*-tty.sh` (handle TTY input) -- Forms ready: setup-wizard, auth-login, mfa-enroll -- Fallback: Basic Nushell prompts if TypeDialog unavailable - -### Current Status: Full Infrastructure Support - -| Component | Status | Details | -| ----------- | -------- | --------- | -| **Schemas** | ✅ Complete | 6 infrastructure schemas (1,577 lines) | -| **Examples** | ✅ Complete | 2 deployment examples (solo, enterprise) | -| **Generation Script** | ✅ Complete | Auto-generates configs for all modes | -| **Validation Script** | ✅ Complete | Validates Docker, K8s, Nginx, Prometheus | -| **Setup Wizard** | ✅ Complete | TypeDialog forms + bash wrappers ready | -| **TypeDialog Integration** | ⏳ Pending | Structure ready, awaiting binary | - -### Validated Examples - -**Solo Deployment** (`examples-solo-deployment.ncl`): -- ✅ Type-checks without errors -- ✅ Exports to 198 lines of JSON -- ✅ 5 Docker Compose services -- ✅ Resource limits: 1.0-4.0 CPU, 256M-1024M RAM -- ✅ Prometheus: 4 scrape jobs -- ✅ Registry backend: Zot (filesystem) - -**Enterprise Deployment** (`examples-enterprise-deployment.ncl`): -- ✅ Type-checks without errors -- ✅ Exports to 313 lines of JSON -- ✅ 6 Docker Compose services with HA -- ✅ Resource limits: 2.0-4.0 CPU, 512M-4096M RAM -- ✅ Prometheus: 7 scrape jobs with remote storage -- ✅ Registry backend: Harbor (S3 distributed) - -### Test Infrastructure Generation - -```bash -# Export solo infrastructure -nickel export --format json provisioning/schemas/infrastructure/examples-solo-deployment.ncl > /tmp/solo.json - -# Validate JSON -jq . /tmp/solo.json - -# Check Docker Compose services -jq '.docker_compose_services | keys' /tmp/solo.json - -# Compare resource allocation (solo vs enterprise) -jq '.docker_compose_services.orchestrator.deploy.resources.limits' /tmp/solo.json -jq '.docker_compose_services.orchestrator.deploy.resources.limits' /tmp/enterprise.json -```text - -## Next Steps - -1. **Infrastructure Setup** (available now): - - Generate infrastructure configs with automation scripts - - Validate with format-specific tools - - Use interactive setup wizard for configuration - -2. **When TypeDialog binary becomes available**: - - Install TypeDialog binary - - Forms already created and ready to use - - Bash wrappers handle TTY input (no Nushell stack issues) - - Full nickel-roundtrip workflow will be enabled - -3. **Production Deployment**: - - Use validated infrastructure configs - - Deploy with ConfigLoader + infrastructure schemas - - Monitor via Prometheus (auto-generated from schemas) - ---- - -**Version**: 1.2.0 (TypeDialog Forms + Bash Wrappers) -**Status**: TypeDialog forms ready with bash wrappers; Awaiting TypeDialog Binary -**Last Updated**: 2025-01-09 -**ForminQuire Status**: DEPRECATED - Archived to .coder/archive/forminquire/ -**Fallback**: Basic Nushell prompts if TypeDialog unavailable -**Tested**: Infrastructure examples (solo + enterprise) validated +# TypeDialog Integration\n\nTypeDialog enables interactive form-based configuration from Nickel schemas.\n\n## Status\n\n- **TypeDialog Binary**: Not yet installed (planned: `typedialog` command)\n- **TypeDialog Forms**: Created and ready (setup wizard, auth login, MFA enrollment)\n- **Bash Wrappers**: Implemented to handle TTY input properly\n- **ForminQuire**: DEPRECATED - Archived to `.coder/archive/forminquire/`\n\n## Directory Structure\n\n```\n.typedialog/\n└── provisioning/platform/\n ├── README.md # This file\n ├── forms/ # Form definitions (to be generated)\n │ ├── orchestrator.form.toml\n │ ├── control-center.form.toml\n │ └── ...\n ├── templates/ # Jinja2 templates for schema rendering\n │ └── service-form.template.j2\n ├── schemas/ # Symlink to Nickel schemas\n │ └── platform/schemas/ → ../../../schemas/platform/schemas/\n └── constraints/ # Validation constraints\n └── constraints.toml # Shared validation rules\n```\n\n## How TypeDialog Would Work\n\n### 1. Form Generation from Schemas\n\n```\n# Auto-generate form from Nickel schema\ntypedialog generate-form --schema orchestrator.ncl \\n --output forms/orchestrator.form.toml\n```\n\n### 2. Interactive Configuration\n\n```\n# Run interactive form\ntypedialog run-form --form forms/orchestrator.form.toml \\n --output orchestrator-configured.ncl\n```\n\n### 3. Validation\n\n```\n# Validate user input against schema\ntypedialog validate --form forms/orchestrator.form.toml \\n --data user-config.ncl\n```\n\n## Current Status: TypeDialog Forms Ready\n\nTypeDialog forms have been created and are ready to use:\n\n**Form Locations**:\n- Setup wizard: `provisioning/.typedialog/core/forms/setup-wizard.toml`\n- Auth login: `provisioning/.typedialog/core/forms/auth-login.toml`\n- MFA enrollment: `provisioning/.typedialog/core/forms/mfa-enroll.toml`\n\n**Bash Wrappers** (TTY-safe, handle input properly):\n- `provisioning/core/shlib/setup-wizard-tty.sh`\n- `provisioning/core/shlib/auth-login-tty.sh`\n- `provisioning/core/shlib/mfa-enroll-tty.sh`\n\n**Usage Pattern**:\n1. Bash wrapper calls TypeDialog (handles TTY input)\n2. TypeDialog generates Nickel config file\n3. Nushell scripts read the generated config (no input issues)\n\n**Example**:\n\n```\n# Run TypeDialog setup wizard\n./provisioning/core/shlib/setup-wizard-tty.sh\n\n# Nushell reads the generated config\nlet config = (open provisioning/.typedialog/core/generated/setup-wizard-result.json | from json)\n```\n\n**Note**: ForminQuire (Jinja2-based forms) has been archived to `provisioning/.coder/archive/forminquire/` and is no longer in use.\n\n## Integration Plan (When TypeDialog Available)\n\n### Step 1: Install TypeDialog\n\n```\ncargo install --path /Users/Akasha/Development/typedialog\ntypedialog --version\n```\n\n### Step 2: Generate Forms from Schemas\n\n```\n# Batch generate all forms\nfor schema in provisioning/schemas/platform/schemas/*.ncl; do\n service=$(basename $schema .ncl)\n typedialog generate-form \\n --schema $schema \\n --output provisioning/platform/.typedialog/forms/${service}.form.toml\ndone\n```\n\n### Step 3: Create Setup Wizard\n\n```\n# Unified setup workflow\nprovisioning setup-platform \\n --mode solo|multiuser|enterprise \\n --provider docker|kubernetes \\n --interactive # Uses TypeDialog forms\n```\n\n### Step 4: Update Platform Setup Script\n\n```\n# provisioning/platform/scripts/setup-platform-config.sh\n\nif command -v typedialog &> /dev/null; then\n # TypeDialog is installed - use bash wrapper for proper TTY handling\n ./provisioning/core/shlib/setup-wizard-tty.sh\n\n # Read generated JSON config\n # Nushell scripts can now read the config without input issues\nelse\n # Fallback to basic prompts\n echo "TypeDialog not available. Using basic interactive prompts..."\n # Nushell wizard with basic input prompts\n nu -c "use provisioning/core/nulib/lib_provisioning/setup/wizard.nu *; run-setup-wizard"\nfi\n```\n\n## Form Definition Example\n\n```\n# provisioning/platform/.typedialog/forms/orchestrator.form.toml\n[metadata]\nname = "Orchestrator Configuration"\ndescription = "Configure the Orchestrator service"\nversion = "1.0.0"\nschema = "orchestrator.ncl"\n\n[fields.mode]\ntype = "enum"\nlabel = "Deployment Mode"\ndescription = "Select deployment mode: solo, multiuser, or enterprise"\noptions = ["solo", "multiuser", "enterprise"]\ndefault = "solo"\nrequired = true\n\n[fields.server.port]\ntype = "number"\nlabel = "Server Port"\ndescription = "HTTP server port (1-65535)"\nmin = 1\nmax = 65535\ndefault = 8080\nrequired = true\n\n[fields.database.host]\ntype = "string"\nlabel = "Database Host"\ndescription = "PostgreSQL host"\ndefault = "localhost"\nrequired = true\n\n[fields.logging.level]\ntype = "enum"\nlabel = "Logging Level"\noptions = ["debug", "info", "warning", "error"]\ndefault = "info"\nrequired = false\n```\n\n## Validation Constraints\n\n```\n# provisioning/platform/.typedialog/constraints/constraints.toml\n\n[orchestrator]\nmode = ["solo", "multiuser", "enterprise"]\nport = "range(1, 65535)"\ndatabase_pool_size = "range(1, 100)"\nmemory = "pattern(^\\d+[MG]B$)"\n\n[control-center]\nport = "range(1, 65535)"\nreplicas = "range(1, 10)"\n\n[nginx]\nworker_processes = "range(1, 32)"\nworker_connections = "range(1, 65536)"\n```\n\n## Workflow: Setup to Deployment\n\n```\n1. User runs setup command\n ↓\n2. TypeDialog displays form\n ↓\n3. User fills form with validation\n ↓\n4. Form data → Nickel config\n ↓\n5. Nickel config → TOML (via ConfigLoader)\n ↓\n6. Service reads TOML config\n ↓\n7. Service starts with configured values\n```\n\n## Benefits of TypeDialog Integration\n\n- ✅ **Type-safe forms** - Generated from Nickel schemas\n- ✅ **Real-time validation** - Enforce constraints as user types\n- ✅ **Progressive disclosure** - Show advanced options only when needed\n- ✅ **Consistent UX** - Same forms across platforms (CLI, Web, TUI)\n- ✅ **Auto-generated** - Forms stay in sync with schemas automatically\n- ✅ **TTY handling** - Bash wrappers solve Nushell input stack issues\n- ✅ **Graceful fallback** - Falls back to basic prompts if TypeDialog unavailable\n\n## Testing TypeDialog Forms\n\n```\n# Validate form structure\ntypedialog check-form provisioning/platform/.typedialog/forms/orchestrator.form.toml\n\n# Run form with test data\ntypedialog run-form \\n --form provisioning/platform/.typedialog/forms/orchestrator.form.toml \\n --test-mode # Automated validation\n\n# Generate sample output\ntypedialog generate-sample \\n --form provisioning/platform/.typedialog/forms/orchestrator.form.toml \\n --output /tmp/orchestrator-sample.ncl\n```\n\n## Migration Path\n\n### Phase A: Legacy (DEPRECATED)\n\n```\nFormInquire (Jinja2) → Nushell processing → TOML config\nStatus: ARCHIVED to .coder/archive/forminquire/\n```\n\n### Phase B: Current Implementation\n\n```\nBash wrapper → TypeDialog (TTY input) → Nickel config → JSON export → Nushell reads JSON\nStatus: IMPLEMENTED with forms ready\n```\n\n### Phase C: TypeDialog Binary Available (Future)\n\n```\nTypeDialog binary installed → Full nickel-roundtrip workflow → Auto-sync with schemas\nStatus: PLANNED - awaiting TypeDialog binary release\n```\n\n### Phase D: Unified (Future)\n\n```\nConfigLoader discovers config → Service reads → TypeDialog updates UI\n```\n\n## Integration with Infrastructure Schemas\n\nTypeDialog forms work seamlessly with infrastructure schemas:\n\n### Infrastructure Configuration Workflow\n\n**1. Define Infrastructure Schemas** (completed)\n- Location: `provisioning/schemas/infrastructure/`\n- 6 schemas: docker-compose, kubernetes, nginx, prometheus, systemd, oci-registry\n- All validated with `nickel typecheck`\n\n**2. Generate Infrastructure Configs** (completed)\n- Script: `provisioning/platform/scripts/generate-infrastructure-configs.nu`\n- Supports: solo, multiuser, enterprise, cicd modes\n- Formats: YAML, JSON, conf, service\n\n**3. Validate Generated Configs** (completed)\n- Script: `provisioning/platform/scripts/validate-infrastructure.nu`\n- Tools: docker-compose config, kubectl apply --dry-run, nginx -t, promtool check\n- Examples: `examples-solo-deployment.ncl`, `examples-enterprise-deployment.ncl`\n\n**4. Interactive Setup with Forms** (TypeDialog ready)\n- Script: `provisioning/platform/scripts/setup-with-forms.sh`\n- Bash wrappers: `provisioning/core/shlib/*-tty.sh` (handle TTY input)\n- Forms ready: setup-wizard, auth-login, mfa-enroll\n- Fallback: Basic Nushell prompts if TypeDialog unavailable\n\n### Current Status: Full Infrastructure Support\n\n| Component | Status | Details |\n| ----------- | -------- | --------- |\n| **Schemas** | ✅ Complete | 6 infrastructure schemas (1,577 lines) |\n| **Examples** | ✅ Complete | 2 deployment examples (solo, enterprise) |\n| **Generation Script** | ✅ Complete | Auto-generates configs for all modes |\n| **Validation Script** | ✅ Complete | Validates Docker, K8s, Nginx, Prometheus |\n| **Setup Wizard** | ✅ Complete | TypeDialog forms + bash wrappers ready |\n| **TypeDialog Integration** | ⏳ Pending | Structure ready, awaiting binary |\n\n### Validated Examples\n\n**Solo Deployment** (`examples-solo-deployment.ncl`):\n- ✅ Type-checks without errors\n- ✅ Exports to 198 lines of JSON\n- ✅ 5 Docker Compose services\n- ✅ Resource limits: 1.0-4.0 CPU, 256M-1024M RAM\n- ✅ Prometheus: 4 scrape jobs\n- ✅ Registry backend: Zot (filesystem)\n\n**Enterprise Deployment** (`examples-enterprise-deployment.ncl`):\n- ✅ Type-checks without errors\n- ✅ Exports to 313 lines of JSON\n- ✅ 6 Docker Compose services with HA\n- ✅ Resource limits: 2.0-4.0 CPU, 512M-4096M RAM\n- ✅ Prometheus: 7 scrape jobs with remote storage\n- ✅ Registry backend: Harbor (S3 distributed)\n\n### Test Infrastructure Generation\n\n```\n# Export solo infrastructure\nnickel export --format json provisioning/schemas/infrastructure/examples-solo-deployment.ncl > /tmp/solo.json\n\n# Validate JSON\njq . /tmp/solo.json\n\n# Check Docker Compose services\njq '.docker_compose_services | keys' /tmp/solo.json\n\n# Compare resource allocation (solo vs enterprise)\njq '.docker_compose_services.orchestrator.deploy.resources.limits' /tmp/solo.json\njq '.docker_compose_services.orchestrator.deploy.resources.limits' /tmp/enterprise.json\n```\n\n## Next Steps\n\n1. **Infrastructure Setup** (available now):\n - Generate infrastructure configs with automation scripts\n - Validate with format-specific tools\n - Use interactive setup wizard for configuration\n\n2. **When TypeDialog binary becomes available**:\n - Install TypeDialog binary\n - Forms already created and ready to use\n - Bash wrappers handle TTY input (no Nushell stack issues)\n - Full nickel-roundtrip workflow will be enabled\n\n3. **Production Deployment**:\n - Use validated infrastructure configs\n - Deploy with ConfigLoader + infrastructure schemas\n - Monitor via Prometheus (auto-generated from schemas)\n\n---\n\n**Version**: 1.2.0 (TypeDialog Forms + Bash Wrappers)\n**Status**: TypeDialog forms ready with bash wrappers; Awaiting TypeDialog Binary\n**Last Updated**: 2025-01-09\n**ForminQuire Status**: DEPRECATED - Archived to .coder/archive/forminquire/\n**Fallback**: Basic Nushell prompts if TypeDialog unavailable\n**Tested**: Infrastructure examples (solo + enterprise) validated \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml index 841b37a..e517408 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -21,7 +21,7 @@ resolver = "2" edition = "2021" license = "MIT" repository = "https://github.com/jesusperezlorenzo/provisioning" - version = "0.1.0" + version = "1.0.11" [workspace.dependencies] # ============================================================================ diff --git a/README.md b/README.md index 79644c8..a6400f6 100644 --- a/README.md +++ b/README.md @@ -1,585 +1 @@ -

- Provisioning Logo -

-

- Provisioning -

- ---- - -# Platform Services - -Platform-level services for the [Provisioning project](https://repo.jesusperez.pro/jesus/provisioning) infrastructure automation platform. -These services provide the high-performance execution layer, management interfaces, and supporting infrastructure for the entire provisioning system. - -## Overview - -The Platform layer consists of **production-ready services** built primarily in Rust, providing: - -- **Workflow Execution** - High-performance orchestration and task coordination -- **Management Interfaces** - Web UI and REST APIs for infrastructure management -- **Security & Authorization** - Enterprise-grade access control and permissions -- **Installation & Distribution** - Multi-mode installer with TUI, CLI, and unattended modes -- **AI Integration** - Model Context Protocol (MCP) server for intelligent assistance -- **Extension Management** - OCI-based registry for distributing modules - ---- - -## Core Platform Services - -### 1. **Orchestrator** (`orchestrator/`) - -High-performance Rust/Nushell hybrid orchestrator for workflow execution. - -**Language**: Rust + Nushell integration - -**Purpose**: Workflow execution, task scheduling, state management - -**Key Features**: - -- File-based persistence for reliability -- Priority processing with retry logic -- Checkpoint recovery and automatic rollback -- REST API endpoints for external integration -- Solves deep call stack limitations -- Parallel task execution with dependency resolution - -**Status**: ✅ Production Ready (v3.0.0) - -**Documentation**: See [.claude/features/orchestrator-architecture.md](../../.claude/features/orchestrator-architecture.md) - -**Quick Start**: - -```bash -cd orchestrator -./scripts/start-orchestrator.nu --background -```text - -**REST API**: - -- `GET http://localhost:8080/health` - Health check -- `GET http://localhost:8080/tasks` - List all tasks -- `POST http://localhost:8080/workflows/servers/create` - Server workflow -- `POST http://localhost:8080/workflows/taskserv/create` - Taskserv workflow - ---- - -### 2. **Control Center** (`control-center/`) - -Backend control center service with authorization and permissions management. - -**Language**: Rust - -**Purpose**: Web-based infrastructure management with RBAC - -**Key Features**: - -- **Authorization and permissions control** (enterprise security) -- Role-Based Access Control (RBAC) -- Audit logging and compliance tracking -- System management APIs -- Configuration management -- Resource monitoring - -**Status**: ✅ Active Development - -**Security Features**: - -- Fine-grained permissions system -- User authentication and session management -- API key management -- Activity audit logs - ---- - -### 3. **Control Center UI** (`control-center-ui/`) - -Frontend web interface for infrastructure management. - -**Language**: Web (HTML/CSS/JavaScript) - -**Purpose**: User-friendly dashboard and administration interface - -**Key Features**: - -- Dashboard with real-time monitoring -- Configuration management interface -- System administration tools -- Workflow visualization -- Log viewing and search - -**Status**: ✅ Active Development - -**Integration**: Communicates with Control Center backend and Orchestrator APIs - ---- - -### 4. **Installer** (`installer/`) - -Multi-mode platform installation system with interactive TUI, headless CLI, and unattended modes. - -**Language**: Rust (Ratatui TUI) + Nushell scripts - -**Purpose**: Platform installation and configuration generation - -**Key Features**: - -- **Interactive TUI Mode**: Beautiful terminal UI with 7 screens -- **Headless Mode**: CLI automation for scripted installations -- **Unattended Mode**: Zero-interaction CI/CD deployments -- **Deployment Modes**: Solo (2 CPU/4GB), MultiUser (4 CPU/8GB), CICD (8 CPU/16GB), Enterprise (16 CPU/32GB) -- **MCP Integration**: 7 AI-powered settings tools for intelligent configuration -- **Nushell Scripts**: Complete deployment automation for Docker, Podman, Kubernetes, OrbStack - -**Status**: ✅ Production Ready (v3.5.0) - -**Quick Start**: - -```bash -# Interactive TUI -provisioning-installer - -# Headless mode -provisioning-installer --headless --mode solo --yes - -# Unattended CI/CD -provisioning-installer --unattended --config config.toml -```text - -**Documentation**: `installer/docs/` - Complete guides and references - ---- - -### 5. **MCP Server** (`mcp-server/`) - -Model Context Protocol server for AI-powered assistance. - -**Language**: Nushell - -**Purpose**: AI integration for intelligent configuration and assistance - -**Key Features**: - -- 7 AI-powered settings tools -- Intelligent config completion -- Natural language infrastructure queries -- Configuration validation and suggestions -- Context-aware help system - -**Status**: ✅ Active Development - -**MCP Tools**: - -- Settings generation -- Configuration validation -- Best practice recommendations -- Infrastructure planning assistance -- Error diagnosis and resolution - ---- - -### 6. **OCI Registry** (`infrastructure/oci-registry/`) - -OCI-compliant registry for extension distribution and versioning. - -**Purpose**: Distributing and managing extensions - -**Key Features**: - -- Task service packages -- Provider packages -- Cluster templates -- Workflow definitions -- Version management and updates -- Dependency resolution - -**Status**: 🔄 Planned - -**Benefits**: - -- Centralized extension management -- Version control and rollback -- Dependency tracking -- Community marketplace ready - ---- - -### 7. **API Gateway** (`infrastructure/api-gateway/`) - -Unified REST API gateway for external integration. - -**Language**: Rust - -**Purpose**: API routing, authentication, and rate limiting - -**Key Features**: - -- Request routing to backend services -- Authentication and authorization -- Rate limiting and throttling -- API versioning -- Request validation -- Metrics and monitoring - -**Status**: 🔄 Planned - -**Endpoints** (Planned): - -- `/api/v1/servers/*` - Server management -- `/api/v1/taskservs/*` - Task service operations -- `/api/v1/clusters/*` - Cluster operations -- `/api/v1/workflows/*` - Workflow management - ---- - -### 8. **Extension Registry** (`extension-registry/`) - -Registry and catalog for browsing and discovering extensions. - -**Purpose**: Extension discovery and metadata management - -**Key Features**: - -- Extension catalog -- Search and filtering -- Version history -- Dependency information -- Documentation links -- Community ratings (future) - -**Status**: 🔄 Planned - ---- - -### 9. **Provisioning Server** (`provisioning-server/`) - -Alternative provisioning service implementation. - -**Purpose**: Additional provisioning service capabilities - -**Status**: 🔄 In Development - ---- - -## Supporting Services - -### CoreDNS (`config/coredns/`) - -DNS service configuration for cluster environments. - -**Purpose**: Service discovery and DNS resolution - -**Status**: ✅ Configuration Ready - ---- - -### Monitoring (`infrastructure/monitoring/`) - -Observability and monitoring infrastructure. - -**Purpose**: Metrics, logging, and alerting - -**Components**: - -- Prometheus configuration -- Grafana dashboards -- Alert rules - -**Status**: ✅ Configuration Ready - ---- - -### Nginx (`infrastructure/nginx/`) - -Reverse proxy and load balancer configurations. - -**Purpose**: HTTP routing and SSL termination - -**Status**: ✅ Configuration Ready - ---- - -### Docker Compose (`infrastructure/docker/`) - -Docker Compose configurations for local development. - -**Purpose**: Quick local platform deployment - -**Status**: ✅ Ready for Development - ---- - -### Systemd (`infrastructure/systemd/`) - -Systemd service units for platform services. - -**Purpose**: Production deployment with systemd - -**Status**: ✅ Ready for Production - ---- - -## Architecture - -```plaintext -┌───────────────────────────────────────────────── -────────────┐ -│ User Interfaces │ -│ • CLI (provisioning command) │ -│ • Web UI (Control Center UI) │ -│ • API Clients │ -└───────────────────────────────────────────────── -────────────┘ - ↓ -┌───────────────────────────────────────────────── -────────────┐ -│ API Gateway │ -│ • Request Routing │ -│ • Authentication & Authorization │ -│ • Rate Limiting │ -└───────────────────────────────────────────────── -────────────┘ - ↓ -┌───────────────────────────────────────────────── -────────────┐ -│ Platform Services Layer │ -│ │ -│ ┌──────────────┐ ┌──────────────┐ -┌──────────────┐ │ -│ │ Orchestrator │ │Control Center│ │ MCP Server │ │ -│ │ (Rust) │ │ (Rust) │ │ (Nushell) │ │ -│ └──────────────┘ └──────────────┘ -└──────────────┘ │ -│ │ -│ ┌──────────────┐ ┌──────────────┐ -┌──────────────┐ │ -│ │ Installer │ │ OCI Registry │ │ Extension │ │ -│ │(Rust/Nushell)│ │ │ │ Registry │ │ -│ └──────────────┘ └──────────────┘ -└──────────────┘ │ -└───────────────────────────────────────────────── -────────────┘ - ↓ -┌───────────────────────────────────────────────── -────────────┐ -│ Data & State Layer │ -│ • SurrealDB (State Management) │ -│ • File-based Persistence (Checkpoints) │ -│ • Configuration Storage │ -└───────────────────────────────────────────────── -────────────┘ -```text - ---- - -## Technology Stack - -### Primary Languages - -| Language | Usage | Services | -| ---------- | ------- | ---------- | -| **Rust** | Platform services, performance layer | Orchestrator, Control Center, Installer, API Gateway | -| **Nushell** | Scripting, automation, MCP integration | MCP Server, Installer scripts | -| **Web** | Frontend interfaces | Control Center UI | - -### Key Dependencies - -- **tokio** - Async runtime for Rust services -- **axum** / **actix-web** - Web frameworks -- **serde** - Serialization/deserialization -- **bollard** - Docker API client (test environments) -- **ratatui** - Terminal UI framework (installer) -- **SurrealDB** - State management database - ---- - -## Deployment Modes - -### 1. **Development Mode** - -```bash -# Docker Compose for local development -docker-compose -f infrastructure/docker/dev.yml up -```text - -### 2. **Production Mode (Systemd)** - -```bash -# Install systemd units -sudo cp infrastructure/systemd/*.service /etc/infrastructure/systemd/system/ -sudo systemctl daemon-reload -sudo systemctl enable --now provisioning-orchestrator -sudo systemctl enable --now provisioning-control-center -```text - -### 3. **Kubernetes Deployment** - -```bash -# Deploy platform services to Kubernetes -kubectl apply -f k8s/ -```text - ---- - -## Security Features - -### Enterprise Security Stack - -1. **Authorization & Permissions** (Control Center) - - Role-Based Access Control (RBAC) - - Fine-grained permissions - - Audit logging - -2. **Authentication** - - API key management - - Session management - - Token-based auth (JWT) - -3. **Secrets Management** - - Integration with SOPS/Age - - Cosmian KMS support - - Secure configuration storage - -4. **Policy Enforcement** - - Cedar policy engine integration - - Compliance checking - - Anomaly detection - ---- - -## Getting Started - -### Prerequisites - -- **Rust** - Latest stable (for building platform services) -- **Nushell 0.107.1+** - For MCP server and scripts -- **Docker** (optional) - For containerized deployment -- **Kubernetes** (optional) - For K8s deployment - -### Building Platform Services - -```bash -# Build all Rust services -cd orchestrator && cargo build --release -cd ../control-center && cargo build --release -cd ../installer && cargo build --release -```text - -### Running Services - -```bash -# Start orchestrator -cd orchestrator -./scripts/start-orchestrator.nu --background - -# Start control center -cd control-center -cargo run --release - -# Start MCP server -cd mcp-server -nu run.nu -```text - ---- - -## Development - -### Project Structure - -```plaintext -platform/ -├── orchestrator/ # Rust orchestrator service -├── control-center/ # Rust control center backend -├── control-center-ui/ # Web frontend -├── installer/ # Rust/Nushell installer -├── mcp-server/ # Nushell MCP server -├── infrastructure/api-gateway/ # Rust API gateway (planned) -├── infrastructure/oci-registry/ # OCI registry (planned) -├── extension-registry/ # Extension catalog (planned) -├── provisioning-server/# Alternative service -├── infrastructure/docker/ # Docker Compose configs -├── k8s/ # Kubernetes manifests -├── infrastructure/systemd/ # Systemd units -└── docs/ # Platform documentation -```text - -### Adding New Services - -1. Create service directory in `platform/` -2. Add README.md with service description -3. Implement service following architecture patterns -4. Add tests and documentation -5. Update platform/README.md (this file) -6. Add deployment configurations (docker-compose, k8s, systemd) - ---- - -## Integration with [Provisioning](../../PROVISIONING.md) - -Platform services integrate seamlessly with the [Provisioning](../../PROVISIONING.md) system: - -- **Core Engine** (`../core/`) provides CLI and libraries -- **Extensions** (`../extensions/`) provide providers, taskservs, clusters -- **Platform Services** (this directory) provide execution and management -- **Configuration** (`../kcl/`, `../config/`) defines infrastructure - ---- - -## Documentation - -### Platform Documentation - -- **Orchestrator**: [.claude/features/orchestrator-architecture.md](../../.claude/features/orchestrator-architecture.md) -- **Installer**: `installer/docs/` directory -- **Test Environments**: [.claude/features/test-environment-service.md](../../.claude/features/test-environment-service.md) - -### API Documentation - -- **REST API Reference**: `docs/api/` (when orchestrator is running) -- **MCP Tools Reference**: `mcp-server/docs/` - -### Architecture Documentation - -- **Main Project**: [PROVISIONING.md](../../PROVISIONING.md) -- **Project Architecture**: [CLAUDE.md](../../CLAUDE.md) - ---- - -## Contributing - -When contributing to platform services: - -1. **Follow Rust Best Practices** - Idiomatic Rust, proper error handling -2. **Security First** - Always consider security implications -3. **Performance Matters** - Platform services are performance-critical -4. **Document APIs** - All REST endpoints must be documented -5. **Add Tests** - Unit tests and integration tests required -6. **Update Docs** - Keep README and API docs current - ---- - -## Status Legend - -- ✅ **Production Ready** - Fully implemented and tested -- ✅ **Active Development** - Working implementation, ongoing improvements -- ✅ **Configuration Ready** - Configuration files ready for deployment -- 🔄 **Planned** - Design phase, implementation pending -- 🔄 **In Development** - Early implementation stage - ---- - -## Support - -For platform service issues: - -- Check service-specific README in service directory -- Review logs: `journalctl -u provisioning-*` (systemd) -- API documentation: `http://localhost:8080/docs` (when running) -- See [Provisioning project](https://repo.jesusperez.pro/jesus/provisioning) for general support - ---- - -**Maintained By**: Platform Team -**Last Updated**: 2025-10-07 -**Platform Version**: 3.5.0 +

\n Provisioning Logo\n

\n

\n Provisioning\n

\n\n---\n\n# Platform Services\n\nPlatform-level services for the [Provisioning project](https://repo.jesusperez.pro/jesus/provisioning) infrastructure automation platform.\nThese services provide the high-performance execution layer, management interfaces, and supporting infrastructure for the entire provisioning system.\n\n## Overview\n\nThe Platform layer consists of **production-ready services** built primarily in Rust, providing:\n\n- **Workflow Execution** - High-performance orchestration and task coordination\n- **Management Interfaces** - Web UI and REST APIs for infrastructure management\n- **Security & Authorization** - Enterprise-grade access control and permissions\n- **Installation & Distribution** - Multi-mode installer with TUI, CLI, and unattended modes\n- **AI Integration** - Model Context Protocol (MCP) server for intelligent assistance\n- **Extension Management** - OCI-based registry for distributing modules\n\n---\n\n## Core Platform Services\n\n### 1. **Orchestrator** (`orchestrator/`)\n\nHigh-performance Rust/Nushell hybrid orchestrator for workflow execution.\n\n**Language**: Rust + Nushell integration\n\n**Purpose**: Workflow execution, task scheduling, state management\n\n**Key Features**:\n\n- File-based persistence for reliability\n- Priority processing with retry logic\n- Checkpoint recovery and automatic rollback\n- REST API endpoints for external integration\n- Solves deep call stack limitations\n- Parallel task execution with dependency resolution\n\n**Status**: ✅ Production Ready (v3.0.0)\n\n**Documentation**: See [.claude/features/orchestrator-architecture.md](../../.claude/features/orchestrator-architecture.md)\n\n**Quick Start**:\n\n```\ncd orchestrator\n./scripts/start-orchestrator.nu --background\n```\n\n**REST API**:\n\n- `GET http://localhost:8080/health` - Health check\n- `GET http://localhost:8080/tasks` - List all tasks\n- `POST http://localhost:8080/workflows/servers/create` - Server workflow\n- `POST http://localhost:8080/workflows/taskserv/create` - Taskserv workflow\n\n---\n\n### 2. **Control Center** (`control-center/`)\n\nBackend control center service with authorization and permissions management.\n\n**Language**: Rust\n\n**Purpose**: Web-based infrastructure management with RBAC\n\n**Key Features**:\n\n- **Authorization and permissions control** (enterprise security)\n- Role-Based Access Control (RBAC)\n- Audit logging and compliance tracking\n- System management APIs\n- Configuration management\n- Resource monitoring\n\n**Status**: ✅ Active Development\n\n**Security Features**:\n\n- Fine-grained permissions system\n- User authentication and session management\n- API key management\n- Activity audit logs\n\n---\n\n### 3. **Control Center UI** (`control-center-ui/`)\n\nFrontend web interface for infrastructure management.\n\n**Language**: Web (HTML/CSS/JavaScript)\n\n**Purpose**: User-friendly dashboard and administration interface\n\n**Key Features**:\n\n- Dashboard with real-time monitoring\n- Configuration management interface\n- System administration tools\n- Workflow visualization\n- Log viewing and search\n\n**Status**: ✅ Active Development\n\n**Integration**: Communicates with Control Center backend and Orchestrator APIs\n\n---\n\n### 4. **Installer** (`installer/`)\n\nMulti-mode platform installation system with interactive TUI, headless CLI, and unattended modes.\n\n**Language**: Rust (Ratatui TUI) + Nushell scripts\n\n**Purpose**: Platform installation and configuration generation\n\n**Key Features**:\n\n- **Interactive TUI Mode**: Beautiful terminal UI with 7 screens\n- **Headless Mode**: CLI automation for scripted installations\n- **Unattended Mode**: Zero-interaction CI/CD deployments\n- **Deployment Modes**: Solo (2 CPU/4GB), MultiUser (4 CPU/8GB), CICD (8 CPU/16GB), Enterprise (16 CPU/32GB)\n- **MCP Integration**: 7 AI-powered settings tools for intelligent configuration\n- **Nushell Scripts**: Complete deployment automation for Docker, Podman, Kubernetes, OrbStack\n\n**Status**: ✅ Production Ready (v3.5.0)\n\n**Quick Start**:\n\n```\n# Interactive TUI\nprovisioning-installer\n\n# Headless mode\nprovisioning-installer --headless --mode solo --yes\n\n# Unattended CI/CD\nprovisioning-installer --unattended --config config.toml\n```\n\n**Documentation**: `installer/docs/` - Complete guides and references\n\n---\n\n### 5. **MCP Server** (`mcp-server/`)\n\nModel Context Protocol server for AI-powered assistance.\n\n**Language**: Nushell\n\n**Purpose**: AI integration for intelligent configuration and assistance\n\n**Key Features**:\n\n- 7 AI-powered settings tools\n- Intelligent config completion\n- Natural language infrastructure queries\n- Configuration validation and suggestions\n- Context-aware help system\n\n**Status**: ✅ Active Development\n\n**MCP Tools**:\n\n- Settings generation\n- Configuration validation\n- Best practice recommendations\n- Infrastructure planning assistance\n- Error diagnosis and resolution\n\n---\n\n### 6. **OCI Registry** (`infrastructure/oci-registry/`)\n\nOCI-compliant registry for extension distribution and versioning.\n\n**Purpose**: Distributing and managing extensions\n\n**Key Features**:\n\n- Task service packages\n- Provider packages\n- Cluster templates\n- Workflow definitions\n- Version management and updates\n- Dependency resolution\n\n**Status**: 🔄 Planned\n\n**Benefits**:\n\n- Centralized extension management\n- Version control and rollback\n- Dependency tracking\n- Community marketplace ready\n\n---\n\n### 7. **API Gateway** (`infrastructure/api-gateway/`)\n\nUnified REST API gateway for external integration.\n\n**Language**: Rust\n\n**Purpose**: API routing, authentication, and rate limiting\n\n**Key Features**:\n\n- Request routing to backend services\n- Authentication and authorization\n- Rate limiting and throttling\n- API versioning\n- Request validation\n- Metrics and monitoring\n\n**Status**: 🔄 Planned\n\n**Endpoints** (Planned):\n\n- `/api/v1/servers/*` - Server management\n- `/api/v1/taskservs/*` - Task service operations\n- `/api/v1/clusters/*` - Cluster operations\n- `/api/v1/workflows/*` - Workflow management\n\n---\n\n### 8. **Extension Registry** (`extension-registry/`)\n\nRegistry and catalog for browsing and discovering extensions.\n\n**Purpose**: Extension discovery and metadata management\n\n**Key Features**:\n\n- Extension catalog\n- Search and filtering\n- Version history\n- Dependency information\n- Documentation links\n- Community ratings (future)\n\n**Status**: 🔄 Planned\n\n---\n\n### 9. **Provisioning Server** (`provisioning-server/`)\n\nAlternative provisioning service implementation.\n\n**Purpose**: Additional provisioning service capabilities\n\n**Status**: 🔄 In Development\n\n---\n\n## Supporting Services\n\n### CoreDNS (`config/coredns/`)\n\nDNS service configuration for cluster environments.\n\n**Purpose**: Service discovery and DNS resolution\n\n**Status**: ✅ Configuration Ready\n\n---\n\n### Monitoring (`infrastructure/monitoring/`)\n\nObservability and monitoring infrastructure.\n\n**Purpose**: Metrics, logging, and alerting\n\n**Components**:\n\n- Prometheus configuration\n- Grafana dashboards\n- Alert rules\n\n**Status**: ✅ Configuration Ready\n\n---\n\n### Nginx (`infrastructure/nginx/`)\n\nReverse proxy and load balancer configurations.\n\n**Purpose**: HTTP routing and SSL termination\n\n**Status**: ✅ Configuration Ready\n\n---\n\n### Docker Compose (`infrastructure/docker/`)\n\nDocker Compose configurations for local development.\n\n**Purpose**: Quick local platform deployment\n\n**Status**: ✅ Ready for Development\n\n---\n\n### Systemd (`infrastructure/systemd/`)\n\nSystemd service units for platform services.\n\n**Purpose**: Production deployment with systemd\n\n**Status**: ✅ Ready for Production\n\n---\n\n## Architecture\n\n```\n┌─────────────────────────────────────────────────\n────────────┐\n│ User Interfaces │\n│ • CLI (provisioning command) │\n│ • Web UI (Control Center UI) │\n│ • API Clients │\n└─────────────────────────────────────────────────\n────────────┘\n ↓\n┌─────────────────────────────────────────────────\n────────────┐\n│ API Gateway │\n│ • Request Routing │\n│ • Authentication & Authorization │\n│ • Rate Limiting │\n└─────────────────────────────────────────────────\n────────────┘\n ↓\n┌─────────────────────────────────────────────────\n────────────┐\n│ Platform Services Layer │\n│ │\n│ ┌──────────────┐ ┌──────────────┐ \n┌──────────────┐ │\n│ │ Orchestrator │ │Control Center│ │ MCP Server │ │\n│ │ (Rust) │ │ (Rust) │ │ (Nushell) │ │\n│ └──────────────┘ └──────────────┘ \n└──────────────┘ │\n│ │\n│ ┌──────────────┐ ┌──────────────┐ \n┌──────────────┐ │\n│ │ Installer │ │ OCI Registry │ │ Extension │ │\n│ │(Rust/Nushell)│ │ │ │ Registry │ │\n│ └──────────────┘ └──────────────┘ \n└──────────────┘ │\n└─────────────────────────────────────────────────\n────────────┘\n ↓\n┌─────────────────────────────────────────────────\n────────────┐\n│ Data & State Layer │\n│ • SurrealDB (State Management) │\n│ • File-based Persistence (Checkpoints) │\n│ • Configuration Storage │\n└─────────────────────────────────────────────────\n────────────┘\n```\n\n---\n\n## Technology Stack\n\n### Primary Languages\n\n| Language | Usage | Services |\n| ---------- | ------- | ---------- |\n| **Rust** | Platform services, performance layer | Orchestrator, Control Center, Installer, API Gateway |\n| **Nushell** | Scripting, automation, MCP integration | MCP Server, Installer scripts |\n| **Web** | Frontend interfaces | Control Center UI |\n\n### Key Dependencies\n\n- **tokio** - Async runtime for Rust services\n- **axum** / **actix-web** - Web frameworks\n- **serde** - Serialization/deserialization\n- **bollard** - Docker API client (test environments)\n- **ratatui** - Terminal UI framework (installer)\n- **SurrealDB** - State management database\n\n---\n\n## Deployment Modes\n\n### 1. **Development Mode**\n\n```\n# Docker Compose for local development\ndocker-compose -f infrastructure/docker/dev.yml up\n```\n\n### 2. **Production Mode (Systemd)**\n\n```\n# Install systemd units\nsudo cp infrastructure/systemd/*.service /etc/infrastructure/systemd/system/\nsudo systemctl daemon-reload\nsudo systemctl enable --now provisioning-orchestrator\nsudo systemctl enable --now provisioning-control-center\n```\n\n### 3. **Kubernetes Deployment**\n\n```\n# Deploy platform services to Kubernetes\nkubectl apply -f k8s/\n```\n\n---\n\n## Security Features\n\n### Enterprise Security Stack\n\n1. **Authorization & Permissions** (Control Center)\n - Role-Based Access Control (RBAC)\n - Fine-grained permissions\n - Audit logging\n\n2. **Authentication**\n - API key management\n - Session management\n - Token-based auth (JWT)\n\n3. **Secrets Management**\n - Integration with SOPS/Age\n - Cosmian KMS support\n - Secure configuration storage\n\n4. **Policy Enforcement**\n - Cedar policy engine integration\n - Compliance checking\n - Anomaly detection\n\n---\n\n## Getting Started\n\n### Prerequisites\n\n- **Rust** - Latest stable (for building platform services)\n- **Nushell 0.107.1+** - For MCP server and scripts\n- **Docker** (optional) - For containerized deployment\n- **Kubernetes** (optional) - For K8s deployment\n\n### Building Platform Services\n\n```\n# Build all Rust services\ncd orchestrator && cargo build --release\ncd ../control-center && cargo build --release\ncd ../installer && cargo build --release\n```\n\n### Running Services\n\n```\n# Start orchestrator\ncd orchestrator\n./scripts/start-orchestrator.nu --background\n\n# Start control center\ncd control-center\ncargo run --release\n\n# Start MCP server\ncd mcp-server\nnu run.nu\n```\n\n---\n\n## Development\n\n### Project Structure\n\n```\nplatform/\n├── orchestrator/ # Rust orchestrator service\n├── control-center/ # Rust control center backend\n├── control-center-ui/ # Web frontend\n├── installer/ # Rust/Nushell installer\n├── mcp-server/ # Nushell MCP server\n├── infrastructure/api-gateway/ # Rust API gateway (planned)\n├── infrastructure/oci-registry/ # OCI registry (planned)\n├── extension-registry/ # Extension catalog (planned)\n├── provisioning-server/# Alternative service\n├── infrastructure/docker/ # Docker Compose configs\n├── k8s/ # Kubernetes manifests\n├── infrastructure/systemd/ # Systemd units\n└── docs/ # Platform documentation\n```\n\n### Adding New Services\n\n1. Create service directory in `platform/`\n2. Add README.md with service description\n3. Implement service following architecture patterns\n4. Add tests and documentation\n5. Update platform/README.md (this file)\n6. Add deployment configurations (docker-compose, k8s, systemd)\n\n---\n\n## Integration with [Provisioning](../../PROVISIONING.md)\n\nPlatform services integrate seamlessly with the [Provisioning](../../PROVISIONING.md) system:\n\n- **Core Engine** (`../core/`) provides CLI and libraries\n- **Extensions** (`../extensions/`) provide providers, taskservs, clusters\n- **Platform Services** (this directory) provide execution and management\n- **Configuration** (`../kcl/`, `../config/`) defines infrastructure\n\n---\n\n## Documentation\n\n### Platform Documentation\n\n- **Orchestrator**: [.claude/features/orchestrator-architecture.md](../../.claude/features/orchestrator-architecture.md)\n- **Installer**: `installer/docs/` directory\n- **Test Environments**: [.claude/features/test-environment-service.md](../../.claude/features/test-environment-service.md)\n\n### API Documentation\n\n- **REST API Reference**: `docs/api/` (when orchestrator is running)\n- **MCP Tools Reference**: `mcp-server/docs/`\n\n### Architecture Documentation\n\n- **Main Project**: [PROVISIONING.md](../../PROVISIONING.md)\n- **Project Architecture**: [CLAUDE.md](../../CLAUDE.md)\n\n---\n\n## Contributing\n\nWhen contributing to platform services:\n\n1. **Follow Rust Best Practices** - Idiomatic Rust, proper error handling\n2. **Security First** - Always consider security implications\n3. **Performance Matters** - Platform services are performance-critical\n4. **Document APIs** - All REST endpoints must be documented\n5. **Add Tests** - Unit tests and integration tests required\n6. **Update Docs** - Keep README and API docs current\n\n---\n\n## Status Legend\n\n- ✅ **Production Ready** - Fully implemented and tested\n- ✅ **Active Development** - Working implementation, ongoing improvements\n- ✅ **Configuration Ready** - Configuration files ready for deployment\n- 🔄 **Planned** - Design phase, implementation pending\n- 🔄 **In Development** - Early implementation stage\n\n---\n\n## Support\n\nFor platform service issues:\n\n- Check service-specific README in service directory\n- Review logs: `journalctl -u provisioning-*` (systemd)\n- API documentation: `http://localhost:8080/docs` (when running)\n- See [Provisioning project](https://repo.jesusperez.pro/jesus/provisioning) for general support\n\n---\n\n**Maintained By**: Platform Team\n**Last Updated**: 2025-10-07\n**Platform Version**: 3.5.0 \ No newline at end of file diff --git a/config/README.md b/config/README.md index 428b40c..9c4d9aa 100644 --- a/config/README.md +++ b/config/README.md @@ -1,109 +1 @@ -# Platform Service Configuration Files - -This directory contains **16 production-ready TOML configuration files** generated from Nickel schemas -for all platform services across all deployment modes. - -## Generated Files - -**4 Services × 4 Deployment Modes = 16 Configuration Files** - -```plaintext -orchestrator.{solo,multiuser,cicd,enterprise}.toml (2.2 kB each) -control-center.{solo,multiuser,cicd,enterprise}.toml (3.4 kB each) -mcp-server.{solo,multiuser,cicd,enterprise}.toml (2.7 kB each) -installer.{solo,multiuser,cicd,enterprise}.toml (2.5 kB each) -```text - -**Total**: ~45 KB, all validated and ready for deployment - -## Deployment Modes - -| Mode | Resources | Database | Use Case | Load | -| ------ | ----------- | ---------- | ---------- | ------ | -| **solo** | 2 CPU, 4 GB | Embedded | Development | `ORCHESTRATOR_MODE=solo` | -| **multiuser** | 4 CPU, 8 GB | PostgreSQL/SurrealDB | Team Staging | `ORCHESTRATOR_MODE=multiuser` | -| **cicd** | 8 CPU, 16 GB | Ephemeral | CI/CD Pipelines | `ORCHESTRATOR_MODE=cicd` | -| **enterprise** | 16+ CPU, 32+ GB | SurrealDB HA | Production | `ORCHESTRATOR_MODE=enterprise` | - -## Quick Start - -### Load a configuration mode - -```bash -# Solo mode (single developer) -export ORCHESTRATOR_MODE=solo -export CONTROL_CENTER_MODE=solo - -# Multiuser mode (team development) -export ORCHESTRATOR_MODE=multiuser -export CONTROL_CENTER_MODE=multiuser - -# Enterprise mode (production HA) -export ORCHESTRATOR_MODE=enterprise -export CONTROL_CENTER_MODE=enterprise -```text - -### Override individual fields - -```bash -export ORCHESTRATOR_SERVER_WORKERS=8 -export ORCHESTRATOR_SERVER_PORT=9090 -export CONTROL_CENTER_REQUIRE_MFA=true -```text - -## Configuration Loading Hierarchy - -Each service loads configuration with this priority: - -1. **Explicit path** — `{SERVICE}_CONFIG` environment variable -2. **Mode-specific** — `{SERVICE}_MODE` → `provisioning/platform/config/{service}.{mode}.toml` -3. **Legacy** — `config.user.toml` (backward compatibility) -4. **Defaults** — `config.defaults.toml` or built-in -5. **Field overrides** — `{SERVICE}_*` environment variables - -## Docker Compose Integration - -```bash -export DEPLOYMENT_MODE=multiuser -docker-compose -f provisioning/platform/infrastructure/docker/docker-compose.yml up -```text - -## Kubernetes Integration - -```bash -# Load enterprise mode configs into K8s -kubectl create configmap orchestrator-config \ - --from-file=provisioning/platform/config/orchestrator.enterprise.toml -```text - -## Validation - -Verify all configs parse correctly: - -```bash -for file in *.toml; do - nu -c "open '$file'" && echo "✅ $file" || echo "❌ $file" -done -```text - -## Structure - -- **orchestrator.*.toml** — Workflow engine configuration -- **control-center.*.toml** — Policy/RBAC backend configuration -- **mcp-server.*.toml** — MCP server configuration -- **installer.*.toml** — Installation/bootstrap configuration - -Each file contains service-specific settings for networking, storage, security, logging, and monitoring. - -## Related Documentation - -- **Configuration workflow**: `provisioning/.typedialog/provisioning/platform/configuration-workflow.md` -- **Usage guide**: `provisioning/.typedialog/provisioning/platform/usage-guide.md` -- **Schema definitions**: `provisioning/.typedialog/provisioning/platform/schemas/` -- **Default values**: `provisioning/.typedialog/provisioning/platform/defaults/` - -## Generated By - -**Framework**: TypeDialog + Nickel Configuration System -**Date**: 2026-01-05 -**Status**: ✅ Production Ready +# Platform Service Configuration Files\n\nThis directory contains **16 production-ready TOML configuration files** generated from Nickel schemas\nfor all platform services across all deployment modes.\n\n## Generated Files\n\n**4 Services × 4 Deployment Modes = 16 Configuration Files**\n\n```\norchestrator.{solo,multiuser,cicd,enterprise}.toml (2.2 kB each)\ncontrol-center.{solo,multiuser,cicd,enterprise}.toml (3.4 kB each)\nmcp-server.{solo,multiuser,cicd,enterprise}.toml (2.7 kB each)\ninstaller.{solo,multiuser,cicd,enterprise}.toml (2.5 kB each)\n```\n\n**Total**: ~45 KB, all validated and ready for deployment\n\n## Deployment Modes\n\n| Mode | Resources | Database | Use Case | Load |\n| ------ | ----------- | ---------- | ---------- | ------ |\n| **solo** | 2 CPU, 4 GB | Embedded | Development | `ORCHESTRATOR_MODE=solo` |\n| **multiuser** | 4 CPU, 8 GB | PostgreSQL/SurrealDB | Team Staging | `ORCHESTRATOR_MODE=multiuser` |\n| **cicd** | 8 CPU, 16 GB | Ephemeral | CI/CD Pipelines | `ORCHESTRATOR_MODE=cicd` |\n| **enterprise** | 16+ CPU, 32+ GB | SurrealDB HA | Production | `ORCHESTRATOR_MODE=enterprise` |\n\n## Quick Start\n\n### Load a configuration mode\n\n```\n# Solo mode (single developer)\nexport ORCHESTRATOR_MODE=solo\nexport CONTROL_CENTER_MODE=solo\n\n# Multiuser mode (team development)\nexport ORCHESTRATOR_MODE=multiuser\nexport CONTROL_CENTER_MODE=multiuser\n\n# Enterprise mode (production HA)\nexport ORCHESTRATOR_MODE=enterprise\nexport CONTROL_CENTER_MODE=enterprise\n```\n\n### Override individual fields\n\n```\nexport ORCHESTRATOR_SERVER_WORKERS=8\nexport ORCHESTRATOR_SERVER_PORT=9090\nexport CONTROL_CENTER_REQUIRE_MFA=true\n```\n\n## Configuration Loading Hierarchy\n\nEach service loads configuration with this priority:\n\n1. **Explicit path** — `{SERVICE}_CONFIG` environment variable\n2. **Mode-specific** — `{SERVICE}_MODE` → `provisioning/platform/config/{service}.{mode}.toml`\n3. **Legacy** — `config.user.toml` (backward compatibility)\n4. **Defaults** — `config.defaults.toml` or built-in\n5. **Field overrides** — `{SERVICE}_*` environment variables\n\n## Docker Compose Integration\n\n```\nexport DEPLOYMENT_MODE=multiuser\ndocker-compose -f provisioning/platform/infrastructure/docker/docker-compose.yml up\n```\n\n## Kubernetes Integration\n\n```\n# Load enterprise mode configs into K8s\nkubectl create configmap orchestrator-config \\n --from-file=provisioning/platform/config/orchestrator.enterprise.toml\n```\n\n## Validation\n\nVerify all configs parse correctly:\n\n```\nfor file in *.toml; do\n nu -c "open '$file'" && echo "✅ $file" || echo "❌ $file"\ndone\n```\n\n## Structure\n\n- **orchestrator.*.toml** — Workflow engine configuration\n- **control-center.*.toml** — Policy/RBAC backend configuration\n- **mcp-server.*.toml** — MCP server configuration\n- **installer.*.toml** — Installation/bootstrap configuration\n\nEach file contains service-specific settings for networking, storage, security, logging, and monitoring.\n\n## Related Documentation\n\n- **Configuration workflow**: `provisioning/.typedialog/provisioning/platform/configuration-workflow.md`\n- **Usage guide**: `provisioning/.typedialog/provisioning/platform/usage-guide.md`\n- **Schema definitions**: `provisioning/.typedialog/provisioning/platform/schemas/`\n- **Default values**: `provisioning/.typedialog/provisioning/platform/defaults/`\n\n## Generated By\n\n**Framework**: TypeDialog + Nickel Configuration System\n**Date**: 2026-01-05\n**Status**: ✅ Production Ready \ No newline at end of file diff --git a/config/examples/README.md b/config/examples/README.md index e1960a6..2578008 100644 --- a/config/examples/README.md +++ b/config/examples/README.md @@ -1,201 +1 @@ -# Platform Configuration Examples - -This directory contains example Nickel files demonstrating how to generate platform configurations for different deployment modes. - -## File Structure - -```plaintext -examples/ -├── README.md # This file -├── orchestrator.solo.example.ncl # Solo deployment (1 CPU, 1GB memory) -├── orchestrator.multiuser.example.ncl # Multiuser deployment (2 CPU, 2GB memory, HA) -├── orchestrator.enterprise.example.ncl # Enterprise deployment (4 CPU, 4GB memory, 3 replicas) -└── control-center.solo.example.ncl # Control Center solo deployment -```text - -## Usage - -To generate actual TOML configuration from an example: - -```bash -# Export to TOML (placed in runtime/generated/) -nickel export --format toml examples/orchestrator.solo.example.ncl > runtime/generated/orchestrator.solo.toml - -# Export to JSON for inspection -nickel export --format json examples/orchestrator.solo.example.ncl | jq . - -# Type check example -nickel typecheck examples/orchestrator.solo.example.ncl -```text - -## Key Concepts - -### 1. Schemas Reference -All examples import from the schema library: -- `provisioning/schemas/platform/schemas/orchestrator.ncl` -- `provisioning/schemas/platform/defaults/orchestrator-defaults.ncl` - -### 2. Mode-Based Composition -Each example uses composition helpers to overlay mode-specific settings: - -```nickel -let helpers = import "../../schemas/platform/common/helpers.ncl" in -let defaults = import "../../schemas/platform/defaults/orchestrator-defaults.ncl" in -let mode = import "../../schemas/platform/defaults/deployment/solo-defaults.ncl" in - -helpers.compose_config defaults mode { - # User-specific overrides here -} -```text - -### 3. ConfigLoader Integration -Generated TOML files are automatically loaded by Rust services: - -```rust -use platform_config::OrchestratorConfig; - -let config = OrchestratorConfig::load().expect("Failed to load orchestrator config"); -println!("Orchestrator listening on port: {}", config.server.port); -```text - -## Mode Reference - -| Mode | CPU | Memory | Replicas | Use Case | -| ------ | ----- | -------- | ---------- | ---------- | -| **solo** | 1.0 | 1024M | 1 | Development, testing | -| **multiuser** | 2.0 | 2048M | 2 | Staging, small production | -| **enterprise** | 4.0 | 4096M | 3+ | Large production deployments | -| **cicd** | 2.0 | 2048M | 1 | CI/CD pipelines | - -## Workflow: Platform Configuration - -1. **Choose deployment mode** → select example file (orchestrator.solo.example.ncl, etc.) -2. **Customize if needed** → modify the example -3. **Generate config** → `nickel export --format toml` -4. **Place in runtime/generated/** → ConfigLoader picks it up automatically -5. **Service reads config** → via platform-config crate - -## Infrastructure Generation - -These platform configuration examples work together with infrastructure schemas to create complete deployments. - -### Complete Infrastructure Stack - -Beyond platform configs, you can generate complete infrastructure from schemas: - -**Infrastructure Examples**: -- `provisioning/schemas/infrastructure/examples-solo-deployment.ncl` - Solo infrastructure -- `provisioning/schemas/infrastructure/examples-enterprise-deployment.ncl` - Enterprise infrastructure - -**What Gets Generated**: - -```bash -# Solo deployment infrastructure -nickel export --format json provisioning/schemas/infrastructure/examples-solo-deployment.ncl - -# Exports: -# - docker_compose_services (5 services) -# - nginx_config (load balancer setup) -# - prometheus_config (4 scrape jobs) -# - oci_registry_config (container registry) -```text - -**Integration Pattern**: - -```plaintext -Platform Config (Orchestrator, Control Center, etc.) - ↓ ConfigLoader reads TOML - ↓ Services start with config - -Infrastructure Config (Docker, Nginx, Prometheus, etc.) - ↓ nickel export → YAML/JSON - ↓ Deploy with Docker/Kubernetes/Nginx -```text - -### Generation and Validation - -**Generate all infrastructure configs**: - -```bash -provisioning/platform/scripts/generate-infrastructure-configs.nu --mode solo --format yaml -provisioning/platform/scripts/generate-infrastructure-configs.nu --mode enterprise --format json -```text - -**Validate generated configs**: - -```bash -provisioning/platform/scripts/validate-infrastructure.nu --config-dir /tmp/infra - -# Output shows validation results for: -# - Docker Compose (docker-compose config --quiet) -# - Kubernetes (kubectl apply --dry-run=client) -# - Nginx (nginx -t) -# - Prometheus (promtool check config) -```text - -**Interactive setup**: - -```bash -bash provisioning/platform/scripts/setup-with-forms.sh -# Uses TypeDialog bash wrappers (TTY-safe) or basic Nushell prompts as fallback -```text - -## Error Handling - -If configuration fails to load: - -```bash -# Validate Nickel syntax -nickel typecheck examples/orchestrator.solo.example.ncl - -# Check TOML validity -cargo test --package platform-config --test validation - -# Verify path resolution -provisioning validate-config --check-paths -```text - -## Environment Variable Overrides - -Even with TOML configs, environment variables take precedence: - -```bash -export PROVISIONING_MODE=multiuser -export ORCHESTRATOR_PORT=9000 -provisioning orchestrator start # Uses env overrides -```text - -## Adding New Configurations - -To add a new service configuration: - -1. Create `service-name.mode.example.ncl` in this directory -2. Import the service schema: `import "../../schemas/platform/schemas/service-name.ncl"` -3. Compose using helpers: `helpers.compose_config defaults mode {}` -4. Document in this README -5. Test with: `nickel typecheck` and `nickel export --format json` - -## Platform vs Infrastructure Configuration - -**Platform Configuration** (this directory): -- Service-specific settings (port, database host, logging level) -- Loaded by ConfigLoader at service startup -- Format: TOML files in `runtime/generated/` -- Examples: orchestrator.solo.example.ncl, orchestrator.multiuser.example.ncl - -**Infrastructure Configuration** (provisioning/schemas/infrastructure/): -- Deployment-specific settings (replicas, resources, networking) -- Generated and validated separately -- Formats: YAML (Docker/Kubernetes), JSON (registries), conf (Nginx) -- Examples: examples-solo-deployment.ncl, examples-enterprise-deployment.ncl - -**Why Both?**: -- Platform config: How should Orchestrator behave? (internal settings) -- Infrastructure config: How should Orchestrator be deployed? (external deployment) - ---- - -**Last Updated**: 2025-01-06 (Updated with Infrastructure Integration Guide) -**ConfigLoader Version**: 2.0.0 -**Nickel Version**: Latest -**Infrastructure Integration**: Complete with schemas, examples, and validation scripts +# Platform Configuration Examples\n\nThis directory contains example Nickel files demonstrating how to generate platform configurations for different deployment modes.\n\n## File Structure\n\n```\nexamples/\n├── README.md # This file\n├── orchestrator.solo.example.ncl # Solo deployment (1 CPU, 1GB memory)\n├── orchestrator.multiuser.example.ncl # Multiuser deployment (2 CPU, 2GB memory, HA)\n├── orchestrator.enterprise.example.ncl # Enterprise deployment (4 CPU, 4GB memory, 3 replicas)\n└── control-center.solo.example.ncl # Control Center solo deployment\n```\n\n## Usage\n\nTo generate actual TOML configuration from an example:\n\n```\n# Export to TOML (placed in runtime/generated/)\nnickel export --format toml examples/orchestrator.solo.example.ncl > runtime/generated/orchestrator.solo.toml\n\n# Export to JSON for inspection\nnickel export --format json examples/orchestrator.solo.example.ncl | jq .\n\n# Type check example\nnickel typecheck examples/orchestrator.solo.example.ncl\n```\n\n## Key Concepts\n\n### 1. Schemas Reference\nAll examples import from the schema library:\n- `provisioning/schemas/platform/schemas/orchestrator.ncl`\n- `provisioning/schemas/platform/defaults/orchestrator-defaults.ncl`\n\n### 2. Mode-Based Composition\nEach example uses composition helpers to overlay mode-specific settings:\n\n```\nlet helpers = import "../../schemas/platform/common/helpers.ncl" in\nlet defaults = import "../../schemas/platform/defaults/orchestrator-defaults.ncl" in\nlet mode = import "../../schemas/platform/defaults/deployment/solo-defaults.ncl" in\n\nhelpers.compose_config defaults mode {\n # User-specific overrides here\n}\n```\n\n### 3. ConfigLoader Integration\nGenerated TOML files are automatically loaded by Rust services:\n\n```\nuse platform_config::OrchestratorConfig;\n\nlet config = OrchestratorConfig::load().expect("Failed to load orchestrator config");\nprintln!("Orchestrator listening on port: {}", config.server.port);\n```\n\n## Mode Reference\n\n| Mode | CPU | Memory | Replicas | Use Case |\n| ------ | ----- | -------- | ---------- | ---------- |\n| **solo** | 1.0 | 1024M | 1 | Development, testing |\n| **multiuser** | 2.0 | 2048M | 2 | Staging, small production |\n| **enterprise** | 4.0 | 4096M | 3+ | Large production deployments |\n| **cicd** | 2.0 | 2048M | 1 | CI/CD pipelines |\n\n## Workflow: Platform Configuration\n\n1. **Choose deployment mode** → select example file (orchestrator.solo.example.ncl, etc.)\n2. **Customize if needed** → modify the example\n3. **Generate config** → `nickel export --format toml`\n4. **Place in runtime/generated/** → ConfigLoader picks it up automatically\n5. **Service reads config** → via platform-config crate\n\n## Infrastructure Generation\n\nThese platform configuration examples work together with infrastructure schemas to create complete deployments.\n\n### Complete Infrastructure Stack\n\nBeyond platform configs, you can generate complete infrastructure from schemas:\n\n**Infrastructure Examples**:\n- `provisioning/schemas/infrastructure/examples-solo-deployment.ncl` - Solo infrastructure\n- `provisioning/schemas/infrastructure/examples-enterprise-deployment.ncl` - Enterprise infrastructure\n\n**What Gets Generated**:\n\n```\n# Solo deployment infrastructure\nnickel export --format json provisioning/schemas/infrastructure/examples-solo-deployment.ncl\n\n# Exports:\n# - docker_compose_services (5 services)\n# - nginx_config (load balancer setup)\n# - prometheus_config (4 scrape jobs)\n# - oci_registry_config (container registry)\n```\n\n**Integration Pattern**:\n\n```\nPlatform Config (Orchestrator, Control Center, etc.)\n ↓ ConfigLoader reads TOML\n ↓ Services start with config\n\nInfrastructure Config (Docker, Nginx, Prometheus, etc.)\n ↓ nickel export → YAML/JSON\n ↓ Deploy with Docker/Kubernetes/Nginx\n```\n\n### Generation and Validation\n\n**Generate all infrastructure configs**:\n\n```\nprovisioning/platform/scripts/generate-infrastructure-configs.nu --mode solo --format yaml\nprovisioning/platform/scripts/generate-infrastructure-configs.nu --mode enterprise --format json\n```\n\n**Validate generated configs**:\n\n```\nprovisioning/platform/scripts/validate-infrastructure.nu --config-dir /tmp/infra\n\n# Output shows validation results for:\n# - Docker Compose (docker-compose config --quiet)\n# - Kubernetes (kubectl apply --dry-run=client)\n# - Nginx (nginx -t)\n# - Prometheus (promtool check config)\n```\n\n**Interactive setup**:\n\n```\nbash provisioning/platform/scripts/setup-with-forms.sh\n# Uses TypeDialog bash wrappers (TTY-safe) or basic Nushell prompts as fallback\n```\n\n## Error Handling\n\nIf configuration fails to load:\n\n```\n# Validate Nickel syntax\nnickel typecheck examples/orchestrator.solo.example.ncl\n\n# Check TOML validity\ncargo test --package platform-config --test validation\n\n# Verify path resolution\nprovisioning validate-config --check-paths\n```\n\n## Environment Variable Overrides\n\nEven with TOML configs, environment variables take precedence:\n\n```\nexport PROVISIONING_MODE=multiuser\nexport ORCHESTRATOR_PORT=9000\nprovisioning orchestrator start # Uses env overrides\n```\n\n## Adding New Configurations\n\nTo add a new service configuration:\n\n1. Create `service-name.mode.example.ncl` in this directory\n2. Import the service schema: `import "../../schemas/platform/schemas/service-name.ncl"`\n3. Compose using helpers: `helpers.compose_config defaults mode {}`\n4. Document in this README\n5. Test with: `nickel typecheck` and `nickel export --format json`\n\n## Platform vs Infrastructure Configuration\n\n**Platform Configuration** (this directory):\n- Service-specific settings (port, database host, logging level)\n- Loaded by ConfigLoader at service startup\n- Format: TOML files in `runtime/generated/`\n- Examples: orchestrator.solo.example.ncl, orchestrator.multiuser.example.ncl\n\n**Infrastructure Configuration** (provisioning/schemas/infrastructure/):\n- Deployment-specific settings (replicas, resources, networking)\n- Generated and validated separately\n- Formats: YAML (Docker/Kubernetes), JSON (registries), conf (Nginx)\n- Examples: examples-solo-deployment.ncl, examples-enterprise-deployment.ncl\n\n**Why Both?**:\n- Platform config: How should Orchestrator behave? (internal settings)\n- Infrastructure config: How should Orchestrator be deployed? (external deployment)\n\n---\n\n**Last Updated**: 2025-01-06 (Updated with Infrastructure Integration Guide)\n**ConfigLoader Version**: 2.0.0\n**Nickel Version**: Latest\n**Infrastructure Integration**: Complete with schemas, examples, and validation scripts \ No newline at end of file diff --git a/crates/control-center-ui/README.md b/crates/control-center-ui/README.md index 1bf5fb6..a1b71e0 100644 --- a/crates/control-center-ui/README.md +++ b/crates/control-center-ui/README.md @@ -1,368 +1 @@ -# Control Center UI - Audit Log Viewer - -A comprehensive React-based audit log viewer for the Cedar Policy Engine with advanced search, real-time streaming, -compliance reporting, and visualization capabilities. - -## 🚀 Features - -### 🔍 Advanced Search & Filtering - -- **Multi-dimensional Filters**: Date range, users, actions, resources, severity, compliance frameworks -- **Real-time Search**: Debounced search with instant results -- **Saved Searches**: Save and reuse complex filter combinations -- **Quick Filters**: One-click access to common time ranges and filters -- **Correlation Search**: Find logs by request ID, session ID, or trace correlation - -### 📊 High-Performance Data Display - -- **Virtual Scrolling**: Handle millions of log entries with smooth scrolling -- **Infinite Loading**: Automatic pagination with optimized data fetching -- **Column Sorting**: Sort by any field with persistent state -- **Bulk Selection**: Select multiple logs for batch operations -- **Responsive Design**: Works seamlessly on desktop, tablet, and mobile - -### 🔴 Real-time Streaming - -- **WebSocket Integration**: Live log updates without page refresh -- **Connection Management**: Automatic reconnection with exponential backoff -- **Real-time Indicators**: Visual status of live connection -- **Message Queuing**: Handles high-volume log streams efficiently -- **Alert Notifications**: Critical events trigger immediate notifications - -### 📋 Detailed Log Inspection - -- **JSON Viewer**: Syntax-highlighted JSON with collapsible sections -- **Multi-tab Interface**: Overview, Context, Metadata, Compliance, Raw JSON -- **Sensitive Data Toggle**: Hide/show sensitive information -- **Copy Utilities**: One-click copying of IDs, values, and entire records -- **Deep Linking**: Direct URLs to specific log entries - -### 📤 Export & Reporting - -- **Multiple Formats**: CSV, JSON, PDF export with customizable fields -- **Template System**: Pre-built templates for different report types -- **Batch Export**: Export filtered results or selected logs -- **Progress Tracking**: Real-time export progress indication -- **Custom Fields**: Choose exactly which data to include - -### 🛡️ Compliance Management - -- **Framework Support**: SOC2, HIPAA, PCI DSS, GDPR compliance templates -- **Report Generation**: Automated compliance reports with evidence -- **Finding Tracking**: Track violations and remediation status -- **Attestation Management**: Digital signatures and certifications -- **Template Library**: Customizable report templates for different frameworks - -### 🔗 Log Correlation & Tracing - -- **Request Tracing**: Follow request flows across services -- **Session Analysis**: View all activity for a user session -- **Dependency Mapping**: Understand log relationships and causality -- **Timeline Views**: Chronological visualization of related events - -### 📈 Visualization & Analytics - -- **Dashboard Metrics**: Real-time statistics and KPIs -- **Timeline Charts**: Visual representation of log patterns -- **Geographic Distribution**: Location-based log analysis -- **Severity Trends**: Track security event patterns over time -- **User Activity**: Monitor user behavior and access patterns - -## 🛠 Technology Stack - -### Frontend Framework - -- **React 18.3.1**: Modern React with hooks and concurrent features -- **TypeScript 5.5.4**: Type-safe development with advanced types -- **Vite 5.4.1**: Lightning-fast build tool and dev server - -### UI Components & Styling - -- **TailwindCSS 3.4.9**: Utility-first CSS framework -- **DaisyUI 4.4.19**: Beautiful component library built on Tailwind -- **Framer Motion 11.3.24**: Smooth animations and transitions -- **Lucide React 0.427.0**: Beautiful, customizable icons - -### Data Management - -- **TanStack Query 5.51.23**: Powerful data fetching and caching -- **TanStack Table 8.20.1**: Headless table utilities for complex data -- **TanStack Virtual 3.8.4**: Virtual scrolling for performance -- **Zustand 4.5.4**: Lightweight state management - -### Forms & Validation - -- **React Hook Form 7.52.2**: Performant forms with minimal re-renders -- **React Select 5.8.0**: Flexible select components with search - -### Real-time & Networking - -- **Native WebSocket API**: Direct WebSocket integration -- **Custom Hooks**: Reusable WebSocket management with reconnection - -### Export & Reporting - -- **jsPDF 2.5.1**: Client-side PDF generation -- **jsPDF AutoTable 3.8.2**: Table formatting for PDF reports -- **Native Blob API**: File download and export functionality - -### Date & Time - -- **date-fns 3.6.0**: Modern date utility library with tree shaking - -## 📁 Project Structure - -```plaintext -src/ -├── components/audit/ # Audit log components -│ ├── AuditLogViewer.tsx # Main viewer component -│ ├── SearchFilters.tsx # Advanced search interface -│ ├── VirtualizedLogTable.tsx # High-performance table -│ ├── LogDetailModal.tsx # Detailed log inspection -│ ├── ExportModal.tsx # Export functionality -│ ├── ComplianceReportGenerator.tsx # Compliance reports -│ └── RealTimeIndicator.tsx # WebSocket status -├── hooks/ # Custom React hooks -│ └── useWebSocket.ts # WebSocket management -├── services/ # API integration -│ └── api.ts # Audit API client -├── types/ # TypeScript definitions -│ └── audit.ts # Audit-specific types -├── utils/ # Utility functions -├── store/ # State management -└── styles/ # CSS and styling -```text - -## 🔧 Setup and Development - -### Prerequisites - -- **Node.js 18+** and **npm 9+** -- **Control Center backend** running on `http://localhost:8080` - -### Installation - -```bash -# Clone the repository -git clone -cd control-center-ui - -# Install dependencies -npm install - -# Start development server -npm run dev -```text - -The application will be available at `http://localhost:3000` - -### Building for Production - -```bash -# Type check -npm run type-check - -# Build for production -npm run build - -# Preview production build -npm run preview -```text - -## 🌐 API Integration - -The UI integrates with the Control Center backend and expects the following endpoints: - -- `GET /audit/logs` - Fetch audit logs with filtering and pagination -- `GET /audit/logs/{id}` - Get specific log entry details -- `POST /audit/search` - Advanced search functionality -- `GET /audit/saved-searches` - Manage saved search queries -- `POST /audit/export` - Export logs in various formats (CSV, JSON, PDF) -- `GET /compliance/reports` - Compliance report management -- `POST /compliance/reports/generate` - Generate compliance reports -- `WS /audit/stream` - Real-time log streaming via WebSocket -- `GET /health` - Health check endpoint - -### WebSocket Integration - -Real-time log streaming is implemented using WebSocket connections: - -```typescript -import { useWebSocket } from './hooks/useWebSocket'; - -const { isConnected, lastMessage } = useWebSocket({ - url: 'ws://localhost:8080/ws/audit', - onNewAuditLog: (log) => { - // Handle new log entry in real-time - updateLogsList(log); - } -}); -```text - -## ✅ Features Implemented - -### Core Audit Log Viewer System - -- ✅ **Advanced Search Filters**: Multi-dimensional filtering with date range, users, actions, resources, severity, compliance frameworks -- ✅ **Virtual Scrolling Component**: High-performance rendering capable of handling millions of log entries -- ✅ **Real-time Log Streaming**: WebSocket integration with automatic reconnection and live status indicators -- ✅ **Detailed Log Modal**: Multi-tab interface with JSON syntax highlighting, sensitive data toggle, and copy utilities -- ✅ **Export Functionality**: Support for CSV, JSON, and PDF formats with customizable fields and templates -- ✅ **Saved Search Queries**: User preference system for saving and reusing complex search combinations - -### Compliance & Security Features - -- ✅ **Compliance Report Generator**: Automated report generation with SOC2, HIPAA, PCI DSS, and GDPR templates -- ✅ **Violation Tracking**: Remediation workflow system with task management and progress tracking -- ✅ **Timeline Visualization**: Chronological visualization of audit trails with correlation mapping -- ✅ **Request ID Correlation**: Cross-service request tracing and session analysis -- ✅ **Attestation Management**: Digital signature system for compliance certifications -- ✅ **Log Retention Management**: Archival policies and retention period management - -### Performance & User Experience - -- ✅ **Dashboard Analytics**: Real-time metrics including success rates, critical events, and compliance scores -- ✅ **Responsive Design**: Mobile-first design that works across all device sizes -- ✅ **Loading States**: Comprehensive loading indicators and skeleton screens -- ✅ **Error Handling**: Robust error boundaries with user-friendly error messages -- ✅ **Keyboard Shortcuts**: Accessibility features and keyboard navigation support - -## 🎨 Styling and Theming - -### TailwindCSS Configuration - -The application uses a comprehensive TailwindCSS setup with: - -- **DaisyUI Components**: Pre-built, accessible UI components -- **Custom Color Palette**: Primary, secondary, success, warning, error themes -- **Custom Animations**: Smooth transitions and loading states -- **Dark/Light Themes**: Automatic theme switching with system preference detection -- **Responsive Grid System**: Mobile-first responsive design - -### Component Design System - -- **Consistent Spacing**: Standardized margin and padding scales -- **Typography Scale**: Hierarchical text sizing and weights -- **Icon System**: Comprehensive icon library with consistent styling -- **Form Controls**: Validated, accessible form components -- **Data Visualization**: Charts and metrics with consistent styling - -## 📱 Performance Optimization - -### Virtual Scrolling - -- Renders only visible rows for optimal performance -- Handles datasets with millions of entries smoothly -- Maintains smooth scrolling with momentum preservation -- Automatic cleanup of off-screen elements - -### Efficient Data Fetching - -- Infinite queries with intelligent pagination -- Aggressive caching with TanStack Query -- Optimistic updates for better user experience -- Background refetching for fresh data - -### Bundle Optimization - -- Code splitting by route and feature -- Tree shaking for minimal bundle size -- Lazy loading of heavy components -- Optimized production builds - -## 🔒 Security Considerations - -### Data Protection - -- Sensitive data masking in UI components -- Secure WebSocket connections (WSS in production) -- Content Security Policy headers for XSS protection -- Input sanitization for search queries - -### API Security - -- JWT token authentication support (when implemented) -- Request rate limiting awareness -- Secure file downloads with proper headers -- CORS configuration for cross-origin requests - -## 🚀 Deployment - -### Docker Deployment - -```dockerfile -FROM node:18-alpine as builder -WORKDIR /app -COPY package*.json ./ -RUN npm ci --only=production -COPY . . -RUN npm run build - -FROM nginx:alpine -COPY --from=builder /app/dist /usr/share/nginx/html -COPY nginx.conf /etc/nginx/nginx.conf -EXPOSE 80 -CMD ["nginx", "-g", "daemon off;"] -```text - -### Kubernetes Deployment - -```yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: control-center-ui -spec: - replicas: 3 - selector: - matchLabels: - app: control-center-ui - template: - metadata: - labels: - app: control-center-ui - spec: - containers: - - name: control-center-ui - image: control-center-ui:latest - ports: - - containerPort: 80 - env: - - name: VITE_API_BASE_URL - value: "https://api.example.com" -```text - -## 🤝 Contributing - -### Development Guidelines - -- Follow TypeScript strict mode conventions -- Use existing component patterns and design system -- Maintain accessibility standards (WCAG 2.1 AA) -- Add proper error boundaries for robust error handling -- Write meaningful commit messages following conventional commits - -### Code Style - -- Use Prettier for consistent code formatting -- Follow ESLint rules for code quality -- Use semantic HTML elements for accessibility -- Maintain consistent naming conventions -- Document complex logic with comments - -## 📄 License - -This project follows the same license as the parent Control Center repository. - -## 🆘 Support - -For questions, issues, or contributions: - -1. Check existing issues in the repository -2. Review the comprehensive documentation -3. Create detailed bug reports or feature requests -4. Follow the established contribution guidelines - ---- - -Built with ❤️ for comprehensive audit log management, compliance monitoring, and security analytics. +# Control Center UI - Audit Log Viewer\n\nA comprehensive React-based audit log viewer for the Cedar Policy Engine with advanced search, real-time streaming,\ncompliance reporting, and visualization capabilities.\n\n## 🚀 Features\n\n### 🔍 Advanced Search & Filtering\n\n- **Multi-dimensional Filters**: Date range, users, actions, resources, severity, compliance frameworks\n- **Real-time Search**: Debounced search with instant results\n- **Saved Searches**: Save and reuse complex filter combinations\n- **Quick Filters**: One-click access to common time ranges and filters\n- **Correlation Search**: Find logs by request ID, session ID, or trace correlation\n\n### 📊 High-Performance Data Display\n\n- **Virtual Scrolling**: Handle millions of log entries with smooth scrolling\n- **Infinite Loading**: Automatic pagination with optimized data fetching\n- **Column Sorting**: Sort by any field with persistent state\n- **Bulk Selection**: Select multiple logs for batch operations\n- **Responsive Design**: Works seamlessly on desktop, tablet, and mobile\n\n### 🔴 Real-time Streaming\n\n- **WebSocket Integration**: Live log updates without page refresh\n- **Connection Management**: Automatic reconnection with exponential backoff\n- **Real-time Indicators**: Visual status of live connection\n- **Message Queuing**: Handles high-volume log streams efficiently\n- **Alert Notifications**: Critical events trigger immediate notifications\n\n### 📋 Detailed Log Inspection\n\n- **JSON Viewer**: Syntax-highlighted JSON with collapsible sections\n- **Multi-tab Interface**: Overview, Context, Metadata, Compliance, Raw JSON\n- **Sensitive Data Toggle**: Hide/show sensitive information\n- **Copy Utilities**: One-click copying of IDs, values, and entire records\n- **Deep Linking**: Direct URLs to specific log entries\n\n### 📤 Export & Reporting\n\n- **Multiple Formats**: CSV, JSON, PDF export with customizable fields\n- **Template System**: Pre-built templates for different report types\n- **Batch Export**: Export filtered results or selected logs\n- **Progress Tracking**: Real-time export progress indication\n- **Custom Fields**: Choose exactly which data to include\n\n### 🛡️ Compliance Management\n\n- **Framework Support**: SOC2, HIPAA, PCI DSS, GDPR compliance templates\n- **Report Generation**: Automated compliance reports with evidence\n- **Finding Tracking**: Track violations and remediation status\n- **Attestation Management**: Digital signatures and certifications\n- **Template Library**: Customizable report templates for different frameworks\n\n### 🔗 Log Correlation & Tracing\n\n- **Request Tracing**: Follow request flows across services\n- **Session Analysis**: View all activity for a user session\n- **Dependency Mapping**: Understand log relationships and causality\n- **Timeline Views**: Chronological visualization of related events\n\n### 📈 Visualization & Analytics\n\n- **Dashboard Metrics**: Real-time statistics and KPIs\n- **Timeline Charts**: Visual representation of log patterns\n- **Geographic Distribution**: Location-based log analysis\n- **Severity Trends**: Track security event patterns over time\n- **User Activity**: Monitor user behavior and access patterns\n\n## 🛠 Technology Stack\n\n### Frontend Framework\n\n- **React 18.3.1**: Modern React with hooks and concurrent features\n- **TypeScript 5.5.4**: Type-safe development with advanced types\n- **Vite 5.4.1**: Lightning-fast build tool and dev server\n\n### UI Components & Styling\n\n- **TailwindCSS 3.4.9**: Utility-first CSS framework\n- **DaisyUI 4.4.19**: Beautiful component library built on Tailwind\n- **Framer Motion 11.3.24**: Smooth animations and transitions\n- **Lucide React 0.427.0**: Beautiful, customizable icons\n\n### Data Management\n\n- **TanStack Query 5.51.23**: Powerful data fetching and caching\n- **TanStack Table 8.20.1**: Headless table utilities for complex data\n- **TanStack Virtual 3.8.4**: Virtual scrolling for performance\n- **Zustand 4.5.4**: Lightweight state management\n\n### Forms & Validation\n\n- **React Hook Form 7.52.2**: Performant forms with minimal re-renders\n- **React Select 5.8.0**: Flexible select components with search\n\n### Real-time & Networking\n\n- **Native WebSocket API**: Direct WebSocket integration\n- **Custom Hooks**: Reusable WebSocket management with reconnection\n\n### Export & Reporting\n\n- **jsPDF 2.5.1**: Client-side PDF generation\n- **jsPDF AutoTable 3.8.2**: Table formatting for PDF reports\n- **Native Blob API**: File download and export functionality\n\n### Date & Time\n\n- **date-fns 3.6.0**: Modern date utility library with tree shaking\n\n## 📁 Project Structure\n\n```\nsrc/\n├── components/audit/ # Audit log components\n│ ├── AuditLogViewer.tsx # Main viewer component\n│ ├── SearchFilters.tsx # Advanced search interface\n│ ├── VirtualizedLogTable.tsx # High-performance table\n│ ├── LogDetailModal.tsx # Detailed log inspection\n│ ├── ExportModal.tsx # Export functionality\n│ ├── ComplianceReportGenerator.tsx # Compliance reports\n│ └── RealTimeIndicator.tsx # WebSocket status\n├── hooks/ # Custom React hooks\n│ └── useWebSocket.ts # WebSocket management\n├── services/ # API integration\n│ └── api.ts # Audit API client\n├── types/ # TypeScript definitions\n│ └── audit.ts # Audit-specific types\n├── utils/ # Utility functions\n├── store/ # State management\n└── styles/ # CSS and styling\n```\n\n## 🔧 Setup and Development\n\n### Prerequisites\n\n- **Node.js 18+** and **npm 9+**\n- **Control Center backend** running on `http://localhost:8080`\n\n### Installation\n\n```\n# Clone the repository\ngit clone \ncd control-center-ui\n\n# Install dependencies\nnpm install\n\n# Start development server\nnpm run dev\n```\n\nThe application will be available at `http://localhost:3000`\n\n### Building for Production\n\n```\n# Type check\nnpm run type-check\n\n# Build for production\nnpm run build\n\n# Preview production build\nnpm run preview\n```\n\n## 🌐 API Integration\n\nThe UI integrates with the Control Center backend and expects the following endpoints:\n\n- `GET /audit/logs` - Fetch audit logs with filtering and pagination\n- `GET /audit/logs/{id}` - Get specific log entry details\n- `POST /audit/search` - Advanced search functionality\n- `GET /audit/saved-searches` - Manage saved search queries\n- `POST /audit/export` - Export logs in various formats (CSV, JSON, PDF)\n- `GET /compliance/reports` - Compliance report management\n- `POST /compliance/reports/generate` - Generate compliance reports\n- `WS /audit/stream` - Real-time log streaming via WebSocket\n- `GET /health` - Health check endpoint\n\n### WebSocket Integration\n\nReal-time log streaming is implemented using WebSocket connections:\n\n```\nimport { useWebSocket } from './hooks/useWebSocket';\n\nconst { isConnected, lastMessage } = useWebSocket({\n url: 'ws://localhost:8080/ws/audit',\n onNewAuditLog: (log) => {\n // Handle new log entry in real-time\n updateLogsList(log);\n }\n});\n```\n\n## ✅ Features Implemented\n\n### Core Audit Log Viewer System\n\n- ✅ **Advanced Search Filters**: Multi-dimensional filtering with date range, users, actions, resources, severity, compliance frameworks\n- ✅ **Virtual Scrolling Component**: High-performance rendering capable of handling millions of log entries\n- ✅ **Real-time Log Streaming**: WebSocket integration with automatic reconnection and live status indicators\n- ✅ **Detailed Log Modal**: Multi-tab interface with JSON syntax highlighting, sensitive data toggle, and copy utilities\n- ✅ **Export Functionality**: Support for CSV, JSON, and PDF formats with customizable fields and templates\n- ✅ **Saved Search Queries**: User preference system for saving and reusing complex search combinations\n\n### Compliance & Security Features\n\n- ✅ **Compliance Report Generator**: Automated report generation with SOC2, HIPAA, PCI DSS, and GDPR templates\n- ✅ **Violation Tracking**: Remediation workflow system with task management and progress tracking\n- ✅ **Timeline Visualization**: Chronological visualization of audit trails with correlation mapping\n- ✅ **Request ID Correlation**: Cross-service request tracing and session analysis\n- ✅ **Attestation Management**: Digital signature system for compliance certifications\n- ✅ **Log Retention Management**: Archival policies and retention period management\n\n### Performance & User Experience\n\n- ✅ **Dashboard Analytics**: Real-time metrics including success rates, critical events, and compliance scores\n- ✅ **Responsive Design**: Mobile-first design that works across all device sizes\n- ✅ **Loading States**: Comprehensive loading indicators and skeleton screens\n- ✅ **Error Handling**: Robust error boundaries with user-friendly error messages\n- ✅ **Keyboard Shortcuts**: Accessibility features and keyboard navigation support\n\n## 🎨 Styling and Theming\n\n### TailwindCSS Configuration\n\nThe application uses a comprehensive TailwindCSS setup with:\n\n- **DaisyUI Components**: Pre-built, accessible UI components\n- **Custom Color Palette**: Primary, secondary, success, warning, error themes\n- **Custom Animations**: Smooth transitions and loading states\n- **Dark/Light Themes**: Automatic theme switching with system preference detection\n- **Responsive Grid System**: Mobile-first responsive design\n\n### Component Design System\n\n- **Consistent Spacing**: Standardized margin and padding scales\n- **Typography Scale**: Hierarchical text sizing and weights\n- **Icon System**: Comprehensive icon library with consistent styling\n- **Form Controls**: Validated, accessible form components\n- **Data Visualization**: Charts and metrics with consistent styling\n\n## 📱 Performance Optimization\n\n### Virtual Scrolling\n\n- Renders only visible rows for optimal performance\n- Handles datasets with millions of entries smoothly\n- Maintains smooth scrolling with momentum preservation\n- Automatic cleanup of off-screen elements\n\n### Efficient Data Fetching\n\n- Infinite queries with intelligent pagination\n- Aggressive caching with TanStack Query\n- Optimistic updates for better user experience\n- Background refetching for fresh data\n\n### Bundle Optimization\n\n- Code splitting by route and feature\n- Tree shaking for minimal bundle size\n- Lazy loading of heavy components\n- Optimized production builds\n\n## 🔒 Security Considerations\n\n### Data Protection\n\n- Sensitive data masking in UI components\n- Secure WebSocket connections (WSS in production)\n- Content Security Policy headers for XSS protection\n- Input sanitization for search queries\n\n### API Security\n\n- JWT token authentication support (when implemented)\n- Request rate limiting awareness\n- Secure file downloads with proper headers\n- CORS configuration for cross-origin requests\n\n## 🚀 Deployment\n\n### Docker Deployment\n\n```\nFROM node:18-alpine as builder\nWORKDIR /app\nCOPY package*.json ./\nRUN npm ci --only=production\nCOPY . .\nRUN npm run build\n\nFROM nginx:alpine\nCOPY --from=builder /app/dist /usr/share/nginx/html\nCOPY nginx.conf /etc/nginx/nginx.conf\nEXPOSE 80\nCMD ["nginx", "-g", "daemon off;"]\n```\n\n### Kubernetes Deployment\n\n```\napiVersion: apps/v1\nkind: Deployment\nmetadata:\n name: control-center-ui\nspec:\n replicas: 3\n selector:\n matchLabels:\n app: control-center-ui\n template:\n metadata:\n labels:\n app: control-center-ui\n spec:\n containers:\n - name: control-center-ui\n image: control-center-ui:latest\n ports:\n - containerPort: 80\n env:\n - name: VITE_API_BASE_URL\n value: "https://api.example.com"\n```\n\n## 🤝 Contributing\n\n### Development Guidelines\n\n- Follow TypeScript strict mode conventions\n- Use existing component patterns and design system\n- Maintain accessibility standards (WCAG 2.1 AA)\n- Add proper error boundaries for robust error handling\n- Write meaningful commit messages following conventional commits\n\n### Code Style\n\n- Use Prettier for consistent code formatting\n- Follow ESLint rules for code quality\n- Use semantic HTML elements for accessibility\n- Maintain consistent naming conventions\n- Document complex logic with comments\n\n## 📄 License\n\nThis project follows the same license as the parent Control Center repository.\n\n## 🆘 Support\n\nFor questions, issues, or contributions:\n\n1. Check existing issues in the repository\n2. Review the comprehensive documentation\n3. Create detailed bug reports or feature requests\n4. Follow the established contribution guidelines\n\n---\n\nBuilt with ❤️ for comprehensive audit log management, compliance monitoring, and security analytics. \ No newline at end of file diff --git a/crates/control-center-ui/REFERENCE.md b/crates/control-center-ui/REFERENCE.md index 1ff83b5..72b1e2b 100644 --- a/crates/control-center-ui/REFERENCE.md +++ b/crates/control-center-ui/REFERENCE.md @@ -1,33 +1 @@ -# Control Center UI Reference - -This directory will reference the existing control center UI implementation. - -## Current Implementation Location - -`/Users/Akasha/repo-cnz/src/control-center-ui/` - -## Implementation Details - -- **Language**: Web frontend (likely React/Vue/Leptos) -- **Purpose**: Web interface for system management -- **Features**: - - Dashboard and monitoring UI - - Configuration management interface - - System administration controls - -## Integration Status - -- **Current**: Fully functional in original location -- **New Structure**: Reference established -- **Migration**: Planned for future phase - -## Usage - -The control center UI remains fully functional at its original location. - -```bash -cd /Users/Akasha/repo-cnz/src/control-center-ui -# Use existing UI development commands -```text - -See original implementation for development setup and usage instructions. +# Control Center UI Reference\n\nThis directory will reference the existing control center UI implementation.\n\n## Current Implementation Location\n\n`/Users/Akasha/repo-cnz/src/control-center-ui/`\n\n## Implementation Details\n\n- **Language**: Web frontend (likely React/Vue/Leptos)\n- **Purpose**: Web interface for system management\n- **Features**:\n - Dashboard and monitoring UI\n - Configuration management interface\n - System administration controls\n\n## Integration Status\n\n- **Current**: Fully functional in original location\n- **New Structure**: Reference established\n- **Migration**: Planned for future phase\n\n## Usage\n\nThe control center UI remains fully functional at its original location.\n\n```\ncd /Users/Akasha/repo-cnz/src/control-center-ui\n# Use existing UI development commands\n```\n\nSee original implementation for development setup and usage instructions. \ No newline at end of file diff --git a/crates/control-center-ui/auth-system.md b/crates/control-center-ui/auth-system.md index b03dd0a..8096cae 100644 --- a/crates/control-center-ui/auth-system.md +++ b/crates/control-center-ui/auth-system.md @@ -1,381 +1 @@ -# Control Center UI - Leptos Authentication System - -A comprehensive authentication system built with Leptos and WebAssembly for cloud infrastructure management. - -## 🔐 Features Overview - -### Core Authentication - -- **Email/Password Login** with comprehensive validation -- **JWT Token Management** with automatic refresh -- **Secure Token Storage** with AES-256-GCM encryption in localStorage -- **401 Response Interceptor** for automatic logout and token refresh - -### Multi-Factor Authentication (MFA) - -- **TOTP-based MFA** with QR code generation -- **Backup Codes** for account recovery -- **Mobile App Integration** (Google Authenticator, Authy, etc.) - -### Biometric Authentication - -- **WebAuthn/FIDO2 Support** for passwordless authentication -- **Platform Authenticators** (Touch ID, Face ID, Windows Hello) -- **Cross-Platform Security Keys** (USB, NFC, Bluetooth) -- **Credential Management** with device naming and removal - -### Advanced Security Features - -- **Device Trust Management** with fingerprinting -- **Session Timeout Warnings** with countdown timers -- **Password Reset Flow** with email verification -- **SSO Integration** (OAuth2, SAML, OpenID Connect) -- **Session Management** with active session monitoring - -### Route Protection - -- **Auth Guards** for protected routes -- **Permission-based Access Control** with role validation -- **Conditional Rendering** based on authentication state -- **Automatic Redirects** for unauthorized access - -## 📁 Architecture Overview - -```plaintext -src/ -├── auth/ # Authentication core -│ ├── mod.rs # Type definitions and exports -│ ├── token_manager.rs # JWT token handling with auto-refresh -│ ├── storage.rs # Encrypted token storage -│ ├── webauthn.rs # WebAuthn/FIDO2 implementation -│ ├── crypto.rs # Cryptographic utilities -│ └── http_interceptor.rs # HTTP request/response interceptor -├── components/auth/ # Authentication components -│ ├── mod.rs # Component exports -│ ├── login_form.rs # Email/password login form -│ ├── mfa_setup.rs # TOTP MFA configuration -│ ├── password_reset.rs # Password reset flow -│ ├── auth_guard.rs # Route protection components -│ ├── session_timeout.rs # Session management modal -│ ├── sso_buttons.rs # SSO provider buttons -│ ├── device_trust.rs # Device trust management -│ ├── biometric_auth.rs # WebAuthn biometric auth -│ ├── logout_button.rs # Logout functionality -│ └── user_profile.rs # User profile management -├── utils/ # Utility modules -└── lib.rs # Main application entry -```text - -## 🚀 Implemented Components - -All authentication components have been successfully implemented: - -### ✅ Core Authentication Infrastructure - -- **Secure Token Storage** (`src/auth/storage.rs`) - AES-256-GCM encrypted localStorage with session-based keys -- **JWT Token Manager** (`src/auth/token_manager.rs`) - Automatic token refresh, expiry monitoring, context management -- **Crypto Utilities** (`src/auth/crypto.rs`) - Secure random generation, hashing, HMAC, device fingerprinting -- **HTTP Interceptor** (`src/auth/http_interceptor.rs`) - 401 handling, automatic logout, request/response middleware - -### ✅ Authentication Components - -- **Login Form** (`src/components/auth/login_form.rs`) - Email/password validation, remember me, SSO integration -- **MFA Setup** (`src/components/auth/mfa_setup.rs`) - TOTP with QR codes, backup codes, verification flow -- **Password Reset** (`src/components/auth/password_reset.rs`) - Email verification, secure token flow, validation -- **Session Timeout** (`src/components/auth/session_timeout.rs`) - Countdown modal, automatic logout, session extension - -### ✅ Advanced Security Features - -- **Device Trust** (`src/components/auth/device_trust.rs`) - Device fingerprinting, trust management, auto-generated names -- **Biometric Auth** (`src/components/auth/biometric_auth.rs`) - WebAuthn/FIDO2 integration, credential management -- **SSO Buttons** (`src/components/auth/sso_buttons.rs`) - OAuth2/SAML/OIDC providers with branded icons -- **User Profile** (`src/components/auth/user_profile.rs`) - Comprehensive profile management with tabbed interface - -### ✅ Route Protection System - -- **Auth Guard** (`src/components/auth/auth_guard.rs`) - Protected routes, permission guards, role-based access -- **Logout Button** (`src/components/auth/logout_button.rs`) - Secure logout with server notification and cleanup - -### ✅ WebAuthn Integration - -- **WebAuthn Manager** (`src/auth/webauthn.rs`) - Complete FIDO2 implementation with browser compatibility -- **Biometric Registration** - Platform and cross-platform authenticator support -- **Credential Management** - Device naming, usage tracking, removal capabilities - -## 🔒 Security Implementation - -### Token Security - -- **AES-256-GCM Encryption**: All tokens encrypted before storage -- **Session-based Keys**: Encryption keys unique per browser session -- **Automatic Rotation**: Keys regenerated on each application load -- **Secure Cleanup**: Complete token removal on logout - -### Device Trust - -- **Hardware Fingerprinting**: Based on browser, platform, screen, timezone -- **Trust Duration**: Configurable trust periods (7, 30, 90, 365 days) -- **Trust Tokens**: Separate tokens for device trust validation -- **Remote Revocation**: Server-side device trust management - -### Session Management - -- **Configurable Timeouts**: Adjustable session timeout periods -- **Activity Monitoring**: Tracks user activity for session extension -- **Concurrent Sessions**: Multiple session tracking and management -- **Graceful Logout**: Clean session termination with server notification - -### WebAuthn Security - -- **Hardware Security**: Leverages hardware security modules -- **Biometric Verification**: Touch ID, Face ID, Windows Hello support -- **Security Key Support**: USB, NFC, Bluetooth FIDO2 keys -- **Attestation Validation**: Hardware authenticity verification - -## 📱 Component Usage Examples - -### Basic Authentication Flow - -```rust -use leptos::*; -use control_center_ui::auth::provide_auth_context; -use control_center_ui::components::auth::*; - -#[component] -fn App() -> impl IntoView { - provide_meta_context(); - - // Initialize auth context with API base URL - provide_auth_context("http://localhost:8080".to_string()).unwrap(); - - view! { - - - - - - - - } -} -```text - -### Login Page Implementation - -```rust -#[component] -fn LoginPage() -> impl IntoView { - view! { -
-
-

- "Control Center" -

-
-
-
- -
-
-
- } -} -```text - -### Protected Dashboard - -```rust -#[component] -fn DashboardPage() -> impl IntoView { - view! { - -
- -
- - // Dashboard content -
-
-
- } -} -```text - -### User Profile Management - -```rust -#[component] -fn ProfilePage() -> impl IntoView { - view! { - -
-
- -
-
-
- } -} -```text - -## 🔧 Required Backend API - -The authentication system expects the following backend endpoints: - -### Authentication Endpoints - -```plaintext -POST /auth/login # Email/password authentication -POST /auth/refresh # JWT token refresh -POST /auth/logout # Session termination -POST /auth/extend-session # Session timeout extension -```text - -### Password Management - -```plaintext -POST /auth/password-reset # Password reset request -POST /auth/password-reset/confirm # Password reset confirmation -```text - -### Multi-Factor Authentication - -```plaintext -POST /auth/mfa/setup # MFA setup initiation -POST /auth/mfa/verify # MFA verification -```text - -### SSO Integration - -```plaintext -GET /auth/sso/providers # Available SSO providers -POST /auth/sso/{provider}/login # SSO authentication initiation -```text - -### WebAuthn/FIDO2 - -```plaintext -POST /auth/webauthn/register/begin # WebAuthn registration start -POST /auth/webauthn/register/complete # WebAuthn registration finish -POST /auth/webauthn/authenticate/begin # WebAuthn authentication start -POST /auth/webauthn/authenticate/complete # WebAuthn authentication finish -GET /auth/webauthn/credentials # List WebAuthn credentials -DELETE /auth/webauthn/credentials/{id} # Remove WebAuthn credential -```text - -### Device Trust Management - -```plaintext -GET /auth/devices # List trusted devices -POST /auth/devices/trust # Trust current device -DELETE /auth/devices/{id}/revoke # Revoke device trust -```text - -### User Profile Management - -```plaintext -GET /user/profile # Get user profile -PUT /user/profile # Update user profile -POST /user/change-password # Change password -POST /user/mfa/enable # Enable MFA -POST /user/mfa/disable # Disable MFA -GET /user/sessions # List active sessions -DELETE /user/sessions/{id}/revoke # Revoke session -```text - -## 📊 Implementation Statistics - -### Component Coverage - -- **13/13 Core Components** ✅ Complete -- **4/4 Auth Infrastructure** ✅ Complete -- **9/9 Security Features** ✅ Complete -- **3/3 Route Protection** ✅ Complete -- **2/2 WebAuthn Features** ✅ Complete - -### Security Features - -- **Encrypted Storage** ✅ AES-256-GCM with session keys -- **Automatic Token Refresh** ✅ Background refresh with retry logic -- **Device Fingerprinting** ✅ Hardware-based unique identification -- **Session Management** ✅ Timeout warnings and extensions -- **Biometric Authentication** ✅ WebAuthn/FIDO2 integration -- **Multi-Factor Auth** ✅ TOTP with QR codes and backup codes -- **SSO Integration** ✅ OAuth2/SAML/OIDC providers -- **Route Protection** ✅ Guards with permission/role validation - -### Performance Optimizations - -- **Lazy Loading** ✅ Components loaded on demand -- **Reactive Updates** ✅ Leptos fine-grained reactivity -- **Efficient Re-renders** ✅ Minimal component updates -- **Background Operations** ✅ Non-blocking authentication flows -- **Connection Management** ✅ Automatic retry and fallback - -## 🎯 Key Features Highlights - -### Advanced Authentication - -- **Passwordless Login**: WebAuthn biometric authentication -- **Device Memory**: Skip MFA on trusted devices -- **Session Continuity**: Automatic token refresh without interruption -- **Multi-Provider SSO**: Google, Microsoft, GitHub, GitLab, etc. - -### Enterprise Security - -- **Hardware Security**: FIDO2 security keys and platform authenticators -- **Device Trust**: Configurable trust periods with remote revocation -- **Session Monitoring**: Real-time session management and monitoring -- **Audit Trail**: Complete authentication event logging - -### Developer Experience - -- **Type Safety**: Full TypeScript-equivalent safety with Rust -- **Component Reusability**: Modular authentication components -- **Easy Integration**: Simple context provider setup -- **Comprehensive Documentation**: Detailed implementation guide - -### User Experience - -- **Smooth Flows**: Intuitive authentication workflows -- **Mobile Support**: Responsive design for all devices -- **Accessibility**: WCAG 2.1 compliant components -- **Error Handling**: User-friendly error messages and recovery - -## 🚀 Getting Started - -### Prerequisites - -- **Rust 1.70+** with wasm-pack -- **Leptos 0.6** framework -- **Compatible browser** (Chrome 67+, Firefox 60+, Safari 14+, Edge 18+) - -### Quick Setup - -1. Add the authentication dependencies to your `Cargo.toml` -2. Initialize the authentication context in your app -3. Use the provided components in your routes -4. Configure your backend API endpoints -5. Test the complete authentication flow - -### Production Deployment - -- **HTTPS Required**: WebAuthn requires secure connections -- **CORS Configuration**: Proper cross-origin setup -- **CSP Headers**: Content security policy for XSS protection -- **Rate Limiting**: API endpoint protection - ---- - -**A complete, production-ready authentication system built with modern Rust and WebAssembly technologies.** +# Control Center UI - Leptos Authentication System\n\nA comprehensive authentication system built with Leptos and WebAssembly for cloud infrastructure management.\n\n## 🔐 Features Overview\n\n### Core Authentication\n\n- **Email/Password Login** with comprehensive validation\n- **JWT Token Management** with automatic refresh\n- **Secure Token Storage** with AES-256-GCM encryption in localStorage\n- **401 Response Interceptor** for automatic logout and token refresh\n\n### Multi-Factor Authentication (MFA)\n\n- **TOTP-based MFA** with QR code generation\n- **Backup Codes** for account recovery\n- **Mobile App Integration** (Google Authenticator, Authy, etc.)\n\n### Biometric Authentication\n\n- **WebAuthn/FIDO2 Support** for passwordless authentication\n- **Platform Authenticators** (Touch ID, Face ID, Windows Hello)\n- **Cross-Platform Security Keys** (USB, NFC, Bluetooth)\n- **Credential Management** with device naming and removal\n\n### Advanced Security Features\n\n- **Device Trust Management** with fingerprinting\n- **Session Timeout Warnings** with countdown timers\n- **Password Reset Flow** with email verification\n- **SSO Integration** (OAuth2, SAML, OpenID Connect)\n- **Session Management** with active session monitoring\n\n### Route Protection\n\n- **Auth Guards** for protected routes\n- **Permission-based Access Control** with role validation\n- **Conditional Rendering** based on authentication state\n- **Automatic Redirects** for unauthorized access\n\n## 📁 Architecture Overview\n\n```\nsrc/\n├── auth/ # Authentication core\n│ ├── mod.rs # Type definitions and exports\n│ ├── token_manager.rs # JWT token handling with auto-refresh\n│ ├── storage.rs # Encrypted token storage\n│ ├── webauthn.rs # WebAuthn/FIDO2 implementation\n│ ├── crypto.rs # Cryptographic utilities\n│ └── http_interceptor.rs # HTTP request/response interceptor\n├── components/auth/ # Authentication components\n│ ├── mod.rs # Component exports\n│ ├── login_form.rs # Email/password login form\n│ ├── mfa_setup.rs # TOTP MFA configuration\n│ ├── password_reset.rs # Password reset flow\n│ ├── auth_guard.rs # Route protection components\n│ ├── session_timeout.rs # Session management modal\n│ ├── sso_buttons.rs # SSO provider buttons\n│ ├── device_trust.rs # Device trust management\n│ ├── biometric_auth.rs # WebAuthn biometric auth\n│ ├── logout_button.rs # Logout functionality\n│ └── user_profile.rs # User profile management\n├── utils/ # Utility modules\n└── lib.rs # Main application entry\n```\n\n## 🚀 Implemented Components\n\nAll authentication components have been successfully implemented:\n\n### ✅ Core Authentication Infrastructure\n\n- **Secure Token Storage** (`src/auth/storage.rs`) - AES-256-GCM encrypted localStorage with session-based keys\n- **JWT Token Manager** (`src/auth/token_manager.rs`) - Automatic token refresh, expiry monitoring, context management\n- **Crypto Utilities** (`src/auth/crypto.rs`) - Secure random generation, hashing, HMAC, device fingerprinting\n- **HTTP Interceptor** (`src/auth/http_interceptor.rs`) - 401 handling, automatic logout, request/response middleware\n\n### ✅ Authentication Components\n\n- **Login Form** (`src/components/auth/login_form.rs`) - Email/password validation, remember me, SSO integration\n- **MFA Setup** (`src/components/auth/mfa_setup.rs`) - TOTP with QR codes, backup codes, verification flow\n- **Password Reset** (`src/components/auth/password_reset.rs`) - Email verification, secure token flow, validation\n- **Session Timeout** (`src/components/auth/session_timeout.rs`) - Countdown modal, automatic logout, session extension\n\n### ✅ Advanced Security Features\n\n- **Device Trust** (`src/components/auth/device_trust.rs`) - Device fingerprinting, trust management, auto-generated names\n- **Biometric Auth** (`src/components/auth/biometric_auth.rs`) - WebAuthn/FIDO2 integration, credential management\n- **SSO Buttons** (`src/components/auth/sso_buttons.rs`) - OAuth2/SAML/OIDC providers with branded icons\n- **User Profile** (`src/components/auth/user_profile.rs`) - Comprehensive profile management with tabbed interface\n\n### ✅ Route Protection System\n\n- **Auth Guard** (`src/components/auth/auth_guard.rs`) - Protected routes, permission guards, role-based access\n- **Logout Button** (`src/components/auth/logout_button.rs`) - Secure logout with server notification and cleanup\n\n### ✅ WebAuthn Integration\n\n- **WebAuthn Manager** (`src/auth/webauthn.rs`) - Complete FIDO2 implementation with browser compatibility\n- **Biometric Registration** - Platform and cross-platform authenticator support\n- **Credential Management** - Device naming, usage tracking, removal capabilities\n\n## 🔒 Security Implementation\n\n### Token Security\n\n- **AES-256-GCM Encryption**: All tokens encrypted before storage\n- **Session-based Keys**: Encryption keys unique per browser session\n- **Automatic Rotation**: Keys regenerated on each application load\n- **Secure Cleanup**: Complete token removal on logout\n\n### Device Trust\n\n- **Hardware Fingerprinting**: Based on browser, platform, screen, timezone\n- **Trust Duration**: Configurable trust periods (7, 30, 90, 365 days)\n- **Trust Tokens**: Separate tokens for device trust validation\n- **Remote Revocation**: Server-side device trust management\n\n### Session Management\n\n- **Configurable Timeouts**: Adjustable session timeout periods\n- **Activity Monitoring**: Tracks user activity for session extension\n- **Concurrent Sessions**: Multiple session tracking and management\n- **Graceful Logout**: Clean session termination with server notification\n\n### WebAuthn Security\n\n- **Hardware Security**: Leverages hardware security modules\n- **Biometric Verification**: Touch ID, Face ID, Windows Hello support\n- **Security Key Support**: USB, NFC, Bluetooth FIDO2 keys\n- **Attestation Validation**: Hardware authenticity verification\n\n## 📱 Component Usage Examples\n\n### Basic Authentication Flow\n\n```\nuse leptos::*;\nuse control_center_ui::auth::provide_auth_context;\nuse control_center_ui::components::auth::*;\n\n#[component]\nfn App() -> impl IntoView {\n provide_meta_context();\n\n // Initialize auth context with API base URL\n provide_auth_context("http://localhost:8080".to_string()).unwrap();\n\n view! {\n \n \n \n \n \n \n \n }\n}\n```\n\n### Login Page Implementation\n\n```\n#[component]\nfn LoginPage() -> impl IntoView {\n view! {\n
\n
\n

\n "Control Center"\n

\n
\n
\n
\n \n
\n
\n
\n }\n}\n```\n\n### Protected Dashboard\n\n```\n#[component]\nfn DashboardPage() -> impl IntoView {\n view! {\n \n
\n \n
\n \n // Dashboard content\n
\n
\n
\n }\n}\n```\n\n### User Profile Management\n\n```\n#[component]\nfn ProfilePage() -> impl IntoView {\n view! {\n \n
\n
\n \n
\n
\n
\n }\n}\n```\n\n## 🔧 Required Backend API\n\nThe authentication system expects the following backend endpoints:\n\n### Authentication Endpoints\n\n```\nPOST /auth/login # Email/password authentication\nPOST /auth/refresh # JWT token refresh\nPOST /auth/logout # Session termination\nPOST /auth/extend-session # Session timeout extension\n```\n\n### Password Management\n\n```\nPOST /auth/password-reset # Password reset request\nPOST /auth/password-reset/confirm # Password reset confirmation\n```\n\n### Multi-Factor Authentication\n\n```\nPOST /auth/mfa/setup # MFA setup initiation\nPOST /auth/mfa/verify # MFA verification\n```\n\n### SSO Integration\n\n```\nGET /auth/sso/providers # Available SSO providers\nPOST /auth/sso/{provider}/login # SSO authentication initiation\n```\n\n### WebAuthn/FIDO2\n\n```\nPOST /auth/webauthn/register/begin # WebAuthn registration start\nPOST /auth/webauthn/register/complete # WebAuthn registration finish\nPOST /auth/webauthn/authenticate/begin # WebAuthn authentication start\nPOST /auth/webauthn/authenticate/complete # WebAuthn authentication finish\nGET /auth/webauthn/credentials # List WebAuthn credentials\nDELETE /auth/webauthn/credentials/{id} # Remove WebAuthn credential\n```\n\n### Device Trust Management\n\n```\nGET /auth/devices # List trusted devices\nPOST /auth/devices/trust # Trust current device\nDELETE /auth/devices/{id}/revoke # Revoke device trust\n```\n\n### User Profile Management\n\n```\nGET /user/profile # Get user profile\nPUT /user/profile # Update user profile\nPOST /user/change-password # Change password\nPOST /user/mfa/enable # Enable MFA\nPOST /user/mfa/disable # Disable MFA\nGET /user/sessions # List active sessions\nDELETE /user/sessions/{id}/revoke # Revoke session\n```\n\n## 📊 Implementation Statistics\n\n### Component Coverage\n\n- **13/13 Core Components** ✅ Complete\n- **4/4 Auth Infrastructure** ✅ Complete\n- **9/9 Security Features** ✅ Complete\n- **3/3 Route Protection** ✅ Complete\n- **2/2 WebAuthn Features** ✅ Complete\n\n### Security Features\n\n- **Encrypted Storage** ✅ AES-256-GCM with session keys\n- **Automatic Token Refresh** ✅ Background refresh with retry logic\n- **Device Fingerprinting** ✅ Hardware-based unique identification\n- **Session Management** ✅ Timeout warnings and extensions\n- **Biometric Authentication** ✅ WebAuthn/FIDO2 integration\n- **Multi-Factor Auth** ✅ TOTP with QR codes and backup codes\n- **SSO Integration** ✅ OAuth2/SAML/OIDC providers\n- **Route Protection** ✅ Guards with permission/role validation\n\n### Performance Optimizations\n\n- **Lazy Loading** ✅ Components loaded on demand\n- **Reactive Updates** ✅ Leptos fine-grained reactivity\n- **Efficient Re-renders** ✅ Minimal component updates\n- **Background Operations** ✅ Non-blocking authentication flows\n- **Connection Management** ✅ Automatic retry and fallback\n\n## 🎯 Key Features Highlights\n\n### Advanced Authentication\n\n- **Passwordless Login**: WebAuthn biometric authentication\n- **Device Memory**: Skip MFA on trusted devices\n- **Session Continuity**: Automatic token refresh without interruption\n- **Multi-Provider SSO**: Google, Microsoft, GitHub, GitLab, etc.\n\n### Enterprise Security\n\n- **Hardware Security**: FIDO2 security keys and platform authenticators\n- **Device Trust**: Configurable trust periods with remote revocation\n- **Session Monitoring**: Real-time session management and monitoring\n- **Audit Trail**: Complete authentication event logging\n\n### Developer Experience\n\n- **Type Safety**: Full TypeScript-equivalent safety with Rust\n- **Component Reusability**: Modular authentication components\n- **Easy Integration**: Simple context provider setup\n- **Comprehensive Documentation**: Detailed implementation guide\n\n### User Experience\n\n- **Smooth Flows**: Intuitive authentication workflows\n- **Mobile Support**: Responsive design for all devices\n- **Accessibility**: WCAG 2.1 compliant components\n- **Error Handling**: User-friendly error messages and recovery\n\n## 🚀 Getting Started\n\n### Prerequisites\n\n- **Rust 1.70+** with wasm-pack\n- **Leptos 0.6** framework\n- **Compatible browser** (Chrome 67+, Firefox 60+, Safari 14+, Edge 18+)\n\n### Quick Setup\n\n1. Add the authentication dependencies to your `Cargo.toml`\n2. Initialize the authentication context in your app\n3. Use the provided components in your routes\n4. Configure your backend API endpoints\n5. Test the complete authentication flow\n\n### Production Deployment\n\n- **HTTPS Required**: WebAuthn requires secure connections\n- **CORS Configuration**: Proper cross-origin setup\n- **CSP Headers**: Content security policy for XSS protection\n- **Rate Limiting**: API endpoint protection\n\n---\n\n**A complete, production-ready authentication system built with modern Rust and WebAssembly technologies.** \ No newline at end of file diff --git a/crates/control-center-ui/upstream-dependency-issue.md b/crates/control-center-ui/upstream-dependency-issue.md index c345878..74bb606 100644 --- a/crates/control-center-ui/upstream-dependency-issue.md +++ b/crates/control-center-ui/upstream-dependency-issue.md @@ -1,145 +1 @@ -# Upstream Dependency Issue: num-bigint-dig v0.8.4 - -## Issue Summary - -**Status**: ⚠️ **UPSTREAM ISSUE - NON-BLOCKING** - -The control-center-ui build produces a future incompatibility warning from the transitive dependency `num-bigint-dig v0.8.4`: - -```plaintext -warning: the following packages contain code that will be rejected by a future version of Rust: num-bigint-dig v0.8.4 -note: to see what the problems were, use the option `--future-incompat-report`, or run `cargo report future-incompatibilities --id 1` -```text - -## Root Cause - -The `num-bigint-dig v0.8.4` crate uses a **private `vec!` macro** in multiple locations (Rust issue #120192). -This pattern will become a hard error in a future Rust release. - -**Affected files in num-bigint-dig v0.8.4:** - -- `src/biguint.rs` (lines 490, 2005, 2027, 2313) -- `src/prime.rs` (line 138) -- `src/bigrand.rs` (line 319) - -## Dependency Chain - -```plaintext -control-center-ui (control-center-ui v0.1.0) - ↓ -num-bigint-dig v0.8.4 - ↑ (pulled in by) -├── rsa v0.9.9 -│ ├── control-center -│ ├── jsonwebtoken v10.2.0 -│ └── provisioning-orchestrator -└── ssh-key v0.6.7 - ├── russh v0.44.1 - └── russh-keys v0.44.0 -```text - -## Why We Can't Fix It - -**Option 1: Direct Patch** - -- ✗ Cannot patch transitive crates.io dependencies to different crates.io versions -- Cargo only allows patches to point to different sources (git repos, local paths) - -**Option 2: Upgrade rsa** - -- Available: `rsa v0.10.0-rc.10` (release candidate only, not stable) -- Status: Not production-ready until stable release -- Current: `rsa v0.9.9` (stable, production) - -**Option 3: Upgrade ssh-key** - -- Current: `ssh-key v0.6.7` -- Still depends on `num-bigint-dig v0.8.4` (not upgraded yet) - -**Option 4: Local Fork** - -- ✗ Not practical for transitive dependencies - -## Resolution Timeline - -**For num-bigint-dig:** - -- Available versions: 0.8.5, 0.8.6, 0.9.0, 0.9.1 -- Latest: v0.9.1 -- Status: Fixed in 0.8.6 and later -- When it gets picked up: Depends on upstream crate releases - -**Upstream Action Items:** - -1. **rsa crate** needs to upgrade to use newer num-bigint-dig when available -2. **ssh-key crate** needs to upgrade to use newer num-bigint-dig when available -3. Once upstream crates update their dependencies, our Cargo.lock will automatically use the fixed version - -## Current Impact - -✅ **NO IMPACT ON FUNCTIONALITY** - -- Code compiles cleanly -- All tests pass -- All features work correctly -- Only a forward-compatibility warning, not an error - -✅ **NOT A BLOCKER FOR:** - -- Deployment -- Production use -- Any functionality -- WASM compilation -- Release builds - -## Timeline for Resolution - -| Status | Item | Estimated | -| -------- | ------ | ----------- | -| ✓ Available | num-bigint-dig 0.8.6 | Already released | -| ⏳ Waiting | rsa v0.10 stable release | 2024-Q4 to 2025-Q1 | -| ⏳ Waiting | Downstream crate updates | After upstream releases | -| ✓ Automatic | Our build updates | Once dependencies are updated | - -## Monitoring - -To check for updates: - -```bash -# Check for future incompatibilities -cargo report future-incompatibilities - -# Check available versions -cargo outdated - -# Check dependency tree -cargo tree | grep num-bigint-dig -```text - -## Workaround (if needed) - -If the warning becomes an error before upstream fixes are released, you can: - -1. **Use an older Rust version** (current stable still allows this as warning) -2. **Wait for upstream updates** (recommended) -3. **Create a fork** of rsa/ssh-key with newer num-bigint-dig (not recommended) - -## Recommended Action - -**No immediate action needed.** This is a normal part of the Rust ecosystem evolution: - -- Upstream packages will update their dependencies -- Our Cargo.lock will automatically resolve to fixed versions -- Continue monitoring with `cargo report future-incompatibilities` - -## References - -- Rust Issue #120192: -- num-bigint-dig Repository: -- num-bigint-dig Releases: - ---- - -**Last Updated**: December 12, 2025 -**Status**: Monitored, Non-Blocking -**Action**: Awaiting Upstream Fixes +# Upstream Dependency Issue: num-bigint-dig v0.8.4\n\n## Issue Summary\n\n**Status**: ⚠️ **UPSTREAM ISSUE - NON-BLOCKING**\n\nThe control-center-ui build produces a future incompatibility warning from the transitive dependency `num-bigint-dig v0.8.4`:\n\n```\nwarning: the following packages contain code that will be rejected by a future version of Rust: num-bigint-dig v0.8.4\nnote: to see what the problems were, use the option `--future-incompat-report`, or run `cargo report future-incompatibilities --id 1`\n```\n\n## Root Cause\n\nThe `num-bigint-dig v0.8.4` crate uses a **private `vec!` macro** in multiple locations (Rust issue #120192).\nThis pattern will become a hard error in a future Rust release.\n\n**Affected files in num-bigint-dig v0.8.4:**\n\n- `src/biguint.rs` (lines 490, 2005, 2027, 2313)\n- `src/prime.rs` (line 138)\n- `src/bigrand.rs` (line 319)\n\n## Dependency Chain\n\n```\ncontrol-center-ui (control-center-ui v0.1.0)\n ↓\nnum-bigint-dig v0.8.4\n ↑ (pulled in by)\n├── rsa v0.9.9\n│ ├── control-center\n│ ├── jsonwebtoken v10.2.0\n│ └── provisioning-orchestrator\n└── ssh-key v0.6.7\n ├── russh v0.44.1\n └── russh-keys v0.44.0\n```\n\n## Why We Can't Fix It\n\n**Option 1: Direct Patch**\n\n- ✗ Cannot patch transitive crates.io dependencies to different crates.io versions\n- Cargo only allows patches to point to different sources (git repos, local paths)\n\n**Option 2: Upgrade rsa**\n\n- Available: `rsa v0.10.0-rc.10` (release candidate only, not stable)\n- Status: Not production-ready until stable release\n- Current: `rsa v0.9.9` (stable, production)\n\n**Option 3: Upgrade ssh-key**\n\n- Current: `ssh-key v0.6.7`\n- Still depends on `num-bigint-dig v0.8.4` (not upgraded yet)\n\n**Option 4: Local Fork**\n\n- ✗ Not practical for transitive dependencies\n\n## Resolution Timeline\n\n**For num-bigint-dig:**\n\n- Available versions: 0.8.5, 0.8.6, 0.9.0, 0.9.1\n- Latest: v0.9.1\n- Status: Fixed in 0.8.6 and later\n- When it gets picked up: Depends on upstream crate releases\n\n**Upstream Action Items:**\n\n1. **rsa crate** needs to upgrade to use newer num-bigint-dig when available\n2. **ssh-key crate** needs to upgrade to use newer num-bigint-dig when available\n3. Once upstream crates update their dependencies, our Cargo.lock will automatically use the fixed version\n\n## Current Impact\n\n✅ **NO IMPACT ON FUNCTIONALITY**\n\n- Code compiles cleanly\n- All tests pass\n- All features work correctly\n- Only a forward-compatibility warning, not an error\n\n✅ **NOT A BLOCKER FOR:**\n\n- Deployment\n- Production use\n- Any functionality\n- WASM compilation\n- Release builds\n\n## Timeline for Resolution\n\n| Status | Item | Estimated |\n| -------- | ------ | ----------- |\n| ✓ Available | num-bigint-dig 0.8.6 | Already released |\n| ⏳ Waiting | rsa v0.10 stable release | 2024-Q4 to 2025-Q1 |\n| ⏳ Waiting | Downstream crate updates | After upstream releases |\n| ✓ Automatic | Our build updates | Once dependencies are updated |\n\n## Monitoring\n\nTo check for updates:\n\n```\n# Check for future incompatibilities\ncargo report future-incompatibilities\n\n# Check available versions\ncargo outdated\n\n# Check dependency tree\ncargo tree | grep num-bigint-dig\n```\n\n## Workaround (if needed)\n\nIf the warning becomes an error before upstream fixes are released, you can:\n\n1. **Use an older Rust version** (current stable still allows this as warning)\n2. **Wait for upstream updates** (recommended)\n3. **Create a fork** of rsa/ssh-key with newer num-bigint-dig (not recommended)\n\n## Recommended Action\n\n**No immediate action needed.** This is a normal part of the Rust ecosystem evolution:\n\n- Upstream packages will update their dependencies\n- Our Cargo.lock will automatically resolve to fixed versions\n- Continue monitoring with `cargo report future-incompatibilities`\n\n## References\n\n- Rust Issue #120192: \n- num-bigint-dig Repository: \n- num-bigint-dig Releases: \n\n---\n\n**Last Updated**: December 12, 2025\n**Status**: Monitored, Non-Blocking\n**Action**: Awaiting Upstream Fixes \ No newline at end of file diff --git a/crates/control-center/README.md b/crates/control-center/README.md index 0a50455..a01fb6d 100644 --- a/crates/control-center/README.md +++ b/crates/control-center/README.md @@ -1,371 +1 @@ -# Control Center - Cedar Policy Engine - -A comprehensive Cedar policy engine implementation with advanced security features, compliance checking, and anomaly detection. - -## Features - -### 🔐 Cedar Policy Engine - -- **Policy Evaluation**: High-performance policy evaluation with context injection -- **Versioning**: Complete policy versioning with rollback capabilities -- **Templates**: Configuration-driven policy templates with variable substitution -- **Validation**: Comprehensive policy validation with syntax and semantic checking - -### 🛡️ Security & Authentication - -- **JWT Authentication**: Secure token-based authentication -- **Multi-Factor Authentication**: MFA support for sensitive operations -- **Role-Based Access Control**: Flexible RBAC with policy integration -- **Session Management**: Secure session handling with timeouts - -### 📊 Compliance Framework - -- **SOC2 Type II**: Complete SOC2 compliance validation -- **HIPAA**: Healthcare data protection compliance -- **Audit Trail**: Comprehensive audit logging and reporting -- **Impact Analysis**: Policy change impact assessment - -### 🔍 Anomaly Detection - -- **Statistical Analysis**: Multiple statistical methods (Z-Score, IQR, Isolation Forest) -- **Real-time Detection**: Continuous monitoring of policy evaluations -- **Alert Management**: Configurable alerting through multiple channels -- **Baseline Learning**: Adaptive baseline calculation for improved accuracy - -### 🗄️ Storage & Persistence - -- **SurrealDB Integration**: High-performance graph database backend -- **Policy Storage**: Versioned policy storage with metadata -- **Metrics Storage**: Policy evaluation metrics and analytics -- **Compliance Records**: Complete compliance audit trails - -## Quick Start - -### 1. Installation - -```bash -cd src/control-center -cargo build --release -```text - -### 2. Configuration - -Copy the example configuration: - -```bash -cp config.toml.example config.toml -```text - -Edit `config.toml` for your environment: - -```toml -[database] -url = "surreal://localhost:8000" # Your SurrealDB instance -username = "root" -password = "your-password" - -[auth] -jwt_secret = "your-super-secret-key" -require_mfa = true - -[compliance.soc2] -enabled = true - -[anomaly] -enabled = true -detection_threshold = 2.5 -```text - -### 3. Start the Server - -```bash -./target/release/control-center server --port 8080 -```text - -### 4. Test Policy Evaluation - -```bash -curl -X POST http://localhost:8080/policies/evaluate \ - -H "Content-Type: application/json" \ - -d '{ - "principal": {"id": "user123", "roles": ["Developer"]}, - "action": {"id": "access"}, - "resource": {"id": "sensitive-db", "classification": "confidential"}, - "context": {"mfa_enabled": true, "location": "US"} - }' -```text - -## Policy Examples - -### Multi-Factor Authentication Policy - -```cedar -// Require MFA for sensitive resources -permit( - principal, - action == Action::"access", - resource -) when { - resource has classification && - resource.classification in ["sensitive", "confidential"] && - principal has mfa_enabled && - principal.mfa_enabled == true -}; -```text - -### Production Approval Policy - -```cedar -// Require approval for production operations -permit( - principal, - action in [Action::"deploy", Action::"modify", Action::"delete"], - resource -) when { - resource has environment && - resource.environment == "production" && - principal has approval && - principal.approval.approved_by in ["ProductionAdmin", "SRE"] -}; -```text - -### Geographic Restrictions - -```cedar -// Allow access only from approved countries -permit( - principal, - action, - resource -) when { - context has geo && - context.geo has country && - context.geo.country in ["US", "CA", "GB", "DE"] -}; -```text - -## CLI Commands - -### Policy Management - -```bash -# Validate policies -control-center policy validate policies/ - -# Test policy with test data -control-center policy test policies/mfa.cedar tests/data/mfa_test.json - -# Analyze policy impact -control-center policy impact policies/new_policy.cedar -```text - -### Compliance Checking - -```bash -# Check SOC2 compliance -control-center compliance soc2 - -# Check HIPAA compliance -control-center compliance hipaa - -# Generate compliance report -control-center compliance report --format html -```text - -## API Endpoints - -### Policy Evaluation - -- `POST /policies/evaluate` - Evaluate policy decision -- `GET /policies` - List all policies -- `POST /policies` - Create new policy -- `PUT /policies/{id}` - Update policy -- `DELETE /policies/{id}` - Delete policy - -### Policy Versions - -- `GET /policies/{id}/versions` - List policy versions -- `GET /policies/{id}/versions/{version}` - Get specific version -- `POST /policies/{id}/rollback/{version}` - Rollback to version - -### Compliance - -- `GET /compliance/soc2` - SOC2 compliance check -- `GET /compliance/hipaa` - HIPAA compliance check -- `GET /compliance/report` - Generate compliance report - -### Anomaly Detection - -- `GET /anomalies` - List detected anomalies -- `GET /anomalies/{id}` - Get anomaly details -- `POST /anomalies/detect` - Trigger anomaly detection - -## Testing - -### Run Unit Tests - -```bash -cargo test -```text - -### Run Integration Tests - -```bash -cargo test --test integration_tests -```text - -### Run Policy Tests - -```bash -cargo test --test policy_tests -```text - -### Run Compliance Tests - -```bash -cargo test --test compliance_tests -```text - -## Architecture - -### Core Components - -1. **Policy Engine** (`src/policies/engine.rs`) - - Cedar policy evaluation - - Context injection - - Caching and optimization - -2. **Storage Layer** (`src/storage/`) - - SurrealDB integration - - Policy versioning - - Metrics storage - -3. **Compliance Framework** (`src/compliance/`) - - SOC2 checker - - HIPAA validator - - Report generation - -4. **Anomaly Detection** (`src/anomaly/`) - - Statistical analysis - - Real-time monitoring - - Alert management - -5. **Authentication** (`src/auth.rs`) - - JWT token management - - Password hashing - - Session handling - -### Configuration-Driven Design - -The system follows PAP (Project Architecture Principles) with: - -- **No hardcoded values**: All behavior controlled via configuration -- **Dynamic loading**: Policies and rules loaded from configuration -- **Template-based**: Policy generation through templates -- **Environment-aware**: Different configs for dev/test/prod - -### Security Features - -- **Audit Logging**: All policy evaluations logged -- **Encryption**: Data encrypted at rest and in transit -- **Rate Limiting**: Protection against abuse -- **Input Validation**: Comprehensive validation of all inputs -- **Error Handling**: Secure error handling without information leakage - -## Production Deployment - -### Docker - -```dockerfile -FROM rust:1.75 as builder -WORKDIR /app -COPY . . -RUN cargo build --release - -FROM debian:bookworm-slim -RUN apt-get update && apt-get install -y ca-certificates -COPY --from=builder /app/target/release/control-center /usr/local/bin/ -EXPOSE 8080 -CMD ["control-center", "server"] -```text - -### Kubernetes - -```yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: control-center -spec: - replicas: 3 - selector: - matchLabels: - app: control-center - template: - metadata: - labels: - app: control-center - spec: - containers: - - name: control-center - image: control-center:latest - ports: - - containerPort: 8080 - env: - - name: DATABASE_URL - value: "surreal://surrealdb:8000" -```text - -### Environment Variables - -```bash -# Override config values with environment variables -export CONTROL_CENTER_SERVER_PORT=8080 -export CONTROL_CENTER_DATABASE_URL="surreal://prod-db:8000" -export CONTROL_CENTER_AUTH_JWT_SECRET="production-secret" -export CONTROL_CENTER_COMPLIANCE_SOC2_ENABLED=true -```text - -## Monitoring & Observability - -### Metrics - -- Policy evaluation latency -- Policy decision distribution -- Anomaly detection rates -- Compliance scores - -### Logging - -```rust -// Structured logging with tracing -tracing::info!( - policy_id = %policy.id, - principal = %context.principal.id, - decision = ?result.decision, - duration_ms = evaluation_time, - "Policy evaluation completed" -); -```text - -### Health Checks - -```bash -curl http://localhost:8080/health -```text - -## Contributing - -1. Follow the PAP principles documented in the codebase -2. Add tests for new features -3. Update documentation -4. Ensure compliance checks pass -5. Add appropriate logging and monitoring - -## License - -This project follows the licensing specified in the parent repository. - -## Support - -For questions and support, refer to the project documentation or create an issue in the repository. +# Control Center - Cedar Policy Engine\n\nA comprehensive Cedar policy engine implementation with advanced security features, compliance checking, and anomaly detection.\n\n## Features\n\n### 🔐 Cedar Policy Engine\n\n- **Policy Evaluation**: High-performance policy evaluation with context injection\n- **Versioning**: Complete policy versioning with rollback capabilities\n- **Templates**: Configuration-driven policy templates with variable substitution\n- **Validation**: Comprehensive policy validation with syntax and semantic checking\n\n### 🛡️ Security & Authentication\n\n- **JWT Authentication**: Secure token-based authentication\n- **Multi-Factor Authentication**: MFA support for sensitive operations\n- **Role-Based Access Control**: Flexible RBAC with policy integration\n- **Session Management**: Secure session handling with timeouts\n\n### 📊 Compliance Framework\n\n- **SOC2 Type II**: Complete SOC2 compliance validation\n- **HIPAA**: Healthcare data protection compliance\n- **Audit Trail**: Comprehensive audit logging and reporting\n- **Impact Analysis**: Policy change impact assessment\n\n### 🔍 Anomaly Detection\n\n- **Statistical Analysis**: Multiple statistical methods (Z-Score, IQR, Isolation Forest)\n- **Real-time Detection**: Continuous monitoring of policy evaluations\n- **Alert Management**: Configurable alerting through multiple channels\n- **Baseline Learning**: Adaptive baseline calculation for improved accuracy\n\n### 🗄️ Storage & Persistence\n\n- **SurrealDB Integration**: High-performance graph database backend\n- **Policy Storage**: Versioned policy storage with metadata\n- **Metrics Storage**: Policy evaluation metrics and analytics\n- **Compliance Records**: Complete compliance audit trails\n\n## Quick Start\n\n### 1. Installation\n\n```\ncd src/control-center\ncargo build --release\n```\n\n### 2. Configuration\n\nCopy the example configuration:\n\n```\ncp config.toml.example config.toml\n```\n\nEdit `config.toml` for your environment:\n\n```\n[database]\nurl = "surreal://localhost:8000" # Your SurrealDB instance\nusername = "root"\npassword = "your-password"\n\n[auth]\njwt_secret = "your-super-secret-key"\nrequire_mfa = true\n\n[compliance.soc2]\nenabled = true\n\n[anomaly]\nenabled = true\ndetection_threshold = 2.5\n```\n\n### 3. Start the Server\n\n```\n./target/release/control-center server --port 8080\n```\n\n### 4. Test Policy Evaluation\n\n```\ncurl -X POST http://localhost:8080/policies/evaluate \\n -H "Content-Type: application/json" \\n -d '{\n "principal": {"id": "user123", "roles": ["Developer"]},\n "action": {"id": "access"},\n "resource": {"id": "sensitive-db", "classification": "confidential"},\n "context": {"mfa_enabled": true, "location": "US"}\n }'\n```\n\n## Policy Examples\n\n### Multi-Factor Authentication Policy\n\n```\n// Require MFA for sensitive resources\npermit(\n principal,\n action == Action::"access",\n resource\n) when {\n resource has classification &&\n resource.classification in ["sensitive", "confidential"] &&\n principal has mfa_enabled &&\n principal.mfa_enabled == true\n};\n```\n\n### Production Approval Policy\n\n```\n// Require approval for production operations\npermit(\n principal,\n action in [Action::"deploy", Action::"modify", Action::"delete"],\n resource\n) when {\n resource has environment &&\n resource.environment == "production" &&\n principal has approval &&\n principal.approval.approved_by in ["ProductionAdmin", "SRE"]\n};\n```\n\n### Geographic Restrictions\n\n```\n// Allow access only from approved countries\npermit(\n principal,\n action,\n resource\n) when {\n context has geo &&\n context.geo has country &&\n context.geo.country in ["US", "CA", "GB", "DE"]\n};\n```\n\n## CLI Commands\n\n### Policy Management\n\n```\n# Validate policies\ncontrol-center policy validate policies/\n\n# Test policy with test data\ncontrol-center policy test policies/mfa.cedar tests/data/mfa_test.json\n\n# Analyze policy impact\ncontrol-center policy impact policies/new_policy.cedar\n```\n\n### Compliance Checking\n\n```\n# Check SOC2 compliance\ncontrol-center compliance soc2\n\n# Check HIPAA compliance\ncontrol-center compliance hipaa\n\n# Generate compliance report\ncontrol-center compliance report --format html\n```\n\n## API Endpoints\n\n### Policy Evaluation\n\n- `POST /policies/evaluate` - Evaluate policy decision\n- `GET /policies` - List all policies\n- `POST /policies` - Create new policy\n- `PUT /policies/{id}` - Update policy\n- `DELETE /policies/{id}` - Delete policy\n\n### Policy Versions\n\n- `GET /policies/{id}/versions` - List policy versions\n- `GET /policies/{id}/versions/{version}` - Get specific version\n- `POST /policies/{id}/rollback/{version}` - Rollback to version\n\n### Compliance\n\n- `GET /compliance/soc2` - SOC2 compliance check\n- `GET /compliance/hipaa` - HIPAA compliance check\n- `GET /compliance/report` - Generate compliance report\n\n### Anomaly Detection\n\n- `GET /anomalies` - List detected anomalies\n- `GET /anomalies/{id}` - Get anomaly details\n- `POST /anomalies/detect` - Trigger anomaly detection\n\n## Testing\n\n### Run Unit Tests\n\n```\ncargo test\n```\n\n### Run Integration Tests\n\n```\ncargo test --test integration_tests\n```\n\n### Run Policy Tests\n\n```\ncargo test --test policy_tests\n```\n\n### Run Compliance Tests\n\n```\ncargo test --test compliance_tests\n```\n\n## Architecture\n\n### Core Components\n\n1. **Policy Engine** (`src/policies/engine.rs`)\n - Cedar policy evaluation\n - Context injection\n - Caching and optimization\n\n2. **Storage Layer** (`src/storage/`)\n - SurrealDB integration\n - Policy versioning\n - Metrics storage\n\n3. **Compliance Framework** (`src/compliance/`)\n - SOC2 checker\n - HIPAA validator\n - Report generation\n\n4. **Anomaly Detection** (`src/anomaly/`)\n - Statistical analysis\n - Real-time monitoring\n - Alert management\n\n5. **Authentication** (`src/auth.rs`)\n - JWT token management\n - Password hashing\n - Session handling\n\n### Configuration-Driven Design\n\nThe system follows PAP (Project Architecture Principles) with:\n\n- **No hardcoded values**: All behavior controlled via configuration\n- **Dynamic loading**: Policies and rules loaded from configuration\n- **Template-based**: Policy generation through templates\n- **Environment-aware**: Different configs for dev/test/prod\n\n### Security Features\n\n- **Audit Logging**: All policy evaluations logged\n- **Encryption**: Data encrypted at rest and in transit\n- **Rate Limiting**: Protection against abuse\n- **Input Validation**: Comprehensive validation of all inputs\n- **Error Handling**: Secure error handling without information leakage\n\n## Production Deployment\n\n### Docker\n\n```\nFROM rust:1.75 as builder\nWORKDIR /app\nCOPY . .\nRUN cargo build --release\n\nFROM debian:bookworm-slim\nRUN apt-get update && apt-get install -y ca-certificates\nCOPY --from=builder /app/target/release/control-center /usr/local/bin/\nEXPOSE 8080\nCMD ["control-center", "server"]\n```\n\n### Kubernetes\n\n```\napiVersion: apps/v1\nkind: Deployment\nmetadata:\n name: control-center\nspec:\n replicas: 3\n selector:\n matchLabels:\n app: control-center\n template:\n metadata:\n labels:\n app: control-center\n spec:\n containers:\n - name: control-center\n image: control-center:latest\n ports:\n - containerPort: 8080\n env:\n - name: DATABASE_URL\n value: "surreal://surrealdb:8000"\n```\n\n### Environment Variables\n\n```\n# Override config values with environment variables\nexport CONTROL_CENTER_SERVER_PORT=8080\nexport CONTROL_CENTER_DATABASE_URL="surreal://prod-db:8000"\nexport CONTROL_CENTER_AUTH_JWT_SECRET="production-secret"\nexport CONTROL_CENTER_COMPLIANCE_SOC2_ENABLED=true\n```\n\n## Monitoring & Observability\n\n### Metrics\n\n- Policy evaluation latency\n- Policy decision distribution\n- Anomaly detection rates\n- Compliance scores\n\n### Logging\n\n```\n// Structured logging with tracing\ntracing::info!(\n policy_id = %policy.id,\n principal = %context.principal.id,\n decision = ?result.decision,\n duration_ms = evaluation_time,\n "Policy evaluation completed"\n);\n```\n\n### Health Checks\n\n```\ncurl http://localhost:8080/health\n```\n\n## Contributing\n\n1. Follow the PAP principles documented in the codebase\n2. Add tests for new features\n3. Update documentation\n4. Ensure compliance checks pass\n5. Add appropriate logging and monitoring\n\n## License\n\nThis project follows the licensing specified in the parent repository.\n\n## Support\n\nFor questions and support, refer to the project documentation or create an issue in the repository. \ No newline at end of file diff --git a/crates/control-center/docs/security-considerations.md b/crates/control-center/docs/security-considerations.md index 133878c..9026861 100644 --- a/crates/control-center/docs/security-considerations.md +++ b/crates/control-center/docs/security-considerations.md @@ -1,710 +1 @@ -# Security Considerations for Control Center Enhancements - -## Overview - -This document outlines the security architecture and considerations for the control-center enhancements, -including KMS SSH key management, mode-based RBAC, and platform service monitoring. - -## 1. SSH Key Management Security - -### 1.1 Key Storage Security - -**Implementation**: - -- Private keys encrypted at rest using AES-256-GCM in KMS -- Public keys stored in plaintext (as they are meant to be public) -- Private key material never exposed in API responses -- Key IDs used as references, not actual keys - -**Threat Mitigation**: - -- ✅ **Data at Rest**: All private keys encrypted with master encryption key -- ✅ **Key Exposure**: Private keys only decrypted in memory when needed -- ✅ **Key Leakage**: Zeroization of key material after use -- ✅ **Unauthorized Access**: KMS access controlled by RBAC - -**Best Practices**: - -```rust -// Good: Using key ID reference -let key_id = ssh_key_manager.store_ssh_key(name, private, public, purpose, tags).await?; - -// Bad: Never do this - exposing private key in logs -tracing::info!("Stored key: {}", private_key); // DON'T DO THIS -```text - -### 1.2 Key Rotation Security - -**Implementation**: - -- Configurable rotation intervals (default 90 days) -- Grace period for old key usage (default 7 days) -- Automatic rotation scheduling (if enabled) -- Manual rotation support with immediate effect - -**Threat Mitigation**: - -- ✅ **Key Compromise**: Regular rotation limits exposure window -- ✅ **Stale Keys**: Automated detection of keys due for rotation -- ✅ **Rotation Failures**: Graceful degradation with error logging - -**Rotation Policy**: - -```toml -[kms.ssh_keys] -rotation_enabled = true -rotation_interval_days = 90 # Enterprise: 30, Dev: 180 -grace_period_days = 7 # Time to update deployed keys -auto_rotate = false # Manual approval recommended -```text - -### 1.3 Audit Logging - -**Logged Events**: - -- SSH key creation (who, when, purpose) -- SSH key retrieval (who accessed, when) -- SSH key rotation (old key ID, new key ID) -- SSH key deletion (who deleted, when) -- Failed access attempts - -**Audit Entry Structure**: - -```rust -pub struct SshKeyAuditEntry { - pub timestamp: DateTime, - pub key_id: String, - pub action: SshKeyAction, - pub user: Option, // User who performed action - pub ip_address: Option, // Source IP - pub success: bool, - pub error_message: Option, -} -```text - -**Threat Mitigation**: - -- ✅ **Unauthorized Access**: Full audit trail for forensics -- ✅ **Insider Threats**: User attribution for all actions -- ✅ **Compliance**: GDPR/SOC2 audit log requirements met - -**Audit Log Retention**: - -- In-memory: Last 10,000 entries -- Persistent: SurrealDB with 1-year retention -- Compliance mode: 7-year retention (configurable) - -### 1.4 Key Fingerprinting - -**Implementation**: - -```rust -fn calculate_fingerprint(public_key: &[u8]) -> Result { - use sha2::{Sha256, Digest}; - let mut hasher = Sha256::new(); - hasher.update(public_key); - let result = hasher.finalize(); - Ok(format!("SHA256:{}", base64::encode(&result[..16]))) -} -```text - -**Security Benefits**: - -- Verify key integrity -- Detect key tampering -- Match deployed keys to KMS records - -## 2. RBAC Security - -### 2.1 Execution Modes - -**Security Model by Mode**: - -| Mode | Security Level | Use Case | Audit Required | -| ------ | --------------- | ---------- | ---------------- | -| Solo | Low | Single developer | No | -| MultiUser | Medium | Small teams | Optional | -| CICD | Medium | Automation | Yes | -| Enterprise | High | Production | Mandatory | - -**Mode-Specific Security**: - -#### Solo Mode - -```rust -// Solo mode: All users are admin -// Security: Trust-based, no RBAC checks -if mode == ExecutionMode::Solo { - return true; // Allow all operations -} -```text - -**Risks**: - -- No access control -- No audit trail -- Single point of failure - -**Mitigations**: - -- Only for development environments -- Network isolation required -- Regular backups - -#### MultiUser Mode - -```rust -// Multi-user: Role-based access control -let permissions = rbac_manager.get_user_permissions(&user).await; -if !permissions.contains(&required_permission) { - return Err(RbacError::PermissionDenied); -} -```text - -**Security Features**: - -- Role-based permissions -- Optional audit logging -- Session management - -#### CICD Mode - -```rust -// CICD: Service account focused -// All actions logged for automation tracking -if mode == ExecutionMode::CICD { - audit_log.log_automation_action(service_account, action).await; -} -```text - -**Security Features**: - -- Service account isolation -- Mandatory audit logging -- Token-based authentication -- Short-lived credentials - -#### Enterprise Mode - -```rust -// Enterprise: Full security -// - Mandatory audit logging -// - Stricter session timeouts -// - Compliance reports -if mode == ExecutionMode::Enterprise { - audit_log.log_with_compliance(user, action, compliance_tags).await; -} -```text - -**Security Features**: - -- Full RBAC enforcement -- Comprehensive audit logging -- Compliance reporting -- Role assignment approval workflow - -### 2.2 Permission System - -**Permission Levels**: - -```rust -Role::Admin => 100 // Full access -Role::Operator => 80 // Deploy & manage -Role::Developer => 60 // Read + dev deploy -Role::ServiceAccount => 50 // Automation -Role::Auditor => 40 // Read + audit -Role::Viewer => 20 // Read-only -```text - -**Action Security Levels**: - -```rust -Action::Delete => 100 // Destructive, admin only -Action::Manage => 80 // Service management -Action::Deploy => 80 // Deploy to production -Action::Create => 60 // Create resources -Action::Update => 60 // Modify resources -Action::Execute => 50 // Execute operations -Action::Audit => 40 // View audit logs -Action::Read => 20 // View resources -```text - -**Permission Check**: - -```rust -pub fn can_perform(&self, required_level: u8) -> bool { - self.permission_level() >= required_level -} -```text - -**Security Guarantees**: - -- ✅ Least privilege by default (Viewer role) -- ✅ Hierarchical permissions (higher roles include lower) -- ✅ Explicit deny for unknown resources -- ✅ No permission escalation without admin - -### 2.3 Session Security - -**Session Configuration**: - -```toml -[security] -session_timeout_minutes = 60 # Solo/MultiUser -session_timeout_minutes = 30 # Enterprise -max_sessions_per_user = 5 -failed_login_lockout_attempts = 5 -failed_login_lockout_duration_minutes = 15 -```text - -**Session Lifecycle**: - -1. User authenticates → JWT token issued -2. Token includes: user_id, role, issued_at, expires_at -3. Middleware validates token on each request -4. Session tracked in Redis/RocksDB -5. Session invalidated on logout or timeout - -**Security Features**: - -- JWT with RSA-2048 signatures -- Refresh token rotation -- Session fixation prevention -- Concurrent session limits - -**Threat Mitigation**: - -- ✅ **Session Hijacking**: Short-lived tokens (1 hour) -- ✅ **Token Replay**: One-time refresh tokens -- ✅ **Brute Force**: Account lockout after 5 failures -- ✅ **Session Fixation**: New session ID on login - -### 2.4 Middleware Security - -**RBAC Middleware Flow**: - -```plaintext -Request → Auth Middleware → RBAC Middleware → Handler - ↓ ↓ - Extract User Check Permission - from JWT Token (role + resource + action) - ↓ - Allow / Deny -```text - -**Middleware Implementation**: - -```rust -pub async fn check_permission( - State(state): State>, - resource: Resource, - action: Action, - mut req: Request, - next: Next, -) -> Result { - let user = req.extensions() - .get::() - .ok_or(RbacError::UserNotFound("No user in request".to_string()))?; - - if !state.rbac_manager.check_permission(&user, resource, action).await { - return Err(RbacError::PermissionDenied); - } - - Ok(next.run(req).await) -} -```text - -**Security Guarantees**: - -- ✅ All API endpoints protected by default -- ✅ Permission checked before handler execution -- ✅ User context available in handlers -- ✅ Failed checks logged for audit - -## 3. Platform Monitoring Security - -### 3.1 Service Access Security - -**Internal URLs Only**: - -```toml -[platform] -orchestrator_url = "http://localhost:9090" # Not exposed externally -coredns_url = "http://localhost:9153" -gitea_url = "http://localhost:3000" -oci_registry_url = "http://localhost:5000" -```text - -**Network Security**: - -- All services on localhost or internal network -- No external exposure of monitoring endpoints -- Firewall rules to prevent external access - -**Threat Mitigation**: - -- ✅ **External Scanning**: Services not reachable from internet -- ✅ **DDoS**: Internal-only access limits attack surface -- ✅ **Data Exfiltration**: Monitoring data not exposed externally - -### 3.2 Health Check Security - -**Timeout Protection**: - -```rust -let client = Client::builder() - .timeout(std::time::Duration::from_secs(5)) // Prevent hanging - .build() - .unwrap(); -```text - -**Error Handling**: - -```rust -// Never expose internal errors to users -Err(e) => { - // Log detailed error internally - tracing::error!("Health check failed for {}: {}", service, e); - - // Return generic error externally - ServiceStatus { - status: HealthStatus::Unhealthy, - error_message: Some("Service unavailable".to_string()), // Generic - .. - } -} -```text - -**Threat Mitigation**: - -- ✅ **Timeout Attacks**: 5-second timeout prevents resource exhaustion -- ✅ **Information Disclosure**: Error messages sanitized -- ✅ **Resource Exhaustion**: Parallel checks with concurrency limits - -### 3.3 Service Control Security - -**RBAC-Protected Service Control**: - -```rust -// Only Operator or Admin can start/stop services -#[axum::debug_handler] -pub async fn start_service( - State(state): State, - Extension(user): Extension, - Path(service_type): Path, -) -> Result { - // Check permission - if !rbac_manager.check_permission( - &user, - Resource::Service, - Action::Manage, - ).await { - return Err(ApiError::PermissionDenied); - } - - // Start service - service_manager.start_service(&service_type).await?; - - // Audit log - audit_log.log_service_action(user, service_type, "start").await; - - Ok(StatusCode::OK) -} -```text - -**Security Guarantees**: - -- ✅ Only authorized users can control services -- ✅ All service actions logged -- ✅ Graceful degradation on service failure - -## 4. Threat Model - -### 4.1 High-Risk Threats - -#### Threat: SSH Private Key Exposure - -**Attack Vector**: Attacker gains access to KMS database - -**Mitigations**: - -- Private keys encrypted at rest with master key -- Master key stored in hardware security module (HSM) or KMS -- Key access audited and rate-limited -- Zeroization of decrypted keys in memory - -**Detection**: - -- Audit log monitoring for unusual key access patterns -- Alerting on bulk key retrievals - -#### Threat: Privilege Escalation - -**Attack Vector**: Lower-privileged user attempts to gain admin access - -**Mitigations**: - -- Role assignment requires Admin role -- Mode switching requires Admin role -- Middleware enforces permissions on every request -- No client-side permission checks (server-side only) - -**Detection**: - -- Failed permission checks logged -- Alerting on repeated permission denials - -#### Threat: Session Hijacking - -**Attack Vector**: Attacker steals JWT token - -**Mitigations**: - -- Short-lived access tokens (1 hour) -- Refresh token rotation -- Secure HTTP-only cookies (recommended) -- IP address binding (optional) - -**Detection**: - -- Unusual login locations -- Concurrent sessions from different IPs - -### 4.2 Medium-Risk Threats - -#### Threat: Service Impersonation - -**Attack Vector**: Malicious service pretends to be legitimate platform service - -**Mitigations**: - -- Service URLs configured in config file (not dynamic) -- TLS certificate validation (if HTTPS) -- Service authentication tokens - -**Detection**: - -- Health check failures -- Metrics anomalies - -#### Threat: Audit Log Tampering - -**Attack Vector**: Attacker modifies audit logs to hide tracks - -**Mitigations**: - -- Audit logs write-only -- Logs stored in tamper-evident database (SurrealDB) -- Hash chain for log integrity -- Offsite log backup - -**Detection**: - -- Hash chain verification -- Log gap detection - -### 4.3 Low-Risk Threats - -#### Threat: Information Disclosure via Error Messages - -**Attack Vector**: Error messages leak internal information - -**Mitigations**: - -- Generic error messages for users -- Detailed errors only in server logs -- Error message sanitization - -**Detection**: - -- Code review for error handling -- Automated scanning for sensitive data in responses - -## 5. Compliance Considerations - -### 5.1 GDPR Compliance - -**Personal Data Handling**: - -- User information: username, email, IP addresses -- Retention: Audit logs kept for required period -- Right to erasure: User deletion deletes all associated data - -**Implementation**: - -```rust -// Delete user and all associated data -pub async fn delete_user(&self, user_id: &str) -> Result<(), RbacError> { - // Delete user SSH keys - for key in self.list_user_ssh_keys(user_id).await? { - self.delete_ssh_key(&key.key_id).await?; - } - - // Anonymize audit logs (retain for compliance, remove PII) - self.anonymize_user_audit_logs(user_id).await?; - - // Delete user record - self.delete_user_record(user_id).await?; - - Ok(()) -} -```text - -### 5.2 SOC 2 Compliance - -**Security Controls**: - -- ✅ Access control (RBAC) -- ✅ Audit logging (all actions logged) -- ✅ Encryption at rest (KMS) -- ✅ Encryption in transit (HTTPS recommended) -- ✅ Session management (timeout, MFA support) - -**Monitoring & Alerting**: - -- ✅ Service health monitoring -- ✅ Failed login tracking -- ✅ Permission denial alerting -- ✅ Unusual activity detection - -### 5.3 PCI DSS (if applicable) - -**Requirements**: - -- ✅ Encrypt cardholder data (use KMS for keys) -- ✅ Maintain access control (RBAC) -- ✅ Track and monitor access (audit logs) -- ✅ Regularly test security (integration tests) - -## 6. Security Best Practices - -### 6.1 Development - -**Code Review Checklist**: - -- [ ] All API endpoints have RBAC middleware -- [ ] No hardcoded secrets or keys -- [ ] Error messages don't leak sensitive info -- [ ] Audit logging for sensitive operations -- [ ] Input validation on all user inputs -- [ ] SQL injection prevention (use parameterized queries) -- [ ] XSS prevention (escape user inputs) - -**Testing**: - -- Unit tests for permission checks -- Integration tests for RBAC enforcement -- Penetration testing for production deployments - -### 6.2 Deployment - -**Production Checklist**: - -- [ ] Change default admin password -- [ ] Enable HTTPS with valid certificate -- [ ] Configure firewall rules (internal services only) -- [ ] Set appropriate execution mode (Enterprise for production) -- [ ] Enable audit logging -- [ ] Configure session timeout (30 minutes for Enterprise) -- [ ] Enable rate limiting -- [ ] Set up log monitoring and alerting -- [ ] Regular security updates -- [ ] Backup encryption keys - -### 6.3 Operations - -**Incident Response**: - -1. **Detection**: Monitor audit logs for anomalies -2. **Containment**: Revoke compromised credentials -3. **Eradication**: Rotate affected SSH keys -4. **Recovery**: Restore from backup if needed -5. **Lessons Learned**: Update security controls - -**Key Rotation Schedule**: - -- SSH keys: Every 90 days (Enterprise: 30 days) -- JWT signing keys: Every 180 days -- Master encryption key: Every 365 days -- Service account tokens: Every 30 days - -## 7. Security Metrics - -### 7.1 Monitoring Metrics - -**Authentication**: - -- Failed login attempts per user -- Concurrent sessions per user -- Session duration (average, p95, p99) - -**Authorization**: - -- Permission denials per user -- Permission denials per resource -- Role assignments per day - -**Audit**: - -- SSH key accesses per day -- SSH key rotations per month -- Audit log retention compliance - -**Services**: - -- Service health check success rate -- Service response times (p50, p95, p99) -- Service dependency failures - -### 7.2 Alerting Thresholds - -**Critical Alerts**: - -- Service health: >3 failures in 5 minutes -- Failed logins: >10 attempts in 1 minute -- Permission denials: >50 in 1 minute -- SSH key bulk retrieval: >10 keys in 1 minute - -**Warning Alerts**: - -- Service degraded: response time >1 second -- Session timeout rate: >10% of sessions -- Audit log storage: >80% capacity - -## 8. Security Roadmap - -### Phase 1 (Completed) - -- ✅ SSH key storage with encryption -- ✅ Mode-based RBAC -- ✅ Audit logging -- ✅ Platform monitoring - -### Phase 2 (In Progress) - -- 📋 API handlers with RBAC enforcement -- 📋 Integration tests for security -- 📋 Documentation - -### Phase 3 (Future) - -- Multi-factor authentication (MFA) -- Hardware security module (HSM) integration -- Advanced threat detection (ML-based) -- Automated security scanning -- Compliance report generation -- Security information and event management (SIEM) integration - -## References - -- **OWASP Top 10**: -- **NIST Cybersecurity Framework**: -- **CIS Controls**: -- **GDPR**: -- **SOC 2**: - ---- - -**Last Updated**: 2025-10-06 -**Review Cycle**: Quarterly -**Next Review**: 2026-01-06 +# Security Considerations for Control Center Enhancements\n\n## Overview\n\nThis document outlines the security architecture and considerations for the control-center enhancements,\nincluding KMS SSH key management, mode-based RBAC, and platform service monitoring.\n\n## 1. SSH Key Management Security\n\n### 1.1 Key Storage Security\n\n**Implementation**:\n\n- Private keys encrypted at rest using AES-256-GCM in KMS\n- Public keys stored in plaintext (as they are meant to be public)\n- Private key material never exposed in API responses\n- Key IDs used as references, not actual keys\n\n**Threat Mitigation**:\n\n- ✅ **Data at Rest**: All private keys encrypted with master encryption key\n- ✅ **Key Exposure**: Private keys only decrypted in memory when needed\n- ✅ **Key Leakage**: Zeroization of key material after use\n- ✅ **Unauthorized Access**: KMS access controlled by RBAC\n\n**Best Practices**:\n\n```\n// Good: Using key ID reference\nlet key_id = ssh_key_manager.store_ssh_key(name, private, public, purpose, tags).await?;\n\n// Bad: Never do this - exposing private key in logs\ntracing::info!("Stored key: {}", private_key); // DON'T DO THIS\n```\n\n### 1.2 Key Rotation Security\n\n**Implementation**:\n\n- Configurable rotation intervals (default 90 days)\n- Grace period for old key usage (default 7 days)\n- Automatic rotation scheduling (if enabled)\n- Manual rotation support with immediate effect\n\n**Threat Mitigation**:\n\n- ✅ **Key Compromise**: Regular rotation limits exposure window\n- ✅ **Stale Keys**: Automated detection of keys due for rotation\n- ✅ **Rotation Failures**: Graceful degradation with error logging\n\n**Rotation Policy**:\n\n```\n[kms.ssh_keys]\nrotation_enabled = true\nrotation_interval_days = 90 # Enterprise: 30, Dev: 180\ngrace_period_days = 7 # Time to update deployed keys\nauto_rotate = false # Manual approval recommended\n```\n\n### 1.3 Audit Logging\n\n**Logged Events**:\n\n- SSH key creation (who, when, purpose)\n- SSH key retrieval (who accessed, when)\n- SSH key rotation (old key ID, new key ID)\n- SSH key deletion (who deleted, when)\n- Failed access attempts\n\n**Audit Entry Structure**:\n\n```\npub struct SshKeyAuditEntry {\n pub timestamp: DateTime,\n pub key_id: String,\n pub action: SshKeyAction,\n pub user: Option, // User who performed action\n pub ip_address: Option, // Source IP\n pub success: bool,\n pub error_message: Option,\n}\n```\n\n**Threat Mitigation**:\n\n- ✅ **Unauthorized Access**: Full audit trail for forensics\n- ✅ **Insider Threats**: User attribution for all actions\n- ✅ **Compliance**: GDPR/SOC2 audit log requirements met\n\n**Audit Log Retention**:\n\n- In-memory: Last 10,000 entries\n- Persistent: SurrealDB with 1-year retention\n- Compliance mode: 7-year retention (configurable)\n\n### 1.4 Key Fingerprinting\n\n**Implementation**:\n\n```\nfn calculate_fingerprint(public_key: &[u8]) -> Result {\n use sha2::{Sha256, Digest};\n let mut hasher = Sha256::new();\n hasher.update(public_key);\n let result = hasher.finalize();\n Ok(format!("SHA256:{}", base64::encode(&result[..16])))\n}\n```\n\n**Security Benefits**:\n\n- Verify key integrity\n- Detect key tampering\n- Match deployed keys to KMS records\n\n## 2. RBAC Security\n\n### 2.1 Execution Modes\n\n**Security Model by Mode**:\n\n| Mode | Security Level | Use Case | Audit Required |\n| ------ | --------------- | ---------- | ---------------- |\n| Solo | Low | Single developer | No |\n| MultiUser | Medium | Small teams | Optional |\n| CICD | Medium | Automation | Yes |\n| Enterprise | High | Production | Mandatory |\n\n**Mode-Specific Security**:\n\n#### Solo Mode\n\n```\n// Solo mode: All users are admin\n// Security: Trust-based, no RBAC checks\nif mode == ExecutionMode::Solo {\n return true; // Allow all operations\n}\n```\n\n**Risks**:\n\n- No access control\n- No audit trail\n- Single point of failure\n\n**Mitigations**:\n\n- Only for development environments\n- Network isolation required\n- Regular backups\n\n#### MultiUser Mode\n\n```\n// Multi-user: Role-based access control\nlet permissions = rbac_manager.get_user_permissions(&user).await;\nif !permissions.contains(&required_permission) {\n return Err(RbacError::PermissionDenied);\n}\n```\n\n**Security Features**:\n\n- Role-based permissions\n- Optional audit logging\n- Session management\n\n#### CICD Mode\n\n```\n// CICD: Service account focused\n// All actions logged for automation tracking\nif mode == ExecutionMode::CICD {\n audit_log.log_automation_action(service_account, action).await;\n}\n```\n\n**Security Features**:\n\n- Service account isolation\n- Mandatory audit logging\n- Token-based authentication\n- Short-lived credentials\n\n#### Enterprise Mode\n\n```\n// Enterprise: Full security\n// - Mandatory audit logging\n// - Stricter session timeouts\n// - Compliance reports\nif mode == ExecutionMode::Enterprise {\n audit_log.log_with_compliance(user, action, compliance_tags).await;\n}\n```\n\n**Security Features**:\n\n- Full RBAC enforcement\n- Comprehensive audit logging\n- Compliance reporting\n- Role assignment approval workflow\n\n### 2.2 Permission System\n\n**Permission Levels**:\n\n```\nRole::Admin => 100 // Full access\nRole::Operator => 80 // Deploy & manage\nRole::Developer => 60 // Read + dev deploy\nRole::ServiceAccount => 50 // Automation\nRole::Auditor => 40 // Read + audit\nRole::Viewer => 20 // Read-only\n```\n\n**Action Security Levels**:\n\n```\nAction::Delete => 100 // Destructive, admin only\nAction::Manage => 80 // Service management\nAction::Deploy => 80 // Deploy to production\nAction::Create => 60 // Create resources\nAction::Update => 60 // Modify resources\nAction::Execute => 50 // Execute operations\nAction::Audit => 40 // View audit logs\nAction::Read => 20 // View resources\n```\n\n**Permission Check**:\n\n```\npub fn can_perform(&self, required_level: u8) -> bool {\n self.permission_level() >= required_level\n}\n```\n\n**Security Guarantees**:\n\n- ✅ Least privilege by default (Viewer role)\n- ✅ Hierarchical permissions (higher roles include lower)\n- ✅ Explicit deny for unknown resources\n- ✅ No permission escalation without admin\n\n### 2.3 Session Security\n\n**Session Configuration**:\n\n```\n[security]\nsession_timeout_minutes = 60 # Solo/MultiUser\nsession_timeout_minutes = 30 # Enterprise\nmax_sessions_per_user = 5\nfailed_login_lockout_attempts = 5\nfailed_login_lockout_duration_minutes = 15\n```\n\n**Session Lifecycle**:\n\n1. User authenticates → JWT token issued\n2. Token includes: user_id, role, issued_at, expires_at\n3. Middleware validates token on each request\n4. Session tracked in Redis/RocksDB\n5. Session invalidated on logout or timeout\n\n**Security Features**:\n\n- JWT with RSA-2048 signatures\n- Refresh token rotation\n- Session fixation prevention\n- Concurrent session limits\n\n**Threat Mitigation**:\n\n- ✅ **Session Hijacking**: Short-lived tokens (1 hour)\n- ✅ **Token Replay**: One-time refresh tokens\n- ✅ **Brute Force**: Account lockout after 5 failures\n- ✅ **Session Fixation**: New session ID on login\n\n### 2.4 Middleware Security\n\n**RBAC Middleware Flow**:\n\n```\nRequest → Auth Middleware → RBAC Middleware → Handler\n ↓ ↓\n Extract User Check Permission\n from JWT Token (role + resource + action)\n ↓\n Allow / Deny\n```\n\n**Middleware Implementation**:\n\n```\npub async fn check_permission(\n State(state): State>,\n resource: Resource,\n action: Action,\n mut req: Request,\n next: Next,\n) -> Result {\n let user = req.extensions()\n .get::()\n .ok_or(RbacError::UserNotFound("No user in request".to_string()))?;\n\n if !state.rbac_manager.check_permission(&user, resource, action).await {\n return Err(RbacError::PermissionDenied);\n }\n\n Ok(next.run(req).await)\n}\n```\n\n**Security Guarantees**:\n\n- ✅ All API endpoints protected by default\n- ✅ Permission checked before handler execution\n- ✅ User context available in handlers\n- ✅ Failed checks logged for audit\n\n## 3. Platform Monitoring Security\n\n### 3.1 Service Access Security\n\n**Internal URLs Only**:\n\n```\n[platform]\norchestrator_url = "http://localhost:9090" # Not exposed externally\ncoredns_url = "http://localhost:9153"\ngitea_url = "http://localhost:3000"\noci_registry_url = "http://localhost:5000"\n```\n\n**Network Security**:\n\n- All services on localhost or internal network\n- No external exposure of monitoring endpoints\n- Firewall rules to prevent external access\n\n**Threat Mitigation**:\n\n- ✅ **External Scanning**: Services not reachable from internet\n- ✅ **DDoS**: Internal-only access limits attack surface\n- ✅ **Data Exfiltration**: Monitoring data not exposed externally\n\n### 3.2 Health Check Security\n\n**Timeout Protection**:\n\n```\nlet client = Client::builder()\n .timeout(std::time::Duration::from_secs(5)) // Prevent hanging\n .build()\n .unwrap();\n```\n\n**Error Handling**:\n\n```\n// Never expose internal errors to users\nErr(e) => {\n // Log detailed error internally\n tracing::error!("Health check failed for {}: {}", service, e);\n\n // Return generic error externally\n ServiceStatus {\n status: HealthStatus::Unhealthy,\n error_message: Some("Service unavailable".to_string()), // Generic\n ..\n }\n}\n```\n\n**Threat Mitigation**:\n\n- ✅ **Timeout Attacks**: 5-second timeout prevents resource exhaustion\n- ✅ **Information Disclosure**: Error messages sanitized\n- ✅ **Resource Exhaustion**: Parallel checks with concurrency limits\n\n### 3.3 Service Control Security\n\n**RBAC-Protected Service Control**:\n\n```\n// Only Operator or Admin can start/stop services\n#[axum::debug_handler]\npub async fn start_service(\n State(state): State,\n Extension(user): Extension,\n Path(service_type): Path,\n) -> Result {\n // Check permission\n if !rbac_manager.check_permission(\n &user,\n Resource::Service,\n Action::Manage,\n ).await {\n return Err(ApiError::PermissionDenied);\n }\n\n // Start service\n service_manager.start_service(&service_type).await?;\n\n // Audit log\n audit_log.log_service_action(user, service_type, "start").await;\n\n Ok(StatusCode::OK)\n}\n```\n\n**Security Guarantees**:\n\n- ✅ Only authorized users can control services\n- ✅ All service actions logged\n- ✅ Graceful degradation on service failure\n\n## 4. Threat Model\n\n### 4.1 High-Risk Threats\n\n#### Threat: SSH Private Key Exposure\n\n**Attack Vector**: Attacker gains access to KMS database\n\n**Mitigations**:\n\n- Private keys encrypted at rest with master key\n- Master key stored in hardware security module (HSM) or KMS\n- Key access audited and rate-limited\n- Zeroization of decrypted keys in memory\n\n**Detection**:\n\n- Audit log monitoring for unusual key access patterns\n- Alerting on bulk key retrievals\n\n#### Threat: Privilege Escalation\n\n**Attack Vector**: Lower-privileged user attempts to gain admin access\n\n**Mitigations**:\n\n- Role assignment requires Admin role\n- Mode switching requires Admin role\n- Middleware enforces permissions on every request\n- No client-side permission checks (server-side only)\n\n**Detection**:\n\n- Failed permission checks logged\n- Alerting on repeated permission denials\n\n#### Threat: Session Hijacking\n\n**Attack Vector**: Attacker steals JWT token\n\n**Mitigations**:\n\n- Short-lived access tokens (1 hour)\n- Refresh token rotation\n- Secure HTTP-only cookies (recommended)\n- IP address binding (optional)\n\n**Detection**:\n\n- Unusual login locations\n- Concurrent sessions from different IPs\n\n### 4.2 Medium-Risk Threats\n\n#### Threat: Service Impersonation\n\n**Attack Vector**: Malicious service pretends to be legitimate platform service\n\n**Mitigations**:\n\n- Service URLs configured in config file (not dynamic)\n- TLS certificate validation (if HTTPS)\n- Service authentication tokens\n\n**Detection**:\n\n- Health check failures\n- Metrics anomalies\n\n#### Threat: Audit Log Tampering\n\n**Attack Vector**: Attacker modifies audit logs to hide tracks\n\n**Mitigations**:\n\n- Audit logs write-only\n- Logs stored in tamper-evident database (SurrealDB)\n- Hash chain for log integrity\n- Offsite log backup\n\n**Detection**:\n\n- Hash chain verification\n- Log gap detection\n\n### 4.3 Low-Risk Threats\n\n#### Threat: Information Disclosure via Error Messages\n\n**Attack Vector**: Error messages leak internal information\n\n**Mitigations**:\n\n- Generic error messages for users\n- Detailed errors only in server logs\n- Error message sanitization\n\n**Detection**:\n\n- Code review for error handling\n- Automated scanning for sensitive data in responses\n\n## 5. Compliance Considerations\n\n### 5.1 GDPR Compliance\n\n**Personal Data Handling**:\n\n- User information: username, email, IP addresses\n- Retention: Audit logs kept for required period\n- Right to erasure: User deletion deletes all associated data\n\n**Implementation**:\n\n```\n// Delete user and all associated data\npub async fn delete_user(&self, user_id: &str) -> Result<(), RbacError> {\n // Delete user SSH keys\n for key in self.list_user_ssh_keys(user_id).await? {\n self.delete_ssh_key(&key.key_id).await?;\n }\n\n // Anonymize audit logs (retain for compliance, remove PII)\n self.anonymize_user_audit_logs(user_id).await?;\n\n // Delete user record\n self.delete_user_record(user_id).await?;\n\n Ok(())\n}\n```\n\n### 5.2 SOC 2 Compliance\n\n**Security Controls**:\n\n- ✅ Access control (RBAC)\n- ✅ Audit logging (all actions logged)\n- ✅ Encryption at rest (KMS)\n- ✅ Encryption in transit (HTTPS recommended)\n- ✅ Session management (timeout, MFA support)\n\n**Monitoring & Alerting**:\n\n- ✅ Service health monitoring\n- ✅ Failed login tracking\n- ✅ Permission denial alerting\n- ✅ Unusual activity detection\n\n### 5.3 PCI DSS (if applicable)\n\n**Requirements**:\n\n- ✅ Encrypt cardholder data (use KMS for keys)\n- ✅ Maintain access control (RBAC)\n- ✅ Track and monitor access (audit logs)\n- ✅ Regularly test security (integration tests)\n\n## 6. Security Best Practices\n\n### 6.1 Development\n\n**Code Review Checklist**:\n\n- [ ] All API endpoints have RBAC middleware\n- [ ] No hardcoded secrets or keys\n- [ ] Error messages don't leak sensitive info\n- [ ] Audit logging for sensitive operations\n- [ ] Input validation on all user inputs\n- [ ] SQL injection prevention (use parameterized queries)\n- [ ] XSS prevention (escape user inputs)\n\n**Testing**:\n\n- Unit tests for permission checks\n- Integration tests for RBAC enforcement\n- Penetration testing for production deployments\n\n### 6.2 Deployment\n\n**Production Checklist**:\n\n- [ ] Change default admin password\n- [ ] Enable HTTPS with valid certificate\n- [ ] Configure firewall rules (internal services only)\n- [ ] Set appropriate execution mode (Enterprise for production)\n- [ ] Enable audit logging\n- [ ] Configure session timeout (30 minutes for Enterprise)\n- [ ] Enable rate limiting\n- [ ] Set up log monitoring and alerting\n- [ ] Regular security updates\n- [ ] Backup encryption keys\n\n### 6.3 Operations\n\n**Incident Response**:\n\n1. **Detection**: Monitor audit logs for anomalies\n2. **Containment**: Revoke compromised credentials\n3. **Eradication**: Rotate affected SSH keys\n4. **Recovery**: Restore from backup if needed\n5. **Lessons Learned**: Update security controls\n\n**Key Rotation Schedule**:\n\n- SSH keys: Every 90 days (Enterprise: 30 days)\n- JWT signing keys: Every 180 days\n- Master encryption key: Every 365 days\n- Service account tokens: Every 30 days\n\n## 7. Security Metrics\n\n### 7.1 Monitoring Metrics\n\n**Authentication**:\n\n- Failed login attempts per user\n- Concurrent sessions per user\n- Session duration (average, p95, p99)\n\n**Authorization**:\n\n- Permission denials per user\n- Permission denials per resource\n- Role assignments per day\n\n**Audit**:\n\n- SSH key accesses per day\n- SSH key rotations per month\n- Audit log retention compliance\n\n**Services**:\n\n- Service health check success rate\n- Service response times (p50, p95, p99)\n- Service dependency failures\n\n### 7.2 Alerting Thresholds\n\n**Critical Alerts**:\n\n- Service health: >3 failures in 5 minutes\n- Failed logins: >10 attempts in 1 minute\n- Permission denials: >50 in 1 minute\n- SSH key bulk retrieval: >10 keys in 1 minute\n\n**Warning Alerts**:\n\n- Service degraded: response time >1 second\n- Session timeout rate: >10% of sessions\n- Audit log storage: >80% capacity\n\n## 8. Security Roadmap\n\n### Phase 1 (Completed)\n\n- ✅ SSH key storage with encryption\n- ✅ Mode-based RBAC\n- ✅ Audit logging\n- ✅ Platform monitoring\n\n### Phase 2 (In Progress)\n\n- 📋 API handlers with RBAC enforcement\n- 📋 Integration tests for security\n- 📋 Documentation\n\n### Phase 3 (Future)\n\n- Multi-factor authentication (MFA)\n- Hardware security module (HSM) integration\n- Advanced threat detection (ML-based)\n- Automated security scanning\n- Compliance report generation\n- Security information and event management (SIEM) integration\n\n## References\n\n- **OWASP Top 10**: \n- **NIST Cybersecurity Framework**: \n- **CIS Controls**: \n- **GDPR**: \n- **SOC 2**: \n\n---\n\n**Last Updated**: 2025-10-06\n**Review Cycle**: Quarterly\n**Next Review**: 2026-01-06 \ No newline at end of file diff --git a/crates/control-center/src/kms/README.md b/crates/control-center/src/kms/README.md index 5a4e677..aa07bb0 100644 --- a/crates/control-center/src/kms/README.md +++ b/crates/control-center/src/kms/README.md @@ -1,453 +1 @@ -# Hybrid Key Management System (KMS) - -A comprehensive hybrid KMS system built for the control center, supporting local/remote/hybrid modes -with intelligent caching, failover, and advanced security features. - -## Architecture Overview - -### Core Components - -1. **KMS Backends** - - **Local Backend**: SQLite with AES-256-GCM encryption - - **Remote Backend**: Cosmian KMS client integration - - **Hybrid Backend**: Intelligent combination of local and remote - -2. **Caching System** - - **Memory Cache**: In-memory LRU cache with TTL - - **Redis Cache**: Distributed caching with automatic failover - - **Local File Cache**: Persistent file-based cache for offline scenarios - -3. **Security Features** - - **Encryption**: AES-256-GCM for local storage - - **Key Derivation**: HKDF-SHA256 for master key derivation - - **Authentication**: Multiple auth methods (certificate, token, basic, OAuth) - - **Audit Logging**: Comprehensive audit trail for all operations - -4. **Advanced Features** - - **Credential Management**: Automatic injection for cloud providers - - **Key Rotation**: Configurable automatic key rotation - - **HSM Integration**: Hardware Security Module support - - **Zero-Knowledge Proofs**: For sensitive operations - - **Migration Tools**: Backend-to-backend key migration - -## Configuration - -### TOML Configuration Example - -```toml -[kms] -# Operation mode: local, remote, or hybrid -mode = "hybrid" - -# Local SQLite backend configuration -[kms.local] -database_path = "./data/kms.db" - -[kms.local.master_key] -derivation_method = "hkdf" -source = "generated" -iterations = 100000 - -[kms.local.encryption] -default_algorithm = "AES-256-GCM" -key_size_bits = 256 -authenticated = true - -[kms.local.backup] -enabled = true -backup_dir = "./backups/kms" -interval_hours = 24 -max_backups = 7 -compress = true -encrypt = true - -# Remote Cosmian KMS configuration -[kms.remote] -server_url = "https://kms.example.com:9998" -auth_method = "certificate" -client_cert_path = "./certs/client.crt" -client_key_path = "./certs/client.key" -ca_cert_path = "./certs/ca.crt" -timeout_seconds = 30 -verify_ssl = true - -[kms.remote.retry] -max_attempts = 3 -initial_delay_ms = 1000 -max_delay_ms = 30000 -backoff_multiplier = 2.0 - -# Cache configuration -[kms.cache] -enabled = true -backend = "memory" -default_ttl_seconds = 3600 -max_size_bytes = 104857600 # 100MB -local_dir = "./cache/kms" - -# Credential management -[kms.credentials] -enabled = true - -[kms.credentials.storage] -storage_type = "sqlite" -encryption_key_id = "credential_encryption_key" -database_path = "./data/credentials.db" - -[kms.credentials.providers.aws] -name = "AWS" -provider_type = "aws" -refresh_interval_seconds = 3600 -expiry_warning_seconds = 300 -regions = ["us-east-1", "eu-west-1"] - -[kms.credentials.providers.upcloud] -name = "UpCloud" -provider_type = "upcloud" -refresh_interval_seconds = 3600 -expiry_warning_seconds = 300 -regions = ["fi-hel1", "us-nyc1"] - -# Key rotation configuration -[kms.rotation] -enabled = true -interval_seconds = 2592000 # 30 days -max_age_seconds = 7776000 # 90 days -notice_seconds = 604800 # 7 days -schedule = "0 2 * * 0" # Every Sunday at 2 AM - -# Audit configuration -[kms.audit] -enabled = true -backend = "file" -retention_days = 90 -log_level = "info" -include_data = false -max_file_size_mb = 100 -format = "json" - -# HSM configuration -[kms.hsm] -enabled = false -hsm_type = "pkcs11" -pkcs11_library = "/usr/lib/libpkcs11.so" -slot_id = 0 - -# Zero-knowledge proof configuration -[kms.zkp] -enabled = false -proof_system = "groth16" -setup_params_path = "./zkp/setup.params" - -# Security policy -[kms.security] -require_strong_auth = true -min_key_length_bits = 256 -max_key_age_days = 90 -enable_pfs = true -allowed_algorithms = ["AES-256-GCM", "ChaCha20Poly1305", "RSA-4096", "ECDSA-P384"] -blocked_algorithms = ["DES", "3DES", "RC4", "MD5"] -policy_enforcement = "strict" -```text - -## Usage Examples - -### Basic KMS Operations - -```rust -use control_center::kms::{KmsManager, KmsConfig, KeyData, KeyType, KeyAlgorithm, KeyUsage}; - -// Initialize KMS manager -let config = KmsConfig::load_from_file("kms.toml").await?; -let mut kms = KmsManager::new(&config).await?; -kms.initialize().await?; - -// Create a new encryption key -let key_data = KeyData { - key_id: "my-encryption-key".to_string(), - key_type: KeyType::Symmetric, - algorithm: KeyAlgorithm::Aes256Gcm, - usage: KeyUsage { - encrypt: true, - decrypt: true, - ..Default::default() - }, - key_size: 256, - key_material: SecretBytes::new(generate_random_key(32)), - metadata: KeyMetadata { - name: Some("Application Encryption Key".to_string()), - description: Some("Key for encrypting application data".to_string()), - owner: Some("app-service".to_string()), - environment: Some("production".to_string()), - ..Default::default() - }, - created_at: Utc::now(), - last_accessed: None, - expires_at: Some(Utc::now() + chrono::Duration::days(90)), - status: KeyStatus::Active, - tags: HashMap::from([ - ("purpose".to_string(), "encryption".to_string()), - ("service".to_string(), "app".to_string()), - ]), -}; - -// Store the key -let stored_key_id = kms.store_key(key_data).await?; -println!("Key stored with ID: {}", stored_key_id); - -// Encrypt data -let plaintext = b"sensitive data to encrypt"; -let context = HashMap::from([ - ("service".to_string(), "app".to_string()), - ("version".to_string(), "1.0".to_string()), -]); -let ciphertext = kms.encrypt(&stored_key_id, plaintext, Some(context.clone())).await?; - -// Decrypt data -let decrypted = kms.decrypt(&stored_key_id, &ciphertext, Some(context)).await?; -assert_eq!(plaintext, decrypted.as_slice()); - -// Get key information -if let Some(key_info) = kms.get_key(&stored_key_id).await? { - println!("Key algorithm: {:?}", key_info.algorithm); - println!("Key status: {:?}", key_info.status); - println!("Created: {}", key_info.created_at); -} -```text - -### Provider Credential Management - -```rust -use control_center::kms::{ProviderCredentials, CredentialType}; - -// Store AWS credentials -let aws_creds = ProviderCredentials { - provider: "aws".to_string(), - credential_type: CredentialType::AccessKey, - access_key: "AKIAIOSFODNN7EXAMPLE".to_string(), - secret_key: SecretBytes::new(b"wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY".to_vec()), - session_token: None, - region: Some("us-east-1".to_string()), - config: HashMap::new(), - expires_at: Some(Utc::now() + chrono::Duration::hours(12)), - created_at: Utc::now(), -}; - -kms.store_provider_credentials("aws", aws_creds).await?; - -// Retrieve credentials for automatic injection -if let Some(creds) = kms.get_provider_credentials("aws").await? { - println!("AWS Access Key: {}", creds.access_key); - // Credentials are automatically injected into environment variables - // or configuration files based on the injection configuration -} -```text - -### Health Monitoring - -```rust -// Check system health -let health = kms.health_check().await?; -println!("KMS Health: {}", health.overall); -println!("Backend Status: {}", health.backend.healthy); -println!("Rotation Status: {}", health.rotation.healthy); -println!("Credentials Status: {}", health.credentials.healthy); - -// Get cache statistics -let cache_stats = kms.cache.stats().await; -println!("Cache hit rate: {:.2}%", cache_stats.hit_rate() * 100.0); -println!("Cache entries: {}", cache_stats.entry_count); -```text - -## Integration with Existing System - -### Environment Variable Integration - -The KMS system integrates with the existing environment-based configuration: - -```bash -# Set KMS configuration environment -export PROVISIONING_KMS_MODE=hybrid -export PROVISIONING_KMS_LOCAL_DATABASE_PATH=/var/lib/provisioning/kms.db -export PROVISIONING_KMS_REMOTE_SERVER_URL=https://kms.example.com:9998 -export PROVISIONING_KMS_CACHE_ENABLED=true -```text - -### TOML Configuration Integration - -Add KMS configuration to your existing `config.defaults.toml`: - -```toml -# Extend existing configuration with KMS section -[kms] -mode = "local" # Start with local mode -local.database_path = "{{paths.base}}/data/kms.db" -cache.enabled = true -cache.local_dir = "{{paths.base}}/cache/kms" -audit.enabled = true -```text - -### Nushell Integration - -```nushell -# Load KMS configuration -def kms_config [] { - let config_path = $"($env.PROVISIONING_BASE)/config.toml" - open $config_path | get kms -} - -# Check KMS health -def kms_health [] { - http get http://localhost:8080/kms/health | from json -} - -# List available keys -def kms_keys [] { - http get http://localhost:8080/kms/keys | from json -} -```text - -## Security Considerations - -### Key Storage Security - -1. **Local Storage**: Keys are encrypted using AES-256-GCM with keys derived from a master key using HKDF -2. **Remote Storage**: Relies on Cosmian KMS security guarantees -3. **Cache**: Cached keys have TTL and are encrypted in transit -4. **Memory**: Key material is zeroed on drop using the `zeroize` crate - -### Access Control - -1. **Authentication**: Multiple authentication methods supported -2. **Authorization**: Key usage permissions enforced -3. **Audit**: All operations logged with detailed context -4. **Network**: TLS encryption for all remote communications - -### Compliance Features - -1. **Audit Trail**: Comprehensive logging of all KMS operations -2. **Key Rotation**: Automatic rotation with configurable policies -3. **Retention**: Configurable key and audit log retention -4. **Export Controls**: Keys can be marked as non-exportable - -## Deployment Guide - -### Production Deployment - -1. **Configuration**: - - ```toml - [kms] - mode = "hybrid" - - [kms.security] - policy_enforcement = "strict" - require_strong_auth = true - - [kms.audit] - enabled = true - backend = "database" - retention_days = 365 - ``` - -2. **Infrastructure**: - - Dedicated KMS database server - - Redis cluster for caching - - HSM for high-security keys - - Backup and disaster recovery - -3. **Monitoring**: - - Health check endpoints - - Metrics collection - - Alert on key expiration - - Audit log monitoring - -### Development Setup - -1. **Local Development**: - - ```toml - [kms] - mode = "local" - - [kms.security] - policy_enforcement = "permissive" - - [kms.audit] - enabled = true - backend = "stdout" - ``` - -2. **Testing**: - - ```bash - # Run with test configuration - PROVISIONING_ENV=test ./control-center - ``` - -## Performance Optimization - -### Caching Strategy - -1. **Multi-level Caching**: Memory → Redis → Local File -2. **TTL Management**: Configurable per key type -3. **Cache Warming**: Preload frequently used keys -4. **Eviction Policies**: LRU with size-based eviction - -### Connection Pooling - -1. **Database Connections**: Configurable pool size -2. **HTTP Connections**: Keep-alive and connection reuse -3. **Batch Operations**: Bulk key operations where supported - -## Troubleshooting - -### Common Issues - -1. **Connection Failures**: Check network connectivity and certificates -2. **Permission Errors**: Verify file system permissions for local storage -3. **Cache Misses**: Monitor cache hit rates and adjust TTL -4. **Key Rotation**: Check rotation scheduler logs - -### Debug Mode - -```bash -# Enable debug logging -export PROVISIONING_DEBUG=true -export PROVISIONING_LOG_LEVEL=debug - -# Run with verbose output -./control-center --debug -```text - -### Health Checks - -```bash -# Check KMS health -curl http://localhost:8080/kms/health - -# Check individual components -curl http://localhost:8080/kms/health/backend -curl http://localhost:8080/kms/health/cache -curl http://localhost:8080/kms/health/rotation -```text - -## Future Enhancements - -### Planned Features - -1. **Multi-Region Support**: Cross-region key replication -2. **Key Versioning**: Multiple versions per key with rollback -3. **Policy Engine**: Fine-grained access control policies -4. **Metrics Dashboard**: Web UI for monitoring and management -5. **Integration APIs**: REST and gRPC APIs for external systems - -### Experimental Features - -1. **Zero-Knowledge Proofs**: For privacy-preserving operations -2. **Quantum-Resistant Algorithms**: Post-quantum cryptography support -3. **Federated KMS**: Multi-organization key sharing -4. **Blockchain Integration**: Immutable audit trails - -This hybrid KMS system provides a solid foundation for secure key management in the control center architecture, -with room for future enhancements and customization based on specific requirements. +# Hybrid Key Management System (KMS)\n\nA comprehensive hybrid KMS system built for the control center, supporting local/remote/hybrid modes\nwith intelligent caching, failover, and advanced security features.\n\n## Architecture Overview\n\n### Core Components\n\n1. **KMS Backends**\n - **Local Backend**: SQLite with AES-256-GCM encryption\n - **Remote Backend**: Cosmian KMS client integration\n - **Hybrid Backend**: Intelligent combination of local and remote\n\n2. **Caching System**\n - **Memory Cache**: In-memory LRU cache with TTL\n - **Redis Cache**: Distributed caching with automatic failover\n - **Local File Cache**: Persistent file-based cache for offline scenarios\n\n3. **Security Features**\n - **Encryption**: AES-256-GCM for local storage\n - **Key Derivation**: HKDF-SHA256 for master key derivation\n - **Authentication**: Multiple auth methods (certificate, token, basic, OAuth)\n - **Audit Logging**: Comprehensive audit trail for all operations\n\n4. **Advanced Features**\n - **Credential Management**: Automatic injection for cloud providers\n - **Key Rotation**: Configurable automatic key rotation\n - **HSM Integration**: Hardware Security Module support\n - **Zero-Knowledge Proofs**: For sensitive operations\n - **Migration Tools**: Backend-to-backend key migration\n\n## Configuration\n\n### TOML Configuration Example\n\n```\n[kms]\n# Operation mode: local, remote, or hybrid\nmode = "hybrid"\n\n# Local SQLite backend configuration\n[kms.local]\ndatabase_path = "./data/kms.db"\n\n[kms.local.master_key]\nderivation_method = "hkdf"\nsource = "generated"\niterations = 100000\n\n[kms.local.encryption]\ndefault_algorithm = "AES-256-GCM"\nkey_size_bits = 256\nauthenticated = true\n\n[kms.local.backup]\nenabled = true\nbackup_dir = "./backups/kms"\ninterval_hours = 24\nmax_backups = 7\ncompress = true\nencrypt = true\n\n# Remote Cosmian KMS configuration\n[kms.remote]\nserver_url = "https://kms.example.com:9998"\nauth_method = "certificate"\nclient_cert_path = "./certs/client.crt"\nclient_key_path = "./certs/client.key"\nca_cert_path = "./certs/ca.crt"\ntimeout_seconds = 30\nverify_ssl = true\n\n[kms.remote.retry]\nmax_attempts = 3\ninitial_delay_ms = 1000\nmax_delay_ms = 30000\nbackoff_multiplier = 2.0\n\n# Cache configuration\n[kms.cache]\nenabled = true\nbackend = "memory"\ndefault_ttl_seconds = 3600\nmax_size_bytes = 104857600 # 100MB\nlocal_dir = "./cache/kms"\n\n# Credential management\n[kms.credentials]\nenabled = true\n\n[kms.credentials.storage]\nstorage_type = "sqlite"\nencryption_key_id = "credential_encryption_key"\ndatabase_path = "./data/credentials.db"\n\n[kms.credentials.providers.aws]\nname = "AWS"\nprovider_type = "aws"\nrefresh_interval_seconds = 3600\nexpiry_warning_seconds = 300\nregions = ["us-east-1", "eu-west-1"]\n\n[kms.credentials.providers.upcloud]\nname = "UpCloud"\nprovider_type = "upcloud"\nrefresh_interval_seconds = 3600\nexpiry_warning_seconds = 300\nregions = ["fi-hel1", "us-nyc1"]\n\n# Key rotation configuration\n[kms.rotation]\nenabled = true\ninterval_seconds = 2592000 # 30 days\nmax_age_seconds = 7776000 # 90 days\nnotice_seconds = 604800 # 7 days\nschedule = "0 2 * * 0" # Every Sunday at 2 AM\n\n# Audit configuration\n[kms.audit]\nenabled = true\nbackend = "file"\nretention_days = 90\nlog_level = "info"\ninclude_data = false\nmax_file_size_mb = 100\nformat = "json"\n\n# HSM configuration\n[kms.hsm]\nenabled = false\nhsm_type = "pkcs11"\npkcs11_library = "/usr/lib/libpkcs11.so"\nslot_id = 0\n\n# Zero-knowledge proof configuration\n[kms.zkp]\nenabled = false\nproof_system = "groth16"\nsetup_params_path = "./zkp/setup.params"\n\n# Security policy\n[kms.security]\nrequire_strong_auth = true\nmin_key_length_bits = 256\nmax_key_age_days = 90\nenable_pfs = true\nallowed_algorithms = ["AES-256-GCM", "ChaCha20Poly1305", "RSA-4096", "ECDSA-P384"]\nblocked_algorithms = ["DES", "3DES", "RC4", "MD5"]\npolicy_enforcement = "strict"\n```\n\n## Usage Examples\n\n### Basic KMS Operations\n\n```\nuse control_center::kms::{KmsManager, KmsConfig, KeyData, KeyType, KeyAlgorithm, KeyUsage};\n\n// Initialize KMS manager\nlet config = KmsConfig::load_from_file("kms.toml").await?;\nlet mut kms = KmsManager::new(&config).await?;\nkms.initialize().await?;\n\n// Create a new encryption key\nlet key_data = KeyData {\n key_id: "my-encryption-key".to_string(),\n key_type: KeyType::Symmetric,\n algorithm: KeyAlgorithm::Aes256Gcm,\n usage: KeyUsage {\n encrypt: true,\n decrypt: true,\n ..Default::default()\n },\n key_size: 256,\n key_material: SecretBytes::new(generate_random_key(32)),\n metadata: KeyMetadata {\n name: Some("Application Encryption Key".to_string()),\n description: Some("Key for encrypting application data".to_string()),\n owner: Some("app-service".to_string()),\n environment: Some("production".to_string()),\n ..Default::default()\n },\n created_at: Utc::now(),\n last_accessed: None,\n expires_at: Some(Utc::now() + chrono::Duration::days(90)),\n status: KeyStatus::Active,\n tags: HashMap::from([\n ("purpose".to_string(), "encryption".to_string()),\n ("service".to_string(), "app".to_string()),\n ]),\n};\n\n// Store the key\nlet stored_key_id = kms.store_key(key_data).await?;\nprintln!("Key stored with ID: {}", stored_key_id);\n\n// Encrypt data\nlet plaintext = b"sensitive data to encrypt";\nlet context = HashMap::from([\n ("service".to_string(), "app".to_string()),\n ("version".to_string(), "1.0".to_string()),\n]);\nlet ciphertext = kms.encrypt(&stored_key_id, plaintext, Some(context.clone())).await?;\n\n// Decrypt data\nlet decrypted = kms.decrypt(&stored_key_id, &ciphertext, Some(context)).await?;\nassert_eq!(plaintext, decrypted.as_slice());\n\n// Get key information\nif let Some(key_info) = kms.get_key(&stored_key_id).await? {\n println!("Key algorithm: {:?}", key_info.algorithm);\n println!("Key status: {:?}", key_info.status);\n println!("Created: {}", key_info.created_at);\n}\n```\n\n### Provider Credential Management\n\n```\nuse control_center::kms::{ProviderCredentials, CredentialType};\n\n// Store AWS credentials\nlet aws_creds = ProviderCredentials {\n provider: "aws".to_string(),\n credential_type: CredentialType::AccessKey,\n access_key: "AKIAIOSFODNN7EXAMPLE".to_string(),\n secret_key: SecretBytes::new(b"wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY".to_vec()),\n session_token: None,\n region: Some("us-east-1".to_string()),\n config: HashMap::new(),\n expires_at: Some(Utc::now() + chrono::Duration::hours(12)),\n created_at: Utc::now(),\n};\n\nkms.store_provider_credentials("aws", aws_creds).await?;\n\n// Retrieve credentials for automatic injection\nif let Some(creds) = kms.get_provider_credentials("aws").await? {\n println!("AWS Access Key: {}", creds.access_key);\n // Credentials are automatically injected into environment variables\n // or configuration files based on the injection configuration\n}\n```\n\n### Health Monitoring\n\n```\n// Check system health\nlet health = kms.health_check().await?;\nprintln!("KMS Health: {}", health.overall);\nprintln!("Backend Status: {}", health.backend.healthy);\nprintln!("Rotation Status: {}", health.rotation.healthy);\nprintln!("Credentials Status: {}", health.credentials.healthy);\n\n// Get cache statistics\nlet cache_stats = kms.cache.stats().await;\nprintln!("Cache hit rate: {:.2}%", cache_stats.hit_rate() * 100.0);\nprintln!("Cache entries: {}", cache_stats.entry_count);\n```\n\n## Integration with Existing System\n\n### Environment Variable Integration\n\nThe KMS system integrates with the existing environment-based configuration:\n\n```\n# Set KMS configuration environment\nexport PROVISIONING_KMS_MODE=hybrid\nexport PROVISIONING_KMS_LOCAL_DATABASE_PATH=/var/lib/provisioning/kms.db\nexport PROVISIONING_KMS_REMOTE_SERVER_URL=https://kms.example.com:9998\nexport PROVISIONING_KMS_CACHE_ENABLED=true\n```\n\n### TOML Configuration Integration\n\nAdd KMS configuration to your existing `config.defaults.toml`:\n\n```\n# Extend existing configuration with KMS section\n[kms]\nmode = "local" # Start with local mode\nlocal.database_path = "{{paths.base}}/data/kms.db"\ncache.enabled = true\ncache.local_dir = "{{paths.base}}/cache/kms"\naudit.enabled = true\n```\n\n### Nushell Integration\n\n```\n# Load KMS configuration\ndef kms_config [] {\n let config_path = $"($env.PROVISIONING_BASE)/config.toml"\n open $config_path | get kms\n}\n\n# Check KMS health\ndef kms_health [] {\n http get http://localhost:8080/kms/health | from json\n}\n\n# List available keys\ndef kms_keys [] {\n http get http://localhost:8080/kms/keys | from json\n}\n```\n\n## Security Considerations\n\n### Key Storage Security\n\n1. **Local Storage**: Keys are encrypted using AES-256-GCM with keys derived from a master key using HKDF\n2. **Remote Storage**: Relies on Cosmian KMS security guarantees\n3. **Cache**: Cached keys have TTL and are encrypted in transit\n4. **Memory**: Key material is zeroed on drop using the `zeroize` crate\n\n### Access Control\n\n1. **Authentication**: Multiple authentication methods supported\n2. **Authorization**: Key usage permissions enforced\n3. **Audit**: All operations logged with detailed context\n4. **Network**: TLS encryption for all remote communications\n\n### Compliance Features\n\n1. **Audit Trail**: Comprehensive logging of all KMS operations\n2. **Key Rotation**: Automatic rotation with configurable policies\n3. **Retention**: Configurable key and audit log retention\n4. **Export Controls**: Keys can be marked as non-exportable\n\n## Deployment Guide\n\n### Production Deployment\n\n1. **Configuration**:\n\n ```toml\n [kms]\n mode = "hybrid"\n\n [kms.security]\n policy_enforcement = "strict"\n require_strong_auth = true\n\n [kms.audit]\n enabled = true\n backend = "database"\n retention_days = 365\n ```\n\n2. **Infrastructure**:\n - Dedicated KMS database server\n - Redis cluster for caching\n - HSM for high-security keys\n - Backup and disaster recovery\n\n3. **Monitoring**:\n - Health check endpoints\n - Metrics collection\n - Alert on key expiration\n - Audit log monitoring\n\n### Development Setup\n\n1. **Local Development**:\n\n ```toml\n [kms]\n mode = "local"\n\n [kms.security]\n policy_enforcement = "permissive"\n\n [kms.audit]\n enabled = true\n backend = "stdout"\n ```\n\n2. **Testing**:\n\n ```bash\n # Run with test configuration\n PROVISIONING_ENV=test ./control-center\n ```\n\n## Performance Optimization\n\n### Caching Strategy\n\n1. **Multi-level Caching**: Memory → Redis → Local File\n2. **TTL Management**: Configurable per key type\n3. **Cache Warming**: Preload frequently used keys\n4. **Eviction Policies**: LRU with size-based eviction\n\n### Connection Pooling\n\n1. **Database Connections**: Configurable pool size\n2. **HTTP Connections**: Keep-alive and connection reuse\n3. **Batch Operations**: Bulk key operations where supported\n\n## Troubleshooting\n\n### Common Issues\n\n1. **Connection Failures**: Check network connectivity and certificates\n2. **Permission Errors**: Verify file system permissions for local storage\n3. **Cache Misses**: Monitor cache hit rates and adjust TTL\n4. **Key Rotation**: Check rotation scheduler logs\n\n### Debug Mode\n\n```\n# Enable debug logging\nexport PROVISIONING_DEBUG=true\nexport PROVISIONING_LOG_LEVEL=debug\n\n# Run with verbose output\n./control-center --debug\n```\n\n### Health Checks\n\n```\n# Check KMS health\ncurl http://localhost:8080/kms/health\n\n# Check individual components\ncurl http://localhost:8080/kms/health/backend\ncurl http://localhost:8080/kms/health/cache\ncurl http://localhost:8080/kms/health/rotation\n```\n\n## Future Enhancements\n\n### Planned Features\n\n1. **Multi-Region Support**: Cross-region key replication\n2. **Key Versioning**: Multiple versions per key with rollback\n3. **Policy Engine**: Fine-grained access control policies\n4. **Metrics Dashboard**: Web UI for monitoring and management\n5. **Integration APIs**: REST and gRPC APIs for external systems\n\n### Experimental Features\n\n1. **Zero-Knowledge Proofs**: For privacy-preserving operations\n2. **Quantum-Resistant Algorithms**: Post-quantum cryptography support\n3. **Federated KMS**: Multi-organization key sharing\n4. **Blockchain Integration**: Immutable audit trails\n\nThis hybrid KMS system provides a solid foundation for secure key management in the control center architecture,\nwith room for future enhancements and customization based on specific requirements. \ No newline at end of file diff --git a/crates/control-center/web/README.md b/crates/control-center/web/README.md index 7166371..5368830 100644 --- a/crates/control-center/web/README.md +++ b/crates/control-center/web/README.md @@ -1,180 +1 @@ -# Control Center Web UI - -React/TypeScript frontend for the Control Center vault secrets management. - -## Features - -- **Secrets List**: Browse and filter vault secrets -- **Secret View**: View secret details with show/hide value toggle -- **Secret Create/Edit**: Create new secrets or update existing ones -- **Secret History**: View version history and restore previous versions -- **Copy to Clipboard**: Easy copy functionality for secret values -- **Responsive Design**: Works on desktop and mobile devices - -## Components - -### Core Components - -- **SecretsManager**: Main orchestrator component -- **SecretsList**: List view with pagination and filtering -- **SecretView**: Detailed secret view with metadata -- **SecretCreate**: Create/edit form for secrets -- **SecretHistory**: Version history with restore functionality - -### API Client - -- **secretsApi**: HTTP client for vault secrets endpoints -- Type-safe request/response handling -- Error handling with custom error types - -## Prerequisites - -- Node.js 18+ -- npm or yarn -- Control Center backend running on - -## Installation - -```bash -cd provisioning/platform/control-center/web -npm install -```text - -## Development - -```bash -# Start development server -npm start - -# Build for production -npm build - -# Run tests -npm test - -# Lint code -npm run lint - -# Format code -npm run format -```text - -## Environment Variables - -Create a `.env` file in the web directory: - -```bash -REACT_APP_API_URL=http://localhost:8080 -```text - -## Usage - -### Import and Use - -```typescript -import { SecretsManager } from './components/secrets'; - -function App() { - return ( -
- -
- ); -} -```text - -### API Client - -```typescript -import { secretsApi } from './api/secrets'; - -// Create a secret -const secret = await secretsApi.createSecret({ - path: 'database/prod/password', - value: 'my-secret-value', - context: 'production', - metadata: { description: 'Production DB password' }, -}); - -// Get a secret -const secretData = await secretsApi.getSecret('database/prod/password'); - -// List secrets -const { secrets, total } = await secretsApi.listSecrets({ - prefix: 'database/', - limit: 50, - offset: 0, -}); - -// Update secret -await secretsApi.updateSecret('database/prod/password', { - value: 'new-secret-value', -}); - -// Delete secret -await secretsApi.deleteSecret('database/prod/password'); - -// Get history -const history = await secretsApi.getSecretHistory('database/prod/password'); - -// Restore version -await secretsApi.restoreSecretVersion('database/prod/password', 2); -```text - -## Architecture - -```plaintext -SecretsManager (Orchestrator) - ├── SecretsList (Browse) - ├── SecretView (Detail) - ├── SecretCreate (Create/Edit) - └── SecretHistory (Versions) - ↓ - secretsApi (HTTP Client) - ↓ - Control Center Backend API - ↓ - KMS Service (Encryption) - ↓ - RustyVault (Storage) -```text - -## Security - -- **MFA Required**: All secret operations require MFA verification -- **RBAC**: Role-based access control enforced by backend -- **Encrypted Storage**: Values encrypted via KMS Service before storage -- **Audit Trail**: All operations logged for compliance -- **No Plaintext**: Values never stored unencrypted -- **Context Encryption**: Optional AAD for additional security - -## TypeScript Types - -All components are fully typed. See `src/types/secrets.ts` for type definitions: - -- `Secret`: Secret metadata -- `SecretWithValue`: Secret with decrypted value -- `SecretVersion`: Version information -- `SecretHistory`: Complete version history -- `CreateSecretRequest`: Create request payload -- `UpdateSecretRequest`: Update request payload -- `ListSecretsQuery`: List query parameters -- `ApiError`: Error response type - -## Styling - -Custom CSS in `src/components/secrets/secrets.css`. Modify to match your design system. - -## Browser Support - -- Chrome/Edge 90+ -- Firefox 88+ -- Safari 14+ - -## License - -See project root LICENSE file. - -## Contributing - -See project root CONTRIBUTING.md for contribution guidelines. +# Control Center Web UI\n\nReact/TypeScript frontend for the Control Center vault secrets management.\n\n## Features\n\n- **Secrets List**: Browse and filter vault secrets\n- **Secret View**: View secret details with show/hide value toggle\n- **Secret Create/Edit**: Create new secrets or update existing ones\n- **Secret History**: View version history and restore previous versions\n- **Copy to Clipboard**: Easy copy functionality for secret values\n- **Responsive Design**: Works on desktop and mobile devices\n\n## Components\n\n### Core Components\n\n- **SecretsManager**: Main orchestrator component\n- **SecretsList**: List view with pagination and filtering\n- **SecretView**: Detailed secret view with metadata\n- **SecretCreate**: Create/edit form for secrets\n- **SecretHistory**: Version history with restore functionality\n\n### API Client\n\n- **secretsApi**: HTTP client for vault secrets endpoints\n- Type-safe request/response handling\n- Error handling with custom error types\n\n## Prerequisites\n\n- Node.js 18+\n- npm or yarn\n- Control Center backend running on \n\n## Installation\n\n```\ncd provisioning/platform/control-center/web\nnpm install\n```\n\n## Development\n\n```\n# Start development server\nnpm start\n\n# Build for production\nnpm build\n\n# Run tests\nnpm test\n\n# Lint code\nnpm run lint\n\n# Format code\nnpm run format\n```\n\n## Environment Variables\n\nCreate a `.env` file in the web directory:\n\n```\nREACT_APP_API_URL=http://localhost:8080\n```\n\n## Usage\n\n### Import and Use\n\n```\nimport { SecretsManager } from './components/secrets';\n\nfunction App() {\n return (\n
\n \n
\n );\n}\n```\n\n### API Client\n\n```\nimport { secretsApi } from './api/secrets';\n\n// Create a secret\nconst secret = await secretsApi.createSecret({\n path: 'database/prod/password',\n value: 'my-secret-value',\n context: 'production',\n metadata: { description: 'Production DB password' },\n});\n\n// Get a secret\nconst secretData = await secretsApi.getSecret('database/prod/password');\n\n// List secrets\nconst { secrets, total } = await secretsApi.listSecrets({\n prefix: 'database/',\n limit: 50,\n offset: 0,\n});\n\n// Update secret\nawait secretsApi.updateSecret('database/prod/password', {\n value: 'new-secret-value',\n});\n\n// Delete secret\nawait secretsApi.deleteSecret('database/prod/password');\n\n// Get history\nconst history = await secretsApi.getSecretHistory('database/prod/password');\n\n// Restore version\nawait secretsApi.restoreSecretVersion('database/prod/password', 2);\n```\n\n## Architecture\n\n```\nSecretsManager (Orchestrator)\n ├── SecretsList (Browse)\n ├── SecretView (Detail)\n ├── SecretCreate (Create/Edit)\n └── SecretHistory (Versions)\n ↓\n secretsApi (HTTP Client)\n ↓\n Control Center Backend API\n ↓\n KMS Service (Encryption)\n ↓\n RustyVault (Storage)\n```\n\n## Security\n\n- **MFA Required**: All secret operations require MFA verification\n- **RBAC**: Role-based access control enforced by backend\n- **Encrypted Storage**: Values encrypted via KMS Service before storage\n- **Audit Trail**: All operations logged for compliance\n- **No Plaintext**: Values never stored unencrypted\n- **Context Encryption**: Optional AAD for additional security\n\n## TypeScript Types\n\nAll components are fully typed. See `src/types/secrets.ts` for type definitions:\n\n- `Secret`: Secret metadata\n- `SecretWithValue`: Secret with decrypted value\n- `SecretVersion`: Version information\n- `SecretHistory`: Complete version history\n- `CreateSecretRequest`: Create request payload\n- `UpdateSecretRequest`: Update request payload\n- `ListSecretsQuery`: List query parameters\n- `ApiError`: Error response type\n\n## Styling\n\nCustom CSS in `src/components/secrets/secrets.css`. Modify to match your design system.\n\n## Browser Support\n\n- Chrome/Edge 90+\n- Firefox 88+\n- Safari 14+\n\n## License\n\nSee project root LICENSE file.\n\n## Contributing\n\nSee project root CONTRIBUTING.md for contribution guidelines. \ No newline at end of file diff --git a/crates/extension-registry/API.md b/crates/extension-registry/API.md index 4d4711e..00a81d3 100644 --- a/crates/extension-registry/API.md +++ b/crates/extension-registry/API.md @@ -1,586 +1 @@ -# Extension Registry API Documentation - -Version: 1.0.0 -Base URL: `http://localhost:8082/api/v1` - -## Table of Contents - -- [Authentication](#authentication) -- [Extension Endpoints](#extension-endpoints) -- [System Endpoints](#system-endpoints) -- [Error Responses](#error-responses) -- [Data Models](#data-models) - -## Authentication - -The Extension Registry API does not require authentication for read operations. Backend authentication (Gitea/OCI) is handled server-side via -configuration. - -## Extension Endpoints - -### List Extensions - -Retrieve a list of available extensions with optional filtering and pagination. - -**Endpoint**: `GET /extensions` - -**Query Parameters**: - -| Parameter | Type | Required | Description | -| ----------- | ------ | ---------- | ------------- | -| `type` | string | No | Filter by extension type: `provider`, `taskserv`, `cluster` | -| `source` | string | No | Filter by source: `gitea`, `oci` | -| `limit` | integer | No | Maximum results (default: 100, max: 1000) | -| `offset` | integer | No | Pagination offset (default: 0) | - -**Example Request**: - -```bash -curl "http://localhost:8082/api/v1/extensions?type=provider&limit=10" -```text - -**Example Response** (200 OK): - -```json -[ - { - "name": "aws", - "type": "provider", - "version": "1.2.0", - "description": "AWS provider for provisioning infrastructure", - "author": "provisioning-team", - "repository": "https://gitea.example.com/org/aws_prov", - "source": "gitea", - "published_at": "2025-10-06T12:00:00Z", - "download_url": "https://gitea.example.com/org/aws_prov/releases/download/1.2.0/aws_prov.tar.gz", - "checksum": "sha256:abc123...", - "size": 1024000, - "tags": ["cloud", "aws", "infrastructure"] - }, - { - "name": "upcloud", - "type": "provider", - "version": "2.1.3", - "description": "UpCloud provider for European cloud infrastructure", - "author": "provisioning-team", - "repository": "https://gitea.example.com/org/upcloud_prov", - "source": "gitea", - "published_at": "2025-10-05T10:30:00Z", - "download_url": "https://gitea.example.com/org/upcloud_prov/releases/download/2.1.3/upcloud_prov.tar.gz", - "size": 890000 - } -] -```text - ---- - -### Get Extension Metadata - -Retrieve detailed metadata for a specific extension. - -**Endpoint**: `GET /extensions/{type}/{name}` - -**Path Parameters**: - -| Parameter | Type | Required | Description | -| ----------- | ------ | ---------- | ------------- | -| `type` | string | Yes | Extension type: `provider`, `taskserv`, `cluster` | -| `name` | string | Yes | Extension name | - -**Example Request**: - -```bash -curl "http://localhost:8082/api/v1/extensions/provider/aws" -```text - -**Example Response** (200 OK): - -```json -{ - "name": "aws", - "type": "provider", - "version": "1.2.0", - "description": "AWS provider for provisioning infrastructure", - "author": "provisioning-team", - "repository": "https://gitea.example.com/org/aws_prov", - "source": "gitea", - "published_at": "2025-10-06T12:00:00Z", - "download_url": "https://gitea.example.com/org/aws_prov/releases/download/1.2.0/aws_prov.tar.gz", - "checksum": "sha256:abc123...", - "size": 1024000, - "tags": ["cloud", "aws", "infrastructure"] -} -```text - -**Error Response** (404 Not Found): - -```json -{ - "error": "not_found", - "message": "Extension provider/nonexistent not found" -} -```text - ---- - -### List Extension Versions - -Get all available versions for a specific extension. - -**Endpoint**: `GET /extensions/{type}/{name}/versions` - -**Path Parameters**: - -| Parameter | Type | Required | Description | -| ----------- | ------ | ---------- | ------------- | -| `type` | string | Yes | Extension type: `provider`, `taskserv`, `cluster` | -| `name` | string | Yes | Extension name | - -**Example Request**: - -```bash -curl "http://localhost:8082/api/v1/extensions/taskserv/kubernetes/versions" -```text - -**Example Response** (200 OK): - -```json -[ - { - "version": "1.28.0", - "published_at": "2025-10-06T12:00:00Z", - "download_url": "https://gitea.example.com/org/kubernetes_taskserv/releases/download/1.28.0/kubernetes_taskserv.tar.gz", - "checksum": "sha256:def456...", - "size": 2048000 - }, - { - "version": "1.27.5", - "published_at": "2025-09-15T10:30:00Z", - "download_url": "https://gitea.example.com/org/kubernetes_taskserv/releases/download/1.27.5/kubernetes_taskserv.tar.gz", - "checksum": "sha256:ghi789...", - "size": 1980000 - }, - { - "version": "1.27.4", - "published_at": "2025-08-20T08:15:00Z", - "download_url": "https://gitea.example.com/org/kubernetes_taskserv/releases/download/1.27.4/kubernetes_taskserv.tar.gz", - "size": 1950000 - } -] -```text - ---- - -### Download Extension - -Download a specific version of an extension. - -**Endpoint**: `GET /extensions/{type}/{name}/{version}` - -**Path Parameters**: - -| Parameter | Type | Required | Description | -| ----------- | ------ | ---------- | ------------- | -| `type` | string | Yes | Extension type: `provider`, `taskserv`, `cluster` | -| `name` | string | Yes | Extension name | -| `version` | string | Yes | Extension version (e.g., `1.2.0`) | - -**Example Request**: - -```bash -curl -OJ "http://localhost:8082/api/v1/extensions/provider/aws/1.2.0" -```text - -**Response**: - -- **Content-Type**: `application/octet-stream` -- **Body**: Binary data (tarball or archive) - -**Error Response** (404 Not Found): - -```json -{ - "error": "not_found", - "message": "Extension provider/aws version 1.2.0 not found" -} -```text - ---- - -### Search Extensions - -Search for extensions by name or description. - -**Endpoint**: `GET /extensions/search` - -**Query Parameters**: - -| Parameter | Type | Required | Description | -| ----------- | ------ | ---------- | ------------- | -| `q` | string | Yes | Search query (case-insensitive) | -| `type` | string | No | Filter by extension type | -| `limit` | integer | No | Maximum results (default: 50, max: 100) | - -**Example Request**: - -```bash -curl "http://localhost:8082/api/v1/extensions/search?q=kubernetes&type=taskserv&limit=5" -```text - -**Example Response** (200 OK): - -```json -[ - { - "name": "kubernetes", - "type": "taskserv", - "version": "1.28.0", - "description": "Kubernetes container orchestration platform", - "author": "provisioning-team", - "source": "gitea", - "published_at": "2025-10-06T12:00:00Z" - }, - { - "name": "k3s", - "type": "taskserv", - "version": "1.27.5", - "description": "Lightweight Kubernetes distribution", - "author": "community", - "source": "oci", - "published_at": "2025-09-20T14:30:00Z" - } -] -```text - ---- - -## System Endpoints - -### Health Check - -Check service health and backend status. - -**Endpoint**: `GET /health` - -**Example Request**: - -```bash -curl "http://localhost:8082/api/v1/health" -```text - -**Example Response** (200 OK): - -```json -{ - "status": "healthy", - "version": "0.1.0", - "uptime": 3600, - "backends": { - "gitea": { - "enabled": true, - "healthy": true - }, - "oci": { - "enabled": true, - "healthy": true, - "error": null - } - } -} -```text - -**Degraded Status** (200 OK): - -```json -{ - "status": "degraded", - "version": "0.1.0", - "uptime": 7200, - "backends": { - "gitea": { - "enabled": true, - "healthy": false, - "error": "Connection timeout" - }, - "oci": { - "enabled": true, - "healthy": true - } - } -} -```text - ---- - -### Metrics - -Get Prometheus-formatted metrics. - -**Endpoint**: `GET /metrics` - -**Example Request**: - -```bash -curl "http://localhost:8082/api/v1/metrics" -```text - -**Example Response** (200 OK): - -```plaintext -# HELP http_requests_total Total HTTP requests -# TYPE http_requests_total counter -http_requests_total 1234 - -# HELP http_request_duration_seconds HTTP request duration -# TYPE http_request_duration_seconds histogram -http_request_duration_seconds_bucket{le="0.005"} 100 -http_request_duration_seconds_bucket{le="0.01"} 200 -http_request_duration_seconds_bucket{le="0.025"} 300 -http_request_duration_seconds_sum 50.5 -http_request_duration_seconds_count 1234 - -# HELP cache_hits_total Total cache hits -# TYPE cache_hits_total counter -cache_hits_total 987 - -# HELP cache_misses_total Total cache misses -# TYPE cache_misses_total counter -cache_misses_total 247 - -# HELP extensions_total Total extensions -# TYPE extensions_total gauge -extensions_total 45 -```text - ---- - -### Cache Statistics - -Get cache performance statistics. - -**Endpoint**: `GET /cache/stats` - -**Example Request**: - -```bash -curl "http://localhost:8082/api/v1/cache/stats" -```text - -**Example Response** (200 OK): - -```json -{ - "list_entries": 45, - "metadata_entries": 120, - "version_entries": 80, - "total_entries": 245 -} -```text - ---- - -## Error Responses - -All error responses follow this format: - -```json -{ - "error": "error_type", - "message": "Human-readable error message", - "details": "Optional additional details" -} -```text - -### HTTP Status Codes - -| Status | Description | -| -------- | ------------- | -| 200 OK | Request successful | -| 400 Bad Request | Invalid input (e.g., invalid extension type) | -| 401 Unauthorized | Authentication failed | -| 404 Not Found | Resource not found | -| 429 Too Many Requests | Rate limit exceeded | -| 500 Internal Server Error | Server error | -| 503 Service Unavailable | Service temporarily unavailable | - -### Error Types - -| Error Type | HTTP Status | Description | -| ------------ | ------------- | ------------- | -| `not_found` | 404 | Extension or resource not found | -| `invalid_type` | 400 | Invalid extension type provided | -| `invalid_version` | 400 | Invalid version format | -| `auth_error` | 401 | Authentication failed | -| `rate_limit` | 429 | Too many requests | -| `config_error` | 500 | Server configuration error | -| `internal_error` | 500 | Internal server error | - ---- - -## Data Models - -### Extension - -```typescript -interface Extension { - name: string; // Extension name - type: ExtensionType; // Extension type - version: string; // Current version (semver) - description: string; // Description - author?: string; // Author/organization - repository?: string; // Source repository URL - source: ExtensionSource; // Backend source - published_at: string; // ISO 8601 timestamp - download_url?: string; // Download URL - checksum?: string; // Checksum (e.g., sha256:...) - size?: number; // Size in bytes - tags?: string[]; // Tags -} -```text - -### ExtensionType - -```typescript -type ExtensionType = "provider" | "taskserv" | "cluster"; -```text - -### ExtensionSource - -```typescript -type ExtensionSource = "gitea" | "oci"; -```text - -### ExtensionVersion - -```typescript -interface ExtensionVersion { - version: string; // Version string (semver) - published_at: string; // ISO 8601 timestamp - download_url?: string; // Download URL - checksum?: string; // Checksum - size?: number; // Size in bytes -} -```text - -### HealthResponse - -```typescript -interface HealthResponse { - status: string; // "healthy" | "degraded" - version: string; // Service version - uptime: number; // Uptime in seconds - backends: BackendHealth; // Backend health status -} -```text - -### BackendHealth - -```typescript -interface BackendHealth { - gitea: BackendStatus; - oci: BackendStatus; -} -```text - -### BackendStatus - -```typescript -interface BackendStatus { - enabled: boolean; // Backend enabled - healthy: boolean; // Backend healthy - error?: string; // Error message if unhealthy -} -```text - ---- - -## Rate Limiting - -Currently, the API does not enforce rate limiting. This may be added in future versions. - -For high-volume usage, consider: - -- Implementing client-side rate limiting -- Using the cache effectively -- Batching requests when possible - ---- - -## Caching Behavior - -The service implements LRU caching with TTL: - -- **Cache TTL**: Configurable (default: 5 minutes) -- **Cache Capacity**: Configurable (default: 1000 entries) -- **Cache Keys**: - - List: `list:{type}:{source}` - - Metadata: `{type}/{name}` - - Versions: `{type}/{name}/versions` - -Cache headers are not currently exposed. Future versions may include: - -- `X-Cache-Hit: true/false` -- `X-Cache-TTL: {seconds}` - ---- - -## Versioning - -API version is specified in the URL path: `/api/v1/` - -Major version changes will be introduced in new paths (e.g., `/api/v2/`). - ---- - -## Examples - -### Complete Workflow - -```bash -# 1. Search for Kubernetes extensions -curl "http://localhost:8082/api/v1/extensions/search?q=kubernetes" - -# 2. Get extension metadata -curl "http://localhost:8082/api/v1/extensions/taskserv/kubernetes" - -# 3. List available versions -curl "http://localhost:8082/api/v1/extensions/taskserv/kubernetes/versions" - -# 4. Download specific version -curl -OJ "http://localhost:8082/api/v1/extensions/taskserv/kubernetes/1.28.0" - -# 5. Verify checksum (if provided) -sha256sum kubernetes_taskserv.tar.gz -```text - -### Pagination - -```bash -# Get first page -curl "http://localhost:8082/api/v1/extensions?limit=10&offset=0" - -# Get second page -curl "http://localhost:8082/api/v1/extensions?limit=10&offset=10" - -# Get third page -curl "http://localhost:8082/api/v1/extensions?limit=10&offset=20" -```text - -### Filtering - -```bash -# Only providers from Gitea -curl "http://localhost:8082/api/v1/extensions?type=provider&source=gitea" - -# Only taskservs from OCI -curl "http://localhost:8082/api/v1/extensions?type=taskserv&source=oci" - -# All clusters -curl "http://localhost:8082/api/v1/extensions?type=cluster" -```text - ---- - -## Support - -For issues and questions, see the main README or project documentation. +# Extension Registry API Documentation\n\nVersion: 1.0.0\nBase URL: `http://localhost:8082/api/v1`\n\n## Table of Contents\n\n- [Authentication](#authentication)\n- [Extension Endpoints](#extension-endpoints)\n- [System Endpoints](#system-endpoints)\n- [Error Responses](#error-responses)\n- [Data Models](#data-models)\n\n## Authentication\n\nThe Extension Registry API does not require authentication for read operations. Backend authentication (Gitea/OCI) is handled server-side via \nconfiguration.\n\n## Extension Endpoints\n\n### List Extensions\n\nRetrieve a list of available extensions with optional filtering and pagination.\n\n**Endpoint**: `GET /extensions`\n\n**Query Parameters**:\n\n| Parameter | Type | Required | Description |\n| ----------- | ------ | ---------- | ------------- |\n| `type` | string | No | Filter by extension type: `provider`, `taskserv`, `cluster` |\n| `source` | string | No | Filter by source: `gitea`, `oci` |\n| `limit` | integer | No | Maximum results (default: 100, max: 1000) |\n| `offset` | integer | No | Pagination offset (default: 0) |\n\n**Example Request**:\n\n```\ncurl "http://localhost:8082/api/v1/extensions?type=provider&limit=10"\n```\n\n**Example Response** (200 OK):\n\n```\n[\n {\n "name": "aws",\n "type": "provider",\n "version": "1.2.0",\n "description": "AWS provider for provisioning infrastructure",\n "author": "provisioning-team",\n "repository": "https://gitea.example.com/org/aws_prov",\n "source": "gitea",\n "published_at": "2025-10-06T12:00:00Z",\n "download_url": "https://gitea.example.com/org/aws_prov/releases/download/1.2.0/aws_prov.tar.gz",\n "checksum": "sha256:abc123...",\n "size": 1024000,\n "tags": ["cloud", "aws", "infrastructure"]\n },\n {\n "name": "upcloud",\n "type": "provider",\n "version": "2.1.3",\n "description": "UpCloud provider for European cloud infrastructure",\n "author": "provisioning-team",\n "repository": "https://gitea.example.com/org/upcloud_prov",\n "source": "gitea",\n "published_at": "2025-10-05T10:30:00Z",\n "download_url": "https://gitea.example.com/org/upcloud_prov/releases/download/2.1.3/upcloud_prov.tar.gz",\n "size": 890000\n }\n]\n```\n\n---\n\n### Get Extension Metadata\n\nRetrieve detailed metadata for a specific extension.\n\n**Endpoint**: `GET /extensions/{type}/{name}`\n\n**Path Parameters**:\n\n| Parameter | Type | Required | Description |\n| ----------- | ------ | ---------- | ------------- |\n| `type` | string | Yes | Extension type: `provider`, `taskserv`, `cluster` |\n| `name` | string | Yes | Extension name |\n\n**Example Request**:\n\n```\ncurl "http://localhost:8082/api/v1/extensions/provider/aws"\n```\n\n**Example Response** (200 OK):\n\n```\n{\n "name": "aws",\n "type": "provider",\n "version": "1.2.0",\n "description": "AWS provider for provisioning infrastructure",\n "author": "provisioning-team",\n "repository": "https://gitea.example.com/org/aws_prov",\n "source": "gitea",\n "published_at": "2025-10-06T12:00:00Z",\n "download_url": "https://gitea.example.com/org/aws_prov/releases/download/1.2.0/aws_prov.tar.gz",\n "checksum": "sha256:abc123...",\n "size": 1024000,\n "tags": ["cloud", "aws", "infrastructure"]\n}\n```\n\n**Error Response** (404 Not Found):\n\n```\n{\n "error": "not_found",\n "message": "Extension provider/nonexistent not found"\n}\n```\n\n---\n\n### List Extension Versions\n\nGet all available versions for a specific extension.\n\n**Endpoint**: `GET /extensions/{type}/{name}/versions`\n\n**Path Parameters**:\n\n| Parameter | Type | Required | Description |\n| ----------- | ------ | ---------- | ------------- |\n| `type` | string | Yes | Extension type: `provider`, `taskserv`, `cluster` |\n| `name` | string | Yes | Extension name |\n\n**Example Request**:\n\n```\ncurl "http://localhost:8082/api/v1/extensions/taskserv/kubernetes/versions"\n```\n\n**Example Response** (200 OK):\n\n```\n[\n {\n "version": "1.28.0",\n "published_at": "2025-10-06T12:00:00Z",\n "download_url": "https://gitea.example.com/org/kubernetes_taskserv/releases/download/1.28.0/kubernetes_taskserv.tar.gz",\n "checksum": "sha256:def456...",\n "size": 2048000\n },\n {\n "version": "1.27.5",\n "published_at": "2025-09-15T10:30:00Z",\n "download_url": "https://gitea.example.com/org/kubernetes_taskserv/releases/download/1.27.5/kubernetes_taskserv.tar.gz",\n "checksum": "sha256:ghi789...",\n "size": 1980000\n },\n {\n "version": "1.27.4",\n "published_at": "2025-08-20T08:15:00Z",\n "download_url": "https://gitea.example.com/org/kubernetes_taskserv/releases/download/1.27.4/kubernetes_taskserv.tar.gz",\n "size": 1950000\n }\n]\n```\n\n---\n\n### Download Extension\n\nDownload a specific version of an extension.\n\n**Endpoint**: `GET /extensions/{type}/{name}/{version}`\n\n**Path Parameters**:\n\n| Parameter | Type | Required | Description |\n| ----------- | ------ | ---------- | ------------- |\n| `type` | string | Yes | Extension type: `provider`, `taskserv`, `cluster` |\n| `name` | string | Yes | Extension name |\n| `version` | string | Yes | Extension version (e.g., `1.2.0`) |\n\n**Example Request**:\n\n```\ncurl -OJ "http://localhost:8082/api/v1/extensions/provider/aws/1.2.0"\n```\n\n**Response**:\n\n- **Content-Type**: `application/octet-stream`\n- **Body**: Binary data (tarball or archive)\n\n**Error Response** (404 Not Found):\n\n```\n{\n "error": "not_found",\n "message": "Extension provider/aws version 1.2.0 not found"\n}\n```\n\n---\n\n### Search Extensions\n\nSearch for extensions by name or description.\n\n**Endpoint**: `GET /extensions/search`\n\n**Query Parameters**:\n\n| Parameter | Type | Required | Description |\n| ----------- | ------ | ---------- | ------------- |\n| `q` | string | Yes | Search query (case-insensitive) |\n| `type` | string | No | Filter by extension type |\n| `limit` | integer | No | Maximum results (default: 50, max: 100) |\n\n**Example Request**:\n\n```\ncurl "http://localhost:8082/api/v1/extensions/search?q=kubernetes&type=taskserv&limit=5"\n```\n\n**Example Response** (200 OK):\n\n```\n[\n {\n "name": "kubernetes",\n "type": "taskserv",\n "version": "1.28.0",\n "description": "Kubernetes container orchestration platform",\n "author": "provisioning-team",\n "source": "gitea",\n "published_at": "2025-10-06T12:00:00Z"\n },\n {\n "name": "k3s",\n "type": "taskserv",\n "version": "1.27.5",\n "description": "Lightweight Kubernetes distribution",\n "author": "community",\n "source": "oci",\n "published_at": "2025-09-20T14:30:00Z"\n }\n]\n```\n\n---\n\n## System Endpoints\n\n### Health Check\n\nCheck service health and backend status.\n\n**Endpoint**: `GET /health`\n\n**Example Request**:\n\n```\ncurl "http://localhost:8082/api/v1/health"\n```\n\n**Example Response** (200 OK):\n\n```\n{\n "status": "healthy",\n "version": "0.1.0",\n "uptime": 3600,\n "backends": {\n "gitea": {\n "enabled": true,\n "healthy": true\n },\n "oci": {\n "enabled": true,\n "healthy": true,\n "error": null\n }\n }\n}\n```\n\n**Degraded Status** (200 OK):\n\n```\n{\n "status": "degraded",\n "version": "0.1.0",\n "uptime": 7200,\n "backends": {\n "gitea": {\n "enabled": true,\n "healthy": false,\n "error": "Connection timeout"\n },\n "oci": {\n "enabled": true,\n "healthy": true\n }\n }\n}\n```\n\n---\n\n### Metrics\n\nGet Prometheus-formatted metrics.\n\n**Endpoint**: `GET /metrics`\n\n**Example Request**:\n\n```\ncurl "http://localhost:8082/api/v1/metrics"\n```\n\n**Example Response** (200 OK):\n\n```\n# HELP http_requests_total Total HTTP requests\n# TYPE http_requests_total counter\nhttp_requests_total 1234\n\n# HELP http_request_duration_seconds HTTP request duration\n# TYPE http_request_duration_seconds histogram\nhttp_request_duration_seconds_bucket{le="0.005"} 100\nhttp_request_duration_seconds_bucket{le="0.01"} 200\nhttp_request_duration_seconds_bucket{le="0.025"} 300\nhttp_request_duration_seconds_sum 50.5\nhttp_request_duration_seconds_count 1234\n\n# HELP cache_hits_total Total cache hits\n# TYPE cache_hits_total counter\ncache_hits_total 987\n\n# HELP cache_misses_total Total cache misses\n# TYPE cache_misses_total counter\ncache_misses_total 247\n\n# HELP extensions_total Total extensions\n# TYPE extensions_total gauge\nextensions_total 45\n```\n\n---\n\n### Cache Statistics\n\nGet cache performance statistics.\n\n**Endpoint**: `GET /cache/stats`\n\n**Example Request**:\n\n```\ncurl "http://localhost:8082/api/v1/cache/stats"\n```\n\n**Example Response** (200 OK):\n\n```\n{\n "list_entries": 45,\n "metadata_entries": 120,\n "version_entries": 80,\n "total_entries": 245\n}\n```\n\n---\n\n## Error Responses\n\nAll error responses follow this format:\n\n```\n{\n "error": "error_type",\n "message": "Human-readable error message",\n "details": "Optional additional details"\n}\n```\n\n### HTTP Status Codes\n\n| Status | Description |\n| -------- | ------------- |\n| 200 OK | Request successful |\n| 400 Bad Request | Invalid input (e.g., invalid extension type) |\n| 401 Unauthorized | Authentication failed |\n| 404 Not Found | Resource not found |\n| 429 Too Many Requests | Rate limit exceeded |\n| 500 Internal Server Error | Server error |\n| 503 Service Unavailable | Service temporarily unavailable |\n\n### Error Types\n\n| Error Type | HTTP Status | Description |\n| ------------ | ------------- | ------------- |\n| `not_found` | 404 | Extension or resource not found |\n| `invalid_type` | 400 | Invalid extension type provided |\n| `invalid_version` | 400 | Invalid version format |\n| `auth_error` | 401 | Authentication failed |\n| `rate_limit` | 429 | Too many requests |\n| `config_error` | 500 | Server configuration error |\n| `internal_error` | 500 | Internal server error |\n\n---\n\n## Data Models\n\n### Extension\n\n```\ninterface Extension {\n name: string; // Extension name\n type: ExtensionType; // Extension type\n version: string; // Current version (semver)\n description: string; // Description\n author?: string; // Author/organization\n repository?: string; // Source repository URL\n source: ExtensionSource; // Backend source\n published_at: string; // ISO 8601 timestamp\n download_url?: string; // Download URL\n checksum?: string; // Checksum (e.g., sha256:...)\n size?: number; // Size in bytes\n tags?: string[]; // Tags\n}\n```\n\n### ExtensionType\n\n```\ntype ExtensionType = "provider" | "taskserv" | "cluster";\n```\n\n### ExtensionSource\n\n```\ntype ExtensionSource = "gitea" | "oci";\n```\n\n### ExtensionVersion\n\n```\ninterface ExtensionVersion {\n version: string; // Version string (semver)\n published_at: string; // ISO 8601 timestamp\n download_url?: string; // Download URL\n checksum?: string; // Checksum\n size?: number; // Size in bytes\n}\n```\n\n### HealthResponse\n\n```\ninterface HealthResponse {\n status: string; // "healthy" | "degraded"\n version: string; // Service version\n uptime: number; // Uptime in seconds\n backends: BackendHealth; // Backend health status\n}\n```\n\n### BackendHealth\n\n```\ninterface BackendHealth {\n gitea: BackendStatus;\n oci: BackendStatus;\n}\n```\n\n### BackendStatus\n\n```\ninterface BackendStatus {\n enabled: boolean; // Backend enabled\n healthy: boolean; // Backend healthy\n error?: string; // Error message if unhealthy\n}\n```\n\n---\n\n## Rate Limiting\n\nCurrently, the API does not enforce rate limiting. This may be added in future versions.\n\nFor high-volume usage, consider:\n\n- Implementing client-side rate limiting\n- Using the cache effectively\n- Batching requests when possible\n\n---\n\n## Caching Behavior\n\nThe service implements LRU caching with TTL:\n\n- **Cache TTL**: Configurable (default: 5 minutes)\n- **Cache Capacity**: Configurable (default: 1000 entries)\n- **Cache Keys**:\n - List: `list:{type}:{source}`\n - Metadata: `{type}/{name}`\n - Versions: `{type}/{name}/versions`\n\nCache headers are not currently exposed. Future versions may include:\n\n- `X-Cache-Hit: true/false`\n- `X-Cache-TTL: {seconds}`\n\n---\n\n## Versioning\n\nAPI version is specified in the URL path: `/api/v1/`\n\nMajor version changes will be introduced in new paths (e.g., `/api/v2/`).\n\n---\n\n## Examples\n\n### Complete Workflow\n\n```\n# 1. Search for Kubernetes extensions\ncurl "http://localhost:8082/api/v1/extensions/search?q=kubernetes"\n\n# 2. Get extension metadata\ncurl "http://localhost:8082/api/v1/extensions/taskserv/kubernetes"\n\n# 3. List available versions\ncurl "http://localhost:8082/api/v1/extensions/taskserv/kubernetes/versions"\n\n# 4. Download specific version\ncurl -OJ "http://localhost:8082/api/v1/extensions/taskserv/kubernetes/1.28.0"\n\n# 5. Verify checksum (if provided)\nsha256sum kubernetes_taskserv.tar.gz\n```\n\n### Pagination\n\n```\n# Get first page\ncurl "http://localhost:8082/api/v1/extensions?limit=10&offset=0"\n\n# Get second page\ncurl "http://localhost:8082/api/v1/extensions?limit=10&offset=10"\n\n# Get third page\ncurl "http://localhost:8082/api/v1/extensions?limit=10&offset=20"\n```\n\n### Filtering\n\n```\n# Only providers from Gitea\ncurl "http://localhost:8082/api/v1/extensions?type=provider&source=gitea"\n\n# Only taskservs from OCI\ncurl "http://localhost:8082/api/v1/extensions?type=taskserv&source=oci"\n\n# All clusters\ncurl "http://localhost:8082/api/v1/extensions?type=cluster"\n```\n\n---\n\n## Support\n\nFor issues and questions, see the main README or project documentation. \ No newline at end of file diff --git a/crates/extension-registry/README.md b/crates/extension-registry/README.md index a0a14bb..4801e43 100644 --- a/crates/extension-registry/README.md +++ b/crates/extension-registry/README.md @@ -1,635 +1 @@ -# Extension Registry Service - -A high-performance Rust microservice that provides a unified REST API for extension discovery, versioning, -and download from multiple sources (Gitea releases and OCI registries). - -## Features - -- **Multi-Backend Support**: Fetch extensions from Gitea releases and OCI registries -- **Unified REST API**: Single API for all extension operations -- **Smart Caching**: LRU cache with TTL to reduce backend API calls -- **Prometheus Metrics**: Built-in metrics for monitoring -- **Health Monitoring**: Health checks for all backends -- **Type-Safe**: Strong typing for extension metadata -- **Async/Await**: High-performance async operations with Tokio -- **Docker Support**: Production-ready containerization - -## Architecture - -```plaintext -┌───────────────────────────────────────────────── -────────────┐ -│ Extension Registry API │ -│ (axum) │ -├───────────────────────────────────────────────── -────────────┤ -│ │ -│ ┌────────────────┐ ┌────────────────┐ -┌──────────────┐ │ -│ │ Gitea Client │ │ OCI Client │ │ LRU Cache │ │ -│ │ (reqwest) │ │ (reqwest) │ │ (parking) │ │ -│ └────────────────┘ └────────────────┘ -└──────────────┘ │ -│ │ │ │ │ -└─────────┼────────────────────┼────────────────── -──┼─────────┘ - │ │ │ - ▼ ▼ ▼ - ┌──────────┐ ┌──────────┐ ┌──────────┐ - │ Gitea │ │ OCI │ │ Memory │ - │ Releases │ │ Registry │ │ │ - └──────────┘ └──────────┘ └──────────┘ -```text - -## Installation - -### Building from Source - -```bash -cd provisioning/platform/extension-registry -cargo build --release -```text - -### Docker Build - -```bash -docker build -t extension-registry:latest . -```text - -### Running with Cargo - -```bash -cargo run -- --config config.toml --port 8082 -```text - -### Running with Docker - -```bash -docker run -d \ - -p 8082:8082 \ - -v $(pwd)/config.toml:/app/config.toml:ro \ - -v $(pwd)/tokens:/app/tokens:ro \ - extension-registry:latest -```text - -## Configuration - -Create a `config.toml` file (see `config.example.toml`): - -```toml -[server] -host = "0.0.0.0" -port = 8082 -workers = 4 -enable_cors = true -enable_compression = true - -# Gitea backend (optional) -[gitea] -url = "https://gitea.example.com" -organization = "provisioning-extensions" -token_path = "/path/to/gitea-token.txt" -timeout_seconds = 30 -verify_ssl = true - -# OCI registry backend (optional) -[oci] -registry = "registry.example.com" -namespace = "provisioning" -auth_token_path = "/path/to/oci-token.txt" -timeout_seconds = 30 -verify_ssl = true - -# Cache configuration -[cache] -capacity = 1000 -ttl_seconds = 300 -enable_metadata_cache = true -enable_list_cache = true -```text - -**Note**: At least one backend (Gitea or OCI) must be configured. - -## API Endpoints - -### Extension Operations - -#### List Extensions - -```bash -GET /api/v1/extensions -```text - -Query parameters: - -- `type` (optional): Filter by extension type (`provider`, `taskserv`, `cluster`) -- `source` (optional): Filter by source (`gitea`, `oci`) -- `limit` (optional): Maximum results (default: 100) -- `offset` (optional): Pagination offset (default: 0) - -Example: - -```bash -curl http://localhost:8082/api/v1/extensions?type=provider&limit=10 -```text - -Response: - -```json -[ - { - "name": "aws", - "type": "provider", - "version": "1.2.0", - "description": "AWS provider for provisioning", - "author": "provisioning-team", - "repository": "https://gitea.example.com/org/aws_prov", - "source": "gitea", - "published_at": "2025-10-06T12:00:00Z", - "download_url": "https://gitea.example.com/org/aws_prov/releases/download/1.2.0/aws_prov.tar.gz", - "size": 1024000 - } -] -```text - -#### Get Extension - -```bash -GET /api/v1/extensions/{type}/{name} -```text - -Example: - -```bash -curl http://localhost:8082/api/v1/extensions/provider/aws -```text - -#### List Versions - -```bash -GET /api/v1/extensions/{type}/{name}/versions -```text - -Example: - -```bash -curl http://localhost:8082/api/v1/extensions/provider/aws/versions -```text - -Response: - -```json -[ - { - "version": "1.2.0", - "published_at": "2025-10-06T12:00:00Z", - "download_url": "https://gitea.example.com/org/aws_prov/releases/download/1.2.0/aws_prov.tar.gz", - "size": 1024000 - }, - { - "version": "1.1.0", - "published_at": "2025-09-15T10:30:00Z", - "download_url": "https://gitea.example.com/org/aws_prov/releases/download/1.1.0/aws_prov.tar.gz", - "size": 980000 - } -] -```text - -#### Download Extension - -```bash -GET /api/v1/extensions/{type}/{name}/{version} -```text - -Example: - -```bash -curl -O http://localhost:8082/api/v1/extensions/provider/aws/1.2.0 -```text - -Returns binary data with `Content-Type: application/octet-stream`. - -#### Search Extensions - -```bash -GET /api/v1/extensions/search?q={query} -```text - -Query parameters: - -- `q` (required): Search query -- `type` (optional): Filter by extension type -- `limit` (optional): Maximum results (default: 50) - -Example: - -```bash -curl http://localhost:8082/api/v1/extensions/search?q=kubernetes&type=taskserv -```text - -### System Endpoints - -#### Health Check - -```bash -GET /api/v1/health -```text - -Example: - -```bash -curl http://localhost:8082/api/v1/health -```text - -Response: - -```json -{ - "status": "healthy", - "version": "0.1.0", - "uptime": 3600, - "backends": { - "gitea": { - "enabled": true, - "healthy": true - }, - "oci": { - "enabled": true, - "healthy": true - } - } -} -```text - -#### Metrics - -```bash -GET /api/v1/metrics -```text - -Returns Prometheus-formatted metrics: - -```plaintext -# HELP http_requests_total Total HTTP requests -# TYPE http_requests_total counter -http_requests_total 1234 - -# HELP cache_hits_total Total cache hits -# TYPE cache_hits_total counter -cache_hits_total 567 - -# HELP cache_misses_total Total cache misses -# TYPE cache_misses_total counter -cache_misses_total 123 -```text - -#### Cache Statistics - -```bash -GET /api/v1/cache/stats -```text - -Response: - -```json -{ - "list_entries": 45, - "metadata_entries": 120, - "version_entries": 80, - "total_entries": 245 -} -```text - -## Extension Naming Conventions - -### Gitea Repositories - -Extensions in Gitea follow specific naming patterns: - -- **Providers**: `{name}_prov` (e.g., `aws_prov`, `upcloud_prov`) -- **Task Services**: `{name}_taskserv` (e.g., `kubernetes_taskserv`, `postgres_taskserv`) -- **Clusters**: `{name}_cluster` (e.g., `buildkit_cluster`, `ci_cluster`) - -### OCI Artifacts - -Extensions in OCI registries follow these patterns: - -- **Providers**: `{namespace}/{name}-provider` (e.g., `provisioning/aws-provider`) -- **Task Services**: `{namespace}/{name}-taskserv` (e.g., `provisioning/kubernetes-taskserv`) -- **Clusters**: `{namespace}/{name}-cluster` (e.g., `provisioning/buildkit-cluster`) - -## Caching Strategy - -The service implements a multi-level LRU cache with TTL: - -1. **List Cache**: Caches extension lists (filtered by type/source) -2. **Metadata Cache**: Caches individual extension metadata -3. **Version Cache**: Caches version lists per extension - -Cache behavior: - -- **Capacity**: Configurable (default: 1000 entries) -- **TTL**: Configurable (default: 5 minutes) -- **Eviction**: LRU (Least Recently Used) -- **Invalidation**: Automatic on TTL expiration - -Cache keys: - -- List: `list:{type}:{source}` -- Metadata: `{type}/{name}` -- Versions: `{type}/{name}/versions` - -## Error Handling - -The API uses standard HTTP status codes: - -- `200 OK`: Successful operation -- `400 Bad Request`: Invalid input (e.g., invalid extension type) -- `401 Unauthorized`: Authentication failed -- `404 Not Found`: Extension not found -- `429 Too Many Requests`: Rate limit exceeded -- `500 Internal Server Error`: Server error - -Error response format: - -```json -{ - "error": "not_found", - "message": "Extension provider/nonexistent not found" -} -```text - -## Metrics and Monitoring - -### Prometheus Metrics - -Available metrics: - -- `http_requests_total`: Total HTTP requests -- `http_request_duration_seconds`: Request duration histogram -- `cache_hits_total`: Total cache hits -- `cache_misses_total`: Total cache misses -- `extensions_total`: Total extensions served - -### Health Checks - -The health endpoint checks: - -- Service uptime -- Gitea backend connectivity -- OCI backend connectivity -- Overall service status - -## Development - -### Project Structure - -```plaintext -extension-registry/ -├── Cargo.toml # Rust dependencies -├── src/ -│ ├── main.rs # Entry point -│ ├── lib.rs # Library exports -│ ├── config.rs # Configuration management -│ ├── error.rs # Error types -│ ├── api/ -│ │ ├── handlers.rs # HTTP handlers -│ │ └── routes.rs # Route definitions -│ ├── gitea/ -│ │ ├── client.rs # Gitea API client -│ │ └── models.rs # Gitea data models -│ ├── oci/ -│ │ ├── client.rs # OCI registry client -│ │ └── models.rs # OCI data models -│ ├── cache/ -│ │ └── lru_cache.rs # LRU caching -│ └── models/ -│ └── extension.rs # Extension models -├── tests/ -│ └── integration_test.rs # Integration tests -├── Dockerfile # Docker build -└── README.md # This file -```text - -### Running Tests - -```bash -# Run all tests -cargo test - -# Run with output -cargo test -- --nocapture - -# Run specific test -cargo test test_health_check -```text - -### Code Quality - -```bash -# Format code -cargo fmt - -# Run clippy -cargo clippy - -# Check for security vulnerabilities -cargo audit -```text - -## Deployment - -### Systemd Service - -Create `/etc/systemd/system/extension-registry.service`: - -```ini -[Unit] -Description=Extension Registry Service -After=network.target - -[Service] -Type=simple -User=registry -WorkingDirectory=/opt/extension-registry -ExecStart=/usr/local/bin/extension-registry --config /etc/extension-registry/config.toml -Restart=on-failure -RestartSec=5s - -[Install] -WantedBy=multi-user.target -```text - -Enable and start: - -```bash -sudo systemctl enable extension-registry -sudo systemctl start extension-registry -sudo systemctl status extension-registry -```text - -### Docker Compose - -```yaml -version: '3.8' - -services: - extension-registry: - image: extension-registry:latest - ports: - - "8082:8082" - volumes: - - ./config.toml:/app/config.toml:ro - - ./tokens:/app/tokens:ro - restart: unless-stopped - healthcheck: - test: ["CMD", "curl", "-f", "http://localhost:8082/api/v1/health"] - interval: 30s - timeout: 3s - retries: 3 - start_period: 5s -```text - -### Kubernetes Deployment - -```yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: extension-registry -spec: - replicas: 3 - selector: - matchLabels: - app: extension-registry - template: - metadata: - labels: - app: extension-registry - spec: - containers: - - name: extension-registry - image: extension-registry:latest - ports: - - containerPort: 8082 - volumeMounts: - - name: config - mountPath: /app/config.toml - subPath: config.toml - - name: tokens - mountPath: /app/tokens - livenessProbe: - httpGet: - path: /api/v1/health - port: 8082 - initialDelaySeconds: 5 - periodSeconds: 10 - readinessProbe: - httpGet: - path: /api/v1/health - port: 8082 - initialDelaySeconds: 5 - periodSeconds: 5 - volumes: - - name: config - configMap: - name: extension-registry-config - - name: tokens - secret: - secretName: extension-registry-tokens ---- -apiVersion: v1 -kind: Service -metadata: - name: extension-registry -spec: - selector: - app: extension-registry - ports: - - port: 8082 - targetPort: 8082 - type: ClusterIP -```text - -## Security - -### Authentication - -- Gitea: Token-based authentication via `token_path` -- OCI: Optional token authentication via `auth_token_path` - -### Best Practices - -1. **Store tokens securely**: Use file permissions (600) for token files -2. **Enable SSL verification**: Set `verify_ssl = true` in production -3. **Use HTTPS**: Always use HTTPS for Gitea and OCI registries -4. **Limit CORS**: Configure CORS appropriately for production -5. **Rate limiting**: Consider adding rate limiting for public APIs -6. **Network isolation**: Run service in isolated network segments - -## Performance - -### Benchmarks - -Typical performance characteristics: - -- **Cached requests**: <5ms response time -- **Uncached requests**: 50-200ms (depends on backend latency) -- **Cache hit ratio**: ~85-95% in typical workloads -- **Throughput**: 1000+ req/s on modern hardware - -### Optimization Tips - -1. **Increase cache capacity**: For large extension catalogs -2. **Tune TTL**: Balance freshness vs. performance -3. **Use multiple workers**: Scale with CPU cores -4. **Enable compression**: Reduce bandwidth usage -5. **Connection pooling**: Reuse HTTP connections to backends - -## Troubleshooting - -### Common Issues - -#### Service won't start - -- Check configuration file syntax -- Verify token files exist and are readable -- Check network connectivity to backends - -#### Extensions not found - -- Verify backend configuration (URL, organization, namespace) -- Check backend connectivity with health endpoint -- Review logs for authentication errors - -#### Slow responses - -- Check backend latency -- Increase cache capacity or TTL -- Review Prometheus metrics for bottlenecks - -### Logging - -Enable debug logging: - -```bash -extension-registry --log-level debug -```text - -Enable JSON logging for structured logs: - -```bash -extension-registry --json-log -```text - -## License - -Part of the Provisioning Project. - -## Contributing - -See main project documentation for contribution guidelines. - -## Support - -For issues and questions, please refer to the main provisioning project documentation. +# Extension Registry Service\n\nA high-performance Rust microservice that provides a unified REST API for extension discovery, versioning,\nand download from multiple sources (Gitea releases and OCI registries).\n\n## Features\n\n- **Multi-Backend Support**: Fetch extensions from Gitea releases and OCI registries\n- **Unified REST API**: Single API for all extension operations\n- **Smart Caching**: LRU cache with TTL to reduce backend API calls\n- **Prometheus Metrics**: Built-in metrics for monitoring\n- **Health Monitoring**: Health checks for all backends\n- **Type-Safe**: Strong typing for extension metadata\n- **Async/Await**: High-performance async operations with Tokio\n- **Docker Support**: Production-ready containerization\n\n## Architecture\n\n```\n┌─────────────────────────────────────────────────\n────────────┐\n│ Extension Registry API │\n│ (axum) │\n├─────────────────────────────────────────────────\n────────────┤\n│ │\n│ ┌────────────────┐ ┌────────────────┐ \n┌──────────────┐ │\n│ │ Gitea Client │ │ OCI Client │ │ LRU Cache │ │\n│ │ (reqwest) │ │ (reqwest) │ │ (parking) │ │\n│ └────────────────┘ └────────────────┘ \n└──────────────┘ │\n│ │ │ │ │\n└─────────┼────────────────────┼──────────────────\n──┼─────────┘\n │ │ │\n ▼ ▼ ▼\n ┌──────────┐ ┌──────────┐ ┌──────────┐\n │ Gitea │ │ OCI │ │ Memory │\n │ Releases │ │ Registry │ │ │\n └──────────┘ └──────────┘ └──────────┘\n```\n\n## Installation\n\n### Building from Source\n\n```\ncd provisioning/platform/extension-registry\ncargo build --release\n```\n\n### Docker Build\n\n```\ndocker build -t extension-registry:latest .\n```\n\n### Running with Cargo\n\n```\ncargo run -- --config config.toml --port 8082\n```\n\n### Running with Docker\n\n```\ndocker run -d \\n -p 8082:8082 \\n -v $(pwd)/config.toml:/app/config.toml:ro \\n -v $(pwd)/tokens:/app/tokens:ro \\n extension-registry:latest\n```\n\n## Configuration\n\nCreate a `config.toml` file (see `config.example.toml`):\n\n```\n[server]\nhost = "0.0.0.0"\nport = 8082\nworkers = 4\nenable_cors = true\nenable_compression = true\n\n# Gitea backend (optional)\n[gitea]\nurl = "https://gitea.example.com"\norganization = "provisioning-extensions"\ntoken_path = "/path/to/gitea-token.txt"\ntimeout_seconds = 30\nverify_ssl = true\n\n# OCI registry backend (optional)\n[oci]\nregistry = "registry.example.com"\nnamespace = "provisioning"\nauth_token_path = "/path/to/oci-token.txt"\ntimeout_seconds = 30\nverify_ssl = true\n\n# Cache configuration\n[cache]\ncapacity = 1000\nttl_seconds = 300\nenable_metadata_cache = true\nenable_list_cache = true\n```\n\n**Note**: At least one backend (Gitea or OCI) must be configured.\n\n## API Endpoints\n\n### Extension Operations\n\n#### List Extensions\n\n```\nGET /api/v1/extensions\n```\n\nQuery parameters:\n\n- `type` (optional): Filter by extension type (`provider`, `taskserv`, `cluster`)\n- `source` (optional): Filter by source (`gitea`, `oci`)\n- `limit` (optional): Maximum results (default: 100)\n- `offset` (optional): Pagination offset (default: 0)\n\nExample:\n\n```\ncurl http://localhost:8082/api/v1/extensions?type=provider&limit=10\n```\n\nResponse:\n\n```\n[\n {\n "name": "aws",\n "type": "provider",\n "version": "1.2.0",\n "description": "AWS provider for provisioning",\n "author": "provisioning-team",\n "repository": "https://gitea.example.com/org/aws_prov",\n "source": "gitea",\n "published_at": "2025-10-06T12:00:00Z",\n "download_url": "https://gitea.example.com/org/aws_prov/releases/download/1.2.0/aws_prov.tar.gz",\n "size": 1024000\n }\n]\n```\n\n#### Get Extension\n\n```\nGET /api/v1/extensions/{type}/{name}\n```\n\nExample:\n\n```\ncurl http://localhost:8082/api/v1/extensions/provider/aws\n```\n\n#### List Versions\n\n```\nGET /api/v1/extensions/{type}/{name}/versions\n```\n\nExample:\n\n```\ncurl http://localhost:8082/api/v1/extensions/provider/aws/versions\n```\n\nResponse:\n\n```\n[\n {\n "version": "1.2.0",\n "published_at": "2025-10-06T12:00:00Z",\n "download_url": "https://gitea.example.com/org/aws_prov/releases/download/1.2.0/aws_prov.tar.gz",\n "size": 1024000\n },\n {\n "version": "1.1.0",\n "published_at": "2025-09-15T10:30:00Z",\n "download_url": "https://gitea.example.com/org/aws_prov/releases/download/1.1.0/aws_prov.tar.gz",\n "size": 980000\n }\n]\n```\n\n#### Download Extension\n\n```\nGET /api/v1/extensions/{type}/{name}/{version}\n```\n\nExample:\n\n```\ncurl -O http://localhost:8082/api/v1/extensions/provider/aws/1.2.0\n```\n\nReturns binary data with `Content-Type: application/octet-stream`.\n\n#### Search Extensions\n\n```\nGET /api/v1/extensions/search?q={query}\n```\n\nQuery parameters:\n\n- `q` (required): Search query\n- `type` (optional): Filter by extension type\n- `limit` (optional): Maximum results (default: 50)\n\nExample:\n\n```\ncurl http://localhost:8082/api/v1/extensions/search?q=kubernetes&type=taskserv\n```\n\n### System Endpoints\n\n#### Health Check\n\n```\nGET /api/v1/health\n```\n\nExample:\n\n```\ncurl http://localhost:8082/api/v1/health\n```\n\nResponse:\n\n```\n{\n "status": "healthy",\n "version": "0.1.0",\n "uptime": 3600,\n "backends": {\n "gitea": {\n "enabled": true,\n "healthy": true\n },\n "oci": {\n "enabled": true,\n "healthy": true\n }\n }\n}\n```\n\n#### Metrics\n\n```\nGET /api/v1/metrics\n```\n\nReturns Prometheus-formatted metrics:\n\n```\n# HELP http_requests_total Total HTTP requests\n# TYPE http_requests_total counter\nhttp_requests_total 1234\n\n# HELP cache_hits_total Total cache hits\n# TYPE cache_hits_total counter\ncache_hits_total 567\n\n# HELP cache_misses_total Total cache misses\n# TYPE cache_misses_total counter\ncache_misses_total 123\n```\n\n#### Cache Statistics\n\n```\nGET /api/v1/cache/stats\n```\n\nResponse:\n\n```\n{\n "list_entries": 45,\n "metadata_entries": 120,\n "version_entries": 80,\n "total_entries": 245\n}\n```\n\n## Extension Naming Conventions\n\n### Gitea Repositories\n\nExtensions in Gitea follow specific naming patterns:\n\n- **Providers**: `{name}_prov` (e.g., `aws_prov`, `upcloud_prov`)\n- **Task Services**: `{name}_taskserv` (e.g., `kubernetes_taskserv`, `postgres_taskserv`)\n- **Clusters**: `{name}_cluster` (e.g., `buildkit_cluster`, `ci_cluster`)\n\n### OCI Artifacts\n\nExtensions in OCI registries follow these patterns:\n\n- **Providers**: `{namespace}/{name}-provider` (e.g., `provisioning/aws-provider`)\n- **Task Services**: `{namespace}/{name}-taskserv` (e.g., `provisioning/kubernetes-taskserv`)\n- **Clusters**: `{namespace}/{name}-cluster` (e.g., `provisioning/buildkit-cluster`)\n\n## Caching Strategy\n\nThe service implements a multi-level LRU cache with TTL:\n\n1. **List Cache**: Caches extension lists (filtered by type/source)\n2. **Metadata Cache**: Caches individual extension metadata\n3. **Version Cache**: Caches version lists per extension\n\nCache behavior:\n\n- **Capacity**: Configurable (default: 1000 entries)\n- **TTL**: Configurable (default: 5 minutes)\n- **Eviction**: LRU (Least Recently Used)\n- **Invalidation**: Automatic on TTL expiration\n\nCache keys:\n\n- List: `list:{type}:{source}`\n- Metadata: `{type}/{name}`\n- Versions: `{type}/{name}/versions`\n\n## Error Handling\n\nThe API uses standard HTTP status codes:\n\n- `200 OK`: Successful operation\n- `400 Bad Request`: Invalid input (e.g., invalid extension type)\n- `401 Unauthorized`: Authentication failed\n- `404 Not Found`: Extension not found\n- `429 Too Many Requests`: Rate limit exceeded\n- `500 Internal Server Error`: Server error\n\nError response format:\n\n```\n{\n "error": "not_found",\n "message": "Extension provider/nonexistent not found"\n}\n```\n\n## Metrics and Monitoring\n\n### Prometheus Metrics\n\nAvailable metrics:\n\n- `http_requests_total`: Total HTTP requests\n- `http_request_duration_seconds`: Request duration histogram\n- `cache_hits_total`: Total cache hits\n- `cache_misses_total`: Total cache misses\n- `extensions_total`: Total extensions served\n\n### Health Checks\n\nThe health endpoint checks:\n\n- Service uptime\n- Gitea backend connectivity\n- OCI backend connectivity\n- Overall service status\n\n## Development\n\n### Project Structure\n\n```\nextension-registry/\n├── Cargo.toml # Rust dependencies\n├── src/\n│ ├── main.rs # Entry point\n│ ├── lib.rs # Library exports\n│ ├── config.rs # Configuration management\n│ ├── error.rs # Error types\n│ ├── api/\n│ │ ├── handlers.rs # HTTP handlers\n│ │ └── routes.rs # Route definitions\n│ ├── gitea/\n│ │ ├── client.rs # Gitea API client\n│ │ └── models.rs # Gitea data models\n│ ├── oci/\n│ │ ├── client.rs # OCI registry client\n│ │ └── models.rs # OCI data models\n│ ├── cache/\n│ │ └── lru_cache.rs # LRU caching\n│ └── models/\n│ └── extension.rs # Extension models\n├── tests/\n│ └── integration_test.rs # Integration tests\n├── Dockerfile # Docker build\n└── README.md # This file\n```\n\n### Running Tests\n\n```\n# Run all tests\ncargo test\n\n# Run with output\ncargo test -- --nocapture\n\n# Run specific test\ncargo test test_health_check\n```\n\n### Code Quality\n\n```\n# Format code\ncargo fmt\n\n# Run clippy\ncargo clippy\n\n# Check for security vulnerabilities\ncargo audit\n```\n\n## Deployment\n\n### Systemd Service\n\nCreate `/etc/systemd/system/extension-registry.service`:\n\n```\n[Unit]\nDescription=Extension Registry Service\nAfter=network.target\n\n[Service]\nType=simple\nUser=registry\nWorkingDirectory=/opt/extension-registry\nExecStart=/usr/local/bin/extension-registry --config /etc/extension-registry/config.toml\nRestart=on-failure\nRestartSec=5s\n\n[Install]\nWantedBy=multi-user.target\n```\n\nEnable and start:\n\n```\nsudo systemctl enable extension-registry\nsudo systemctl start extension-registry\nsudo systemctl status extension-registry\n```\n\n### Docker Compose\n\n```\nversion: '3.8'\n\nservices:\n extension-registry:\n image: extension-registry:latest\n ports:\n - "8082:8082"\n volumes:\n - ./config.toml:/app/config.toml:ro\n - ./tokens:/app/tokens:ro\n restart: unless-stopped\n healthcheck:\n test: ["CMD", "curl", "-f", "http://localhost:8082/api/v1/health"]\n interval: 30s\n timeout: 3s\n retries: 3\n start_period: 5s\n```\n\n### Kubernetes Deployment\n\n```\napiVersion: apps/v1\nkind: Deployment\nmetadata:\n name: extension-registry\nspec:\n replicas: 3\n selector:\n matchLabels:\n app: extension-registry\n template:\n metadata:\n labels:\n app: extension-registry\n spec:\n containers:\n - name: extension-registry\n image: extension-registry:latest\n ports:\n - containerPort: 8082\n volumeMounts:\n - name: config\n mountPath: /app/config.toml\n subPath: config.toml\n - name: tokens\n mountPath: /app/tokens\n livenessProbe:\n httpGet:\n path: /api/v1/health\n port: 8082\n initialDelaySeconds: 5\n periodSeconds: 10\n readinessProbe:\n httpGet:\n path: /api/v1/health\n port: 8082\n initialDelaySeconds: 5\n periodSeconds: 5\n volumes:\n - name: config\n configMap:\n name: extension-registry-config\n - name: tokens\n secret:\n secretName: extension-registry-tokens\n---\napiVersion: v1\nkind: Service\nmetadata:\n name: extension-registry\nspec:\n selector:\n app: extension-registry\n ports:\n - port: 8082\n targetPort: 8082\n type: ClusterIP\n```\n\n## Security\n\n### Authentication\n\n- Gitea: Token-based authentication via `token_path`\n- OCI: Optional token authentication via `auth_token_path`\n\n### Best Practices\n\n1. **Store tokens securely**: Use file permissions (600) for token files\n2. **Enable SSL verification**: Set `verify_ssl = true` in production\n3. **Use HTTPS**: Always use HTTPS for Gitea and OCI registries\n4. **Limit CORS**: Configure CORS appropriately for production\n5. **Rate limiting**: Consider adding rate limiting for public APIs\n6. **Network isolation**: Run service in isolated network segments\n\n## Performance\n\n### Benchmarks\n\nTypical performance characteristics:\n\n- **Cached requests**: <5ms response time\n- **Uncached requests**: 50-200ms (depends on backend latency)\n- **Cache hit ratio**: ~85-95% in typical workloads\n- **Throughput**: 1000+ req/s on modern hardware\n\n### Optimization Tips\n\n1. **Increase cache capacity**: For large extension catalogs\n2. **Tune TTL**: Balance freshness vs. performance\n3. **Use multiple workers**: Scale with CPU cores\n4. **Enable compression**: Reduce bandwidth usage\n5. **Connection pooling**: Reuse HTTP connections to backends\n\n## Troubleshooting\n\n### Common Issues\n\n#### Service won't start\n\n- Check configuration file syntax\n- Verify token files exist and are readable\n- Check network connectivity to backends\n\n#### Extensions not found\n\n- Verify backend configuration (URL, organization, namespace)\n- Check backend connectivity with health endpoint\n- Review logs for authentication errors\n\n#### Slow responses\n\n- Check backend latency\n- Increase cache capacity or TTL\n- Review Prometheus metrics for bottlenecks\n\n### Logging\n\nEnable debug logging:\n\n```\nextension-registry --log-level debug\n```\n\nEnable JSON logging for structured logs:\n\n```\nextension-registry --json-log\n```\n\n## License\n\nPart of the Provisioning Project.\n\n## Contributing\n\nSee main project documentation for contribution guidelines.\n\n## Support\n\nFor issues and questions, please refer to the main provisioning project documentation. \ No newline at end of file diff --git a/crates/mcp-server/README.md b/crates/mcp-server/README.md index 6345d1f..0eeae94 100644 --- a/crates/mcp-server/README.md +++ b/crates/mcp-server/README.md @@ -1,135 +1 @@ -# Rust MCP Server for Infrastructure Automation - -## Overview - -A **Rust-native Model Context Protocol (MCP) server** for infrastructure automation and AI-assisted DevOps operations. -This replaces the Python implementation, providing significant performance improvements and maintaining philosophical consistency -with the Rust ecosystem approach. - -## ✅ Project Status: **PROOF OF CONCEPT COMPLETE** - -### 🎯 Achieved Goals - -- ✅ **Feasibility Analysis**: Rust MCP server is fully viable -- ✅ **Functional Prototype**: All core features working -- ✅ **Performance Benchmarks**: Microsecond-level latency achieved -- ✅ **Integration**: Successfully integrates with existing provisioning system - -### 🚀 Performance Results - -```plaintext -🚀 Rust MCP Server Performance Analysis -================================================== - -📋 Server Parsing Performance: - • 31 chars: 0μs avg - • 67 chars: 0μs avg - • 65 chars: 0μs avg - • 58 chars: 0μs avg - -🤖 AI Status Performance: - • AI Status: 0μs avg (10000 iterations) - -💾 Memory Footprint: - • ServerConfig size: 80 bytes - • Config size: 272 bytes - -✅ Performance Summary: - • Server parsing: Sub-millisecond latency - • Configuration access: Microsecond latency - • Memory efficient: Small struct footprint - • Zero-copy string operations where possible -```text - -### 🏗️ Architecture - -```plaintext -src/ -├── simple_main.rs # Lightweight MCP server entry point -├── main.rs # Full MCP server (with SDK integration) -├── lib.rs # Library interface -├── config.rs # Configuration management -├── provisioning.rs # Core provisioning engine -├── tools.rs # AI-powered parsing tools -├── errors.rs # Error handling -└── performance_test.rs # Performance benchmarking -```text - -### 🎲 Key Features - -1. **AI-Powered Server Parsing**: Natural language to infrastructure config -2. **Multi-Provider Support**: AWS, UpCloud, Local -3. **Configuration Management**: TOML-based with environment overrides -4. **Error Handling**: Comprehensive error types with recovery hints -5. **Performance Monitoring**: Built-in benchmarking capabilities - -### 📊 Rust vs Python Comparison - -| Metric | Python MCP Server | Rust MCP Server | Improvement | -| -------- | ------------------ | ----------------- | ------------- | -| **Startup Time** | ~500ms | ~50ms | **10x faster** | -| **Memory Usage** | ~50MB | ~5MB | **10x less** | -| **Parsing Latency** | ~1ms | ~0.001ms | **1000x faster** | -| **Binary Size** | Python + deps | ~15MB static | **Portable** | -| **Type Safety** | Runtime errors | Compile-time | **Zero runtime errors** | - -### 🛠️ Usage - -```bash -# Build and run -cargo run --bin provisioning-mcp-server --release - -# Run with custom config -PROVISIONING_PATH=/path/to/provisioning cargo run --bin provisioning-mcp-server -- --debug - -# Run tests -cargo test - -# Run benchmarks -cargo run --bin provisioning-mcp-server --release -```text - -### 🔧 Configuration - -Set via environment variables: - -```bash -export PROVISIONING_PATH=/path/to/provisioning -export PROVISIONING_AI_PROVIDER=openai -export OPENAI_API_KEY=your-key -export PROVISIONING_DEBUG=true -```text - -### 📈 Integration Benefits - -1. **Philosophical Consistency**: Rust throughout the stack -2. **Performance**: Sub-millisecond response times -3. **Memory Safety**: No segfaults, no memory leaks -4. **Concurrency**: Native async/await support -5. **Distribution**: Single static binary -6. **Cross-compilation**: ARM64/x86_64 support - -### 🎪 Demo Integration - -This Rust MCP server is ready to be showcased at the **Rust Meetup 2025** as proof that: - -> **"A Rust-first approach to infrastructure automation delivers both performance and safety without compromising functionality."** - -### 🚧 Next Steps - -1. Full MCP SDK integration (schema definitions) -2. WebSocket/TCP transport layer -3. Plugin system for extensibility -4. Metrics collection and monitoring -5. Documentation and examples - -### 📝 Conclusion - -**The Rust MCP Server successfully demonstrates that replacing Python components with Rust provides:** - -- ⚡ **1000x performance improvement** in parsing operations -- 🧠 **10x memory efficiency** -- 🔒 **Compile-time safety** guarantees -- 🎯 **Philosophical consistency** with the ecosystem approach - -This validates the **"Rust-first infrastructure automation"** approach for the meetup presentation. +# Rust MCP Server for Infrastructure Automation\n\n## Overview\n\nA **Rust-native Model Context Protocol (MCP) server** for infrastructure automation and AI-assisted DevOps operations.\nThis replaces the Python implementation, providing significant performance improvements and maintaining philosophical consistency\nwith the Rust ecosystem approach.\n\n## ✅ Project Status: **PROOF OF CONCEPT COMPLETE**\n\n### 🎯 Achieved Goals\n\n- ✅ **Feasibility Analysis**: Rust MCP server is fully viable\n- ✅ **Functional Prototype**: All core features working\n- ✅ **Performance Benchmarks**: Microsecond-level latency achieved\n- ✅ **Integration**: Successfully integrates with existing provisioning system\n\n### 🚀 Performance Results\n\n```\n🚀 Rust MCP Server Performance Analysis\n==================================================\n\n📋 Server Parsing Performance:\n • 31 chars: 0μs avg\n • 67 chars: 0μs avg \n • 65 chars: 0μs avg\n • 58 chars: 0μs avg\n\n🤖 AI Status Performance:\n • AI Status: 0μs avg (10000 iterations)\n\n💾 Memory Footprint:\n • ServerConfig size: 80 bytes\n • Config size: 272 bytes\n\n✅ Performance Summary:\n • Server parsing: Sub-millisecond latency\n • Configuration access: Microsecond latency\n • Memory efficient: Small struct footprint\n • Zero-copy string operations where possible\n```\n\n### 🏗️ Architecture\n\n```\nsrc/\n├── simple_main.rs # Lightweight MCP server entry point\n├── main.rs # Full MCP server (with SDK integration)\n├── lib.rs # Library interface\n├── config.rs # Configuration management\n├── provisioning.rs # Core provisioning engine\n├── tools.rs # AI-powered parsing tools\n├── errors.rs # Error handling\n└── performance_test.rs # Performance benchmarking\n```\n\n### 🎲 Key Features\n\n1. **AI-Powered Server Parsing**: Natural language to infrastructure config\n2. **Multi-Provider Support**: AWS, UpCloud, Local\n3. **Configuration Management**: TOML-based with environment overrides \n4. **Error Handling**: Comprehensive error types with recovery hints\n5. **Performance Monitoring**: Built-in benchmarking capabilities\n\n### 📊 Rust vs Python Comparison\n\n| Metric | Python MCP Server | Rust MCP Server | Improvement |\n| -------- | ------------------ | ----------------- | ------------- |\n| **Startup Time** | ~500ms | ~50ms | **10x faster** |\n| **Memory Usage** | ~50MB | ~5MB | **10x less** |\n| **Parsing Latency** | ~1ms | ~0.001ms | **1000x faster** |\n| **Binary Size** | Python + deps | ~15MB static | **Portable** |\n| **Type Safety** | Runtime errors | Compile-time | **Zero runtime errors** |\n\n### 🛠️ Usage\n\n```\n# Build and run\ncargo run --bin provisioning-mcp-server --release\n\n# Run with custom config\nPROVISIONING_PATH=/path/to/provisioning cargo run --bin provisioning-mcp-server -- --debug\n\n# Run tests\ncargo test\n\n# Run benchmarks \ncargo run --bin provisioning-mcp-server --release\n```\n\n### 🔧 Configuration\n\nSet via environment variables:\n\n```\nexport PROVISIONING_PATH=/path/to/provisioning\nexport PROVISIONING_AI_PROVIDER=openai\nexport OPENAI_API_KEY=your-key\nexport PROVISIONING_DEBUG=true\n```\n\n### 📈 Integration Benefits\n\n1. **Philosophical Consistency**: Rust throughout the stack\n2. **Performance**: Sub-millisecond response times\n3. **Memory Safety**: No segfaults, no memory leaks\n4. **Concurrency**: Native async/await support\n5. **Distribution**: Single static binary\n6. **Cross-compilation**: ARM64/x86_64 support\n\n### 🎪 Demo Integration\n\nThis Rust MCP server is ready to be showcased at the **Rust Meetup 2025** as proof that:\n\n> **"A Rust-first approach to infrastructure automation delivers both performance and safety without compromising functionality."**\n\n### 🚧 Next Steps\n\n1. Full MCP SDK integration (schema definitions)\n2. WebSocket/TCP transport layer\n3. Plugin system for extensibility\n4. Metrics collection and monitoring\n5. Documentation and examples\n\n### 📝 Conclusion\n\n**The Rust MCP Server successfully demonstrates that replacing Python components with Rust provides:**\n\n- ⚡ **1000x performance improvement** in parsing operations\n- 🧠 **10x memory efficiency**\n- 🔒 **Compile-time safety** guarantees\n- 🎯 **Philosophical consistency** with the ecosystem approach\n\nThis validates the **"Rust-first infrastructure automation"** approach for the meetup presentation. \ No newline at end of file diff --git a/crates/orchestrator/README.md b/crates/orchestrator/README.md index e730330..4db02ae 100644 --- a/crates/orchestrator/README.md +++ b/crates/orchestrator/README.md @@ -1,515 +1 @@ -# Provisioning Orchestrator - -A Rust-based orchestrator service that coordinates infrastructure provisioning workflows with pluggable storage backends and comprehensive migration -tools. - -## Architecture - -The orchestrator implements a hybrid multi-storage approach: - -- **Rust Orchestrator**: Handles coordination, queuing, and parallel execution -- **Nushell Scripts**: Execute the actual provisioning logic -- **Pluggable Storage**: Multiple storage backends with seamless migration -- **REST API**: HTTP interface for workflow submission and monitoring - -## Features - -- **Multi-Storage Backends**: Filesystem, SurrealDB Embedded, and SurrealDB Server options -- **Task Queue**: Priority-based task scheduling with retry logic -- **Seamless Migration**: Move data between storage backends with zero downtime -- **Feature Flags**: Compile-time backend selection for minimal dependencies -- **Parallel Execution**: Multiple tasks can run concurrently -- **Status Tracking**: Real-time task status and progress monitoring -- **Advanced Features**: Authentication, audit logging, and metrics (SurrealDB) -- **Nushell Integration**: Seamless execution of existing provisioning scripts -- **RESTful API**: HTTP endpoints for workflow management -- **Test Environment Service**: Automated containerized testing for taskservs, servers, and clusters -- **Multi-Node Support**: Test complex topologies including Kubernetes and etcd clusters -- **Docker Integration**: Automated container lifecycle management via Docker API - -## Quick Start - -### Build and Run - -**Default Build (Filesystem Only)**: - -```bash -cd src/orchestrator -cargo build --release -cargo run -- --port 8080 --data-dir ./data -```text - -**With SurrealDB Support**: - -```bash -cd src/orchestrator -cargo build --release --features surrealdb - -# Run with SurrealDB embedded -cargo run --features surrealdb -- --storage-type surrealdb-embedded --data-dir ./data - -# Run with SurrealDB server -cargo run --features surrealdb -- --storage-type surrealdb-server \ - --surrealdb-url ws://localhost:8000 \ - --surrealdb-username admin --surrealdb-password secret -```text - -### Submit a Server Creation Workflow - -```bash -curl -X POST http://localhost:8080/workflows/servers/create \ - -H "Content-Type: application/json" \ - -d '{ - "infra": "production", - "settings": "./settings.yaml", - "servers": ["web-01", "web-02"], - "check_mode": false, - "wait": true - }' -```text - -### Check Task Status - -```bash -curl http://localhost:8080/tasks/{task_id} -```text - -### List All Tasks - -```bash -curl http://localhost:8080/tasks -```text - -## API Endpoints - -### Health Check - -- `GET /health` - Service health status - -### Task Management - -- `GET /tasks` - List all tasks -- `GET /tasks/{id}` - Get specific task status - -### Workflows - -- `POST /workflows/servers/create` - Submit server creation workflow -- `POST /workflows/taskserv/create` - Submit taskserv creation workflow -- `POST /workflows/cluster/create` - Submit cluster creation workflow - -### Test Environments - -- `POST /test/environments/create` - Create test environment -- `GET /test/environments` - List all test environments -- `GET /test/environments/{id}` - Get environment details -- `POST /test/environments/{id}/run` - Run tests in environment -- `DELETE /test/environments/{id}` - Cleanup test environment -- `GET /test/environments/{id}/logs` - Get environment logs - -## Test Environment Service - -The orchestrator includes a comprehensive test environment service for automated containerized testing -of taskservs, complete servers, and multi-node clusters. - -### Overview - -The Test Environment Service enables: - -- **Single Taskserv Testing**: Test individual taskservs in isolated containers -- **Server Simulation**: Test complete server configurations with multiple taskservs -- **Cluster Topologies**: Test multi-node clusters (Kubernetes, etcd, etc.) -- **Automated Container Management**: No manual Docker management required -- **Network Isolation**: Each test environment gets dedicated networks -- **Resource Limits**: Configure CPU, memory, and disk limits per container - -### Test Environment Types - -#### 1. Single Taskserv - -Test individual taskserv in isolated container: - -```bash -curl -X POST http://localhost:8080/test/environments/create \ - -H "Content-Type: application/json" \ - -d '{ - "config": { - "type": "single_taskserv", - "taskserv": "kubernetes", - "base_image": "ubuntu:22.04", - "resources": { - "cpu_millicores": 2000, - "memory_mb": 4096 - } - }, - "auto_start": true, - "auto_cleanup": false - }' -```text - -#### 2. Server Simulation - -Simulate complete server with multiple taskservs: - -```bash -curl -X POST http://localhost:8080/test/environments/create \ - -H "Content-Type: application/json" \ - -d '{ - "config": { - "type": "server_simulation", - "server_name": "web-01", - "taskservs": ["containerd", "kubernetes", "cilium"], - "base_image": "ubuntu:22.04" - }, - "infra": "prod-stack", - "auto_start": true - }' -```text - -#### 3. Cluster Topology - -Test multi-node cluster configurations: - -```bash -curl -X POST http://localhost:8080/test/environments/create \ - -H "Content-Type: application/json" \ - -d '{ - "config": { - "type": "cluster_topology", - "cluster_type": "kubernetes", - "topology": { - "nodes": [ - { - "name": "cp-01", - "role": "controlplane", - "taskservs": ["etcd", "kubernetes", "containerd"], - "resources": { - "cpu_millicores": 2000, - "memory_mb": 4096 - } - }, - { - "name": "worker-01", - "role": "worker", - "taskservs": ["kubernetes", "containerd", "cilium"], - "resources": { - "cpu_millicores": 1000, - "memory_mb": 2048 - } - } - ], - "network": { - "subnet": "172.30.0.0/16" - } - } - }, - "auto_start": true - }' -```text - -### Nushell CLI Integration - -The test environment service is fully integrated with Nushell CLI: - -```nushell -# Quick test (create, run, cleanup) -provisioning test quick kubernetes - -# Single taskserv test -provisioning test env single postgres --auto-start --auto-cleanup - -# Server simulation -provisioning test env server web-01 [containerd kubernetes cilium] --auto-start - -# Cluster from template -provisioning test topology load kubernetes_3node | test env cluster kubernetes - -# List environments -provisioning test env list - -# Check status -provisioning test env status - -# View logs -provisioning test env logs - -# Cleanup -provisioning test env cleanup -```text - -### Topology Templates - -Predefined multi-node cluster topologies are available in `provisioning/config/test-topologies.toml`: - -- **kubernetes_3node**: 3-node HA Kubernetes cluster (1 control plane + 2 workers) -- **kubernetes_single**: All-in-one Kubernetes node -- **etcd_cluster**: 3-member etcd cluster -- **containerd_test**: Standalone containerd testing -- **postgres_redis**: Database stack testing - -### Prerequisites - -1. **Docker Running**: The orchestrator requires Docker daemon to be running - - ```bash - docker ps # Should work without errors - ``` - -1. **Orchestrator Running**: Start the orchestrator before using test environments - - ```bash - ./scripts/start-orchestrator.nu --background - ``` - -### Architecture - -```plaintext -User Command (CLI/API) - ↓ -Test Orchestrator (Rust) - ↓ -Container Manager (bollard) - ↓ -Docker API - ↓ -Isolated Test Containers - • Dedicated networks - • Resource limits - • Volume mounts - • Multi-node support -```text - -### Key Components - -#### Rust Modules - -- `test_environment.rs` - Core types and configurations -- `container_manager.rs` - Docker API integration (bollard) -- `test_orchestrator.rs` - Orchestration logic - -#### Features - -- **Automated Lifecycle**: Create, start, stop, cleanup containers automatically -- **Network Isolation**: Each environment gets isolated Docker network -- **Resource Management**: CPU and memory limits per container -- **Test Execution**: Run test scripts within containers -- **Log Collection**: Capture and expose container logs -- **Auto-Cleanup**: Optional automatic cleanup after tests - -### Use Cases - -1. **Taskserv Development**: Test new taskservs before deployment -2. **Integration Testing**: Validate taskserv combinations -3. **Cluster Validation**: Test multi-node cluster configurations -4. **CI/CD Integration**: Automated testing in pipelines -5. **Production Simulation**: Test production-like deployments safely - -### CI/CD Integration - -```yaml -# GitLab CI example -test-infrastructure: - stage: test - script: - - provisioning test quick kubernetes - - provisioning test quick postgres - - provisioning test quick redis -```text - -### Documentation - -For complete usage guide and examples, see: - -- **User Guide**: `docs/user/test-environment-guide.md` -- **Usage Documentation**: `docs/user/test-environment-usage.md` -- **Implementation Summary**: `provisioning/core/nulib/test_environments_summary.md` - -## Configuration - -### Core Options - -- `--port` - HTTP server port (default: 8080) -- `--data-dir` - Data directory for storage (default: ./data) -- `--storage-type` - Storage backend: filesystem, surrealdb-embedded, surrealdb-server -- `--nu-path` - Path to Nushell executable (default: nu) -- `--provisioning-path` - Path to provisioning script (default: ./core/nulib/provisioning) - -### SurrealDB Options (when `--features surrealdb` enabled) - -- `--surrealdb-url` - Server URL for surrealdb-server mode (e.g., ws://localhost:8000) -- `--surrealdb-namespace` - Database namespace (default: orchestrator) -- `--surrealdb-database` - Database name (default: tasks) -- `--surrealdb-username` - Authentication username -- `--surrealdb-password` - Authentication password - -### Storage Backend Comparison - -| Feature | Filesystem | SurrealDB Embedded | SurrealDB Server | -| --------- | ------------ | ------------------- | ------------------ | -| **Dependencies** | None | Local database | Remote server | -| **Auth/RBAC** | Basic | Advanced | Advanced | -| **Real-time** | No | Yes | Yes | -| **Scalability** | Limited | Medium | High | -| **Complexity** | Low | Medium | High | -| **Best For** | Development | Production | Distributed | - -## Nushell Integration - -The orchestrator includes workflow wrappers in `core/nulib/workflows/server_create.nu`: - -```nushell -# Submit workflow via Nushell -use workflows/server_create.nu -server_create_workflow "production" --settings "./settings.yaml" --wait - -# Check workflow status -workflow status $task_id - -# List all workflows -workflow list -```text - -## Task States - -- **Pending**: Queued for execution -- **Running**: Currently executing -- **Completed**: Finished successfully -- **Failed**: Execution failed (will retry if under limit) -- **Cancelled**: Manually cancelled - -## Storage Architecture - -### Multi-Backend Support - -The orchestrator uses a pluggable storage architecture with three backends: - -#### Filesystem (Default) - -- **Format**: JSON files in directory structure -- **Location**: `{data_dir}/queue.rkvs/{tasks,queue}/` -- **Features**: Basic task persistence, priority queuing -- **Best For**: Development, simple deployments - -#### SurrealDB Embedded - -- **Format**: Local SurrealDB database with RocksDB engine -- **Location**: `{data_dir}/orchestrator.db` -- **Features**: ACID transactions, advanced queries, audit logging -- **Best For**: Production single-node deployments - -#### SurrealDB Server - -- **Format**: Remote SurrealDB server connection -- **Connection**: WebSocket or HTTP protocol -- **Features**: Full multi-user, real-time subscriptions, horizontal scaling -- **Best For**: Distributed production deployments - -### Data Migration - -Seamless migration between storage backends: - -```bash -# Interactive migration wizard -./scripts/migrate-storage.nu --interactive - -# Direct migration -./scripts/migrate-storage.nu --from filesystem --to surrealdb-embedded \ - --source-dir ./data --target-dir ./surrealdb-data - -# Validate migration setup -./scripts/migrate-storage.nu validate --from filesystem --to surrealdb-server -```text - -## Error Handling - -- Failed tasks are automatically retried up to 3 times -- Permanent failures are marked and logged -- Service restart recovery loads tasks from persistent storage -- API errors return structured JSON responses - -## Monitoring - -- Structured logging with tracing -- Task execution metrics -- Queue depth monitoring -- Health check endpoint - -## Development - -### Dependencies - -**Core Dependencies** (always included): - -- **axum**: HTTP server framework -- **tokio**: Async runtime -- **serde**: Serialization -- **tracing**: Structured logging -- **async-trait**: Async trait support -- **anyhow**: Error handling -- **bollard**: Docker API client for container management - -**Optional Dependencies** (feature-gated): - -- **surrealdb**: Multi-model database (requires `--features surrealdb`) - - Embedded mode: RocksDB storage engine - - Server mode: WebSocket/HTTP client - -### Adding New Workflows - -1. Create workflow definition in `src/main.rs` -2. Add API endpoint handler -3. Create Nushell wrapper in `core/nulib/workflows/` -4. Update existing code to use workflow bridge functions - -### Testing - -**Unit and Integration Tests**: - -```bash -# Test with filesystem only (default) -cargo test - -# Test all storage backends -cargo test --features surrealdb - -# Test specific suites -cargo test --test storage_integration -cargo test --test migration_tests -cargo test --test factory_tests -```text - -**Performance Benchmarks**: - -```bash -# Benchmark storage performance -cargo bench --bench storage_benchmarks - -# Benchmark migration performance -cargo bench --bench migration_benchmarks - -# Generate HTML reports -cargo bench --features surrealdb -open target/criterion/reports/index.html -```text - -**Test Configuration**: - -```bash -# Run with specific backend -TEST_STORAGE=filesystem cargo test -TEST_STORAGE=surrealdb-embedded cargo test --features surrealdb - -# Verbose testing -cargo test -- --nocapture -```text - -## Migration from Deep Call Stack Issues - -This orchestrator solves the Nushell deep call stack limitations by: - -1. Moving coordination logic to Rust -2. Executing individual Nushell commands at top level -3. Managing parallel execution externally -4. Preserving all existing business logic in Nushell - -The existing `on_create_servers` function can be replaced with `on_create_servers_workflow` for orchestrated execution while maintaining full -compatibility. +# Provisioning Orchestrator\n\nA Rust-based orchestrator service that coordinates infrastructure provisioning workflows with pluggable storage backends and comprehensive migration \ntools.\n\n## Architecture\n\nThe orchestrator implements a hybrid multi-storage approach:\n\n- **Rust Orchestrator**: Handles coordination, queuing, and parallel execution\n- **Nushell Scripts**: Execute the actual provisioning logic\n- **Pluggable Storage**: Multiple storage backends with seamless migration\n- **REST API**: HTTP interface for workflow submission and monitoring\n\n## Features\n\n- **Multi-Storage Backends**: Filesystem, SurrealDB Embedded, and SurrealDB Server options\n- **Task Queue**: Priority-based task scheduling with retry logic\n- **Seamless Migration**: Move data between storage backends with zero downtime\n- **Feature Flags**: Compile-time backend selection for minimal dependencies\n- **Parallel Execution**: Multiple tasks can run concurrently\n- **Status Tracking**: Real-time task status and progress monitoring\n- **Advanced Features**: Authentication, audit logging, and metrics (SurrealDB)\n- **Nushell Integration**: Seamless execution of existing provisioning scripts\n- **RESTful API**: HTTP endpoints for workflow management\n- **Test Environment Service**: Automated containerized testing for taskservs, servers, and clusters\n- **Multi-Node Support**: Test complex topologies including Kubernetes and etcd clusters\n- **Docker Integration**: Automated container lifecycle management via Docker API\n\n## Quick Start\n\n### Build and Run\n\n**Default Build (Filesystem Only)**:\n\n```\ncd src/orchestrator\ncargo build --release\ncargo run -- --port 8080 --data-dir ./data\n```\n\n**With SurrealDB Support**:\n\n```\ncd src/orchestrator\ncargo build --release --features surrealdb\n\n# Run with SurrealDB embedded\ncargo run --features surrealdb -- --storage-type surrealdb-embedded --data-dir ./data\n\n# Run with SurrealDB server\ncargo run --features surrealdb -- --storage-type surrealdb-server \\n --surrealdb-url ws://localhost:8000 \\n --surrealdb-username admin --surrealdb-password secret\n```\n\n### Submit a Server Creation Workflow\n\n```\ncurl -X POST http://localhost:8080/workflows/servers/create \\n -H "Content-Type: application/json" \\n -d '{\n "infra": "production",\n "settings": "./settings.yaml",\n "servers": ["web-01", "web-02"],\n "check_mode": false,\n "wait": true\n }'\n```\n\n### Check Task Status\n\n```\ncurl http://localhost:8080/tasks/{task_id}\n```\n\n### List All Tasks\n\n```\ncurl http://localhost:8080/tasks\n```\n\n## API Endpoints\n\n### Health Check\n\n- `GET /health` - Service health status\n\n### Task Management\n\n- `GET /tasks` - List all tasks\n- `GET /tasks/{id}` - Get specific task status\n\n### Workflows\n\n- `POST /workflows/servers/create` - Submit server creation workflow\n- `POST /workflows/taskserv/create` - Submit taskserv creation workflow\n- `POST /workflows/cluster/create` - Submit cluster creation workflow\n\n### Test Environments\n\n- `POST /test/environments/create` - Create test environment\n- `GET /test/environments` - List all test environments\n- `GET /test/environments/{id}` - Get environment details\n- `POST /test/environments/{id}/run` - Run tests in environment\n- `DELETE /test/environments/{id}` - Cleanup test environment\n- `GET /test/environments/{id}/logs` - Get environment logs\n\n## Test Environment Service\n\nThe orchestrator includes a comprehensive test environment service for automated containerized testing\nof taskservs, complete servers, and multi-node clusters.\n\n### Overview\n\nThe Test Environment Service enables:\n\n- **Single Taskserv Testing**: Test individual taskservs in isolated containers\n- **Server Simulation**: Test complete server configurations with multiple taskservs\n- **Cluster Topologies**: Test multi-node clusters (Kubernetes, etcd, etc.)\n- **Automated Container Management**: No manual Docker management required\n- **Network Isolation**: Each test environment gets dedicated networks\n- **Resource Limits**: Configure CPU, memory, and disk limits per container\n\n### Test Environment Types\n\n#### 1. Single Taskserv\n\nTest individual taskserv in isolated container:\n\n```\ncurl -X POST http://localhost:8080/test/environments/create \\n -H "Content-Type: application/json" \\n -d '{\n "config": {\n "type": "single_taskserv",\n "taskserv": "kubernetes",\n "base_image": "ubuntu:22.04",\n "resources": {\n "cpu_millicores": 2000,\n "memory_mb": 4096\n }\n },\n "auto_start": true,\n "auto_cleanup": false\n }'\n```\n\n#### 2. Server Simulation\n\nSimulate complete server with multiple taskservs:\n\n```\ncurl -X POST http://localhost:8080/test/environments/create \\n -H "Content-Type: application/json" \\n -d '{\n "config": {\n "type": "server_simulation",\n "server_name": "web-01",\n "taskservs": ["containerd", "kubernetes", "cilium"],\n "base_image": "ubuntu:22.04"\n },\n "infra": "prod-stack",\n "auto_start": true\n }'\n```\n\n#### 3. Cluster Topology\n\nTest multi-node cluster configurations:\n\n```\ncurl -X POST http://localhost:8080/test/environments/create \\n -H "Content-Type: application/json" \\n -d '{\n "config": {\n "type": "cluster_topology",\n "cluster_type": "kubernetes",\n "topology": {\n "nodes": [\n {\n "name": "cp-01",\n "role": "controlplane",\n "taskservs": ["etcd", "kubernetes", "containerd"],\n "resources": {\n "cpu_millicores": 2000,\n "memory_mb": 4096\n }\n },\n {\n "name": "worker-01",\n "role": "worker",\n "taskservs": ["kubernetes", "containerd", "cilium"],\n "resources": {\n "cpu_millicores": 1000,\n "memory_mb": 2048\n }\n }\n ],\n "network": {\n "subnet": "172.30.0.0/16"\n }\n }\n },\n "auto_start": true\n }'\n```\n\n### Nushell CLI Integration\n\nThe test environment service is fully integrated with Nushell CLI:\n\n```\n# Quick test (create, run, cleanup)\nprovisioning test quick kubernetes\n\n# Single taskserv test\nprovisioning test env single postgres --auto-start --auto-cleanup\n\n# Server simulation\nprovisioning test env server web-01 [containerd kubernetes cilium] --auto-start\n\n# Cluster from template\nprovisioning test topology load kubernetes_3node | test env cluster kubernetes\n\n# List environments\nprovisioning test env list\n\n# Check status\nprovisioning test env status \n\n# View logs\nprovisioning test env logs \n\n# Cleanup\nprovisioning test env cleanup \n```\n\n### Topology Templates\n\nPredefined multi-node cluster topologies are available in `provisioning/config/test-topologies.toml`:\n\n- **kubernetes_3node**: 3-node HA Kubernetes cluster (1 control plane + 2 workers)\n- **kubernetes_single**: All-in-one Kubernetes node\n- **etcd_cluster**: 3-member etcd cluster\n- **containerd_test**: Standalone containerd testing\n- **postgres_redis**: Database stack testing\n\n### Prerequisites\n\n1. **Docker Running**: The orchestrator requires Docker daemon to be running\n\n ```bash\n docker ps # Should work without errors\n ```\n\n1. **Orchestrator Running**: Start the orchestrator before using test environments\n\n ```bash\n ./scripts/start-orchestrator.nu --background\n ```\n\n### Architecture\n\n```\nUser Command (CLI/API)\n ↓\nTest Orchestrator (Rust)\n ↓\nContainer Manager (bollard)\n ↓\nDocker API\n ↓\nIsolated Test Containers\n • Dedicated networks\n • Resource limits\n • Volume mounts\n • Multi-node support\n```\n\n### Key Components\n\n#### Rust Modules\n\n- `test_environment.rs` - Core types and configurations\n- `container_manager.rs` - Docker API integration (bollard)\n- `test_orchestrator.rs` - Orchestration logic\n\n#### Features\n\n- **Automated Lifecycle**: Create, start, stop, cleanup containers automatically\n- **Network Isolation**: Each environment gets isolated Docker network\n- **Resource Management**: CPU and memory limits per container\n- **Test Execution**: Run test scripts within containers\n- **Log Collection**: Capture and expose container logs\n- **Auto-Cleanup**: Optional automatic cleanup after tests\n\n### Use Cases\n\n1. **Taskserv Development**: Test new taskservs before deployment\n2. **Integration Testing**: Validate taskserv combinations\n3. **Cluster Validation**: Test multi-node cluster configurations\n4. **CI/CD Integration**: Automated testing in pipelines\n5. **Production Simulation**: Test production-like deployments safely\n\n### CI/CD Integration\n\n```\n# GitLab CI example\ntest-infrastructure:\n stage: test\n script:\n - provisioning test quick kubernetes\n - provisioning test quick postgres\n - provisioning test quick redis\n```\n\n### Documentation\n\nFor complete usage guide and examples, see:\n\n- **User Guide**: `docs/user/test-environment-guide.md`\n- **Usage Documentation**: `docs/user/test-environment-usage.md`\n- **Implementation Summary**: `provisioning/core/nulib/test_environments_summary.md`\n\n## Configuration\n\n### Core Options\n\n- `--port` - HTTP server port (default: 8080)\n- `--data-dir` - Data directory for storage (default: ./data)\n- `--storage-type` - Storage backend: filesystem, surrealdb-embedded, surrealdb-server\n- `--nu-path` - Path to Nushell executable (default: nu)\n- `--provisioning-path` - Path to provisioning script (default: ./core/nulib/provisioning)\n\n### SurrealDB Options (when `--features surrealdb` enabled)\n\n- `--surrealdb-url` - Server URL for surrealdb-server mode (e.g., ws://localhost:8000)\n- `--surrealdb-namespace` - Database namespace (default: orchestrator)\n- `--surrealdb-database` - Database name (default: tasks)\n- `--surrealdb-username` - Authentication username\n- `--surrealdb-password` - Authentication password\n\n### Storage Backend Comparison\n\n| Feature | Filesystem | SurrealDB Embedded | SurrealDB Server |\n| --------- | ------------ | ------------------- | ------------------ |\n| **Dependencies** | None | Local database | Remote server |\n| **Auth/RBAC** | Basic | Advanced | Advanced |\n| **Real-time** | No | Yes | Yes |\n| **Scalability** | Limited | Medium | High |\n| **Complexity** | Low | Medium | High |\n| **Best For** | Development | Production | Distributed |\n\n## Nushell Integration\n\nThe orchestrator includes workflow wrappers in `core/nulib/workflows/server_create.nu`:\n\n```\n# Submit workflow via Nushell\nuse workflows/server_create.nu\nserver_create_workflow "production" --settings "./settings.yaml" --wait\n\n# Check workflow status\nworkflow status $task_id\n\n# List all workflows\nworkflow list\n```\n\n## Task States\n\n- **Pending**: Queued for execution\n- **Running**: Currently executing\n- **Completed**: Finished successfully\n- **Failed**: Execution failed (will retry if under limit)\n- **Cancelled**: Manually cancelled\n\n## Storage Architecture\n\n### Multi-Backend Support\n\nThe orchestrator uses a pluggable storage architecture with three backends:\n\n#### Filesystem (Default)\n\n- **Format**: JSON files in directory structure\n- **Location**: `{data_dir}/queue.rkvs/{tasks,queue}/`\n- **Features**: Basic task persistence, priority queuing\n- **Best For**: Development, simple deployments\n\n#### SurrealDB Embedded\n\n- **Format**: Local SurrealDB database with RocksDB engine\n- **Location**: `{data_dir}/orchestrator.db`\n- **Features**: ACID transactions, advanced queries, audit logging\n- **Best For**: Production single-node deployments\n\n#### SurrealDB Server\n\n- **Format**: Remote SurrealDB server connection\n- **Connection**: WebSocket or HTTP protocol\n- **Features**: Full multi-user, real-time subscriptions, horizontal scaling\n- **Best For**: Distributed production deployments\n\n### Data Migration\n\nSeamless migration between storage backends:\n\n```\n# Interactive migration wizard\n./scripts/migrate-storage.nu --interactive\n\n# Direct migration\n./scripts/migrate-storage.nu --from filesystem --to surrealdb-embedded \\n --source-dir ./data --target-dir ./surrealdb-data\n\n# Validate migration setup\n./scripts/migrate-storage.nu validate --from filesystem --to surrealdb-server\n```\n\n## Error Handling\n\n- Failed tasks are automatically retried up to 3 times\n- Permanent failures are marked and logged\n- Service restart recovery loads tasks from persistent storage\n- API errors return structured JSON responses\n\n## Monitoring\n\n- Structured logging with tracing\n- Task execution metrics\n- Queue depth monitoring\n- Health check endpoint\n\n## Development\n\n### Dependencies\n\n**Core Dependencies** (always included):\n\n- **axum**: HTTP server framework\n- **tokio**: Async runtime\n- **serde**: Serialization\n- **tracing**: Structured logging\n- **async-trait**: Async trait support\n- **anyhow**: Error handling\n- **bollard**: Docker API client for container management\n\n**Optional Dependencies** (feature-gated):\n\n- **surrealdb**: Multi-model database (requires `--features surrealdb`)\n - Embedded mode: RocksDB storage engine\n - Server mode: WebSocket/HTTP client\n\n### Adding New Workflows\n\n1. Create workflow definition in `src/main.rs`\n2. Add API endpoint handler\n3. Create Nushell wrapper in `core/nulib/workflows/`\n4. Update existing code to use workflow bridge functions\n\n### Testing\n\n**Unit and Integration Tests**:\n\n```\n# Test with filesystem only (default)\ncargo test\n\n# Test all storage backends\ncargo test --features surrealdb\n\n# Test specific suites\ncargo test --test storage_integration\ncargo test --test migration_tests\ncargo test --test factory_tests\n```\n\n**Performance Benchmarks**:\n\n```\n# Benchmark storage performance\ncargo bench --bench storage_benchmarks\n\n# Benchmark migration performance\ncargo bench --bench migration_benchmarks\n\n# Generate HTML reports\ncargo bench --features surrealdb\nopen target/criterion/reports/index.html\n```\n\n**Test Configuration**:\n\n```\n# Run with specific backend\nTEST_STORAGE=filesystem cargo test\nTEST_STORAGE=surrealdb-embedded cargo test --features surrealdb\n\n# Verbose testing\ncargo test -- --nocapture\n```\n\n## Migration from Deep Call Stack Issues\n\nThis orchestrator solves the Nushell deep call stack limitations by:\n\n1. Moving coordination logic to Rust\n2. Executing individual Nushell commands at top level\n3. Managing parallel execution externally\n4. Preserving all existing business logic in Nushell\n\nThe existing `on_create_servers` function can be replaced with `on_create_servers_workflow` for orchestrated execution while maintaining full \ncompatibility. \ No newline at end of file diff --git a/crates/orchestrator/docs/dns-integration.md b/crates/orchestrator/docs/dns-integration.md index a9f41cf..ee4753e 100644 --- a/crates/orchestrator/docs/dns-integration.md +++ b/crates/orchestrator/docs/dns-integration.md @@ -1,221 +1 @@ -# DNS Integration Guide - -## Overview - -The DNS integration module provides automatic DNS registration and management for provisioned servers through CoreDNS integration. - -## Architecture - -```plaintext -┌─────────────────┐ -│ Orchestrator │ -│ (Rust) │ -└────────┬────────┘ - │ - ▼ -┌─────────────────┐ -│ DNS Manager │ -│ │ -│ - Auto-register│ -│ - TTL config │ -│ - Verification │ -└────────┬────────┘ - │ - ▼ -┌─────────────────┐ -│ CoreDNS Client │ -│ (HTTP API) │ -└────────┬────────┘ - │ - ▼ -┌─────────────────┐ -│ CoreDNS │ -│ Service │ -└─────────────────┘ -```text - -## Features - -### 1. Automatic DNS Registration - -When a server is created, the orchestrator automatically registers its DNS record: - -```rust -// In server creation workflow -let ip = server.get_ip_address(); -state.dns_manager.register_server_dns(&hostname, ip).await?; -```text - -### 2. DNS Record Types - -Supports multiple DNS record types: - -- **A** - IPv4 address -- **AAAA** - IPv6 address -- **CNAME** - Canonical name -- **TXT** - Text record - -### 3. DNS Verification - -Verify DNS resolution after registration: - -```rust -let verified = state.dns_manager.verify_dns_resolution("server.example.com").await?; -if verified { - info!("DNS resolution verified"); -} -```text - -### 4. Automatic Cleanup - -When a server is deleted, DNS records are automatically removed: - -```rust -state.dns_manager.unregister_server_dns(&hostname).await?; -```text - -## Configuration - -DNS settings in `config.defaults.toml`: - -```toml -[orchestrator.dns] -coredns_url = "http://localhost:53" -auto_register = true -ttl = 300 -```text - -### Configuration Options - -- **coredns_url**: CoreDNS HTTP API endpoint -- **auto_register**: Enable automatic DNS registration (default: true) -- **ttl**: Default TTL for DNS records in seconds (default: 300) - -## API Endpoints - -### List DNS Records - -```http -GET /api/v1/dns/records -```text - -**Response:** - -```json -{ - "success": true, - "data": [ - { - "name": "web-01.example.com", - "record_type": "A", - "value": "192.168.1.10", - "ttl": 300 - } - ] -} -```text - -## Usage Examples - -### Register Server DNS - -```rust -use std::net::IpAddr; - -let ip: IpAddr = "192.168.1.10".parse()?; -dns_manager.register_server_dns("web-01.example.com", ip).await?; -```text - -### Unregister Server DNS - -```rust -dns_manager.unregister_server_dns("web-01.example.com").await?; -```text - -### Update DNS Record - -```rust -let new_ip: IpAddr = "192.168.1.20".parse()?; -dns_manager.update_dns_record("web-01.example.com", new_ip).await?; -```text - -### List All Records - -```rust -let records = dns_manager.list_records().await?; -for record in records { - println!("{} -> {} ({})", record.name, record.value, record.record_type); -} -```text - -## Integration with Workflows - -### Server Creation Workflow - -1. Create server via provider API -2. Wait for server to be ready -3. **Register DNS record** (automatic) -4. Verify DNS resolution -5. Continue with next steps - -### Server Deletion Workflow - -1. Stop services on server -2. **Unregister DNS record** (automatic) -3. Delete server via provider API -4. Update inventory - -## Error Handling - -The DNS integration handles errors gracefully: - -- **Network errors**: Retries with exponential backoff -- **DNS conflicts**: Reports error but continues workflow -- **Invalid records**: Validates before sending to CoreDNS - -## Testing - -Run DNS integration tests: - -```bash -cd provisioning/platform/orchestrator -cargo test test_dns_integration -```text - -## Troubleshooting - -### DNS registration fails - -1. Check CoreDNS is running and accessible -2. Verify `coredns_url` configuration -3. Check network connectivity -4. Review orchestrator logs - -### DNS records not resolving - -1. Verify record was registered (check logs) -2. Query CoreDNS directly -3. Check TTL settings -4. Verify DNS resolver configuration - -## Best Practices - -1. **Use FQDN**: Always use fully-qualified domain names -2. **Set appropriate TTL**: Lower TTL for dev, higher for prod -3. **Enable auto-register**: Reduces manual operations -4. **Monitor DNS health**: Check DNS resolution periodically - -## Security Considerations - -1. **Access Control**: Restrict access to CoreDNS API -2. **Validation**: Validate hostnames and IP addresses -3. **Audit**: Log all DNS operations -4. **Rate Limiting**: Prevent DNS flooding - -## Future Enhancements - -- [ ] Support for SRV records -- [ ] DNS zone management -- [ ] DNSSEC integration -- [ ] Multi-zone support -- [ ] DNS caching layer +# DNS Integration Guide\n\n## Overview\n\nThe DNS integration module provides automatic DNS registration and management for provisioned servers through CoreDNS integration.\n\n## Architecture\n\n```\n┌─────────────────┐\n│ Orchestrator │\n│ (Rust) │\n└────────┬────────┘\n │\n ▼\n┌─────────────────┐\n│ DNS Manager │\n│ │\n│ - Auto-register│\n│ - TTL config │\n│ - Verification │\n└────────┬────────┘\n │\n ▼\n┌─────────────────┐\n│ CoreDNS Client │\n│ (HTTP API) │\n└────────┬────────┘\n │\n ▼\n┌─────────────────┐\n│ CoreDNS │\n│ Service │\n└─────────────────┘\n```\n\n## Features\n\n### 1. Automatic DNS Registration\n\nWhen a server is created, the orchestrator automatically registers its DNS record:\n\n```\n// In server creation workflow\nlet ip = server.get_ip_address();\nstate.dns_manager.register_server_dns(&hostname, ip).await?;\n```\n\n### 2. DNS Record Types\n\nSupports multiple DNS record types:\n\n- **A** - IPv4 address\n- **AAAA** - IPv6 address\n- **CNAME** - Canonical name\n- **TXT** - Text record\n\n### 3. DNS Verification\n\nVerify DNS resolution after registration:\n\n```\nlet verified = state.dns_manager.verify_dns_resolution("server.example.com").await?;\nif verified {\n info!("DNS resolution verified");\n}\n```\n\n### 4. Automatic Cleanup\n\nWhen a server is deleted, DNS records are automatically removed:\n\n```\nstate.dns_manager.unregister_server_dns(&hostname).await?;\n```\n\n## Configuration\n\nDNS settings in `config.defaults.toml`:\n\n```\n[orchestrator.dns]\ncoredns_url = "http://localhost:53"\nauto_register = true\nttl = 300\n```\n\n### Configuration Options\n\n- **coredns_url**: CoreDNS HTTP API endpoint\n- **auto_register**: Enable automatic DNS registration (default: true)\n- **ttl**: Default TTL for DNS records in seconds (default: 300)\n\n## API Endpoints\n\n### List DNS Records\n\n```\nGET /api/v1/dns/records\n```\n\n**Response:**\n\n```\n{\n "success": true,\n "data": [\n {\n "name": "web-01.example.com",\n "record_type": "A",\n "value": "192.168.1.10",\n "ttl": 300\n }\n ]\n}\n```\n\n## Usage Examples\n\n### Register Server DNS\n\n```\nuse std::net::IpAddr;\n\nlet ip: IpAddr = "192.168.1.10".parse()?;\ndns_manager.register_server_dns("web-01.example.com", ip).await?;\n```\n\n### Unregister Server DNS\n\n```\ndns_manager.unregister_server_dns("web-01.example.com").await?;\n```\n\n### Update DNS Record\n\n```\nlet new_ip: IpAddr = "192.168.1.20".parse()?;\ndns_manager.update_dns_record("web-01.example.com", new_ip).await?;\n```\n\n### List All Records\n\n```\nlet records = dns_manager.list_records().await?;\nfor record in records {\n println!("{} -> {} ({})", record.name, record.value, record.record_type);\n}\n```\n\n## Integration with Workflows\n\n### Server Creation Workflow\n\n1. Create server via provider API\n2. Wait for server to be ready\n3. **Register DNS record** (automatic)\n4. Verify DNS resolution\n5. Continue with next steps\n\n### Server Deletion Workflow\n\n1. Stop services on server\n2. **Unregister DNS record** (automatic)\n3. Delete server via provider API\n4. Update inventory\n\n## Error Handling\n\nThe DNS integration handles errors gracefully:\n\n- **Network errors**: Retries with exponential backoff\n- **DNS conflicts**: Reports error but continues workflow\n- **Invalid records**: Validates before sending to CoreDNS\n\n## Testing\n\nRun DNS integration tests:\n\n```\ncd provisioning/platform/orchestrator\ncargo test test_dns_integration\n```\n\n## Troubleshooting\n\n### DNS registration fails\n\n1. Check CoreDNS is running and accessible\n2. Verify `coredns_url` configuration\n3. Check network connectivity\n4. Review orchestrator logs\n\n### DNS records not resolving\n\n1. Verify record was registered (check logs)\n2. Query CoreDNS directly\n3. Check TTL settings\n4. Verify DNS resolver configuration\n\n## Best Practices\n\n1. **Use FQDN**: Always use fully-qualified domain names\n2. **Set appropriate TTL**: Lower TTL for dev, higher for prod\n3. **Enable auto-register**: Reduces manual operations\n4. **Monitor DNS health**: Check DNS resolution periodically\n\n## Security Considerations\n\n1. **Access Control**: Restrict access to CoreDNS API\n2. **Validation**: Validate hostnames and IP addresses\n3. **Audit**: Log all DNS operations\n4. **Rate Limiting**: Prevent DNS flooding\n\n## Future Enhancements\n\n- [ ] Support for SRV records\n- [ ] DNS zone management\n- [ ] DNSSEC integration\n- [ ] Multi-zone support\n- [ ] DNS caching layer \ No newline at end of file diff --git a/crates/orchestrator/docs/extension-loading.md b/crates/orchestrator/docs/extension-loading.md index 0e3fa63..83846f8 100644 --- a/crates/orchestrator/docs/extension-loading.md +++ b/crates/orchestrator/docs/extension-loading.md @@ -1,376 +1 @@ -# Extension Loading Guide - -## Overview - -The extension loading module provides dynamic loading of providers, taskservs, and clusters through Nushell script integration. - -## Architecture - -```plaintext -┌──────────────────┐ -│ Orchestrator │ -│ (Rust) │ -└────────┬─────────┘ - │ - ▼ -┌──────────────────┐ -│ Extension Manager│ -│ │ -│ - Caching │ -│ - Type safety │ -│ - Validation │ -└────────┬─────────┘ - │ - ▼ -┌──────────────────┐ -│Extension Loader │ -│ (Nushell Call) │ -└────────┬─────────┘ - │ - ▼ -┌──────────────────┐ -│ Nushell Scripts │ -│ (module load) │ -└──────────────────┘ -```text - -## Extension Types - -### 1. Providers - -Cloud provider implementations (AWS, UpCloud, Local): - -```rust -let provider = extension_manager.load_extension( - ExtensionType::Provider, - "aws".to_string(), - Some("2.0.0".to_string()) -).await?; -```text - -### 2. Taskservs - -Infrastructure service definitions (Kubernetes, PostgreSQL, etc.): - -```rust -let taskserv = extension_manager.load_extension( - ExtensionType::Taskserv, - "kubernetes".to_string(), - None // Load latest version -).await?; -```text - -### 3. Clusters - -Complete cluster configurations (Buildkit, CI/CD, etc.): - -```rust -let cluster = extension_manager.load_extension( - ExtensionType::Cluster, - "buildkit".to_string(), - Some("1.0.0".to_string()) -).await?; -```text - -## Features - -### LRU Caching - -Extensions are cached using LRU (Least Recently Used) strategy: - -- **Cache size**: 100 extensions -- **Cache key**: `{type}:{name}:{version}` -- **Automatic eviction**: Oldest entries removed when full - -### Type Safety - -All extensions are strongly typed: - -```rust -pub struct Extension { - pub metadata: ExtensionMetadata, - pub path: String, - pub loaded_at: chrono::DateTime, -} - -pub struct ExtensionMetadata { - pub name: String, - pub version: String, - pub description: String, - pub extension_type: ExtensionType, - pub dependencies: Vec, - pub author: Option, - pub repository: Option, -} -```text - -### Version Management - -Load specific versions or use latest: - -```rust -// Load specific version -let ext = extension_manager.load_extension( - ExtensionType::Taskserv, - "kubernetes".to_string(), - Some("1.28.0".to_string()) -).await?; - -// Load latest version -let ext = extension_manager.load_extension( - ExtensionType::Taskserv, - "kubernetes".to_string(), - None -).await?; -```text - -## Configuration - -Extension settings in `config.defaults.toml`: - -```toml -[orchestrator.extensions] -auto_load = true -cache_dir = "{{orchestrator.paths.data_dir}}/extensions" -```text - -### Configuration Options - -- **auto_load**: Enable automatic extension loading (default: true) -- **cache_dir**: Directory for caching extension artifacts - -## API Endpoints - -### List Loaded Extensions - -```http -GET /api/v1/extensions/loaded -```text - -**Response:** - -```json -{ - "success": true, - "data": [ - { - "metadata": { - "name": "kubernetes", - "version": "1.28.0", - "description": "Kubernetes container orchestrator", - "extension_type": "Taskserv", - "dependencies": ["containerd", "etcd"], - "author": "provisioning-team", - "repository": null - }, - "path": "extensions/taskservs/kubernetes", - "loaded_at": "2025-10-06T12:30:00Z" - } - ] -} -```text - -### Reload Extension - -```http -POST /api/v1/extensions/reload -Content-Type: application/json - -{ - "extension_type": "taskserv", - "name": "kubernetes" -} -```text - -**Response:** - -```json -{ - "success": true, - "data": "Extension kubernetes reloaded" -} -```text - -## Usage Examples - -### Load Extension - -```rust -use provisioning_orchestrator::extensions::{ExtensionManager, ExtensionType}; - -let manager = ExtensionManager::new( - "/usr/local/bin/nu".to_string(), - "/usr/local/bin/provisioning".to_string(), -); - -let extension = manager.load_extension( - ExtensionType::Taskserv, - "kubernetes".to_string(), - Some("1.28.0".to_string()) -).await?; - -println!("Loaded: {} v{}", extension.metadata.name, extension.metadata.version); -```text - -### List Loaded Extensions - -```rust -let extensions = manager.list_loaded_extensions().await; -for ext in extensions { - println!("{} ({}) - loaded at {}", - ext.metadata.name, - ext.metadata.extension_type, - ext.loaded_at - ); -} -```text - -### Reload Extension - -```rust -let extension = manager.reload_extension( - ExtensionType::Taskserv, - "kubernetes".to_string() -).await?; -```text - -### Check if Loaded - -```rust -let is_loaded = manager.is_extension_loaded( - ExtensionType::Taskserv, - "kubernetes" -).await; - -if !is_loaded { - // Load the extension - manager.load_extension( - ExtensionType::Taskserv, - "kubernetes".to_string(), - None - ).await?; -} -```text - -### Clear Cache - -```rust -manager.clear_cache().await; -```text - -## Integration with Workflows - -### Taskserv Installation Workflow - -1. **Load extension** (before installation) -2. Validate dependencies -3. Generate configuration -4. Execute installation -5. Verify installation - -```rust -// Step 1: Load extension -let extension = extension_manager.load_extension( - ExtensionType::Taskserv, - "kubernetes".to_string(), - Some("1.28.0".to_string()) -).await?; - -// Step 2: Validate dependencies -for dep in &extension.metadata.dependencies { - ensure_dependency_installed(dep).await?; -} - -// Continue with installation... -```text - -## Nushell Integration - -The extension loader calls Nushell commands: - -```bash -# Load taskserv extension -provisioning module load taskserv kubernetes --version 1.28.0 - -# Discover available extensions -provisioning module discover taskserv --output json - -# Get extension metadata -provisioning module discover taskserv --name kubernetes --output json -```text - -## Error Handling - -The extension loader handles errors gracefully: - -- **Extension not found**: Returns clear error message -- **Version mismatch**: Reports available versions -- **Dependency errors**: Lists missing dependencies -- **Load failures**: Logs detailed error information - -## Testing - -Run extension loading tests: - -```bash -cd provisioning/platform/orchestrator -cargo test test_extension_loading -```text - -## Troubleshooting - -### Extension load fails - -1. Check Nushell is installed and accessible -2. Verify extension exists in expected location -3. Check provisioning path configuration -4. Review orchestrator logs - -### Cache issues - -1. Clear cache manually: `manager.clear_cache().await` -2. Check cache directory permissions -3. Verify disk space availability - -## Best Practices - -1. **Use versioning**: Always specify version for production -2. **Cache management**: Clear cache periodically in dev environments -3. **Dependency validation**: Check dependencies before loading -4. **Error handling**: Always handle load failures gracefully - -## Security Considerations - -1. **Code execution**: Extensions execute Nushell code -2. **Validation**: Verify extension metadata -3. **Sandboxing**: Consider sandboxed execution -4. **Audit**: Log all extension loading operations - -## Performance - -### Cache Hit Ratio - -Monitor cache effectiveness: - -```rust -let total_loads = metrics.total_extension_loads; -let cache_hits = metrics.cache_hits; -let hit_ratio = cache_hits as f64 / total_loads as f64; -println!("Cache hit ratio: {:.2}%", hit_ratio * 100.0); -```text - -### Loading Time - -Extension loading is optimized: - -- **Cached**: < 1ms -- **Cold load**: 100-500ms (depends on extension size) -- **With dependencies**: Variable (depends on dependency count) - -## Future Enhancements - -- [ ] Extension hot-reload without cache clear -- [ ] Dependency graph visualization -- [ ] Extension marketplace integration -- [ ] Automatic version updates -- [ ] Extension sandboxing +# Extension Loading Guide\n\n## Overview\n\nThe extension loading module provides dynamic loading of providers, taskservs, and clusters through Nushell script integration.\n\n## Architecture\n\n```\n┌──────────────────┐\n│ Orchestrator │\n│ (Rust) │\n└────────┬─────────┘\n │\n ▼\n┌──────────────────┐\n│ Extension Manager│\n│ │\n│ - Caching │\n│ - Type safety │\n│ - Validation │\n└────────┬─────────┘\n │\n ▼\n┌──────────────────┐\n│Extension Loader │\n│ (Nushell Call) │\n└────────┬─────────┘\n │\n ▼\n┌──────────────────┐\n│ Nushell Scripts │\n│ (module load) │\n└──────────────────┘\n```\n\n## Extension Types\n\n### 1. Providers\n\nCloud provider implementations (AWS, UpCloud, Local):\n\n```\nlet provider = extension_manager.load_extension(\n ExtensionType::Provider,\n "aws".to_string(),\n Some("2.0.0".to_string())\n).await?;\n```\n\n### 2. Taskservs\n\nInfrastructure service definitions (Kubernetes, PostgreSQL, etc.):\n\n```\nlet taskserv = extension_manager.load_extension(\n ExtensionType::Taskserv,\n "kubernetes".to_string(),\n None // Load latest version\n).await?;\n```\n\n### 3. Clusters\n\nComplete cluster configurations (Buildkit, CI/CD, etc.):\n\n```\nlet cluster = extension_manager.load_extension(\n ExtensionType::Cluster,\n "buildkit".to_string(),\n Some("1.0.0".to_string())\n).await?;\n```\n\n## Features\n\n### LRU Caching\n\nExtensions are cached using LRU (Least Recently Used) strategy:\n\n- **Cache size**: 100 extensions\n- **Cache key**: `{type}:{name}:{version}`\n- **Automatic eviction**: Oldest entries removed when full\n\n### Type Safety\n\nAll extensions are strongly typed:\n\n```\npub struct Extension {\n pub metadata: ExtensionMetadata,\n pub path: String,\n pub loaded_at: chrono::DateTime,\n}\n\npub struct ExtensionMetadata {\n pub name: String,\n pub version: String,\n pub description: String,\n pub extension_type: ExtensionType,\n pub dependencies: Vec,\n pub author: Option,\n pub repository: Option,\n}\n```\n\n### Version Management\n\nLoad specific versions or use latest:\n\n```\n// Load specific version\nlet ext = extension_manager.load_extension(\n ExtensionType::Taskserv,\n "kubernetes".to_string(),\n Some("1.28.0".to_string())\n).await?;\n\n// Load latest version\nlet ext = extension_manager.load_extension(\n ExtensionType::Taskserv,\n "kubernetes".to_string(),\n None\n).await?;\n```\n\n## Configuration\n\nExtension settings in `config.defaults.toml`:\n\n```\n[orchestrator.extensions]\nauto_load = true\ncache_dir = "{{orchestrator.paths.data_dir}}/extensions"\n```\n\n### Configuration Options\n\n- **auto_load**: Enable automatic extension loading (default: true)\n- **cache_dir**: Directory for caching extension artifacts\n\n## API Endpoints\n\n### List Loaded Extensions\n\n```\nGET /api/v1/extensions/loaded\n```\n\n**Response:**\n\n```\n{\n "success": true,\n "data": [\n {\n "metadata": {\n "name": "kubernetes",\n "version": "1.28.0",\n "description": "Kubernetes container orchestrator",\n "extension_type": "Taskserv",\n "dependencies": ["containerd", "etcd"],\n "author": "provisioning-team",\n "repository": null\n },\n "path": "extensions/taskservs/kubernetes",\n "loaded_at": "2025-10-06T12:30:00Z"\n }\n ]\n}\n```\n\n### Reload Extension\n\n```\nPOST /api/v1/extensions/reload\nContent-Type: application/json\n\n{\n "extension_type": "taskserv",\n "name": "kubernetes"\n}\n```\n\n**Response:**\n\n```\n{\n "success": true,\n "data": "Extension kubernetes reloaded"\n}\n```\n\n## Usage Examples\n\n### Load Extension\n\n```\nuse provisioning_orchestrator::extensions::{ExtensionManager, ExtensionType};\n\nlet manager = ExtensionManager::new(\n "/usr/local/bin/nu".to_string(),\n "/usr/local/bin/provisioning".to_string(),\n);\n\nlet extension = manager.load_extension(\n ExtensionType::Taskserv,\n "kubernetes".to_string(),\n Some("1.28.0".to_string())\n).await?;\n\nprintln!("Loaded: {} v{}", extension.metadata.name, extension.metadata.version);\n```\n\n### List Loaded Extensions\n\n```\nlet extensions = manager.list_loaded_extensions().await;\nfor ext in extensions {\n println!("{} ({}) - loaded at {}",\n ext.metadata.name,\n ext.metadata.extension_type,\n ext.loaded_at\n );\n}\n```\n\n### Reload Extension\n\n```\nlet extension = manager.reload_extension(\n ExtensionType::Taskserv,\n "kubernetes".to_string()\n).await?;\n```\n\n### Check if Loaded\n\n```\nlet is_loaded = manager.is_extension_loaded(\n ExtensionType::Taskserv,\n "kubernetes"\n).await;\n\nif !is_loaded {\n // Load the extension\n manager.load_extension(\n ExtensionType::Taskserv,\n "kubernetes".to_string(),\n None\n ).await?;\n}\n```\n\n### Clear Cache\n\n```\nmanager.clear_cache().await;\n```\n\n## Integration with Workflows\n\n### Taskserv Installation Workflow\n\n1. **Load extension** (before installation)\n2. Validate dependencies\n3. Generate configuration\n4. Execute installation\n5. Verify installation\n\n```\n// Step 1: Load extension\nlet extension = extension_manager.load_extension(\n ExtensionType::Taskserv,\n "kubernetes".to_string(),\n Some("1.28.0".to_string())\n).await?;\n\n// Step 2: Validate dependencies\nfor dep in &extension.metadata.dependencies {\n ensure_dependency_installed(dep).await?;\n}\n\n// Continue with installation...\n```\n\n## Nushell Integration\n\nThe extension loader calls Nushell commands:\n\n```\n# Load taskserv extension\nprovisioning module load taskserv kubernetes --version 1.28.0\n\n# Discover available extensions\nprovisioning module discover taskserv --output json\n\n# Get extension metadata\nprovisioning module discover taskserv --name kubernetes --output json\n```\n\n## Error Handling\n\nThe extension loader handles errors gracefully:\n\n- **Extension not found**: Returns clear error message\n- **Version mismatch**: Reports available versions\n- **Dependency errors**: Lists missing dependencies\n- **Load failures**: Logs detailed error information\n\n## Testing\n\nRun extension loading tests:\n\n```\ncd provisioning/platform/orchestrator\ncargo test test_extension_loading\n```\n\n## Troubleshooting\n\n### Extension load fails\n\n1. Check Nushell is installed and accessible\n2. Verify extension exists in expected location\n3. Check provisioning path configuration\n4. Review orchestrator logs\n\n### Cache issues\n\n1. Clear cache manually: `manager.clear_cache().await`\n2. Check cache directory permissions\n3. Verify disk space availability\n\n## Best Practices\n\n1. **Use versioning**: Always specify version for production\n2. **Cache management**: Clear cache periodically in dev environments\n3. **Dependency validation**: Check dependencies before loading\n4. **Error handling**: Always handle load failures gracefully\n\n## Security Considerations\n\n1. **Code execution**: Extensions execute Nushell code\n2. **Validation**: Verify extension metadata\n3. **Sandboxing**: Consider sandboxed execution\n4. **Audit**: Log all extension loading operations\n\n## Performance\n\n### Cache Hit Ratio\n\nMonitor cache effectiveness:\n\n```\nlet total_loads = metrics.total_extension_loads;\nlet cache_hits = metrics.cache_hits;\nlet hit_ratio = cache_hits as f64 / total_loads as f64;\nprintln!("Cache hit ratio: {:.2}%", hit_ratio * 100.0);\n```\n\n### Loading Time\n\nExtension loading is optimized:\n\n- **Cached**: < 1ms\n- **Cold load**: 100-500ms (depends on extension size)\n- **With dependencies**: Variable (depends on dependency count)\n\n## Future Enhancements\n\n- [ ] Extension hot-reload without cache clear\n- [ ] Dependency graph visualization\n- [ ] Extension marketplace integration\n- [ ] Automatic version updates\n- [ ] Extension sandboxing \ No newline at end of file diff --git a/crates/orchestrator/docs/oci-integration.md b/crates/orchestrator/docs/oci-integration.md index cc1f00c..29b142d 100644 --- a/crates/orchestrator/docs/oci-integration.md +++ b/crates/orchestrator/docs/oci-integration.md @@ -1,417 +1 @@ -# OCI Registry Integration Guide - -## Overview - -The OCI integration module provides OCI Distribution Spec v2 compliant registry integration for pulling KCL packages and extension artifacts. - -## Architecture - -```plaintext -┌──────────────────┐ -│ Orchestrator │ -│ (Rust) │ -└────────┬─────────┘ - │ - ▼ -┌──────────────────┐ -│ OCI Manager │ -│ │ -│ - LRU caching │ -│ - Pull artifacts│ -│ - List packages │ -└────────┬─────────┘ - │ - ▼ -┌──────────────────┐ -│ OCI Client │ -│ (Distribution) │ -└────────┬─────────┘ - │ - ▼ -┌──────────────────┐ -│ OCI Registry │ -│ (HTTP API v2) │ -└──────────────────┘ -```text - -## Features - -### 1. KCL Package Management - -Pull KCL configuration packages from OCI registry: - -```rust -let package_path = oci_manager.pull_kcl_package( - "provisioning-core", - "1.0.0" -).await?; -```text - -### 2. Extension Artifacts - -Pull extension artifacts (providers, taskservs, clusters): - -```rust -let artifact_path = oci_manager.pull_extension_artifact( - "taskserv", // Extension type - "kubernetes", // Extension name - "1.28.0" // Version -).await?; -```text - -### 3. Manifest Caching - -Manifests are cached using LRU strategy: - -- **Cache size**: 100 manifests -- **Cache key**: `{name}:{version}` -- **Automatic eviction**: Oldest entries removed when full - -### 4. Artifact Listing - -List all artifacts in a namespace: - -```rust -let artifacts = oci_manager.list_oci_artifacts("kcl").await?; -for artifact in artifacts { - println!("{} v{} ({})", artifact.name, artifact.version, artifact.size); -} -```text - -## OCI Distribution Spec v2 - -Implements OCI Distribution Specification v2: - -- **Manifest retrieval**: `GET /v2/{namespace}/{repository}/manifests/{reference}` -- **Blob download**: `GET /v2/{namespace}/{repository}/blobs/{digest}` -- **Tag listing**: `GET /v2/{namespace}/{repository}/tags/list` -- **Artifact existence**: `HEAD /v2/{namespace}/{repository}/manifests/{reference}` - -## Configuration - -OCI settings in `config.defaults.toml`: - -```toml -[orchestrator.oci] -registry_url = "http://localhost:5000" -namespace = "provisioning-extensions" -cache_dir = "{{orchestrator.paths.data_dir}}/oci-cache" -```text - -### Configuration Options - -- **registry_url**: OCI registry HTTP endpoint -- **namespace**: Default namespace for artifacts -- **cache_dir**: Local cache directory for downloaded artifacts - -## API Endpoints - -### List OCI Artifacts - -```http -POST /api/v1/oci/artifacts -Content-Type: application/json - -{ - "namespace": "kcl" -} -```text - -**Response:** - -```json -{ - "success": true, - "data": [ - { - "name": "provisioning-core", - "version": "1.0.0", - "digest": "sha256:abc123...", - "size": 102400, - "media_type": "application/vnd.oci.image.manifest.v1+json", - "created_at": "2025-10-06T12:00:00Z" - } - ] -} -```text - -## Usage Examples - -### Pull KCL Package - -```rust -use provisioning_orchestrator::oci::OciManager; -use std::path::PathBuf; - -let oci_manager = OciManager::new( - "http://localhost:5000".to_string(), - "provisioning-extensions".to_string(), - PathBuf::from("/tmp/oci-cache"), -); - -// Pull KCL package -let package_path = oci_manager.pull_kcl_package( - "provisioning-core", - "1.0.0" -).await?; - -println!("Package downloaded to: {}", package_path.display()); - -// Extract package -// tar -xzf package_path -```text - -### Pull Extension Artifact - -```rust -// Pull taskserv extension -let artifact_path = oci_manager.pull_extension_artifact( - "taskserv", - "kubernetes", - "1.28.0" -).await?; - -// Extract and install -// tar -xzf artifact_path -C /target/path -```text - -### List Artifacts - -```rust -let artifacts = oci_manager.list_oci_artifacts("kcl").await?; - -for artifact in artifacts { - println!("📦 {} v{}", artifact.name, artifact.version); - println!(" Size: {} bytes", artifact.size); - println!(" Digest: {}", artifact.digest); - println!(); -} -```text - -### Check Artifact Exists - -```rust -let exists = oci_manager.artifact_exists( - "kcl/provisioning-core", - "1.0.0" -).await?; - -if exists { - println!("Artifact exists in registry"); -} else { - println!("Artifact not found"); -} -```text - -### Get Manifest (with caching) - -```rust -let manifest = oci_manager.get_manifest( - "kcl/provisioning-core", - "1.0.0" -).await?; - -println!("Schema version: {}", manifest.schema_version); -println!("Media type: {}", manifest.media_type); -println!("Layers: {}", manifest.layers.len()); -```text - -### Clear Manifest Cache - -```rust -oci_manager.clear_cache().await; -```text - -## OCI Artifact Structure - -### Manifest Format - -```json -{ - "schemaVersion": 2, - "mediaType": "application/vnd.oci.image.manifest.v1+json", - "config": { - "mediaType": "application/vnd.oci.image.config.v1+json", - "digest": "sha256:abc123...", - "size": 1234 - }, - "layers": [ - { - "mediaType": "application/vnd.oci.image.layer.v1.tar+gzip", - "digest": "sha256:def456...", - "size": 102400 - } - ], - "annotations": { - "org.opencontainers.image.created": "2025-10-06T12:00:00Z", - "org.opencontainers.image.version": "1.0.0" - } -} -```text - -## Integration with Workflows - -### Extension Installation with OCI - -1. **Check local cache** -2. **Pull from OCI registry** (if not cached) -3. Extract artifact -4. Validate contents -5. Install extension - -```rust -// Workflow: Install taskserv from OCI -async fn install_taskserv_from_oci( - oci_manager: &OciManager, - name: &str, - version: &str -) -> Result<()> { - // Pull artifact - let artifact_path = oci_manager.pull_extension_artifact( - "taskserv", - name, - version - ).await?; - - // Extract - extract_tarball(&artifact_path, &target_dir)?; - - // Validate - validate_extension_structure(&target_dir)?; - - // Install - install_extension(&target_dir)?; - - Ok(()) -} -```text - -## Cache Management - -### Cache Directory Structure - -```plaintext -/tmp/oci-cache/ -├── kcl/ -│ └── provisioning-core/ -│ └── 1.0.0/ -│ └── package.tar.gz -├── extensions/ -│ ├── taskserv/ -│ │ └── kubernetes/ -│ │ └── 1.28.0/ -│ │ └── artifact.tar.gz -│ └── provider/ -│ └── aws/ -│ └── 2.0.0/ -│ └── artifact.tar.gz -```text - -### Cache Cleanup - -Implement cache cleanup strategy: - -```rust -// Clean old artifacts -async fn cleanup_old_artifacts(cache_dir: &Path, max_age_days: u64) -> Result<()> { - let cutoff = Utc::now() - Duration::days(max_age_days as i64); - - for entry in std::fs::read_dir(cache_dir)? { - let entry = entry?; - let metadata = entry.metadata()?; - - if let Ok(modified) = metadata.modified() { - let modified: DateTime = modified.into(); - if modified < cutoff { - std::fs::remove_dir_all(entry.path())?; - } - } - } - - Ok(()) -} -```text - -## Error Handling - -The OCI integration handles errors gracefully: - -- **Network errors**: Retries with exponential backoff -- **Manifest not found**: Returns clear error message -- **Corrupted downloads**: Validates digest before returning -- **Disk full**: Reports storage error - -## Testing - -Run OCI integration tests: - -```bash -cd provisioning/platform/orchestrator -cargo test test_oci_integration -```text - -## Troubleshooting - -### Artifact pull fails - -1. Check OCI registry is accessible -2. Verify `registry_url` configuration -3. Check network connectivity -4. Verify artifact exists in registry -5. Review orchestrator logs - -### Digest mismatch - -1. Clear local cache -2. Re-pull artifact -3. Verify registry integrity -4. Check for network corruption - -### Cache issues - -1. Check cache directory permissions -2. Verify disk space -3. Clear cache manually if corrupted - -## Best Practices - -1. **Use specific versions**: Always specify version for production -2. **Verify digests**: Validate artifact integrity -3. **Cache management**: Implement cleanup strategy -4. **Error handling**: Handle network failures gracefully -5. **Monitor downloads**: Track download times and failures - -## Security Considerations - -1. **TLS/HTTPS**: Use secure registry connections in production -2. **Authentication**: Implement registry authentication -3. **Digest verification**: Always verify artifact digests -4. **Access control**: Restrict registry access -5. **Audit logging**: Log all pull operations - -## Performance - -### Download Optimization - -- **Parallel layers**: Download layers in parallel -- **Resume support**: Resume interrupted downloads -- **Compression**: Use gzip for smaller transfers -- **Local cache**: Cache frequently used artifacts - -### Metrics - -Track OCI operations: - -- **Pull count**: Number of artifact pulls -- **Cache hits**: Percentage of cache hits -- **Download time**: Average download duration -- **Bandwidth usage**: Total bytes downloaded - -## Future Enhancements - -- [ ] Push artifacts to registry -- [ ] Registry authentication (OAuth2, Basic Auth) -- [ ] Multi-registry support -- [ ] Mirror/proxy registry -- [ ] Artifact signing and verification -- [ ] Garbage collection for cache +# OCI Registry Integration Guide\n\n## Overview\n\nThe OCI integration module provides OCI Distribution Spec v2 compliant registry integration for pulling KCL packages and extension artifacts.\n\n## Architecture\n\n```\n┌──────────────────┐\n│ Orchestrator │\n│ (Rust) │\n└────────┬─────────┘\n │\n ▼\n┌──────────────────┐\n│ OCI Manager │\n│ │\n│ - LRU caching │\n│ - Pull artifacts│\n│ - List packages │\n└────────┬─────────┘\n │\n ▼\n┌──────────────────┐\n│ OCI Client │\n│ (Distribution) │\n└────────┬─────────┘\n │\n ▼\n┌──────────────────┐\n│ OCI Registry │\n│ (HTTP API v2) │\n└──────────────────┘\n```\n\n## Features\n\n### 1. KCL Package Management\n\nPull KCL configuration packages from OCI registry:\n\n```\nlet package_path = oci_manager.pull_kcl_package(\n "provisioning-core",\n "1.0.0"\n).await?;\n```\n\n### 2. Extension Artifacts\n\nPull extension artifacts (providers, taskservs, clusters):\n\n```\nlet artifact_path = oci_manager.pull_extension_artifact(\n "taskserv", // Extension type\n "kubernetes", // Extension name\n "1.28.0" // Version\n).await?;\n```\n\n### 3. Manifest Caching\n\nManifests are cached using LRU strategy:\n\n- **Cache size**: 100 manifests\n- **Cache key**: `{name}:{version}`\n- **Automatic eviction**: Oldest entries removed when full\n\n### 4. Artifact Listing\n\nList all artifacts in a namespace:\n\n```\nlet artifacts = oci_manager.list_oci_artifacts("kcl").await?;\nfor artifact in artifacts {\n println!("{} v{} ({})", artifact.name, artifact.version, artifact.size);\n}\n```\n\n## OCI Distribution Spec v2\n\nImplements OCI Distribution Specification v2:\n\n- **Manifest retrieval**: `GET /v2/{namespace}/{repository}/manifests/{reference}`\n- **Blob download**: `GET /v2/{namespace}/{repository}/blobs/{digest}`\n- **Tag listing**: `GET /v2/{namespace}/{repository}/tags/list`\n- **Artifact existence**: `HEAD /v2/{namespace}/{repository}/manifests/{reference}`\n\n## Configuration\n\nOCI settings in `config.defaults.toml`:\n\n```\n[orchestrator.oci]\nregistry_url = "http://localhost:5000"\nnamespace = "provisioning-extensions"\ncache_dir = "{{orchestrator.paths.data_dir}}/oci-cache"\n```\n\n### Configuration Options\n\n- **registry_url**: OCI registry HTTP endpoint\n- **namespace**: Default namespace for artifacts\n- **cache_dir**: Local cache directory for downloaded artifacts\n\n## API Endpoints\n\n### List OCI Artifacts\n\n```\nPOST /api/v1/oci/artifacts\nContent-Type: application/json\n\n{\n "namespace": "kcl"\n}\n```\n\n**Response:**\n\n```\n{\n "success": true,\n "data": [\n {\n "name": "provisioning-core",\n "version": "1.0.0",\n "digest": "sha256:abc123...",\n "size": 102400,\n "media_type": "application/vnd.oci.image.manifest.v1+json",\n "created_at": "2025-10-06T12:00:00Z"\n }\n ]\n}\n```\n\n## Usage Examples\n\n### Pull KCL Package\n\n```\nuse provisioning_orchestrator::oci::OciManager;\nuse std::path::PathBuf;\n\nlet oci_manager = OciManager::new(\n "http://localhost:5000".to_string(),\n "provisioning-extensions".to_string(),\n PathBuf::from("/tmp/oci-cache"),\n);\n\n// Pull KCL package\nlet package_path = oci_manager.pull_kcl_package(\n "provisioning-core",\n "1.0.0"\n).await?;\n\nprintln!("Package downloaded to: {}", package_path.display());\n\n// Extract package\n// tar -xzf package_path\n```\n\n### Pull Extension Artifact\n\n```\n// Pull taskserv extension\nlet artifact_path = oci_manager.pull_extension_artifact(\n "taskserv",\n "kubernetes",\n "1.28.0"\n).await?;\n\n// Extract and install\n// tar -xzf artifact_path -C /target/path\n```\n\n### List Artifacts\n\n```\nlet artifacts = oci_manager.list_oci_artifacts("kcl").await?;\n\nfor artifact in artifacts {\n println!("📦 {} v{}", artifact.name, artifact.version);\n println!(" Size: {} bytes", artifact.size);\n println!(" Digest: {}", artifact.digest);\n println!();\n}\n```\n\n### Check Artifact Exists\n\n```\nlet exists = oci_manager.artifact_exists(\n "kcl/provisioning-core",\n "1.0.0"\n).await?;\n\nif exists {\n println!("Artifact exists in registry");\n} else {\n println!("Artifact not found");\n}\n```\n\n### Get Manifest (with caching)\n\n```\nlet manifest = oci_manager.get_manifest(\n "kcl/provisioning-core",\n "1.0.0"\n).await?;\n\nprintln!("Schema version: {}", manifest.schema_version);\nprintln!("Media type: {}", manifest.media_type);\nprintln!("Layers: {}", manifest.layers.len());\n```\n\n### Clear Manifest Cache\n\n```\noci_manager.clear_cache().await;\n```\n\n## OCI Artifact Structure\n\n### Manifest Format\n\n```\n{\n "schemaVersion": 2,\n "mediaType": "application/vnd.oci.image.manifest.v1+json",\n "config": {\n "mediaType": "application/vnd.oci.image.config.v1+json",\n "digest": "sha256:abc123...",\n "size": 1234\n },\n "layers": [\n {\n "mediaType": "application/vnd.oci.image.layer.v1.tar+gzip",\n "digest": "sha256:def456...",\n "size": 102400\n }\n ],\n "annotations": {\n "org.opencontainers.image.created": "2025-10-06T12:00:00Z",\n "org.opencontainers.image.version": "1.0.0"\n }\n}\n```\n\n## Integration with Workflows\n\n### Extension Installation with OCI\n\n1. **Check local cache**\n2. **Pull from OCI registry** (if not cached)\n3. Extract artifact\n4. Validate contents\n5. Install extension\n\n```\n// Workflow: Install taskserv from OCI\nasync fn install_taskserv_from_oci(\n oci_manager: &OciManager,\n name: &str,\n version: &str\n) -> Result<()> {\n // Pull artifact\n let artifact_path = oci_manager.pull_extension_artifact(\n "taskserv",\n name,\n version\n ).await?;\n\n // Extract\n extract_tarball(&artifact_path, &target_dir)?;\n\n // Validate\n validate_extension_structure(&target_dir)?;\n\n // Install\n install_extension(&target_dir)?;\n\n Ok(())\n}\n```\n\n## Cache Management\n\n### Cache Directory Structure\n\n```\n/tmp/oci-cache/\n├── kcl/\n│ └── provisioning-core/\n│ └── 1.0.0/\n│ └── package.tar.gz\n├── extensions/\n│ ├── taskserv/\n│ │ └── kubernetes/\n│ │ └── 1.28.0/\n│ │ └── artifact.tar.gz\n│ └── provider/\n│ └── aws/\n│ └── 2.0.0/\n│ └── artifact.tar.gz\n```\n\n### Cache Cleanup\n\nImplement cache cleanup strategy:\n\n```\n// Clean old artifacts\nasync fn cleanup_old_artifacts(cache_dir: &Path, max_age_days: u64) -> Result<()> {\n let cutoff = Utc::now() - Duration::days(max_age_days as i64);\n\n for entry in std::fs::read_dir(cache_dir)? {\n let entry = entry?;\n let metadata = entry.metadata()?;\n\n if let Ok(modified) = metadata.modified() {\n let modified: DateTime = modified.into();\n if modified < cutoff {\n std::fs::remove_dir_all(entry.path())?;\n }\n }\n }\n\n Ok(())\n}\n```\n\n## Error Handling\n\nThe OCI integration handles errors gracefully:\n\n- **Network errors**: Retries with exponential backoff\n- **Manifest not found**: Returns clear error message\n- **Corrupted downloads**: Validates digest before returning\n- **Disk full**: Reports storage error\n\n## Testing\n\nRun OCI integration tests:\n\n```\ncd provisioning/platform/orchestrator\ncargo test test_oci_integration\n```\n\n## Troubleshooting\n\n### Artifact pull fails\n\n1. Check OCI registry is accessible\n2. Verify `registry_url` configuration\n3. Check network connectivity\n4. Verify artifact exists in registry\n5. Review orchestrator logs\n\n### Digest mismatch\n\n1. Clear local cache\n2. Re-pull artifact\n3. Verify registry integrity\n4. Check for network corruption\n\n### Cache issues\n\n1. Check cache directory permissions\n2. Verify disk space\n3. Clear cache manually if corrupted\n\n## Best Practices\n\n1. **Use specific versions**: Always specify version for production\n2. **Verify digests**: Validate artifact integrity\n3. **Cache management**: Implement cleanup strategy\n4. **Error handling**: Handle network failures gracefully\n5. **Monitor downloads**: Track download times and failures\n\n## Security Considerations\n\n1. **TLS/HTTPS**: Use secure registry connections in production\n2. **Authentication**: Implement registry authentication\n3. **Digest verification**: Always verify artifact digests\n4. **Access control**: Restrict registry access\n5. **Audit logging**: Log all pull operations\n\n## Performance\n\n### Download Optimization\n\n- **Parallel layers**: Download layers in parallel\n- **Resume support**: Resume interrupted downloads\n- **Compression**: Use gzip for smaller transfers\n- **Local cache**: Cache frequently used artifacts\n\n### Metrics\n\nTrack OCI operations:\n\n- **Pull count**: Number of artifact pulls\n- **Cache hits**: Percentage of cache hits\n- **Download time**: Average download duration\n- **Bandwidth usage**: Total bytes downloaded\n\n## Future Enhancements\n\n- [ ] Push artifacts to registry\n- [ ] Registry authentication (OAuth2, Basic Auth)\n- [ ] Multi-registry support\n- [ ] Mirror/proxy registry\n- [ ] Artifact signing and verification\n- [ ] Garbage collection for cache \ No newline at end of file diff --git a/crates/orchestrator/docs/service-orchestration.md b/crates/orchestrator/docs/service-orchestration.md index 3de22bf..ce4a10e 100644 --- a/crates/orchestrator/docs/service-orchestration.md +++ b/crates/orchestrator/docs/service-orchestration.md @@ -1,467 +1 @@ -# Service Orchestration Guide - -## Overview - -The service orchestration module manages platform services with dependency-based startup, health checking, and automatic service coordination. - -## Architecture - -```plaintext -┌──────────────────────┐ -│ Orchestrator │ -│ (Rust) │ -└──────────┬───────────┘ - │ - ▼ -┌──────────────────────┐ -│ Service Orchestrator │ -│ │ -│ - Dependency graph │ -│ - Startup order │ -│ - Health checking │ -└──────────┬───────────┘ - │ - ▼ -┌──────────────────────┐ -│ Service Manager │ -│ (Nushell calls) │ -└──────────┬───────────┘ - │ - ▼ -┌──────────────────────┐ -│ Platform Services │ -│ (CoreDNS, OCI, etc) │ -└──────────────────────┘ -```text - -## Features - -### 1. Dependency Resolution - -Automatically resolve service startup order based on dependencies: - -```rust -let order = service_orchestrator.resolve_startup_order(&[ - "service-c".to_string() -]).await?; - -// Returns: ["service-a", "service-b", "service-c"] -```text - -### 2. Automatic Dependency Startup - -When enabled, dependencies are started automatically: - -```rust -// Start service with dependencies -service_orchestrator.start_service("web-app").await?; - -// Automatically starts: database -> cache -> web-app -```text - -### 3. Health Checking - -Monitor service health with HTTP or process checks: - -```rust -let health = service_orchestrator.check_service_health("web-app").await?; - -if health.healthy { - println!("Service is healthy: {}", health.message); -} -```text - -### 4. Service Status - -Get current status of any registered service: - -```rust -let status = service_orchestrator.get_service_status("web-app").await?; - -match status { - ServiceStatus::Running => println!("Service is running"), - ServiceStatus::Stopped => println!("Service is stopped"), - ServiceStatus::Failed => println!("Service has failed"), - ServiceStatus::Unknown => println!("Service status unknown"), -} -```text - -## Service Definition - -### Service Structure - -```rust -pub struct Service { - pub name: String, - pub description: String, - pub dependencies: Vec, - pub start_command: String, - pub stop_command: String, - pub health_check_endpoint: Option, -} -```text - -### Example Service Definition - -```rust -let coredns_service = Service { - name: "coredns".to_string(), - description: "CoreDNS DNS server".to_string(), - dependencies: vec![], // No dependencies - start_command: "systemctl start coredns".to_string(), - stop_command: "systemctl stop coredns".to_string(), - health_check_endpoint: Some("http://localhost:53/health".to_string()), -}; -```text - -### Service with Dependencies - -```rust -let oci_registry = Service { - name: "oci-registry".to_string(), - description: "OCI distribution registry".to_string(), - dependencies: vec!["coredns".to_string()], // Depends on DNS - start_command: "systemctl start oci-registry".to_string(), - stop_command: "systemctl stop oci-registry".to_string(), - health_check_endpoint: Some("http://localhost:5000/v2/".to_string()), -}; -```text - -## Configuration - -Service orchestration settings in `config.defaults.toml`: - -```toml -[orchestrator.services] -manager_enabled = true -auto_start_dependencies = true -```text - -### Configuration Options - -- **manager_enabled**: Enable service orchestration (default: true) -- **auto_start_dependencies**: Auto-start dependencies when starting a service (default: true) - -## API Endpoints - -### List Services - -```http -GET /api/v1/services/list -```text - -**Response:** - -```json -{ - "success": true, - "data": [ - { - "name": "coredns", - "description": "CoreDNS DNS server", - "dependencies": [], - "start_command": "systemctl start coredns", - "stop_command": "systemctl stop coredns", - "health_check_endpoint": "http://localhost:53/health" - } - ] -} -```text - -### Get Services Status - -```http -GET /api/v1/services/status -```text - -**Response:** - -```json -{ - "success": true, - "data": [ - { - "name": "coredns", - "status": "Running" - }, - { - "name": "oci-registry", - "status": "Running" - } - ] -} -```text - -## Usage Examples - -### Register Services - -```rust -use provisioning_orchestrator::services::{ServiceOrchestrator, Service}; - -let orchestrator = ServiceOrchestrator::new( - "/usr/local/bin/nu".to_string(), - "/usr/local/bin/provisioning".to_string(), - true, // auto_start_dependencies -); - -// Register CoreDNS -let coredns = Service { - name: "coredns".to_string(), - description: "CoreDNS DNS server".to_string(), - dependencies: vec![], - start_command: "systemctl start coredns".to_string(), - stop_command: "systemctl stop coredns".to_string(), - health_check_endpoint: Some("http://localhost:53/health".to_string()), -}; - -orchestrator.register_service(coredns).await; - -// Register OCI Registry (depends on CoreDNS) -let oci = Service { - name: "oci-registry".to_string(), - description: "OCI distribution registry".to_string(), - dependencies: vec!["coredns".to_string()], - start_command: "systemctl start oci-registry".to_string(), - stop_command: "systemctl stop oci-registry".to_string(), - health_check_endpoint: Some("http://localhost:5000/v2/".to_string()), -}; - -orchestrator.register_service(oci).await; -```text - -### Start Service with Dependencies - -```rust -// This will automatically start coredns first, then oci-registry -orchestrator.start_service("oci-registry").await?; -```text - -### Resolve Startup Order - -```rust -let services = vec![ - "web-app".to_string(), - "api-server".to_string(), -]; - -let order = orchestrator.resolve_startup_order(&services).await?; - -println!("Startup order:"); -for (i, service) in order.iter().enumerate() { - println!("{}. {}", i + 1, service); -} -```text - -### Start All Services - -```rust -let started = orchestrator.start_all_services().await?; - -println!("Started {} services:", started.len()); -for service in started { - println!(" ✓ {}", service); -} -```text - -### Check Service Health - -```rust -let health = orchestrator.check_service_health("coredns").await?; - -if health.healthy { - println!("✓ {} is healthy", "coredns"); - println!(" Message: {}", health.message); - println!(" Last check: {}", health.last_check); -} else { - println!("✗ {} is unhealthy", "coredns"); - println!(" Message: {}", health.message); -} -```text - -## Dependency Graph Examples - -### Simple Chain - -```plaintext -A -> B -> C -```text - -Startup order: A, B, C - -```rust -let a = Service { name: "a".to_string(), dependencies: vec![], /* ... */ }; -let b = Service { name: "b".to_string(), dependencies: vec!["a".to_string()], /* ... */ }; -let c = Service { name: "c".to_string(), dependencies: vec!["b".to_string()], /* ... */ }; -```text - -### Diamond Dependency - -```plaintext - A - / \ - B C - \ / - D -```text - -Startup order: A, B, C, D (B and C can start in parallel) - -```rust -let a = Service { name: "a".to_string(), dependencies: vec![], /* ... */ }; -let b = Service { name: "b".to_string(), dependencies: vec!["a".to_string()], /* ... */ }; -let c = Service { name: "c".to_string(), dependencies: vec!["a".to_string()], /* ... */ }; -let d = Service { name: "d".to_string(), dependencies: vec!["b".to_string(), "c".to_string()], /* ... */ }; -```text - -### Complex Dependency - -```plaintext - A - | - B - / \ - C D - | | - E F - \ / - G -```text - -Startup order: A, B, C, D, E, F, G - -## Integration with Platform Services - -### CoreDNS Service - -```rust -let coredns = Service { - name: "coredns".to_string(), - description: "CoreDNS DNS server for automatic DNS registration".to_string(), - dependencies: vec![], - start_command: "systemctl start coredns".to_string(), - stop_command: "systemctl stop coredns".to_string(), - health_check_endpoint: Some("http://localhost:53/health".to_string()), -}; -```text - -### OCI Registry Service - -```rust -let oci_registry = Service { - name: "oci-registry".to_string(), - description: "OCI distribution registry for artifacts".to_string(), - dependencies: vec!["coredns".to_string()], - start_command: "systemctl start oci-registry".to_string(), - stop_command: "systemctl stop oci-registry".to_string(), - health_check_endpoint: Some("http://localhost:5000/v2/".to_string()), -}; -```text - -### Orchestrator Service - -```rust -let orchestrator = Service { - name: "orchestrator".to_string(), - description: "Main orchestrator service".to_string(), - dependencies: vec!["coredns".to_string(), "oci-registry".to_string()], - start_command: "./scripts/start-orchestrator.nu --background".to_string(), - stop_command: "./scripts/start-orchestrator.nu --stop".to_string(), - health_check_endpoint: Some("http://localhost:9090/health".to_string()), -}; -```text - -## Error Handling - -The service orchestrator handles errors gracefully: - -- **Missing dependencies**: Reports missing services -- **Circular dependencies**: Detects and reports cycles -- **Start failures**: Continues with other services -- **Health check failures**: Marks service as unhealthy - -### Circular Dependency Detection - -```rust -// This would create a cycle: A -> B -> C -> A -let a = Service { name: "a".to_string(), dependencies: vec!["c".to_string()], /* ... */ }; -let b = Service { name: "b".to_string(), dependencies: vec!["a".to_string()], /* ... */ }; -let c = Service { name: "c".to_string(), dependencies: vec!["b".to_string()], /* ... */ }; - -// Error: Circular dependency detected -let result = orchestrator.resolve_startup_order(&["a".to_string()]).await; -assert!(result.is_err()); -```text - -## Testing - -Run service orchestration tests: - -```bash -cd provisioning/platform/orchestrator -cargo test test_service_orchestration -```text - -## Troubleshooting - -### Service fails to start - -1. Check service is registered -2. Verify dependencies are running -3. Review service start command -4. Check service logs -5. Verify permissions - -### Dependency resolution fails - -1. Check for circular dependencies -2. Verify all services are registered -3. Review dependency declarations - -### Health check fails - -1. Verify health endpoint is correct -2. Check service is actually running -3. Review network connectivity -4. Check health check timeout - -## Best Practices - -1. **Minimize dependencies**: Only declare necessary dependencies -2. **Health endpoints**: Implement health checks for all services -3. **Graceful shutdown**: Implement proper stop commands -4. **Idempotent starts**: Ensure services can be restarted safely -5. **Error logging**: Log all service operations - -## Security Considerations - -1. **Command injection**: Validate service commands -2. **Access control**: Restrict service management -3. **Audit logging**: Log all service operations -4. **Least privilege**: Run services with minimal permissions - -## Performance - -### Startup Optimization - -- **Parallel starts**: Services without dependencies start in parallel -- **Dependency caching**: Cache dependency resolution -- **Health check batching**: Batch health checks for efficiency - -### Monitoring - -Track service metrics: - -- **Start time**: Time to start each service -- **Health check latency**: Health check response time -- **Failure rate**: Percentage of failed starts -- **Uptime**: Service availability percentage - -## Future Enhancements - -- [ ] Service restart policies -- [ ] Graceful shutdown ordering -- [ ] Service watchdog -- [ ] Auto-restart on failure -- [ ] Service templates -- [ ] Container-based services +# Service Orchestration Guide\n\n## Overview\n\nThe service orchestration module manages platform services with dependency-based startup, health checking, and automatic service coordination.\n\n## Architecture\n\n```\n┌──────────────────────┐\n│ Orchestrator │\n│ (Rust) │\n└──────────┬───────────┘\n │\n ▼\n┌──────────────────────┐\n│ Service Orchestrator │\n│ │\n│ - Dependency graph │\n│ - Startup order │\n│ - Health checking │\n└──────────┬───────────┘\n │\n ▼\n┌──────────────────────┐\n│ Service Manager │\n│ (Nushell calls) │\n└──────────┬───────────┘\n │\n ▼\n┌──────────────────────┐\n│ Platform Services │\n│ (CoreDNS, OCI, etc) │\n└──────────────────────┘\n```\n\n## Features\n\n### 1. Dependency Resolution\n\nAutomatically resolve service startup order based on dependencies:\n\n```\nlet order = service_orchestrator.resolve_startup_order(&[\n "service-c".to_string()\n]).await?;\n\n// Returns: ["service-a", "service-b", "service-c"]\n```\n\n### 2. Automatic Dependency Startup\n\nWhen enabled, dependencies are started automatically:\n\n```\n// Start service with dependencies\nservice_orchestrator.start_service("web-app").await?;\n\n// Automatically starts: database -> cache -> web-app\n```\n\n### 3. Health Checking\n\nMonitor service health with HTTP or process checks:\n\n```\nlet health = service_orchestrator.check_service_health("web-app").await?;\n\nif health.healthy {\n println!("Service is healthy: {}", health.message);\n}\n```\n\n### 4. Service Status\n\nGet current status of any registered service:\n\n```\nlet status = service_orchestrator.get_service_status("web-app").await?;\n\nmatch status {\n ServiceStatus::Running => println!("Service is running"),\n ServiceStatus::Stopped => println!("Service is stopped"),\n ServiceStatus::Failed => println!("Service has failed"),\n ServiceStatus::Unknown => println!("Service status unknown"),\n}\n```\n\n## Service Definition\n\n### Service Structure\n\n```\npub struct Service {\n pub name: String,\n pub description: String,\n pub dependencies: Vec,\n pub start_command: String,\n pub stop_command: String,\n pub health_check_endpoint: Option,\n}\n```\n\n### Example Service Definition\n\n```\nlet coredns_service = Service {\n name: "coredns".to_string(),\n description: "CoreDNS DNS server".to_string(),\n dependencies: vec![], // No dependencies\n start_command: "systemctl start coredns".to_string(),\n stop_command: "systemctl stop coredns".to_string(),\n health_check_endpoint: Some("http://localhost:53/health".to_string()),\n};\n```\n\n### Service with Dependencies\n\n```\nlet oci_registry = Service {\n name: "oci-registry".to_string(),\n description: "OCI distribution registry".to_string(),\n dependencies: vec!["coredns".to_string()], // Depends on DNS\n start_command: "systemctl start oci-registry".to_string(),\n stop_command: "systemctl stop oci-registry".to_string(),\n health_check_endpoint: Some("http://localhost:5000/v2/".to_string()),\n};\n```\n\n## Configuration\n\nService orchestration settings in `config.defaults.toml`:\n\n```\n[orchestrator.services]\nmanager_enabled = true\nauto_start_dependencies = true\n```\n\n### Configuration Options\n\n- **manager_enabled**: Enable service orchestration (default: true)\n- **auto_start_dependencies**: Auto-start dependencies when starting a service (default: true)\n\n## API Endpoints\n\n### List Services\n\n```\nGET /api/v1/services/list\n```\n\n**Response:**\n\n```\n{\n "success": true,\n "data": [\n {\n "name": "coredns",\n "description": "CoreDNS DNS server",\n "dependencies": [],\n "start_command": "systemctl start coredns",\n "stop_command": "systemctl stop coredns",\n "health_check_endpoint": "http://localhost:53/health"\n }\n ]\n}\n```\n\n### Get Services Status\n\n```\nGET /api/v1/services/status\n```\n\n**Response:**\n\n```\n{\n "success": true,\n "data": [\n {\n "name": "coredns",\n "status": "Running"\n },\n {\n "name": "oci-registry",\n "status": "Running"\n }\n ]\n}\n```\n\n## Usage Examples\n\n### Register Services\n\n```\nuse provisioning_orchestrator::services::{ServiceOrchestrator, Service};\n\nlet orchestrator = ServiceOrchestrator::new(\n "/usr/local/bin/nu".to_string(),\n "/usr/local/bin/provisioning".to_string(),\n true, // auto_start_dependencies\n);\n\n// Register CoreDNS\nlet coredns = Service {\n name: "coredns".to_string(),\n description: "CoreDNS DNS server".to_string(),\n dependencies: vec![],\n start_command: "systemctl start coredns".to_string(),\n stop_command: "systemctl stop coredns".to_string(),\n health_check_endpoint: Some("http://localhost:53/health".to_string()),\n};\n\norchestrator.register_service(coredns).await;\n\n// Register OCI Registry (depends on CoreDNS)\nlet oci = Service {\n name: "oci-registry".to_string(),\n description: "OCI distribution registry".to_string(),\n dependencies: vec!["coredns".to_string()],\n start_command: "systemctl start oci-registry".to_string(),\n stop_command: "systemctl stop oci-registry".to_string(),\n health_check_endpoint: Some("http://localhost:5000/v2/".to_string()),\n};\n\norchestrator.register_service(oci).await;\n```\n\n### Start Service with Dependencies\n\n```\n// This will automatically start coredns first, then oci-registry\norchestrator.start_service("oci-registry").await?;\n```\n\n### Resolve Startup Order\n\n```\nlet services = vec![\n "web-app".to_string(),\n "api-server".to_string(),\n];\n\nlet order = orchestrator.resolve_startup_order(&services).await?;\n\nprintln!("Startup order:");\nfor (i, service) in order.iter().enumerate() {\n println!("{}. {}", i + 1, service);\n}\n```\n\n### Start All Services\n\n```\nlet started = orchestrator.start_all_services().await?;\n\nprintln!("Started {} services:", started.len());\nfor service in started {\n println!(" ✓ {}", service);\n}\n```\n\n### Check Service Health\n\n```\nlet health = orchestrator.check_service_health("coredns").await?;\n\nif health.healthy {\n println!("✓ {} is healthy", "coredns");\n println!(" Message: {}", health.message);\n println!(" Last check: {}", health.last_check);\n} else {\n println!("✗ {} is unhealthy", "coredns");\n println!(" Message: {}", health.message);\n}\n```\n\n## Dependency Graph Examples\n\n### Simple Chain\n\n```\nA -> B -> C\n```\n\nStartup order: A, B, C\n\n```\nlet a = Service { name: "a".to_string(), dependencies: vec![], /* ... */ };\nlet b = Service { name: "b".to_string(), dependencies: vec!["a".to_string()], /* ... */ };\nlet c = Service { name: "c".to_string(), dependencies: vec!["b".to_string()], /* ... */ };\n```\n\n### Diamond Dependency\n\n```\n A\n / \\n B C\n \ /\n D\n```\n\nStartup order: A, B, C, D (B and C can start in parallel)\n\n```\nlet a = Service { name: "a".to_string(), dependencies: vec![], /* ... */ };\nlet b = Service { name: "b".to_string(), dependencies: vec!["a".to_string()], /* ... */ };\nlet c = Service { name: "c".to_string(), dependencies: vec!["a".to_string()], /* ... */ };\nlet d = Service { name: "d".to_string(), dependencies: vec!["b".to_string(), "c".to_string()], /* ... */ };\n```\n\n### Complex Dependency\n\n```\n A\n |\n B\n / \\n C D\n | |\n E F\n \ /\n G\n```\n\nStartup order: A, B, C, D, E, F, G\n\n## Integration with Platform Services\n\n### CoreDNS Service\n\n```\nlet coredns = Service {\n name: "coredns".to_string(),\n description: "CoreDNS DNS server for automatic DNS registration".to_string(),\n dependencies: vec![],\n start_command: "systemctl start coredns".to_string(),\n stop_command: "systemctl stop coredns".to_string(),\n health_check_endpoint: Some("http://localhost:53/health".to_string()),\n};\n```\n\n### OCI Registry Service\n\n```\nlet oci_registry = Service {\n name: "oci-registry".to_string(),\n description: "OCI distribution registry for artifacts".to_string(),\n dependencies: vec!["coredns".to_string()],\n start_command: "systemctl start oci-registry".to_string(),\n stop_command: "systemctl stop oci-registry".to_string(),\n health_check_endpoint: Some("http://localhost:5000/v2/".to_string()),\n};\n```\n\n### Orchestrator Service\n\n```\nlet orchestrator = Service {\n name: "orchestrator".to_string(),\n description: "Main orchestrator service".to_string(),\n dependencies: vec!["coredns".to_string(), "oci-registry".to_string()],\n start_command: "./scripts/start-orchestrator.nu --background".to_string(),\n stop_command: "./scripts/start-orchestrator.nu --stop".to_string(),\n health_check_endpoint: Some("http://localhost:9090/health".to_string()),\n};\n```\n\n## Error Handling\n\nThe service orchestrator handles errors gracefully:\n\n- **Missing dependencies**: Reports missing services\n- **Circular dependencies**: Detects and reports cycles\n- **Start failures**: Continues with other services\n- **Health check failures**: Marks service as unhealthy\n\n### Circular Dependency Detection\n\n```\n// This would create a cycle: A -> B -> C -> A\nlet a = Service { name: "a".to_string(), dependencies: vec!["c".to_string()], /* ... */ };\nlet b = Service { name: "b".to_string(), dependencies: vec!["a".to_string()], /* ... */ };\nlet c = Service { name: "c".to_string(), dependencies: vec!["b".to_string()], /* ... */ };\n\n// Error: Circular dependency detected\nlet result = orchestrator.resolve_startup_order(&["a".to_string()]).await;\nassert!(result.is_err());\n```\n\n## Testing\n\nRun service orchestration tests:\n\n```\ncd provisioning/platform/orchestrator\ncargo test test_service_orchestration\n```\n\n## Troubleshooting\n\n### Service fails to start\n\n1. Check service is registered\n2. Verify dependencies are running\n3. Review service start command\n4. Check service logs\n5. Verify permissions\n\n### Dependency resolution fails\n\n1. Check for circular dependencies\n2. Verify all services are registered\n3. Review dependency declarations\n\n### Health check fails\n\n1. Verify health endpoint is correct\n2. Check service is actually running\n3. Review network connectivity\n4. Check health check timeout\n\n## Best Practices\n\n1. **Minimize dependencies**: Only declare necessary dependencies\n2. **Health endpoints**: Implement health checks for all services\n3. **Graceful shutdown**: Implement proper stop commands\n4. **Idempotent starts**: Ensure services can be restarted safely\n5. **Error logging**: Log all service operations\n\n## Security Considerations\n\n1. **Command injection**: Validate service commands\n2. **Access control**: Restrict service management\n3. **Audit logging**: Log all service operations\n4. **Least privilege**: Run services with minimal permissions\n\n## Performance\n\n### Startup Optimization\n\n- **Parallel starts**: Services without dependencies start in parallel\n- **Dependency caching**: Cache dependency resolution\n- **Health check batching**: Batch health checks for efficiency\n\n### Monitoring\n\nTrack service metrics:\n\n- **Start time**: Time to start each service\n- **Health check latency**: Health check response time\n- **Failure rate**: Percentage of failed starts\n- **Uptime**: Service availability percentage\n\n## Future Enhancements\n\n- [ ] Service restart policies\n- [ ] Graceful shutdown ordering\n- [ ] Service watchdog\n- [ ] Auto-restart on failure\n- [ ] Service templates\n- [ ] Container-based services \ No newline at end of file diff --git a/crates/orchestrator/docs/ssh-key-management.md b/crates/orchestrator/docs/ssh-key-management.md index 1ef59f0..b92fbe4 100644 --- a/crates/orchestrator/docs/ssh-key-management.md +++ b/crates/orchestrator/docs/ssh-key-management.md @@ -1,525 +1 @@ -# SSH Temporal Key Management System - -## Overview - -The SSH Temporal Key Management System provides automated generation, deployment, and cleanup of short-lived SSH keys -for secure server access. It eliminates the need for static SSH keys by generating keys on-demand with automatic expiration. - -## Features - -### Core Features - -- **Short-Lived Keys**: Keys expire automatically after a configurable TTL (default: 1 hour) -- **Multiple Key Types**: - - Dynamic Key Pairs (Ed25519) - - Vault OTP (One-Time Password) - - Vault CA-Signed Certificates -- **Automatic Cleanup**: Background task removes expired keys from servers -- **Audit Trail**: All key operations are logged -- **REST API**: HTTP endpoints for integration -- **Nushell CLI**: User-friendly command-line interface - -### Security Features - -- ✅ Ed25519 keys (modern, secure algorithm) -- ✅ Automatic expiration and cleanup -- ✅ Private keys never stored on disk (only in memory) -- ✅ Vault integration for enterprise scenarios -- ✅ SSH fingerprint tracking -- ✅ Per-key audit logging - -## Architecture - -```plaintext -┌───────────────────────────────────────────────── -────────────┐ -│ SSH Key Manager │ -├───────────────────────────────────────────────── -────────────┤ -│ │ -│ ┌──────────────┐ ┌──────────────┐ -┌──────────────┐ │ -│ │ Key Generator│ │ Key Deployer │ │ Temporal │ │ -│ │ (Ed25519) │ │ (SSH Deploy) │ │ Manager │ │ -│ └──────────────┘ └──────────────┘ -└──────────────┘ │ -│ │ -│ ┌──────────────┐ ┌──────────────┐ │ -│ │ Vault │ │ Authorized │ │ -│ │ SSH Engine │ │ Keys Manager │ │ -│ └──────────────┘ └──────────────┘ │ -│ │ -└───────────────────────────────────────────────── -────────────┘ - │ │ │ - ▼ ▼ ▼ - REST API Nushell CLI Background Tasks -```text - -## Key Types - -### 1. Dynamic Key Pairs (Default) - -Generated on-demand Ed25519 keys that are automatically deployed and cleaned up. - -**Use Case**: Quick SSH access without Vault infrastructure - -**Example**: - -```bash -ssh generate-key server.example.com --user root --ttl 30min -```text - -### 2. Vault OTP (One-Time Password) - -Vault generates a one-time password for SSH authentication. - -**Use Case**: Single-use SSH access with centralized authentication - -**Requirements**: Vault with SSH secrets engine in OTP mode - -**Example**: - -```bash -ssh generate-key server.example.com --type otp --ip 192.168.1.100 -```text - -### 3. Vault CA-Signed Certificates - -Vault acts as SSH CA, signing user public keys with short TTL. - -**Use Case**: Enterprise scenarios with SSH CA infrastructure - -**Requirements**: Vault with SSH secrets engine in CA mode - -**Example**: - -```bash -ssh generate-key server.example.com --type ca --principal admin --ttl 1hr -```text - -## REST API Endpoints - -Base URL: `http://localhost:9090` - -### Generate SSH Key - -```http -POST /api/v1/ssh/generate - -{ - "key_type": "dynamickeypair", // or "otp", "certificate" - "user": "root", - "target_server": "server.example.com", - "ttl_seconds": 3600, - "allowed_ip": "192.168.1.100", // optional, for OTP - "principal": "admin" // optional, for CA -} - -Response: -{ - "success": true, - "data": { - "id": "uuid", - "key_type": "dynamickeypair", - "public_key": "ssh-ed25519 AAAA...", - "private_key": "-----BEGIN OPENSSH PRIVATE KEY-----...", - "fingerprint": "SHA256:...", - "user": "root", - "target_server": "server.example.com", - "created_at": "2024-01-01T00:00:00Z", - "expires_at": "2024-01-01T01:00:00Z", - "deployed": false - } -} -```text - -### Deploy SSH Key - -```http -POST /api/v1/ssh/{key_id}/deploy - -Response: -{ - "success": true, - "data": { - "key_id": "uuid", - "server": "server.example.com", - "success": true, - "deployed_at": "2024-01-01T00:00:00Z" - } -} -```text - -### List SSH Keys - -```http -GET /api/v1/ssh/keys - -Response: -{ - "success": true, - "data": [ - { - "id": "uuid", - "key_type": "dynamickeypair", - "user": "root", - "target_server": "server.example.com", - "expires_at": "2024-01-01T01:00:00Z", - "deployed": true - } - ] -} -```text - -### Revoke SSH Key - -```http -POST /api/v1/ssh/{key_id}/revoke - -Response: -{ - "success": true, - "data": "Key uuid revoked successfully" -} -```text - -### Get SSH Key - -```http -GET /api/v1/ssh/{key_id} - -Response: -{ - "success": true, - "data": { - "id": "uuid", - "key_type": "dynamickeypair", - ... - } -} -```text - -### Cleanup Expired Keys - -```http -POST /api/v1/ssh/cleanup - -Response: -{ - "success": true, - "data": { - "cleaned_count": 5, - "cleaned_key_ids": ["uuid1", "uuid2", ...] - } -} -```text - -### Get Statistics - -```http -GET /api/v1/ssh/stats - -Response: -{ - "success": true, - "data": { - "total_generated": 42, - "active_keys": 10, - "expired_keys": 32, - "keys_by_type": { - "dynamic": 35, - "otp": 5, - "certificate": 2 - }, - "last_cleanup_count": 5, - "last_cleanup_at": "2024-01-01T00:00:00Z" - } -} -```text - -## Nushell CLI Commands - -### Generate Key - -```bash -ssh generate-key [options] - -Options: - --user SSH user (default: root) - --ttl Key lifetime (default: 1hr) - --type Key type (default: dynamic) - --ip
Allowed IP (OTP mode) - --principal Principal (CA mode) - -Examples: - ssh generate-key server.example.com - ssh generate-key server.example.com --user deploy --ttl 30min - ssh generate-key server.example.com --type ca --principal admin -```text - -### Deploy Key - -```bash -ssh deploy-key - -Example: - ssh deploy-key abc-123-def-456 -```text - -### List Keys - -```bash -ssh list-keys [--expired] - -Example: - ssh list-keys - ssh list-keys | where deployed == true -```text - -### Revoke Key - -```bash -ssh revoke-key - -Example: - ssh revoke-key abc-123-def-456 -```text - -### Connect with Auto-Generated Key - -```bash -ssh connect [options] - -Options: - --user SSH user (default: root) - --ttl Key lifetime (default: 1hr) - --type Key type (default: dynamic) - --keep Keep key after disconnect - -Example: - ssh connect server.example.com --user deploy -```text - -This command: - -1. Generates a temporal SSH key -2. Deploys it to the server -3. Opens SSH connection -4. Revokes the key after disconnect (unless --keep is used) - -### Show Statistics - -```bash -ssh stats - -Example output: - SSH Key Statistics: - Total generated: 42 - Active keys: 10 - Expired keys: 32 - - Keys by type: - dynamic: 35 - otp: 5 - certificate: 2 - - Last cleanup: 2024-01-01T00:00:00Z - Cleaned keys: 5 -```text - -### Manual Cleanup - -```bash -ssh cleanup - -Example output: - ✓ Cleaned up 5 expired keys - Cleaned key IDs: - - abc-123 - - def-456 - ... -```text - -## Configuration - -### Orchestrator Configuration - -Add to orchestrator startup: - -```rust -use provisioning_orchestrator::{SshKeyManager, SshConfig}; - -// Create SSH configuration -let ssh_config = SshConfig { - vault_enabled: false, // Enable Vault integration - vault_addr: None, // Vault address - vault_token: None, // Vault token - vault_mount_point: "ssh".to_string(), - vault_mode: "ca".to_string(), // "ca" or "otp" - default_ttl: Duration::hours(1), - cleanup_interval: Duration::minutes(5), - provisioning_key_path: Some("/path/to/provisioning/key".to_string()), -}; - -// Create SSH key manager -let ssh_manager = Arc::new(SshKeyManager::new(ssh_config).await?); - -// Start background cleanup task -Arc::clone(&ssh_manager).start_cleanup_task().await; -```text - -### Vault SSH Configuration - -#### OTP Mode - -```bash -# Enable SSH secrets engine -vault secrets enable ssh - -# Configure OTP role -vault write ssh/roles/otp_key_role \ - key_type=otp \ - default_user=root \ - cidr_list=0.0.0.0/0 -```text - -#### CA Mode - -```bash -# Enable SSH secrets engine -vault secrets enable ssh - -# Generate SSH CA -vault write ssh/config/ca generate_signing_key=true - -# Configure CA role -vault write ssh/roles/default \ - key_type=ca \ - ttl=1h \ - max_ttl=24h \ - allow_user_certificates=true \ - allowed_users="*" \ - default_extensions="permit-pty,permit-port-forwarding" - -# Get CA public key (add to servers' /etc/ssh/trusted-user-ca-keys.pem) -vault read -field=public_key ssh/config/ca -```text - -Server configuration (`/etc/ssh/sshd_config`): - -```plaintext -TrustedUserCAKeys /etc/ssh/trusted-user-ca-keys.pem -```text - -## Deployment - -### Prerequisites - -1. **Orchestrator**: Running on port 8080 -2. **SSH Access**: Provisioning key for deploying to servers -3. **Vault** (optional): For OTP or CA modes - -### Environment Variables - -```bash -# Vault integration (optional) -export VAULT_ADDR=https://vault.example.com:8200 -export VAULT_TOKEN=your-vault-token - -# Provisioning SSH key path -export PROVISIONING_SSH_KEY=/path/to/provisioning/key -```text - -### Integration with Workflows - -The SSH key manager integrates with existing workflows: - -```nushell -# In server creation workflow -let ssh_key = (ssh generate-key $server --ttl 30min) -ssh deploy-key $ssh_key.id - -# Execute remote commands -ssh root@$server "install-kubernetes.sh" - -# Auto-revoke after workflow -ssh revoke-key $ssh_key.id -```text - -## Security Considerations - -1. **Private Key Exposure**: Private keys are only shown once during generation -2. **Key Storage**: Keys stored in memory only, not on disk -3. **Cleanup**: Automatic cleanup removes expired keys from servers -4. **Audit Logging**: All operations logged for security audit -5. **Vault Integration**: Optional Vault integration for enterprise security -6. **TTL Limits**: Enforce maximum TTL to prevent long-lived keys - -## Troubleshooting - -### Key Deployment Fails - -Check SSH connectivity: - -```bash -ssh -i /path/to/provisioning/key root@server.example.com -```text - -Verify SSH daemon is running: - -```bash -systemctl status sshd -```text - -### Cleanup Not Working - -Check orchestrator logs: - -```bash -tail -f ./data/orchestrator.log | grep SSH -```text - -Manual cleanup: - -```bash -ssh cleanup -```text - -### Vault Integration Issues - -Test Vault connectivity: - -```bash -vault status -vault token lookup -```text - -Check SSH secrets engine: - -```bash -vault secrets list -vault read ssh/config/ca -```text - -## Performance - -- **Key Generation**: <100ms (Ed25519) -- **Key Deployment**: ~1s (depends on SSH latency) -- **Cleanup Task**: Every 5 minutes (configurable) -- **Concurrent Keys**: Unlimited (memory bound) - -## Future Enhancements - -- [ ] SSH certificate rotation -- [ ] Integration with KMS for key encryption -- [ ] WebSocket notifications for key expiration -- [ ] Prometheus metrics export -- [ ] SSH session recording -- [ ] Role-based key generation policies - -## References - -- RFC 8709: Ed25519 and Ed448 Public Key Algorithms for SSH -- Vault SSH Secrets Engine: -- OpenSSH Certificate Authentication: +# SSH Temporal Key Management System\n\n## Overview\n\nThe SSH Temporal Key Management System provides automated generation, deployment, and cleanup of short-lived SSH keys\nfor secure server access. It eliminates the need for static SSH keys by generating keys on-demand with automatic expiration.\n\n## Features\n\n### Core Features\n\n- **Short-Lived Keys**: Keys expire automatically after a configurable TTL (default: 1 hour)\n- **Multiple Key Types**:\n - Dynamic Key Pairs (Ed25519)\n - Vault OTP (One-Time Password)\n - Vault CA-Signed Certificates\n- **Automatic Cleanup**: Background task removes expired keys from servers\n- **Audit Trail**: All key operations are logged\n- **REST API**: HTTP endpoints for integration\n- **Nushell CLI**: User-friendly command-line interface\n\n### Security Features\n\n- ✅ Ed25519 keys (modern, secure algorithm)\n- ✅ Automatic expiration and cleanup\n- ✅ Private keys never stored on disk (only in memory)\n- ✅ Vault integration for enterprise scenarios\n- ✅ SSH fingerprint tracking\n- ✅ Per-key audit logging\n\n## Architecture\n\n```\n┌─────────────────────────────────────────────────\n────────────┐\n│ SSH Key Manager │\n├─────────────────────────────────────────────────\n────────────┤\n│ │\n│ ┌──────────────┐ ┌──────────────┐ \n┌──────────────┐ │\n│ │ Key Generator│ │ Key Deployer │ │ Temporal │ │\n│ │ (Ed25519) │ │ (SSH Deploy) │ │ Manager │ │\n│ └──────────────┘ └──────────────┘ \n└──────────────┘ │\n│ │\n│ ┌──────────────┐ ┌──────────────┐ │\n│ │ Vault │ │ Authorized │ │\n│ │ SSH Engine │ │ Keys Manager │ │\n│ └──────────────┘ └──────────────┘ │\n│ │\n└─────────────────────────────────────────────────\n────────────┘\n │ │ │\n ▼ ▼ ▼\n REST API Nushell CLI Background Tasks\n```\n\n## Key Types\n\n### 1. Dynamic Key Pairs (Default)\n\nGenerated on-demand Ed25519 keys that are automatically deployed and cleaned up.\n\n**Use Case**: Quick SSH access without Vault infrastructure\n\n**Example**:\n\n```\nssh generate-key server.example.com --user root --ttl 30min\n```\n\n### 2. Vault OTP (One-Time Password)\n\nVault generates a one-time password for SSH authentication.\n\n**Use Case**: Single-use SSH access with centralized authentication\n\n**Requirements**: Vault with SSH secrets engine in OTP mode\n\n**Example**:\n\n```\nssh generate-key server.example.com --type otp --ip 192.168.1.100\n```\n\n### 3. Vault CA-Signed Certificates\n\nVault acts as SSH CA, signing user public keys with short TTL.\n\n**Use Case**: Enterprise scenarios with SSH CA infrastructure\n\n**Requirements**: Vault with SSH secrets engine in CA mode\n\n**Example**:\n\n```\nssh generate-key server.example.com --type ca --principal admin --ttl 1hr\n```\n\n## REST API Endpoints\n\nBase URL: `http://localhost:9090`\n\n### Generate SSH Key\n\n```\nPOST /api/v1/ssh/generate\n\n{\n "key_type": "dynamickeypair", // or "otp", "certificate"\n "user": "root",\n "target_server": "server.example.com",\n "ttl_seconds": 3600,\n "allowed_ip": "192.168.1.100", // optional, for OTP\n "principal": "admin" // optional, for CA\n}\n\nResponse:\n{\n "success": true,\n "data": {\n "id": "uuid",\n "key_type": "dynamickeypair",\n "public_key": "ssh-ed25519 AAAA...",\n "private_key": "-----BEGIN OPENSSH PRIVATE KEY-----...",\n "fingerprint": "SHA256:...",\n "user": "root",\n "target_server": "server.example.com",\n "created_at": "2024-01-01T00:00:00Z",\n "expires_at": "2024-01-01T01:00:00Z",\n "deployed": false\n }\n}\n```\n\n### Deploy SSH Key\n\n```\nPOST /api/v1/ssh/{key_id}/deploy\n\nResponse:\n{\n "success": true,\n "data": {\n "key_id": "uuid",\n "server": "server.example.com",\n "success": true,\n "deployed_at": "2024-01-01T00:00:00Z"\n }\n}\n```\n\n### List SSH Keys\n\n```\nGET /api/v1/ssh/keys\n\nResponse:\n{\n "success": true,\n "data": [\n {\n "id": "uuid",\n "key_type": "dynamickeypair",\n "user": "root",\n "target_server": "server.example.com",\n "expires_at": "2024-01-01T01:00:00Z",\n "deployed": true\n }\n ]\n}\n```\n\n### Revoke SSH Key\n\n```\nPOST /api/v1/ssh/{key_id}/revoke\n\nResponse:\n{\n "success": true,\n "data": "Key uuid revoked successfully"\n}\n```\n\n### Get SSH Key\n\n```\nGET /api/v1/ssh/{key_id}\n\nResponse:\n{\n "success": true,\n "data": {\n "id": "uuid",\n "key_type": "dynamickeypair",\n ...\n }\n}\n```\n\n### Cleanup Expired Keys\n\n```\nPOST /api/v1/ssh/cleanup\n\nResponse:\n{\n "success": true,\n "data": {\n "cleaned_count": 5,\n "cleaned_key_ids": ["uuid1", "uuid2", ...]\n }\n}\n```\n\n### Get Statistics\n\n```\nGET /api/v1/ssh/stats\n\nResponse:\n{\n "success": true,\n "data": {\n "total_generated": 42,\n "active_keys": 10,\n "expired_keys": 32,\n "keys_by_type": {\n "dynamic": 35,\n "otp": 5,\n "certificate": 2\n },\n "last_cleanup_count": 5,\n "last_cleanup_at": "2024-01-01T00:00:00Z"\n }\n}\n```\n\n## Nushell CLI Commands\n\n### Generate Key\n\n```\nssh generate-key [options]\n\nOptions:\n --user SSH user (default: root)\n --ttl Key lifetime (default: 1hr)\n --type Key type (default: dynamic)\n --ip
Allowed IP (OTP mode)\n --principal Principal (CA mode)\n\nExamples:\n ssh generate-key server.example.com\n ssh generate-key server.example.com --user deploy --ttl 30min\n ssh generate-key server.example.com --type ca --principal admin\n```\n\n### Deploy Key\n\n```\nssh deploy-key \n\nExample:\n ssh deploy-key abc-123-def-456\n```\n\n### List Keys\n\n```\nssh list-keys [--expired]\n\nExample:\n ssh list-keys\n ssh list-keys | where deployed == true\n```\n\n### Revoke Key\n\n```\nssh revoke-key \n\nExample:\n ssh revoke-key abc-123-def-456\n```\n\n### Connect with Auto-Generated Key\n\n```\nssh connect [options]\n\nOptions:\n --user SSH user (default: root)\n --ttl Key lifetime (default: 1hr)\n --type Key type (default: dynamic)\n --keep Keep key after disconnect\n\nExample:\n ssh connect server.example.com --user deploy\n```\n\nThis command:\n\n1. Generates a temporal SSH key\n2. Deploys it to the server\n3. Opens SSH connection\n4. Revokes the key after disconnect (unless --keep is used)\n\n### Show Statistics\n\n```\nssh stats\n\nExample output:\n SSH Key Statistics:\n Total generated: 42\n Active keys: 10\n Expired keys: 32\n\n Keys by type:\n dynamic: 35\n otp: 5\n certificate: 2\n\n Last cleanup: 2024-01-01T00:00:00Z\n Cleaned keys: 5\n```\n\n### Manual Cleanup\n\n```\nssh cleanup\n\nExample output:\n ✓ Cleaned up 5 expired keys\n Cleaned key IDs:\n - abc-123\n - def-456\n ...\n```\n\n## Configuration\n\n### Orchestrator Configuration\n\nAdd to orchestrator startup:\n\n```\nuse provisioning_orchestrator::{SshKeyManager, SshConfig};\n\n// Create SSH configuration\nlet ssh_config = SshConfig {\n vault_enabled: false, // Enable Vault integration\n vault_addr: None, // Vault address\n vault_token: None, // Vault token\n vault_mount_point: "ssh".to_string(),\n vault_mode: "ca".to_string(), // "ca" or "otp"\n default_ttl: Duration::hours(1),\n cleanup_interval: Duration::minutes(5),\n provisioning_key_path: Some("/path/to/provisioning/key".to_string()),\n};\n\n// Create SSH key manager\nlet ssh_manager = Arc::new(SshKeyManager::new(ssh_config).await?);\n\n// Start background cleanup task\nArc::clone(&ssh_manager).start_cleanup_task().await;\n```\n\n### Vault SSH Configuration\n\n#### OTP Mode\n\n```\n# Enable SSH secrets engine\nvault secrets enable ssh\n\n# Configure OTP role\nvault write ssh/roles/otp_key_role \\n key_type=otp \\n default_user=root \\n cidr_list=0.0.0.0/0\n```\n\n#### CA Mode\n\n```\n# Enable SSH secrets engine\nvault secrets enable ssh\n\n# Generate SSH CA\nvault write ssh/config/ca generate_signing_key=true\n\n# Configure CA role\nvault write ssh/roles/default \\n key_type=ca \\n ttl=1h \\n max_ttl=24h \\n allow_user_certificates=true \\n allowed_users="*" \\n default_extensions="permit-pty,permit-port-forwarding"\n\n# Get CA public key (add to servers' /etc/ssh/trusted-user-ca-keys.pem)\nvault read -field=public_key ssh/config/ca\n```\n\nServer configuration (`/etc/ssh/sshd_config`):\n\n```\nTrustedUserCAKeys /etc/ssh/trusted-user-ca-keys.pem\n```\n\n## Deployment\n\n### Prerequisites\n\n1. **Orchestrator**: Running on port 8080\n2. **SSH Access**: Provisioning key for deploying to servers\n3. **Vault** (optional): For OTP or CA modes\n\n### Environment Variables\n\n```\n# Vault integration (optional)\nexport VAULT_ADDR=https://vault.example.com:8200\nexport VAULT_TOKEN=your-vault-token\n\n# Provisioning SSH key path\nexport PROVISIONING_SSH_KEY=/path/to/provisioning/key\n```\n\n### Integration with Workflows\n\nThe SSH key manager integrates with existing workflows:\n\n```\n# In server creation workflow\nlet ssh_key = (ssh generate-key $server --ttl 30min)\nssh deploy-key $ssh_key.id\n\n# Execute remote commands\nssh root@$server "install-kubernetes.sh"\n\n# Auto-revoke after workflow\nssh revoke-key $ssh_key.id\n```\n\n## Security Considerations\n\n1. **Private Key Exposure**: Private keys are only shown once during generation\n2. **Key Storage**: Keys stored in memory only, not on disk\n3. **Cleanup**: Automatic cleanup removes expired keys from servers\n4. **Audit Logging**: All operations logged for security audit\n5. **Vault Integration**: Optional Vault integration for enterprise security\n6. **TTL Limits**: Enforce maximum TTL to prevent long-lived keys\n\n## Troubleshooting\n\n### Key Deployment Fails\n\nCheck SSH connectivity:\n\n```\nssh -i /path/to/provisioning/key root@server.example.com\n```\n\nVerify SSH daemon is running:\n\n```\nsystemctl status sshd\n```\n\n### Cleanup Not Working\n\nCheck orchestrator logs:\n\n```\ntail -f ./data/orchestrator.log | grep SSH\n```\n\nManual cleanup:\n\n```\nssh cleanup\n```\n\n### Vault Integration Issues\n\nTest Vault connectivity:\n\n```\nvault status\nvault token lookup\n```\n\nCheck SSH secrets engine:\n\n```\nvault secrets list\nvault read ssh/config/ca\n```\n\n## Performance\n\n- **Key Generation**: <100ms (Ed25519)\n- **Key Deployment**: ~1s (depends on SSH latency)\n- **Cleanup Task**: Every 5 minutes (configurable)\n- **Concurrent Keys**: Unlimited (memory bound)\n\n## Future Enhancements\n\n- [ ] SSH certificate rotation\n- [ ] Integration with KMS for key encryption\n- [ ] WebSocket notifications for key expiration\n- [ ] Prometheus metrics export\n- [ ] SSH session recording\n- [ ] Role-based key generation policies\n\n## References\n\n- RFC 8709: Ed25519 and Ed448 Public Key Algorithms for SSH\n- Vault SSH Secrets Engine: \n- OpenSSH Certificate Authentication: \ No newline at end of file diff --git a/crates/orchestrator/docs/storage-backends.md b/crates/orchestrator/docs/storage-backends.md index 14b6d4f..88ab375 100644 --- a/crates/orchestrator/docs/storage-backends.md +++ b/crates/orchestrator/docs/storage-backends.md @@ -1,385 +1 @@ -# Storage Backends Guide - -This document provides comprehensive guidance on the orchestrator's storage backend options, configuration, and migration between them. - -## Overview - -The orchestrator supports three storage backends through a pluggable architecture: - -1. **Filesystem** - JSON file-based storage (default) -2. **SurrealDB Embedded** - Local database with RocksDB engine -3. **SurrealDB Server** - Remote SurrealDB server connection - -All backends implement the same `TaskStorage` trait, ensuring consistent behavior and seamless migration. - -## Backend Comparison - -| Feature | Filesystem | SurrealDB Embedded | SurrealDB Server | -| --------- | ------------ | ------------------- | ------------------ | -| **Setup Complexity** | Minimal | Low | Medium | -| **External Dependencies** | None | None | SurrealDB Server | -| **Storage Format** | JSON Files | RocksDB | Remote DB | -| **ACID Transactions** | No | Yes | Yes | -| **Authentication/RBAC** | Basic | Advanced | Advanced | -| **Real-time Subscriptions** | No | Yes | Yes | -| **Audit Logging** | Manual | Automatic | Automatic | -| **Metrics Collection** | Basic | Advanced | Advanced | -| **Task Dependencies** | Simple | Graph-based | Graph-based | -| **Horizontal Scaling** | No | No | Yes | -| **Backup/Recovery** | File Copy | Database Backup | Server Backup | -| **Performance** | Good | Excellent | Variable | -| **Memory Usage** | Low | Medium | Low | -| **Disk Usage** | Medium | Optimized | Minimal | - -## 1. Filesystem Backend - -### Overview - -The default storage backend using JSON files for task persistence. Ideal for development and simple deployments. - -### Configuration - -```bash -# Default configuration -./orchestrator --storage-type filesystem --data-dir ./data - -# Custom data directory -./orchestrator --storage-type filesystem --data-dir /var/lib/orchestrator -```text - -### File Structure - -```plaintext -data/ -└── queue.rkvs/ - ├── tasks/ - │ ├── uuid1.json # Individual task records - │ ├── uuid2.json - │ └── ... - └── queue/ - ├── uuid1.json # Queue entries with priority - ├── uuid2.json - └── ... -```text - -### Features - -- ✅ **Simple Setup**: No external dependencies -- ✅ **Transparency**: Human-readable JSON files -- ✅ **Backup**: Standard file system tools -- ✅ **Debugging**: Direct file inspection -- ❌ **ACID**: No transaction guarantees -- ❌ **Concurrency**: Basic file locking -- ❌ **Advanced Features**: Limited auth/audit - -### Best Use Cases - -- Development environments -- Single-instance deployments -- Simple task orchestration -- Environments with strict dependency requirements - -## 2. SurrealDB Embedded - -### Overview - -Local SurrealDB database using RocksDB storage engine. Provides advanced database features without external dependencies. - -### Configuration - -```bash -# Build with SurrealDB support -cargo build --features surrealdb - -# Run with embedded SurrealDB -./orchestrator --storage-type surrealdb-embedded --data-dir ./data -```text - -### Database Schema - -- **tasks**: Main task records with full metadata -- **task_queue**: Priority queue with scheduling info -- **users**: Authentication and RBAC -- **audit_log**: Complete operation history -- **metrics**: Performance and usage statistics -- **task_events**: Real-time event stream - -### Features - -- ✅ **ACID Transactions**: Reliable data consistency -- ✅ **Advanced Queries**: SQL-like syntax with graph support -- ✅ **Real-time Events**: Live query subscriptions -- ✅ **Built-in Auth**: User management and RBAC -- ✅ **Audit Logging**: Automatic operation tracking -- ✅ **No External Deps**: Self-contained database -- ❌ **Horizontal Scaling**: Single-node only - -### Configuration Options - -```bash -# Custom database location -./orchestrator --storage-type surrealdb-embedded \ - --data-dir /var/lib/orchestrator/db - -# With specific namespace/database -./orchestrator --storage-type surrealdb-embedded \ - --data-dir ./data \ - --surrealdb-namespace production \ - --surrealdb-database orchestrator -```text - -### Best Use Cases - -- Production single-node deployments -- Applications requiring ACID guarantees -- Advanced querying and analytics -- Real-time monitoring requirements -- Audit logging compliance - -## 3. SurrealDB Server - -### Overview - -Remote SurrealDB server connection providing full distributed database capabilities with horizontal scaling. - -### Prerequisites - -1. **SurrealDB Server**: Running instance accessible via network -2. **Authentication**: Valid credentials for database access -3. **Network**: Reliable connectivity to SurrealDB server - -### SurrealDB Server Setup - -```bash -# Install SurrealDB -curl -sSf https://install.surrealdb.com | sh - -# Start server -surreal start --log trace --user root --pass root memory - -# Or with file storage -surreal start --log trace --user root --pass root file:orchestrator.db - -# Or with TiKV (distributed) -surreal start --log trace --user root --pass root tikv://localhost:2379 -```text - -### Configuration - -```bash -# Basic server connection -./orchestrator --storage-type surrealdb-server \ - --surrealdb-url ws://localhost:8000 \ - --surrealdb-username admin \ - --surrealdb-password secret - -# Production configuration -./orchestrator --storage-type surrealdb-server \ - --surrealdb-url wss://surreal.production.com:8000 \ - --surrealdb-namespace prod \ - --surrealdb-database orchestrator \ - --surrealdb-username orchestrator-service \ - --surrealdb-password "$SURREALDB_PASSWORD" -```text - -### Features - -- ✅ **Distributed**: Multi-node clustering support -- ✅ **Horizontal Scaling**: Handle massive workloads -- ✅ **Multi-tenancy**: Namespace and database isolation -- ✅ **Real-time Collaboration**: Multiple orchestrator instances -- ✅ **Advanced Security**: Enterprise authentication -- ✅ **High Availability**: Fault-tolerant deployments -- ❌ **Complexity**: Requires server management -- ❌ **Network Dependency**: Requires reliable connectivity - -### Best Use Cases - -- Distributed production deployments -- Multiple orchestrator instances -- High availability requirements -- Large-scale task orchestration -- Multi-tenant environments - -## Migration Between Backends - -### Migration Tool - -Use the migration script to move data between any backend combination: - -```bash -# Interactive migration wizard -./scripts/migrate-storage.nu --interactive - -# Direct migration examples -./scripts/migrate-storage.nu --from filesystem --to surrealdb-embedded \ - --source-dir ./data --target-dir ./surrealdb-data - -./scripts/migrate-storage.nu --from surrealdb-embedded --to surrealdb-server \ - --source-dir ./data \ - --surrealdb-url ws://localhost:8000 \ - --username admin --password secret - -# Validation and dry-run -./scripts/migrate-storage.nu validate --from filesystem --to surrealdb-embedded -./scripts/migrate-storage.nu --from filesystem --to surrealdb-embedded --dry-run -```text - -### Migration Features - -- **Data Integrity**: Complete validation before and after migration -- **Progress Tracking**: Real-time progress with throughput metrics -- **Rollback Support**: Automatic rollback on failures -- **Selective Migration**: Filter by task status, date range, etc. -- **Batch Processing**: Configurable batch sizes for performance - -### Migration Scenarios - -#### Development to Production - -```bash -# Migrate from filesystem (dev) to SurrealDB embedded (production) -./scripts/migrate-storage.nu --from filesystem --to surrealdb-embedded \ - --source-dir ./dev-data --target-dir ./prod-data \ - --batch-size 100 --verify -```text - -#### Scaling Up - -```bash -# Migrate from embedded to server for distributed setup -./scripts/migrate-storage.nu --from surrealdb-embedded --to surrealdb-server \ - --source-dir ./data \ - --surrealdb-url ws://production-surreal:8000 \ - --username orchestrator --password "$PROD_PASSWORD" \ - --namespace production --database main -```text - -#### Disaster Recovery - -```bash -# Migrate from server back to filesystem for emergency backup -./scripts/migrate-storage.nu --from surrealdb-server --to filesystem \ - --surrealdb-url ws://failing-server:8000 \ - --username admin --password "$PASSWORD" \ - --target-dir ./emergency-backup -```text - -## Performance Considerations - -### Filesystem - -- **Strengths**: Low memory usage, simple debugging -- **Limitations**: File I/O bottlenecks, no concurrent writes -- **Optimization**: Fast SSD, regular cleanup of old tasks - -### SurrealDB Embedded - -- **Strengths**: Excellent single-node performance, ACID guarantees -- **Limitations**: Memory usage scales with data size -- **Optimization**: Adequate RAM, SSD storage, regular compaction - -### SurrealDB Server - -- **Strengths**: Horizontal scaling, shared state -- **Limitations**: Network latency, server dependency -- **Optimization**: Low-latency network, connection pooling, server tuning - -## Security Considerations - -### Filesystem - -- **File Permissions**: Restrict access to data directory -- **Backup Security**: Encrypt backup files -- **Network**: No network exposure - -### SurrealDB Embedded - -- **File Permissions**: Secure database files -- **Encryption**: Database-level encryption available -- **Access Control**: Built-in user management - -### SurrealDB Server - -- **Network Security**: Use TLS/WSS connections -- **Authentication**: Strong passwords, regular rotation -- **Authorization**: Role-based access control -- **Audit**: Complete operation logging - -## Troubleshooting - -### Common Issues - -#### Filesystem Backend - -```bash -# Permission issues -sudo chown -R $USER:$USER ./data -chmod -R 755 ./data - -# Corrupted JSON files -rm ./data/queue.rkvs/tasks/corrupted-file.json -```text - -#### SurrealDB Embedded - -```bash -# Database corruption -rm -rf ./data/orchestrator.db -# Restore from backup or re-initialize - -# Permission issues -sudo chown -R $USER:$USER ./data -```text - -#### SurrealDB Server - -```bash -# Connection issues -telnet surreal-server 8000 -# Check server status and network connectivity - -# Authentication failures -# Verify credentials and user permissions -```text - -### Debugging Commands - -```bash -# List available storage types -./orchestrator --help | grep storage-type - -# Validate configuration -./orchestrator --storage-type filesystem --data-dir ./data --dry-run - -# Test migration -./scripts/migrate-storage.nu validate --from filesystem --to surrealdb-embedded - -# Monitor migration progress -./scripts/migrate-storage.nu --from filesystem --to surrealdb-embedded --verbose -```text - -## Recommendations - -### Development - -- **Use**: Filesystem backend -- **Rationale**: Simple setup, easy debugging, no external dependencies - -### Single-Node Production - -- **Use**: SurrealDB Embedded -- **Rationale**: ACID guarantees, advanced features, no external dependencies - -### Distributed Production - -- **Use**: SurrealDB Server -- **Rationale**: Horizontal scaling, high availability, multi-instance support - -### Migration Path - -1. **Start**: Filesystem (development) -2. **Scale**: SurrealDB Embedded (single-node production) -3. **Distribute**: SurrealDB Server (multi-node production) - -This progressive approach allows teams to start simple and scale as requirements grow, with seamless migration between each stage. +# Storage Backends Guide\n\nThis document provides comprehensive guidance on the orchestrator's storage backend options, configuration, and migration between them.\n\n## Overview\n\nThe orchestrator supports three storage backends through a pluggable architecture:\n\n1. **Filesystem** - JSON file-based storage (default)\n2. **SurrealDB Embedded** - Local database with RocksDB engine\n3. **SurrealDB Server** - Remote SurrealDB server connection\n\nAll backends implement the same `TaskStorage` trait, ensuring consistent behavior and seamless migration.\n\n## Backend Comparison\n\n| Feature | Filesystem | SurrealDB Embedded | SurrealDB Server |\n| --------- | ------------ | ------------------- | ------------------ |\n| **Setup Complexity** | Minimal | Low | Medium |\n| **External Dependencies** | None | None | SurrealDB Server |\n| **Storage Format** | JSON Files | RocksDB | Remote DB |\n| **ACID Transactions** | No | Yes | Yes |\n| **Authentication/RBAC** | Basic | Advanced | Advanced |\n| **Real-time Subscriptions** | No | Yes | Yes |\n| **Audit Logging** | Manual | Automatic | Automatic |\n| **Metrics Collection** | Basic | Advanced | Advanced |\n| **Task Dependencies** | Simple | Graph-based | Graph-based |\n| **Horizontal Scaling** | No | No | Yes |\n| **Backup/Recovery** | File Copy | Database Backup | Server Backup |\n| **Performance** | Good | Excellent | Variable |\n| **Memory Usage** | Low | Medium | Low |\n| **Disk Usage** | Medium | Optimized | Minimal |\n\n## 1. Filesystem Backend\n\n### Overview\n\nThe default storage backend using JSON files for task persistence. Ideal for development and simple deployments.\n\n### Configuration\n\n```\n# Default configuration\n./orchestrator --storage-type filesystem --data-dir ./data\n\n# Custom data directory\n./orchestrator --storage-type filesystem --data-dir /var/lib/orchestrator\n```\n\n### File Structure\n\n```\ndata/\n└── queue.rkvs/\n ├── tasks/\n │ ├── uuid1.json # Individual task records\n │ ├── uuid2.json\n │ └── ...\n └── queue/\n ├── uuid1.json # Queue entries with priority\n ├── uuid2.json\n └── ...\n```\n\n### Features\n\n- ✅ **Simple Setup**: No external dependencies\n- ✅ **Transparency**: Human-readable JSON files\n- ✅ **Backup**: Standard file system tools\n- ✅ **Debugging**: Direct file inspection\n- ❌ **ACID**: No transaction guarantees\n- ❌ **Concurrency**: Basic file locking\n- ❌ **Advanced Features**: Limited auth/audit\n\n### Best Use Cases\n\n- Development environments\n- Single-instance deployments\n- Simple task orchestration\n- Environments with strict dependency requirements\n\n## 2. SurrealDB Embedded\n\n### Overview\n\nLocal SurrealDB database using RocksDB storage engine. Provides advanced database features without external dependencies.\n\n### Configuration\n\n```\n# Build with SurrealDB support\ncargo build --features surrealdb\n\n# Run with embedded SurrealDB\n./orchestrator --storage-type surrealdb-embedded --data-dir ./data\n```\n\n### Database Schema\n\n- **tasks**: Main task records with full metadata\n- **task_queue**: Priority queue with scheduling info\n- **users**: Authentication and RBAC\n- **audit_log**: Complete operation history\n- **metrics**: Performance and usage statistics\n- **task_events**: Real-time event stream\n\n### Features\n\n- ✅ **ACID Transactions**: Reliable data consistency\n- ✅ **Advanced Queries**: SQL-like syntax with graph support\n- ✅ **Real-time Events**: Live query subscriptions\n- ✅ **Built-in Auth**: User management and RBAC\n- ✅ **Audit Logging**: Automatic operation tracking\n- ✅ **No External Deps**: Self-contained database\n- ❌ **Horizontal Scaling**: Single-node only\n\n### Configuration Options\n\n```\n# Custom database location\n./orchestrator --storage-type surrealdb-embedded \\n --data-dir /var/lib/orchestrator/db\n\n# With specific namespace/database\n./orchestrator --storage-type surrealdb-embedded \\n --data-dir ./data \\n --surrealdb-namespace production \\n --surrealdb-database orchestrator\n```\n\n### Best Use Cases\n\n- Production single-node deployments\n- Applications requiring ACID guarantees\n- Advanced querying and analytics\n- Real-time monitoring requirements\n- Audit logging compliance\n\n## 3. SurrealDB Server\n\n### Overview\n\nRemote SurrealDB server connection providing full distributed database capabilities with horizontal scaling.\n\n### Prerequisites\n\n1. **SurrealDB Server**: Running instance accessible via network\n2. **Authentication**: Valid credentials for database access\n3. **Network**: Reliable connectivity to SurrealDB server\n\n### SurrealDB Server Setup\n\n```\n# Install SurrealDB\ncurl -sSf https://install.surrealdb.com | sh\n\n# Start server\nsurreal start --log trace --user root --pass root memory\n\n# Or with file storage\nsurreal start --log trace --user root --pass root file:orchestrator.db\n\n# Or with TiKV (distributed)\nsurreal start --log trace --user root --pass root tikv://localhost:2379\n```\n\n### Configuration\n\n```\n# Basic server connection\n./orchestrator --storage-type surrealdb-server \\n --surrealdb-url ws://localhost:8000 \\n --surrealdb-username admin \\n --surrealdb-password secret\n\n# Production configuration\n./orchestrator --storage-type surrealdb-server \\n --surrealdb-url wss://surreal.production.com:8000 \\n --surrealdb-namespace prod \\n --surrealdb-database orchestrator \\n --surrealdb-username orchestrator-service \\n --surrealdb-password "$SURREALDB_PASSWORD"\n```\n\n### Features\n\n- ✅ **Distributed**: Multi-node clustering support\n- ✅ **Horizontal Scaling**: Handle massive workloads\n- ✅ **Multi-tenancy**: Namespace and database isolation\n- ✅ **Real-time Collaboration**: Multiple orchestrator instances\n- ✅ **Advanced Security**: Enterprise authentication\n- ✅ **High Availability**: Fault-tolerant deployments\n- ❌ **Complexity**: Requires server management\n- ❌ **Network Dependency**: Requires reliable connectivity\n\n### Best Use Cases\n\n- Distributed production deployments\n- Multiple orchestrator instances\n- High availability requirements\n- Large-scale task orchestration\n- Multi-tenant environments\n\n## Migration Between Backends\n\n### Migration Tool\n\nUse the migration script to move data between any backend combination:\n\n```\n# Interactive migration wizard\n./scripts/migrate-storage.nu --interactive\n\n# Direct migration examples\n./scripts/migrate-storage.nu --from filesystem --to surrealdb-embedded \\n --source-dir ./data --target-dir ./surrealdb-data\n\n./scripts/migrate-storage.nu --from surrealdb-embedded --to surrealdb-server \\n --source-dir ./data \\n --surrealdb-url ws://localhost:8000 \\n --username admin --password secret\n\n# Validation and dry-run\n./scripts/migrate-storage.nu validate --from filesystem --to surrealdb-embedded\n./scripts/migrate-storage.nu --from filesystem --to surrealdb-embedded --dry-run\n```\n\n### Migration Features\n\n- **Data Integrity**: Complete validation before and after migration\n- **Progress Tracking**: Real-time progress with throughput metrics\n- **Rollback Support**: Automatic rollback on failures\n- **Selective Migration**: Filter by task status, date range, etc.\n- **Batch Processing**: Configurable batch sizes for performance\n\n### Migration Scenarios\n\n#### Development to Production\n\n```\n# Migrate from filesystem (dev) to SurrealDB embedded (production)\n./scripts/migrate-storage.nu --from filesystem --to surrealdb-embedded \\n --source-dir ./dev-data --target-dir ./prod-data \\n --batch-size 100 --verify\n```\n\n#### Scaling Up\n\n```\n# Migrate from embedded to server for distributed setup\n./scripts/migrate-storage.nu --from surrealdb-embedded --to surrealdb-server \\n --source-dir ./data \\n --surrealdb-url ws://production-surreal:8000 \\n --username orchestrator --password "$PROD_PASSWORD" \\n --namespace production --database main\n```\n\n#### Disaster Recovery\n\n```\n# Migrate from server back to filesystem for emergency backup\n./scripts/migrate-storage.nu --from surrealdb-server --to filesystem \\n --surrealdb-url ws://failing-server:8000 \\n --username admin --password "$PASSWORD" \\n --target-dir ./emergency-backup\n```\n\n## Performance Considerations\n\n### Filesystem\n\n- **Strengths**: Low memory usage, simple debugging\n- **Limitations**: File I/O bottlenecks, no concurrent writes\n- **Optimization**: Fast SSD, regular cleanup of old tasks\n\n### SurrealDB Embedded\n\n- **Strengths**: Excellent single-node performance, ACID guarantees\n- **Limitations**: Memory usage scales with data size\n- **Optimization**: Adequate RAM, SSD storage, regular compaction\n\n### SurrealDB Server\n\n- **Strengths**: Horizontal scaling, shared state\n- **Limitations**: Network latency, server dependency\n- **Optimization**: Low-latency network, connection pooling, server tuning\n\n## Security Considerations\n\n### Filesystem\n\n- **File Permissions**: Restrict access to data directory\n- **Backup Security**: Encrypt backup files\n- **Network**: No network exposure\n\n### SurrealDB Embedded\n\n- **File Permissions**: Secure database files\n- **Encryption**: Database-level encryption available\n- **Access Control**: Built-in user management\n\n### SurrealDB Server\n\n- **Network Security**: Use TLS/WSS connections\n- **Authentication**: Strong passwords, regular rotation\n- **Authorization**: Role-based access control\n- **Audit**: Complete operation logging\n\n## Troubleshooting\n\n### Common Issues\n\n#### Filesystem Backend\n\n```\n# Permission issues\nsudo chown -R $USER:$USER ./data\nchmod -R 755 ./data\n\n# Corrupted JSON files\nrm ./data/queue.rkvs/tasks/corrupted-file.json\n```\n\n#### SurrealDB Embedded\n\n```\n# Database corruption\nrm -rf ./data/orchestrator.db\n# Restore from backup or re-initialize\n\n# Permission issues\nsudo chown -R $USER:$USER ./data\n```\n\n#### SurrealDB Server\n\n```\n# Connection issues\ntelnet surreal-server 8000\n# Check server status and network connectivity\n\n# Authentication failures\n# Verify credentials and user permissions\n```\n\n### Debugging Commands\n\n```\n# List available storage types\n./orchestrator --help | grep storage-type\n\n# Validate configuration\n./orchestrator --storage-type filesystem --data-dir ./data --dry-run\n\n# Test migration\n./scripts/migrate-storage.nu validate --from filesystem --to surrealdb-embedded\n\n# Monitor migration progress\n./scripts/migrate-storage.nu --from filesystem --to surrealdb-embedded --verbose\n```\n\n## Recommendations\n\n### Development\n\n- **Use**: Filesystem backend\n- **Rationale**: Simple setup, easy debugging, no external dependencies\n\n### Single-Node Production\n\n- **Use**: SurrealDB Embedded\n- **Rationale**: ACID guarantees, advanced features, no external dependencies\n\n### Distributed Production\n\n- **Use**: SurrealDB Server\n- **Rationale**: Horizontal scaling, high availability, multi-instance support\n\n### Migration Path\n\n1. **Start**: Filesystem (development)\n2. **Scale**: SurrealDB Embedded (single-node production)\n3. **Distribute**: SurrealDB Server (multi-node production)\n\nThis progressive approach allows teams to start simple and scale as requirements grow, with seamless migration between each stage. \ No newline at end of file diff --git a/crates/orchestrator/scripts/migrate-storage.nu b/crates/orchestrator/scripts/migrate-storage.nu old mode 100755 new mode 100644 diff --git a/crates/orchestrator/wrks/readme-testing.md b/crates/orchestrator/wrks/readme-testing.md index eca6281..d4056eb 100644 --- a/crates/orchestrator/wrks/readme-testing.md +++ b/crates/orchestrator/wrks/readme-testing.md @@ -1,392 +1 @@ -# Testing Guide for Multi-Storage Orchestrator - -This document provides comprehensive guidance for testing the multi-storage orchestrator system, -including unit tests, integration tests, benchmarks, and performance analysis. - -## Overview - -The orchestrator uses a multi-tiered testing approach: - -1. **Unit Tests**: Test individual components in isolation -2. **Integration Tests**: Test complete workflows across storage backends -3. **Migration Tests**: Validate data migration between backends -4. **Factory Tests**: Test configuration and backend selection -5. **Benchmarks**: Performance testing and regression detection - -## Test Structure - -```plaintext -tests/ -├── helpers/mod.rs # Test utilities and mock implementations -├── storage_integration.rs # Cross-backend integration tests -├── migration_tests.rs # Migration validation tests -└── factory_tests.rs # Factory and configuration tests - -benches/ -├── storage_benchmarks.rs # Storage performance benchmarks -└── migration_benchmarks.rs # Migration performance benchmarks - -src/ -├── storage/ # Unit tests embedded in modules -├── migration/tests.rs # Migration unit tests -└── main.rs # Application integration tests -```text - -## Running Tests - -### Basic Test Commands - -```bash -# Run all tests (filesystem backend only) -cargo test - -# Run all tests with SurrealDB backends -cargo test --features surrealdb - -# Run specific test suites -cargo test --test storage_integration -cargo test --test migration_tests -cargo test --test factory_tests - -# Run unit tests only -cargo test --lib -```text - -### Using Cargo Aliases - -The project includes convenient aliases (defined in `.cargo/config.toml`): - -```bash -# Test all backends with all features -cargo test-all - -# Test only filesystem backend -cargo test-fs - -# Test with SurrealDB features -cargo test-surrealdb - -# Test specific areas -cargo test-integration -cargo test-migration -cargo test-factory -cargo test-unit -```text - -## Test Features and Backends - -### Backend Support - -- **Filesystem**: Always available, no additional dependencies -- **SurrealDB Embedded**: Requires `--features surrealdb` -- **SurrealDB Server**: Requires `--features surrealdb` - -### Feature-Gated Tests - -Tests automatically adapt to available features: - -```rust -#[cfg(feature = "surrealdb")] -#[tokio::test] -async fn test_surrealdb_specific_feature() { - // This test only runs when SurrealDB feature is enabled -} -```text - -## Integration Tests - -### Storage Integration Tests - -Location: `tests/storage_integration.rs` - -These tests verify consistent behavior across all storage backends: - -```rust -// Example: Test runs against all available backends -test_all_backends!(test_basic_crud_operations, |storage, gen| async move { - let task = gen.workflow_task(); - storage.enqueue(task.clone(), 1).await?; - // ... test implementation - Ok(()) -}); -```text - -**Key Test Scenarios:** - -- Basic CRUD operations -- Queue management and priorities -- Task status updates -- Batch operations -- Search and filtering -- Concurrent operations -- Error handling -- Performance characteristics - -### Migration Tests - -Location: `tests/migration_tests.rs` - -Validates data migration between all backend combinations: - -```bash -# Run migration tests -cargo test --features surrealdb --test migration_tests - -# Test specific migration scenarios -cargo test --features surrealdb test_filesystem_to_embedded_migration -cargo test --features surrealdb test_large_dataset_migration_performance -```text - -**Migration Test Coverage:** - -- Data integrity verification -- Rollback functionality -- Progress tracking -- Error recovery -- Performance scaling -- Filtering and batch operations - -### Factory Tests - -Location: `tests/factory_tests.rs` - -Tests configuration validation and backend selection: - -```bash -# Run factory tests -cargo test --test factory_tests - -# Test configuration validation -cargo test test_storage_config_validation_failures -```text - -## Benchmarks - -### Storage Benchmarks - -Location: `benches/storage_benchmarks.rs` - -```bash -# Run all storage benchmarks -cargo bench-storage - -# Run specific backend benchmarks -cargo bench-fs -cargo bench-surrealdb # Requires --features surrealdb - -# Run specific benchmark categories -cargo bench -- single_enqueue -cargo bench -- batch_operations -cargo bench -- concurrent_operations -```text - -**Benchmark Categories:** - -- Single operations (enqueue/dequeue) -- Batch operations -- Search and retrieval -- Concurrent operations -- Cleanup operations - -### Migration Benchmarks - -Location: `benches/migration_benchmarks.rs` - -```bash -# Run migration benchmarks -cargo bench-migration - -# Test migration performance -cargo bench -- basic_migration -cargo bench -- migration_batch_sizes -```text - -**Migration Benchmarks:** - -- Basic migration throughput -- Batch size optimization -- Verification overhead -- Progress tracking overhead -- Dry run performance - -## Test Helpers and Utilities - -### TestDataGenerator - -Provides consistent test data across all tests: - -```rust -use crate::helpers::TestDataGenerator; - -let gen = TestDataGenerator::new(); -let task = gen.workflow_task(); -let batch = gen.workflow_tasks_batch(10); -```text - -### StorageTestRunner - -Runs tests against all available storage backends: - -```rust -use crate::helpers::StorageTestRunner; - -let mut runner = StorageTestRunner::new(); -runner.run_against_all_backends(test_function).await; -```text - -### MockStorage - -Mock implementation for testing migration scenarios: - -```rust -use crate::helpers::MockStorage; - -let mock = MockStorage::new(); -mock.set_health(false); // Simulate failure -```text - -## Performance Testing - -### Benchmark Configuration - -Benchmarks are configured with: - -- Small sample sizes for expensive operations -- Throughput measurement for batch operations -- Memory usage tracking -- Concurrent operation testing - -### Performance Targets - -**Storage Operations:** - -- Single enqueue: < 1ms average -- Batch enqueue (100 tasks): < 100ms average -- Task retrieval: < 0.5ms average -- Search operations: < 50ms average - -**Migration Operations:** - -- Small dataset (100 tasks): < 5 seconds -- Large dataset (1000 tasks): < 30 seconds -- Throughput: > 10 tasks/second - -## Continuous Integration - -### CI Test Matrix - -```yaml -# Example CI configuration -strategy: - matrix: - features: - - "" # Filesystem only - - "surrealdb" # All backends - rust: - - stable - - beta -```text - -### Test Commands for CI - -```bash -# Basic functionality tests -cargo test --no-default-features -cargo test --all-features - -# Documentation tests -cargo test --doc --all-features - -# Benchmark regression tests -cargo bench --all-features -- --test -```text - -## Debugging and Troubleshooting - -### Verbose Test Output - -```bash -# Enable detailed logging -RUST_LOG=debug cargo test --features surrealdb - -# Show test output -cargo test -- --nocapture - -# Run single test with full output -cargo test test_name -- --exact --nocapture -```text - -### Common Issues - -1. **SurrealDB tests failing**: Ensure `--features surrealdb` is specified -2. **Temporary directory errors**: Tests clean up automatically, but manual cleanup may be needed -3. **Port conflicts**: Tests use ephemeral ports, but conflicts can occur -4. **Timing issues**: Some tests use sleeps for async operations - -### Test Data Isolation - -- Each test uses unique temporary directories -- Mock storage is reset between tests -- Concurrent tests use separate data spaces -- Cleanup is automatic via `Drop` implementations - -## Coverage Analysis - -```bash -# Generate coverage report -cargo install cargo-tarpaulin -cargo test-coverage - -# View coverage report -open target/tarpaulin-report.html -```text - -## Performance Profiling - -```bash -# Profile storage operations -cargo bench --bench storage_benchmarks -- --profile-time=10 - -# Profile migration operations -cargo bench --bench migration_benchmarks -- --profile-time=10 - -# Generate flame graphs -cargo install flamegraph -cargo flamegraph --bench storage_benchmarks -```text - -## Best Practices - -### Writing Tests - -1. **Use descriptive test names** that explain what is being tested -2. **Test error conditions** as well as success paths -3. **Use feature gates** for backend-specific tests -4. **Clean up resources** using RAII patterns -5. **Test concurrency** where applicable - -### Test Data - -1. **Use test generators** for consistent data -2. **Test with realistic data sizes** -3. **Include edge cases** (empty data, large data, malformed data) -4. **Use deterministic data** where possible - -### Performance Testing - -1. **Set appropriate baselines** for performance regression -2. **Test with various data sizes** to understand scaling -3. **Include warmup iterations** for accurate measurements -4. **Document performance expectations** in code comments - -## Contributing - -When adding new features: - -1. Add unit tests for new components -2. Update integration tests for new storage methods -3. Add migration tests for new backends -4. Update benchmarks for performance-critical code -5. Document any new test utilities - -For more information on the storage architecture and API, see the main project documentation. +# Testing Guide for Multi-Storage Orchestrator\n\nThis document provides comprehensive guidance for testing the multi-storage orchestrator system,\nincluding unit tests, integration tests, benchmarks, and performance analysis.\n\n## Overview\n\nThe orchestrator uses a multi-tiered testing approach:\n\n1. **Unit Tests**: Test individual components in isolation\n2. **Integration Tests**: Test complete workflows across storage backends\n3. **Migration Tests**: Validate data migration between backends\n4. **Factory Tests**: Test configuration and backend selection\n5. **Benchmarks**: Performance testing and regression detection\n\n## Test Structure\n\n```\ntests/\n├── helpers/mod.rs # Test utilities and mock implementations\n├── storage_integration.rs # Cross-backend integration tests\n├── migration_tests.rs # Migration validation tests\n└── factory_tests.rs # Factory and configuration tests\n\nbenches/\n├── storage_benchmarks.rs # Storage performance benchmarks\n└── migration_benchmarks.rs # Migration performance benchmarks\n\nsrc/\n├── storage/ # Unit tests embedded in modules\n├── migration/tests.rs # Migration unit tests\n└── main.rs # Application integration tests\n```\n\n## Running Tests\n\n### Basic Test Commands\n\n```\n# Run all tests (filesystem backend only)\ncargo test\n\n# Run all tests with SurrealDB backends\ncargo test --features surrealdb\n\n# Run specific test suites\ncargo test --test storage_integration\ncargo test --test migration_tests\ncargo test --test factory_tests\n\n# Run unit tests only\ncargo test --lib\n```\n\n### Using Cargo Aliases\n\nThe project includes convenient aliases (defined in `.cargo/config.toml`):\n\n```\n# Test all backends with all features\ncargo test-all\n\n# Test only filesystem backend\ncargo test-fs\n\n# Test with SurrealDB features\ncargo test-surrealdb\n\n# Test specific areas\ncargo test-integration\ncargo test-migration\ncargo test-factory\ncargo test-unit\n```\n\n## Test Features and Backends\n\n### Backend Support\n\n- **Filesystem**: Always available, no additional dependencies\n- **SurrealDB Embedded**: Requires `--features surrealdb`\n- **SurrealDB Server**: Requires `--features surrealdb`\n\n### Feature-Gated Tests\n\nTests automatically adapt to available features:\n\n```\n#[cfg(feature = "surrealdb")]\n#[tokio::test]\nasync fn test_surrealdb_specific_feature() {\n // This test only runs when SurrealDB feature is enabled\n}\n```\n\n## Integration Tests\n\n### Storage Integration Tests\n\nLocation: `tests/storage_integration.rs`\n\nThese tests verify consistent behavior across all storage backends:\n\n```\n// Example: Test runs against all available backends\ntest_all_backends!(test_basic_crud_operations, |storage, gen| async move {\n let task = gen.workflow_task();\n storage.enqueue(task.clone(), 1).await?;\n // ... test implementation\n Ok(())\n});\n```\n\n**Key Test Scenarios:**\n\n- Basic CRUD operations\n- Queue management and priorities\n- Task status updates\n- Batch operations\n- Search and filtering\n- Concurrent operations\n- Error handling\n- Performance characteristics\n\n### Migration Tests\n\nLocation: `tests/migration_tests.rs`\n\nValidates data migration between all backend combinations:\n\n```\n# Run migration tests\ncargo test --features surrealdb --test migration_tests\n\n# Test specific migration scenarios\ncargo test --features surrealdb test_filesystem_to_embedded_migration\ncargo test --features surrealdb test_large_dataset_migration_performance\n```\n\n**Migration Test Coverage:**\n\n- Data integrity verification\n- Rollback functionality\n- Progress tracking\n- Error recovery\n- Performance scaling\n- Filtering and batch operations\n\n### Factory Tests\n\nLocation: `tests/factory_tests.rs`\n\nTests configuration validation and backend selection:\n\n```\n# Run factory tests\ncargo test --test factory_tests\n\n# Test configuration validation\ncargo test test_storage_config_validation_failures\n```\n\n## Benchmarks\n\n### Storage Benchmarks\n\nLocation: `benches/storage_benchmarks.rs`\n\n```\n# Run all storage benchmarks\ncargo bench-storage\n\n# Run specific backend benchmarks\ncargo bench-fs\ncargo bench-surrealdb # Requires --features surrealdb\n\n# Run specific benchmark categories\ncargo bench -- single_enqueue\ncargo bench -- batch_operations\ncargo bench -- concurrent_operations\n```\n\n**Benchmark Categories:**\n\n- Single operations (enqueue/dequeue)\n- Batch operations\n- Search and retrieval\n- Concurrent operations\n- Cleanup operations\n\n### Migration Benchmarks\n\nLocation: `benches/migration_benchmarks.rs`\n\n```\n# Run migration benchmarks\ncargo bench-migration\n\n# Test migration performance\ncargo bench -- basic_migration\ncargo bench -- migration_batch_sizes\n```\n\n**Migration Benchmarks:**\n\n- Basic migration throughput\n- Batch size optimization\n- Verification overhead\n- Progress tracking overhead\n- Dry run performance\n\n## Test Helpers and Utilities\n\n### TestDataGenerator\n\nProvides consistent test data across all tests:\n\n```\nuse crate::helpers::TestDataGenerator;\n\nlet gen = TestDataGenerator::new();\nlet task = gen.workflow_task();\nlet batch = gen.workflow_tasks_batch(10);\n```\n\n### StorageTestRunner\n\nRuns tests against all available storage backends:\n\n```\nuse crate::helpers::StorageTestRunner;\n\nlet mut runner = StorageTestRunner::new();\nrunner.run_against_all_backends(test_function).await;\n```\n\n### MockStorage\n\nMock implementation for testing migration scenarios:\n\n```\nuse crate::helpers::MockStorage;\n\nlet mock = MockStorage::new();\nmock.set_health(false); // Simulate failure\n```\n\n## Performance Testing\n\n### Benchmark Configuration\n\nBenchmarks are configured with:\n\n- Small sample sizes for expensive operations\n- Throughput measurement for batch operations\n- Memory usage tracking\n- Concurrent operation testing\n\n### Performance Targets\n\n**Storage Operations:**\n\n- Single enqueue: < 1ms average\n- Batch enqueue (100 tasks): < 100ms average\n- Task retrieval: < 0.5ms average\n- Search operations: < 50ms average\n\n**Migration Operations:**\n\n- Small dataset (100 tasks): < 5 seconds\n- Large dataset (1000 tasks): < 30 seconds\n- Throughput: > 10 tasks/second\n\n## Continuous Integration\n\n### CI Test Matrix\n\n```\n# Example CI configuration\nstrategy:\n matrix:\n features:\n - "" # Filesystem only\n - "surrealdb" # All backends\n rust:\n - stable\n - beta\n```\n\n### Test Commands for CI\n\n```\n# Basic functionality tests\ncargo test --no-default-features\ncargo test --all-features\n\n# Documentation tests\ncargo test --doc --all-features\n\n# Benchmark regression tests\ncargo bench --all-features -- --test\n```\n\n## Debugging and Troubleshooting\n\n### Verbose Test Output\n\n```\n# Enable detailed logging\nRUST_LOG=debug cargo test --features surrealdb\n\n# Show test output\ncargo test -- --nocapture\n\n# Run single test with full output\ncargo test test_name -- --exact --nocapture\n```\n\n### Common Issues\n\n1. **SurrealDB tests failing**: Ensure `--features surrealdb` is specified\n2. **Temporary directory errors**: Tests clean up automatically, but manual cleanup may be needed\n3. **Port conflicts**: Tests use ephemeral ports, but conflicts can occur\n4. **Timing issues**: Some tests use sleeps for async operations\n\n### Test Data Isolation\n\n- Each test uses unique temporary directories\n- Mock storage is reset between tests\n- Concurrent tests use separate data spaces\n- Cleanup is automatic via `Drop` implementations\n\n## Coverage Analysis\n\n```\n# Generate coverage report\ncargo install cargo-tarpaulin\ncargo test-coverage\n\n# View coverage report\nopen target/tarpaulin-report.html\n```\n\n## Performance Profiling\n\n```\n# Profile storage operations\ncargo bench --bench storage_benchmarks -- --profile-time=10\n\n# Profile migration operations\ncargo bench --bench migration_benchmarks -- --profile-time=10\n\n# Generate flame graphs\ncargo install flamegraph\ncargo flamegraph --bench storage_benchmarks\n```\n\n## Best Practices\n\n### Writing Tests\n\n1. **Use descriptive test names** that explain what is being tested\n2. **Test error conditions** as well as success paths\n3. **Use feature gates** for backend-specific tests\n4. **Clean up resources** using RAII patterns\n5. **Test concurrency** where applicable\n\n### Test Data\n\n1. **Use test generators** for consistent data\n2. **Test with realistic data sizes**\n3. **Include edge cases** (empty data, large data, malformed data)\n4. **Use deterministic data** where possible\n\n### Performance Testing\n\n1. **Set appropriate baselines** for performance regression\n2. **Test with various data sizes** to understand scaling\n3. **Include warmup iterations** for accurate measurements\n4. **Document performance expectations** in code comments\n\n## Contributing\n\nWhen adding new features:\n\n1. Add unit tests for new components\n2. Update integration tests for new storage methods\n3. Add migration tests for new backends\n4. Update benchmarks for performance-critical code\n5. Document any new test utilities\n\nFor more information on the storage architecture and API, see the main project documentation. \ No newline at end of file diff --git a/crates/vault-service/README.md b/crates/vault-service/README.md index bf30407..8607943 100644 --- a/crates/vault-service/README.md +++ b/crates/vault-service/README.md @@ -1,467 +1 @@ -# KMS Service - Key Management Service - -A unified Key Management Service for the Provisioning platform with support for multiple backends: **Age** (development), -**Cosmian KMS** (privacy-preserving), **RustyVault** (self-hosted), **AWS KMS** (cloud-native), and **HashiCorp Vault** (enterprise). - -## Features - -### Age Backend (Development) - -- ✅ Fast, offline encryption/decryption -- ✅ No server required - local key files -- ✅ Simple setup with age-keygen -- ✅ Perfect for development and testing -- ✅ Zero network dependency - -### RustyVault Backend (Self-hosted) ✨ NEW - -- ✅ Vault-compatible API (drop-in replacement) -- ✅ Pure Rust implementation -- ✅ Self-hosted secrets management -- ✅ Apache 2.0 license (OSI-approved) -- ✅ Transit secrets engine support -- ✅ Embeddable or standalone -- ✅ No vendor lock-in - -### Cosmian KMS Backend (Production) - -- ✅ Enterprise-grade key management -- ✅ Confidential computing support (SGX/SEV) -- ✅ Zero-knowledge architecture -- ✅ Server-side key rotation -- ✅ Audit logging and compliance -- ✅ Multi-tenant support - -### Security Features - -- ✅ TLS for all communications (Cosmian) -- ✅ Context-based encryption (AAD) -- ✅ Automatic key rotation (Cosmian) -- ✅ Data key generation (Cosmian) -- ✅ Health monitoring -- ✅ Operation metrics - -## Architecture - -```plaintext -┌───────────────────────────────────────────────── -────────┐ -│ KMS Service │ -├───────────────────────────────────────────────── -────────┤ -│ REST API (Axum) │ -│ ├─ /api/v1/kms/encrypt POST │ -│ ├─ /api/v1/kms/decrypt POST │ -│ ├─ /api/v1/kms/generate-key POST (Cosmian only) │ -│ ├─ /api/v1/kms/status GET │ -│ └─ /api/v1/kms/health GET │ -├───────────────────────────────────────────────── -────────┤ -│ Unified KMS Service Interface │ -│ ├─ encrypt(plaintext, context) -> ciphertext │ -│ ├─ decrypt(ciphertext, context) -> plaintext │ -│ ├─ generate_data_key(spec) -> DataKey │ -│ └─ health_check() -> bool │ -├───────────────────────────────────────────────── -────────┤ -│ Backend Implementations │ -│ ├─ Age Client │ -│ │ ├─ X25519 encryption │ -│ │ ├─ Local key files │ -│ │ └─ Offline operation │ -│ └─ Cosmian KMS Client │ -│ ├─ REST API integration │ -│ ├─ Zero-knowledge encryption │ -│ └─ Confidential computing │ -└───────────────────────────────────────────────── -────────┘ -```text - -## Installation - -### Prerequisites - -- Rust 1.70+ (for building) -- Age 1.2+ (for development backend) -- Cosmian KMS server (for production backend) -- Nushell 0.107+ (for CLI integration) - -### Build from Source - -```bash -cd provisioning/platform/kms-service -cargo build --release - -# Binary will be at: target/release/kms-service -```text - -## Configuration - -### Configuration File - -Create `provisioning/config/kms.toml`: - -```toml -[kms] -dev_backend = "age" -prod_backend = "cosmian" -environment = "${PROVISIONING_ENV:-dev}" - -[kms.age] -public_key_path = "~/.config/provisioning/age/public_key.txt" -private_key_path = "~/.config/provisioning/age/private_key.txt" - -[kms.cosmian] -server_url = "${COSMIAN_KMS_URL:-https://kms.example.com}" -api_key = "${COSMIAN_API_KEY}" -default_key_id = "provisioning-master-key" -tls_verify = true -```text - -### Environment Variables - -```bash -# Development with Age -export PROVISIONING_ENV=dev -# Age keys will be read from paths in config - -# Production with Cosmian -export PROVISIONING_ENV=prod -export COSMIAN_KMS_URL="https://kms.example.com" -export COSMIAN_API_KEY="your-api-key" -```text - -## Quick Start - -### Development Setup (Age) - -```bash -# 1. Generate Age keys -mkdir -p ~/.config/provisioning/age -age-keygen -o ~/.config/provisioning/age/private_key.txt -age-keygen -y ~/.config/provisioning/age/private_key.txt > ~/.config/provisioning/age/public_key.txt - -# 2. Set environment -export PROVISIONING_ENV=dev - -# 3. Start KMS service -cargo run --bin kms-service -```text - -### Production Setup (Cosmian) - -```bash -# 1. Set up Cosmian KMS server (or use hosted service) - -# 2. Create master key in Cosmian KMS -# (Use Cosmian KMS CLI or web interface) - -# 3. Set environment variables -export PROVISIONING_ENV=prod -export COSMIAN_KMS_URL=https://your-kms.example.com -export COSMIAN_API_KEY=your-api-key-here - -# 4. Start KMS service -cargo run --bin kms-service -```text - -## Usage - -### REST API Examples - -#### Encrypt Data - -```bash -curl -X POST http://localhost:8082/api/v1/kms/encrypt \ - -H "Content-Type: application/json" \ - -d '{ - "plaintext": "SGVsbG8sIFdvcmxkIQ==", - "context": "env=prod,service=api" - }' -```text - -#### Decrypt Data - -```bash -curl -X POST http://localhost:8082/api/v1/kms/decrypt \ - -H "Content-Type: application/json" \ - -d '{ - "ciphertext": "...", - "context": "env=prod,service=api" - }' -```text - -#### Generate Data Key (Cosmian only) - -```bash -curl -X POST http://localhost:8082/api/v1/kms/generate-key \ - -H "Content-Type: application/json" \ - -d '{ - "key_spec": "AES_256" - }' -```text - -#### Health Check - -```bash -curl http://localhost:8082/api/v1/kms/health -```text - -### Nushell CLI Integration - -```bash -# Load the KMS module -use provisioning/core/nulib/kms - -# Set service URL -export KMS_SERVICE_URL="http://localhost:8082" - -# Encrypt data -"secret-data" | kms encrypt -"api-key" | kms encrypt --context "env=prod,service=api" - -# Decrypt data -$ciphertext | kms decrypt -$ciphertext | kms decrypt --context "env=prod,service=api" - -# Generate data key (Cosmian only) -kms generate-key -kms generate-key --key-spec AES_128 - -# Check service status -kms status -kms health - -# Encrypt/decrypt files -kms encrypt-file config.yaml -kms encrypt-file secrets.json --output secrets.enc --context "env=prod" - -kms decrypt-file config.yaml.enc -kms decrypt-file secrets.enc --output secrets.json --context "env=prod" -```text - -## Backend Comparison - -| Feature | Age | RustyVault | Cosmian KMS | AWS KMS | Vault | -| --------- | ----- | ------------ | ------------- | --------- | ------- | -| **Setup** | Simple | Self-hosted | Server setup | AWS account | Enterprise | -| **Speed** | Very fast | Fast | Fast | Fast | Fast | -| **Network** | No | Yes | Yes | Yes | Yes | -| **Key Rotation** | Manual | Automatic | Automatic | Automatic | Automatic | -| **Data Keys** | No | Yes | Yes | Yes | Yes | -| **Audit Logging** | No | Yes | Full | Full | Full | -| **Confidential** | No | No | Yes (SGX/SEV) | No | No | -| **Multi-tenant** | No | Yes | Yes | Yes | Yes | -| **License** | MIT | Apache 2.0 | Proprietary | Proprietary | BSL/Enterprise | -| **Cost** | Free | Free | Paid | Paid | Paid | -| **Use Case** | Dev/Test | Self-hosted | Privacy | AWS Cloud | Enterprise | - -## Integration Points - -### 1. Config Encryption (SOPS Integration) - -```nushell -# Encrypt configuration files -kms encrypt-file workspace/config/secrets.yaml - -# SOPS can use KMS for key encryption -# Configure in .sops.yaml to use KMS endpoint -```text - -### 2. Dynamic Secrets (Provider API Keys) - -```rust -// Rust orchestrator can call KMS API -let encrypted_key = kms_client.encrypt(api_key.as_bytes(), &context).await?; -```text - -### 3. SSH Key Management - -```nushell -# Generate and encrypt temporal SSH keys -ssh-keygen -t ed25519 -f temp_key -N "" -kms encrypt-file temp_key --context "infra=prod,purpose=deployment" -```text - -### 4. Orchestrator (Workflow Data) - -```rust -// Encrypt sensitive workflow parameters -let encrypted_params = kms_service - .encrypt(params_json.as_bytes(), &workflow_context) - .await?; -```text - -### 5. Control Center (Audit Logs) - -- All KMS operations are logged -- Audit trail for compliance -- Integration with control center UI - -## Testing - -### Unit Tests - -```bash -cargo test -```text - -### Integration Tests - -```bash -# Age backend tests (no external dependencies) -cargo test age - -# Cosmian backend tests (requires Cosmian KMS server) -export COSMIAN_KMS_URL=http://localhost:9999 -export COSMIAN_API_KEY=test-key -cargo test cosmian -- --ignored -```text - -## Deployment - -### Docker - -```dockerfile -FROM rust:1.70 as builder -WORKDIR /app -COPY . . -RUN cargo build --release - -FROM debian:bookworm-slim -RUN apt-get update && \ - apt-get install -y ca-certificates && \ - rm -rf /var/lib/apt/lists/* -COPY --from=builder /app/target/release/kms-service /usr/local/bin/ -ENTRYPOINT ["kms-service"] -```text - -### Kubernetes (Production with Cosmian) - -```yaml -apiVersion: apps/v1 -kind: Deployment -metadata: - name: kms-service -spec: - replicas: 2 - template: - spec: - containers: - - name: kms-service - image: provisioning/kms-service:latest - env: - - name: PROVISIONING_ENV - value: "prod" - - name: COSMIAN_KMS_URL - value: "https://kms.example.com" - - name: COSMIAN_API_KEY - valueFrom: - secretKeyRef: - name: cosmian-api-key - key: api-key - ports: - - containerPort: 8082 -```text - -### systemd Service - -```ini -[Unit] -Description=KMS Service -After=network.target - -[Service] -Type=simple -User=kms-service -Environment="PROVISIONING_ENV=prod" -Environment="COSMIAN_KMS_URL=https://kms.example.com" -Environment="COSMIAN_API_KEY=your-api-key" -ExecStart=/usr/local/bin/kms-service -Restart=always - -[Install] -WantedBy=multi-user.target -```text - -## Security Best Practices - -1. **Development**: Use Age for dev/test only, never for production secrets -2. **Production**: Always use Cosmian KMS with TLS verification enabled -3. **API Keys**: Never hardcode Cosmian API keys, use environment variables -4. **Key Rotation**: Enable automatic rotation in Cosmian (90 days recommended) -5. **Context Encryption**: Always use encryption context (AAD) for additional security -6. **Network Access**: Restrict KMS service access with firewall rules -7. **Monitoring**: Enable health checks and monitor operation metrics - -## Migration from Vault/AWS KMS - -See [KMS_SIMPLIFICATION.md](../../docs/migration/KMS_SIMPLIFICATION.md) for migration guide. - -## Monitoring - -### Metrics Endpoints - -```bash -# Service status (includes operation count) -curl http://localhost:8082/api/v1/kms/status - -# Health check -curl http://localhost:8082/api/v1/kms/health -```text - -### Logs - -```bash -# Set log level -export RUST_LOG="kms_service=debug,tower_http=debug" - -# View logs -journalctl -u kms-service -f -```text - -## Troubleshooting - -### Age Backend Issues - -```bash -# Check keys exist -ls -la ~/.config/provisioning/age/ - -# Verify key format -cat ~/.config/provisioning/age/public_key.txt -# Should start with: age1... - -# Test encryption manually -echo "test" | age -r $(cat ~/.config/provisioning/age/public_key.txt) > test.enc -age -d -i ~/.config/provisioning/age/private_key.txt test.enc -```text - -### Cosmian KMS Issues - -```bash -# Check connectivity -curl https://kms.example.com/api/v1/health \ - -H "X-API-Key: $COSMIAN_API_KEY" - -# Verify API key -curl https://kms.example.com/api/v1/version \ - -H "X-API-Key: $COSMIAN_API_KEY" - -# Test encryption -curl -X POST https://kms.example.com/api/v1/encrypt \ - -H "X-API-Key: $COSMIAN_API_KEY" \ - -H "Content-Type: application/json" \ - -d '{"keyId":"master-key","data":"SGVsbG8="}' -```text - -## License - -Copyright © 2024 Provisioning Team - -## References - -- [Age Encryption](https://github.com/FiloSottile/age) -- [Cosmian KMS](https://cosmian.com/kms/) -- [Axum Web Framework](https://docs.rs/axum/) -- [Confidential Computing](https://confidentialcomputing.io/) +# KMS Service - Key Management Service\n\nA unified Key Management Service for the Provisioning platform with support for multiple backends: **Age** (development),\n**Cosmian KMS** (privacy-preserving), **RustyVault** (self-hosted), **AWS KMS** (cloud-native), and **HashiCorp Vault** (enterprise).\n\n## Features\n\n### Age Backend (Development)\n\n- ✅ Fast, offline encryption/decryption\n- ✅ No server required - local key files\n- ✅ Simple setup with age-keygen\n- ✅ Perfect for development and testing\n- ✅ Zero network dependency\n\n### RustyVault Backend (Self-hosted) ✨ NEW\n\n- ✅ Vault-compatible API (drop-in replacement)\n- ✅ Pure Rust implementation\n- ✅ Self-hosted secrets management\n- ✅ Apache 2.0 license (OSI-approved)\n- ✅ Transit secrets engine support\n- ✅ Embeddable or standalone\n- ✅ No vendor lock-in\n\n### Cosmian KMS Backend (Production)\n\n- ✅ Enterprise-grade key management\n- ✅ Confidential computing support (SGX/SEV)\n- ✅ Zero-knowledge architecture\n- ✅ Server-side key rotation\n- ✅ Audit logging and compliance\n- ✅ Multi-tenant support\n\n### Security Features\n\n- ✅ TLS for all communications (Cosmian)\n- ✅ Context-based encryption (AAD)\n- ✅ Automatic key rotation (Cosmian)\n- ✅ Data key generation (Cosmian)\n- ✅ Health monitoring\n- ✅ Operation metrics\n\n## Architecture\n\n```\n┌─────────────────────────────────────────────────\n────────┐\n│ KMS Service │\n├─────────────────────────────────────────────────\n────────┤\n│ REST API (Axum) │\n│ ├─ /api/v1/kms/encrypt POST │\n│ ├─ /api/v1/kms/decrypt POST │\n│ ├─ /api/v1/kms/generate-key POST (Cosmian only) │\n│ ├─ /api/v1/kms/status GET │\n│ └─ /api/v1/kms/health GET │\n├─────────────────────────────────────────────────\n────────┤\n│ Unified KMS Service Interface │\n│ ├─ encrypt(plaintext, context) -> ciphertext │\n│ ├─ decrypt(ciphertext, context) -> plaintext │\n│ ├─ generate_data_key(spec) -> DataKey │\n│ └─ health_check() -> bool │\n├─────────────────────────────────────────────────\n────────┤\n│ Backend Implementations │\n│ ├─ Age Client │\n│ │ ├─ X25519 encryption │\n│ │ ├─ Local key files │\n│ │ └─ Offline operation │\n│ └─ Cosmian KMS Client │\n│ ├─ REST API integration │\n│ ├─ Zero-knowledge encryption │\n│ └─ Confidential computing │\n└─────────────────────────────────────────────────\n────────┘\n```\n\n## Installation\n\n### Prerequisites\n\n- Rust 1.70+ (for building)\n- Age 1.2+ (for development backend)\n- Cosmian KMS server (for production backend)\n- Nushell 0.107+ (for CLI integration)\n\n### Build from Source\n\n```\ncd provisioning/platform/kms-service\ncargo build --release\n\n# Binary will be at: target/release/kms-service\n```\n\n## Configuration\n\n### Configuration File\n\nCreate `provisioning/config/kms.toml`:\n\n```\n[kms]\ndev_backend = "age"\nprod_backend = "cosmian"\nenvironment = "${PROVISIONING_ENV:-dev}"\n\n[kms.age]\npublic_key_path = "~/.config/provisioning/age/public_key.txt"\nprivate_key_path = "~/.config/provisioning/age/private_key.txt"\n\n[kms.cosmian]\nserver_url = "${COSMIAN_KMS_URL:-https://kms.example.com}"\napi_key = "${COSMIAN_API_KEY}"\ndefault_key_id = "provisioning-master-key"\ntls_verify = true\n```\n\n### Environment Variables\n\n```\n# Development with Age\nexport PROVISIONING_ENV=dev\n# Age keys will be read from paths in config\n\n# Production with Cosmian\nexport PROVISIONING_ENV=prod\nexport COSMIAN_KMS_URL="https://kms.example.com"\nexport COSMIAN_API_KEY="your-api-key"\n```\n\n## Quick Start\n\n### Development Setup (Age)\n\n```\n# 1. Generate Age keys\nmkdir -p ~/.config/provisioning/age\nage-keygen -o ~/.config/provisioning/age/private_key.txt\nage-keygen -y ~/.config/provisioning/age/private_key.txt > ~/.config/provisioning/age/public_key.txt\n\n# 2. Set environment\nexport PROVISIONING_ENV=dev\n\n# 3. Start KMS service\ncargo run --bin kms-service\n```\n\n### Production Setup (Cosmian)\n\n```\n# 1. Set up Cosmian KMS server (or use hosted service)\n\n# 2. Create master key in Cosmian KMS\n# (Use Cosmian KMS CLI or web interface)\n\n# 3. Set environment variables\nexport PROVISIONING_ENV=prod\nexport COSMIAN_KMS_URL=https://your-kms.example.com\nexport COSMIAN_API_KEY=your-api-key-here\n\n# 4. Start KMS service\ncargo run --bin kms-service\n```\n\n## Usage\n\n### REST API Examples\n\n#### Encrypt Data\n\n```\ncurl -X POST http://localhost:8082/api/v1/kms/encrypt \\n -H "Content-Type: application/json" \\n -d '{\n "plaintext": "SGVsbG8sIFdvcmxkIQ==",\n "context": "env=prod,service=api"\n }'\n```\n\n#### Decrypt Data\n\n```\ncurl -X POST http://localhost:8082/api/v1/kms/decrypt \\n -H "Content-Type: application/json" \\n -d '{\n "ciphertext": "...",\n "context": "env=prod,service=api"\n }'\n```\n\n#### Generate Data Key (Cosmian only)\n\n```\ncurl -X POST http://localhost:8082/api/v1/kms/generate-key \\n -H "Content-Type: application/json" \\n -d '{\n "key_spec": "AES_256"\n }'\n```\n\n#### Health Check\n\n```\ncurl http://localhost:8082/api/v1/kms/health\n```\n\n### Nushell CLI Integration\n\n```\n# Load the KMS module\nuse provisioning/core/nulib/kms\n\n# Set service URL\nexport KMS_SERVICE_URL="http://localhost:8082"\n\n# Encrypt data\n"secret-data" | kms encrypt\n"api-key" | kms encrypt --context "env=prod,service=api"\n\n# Decrypt data\n$ciphertext | kms decrypt\n$ciphertext | kms decrypt --context "env=prod,service=api"\n\n# Generate data key (Cosmian only)\nkms generate-key\nkms generate-key --key-spec AES_128\n\n# Check service status\nkms status\nkms health\n\n# Encrypt/decrypt files\nkms encrypt-file config.yaml\nkms encrypt-file secrets.json --output secrets.enc --context "env=prod"\n\nkms decrypt-file config.yaml.enc\nkms decrypt-file secrets.enc --output secrets.json --context "env=prod"\n```\n\n## Backend Comparison\n\n| Feature | Age | RustyVault | Cosmian KMS | AWS KMS | Vault |\n| --------- | ----- | ------------ | ------------- | --------- | ------- |\n| **Setup** | Simple | Self-hosted | Server setup | AWS account | Enterprise |\n| **Speed** | Very fast | Fast | Fast | Fast | Fast |\n| **Network** | No | Yes | Yes | Yes | Yes |\n| **Key Rotation** | Manual | Automatic | Automatic | Automatic | Automatic |\n| **Data Keys** | No | Yes | Yes | Yes | Yes |\n| **Audit Logging** | No | Yes | Full | Full | Full |\n| **Confidential** | No | No | Yes (SGX/SEV) | No | No |\n| **Multi-tenant** | No | Yes | Yes | Yes | Yes |\n| **License** | MIT | Apache 2.0 | Proprietary | Proprietary | BSL/Enterprise |\n| **Cost** | Free | Free | Paid | Paid | Paid |\n| **Use Case** | Dev/Test | Self-hosted | Privacy | AWS Cloud | Enterprise |\n\n## Integration Points\n\n### 1. Config Encryption (SOPS Integration)\n\n```\n# Encrypt configuration files\nkms encrypt-file workspace/config/secrets.yaml\n\n# SOPS can use KMS for key encryption\n# Configure in .sops.yaml to use KMS endpoint\n```\n\n### 2. Dynamic Secrets (Provider API Keys)\n\n```\n// Rust orchestrator can call KMS API\nlet encrypted_key = kms_client.encrypt(api_key.as_bytes(), &context).await?;\n```\n\n### 3. SSH Key Management\n\n```\n# Generate and encrypt temporal SSH keys\nssh-keygen -t ed25519 -f temp_key -N ""\nkms encrypt-file temp_key --context "infra=prod,purpose=deployment"\n```\n\n### 4. Orchestrator (Workflow Data)\n\n```\n// Encrypt sensitive workflow parameters\nlet encrypted_params = kms_service\n .encrypt(params_json.as_bytes(), &workflow_context)\n .await?;\n```\n\n### 5. Control Center (Audit Logs)\n\n- All KMS operations are logged\n- Audit trail for compliance\n- Integration with control center UI\n\n## Testing\n\n### Unit Tests\n\n```\ncargo test\n```\n\n### Integration Tests\n\n```\n# Age backend tests (no external dependencies)\ncargo test age\n\n# Cosmian backend tests (requires Cosmian KMS server)\nexport COSMIAN_KMS_URL=http://localhost:9999\nexport COSMIAN_API_KEY=test-key\ncargo test cosmian -- --ignored\n```\n\n## Deployment\n\n### Docker\n\n```\nFROM rust:1.70 as builder\nWORKDIR /app\nCOPY . .\nRUN cargo build --release\n\nFROM debian:bookworm-slim\nRUN apt-get update && \\n apt-get install -y ca-certificates && \\n rm -rf /var/lib/apt/lists/*\nCOPY --from=builder /app/target/release/kms-service /usr/local/bin/\nENTRYPOINT ["kms-service"]\n```\n\n### Kubernetes (Production with Cosmian)\n\n```\napiVersion: apps/v1\nkind: Deployment\nmetadata:\n name: kms-service\nspec:\n replicas: 2\n template:\n spec:\n containers:\n - name: kms-service\n image: provisioning/kms-service:latest\n env:\n - name: PROVISIONING_ENV\n value: "prod"\n - name: COSMIAN_KMS_URL\n value: "https://kms.example.com"\n - name: COSMIAN_API_KEY\n valueFrom:\n secretKeyRef:\n name: cosmian-api-key\n key: api-key\n ports:\n - containerPort: 8082\n```\n\n### systemd Service\n\n```\n[Unit]\nDescription=KMS Service\nAfter=network.target\n\n[Service]\nType=simple\nUser=kms-service\nEnvironment="PROVISIONING_ENV=prod"\nEnvironment="COSMIAN_KMS_URL=https://kms.example.com"\nEnvironment="COSMIAN_API_KEY=your-api-key"\nExecStart=/usr/local/bin/kms-service\nRestart=always\n\n[Install]\nWantedBy=multi-user.target\n```\n\n## Security Best Practices\n\n1. **Development**: Use Age for dev/test only, never for production secrets\n2. **Production**: Always use Cosmian KMS with TLS verification enabled\n3. **API Keys**: Never hardcode Cosmian API keys, use environment variables\n4. **Key Rotation**: Enable automatic rotation in Cosmian (90 days recommended)\n5. **Context Encryption**: Always use encryption context (AAD) for additional security\n6. **Network Access**: Restrict KMS service access with firewall rules\n7. **Monitoring**: Enable health checks and monitor operation metrics\n\n## Migration from Vault/AWS KMS\n\nSee [KMS_SIMPLIFICATION.md](../../docs/migration/KMS_SIMPLIFICATION.md) for migration guide.\n\n## Monitoring\n\n### Metrics Endpoints\n\n```\n# Service status (includes operation count)\ncurl http://localhost:8082/api/v1/kms/status\n\n# Health check\ncurl http://localhost:8082/api/v1/kms/health\n```\n\n### Logs\n\n```\n# Set log level\nexport RUST_LOG="kms_service=debug,tower_http=debug"\n\n# View logs\njournalctl -u kms-service -f\n```\n\n## Troubleshooting\n\n### Age Backend Issues\n\n```\n# Check keys exist\nls -la ~/.config/provisioning/age/\n\n# Verify key format\ncat ~/.config/provisioning/age/public_key.txt\n# Should start with: age1...\n\n# Test encryption manually\necho "test" | age -r $(cat ~/.config/provisioning/age/public_key.txt) > test.enc\nage -d -i ~/.config/provisioning/age/private_key.txt test.enc\n```\n\n### Cosmian KMS Issues\n\n```\n# Check connectivity\ncurl https://kms.example.com/api/v1/health \\n -H "X-API-Key: $COSMIAN_API_KEY"\n\n# Verify API key\ncurl https://kms.example.com/api/v1/version \\n -H "X-API-Key: $COSMIAN_API_KEY"\n\n# Test encryption\ncurl -X POST https://kms.example.com/api/v1/encrypt \\n -H "X-API-Key: $COSMIAN_API_KEY" \\n -H "Content-Type: application/json" \\n -d '{"keyId":"master-key","data":"SGVsbG8="}'\n```\n\n## License\n\nCopyright © 2024 Provisioning Team\n\n## References\n\n- [Age Encryption](https://github.com/FiloSottile/age)\n- [Cosmian KMS](https://cosmian.com/kms/)\n- [Axum Web Framework](https://docs.rs/axum/)\n- [Confidential Computing](https://confidentialcomputing.io/) \ No newline at end of file diff --git a/crates/vault-service/scripts/start-kms.nu b/crates/vault-service/scripts/start-kms.nu old mode 100755 new mode 100644 diff --git a/docs/README.md b/docs/README.md index e8d363e..d3288ff 100644 --- a/docs/README.md +++ b/docs/README.md @@ -1,9 +1 @@ -# Provisioning Platform Documentation - -User guides, deployment documentation, and operational guides. - -## Structure - -- **deployment/** - Deployment guides, troubleshooting, and build documentation -- **guides/** - Quick start and user guides -- **architecture/** - Architecture and design documentation (if present) +# Provisioning Platform Documentation\n\nUser guides, deployment documentation, and operational guides.\n\n## Structure\n\n- **deployment/** - Deployment guides, troubleshooting, and build documentation\n- **guides/** - Quick start and user guides\n- **architecture/** - Architecture and design documentation (if present) \ No newline at end of file diff --git a/docs/deployment/deployment-guide.md b/docs/deployment/deployment-guide.md index 777a3d4..0ba600a 100644 --- a/docs/deployment/deployment-guide.md +++ b/docs/deployment/deployment-guide.md @@ -1,757 +1 @@ -# Provisioning Platform Deployment Guide - -**Version**: 3.0.0 -**Date**: 2025-10-06 -**Deployment Modes**: Solo, Multi-User, CI/CD, Enterprise - ---- - -## Table of Contents - -1. [Overview](#overview) -2. [Prerequisites](#prerequisites) -3. [Deployment Modes](#deployment-modes) -4. [Quick Start](#quick-start) -5. [Configuration](#configuration) -6. [Deployment Methods](#deployment-methods) -7. [Post-Deployment](#post-deployment) -8. [Troubleshooting](#troubleshooting) - ---- - -## Overview - -The Provisioning Platform is a comprehensive infrastructure automation system that can be deployed in four modes: - -- **Solo**: Single-user local development (minimal services) -- **Multi-User**: Team collaboration with source control -- **CI/CD**: Automated deployment pipelines -- **Enterprise**: Full production with monitoring, KMS, and audit logging - -### Architecture Components - -| Component | Solo | Multi-User | CI/CD | Enterprise | -| ----------- | ------ | ------------ | ------- | ------------ | -| Orchestrator | ✓ | ✓ | ✓ | ✓ | -| Control Center | ✓ | ✓ | ✓ | ✓ | -| CoreDNS | ✓ | ✓ | ✓ | ✓ | -| OCI Registry (Zot) | ✓ | ✓ | ✓ | ---- | -| Extension Registry | ✓ | ✓ | ✓ | ✓ | -| Gitea | ---- | ✓ | ✓ | ✓ | -| PostgreSQL | ---- | ✓ | ✓ | ✓ | -| API Server | ---- | - | ✓ | ✓ | -| Harbor | ---- | - | ---- | ✓ | -| Cosmian KMS | ---- | - | ---- | ✓ | -| Prometheus | ---- | - | ---- | ✓ | -| Grafana | ---- | - | ---- | ✓ | -| Loki + Promtail | ---- | - | ---- | ✓ | -| Elasticsearch + Kibana | ---- | - | ---- | ✓ | -| Nginx Reverse Proxy | ---- | - | ---- | ✓ | - ---- - -## Prerequisites - -### Required Software - -1. **Docker** (version 20.10+) - - ```bash - docker --version - # Docker version 20.10.0 or higher - ``` - -2. **Docker Compose** (version 2.0+) - - ```bash - docker-compose --version - # Docker Compose version 2.0.0 or higher - ``` - -3. **Nushell** (version 0.107.1+ for automation scripts) - - ```bash - nu --version - # 0.107.1 or higher - ``` - -### System Requirements - -#### Solo Mode - -- **CPU**: 2 cores -- **Memory**: 4GB RAM -- **Disk**: 20GB free space -- **Network**: Internet connection for pulling images - -#### Multi-User Mode - -- **CPU**: 4 cores -- **Memory**: 8GB RAM -- **Disk**: 50GB free space -- **Network**: Internet connection + internal network - -#### CI/CD Mode - -- **CPU**: 8 cores -- **Memory**: 16GB RAM -- **Disk**: 100GB free space -- **Network**: Internet + dedicated CI/CD network - -#### Enterprise Mode - -- **CPU**: 16 cores -- **Memory**: 32GB RAM -- **Disk**: 500GB free space (SSD recommended) -- **Network**: High-bandwidth, low-latency network - -### Optional Tools - -- **OpenSSL** (for generating secrets) -- **kubectl** (for Kubernetes deployment) -- **Helm** (for Kubernetes package management) - ---- - -## Deployment Modes - -### Solo Mode - -**Use Case**: Local development, testing, personal use - -**Features**: - -- Minimal resource usage -- No authentication required -- SQLite databases -- Local file storage - -**Limitations**: - -- Single user only -- No version control integration -- No audit logging - -### Multi-User Mode - -**Use Case**: Small team collaboration - -**Features**: - -- Multi-user authentication -- Gitea for source control -- PostgreSQL shared database -- User management - -**Limitations**: - -- No automated pipelines -- No advanced monitoring - -### CI/CD Mode - -**Use Case**: Automated deployment pipelines - -**Features**: - -- All Multi-User features -- Provisioning API Server -- Webhook support -- Jenkins/GitLab Runner integration - -**Limitations**: - -- Basic monitoring only - -### Enterprise Mode - -**Use Case**: Production deployments, compliance requirements - -**Features**: - -- All CI/CD features -- Harbor registry (enterprise OCI) -- Cosmian KMS (secret management) -- Full monitoring stack (Prometheus, Grafana) -- Log aggregation (Loki, Elasticsearch) -- Audit logging -- TLS/SSL encryption -- Nginx reverse proxy - ---- - -## Quick Start - -### 1. Clone Repository - -```bash -cd /opt -git clone https://github.com/your-org/project-provisioning.git -cd project-provisioning/provisioning/platform -```text - -### 2. Generate Secrets - -```bash -# Generate .env file with random secrets -./scripts/generate-secrets.nu - -# Or copy and edit manually -cp .env.example .env -nano .env -```text - -### 3. Choose Deployment Mode and Deploy - -#### Solo Mode - -```bash -./scripts/deploy-platform.nu --mode solo -```text - -#### Multi-User Mode - -```bash -# Generate secrets first -./scripts/generate-secrets.nu - -# Deploy -./scripts/deploy-platform.nu --mode multi-user -```text - -#### CI/CD Mode - -```bash -./scripts/deploy-platform.nu --mode cicd --build -```text - -#### Enterprise Mode - -```bash -# Full production deployment -./scripts/deploy-platform.nu --mode enterprise --build --wait 600 -```text - -### 4. Verify Deployment - -```bash -# Check all services -./scripts/health-check.nu - -# View logs -docker-compose logs -f -```text - -### 5. Access Services - -- **Orchestrator**: -- **Control Center**: -- **OCI Registry**: -- **Gitea** (Multi-User+): -- **Grafana** (Enterprise): - ---- - -## Configuration - -### Environment Variables - -The `.env` file controls all deployment settings. Key variables: - -#### Platform Configuration - -```bash -PROVISIONING_MODE=solo # solo, multi-user, cicd, enterprise -PLATFORM_ENVIRONMENT=development # development, staging, production -```text - -#### Service Ports - -```bash -ORCHESTRATOR_PORT=8080 -CONTROL_CENTER_PORT=8081 -GITEA_HTTP_PORT=3000 -OCI_REGISTRY_PORT=5000 -```text - -#### Security Settings - -```bash -# Generate with: openssl rand -base64 32 -CONTROL_CENTER_JWT_SECRET= -API_SERVER_JWT_SECRET= -POSTGRES_PASSWORD= -```text - -#### Resource Limits - -```bash -ORCHESTRATOR_CPU_LIMIT=2000m -ORCHESTRATOR_MEMORY_LIMIT=2048M -```text - -### Configuration Files - -#### Docker Compose - -- **Main**: `docker-compose.yaml` (base services) -- **Solo**: `infrastructure/docker/docker-compose.solo.yaml` -- **Multi-User**: `infrastructure/docker/docker-compose.multi-user.yaml` -- **CI/CD**: `infrastructure/docker/docker-compose.cicd.yaml` -- **Enterprise**: `infrastructure/docker/docker-compose.enterprise.yaml` - -#### Service Configurations - -- **Orchestrator**: `orchestrator/config.defaults.toml` -- **Control Center**: `control-center/config.defaults.toml` -- **CoreDNS**: `config/coredns/Corefile` -- **OCI Registry**: `infrastructure/oci-registry/config.json` -- **Nginx**: `infrastructure/nginx/nginx.conf` -- **Prometheus**: `infrastructure/monitoring/prometheus/prometheus.yml` - ---- - -## Deployment Methods - -### Method 1: Docker Compose (Recommended) - -#### Deploy - -```bash -# Solo mode -docker-compose -f docker-compose.yaml \ - -f infrastructure/docker/docker-compose.solo.yaml \ - up -d - -# Multi-user mode -docker-compose -f docker-compose.yaml \ - -f infrastructure/docker/docker-compose.multi-user.yaml \ - up -d - -# CI/CD mode -docker-compose -f docker-compose.yaml \ - -f infrastructure/docker/docker-compose.multi-user.yaml \ - -f infrastructure/docker/docker-compose.cicd.yaml \ - up -d - -# Enterprise mode -docker-compose -f docker-compose.yaml \ - -f infrastructure/docker/docker-compose.multi-user.yaml \ - -f infrastructure/docker/docker-compose.cicd.yaml \ - -f infrastructure/docker/docker-compose.enterprise.yaml \ - up -d -```text - -#### Manage Services - -```bash -# View logs -docker-compose logs -f [service-name] - -# Restart service -docker-compose restart orchestrator - -# Stop all services -docker-compose down - -# Stop and remove volumes (WARNING: data loss) -docker-compose down --volumes -```text - -### Method 2: Systemd (Linux Production) - -#### Install Services - -```bash -cd systemd -sudo ./install-services.sh -```text - -#### Manage via systemd - -```bash -# Start platform -sudo systemctl start provisioning-platform - -# Enable auto-start on boot -sudo systemctl enable provisioning-platform - -# Check status -sudo systemctl status provisioning-platform - -# View logs -sudo journalctl -u provisioning-platform -f - -# Restart -sudo systemctl restart provisioning-platform - -# Stop -sudo systemctl stop provisioning-platform -```text - -### Method 3: Kubernetes - -See [KUBERNETES_DEPLOYMENT.md](./KUBERNETES_DEPLOYMENT.md) for detailed instructions. - -#### Quick Deploy - -```bash -# Create namespace -kubectl apply -f k8s/base/namespace.yaml - -# Deploy services -kubectl apply -f k8s/deployments/ -kubectl apply -f k8s/services/ -kubectl apply -f k8s/ingress/ - -# Check status -kubectl get pods -n provisioning -```text - -### Method 4: Automation Script (Nushell) - -```bash -# Deploy with options -./scripts/deploy-platform.nu --mode enterprise \ - --build \ - --wait 300 - -# Health check -./scripts/health-check.nu - -# Dry run (show what would be deployed) -./scripts/deploy-platform.nu --mode enterprise --dry-run -```text - ---- - -## Post-Deployment - -### 1. Verify Services - -```bash -# Quick health check -./scripts/health-check.nu - -# Detailed Docker status -docker-compose ps - -# Check individual service -curl http://localhost:9090/health -```text - -### 2. Initial Configuration - -#### Create Admin User (Multi-User+) - -Access Gitea at and complete setup wizard. - -#### Configure DNS (Optional) - -Add to `/etc/hosts` or configure local DNS: - -```plaintext -127.0.0.1 provisioning.local -127.0.0.1 gitea.provisioning.local -127.0.0.1 grafana.provisioning.local -```text - -#### Configure Monitoring (Enterprise) - -1. Access Grafana: -2. Login with credentials from `.env`: - - Username: `admin` - - Password: `${GRAFANA_ADMIN_PASSWORD}` -3. Dashboards are auto-provisioned from `infrastructure/monitoring/grafana/dashboards/` - -### 3. Load Extensions - -```bash -# List available extensions -curl http://localhost:8082/api/v1/extensions - -# Upload extension (example) -curl -X POST http://localhost:8082/api/v1/extensions/upload \ - -F "file=@my-extension.tar.gz" -```text - -### 4. Test Workflows - -```bash -# Create test server (via orchestrator API) -curl -X POST http://localhost:9090/workflows/servers/create \ - -H "Content-Type: application/json" \ - -d '{"name": "test-server", "plan": "1xCPU-2GB"}' - -# Check workflow status -curl http://localhost:9090/tasks/ -```text - ---- - -## Troubleshooting - -### Common Issues - -#### Services Not Starting - -**Symptom**: `docker-compose up` fails or services crash - -**Solutions**: - -1. Check Docker daemon: - - ```bash - systemctl status docker - ``` - -1. Check logs: - - ```bash - docker-compose logs orchestrator - ``` - -2. Check resource limits: - - ```bash - docker stats - ``` - -3. Increase Docker resources in Docker Desktop settings - -#### Port Conflicts - -**Symptom**: `Error: port is already allocated` - -**Solutions**: - -1. Find conflicting process: - - ```bash - lsof -i :9090 - ``` - -2. Change port in `.env`: - - ```bash - ORCHESTRATOR_PORT=9080 - ``` - -3. Restart deployment: - - ```bash - docker-compose down - docker-compose up -d - ``` - -#### Health Checks Failing - -**Symptom**: Health check script reports unhealthy services - -**Solutions**: - -1. Check service logs: - - ```bash - docker-compose logs -f - ``` - -2. Verify network connectivity: - - ```bash - docker network inspect provisioning-net - ``` - -3. Check firewall rules: - - ```bash - sudo ufw status - ``` - -4. Wait longer for services to start: - - ```bash - ./scripts/deploy-platform.nu --wait 600 - ``` - -#### Database Connection Errors - -**Symptom**: PostgreSQL connection refused - -**Solutions**: - -1. Check PostgreSQL health: - - ```bash - docker exec provisioning-postgres pg_isready - ``` - -2. Verify credentials in `.env`: - - ```bash - grep POSTGRES_ .env - ``` - -3. Check PostgreSQL logs: - - ```bash - docker-compose logs postgres - ``` - -4. Recreate database: - - ```bash - docker-compose down - docker volume rm provisioning_postgres-data - docker-compose up -d - ``` - -#### Out of Disk Space - -**Symptom**: No space left on device - -**Solutions**: - -1. Clean Docker volumes: - - ```bash - docker volume prune - ``` - -2. Clean Docker images: - - ```bash - docker image prune -a - ``` - -3. Check volume sizes: - - ```bash - docker system df -v - ``` - -### Getting Help - -- **Logs**: Always check logs first: `docker-compose logs -f` -- **Health**: Run health check: `./scripts/health-check.nu --json` -- **Documentation**: See `docs/` directory -- **Issues**: File bug reports at GitHub repository - ---- - -## Security Best Practices - -### 1. Secret Management - -- **Never commit** `.env` files to version control -- Use `./scripts/generate-secrets.nu` to generate strong secrets -- Rotate secrets regularly -- Use KMS in enterprise mode - -### 2. Network Security - -- Use TLS/SSL in production (enterprise mode) -- Configure firewall rules: - - ```bash - sudo ufw allow 80/tcp - sudo ufw allow 443/tcp - sudo ufw enable - ``` - -- Use private networks for backend services - -### 3. Access Control - -- Enable authentication in multi-user mode -- Use strong passwords (16+ characters) -- Configure API keys for CI/CD access -- Enable audit logging in enterprise mode - -### 4. Regular Updates - -```bash -# Pull latest images -docker-compose pull - -# Rebuild with updates -./scripts/deploy-platform.nu --pull --build -```text - ---- - -## Backup and Recovery - -### Backup - -```bash -# Backup volumes -docker run --rm -v provisioning_orchestrator-data:/data \ - -v $(pwd)/backup:/backup \ - alpine tar czf /backup/orchestrator-data.tar.gz -C /data . - -# Backup PostgreSQL -docker exec provisioning-postgres pg_dumpall -U provisioning > backup/postgres-backup.sql -```text - -### Restore - -```bash -# Restore volume -docker run --rm -v provisioning_orchestrator-data:/data \ - -v $(pwd)/backup:/backup \ - alpine tar xzf /backup/orchestrator-data.tar.gz -C /data - -# Restore PostgreSQL -docker exec -i provisioning-postgres psql -U provisioning < backup/postgres-backup.sql -```text - ---- - -## Maintenance - -### Updates - -```bash -# Pull latest images -docker-compose pull - -# Recreate containers -docker-compose up -d --force-recreate - -# Remove old images -docker image prune -```text - -### Monitoring - -- **Prometheus**: -- **Grafana**: -- **Logs**: `docker-compose logs -f` - -### Health Checks - -```bash -# Automated health check -./scripts/health-check.nu - -# Manual checks -curl http://localhost:9090/health -curl http://localhost:8081/health -```text - ---- - -## Next Steps - -- [Production Deployment Guide](./PRODUCTION_DEPLOYMENT.md) -- [Kubernetes Deployment Guide](./KUBERNETES_DEPLOYMENT.md) -- [Docker Compose Reference](./DOCKER_COMPOSE_REFERENCE.md) -- [Monitoring Setup](./MONITORING_SETUP.md) -- [Security Hardening](./SECURITY_HARDENING.md) - ---- - -**Documentation Version**: 1.0.0 -**Last Updated**: 2025-10-06 -**Maintained By**: Platform Team +# Provisioning Platform Deployment Guide\n\n**Version**: 3.0.0\n**Date**: 2025-10-06\n**Deployment Modes**: Solo, Multi-User, CI/CD, Enterprise\n\n---\n\n## Table of Contents\n\n1. [Overview](#overview)\n2. [Prerequisites](#prerequisites)\n3. [Deployment Modes](#deployment-modes)\n4. [Quick Start](#quick-start)\n5. [Configuration](#configuration)\n6. [Deployment Methods](#deployment-methods)\n7. [Post-Deployment](#post-deployment)\n8. [Troubleshooting](#troubleshooting)\n\n---\n\n## Overview\n\nThe Provisioning Platform is a comprehensive infrastructure automation system that can be deployed in four modes:\n\n- **Solo**: Single-user local development (minimal services)\n- **Multi-User**: Team collaboration with source control\n- **CI/CD**: Automated deployment pipelines\n- **Enterprise**: Full production with monitoring, KMS, and audit logging\n\n### Architecture Components\n\n| Component | Solo | Multi-User | CI/CD | Enterprise |\n| ----------- | ------ | ------------ | ------- | ------------ |\n| Orchestrator | ✓ | ✓ | ✓ | ✓ |\n| Control Center | ✓ | ✓ | ✓ | ✓ |\n| CoreDNS | ✓ | ✓ | ✓ | ✓ |\n| OCI Registry (Zot) | ✓ | ✓ | ✓ | ---- |\n| Extension Registry | ✓ | ✓ | ✓ | ✓ |\n| Gitea | ---- | ✓ | ✓ | ✓ |\n| PostgreSQL | ---- | ✓ | ✓ | ✓ |\n| API Server | ---- | - | ✓ | ✓ |\n| Harbor | ---- | - | ---- | ✓ |\n| Cosmian KMS | ---- | - | ---- | ✓ |\n| Prometheus | ---- | - | ---- | ✓ |\n| Grafana | ---- | - | ---- | ✓ |\n| Loki + Promtail | ---- | - | ---- | ✓ |\n| Elasticsearch + Kibana | ---- | - | ---- | ✓ |\n| Nginx Reverse Proxy | ---- | - | ---- | ✓ |\n\n---\n\n## Prerequisites\n\n### Required Software\n\n1. **Docker** (version 20.10+)\n\n ```bash\n docker --version\n # Docker version 20.10.0 or higher\n ```\n\n2. **Docker Compose** (version 2.0+)\n\n ```bash\n docker-compose --version\n # Docker Compose version 2.0.0 or higher\n ```\n\n3. **Nushell** (version 0.107.1+ for automation scripts)\n\n ```bash\n nu --version\n # 0.107.1 or higher\n ```\n\n### System Requirements\n\n#### Solo Mode\n\n- **CPU**: 2 cores\n- **Memory**: 4GB RAM\n- **Disk**: 20GB free space\n- **Network**: Internet connection for pulling images\n\n#### Multi-User Mode\n\n- **CPU**: 4 cores\n- **Memory**: 8GB RAM\n- **Disk**: 50GB free space\n- **Network**: Internet connection + internal network\n\n#### CI/CD Mode\n\n- **CPU**: 8 cores\n- **Memory**: 16GB RAM\n- **Disk**: 100GB free space\n- **Network**: Internet + dedicated CI/CD network\n\n#### Enterprise Mode\n\n- **CPU**: 16 cores\n- **Memory**: 32GB RAM\n- **Disk**: 500GB free space (SSD recommended)\n- **Network**: High-bandwidth, low-latency network\n\n### Optional Tools\n\n- **OpenSSL** (for generating secrets)\n- **kubectl** (for Kubernetes deployment)\n- **Helm** (for Kubernetes package management)\n\n---\n\n## Deployment Modes\n\n### Solo Mode\n\n**Use Case**: Local development, testing, personal use\n\n**Features**:\n\n- Minimal resource usage\n- No authentication required\n- SQLite databases\n- Local file storage\n\n**Limitations**:\n\n- Single user only\n- No version control integration\n- No audit logging\n\n### Multi-User Mode\n\n**Use Case**: Small team collaboration\n\n**Features**:\n\n- Multi-user authentication\n- Gitea for source control\n- PostgreSQL shared database\n- User management\n\n**Limitations**:\n\n- No automated pipelines\n- No advanced monitoring\n\n### CI/CD Mode\n\n**Use Case**: Automated deployment pipelines\n\n**Features**:\n\n- All Multi-User features\n- Provisioning API Server\n- Webhook support\n- Jenkins/GitLab Runner integration\n\n**Limitations**:\n\n- Basic monitoring only\n\n### Enterprise Mode\n\n**Use Case**: Production deployments, compliance requirements\n\n**Features**:\n\n- All CI/CD features\n- Harbor registry (enterprise OCI)\n- Cosmian KMS (secret management)\n- Full monitoring stack (Prometheus, Grafana)\n- Log aggregation (Loki, Elasticsearch)\n- Audit logging\n- TLS/SSL encryption\n- Nginx reverse proxy\n\n---\n\n## Quick Start\n\n### 1. Clone Repository\n\n```\ncd /opt\ngit clone https://github.com/your-org/project-provisioning.git\ncd project-provisioning/provisioning/platform\n```\n\n### 2. Generate Secrets\n\n```\n# Generate .env file with random secrets\n./scripts/generate-secrets.nu\n\n# Or copy and edit manually\ncp .env.example .env\nnano .env\n```\n\n### 3. Choose Deployment Mode and Deploy\n\n#### Solo Mode\n\n```\n./scripts/deploy-platform.nu --mode solo\n```\n\n#### Multi-User Mode\n\n```\n# Generate secrets first\n./scripts/generate-secrets.nu\n\n# Deploy\n./scripts/deploy-platform.nu --mode multi-user\n```\n\n#### CI/CD Mode\n\n```\n./scripts/deploy-platform.nu --mode cicd --build\n```\n\n#### Enterprise Mode\n\n```\n# Full production deployment\n./scripts/deploy-platform.nu --mode enterprise --build --wait 600\n```\n\n### 4. Verify Deployment\n\n```\n# Check all services\n./scripts/health-check.nu\n\n# View logs\ndocker-compose logs -f\n```\n\n### 5. Access Services\n\n- **Orchestrator**: \n- **Control Center**: \n- **OCI Registry**: \n- **Gitea** (Multi-User+): \n- **Grafana** (Enterprise): \n\n---\n\n## Configuration\n\n### Environment Variables\n\nThe `.env` file controls all deployment settings. Key variables:\n\n#### Platform Configuration\n\n```\nPROVISIONING_MODE=solo # solo, multi-user, cicd, enterprise\nPLATFORM_ENVIRONMENT=development # development, staging, production\n```\n\n#### Service Ports\n\n```\nORCHESTRATOR_PORT=8080\nCONTROL_CENTER_PORT=8081\nGITEA_HTTP_PORT=3000\nOCI_REGISTRY_PORT=5000\n```\n\n#### Security Settings\n\n```\n# Generate with: openssl rand -base64 32\nCONTROL_CENTER_JWT_SECRET=\nAPI_SERVER_JWT_SECRET=\nPOSTGRES_PASSWORD=\n```\n\n#### Resource Limits\n\n```\nORCHESTRATOR_CPU_LIMIT=2000m\nORCHESTRATOR_MEMORY_LIMIT=2048M\n```\n\n### Configuration Files\n\n#### Docker Compose\n\n- **Main**: `docker-compose.yaml` (base services)\n- **Solo**: `infrastructure/docker/docker-compose.solo.yaml`\n- **Multi-User**: `infrastructure/docker/docker-compose.multi-user.yaml`\n- **CI/CD**: `infrastructure/docker/docker-compose.cicd.yaml`\n- **Enterprise**: `infrastructure/docker/docker-compose.enterprise.yaml`\n\n#### Service Configurations\n\n- **Orchestrator**: `orchestrator/config.defaults.toml`\n- **Control Center**: `control-center/config.defaults.toml`\n- **CoreDNS**: `config/coredns/Corefile`\n- **OCI Registry**: `infrastructure/oci-registry/config.json`\n- **Nginx**: `infrastructure/nginx/nginx.conf`\n- **Prometheus**: `infrastructure/monitoring/prometheus/prometheus.yml`\n\n---\n\n## Deployment Methods\n\n### Method 1: Docker Compose (Recommended)\n\n#### Deploy\n\n```\n# Solo mode\ndocker-compose -f docker-compose.yaml \\n -f infrastructure/docker/docker-compose.solo.yaml \\n up -d\n\n# Multi-user mode\ndocker-compose -f docker-compose.yaml \\n -f infrastructure/docker/docker-compose.multi-user.yaml \\n up -d\n\n# CI/CD mode\ndocker-compose -f docker-compose.yaml \\n -f infrastructure/docker/docker-compose.multi-user.yaml \\n -f infrastructure/docker/docker-compose.cicd.yaml \\n up -d\n\n# Enterprise mode\ndocker-compose -f docker-compose.yaml \\n -f infrastructure/docker/docker-compose.multi-user.yaml \\n -f infrastructure/docker/docker-compose.cicd.yaml \\n -f infrastructure/docker/docker-compose.enterprise.yaml \\n up -d\n```\n\n#### Manage Services\n\n```\n# View logs\ndocker-compose logs -f [service-name]\n\n# Restart service\ndocker-compose restart orchestrator\n\n# Stop all services\ndocker-compose down\n\n# Stop and remove volumes (WARNING: data loss)\ndocker-compose down --volumes\n```\n\n### Method 2: Systemd (Linux Production)\n\n#### Install Services\n\n```\ncd systemd\nsudo ./install-services.sh\n```\n\n#### Manage via systemd\n\n```\n# Start platform\nsudo systemctl start provisioning-platform\n\n# Enable auto-start on boot\nsudo systemctl enable provisioning-platform\n\n# Check status\nsudo systemctl status provisioning-platform\n\n# View logs\nsudo journalctl -u provisioning-platform -f\n\n# Restart\nsudo systemctl restart provisioning-platform\n\n# Stop\nsudo systemctl stop provisioning-platform\n```\n\n### Method 3: Kubernetes\n\nSee [KUBERNETES_DEPLOYMENT.md](./KUBERNETES_DEPLOYMENT.md) for detailed instructions.\n\n#### Quick Deploy\n\n```\n# Create namespace\nkubectl apply -f k8s/base/namespace.yaml\n\n# Deploy services\nkubectl apply -f k8s/deployments/\nkubectl apply -f k8s/services/\nkubectl apply -f k8s/ingress/\n\n# Check status\nkubectl get pods -n provisioning\n```\n\n### Method 4: Automation Script (Nushell)\n\n```\n# Deploy with options\n./scripts/deploy-platform.nu --mode enterprise \\n --build \\n --wait 300\n\n# Health check\n./scripts/health-check.nu\n\n# Dry run (show what would be deployed)\n./scripts/deploy-platform.nu --mode enterprise --dry-run\n```\n\n---\n\n## Post-Deployment\n\n### 1. Verify Services\n\n```\n# Quick health check\n./scripts/health-check.nu\n\n# Detailed Docker status\ndocker-compose ps\n\n# Check individual service\ncurl http://localhost:9090/health\n```\n\n### 2. Initial Configuration\n\n#### Create Admin User (Multi-User+)\n\nAccess Gitea at and complete setup wizard.\n\n#### Configure DNS (Optional)\n\nAdd to `/etc/hosts` or configure local DNS:\n\n```\n127.0.0.1 provisioning.local\n127.0.0.1 gitea.provisioning.local\n127.0.0.1 grafana.provisioning.local\n```\n\n#### Configure Monitoring (Enterprise)\n\n1. Access Grafana: \n2. Login with credentials from `.env`:\n - Username: `admin`\n - Password: `${GRAFANA_ADMIN_PASSWORD}`\n3. Dashboards are auto-provisioned from `infrastructure/monitoring/grafana/dashboards/`\n\n### 3. Load Extensions\n\n```\n# List available extensions\ncurl http://localhost:8082/api/v1/extensions\n\n# Upload extension (example)\ncurl -X POST http://localhost:8082/api/v1/extensions/upload \\n -F "file=@my-extension.tar.gz"\n```\n\n### 4. Test Workflows\n\n```\n# Create test server (via orchestrator API)\ncurl -X POST http://localhost:9090/workflows/servers/create \\n -H "Content-Type: application/json" \\n -d '{"name": "test-server", "plan": "1xCPU-2GB"}'\n\n# Check workflow status\ncurl http://localhost:9090/tasks/\n```\n\n---\n\n## Troubleshooting\n\n### Common Issues\n\n#### Services Not Starting\n\n**Symptom**: `docker-compose up` fails or services crash\n\n**Solutions**:\n\n1. Check Docker daemon:\n\n ```bash\n systemctl status docker\n ```\n\n1. Check logs:\n\n ```bash\n docker-compose logs orchestrator\n ```\n\n2. Check resource limits:\n\n ```bash\n docker stats\n ```\n\n3. Increase Docker resources in Docker Desktop settings\n\n#### Port Conflicts\n\n**Symptom**: `Error: port is already allocated`\n\n**Solutions**:\n\n1. Find conflicting process:\n\n ```bash\n lsof -i :9090\n ```\n\n2. Change port in `.env`:\n\n ```bash\n ORCHESTRATOR_PORT=9080\n ```\n\n3. Restart deployment:\n\n ```bash\n docker-compose down\n docker-compose up -d\n ```\n\n#### Health Checks Failing\n\n**Symptom**: Health check script reports unhealthy services\n\n**Solutions**:\n\n1. Check service logs:\n\n ```bash\n docker-compose logs -f \n ```\n\n2. Verify network connectivity:\n\n ```bash\n docker network inspect provisioning-net\n ```\n\n3. Check firewall rules:\n\n ```bash\n sudo ufw status\n ```\n\n4. Wait longer for services to start:\n\n ```bash\n ./scripts/deploy-platform.nu --wait 600\n ```\n\n#### Database Connection Errors\n\n**Symptom**: PostgreSQL connection refused\n\n**Solutions**:\n\n1. Check PostgreSQL health:\n\n ```bash\n docker exec provisioning-postgres pg_isready\n ```\n\n2. Verify credentials in `.env`:\n\n ```bash\n grep POSTGRES_ .env\n ```\n\n3. Check PostgreSQL logs:\n\n ```bash\n docker-compose logs postgres\n ```\n\n4. Recreate database:\n\n ```bash\n docker-compose down\n docker volume rm provisioning_postgres-data\n docker-compose up -d\n ```\n\n#### Out of Disk Space\n\n**Symptom**: No space left on device\n\n**Solutions**:\n\n1. Clean Docker volumes:\n\n ```bash\n docker volume prune\n ```\n\n2. Clean Docker images:\n\n ```bash\n docker image prune -a\n ```\n\n3. Check volume sizes:\n\n ```bash\n docker system df -v\n ```\n\n### Getting Help\n\n- **Logs**: Always check logs first: `docker-compose logs -f`\n- **Health**: Run health check: `./scripts/health-check.nu --json`\n- **Documentation**: See `docs/` directory\n- **Issues**: File bug reports at GitHub repository\n\n---\n\n## Security Best Practices\n\n### 1. Secret Management\n\n- **Never commit** `.env` files to version control\n- Use `./scripts/generate-secrets.nu` to generate strong secrets\n- Rotate secrets regularly\n- Use KMS in enterprise mode\n\n### 2. Network Security\n\n- Use TLS/SSL in production (enterprise mode)\n- Configure firewall rules:\n\n ```bash\n sudo ufw allow 80/tcp\n sudo ufw allow 443/tcp\n sudo ufw enable\n ```\n\n- Use private networks for backend services\n\n### 3. Access Control\n\n- Enable authentication in multi-user mode\n- Use strong passwords (16+ characters)\n- Configure API keys for CI/CD access\n- Enable audit logging in enterprise mode\n\n### 4. Regular Updates\n\n```\n# Pull latest images\ndocker-compose pull\n\n# Rebuild with updates\n./scripts/deploy-platform.nu --pull --build\n```\n\n---\n\n## Backup and Recovery\n\n### Backup\n\n```\n# Backup volumes\ndocker run --rm -v provisioning_orchestrator-data:/data \\n -v $(pwd)/backup:/backup \\n alpine tar czf /backup/orchestrator-data.tar.gz -C /data .\n\n# Backup PostgreSQL\ndocker exec provisioning-postgres pg_dumpall -U provisioning > backup/postgres-backup.sql\n```\n\n### Restore\n\n```\n# Restore volume\ndocker run --rm -v provisioning_orchestrator-data:/data \\n -v $(pwd)/backup:/backup \\n alpine tar xzf /backup/orchestrator-data.tar.gz -C /data\n\n# Restore PostgreSQL\ndocker exec -i provisioning-postgres psql -U provisioning < backup/postgres-backup.sql\n```\n\n---\n\n## Maintenance\n\n### Updates\n\n```\n# Pull latest images\ndocker-compose pull\n\n# Recreate containers\ndocker-compose up -d --force-recreate\n\n# Remove old images\ndocker image prune\n```\n\n### Monitoring\n\n- **Prometheus**: \n- **Grafana**: \n- **Logs**: `docker-compose logs -f`\n\n### Health Checks\n\n```\n# Automated health check\n./scripts/health-check.nu\n\n# Manual checks\ncurl http://localhost:9090/health\ncurl http://localhost:8081/health\n```\n\n---\n\n## Next Steps\n\n- [Production Deployment Guide](./PRODUCTION_DEPLOYMENT.md)\n- [Kubernetes Deployment Guide](./KUBERNETES_DEPLOYMENT.md)\n- [Docker Compose Reference](./DOCKER_COMPOSE_REFERENCE.md)\n- [Monitoring Setup](./MONITORING_SETUP.md)\n- [Security Hardening](./SECURITY_HARDENING.md)\n\n---\n\n**Documentation Version**: 1.0.0\n**Last Updated**: 2025-10-06\n**Maintained By**: Platform Team \ No newline at end of file diff --git a/docs/deployment/guide.md b/docs/deployment/guide.md index 59aee60..eea4526 100644 --- a/docs/deployment/guide.md +++ b/docs/deployment/guide.md @@ -1,468 +1 @@ -# Provisioning Platform - Deployment Guide - -**Last Updated**: 2025-10-07 -**Platform**: macOS (Apple Silicon / Intel) + OrbStack/Docker - ---- - -## ✅ Fixed: Docker Builds - -Docker builds have been **fixed** to properly handle the Rust workspace structure. Both deployment methods (Native and Docker) are now fully -supported. - -**Note**: Docker builds use Rust nightly to support edition2024 (required by async-graphql 7.x from surrealdb). -RocksDB has been replaced with SurrealDB in-memory backend (kv-mem) to simplify Docker builds (no libclang requirement). - ---- - -## 📦 Quick Start - -### Prerequisites - -**For Native Deployment:** - -- Rust 1.75+: `brew install rust` -- Nushell 0.107+: `brew install nushell` - -**For Docker Deployment:** - -- OrbStack (recommended): -- Or Docker Desktop: `brew install --cask docker` - ---- - -## 🚀 Deployment Methods - -### Method 1: Native Execution (Recommended for Development) - -**Fastest startup, easiest debugging, direct access to logs** - -```bash -cd provisioning/platform/scripts - -# 1. Build all services -nu run-native.nu build - -# 2. Start all services in background -nu run-native.nu start-all --background - -# 3. Check status -nu run-native.nu status - -# 4. View logs -nu run-native.nu logs orchestrator --follow - -# 5. Stop all -nu run-native.nu stop-all -```text - -**Services will run on:** - -- Orchestrator: -- Control Center: - -**Data stored in:** - -- `~/.provisioning-platform/data/` -- `~/.provisioning-platform/logs/` - ---- - -### Method 2: Docker Execution (Recommended for Production-like Testing) - -**Isolated environments, easy cleanup, supports all deployment modes** - -```bash -cd provisioning/platform/scripts - -# 1. Build Docker images (Solo mode) -nu run-docker.nu build solo - -# 2. Start services in background -nu run-docker.nu start solo --detach - -# 3. Check status -nu run-docker.nu status - -# 4. View logs -nu run-docker.nu logs orchestrator --follow - -# 5. Stop all -nu run-docker.nu stop -```text - -**Deployment Modes:** - -- `solo` - 2 CPU / 4GB RAM (dev/test) -- `multiuser` - 4 CPU / 8GB RAM (team) -- `cicd` - 8 CPU / 16GB RAM (automation) -- `enterprise` - 16 CPU / 32GB RAM (production + KMS) - ---- - -## 📋 Complete Command Reference - -### Native Execution (`run-native.nu`) - -| Command | Description | -| --------- | ------------- | -| `build` | Build all services | -| `start ` | Start orchestrator or control_center | -| `start-all` | Start all services | -| `stop ` | Stop a specific service | -| `stop-all` | Stop all services | -| `status` | Show service status | -| `logs ` | Show logs (add `--follow`) | -| `health` | Check service health | - -**Examples:** - -```bash -nu run-native.nu build -nu run-native.nu start orchestrator --background -nu run-native.nu start control_center --background -nu run-native.nu logs orchestrator --follow -nu run-native.nu health -nu run-native.nu stop-all -```text - ---- - -### Docker Execution (`run-docker.nu`) - -| Command | Description | -| --------- | ------------- | -| `build [mode]` | Build Docker images | -| `start [mode]` | Start services (add `--detach`) | -| `stop` | Stop all services (add `--volumes` to delete data) | -| `restart [mode]` | Restart services | -| `status` | Show container status | -| `logs ` | Show logs (add `--follow`) | -| `exec ` | Execute command in container | -| `stats` | Show resource usage | -| `health` | Check service health | -| `config [mode]` | Show docker-compose config | -| `clean` | Remove containers (add `--all` for images too) | - -**Examples:** - -```bash -# Solo mode (fastest) -nu run-docker.nu build solo -nu run-docker.nu start solo --detach - -# Enterprise mode (with KMS) -nu run-docker.nu build enterprise -nu run-docker.nu start enterprise --detach - -# Operations -nu run-docker.nu status -nu run-docker.nu logs control-center --follow -nu run-docker.nu exec orchestrator bash -nu run-docker.nu stats -nu run-docker.nu stop -```text - ---- - -## 🗄️ Database Information - -### Control-Center Database - -**Type**: SurrealDB with in-memory backend (kv-mem) -**Location**: In-memory (data persisted during container/process lifetime) -**Production Alternative**: SurrealDB with remote WebSocket connection for persistent storage - -**No separate database server required** - SurrealDB in-memory backend is embedded in the control-center process. - -### Orchestrator Storage - -**Type**: Filesystem queue (default) -**Location**: - -- Native: `~/.provisioning-platform/data/orchestrator/queue.rkvs` -- Docker: `/data/queue.rkvs` (inside container) - -**Production Option**: Switch to SurrealDB via config for distributed deployments. - ---- - -## ⚙️ Configuration Loading - -Services load configuration in this order (priority: low → high): - -1. **System Defaults** - `provisioning/config/config.defaults.toml` -2. **Service Defaults** - `provisioning/platform/{service}/config.defaults.toml` -3. **Workspace Config** - `workspace/{name}/config/provisioning.yaml` -4. **User Config** - `~/Library/Application Support/provisioning/user_config.yaml` -5. **Environment Variables** - `CONTROL_CENTER_*`, `ORCHESTRATOR_*` -6. **Runtime Overrides** - `--config` flag - -**See full documentation**: `docs/architecture/DATABASE_AND_CONFIG_ARCHITECTURE.md` - ---- - -## 🐛 Troubleshooting - -### Native Deployment Issues - -**Build fails:** - -```bash -# Clean and rebuild -cd provisioning/platform -cargo clean -cargo build --release -```text - -**Port already in use:** - -```bash -# Check what's using the port -lsof -i :8080 -lsof -i :8081 - -# Kill the process or use different ports via environment variables -export ORCHESTRATOR_SERVER_PORT=8090 -export CONTROL_CENTER_SERVER_PORT=8091 -```text - -**Service won't start:** - -```bash -# Check logs for errors -nu run-native.nu logs orchestrator - -# Run in foreground to see output -nu run-native.nu start orchestrator -```text - ---- - -### Docker Deployment Issues - -**Build fails with workspace errors:** - -- **Fixed!** Dockerfiles now properly handle workspace structure -- If still failing: `nu run-docker.nu build solo --no-cache` - -**Containers won't start:** - -```bash -# Check container logs -nu run-docker.nu logs orchestrator - -# Check Docker daemon -docker ps -docker info - -# Restart Docker/OrbStack -```text - -**Port conflicts:** - -```bash -# Check what's using ports -lsof -i :8080 -lsof -i :8081 - -# Stop conflicting services or modify docker-compose.yaml ports -```text - -**Out of resources:** - -```bash -# Check current usage -nu run-docker.nu stats - -# Clean up unused containers/images -docker system prune -a - -# Or use the script -nu run-docker.nu clean --all -```text - ---- - -## 🔐 KMS Integration (Enterprise Mode) - -Enterprise mode includes Cosmian KMS for production-grade secret management. - -**Start with KMS:** - -```bash -nu run-docker.nu build enterprise -nu run-docker.nu start enterprise --detach -```text - -**Access KMS:** - -- KMS API: -- KMS Health: - -**KMS Features:** - -- SSL certificate lifecycle management -- SSH private key rotation -- Cloud credential auto-refresh -- Audit trails -- Automatic key rotation - -**See full KMS documentation**: `provisioning/platform/control-center/src/kms/README.md` - ---- - -## 📊 Monitoring - -### Health Checks - -**Native:** - -```bash -nu run-native.nu health -```text - -**Docker:** - -```bash -nu run-docker.nu health -```text - -**Manual:** - -```bash -curl http://localhost:8080/health # Orchestrator -curl http://localhost:8081/health # Control Center -curl http://localhost:9998/health # KMS (enterprise only) -```text - -### Resource Usage - -**Docker:** - -```bash -nu run-docker.nu stats -```text - -**Native:** - -```bash -ps aux | grep -E "provisioning-orchestrator|control-center" -top -pid -```text - ---- - -## 🧪 Testing Both Methods - -### Test Native Deployment - -```bash -cd provisioning/platform/scripts - -# 1. Build -nu run-native.nu build - -# 2. Start services -nu run-native.nu start-all --background - -# 3. Verify -nu run-native.nu status -nu run-native.nu health - -# 4. Test API -curl http://localhost:8080/health -curl http://localhost:8081/health - -# 5. Clean up -nu run-native.nu stop-all -```text - -### Test Docker Deployment - -```bash -cd provisioning/platform/scripts - -# 1. Build -nu run-docker.nu build solo - -# 2. Start services -nu run-docker.nu start solo --detach - -# 3. Verify -nu run-docker.nu status -nu run-docker.nu health - -# 4. Test API -curl http://localhost:8080/health -curl http://localhost:8081/health - -# 5. Clean up -nu run-docker.nu stop --volumes -```text - ---- - -## 🎯 Best Practices - -### Development Workflow - -1. **Use Native for Active Development** - - Faster iteration (no Docker rebuild) - - Direct log access - - Easy debugging with IDE - -2. **Use Docker for Integration Testing** - - Test deployment configurations - - Verify Docker builds - - Simulate production environment - -### Production Deployment - -1. **Use Docker/Kubernetes** - - Isolated environments - - Easy scaling - - Standard deployment - -2. **Use Enterprise Mode** - - KMS for secret management - - Full monitoring stack - - High availability - ---- - -## 📚 Related Documentation - -- **Database Architecture**: `docs/architecture/DATABASE_AND_CONFIG_ARCHITECTURE.md` -- **KMS Integration**: `provisioning/platform/control-center/src/kms/README.md` -- **Configuration System**: `.claude/features/configuration-system.md` -- **Workspace Switching**: `.claude/features/workspace-switching.md` -- **Orchestrator Architecture**: `.claude/features/orchestrator-architecture.md` - ---- - -## ✅ Summary - -### Native Execution - -- ✅ **Fixed**: Workspace builds work correctly -- ✅ **Fast**: No container overhead -- ✅ **Simple**: Direct binary execution -- ✅ **Best for**: Development, debugging - -### Docker Execution - -- ✅ **Fixed**: Dockerfiles now workspace-aware -- ✅ **Isolated**: Clean environments -- ✅ **Flexible**: Multiple deployment modes -- ✅ **Best for**: Testing, production-like deployments - -**Both methods fully supported and tested!** - ---- - -**Quick Links:** - -- Native Script: `provisioning/platform/scripts/run-native.nu` -- Docker Script: `provisioning/platform/scripts/run-docker.nu` -- Docker Files: `provisioning/platform/docker-compose.yaml` + mode-specific overrides +# Provisioning Platform - Deployment Guide\n\n**Last Updated**: 2025-10-07\n**Platform**: macOS (Apple Silicon / Intel) + OrbStack/Docker\n\n---\n\n## ✅ Fixed: Docker Builds\n\nDocker builds have been **fixed** to properly handle the Rust workspace structure. Both deployment methods (Native and Docker) are now fully \nsupported.\n\n**Note**: Docker builds use Rust nightly to support edition2024 (required by async-graphql 7.x from surrealdb).\nRocksDB has been replaced with SurrealDB in-memory backend (kv-mem) to simplify Docker builds (no libclang requirement).\n\n---\n\n## 📦 Quick Start\n\n### Prerequisites\n\n**For Native Deployment:**\n\n- Rust 1.75+: `brew install rust`\n- Nushell 0.107+: `brew install nushell`\n\n**For Docker Deployment:**\n\n- OrbStack (recommended): \n- Or Docker Desktop: `brew install --cask docker`\n\n---\n\n## 🚀 Deployment Methods\n\n### Method 1: Native Execution (Recommended for Development)\n\n**Fastest startup, easiest debugging, direct access to logs**\n\n```\ncd provisioning/platform/scripts\n\n# 1. Build all services\nnu run-native.nu build\n\n# 2. Start all services in background\nnu run-native.nu start-all --background\n\n# 3. Check status\nnu run-native.nu status\n\n# 4. View logs\nnu run-native.nu logs orchestrator --follow\n\n# 5. Stop all\nnu run-native.nu stop-all\n```\n\n**Services will run on:**\n\n- Orchestrator: \n- Control Center: \n\n**Data stored in:**\n\n- `~/.provisioning-platform/data/`\n- `~/.provisioning-platform/logs/`\n\n---\n\n### Method 2: Docker Execution (Recommended for Production-like Testing)\n\n**Isolated environments, easy cleanup, supports all deployment modes**\n\n```\ncd provisioning/platform/scripts\n\n# 1. Build Docker images (Solo mode)\nnu run-docker.nu build solo\n\n# 2. Start services in background\nnu run-docker.nu start solo --detach\n\n# 3. Check status\nnu run-docker.nu status\n\n# 4. View logs\nnu run-docker.nu logs orchestrator --follow\n\n# 5. Stop all\nnu run-docker.nu stop\n```\n\n**Deployment Modes:**\n\n- `solo` - 2 CPU / 4GB RAM (dev/test)\n- `multiuser` - 4 CPU / 8GB RAM (team)\n- `cicd` - 8 CPU / 16GB RAM (automation)\n- `enterprise` - 16 CPU / 32GB RAM (production + KMS)\n\n---\n\n## 📋 Complete Command Reference\n\n### Native Execution (`run-native.nu`)\n\n| Command | Description |\n| --------- | ------------- |\n| `build` | Build all services |\n| `start ` | Start orchestrator or control_center |\n| `start-all` | Start all services |\n| `stop ` | Stop a specific service |\n| `stop-all` | Stop all services |\n| `status` | Show service status |\n| `logs ` | Show logs (add `--follow`) |\n| `health` | Check service health |\n\n**Examples:**\n\n```\nnu run-native.nu build\nnu run-native.nu start orchestrator --background\nnu run-native.nu start control_center --background\nnu run-native.nu logs orchestrator --follow\nnu run-native.nu health\nnu run-native.nu stop-all\n```\n\n---\n\n### Docker Execution (`run-docker.nu`)\n\n| Command | Description |\n| --------- | ------------- |\n| `build [mode]` | Build Docker images |\n| `start [mode]` | Start services (add `--detach`) |\n| `stop` | Stop all services (add `--volumes` to delete data) |\n| `restart [mode]` | Restart services |\n| `status` | Show container status |\n| `logs ` | Show logs (add `--follow`) |\n| `exec ` | Execute command in container |\n| `stats` | Show resource usage |\n| `health` | Check service health |\n| `config [mode]` | Show docker-compose config |\n| `clean` | Remove containers (add `--all` for images too) |\n\n**Examples:**\n\n```\n# Solo mode (fastest)\nnu run-docker.nu build solo\nnu run-docker.nu start solo --detach\n\n# Enterprise mode (with KMS)\nnu run-docker.nu build enterprise\nnu run-docker.nu start enterprise --detach\n\n# Operations\nnu run-docker.nu status\nnu run-docker.nu logs control-center --follow\nnu run-docker.nu exec orchestrator bash\nnu run-docker.nu stats\nnu run-docker.nu stop\n```\n\n---\n\n## 🗄️ Database Information\n\n### Control-Center Database\n\n**Type**: SurrealDB with in-memory backend (kv-mem)\n**Location**: In-memory (data persisted during container/process lifetime)\n**Production Alternative**: SurrealDB with remote WebSocket connection for persistent storage\n\n**No separate database server required** - SurrealDB in-memory backend is embedded in the control-center process.\n\n### Orchestrator Storage\n\n**Type**: Filesystem queue (default)\n**Location**:\n\n- Native: `~/.provisioning-platform/data/orchestrator/queue.rkvs`\n- Docker: `/data/queue.rkvs` (inside container)\n\n**Production Option**: Switch to SurrealDB via config for distributed deployments.\n\n---\n\n## ⚙️ Configuration Loading\n\nServices load configuration in this order (priority: low → high):\n\n1. **System Defaults** - `provisioning/config/config.defaults.toml`\n2. **Service Defaults** - `provisioning/platform/{service}/config.defaults.toml`\n3. **Workspace Config** - `workspace/{name}/config/provisioning.yaml`\n4. **User Config** - `~/Library/Application Support/provisioning/user_config.yaml`\n5. **Environment Variables** - `CONTROL_CENTER_*`, `ORCHESTRATOR_*`\n6. **Runtime Overrides** - `--config` flag\n\n**See full documentation**: `docs/architecture/DATABASE_AND_CONFIG_ARCHITECTURE.md`\n\n---\n\n## 🐛 Troubleshooting\n\n### Native Deployment Issues\n\n**Build fails:**\n\n```\n# Clean and rebuild\ncd provisioning/platform\ncargo clean\ncargo build --release\n```\n\n**Port already in use:**\n\n```\n# Check what's using the port\nlsof -i :8080\nlsof -i :8081\n\n# Kill the process or use different ports via environment variables\nexport ORCHESTRATOR_SERVER_PORT=8090\nexport CONTROL_CENTER_SERVER_PORT=8091\n```\n\n**Service won't start:**\n\n```\n# Check logs for errors\nnu run-native.nu logs orchestrator\n\n# Run in foreground to see output\nnu run-native.nu start orchestrator\n```\n\n---\n\n### Docker Deployment Issues\n\n**Build fails with workspace errors:**\n\n- **Fixed!** Dockerfiles now properly handle workspace structure\n- If still failing: `nu run-docker.nu build solo --no-cache`\n\n**Containers won't start:**\n\n```\n# Check container logs\nnu run-docker.nu logs orchestrator\n\n# Check Docker daemon\ndocker ps\ndocker info\n\n# Restart Docker/OrbStack\n```\n\n**Port conflicts:**\n\n```\n# Check what's using ports\nlsof -i :8080\nlsof -i :8081\n\n# Stop conflicting services or modify docker-compose.yaml ports\n```\n\n**Out of resources:**\n\n```\n# Check current usage\nnu run-docker.nu stats\n\n# Clean up unused containers/images\ndocker system prune -a\n\n# Or use the script\nnu run-docker.nu clean --all\n```\n\n---\n\n## 🔐 KMS Integration (Enterprise Mode)\n\nEnterprise mode includes Cosmian KMS for production-grade secret management.\n\n**Start with KMS:**\n\n```\nnu run-docker.nu build enterprise\nnu run-docker.nu start enterprise --detach\n```\n\n**Access KMS:**\n\n- KMS API: \n- KMS Health: \n\n**KMS Features:**\n\n- SSL certificate lifecycle management\n- SSH private key rotation\n- Cloud credential auto-refresh\n- Audit trails\n- Automatic key rotation\n\n**See full KMS documentation**: `provisioning/platform/control-center/src/kms/README.md`\n\n---\n\n## 📊 Monitoring\n\n### Health Checks\n\n**Native:**\n\n```\nnu run-native.nu health\n```\n\n**Docker:**\n\n```\nnu run-docker.nu health\n```\n\n**Manual:**\n\n```\ncurl http://localhost:8080/health # Orchestrator\ncurl http://localhost:8081/health # Control Center\ncurl http://localhost:9998/health # KMS (enterprise only)\n```\n\n### Resource Usage\n\n**Docker:**\n\n```\nnu run-docker.nu stats\n```\n\n**Native:**\n\n```\nps aux | grep -E "provisioning-orchestrator|control-center"\ntop -pid \n```\n\n---\n\n## 🧪 Testing Both Methods\n\n### Test Native Deployment\n\n```\ncd provisioning/platform/scripts\n\n# 1. Build\nnu run-native.nu build\n\n# 2. Start services\nnu run-native.nu start-all --background\n\n# 3. Verify\nnu run-native.nu status\nnu run-native.nu health\n\n# 4. Test API\ncurl http://localhost:8080/health\ncurl http://localhost:8081/health\n\n# 5. Clean up\nnu run-native.nu stop-all\n```\n\n### Test Docker Deployment\n\n```\ncd provisioning/platform/scripts\n\n# 1. Build\nnu run-docker.nu build solo\n\n# 2. Start services\nnu run-docker.nu start solo --detach\n\n# 3. Verify\nnu run-docker.nu status\nnu run-docker.nu health\n\n# 4. Test API\ncurl http://localhost:8080/health\ncurl http://localhost:8081/health\n\n# 5. Clean up\nnu run-docker.nu stop --volumes\n```\n\n---\n\n## 🎯 Best Practices\n\n### Development Workflow\n\n1. **Use Native for Active Development**\n - Faster iteration (no Docker rebuild)\n - Direct log access\n - Easy debugging with IDE\n\n2. **Use Docker for Integration Testing**\n - Test deployment configurations\n - Verify Docker builds\n - Simulate production environment\n\n### Production Deployment\n\n1. **Use Docker/Kubernetes**\n - Isolated environments\n - Easy scaling\n - Standard deployment\n\n2. **Use Enterprise Mode**\n - KMS for secret management\n - Full monitoring stack\n - High availability\n\n---\n\n## 📚 Related Documentation\n\n- **Database Architecture**: `docs/architecture/DATABASE_AND_CONFIG_ARCHITECTURE.md`\n- **KMS Integration**: `provisioning/platform/control-center/src/kms/README.md`\n- **Configuration System**: `.claude/features/configuration-system.md`\n- **Workspace Switching**: `.claude/features/workspace-switching.md`\n- **Orchestrator Architecture**: `.claude/features/orchestrator-architecture.md`\n\n---\n\n## ✅ Summary\n\n### Native Execution\n\n- ✅ **Fixed**: Workspace builds work correctly\n- ✅ **Fast**: No container overhead\n- ✅ **Simple**: Direct binary execution\n- ✅ **Best for**: Development, debugging\n\n### Docker Execution\n\n- ✅ **Fixed**: Dockerfiles now workspace-aware\n- ✅ **Isolated**: Clean environments\n- ✅ **Flexible**: Multiple deployment modes\n- ✅ **Best for**: Testing, production-like deployments\n\n**Both methods fully supported and tested!**\n\n---\n\n**Quick Links:**\n\n- Native Script: `provisioning/platform/scripts/run-native.nu`\n- Docker Script: `provisioning/platform/scripts/run-docker.nu`\n- Docker Files: `provisioning/platform/docker-compose.yaml` + mode-specific overrides \ No newline at end of file diff --git a/docs/deployment/known-issues.md b/docs/deployment/known-issues.md index c04ff2e..c800597 100644 --- a/docs/deployment/known-issues.md +++ b/docs/deployment/known-issues.md @@ -1,97 +1 @@ -# Known Issues - Provisioning Platform - -## Control-Center Requires Rust Nightly (Edition 2024 Dependency) - -**Status**: Resolved (using nightly) -**Severity**: Low -**Affects**: Docker deployment only -**Date Reported**: 2025-10-07 -**Date Resolved**: 2025-10-07 - -### Issue - -Control-center Docker builds fail with the following error: - -```plaintext -feature 'edition2024' is required -this Cargo does not support nightly features, but if you -switch to nightly channel you can add -`cargo-features = ["edition2024"]` to enable this feature -```text - -### Root Cause - -Dependency chain: - -```plaintext -control-center → surrealdb 2.3.10 → surrealdb-core 2.3.10 → async-graphql 7.0.17 -```text - -The `async-graphql-value` crate v7.0.17 requires Rust edition 2024, which is not yet stable in Rust 1.82. -Edition 2024 is currently only available in Rust nightly builds. - -### Resolution - -**Updated Dockerfiles to use Rust nightly** (2025-10-07): - -Both `orchestrator/Dockerfile` and `control-center/Dockerfile` now use: - -```dockerfile -FROM rustlang/rust:nightly-bookworm AS builder -```text - -This provides edition2024 support required by the surrealdb dependency chain. - -### Production Considerations - -**Rust Nightly Stability**: - -- Nightly builds are generally stable for compilation -- The compiled binaries are production-ready -- Runtime behavior is not affected by nightly vs stable compilation -- Consider pinning to a specific nightly date for reproducible builds - -**Alternative**: Use native deployment with stable Rust if nightly is a concern: - -```bash -cd provisioning/platform/scripts -nu run-native.nu build -nu run-native.nu start-all --background -```text - -### Timeline - -- **Rust 1.85** (estimated Feb 2025): Expected to stabilize edition 2024 -- **SurrealDB 3.x**: May drop async-graphql dependency - -### Tracking - -- Rust Edition 2024 RFC: -- SurrealDB Issue: -- async-graphql Issue: - -### Related Files - -- `provisioning/platform/control-center/Dockerfile` -- `provisioning/platform/Cargo.toml` (workspace dependencies) -- `provisioning/platform/control-center/Cargo.toml` - ---- - -## RocksDB Build Takes Long Time - -**Status**: Known Limitation -**Severity**: Low -**Affects**: All builds - -### Issue - -RocksDB compilation takes 30-60 seconds during builds. - -### Workaround - -Use cached Docker layers or native builds with incremental compilation. - ---- - -**Last Updated**: 2025-10-07 +# Known Issues - Provisioning Platform\n\n## Control-Center Requires Rust Nightly (Edition 2024 Dependency)\n\n**Status**: Resolved (using nightly)\n**Severity**: Low\n**Affects**: Docker deployment only\n**Date Reported**: 2025-10-07\n**Date Resolved**: 2025-10-07\n\n### Issue\n\nControl-center Docker builds fail with the following error:\n\n```\nfeature 'edition2024' is required\nthis Cargo does not support nightly features, but if you\nswitch to nightly channel you can add\n`cargo-features = ["edition2024"]` to enable this feature\n```\n\n### Root Cause\n\nDependency chain:\n\n```\ncontrol-center → surrealdb 2.3.10 → surrealdb-core 2.3.10 → async-graphql 7.0.17\n```\n\nThe `async-graphql-value` crate v7.0.17 requires Rust edition 2024, which is not yet stable in Rust 1.82.\nEdition 2024 is currently only available in Rust nightly builds.\n\n### Resolution\n\n**Updated Dockerfiles to use Rust nightly** (2025-10-07):\n\nBoth `orchestrator/Dockerfile` and `control-center/Dockerfile` now use:\n\n```\nFROM rustlang/rust:nightly-bookworm AS builder\n```\n\nThis provides edition2024 support required by the surrealdb dependency chain.\n\n### Production Considerations\n\n**Rust Nightly Stability**:\n\n- Nightly builds are generally stable for compilation\n- The compiled binaries are production-ready\n- Runtime behavior is not affected by nightly vs stable compilation\n- Consider pinning to a specific nightly date for reproducible builds\n\n**Alternative**: Use native deployment with stable Rust if nightly is a concern:\n\n```\ncd provisioning/platform/scripts\nnu run-native.nu build\nnu run-native.nu start-all --background\n```\n\n### Timeline\n\n- **Rust 1.85** (estimated Feb 2025): Expected to stabilize edition 2024\n- **SurrealDB 3.x**: May drop async-graphql dependency\n\n### Tracking\n\n- Rust Edition 2024 RFC: \n- SurrealDB Issue: \n- async-graphql Issue: \n\n### Related Files\n\n- `provisioning/platform/control-center/Dockerfile`\n- `provisioning/platform/Cargo.toml` (workspace dependencies)\n- `provisioning/platform/control-center/Cargo.toml`\n\n---\n\n## RocksDB Build Takes Long Time\n\n**Status**: Known Limitation\n**Severity**: Low\n**Affects**: All builds\n\n### Issue\n\nRocksDB compilation takes 30-60 seconds during builds.\n\n### Workaround\n\nUse cached Docker layers or native builds with incremental compilation.\n\n---\n\n**Last Updated**: 2025-10-07 \ No newline at end of file diff --git a/docs/guides/quick-start.md b/docs/guides/quick-start.md index 2a47eaa..7980811 100644 --- a/docs/guides/quick-start.md +++ b/docs/guides/quick-start.md @@ -1,282 +1 @@ -# Provisioning Platform - Quick Start - -Fast deployment guide for all modes. - ---- - -## Prerequisites - -```bash -# Verify Docker is installed and running -docker --version # 20.10+ -docker-compose --version # 2.0+ -docker ps # Should work without errors -```text - ---- - -## 1. Solo Mode (Local Development) - -**Services**: Orchestrator, Control Center, CoreDNS, OCI Registry, Extension Registry - -**Resources**: 2 CPU cores, 4GB RAM, 20GB disk - -```bash -cd /Users/Akasha/project-provisioning/provisioning/platform - -# Generate secrets -./scripts/generate-secrets.nu - -# Deploy -./scripts/deploy-platform.nu --mode solo - -# Verify -./scripts/health-check.nu - -# Access -open http://localhost:8080 # Orchestrator -open http://localhost:8081 # Control Center -```text - -**Stop**: - -```bash -docker-compose down -```text - ---- - -## 2. Multi-User Mode (Team Collaboration) - -**Services**: Solo + Gitea, PostgreSQL - -**Resources**: 4 CPU cores, 8GB RAM, 50GB disk - -```bash -cd /Users/Akasha/project-provisioning/provisioning/platform - -# Generate secrets -./scripts/generate-secrets.nu - -# Deploy -./scripts/deploy-platform.nu --mode multi-user - -# Verify -./scripts/health-check.nu - -# Access -open http://localhost:3000 # Gitea -open http://localhost:8081 # Control Center -```text - -**Configure Gitea**: - -1. Visit -2. Complete initial setup wizard -3. Create admin account - ---- - -## 3. CI/CD Mode (Automated Pipelines) - -**Services**: Multi-User + API Server, Jenkins (optional), GitLab Runner (optional) - -**Resources**: 8 CPU cores, 16GB RAM, 100GB disk - -```bash -cd /Users/Akasha/project-provisioning/provisioning/platform - -# Generate secrets -./scripts/generate-secrets.nu - -# Deploy -./scripts/deploy-platform.nu --mode cicd --build - -# Verify -./scripts/health-check.nu - -# Access -open http://localhost:8083 # API Server -```text - ---- - -## 4. Enterprise Mode (Production) - -**Services**: Full stack (15+ services) - -**Resources**: 16 CPU cores, 32GB RAM, 500GB disk - -```bash -cd /Users/Akasha/project-provisioning/provisioning/platform - -# Generate production secrets -./scripts/generate-secrets.nu --output .env.production - -# Review and customize -nano .env.production - -# Deploy with build -./scripts/deploy-platform.nu --mode enterprise \ - --env-file .env.production \ - --build \ - --wait 600 - -# Verify -./scripts/health-check.nu - -# Access -open http://localhost:3001 # Grafana (admin / password from .env) -open http://localhost:9090 # Prometheus -open http://localhost:5601 # Kibana -```text - ---- - -## Common Commands - -### View Logs - -```bash -docker-compose logs -f -docker-compose logs -f orchestrator -docker-compose logs --tail=100 orchestrator -```text - -### Restart Services - -```bash -docker-compose restart orchestrator -docker-compose restart -```text - -### Update Platform - -```bash -docker-compose pull -./scripts/deploy-platform.nu --mode --pull -```text - -### Stop Platform - -```bash -docker-compose down -```text - -### Clean Everything (WARNING: data loss) - -```bash -docker-compose down --volumes -```text - ---- - -## Systemd (Linux Production) - -```bash -# Install services -cd systemd -sudo ./install-services.sh - -# Enable and start -sudo systemctl enable --now provisioning-platform - -# Check status -sudo systemctl status provisioning-platform - -# View logs -sudo journalctl -u provisioning-platform -f - -# Restart -sudo systemctl restart provisioning-platform - -# Stop -sudo systemctl stop provisioning-platform -```text - ---- - -## Troubleshooting - -### Services not starting - -```bash -# Check Docker -systemctl status docker - -# Check logs -docker-compose logs orchestrator - -# Check resources -docker stats -```text - -### Port conflicts - -```bash -# Find what's using port -lsof -i :8080 - -# Change port in .env -nano .env -# Set ORCHESTRATOR_PORT=9080 - -# Restart -docker-compose down && docker-compose up -d -```text - -### Health checks failing - -```bash -# Check individual service -curl http://localhost:8080/health - -# Wait longer -./scripts/deploy-platform.nu --wait 600 - -# Check networks -docker network inspect provisioning-net -```text - ---- - -## Access URLs - -### Solo Mode - -- Orchestrator: -- Control Center: -- OCI Registry: - -### Multi-User Mode - -- Gitea: -- PostgreSQL: localhost:5432 - -### CI/CD Mode - -- API Server: - -### Enterprise Mode - -- Prometheus: -- Grafana: -- Kibana: -- Nginx: - ---- - -## Next Steps - -- **Full Guide**: See `docs/deployment/deployment-guide.md` -- **Configuration**: Edit `.env` file for customization -- **Monitoring**: Access Grafana dashboards (enterprise mode) -- **API**: Use API Server for automation (CI/CD mode) - ---- - -**Need Help?** - -- Health Check: `./scripts/health-check.nu` -- Logs: `docker-compose logs -f` -- Documentation: `docs/deployment/` +# Provisioning Platform - Quick Start\n\nFast deployment guide for all modes.\n\n---\n\n## Prerequisites\n\n```\n# Verify Docker is installed and running\ndocker --version # 20.10+\ndocker-compose --version # 2.0+\ndocker ps # Should work without errors\n```\n\n---\n\n## 1. Solo Mode (Local Development)\n\n**Services**: Orchestrator, Control Center, CoreDNS, OCI Registry, Extension Registry\n\n**Resources**: 2 CPU cores, 4GB RAM, 20GB disk\n\n```\ncd /Users/Akasha/project-provisioning/provisioning/platform\n\n# Generate secrets\n./scripts/generate-secrets.nu\n\n# Deploy\n./scripts/deploy-platform.nu --mode solo\n\n# Verify\n./scripts/health-check.nu\n\n# Access\nopen http://localhost:8080 # Orchestrator\nopen http://localhost:8081 # Control Center\n```\n\n**Stop**:\n\n```\ndocker-compose down\n```\n\n---\n\n## 2. Multi-User Mode (Team Collaboration)\n\n**Services**: Solo + Gitea, PostgreSQL\n\n**Resources**: 4 CPU cores, 8GB RAM, 50GB disk\n\n```\ncd /Users/Akasha/project-provisioning/provisioning/platform\n\n# Generate secrets\n./scripts/generate-secrets.nu\n\n# Deploy\n./scripts/deploy-platform.nu --mode multi-user\n\n# Verify\n./scripts/health-check.nu\n\n# Access\nopen http://localhost:3000 # Gitea\nopen http://localhost:8081 # Control Center\n```\n\n**Configure Gitea**:\n\n1. Visit \n2. Complete initial setup wizard\n3. Create admin account\n\n---\n\n## 3. CI/CD Mode (Automated Pipelines)\n\n**Services**: Multi-User + API Server, Jenkins (optional), GitLab Runner (optional)\n\n**Resources**: 8 CPU cores, 16GB RAM, 100GB disk\n\n```\ncd /Users/Akasha/project-provisioning/provisioning/platform\n\n# Generate secrets\n./scripts/generate-secrets.nu\n\n# Deploy\n./scripts/deploy-platform.nu --mode cicd --build\n\n# Verify\n./scripts/health-check.nu\n\n# Access\nopen http://localhost:8083 # API Server\n```\n\n---\n\n## 4. Enterprise Mode (Production)\n\n**Services**: Full stack (15+ services)\n\n**Resources**: 16 CPU cores, 32GB RAM, 500GB disk\n\n```\ncd /Users/Akasha/project-provisioning/provisioning/platform\n\n# Generate production secrets\n./scripts/generate-secrets.nu --output .env.production\n\n# Review and customize\nnano .env.production\n\n# Deploy with build\n./scripts/deploy-platform.nu --mode enterprise \\n --env-file .env.production \\n --build \\n --wait 600\n\n# Verify\n./scripts/health-check.nu\n\n# Access\nopen http://localhost:3001 # Grafana (admin / password from .env)\nopen http://localhost:9090 # Prometheus\nopen http://localhost:5601 # Kibana\n```\n\n---\n\n## Common Commands\n\n### View Logs\n\n```\ndocker-compose logs -f\ndocker-compose logs -f orchestrator\ndocker-compose logs --tail=100 orchestrator\n```\n\n### Restart Services\n\n```\ndocker-compose restart orchestrator\ndocker-compose restart\n```\n\n### Update Platform\n\n```\ndocker-compose pull\n./scripts/deploy-platform.nu --mode --pull\n```\n\n### Stop Platform\n\n```\ndocker-compose down\n```\n\n### Clean Everything (WARNING: data loss)\n\n```\ndocker-compose down --volumes\n```\n\n---\n\n## Systemd (Linux Production)\n\n```\n# Install services\ncd systemd\nsudo ./install-services.sh\n\n# Enable and start\nsudo systemctl enable --now provisioning-platform\n\n# Check status\nsudo systemctl status provisioning-platform\n\n# View logs\nsudo journalctl -u provisioning-platform -f\n\n# Restart\nsudo systemctl restart provisioning-platform\n\n# Stop\nsudo systemctl stop provisioning-platform\n```\n\n---\n\n## Troubleshooting\n\n### Services not starting\n\n```\n# Check Docker\nsystemctl status docker\n\n# Check logs\ndocker-compose logs orchestrator\n\n# Check resources\ndocker stats\n```\n\n### Port conflicts\n\n```\n# Find what's using port\nlsof -i :8080\n\n# Change port in .env\nnano .env\n# Set ORCHESTRATOR_PORT=9080\n\n# Restart\ndocker-compose down && docker-compose up -d\n```\n\n### Health checks failing\n\n```\n# Check individual service\ncurl http://localhost:8080/health\n\n# Wait longer\n./scripts/deploy-platform.nu --wait 600\n\n# Check networks\ndocker network inspect provisioning-net\n```\n\n---\n\n## Access URLs\n\n### Solo Mode\n\n- Orchestrator: \n- Control Center: \n- OCI Registry: \n\n### Multi-User Mode\n\n- Gitea: \n- PostgreSQL: localhost:5432\n\n### CI/CD Mode\n\n- API Server: \n\n### Enterprise Mode\n\n- Prometheus: \n- Grafana: \n- Kibana: \n- Nginx: \n\n---\n\n## Next Steps\n\n- **Full Guide**: See `docs/deployment/deployment-guide.md`\n- **Configuration**: Edit `.env` file for customization\n- **Monitoring**: Access Grafana dashboards (enterprise mode)\n- **API**: Use API Server for automation (CI/CD mode)\n\n---\n\n**Need Help?**\n\n- Health Check: `./scripts/health-check.nu`\n- Logs: `docker-compose logs -f`\n- Documentation: `docs/deployment/` \ No newline at end of file diff --git a/infrastructure/README.md b/infrastructure/README.md index 732b31e..79be132 100644 --- a/infrastructure/README.md +++ b/infrastructure/README.md @@ -1,18 +1 @@ -# Infrastructure - -Deployment and operational configuration for the Provisioning Platform. - -## Structure - -- **kubernetes/** - Kubernetes manifests and configurations -- **docker/** - Docker Compose and Dockerfile definitions -- **nginx/** - Nginx reverse proxy and load balancer configs -- **systemd/** - Systemd service definitions -- **monitoring/** - Monitoring stack (Prometheus, Grafana, etc.) -- **oci-registry/** - OCI Container Registry configuration -- **api-gateway/** - API Gateway service definitions -- **integrations/** - Third-party service integrations - -## Quick Start - -Each subdirectory contains its own documentation. Review the specific service README for deployment instructions. +# Infrastructure\n\nDeployment and operational configuration for the Provisioning Platform.\n\n## Structure\n\n- **kubernetes/** - Kubernetes manifests and configurations\n- **docker/** - Docker Compose and Dockerfile definitions\n- **nginx/** - Nginx reverse proxy and load balancer configs\n- **systemd/** - Systemd service definitions\n- **monitoring/** - Monitoring stack (Prometheus, Grafana, etc.)\n- **oci-registry/** - OCI Container Registry configuration\n- **api-gateway/** - API Gateway service definitions\n- **integrations/** - Third-party service integrations\n\n## Quick Start\n\nEach subdirectory contains its own documentation. Review the specific service README for deployment instructions. \ No newline at end of file diff --git a/infrastructure/oci-registry/README.md b/infrastructure/oci-registry/README.md index 9ba2a3e..895d393 100644 --- a/infrastructure/oci-registry/README.md +++ b/infrastructure/oci-registry/README.md @@ -1,817 +1 @@ -# OCI Registry Service - -Comprehensive OCI (Open Container Initiative) registry deployment and management for the provisioning system. -Supports multiple registry implementations: **Zot** (lightweight), **Harbor** (full-featured), -and **Distribution** (OCI reference implementation). - -## Table of Contents - -- [Overview](#overview) -- [Registry Types](#registry-types) -- [Quick Start](#quick-start) -- [Installation](#installation) -- [Configuration](#configuration) -- [Management](#management) -- [Namespaces](#namespaces) -- [Access Control](#access-control) -- [Monitoring](#monitoring) -- [Troubleshooting](#troubleshooting) -- [Advanced Usage](#advanced-usage) - -## Overview - -The OCI registry service provides artifact storage and distribution for: - -- **Extension Packages**: Providers, taskservs, clusters -- **KCL Schemas**: Configuration schemas and modules -- **Platform Images**: Orchestrator, control-center, services -- **Test Artifacts**: Development and testing images - -### Features - -- **Multi-Registry Support**: Zot, Harbor, Distribution -- **Namespace Organization**: Logical separation of artifacts -- **Access Control**: RBAC, policies, authentication -- **Monitoring**: Prometheus metrics, health checks -- **Garbage Collection**: Automatic cleanup of unused artifacts -- **High Availability**: Optional HA configurations -- **TLS/SSL**: Secure communication -- **UI Interface**: Web-based management (Zot, Harbor) - -## Registry Types - -### Zot (Recommended for Development) - -**Lightweight, fast, OCI-native registry with search and UI.** - -**Pros:** - -- Fast startup and low resource usage -- Built-in UI and search -- Prometheus metrics -- Automatic garbage collection -- Good for development and small deployments - -**Cons:** - -- Less mature than Distribution -- Fewer enterprise features than Harbor - -**Use Cases:** - -- Development environments -- CI/CD pipelines -- Small to medium deployments -- Quick prototyping - -### Harbor (Recommended for Production) - -**Full-featured enterprise registry with replication, scanning, and RBAC.** - -**Pros:** - -- Enterprise-grade features -- Vulnerability scanning (Trivy) -- Replication and mirroring -- Advanced RBAC -- Webhooks and notifications -- Mature and battle-tested - -**Cons:** - -- Higher resource requirements -- More complex setup -- Heavier than Zot/Distribution - -**Use Cases:** - -- Production deployments -- Multi-tenant environments -- Security-critical applications -- Large-scale deployments - -### Distribution (OCI Reference) - -**Official OCI registry reference implementation.** - -**Pros:** - -- OCI standard compliance -- Lightweight and simple -- Well-documented -- Industry standard - -**Cons:** - -- No built-in UI -- No search functionality -- Manual garbage collection -- Basic feature set - -**Use Cases:** - -- OCI standard compliance required -- Minimal registry needs -- Custom integrations -- Educational purposes - -## Quick Start - -### Start Zot Registry (Default) - -```bash -# Start Zot in background -cd provisioning/platform/oci-registry/zot -docker-compose up -d - -# Initialize with namespaces and policies -nu ../scripts/init-registry.nu --registry-type zot - -# Check health -nu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; oci-registry health" - -# Access UI -open http://localhost:5000 -```text - -### Start Harbor Registry - -```bash -# Start Harbor -cd provisioning/platform/oci-registry/harbor -docker-compose up -d - -# Wait for services to be ready (takes ~2 minutes) -sleep 120 - -# Initialize -nu ../scripts/init-registry.nu --registry-type harbor --admin-password Harbor12345 - -# Access UI -open http://localhost -# Login: admin / Harbor12345 -```text - -### Start Distribution Registry - -```bash -# Start Distribution with UI -cd provisioning/platform/oci-registry/distribution -docker-compose up -d - -# Initialize -nu ../scripts/init-registry.nu --registry-type distribution - -# Access UI (if included) -open http://localhost:8080 -```text - -## Installation - -### Prerequisites - -- **Docker** (20.10+) -- **Docker Compose** (2.0+) -- **Nushell** (0.107+) - -### Setup - -```bash -# Clone configurations (already included) -cd provisioning/platform/oci-registry - -# Choose registry type -REGISTRY_TYPE="zot" # or "harbor" or "distribution" - -# Generate TLS certificates (optional, for HTTPS) -./scripts/generate-certs.nu - -# Start registry -cd $REGISTRY_TYPE -docker-compose up -d - -# Initialize -nu ../scripts/init-registry.nu --registry-type $REGISTRY_TYPE - -# Verify -docker-compose ps -```text - -## Configuration - -### Zot Configuration - -**File**: `zot/config.json` - -Key settings: - -```json -{ - "storage": { - "rootDirectory": "/var/lib/registry", - "dedupe": true, - "gc": true, - "gcInterval": "24h" - }, - "http": { - "address": "0.0.0.0", - "port": "5000" - }, - "extensions": { - "search": {"enable": true}, - "metrics": {"enable": true}, - "ui": {"enable": true} - }, - "accessControl": { - "repositories": { - "provisioning-extensions/**": { - "policies": [ - { - "users": ["provisioning"], - "actions": ["read", "create", "update", "delete"] - } - ] - } - } - } -} -```text - -### Harbor Configuration - -**File**: `harbor/harbor.yml` - -Key settings: - -```yaml -hostname: harbor.provisioning.local -harbor_admin_password: Harbor12345 - -database: - password: root123 - -trivy: - ignore_unfixed: false - skip_update: false - -log: - level: info -```text - -### Distribution Configuration - -**File**: `distribution/config.yml` - -Key settings: - -```yaml -storage: - filesystem: - rootdirectory: /var/lib/registry - delete: - enabled: true - -http: - addr: :5000 - tls: - certificate: /etc/docker/registry/certs/cert.pem - key: /etc/docker/registry/certs/key.pem - -auth: - htpasswd: - realm: Registry - path: /etc/docker/registry/htpasswd -```text - -## Management - -### Using Nushell Commands - -```bash -# Start registry -nu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; oci-registry start --type zot" - -# Stop registry -nu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; oci-registry stop --type zot" - -# Check status -nu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; oci-registry status --type zot" - -# View logs -nu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; oci-registry logs --type zot --follow" - -# Health check -nu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; oci-registry health --type zot" - -# Initialize -nu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; oci-registry init --type zot" - -# List namespaces -nu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; oci-registry namespaces" -```text - -### Using Docker Compose - -```bash -# Start -cd provisioning/platform/oci-registry/zot -docker-compose up -d - -# Stop -docker-compose down - -# View logs -docker-compose logs -f - -# Restart -docker-compose restart - -# Remove (including volumes) -docker-compose down -v -```text - -## Namespaces - -### Default Namespaces - -| Namespace | Description | Public | Retention | -| ----------- | ------------- | -------- | ----------- | -| `provisioning-extensions` | Extension packages | No | 10 tags, 90 days | -| `provisioning-kcl` | KCL schemas | No | 20 tags, 180 days | -| `provisioning-platform` | Platform images | No | 5 tags, 30 days | -| `provisioning-test` | Test artifacts | Yes | 3 tags, 7 days | - -### Manage Namespaces - -```bash -# Setup all namespaces -nu scripts/setup-namespaces.nu --registry-type zot - -# List namespaces -nu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; oci-registry namespaces" - -# Create namespace -nu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; \ - oci-registry namespace create my-namespace --type zot" - -# Get namespace info -nu scripts/setup-namespaces.nu namespace info provisioning-extensions -```text - -## Access Control - -### Policies - -Default access policies: - -**provisioning-extensions:** - -- Authenticated: Read, Write, Delete -- Anonymous: None - -**provisioning-kcl:** - -- Authenticated: Read, Write -- Anonymous: None - -**provisioning-platform:** - -- Authenticated: Read only (except admin) -- Anonymous: None - -**provisioning-test:** - -- Authenticated: Read, Write, Delete -- Anonymous: Read only - -### Configure Policies - -```bash -# Apply all policies -nu scripts/configure-policies.nu --registry-type zot - -# Show policy for namespace -nu scripts/configure-policies.nu policy show provisioning-extensions - -# List all policies -nu scripts/configure-policies.nu policy list -```text - -### Authentication - -**Zot/Distribution (htpasswd):** - -```bash -# Create user -htpasswd -Bc htpasswd provisioning - -# Login -docker login localhost:5000 -```text - -**Harbor (Database):** - -```bash -# Login via UI or CLI -docker login localhost -# Username: admin -# Password: Harbor12345 - -# Create users via Harbor UI -# Admin → Users → New User -```text - -## Monitoring - -### Health Checks - -```bash -# Full health check -nu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; \ - oci-registry health --type zot" - -# API check -curl http://localhost:5000/v2/ - -# Catalog check -curl http://localhost:5000/v2/_catalog -```text - -### Metrics - -**Zot:** - -```bash -# Prometheus metrics -curl http://localhost:5000/metrics - -# Visualize with Prometheus -# Add to prometheus.yml: -# - targets: ['localhost:5000'] -```text - -**Distribution:** - -```bash -# Metrics on debug port -curl http://localhost:5001/metrics -```text - -**Harbor:** - -```bash -# Metrics endpoint -curl http://localhost:9090/metrics - -# View in Harbor UI -# Admin → System Settings → Metrics -```text - -### Logs - -```bash -# Zot logs -docker-compose logs -f zot - -# Harbor logs -docker-compose logs -f core registry nginx - -# Distribution logs -docker-compose logs -f registry - -# Nushell command -nu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; \ - oci-registry logs --type zot --follow --tail 100" -```text - -## Troubleshooting - -### Registry Not Starting - -```bash -# Check Docker daemon -docker ps - -# Check ports -lsof -i :5000 - -# View logs -docker-compose logs - -# Rebuild -docker-compose down -v -docker-compose up -d --build -```text - -### Cannot Push Images - -```bash -# Check authentication -docker login localhost:5000 - -# Check permissions -# Ensure user has write access to namespace - -# Check storage -df -h # Ensure disk space available - -# Check registry health -curl http://localhost:5000/v2/ -```text - -### Slow Performance - -```bash -# Enable deduplication (Zot) -# In config.json: "dedupe": true - -# Increase resources (Docker) -# Docker → Preferences → Resources - -# Run garbage collection -nu -c "use provisioning/core/nulib/lib_provisioning/oci_registry/service; \ - run-oci-registry-gc --type zot" -```text - -### TLS/Certificate Issues - -```bash -# Regenerate certificates -./scripts/generate-certs.nu - -# Trust certificate -# macOS: Add to Keychain Access -# Linux: Copy to /usr/local/share/ca-certificates/ - -# Skip TLS verification (testing only) -docker login --insecure localhost:5000 -```text - -## Advanced Usage - -### High Availability (Harbor) - -```yaml -# harbor/docker-compose.yml -# Add multiple registry instances -registry-1: - image: goharbor/registry-photon:v2.9.0 - ... - -registry-2: - image: goharbor/registry-photon:v2.9.0 - ... - -# Add load balancer -nginx: - ... - depends_on: - - registry-1 - - registry-2 -```text - -### S3 Backend (Distribution) - -```yaml -# distribution/config.yml -storage: - s3: - accesskey: AKIAIOSFODNN7EXAMPLE - secretkey: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY - region: us-west-1 - bucket: my-registry-bucket - rootdirectory: /registry -```text - -### Replication (Harbor) - -```bash -# Harbor UI → Replications → New Replication Rule -# Source: Local registry -# Destination: Remote registry -# Trigger: Manual/Scheduled/Event-based -```text - -### Webhooks - -**Zot** (via config.json): - -```json -{ - "http": { - "notifications": { - "endpoints": [ - { - "name": "orchestrator", - "url": "http://orchestrator:8080/registry/events", - "headers": { - "Authorization": ["Bearer token"] - } - } - ] - } - } -} -```text - -**Harbor** (via scripts): - -```bash -nu scripts/configure-policies.nu --registry-type harbor -# Webhooks configured automatically -```text - -### Garbage Collection - -**Zot** (automatic): - -```json -{ - "storage": { - "gc": true, - "gcInterval": "24h" - } -} -```text - -**Distribution** (manual): - -```bash -# Run GC -docker-compose exec registry \ - registry garbage-collect /etc/docker/registry/config.yml - -# Or via Nushell -nu -c "use provisioning/core/nulib/lib_provisioning/oci_registry/service; \ - run-oci-registry-gc --type distribution" -```text - -**Harbor** (UI): - -```plaintext -Admin → System Settings → Garbage Collection → Run GC -```text - -## API Reference - -### OCI API (All Registries) - -```bash -# List repositories -curl http://localhost:5000/v2/_catalog - -# List tags -curl http://localhost:5000/v2/{repository}/tags/list - -# Get manifest -curl http://localhost:5000/v2/{repository}/manifests/{tag} - -# Delete image (requires delete enabled) -curl -X DELETE http://localhost:5000/v2/{repository}/manifests/{digest} -```text - -### Harbor API - -```bash -# List projects -curl -u admin:Harbor12345 \ - http://localhost/api/v2.0/projects - -# Create project -curl -X POST -u admin:Harbor12345 \ - -H "Content-Type: application/json" \ - -d '{"project_name":"test","metadata":{"public":"false"}}' \ - http://localhost/api/v2.0/projects - -# Scan image -curl -X POST -u admin:Harbor12345 \ - http://localhost/api/v2.0/projects/{project}/repositories/{repo}/artifacts/{tag}/scan -```text - -## Performance Tuning - -### Zot - -```json -{ - "storage": { - "dedupe": true, // Enable deduplication - "gc": true, // Enable GC - "gcInterval": "12h" // More frequent GC - }, - "http": { - "http2": true // Enable HTTP/2 - } -} -```text - -### Distribution - -```yaml -storage: - cache: - blobdescriptor: redis # Use Redis for caching - -redis: - addr: redis:6379 - pool: - maxidle: 16 - maxactive: 64 -```text - -### Harbor - -```yaml -jobservice: - max_job_workers: 20 # Increase concurrent jobs - -database: - max_idle_conns: 100 - max_open_conns: 900 # Increase DB connections -```text - -## Security Best Practices - -1. **Use TLS/SSL** for all connections -2. **Strong passwords** for admin accounts -3. **Regular updates** of registry software -4. **Scan images** for vulnerabilities (Harbor/Trivy) -5. **Least privilege** access control -6. **Network isolation** (Docker networks) -7. **Regular backups** of registry data -8. **Audit logging** enabled -9. **Rate limiting** for API access -10. **Secrets management** (not in configs) - -## Backup and Restore - -### Backup - -```bash -# Backup Zot -docker-compose stop zot -tar czf zot-backup-$(date +%Y%m%d).tar.gz \ - -C /var/lib/docker/volumes zot-data - -# Backup Harbor -docker-compose stop -tar czf harbor-backup-$(date +%Y%m%d).tar.gz \ - -C /var/lib/docker/volumes \ - harbor-registry harbor-database - -# Backup Distribution -docker-compose stop registry -tar czf dist-backup-$(date +%Y%m%d).tar.gz \ - -C /var/lib/docker/volumes registry-data -```text - -### Restore - -```bash -# Restore (example for Zot) -docker-compose down -v -tar xzf zot-backup-20250106.tar.gz -C /var/lib/docker/volumes -docker-compose up -d -```text - -## Migration Between Registries - -```bash -# Example: Zot → Harbor - -# 1. Export from Zot -for repo in $(curl http://localhost:5000/v2/_catalog | jq -r '.repositories[]'); do - for tag in $(curl http://localhost:5000/v2/$repo/tags/list | jq -r '.tags[]'); do - docker pull localhost:5000/$repo:$tag - docker tag localhost:5000/$repo:$tag harbor.local/$repo:$tag - docker push harbor.local/$repo:$tag - done -done - -# 2. Or use skopeo -skopeo sync --src docker --dest docker \ - localhost:5000/provisioning-extensions \ - harbor.local/provisioning-extensions -```text - -## References - -- **Zot**: -- **Harbor**: -- **Distribution**: -- **OCI Spec**: - -## Support - -For issues or questions: - -1. Check logs: `docker-compose logs` -2. Review this documentation -3. Check GitHub issues for respective registry -4. Contact provisioning team - ---- - -**Version**: 1.0.0 -**Last Updated**: 2025-01-06 -**Maintainer**: Provisioning Platform Team +# OCI Registry Service\n\nComprehensive OCI (Open Container Initiative) registry deployment and management for the provisioning system.\nSupports multiple registry implementations: **Zot** (lightweight), **Harbor** (full-featured),\nand **Distribution** (OCI reference implementation).\n\n## Table of Contents\n\n- [Overview](#overview)\n- [Registry Types](#registry-types)\n- [Quick Start](#quick-start)\n- [Installation](#installation)\n- [Configuration](#configuration)\n- [Management](#management)\n- [Namespaces](#namespaces)\n- [Access Control](#access-control)\n- [Monitoring](#monitoring)\n- [Troubleshooting](#troubleshooting)\n- [Advanced Usage](#advanced-usage)\n\n## Overview\n\nThe OCI registry service provides artifact storage and distribution for:\n\n- **Extension Packages**: Providers, taskservs, clusters\n- **KCL Schemas**: Configuration schemas and modules\n- **Platform Images**: Orchestrator, control-center, services\n- **Test Artifacts**: Development and testing images\n\n### Features\n\n- **Multi-Registry Support**: Zot, Harbor, Distribution\n- **Namespace Organization**: Logical separation of artifacts\n- **Access Control**: RBAC, policies, authentication\n- **Monitoring**: Prometheus metrics, health checks\n- **Garbage Collection**: Automatic cleanup of unused artifacts\n- **High Availability**: Optional HA configurations\n- **TLS/SSL**: Secure communication\n- **UI Interface**: Web-based management (Zot, Harbor)\n\n## Registry Types\n\n### Zot (Recommended for Development)\n\n**Lightweight, fast, OCI-native registry with search and UI.**\n\n**Pros:**\n\n- Fast startup and low resource usage\n- Built-in UI and search\n- Prometheus metrics\n- Automatic garbage collection\n- Good for development and small deployments\n\n**Cons:**\n\n- Less mature than Distribution\n- Fewer enterprise features than Harbor\n\n**Use Cases:**\n\n- Development environments\n- CI/CD pipelines\n- Small to medium deployments\n- Quick prototyping\n\n### Harbor (Recommended for Production)\n\n**Full-featured enterprise registry with replication, scanning, and RBAC.**\n\n**Pros:**\n\n- Enterprise-grade features\n- Vulnerability scanning (Trivy)\n- Replication and mirroring\n- Advanced RBAC\n- Webhooks and notifications\n- Mature and battle-tested\n\n**Cons:**\n\n- Higher resource requirements\n- More complex setup\n- Heavier than Zot/Distribution\n\n**Use Cases:**\n\n- Production deployments\n- Multi-tenant environments\n- Security-critical applications\n- Large-scale deployments\n\n### Distribution (OCI Reference)\n\n**Official OCI registry reference implementation.**\n\n**Pros:**\n\n- OCI standard compliance\n- Lightweight and simple\n- Well-documented\n- Industry standard\n\n**Cons:**\n\n- No built-in UI\n- No search functionality\n- Manual garbage collection\n- Basic feature set\n\n**Use Cases:**\n\n- OCI standard compliance required\n- Minimal registry needs\n- Custom integrations\n- Educational purposes\n\n## Quick Start\n\n### Start Zot Registry (Default)\n\n```\n# Start Zot in background\ncd provisioning/platform/oci-registry/zot\ndocker-compose up -d\n\n# Initialize with namespaces and policies\nnu ../scripts/init-registry.nu --registry-type zot\n\n# Check health\nnu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; oci-registry health"\n\n# Access UI\nopen http://localhost:5000\n```\n\n### Start Harbor Registry\n\n```\n# Start Harbor\ncd provisioning/platform/oci-registry/harbor\ndocker-compose up -d\n\n# Wait for services to be ready (takes ~2 minutes)\nsleep 120\n\n# Initialize\nnu ../scripts/init-registry.nu --registry-type harbor --admin-password Harbor12345\n\n# Access UI\nopen http://localhost\n# Login: admin / Harbor12345\n```\n\n### Start Distribution Registry\n\n```\n# Start Distribution with UI\ncd provisioning/platform/oci-registry/distribution\ndocker-compose up -d\n\n# Initialize\nnu ../scripts/init-registry.nu --registry-type distribution\n\n# Access UI (if included)\nopen http://localhost:8080\n```\n\n## Installation\n\n### Prerequisites\n\n- **Docker** (20.10+)\n- **Docker Compose** (2.0+)\n- **Nushell** (0.107+)\n\n### Setup\n\n```\n# Clone configurations (already included)\ncd provisioning/platform/oci-registry\n\n# Choose registry type\nREGISTRY_TYPE="zot" # or "harbor" or "distribution"\n\n# Generate TLS certificates (optional, for HTTPS)\n./scripts/generate-certs.nu\n\n# Start registry\ncd $REGISTRY_TYPE\ndocker-compose up -d\n\n# Initialize\nnu ../scripts/init-registry.nu --registry-type $REGISTRY_TYPE\n\n# Verify\ndocker-compose ps\n```\n\n## Configuration\n\n### Zot Configuration\n\n**File**: `zot/config.json`\n\nKey settings:\n\n```\n{\n "storage": {\n "rootDirectory": "/var/lib/registry",\n "dedupe": true,\n "gc": true,\n "gcInterval": "24h"\n },\n "http": {\n "address": "0.0.0.0",\n "port": "5000"\n },\n "extensions": {\n "search": {"enable": true},\n "metrics": {"enable": true},\n "ui": {"enable": true}\n },\n "accessControl": {\n "repositories": {\n "provisioning-extensions/**": {\n "policies": [\n {\n "users": ["provisioning"],\n "actions": ["read", "create", "update", "delete"]\n }\n ]\n }\n }\n }\n}\n```\n\n### Harbor Configuration\n\n**File**: `harbor/harbor.yml`\n\nKey settings:\n\n```\nhostname: harbor.provisioning.local\nharbor_admin_password: Harbor12345\n\ndatabase:\n password: root123\n\ntrivy:\n ignore_unfixed: false\n skip_update: false\n\nlog:\n level: info\n```\n\n### Distribution Configuration\n\n**File**: `distribution/config.yml`\n\nKey settings:\n\n```\nstorage:\n filesystem:\n rootdirectory: /var/lib/registry\n delete:\n enabled: true\n\nhttp:\n addr: :5000\n tls:\n certificate: /etc/docker/registry/certs/cert.pem\n key: /etc/docker/registry/certs/key.pem\n\nauth:\n htpasswd:\n realm: Registry\n path: /etc/docker/registry/htpasswd\n```\n\n## Management\n\n### Using Nushell Commands\n\n```\n# Start registry\nnu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; oci-registry start --type zot"\n\n# Stop registry\nnu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; oci-registry stop --type zot"\n\n# Check status\nnu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; oci-registry status --type zot"\n\n# View logs\nnu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; oci-registry logs --type zot --follow"\n\n# Health check\nnu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; oci-registry health --type zot"\n\n# Initialize\nnu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; oci-registry init --type zot"\n\n# List namespaces\nnu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; oci-registry namespaces"\n```\n\n### Using Docker Compose\n\n```\n# Start\ncd provisioning/platform/oci-registry/zot\ndocker-compose up -d\n\n# Stop\ndocker-compose down\n\n# View logs\ndocker-compose logs -f\n\n# Restart\ndocker-compose restart\n\n# Remove (including volumes)\ndocker-compose down -v\n```\n\n## Namespaces\n\n### Default Namespaces\n\n| Namespace | Description | Public | Retention |\n| ----------- | ------------- | -------- | ----------- |\n| `provisioning-extensions` | Extension packages | No | 10 tags, 90 days |\n| `provisioning-kcl` | KCL schemas | No | 20 tags, 180 days |\n| `provisioning-platform` | Platform images | No | 5 tags, 30 days |\n| `provisioning-test` | Test artifacts | Yes | 3 tags, 7 days |\n\n### Manage Namespaces\n\n```\n# Setup all namespaces\nnu scripts/setup-namespaces.nu --registry-type zot\n\n# List namespaces\nnu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; oci-registry namespaces"\n\n# Create namespace\nnu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; \\n oci-registry namespace create my-namespace --type zot"\n\n# Get namespace info\nnu scripts/setup-namespaces.nu namespace info provisioning-extensions\n```\n\n## Access Control\n\n### Policies\n\nDefault access policies:\n\n**provisioning-extensions:**\n\n- Authenticated: Read, Write, Delete\n- Anonymous: None\n\n**provisioning-kcl:**\n\n- Authenticated: Read, Write\n- Anonymous: None\n\n**provisioning-platform:**\n\n- Authenticated: Read only (except admin)\n- Anonymous: None\n\n**provisioning-test:**\n\n- Authenticated: Read, Write, Delete\n- Anonymous: Read only\n\n### Configure Policies\n\n```\n# Apply all policies\nnu scripts/configure-policies.nu --registry-type zot\n\n# Show policy for namespace\nnu scripts/configure-policies.nu policy show provisioning-extensions\n\n# List all policies\nnu scripts/configure-policies.nu policy list\n```\n\n### Authentication\n\n**Zot/Distribution (htpasswd):**\n\n```\n# Create user\nhtpasswd -Bc htpasswd provisioning\n\n# Login\ndocker login localhost:5000\n```\n\n**Harbor (Database):**\n\n```\n# Login via UI or CLI\ndocker login localhost\n# Username: admin\n# Password: Harbor12345\n\n# Create users via Harbor UI\n# Admin → Users → New User\n```\n\n## Monitoring\n\n### Health Checks\n\n```\n# Full health check\nnu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; \\n oci-registry health --type zot"\n\n# API check\ncurl http://localhost:5000/v2/\n\n# Catalog check\ncurl http://localhost:5000/v2/_catalog\n```\n\n### Metrics\n\n**Zot:**\n\n```\n# Prometheus metrics\ncurl http://localhost:5000/metrics\n\n# Visualize with Prometheus\n# Add to prometheus.yml:\n# - targets: ['localhost:5000']\n```\n\n**Distribution:**\n\n```\n# Metrics on debug port\ncurl http://localhost:5001/metrics\n```\n\n**Harbor:**\n\n```\n# Metrics endpoint\ncurl http://localhost:9090/metrics\n\n# View in Harbor UI\n# Admin → System Settings → Metrics\n```\n\n### Logs\n\n```\n# Zot logs\ndocker-compose logs -f zot\n\n# Harbor logs\ndocker-compose logs -f core registry nginx\n\n# Distribution logs\ndocker-compose logs -f registry\n\n# Nushell command\nnu -c "use provisioning/core/nulib/lib_provisioning/oci_registry; \\n oci-registry logs --type zot --follow --tail 100"\n```\n\n## Troubleshooting\n\n### Registry Not Starting\n\n```\n# Check Docker daemon\ndocker ps\n\n# Check ports\nlsof -i :5000\n\n# View logs\ndocker-compose logs\n\n# Rebuild\ndocker-compose down -v\ndocker-compose up -d --build\n```\n\n### Cannot Push Images\n\n```\n# Check authentication\ndocker login localhost:5000\n\n# Check permissions\n# Ensure user has write access to namespace\n\n# Check storage\ndf -h # Ensure disk space available\n\n# Check registry health\ncurl http://localhost:5000/v2/\n```\n\n### Slow Performance\n\n```\n# Enable deduplication (Zot)\n# In config.json: "dedupe": true\n\n# Increase resources (Docker)\n# Docker → Preferences → Resources\n\n# Run garbage collection\nnu -c "use provisioning/core/nulib/lib_provisioning/oci_registry/service; \\n run-oci-registry-gc --type zot"\n```\n\n### TLS/Certificate Issues\n\n```\n# Regenerate certificates\n./scripts/generate-certs.nu\n\n# Trust certificate\n# macOS: Add to Keychain Access\n# Linux: Copy to /usr/local/share/ca-certificates/\n\n# Skip TLS verification (testing only)\ndocker login --insecure localhost:5000\n```\n\n## Advanced Usage\n\n### High Availability (Harbor)\n\n```\n# harbor/docker-compose.yml\n# Add multiple registry instances\nregistry-1:\n image: goharbor/registry-photon:v2.9.0\n ...\n\nregistry-2:\n image: goharbor/registry-photon:v2.9.0\n ...\n\n# Add load balancer\nnginx:\n ...\n depends_on:\n - registry-1\n - registry-2\n```\n\n### S3 Backend (Distribution)\n\n```\n# distribution/config.yml\nstorage:\n s3:\n accesskey: AKIAIOSFODNN7EXAMPLE\n secretkey: wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\n region: us-west-1\n bucket: my-registry-bucket\n rootdirectory: /registry\n```\n\n### Replication (Harbor)\n\n```\n# Harbor UI → Replications → New Replication Rule\n# Source: Local registry\n# Destination: Remote registry\n# Trigger: Manual/Scheduled/Event-based\n```\n\n### Webhooks\n\n**Zot** (via config.json):\n\n```\n{\n "http": {\n "notifications": {\n "endpoints": [\n {\n "name": "orchestrator",\n "url": "http://orchestrator:8080/registry/events",\n "headers": {\n "Authorization": ["Bearer token"]\n }\n }\n ]\n }\n }\n}\n```\n\n**Harbor** (via scripts):\n\n```\nnu scripts/configure-policies.nu --registry-type harbor\n# Webhooks configured automatically\n```\n\n### Garbage Collection\n\n**Zot** (automatic):\n\n```\n{\n "storage": {\n "gc": true,\n "gcInterval": "24h"\n }\n}\n```\n\n**Distribution** (manual):\n\n```\n# Run GC\ndocker-compose exec registry \\n registry garbage-collect /etc/docker/registry/config.yml\n\n# Or via Nushell\nnu -c "use provisioning/core/nulib/lib_provisioning/oci_registry/service; \\n run-oci-registry-gc --type distribution"\n```\n\n**Harbor** (UI):\n\n```\nAdmin → System Settings → Garbage Collection → Run GC\n```\n\n## API Reference\n\n### OCI API (All Registries)\n\n```\n# List repositories\ncurl http://localhost:5000/v2/_catalog\n\n# List tags\ncurl http://localhost:5000/v2/{repository}/tags/list\n\n# Get manifest\ncurl http://localhost:5000/v2/{repository}/manifests/{tag}\n\n# Delete image (requires delete enabled)\ncurl -X DELETE http://localhost:5000/v2/{repository}/manifests/{digest}\n```\n\n### Harbor API\n\n```\n# List projects\ncurl -u admin:Harbor12345 \\n http://localhost/api/v2.0/projects\n\n# Create project\ncurl -X POST -u admin:Harbor12345 \\n -H "Content-Type: application/json" \\n -d '{"project_name":"test","metadata":{"public":"false"}}' \\n http://localhost/api/v2.0/projects\n\n# Scan image\ncurl -X POST -u admin:Harbor12345 \\n http://localhost/api/v2.0/projects/{project}/repositories/{repo}/artifacts/{tag}/scan\n```\n\n## Performance Tuning\n\n### Zot\n\n```\n{\n "storage": {\n "dedupe": true, // Enable deduplication\n "gc": true, // Enable GC\n "gcInterval": "12h" // More frequent GC\n },\n "http": {\n "http2": true // Enable HTTP/2\n }\n}\n```\n\n### Distribution\n\n```\nstorage:\n cache:\n blobdescriptor: redis # Use Redis for caching\n\nredis:\n addr: redis:6379\n pool:\n maxidle: 16\n maxactive: 64\n```\n\n### Harbor\n\n```\njobservice:\n max_job_workers: 20 # Increase concurrent jobs\n\ndatabase:\n max_idle_conns: 100\n max_open_conns: 900 # Increase DB connections\n```\n\n## Security Best Practices\n\n1. **Use TLS/SSL** for all connections\n2. **Strong passwords** for admin accounts\n3. **Regular updates** of registry software\n4. **Scan images** for vulnerabilities (Harbor/Trivy)\n5. **Least privilege** access control\n6. **Network isolation** (Docker networks)\n7. **Regular backups** of registry data\n8. **Audit logging** enabled\n9. **Rate limiting** for API access\n10. **Secrets management** (not in configs)\n\n## Backup and Restore\n\n### Backup\n\n```\n# Backup Zot\ndocker-compose stop zot\ntar czf zot-backup-$(date +%Y%m%d).tar.gz \\n -C /var/lib/docker/volumes zot-data\n\n# Backup Harbor\ndocker-compose stop\ntar czf harbor-backup-$(date +%Y%m%d).tar.gz \\n -C /var/lib/docker/volumes \\n harbor-registry harbor-database\n\n# Backup Distribution\ndocker-compose stop registry\ntar czf dist-backup-$(date +%Y%m%d).tar.gz \\n -C /var/lib/docker/volumes registry-data\n```\n\n### Restore\n\n```\n# Restore (example for Zot)\ndocker-compose down -v\ntar xzf zot-backup-20250106.tar.gz -C /var/lib/docker/volumes\ndocker-compose up -d\n```\n\n## Migration Between Registries\n\n```\n# Example: Zot → Harbor\n\n# 1. Export from Zot\nfor repo in $(curl http://localhost:5000/v2/_catalog | jq -r '.repositories[]'); do\n for tag in $(curl http://localhost:5000/v2/$repo/tags/list | jq -r '.tags[]'); do\n docker pull localhost:5000/$repo:$tag\n docker tag localhost:5000/$repo:$tag harbor.local/$repo:$tag\n docker push harbor.local/$repo:$tag\n done\ndone\n\n# 2. Or use skopeo\nskopeo sync --src docker --dest docker \\n localhost:5000/provisioning-extensions \\n harbor.local/provisioning-extensions\n```\n\n## References\n\n- **Zot**: \n- **Harbor**: \n- **Distribution**: \n- **OCI Spec**: \n\n## Support\n\nFor issues or questions:\n\n1. Check logs: `docker-compose logs`\n2. Review this documentation\n3. Check GitHub issues for respective registry\n4. Contact provisioning team\n\n---\n\n**Version**: 1.0.0\n**Last Updated**: 2025-01-06\n**Maintainer**: Provisioning Platform Team \ No newline at end of file diff --git a/scripts/deploy-platform.nu b/scripts/deploy-platform.nu old mode 100755 new mode 100644 diff --git a/scripts/generate-infrastructure-configs.nu b/scripts/generate-infrastructure-configs.nu old mode 100755 new mode 100644 diff --git a/scripts/health-check.nu b/scripts/health-check.nu old mode 100755 new mode 100644 diff --git a/scripts/run-docker.nu b/scripts/run-docker.nu old mode 100755 new mode 100644 diff --git a/scripts/run-native.nu b/scripts/run-native.nu old mode 100755 new mode 100644 diff --git a/scripts/start-provisioning-daemon.nu b/scripts/start-provisioning-daemon.nu old mode 100755 new mode 100644 diff --git a/scripts/validate-configs.nu b/scripts/validate-configs.nu old mode 100755 new mode 100644 diff --git a/scripts/validate-infrastructure.nu b/scripts/validate-infrastructure.nu old mode 100755 new mode 100644