Vapora/provisioning/.github/workflows/health-check.yml

name: Health Check & Monitoring

on:
  schedule:
    - cron: '*/15 * * * *'  # Every 15 minutes
    - cron: '0 */6 * * *'   # Every 6 hours
  workflow_dispatch:
    inputs:
      target:
        description: 'Health check target'
        required: true
        default: 'kubernetes'
        type: choice
        options:
          - docker
          - kubernetes
          - both
      count:
        description: 'Number of checks to perform'
        required: false
        default: '1'
        type: string
      interval:
        description: 'Interval between checks (seconds)'
        required: false
        default: '30'
        type: string

concurrency:
  group: health-check-${{ github.event_name }}
  cancel-in-progress: false

jobs:
  health-check:
    name: Health Check - ${{ inputs.target || 'kubernetes' }}
    runs-on: ubuntu-latest
    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Install Nushell
        run: |
          cargo install nu --locked
          nu --version

      - name: Install kubectl
        uses: azure/setup-kubectl@v3
        with:
          version: 'latest'
        if: ${{ inputs.target == 'kubernetes' || inputs.target == 'both' }}

      - name: Configure kubeconfig
        run: |
          mkdir -p ~/.kube
          # Try to use CI cluster first, fall back to staging
          if [ -n "${{ secrets.KUBE_CONFIG_CI }}" ]; then
            echo "${{ secrets.KUBE_CONFIG_CI }}" | base64 -d > ~/.kube/config
          elif [ -n "${{ secrets.KUBE_CONFIG_STAGING }}" ]; then
            echo "${{ secrets.KUBE_CONFIG_STAGING }}" | base64 -d > ~/.kube/config
          else
            echo "Warning: No kubeconfig available"
            exit 1
          fi
          chmod 600 ~/.kube/config
          kubectl cluster-info
        if: ${{ inputs.target == 'kubernetes' || inputs.target == 'both' }}
        continue-on-error: true

      - name: Create health check directory
        run: mkdir -p health-check-reports

      - name: Run health check (Docker)
        if: ${{ inputs.target == 'docker' || inputs.target == 'both' }}
        run: |
          cd provisioning
          nu scripts/health-check.nu \
            --target docker \
            --count ${{ inputs.count || '1' }} \
            --interval ${{ inputs.interval || '30' }} \
            2>&1 | tee ../health-check-reports/docker-health.log
        continue-on-error: true

      - name: Run health check (Kubernetes)
        if: ${{ inputs.target == 'kubernetes' || inputs.target == 'both' }}
        run: |
          cd provisioning
          nu scripts/health-check.nu \
            --target kubernetes \
            --count ${{ inputs.count || '1' }} \
            --interval ${{ inputs.interval || '30' }} \
            2>&1 | tee ../health-check-reports/k8s-health.log
        continue-on-error: true

      - name: Collect Kubernetes diagnostics
        if: ${{ (inputs.target == 'kubernetes' || inputs.target == 'both') && always() }}
        run: |
          echo "=== VAPORA Namespace ===" >> health-check-reports/k8s-diagnostics.log
          kubectl get all -n vapora >> health-check-reports/k8s-diagnostics.log 2>&1

          echo "" >> health-check-reports/k8s-diagnostics.log
          echo "=== Deployment Details ===" >> health-check-reports/k8s-diagnostics.log
          kubectl describe deployments -n vapora >> health-check-reports/k8s-diagnostics.log 2>&1

          echo "" >> health-check-reports/k8s-diagnostics.log
          echo "=== Pod Events ===" >> health-check-reports/k8s-diagnostics.log
          kubectl get events -n vapora --sort-by='.lastTimestamp' >> health-check-reports/k8s-diagnostics.log 2>&1

          echo "" >> health-check-reports/k8s-diagnostics.log
          echo "=== Resource Usage ===" >> health-check-reports/k8s-diagnostics.log
          kubectl top pods -n vapora >> health-check-reports/k8s-diagnostics.log 2>&1 || echo "metrics-server not available"

          cat health-check-reports/k8s-diagnostics.log
        continue-on-error: true

      - name: Collect Docker diagnostics
        if: ${{ (inputs.target == 'docker' || inputs.target == 'both') && always() }}
        run: |
          echo "=== Docker Services ===" > health-check-reports/docker-diagnostics.log
          docker ps -a >> health-check-reports/docker-diagnostics.log 2>&1 || echo "Docker daemon not accessible"

          echo "" >> health-check-reports/docker-diagnostics.log
          echo "=== Docker Networks ===" >> health-check-reports/docker-diagnostics.log
          docker network ls >> health-check-reports/docker-diagnostics.log 2>&1 || true

          echo "" >> health-check-reports/docker-diagnostics.log
          echo "=== Docker Volumes ===" >> health-check-reports/docker-diagnostics.log
          docker volume ls >> health-check-reports/docker-diagnostics.log 2>&1 || true

          cat health-check-reports/docker-diagnostics.log
        continue-on-error: true

      - name: Generate health report
        if: always()
        run: |
          cat > health-check-reports/HEALTH_REPORT.md << 'EOF'
          # VAPORA Health Check Report

          **Report Time**: $(date -u +'%Y-%m-%dT%H:%M:%SZ')
          **Triggered By**: ${{ github.event_name }}
          **Workflow Run**: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}

          ## Summary

          Health check executed for target: **${{ inputs.target || 'kubernetes' }}**
          - Check Count: ${{ inputs.count || '1' }}
          - Check Interval: ${{ inputs.interval || '30' }}s

          ## Results

          ### Docker Status
          See `docker-health.log` and `docker-diagnostics.log` for details.

          ### Kubernetes Status
          See `k8s-health.log` and `k8s-diagnostics.log` for details.

          ## Files in This Report

          - `HEALTH_REPORT.md` - This report
          - `docker-health.log` - Docker health check output
          - `docker-diagnostics.log` - Docker system diagnostics
          - `k8s-health.log` - Kubernetes health check output
          - `k8s-diagnostics.log` - Kubernetes system diagnostics

          EOF
          cat health-check-reports/HEALTH_REPORT.md

      - name: Upload health check reports
        if: always()
        uses: actions/upload-artifact@v4
        with:
          name: health-check-${{ inputs.target || 'kubernetes' }}-${{ github.run_id }}
          path: health-check-reports/
          retention-days: 30

      - name: Check health check success
        run: |
          if grep -q "✅ All services healthy" health-check-reports/docker-health.log 2>/dev/null || \
             grep -q "✅ All services healthy" health-check-reports/k8s-health.log 2>/dev/null; then
            echo "✅ Health check passed"
            exit 0
          else
            echo "⚠️  Health check warnings detected"
            exit 0  # Don't fail, just report
          fi
        continue-on-error: true

      - name: Create issue on health failure
        if: |
          failure() &&
          github.event_name == 'schedule' &&
          (contains(fromJson('["kubernetes", "both"]'), inputs.target || inputs.target == null)
        uses: actions/github-script@v7
        with:
          script: |
            github.rest.issues.create({
              owner: context.repo.owner,
              repo: context.repo.repo,
              title: `🚨 Health Check Failed - ${new Date().toISOString()}`,
              body: `Health check failed at ${new Date().toISOString()}\n\nSee workflow run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`,
              labels: ['monitoring', 'health-check', 'critical']
            });
        continue-on-error: true

      - name: Notify Slack - Success
        if: success()
        uses: 8398a7/action-slack@v3
        with:
          status: ${{ job.status }}
          text: |
            ✅ VAPORA Health Check Passed
            Target: ${{ inputs.target || 'kubernetes' }}
            Checks: ${{ inputs.count || '1' }}
          webhook_url: ${{ secrets.SLACK_WEBHOOK }}
          fields: repo,message
        continue-on-error: true

      - name: Notify Slack - Failure
        if: failure()
        uses: 8398a7/action-slack@v3
        with:
          status: ${{ job.status }}
          text: |
            ❌ VAPORA Health Check Failed
            Target: ${{ inputs.target || 'kubernetes' }}
            Check workflow logs for details
          webhook_url: ${{ secrets.SLACK_WEBHOOK_ALERTS }}
          fields: repo,message,commit
        continue-on-error: true