name: Health Check & Monitoring on: schedule: - cron: '*/15 * * * *' # Every 15 minutes - cron: '0 */6 * * *' # Every 6 hours workflow_dispatch: inputs: target: description: 'Health check target' required: true default: 'kubernetes' type: choice options: - docker - kubernetes - both count: description: 'Number of checks to perform' required: false default: '1' type: string interval: description: 'Interval between checks (seconds)' required: false default: '30' type: string concurrency: group: health-check-${{ github.event_name }} cancel-in-progress: false jobs: health-check: name: Health Check - ${{ inputs.target || 'kubernetes' }} runs-on: ubuntu-latest steps: - name: Checkout code uses: actions/checkout@v4 - name: Install Nushell run: | cargo install nu --locked nu --version - name: Install kubectl uses: azure/setup-kubectl@v3 with: version: 'latest' if: ${{ inputs.target == 'kubernetes' || inputs.target == 'both' }} - name: Configure kubeconfig run: | mkdir -p ~/.kube # Try to use CI cluster first, fall back to staging if [ -n "${{ secrets.KUBE_CONFIG_CI }}" ]; then echo "${{ secrets.KUBE_CONFIG_CI }}" | base64 -d > ~/.kube/config elif [ -n "${{ secrets.KUBE_CONFIG_STAGING }}" ]; then echo "${{ secrets.KUBE_CONFIG_STAGING }}" | base64 -d > ~/.kube/config else echo "Warning: No kubeconfig available" exit 1 fi chmod 600 ~/.kube/config kubectl cluster-info if: ${{ inputs.target == 'kubernetes' || inputs.target == 'both' }} continue-on-error: true - name: Create health check directory run: mkdir -p health-check-reports - name: Run health check (Docker) if: ${{ inputs.target == 'docker' || inputs.target == 'both' }} run: | cd provisioning nu scripts/health-check.nu \ --target docker \ --count ${{ inputs.count || '1' }} \ --interval ${{ inputs.interval || '30' }} \ 2>&1 | tee ../health-check-reports/docker-health.log continue-on-error: true - name: Run health check (Kubernetes) if: ${{ inputs.target == 'kubernetes' || inputs.target == 'both' }} run: | cd provisioning nu scripts/health-check.nu \ --target kubernetes \ --count ${{ inputs.count || '1' }} \ --interval ${{ inputs.interval || '30' }} \ 2>&1 | tee ../health-check-reports/k8s-health.log continue-on-error: true - name: Collect Kubernetes diagnostics if: ${{ (inputs.target == 'kubernetes' || inputs.target == 'both') && always() }} run: | echo "=== VAPORA Namespace ===" >> health-check-reports/k8s-diagnostics.log kubectl get all -n vapora >> health-check-reports/k8s-diagnostics.log 2>&1 echo "" >> health-check-reports/k8s-diagnostics.log echo "=== Deployment Details ===" >> health-check-reports/k8s-diagnostics.log kubectl describe deployments -n vapora >> health-check-reports/k8s-diagnostics.log 2>&1 echo "" >> health-check-reports/k8s-diagnostics.log echo "=== Pod Events ===" >> health-check-reports/k8s-diagnostics.log kubectl get events -n vapora --sort-by='.lastTimestamp' >> health-check-reports/k8s-diagnostics.log 2>&1 echo "" >> health-check-reports/k8s-diagnostics.log echo "=== Resource Usage ===" >> health-check-reports/k8s-diagnostics.log kubectl top pods -n vapora >> health-check-reports/k8s-diagnostics.log 2>&1 || echo "metrics-server not available" cat health-check-reports/k8s-diagnostics.log continue-on-error: true - name: Collect Docker diagnostics if: ${{ (inputs.target == 'docker' || inputs.target == 'both') && always() }} run: | echo "=== Docker Services ===" > health-check-reports/docker-diagnostics.log docker ps -a >> health-check-reports/docker-diagnostics.log 2>&1 || echo "Docker daemon not accessible" echo "" >> health-check-reports/docker-diagnostics.log echo "=== Docker Networks ===" >> health-check-reports/docker-diagnostics.log docker network ls >> health-check-reports/docker-diagnostics.log 2>&1 || true echo "" >> health-check-reports/docker-diagnostics.log echo "=== Docker Volumes ===" >> health-check-reports/docker-diagnostics.log docker volume ls >> health-check-reports/docker-diagnostics.log 2>&1 || true cat health-check-reports/docker-diagnostics.log continue-on-error: true - name: Generate health report if: always() run: | cat > health-check-reports/HEALTH_REPORT.md << 'EOF' # VAPORA Health Check Report **Report Time**: $(date -u +'%Y-%m-%dT%H:%M:%SZ') **Triggered By**: ${{ github.event_name }} **Workflow Run**: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} ## Summary Health check executed for target: **${{ inputs.target || 'kubernetes' }}** - Check Count: ${{ inputs.count || '1' }} - Check Interval: ${{ inputs.interval || '30' }}s ## Results ### Docker Status See `docker-health.log` and `docker-diagnostics.log` for details. ### Kubernetes Status See `k8s-health.log` and `k8s-diagnostics.log` for details. ## Files in This Report - `HEALTH_REPORT.md` - This report - `docker-health.log` - Docker health check output - `docker-diagnostics.log` - Docker system diagnostics - `k8s-health.log` - Kubernetes health check output - `k8s-diagnostics.log` - Kubernetes system diagnostics EOF cat health-check-reports/HEALTH_REPORT.md - name: Upload health check reports if: always() uses: actions/upload-artifact@v4 with: name: health-check-${{ inputs.target || 'kubernetes' }}-${{ github.run_id }} path: health-check-reports/ retention-days: 30 - name: Check health check success run: | if grep -q "✅ All services healthy" health-check-reports/docker-health.log 2>/dev/null || \ grep -q "✅ All services healthy" health-check-reports/k8s-health.log 2>/dev/null; then echo "✅ Health check passed" exit 0 else echo "⚠️ Health check warnings detected" exit 0 # Don't fail, just report fi continue-on-error: true - name: Create issue on health failure if: | failure() && github.event_name == 'schedule' && (contains(fromJson('["kubernetes", "both"]'), inputs.target || inputs.target == null) uses: actions/github-script@v7 with: script: | github.rest.issues.create({ owner: context.repo.owner, repo: context.repo.repo, title: `🚨 Health Check Failed - ${new Date().toISOString()}`, body: `Health check failed at ${new Date().toISOString()}\n\nSee workflow run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`, labels: ['monitoring', 'health-check', 'critical'] }); continue-on-error: true - name: Notify Slack - Success if: success() uses: 8398a7/action-slack@v3 with: status: ${{ job.status }} text: | ✅ VAPORA Health Check Passed Target: ${{ inputs.target || 'kubernetes' }} Checks: ${{ inputs.count || '1' }} webhook_url: ${{ secrets.SLACK_WEBHOOK }} fields: repo,message continue-on-error: true - name: Notify Slack - Failure if: failure() uses: 8398a7/action-slack@v3 with: status: ${{ job.status }} text: | ❌ VAPORA Health Check Failed Target: ${{ inputs.target || 'kubernetes' }} Check workflow logs for details webhook_url: ${{ secrets.SLACK_WEBHOOK_ALERTS }} fields: repo,message,commit continue-on-error: true