Some checks failed
Rust CI / Security Audit (push) Has been cancelled
Rust CI / Check + Test + Lint (nightly) (push) Has been cancelled
Rust CI / Check + Test + Lint (stable) (push) Has been cancelled
mdBook Build & Deploy / Build mdBook (push) Has been cancelled
Nickel Type Check / Nickel Type Checking (push) Has been cancelled
mdBook Build & Deploy / Documentation Quality Check (push) Has been cancelled
mdBook Build & Deploy / Deploy to GitHub Pages (push) Has been cancelled
mdBook Build & Deploy / Notification (push) Has been cancelled
229 lines
8.4 KiB
YAML
229 lines
8.4 KiB
YAML
name: Health Check & Monitoring
|
|
|
|
on:
|
|
schedule:
|
|
- cron: '*/15 * * * *' # Every 15 minutes
|
|
- cron: '0 */6 * * *' # Every 6 hours
|
|
workflow_dispatch:
|
|
inputs:
|
|
target:
|
|
description: 'Health check target'
|
|
required: true
|
|
default: 'kubernetes'
|
|
type: choice
|
|
options:
|
|
- docker
|
|
- kubernetes
|
|
- both
|
|
count:
|
|
description: 'Number of checks to perform'
|
|
required: false
|
|
default: '1'
|
|
type: string
|
|
interval:
|
|
description: 'Interval between checks (seconds)'
|
|
required: false
|
|
default: '30'
|
|
type: string
|
|
|
|
concurrency:
|
|
group: health-check-${{ github.event_name }}
|
|
cancel-in-progress: false
|
|
|
|
jobs:
|
|
health-check:
|
|
name: Health Check - ${{ inputs.target || 'kubernetes' }}
|
|
runs-on: ubuntu-latest
|
|
steps:
|
|
- name: Checkout code
|
|
uses: actions/checkout@v4
|
|
|
|
- name: Install Nushell
|
|
run: |
|
|
cargo install nu --locked
|
|
nu --version
|
|
|
|
- name: Install kubectl
|
|
uses: azure/setup-kubectl@v3
|
|
with:
|
|
version: 'latest'
|
|
if: ${{ inputs.target == 'kubernetes' || inputs.target == 'both' }}
|
|
|
|
- name: Configure kubeconfig
|
|
run: |
|
|
mkdir -p ~/.kube
|
|
# Try to use CI cluster first, fall back to staging
|
|
if [ -n "${{ secrets.KUBE_CONFIG_CI }}" ]; then
|
|
echo "${{ secrets.KUBE_CONFIG_CI }}" | base64 -d > ~/.kube/config
|
|
elif [ -n "${{ secrets.KUBE_CONFIG_STAGING }}" ]; then
|
|
echo "${{ secrets.KUBE_CONFIG_STAGING }}" | base64 -d > ~/.kube/config
|
|
else
|
|
echo "Warning: No kubeconfig available"
|
|
exit 1
|
|
fi
|
|
chmod 600 ~/.kube/config
|
|
kubectl cluster-info
|
|
if: ${{ inputs.target == 'kubernetes' || inputs.target == 'both' }}
|
|
continue-on-error: true
|
|
|
|
- name: Create health check directory
|
|
run: mkdir -p health-check-reports
|
|
|
|
- name: Run health check (Docker)
|
|
if: ${{ inputs.target == 'docker' || inputs.target == 'both' }}
|
|
run: |
|
|
cd provisioning
|
|
nu scripts/health-check.nu \
|
|
--target docker \
|
|
--count ${{ inputs.count || '1' }} \
|
|
--interval ${{ inputs.interval || '30' }} \
|
|
2>&1 | tee ../health-check-reports/docker-health.log
|
|
continue-on-error: true
|
|
|
|
- name: Run health check (Kubernetes)
|
|
if: ${{ inputs.target == 'kubernetes' || inputs.target == 'both' }}
|
|
run: |
|
|
cd provisioning
|
|
nu scripts/health-check.nu \
|
|
--target kubernetes \
|
|
--count ${{ inputs.count || '1' }} \
|
|
--interval ${{ inputs.interval || '30' }} \
|
|
2>&1 | tee ../health-check-reports/k8s-health.log
|
|
continue-on-error: true
|
|
|
|
- name: Collect Kubernetes diagnostics
|
|
if: ${{ (inputs.target == 'kubernetes' || inputs.target == 'both') && always() }}
|
|
run: |
|
|
echo "=== VAPORA Namespace ===" >> health-check-reports/k8s-diagnostics.log
|
|
kubectl get all -n vapora >> health-check-reports/k8s-diagnostics.log 2>&1
|
|
|
|
echo "" >> health-check-reports/k8s-diagnostics.log
|
|
echo "=== Deployment Details ===" >> health-check-reports/k8s-diagnostics.log
|
|
kubectl describe deployments -n vapora >> health-check-reports/k8s-diagnostics.log 2>&1
|
|
|
|
echo "" >> health-check-reports/k8s-diagnostics.log
|
|
echo "=== Pod Events ===" >> health-check-reports/k8s-diagnostics.log
|
|
kubectl get events -n vapora --sort-by='.lastTimestamp' >> health-check-reports/k8s-diagnostics.log 2>&1
|
|
|
|
echo "" >> health-check-reports/k8s-diagnostics.log
|
|
echo "=== Resource Usage ===" >> health-check-reports/k8s-diagnostics.log
|
|
kubectl top pods -n vapora >> health-check-reports/k8s-diagnostics.log 2>&1 || echo "metrics-server not available"
|
|
|
|
cat health-check-reports/k8s-diagnostics.log
|
|
continue-on-error: true
|
|
|
|
- name: Collect Docker diagnostics
|
|
if: ${{ (inputs.target == 'docker' || inputs.target == 'both') && always() }}
|
|
run: |
|
|
echo "=== Docker Services ===" > health-check-reports/docker-diagnostics.log
|
|
docker ps -a >> health-check-reports/docker-diagnostics.log 2>&1 || echo "Docker daemon not accessible"
|
|
|
|
echo "" >> health-check-reports/docker-diagnostics.log
|
|
echo "=== Docker Networks ===" >> health-check-reports/docker-diagnostics.log
|
|
docker network ls >> health-check-reports/docker-diagnostics.log 2>&1 || true
|
|
|
|
echo "" >> health-check-reports/docker-diagnostics.log
|
|
echo "=== Docker Volumes ===" >> health-check-reports/docker-diagnostics.log
|
|
docker volume ls >> health-check-reports/docker-diagnostics.log 2>&1 || true
|
|
|
|
cat health-check-reports/docker-diagnostics.log
|
|
continue-on-error: true
|
|
|
|
- name: Generate health report
|
|
if: always()
|
|
run: |
|
|
cat > health-check-reports/HEALTH_REPORT.md << 'EOF'
|
|
# VAPORA Health Check Report
|
|
|
|
**Report Time**: $(date -u +'%Y-%m-%dT%H:%M:%SZ')
|
|
**Triggered By**: ${{ github.event_name }}
|
|
**Workflow Run**: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
|
|
|
|
## Summary
|
|
|
|
Health check executed for target: **${{ inputs.target || 'kubernetes' }}**
|
|
- Check Count: ${{ inputs.count || '1' }}
|
|
- Check Interval: ${{ inputs.interval || '30' }}s
|
|
|
|
## Results
|
|
|
|
### Docker Status
|
|
See `docker-health.log` and `docker-diagnostics.log` for details.
|
|
|
|
### Kubernetes Status
|
|
See `k8s-health.log` and `k8s-diagnostics.log` for details.
|
|
|
|
## Files in This Report
|
|
|
|
- `HEALTH_REPORT.md` - This report
|
|
- `docker-health.log` - Docker health check output
|
|
- `docker-diagnostics.log` - Docker system diagnostics
|
|
- `k8s-health.log` - Kubernetes health check output
|
|
- `k8s-diagnostics.log` - Kubernetes system diagnostics
|
|
|
|
EOF
|
|
cat health-check-reports/HEALTH_REPORT.md
|
|
|
|
- name: Upload health check reports
|
|
if: always()
|
|
uses: actions/upload-artifact@v4
|
|
with:
|
|
name: health-check-${{ inputs.target || 'kubernetes' }}-${{ github.run_id }}
|
|
path: health-check-reports/
|
|
retention-days: 30
|
|
|
|
- name: Check health check success
|
|
run: |
|
|
if grep -q "✅ All services healthy" health-check-reports/docker-health.log 2>/dev/null || \
|
|
grep -q "✅ All services healthy" health-check-reports/k8s-health.log 2>/dev/null; then
|
|
echo "✅ Health check passed"
|
|
exit 0
|
|
else
|
|
echo "⚠️ Health check warnings detected"
|
|
exit 0 # Don't fail, just report
|
|
fi
|
|
continue-on-error: true
|
|
|
|
- name: Create issue on health failure
|
|
if: |
|
|
failure() &&
|
|
github.event_name == 'schedule' &&
|
|
(contains(fromJson('["kubernetes", "both"]'), inputs.target || inputs.target == null)
|
|
uses: actions/github-script@v7
|
|
with:
|
|
script: |
|
|
github.rest.issues.create({
|
|
owner: context.repo.owner,
|
|
repo: context.repo.repo,
|
|
title: `🚨 Health Check Failed - ${new Date().toISOString()}`,
|
|
body: `Health check failed at ${new Date().toISOString()}\n\nSee workflow run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`,
|
|
labels: ['monitoring', 'health-check', 'critical']
|
|
});
|
|
continue-on-error: true
|
|
|
|
- name: Notify Slack - Success
|
|
if: success()
|
|
uses: 8398a7/action-slack@v3
|
|
with:
|
|
status: ${{ job.status }}
|
|
text: |
|
|
✅ VAPORA Health Check Passed
|
|
Target: ${{ inputs.target || 'kubernetes' }}
|
|
Checks: ${{ inputs.count || '1' }}
|
|
webhook_url: ${{ secrets.SLACK_WEBHOOK }}
|
|
fields: repo,message
|
|
continue-on-error: true
|
|
|
|
- name: Notify Slack - Failure
|
|
if: failure()
|
|
uses: 8398a7/action-slack@v3
|
|
with:
|
|
status: ${{ job.status }}
|
|
text: |
|
|
❌ VAPORA Health Check Failed
|
|
Target: ${{ inputs.target || 'kubernetes' }}
|
|
Check workflow logs for details
|
|
webhook_url: ${{ secrets.SLACK_WEBHOOK_ALERTS }}
|
|
fields: repo,message,commit
|
|
continue-on-error: true
|