Jesús Pérez a395bd972f
Some checks failed
Rust CI / Security Audit (push) Has been cancelled
Rust CI / Check + Test + Lint (nightly) (push) Has been cancelled
Rust CI / Check + Test + Lint (stable) (push) Has been cancelled
mdBook Build & Deploy / Build mdBook (push) Has been cancelled
Nickel Type Check / Nickel Type Checking (push) Has been cancelled
mdBook Build & Deploy / Documentation Quality Check (push) Has been cancelled
mdBook Build & Deploy / Deploy to GitHub Pages (push) Has been cancelled
mdBook Build & Deploy / Notification (push) Has been cancelled
chore: add cd/ci ops
2026-01-12 03:36:55 +00:00

229 lines
8.4 KiB
YAML

name: Health Check & Monitoring
on:
schedule:
- cron: '*/15 * * * *' # Every 15 minutes
- cron: '0 */6 * * *' # Every 6 hours
workflow_dispatch:
inputs:
target:
description: 'Health check target'
required: true
default: 'kubernetes'
type: choice
options:
- docker
- kubernetes
- both
count:
description: 'Number of checks to perform'
required: false
default: '1'
type: string
interval:
description: 'Interval between checks (seconds)'
required: false
default: '30'
type: string
concurrency:
group: health-check-${{ github.event_name }}
cancel-in-progress: false
jobs:
health-check:
name: Health Check - ${{ inputs.target || 'kubernetes' }}
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Install Nushell
run: |
cargo install nu --locked
nu --version
- name: Install kubectl
uses: azure/setup-kubectl@v3
with:
version: 'latest'
if: ${{ inputs.target == 'kubernetes' || inputs.target == 'both' }}
- name: Configure kubeconfig
run: |
mkdir -p ~/.kube
# Try to use CI cluster first, fall back to staging
if [ -n "${{ secrets.KUBE_CONFIG_CI }}" ]; then
echo "${{ secrets.KUBE_CONFIG_CI }}" | base64 -d > ~/.kube/config
elif [ -n "${{ secrets.KUBE_CONFIG_STAGING }}" ]; then
echo "${{ secrets.KUBE_CONFIG_STAGING }}" | base64 -d > ~/.kube/config
else
echo "Warning: No kubeconfig available"
exit 1
fi
chmod 600 ~/.kube/config
kubectl cluster-info
if: ${{ inputs.target == 'kubernetes' || inputs.target == 'both' }}
continue-on-error: true
- name: Create health check directory
run: mkdir -p health-check-reports
- name: Run health check (Docker)
if: ${{ inputs.target == 'docker' || inputs.target == 'both' }}
run: |
cd provisioning
nu scripts/health-check.nu \
--target docker \
--count ${{ inputs.count || '1' }} \
--interval ${{ inputs.interval || '30' }} \
2>&1 | tee ../health-check-reports/docker-health.log
continue-on-error: true
- name: Run health check (Kubernetes)
if: ${{ inputs.target == 'kubernetes' || inputs.target == 'both' }}
run: |
cd provisioning
nu scripts/health-check.nu \
--target kubernetes \
--count ${{ inputs.count || '1' }} \
--interval ${{ inputs.interval || '30' }} \
2>&1 | tee ../health-check-reports/k8s-health.log
continue-on-error: true
- name: Collect Kubernetes diagnostics
if: ${{ (inputs.target == 'kubernetes' || inputs.target == 'both') && always() }}
run: |
echo "=== VAPORA Namespace ===" >> health-check-reports/k8s-diagnostics.log
kubectl get all -n vapora >> health-check-reports/k8s-diagnostics.log 2>&1
echo "" >> health-check-reports/k8s-diagnostics.log
echo "=== Deployment Details ===" >> health-check-reports/k8s-diagnostics.log
kubectl describe deployments -n vapora >> health-check-reports/k8s-diagnostics.log 2>&1
echo "" >> health-check-reports/k8s-diagnostics.log
echo "=== Pod Events ===" >> health-check-reports/k8s-diagnostics.log
kubectl get events -n vapora --sort-by='.lastTimestamp' >> health-check-reports/k8s-diagnostics.log 2>&1
echo "" >> health-check-reports/k8s-diagnostics.log
echo "=== Resource Usage ===" >> health-check-reports/k8s-diagnostics.log
kubectl top pods -n vapora >> health-check-reports/k8s-diagnostics.log 2>&1 || echo "metrics-server not available"
cat health-check-reports/k8s-diagnostics.log
continue-on-error: true
- name: Collect Docker diagnostics
if: ${{ (inputs.target == 'docker' || inputs.target == 'both') && always() }}
run: |
echo "=== Docker Services ===" > health-check-reports/docker-diagnostics.log
docker ps -a >> health-check-reports/docker-diagnostics.log 2>&1 || echo "Docker daemon not accessible"
echo "" >> health-check-reports/docker-diagnostics.log
echo "=== Docker Networks ===" >> health-check-reports/docker-diagnostics.log
docker network ls >> health-check-reports/docker-diagnostics.log 2>&1 || true
echo "" >> health-check-reports/docker-diagnostics.log
echo "=== Docker Volumes ===" >> health-check-reports/docker-diagnostics.log
docker volume ls >> health-check-reports/docker-diagnostics.log 2>&1 || true
cat health-check-reports/docker-diagnostics.log
continue-on-error: true
- name: Generate health report
if: always()
run: |
cat > health-check-reports/HEALTH_REPORT.md << 'EOF'
# VAPORA Health Check Report
**Report Time**: $(date -u +'%Y-%m-%dT%H:%M:%SZ')
**Triggered By**: ${{ github.event_name }}
**Workflow Run**: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
## Summary
Health check executed for target: **${{ inputs.target || 'kubernetes' }}**
- Check Count: ${{ inputs.count || '1' }}
- Check Interval: ${{ inputs.interval || '30' }}s
## Results
### Docker Status
See `docker-health.log` and `docker-diagnostics.log` for details.
### Kubernetes Status
See `k8s-health.log` and `k8s-diagnostics.log` for details.
## Files in This Report
- `HEALTH_REPORT.md` - This report
- `docker-health.log` - Docker health check output
- `docker-diagnostics.log` - Docker system diagnostics
- `k8s-health.log` - Kubernetes health check output
- `k8s-diagnostics.log` - Kubernetes system diagnostics
EOF
cat health-check-reports/HEALTH_REPORT.md
- name: Upload health check reports
if: always()
uses: actions/upload-artifact@v4
with:
name: health-check-${{ inputs.target || 'kubernetes' }}-${{ github.run_id }}
path: health-check-reports/
retention-days: 30
- name: Check health check success
run: |
if grep -q "✅ All services healthy" health-check-reports/docker-health.log 2>/dev/null || \
grep -q "✅ All services healthy" health-check-reports/k8s-health.log 2>/dev/null; then
echo "✅ Health check passed"
exit 0
else
echo "⚠️ Health check warnings detected"
exit 0 # Don't fail, just report
fi
continue-on-error: true
- name: Create issue on health failure
if: |
failure() &&
github.event_name == 'schedule' &&
(contains(fromJson('["kubernetes", "both"]'), inputs.target || inputs.target == null)
uses: actions/github-script@v7
with:
script: |
github.rest.issues.create({
owner: context.repo.owner,
repo: context.repo.repo,
title: `🚨 Health Check Failed - ${new Date().toISOString()}`,
body: `Health check failed at ${new Date().toISOString()}\n\nSee workflow run: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}`,
labels: ['monitoring', 'health-check', 'critical']
});
continue-on-error: true
- name: Notify Slack - Success
if: success()
uses: 8398a7/action-slack@v3
with:
status: ${{ job.status }}
text: |
✅ VAPORA Health Check Passed
Target: ${{ inputs.target || 'kubernetes' }}
Checks: ${{ inputs.count || '1' }}
webhook_url: ${{ secrets.SLACK_WEBHOOK }}
fields: repo,message
continue-on-error: true
- name: Notify Slack - Failure
if: failure()
uses: 8398a7/action-slack@v3
with:
status: ${{ job.status }}
text: |
❌ VAPORA Health Check Failed
Target: ${{ inputs.target || 'kubernetes' }}
Check workflow logs for details
webhook_url: ${{ secrets.SLACK_WEBHOOK_ALERTS }}
fields: repo,message,commit
continue-on-error: true