Vapora/provisioning/.woodpecker/health-check.yml

338 lines
12 KiB
YAML
Raw Permalink Normal View History

2026-01-12 03:36:55 +00:00
# VAPORA Woodpecker Pipeline - Health Check & Monitoring
# Continuous health monitoring for Docker and Kubernetes deployments
# Triggers on: cron schedule, manual promotion
trigger:
event: [cron, promote]
cron:
- "*/15 * * * *" # Every 15 minutes - quick check
- "0 */6 * * *" # Every 6 hours - comprehensive diagnostics
variables:
ARTIFACTS_DIR: provisioning/artifacts
LOGS_DIR: provisioning/logs
VAPORA_NAMESPACE: vapora
stages:
setup:
steps:
- name: prepare
image: alpine:latest
commands:
- mkdir -p ${LOGS_DIR}/health-checks
- echo "🏥 VAPORA Health Check Pipeline"
- echo "Timestamp: $(date -u +'%Y-%m-%dT%H:%M:%SZ')"
- echo "Event: ${CI_PIPELINE_EVENT}"
install_dependencies:
steps:
- name: install_tools
image: rust:latest
commands:
- apt-get update && apt-get install -y curl jq yq
- cargo install nu --locked
- pip install jinja2-cli
- curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
- chmod +x kubectl && mv kubectl /usr/local/bin/
- nu --version
- kubectl version --client
- docker --version || echo "Docker not available in this runner"
configure_kubernetes:
depends_on: [install_dependencies]
steps:
- name: setup_kubeconfig_staging
image: alpine:latest
environment:
KUBE_CONFIG_STAGING: ${KUBE_CONFIG_STAGING}
commands:
- mkdir -p ~/.kube
- echo "$KUBE_CONFIG_STAGING" | base64 -d > ~/.kube/config
- chmod 600 ~/.kube/config
- kubectl cluster-info
- echo "✓ Kubernetes staging configured"
when:
evaluate: 'return build.Health_Target == "kubernetes" || build.Health_Target == ""'
health_check_docker:
depends_on: [configure_kubernetes]
steps:
- name: check_docker_containers
image: docker:latest
volumes:
- /var/run/docker.sock:/var/run/docker.sock
commands:
- |
echo "🐳 Docker Health Check"
echo "---"
mkdir -p ${LOGS_DIR}/health-checks
{
echo "Timestamp: $(date -u +'%Y-%m-%dT%H:%M:%SZ')"
echo ""
echo "Container Status:"
docker ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
echo ""
echo "Network Status:"
docker network ls
} | tee ${LOGS_DIR}/health-checks/docker-containers.log
- name: check_docker_endpoints
image: docker:latest
volumes:
- /var/run/docker.sock:/var/run/docker.sock
commands:
- apk add --no-cache curl
- |
echo "🔍 Docker Endpoint Health Checks"
mkdir -p ${LOGS_DIR}/health-checks
> ${LOGS_DIR}/health-checks/docker-endpoints.log
check_endpoint() {
local name=$1
local url=$2
echo "Checking $name: $url" | tee -a ${LOGS_DIR}/health-checks/docker-endpoints.log
if curl -sf $url > /dev/null; then
echo "✓ $name healthy" | tee -a ${LOGS_DIR}/health-checks/docker-endpoints.log
else
echo "⚠️ $name unreachable" | tee -a ${LOGS_DIR}/health-checks/docker-endpoints.log
fi
}
check_endpoint "Backend" "http://localhost:8001/health"
check_endpoint "Frontend" "http://localhost:3000"
check_endpoint "Agents" "http://localhost:8002/health"
check_endpoint "LLM Router" "http://localhost:8003/health"
check_endpoint "SurrealDB" "http://localhost:8000/health"
- name: collect_docker_diagnostics
image: docker:latest
volumes:
- /var/run/docker.sock:/var/run/docker.sock
commands:
- apk add --no-cache curl jq
- |
echo "📊 Docker Diagnostics"
mkdir -p ${LOGS_DIR}/health-checks
{
echo "Docker System Info:"
docker system df
echo ""
echo "Docker Resource Usage:"
docker stats --no-stream --all
echo ""
echo "Docker Volume Status:"
docker volume ls
} | tee ${LOGS_DIR}/health-checks/docker-diagnostics.log
health_check_kubernetes:
depends_on: [configure_kubernetes]
steps:
- name: check_k8s_deployments
image: alpine:latest
commands:
- apk add --no-cache curl
- |
echo "☸️ Kubernetes Deployment Health Check"
echo "---"
mkdir -p ${LOGS_DIR}/health-checks
{
echo "Timestamp: $(date -u +'%Y-%m-%dT%H:%M:%SZ')"
echo ""
echo "Deployment Status:"
kubectl get deployments -n ${VAPORA_NAMESPACE} -o wide
echo ""
echo "Pod Status:"
kubectl get pods -n ${VAPORA_NAMESPACE} -o wide
echo ""
echo "Pod Details:"
kubectl get pods -n ${VAPORA_NAMESPACE} -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\t"}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}'
} | tee ${LOGS_DIR}/health-checks/k8s-deployments.log
- name: check_k8s_services
image: alpine:latest
commands:
- apk add --no-cache curl
- |
echo "🔍 Kubernetes Service Health Check"
mkdir -p ${LOGS_DIR}/health-checks
{
echo "Services:"
kubectl get services -n ${VAPORA_NAMESPACE} -o wide
echo ""
echo "Endpoints:"
kubectl get endpoints -n ${VAPORA_NAMESPACE}
echo ""
echo "ConfigMap:"
kubectl get configmap -n ${VAPORA_NAMESPACE} -o yaml | head -30
} | tee ${LOGS_DIR}/health-checks/k8s-services.log
- name: check_k8s_events
image: alpine:latest
commands:
- apk add --no-cache curl
- |
echo "📋 Recent Kubernetes Events"
mkdir -p ${LOGS_DIR}/health-checks
kubectl get events -n ${VAPORA_NAMESPACE} --sort-by='.lastTimestamp' | tail -50 | tee ${LOGS_DIR}/health-checks/k8s-events.log
- name: collect_k8s_diagnostics
image: alpine:latest
commands:
- apk add --no-cache curl
- |
echo "📊 Kubernetes Diagnostics"
mkdir -p ${LOGS_DIR}/health-checks
{
echo "Cluster Info:"
kubectl cluster-info
echo ""
echo "Nodes:"
kubectl get nodes -o wide
echo ""
echo "Resource Usage (if metrics available):"
kubectl top nodes 2>/dev/null || echo "Metrics server not available"
echo ""
echo "Pod Resource Usage:"
kubectl top pods -n ${VAPORA_NAMESPACE} 2>/dev/null || echo "Pod metrics not available"
} | tee ${LOGS_DIR}/health-checks/k8s-diagnostics.log
- name: collect_pod_logs
image: alpine:latest
commands:
- apk add --no-cache curl
- |
echo "📝 Collecting Pod Logs"
mkdir -p ${LOGS_DIR}/health-checks/pods
kubectl logs -n ${VAPORA_NAMESPACE} deployment/vapora-backend --tail=100 > ${LOGS_DIR}/health-checks/pods/backend.log 2>&1
kubectl logs -n ${VAPORA_NAMESPACE} deployment/vapora-agents --tail=100 > ${LOGS_DIR}/health-checks/pods/agents.log 2>&1
kubectl logs -n ${VAPORA_NAMESPACE} deployment/vapora-llm-router --tail=100 > ${LOGS_DIR}/health-checks/pods/llm-router.log 2>&1
ls -lah ${LOGS_DIR}/health-checks/pods/
analyze_health:
depends_on: [health_check_docker, health_check_kubernetes]
steps:
- name: generate_health_report
image: alpine:latest
commands:
- |
mkdir -p ${LOGS_DIR}/health-checks
cat > ${LOGS_DIR}/health-checks/HEALTH_REPORT.md << 'EOF'
# VAPORA Health Check Report
**Report Time**: $(date -u +'%Y-%m-%dT%H:%M:%SZ')
**Pipeline**: ${CI_BUILD_LINK}
## Summary
Health check completed for VAPORA services
## Docker Status
- Check logs: `${LOGS_DIR}/health-checks/docker-containers.log`
- Endpoint checks: `${LOGS_DIR}/health-checks/docker-endpoints.log`
- System diagnostics: `${LOGS_DIR}/health-checks/docker-diagnostics.log`
## Kubernetes Status
- Deployment status: `${LOGS_DIR}/health-checks/k8s-deployments.log`
- Service status: `${LOGS_DIR}/health-checks/k8s-services.log`
- Recent events: `${LOGS_DIR}/health-checks/k8s-events.log`
- System diagnostics: `${LOGS_DIR}/health-checks/k8s-diagnostics.log`
- Pod logs: `${LOGS_DIR}/health-checks/pods/`
## Diagnostics
Review the following for detailed information:
1. **Docker Health**
- Container status and uptime
- Endpoint responsiveness (8001, 8002, 8003, 3000, 8000)
- Resource allocation and usage
2. **Kubernetes Health**
- Deployment replica status
- Pod readiness conditions
- Service endpoint availability
- Recent cluster events
- Node resource availability
## Action Required
If any services are down or unhealthy:
1. Review pod logs in `pods/` directory
2. Check recent events in `k8s-events.log`
3. Investigate resource constraints
4. Check configuration in ConfigMap
5. Consider rollback if recent deployment
## Next Check
Next automatic health check scheduled per cron configuration
EOF
cat ${LOGS_DIR}/health-checks/HEALTH_REPORT.md
- name: check_health_status
image: alpine:latest
commands:
- |
echo "📊 Health Check Summary"
echo "---"
# Count issues
DOCKER_DOWN=$(grep -c "⚠️" ${LOGS_DIR}/health-checks/docker-endpoints.log 2>/dev/null || echo 0)
K8S_DOWN=$(grep -c "CrashLoopBackOff\|Error\|Failed" ${LOGS_DIR}/health-checks/k8s-deployments.log 2>/dev/null || echo 0)
echo "Docker issues: $DOCKER_DOWN"
echo "Kubernetes issues: $K8S_DOWN"
if [ "$DOCKER_DOWN" -gt 0 ] || [ "$K8S_DOWN" -gt 0 ]; then
echo "⚠️ Issues detected - may require attention"
else
echo "✓ All checks passed"
fi
publish:
depends_on: [analyze_health]
steps:
- name: publish_reports
image: alpine:latest
commands:
- echo "📦 Health check reports published"
- ls -lah ${LOGS_DIR}/health-checks/
- echo ""
- du -sh ${LOGS_DIR}/health-checks/
- name: notify_slack_success
image: alpine:latest
environment:
SLACK_WEBHOOK: ${SLACK_WEBHOOK}
commands:
- |
if [ -n "$SLACK_WEBHOOK" ]; then
apk add --no-cache curl jq
curl -X POST $SLACK_WEBHOOK \
-H 'Content-Type: application/json' \
-d '{
"text": "✅ VAPORA Health Check Completed",
"blocks": [
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "✅ **VAPORA Health Check Completed**\n\n*Systems Monitored:*\n• Docker (containers, endpoints)\n• Kubernetes (deployments, pods, services)"
}
},
{
"type": "context",
"elements": [
{
"type": "mrkdwn",
"text": "*Report Location*: `${LOGS_DIR}/health-checks/HEALTH_REPORT.md`"
}
]
}
]
}'
fi