338 lines
12 KiB
YAML
338 lines
12 KiB
YAML
|
|
# VAPORA Woodpecker Pipeline - Health Check & Monitoring
|
||
|
|
# Continuous health monitoring for Docker and Kubernetes deployments
|
||
|
|
# Triggers on: cron schedule, manual promotion
|
||
|
|
|
||
|
|
trigger:
|
||
|
|
event: [cron, promote]
|
||
|
|
cron:
|
||
|
|
- "*/15 * * * *" # Every 15 minutes - quick check
|
||
|
|
- "0 */6 * * *" # Every 6 hours - comprehensive diagnostics
|
||
|
|
|
||
|
|
variables:
|
||
|
|
ARTIFACTS_DIR: provisioning/artifacts
|
||
|
|
LOGS_DIR: provisioning/logs
|
||
|
|
VAPORA_NAMESPACE: vapora
|
||
|
|
|
||
|
|
stages:
|
||
|
|
setup:
|
||
|
|
steps:
|
||
|
|
- name: prepare
|
||
|
|
image: alpine:latest
|
||
|
|
commands:
|
||
|
|
- mkdir -p ${LOGS_DIR}/health-checks
|
||
|
|
- echo "🏥 VAPORA Health Check Pipeline"
|
||
|
|
- echo "Timestamp: $(date -u +'%Y-%m-%dT%H:%M:%SZ')"
|
||
|
|
- echo "Event: ${CI_PIPELINE_EVENT}"
|
||
|
|
|
||
|
|
install_dependencies:
|
||
|
|
steps:
|
||
|
|
- name: install_tools
|
||
|
|
image: rust:latest
|
||
|
|
commands:
|
||
|
|
- apt-get update && apt-get install -y curl jq yq
|
||
|
|
- cargo install nu --locked
|
||
|
|
- pip install jinja2-cli
|
||
|
|
- curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
|
||
|
|
- chmod +x kubectl && mv kubectl /usr/local/bin/
|
||
|
|
- nu --version
|
||
|
|
- kubectl version --client
|
||
|
|
- docker --version || echo "Docker not available in this runner"
|
||
|
|
|
||
|
|
configure_kubernetes:
|
||
|
|
depends_on: [install_dependencies]
|
||
|
|
steps:
|
||
|
|
- name: setup_kubeconfig_staging
|
||
|
|
image: alpine:latest
|
||
|
|
environment:
|
||
|
|
KUBE_CONFIG_STAGING: ${KUBE_CONFIG_STAGING}
|
||
|
|
commands:
|
||
|
|
- mkdir -p ~/.kube
|
||
|
|
- echo "$KUBE_CONFIG_STAGING" | base64 -d > ~/.kube/config
|
||
|
|
- chmod 600 ~/.kube/config
|
||
|
|
- kubectl cluster-info
|
||
|
|
- echo "✓ Kubernetes staging configured"
|
||
|
|
when:
|
||
|
|
evaluate: 'return build.Health_Target == "kubernetes" || build.Health_Target == ""'
|
||
|
|
|
||
|
|
health_check_docker:
|
||
|
|
depends_on: [configure_kubernetes]
|
||
|
|
steps:
|
||
|
|
- name: check_docker_containers
|
||
|
|
image: docker:latest
|
||
|
|
volumes:
|
||
|
|
- /var/run/docker.sock:/var/run/docker.sock
|
||
|
|
commands:
|
||
|
|
- |
|
||
|
|
echo "🐳 Docker Health Check"
|
||
|
|
echo "---"
|
||
|
|
mkdir -p ${LOGS_DIR}/health-checks
|
||
|
|
{
|
||
|
|
echo "Timestamp: $(date -u +'%Y-%m-%dT%H:%M:%SZ')"
|
||
|
|
echo ""
|
||
|
|
echo "Container Status:"
|
||
|
|
docker ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}"
|
||
|
|
echo ""
|
||
|
|
echo "Network Status:"
|
||
|
|
docker network ls
|
||
|
|
} | tee ${LOGS_DIR}/health-checks/docker-containers.log
|
||
|
|
|
||
|
|
- name: check_docker_endpoints
|
||
|
|
image: docker:latest
|
||
|
|
volumes:
|
||
|
|
- /var/run/docker.sock:/var/run/docker.sock
|
||
|
|
commands:
|
||
|
|
- apk add --no-cache curl
|
||
|
|
- |
|
||
|
|
echo "🔍 Docker Endpoint Health Checks"
|
||
|
|
mkdir -p ${LOGS_DIR}/health-checks
|
||
|
|
> ${LOGS_DIR}/health-checks/docker-endpoints.log
|
||
|
|
|
||
|
|
check_endpoint() {
|
||
|
|
local name=$1
|
||
|
|
local url=$2
|
||
|
|
echo "Checking $name: $url" | tee -a ${LOGS_DIR}/health-checks/docker-endpoints.log
|
||
|
|
if curl -sf $url > /dev/null; then
|
||
|
|
echo "✓ $name healthy" | tee -a ${LOGS_DIR}/health-checks/docker-endpoints.log
|
||
|
|
else
|
||
|
|
echo "⚠️ $name unreachable" | tee -a ${LOGS_DIR}/health-checks/docker-endpoints.log
|
||
|
|
fi
|
||
|
|
}
|
||
|
|
|
||
|
|
check_endpoint "Backend" "http://localhost:8001/health"
|
||
|
|
check_endpoint "Frontend" "http://localhost:3000"
|
||
|
|
check_endpoint "Agents" "http://localhost:8002/health"
|
||
|
|
check_endpoint "LLM Router" "http://localhost:8003/health"
|
||
|
|
check_endpoint "SurrealDB" "http://localhost:8000/health"
|
||
|
|
|
||
|
|
- name: collect_docker_diagnostics
|
||
|
|
image: docker:latest
|
||
|
|
volumes:
|
||
|
|
- /var/run/docker.sock:/var/run/docker.sock
|
||
|
|
commands:
|
||
|
|
- apk add --no-cache curl jq
|
||
|
|
- |
|
||
|
|
echo "📊 Docker Diagnostics"
|
||
|
|
mkdir -p ${LOGS_DIR}/health-checks
|
||
|
|
{
|
||
|
|
echo "Docker System Info:"
|
||
|
|
docker system df
|
||
|
|
echo ""
|
||
|
|
echo "Docker Resource Usage:"
|
||
|
|
docker stats --no-stream --all
|
||
|
|
echo ""
|
||
|
|
echo "Docker Volume Status:"
|
||
|
|
docker volume ls
|
||
|
|
} | tee ${LOGS_DIR}/health-checks/docker-diagnostics.log
|
||
|
|
|
||
|
|
health_check_kubernetes:
|
||
|
|
depends_on: [configure_kubernetes]
|
||
|
|
steps:
|
||
|
|
- name: check_k8s_deployments
|
||
|
|
image: alpine:latest
|
||
|
|
commands:
|
||
|
|
- apk add --no-cache curl
|
||
|
|
- |
|
||
|
|
echo "☸️ Kubernetes Deployment Health Check"
|
||
|
|
echo "---"
|
||
|
|
mkdir -p ${LOGS_DIR}/health-checks
|
||
|
|
{
|
||
|
|
echo "Timestamp: $(date -u +'%Y-%m-%dT%H:%M:%SZ')"
|
||
|
|
echo ""
|
||
|
|
echo "Deployment Status:"
|
||
|
|
kubectl get deployments -n ${VAPORA_NAMESPACE} -o wide
|
||
|
|
echo ""
|
||
|
|
echo "Pod Status:"
|
||
|
|
kubectl get pods -n ${VAPORA_NAMESPACE} -o wide
|
||
|
|
echo ""
|
||
|
|
echo "Pod Details:"
|
||
|
|
kubectl get pods -n ${VAPORA_NAMESPACE} -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\t"}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}'
|
||
|
|
} | tee ${LOGS_DIR}/health-checks/k8s-deployments.log
|
||
|
|
|
||
|
|
- name: check_k8s_services
|
||
|
|
image: alpine:latest
|
||
|
|
commands:
|
||
|
|
- apk add --no-cache curl
|
||
|
|
- |
|
||
|
|
echo "🔍 Kubernetes Service Health Check"
|
||
|
|
mkdir -p ${LOGS_DIR}/health-checks
|
||
|
|
{
|
||
|
|
echo "Services:"
|
||
|
|
kubectl get services -n ${VAPORA_NAMESPACE} -o wide
|
||
|
|
echo ""
|
||
|
|
echo "Endpoints:"
|
||
|
|
kubectl get endpoints -n ${VAPORA_NAMESPACE}
|
||
|
|
echo ""
|
||
|
|
echo "ConfigMap:"
|
||
|
|
kubectl get configmap -n ${VAPORA_NAMESPACE} -o yaml | head -30
|
||
|
|
} | tee ${LOGS_DIR}/health-checks/k8s-services.log
|
||
|
|
|
||
|
|
- name: check_k8s_events
|
||
|
|
image: alpine:latest
|
||
|
|
commands:
|
||
|
|
- apk add --no-cache curl
|
||
|
|
- |
|
||
|
|
echo "📋 Recent Kubernetes Events"
|
||
|
|
mkdir -p ${LOGS_DIR}/health-checks
|
||
|
|
kubectl get events -n ${VAPORA_NAMESPACE} --sort-by='.lastTimestamp' | tail -50 | tee ${LOGS_DIR}/health-checks/k8s-events.log
|
||
|
|
|
||
|
|
- name: collect_k8s_diagnostics
|
||
|
|
image: alpine:latest
|
||
|
|
commands:
|
||
|
|
- apk add --no-cache curl
|
||
|
|
- |
|
||
|
|
echo "📊 Kubernetes Diagnostics"
|
||
|
|
mkdir -p ${LOGS_DIR}/health-checks
|
||
|
|
{
|
||
|
|
echo "Cluster Info:"
|
||
|
|
kubectl cluster-info
|
||
|
|
echo ""
|
||
|
|
echo "Nodes:"
|
||
|
|
kubectl get nodes -o wide
|
||
|
|
echo ""
|
||
|
|
echo "Resource Usage (if metrics available):"
|
||
|
|
kubectl top nodes 2>/dev/null || echo "Metrics server not available"
|
||
|
|
echo ""
|
||
|
|
echo "Pod Resource Usage:"
|
||
|
|
kubectl top pods -n ${VAPORA_NAMESPACE} 2>/dev/null || echo "Pod metrics not available"
|
||
|
|
} | tee ${LOGS_DIR}/health-checks/k8s-diagnostics.log
|
||
|
|
|
||
|
|
- name: collect_pod_logs
|
||
|
|
image: alpine:latest
|
||
|
|
commands:
|
||
|
|
- apk add --no-cache curl
|
||
|
|
- |
|
||
|
|
echo "📝 Collecting Pod Logs"
|
||
|
|
mkdir -p ${LOGS_DIR}/health-checks/pods
|
||
|
|
kubectl logs -n ${VAPORA_NAMESPACE} deployment/vapora-backend --tail=100 > ${LOGS_DIR}/health-checks/pods/backend.log 2>&1
|
||
|
|
kubectl logs -n ${VAPORA_NAMESPACE} deployment/vapora-agents --tail=100 > ${LOGS_DIR}/health-checks/pods/agents.log 2>&1
|
||
|
|
kubectl logs -n ${VAPORA_NAMESPACE} deployment/vapora-llm-router --tail=100 > ${LOGS_DIR}/health-checks/pods/llm-router.log 2>&1
|
||
|
|
ls -lah ${LOGS_DIR}/health-checks/pods/
|
||
|
|
|
||
|
|
analyze_health:
|
||
|
|
depends_on: [health_check_docker, health_check_kubernetes]
|
||
|
|
steps:
|
||
|
|
- name: generate_health_report
|
||
|
|
image: alpine:latest
|
||
|
|
commands:
|
||
|
|
- |
|
||
|
|
mkdir -p ${LOGS_DIR}/health-checks
|
||
|
|
cat > ${LOGS_DIR}/health-checks/HEALTH_REPORT.md << 'EOF'
|
||
|
|
# VAPORA Health Check Report
|
||
|
|
|
||
|
|
**Report Time**: $(date -u +'%Y-%m-%dT%H:%M:%SZ')
|
||
|
|
**Pipeline**: ${CI_BUILD_LINK}
|
||
|
|
|
||
|
|
## Summary
|
||
|
|
|
||
|
|
Health check completed for VAPORA services
|
||
|
|
|
||
|
|
## Docker Status
|
||
|
|
|
||
|
|
- Check logs: `${LOGS_DIR}/health-checks/docker-containers.log`
|
||
|
|
- Endpoint checks: `${LOGS_DIR}/health-checks/docker-endpoints.log`
|
||
|
|
- System diagnostics: `${LOGS_DIR}/health-checks/docker-diagnostics.log`
|
||
|
|
|
||
|
|
## Kubernetes Status
|
||
|
|
|
||
|
|
- Deployment status: `${LOGS_DIR}/health-checks/k8s-deployments.log`
|
||
|
|
- Service status: `${LOGS_DIR}/health-checks/k8s-services.log`
|
||
|
|
- Recent events: `${LOGS_DIR}/health-checks/k8s-events.log`
|
||
|
|
- System diagnostics: `${LOGS_DIR}/health-checks/k8s-diagnostics.log`
|
||
|
|
- Pod logs: `${LOGS_DIR}/health-checks/pods/`
|
||
|
|
|
||
|
|
## Diagnostics
|
||
|
|
|
||
|
|
Review the following for detailed information:
|
||
|
|
|
||
|
|
1. **Docker Health**
|
||
|
|
- Container status and uptime
|
||
|
|
- Endpoint responsiveness (8001, 8002, 8003, 3000, 8000)
|
||
|
|
- Resource allocation and usage
|
||
|
|
|
||
|
|
2. **Kubernetes Health**
|
||
|
|
- Deployment replica status
|
||
|
|
- Pod readiness conditions
|
||
|
|
- Service endpoint availability
|
||
|
|
- Recent cluster events
|
||
|
|
- Node resource availability
|
||
|
|
|
||
|
|
## Action Required
|
||
|
|
|
||
|
|
If any services are down or unhealthy:
|
||
|
|
1. Review pod logs in `pods/` directory
|
||
|
|
2. Check recent events in `k8s-events.log`
|
||
|
|
3. Investigate resource constraints
|
||
|
|
4. Check configuration in ConfigMap
|
||
|
|
5. Consider rollback if recent deployment
|
||
|
|
|
||
|
|
## Next Check
|
||
|
|
|
||
|
|
Next automatic health check scheduled per cron configuration
|
||
|
|
|
||
|
|
EOF
|
||
|
|
cat ${LOGS_DIR}/health-checks/HEALTH_REPORT.md
|
||
|
|
|
||
|
|
- name: check_health_status
|
||
|
|
image: alpine:latest
|
||
|
|
commands:
|
||
|
|
- |
|
||
|
|
echo "📊 Health Check Summary"
|
||
|
|
echo "---"
|
||
|
|
|
||
|
|
# Count issues
|
||
|
|
DOCKER_DOWN=$(grep -c "⚠️" ${LOGS_DIR}/health-checks/docker-endpoints.log 2>/dev/null || echo 0)
|
||
|
|
K8S_DOWN=$(grep -c "CrashLoopBackOff\|Error\|Failed" ${LOGS_DIR}/health-checks/k8s-deployments.log 2>/dev/null || echo 0)
|
||
|
|
|
||
|
|
echo "Docker issues: $DOCKER_DOWN"
|
||
|
|
echo "Kubernetes issues: $K8S_DOWN"
|
||
|
|
|
||
|
|
if [ "$DOCKER_DOWN" -gt 0 ] || [ "$K8S_DOWN" -gt 0 ]; then
|
||
|
|
echo "⚠️ Issues detected - may require attention"
|
||
|
|
else
|
||
|
|
echo "✓ All checks passed"
|
||
|
|
fi
|
||
|
|
|
||
|
|
publish:
|
||
|
|
depends_on: [analyze_health]
|
||
|
|
steps:
|
||
|
|
- name: publish_reports
|
||
|
|
image: alpine:latest
|
||
|
|
commands:
|
||
|
|
- echo "📦 Health check reports published"
|
||
|
|
- ls -lah ${LOGS_DIR}/health-checks/
|
||
|
|
- echo ""
|
||
|
|
- du -sh ${LOGS_DIR}/health-checks/
|
||
|
|
|
||
|
|
- name: notify_slack_success
|
||
|
|
image: alpine:latest
|
||
|
|
environment:
|
||
|
|
SLACK_WEBHOOK: ${SLACK_WEBHOOK}
|
||
|
|
commands:
|
||
|
|
- |
|
||
|
|
if [ -n "$SLACK_WEBHOOK" ]; then
|
||
|
|
apk add --no-cache curl jq
|
||
|
|
curl -X POST $SLACK_WEBHOOK \
|
||
|
|
-H 'Content-Type: application/json' \
|
||
|
|
-d '{
|
||
|
|
"text": "✅ VAPORA Health Check Completed",
|
||
|
|
"blocks": [
|
||
|
|
{
|
||
|
|
"type": "section",
|
||
|
|
"text": {
|
||
|
|
"type": "mrkdwn",
|
||
|
|
"text": "✅ **VAPORA Health Check Completed**\n\n*Systems Monitored:*\n• Docker (containers, endpoints)\n• Kubernetes (deployments, pods, services)"
|
||
|
|
}
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"type": "context",
|
||
|
|
"elements": [
|
||
|
|
{
|
||
|
|
"type": "mrkdwn",
|
||
|
|
"text": "*Report Location*: `${LOGS_DIR}/health-checks/HEALTH_REPORT.md`"
|
||
|
|
}
|
||
|
|
]
|
||
|
|
}
|
||
|
|
]
|
||
|
|
}'
|
||
|
|
fi
|