# VAPORA Woodpecker Pipeline - Health Check & Monitoring # Continuous health monitoring for Docker and Kubernetes deployments # Triggers on: cron schedule, manual promotion trigger: event: [cron, promote] cron: - "*/15 * * * *" # Every 15 minutes - quick check - "0 */6 * * *" # Every 6 hours - comprehensive diagnostics variables: ARTIFACTS_DIR: provisioning/artifacts LOGS_DIR: provisioning/logs VAPORA_NAMESPACE: vapora stages: setup: steps: - name: prepare image: alpine:latest commands: - mkdir -p ${LOGS_DIR}/health-checks - echo "πŸ₯ VAPORA Health Check Pipeline" - echo "Timestamp: $(date -u +'%Y-%m-%dT%H:%M:%SZ')" - echo "Event: ${CI_PIPELINE_EVENT}" install_dependencies: steps: - name: install_tools image: rust:latest commands: - apt-get update && apt-get install -y curl jq yq - cargo install nu --locked - pip install jinja2-cli - curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" - chmod +x kubectl && mv kubectl /usr/local/bin/ - nu --version - kubectl version --client - docker --version || echo "Docker not available in this runner" configure_kubernetes: depends_on: [install_dependencies] steps: - name: setup_kubeconfig_staging image: alpine:latest environment: KUBE_CONFIG_STAGING: ${KUBE_CONFIG_STAGING} commands: - mkdir -p ~/.kube - echo "$KUBE_CONFIG_STAGING" | base64 -d > ~/.kube/config - chmod 600 ~/.kube/config - kubectl cluster-info - echo "βœ“ Kubernetes staging configured" when: evaluate: 'return build.Health_Target == "kubernetes" || build.Health_Target == ""' health_check_docker: depends_on: [configure_kubernetes] steps: - name: check_docker_containers image: docker:latest volumes: - /var/run/docker.sock:/var/run/docker.sock commands: - | echo "🐳 Docker Health Check" echo "---" mkdir -p ${LOGS_DIR}/health-checks { echo "Timestamp: $(date -u +'%Y-%m-%dT%H:%M:%SZ')" echo "" echo "Container Status:" docker ps -a --format "table {{.Names}}\t{{.Status}}\t{{.Ports}}" echo "" echo "Network Status:" docker network ls } | tee ${LOGS_DIR}/health-checks/docker-containers.log - name: check_docker_endpoints image: docker:latest volumes: - /var/run/docker.sock:/var/run/docker.sock commands: - apk add --no-cache curl - | echo "πŸ” Docker Endpoint Health Checks" mkdir -p ${LOGS_DIR}/health-checks > ${LOGS_DIR}/health-checks/docker-endpoints.log check_endpoint() { local name=$1 local url=$2 echo "Checking $name: $url" | tee -a ${LOGS_DIR}/health-checks/docker-endpoints.log if curl -sf $url > /dev/null; then echo "βœ“ $name healthy" | tee -a ${LOGS_DIR}/health-checks/docker-endpoints.log else echo "⚠️ $name unreachable" | tee -a ${LOGS_DIR}/health-checks/docker-endpoints.log fi } check_endpoint "Backend" "http://localhost:8001/health" check_endpoint "Frontend" "http://localhost:3000" check_endpoint "Agents" "http://localhost:8002/health" check_endpoint "LLM Router" "http://localhost:8003/health" check_endpoint "SurrealDB" "http://localhost:8000/health" - name: collect_docker_diagnostics image: docker:latest volumes: - /var/run/docker.sock:/var/run/docker.sock commands: - apk add --no-cache curl jq - | echo "πŸ“Š Docker Diagnostics" mkdir -p ${LOGS_DIR}/health-checks { echo "Docker System Info:" docker system df echo "" echo "Docker Resource Usage:" docker stats --no-stream --all echo "" echo "Docker Volume Status:" docker volume ls } | tee ${LOGS_DIR}/health-checks/docker-diagnostics.log health_check_kubernetes: depends_on: [configure_kubernetes] steps: - name: check_k8s_deployments image: alpine:latest commands: - apk add --no-cache curl - | echo "☸️ Kubernetes Deployment Health Check" echo "---" mkdir -p ${LOGS_DIR}/health-checks { echo "Timestamp: $(date -u +'%Y-%m-%dT%H:%M:%SZ')" echo "" echo "Deployment Status:" kubectl get deployments -n ${VAPORA_NAMESPACE} -o wide echo "" echo "Pod Status:" kubectl get pods -n ${VAPORA_NAMESPACE} -o wide echo "" echo "Pod Details:" kubectl get pods -n ${VAPORA_NAMESPACE} -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.status.phase}{"\t"}{.status.conditions[?(@.type=="Ready")].status}{"\n"}{end}' } | tee ${LOGS_DIR}/health-checks/k8s-deployments.log - name: check_k8s_services image: alpine:latest commands: - apk add --no-cache curl - | echo "πŸ” Kubernetes Service Health Check" mkdir -p ${LOGS_DIR}/health-checks { echo "Services:" kubectl get services -n ${VAPORA_NAMESPACE} -o wide echo "" echo "Endpoints:" kubectl get endpoints -n ${VAPORA_NAMESPACE} echo "" echo "ConfigMap:" kubectl get configmap -n ${VAPORA_NAMESPACE} -o yaml | head -30 } | tee ${LOGS_DIR}/health-checks/k8s-services.log - name: check_k8s_events image: alpine:latest commands: - apk add --no-cache curl - | echo "πŸ“‹ Recent Kubernetes Events" mkdir -p ${LOGS_DIR}/health-checks kubectl get events -n ${VAPORA_NAMESPACE} --sort-by='.lastTimestamp' | tail -50 | tee ${LOGS_DIR}/health-checks/k8s-events.log - name: collect_k8s_diagnostics image: alpine:latest commands: - apk add --no-cache curl - | echo "πŸ“Š Kubernetes Diagnostics" mkdir -p ${LOGS_DIR}/health-checks { echo "Cluster Info:" kubectl cluster-info echo "" echo "Nodes:" kubectl get nodes -o wide echo "" echo "Resource Usage (if metrics available):" kubectl top nodes 2>/dev/null || echo "Metrics server not available" echo "" echo "Pod Resource Usage:" kubectl top pods -n ${VAPORA_NAMESPACE} 2>/dev/null || echo "Pod metrics not available" } | tee ${LOGS_DIR}/health-checks/k8s-diagnostics.log - name: collect_pod_logs image: alpine:latest commands: - apk add --no-cache curl - | echo "πŸ“ Collecting Pod Logs" mkdir -p ${LOGS_DIR}/health-checks/pods kubectl logs -n ${VAPORA_NAMESPACE} deployment/vapora-backend --tail=100 > ${LOGS_DIR}/health-checks/pods/backend.log 2>&1 kubectl logs -n ${VAPORA_NAMESPACE} deployment/vapora-agents --tail=100 > ${LOGS_DIR}/health-checks/pods/agents.log 2>&1 kubectl logs -n ${VAPORA_NAMESPACE} deployment/vapora-llm-router --tail=100 > ${LOGS_DIR}/health-checks/pods/llm-router.log 2>&1 ls -lah ${LOGS_DIR}/health-checks/pods/ analyze_health: depends_on: [health_check_docker, health_check_kubernetes] steps: - name: generate_health_report image: alpine:latest commands: - | mkdir -p ${LOGS_DIR}/health-checks cat > ${LOGS_DIR}/health-checks/HEALTH_REPORT.md << 'EOF' # VAPORA Health Check Report **Report Time**: $(date -u +'%Y-%m-%dT%H:%M:%SZ') **Pipeline**: ${CI_BUILD_LINK} ## Summary Health check completed for VAPORA services ## Docker Status - Check logs: `${LOGS_DIR}/health-checks/docker-containers.log` - Endpoint checks: `${LOGS_DIR}/health-checks/docker-endpoints.log` - System diagnostics: `${LOGS_DIR}/health-checks/docker-diagnostics.log` ## Kubernetes Status - Deployment status: `${LOGS_DIR}/health-checks/k8s-deployments.log` - Service status: `${LOGS_DIR}/health-checks/k8s-services.log` - Recent events: `${LOGS_DIR}/health-checks/k8s-events.log` - System diagnostics: `${LOGS_DIR}/health-checks/k8s-diagnostics.log` - Pod logs: `${LOGS_DIR}/health-checks/pods/` ## Diagnostics Review the following for detailed information: 1. **Docker Health** - Container status and uptime - Endpoint responsiveness (8001, 8002, 8003, 3000, 8000) - Resource allocation and usage 2. **Kubernetes Health** - Deployment replica status - Pod readiness conditions - Service endpoint availability - Recent cluster events - Node resource availability ## Action Required If any services are down or unhealthy: 1. Review pod logs in `pods/` directory 2. Check recent events in `k8s-events.log` 3. Investigate resource constraints 4. Check configuration in ConfigMap 5. Consider rollback if recent deployment ## Next Check Next automatic health check scheduled per cron configuration EOF cat ${LOGS_DIR}/health-checks/HEALTH_REPORT.md - name: check_health_status image: alpine:latest commands: - | echo "πŸ“Š Health Check Summary" echo "---" # Count issues DOCKER_DOWN=$(grep -c "⚠️" ${LOGS_DIR}/health-checks/docker-endpoints.log 2>/dev/null || echo 0) K8S_DOWN=$(grep -c "CrashLoopBackOff\|Error\|Failed" ${LOGS_DIR}/health-checks/k8s-deployments.log 2>/dev/null || echo 0) echo "Docker issues: $DOCKER_DOWN" echo "Kubernetes issues: $K8S_DOWN" if [ "$DOCKER_DOWN" -gt 0 ] || [ "$K8S_DOWN" -gt 0 ]; then echo "⚠️ Issues detected - may require attention" else echo "βœ“ All checks passed" fi publish: depends_on: [analyze_health] steps: - name: publish_reports image: alpine:latest commands: - echo "πŸ“¦ Health check reports published" - ls -lah ${LOGS_DIR}/health-checks/ - echo "" - du -sh ${LOGS_DIR}/health-checks/ - name: notify_slack_success image: alpine:latest environment: SLACK_WEBHOOK: ${SLACK_WEBHOOK} commands: - | if [ -n "$SLACK_WEBHOOK" ]; then apk add --no-cache curl jq curl -X POST $SLACK_WEBHOOK \ -H 'Content-Type: application/json' \ -d '{ "text": "βœ… VAPORA Health Check Completed", "blocks": [ { "type": "section", "text": { "type": "mrkdwn", "text": "βœ… **VAPORA Health Check Completed**\n\n*Systems Monitored:*\nβ€’ Docker (containers, endpoints)\nβ€’ Kubernetes (deployments, pods, services)" } }, { "type": "context", "elements": [ { "type": "mrkdwn", "text": "*Report Location*: `${LOGS_DIR}/health-checks/HEALTH_REPORT.md`" } ] } ] }' fi