provisioning/docs/book/user/troubleshooting-guide.html
Jesús Pérez 6a59d34bb1
chore: update provisioning configuration and documentation
Update configuration files, templates, and internal documentation
for the provisioning repository system.

Configuration Updates:
- KMS configuration modernization
- Plugin system settings
- Service port mappings
- Test cluster topologies
- Installation configuration examples
- VM configuration defaults
- Cedar authorization policies

Documentation Updates:
- Library module documentation
- Extension API guides
- AI system documentation
- Service management guides
- Test environment setup
- Plugin usage guides
- Validator configuration documentation

All changes are backward compatible.
2025-12-11 21:50:42 +00:00

1083 lines
41 KiB
HTML
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<!DOCTYPE HTML>
<html lang="en" class="ayu sidebar-visible" dir="ltr">
<head>
<!-- Book generated using mdBook -->
<meta charset="UTF-8">
<title>Troubleshooting Guide - Provisioning Platform Documentation</title>
<!-- Custom HTML head -->
<meta name="description" content="Complete documentation for the Provisioning Platform - Infrastructure automation with Nushell, KCL, and Rust">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="theme-color" content="#ffffff">
<link rel="icon" href="../favicon.svg">
<link rel="shortcut icon" href="../favicon.png">
<link rel="stylesheet" href="../css/variables.css">
<link rel="stylesheet" href="../css/general.css">
<link rel="stylesheet" href="../css/chrome.css">
<link rel="stylesheet" href="../css/print.css" media="print">
<!-- Fonts -->
<link rel="stylesheet" href="../FontAwesome/css/font-awesome.css">
<link rel="stylesheet" href="../fonts/fonts.css">
<!-- Highlight.js Stylesheets -->
<link rel="stylesheet" id="highlight-css" href="../highlight.css">
<link rel="stylesheet" id="tomorrow-night-css" href="../tomorrow-night.css">
<link rel="stylesheet" id="ayu-highlight-css" href="../ayu-highlight.css">
<!-- Custom theme stylesheets -->
<!-- Provide site root and default themes to javascript -->
<script>
const path_to_root = "../";
const default_light_theme = "ayu";
const default_dark_theme = "navy";
</script>
<!-- Start loading toc.js asap -->
<script src="../toc.js"></script>
</head>
<body>
<div id="mdbook-help-container">
<div id="mdbook-help-popup">
<h2 class="mdbook-help-title">Keyboard shortcuts</h2>
<div>
<p>Press <kbd></kbd> or <kbd></kbd> to navigate between chapters</p>
<p>Press <kbd>S</kbd> or <kbd>/</kbd> to search in the book</p>
<p>Press <kbd>?</kbd> to show this help</p>
<p>Press <kbd>Esc</kbd> to hide this help</p>
</div>
</div>
</div>
<div id="body-container">
<!-- Work around some values being stored in localStorage wrapped in quotes -->
<script>
try {
let theme = localStorage.getItem('mdbook-theme');
let sidebar = localStorage.getItem('mdbook-sidebar');
if (theme.startsWith('"') && theme.endsWith('"')) {
localStorage.setItem('mdbook-theme', theme.slice(1, theme.length - 1));
}
if (sidebar.startsWith('"') && sidebar.endsWith('"')) {
localStorage.setItem('mdbook-sidebar', sidebar.slice(1, sidebar.length - 1));
}
} catch (e) { }
</script>
<!-- Set the theme before any content is loaded, prevents flash -->
<script>
const default_theme = window.matchMedia("(prefers-color-scheme: dark)").matches ? default_dark_theme : default_light_theme;
let theme;
try { theme = localStorage.getItem('mdbook-theme'); } catch(e) { }
if (theme === null || theme === undefined) { theme = default_theme; }
const html = document.documentElement;
html.classList.remove('ayu')
html.classList.add(theme);
html.classList.add("js");
</script>
<input type="checkbox" id="sidebar-toggle-anchor" class="hidden">
<!-- Hide / unhide sidebar before it is displayed -->
<script>
let sidebar = null;
const sidebar_toggle = document.getElementById("sidebar-toggle-anchor");
if (document.body.clientWidth >= 1080) {
try { sidebar = localStorage.getItem('mdbook-sidebar'); } catch(e) { }
sidebar = sidebar || 'visible';
} else {
sidebar = 'hidden';
}
sidebar_toggle.checked = sidebar === 'visible';
html.classList.remove('sidebar-visible');
html.classList.add("sidebar-" + sidebar);
</script>
<nav id="sidebar" class="sidebar" aria-label="Table of contents">
<!-- populated by js -->
<mdbook-sidebar-scrollbox class="sidebar-scrollbox"></mdbook-sidebar-scrollbox>
<noscript>
<iframe class="sidebar-iframe-outer" src="../toc.html"></iframe>
</noscript>
<div id="sidebar-resize-handle" class="sidebar-resize-handle">
<div class="sidebar-resize-indicator"></div>
</div>
</nav>
<div id="page-wrapper" class="page-wrapper">
<div class="page">
<div id="menu-bar-hover-placeholder"></div>
<div id="menu-bar" class="menu-bar sticky">
<div class="left-buttons">
<label id="sidebar-toggle" class="icon-button" for="sidebar-toggle-anchor" title="Toggle Table of Contents" aria-label="Toggle Table of Contents" aria-controls="sidebar">
<i class="fa fa-bars"></i>
</label>
<button id="theme-toggle" class="icon-button" type="button" title="Change theme" aria-label="Change theme" aria-haspopup="true" aria-expanded="false" aria-controls="theme-list">
<i class="fa fa-paint-brush"></i>
</button>
<ul id="theme-list" class="theme-popup" aria-label="Themes" role="menu">
<li role="none"><button role="menuitem" class="theme" id="default_theme">Auto</button></li>
<li role="none"><button role="menuitem" class="theme" id="light">Light</button></li>
<li role="none"><button role="menuitem" class="theme" id="rust">Rust</button></li>
<li role="none"><button role="menuitem" class="theme" id="coal">Coal</button></li>
<li role="none"><button role="menuitem" class="theme" id="navy">Navy</button></li>
<li role="none"><button role="menuitem" class="theme" id="ayu">Ayu</button></li>
</ul>
<button id="search-toggle" class="icon-button" type="button" title="Search (`/`)" aria-label="Toggle Searchbar" aria-expanded="false" aria-keyshortcuts="/ s" aria-controls="searchbar">
<i class="fa fa-search"></i>
</button>
</div>
<h1 class="menu-title">Provisioning Platform Documentation</h1>
<div class="right-buttons">
<a href="../print.html" title="Print this book" aria-label="Print this book">
<i id="print-button" class="fa fa-print"></i>
</a>
<a href="https://github.com/provisioning/provisioning-platform" title="Git repository" aria-label="Git repository">
<i id="git-repository-button" class="fa fa-github"></i>
</a>
<a href="https://github.com/provisioning/provisioning-platform/edit/main/provisioning/docs/src/user/troubleshooting-guide.md" title="Suggest an edit" aria-label="Suggest an edit">
<i id="git-edit-button" class="fa fa-edit"></i>
</a>
</div>
</div>
<div id="search-wrapper" class="hidden">
<form id="searchbar-outer" class="searchbar-outer">
<input type="search" id="searchbar" name="searchbar" placeholder="Search this book ..." aria-controls="searchresults-outer" aria-describedby="searchresults-header">
</form>
<div id="searchresults-outer" class="searchresults-outer hidden">
<div id="searchresults-header" class="searchresults-header"></div>
<ul id="searchresults">
</ul>
</div>
</div>
<!-- Apply ARIA attributes after the sidebar and the sidebar toggle button are added to the DOM -->
<script>
document.getElementById('sidebar-toggle').setAttribute('aria-expanded', sidebar === 'visible');
document.getElementById('sidebar').setAttribute('aria-hidden', sidebar !== 'visible');
Array.from(document.querySelectorAll('#sidebar a')).forEach(function(link) {
link.setAttribute('tabIndex', sidebar === 'visible' ? 0 : -1);
});
</script>
<div id="content" class="content">
<main>
<h1 id="troubleshooting-guide"><a class="header" href="#troubleshooting-guide">Troubleshooting Guide</a></h1>
<p>This comprehensive troubleshooting guide helps you diagnose and resolve common issues with Infrastructure Automation.</p>
<h2 id="what-youll-learn"><a class="header" href="#what-youll-learn">What Youll Learn</a></h2>
<ul>
<li>Common issues and their solutions</li>
<li>Diagnostic commands and techniques</li>
<li>Error message interpretation</li>
<li>Performance optimization</li>
<li>Recovery procedures</li>
<li>Prevention strategies</li>
</ul>
<h2 id="general-troubleshooting-approach"><a class="header" href="#general-troubleshooting-approach">General Troubleshooting Approach</a></h2>
<h3 id="1-identify-the-problem"><a class="header" href="#1-identify-the-problem">1. Identify the Problem</a></h3>
<pre><code class="language-bash"># Check overall system status
provisioning env
provisioning validate config
# Check specific component status
provisioning show servers --infra my-infra
provisioning taskserv list --infra my-infra --installed
</code></pre>
<h3 id="2-gather-information"><a class="header" href="#2-gather-information">2. Gather Information</a></h3>
<pre><code class="language-bash"># Enable debug mode for detailed output
provisioning --debug &lt;command&gt;
# Check logs and errors
provisioning show logs --infra my-infra
</code></pre>
<h3 id="3-use-diagnostic-commands"><a class="header" href="#3-use-diagnostic-commands">3. Use Diagnostic Commands</a></h3>
<pre><code class="language-bash"># Validate configuration
provisioning validate config --detailed
# Test connectivity
provisioning provider test aws
provisioning network test --infra my-infra
</code></pre>
<h2 id="installation-and-setup-issues"><a class="header" href="#installation-and-setup-issues">Installation and Setup Issues</a></h2>
<h3 id="issue-installation-fails"><a class="header" href="#issue-installation-fails">Issue: Installation Fails</a></h3>
<p><strong>Symptoms:</strong></p>
<ul>
<li>Installation script errors</li>
<li>Missing dependencies</li>
<li>Permission denied errors</li>
</ul>
<p><strong>Diagnosis:</strong></p>
<pre><code class="language-bash"># Check system requirements
uname -a
df -h
whoami
# Check permissions
ls -la /usr/local/
sudo -l
</code></pre>
<p><strong>Solutions:</strong></p>
<h4 id="permission-issues"><a class="header" href="#permission-issues">Permission Issues</a></h4>
<pre><code class="language-bash"># Run installer with sudo
sudo ./install-provisioning
# Or install to user directory
./install-provisioning --prefix=$HOME/provisioning
export PATH="$HOME/provisioning/bin:$PATH"
</code></pre>
<h4 id="missing-dependencies"><a class="header" href="#missing-dependencies">Missing Dependencies</a></h4>
<pre><code class="language-bash"># Ubuntu/Debian
sudo apt update
sudo apt install -y curl wget tar build-essential
# RHEL/CentOS
sudo dnf install -y curl wget tar gcc make
</code></pre>
<h4 id="architecture-issues"><a class="header" href="#architecture-issues">Architecture Issues</a></h4>
<pre><code class="language-bash"># Check architecture
uname -m
# Download correct architecture package
# x86_64: Intel/AMD 64-bit
# arm64: ARM 64-bit (Apple Silicon)
wget https://releases.example.com/provisioning-linux-x86_64.tar.gz
</code></pre>
<h3 id="issue-command-not-found"><a class="header" href="#issue-command-not-found">Issue: Command Not Found</a></h3>
<p><strong>Symptoms:</strong></p>
<pre><code>bash: provisioning: command not found
</code></pre>
<p><strong>Diagnosis:</strong></p>
<pre><code class="language-bash"># Check if provisioning is installed
which provisioning
ls -la /usr/local/bin/provisioning
# Check PATH
echo $PATH
</code></pre>
<p><strong>Solutions:</strong></p>
<pre><code class="language-bash"># Add to PATH
export PATH="/usr/local/bin:$PATH"
# Make permanent (add to shell profile)
echo 'export PATH="/usr/local/bin:$PATH"' &gt;&gt; ~/.bashrc
source ~/.bashrc
# Create symlink if missing
sudo ln -sf /usr/local/provisioning/core/nulib/provisioning /usr/local/bin/provisioning
</code></pre>
<h3 id="issue-nushell-plugin-errors"><a class="header" href="#issue-nushell-plugin-errors">Issue: Nushell Plugin Errors</a></h3>
<p><strong>Symptoms:</strong></p>
<pre><code>Plugin not found: nu_plugin_kcl
Plugin registration failed
</code></pre>
<p><strong>Diagnosis:</strong></p>
<pre><code class="language-bash"># Check Nushell version
nu --version
# Check KCL installation (required for nu_plugin_kcl)
kcl version
# Check plugin registration
nu -c "version | get installed_plugins"
</code></pre>
<p><strong>Solutions:</strong></p>
<pre><code class="language-bash"># Install KCL CLI (required for nu_plugin_kcl)
# Download from: https://github.com/kcl-lang/cli/releases
# Re-register plugins
nu -c "plugin add /usr/local/provisioning/plugins/nu_plugin_kcl"
nu -c "plugin add /usr/local/provisioning/plugins/nu_plugin_tera"
# Restart Nushell after plugin registration
</code></pre>
<h2 id="configuration-issues"><a class="header" href="#configuration-issues">Configuration Issues</a></h2>
<h3 id="issue-configuration-not-found"><a class="header" href="#issue-configuration-not-found">Issue: Configuration Not Found</a></h3>
<p><strong>Symptoms:</strong></p>
<pre><code>Configuration file not found
Failed to load configuration
</code></pre>
<p><strong>Diagnosis:</strong></p>
<pre><code class="language-bash"># Check configuration file locations
provisioning env | grep config
# Check if files exist
ls -la ~/.config/provisioning/
ls -la /usr/local/provisioning/config.defaults.toml
</code></pre>
<p><strong>Solutions:</strong></p>
<pre><code class="language-bash"># Initialize user configuration
provisioning init config
# Create missing directories
mkdir -p ~/.config/provisioning
# Copy template
cp /usr/local/provisioning/config-examples/config.user.toml ~/.config/provisioning/config.toml
# Verify configuration
provisioning validate config
</code></pre>
<h3 id="issue-configuration-validation-errors"><a class="header" href="#issue-configuration-validation-errors">Issue: Configuration Validation Errors</a></h3>
<p><strong>Symptoms:</strong></p>
<pre><code>Configuration validation failed
Invalid configuration value
Missing required field
</code></pre>
<p><strong>Diagnosis:</strong></p>
<pre><code class="language-bash"># Detailed validation
provisioning validate config --detailed
# Check specific sections
provisioning config show --section paths
provisioning config show --section providers
</code></pre>
<p><strong>Solutions:</strong></p>
<h4 id="path-configuration-issues"><a class="header" href="#path-configuration-issues">Path Configuration Issues</a></h4>
<pre><code class="language-bash"># Check base path exists
ls -la /path/to/provisioning
# Update configuration
nano ~/.config/provisioning/config.toml
# Fix paths section
[paths]
base = "/correct/path/to/provisioning"
</code></pre>
<h4 id="provider-configuration-issues"><a class="header" href="#provider-configuration-issues">Provider Configuration Issues</a></h4>
<pre><code class="language-bash"># Test provider connectivity
provisioning provider test aws
# Check credentials
aws configure list # For AWS
upcloud-cli config # For UpCloud
# Update provider configuration
[providers.aws]
interface = "CLI" # or "API"
</code></pre>
<h3 id="issue-interpolation-failures"><a class="header" href="#issue-interpolation-failures">Issue: Interpolation Failures</a></h3>
<p><strong>Symptoms:</strong></p>
<pre><code>Interpolation pattern not resolved: {{env.VARIABLE}}
Template rendering failed
</code></pre>
<p><strong>Diagnosis:</strong></p>
<pre><code class="language-bash"># Test interpolation
provisioning validate interpolation test
# Check environment variables
env | grep VARIABLE
# Debug interpolation
provisioning --debug validate interpolation validate
</code></pre>
<p><strong>Solutions:</strong></p>
<pre><code class="language-bash"># Set missing environment variables
export MISSING_VARIABLE="value"
# Use fallback values in configuration
config_value = "{{env.VARIABLE || 'default_value'}}"
# Check interpolation syntax
# Correct: {{env.HOME}}
# Incorrect: ${HOME} or $HOME
</code></pre>
<h2 id="server-management-issues"><a class="header" href="#server-management-issues">Server Management Issues</a></h2>
<h3 id="issue-server-creation-fails"><a class="header" href="#issue-server-creation-fails">Issue: Server Creation Fails</a></h3>
<p><strong>Symptoms:</strong></p>
<pre><code>Failed to create server
Provider API error
Insufficient quota
</code></pre>
<p><strong>Diagnosis:</strong></p>
<pre><code class="language-bash"># Check provider status
provisioning provider status aws
# Test connectivity
ping api.provider.com
curl -I https://api.provider.com
# Check quota
provisioning provider quota --infra my-infra
# Debug server creation
provisioning --debug server create web-01 --infra my-infra --check
</code></pre>
<p><strong>Solutions:</strong></p>
<h4 id="api-authentication-issues"><a class="header" href="#api-authentication-issues">API Authentication Issues</a></h4>
<pre><code class="language-bash"># AWS
aws configure list
aws sts get-caller-identity
# UpCloud
upcloud-cli account show
# Update credentials
aws configure # For AWS
export UPCLOUD_USERNAME="your-username"
export UPCLOUD_PASSWORD="your-password"
</code></pre>
<h4 id="quotalimit-issues"><a class="header" href="#quotalimit-issues">Quota/Limit Issues</a></h4>
<pre><code class="language-bash"># Check current usage
provisioning show costs --infra my-infra
# Request quota increase from provider
# Or reduce resource requirements
# Use smaller instance types
# Reduce number of servers
</code></pre>
<h4 id="networkconnectivity-issues"><a class="header" href="#networkconnectivity-issues">Network/Connectivity Issues</a></h4>
<pre><code class="language-bash"># Test network connectivity
curl -v https://api.aws.amazon.com
curl -v https://api.upcloud.com
# Check DNS resolution
nslookup api.aws.amazon.com
# Check firewall rules
# Ensure outbound HTTPS (port 443) is allowed
</code></pre>
<h3 id="issue-ssh-access-fails"><a class="header" href="#issue-ssh-access-fails">Issue: SSH Access Fails</a></h3>
<p><strong>Symptoms:</strong></p>
<pre><code>Connection refused
Permission denied
Host key verification failed
</code></pre>
<p><strong>Diagnosis:</strong></p>
<pre><code class="language-bash"># Check server status
provisioning server list --infra my-infra
# Test SSH manually
ssh -v user@server-ip
# Check SSH configuration
provisioning show servers web-01 --infra my-infra
</code></pre>
<p><strong>Solutions:</strong></p>
<h4 id="connection-issues"><a class="header" href="#connection-issues">Connection Issues</a></h4>
<pre><code class="language-bash"># Wait for server to be fully ready
provisioning server list --infra my-infra --status
# Check security groups/firewall
# Ensure SSH (port 22) is allowed
# Use correct IP address
provisioning show servers web-01 --infra my-infra | grep ip
</code></pre>
<h4 id="authentication-issues"><a class="header" href="#authentication-issues">Authentication Issues</a></h4>
<pre><code class="language-bash"># Check SSH key
ls -la ~/.ssh/
ssh-add -l
# Generate new key if needed
ssh-keygen -t ed25519 -f ~/.ssh/provisioning_key
# Use specific key
provisioning server ssh web-01 --key ~/.ssh/provisioning_key --infra my-infra
</code></pre>
<h4 id="host-key-issues"><a class="header" href="#host-key-issues">Host Key Issues</a></h4>
<pre><code class="language-bash"># Remove old host key
ssh-keygen -R server-ip
# Accept new host key
ssh -o StrictHostKeyChecking=accept-new user@server-ip
</code></pre>
<h2 id="task-service-issues"><a class="header" href="#task-service-issues">Task Service Issues</a></h2>
<h3 id="issue-service-installation-fails"><a class="header" href="#issue-service-installation-fails">Issue: Service Installation Fails</a></h3>
<p><strong>Symptoms:</strong></p>
<pre><code>Service installation failed
Package not found
Dependency conflicts
</code></pre>
<p><strong>Diagnosis:</strong></p>
<pre><code class="language-bash"># Check service prerequisites
provisioning taskserv check kubernetes --infra my-infra
# Debug installation
provisioning --debug taskserv create kubernetes --infra my-infra --check
# Check server resources
provisioning server ssh web-01 --command "free -h &amp;&amp; df -h" --infra my-infra
</code></pre>
<p><strong>Solutions:</strong></p>
<h4 id="resource-issues"><a class="header" href="#resource-issues">Resource Issues</a></h4>
<pre><code class="language-bash"># Check available resources
provisioning server ssh web-01 --command "
echo 'Memory:' &amp;&amp; free -h
echo 'Disk:' &amp;&amp; df -h
echo 'CPU:' &amp;&amp; nproc
" --infra my-infra
# Upgrade server if needed
provisioning server resize web-01 --plan larger-plan --infra my-infra
</code></pre>
<h4 id="package-repository-issues"><a class="header" href="#package-repository-issues">Package Repository Issues</a></h4>
<pre><code class="language-bash"># Update package lists
provisioning server ssh web-01 --command "
sudo apt update &amp;&amp; sudo apt upgrade -y
" --infra my-infra
# Check repository connectivity
provisioning server ssh web-01 --command "
curl -I https://download.docker.com/linux/ubuntu/
" --infra my-infra
</code></pre>
<h4 id="dependency-issues"><a class="header" href="#dependency-issues">Dependency Issues</a></h4>
<pre><code class="language-bash"># Install missing dependencies
provisioning taskserv create containerd --infra my-infra
# Then install dependent service
provisioning taskserv create kubernetes --infra my-infra
</code></pre>
<h3 id="issue-service-not-running"><a class="header" href="#issue-service-not-running">Issue: Service Not Running</a></h3>
<p><strong>Symptoms:</strong></p>
<pre><code>Service status: failed
Service not responding
Health check failures
</code></pre>
<p><strong>Diagnosis:</strong></p>
<pre><code class="language-bash"># Check service status
provisioning taskserv status kubernetes --infra my-infra
# Check service logs
provisioning taskserv logs kubernetes --infra my-infra
# SSH and check manually
provisioning server ssh web-01 --command "
sudo systemctl status kubernetes
sudo journalctl -u kubernetes --no-pager -n 50
" --infra my-infra
</code></pre>
<p><strong>Solutions:</strong></p>
<h4 id="configuration-issues-1"><a class="header" href="#configuration-issues-1">Configuration Issues</a></h4>
<pre><code class="language-bash"># Reconfigure service
provisioning taskserv configure kubernetes --infra my-infra
# Reset to defaults
provisioning taskserv reset kubernetes --infra my-infra
</code></pre>
<h4 id="port-conflicts"><a class="header" href="#port-conflicts">Port Conflicts</a></h4>
<pre><code class="language-bash"># Check port usage
provisioning server ssh web-01 --command "
sudo netstat -tulpn | grep :6443
sudo ss -tulpn | grep :6443
" --infra my-infra
# Change port configuration or stop conflicting service
</code></pre>
<h4 id="permission-issues-1"><a class="header" href="#permission-issues-1">Permission Issues</a></h4>
<pre><code class="language-bash"># Fix permissions
provisioning server ssh web-01 --command "
sudo chown -R kubernetes:kubernetes /var/lib/kubernetes
sudo chmod 600 /etc/kubernetes/admin.conf
" --infra my-infra
</code></pre>
<h2 id="cluster-management-issues"><a class="header" href="#cluster-management-issues">Cluster Management Issues</a></h2>
<h3 id="issue-cluster-deployment-fails"><a class="header" href="#issue-cluster-deployment-fails">Issue: Cluster Deployment Fails</a></h3>
<p><strong>Symptoms:</strong></p>
<pre><code>Cluster deployment failed
Pod creation errors
Service unavailable
</code></pre>
<p><strong>Diagnosis:</strong></p>
<pre><code class="language-bash"># Check cluster status
provisioning cluster status web-cluster --infra my-infra
# Check Kubernetes cluster
provisioning server ssh master-01 --command "
kubectl get nodes
kubectl get pods --all-namespaces
" --infra my-infra
# Check cluster logs
provisioning cluster logs web-cluster --infra my-infra
</code></pre>
<p><strong>Solutions:</strong></p>
<h4 id="node-issues"><a class="header" href="#node-issues">Node Issues</a></h4>
<pre><code class="language-bash"># Check node status
provisioning server ssh master-01 --command "
kubectl describe nodes
" --infra my-infra
# Drain and rejoin problematic nodes
provisioning server ssh master-01 --command "
kubectl drain worker-01 --ignore-daemonsets
kubectl delete node worker-01
" --infra my-infra
# Rejoin node
provisioning taskserv configure kubernetes --infra my-infra --servers worker-01
</code></pre>
<h4 id="resource-constraints"><a class="header" href="#resource-constraints">Resource Constraints</a></h4>
<pre><code class="language-bash"># Check resource usage
provisioning server ssh master-01 --command "
kubectl top nodes
kubectl top pods --all-namespaces
" --infra my-infra
# Scale down or add more nodes
provisioning cluster scale web-cluster --replicas 3 --infra my-infra
provisioning server create worker-04 --infra my-infra
</code></pre>
<h4 id="network-issues"><a class="header" href="#network-issues">Network Issues</a></h4>
<pre><code class="language-bash"># Check network plugin
provisioning server ssh master-01 --command "
kubectl get pods -n kube-system | grep cilium
" --infra my-infra
# Restart network plugin
provisioning taskserv restart cilium --infra my-infra
</code></pre>
<h2 id="performance-issues"><a class="header" href="#performance-issues">Performance Issues</a></h2>
<h3 id="issue-slow-operations"><a class="header" href="#issue-slow-operations">Issue: Slow Operations</a></h3>
<p><strong>Symptoms:</strong></p>
<ul>
<li>Commands take very long to complete</li>
<li>Timeouts during operations</li>
<li>High CPU/memory usage</li>
</ul>
<p><strong>Diagnosis:</strong></p>
<pre><code class="language-bash"># Check system resources
top
htop
free -h
df -h
# Check network latency
ping api.aws.amazon.com
traceroute api.aws.amazon.com
# Profile command execution
time provisioning server list --infra my-infra
</code></pre>
<p><strong>Solutions:</strong></p>
<h4 id="local-system-issues"><a class="header" href="#local-system-issues">Local System Issues</a></h4>
<pre><code class="language-bash"># Close unnecessary applications
# Upgrade system resources
# Use SSD storage if available
# Increase timeout values
export PROVISIONING_TIMEOUT=600 # 10 minutes
</code></pre>
<h4 id="network-issues-1"><a class="header" href="#network-issues-1">Network Issues</a></h4>
<pre><code class="language-bash"># Use region closer to your location
[providers.aws]
region = "us-west-1" # Closer region
# Enable connection pooling/caching
[cache]
enabled = true
</code></pre>
<h4 id="large-infrastructure-issues"><a class="header" href="#large-infrastructure-issues">Large Infrastructure Issues</a></h4>
<pre><code class="language-bash"># Use parallel operations
provisioning server create --infra my-infra --parallel 4
# Filter results
provisioning server list --infra my-infra --filter "status == 'running'"
</code></pre>
<h3 id="issue-high-memory-usage"><a class="header" href="#issue-high-memory-usage">Issue: High Memory Usage</a></h3>
<p><strong>Symptoms:</strong></p>
<ul>
<li>System becomes unresponsive</li>
<li>Out of memory errors</li>
<li>Swap usage high</li>
</ul>
<p><strong>Diagnosis:</strong></p>
<pre><code class="language-bash"># Check memory usage
free -h
ps aux --sort=-%mem | head
# Check for memory leaks
valgrind provisioning server list --infra my-infra
</code></pre>
<p><strong>Solutions:</strong></p>
<pre><code class="language-bash"># Increase system memory
# Close other applications
# Use streaming operations for large datasets
# Enable garbage collection
export PROVISIONING_GC_ENABLED=true
# Reduce concurrent operations
export PROVISIONING_MAX_PARALLEL=2
</code></pre>
<h2 id="network-and-connectivity-issues"><a class="header" href="#network-and-connectivity-issues">Network and Connectivity Issues</a></h2>
<h3 id="issue-api-connectivity-problems"><a class="header" href="#issue-api-connectivity-problems">Issue: API Connectivity Problems</a></h3>
<p><strong>Symptoms:</strong></p>
<pre><code>Connection timeout
DNS resolution failed
SSL certificate errors
</code></pre>
<p><strong>Diagnosis:</strong></p>
<pre><code class="language-bash"># Test basic connectivity
ping 8.8.8.8
curl -I https://api.aws.amazon.com
nslookup api.upcloud.com
# Check SSL certificates
openssl s_client -connect api.aws.amazon.com:443 -servername api.aws.amazon.com
</code></pre>
<p><strong>Solutions:</strong></p>
<h4 id="dns-issues"><a class="header" href="#dns-issues">DNS Issues</a></h4>
<pre><code class="language-bash"># Use alternative DNS
echo 'nameserver 8.8.8.8' | sudo tee /etc/resolv.conf
# Clear DNS cache
sudo systemctl restart systemd-resolved # Ubuntu
sudo dscacheutil -flushcache # macOS
</code></pre>
<h4 id="proxyfirewall-issues"><a class="header" href="#proxyfirewall-issues">Proxy/Firewall Issues</a></h4>
<pre><code class="language-bash"># Configure proxy if needed
export HTTP_PROXY=http://proxy.company.com:9090
export HTTPS_PROXY=http://proxy.company.com:9090
# Check firewall rules
sudo ufw status # Ubuntu
sudo firewall-cmd --list-all # RHEL/CentOS
</code></pre>
<h4 id="certificate-issues"><a class="header" href="#certificate-issues">Certificate Issues</a></h4>
<pre><code class="language-bash"># Update CA certificates
sudo apt update &amp;&amp; sudo apt install ca-certificates # Ubuntu
brew install ca-certificates # macOS
# Skip SSL verification (temporary)
export PROVISIONING_SKIP_SSL_VERIFY=true
</code></pre>
<h2 id="security-and-encryption-issues"><a class="header" href="#security-and-encryption-issues">Security and Encryption Issues</a></h2>
<h3 id="issue-sops-decryption-fails"><a class="header" href="#issue-sops-decryption-fails">Issue: SOPS Decryption Fails</a></h3>
<p><strong>Symptoms:</strong></p>
<pre><code>SOPS decryption failed
Age key not found
Invalid key format
</code></pre>
<p><strong>Diagnosis:</strong></p>
<pre><code class="language-bash"># Check SOPS configuration
provisioning sops config
# Test SOPS manually
sops -d encrypted-file.k
# Check Age keys
ls -la ~/.config/sops/age/keys.txt
age-keygen -y ~/.config/sops/age/keys.txt
</code></pre>
<p><strong>Solutions:</strong></p>
<h4 id="missing-keys"><a class="header" href="#missing-keys">Missing Keys</a></h4>
<pre><code class="language-bash"># Generate new Age key
age-keygen -o ~/.config/sops/age/keys.txt
# Update SOPS configuration
provisioning sops config --key-file ~/.config/sops/age/keys.txt
</code></pre>
<h4 id="key-permissions"><a class="header" href="#key-permissions">Key Permissions</a></h4>
<pre><code class="language-bash"># Fix key file permissions
chmod 600 ~/.config/sops/age/keys.txt
chown $(whoami) ~/.config/sops/age/keys.txt
</code></pre>
<h4 id="configuration-issues-2"><a class="header" href="#configuration-issues-2">Configuration Issues</a></h4>
<pre><code class="language-bash"># Update SOPS configuration in ~/.config/provisioning/config.toml
[sops]
use_sops = true
key_search_paths = [
"~/.config/sops/age/keys.txt",
"/path/to/your/key.txt"
]
</code></pre>
<h3 id="issue-access-denied-errors"><a class="header" href="#issue-access-denied-errors">Issue: Access Denied Errors</a></h3>
<p><strong>Symptoms:</strong></p>
<pre><code>Permission denied
Access denied
Insufficient privileges
</code></pre>
<p><strong>Diagnosis:</strong></p>
<pre><code class="language-bash"># Check user permissions
id
groups
# Check file permissions
ls -la ~/.config/provisioning/
ls -la /usr/local/provisioning/
# Test with sudo
sudo provisioning env
</code></pre>
<p><strong>Solutions:</strong></p>
<pre><code class="language-bash"># Fix file ownership
sudo chown -R $(whoami):$(whoami) ~/.config/provisioning/
# Fix permissions
chmod -R 755 ~/.config/provisioning/
chmod 600 ~/.config/provisioning/config.toml
# Add user to required groups
sudo usermod -a -G docker $(whoami) # For Docker access
</code></pre>
<h2 id="data-and-storage-issues"><a class="header" href="#data-and-storage-issues">Data and Storage Issues</a></h2>
<h3 id="issue-disk-space-problems"><a class="header" href="#issue-disk-space-problems">Issue: Disk Space Problems</a></h3>
<p><strong>Symptoms:</strong></p>
<pre><code>No space left on device
Write failed
Disk full
</code></pre>
<p><strong>Diagnosis:</strong></p>
<pre><code class="language-bash"># Check disk usage
df -h
du -sh ~/.config/provisioning/
du -sh /usr/local/provisioning/
# Find large files
find /usr/local/provisioning -type f -size +100M
</code></pre>
<p><strong>Solutions:</strong></p>
<pre><code class="language-bash"># Clean up cache files
rm -rf ~/.config/provisioning/cache/*
rm -rf /usr/local/provisioning/.cache/*
# Clean up logs
find /usr/local/provisioning -name "*.log" -mtime +30 -delete
# Clean up temporary files
rm -rf /tmp/provisioning-*
# Compress old backups
gzip ~/.config/provisioning/backups/*.yaml
</code></pre>
<h2 id="recovery-procedures"><a class="header" href="#recovery-procedures">Recovery Procedures</a></h2>
<h3 id="configuration-recovery"><a class="header" href="#configuration-recovery">Configuration Recovery</a></h3>
<pre><code class="language-bash"># Restore from backup
provisioning config restore --backup latest
# Reset to defaults
provisioning config reset
# Recreate configuration
provisioning init config --force
</code></pre>
<h3 id="infrastructure-recovery"><a class="header" href="#infrastructure-recovery">Infrastructure Recovery</a></h3>
<pre><code class="language-bash"># Check infrastructure status
provisioning show servers --infra my-infra
# Recover failed servers
provisioning server create failed-server --infra my-infra
# Restore from backup
provisioning restore --backup latest --infra my-infra
</code></pre>
<h3 id="service-recovery"><a class="header" href="#service-recovery">Service Recovery</a></h3>
<pre><code class="language-bash"># Restart failed services
provisioning taskserv restart kubernetes --infra my-infra
# Reinstall corrupted services
provisioning taskserv delete kubernetes --infra my-infra
provisioning taskserv create kubernetes --infra my-infra
</code></pre>
<h2 id="prevention-strategies"><a class="header" href="#prevention-strategies">Prevention Strategies</a></h2>
<h3 id="regular-maintenance"><a class="header" href="#regular-maintenance">Regular Maintenance</a></h3>
<pre><code class="language-bash"># Weekly maintenance script
#!/bin/bash
# Update system
provisioning update --check
# Validate configuration
provisioning validate config
# Check for service updates
provisioning taskserv check-updates
# Clean up old files
provisioning cleanup --older-than 30d
# Create backup
provisioning backup create --name "weekly-$(date +%Y%m%d)"
</code></pre>
<h3 id="monitoring-setup"><a class="header" href="#monitoring-setup">Monitoring Setup</a></h3>
<pre><code class="language-bash"># Set up health monitoring
#!/bin/bash
# Check system health every hour
0 * * * * /usr/local/bin/provisioning health check || echo "Health check failed" | mail -s "Provisioning Alert" admin@company.com
# Weekly cost reports
0 9 * * 1 /usr/local/bin/provisioning show costs --all | mail -s "Weekly Cost Report" finance@company.com
</code></pre>
<h3 id="best-practices"><a class="header" href="#best-practices">Best Practices</a></h3>
<ol>
<li>
<p><strong>Configuration Management</strong></p>
<ul>
<li>Version control all configuration files</li>
<li>Use check mode before applying changes</li>
<li>Regular validation and testing</li>
</ul>
</li>
<li>
<p><strong>Security</strong></p>
<ul>
<li>Regular key rotation</li>
<li>Principle of least privilege</li>
<li>Audit logs review</li>
</ul>
</li>
<li>
<p><strong>Backup Strategy</strong></p>
<ul>
<li>Automated daily backups</li>
<li>Test restore procedures</li>
<li>Off-site backup storage</li>
</ul>
</li>
<li>
<p><strong>Documentation</strong></p>
<ul>
<li>Document custom configurations</li>
<li>Keep troubleshooting logs</li>
<li>Share knowledge with team</li>
</ul>
</li>
</ol>
<h2 id="getting-additional-help"><a class="header" href="#getting-additional-help">Getting Additional Help</a></h2>
<h3 id="debug-information-collection"><a class="header" href="#debug-information-collection">Debug Information Collection</a></h3>
<pre><code class="language-bash">#!/bin/bash
# Collect debug information
echo "Collecting provisioning debug information..."
mkdir -p /tmp/provisioning-debug
cd /tmp/provisioning-debug
# System information
uname -a &gt; system-info.txt
free -h &gt;&gt; system-info.txt
df -h &gt;&gt; system-info.txt
# Provisioning information
provisioning --version &gt; provisioning-info.txt
provisioning env &gt;&gt; provisioning-info.txt
provisioning validate config --detailed &gt; config-validation.txt 2&gt;&amp;1
# Configuration files
cp ~/.config/provisioning/config.toml user-config.toml 2&gt;/dev/null || echo "No user config" &gt; user-config.toml
# Logs
provisioning show logs &gt; system-logs.txt 2&gt;&amp;1
# Create archive
cd /tmp
tar czf provisioning-debug-$(date +%Y%m%d_%H%M%S).tar.gz provisioning-debug/
echo "Debug information collected in: provisioning-debug-*.tar.gz"
</code></pre>
<h3 id="support-channels"><a class="header" href="#support-channels">Support Channels</a></h3>
<ol>
<li>
<p><strong>Built-in Help</strong></p>
<pre><code class="language-bash">provisioning help
provisioning help &lt;command&gt;
</code></pre>
</li>
<li>
<p><strong>Documentation</strong></p>
<ul>
<li>User guides in <code>docs/user/</code></li>
<li>CLI reference: <code>docs/user/cli-reference.md</code></li>
<li>Configuration guide: <code>docs/user/configuration.md</code></li>
</ul>
</li>
<li>
<p><strong>Community Resources</strong></p>
<ul>
<li>Project repository issues</li>
<li>Community forums</li>
<li>Documentation wiki</li>
</ul>
</li>
<li>
<p><strong>Enterprise Support</strong></p>
<ul>
<li>Professional services</li>
<li>Priority support</li>
<li>Custom development</li>
</ul>
</li>
</ol>
<p>Remember: When reporting issues, always include the debug information collected above and specific error messages.</p>
</main>
<nav class="nav-wrapper" aria-label="Page navigation">
<!-- Mobile navigation buttons -->
<a rel="prev" href="../user/test-environment-usage.html" class="mobile-nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
<i class="fa fa-angle-left"></i>
</a>
<a rel="next prefetch" href="../user/AUTHENTICATION_LAYER_GUIDE.html" class="mobile-nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
<i class="fa fa-angle-right"></i>
</a>
<div style="clear: both"></div>
</nav>
</div>
</div>
<nav class="nav-wide-wrapper" aria-label="Page navigation">
<a rel="prev" href="../user/test-environment-usage.html" class="nav-chapters previous" title="Previous chapter" aria-label="Previous chapter" aria-keyshortcuts="Left">
<i class="fa fa-angle-left"></i>
</a>
<a rel="next prefetch" href="../user/AUTHENTICATION_LAYER_GUIDE.html" class="nav-chapters next" title="Next chapter" aria-label="Next chapter" aria-keyshortcuts="Right">
<i class="fa fa-angle-right"></i>
</a>
</nav>
</div>
<!-- Livereload script (if served using the cli tool) -->
<script>
const wsProtocol = location.protocol === 'https:' ? 'wss:' : 'ws:';
const wsAddress = wsProtocol + "//" + location.host + "/" + "__livereload";
const socket = new WebSocket(wsAddress);
socket.onmessage = function (event) {
if (event.data === "reload") {
socket.close();
location.reload();
}
};
window.onbeforeunload = function() {
socket.close();
}
</script>
<script>
window.playground_copyable = true;
</script>
<script src="../elasticlunr.min.js"></script>
<script src="../mark.min.js"></script>
<script src="../searcher.js"></script>
<script src="../clipboard.min.js"></script>
<script src="../highlight.js"></script>
<script src="../book.js"></script>
<!-- Custom JS scripts -->
</div>
</body>
</html>