Files
loc_az_hci/scripts/troubleshooting/diagnose-vm-issues.sh
defiQUG c39465c2bd
Some checks failed
Test / test (push) Has been cancelled
Initial commit: loc_az_hci (smom-dbis-138 excluded via .gitignore)
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-08 09:04:46 -08:00

159 lines
4.5 KiB
Bash
Executable File

#!/bin/bash
source ~/.bashrc
# Diagnose VM Issues
# Comprehensive diagnosis of VM problems
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# Load environment variables
if [ -f "$PROJECT_ROOT/.env" ]; then
set -a
source <(grep -v '^#' "$PROJECT_ROOT/.env" | grep -v '^$' | sed 's/#.*$//' | grep '=')
set +a
fi
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
PVE_USERNAME="${PVE_USERNAME:-root@pam}"
PVE_PASSWORD="${PVE_ROOT_PASS:-}"
PROXMOX_URL="${PROXMOX_ML110_URL:-https://192.168.1.206:8006}"
PROXMOX_NODE="${PROXMOX_NODE:-pve}"
log_info() {
echo -e "${GREEN}[INFO]${NC} $1"
}
log_warn() {
echo -e "${YELLOW}[WARN]${NC} $1"
}
log_error() {
echo -e "${RED}[ERROR]${NC} $1"
}
log_issue() {
echo -e "${RED}[ISSUE]${NC} $1"
}
get_api_token() {
local response=$(curl -s -k --connect-timeout 10 --max-time 15 \
-d "username=$PVE_USERNAME&password=$PVE_PASSWORD" \
"$PROXMOX_URL/api2/json/access/ticket" 2>&1)
if echo "$response" | grep -q '"data"'; then
local ticket=$(echo "$response" | grep -o '"ticket":"[^"]*' | cut -d'"' -f4)
local csrf_token=$(echo "$response" | grep -o '"CSRFPreventionToken":"[^"]*' | cut -d'"' -f4)
echo "$ticket|$csrf_token"
else
echo ""
fi
}
diagnose_template() {
log_info "Diagnosing template VM 9000..."
local tokens=$(get_api_token)
local ticket=$(echo "$tokens" | cut -d'|' -f1)
local csrf_token=$(echo "$tokens" | cut -d'|' -f2)
local config=$(curl -s -k -H "Cookie: PVEAuthCookie=$ticket" \
-H "CSRFPreventionToken: $csrf_token" \
"$PROXMOX_URL/api2/json/nodes/$PROXMOX_NODE/qemu/9000/config")
local disk=$(echo "$config" | python3 -c "import sys, json; d=json.load(sys.stdin).get('data', {}); print(d.get('scsi0', ''))" 2>/dev/null)
local size=$(echo "$disk" | grep -o 'size=[^,]*' | cut -d'=' -f2)
if [ "$size" = "600M" ]; then
log_issue "Template has only 600M disk - likely no OS installed"
log_warn "Template may need OS installation before cloning"
return 1
fi
return 0
}
diagnose_vm() {
local vmid=$1
local name=$2
local ip=$3
log_info "Diagnosing VM $vmid ($name)..."
local tokens=$(get_api_token)
local ticket=$(echo "$tokens" | cut -d'|' -f1)
local csrf_token=$(echo "$tokens" | cut -d'|' -f2)
# Check VM status
local status=$(curl -s -k -H "Cookie: PVEAuthCookie=$ticket" \
-H "CSRFPreventionToken: $csrf_token" \
"$PROXMOX_URL/api2/json/nodes/$PROXMOX_NODE/qemu/$vmid/status/current" | \
python3 -c "import sys, json; print(json.load(sys.stdin).get('data', {}).get('status', 'unknown'))" 2>/dev/null)
echo " Status: $status"
# Check QEMU Guest Agent
local agent_check=$(curl -s -k -H "Cookie: PVEAuthCookie=$ticket" \
-H "CSRFPreventionToken: $csrf_token" \
"$PROXMOX_URL/api2/json/nodes/$PROXMOX_NODE/qemu/$vmid/agent/network-get-interfaces" 2>&1)
if echo "$agent_check" | grep -q "not running"; then
log_issue "QEMU Guest Agent not running - OS may not be installed or agent not installed"
fi
# Check network connectivity
if ping -c 1 -W 2 "$ip" &>/dev/null; then
log_info " Network: ✓ Reachable"
else
log_issue " Network: ✗ Not reachable"
log_warn " Possible causes:"
log_warn " - OS not installed"
log_warn " - Cloud-init not installed"
log_warn " - Network configuration failed"
log_warn " - VM stuck in boot"
fi
# Check SSH
if timeout 3 bash -c "cat < /dev/null > /dev/tcp/$ip/22" 2>/dev/null; then
log_info " SSH: ✓ Port 22 open"
else
log_issue " SSH: ✗ Port 22 closed"
fi
}
main() {
log_info "VM Issue Diagnosis"
echo ""
# Diagnose template
diagnose_template
echo ""
# Diagnose VMs
local vms=(
"100 cloudflare-tunnel 192.168.1.60"
"101 k3s-master 192.168.1.188"
"102 git-server 192.168.1.121"
"103 observability 192.168.1.82"
)
for vm_spec in "${vms[@]}"; do
read -r vmid name ip <<< "$vm_spec"
diagnose_vm "$vmid" "$name" "$ip"
echo ""
done
log_info "Diagnosis complete"
log_warn "If template has no OS, VMs need manual OS installation via Proxmox console"
}
main "$@"