#!/usr/bin/env bash # Diagnose container startup failures on r630-02 # Usage: ./scripts/diagnose-r630-02-startup-failures.sh set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" # Configuration NODE_IP="192.168.11.12" NODE_NAME="r630-02" # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' CYAN='\033[0;36m' NC='\033[0m' log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } log_success() { echo -e "${GREEN}[✓]${NC} $1"; } log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } log_error() { echo -e "${RED}[ERROR]${NC} $1"; } log_section() { echo -e "\n${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"; } log_subsection() { echo -e "\n${CYAN} $1${NC}"; } # Failed containers from the report LV_ERROR_CONTAINERS=(3000 3001 3002 3003 3500 3501 6000 6400) STARTUP_ERROR_CONTAINERS=(5200 10000 10001 10020 10030 10040 10050 10060 10070 10080 10090 10091 10092 10100 10101 10120 10130 10150 10151 10200 10201 10202 10210 10230) LOCK_ERROR_CONTAINERS=(10232) ALL_FAILED=("${LV_ERROR_CONTAINERS[@]}" "${STARTUP_ERROR_CONTAINERS[@]}" "${LOCK_ERROR_CONTAINERS[@]}") echo "" log_section log_info " DIAGNOSING CONTAINER STARTUP FAILURES ON $NODE_NAME" log_section echo "" # Check SSH access log_info "Checking SSH access to $NODE_NAME ($NODE_IP)..." if ! ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} "echo 'SSH OK'" &>/dev/null; then log_error "Cannot access $NODE_NAME via SSH" exit 1 fi log_success "SSH access confirmed" # Function to check container status check_container_status() { local vmid=$1 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} \ "pct list 2>/dev/null | awk '\$1 == $vmid {print \$2}'" || echo "notfound" } # Function to check if config exists check_config_exists() { local vmid=$1 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} \ "test -f /etc/pve/lxc/${vmid}.conf && echo 'exists' || echo 'missing'" || echo "error" } # Function to get storage config get_storage_config() { local vmid=$1 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} \ "pct config $vmid 2>/dev/null | grep '^rootfs:'" || echo "notfound" } # Function to check if logical volume exists check_lv_exists() { local vmid=$1 local disk_num=${2:-1} ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} \ "lvs 2>/dev/null | grep -q \"vm-${vmid}-disk-${disk_num}\" && echo 'exists' || echo 'missing'" || echo "error" } # Function to get startup error get_startup_error() { local vmid=$1 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} \ "pct start $vmid 2>&1" || true } # Function to check for locks check_locks() { local vmid=$1 ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} \ "ls -la /var/lock/qemu-server/ 2>/dev/null | grep -E 'lock-${vmid}|lxc-${vmid}' || echo 'nolock'" || echo "error" } # System information log_section log_info " SYSTEM INFORMATION" log_section log_subsection "Storage Status" ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} "pvesm status" || log_error "Failed to get storage status" echo "" log_subsection "Volume Groups" ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} "vgs" || log_error "Failed to get VG status" echo "" log_subsection "Logical Volumes (relevant)" ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} \ "lvs 2>/dev/null | grep -E 'vm-3000|vm-3001|vm-3002|vm-3003|vm-3500|vm-3501|vm-6000|vm-6400|vm-5200|vm-10000|vm-10001|vm-10020' || echo 'No matching volumes found'" || log_error "Failed to get LV status" echo "" log_subsection "System Resources" ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} "free -h | head -2" || log_error "Failed to get memory info" ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} "df -h / | tail -1" || log_error "Failed to get disk info" echo "" # Diagnose Logical Volume Errors log_section log_info " CATEGORY 1: LOGICAL VOLUME ERRORS" log_section for vmid in "${LV_ERROR_CONTAINERS[@]}"; do log_subsection "CT $vmid - Missing Logical Volume" status=$(check_container_status "$vmid") config=$(check_config_exists "$vmid") storage=$(get_storage_config "$vmid") echo " Status: $status" echo " Config: $config" echo " Storage: $storage" # Extract disk number from storage config if [[ "$storage" != "notfound" ]] && [[ -n "$storage" ]]; then disk_num=$(echo "$storage" | grep -oP 'vm-${vmid}-disk-\K\d+' || echo "1") lv_status=$(check_lv_exists "$vmid" "$disk_num") echo " Logical Volume: $lv_status" # Check all possible disk numbers for d in 0 1 2; do lv_check=$(check_lv_exists "$vmid" "$d") if [[ "$lv_check" == "exists" ]]; then echo " Found: vm-${vmid}-disk-${d}" fi done fi # Check if volume exists with different name echo " All volumes for this VMID:" ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} \ "lvs 2>/dev/null | grep \"vm-${vmid}\" || echo ' None found'" || true echo "" done # Diagnose Startup Failures log_section log_info " CATEGORY 2: STARTUP FAILURES" log_section for vmid in "${STARTUP_ERROR_CONTAINERS[@]}"; do log_subsection "CT $vmid - Startup Failed" status=$(check_container_status "$vmid") config=$(check_config_exists "$vmid") storage=$(get_storage_config "$vmid") echo " Status: $status" echo " Config: $config" echo " Storage: $storage" if [[ "$config" == "missing" ]]; then log_error " ⚠️ Config file missing!" fi # Try to get detailed error echo " Attempting startup to capture error..." error_output=$(get_startup_error "$vmid" 2>&1 | head -10) if [[ -n "$error_output" ]]; then echo " Error output:" echo "$error_output" | sed 's/^/ /' fi # Check for locks locks=$(check_locks "$vmid") if [[ "$locks" != "nolock" ]] && [[ "$locks" != "error" ]]; then log_warn " ⚠️ Lock detected: $locks" fi echo "" done # Diagnose Lock Error log_section log_info " CATEGORY 3: LOCK ERROR" log_section for vmid in "${LOCK_ERROR_CONTAINERS[@]}"; do log_subsection "CT $vmid - Locked (create)" status=$(check_container_status "$vmid") config=$(check_config_exists "$vmid") locks=$(check_locks "$vmid") echo " Status: $status" echo " Config: $config" echo " Locks: $locks" # Check Proxmox task queue echo " Task queue:" ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} \ "qm list 2>/dev/null | grep $vmid || echo ' Not in QEMU list'" || true echo "" done # Summary log_section log_info " DIAGNOSTIC SUMMARY" log_section echo "" log_info "Total failed containers: ${#ALL_FAILED[@]}" log_info " - Logical volume errors: ${#LV_ERROR_CONTAINERS[@]}" log_info " - Startup failures: ${#STARTUP_ERROR_CONTAINERS[@]}" log_info " - Lock errors: ${#LOCK_ERROR_CONTAINERS[@]}" echo "" log_info "Next steps:" echo " 1. Review diagnostic output above" echo " 2. Run fix script: ./scripts/fix-r630-02-startup-failures.sh" echo " 3. Manually resolve remaining issues based on findings" echo "" log_success "Diagnostic complete!"