Files
proxmox/scripts/diagnose-r630-02-startup-failures.sh.bak

235 lines
7.5 KiB
Bash
Raw Normal View History

#!/usr/bin/env bash
# Diagnose container startup failures on r630-02
# Usage: ./scripts/diagnose-r630-02-startup-failures.sh
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
# Configuration
NODE_IP="192.168.11.12"
NODE_NAME="r630-02"
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m'
log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
log_success() { echo -e "${GREEN}[✓]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
log_section() { echo -e "\n${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"; }
log_subsection() { echo -e "\n${CYAN} $1${NC}"; }
# Failed containers from the report
LV_ERROR_CONTAINERS=(3000 3001 3002 3003 3500 3501 6000 6400)
STARTUP_ERROR_CONTAINERS=(5200 10000 10001 10020 10030 10040 10050 10060 10070 10080 10090 10091 10092 10100 10101 10120 10130 10150 10151 10200 10201 10202 10210 10230)
LOCK_ERROR_CONTAINERS=(10232)
ALL_FAILED=("${LV_ERROR_CONTAINERS[@]}" "${STARTUP_ERROR_CONTAINERS[@]}" "${LOCK_ERROR_CONTAINERS[@]}")
echo ""
log_section
log_info " DIAGNOSING CONTAINER STARTUP FAILURES ON $NODE_NAME"
log_section
echo ""
# Check SSH access
log_info "Checking SSH access to $NODE_NAME ($NODE_IP)..."
if ! ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} "echo 'SSH OK'" &>/dev/null; then
log_error "Cannot access $NODE_NAME via SSH"
exit 1
fi
log_success "SSH access confirmed"
# Function to check container status
check_container_status() {
local vmid=$1
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} \
"pct list 2>/dev/null | awk '\$1 == $vmid {print \$2}'" || echo "notfound"
}
# Function to check if config exists
check_config_exists() {
local vmid=$1
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} \
"test -f /etc/pve/lxc/${vmid}.conf && echo 'exists' || echo 'missing'" || echo "error"
}
# Function to get storage config
get_storage_config() {
local vmid=$1
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} \
"pct config $vmid 2>/dev/null | grep '^rootfs:'" || echo "notfound"
}
# Function to check if logical volume exists
check_lv_exists() {
local vmid=$1
local disk_num=${2:-1}
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} \
"lvs 2>/dev/null | grep -q \"vm-${vmid}-disk-${disk_num}\" && echo 'exists' || echo 'missing'" || echo "error"
}
# Function to get startup error
get_startup_error() {
local vmid=$1
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} \
"pct start $vmid 2>&1" || true
}
# Function to check for locks
check_locks() {
local vmid=$1
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} \
"ls -la /var/lock/qemu-server/ 2>/dev/null | grep -E 'lock-${vmid}|lxc-${vmid}' || echo 'nolock'" || echo "error"
}
# System information
log_section
log_info " SYSTEM INFORMATION"
log_section
log_subsection "Storage Status"
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} "pvesm status" || log_error "Failed to get storage status"
echo ""
log_subsection "Volume Groups"
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} "vgs" || log_error "Failed to get VG status"
echo ""
log_subsection "Logical Volumes (relevant)"
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} \
"lvs 2>/dev/null | grep -E 'vm-3000|vm-3001|vm-3002|vm-3003|vm-3500|vm-3501|vm-6000|vm-6400|vm-5200|vm-10000|vm-10001|vm-10020' || echo 'No matching volumes found'" || log_error "Failed to get LV status"
echo ""
log_subsection "System Resources"
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} "free -h | head -2" || log_error "Failed to get memory info"
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} "df -h / | tail -1" || log_error "Failed to get disk info"
echo ""
# Diagnose Logical Volume Errors
log_section
log_info " CATEGORY 1: LOGICAL VOLUME ERRORS"
log_section
for vmid in "${LV_ERROR_CONTAINERS[@]}"; do
log_subsection "CT $vmid - Missing Logical Volume"
status=$(check_container_status "$vmid")
config=$(check_config_exists "$vmid")
storage=$(get_storage_config "$vmid")
echo " Status: $status"
echo " Config: $config"
echo " Storage: $storage"
# Extract disk number from storage config
if [[ "$storage" != "notfound" ]] && [[ -n "$storage" ]]; then
disk_num=$(echo "$storage" | grep -oP 'vm-${vmid}-disk-\K\d+' || echo "1")
lv_status=$(check_lv_exists "$vmid" "$disk_num")
echo " Logical Volume: $lv_status"
# Check all possible disk numbers
for d in 0 1 2; do
lv_check=$(check_lv_exists "$vmid" "$d")
if [[ "$lv_check" == "exists" ]]; then
echo " Found: vm-${vmid}-disk-${d}"
fi
done
fi
# Check if volume exists with different name
echo " All volumes for this VMID:"
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} \
"lvs 2>/dev/null | grep \"vm-${vmid}\" || echo ' None found'" || true
echo ""
done
# Diagnose Startup Failures
log_section
log_info " CATEGORY 2: STARTUP FAILURES"
log_section
for vmid in "${STARTUP_ERROR_CONTAINERS[@]}"; do
log_subsection "CT $vmid - Startup Failed"
status=$(check_container_status "$vmid")
config=$(check_config_exists "$vmid")
storage=$(get_storage_config "$vmid")
echo " Status: $status"
echo " Config: $config"
echo " Storage: $storage"
if [[ "$config" == "missing" ]]; then
log_error " ⚠️ Config file missing!"
fi
# Try to get detailed error
echo " Attempting startup to capture error..."
error_output=$(get_startup_error "$vmid" 2>&1 | head -10)
if [[ -n "$error_output" ]]; then
echo " Error output:"
echo "$error_output" | sed 's/^/ /'
fi
# Check for locks
locks=$(check_locks "$vmid")
if [[ "$locks" != "nolock" ]] && [[ "$locks" != "error" ]]; then
log_warn " ⚠️ Lock detected: $locks"
fi
echo ""
done
# Diagnose Lock Error
log_section
log_info " CATEGORY 3: LOCK ERROR"
log_section
for vmid in "${LOCK_ERROR_CONTAINERS[@]}"; do
log_subsection "CT $vmid - Locked (create)"
status=$(check_container_status "$vmid")
config=$(check_config_exists "$vmid")
locks=$(check_locks "$vmid")
echo " Status: $status"
echo " Config: $config"
echo " Locks: $locks"
# Check Proxmox task queue
echo " Task queue:"
ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@${NODE_IP} \
"qm list 2>/dev/null | grep $vmid || echo ' Not in QEMU list'" || true
echo ""
done
# Summary
log_section
log_info " DIAGNOSTIC SUMMARY"
log_section
echo ""
log_info "Total failed containers: ${#ALL_FAILED[@]}"
log_info " - Logical volume errors: ${#LV_ERROR_CONTAINERS[@]}"
log_info " - Startup failures: ${#STARTUP_ERROR_CONTAINERS[@]}"
log_info " - Lock errors: ${#LOCK_ERROR_CONTAINERS[@]}"
echo ""
log_info "Next steps:"
echo " 1. Review diagnostic output above"
echo " 2. Run fix script: ./scripts/fix-r630-02-startup-failures.sh"
echo " 3. Manually resolve remaining issues based on findings"
echo ""
log_success "Diagnostic complete!"