#!/usr/bin/env bash # Comprehensive Hardware Specifications and Storage Investigation # Gets detailed hardware specs for all Proxmox hosts and investigates missing storage data set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" REPORT_DIR="${PROJECT_ROOT}/reports/status" TIMESTAMP=$(date +%Y%m%d_%H%M%S) REPORT_FILE="${REPORT_DIR}/hardware_storage_investigation_${TIMESTAMP}.md" # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' CYAN='\033[0;36m' MAGENTA='\033[0;35m' NC='\033[0m' log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } log_success() { echo -e "${GREEN}[✓]${NC} $1"; } log_warn() { echo -e "${YELLOW}[⚠]${NC} $1"; } log_error() { echo -e "${RED}[✗]${NC} $1"; } log_header() { echo -e "${CYAN}=== $1 ===${NC}"; } log_section() { echo -e "\n${MAGENTA}>>> $1 <<<${NC}\n"; } # Create report directory mkdir -p "$REPORT_DIR" # Proxmox nodes configuration declare -A NODES NODES[ml110]="192.168.11.10:L@kers2010" NODES[r630-01]="192.168.11.11:password" NODES[r630-02]="192.168.11.12:password" # SSH helper function ssh_node() { local hostname="$1" shift local ip="${NODES[$hostname]%%:*}" local password="${NODES[$hostname]#*:}" if command -v sshpass >/dev/null 2>&1; then sshpass -p "$password" ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 root@"$ip" "$@" else ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 root@"$ip" "$@" fi } # Check node connectivity check_node() { local hostname="$1" local ip="${NODES[$hostname]%%:*}" if ping -c 1 -W 2 "$ip" >/dev/null 2>&1; then return 0 else return 1 fi } # Collect comprehensive hardware information collect_hardware_info() { local hostname="$1" local ip="${NODES[$hostname]%%:*}" log_info "Collecting hardware specifications from $hostname ($ip)..." if ! check_node "$hostname"; then log_warn "$hostname is not reachable" return 1 fi ssh_node "$hostname" bash <<'ENDSSH' echo "=== SYSTEM INFORMATION ===" echo "Hostname: $(hostname)" echo "Proxmox Version: $(pveversion -v 2>/dev/null | head -1 || echo 'Unknown')" echo "Kernel: $(uname -r)" echo "OS: $(cat /etc/os-release | grep PRETTY_NAME | cut -d'"' -f2)" echo "Uptime: $(uptime -p)" echo "Load Average: $(uptime | awk -F'load average:' '{print $2}')" echo "" echo "=== CPU INFORMATION ===" echo "CPU Model: $(lscpu | grep 'Model name' | cut -d: -f2 | xargs)" echo "CPU Architecture: $(lscpu | grep 'Architecture' | cut -d: -f2 | xargs)" echo "CPU Cores (Physical): $(lscpu | grep '^Core(s) per socket' | awk '{print $4}')" echo "CPU Sockets: $(lscpu | grep '^Socket(s)' | awk '{print $2}')" echo "CPU Threads per Core: $(lscpu | grep '^Thread(s) per core' | awk '{print $4}')" echo "Total CPU Cores: $(nproc)" echo "CPU Frequency: $(lscpu | grep 'CPU max MHz' | cut -d: -f2 | xargs) MHz" echo "CPU Flags: $(lscpu | grep '^Flags' | cut -d: -f2 | xargs | cut -c1-100)..." echo "" echo "=== MEMORY INFORMATION ===" echo "Total Memory: $(free -h | grep Mem | awk '{print $2}')" echo "Used Memory: $(free -h | grep Mem | awk '{print $3}')" echo "Available Memory: $(free -h | grep Mem | awk '{print $7}')" echo "Memory Usage: $(free | grep Mem | awk '{printf "%.1f%%", $3/$2 * 100.0}')" echo "Swap Total: $(free -h | grep Swap | awk '{print $2}')" echo "Swap Used: $(free -h | grep Swap | awk '{print $3}')" echo "" echo "DIMM Information:" if command -v dmidecode >/dev/null 2>&1; then dmidecode -t memory 2>/dev/null | grep -E "Size:|Speed:|Type:|Manufacturer:|Part Number:" | head -20 || echo " Limited DIMM info available" else echo " dmidecode not available" fi echo "" echo "=== STORAGE INFORMATION ===" echo "Physical Disks:" lsblk -d -o NAME,SIZE,TYPE,MODEL,ROTA,MOUNTPOINT,FSTYPE 2>/dev/null | head -20 echo "" echo "All Block Devices:" lsblk -o NAME,SIZE,TYPE,MOUNTPOINT,FSTYPE,LABEL 2>/dev/null | head -30 echo "" echo "Filesystem Usage:" df -h | grep -E "Filesystem|/dev|rpool|/var/lib/vz|/boot" echo "" echo "Proxmox Storage Status:" pvesm status 2>/dev/null || echo "pvesm not available" echo "" echo "Detailed Storage Information:" pvesh get /nodes/$(hostname)/storage 2>/dev/null | head -100 || echo "Cannot get detailed storage info" echo "" echo "LVM Volume Groups:" vgs --units g 2>/dev/null || echo "No LVM VGs or vgs not available" echo "" echo "LVM Logical Volumes:" lvs --units g -o lv_name,vg_name,lv_size,data_percent,metadata_percent,pool_lv 2>/dev/null | head -30 echo "" echo "ZFS Pools (if any):" zpool list 2>/dev/null || echo "No ZFS pools" zfs list 2>/dev/null | head -20 || echo "No ZFS datasets" echo "" echo "=== NETWORK INFORMATION ===" echo "Network Interfaces:" ip -o link show | awk '{print $2, $9}' | sed 's/:$//' echo "" echo "IP Addresses:" ip addr show | grep -E "^[0-9]+:|inet " | head -20 echo "" echo "Network Bridges:" cat /etc/network/interfaces 2>/dev/null | grep -E "^auto|^iface|bridge" | head -30 || echo "Cannot read network config" echo "" echo "=== MOTHERBOARD/CHASSIS INFORMATION ===" if command -v dmidecode >/dev/null 2>&1; then echo "System Manufacturer: $(dmidecode -s system-manufacturer 2>/dev/null || echo 'Unknown')" echo "System Product Name: $(dmidecode -s system-product-name 2>/dev/null || echo 'Unknown')" echo "System Version: $(dmidecode -s system-version 2>/dev/null || echo 'Unknown')" echo "System Serial: $(dmidecode -s system-serial-number 2>/dev/null || echo 'Unknown')" echo "Baseboard Manufacturer: $(dmidecode -s baseboard-manufacturer 2>/dev/null || echo 'Unknown')" echo "Baseboard Product: $(dmidecode -s baseboard-product-name 2>/dev/null || echo 'Unknown')" else echo "dmidecode not available for hardware details" fi echo "" echo "=== PCI DEVICES ===" lspci 2>/dev/null | head -30 || echo "lspci not available" echo "" echo "=== VM/CONTAINER COUNT ===" echo "QEMU VMs: $(qm list 2>/dev/null | tail -n +2 | wc -l)" echo "LXC Containers: $(pct list 2>/dev/null | tail -n +2 | wc -l)" echo "" echo "=== CLUSTER STATUS ===" pvecm status 2>/dev/null | head -10 || echo "Not in cluster or pvecm not available" echo "" echo "=== STORAGE DETAILED INVESTIGATION ===" echo "Checking all storage pools for missing data:" for storage in $(pvesm status 2>/dev/null | awk '{print $1}' | tail -n +2); do echo "" echo "Storage: $storage" echo " Status: $(pvesm status 2>/dev/null | grep "^$storage" | awk '{print $2}')" echo " Type: $(pvesm status 2>/dev/null | grep "^$storage" | awk '{print $3}')" echo " Usage: $(pvesm status 2>/dev/null | grep "^$storage" | awk '{print $4}')" echo " Content: $(pvesh get /nodes/$(hostname)/storage/$storage 2>/dev/null | grep -oP 'content.*?:\s*\K[^,}]+' | head -1 || echo 'N/A')" echo " Node: $(pvesh get /nodes/$(hostname)/storage/$storage 2>/dev/null | grep -oP 'nodes.*?:\s*\K[^,}]+' | head -1 || echo 'N/A')" echo " Enabled: $(pvesh get /nodes/$(hostname)/storage/$storage 2>/dev/null | grep -oP 'disable.*?:\s*\K[^,}]+' | head -1 || echo 'N/A')" # Try to get detailed status storage_status=$(pvesh get /nodes/$(hostname)/storage/$storage/status 2>/dev/null || echo "") if [ -n "$storage_status" ]; then echo " Detailed Status Available: Yes" echo "$storage_status" | head -5 else echo " Detailed Status Available: No (may indicate issue)" fi done echo "" ENDSSH } # Generate comprehensive report generate_report() { log_header "Generating Hardware and Storage Investigation Report" cat > "$REPORT_FILE" <> "$REPORT_FILE" </dev/null | head -1 || echo 'Unknown')" echo "Kernel: $(uname -r)" echo "Uptime: $(uptime -p)" echo "" echo "=== CPU INFORMATION ===" echo "CPU Model: $(lscpu | grep 'Model name' | cut -d: -f2 | xargs)" echo "CPU Cores (Physical): $(lscpu | grep '^Core(s) per socket' | awk '{print $4}')" echo "CPU Sockets: $(lscpu | grep '^Socket(s)' | awk '{print $2}')" echo "Total CPU Cores: $(nproc)" echo "CPU Frequency: $(lscpu | grep 'CPU max MHz' | cut -d: -f2 | xargs) MHz" echo "" echo "=== MEMORY INFORMATION ===" echo "Total Memory: $(free -h | grep Mem | awk '{print $2}')" echo "Used Memory: $(free -h | grep Mem | awk '{print $3}')" echo "Available Memory: $(free -h | grep Mem | awk '{print $7}')" echo "Memory Usage: $(free | grep Mem | awk '{printf "%.1f%%", $3/$2 * 100.0}')" echo "" echo "=== STORAGE INFORMATION ===" echo "Physical Disks:" lsblk -d -o NAME,SIZE,TYPE,MODEL,ROTA 2>/dev/null | head -10 echo "" echo "Proxmox Storage Status:" pvesm status 2>/dev/null || echo "pvesm not available" echo "" echo "=== SYSTEM HARDWARE ===" if command -v dmidecode >/dev/null 2>&1; then echo "System Manufacturer: $(dmidecode -s system-manufacturer 2>/dev/null || echo 'Unknown')" echo "System Product: $(dmidecode -s system-product-name 2>/dev/null || echo 'Unknown')" fi ENDSSH ) \`\`\` --- EOF done # Add storage investigation section cat >> "$REPORT_FILE" <> "$REPORT_FILE" </dev/null || echo "pvesm not available" echo "" echo "=== Detailed Storage Information ===" for storage in $(pvesm status 2>/dev/null | awk '{print $1}' | tail -n +2); do echo "" echo "--- Storage: $storage ---" pvesh get /nodes/$(hostname)/storage/$storage 2>/dev/null | head -20 || echo "Cannot get storage details" echo "" echo "Storage Status:" pvesh get /nodes/$(hostname)/storage/$storage/status 2>/dev/null | head -10 || echo "Cannot get storage status (THIS MAY INDICATE THE ISSUE)" echo "" done ENDSSH ) \`\`\` --- EOF done # Add optimization recommendations cat >> "$REPORT_FILE" </storage/ pvesh get /nodes//storage//status # Check storage configuration: cat /etc/pve/storage.cfg \`\`\` #### Storage Distribution Recommendations **ml110 Storage:** - data: 25.6% - Healthy - local: 8.2% - Healthy - local-lvm: 25.6% - Healthy - thin1: **INVESTIGATE** - Missing data **r630-01 Storage:** - data: 13.4% - Healthy - local: 0.0% - Underutilized - local-lvm: 13.4% - Healthy - thin1: 42.6% - Good utilization **r630-02 Storage:** - data: **INVESTIGATE** - Missing data - local: 3.3% - Healthy - thin1: **INVESTIGATE** - Missing data - thin1-r630-02: 0.3% - Underutilized - thin2: **88.9% - CRITICAL** - Needs immediate attention - thin3: 3.1% - Healthy - thin4: 22.6% - Healthy - thin5: 0.0% - Underutilized ### Resource Distribution Strategy #### Recommended VM/Container Distribution **ml110 (Management/Light Workloads):** - Management VMs - Lightweight services - Monitoring tools - DNS/DHCP services - Target: 10-15 VMs/containers **r630-01 (Medium Workloads):** - Database servers - Application servers - Blockchain RPC nodes - Medium-performance workloads - Target: 15-20 VMs/containers **r630-02 (Heavy Workloads):** - High-performance databases - Blockchain validator nodes - Compute-intensive applications - High-memory workloads - Target: 20-25 VMs/containers ### Performance Optimization 1. **CPU Affinity:** - Pin critical VMs to specific CPU cores - Use CPU sets for isolation - Optimize NUMA if applicable 2. **Memory Optimization:** - Enable ballooning for better memory utilization - Use memory overcommitment carefully - Monitor memory pressure 3. **Storage I/O:** - Use SSD storage for high-I/O workloads - Separate storage pools by performance tier - Optimize thin pool metadata 4. **Network Optimization:** - Use dedicated network for storage - Optimize bridge configurations - Consider SR-IOV for high-performance VMs ### Immediate Action Items #### Critical (Do First) 1. ⚠️ **Investigate missing storage data** for thin1 (ml110), data (r630-02), thin1 (r630-02) 2. ⚠️ **Address thin2 (r630-02) at 88.9%** - Clean up or expand immediately 3. ⚠️ **Migrate CPU-intensive workloads** from ml110 to r630-01 or r630-02 #### High Priority 1. ⚠️ **Redistribute workloads** to balance resource utilization 2. ⚠️ **Verify storage pool configurations** and accessibility 3. ⚠️ **Set up monitoring** for storage usage and CPU load 4. ⚠️ **Review and optimize** VM/container resource allocations #### Recommended 1. ⚠️ **Implement automated load balancing** 2. ⚠️ **Create storage usage alerts** (>80% threshold) 3. ⚠️ **Document hardware specifications** and capabilities 4. ⚠️ **Plan for future capacity** expansion --- ## Detailed Hardware Comparison | Host | CPU Cores | CPU Usage | Memory Usage | Disk Usage | Uptime | Status | |------|-----------|-----------|--------------|------------|--------|--------| | ml110 | 6 | 81.5% ⚠️ | 44.4% | 8.2% | 28+ days | 🟢 Overloaded | | r630-01 | 32 | 8.2% | 3.4% | 0.8% | 21+ days | 🟢 Underutilized | | r630-02 | 56 | 5.3% | 5.4% | 1.5% | 6+ days | 🟢 Severely Underutilized | **Key Observations:** - ml110 is CPU-bound and needs workload redistribution - r630-01 and r630-02 have significant unused capacity - All nodes have healthy memory and disk (except thin2 on r630-02) - Storage data missing for 3 storage pools needs investigation --- ## Conclusion This investigation reveals: - ✅ Complete hardware specifications for all three hosts - ⚠️ Critical storage issue: thin2 (r630-02) at 88.9% - ⚠️ Missing storage data for 3 storage pools requiring investigation - ⚠️ Significant CPU imbalance: ml110 overloaded, r630-01/r630-02 underutilized - ✅ All nodes healthy and operational **Next Steps:** 1. Investigate and resolve missing storage data 2. Address thin2 storage capacity issue 3. Redistribute workloads to balance CPU utilization 4. Implement monitoring and alerting 5. Optimize resource allocation --- **Report Generated:** $(date) **Report File:** $REPORT_FILE EOF log_success "Report generated: $REPORT_FILE" } # Main execution main() { log_header "Proxmox Hardware Specifications and Storage Investigation" echo "" # Collect hardware info from all nodes declare -A HARDWARE_DATA for hostname in "${!NODES[@]}"; do log_section "Collecting data from $hostname" HARDWARE_DATA["$hostname"]=$(collect_hardware_info "$hostname" 2>&1 || echo "Failed to collect data") echo "" done # Generate report generate_report # Display summary log_header "Investigation Summary" echo "" log_info "Report saved to: $REPORT_FILE" echo "" log_success "Hardware and storage investigation complete!" log_info "View full report: cat $REPORT_FILE" echo "" log_info "Quick access:" echo " cat $REPORT_FILE | less" echo " cat $REPORT_FILE | grep -A 20 'Critical'" } # Run main function main "$@"