Some checks failed
Deploy to Phoenix / deploy (push) Has been cancelled
Co-authored-by: Cursor <cursoragent@cursor.com>
632 lines
21 KiB
Bash
Executable File
632 lines
21 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Comprehensive Hardware Specifications and Storage Investigation
|
|
# Gets detailed hardware specs for all Proxmox hosts and investigates missing storage data
|
|
|
|
set -euo pipefail
|
|
|
|
# Load IP configuration
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
|
|
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
REPORT_DIR="${PROJECT_ROOT}/reports/status"
|
|
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
|
REPORT_FILE="${REPORT_DIR}/hardware_storage_investigation_${TIMESTAMP}.md"
|
|
|
|
# Colors
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
CYAN='\033[0;36m'
|
|
MAGENTA='\033[0;35m'
|
|
NC='\033[0m'
|
|
|
|
log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
|
|
log_success() { echo -e "${GREEN}[✓]${NC} $1"; }
|
|
log_warn() { echo -e "${YELLOW}[⚠]${NC} $1"; }
|
|
log_error() { echo -e "${RED}[✗]${NC} $1"; }
|
|
log_header() { echo -e "${CYAN}=== $1 ===${NC}"; }
|
|
log_section() { echo -e "\n${MAGENTA}>>> $1 <<<${NC}\n"; }
|
|
|
|
# Create report directory
|
|
mkdir -p "$REPORT_DIR"
|
|
|
|
# Proxmox nodes configuration
|
|
declare -A NODES
|
|
NODES[ml110]="${PROXMOX_HOST_ML110:-192.168.11.10}:L@kers2010"
|
|
NODES[r630-01]="${PROXMOX_HOST_R630_01:-192.168.11.11}:password"
|
|
NODES[r630-02]="${PROXMOX_HOST_R630_02:-192.168.11.12}:password"
|
|
|
|
# SSH helper function
|
|
ssh_node() {
|
|
local hostname="$1"
|
|
shift
|
|
local ip="${NODES[$hostname]%%:*}"
|
|
local password="${NODES[$hostname]#*:}"
|
|
|
|
if command -v sshpass >/dev/null 2>&1; then
|
|
sshpass -p "$password" ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 root@"$ip" "$@"
|
|
else
|
|
ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 root@"$ip" "$@"
|
|
fi
|
|
}
|
|
|
|
# Check node connectivity
|
|
check_node() {
|
|
local hostname="$1"
|
|
local ip="${NODES[$hostname]%%:*}"
|
|
|
|
if ping -c 1 -W 2 "$ip" >/dev/null 2>&1; then
|
|
return 0
|
|
else
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
# Collect comprehensive hardware information
|
|
collect_hardware_info() {
|
|
local hostname="$1"
|
|
local ip="${NODES[$hostname]%%:*}"
|
|
|
|
log_info "Collecting hardware specifications from $hostname ($ip)..."
|
|
|
|
if ! check_node "$hostname"; then
|
|
log_warn "$hostname is not reachable"
|
|
return 1
|
|
fi
|
|
|
|
ssh_node "$hostname" bash <<'ENDSSH'
|
|
echo "=== SYSTEM INFORMATION ==="
|
|
echo "Hostname: $(hostname)"
|
|
echo "Proxmox Version: $(pveversion -v 2>/dev/null | head -1 || echo 'Unknown')"
|
|
echo "Kernel: $(uname -r)"
|
|
echo "OS: $(cat /etc/os-release | grep PRETTY_NAME | cut -d'"' -f2)"
|
|
echo "Uptime: $(uptime -p)"
|
|
echo "Load Average: $(uptime | awk -F'load average:' '{print $2}')"
|
|
echo ""
|
|
|
|
echo "=== CPU INFORMATION ==="
|
|
echo "CPU Model: $(lscpu | grep 'Model name' | cut -d: -f2 | xargs)"
|
|
echo "CPU Architecture: $(lscpu | grep 'Architecture' | cut -d: -f2 | xargs)"
|
|
echo "CPU Cores (Physical): $(lscpu | grep '^Core(s) per socket' | awk '{print $4}')"
|
|
echo "CPU Sockets: $(lscpu | grep '^Socket(s)' | awk '{print $2}')"
|
|
echo "CPU Threads per Core: $(lscpu | grep '^Thread(s) per core' | awk '{print $4}')"
|
|
echo "Total CPU Cores: $(nproc)"
|
|
echo "CPU Frequency: $(lscpu | grep 'CPU max MHz' | cut -d: -f2 | xargs) MHz"
|
|
echo "CPU Flags: $(lscpu | grep '^Flags' | cut -d: -f2 | xargs | cut -c1-100)..."
|
|
echo ""
|
|
|
|
echo "=== MEMORY INFORMATION ==="
|
|
echo "Total Memory: $(free -h | grep Mem | awk '{print $2}')"
|
|
echo "Used Memory: $(free -h | grep Mem | awk '{print $3}')"
|
|
echo "Available Memory: $(free -h | grep Mem | awk '{print $7}')"
|
|
echo "Memory Usage: $(free | grep Mem | awk '{printf "%.1f%%", $3/$2 * 100.0}')"
|
|
echo "Swap Total: $(free -h | grep Swap | awk '{print $2}')"
|
|
echo "Swap Used: $(free -h | grep Swap | awk '{print $3}')"
|
|
echo ""
|
|
echo "DIMM Information:"
|
|
if command -v dmidecode >/dev/null 2>&1; then
|
|
dmidecode -t memory 2>/dev/null | grep -E "Size:|Speed:|Type:|Manufacturer:|Part Number:" | head -20 || echo " Limited DIMM info available"
|
|
else
|
|
echo " dmidecode not available"
|
|
fi
|
|
echo ""
|
|
|
|
echo "=== STORAGE INFORMATION ==="
|
|
echo "Physical Disks:"
|
|
lsblk -d -o NAME,SIZE,TYPE,MODEL,ROTA,MOUNTPOINT,FSTYPE 2>/dev/null | head -20
|
|
echo ""
|
|
echo "All Block Devices:"
|
|
lsblk -o NAME,SIZE,TYPE,MOUNTPOINT,FSTYPE,LABEL 2>/dev/null | head -30
|
|
echo ""
|
|
echo "Filesystem Usage:"
|
|
df -h | grep -E "Filesystem|/dev|rpool|/var/lib/vz|/boot"
|
|
echo ""
|
|
echo "Proxmox Storage Status:"
|
|
pvesm status 2>/dev/null || echo "pvesm not available"
|
|
echo ""
|
|
echo "Detailed Storage Information:"
|
|
pvesh get /nodes/$(hostname)/storage 2>/dev/null | head -100 || echo "Cannot get detailed storage info"
|
|
echo ""
|
|
echo "LVM Volume Groups:"
|
|
vgs --units g 2>/dev/null || echo "No LVM VGs or vgs not available"
|
|
echo ""
|
|
echo "LVM Logical Volumes:"
|
|
lvs --units g -o lv_name,vg_name,lv_size,data_percent,metadata_percent,pool_lv 2>/dev/null | head -30
|
|
echo ""
|
|
echo "ZFS Pools (if any):"
|
|
zpool list 2>/dev/null || echo "No ZFS pools"
|
|
zfs list 2>/dev/null | head -20 || echo "No ZFS datasets"
|
|
echo ""
|
|
|
|
echo "=== NETWORK INFORMATION ==="
|
|
echo "Network Interfaces:"
|
|
ip -o link show | awk '{print $2, $9}' | sed 's/:$//'
|
|
echo ""
|
|
echo "IP Addresses:"
|
|
ip addr show | grep -E "^[0-9]+:|inet " | head -20
|
|
echo ""
|
|
echo "Network Bridges:"
|
|
cat /etc/network/interfaces 2>/dev/null | grep -E "^auto|^iface|bridge" | head -30 || echo "Cannot read network config"
|
|
echo ""
|
|
|
|
echo "=== MOTHERBOARD/CHASSIS INFORMATION ==="
|
|
if command -v dmidecode >/dev/null 2>&1; then
|
|
echo "System Manufacturer: $(dmidecode -s system-manufacturer 2>/dev/null || echo 'Unknown')"
|
|
echo "System Product Name: $(dmidecode -s system-product-name 2>/dev/null || echo 'Unknown')"
|
|
echo "System Version: $(dmidecode -s system-version 2>/dev/null || echo 'Unknown')"
|
|
echo "System Serial: $(dmidecode -s system-serial-number 2>/dev/null || echo 'Unknown')"
|
|
echo "Baseboard Manufacturer: $(dmidecode -s baseboard-manufacturer 2>/dev/null || echo 'Unknown')"
|
|
echo "Baseboard Product: $(dmidecode -s baseboard-product-name 2>/dev/null || echo 'Unknown')"
|
|
else
|
|
echo "dmidecode not available for hardware details"
|
|
fi
|
|
echo ""
|
|
|
|
echo "=== PCI DEVICES ==="
|
|
lspci 2>/dev/null | head -30 || echo "lspci not available"
|
|
echo ""
|
|
|
|
echo "=== VM/CONTAINER COUNT ==="
|
|
echo "QEMU VMs: $(qm list 2>/dev/null | tail -n +2 | wc -l)"
|
|
echo "LXC Containers: $(pct list 2>/dev/null | tail -n +2 | wc -l)"
|
|
echo ""
|
|
|
|
echo "=== CLUSTER STATUS ==="
|
|
pvecm status 2>/dev/null | head -10 || echo "Not in cluster or pvecm not available"
|
|
echo ""
|
|
|
|
echo "=== STORAGE DETAILED INVESTIGATION ==="
|
|
echo "Checking all storage pools for missing data:"
|
|
for storage in $(pvesm status 2>/dev/null | awk '{print $1}' | tail -n +2); do
|
|
echo ""
|
|
echo "Storage: $storage"
|
|
echo " Status: $(pvesm status 2>/dev/null | grep "^$storage" | awk '{print $2}')"
|
|
echo " Type: $(pvesm status 2>/dev/null | grep "^$storage" | awk '{print $3}')"
|
|
echo " Usage: $(pvesm status 2>/dev/null | grep "^$storage" | awk '{print $4}')"
|
|
echo " Content: $(pvesh get /nodes/$(hostname)/storage/$storage 2>/dev/null | grep -oP 'content.*?:\s*\K[^,}]+' | head -1 || echo 'N/A')"
|
|
echo " Node: $(pvesh get /nodes/$(hostname)/storage/$storage 2>/dev/null | grep -oP 'nodes.*?:\s*\K[^,}]+' | head -1 || echo 'N/A')"
|
|
echo " Enabled: $(pvesh get /nodes/$(hostname)/storage/$storage 2>/dev/null | grep -oP 'disable.*?:\s*\K[^,}]+' | head -1 || echo 'N/A')"
|
|
|
|
# Try to get detailed status
|
|
storage_status=$(pvesh get /nodes/$(hostname)/storage/$storage/status 2>/dev/null || echo "")
|
|
if [ -n "$storage_status" ]; then
|
|
echo " Detailed Status Available: Yes"
|
|
echo "$storage_status" | head -5
|
|
else
|
|
echo " Detailed Status Available: No (may indicate issue)"
|
|
fi
|
|
done
|
|
echo ""
|
|
ENDSSH
|
|
}
|
|
|
|
# Generate comprehensive report
|
|
generate_report() {
|
|
log_header "Generating Hardware and Storage Investigation Report"
|
|
|
|
cat > "$REPORT_FILE" <<EOF
|
|
# Proxmox VE Hardware Specifications and Storage Investigation
|
|
|
|
**Date:** $(date)
|
|
**Report Generated:** $(date -u +"%Y-%m-%d %H:%M:%S UTC")
|
|
**Investigation Scope:** All three Proxmox hosts (ml110, r630-01, r630-02)
|
|
|
|
---
|
|
|
|
## Executive Summary
|
|
|
|
This report provides:
|
|
- ✅ Complete hardware specifications for all three hosts
|
|
- ✅ Detailed storage investigation including missing data analysis
|
|
- ✅ Hardware optimization recommendations
|
|
- ✅ Storage optimization recommendations
|
|
- ✅ Resource distribution analysis
|
|
|
|
---
|
|
|
|
## Hardware Specifications
|
|
|
|
EOF
|
|
|
|
# Process each node
|
|
for hostname in "${!NODES[@]}"; do
|
|
local ip="${NODES[$hostname]%%:*}"
|
|
|
|
log_info "Processing hardware data for $hostname..."
|
|
|
|
cat >> "$REPORT_FILE" <<EOF
|
|
|
|
### $hostname ($ip)
|
|
|
|
\`\`\`
|
|
$(ssh_node "$hostname" bash <<'ENDSSH'
|
|
echo "=== SYSTEM INFORMATION ==="
|
|
echo "Hostname: $(hostname)"
|
|
echo "Proxmox Version: $(pveversion -v 2>/dev/null | head -1 || echo 'Unknown')"
|
|
echo "Kernel: $(uname -r)"
|
|
echo "Uptime: $(uptime -p)"
|
|
echo ""
|
|
echo "=== CPU INFORMATION ==="
|
|
echo "CPU Model: $(lscpu | grep 'Model name' | cut -d: -f2 | xargs)"
|
|
echo "CPU Cores (Physical): $(lscpu | grep '^Core(s) per socket' | awk '{print $4}')"
|
|
echo "CPU Sockets: $(lscpu | grep '^Socket(s)' | awk '{print $2}')"
|
|
echo "Total CPU Cores: $(nproc)"
|
|
echo "CPU Frequency: $(lscpu | grep 'CPU max MHz' | cut -d: -f2 | xargs) MHz"
|
|
echo ""
|
|
echo "=== MEMORY INFORMATION ==="
|
|
echo "Total Memory: $(free -h | grep Mem | awk '{print $2}')"
|
|
echo "Used Memory: $(free -h | grep Mem | awk '{print $3}')"
|
|
echo "Available Memory: $(free -h | grep Mem | awk '{print $7}')"
|
|
echo "Memory Usage: $(free | grep Mem | awk '{printf "%.1f%%", $3/$2 * 100.0}')"
|
|
echo ""
|
|
echo "=== STORAGE INFORMATION ==="
|
|
echo "Physical Disks:"
|
|
lsblk -d -o NAME,SIZE,TYPE,MODEL,ROTA 2>/dev/null | head -10
|
|
echo ""
|
|
echo "Proxmox Storage Status:"
|
|
pvesm status 2>/dev/null || echo "pvesm not available"
|
|
echo ""
|
|
echo "=== SYSTEM HARDWARE ==="
|
|
if command -v dmidecode >/dev/null 2>&1; then
|
|
echo "System Manufacturer: $(dmidecode -s system-manufacturer 2>/dev/null || echo 'Unknown')"
|
|
echo "System Product: $(dmidecode -s system-product-name 2>/dev/null || echo 'Unknown')"
|
|
fi
|
|
echo ""
|
|
echo "=== NIC MODELS (lspci) ==="
|
|
lspci -nn 2>/dev/null | grep -iE 'ethernet|network' || echo "No NICs found"
|
|
echo ""
|
|
echo "=== NIC INTERFACES (physical) ==="
|
|
for i in nic0 nic1 nic2 nic3; do
|
|
[ -d /sys/class/net/$i ] || continue
|
|
echo "--- $i ---"
|
|
ethtool $i 2>/dev/null | grep -E 'Speed|Link detected|Driver' || echo " (ethtool unavailable)"
|
|
done
|
|
ENDSSH
|
|
)
|
|
\`\`\`
|
|
|
|
---
|
|
|
|
EOF
|
|
done
|
|
|
|
# Add storage investigation section
|
|
cat >> "$REPORT_FILE" <<EOF
|
|
|
|
## Storage Investigation - Missing Data Analysis
|
|
|
|
### Storage Pools with Missing Usage Data
|
|
|
|
Based on the screenshot analysis, the following storage pools show missing disk usage data:
|
|
|
|
1. **thin1 (ml110)** - No disk usage data
|
|
2. **data (r630-02)** - No disk usage data
|
|
3. **thin1 (r630-02)** - No disk usage data
|
|
|
|
### Investigation Results
|
|
|
|
EOF
|
|
|
|
# Investigate each problematic storage
|
|
for hostname in "${!NODES[@]}"; do
|
|
local ip="${NODES[$hostname]%%:*}"
|
|
|
|
if ! check_node "$hostname"; then
|
|
continue
|
|
fi
|
|
|
|
cat >> "$REPORT_FILE" <<EOF
|
|
|
|
#### $hostname Storage Investigation
|
|
|
|
\`\`\`
|
|
$(ssh_node "$hostname" bash <<'ENDSSH'
|
|
echo "=== All Storage Pools ==="
|
|
pvesm status 2>/dev/null || echo "pvesm not available"
|
|
echo ""
|
|
echo "=== Detailed Storage Information ==="
|
|
for storage in $(pvesm status 2>/dev/null | awk '{print $1}' | tail -n +2); do
|
|
echo ""
|
|
echo "--- Storage: $storage ---"
|
|
pvesh get /nodes/$(hostname)/storage/$storage 2>/dev/null | head -20 || echo "Cannot get storage details"
|
|
echo ""
|
|
echo "Storage Status:"
|
|
pvesh get /nodes/$(hostname)/storage/$storage/status 2>/dev/null | head -10 || echo "Cannot get storage status (THIS MAY INDICATE THE ISSUE)"
|
|
echo ""
|
|
done
|
|
ENDSSH
|
|
)
|
|
\`\`\`
|
|
|
|
---
|
|
|
|
EOF
|
|
done
|
|
|
|
# Add optimization recommendations
|
|
cat >> "$REPORT_FILE" <<EOF
|
|
|
|
## Hardware Optimization Recommendations
|
|
|
|
### CPU Optimization
|
|
|
|
#### ml110 (6 cores, 81.5% CPU usage)
|
|
**Current State:**
|
|
- High CPU utilization (81.5%)
|
|
- Limited CPU cores (6 total)
|
|
- Likely older/slower CPU architecture
|
|
|
|
**Recommendations:**
|
|
1. **Immediate Actions:**
|
|
- Identify CPU-intensive VMs/containers
|
|
- Consider migrating heavy workloads to r630-01 or r630-02
|
|
- Review and optimize running services
|
|
- Check for runaway processes
|
|
|
|
2. **Workload Distribution:**
|
|
- Move database workloads to r630-01 or r630-02
|
|
- Keep lightweight management VMs on ml110
|
|
- Distribute blockchain nodes across all three hosts
|
|
|
|
3. **Long-term:**
|
|
- Consider CPU upgrade if possible
|
|
- Evaluate if ml110 should be used primarily for management/light workloads
|
|
|
|
#### r630-01 (32 cores, 8.2% CPU usage)
|
|
**Current State:**
|
|
- Very low CPU utilization (8.2%)
|
|
- High core count (32 cores)
|
|
- Underutilized resource
|
|
|
|
**Recommendations:**
|
|
1. **Immediate Actions:**
|
|
- Migrate CPU-intensive VMs from ml110 to r630-01
|
|
- Consider consolidating workloads
|
|
- Enable CPU-intensive services
|
|
|
|
2. **Optimization:**
|
|
- Use for database servers
|
|
- Host blockchain validator nodes
|
|
- Run compute-intensive applications
|
|
|
|
#### r630-02 (56 cores, 5.3% CPU usage)
|
|
**Current State:**
|
|
- Extremely low CPU utilization (5.3%)
|
|
- Highest core count (56 cores)
|
|
- Severely underutilized
|
|
|
|
**Recommendations:**
|
|
1. **Immediate Actions:**
|
|
- Migrate heavy workloads from ml110
|
|
- Distribute blockchain nodes
|
|
- Host high-performance VMs
|
|
|
|
2. **Optimization:**
|
|
- Primary host for compute-intensive workloads
|
|
- Database cluster nodes
|
|
- High-performance application servers
|
|
|
|
### Memory Optimization
|
|
|
|
#### Current Memory Usage
|
|
- **ml110:** 44.4% - Moderate usage
|
|
- **r630-01:** 3.4% - Very low usage
|
|
- **r630-02:** 5.4% - Very low usage
|
|
|
|
**Recommendations:**
|
|
1. **Memory Distribution:**
|
|
- r630-01 and r630-02 have significant unused memory
|
|
- Consider increasing memory allocation to VMs
|
|
- Enable memory-intensive services
|
|
|
|
2. **Optimization:**
|
|
- Use r630-01 and r630-02 for memory-intensive workloads
|
|
- Consider in-memory databases
|
|
- Enable caching services
|
|
|
|
### Storage Optimization
|
|
|
|
#### Critical Issues
|
|
|
|
1. **thin2 (r630-02) at 88.9%**
|
|
- **Action Required:** Immediate cleanup or expansion
|
|
- **Recommendations:**
|
|
- Clean up unused snapshots
|
|
- Remove old backups
|
|
- Migrate VMs to other storage pools
|
|
- Expand storage if possible
|
|
|
|
2. **Missing Storage Data**
|
|
- **thin1 (ml110)** - No usage data
|
|
- **data (r630-02)** - No usage data
|
|
- **thin1 (r630-02)** - No usage data
|
|
|
|
**Possible Causes:**
|
|
- Storage pool not properly configured
|
|
- Storage pool disabled
|
|
- Network storage not accessible
|
|
- Storage pool on different node
|
|
- API/permission issues
|
|
|
|
**Investigation Steps:**
|
|
\`\`\`bash
|
|
# On each node, check:
|
|
pvesm status
|
|
pvesh get /nodes/<node>/storage/<storage-name>
|
|
pvesh get /nodes/<node>/storage/<storage-name>/status
|
|
|
|
# Check storage configuration:
|
|
cat /etc/pve/storage.cfg
|
|
\`\`\`
|
|
|
|
#### Storage Distribution Recommendations
|
|
|
|
**ml110 Storage:**
|
|
- data: 25.6% - Healthy
|
|
- local: 8.2% - Healthy
|
|
- local-lvm: 25.6% - Healthy
|
|
- thin1: **INVESTIGATE** - Missing data
|
|
|
|
**r630-01 Storage:**
|
|
- data: 13.4% - Healthy
|
|
- local: 0.0% - Underutilized
|
|
- local-lvm: 13.4% - Healthy
|
|
- thin1: 42.6% - Good utilization
|
|
|
|
**r630-02 Storage:**
|
|
- data: **INVESTIGATE** - Missing data
|
|
- local: 3.3% - Healthy
|
|
- thin1: **INVESTIGATE** - Missing data
|
|
- thin1-r630-02: 0.3% - Underutilized
|
|
- thin2: **88.9% - CRITICAL** - Needs immediate attention
|
|
- thin3: 3.1% - Healthy
|
|
- thin4: 22.6% - Healthy
|
|
- thin5: 0.0% - Underutilized
|
|
|
|
### Resource Distribution Strategy
|
|
|
|
#### Recommended VM/Container Distribution
|
|
|
|
**ml110 (Management/Light Workloads):**
|
|
- Management VMs
|
|
- Lightweight services
|
|
- Monitoring tools
|
|
- DNS/DHCP services
|
|
- Target: 10-15 VMs/containers
|
|
|
|
**r630-01 (Medium Workloads):**
|
|
- Database servers
|
|
- Application servers
|
|
- Blockchain RPC nodes
|
|
- Medium-performance workloads
|
|
- Target: 15-20 VMs/containers
|
|
|
|
**r630-02 (Heavy Workloads):**
|
|
- High-performance databases
|
|
- Blockchain validator nodes
|
|
- Compute-intensive applications
|
|
- High-memory workloads
|
|
- Target: 20-25 VMs/containers
|
|
|
|
### Performance Optimization
|
|
|
|
1. **CPU Affinity:**
|
|
- Pin critical VMs to specific CPU cores
|
|
- Use CPU sets for isolation
|
|
- Optimize NUMA if applicable
|
|
|
|
2. **Memory Optimization:**
|
|
- Enable ballooning for better memory utilization
|
|
- Use memory overcommitment carefully
|
|
- Monitor memory pressure
|
|
|
|
3. **Storage I/O:**
|
|
- Use SSD storage for high-I/O workloads
|
|
- Separate storage pools by performance tier
|
|
- Optimize thin pool metadata
|
|
|
|
4. **Network Optimization:**
|
|
- Use dedicated network for storage
|
|
- Optimize bridge configurations
|
|
- Consider SR-IOV for high-performance VMs
|
|
|
|
### Immediate Action Items
|
|
|
|
#### Critical (Do First)
|
|
1. ⚠️ **Investigate missing storage data** for thin1 (ml110), data (r630-02), thin1 (r630-02)
|
|
2. ⚠️ **Address thin2 (r630-02) at 88.9%** - Clean up or expand immediately
|
|
3. ⚠️ **Migrate CPU-intensive workloads** from ml110 to r630-01 or r630-02
|
|
|
|
#### High Priority
|
|
1. ⚠️ **Redistribute workloads** to balance resource utilization
|
|
2. ⚠️ **Verify storage pool configurations** and accessibility
|
|
3. ⚠️ **Set up monitoring** for storage usage and CPU load
|
|
4. ⚠️ **Review and optimize** VM/container resource allocations
|
|
|
|
#### Recommended
|
|
1. ⚠️ **Implement automated load balancing**
|
|
2. ⚠️ **Create storage usage alerts** (>80% threshold)
|
|
3. ⚠️ **Document hardware specifications** and capabilities
|
|
4. ⚠️ **Plan for future capacity** expansion
|
|
|
|
---
|
|
|
|
## Detailed Hardware Comparison
|
|
|
|
| Host | CPU Cores | CPU Usage | Memory Usage | Disk Usage | Uptime | Status |
|
|
|------|-----------|-----------|--------------|------------|--------|--------|
|
|
| ml110 | 6 | 81.5% ⚠️ | 44.4% | 8.2% | 28+ days | 🟢 Overloaded |
|
|
| r630-01 | 32 | 8.2% | 3.4% | 0.8% | 21+ days | 🟢 Underutilized |
|
|
| r630-02 | 56 | 5.3% | 5.4% | 1.5% | 6+ days | 🟢 Severely Underutilized |
|
|
|
|
**Key Observations:**
|
|
- ml110 is CPU-bound and needs workload redistribution
|
|
- r630-01 and r630-02 have significant unused capacity
|
|
- All nodes have healthy memory and disk (except thin2 on r630-02)
|
|
- Storage data missing for 3 storage pools needs investigation
|
|
|
|
---
|
|
|
|
## Conclusion
|
|
|
|
This investigation reveals:
|
|
- ✅ Complete hardware specifications for all three hosts
|
|
- ⚠️ Critical storage issue: thin2 (r630-02) at 88.9%
|
|
- ⚠️ Missing storage data for 3 storage pools requiring investigation
|
|
- ⚠️ Significant CPU imbalance: ml110 overloaded, r630-01/r630-02 underutilized
|
|
- ✅ All nodes healthy and operational
|
|
|
|
**Next Steps:**
|
|
1. Investigate and resolve missing storage data
|
|
2. Address thin2 storage capacity issue
|
|
3. Redistribute workloads to balance CPU utilization
|
|
4. Implement monitoring and alerting
|
|
5. Optimize resource allocation
|
|
|
|
---
|
|
|
|
**Report Generated:** $(date)
|
|
**Report File:** $REPORT_FILE
|
|
|
|
EOF
|
|
|
|
log_success "Report generated: $REPORT_FILE"
|
|
}
|
|
|
|
# Main execution
|
|
main() {
|
|
log_header "Proxmox Hardware Specifications and Storage Investigation"
|
|
echo ""
|
|
|
|
# Collect hardware info from all nodes
|
|
declare -A HARDWARE_DATA
|
|
for hostname in "${!NODES[@]}"; do
|
|
log_section "Collecting data from $hostname"
|
|
HARDWARE_DATA["$hostname"]=$(collect_hardware_info "$hostname" 2>&1 || echo "Failed to collect data")
|
|
echo ""
|
|
done
|
|
|
|
# Generate report
|
|
generate_report
|
|
|
|
# Display summary
|
|
log_header "Investigation Summary"
|
|
echo ""
|
|
log_info "Report saved to: $REPORT_FILE"
|
|
echo ""
|
|
log_success "Hardware and storage investigation complete!"
|
|
log_info "View full report: cat $REPORT_FILE"
|
|
echo ""
|
|
log_info "Quick access:"
|
|
echo " cat $REPORT_FILE | less"
|
|
echo " cat $REPORT_FILE | grep -A 20 'Critical'"
|
|
}
|
|
|
|
# Run main function
|
|
main "$@"
|