proxmox/scripts/review-all-storage.sh.bak

#!/usr/bin/env bash
# Comprehensive Proxmox Storage Review and Recommendations
# Reviews all storage across all Proxmox nodes and provides recommendations

set -euo pipefail

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
REPORT_DIR="${PROJECT_ROOT}/reports/storage"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
REPORT_FILE="${REPORT_DIR}/storage_review_${TIMESTAMP}.md"

# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m'

log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
log_success() { echo -e "${GREEN}[✓]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[⚠]${NC} $1"; }
log_error() { echo -e "${RED}[✗]${NC} $1"; }
log_header() { echo -e "${CYAN}=== $1 ===${NC}"; }

# Create report directory
mkdir -p "$REPORT_DIR"

# Proxmox nodes configuration
declare -A NODES
NODES[ml110]="192.168.11.10:L@kers2010"
NODES[r630-01]="192.168.11.11:password"
NODES[r630-02]="192.168.11.12:password"
NODES[r630-03]="192.168.11.13:L@kers2010"
NODES[r630-04]="192.168.11.14:L@kers2010"

# Storage data collection
declare -A STORAGE_DATA

# SSH helper function
ssh_node() {
    local hostname="$1"
    shift
    local ip="${NODES[$hostname]%%:*}"
    local password="${NODES[$hostname]#*:}"

    if command -v sshpass >/dev/null 2>&1; then
        sshpass -p "$password" ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 root@"$ip" "$@"
    else
        ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 root@"$ip" "$@"
    fi
}

# Check node connectivity
check_node() {
    local hostname="$1"
    local ip="${NODES[$hostname]%%:*}"

    if ping -c 1 -W 2 "$ip" >/dev/null 2>&1; then
        return 0
    else
        return 1
    fi
}

# Collect storage information from a node
collect_storage_info() {
    local hostname="$1"
    local ip="${NODES[$hostname]%%:*}"

    log_info "Collecting storage information from $hostname ($ip)..."

    if ! check_node "$hostname"; then
        log_warn "$hostname is not reachable"
        return 1
    fi

    # Collect storage status
    local storage_status=$(ssh_node "$hostname" 'pvesm status 2>/dev/null' || echo "")

    # Collect LVM information
    local vgs_info=$(ssh_node "$hostname" 'vgs --units g --noheadings -o vg_name,vg_size,vg_free 2>/dev/null' || echo "")
    local lvs_info=$(ssh_node "$hostname" 'lvs --units g --noheadings -o lv_name,vg_name,lv_size,data_percent,metadata_percent,pool_lv 2>/dev/null | grep -E "(thin|data)"' || echo "")

    # Collect disk information
    local disk_info=$(ssh_node "$hostname" 'lsblk -d -o NAME,SIZE,TYPE,MOUNTPOINT 2>/dev/null' || echo "")

    # Collect VM/container count
    local vm_count=$(ssh_node "$hostname" 'qm list 2>/dev/null | tail -n +2 | wc -l' || echo "0")
    local ct_count=$(ssh_node "$hostname" 'pct list 2>/dev/null | tail -n +2 | wc -l' || echo "0")

    # Collect system resources
    local mem_info=$(ssh_node "$hostname" 'free -h | grep Mem | awk "{print \$2,\$3,\$7}"' || echo "")
    local cpu_info=$(ssh_node "$hostname" 'nproc' || echo "0")

    # Store data
    STORAGE_DATA["${hostname}_storage"]="$storage_status"
    STORAGE_DATA["${hostname}_vgs"]="$vgs_info"
    STORAGE_DATA["${hostname}_lvs"]="$lvs_info"
    STORAGE_DATA["${hostname}_disks"]="$disk_info"
    STORAGE_DATA["${hostname}_vms"]="$vm_count"
    STORAGE_DATA["${hostname}_cts"]="$ct_count"
    STORAGE_DATA["${hostname}_mem"]="$mem_info"
    STORAGE_DATA["${hostname}_cpu"]="$cpu_info"

    log_success "Collected data from $hostname"
}

# Generate storage report
generate_report() {
    log_header "Generating Storage Review Report"

    cat > "$REPORT_FILE" <<EOF
# Proxmox Storage Comprehensive Review

**Date:** $(date)
**Report Generated:** $(date -u +"%Y-%m-%d %H:%M:%S UTC")
**Review Scope:** All Proxmox nodes and storage configurations

---

## Executive Summary

This report provides a comprehensive review of all storage configurations across all Proxmox nodes, including:
- Current storage status and usage
- Storage type analysis
- Performance recommendations
- Capacity planning
- Optimization suggestions

---

## Node Overview

EOF

    # Process each node
    for hostname in "${!NODES[@]}"; do
        local ip="${NODES[$hostname]%%:*}"

        cat >> "$REPORT_FILE" <<EOF

### $hostname ($ip)

**Status:** $(if check_node "$hostname"; then echo "✅ Reachable"; else echo "❌ Not Reachable"; fi)

**System Resources:**
- CPU Cores: ${STORAGE_DATA["${hostname}_cpu"]:-Unknown}
- Memory: ${STORAGE_DATA["${hostname}_mem"]:-Unknown}
- VMs: ${STORAGE_DATA["${hostname}_vms"]:-0}
- Containers: ${STORAGE_DATA["${hostname}_cts"]:-0}

**Storage Status:**
\`\`\`
${STORAGE_DATA["${hostname}_storage"]:-No storage data available}
\`\`\`

**Volume Groups:**
\`\`\`
${STORAGE_DATA["${hostname}_vgs"]:-No volume groups found}
\`\`\`

**Thin Pools:**
\`\`\`
${STORAGE_DATA["${hostname}_lvs"]:-No thin pools found}
\`\`\`

**Physical Disks:**
\`\`\`
${STORAGE_DATA["${hostname}_disks"]:-No disk information available}
\`\`\`

---

EOF
    done

    # Add recommendations section
    cat >> "$REPORT_FILE" <<EOF

## Storage Analysis and Recommendations

### 1. Storage Type Analysis

#### Local Storage (Directory-based)
- **Purpose:** ISO images, container templates, backups
- **Performance:** Good for read-heavy workloads
- **Recommendation:** Use for templates and ISOs, not for VM disks

#### LVM Thin Storage
- **Purpose:** VM/container disk images
- **Performance:** Excellent with thin provisioning
- **Benefits:** Space efficiency, snapshots, cloning
- **Recommendation:** ✅ **Preferred for VM/container disks**

#### ZFS Storage
- **Purpose:** High-performance VM storage
- **Performance:** Excellent with compression and deduplication
- **Benefits:** Data integrity, snapshots, clones
- **Recommendation:** Consider for high-performance workloads

### 2. Critical Issues and Fixes

EOF

    # Analyze each node and add recommendations
    for hostname in "${!NODES[@]}"; do
        local storage_status="${STORAGE_DATA["${hostname}_storage"]:-}"

        if [ -z "$storage_status" ]; then
            continue
        fi

        cat >> "$REPORT_FILE" <<EOF

#### $hostname Storage Issues

EOF

        # Check for disabled storage
        if echo "$storage_status" | grep -q "disabled\|inactive"; then
            cat >> "$REPORT_FILE" <<EOF
⚠️ **Issue:** Some storage pools are disabled or inactive

**Action Required:**
\`\`\`bash
ssh root@${NODES[$hostname]%%:*}
pvesm status
# Enable disabled storage:
pvesm set <storage-name> --disable 0
\`\`\`

EOF
        fi

        # Check for high usage
        if echo "$storage_status" | grep -qE "[8-9][0-9]%|[0-9]{2,}%"; then
            cat >> "$REPORT_FILE" <<EOF
⚠️ **Issue:** Storage usage is high (>80%)

**Recommendation:**
- Monitor storage usage closely
- Plan for expansion or cleanup
- Consider migrating VMs to other nodes

EOF
        fi

        # Check for missing LVM thin storage
        if ! echo "$storage_status" | grep -qE "lvmthin|thin"; then
            cat >> "$REPORT_FILE" <<EOF
⚠️ **Issue:** No LVM thin storage configured

**Recommendation:**
- Configure LVM thin storage for better performance
- Use thin provisioning for space efficiency
- Enable snapshots and cloning capabilities

EOF
        fi
    done

    # Add general recommendations
    cat >> "$REPORT_FILE" <<EOF

### 3. Performance Optimization Recommendations

#### Storage Performance Best Practices

1. **Use LVM Thin for VM Disks**
   - Better performance than directory storage
   - Thin provisioning saves space
   - Enables snapshots and cloning

2. **Monitor Thin Pool Metadata Usage**
   - Thin pools require metadata space
   - Monitor metadata_percent in lvs output
   - Expand metadata if >80% used

3. **Storage Distribution**
   - Distribute VMs across multiple nodes
   - Balance storage usage across nodes
   - Avoid overloading single node

4. **Backup Storage Strategy**
   - Use separate storage for backups
   - Consider NFS or Ceph for shared backups
   - Implement backup rotation policies

### 4. Capacity Planning

#### Current Storage Distribution

EOF

    # Calculate total storage
    local total_storage=0
    local total_used=0

    for hostname in "${!NODES[@]}"; do
        local storage_status="${STORAGE_DATA["${hostname}_storage"]:-}"
        if [ -n "$storage_status" ]; then
            # Extract storage sizes (simplified - would need proper parsing)
            echo "$storage_status" | while IFS= read -r line; do
                if [[ $line =~ ([0-9]+)T ]] || [[ $line =~ ([0-9]+)G ]]; then
                    # Storage found
                    :
                fi
            done
        fi
    done

    cat >> "$REPORT_FILE" <<EOF

**Recommendations:**
- Monitor storage growth trends
- Plan for 20-30% headroom
- Set alerts at 80% usage
- Consider storage expansion before reaching capacity

### 5. Storage Type Recommendations by Use Case

| Use Case | Recommended Storage Type | Reason |
|----------|-------------------------|--------|
| VM/Container Disks | LVM Thin (lvmthin) | Best performance, thin provisioning |
| ISO Images | Directory (dir) | Read-only, no performance impact |
| Container Templates | Directory (dir) | Templates are read-only |
| Backups | Directory or NFS | Separate from production storage |
| High-Performance VMs | ZFS or LVM Thin | Best I/O performance |
| Development/Test | LVM Thin | Space efficient with cloning |

### 6. Security Recommendations

1. **Storage Access Control**
   - Review storage.cfg node restrictions
   - Ensure proper node assignments
   - Verify storage permissions

2. **Backup Security**
   - Encrypt backups if containing sensitive data
   - Store backups off-site
   - Test backup restoration regularly

### 7. Monitoring Recommendations

1. **Set Up Storage Monitoring**
   - Monitor storage usage (>80% alert)
   - Monitor thin pool metadata usage
   - Track storage growth trends

2. **Performance Monitoring**
   - Monitor I/O latency
   - Track storage throughput
   - Identify bottlenecks

3. **Automated Alerts**
   - Storage usage >80%
   - Thin pool metadata >80%
   - Storage errors or failures

### 8. Migration Recommendations

#### Workload Distribution

**Current State:**
- ml110: Hosting all VMs (overloaded)
- r630-01/r630-02: Underutilized

**Recommended Distribution:**
- **ml110:** Keep management/lightweight VMs (10-15 VMs)
- **r630-01:** Migrate medium workload VMs (10-15 VMs)
- **r630-02:** Migrate heavy workload VMs (10-15 VMs)

**Benefits:**
- Better performance (ml110 CPU is slower)
- Better resource utilization
- Improved redundancy
- Better storage distribution

### 9. Immediate Action Items

#### Critical (Do First)
1. ✅ Review storage status on all nodes
2. ⚠️ Enable disabled storage pools
3. ⚠️ Verify storage node restrictions in storage.cfg
4. ⚠️ Check for storage errors or warnings

#### High Priority
1. ⚠️ Configure LVM thin storage where missing
2. ⚠️ Set up storage monitoring and alerts
3. ⚠️ Plan VM migration for better distribution
4. ⚠️ Review and optimize storage.cfg

#### Recommended
1. ⚠️ Implement backup storage strategy
2. ⚠️ Consider shared storage (NFS/Ceph) for HA
3. ⚠️ Optimize storage performance settings
4. ⚠️ Document storage procedures

---

## Detailed Storage Commands Reference

### Check Storage Status
\`\`\`bash
# On any Proxmox node
pvesm status
pvesm list <storage-name>
\`\`\`

### Enable Disabled Storage
\`\`\`bash
pvesm set <storage-name> --disable 0
\`\`\`

### Check LVM Configuration
\`\`\`bash
vgs                    # List volume groups
lvs                    # List logical volumes
lvs -o +data_percent,metadata_percent  # Check thin pool usage
\`\`\`

### Check Disk Usage
\`\`\`bash
df -h                  # Filesystem usage
lsblk                  # Block devices
\`\`\`

### Storage Performance Testing
\`\`\`bash
# Test storage I/O
fio --name=test --ioengine=libaio --iodepth=16 --rw=randwrite --bs=4k --size=1G --runtime=60
\`\`\`

---

## Conclusion

This comprehensive storage review provides:
- ✅ Current storage status across all nodes
- ✅ Detailed analysis of storage configurations
- ✅ Performance optimization recommendations
- ✅ Capacity planning guidance
- ✅ Security and monitoring recommendations
- ✅ Migration and distribution strategies

**Next Steps:**
1. Review this report
2. Address critical issues first
3. Implement high-priority recommendations
4. Plan for long-term optimizations

---

**Report Generated:** $(date)
**Report File:** $REPORT_FILE

EOF

    log_success "Report generated: $REPORT_FILE"
}

# Main execution
main() {
    log_header "Proxmox Storage Comprehensive Review"
    echo ""

    # Collect data from all nodes
    for hostname in "${!NODES[@]}"; do
        collect_storage_info "$hostname" || log_warn "Failed to collect data from $hostname"
        echo ""
    done

    # Generate report
    generate_report

    # Display summary
    log_header "Review Summary"
    echo ""
    log_info "Report saved to: $REPORT_FILE"
    echo ""
    log_info "Quick Summary:"

    for hostname in "${!NODES[@]}"; do
        if check_node "$hostname"; then
            local vms="${STORAGE_DATA["${hostname}_vms"]:-0}"
            local cts="${STORAGE_DATA["${hostname}_cts"]:-0}"
            echo "  $hostname: $vms VMs, $cts Containers"
        else
            echo "  $hostname: Not reachable"
        fi
    done

    echo ""
    log_success "Storage review complete!"
    log_info "View full report: cat $REPORT_FILE"
}

# Run main function
main "$@"