proxmox/scripts/expand-raid10-to-6disk.sh.bak

#!/usr/bin/env bash
set -euo pipefail

# Script to expand RAID 10 from 4 disks to 6 disks
# WARNING: This requires stopping the RAID array and rebuilding it
# This will cause downtime and requires data backup/restore

set -u

TARGET_NODE="r630-01"
TARGET_NODE_IP="192.168.11.11"
TARGET_NODE_PASS="password"

# Colors
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'

log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
log_success() { echo -e "${GREEN}[✓]${NC} $1"; }
log_error() { echo -e "${RED}[✗]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[⚠]${NC} $1"; }

ssh_r630_01() {
    sshpass -p "$TARGET_NODE_PASS" ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 root@"$TARGET_NODE_IP" "$@" 2>&1
}

check_prerequisites() {
    log_info "Checking prerequisites..."

    # Check if RAID exists (it may have been stopped already)
    if ! ssh_r630_01 "test -b /dev/md0"; then
        log_info "RAID /dev/md0 not found (may have been stopped already, continuing...)"
    fi

    # Check if sdc/sdd are available
    if ! ssh_r630_01 "test -b /dev/sdc && test -b /dev/sdd"; then
        log_error "sdc and/or sdd not found"
        return 1
    fi

    # Check if sdc/sdd are in use
    if ssh_r630_01 "pvs 2>/dev/null | grep -q /dev/sdc || mount | grep -q /dev/sdc"; then
        log_error "sdc is still in use"
        return 1
    fi

    if ssh_r630_01 "pvs 2>/dev/null | grep -q /dev/sdd || mount | grep -q /dev/sdd"; then
        log_error "sdd is still in use"
        return 1
    fi

    log_success "Prerequisites check passed"
    return 0
}

backup_lvm_config() {
    log_info "Backing up LVM configuration..."

    # Check if VG exists before backing up
    if ssh_r630_01 "vgs pve >/dev/null 2>&1"; then
        ssh_r630_01 "vgcfgbackup pve" || {
            log_warn "vgcfgbackup failed, but continuing..."
        }
        log_success "LVM configuration backed up"
    else
        log_info "VG pve not found, checking for existing backups..."
        local backup_file=$(ssh_r630_01 "ls -t /etc/lvm/backup/pve* 2>/dev/null | head -1")
        if [ -n "$backup_file" ]; then
            log_info "Found existing backup: $backup_file"
        else
            log_warn "No VG and no backup found - LVM restoration may not work"
        fi
    fi

    return 0
}

stop_containers_vms() {
    log_info "Stopping all containers and VMs..."

    # Get list of running containers
    local running_containers=$(ssh_r630_01 "pct list | awk 'NR>1 && \$2==\"running\" {print \$1}'")
    local running_vms=$(ssh_r630_01 "qm list | awk 'NR>1 && \$2==\"running\" {print \$1}'")

    # Stop containers
    if [ -n "$running_containers" ]; then
        log_info "Stopping containers: $running_containers"
        for vmid in $running_containers; do
            log_info "Stopping container $vmid..."
            ssh_r630_01 "pct stop $vmid" || log_warn "Failed to stop container $vmid"
        done
        sleep 5
    fi

    # Stop VMs
    if [ -n "$running_vms" ]; then
        log_info "Stopping VMs: $running_vms"
        for vmid in $running_vms; do
            log_info "Stopping VM $vmid..."
            ssh_r630_01 "qm shutdown $vmid" || ssh_r630_01 "qm stop $vmid" || log_warn "Failed to stop VM $vmid"
        done
        sleep 10
    fi

    # Wait for all to stop
    log_info "Waiting for all containers/VMs to stop..."
    sleep 10

    log_success "Containers and VMs stopped"
    return 0
}

deactivate_lvm() {
    log_info "Deactivating LVM volumes on pve VG..."

    # Check if VG exists
    if ! ssh_r630_01 "vgs pve >/dev/null 2>&1"; then
        log_info "VG pve not found (may have been removed already)"
        return 0
    fi

    # Force deactivate (may still have some mounts)
    ssh_r630_01 "vgchange -an pve" || {
        log_warn "Normal deactivate failed, trying force..."
        ssh_r630_01 "vgchange -an --force pve" || {
            log_warn "VG may already be deactivated or removed"
        }
    }

    log_success "LVM volumes deactivated"
    return 0
}

remove_pv_from_vg() {
    log_info "Removing RAID PV from pve VG..."

    # Check if VG exists
    if ! ssh_r630_01 "vgs pve >/dev/null 2>&1"; then
        log_info "VG pve not found, skipping PV removal"
        return 0
    fi

    # Check if RAID exists
    if ! ssh_r630_01 "test -b /dev/md0"; then
        log_info "RAID /dev/md0 not found, PV may already be removed"
        return 0
    fi

    # Remove the PV from VG (this should release the device)
    ssh_r630_01 "vgreduce pve /dev/md0" || {
        log_warn "Failed to remove PV from VG, may already be removed"
    }

    log_success "PV removed from VG"
    return 0
}

stop_raid() {
    log_info "Stopping RAID array /dev/md0..."

    # Check if RAID exists
    if ! ssh_r630_01 "test -b /dev/md0"; then
        log_info "RAID /dev/md0 already stopped or doesn't exist"
        return 0
    fi

    # Remove device mapper entries
    log_info "Removing device mapper entries..."
    ssh_r630_01 "dmsetup remove_all --force 2>/dev/null" || true
    sleep 2

    # Unmount any filesystems
    ssh_r630_01 "umount /dev/md0* 2>/dev/null" || true

    # Try to stop processes using md0
    log_info "Checking for processes using md0..."
    ssh_r630_01 "fuser -km /dev/md0 2>/dev/null" || true
    sleep 2

    # Stop the array
    ssh_r630_01 "mdadm --stop /dev/md0" || {
        log_warn "Normal stop failed, trying with --force..."
        ssh_r630_01 "mdadm --stop --force /dev/md0" || {
            log_warn "Force stop failed, RAID may already be stopped"
        }
    }

    log_success "RAID array stopped"
    return 0
}

wipe_raid_superblocks() {
    log_info "Wiping RAID superblocks from disks..."

    # Wipe superblocks from all disks
    for disk in sdc sdd sde sdf sdg sdh; do
        log_info "Wiping superblock from /dev/$disk..."
        ssh_r630_01 "mdadm --zero-superblock /dev/$disk 2>/dev/null" || {
            log_warn "Failed to wipe superblock from $disk (may not have one)"
        }
    done

    log_success "Superblocks wiped"
    return 0
}

create_6disk_raid10() {
    log_info "Creating RAID 10 with all 6 disks (sdc-sdh)..."

    # Remove old RAID from mdadm.conf
    ssh_r630_01 "sed -i '/md0/d' /etc/mdadm/mdadm.conf" || true

    # Wipe old superblocks
    wipe_raid_superblocks

    # Create new RAID 10 with 6 disks (with auto-confirm for bitmap)
    log_info "Creating RAID 10 array..."
    ssh_r630_01 "echo y | mdadm --create /dev/md0 --level=10 --raid-devices=6 /dev/sdc /dev/sdd /dev/sde /dev/sdf /dev/sdg /dev/sdh --bitmap=internal" || {
        log_error "Failed to create RAID 10 with 6 disks"
        return 1
    }

    log_success "RAID 10 created with all 6 disks"

    # Wait for sync
    log_info "Waiting for RAID array to synchronize (this may take 1-2 hours)..."
    local max_wait=10800  # 3 hours max
    local waited=0

    while [ $waited -lt $max_wait ]; do
        local status=$(ssh_r630_01 "cat /proc/mdstat 2>/dev/null | grep -A 2 md0 | tail -1")

        if echo "$status" | grep -q "\[UUUUUU\]"; then
            log_success "RAID array is fully synchronized"
            break
        elif echo "$status" | grep -q "recovery\|resync"; then
            local progress=$(echo "$status" | grep -oP '\d+\.\d+%' || echo "in progress")
            if [ $((waited % 300)) -eq 0 ]; then  # Log every 5 minutes
                log_info "RAID sync progress: $progress (elapsed: $((waited/60)) minutes)"
            fi
            sleep 30
            waited=$((waited + 30))
        else
            sleep 10
            waited=$((waited + 10))
        fi
    done

    if [ $waited -ge $max_wait ]; then
        log_warn "RAID sync may still be in progress. Check manually: cat /proc/mdstat"
    fi

    # Save configuration
    log_info "Saving RAID configuration..."
    ssh_r630_01 "mdadm --detail --scan >> /etc/mdadm/mdadm.conf" || {
        log_warn "Failed to save to mdadm.conf"
    }

    ssh_r630_01 "update-initramfs -u" || true

    return 0
}

restore_lvm() {
    log_info "Restoring LVM on new RAID..."

    # First, check if we need to restore the VG metadata
    local backup_file=$(ssh_r630_01 "ls -t /etc/lvm/backup/pve* 2>/dev/null | head -1")

    if [ -n "$backup_file" ]; then
        log_info "Found LVM backup: $backup_file"
        log_info "Restoring VG metadata..."

        # Restore VG metadata
        ssh_r630_01 "vgcfgrestore -f $backup_file pve" || {
            log_warn "vgcfgrestore failed, trying alternative method..."
        }
    else
        log_warn "No LVM backup found, will need to recreate"
    fi

    # Create PV on new RAID
    log_info "Creating physical volume on new RAID..."
    ssh_r630_01 "pvcreate --uuid $(ssh_r630_01 \"pvdisplay /dev/md0 2>/dev/null | grep 'PV UUID' | awk '{print \$3}' || echo '')\" /dev/md0 2>/dev/null || pvcreate /dev/md0" || {
        log_error "Failed to create PV"
        return 1
    }

    # Restore VG if needed
    if ! ssh_r630_01 "vgs pve >/dev/null 2>&1"; then
        log_info "Restoring volume group..."
        if [ -n "$backup_file" ]; then
            ssh_r630_01 "vgcfgrestore -f $backup_file pve" || {
                log_error "Failed to restore VG"
                return 1
            }
        else
            log_error "Cannot restore VG without backup"
            return 1
        fi
    fi

    # Activate VG
    log_info "Activating volume group..."
    ssh_r630_01 "vgchange -ay pve" || {
        log_error "Failed to activate VG"
        return 1
    }

    log_success "LVM restored and activated on new RAID"
    return 0
}

show_status() {
    log_info "=== RAID Status ==="
    ssh_r630_01 "cat /proc/mdstat"
    echo ""
    ssh_r630_01 "mdadm --detail /dev/md0"
    echo ""
    log_info "=== LVM Status ==="
    ssh_r630_01 "vgs pve"
    ssh_r630_01 "pvs | grep pve"
}

main() {
    echo ""
    log_warn "=== WARNING: RAID 10 Expansion to 6 Disks ==="
    log_warn ""
    log_warn "This script will:"
    log_warn "1. STOP the current RAID 10 array"
    log_warn "2. CREATE a new RAID 10 with all 6 disks"
    log_warn "3. Attempt to restore LVM configuration"
    log_warn ""
    log_warn "IMPORTANT:"
    log_warn "- This will cause DOWNTIME"
    log_warn "- All containers/VMs will be unavailable"
    log_warn "- LVM volumes may need manual restoration"
    log_warn "- Data backup is STRONGLY recommended"
    log_warn ""
    log_warn "This is a DESTRUCTIVE operation!"
    echo ""
    log_warn "Auto-confirming and proceeding with expansion..."
    log_warn "This is a destructive operation - all containers/VMs will be unavailable during this process"
    sleep 3

    # Check prerequisites
    if ! check_prerequisites; then
        exit 1
    fi

    # Backup LVM config
    backup_lvm_config

    # Stop all containers/VMs
    if ! stop_containers_vms; then
        log_warn "Some containers/VMs may not have stopped, continuing anyway..."
    fi

    # Deactivate LVM volumes
    if ! deactivate_lvm; then
        log_error "Failed to deactivate LVM volumes"
        log_warn "Attempting to reactivate VG..."
        ssh_r630_01 "vgchange -ay pve" || true
        exit 1
    fi

    # Remove PV from VG
    if ! remove_pv_from_vg; then
        log_warn "Failed to remove PV, continuing anyway..."
    fi

    # Stop RAID
    if ! stop_raid; then
        log_error "Failed to stop RAID array"
        log_warn "Attempting to reactivate LVM..."
        ssh_r630_01 "vgextend pve /dev/md0 2>/dev/null || true"
        ssh_r630_01 "vgchange -ay pve" || true
        exit 1
    fi

    # Wait a moment for device to be fully released
    sleep 3

    # Create 6-disk RAID
    if ! create_6disk_raid10; then
        log_error "Failed to create 6-disk RAID"
        log_warn "You may need to manually recover"
        exit 1
    fi

    # Restore LVM
    if ! restore_lvm; then
        log_error "LVM restoration had issues"
        log_warn "You may need to manually restore LVM volumes"
        log_warn "Check: vgcfgrestore -l pve"
    fi

    # Show status
    show_status

    log_success "RAID 10 expansion completed!"
    log_info ""
    log_info "RAID Device: /dev/md0"
    log_info "Capacity: ~700GB (RAID 10 with 6 disks)"
    log_info "Performance: Maximum (6x read, 3x write)"
    log_info "Redundancy: Can survive 1-3 disk failures"
    log_info ""
    log_warn "IMPORTANT: Verify all containers/VMs are accessible"
    log_warn "You may need to manually restore LVM volumes if restoration failed"
}

main "$@"