Files
proxmox/scripts/expand-raid10-to-6disk.sh.bak

416 lines
12 KiB
Bash
Raw Normal View History

#!/usr/bin/env bash
set -euo pipefail
# Script to expand RAID 10 from 4 disks to 6 disks
# WARNING: This requires stopping the RAID array and rebuilding it
# This will cause downtime and requires data backup/restore
set -u
TARGET_NODE="r630-01"
TARGET_NODE_IP="192.168.11.11"
TARGET_NODE_PASS="password"
# Colors
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'
log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
log_success() { echo -e "${GREEN}[✓]${NC} $1"; }
log_error() { echo -e "${RED}[✗]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[⚠]${NC} $1"; }
ssh_r630_01() {
sshpass -p "$TARGET_NODE_PASS" ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 root@"$TARGET_NODE_IP" "$@" 2>&1
}
check_prerequisites() {
log_info "Checking prerequisites..."
# Check if RAID exists (it may have been stopped already)
if ! ssh_r630_01 "test -b /dev/md0"; then
log_info "RAID /dev/md0 not found (may have been stopped already, continuing...)"
fi
# Check if sdc/sdd are available
if ! ssh_r630_01 "test -b /dev/sdc && test -b /dev/sdd"; then
log_error "sdc and/or sdd not found"
return 1
fi
# Check if sdc/sdd are in use
if ssh_r630_01 "pvs 2>/dev/null | grep -q /dev/sdc || mount | grep -q /dev/sdc"; then
log_error "sdc is still in use"
return 1
fi
if ssh_r630_01 "pvs 2>/dev/null | grep -q /dev/sdd || mount | grep -q /dev/sdd"; then
log_error "sdd is still in use"
return 1
fi
log_success "Prerequisites check passed"
return 0
}
backup_lvm_config() {
log_info "Backing up LVM configuration..."
# Check if VG exists before backing up
if ssh_r630_01 "vgs pve >/dev/null 2>&1"; then
ssh_r630_01 "vgcfgbackup pve" || {
log_warn "vgcfgbackup failed, but continuing..."
}
log_success "LVM configuration backed up"
else
log_info "VG pve not found, checking for existing backups..."
local backup_file=$(ssh_r630_01 "ls -t /etc/lvm/backup/pve* 2>/dev/null | head -1")
if [ -n "$backup_file" ]; then
log_info "Found existing backup: $backup_file"
else
log_warn "No VG and no backup found - LVM restoration may not work"
fi
fi
return 0
}
stop_containers_vms() {
log_info "Stopping all containers and VMs..."
# Get list of running containers
local running_containers=$(ssh_r630_01 "pct list | awk 'NR>1 && \$2==\"running\" {print \$1}'")
local running_vms=$(ssh_r630_01 "qm list | awk 'NR>1 && \$2==\"running\" {print \$1}'")
# Stop containers
if [ -n "$running_containers" ]; then
log_info "Stopping containers: $running_containers"
for vmid in $running_containers; do
log_info "Stopping container $vmid..."
ssh_r630_01 "pct stop $vmid" || log_warn "Failed to stop container $vmid"
done
sleep 5
fi
# Stop VMs
if [ -n "$running_vms" ]; then
log_info "Stopping VMs: $running_vms"
for vmid in $running_vms; do
log_info "Stopping VM $vmid..."
ssh_r630_01 "qm shutdown $vmid" || ssh_r630_01 "qm stop $vmid" || log_warn "Failed to stop VM $vmid"
done
sleep 10
fi
# Wait for all to stop
log_info "Waiting for all containers/VMs to stop..."
sleep 10
log_success "Containers and VMs stopped"
return 0
}
deactivate_lvm() {
log_info "Deactivating LVM volumes on pve VG..."
# Check if VG exists
if ! ssh_r630_01 "vgs pve >/dev/null 2>&1"; then
log_info "VG pve not found (may have been removed already)"
return 0
fi
# Force deactivate (may still have some mounts)
ssh_r630_01 "vgchange -an pve" || {
log_warn "Normal deactivate failed, trying force..."
ssh_r630_01 "vgchange -an --force pve" || {
log_warn "VG may already be deactivated or removed"
}
}
log_success "LVM volumes deactivated"
return 0
}
remove_pv_from_vg() {
log_info "Removing RAID PV from pve VG..."
# Check if VG exists
if ! ssh_r630_01 "vgs pve >/dev/null 2>&1"; then
log_info "VG pve not found, skipping PV removal"
return 0
fi
# Check if RAID exists
if ! ssh_r630_01 "test -b /dev/md0"; then
log_info "RAID /dev/md0 not found, PV may already be removed"
return 0
fi
# Remove the PV from VG (this should release the device)
ssh_r630_01 "vgreduce pve /dev/md0" || {
log_warn "Failed to remove PV from VG, may already be removed"
}
log_success "PV removed from VG"
return 0
}
stop_raid() {
log_info "Stopping RAID array /dev/md0..."
# Check if RAID exists
if ! ssh_r630_01 "test -b /dev/md0"; then
log_info "RAID /dev/md0 already stopped or doesn't exist"
return 0
fi
# Remove device mapper entries
log_info "Removing device mapper entries..."
ssh_r630_01 "dmsetup remove_all --force 2>/dev/null" || true
sleep 2
# Unmount any filesystems
ssh_r630_01 "umount /dev/md0* 2>/dev/null" || true
# Try to stop processes using md0
log_info "Checking for processes using md0..."
ssh_r630_01 "fuser -km /dev/md0 2>/dev/null" || true
sleep 2
# Stop the array
ssh_r630_01 "mdadm --stop /dev/md0" || {
log_warn "Normal stop failed, trying with --force..."
ssh_r630_01 "mdadm --stop --force /dev/md0" || {
log_warn "Force stop failed, RAID may already be stopped"
}
}
log_success "RAID array stopped"
return 0
}
wipe_raid_superblocks() {
log_info "Wiping RAID superblocks from disks..."
# Wipe superblocks from all disks
for disk in sdc sdd sde sdf sdg sdh; do
log_info "Wiping superblock from /dev/$disk..."
ssh_r630_01 "mdadm --zero-superblock /dev/$disk 2>/dev/null" || {
log_warn "Failed to wipe superblock from $disk (may not have one)"
}
done
log_success "Superblocks wiped"
return 0
}
create_6disk_raid10() {
log_info "Creating RAID 10 with all 6 disks (sdc-sdh)..."
# Remove old RAID from mdadm.conf
ssh_r630_01 "sed -i '/md0/d' /etc/mdadm/mdadm.conf" || true
# Wipe old superblocks
wipe_raid_superblocks
# Create new RAID 10 with 6 disks (with auto-confirm for bitmap)
log_info "Creating RAID 10 array..."
ssh_r630_01 "echo y | mdadm --create /dev/md0 --level=10 --raid-devices=6 /dev/sdc /dev/sdd /dev/sde /dev/sdf /dev/sdg /dev/sdh --bitmap=internal" || {
log_error "Failed to create RAID 10 with 6 disks"
return 1
}
log_success "RAID 10 created with all 6 disks"
# Wait for sync
log_info "Waiting for RAID array to synchronize (this may take 1-2 hours)..."
local max_wait=10800 # 3 hours max
local waited=0
while [ $waited -lt $max_wait ]; do
local status=$(ssh_r630_01 "cat /proc/mdstat 2>/dev/null | grep -A 2 md0 | tail -1")
if echo "$status" | grep -q "\[UUUUUU\]"; then
log_success "RAID array is fully synchronized"
break
elif echo "$status" | grep -q "recovery\|resync"; then
local progress=$(echo "$status" | grep -oP '\d+\.\d+%' || echo "in progress")
if [ $((waited % 300)) -eq 0 ]; then # Log every 5 minutes
log_info "RAID sync progress: $progress (elapsed: $((waited/60)) minutes)"
fi
sleep 30
waited=$((waited + 30))
else
sleep 10
waited=$((waited + 10))
fi
done
if [ $waited -ge $max_wait ]; then
log_warn "RAID sync may still be in progress. Check manually: cat /proc/mdstat"
fi
# Save configuration
log_info "Saving RAID configuration..."
ssh_r630_01 "mdadm --detail --scan >> /etc/mdadm/mdadm.conf" || {
log_warn "Failed to save to mdadm.conf"
}
ssh_r630_01 "update-initramfs -u" || true
return 0
}
restore_lvm() {
log_info "Restoring LVM on new RAID..."
# First, check if we need to restore the VG metadata
local backup_file=$(ssh_r630_01 "ls -t /etc/lvm/backup/pve* 2>/dev/null | head -1")
if [ -n "$backup_file" ]; then
log_info "Found LVM backup: $backup_file"
log_info "Restoring VG metadata..."
# Restore VG metadata
ssh_r630_01 "vgcfgrestore -f $backup_file pve" || {
log_warn "vgcfgrestore failed, trying alternative method..."
}
else
log_warn "No LVM backup found, will need to recreate"
fi
# Create PV on new RAID
log_info "Creating physical volume on new RAID..."
ssh_r630_01 "pvcreate --uuid $(ssh_r630_01 \"pvdisplay /dev/md0 2>/dev/null | grep 'PV UUID' | awk '{print \$3}' || echo '')\" /dev/md0 2>/dev/null || pvcreate /dev/md0" || {
log_error "Failed to create PV"
return 1
}
# Restore VG if needed
if ! ssh_r630_01 "vgs pve >/dev/null 2>&1"; then
log_info "Restoring volume group..."
if [ -n "$backup_file" ]; then
ssh_r630_01 "vgcfgrestore -f $backup_file pve" || {
log_error "Failed to restore VG"
return 1
}
else
log_error "Cannot restore VG without backup"
return 1
fi
fi
# Activate VG
log_info "Activating volume group..."
ssh_r630_01 "vgchange -ay pve" || {
log_error "Failed to activate VG"
return 1
}
log_success "LVM restored and activated on new RAID"
return 0
}
show_status() {
log_info "=== RAID Status ==="
ssh_r630_01 "cat /proc/mdstat"
echo ""
ssh_r630_01 "mdadm --detail /dev/md0"
echo ""
log_info "=== LVM Status ==="
ssh_r630_01 "vgs pve"
ssh_r630_01 "pvs | grep pve"
}
main() {
echo ""
log_warn "=== WARNING: RAID 10 Expansion to 6 Disks ==="
log_warn ""
log_warn "This script will:"
log_warn "1. STOP the current RAID 10 array"
log_warn "2. CREATE a new RAID 10 with all 6 disks"
log_warn "3. Attempt to restore LVM configuration"
log_warn ""
log_warn "IMPORTANT:"
log_warn "- This will cause DOWNTIME"
log_warn "- All containers/VMs will be unavailable"
log_warn "- LVM volumes may need manual restoration"
log_warn "- Data backup is STRONGLY recommended"
log_warn ""
log_warn "This is a DESTRUCTIVE operation!"
echo ""
log_warn "Auto-confirming and proceeding with expansion..."
log_warn "This is a destructive operation - all containers/VMs will be unavailable during this process"
sleep 3
# Check prerequisites
if ! check_prerequisites; then
exit 1
fi
# Backup LVM config
backup_lvm_config
# Stop all containers/VMs
if ! stop_containers_vms; then
log_warn "Some containers/VMs may not have stopped, continuing anyway..."
fi
# Deactivate LVM volumes
if ! deactivate_lvm; then
log_error "Failed to deactivate LVM volumes"
log_warn "Attempting to reactivate VG..."
ssh_r630_01 "vgchange -ay pve" || true
exit 1
fi
# Remove PV from VG
if ! remove_pv_from_vg; then
log_warn "Failed to remove PV, continuing anyway..."
fi
# Stop RAID
if ! stop_raid; then
log_error "Failed to stop RAID array"
log_warn "Attempting to reactivate LVM..."
ssh_r630_01 "vgextend pve /dev/md0 2>/dev/null || true"
ssh_r630_01 "vgchange -ay pve" || true
exit 1
fi
# Wait a moment for device to be fully released
sleep 3
# Create 6-disk RAID
if ! create_6disk_raid10; then
log_error "Failed to create 6-disk RAID"
log_warn "You may need to manually recover"
exit 1
fi
# Restore LVM
if ! restore_lvm; then
log_error "LVM restoration had issues"
log_warn "You may need to manually restore LVM volumes"
log_warn "Check: vgcfgrestore -l pve"
fi
# Show status
show_status
log_success "RAID 10 expansion completed!"
log_info ""
log_info "RAID Device: /dev/md0"
log_info "Capacity: ~700GB (RAID 10 with 6 disks)"
log_info "Performance: Maximum (6x read, 3x write)"
log_info "Redundancy: Can survive 1-3 disk failures"
log_info ""
log_warn "IMPORTANT: Verify all containers/VMs are accessible"
log_warn "You may need to manually restore LVM volumes if restoration failed"
}
main "$@"