#!/usr/bin/env bash set -euo pipefail # Script to expand RAID 10 from 4 disks to 6 disks # WARNING: This requires stopping the RAID array and rebuilding it # This will cause downtime and requires data backup/restore set -u TARGET_NODE="r630-01" TARGET_NODE_IP="192.168.11.11" TARGET_NODE_PASS="password" # Colors GREEN='\033[0;32m' RED='\033[0;31m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } log_success() { echo -e "${GREEN}[✓]${NC} $1"; } log_error() { echo -e "${RED}[✗]${NC} $1"; } log_warn() { echo -e "${YELLOW}[⚠]${NC} $1"; } ssh_r630_01() { sshpass -p "$TARGET_NODE_PASS" ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 root@"$TARGET_NODE_IP" "$@" 2>&1 } check_prerequisites() { log_info "Checking prerequisites..." # Check if RAID exists (it may have been stopped already) if ! ssh_r630_01 "test -b /dev/md0"; then log_info "RAID /dev/md0 not found (may have been stopped already, continuing...)" fi # Check if sdc/sdd are available if ! ssh_r630_01 "test -b /dev/sdc && test -b /dev/sdd"; then log_error "sdc and/or sdd not found" return 1 fi # Check if sdc/sdd are in use if ssh_r630_01 "pvs 2>/dev/null | grep -q /dev/sdc || mount | grep -q /dev/sdc"; then log_error "sdc is still in use" return 1 fi if ssh_r630_01 "pvs 2>/dev/null | grep -q /dev/sdd || mount | grep -q /dev/sdd"; then log_error "sdd is still in use" return 1 fi log_success "Prerequisites check passed" return 0 } backup_lvm_config() { log_info "Backing up LVM configuration..." # Check if VG exists before backing up if ssh_r630_01 "vgs pve >/dev/null 2>&1"; then ssh_r630_01 "vgcfgbackup pve" || { log_warn "vgcfgbackup failed, but continuing..." } log_success "LVM configuration backed up" else log_info "VG pve not found, checking for existing backups..." local backup_file=$(ssh_r630_01 "ls -t /etc/lvm/backup/pve* 2>/dev/null | head -1") if [ -n "$backup_file" ]; then log_info "Found existing backup: $backup_file" else log_warn "No VG and no backup found - LVM restoration may not work" fi fi return 0 } stop_containers_vms() { log_info "Stopping all containers and VMs..." # Get list of running containers local running_containers=$(ssh_r630_01 "pct list | awk 'NR>1 && \$2==\"running\" {print \$1}'") local running_vms=$(ssh_r630_01 "qm list | awk 'NR>1 && \$2==\"running\" {print \$1}'") # Stop containers if [ -n "$running_containers" ]; then log_info "Stopping containers: $running_containers" for vmid in $running_containers; do log_info "Stopping container $vmid..." ssh_r630_01 "pct stop $vmid" || log_warn "Failed to stop container $vmid" done sleep 5 fi # Stop VMs if [ -n "$running_vms" ]; then log_info "Stopping VMs: $running_vms" for vmid in $running_vms; do log_info "Stopping VM $vmid..." ssh_r630_01 "qm shutdown $vmid" || ssh_r630_01 "qm stop $vmid" || log_warn "Failed to stop VM $vmid" done sleep 10 fi # Wait for all to stop log_info "Waiting for all containers/VMs to stop..." sleep 10 log_success "Containers and VMs stopped" return 0 } deactivate_lvm() { log_info "Deactivating LVM volumes on pve VG..." # Check if VG exists if ! ssh_r630_01 "vgs pve >/dev/null 2>&1"; then log_info "VG pve not found (may have been removed already)" return 0 fi # Force deactivate (may still have some mounts) ssh_r630_01 "vgchange -an pve" || { log_warn "Normal deactivate failed, trying force..." ssh_r630_01 "vgchange -an --force pve" || { log_warn "VG may already be deactivated or removed" } } log_success "LVM volumes deactivated" return 0 } remove_pv_from_vg() { log_info "Removing RAID PV from pve VG..." # Check if VG exists if ! ssh_r630_01 "vgs pve >/dev/null 2>&1"; then log_info "VG pve not found, skipping PV removal" return 0 fi # Check if RAID exists if ! ssh_r630_01 "test -b /dev/md0"; then log_info "RAID /dev/md0 not found, PV may already be removed" return 0 fi # Remove the PV from VG (this should release the device) ssh_r630_01 "vgreduce pve /dev/md0" || { log_warn "Failed to remove PV from VG, may already be removed" } log_success "PV removed from VG" return 0 } stop_raid() { log_info "Stopping RAID array /dev/md0..." # Check if RAID exists if ! ssh_r630_01 "test -b /dev/md0"; then log_info "RAID /dev/md0 already stopped or doesn't exist" return 0 fi # Remove device mapper entries log_info "Removing device mapper entries..." ssh_r630_01 "dmsetup remove_all --force 2>/dev/null" || true sleep 2 # Unmount any filesystems ssh_r630_01 "umount /dev/md0* 2>/dev/null" || true # Try to stop processes using md0 log_info "Checking for processes using md0..." ssh_r630_01 "fuser -km /dev/md0 2>/dev/null" || true sleep 2 # Stop the array ssh_r630_01 "mdadm --stop /dev/md0" || { log_warn "Normal stop failed, trying with --force..." ssh_r630_01 "mdadm --stop --force /dev/md0" || { log_warn "Force stop failed, RAID may already be stopped" } } log_success "RAID array stopped" return 0 } wipe_raid_superblocks() { log_info "Wiping RAID superblocks from disks..." # Wipe superblocks from all disks for disk in sdc sdd sde sdf sdg sdh; do log_info "Wiping superblock from /dev/$disk..." ssh_r630_01 "mdadm --zero-superblock /dev/$disk 2>/dev/null" || { log_warn "Failed to wipe superblock from $disk (may not have one)" } done log_success "Superblocks wiped" return 0 } create_6disk_raid10() { log_info "Creating RAID 10 with all 6 disks (sdc-sdh)..." # Remove old RAID from mdadm.conf ssh_r630_01 "sed -i '/md0/d' /etc/mdadm/mdadm.conf" || true # Wipe old superblocks wipe_raid_superblocks # Create new RAID 10 with 6 disks (with auto-confirm for bitmap) log_info "Creating RAID 10 array..." ssh_r630_01 "echo y | mdadm --create /dev/md0 --level=10 --raid-devices=6 /dev/sdc /dev/sdd /dev/sde /dev/sdf /dev/sdg /dev/sdh --bitmap=internal" || { log_error "Failed to create RAID 10 with 6 disks" return 1 } log_success "RAID 10 created with all 6 disks" # Wait for sync log_info "Waiting for RAID array to synchronize (this may take 1-2 hours)..." local max_wait=10800 # 3 hours max local waited=0 while [ $waited -lt $max_wait ]; do local status=$(ssh_r630_01 "cat /proc/mdstat 2>/dev/null | grep -A 2 md0 | tail -1") if echo "$status" | grep -q "\[UUUUUU\]"; then log_success "RAID array is fully synchronized" break elif echo "$status" | grep -q "recovery\|resync"; then local progress=$(echo "$status" | grep -oP '\d+\.\d+%' || echo "in progress") if [ $((waited % 300)) -eq 0 ]; then # Log every 5 minutes log_info "RAID sync progress: $progress (elapsed: $((waited/60)) minutes)" fi sleep 30 waited=$((waited + 30)) else sleep 10 waited=$((waited + 10)) fi done if [ $waited -ge $max_wait ]; then log_warn "RAID sync may still be in progress. Check manually: cat /proc/mdstat" fi # Save configuration log_info "Saving RAID configuration..." ssh_r630_01 "mdadm --detail --scan >> /etc/mdadm/mdadm.conf" || { log_warn "Failed to save to mdadm.conf" } ssh_r630_01 "update-initramfs -u" || true return 0 } restore_lvm() { log_info "Restoring LVM on new RAID..." # First, check if we need to restore the VG metadata local backup_file=$(ssh_r630_01 "ls -t /etc/lvm/backup/pve* 2>/dev/null | head -1") if [ -n "$backup_file" ]; then log_info "Found LVM backup: $backup_file" log_info "Restoring VG metadata..." # Restore VG metadata ssh_r630_01 "vgcfgrestore -f $backup_file pve" || { log_warn "vgcfgrestore failed, trying alternative method..." } else log_warn "No LVM backup found, will need to recreate" fi # Create PV on new RAID log_info "Creating physical volume on new RAID..." ssh_r630_01 "pvcreate --uuid $(ssh_r630_01 \"pvdisplay /dev/md0 2>/dev/null | grep 'PV UUID' | awk '{print \$3}' || echo '')\" /dev/md0 2>/dev/null || pvcreate /dev/md0" || { log_error "Failed to create PV" return 1 } # Restore VG if needed if ! ssh_r630_01 "vgs pve >/dev/null 2>&1"; then log_info "Restoring volume group..." if [ -n "$backup_file" ]; then ssh_r630_01 "vgcfgrestore -f $backup_file pve" || { log_error "Failed to restore VG" return 1 } else log_error "Cannot restore VG without backup" return 1 fi fi # Activate VG log_info "Activating volume group..." ssh_r630_01 "vgchange -ay pve" || { log_error "Failed to activate VG" return 1 } log_success "LVM restored and activated on new RAID" return 0 } show_status() { log_info "=== RAID Status ===" ssh_r630_01 "cat /proc/mdstat" echo "" ssh_r630_01 "mdadm --detail /dev/md0" echo "" log_info "=== LVM Status ===" ssh_r630_01 "vgs pve" ssh_r630_01 "pvs | grep pve" } main() { echo "" log_warn "=== WARNING: RAID 10 Expansion to 6 Disks ===" log_warn "" log_warn "This script will:" log_warn "1. STOP the current RAID 10 array" log_warn "2. CREATE a new RAID 10 with all 6 disks" log_warn "3. Attempt to restore LVM configuration" log_warn "" log_warn "IMPORTANT:" log_warn "- This will cause DOWNTIME" log_warn "- All containers/VMs will be unavailable" log_warn "- LVM volumes may need manual restoration" log_warn "- Data backup is STRONGLY recommended" log_warn "" log_warn "This is a DESTRUCTIVE operation!" echo "" log_warn "Auto-confirming and proceeding with expansion..." log_warn "This is a destructive operation - all containers/VMs will be unavailable during this process" sleep 3 # Check prerequisites if ! check_prerequisites; then exit 1 fi # Backup LVM config backup_lvm_config # Stop all containers/VMs if ! stop_containers_vms; then log_warn "Some containers/VMs may not have stopped, continuing anyway..." fi # Deactivate LVM volumes if ! deactivate_lvm; then log_error "Failed to deactivate LVM volumes" log_warn "Attempting to reactivate VG..." ssh_r630_01 "vgchange -ay pve" || true exit 1 fi # Remove PV from VG if ! remove_pv_from_vg; then log_warn "Failed to remove PV, continuing anyway..." fi # Stop RAID if ! stop_raid; then log_error "Failed to stop RAID array" log_warn "Attempting to reactivate LVM..." ssh_r630_01 "vgextend pve /dev/md0 2>/dev/null || true" ssh_r630_01 "vgchange -ay pve" || true exit 1 fi # Wait a moment for device to be fully released sleep 3 # Create 6-disk RAID if ! create_6disk_raid10; then log_error "Failed to create 6-disk RAID" log_warn "You may need to manually recover" exit 1 fi # Restore LVM if ! restore_lvm; then log_error "LVM restoration had issues" log_warn "You may need to manually restore LVM volumes" log_warn "Check: vgcfgrestore -l pve" fi # Show status show_status log_success "RAID 10 expansion completed!" log_info "" log_info "RAID Device: /dev/md0" log_info "Capacity: ~700GB (RAID 10 with 6 disks)" log_info "Performance: Maximum (6x read, 3x write)" log_info "Redundancy: Can survive 1-3 disk failures" log_info "" log_warn "IMPORTANT: Verify all containers/VMs are accessible" log_warn "You may need to manually restore LVM volumes if restoration failed" } main "$@"