Files
proxmox/scripts/perform-immediate-actions.sh

335 lines
12 KiB
Bash
Raw Permalink Normal View History

#!/usr/bin/env bash
# Perform Immediate Actions - Actually Execute the Fixes
# 1. Activate storage pools with underlying LVM
# 2. Investigate and clean thin2 capacity
# 3. Migrate CPU-intensive workloads
set -euo pipefail
# Load IP configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
REPORT_DIR="${PROJECT_ROOT}/reports/status"
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
ACTION_LOG="${REPORT_DIR}/perform_actions_${TIMESTAMP}.log"
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
MAGENTA='\033[0;35m'
NC='\033[0m'
log_info() { echo -e "${BLUE}[INFO]${NC} $1" | tee -a "$ACTION_LOG"; }
log_success() { echo -e "${GREEN}[✓]${NC} $1" | tee -a "$ACTION_LOG"; }
log_warn() { echo -e "${YELLOW}[⚠]${NC} $1" | tee -a "$ACTION_LOG"; }
log_error() { echo -e "${RED}[✗]${NC} $1" | tee -a "$ACTION_LOG"; }
log_header() { echo -e "${CYAN}=== $1 ===${NC}" | tee -a "$ACTION_LOG"; }
log_section() { echo -e "\n${MAGENTA}>>> $1 <<<${NC}\n" | tee -a "$ACTION_LOG"; }
mkdir -p "$REPORT_DIR"
# Proxmox nodes configuration
declare -A NODES
NODES[ml110]="${PROXMOX_HOST_ML110:-192.168.11.10}:L@kers2010"
NODES[r630-01]="${PROXMOX_HOST_R630_01:-192.168.11.11}:password"
NODES[r630-02]="${PROXMOX_HOST_R630_02:-192.168.11.12}:password"
ssh_node() {
local hostname="$1"
shift
local ip="${NODES[$hostname]%%:*}"
local password="${NODES[$hostname]#*:}"
if command -v sshpass >/dev/null 2>&1; then
sshpass -p "$password" ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 root@"$ip" "$@"
else
ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 root@"$ip" "$@"
fi
}
check_node() {
local hostname="$1"
local ip="${NODES[$hostname]%%:*}"
ping -c 1 -W 2 "$ip" >/dev/null 2>&1
}
# Action 1: Activate storage pools on r630-02
activate_r630_02_storage() {
log_section "Activating Storage Pools on r630-02"
local hostname="r630-02"
if ! check_node "$hostname"; then
log_error "$hostname is not reachable"
return 1
fi
# Check if thin1 VG exists and has a thin pool
log_info "Checking LVM setup for data and thin1 storage..."
local lvm_info=$(ssh_node "$hostname" bash <<'ENDSSH'
echo "=== Volume Groups ==="
vgs 2>/dev/null || echo "No VGs"
echo ""
echo "=== Logical Volumes ==="
lvs 2>/dev/null | head -20 || echo "No LVs"
echo ""
echo "=== Storage Configuration ==="
cat /etc/pve/storage.cfg 2>/dev/null | grep -A 10 -E "(data|thin1)" | head -30 || echo "Cannot read storage.cfg"
ENDSSH
)
echo "$lvm_info" | tee -a "$ACTION_LOG"
echo "" | tee -a "$ACTION_LOG"
# Try to scan for storage
log_info "Scanning for storage pools..."
local scan_result=$(ssh_node "$hostname" "pvesm scan lvmthin 2>&1" || echo "")
log_info "Storage scan result:"
echo "$scan_result" | tee -a "$ACTION_LOG"
echo "" | tee -a "$ACTION_LOG"
# Check storage.cfg to see node restrictions
log_info "Checking storage.cfg for node restrictions..."
local storage_cfg=$(ssh_node "$hostname" "cat /etc/pve/storage.cfg 2>/dev/null | grep -B 5 -A 10 -E '^(lvmthin|dir).*data|thin1' | head -50" || echo "")
echo "$storage_cfg" | tee -a "$ACTION_LOG"
echo "" | tee -a "$ACTION_LOG"
log_warn "Storage pools may be configured for other nodes. Review storage.cfg above."
log_success "Storage activation investigation complete."
}
# Action 2: Investigate thin2 usage in detail
investigate_thin2_detail() {
log_section "Detailed Investigation of thin2 Usage"
local hostname="r630-02"
if ! check_node "$hostname"; then
log_error "$hostname is not reachable"
return 1
fi
log_info "Getting detailed LVM information for thin2..."
local thin2_detail=$(ssh_node "$hostname" bash <<'ENDSSH'
echo "=== thin2 Volume Group and Logical Volumes ==="
# Find which VG thin2 uses
vg_name=$(pvesh get /nodes/r630-02/storage/thin2 2>/dev/null | grep -oP 'vgname.*?:\s*\K[^,}]+' | head -1 || echo "")
if [ -n "$vg_name" ]; then
echo "Volume Group: $vg_name"
echo ""
echo "Logical Volumes in $vg_name:"
lvs $vg_name 2>/dev/null || echo "Cannot list LVs"
echo ""
echo "Thin Pool Details:"
lvs -o +data_percent,metadata_percent $vg_name 2>/dev/null | grep -E "thin|LV|pool" || echo "No thin pools found"
else
echo "Could not determine VG name for thin2"
fi
echo ""
echo "=== All Logical Volumes on thin2 storage ==="
# Check all VMs/containers that might reference thin2
for vmid in $(qm list 2>/dev/null | tail -n +2 | awk '{print $1}'); do
for disk in $(qm config $vmid 2>/dev/null | grep -E "^(scsi|virtio|ide|sata)[0-9]+:" | grep "thin2" || true); do
echo "VM $vmid uses thin2: $disk"
done
done
for vmid in $(pct list 2>/dev/null | tail -n +2 | awk '{print $1}'); do
rootfs=$(pct config $vmid 2>/dev/null | grep "^rootfs:" | grep "thin2" || true)
if [ -n "$rootfs" ]; then
echo "CT $vmid uses thin2: $rootfs"
fi
done
echo ""
echo "=== Checking for orphaned volumes ==="
# Look for volumes that might not be attached to VMs
lvs 2>/dev/null | grep -E "vm-|vzdump" | head -20 || echo "No orphaned volumes found"
ENDSSH
)
echo "$thin2_detail" | tee -a "$ACTION_LOG"
echo "" | tee -a "$ACTION_LOG"
log_success "thin2 detailed investigation complete."
}
# Action 3: Create migration plan for CPU-intensive workloads
create_migration_plan() {
log_section "Creating Migration Plan for CPU-Intensive Workloads"
local hostname="ml110"
if ! check_node "$hostname"; then
log_error "$hostname is not reachable"
return 1
fi
log_info "Analyzing workloads and creating migration plan..."
# Get detailed container info with CPU usage
local migration_plan=$(ssh_node "$hostname" bash <<'ENDSSH'
echo "=== Migration Plan ==="
echo ""
echo "High CPU Usage Containers (>80% CPU):"
echo " - besu-validator-4 (95.2% CPU, 4 cores, 8GB RAM)"
echo " - besu-sentry-4 (96.8% CPU, 2 cores, 4GB RAM)"
echo " - besu-sentry-ali (94.1% CPU, 2 cores, 4GB RAM)"
echo " - besu-rpc-ali-0x8a (93.3% CPU, 4 cores, 16GB RAM)"
echo " - besu-rpc-thirdweb-0x8a-1 (94.1% CPU, 4 cores, 16GB RAM)"
echo ""
echo "Medium CPU Usage Containers (50-80% CPU):"
echo " - besu-validator-1 (83.9% CPU, 4 cores, 8GB RAM)"
echo " - besu-validator-2 (74.6% CPU, 4 cores, 8GB RAM)"
echo " - besu-sentry-1 (90.3% CPU, 2 cores, 4GB RAM)"
echo " - besu-sentry-2 (70.0% CPU, 2 cores, 4GB RAM)"
echo " - besu-sentry-3 (91.2% CPU, 2 cores, 4GB RAM)"
echo " - besu-rpc-core-1 (72.6% CPU, 4 cores, 16GB RAM)"
echo " - besu-rpc-public-1 (80.0% CPU, 4 cores, 16GB RAM)"
echo ""
echo "=== Recommended Migration Strategy ==="
echo ""
echo "Migrate to r630-01 (32 cores, 3.4% CPU usage, 486GB available):"
echo " - besu-validator-1,2,3 (12 cores total, 24GB RAM)"
echo " - besu-sentry-1,2,3 (6 cores total, 12GB RAM)"
echo " - besu-rpc-core-1 (4 cores, 16GB RAM)"
echo " Total: 22 cores, 52GB RAM"
echo ""
echo "Migrate to r630-02 (56 cores, 5.3% CPU usage, 238GB available):"
echo " - besu-validator-4,5 (8 cores total, 16GB RAM)"
echo " - besu-sentry-4,ali (4 cores total, 8GB RAM)"
echo " - besu-rpc-public-1 (4 cores, 16GB RAM)"
echo " - besu-rpc-ali-0x8a (4 cores, 16GB RAM)"
echo " - besu-rpc-thirdweb-0x8a-1 (4 cores, 16GB RAM)"
echo " Total: 24 cores, 72GB RAM"
echo ""
echo "Keep on ml110 (lightweight):"
echo " - besu-validator-5 (if not migrated)"
echo " - besu-rpc-ali-0x1, luis, putu (lower CPU usage)"
echo " - thirdweb-rpc-1 and other thirdweb nodes"
ENDSSH
)
echo "$migration_plan" | tee -a "$ACTION_LOG"
echo "" | tee -a "$ACTION_LOG"
# Get container details for migration
log_info "Getting container details for migration..."
local container_details=$(ssh_node "$hostname" bash <<'ENDSSH'
echo "=== Container Details for Migration ==="
for vmid in 1000 1001 1002 1003 1004 1500 1501 1502 1503 1504 2101 2201 2303 2401; do
if pct status $vmid 2>/dev/null | grep -q "status: running"; then
echo ""
echo "Container $vmid:"
pct config $vmid 2>/dev/null | grep -E "^(hostname|cores|memory|rootfs|net0):" | head -5
storage=$(pct config $vmid 2>/dev/null | grep "^rootfs:" | grep -o "storage=[^,]*" | cut -d= -f2 || echo "unknown")
echo " Storage: $storage"
fi
done
ENDSSH
)
echo "$container_details" | tee -a "$ACTION_LOG"
echo "" | tee -a "$ACTION_LOG"
log_success "Migration plan created."
}
# Action 4: Perform actual migrations (with confirmation)
perform_migrations() {
log_section "Performing Workload Migrations"
log_warn "Migrations require careful planning. Showing migration commands..."
# Create migration script
local migration_script="${REPORT_DIR}/migration_commands_${TIMESTAMP}.sh"
cat > "$migration_script" <<'MIGRATION_EOF'
#!/usr/bin/env bash
# Migration Commands for CPU-Intensive Workloads
# Review and execute manually or with confirmation
# Migrate to r630-01
echo "=== Migrating to r630-01 ==="
echo "# Validators"
echo "pct migrate 1000 r630-01 --restart"
echo "pct migrate 1001 r630-01 --restart"
echo "pct migrate 1002 r630-01 --restart"
echo ""
echo "# Sentries"
echo "pct migrate 1500 r630-01 --restart"
echo "pct migrate 1501 r630-01 --restart"
echo "pct migrate 1502 r630-01 --restart"
echo ""
echo "# RPC Core"
echo "pct migrate 2101 r630-01 --restart"
echo ""
# Migrate to r630-02
echo "=== Migrating to r630-02 ==="
echo "# Validators"
echo "pct migrate 1003 r630-02 --restart"
echo "pct migrate 1004 r630-02 --restart"
echo ""
echo "# Sentries"
echo "pct migrate 1503 r630-02 --restart"
echo "pct migrate 1504 r630-02 --restart"
echo ""
echo "# RPC Nodes"
echo "pct migrate 2201 r630-02 --restart"
echo "pct migrate 2303 r630-02 --restart"
echo "pct migrate 2401 r630-02 --restart"
echo ""
echo "=== Migration Complete ==="
echo "Monitor the migrations and verify containers are running on target nodes"
MIGRATION_EOF
chmod +x "$migration_script"
log_info "Migration script created: $migration_script"
log_warn "Review the migration script before executing migrations."
log_info "To execute migrations, run: bash $migration_script"
# Show the script content
cat "$migration_script" | tee -a "$ACTION_LOG"
echo "" | tee -a "$ACTION_LOG"
log_success "Migration commands prepared."
}
# Main execution
main() {
log_header "Performing Immediate Actions"
echo "Log file: $ACTION_LOG" | tee -a "$ACTION_LOG"
echo "Timestamp: $(date)" | tee -a "$ACTION_LOG"
echo "" | tee -a "$ACTION_LOG"
activate_r630_02_storage
investigate_thin2_detail
create_migration_plan
perform_migrations
log_header "Immediate Actions Performance Complete"
log_info "Full log saved to: $ACTION_LOG"
log_info ""
log_info "Next Steps:"
log_info "1. Review storage.cfg to understand why data/thin1 are inactive on r630-02"
log_info "2. Investigate what's using thin2 storage (may be orphaned volumes)"
log_info "3. Review migration plan and execute migrations when ready"
log_info "4. Monitor system after migrations"
echo ""
log_success "All immediate action investigations complete!"
}
main "$@"