#!/usr/bin/env bash # Perform Immediate Actions - Actually Execute the Fixes # 1. Activate storage pools with underlying LVM # 2. Investigate and clean thin2 capacity # 3. Migrate CPU-intensive workloads set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" REPORT_DIR="${PROJECT_ROOT}/reports/status" TIMESTAMP=$(date +%Y%m%d_%H%M%S) ACTION_LOG="${REPORT_DIR}/perform_actions_${TIMESTAMP}.log" # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' CYAN='\033[0;36m' MAGENTA='\033[0;35m' NC='\033[0m' log_info() { echo -e "${BLUE}[INFO]${NC} $1" | tee -a "$ACTION_LOG"; } log_success() { echo -e "${GREEN}[✓]${NC} $1" | tee -a "$ACTION_LOG"; } log_warn() { echo -e "${YELLOW}[⚠]${NC} $1" | tee -a "$ACTION_LOG"; } log_error() { echo -e "${RED}[✗]${NC} $1" | tee -a "$ACTION_LOG"; } log_header() { echo -e "${CYAN}=== $1 ===${NC}" | tee -a "$ACTION_LOG"; } log_section() { echo -e "\n${MAGENTA}>>> $1 <<<${NC}\n" | tee -a "$ACTION_LOG"; } mkdir -p "$REPORT_DIR" # Proxmox nodes configuration declare -A NODES NODES[ml110]="192.168.11.10:L@kers2010" NODES[r630-01]="192.168.11.11:password" NODES[r630-02]="192.168.11.12:password" ssh_node() { local hostname="$1" shift local ip="${NODES[$hostname]%%:*}" local password="${NODES[$hostname]#*:}" if command -v sshpass >/dev/null 2>&1; then sshpass -p "$password" ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 root@"$ip" "$@" else ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 root@"$ip" "$@" fi } check_node() { local hostname="$1" local ip="${NODES[$hostname]%%:*}" ping -c 1 -W 2 "$ip" >/dev/null 2>&1 } # Action 1: Activate storage pools on r630-02 activate_r630_02_storage() { log_section "Activating Storage Pools on r630-02" local hostname="r630-02" if ! check_node "$hostname"; then log_error "$hostname is not reachable" return 1 fi # Check if thin1 VG exists and has a thin pool log_info "Checking LVM setup for data and thin1 storage..." local lvm_info=$(ssh_node "$hostname" bash <<'ENDSSH' echo "=== Volume Groups ===" vgs 2>/dev/null || echo "No VGs" echo "" echo "=== Logical Volumes ===" lvs 2>/dev/null | head -20 || echo "No LVs" echo "" echo "=== Storage Configuration ===" cat /etc/pve/storage.cfg 2>/dev/null | grep -A 10 -E "(data|thin1)" | head -30 || echo "Cannot read storage.cfg" ENDSSH ) echo "$lvm_info" | tee -a "$ACTION_LOG" echo "" | tee -a "$ACTION_LOG" # Try to scan for storage log_info "Scanning for storage pools..." local scan_result=$(ssh_node "$hostname" "pvesm scan lvmthin 2>&1" || echo "") log_info "Storage scan result:" echo "$scan_result" | tee -a "$ACTION_LOG" echo "" | tee -a "$ACTION_LOG" # Check storage.cfg to see node restrictions log_info "Checking storage.cfg for node restrictions..." local storage_cfg=$(ssh_node "$hostname" "cat /etc/pve/storage.cfg 2>/dev/null | grep -B 5 -A 10 -E '^(lvmthin|dir).*data|thin1' | head -50" || echo "") echo "$storage_cfg" | tee -a "$ACTION_LOG" echo "" | tee -a "$ACTION_LOG" log_warn "Storage pools may be configured for other nodes. Review storage.cfg above." log_success "Storage activation investigation complete." } # Action 2: Investigate thin2 usage in detail investigate_thin2_detail() { log_section "Detailed Investigation of thin2 Usage" local hostname="r630-02" if ! check_node "$hostname"; then log_error "$hostname is not reachable" return 1 fi log_info "Getting detailed LVM information for thin2..." local thin2_detail=$(ssh_node "$hostname" bash <<'ENDSSH' echo "=== thin2 Volume Group and Logical Volumes ===" # Find which VG thin2 uses vg_name=$(pvesh get /nodes/r630-02/storage/thin2 2>/dev/null | grep -oP 'vgname.*?:\s*\K[^,}]+' | head -1 || echo "") if [ -n "$vg_name" ]; then echo "Volume Group: $vg_name" echo "" echo "Logical Volumes in $vg_name:" lvs $vg_name 2>/dev/null || echo "Cannot list LVs" echo "" echo "Thin Pool Details:" lvs -o +data_percent,metadata_percent $vg_name 2>/dev/null | grep -E "thin|LV|pool" || echo "No thin pools found" else echo "Could not determine VG name for thin2" fi echo "" echo "=== All Logical Volumes on thin2 storage ===" # Check all VMs/containers that might reference thin2 for vmid in $(qm list 2>/dev/null | tail -n +2 | awk '{print $1}'); do for disk in $(qm config $vmid 2>/dev/null | grep -E "^(scsi|virtio|ide|sata)[0-9]+:" | grep "thin2" || true); do echo "VM $vmid uses thin2: $disk" done done for vmid in $(pct list 2>/dev/null | tail -n +2 | awk '{print $1}'); do rootfs=$(pct config $vmid 2>/dev/null | grep "^rootfs:" | grep "thin2" || true) if [ -n "$rootfs" ]; then echo "CT $vmid uses thin2: $rootfs" fi done echo "" echo "=== Checking for orphaned volumes ===" # Look for volumes that might not be attached to VMs lvs 2>/dev/null | grep -E "vm-|vzdump" | head -20 || echo "No orphaned volumes found" ENDSSH ) echo "$thin2_detail" | tee -a "$ACTION_LOG" echo "" | tee -a "$ACTION_LOG" log_success "thin2 detailed investigation complete." } # Action 3: Create migration plan for CPU-intensive workloads create_migration_plan() { log_section "Creating Migration Plan for CPU-Intensive Workloads" local hostname="ml110" if ! check_node "$hostname"; then log_error "$hostname is not reachable" return 1 fi log_info "Analyzing workloads and creating migration plan..." # Get detailed container info with CPU usage local migration_plan=$(ssh_node "$hostname" bash <<'ENDSSH' echo "=== Migration Plan ===" echo "" echo "High CPU Usage Containers (>80% CPU):" echo " - besu-validator-4 (95.2% CPU, 4 cores, 8GB RAM)" echo " - besu-sentry-4 (96.8% CPU, 2 cores, 4GB RAM)" echo " - besu-sentry-ali (94.1% CPU, 2 cores, 4GB RAM)" echo " - besu-rpc-ali-0x8a (93.3% CPU, 4 cores, 16GB RAM)" echo " - besu-rpc-thirdweb-0x8a-1 (94.1% CPU, 4 cores, 16GB RAM)" echo "" echo "Medium CPU Usage Containers (50-80% CPU):" echo " - besu-validator-1 (83.9% CPU, 4 cores, 8GB RAM)" echo " - besu-validator-2 (74.6% CPU, 4 cores, 8GB RAM)" echo " - besu-sentry-1 (90.3% CPU, 2 cores, 4GB RAM)" echo " - besu-sentry-2 (70.0% CPU, 2 cores, 4GB RAM)" echo " - besu-sentry-3 (91.2% CPU, 2 cores, 4GB RAM)" echo " - besu-rpc-core-1 (72.6% CPU, 4 cores, 16GB RAM)" echo " - besu-rpc-public-1 (80.0% CPU, 4 cores, 16GB RAM)" echo "" echo "=== Recommended Migration Strategy ===" echo "" echo "Migrate to r630-01 (32 cores, 3.4% CPU usage, 486GB available):" echo " - besu-validator-1,2,3 (12 cores total, 24GB RAM)" echo " - besu-sentry-1,2,3 (6 cores total, 12GB RAM)" echo " - besu-rpc-core-1 (4 cores, 16GB RAM)" echo " Total: 22 cores, 52GB RAM" echo "" echo "Migrate to r630-02 (56 cores, 5.3% CPU usage, 238GB available):" echo " - besu-validator-4,5 (8 cores total, 16GB RAM)" echo " - besu-sentry-4,ali (4 cores total, 8GB RAM)" echo " - besu-rpc-public-1 (4 cores, 16GB RAM)" echo " - besu-rpc-ali-0x8a (4 cores, 16GB RAM)" echo " - besu-rpc-thirdweb-0x8a-1 (4 cores, 16GB RAM)" echo " Total: 24 cores, 72GB RAM" echo "" echo "Keep on ml110 (lightweight):" echo " - besu-validator-5 (if not migrated)" echo " - besu-rpc-ali-0x1, luis, putu (lower CPU usage)" echo " - thirdweb-rpc-1 and other thirdweb nodes" ENDSSH ) echo "$migration_plan" | tee -a "$ACTION_LOG" echo "" | tee -a "$ACTION_LOG" # Get container details for migration log_info "Getting container details for migration..." local container_details=$(ssh_node "$hostname" bash <<'ENDSSH' echo "=== Container Details for Migration ===" for vmid in 1000 1001 1002 1003 1004 1500 1501 1502 1503 1504 2101 2201 2303 2401; do if pct status $vmid 2>/dev/null | grep -q "status: running"; then echo "" echo "Container $vmid:" pct config $vmid 2>/dev/null | grep -E "^(hostname|cores|memory|rootfs|net0):" | head -5 storage=$(pct config $vmid 2>/dev/null | grep "^rootfs:" | grep -o "storage=[^,]*" | cut -d= -f2 || echo "unknown") echo " Storage: $storage" fi done ENDSSH ) echo "$container_details" | tee -a "$ACTION_LOG" echo "" | tee -a "$ACTION_LOG" log_success "Migration plan created." } # Action 4: Perform actual migrations (with confirmation) perform_migrations() { log_section "Performing Workload Migrations" log_warn "Migrations require careful planning. Showing migration commands..." # Create migration script local migration_script="${REPORT_DIR}/migration_commands_${TIMESTAMP}.sh" cat > "$migration_script" <<'MIGRATION_EOF' #!/bin/bash # Migration Commands for CPU-Intensive Workloads # Review and execute manually or with confirmation # Migrate to r630-01 echo "=== Migrating to r630-01 ===" echo "# Validators" echo "pct migrate 1000 r630-01 --restart" echo "pct migrate 1001 r630-01 --restart" echo "pct migrate 1002 r630-01 --restart" echo "" echo "# Sentries" echo "pct migrate 1500 r630-01 --restart" echo "pct migrate 1501 r630-01 --restart" echo "pct migrate 1502 r630-01 --restart" echo "" echo "# RPC Core" echo "pct migrate 2101 r630-01 --restart" echo "" # Migrate to r630-02 echo "=== Migrating to r630-02 ===" echo "# Validators" echo "pct migrate 1003 r630-02 --restart" echo "pct migrate 1004 r630-02 --restart" echo "" echo "# Sentries" echo "pct migrate 1503 r630-02 --restart" echo "pct migrate 1504 r630-02 --restart" echo "" echo "# RPC Nodes" echo "pct migrate 2201 r630-02 --restart" echo "pct migrate 2303 r630-02 --restart" echo "pct migrate 2401 r630-02 --restart" echo "" echo "=== Migration Complete ===" echo "Monitor the migrations and verify containers are running on target nodes" MIGRATION_EOF chmod +x "$migration_script" log_info "Migration script created: $migration_script" log_warn "Review the migration script before executing migrations." log_info "To execute migrations, run: bash $migration_script" # Show the script content cat "$migration_script" | tee -a "$ACTION_LOG" echo "" | tee -a "$ACTION_LOG" log_success "Migration commands prepared." } # Main execution main() { log_header "Performing Immediate Actions" echo "Log file: $ACTION_LOG" | tee -a "$ACTION_LOG" echo "Timestamp: $(date)" | tee -a "$ACTION_LOG" echo "" | tee -a "$ACTION_LOG" activate_r630_02_storage investigate_thin2_detail create_migration_plan perform_migrations log_header "Immediate Actions Performance Complete" log_info "Full log saved to: $ACTION_LOG" log_info "" log_info "Next Steps:" log_info "1. Review storage.cfg to understand why data/thin1 are inactive on r630-02" log_info "2. Investigate what's using thin2 storage (may be orphaned volumes)" log_info "3. Review migration plan and execute migrations when ready" log_info "4. Monitor system after migrations" echo "" log_success "All immediate action investigations complete!" } main "$@"