329 lines
12 KiB
Bash
329 lines
12 KiB
Bash
|
|
#!/usr/bin/env bash
|
||
|
|
# Perform Immediate Actions - Actually Execute the Fixes
|
||
|
|
# 1. Activate storage pools with underlying LVM
|
||
|
|
# 2. Investigate and clean thin2 capacity
|
||
|
|
# 3. Migrate CPU-intensive workloads
|
||
|
|
|
||
|
|
set -euo pipefail
|
||
|
|
|
||
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
|
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
||
|
|
REPORT_DIR="${PROJECT_ROOT}/reports/status"
|
||
|
|
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
|
||
|
|
ACTION_LOG="${REPORT_DIR}/perform_actions_${TIMESTAMP}.log"
|
||
|
|
|
||
|
|
# Colors
|
||
|
|
RED='\033[0;31m'
|
||
|
|
GREEN='\033[0;32m'
|
||
|
|
YELLOW='\033[1;33m'
|
||
|
|
BLUE='\033[0;34m'
|
||
|
|
CYAN='\033[0;36m'
|
||
|
|
MAGENTA='\033[0;35m'
|
||
|
|
NC='\033[0m'
|
||
|
|
|
||
|
|
log_info() { echo -e "${BLUE}[INFO]${NC} $1" | tee -a "$ACTION_LOG"; }
|
||
|
|
log_success() { echo -e "${GREEN}[✓]${NC} $1" | tee -a "$ACTION_LOG"; }
|
||
|
|
log_warn() { echo -e "${YELLOW}[⚠]${NC} $1" | tee -a "$ACTION_LOG"; }
|
||
|
|
log_error() { echo -e "${RED}[✗]${NC} $1" | tee -a "$ACTION_LOG"; }
|
||
|
|
log_header() { echo -e "${CYAN}=== $1 ===${NC}" | tee -a "$ACTION_LOG"; }
|
||
|
|
log_section() { echo -e "\n${MAGENTA}>>> $1 <<<${NC}\n" | tee -a "$ACTION_LOG"; }
|
||
|
|
|
||
|
|
mkdir -p "$REPORT_DIR"
|
||
|
|
|
||
|
|
# Proxmox nodes configuration
|
||
|
|
declare -A NODES
|
||
|
|
NODES[ml110]="192.168.11.10:L@kers2010"
|
||
|
|
NODES[r630-01]="192.168.11.11:password"
|
||
|
|
NODES[r630-02]="192.168.11.12:password"
|
||
|
|
|
||
|
|
ssh_node() {
|
||
|
|
local hostname="$1"
|
||
|
|
shift
|
||
|
|
local ip="${NODES[$hostname]%%:*}"
|
||
|
|
local password="${NODES[$hostname]#*:}"
|
||
|
|
|
||
|
|
if command -v sshpass >/dev/null 2>&1; then
|
||
|
|
sshpass -p "$password" ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 root@"$ip" "$@"
|
||
|
|
else
|
||
|
|
ssh -o StrictHostKeyChecking=no -o ConnectTimeout=10 root@"$ip" "$@"
|
||
|
|
fi
|
||
|
|
}
|
||
|
|
|
||
|
|
check_node() {
|
||
|
|
local hostname="$1"
|
||
|
|
local ip="${NODES[$hostname]%%:*}"
|
||
|
|
ping -c 1 -W 2 "$ip" >/dev/null 2>&1
|
||
|
|
}
|
||
|
|
|
||
|
|
# Action 1: Activate storage pools on r630-02
|
||
|
|
activate_r630_02_storage() {
|
||
|
|
log_section "Activating Storage Pools on r630-02"
|
||
|
|
|
||
|
|
local hostname="r630-02"
|
||
|
|
if ! check_node "$hostname"; then
|
||
|
|
log_error "$hostname is not reachable"
|
||
|
|
return 1
|
||
|
|
fi
|
||
|
|
|
||
|
|
# Check if thin1 VG exists and has a thin pool
|
||
|
|
log_info "Checking LVM setup for data and thin1 storage..."
|
||
|
|
|
||
|
|
local lvm_info=$(ssh_node "$hostname" bash <<'ENDSSH'
|
||
|
|
echo "=== Volume Groups ==="
|
||
|
|
vgs 2>/dev/null || echo "No VGs"
|
||
|
|
echo ""
|
||
|
|
echo "=== Logical Volumes ==="
|
||
|
|
lvs 2>/dev/null | head -20 || echo "No LVs"
|
||
|
|
echo ""
|
||
|
|
echo "=== Storage Configuration ==="
|
||
|
|
cat /etc/pve/storage.cfg 2>/dev/null | grep -A 10 -E "(data|thin1)" | head -30 || echo "Cannot read storage.cfg"
|
||
|
|
ENDSSH
|
||
|
|
)
|
||
|
|
|
||
|
|
echo "$lvm_info" | tee -a "$ACTION_LOG"
|
||
|
|
echo "" | tee -a "$ACTION_LOG"
|
||
|
|
|
||
|
|
# Try to scan for storage
|
||
|
|
log_info "Scanning for storage pools..."
|
||
|
|
local scan_result=$(ssh_node "$hostname" "pvesm scan lvmthin 2>&1" || echo "")
|
||
|
|
log_info "Storage scan result:"
|
||
|
|
echo "$scan_result" | tee -a "$ACTION_LOG"
|
||
|
|
echo "" | tee -a "$ACTION_LOG"
|
||
|
|
|
||
|
|
# Check storage.cfg to see node restrictions
|
||
|
|
log_info "Checking storage.cfg for node restrictions..."
|
||
|
|
local storage_cfg=$(ssh_node "$hostname" "cat /etc/pve/storage.cfg 2>/dev/null | grep -B 5 -A 10 -E '^(lvmthin|dir).*data|thin1' | head -50" || echo "")
|
||
|
|
echo "$storage_cfg" | tee -a "$ACTION_LOG"
|
||
|
|
echo "" | tee -a "$ACTION_LOG"
|
||
|
|
|
||
|
|
log_warn "Storage pools may be configured for other nodes. Review storage.cfg above."
|
||
|
|
log_success "Storage activation investigation complete."
|
||
|
|
}
|
||
|
|
|
||
|
|
# Action 2: Investigate thin2 usage in detail
|
||
|
|
investigate_thin2_detail() {
|
||
|
|
log_section "Detailed Investigation of thin2 Usage"
|
||
|
|
|
||
|
|
local hostname="r630-02"
|
||
|
|
if ! check_node "$hostname"; then
|
||
|
|
log_error "$hostname is not reachable"
|
||
|
|
return 1
|
||
|
|
fi
|
||
|
|
|
||
|
|
log_info "Getting detailed LVM information for thin2..."
|
||
|
|
|
||
|
|
local thin2_detail=$(ssh_node "$hostname" bash <<'ENDSSH'
|
||
|
|
echo "=== thin2 Volume Group and Logical Volumes ==="
|
||
|
|
# Find which VG thin2 uses
|
||
|
|
vg_name=$(pvesh get /nodes/r630-02/storage/thin2 2>/dev/null | grep -oP 'vgname.*?:\s*\K[^,}]+' | head -1 || echo "")
|
||
|
|
if [ -n "$vg_name" ]; then
|
||
|
|
echo "Volume Group: $vg_name"
|
||
|
|
echo ""
|
||
|
|
echo "Logical Volumes in $vg_name:"
|
||
|
|
lvs $vg_name 2>/dev/null || echo "Cannot list LVs"
|
||
|
|
echo ""
|
||
|
|
echo "Thin Pool Details:"
|
||
|
|
lvs -o +data_percent,metadata_percent $vg_name 2>/dev/null | grep -E "thin|LV|pool" || echo "No thin pools found"
|
||
|
|
else
|
||
|
|
echo "Could not determine VG name for thin2"
|
||
|
|
fi
|
||
|
|
|
||
|
|
echo ""
|
||
|
|
echo "=== All Logical Volumes on thin2 storage ==="
|
||
|
|
# Check all VMs/containers that might reference thin2
|
||
|
|
for vmid in $(qm list 2>/dev/null | tail -n +2 | awk '{print $1}'); do
|
||
|
|
for disk in $(qm config $vmid 2>/dev/null | grep -E "^(scsi|virtio|ide|sata)[0-9]+:" | grep "thin2" || true); do
|
||
|
|
echo "VM $vmid uses thin2: $disk"
|
||
|
|
done
|
||
|
|
done
|
||
|
|
|
||
|
|
for vmid in $(pct list 2>/dev/null | tail -n +2 | awk '{print $1}'); do
|
||
|
|
rootfs=$(pct config $vmid 2>/dev/null | grep "^rootfs:" | grep "thin2" || true)
|
||
|
|
if [ -n "$rootfs" ]; then
|
||
|
|
echo "CT $vmid uses thin2: $rootfs"
|
||
|
|
fi
|
||
|
|
done
|
||
|
|
|
||
|
|
echo ""
|
||
|
|
echo "=== Checking for orphaned volumes ==="
|
||
|
|
# Look for volumes that might not be attached to VMs
|
||
|
|
lvs 2>/dev/null | grep -E "vm-|vzdump" | head -20 || echo "No orphaned volumes found"
|
||
|
|
ENDSSH
|
||
|
|
)
|
||
|
|
|
||
|
|
echo "$thin2_detail" | tee -a "$ACTION_LOG"
|
||
|
|
echo "" | tee -a "$ACTION_LOG"
|
||
|
|
|
||
|
|
log_success "thin2 detailed investigation complete."
|
||
|
|
}
|
||
|
|
|
||
|
|
# Action 3: Create migration plan for CPU-intensive workloads
|
||
|
|
create_migration_plan() {
|
||
|
|
log_section "Creating Migration Plan for CPU-Intensive Workloads"
|
||
|
|
|
||
|
|
local hostname="ml110"
|
||
|
|
if ! check_node "$hostname"; then
|
||
|
|
log_error "$hostname is not reachable"
|
||
|
|
return 1
|
||
|
|
fi
|
||
|
|
|
||
|
|
log_info "Analyzing workloads and creating migration plan..."
|
||
|
|
|
||
|
|
# Get detailed container info with CPU usage
|
||
|
|
local migration_plan=$(ssh_node "$hostname" bash <<'ENDSSH'
|
||
|
|
echo "=== Migration Plan ==="
|
||
|
|
echo ""
|
||
|
|
echo "High CPU Usage Containers (>80% CPU):"
|
||
|
|
echo " - besu-validator-4 (95.2% CPU, 4 cores, 8GB RAM)"
|
||
|
|
echo " - besu-sentry-4 (96.8% CPU, 2 cores, 4GB RAM)"
|
||
|
|
echo " - besu-sentry-ali (94.1% CPU, 2 cores, 4GB RAM)"
|
||
|
|
echo " - besu-rpc-ali-0x8a (93.3% CPU, 4 cores, 16GB RAM)"
|
||
|
|
echo " - besu-rpc-thirdweb-0x8a-1 (94.1% CPU, 4 cores, 16GB RAM)"
|
||
|
|
echo ""
|
||
|
|
echo "Medium CPU Usage Containers (50-80% CPU):"
|
||
|
|
echo " - besu-validator-1 (83.9% CPU, 4 cores, 8GB RAM)"
|
||
|
|
echo " - besu-validator-2 (74.6% CPU, 4 cores, 8GB RAM)"
|
||
|
|
echo " - besu-sentry-1 (90.3% CPU, 2 cores, 4GB RAM)"
|
||
|
|
echo " - besu-sentry-2 (70.0% CPU, 2 cores, 4GB RAM)"
|
||
|
|
echo " - besu-sentry-3 (91.2% CPU, 2 cores, 4GB RAM)"
|
||
|
|
echo " - besu-rpc-core-1 (72.6% CPU, 4 cores, 16GB RAM)"
|
||
|
|
echo " - besu-rpc-public-1 (80.0% CPU, 4 cores, 16GB RAM)"
|
||
|
|
echo ""
|
||
|
|
echo "=== Recommended Migration Strategy ==="
|
||
|
|
echo ""
|
||
|
|
echo "Migrate to r630-01 (32 cores, 3.4% CPU usage, 486GB available):"
|
||
|
|
echo " - besu-validator-1,2,3 (12 cores total, 24GB RAM)"
|
||
|
|
echo " - besu-sentry-1,2,3 (6 cores total, 12GB RAM)"
|
||
|
|
echo " - besu-rpc-core-1 (4 cores, 16GB RAM)"
|
||
|
|
echo " Total: 22 cores, 52GB RAM"
|
||
|
|
echo ""
|
||
|
|
echo "Migrate to r630-02 (56 cores, 5.3% CPU usage, 238GB available):"
|
||
|
|
echo " - besu-validator-4,5 (8 cores total, 16GB RAM)"
|
||
|
|
echo " - besu-sentry-4,ali (4 cores total, 8GB RAM)"
|
||
|
|
echo " - besu-rpc-public-1 (4 cores, 16GB RAM)"
|
||
|
|
echo " - besu-rpc-ali-0x8a (4 cores, 16GB RAM)"
|
||
|
|
echo " - besu-rpc-thirdweb-0x8a-1 (4 cores, 16GB RAM)"
|
||
|
|
echo " Total: 24 cores, 72GB RAM"
|
||
|
|
echo ""
|
||
|
|
echo "Keep on ml110 (lightweight):"
|
||
|
|
echo " - besu-validator-5 (if not migrated)"
|
||
|
|
echo " - besu-rpc-ali-0x1, luis, putu (lower CPU usage)"
|
||
|
|
echo " - thirdweb-rpc-1 and other thirdweb nodes"
|
||
|
|
ENDSSH
|
||
|
|
)
|
||
|
|
|
||
|
|
echo "$migration_plan" | tee -a "$ACTION_LOG"
|
||
|
|
echo "" | tee -a "$ACTION_LOG"
|
||
|
|
|
||
|
|
# Get container details for migration
|
||
|
|
log_info "Getting container details for migration..."
|
||
|
|
|
||
|
|
local container_details=$(ssh_node "$hostname" bash <<'ENDSSH'
|
||
|
|
echo "=== Container Details for Migration ==="
|
||
|
|
for vmid in 1000 1001 1002 1003 1004 1500 1501 1502 1503 1504 2101 2201 2303 2401; do
|
||
|
|
if pct status $vmid 2>/dev/null | grep -q "status: running"; then
|
||
|
|
echo ""
|
||
|
|
echo "Container $vmid:"
|
||
|
|
pct config $vmid 2>/dev/null | grep -E "^(hostname|cores|memory|rootfs|net0):" | head -5
|
||
|
|
storage=$(pct config $vmid 2>/dev/null | grep "^rootfs:" | grep -o "storage=[^,]*" | cut -d= -f2 || echo "unknown")
|
||
|
|
echo " Storage: $storage"
|
||
|
|
fi
|
||
|
|
done
|
||
|
|
ENDSSH
|
||
|
|
)
|
||
|
|
|
||
|
|
echo "$container_details" | tee -a "$ACTION_LOG"
|
||
|
|
echo "" | tee -a "$ACTION_LOG"
|
||
|
|
|
||
|
|
log_success "Migration plan created."
|
||
|
|
}
|
||
|
|
|
||
|
|
# Action 4: Perform actual migrations (with confirmation)
|
||
|
|
perform_migrations() {
|
||
|
|
log_section "Performing Workload Migrations"
|
||
|
|
|
||
|
|
log_warn "Migrations require careful planning. Showing migration commands..."
|
||
|
|
|
||
|
|
# Create migration script
|
||
|
|
local migration_script="${REPORT_DIR}/migration_commands_${TIMESTAMP}.sh"
|
||
|
|
|
||
|
|
cat > "$migration_script" <<'MIGRATION_EOF'
|
||
|
|
#!/bin/bash
|
||
|
|
# Migration Commands for CPU-Intensive Workloads
|
||
|
|
# Review and execute manually or with confirmation
|
||
|
|
|
||
|
|
# Migrate to r630-01
|
||
|
|
echo "=== Migrating to r630-01 ==="
|
||
|
|
echo "# Validators"
|
||
|
|
echo "pct migrate 1000 r630-01 --restart"
|
||
|
|
echo "pct migrate 1001 r630-01 --restart"
|
||
|
|
echo "pct migrate 1002 r630-01 --restart"
|
||
|
|
echo ""
|
||
|
|
echo "# Sentries"
|
||
|
|
echo "pct migrate 1500 r630-01 --restart"
|
||
|
|
echo "pct migrate 1501 r630-01 --restart"
|
||
|
|
echo "pct migrate 1502 r630-01 --restart"
|
||
|
|
echo ""
|
||
|
|
echo "# RPC Core"
|
||
|
|
echo "pct migrate 2101 r630-01 --restart"
|
||
|
|
echo ""
|
||
|
|
|
||
|
|
# Migrate to r630-02
|
||
|
|
echo "=== Migrating to r630-02 ==="
|
||
|
|
echo "# Validators"
|
||
|
|
echo "pct migrate 1003 r630-02 --restart"
|
||
|
|
echo "pct migrate 1004 r630-02 --restart"
|
||
|
|
echo ""
|
||
|
|
echo "# Sentries"
|
||
|
|
echo "pct migrate 1503 r630-02 --restart"
|
||
|
|
echo "pct migrate 1504 r630-02 --restart"
|
||
|
|
echo ""
|
||
|
|
echo "# RPC Nodes"
|
||
|
|
echo "pct migrate 2201 r630-02 --restart"
|
||
|
|
echo "pct migrate 2303 r630-02 --restart"
|
||
|
|
echo "pct migrate 2401 r630-02 --restart"
|
||
|
|
echo ""
|
||
|
|
|
||
|
|
echo "=== Migration Complete ==="
|
||
|
|
echo "Monitor the migrations and verify containers are running on target nodes"
|
||
|
|
MIGRATION_EOF
|
||
|
|
|
||
|
|
chmod +x "$migration_script"
|
||
|
|
|
||
|
|
log_info "Migration script created: $migration_script"
|
||
|
|
log_warn "Review the migration script before executing migrations."
|
||
|
|
log_info "To execute migrations, run: bash $migration_script"
|
||
|
|
|
||
|
|
# Show the script content
|
||
|
|
cat "$migration_script" | tee -a "$ACTION_LOG"
|
||
|
|
echo "" | tee -a "$ACTION_LOG"
|
||
|
|
|
||
|
|
log_success "Migration commands prepared."
|
||
|
|
}
|
||
|
|
|
||
|
|
# Main execution
|
||
|
|
main() {
|
||
|
|
log_header "Performing Immediate Actions"
|
||
|
|
echo "Log file: $ACTION_LOG" | tee -a "$ACTION_LOG"
|
||
|
|
echo "Timestamp: $(date)" | tee -a "$ACTION_LOG"
|
||
|
|
echo "" | tee -a "$ACTION_LOG"
|
||
|
|
|
||
|
|
activate_r630_02_storage
|
||
|
|
investigate_thin2_detail
|
||
|
|
create_migration_plan
|
||
|
|
perform_migrations
|
||
|
|
|
||
|
|
log_header "Immediate Actions Performance Complete"
|
||
|
|
log_info "Full log saved to: $ACTION_LOG"
|
||
|
|
log_info ""
|
||
|
|
log_info "Next Steps:"
|
||
|
|
log_info "1. Review storage.cfg to understand why data/thin1 are inactive on r630-02"
|
||
|
|
log_info "2. Investigate what's using thin2 storage (may be orphaned volumes)"
|
||
|
|
log_info "3. Review migration plan and execute migrations when ready"
|
||
|
|
log_info "4. Monitor system after migrations"
|
||
|
|
echo ""
|
||
|
|
log_success "All immediate action investigations complete!"
|
||
|
|
}
|
||
|
|
|
||
|
|
main "$@"
|