Files
proxmox/scripts/health/check-rpc-vms-health.sh
defiQUG b3a8fe4496
Some checks failed
Deploy to Phoenix / deploy (push) Has been cancelled
chore: sync all changes to Gitea
- Config, docs, scripts, and backup manifests
- Submodule refs unchanged (m = modified content in submodules)

Made-with: Cursor
2026-03-02 11:37:34 -08:00

93 lines
3.1 KiB
Bash

#!/usr/bin/env bash
# Check health of RPC node VMs only (container status + besu-rpc service + RPC block).
# Uses SSH to Proxmox hosts. Run from project root.
# Usage: ./scripts/health/check-rpc-vms-health.sh
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
cd "$PROJECT_ROOT"
[ -f config/ip-addresses.conf ] && source config/ip-addresses.conf 2>/dev/null || true
# SSH user for shell (PROXMOX_USER in .env may be root@pam for API)
PROXMOX_SSH_USER="${PROXMOX_SSH_USER:-${PROXMOX_USER:-root}}"
[[ "$PROXMOX_SSH_USER" == *"@"* ]] && PROXMOX_SSH_USER="root"
PROXMOX_USER="${PROXMOX_SSH_USER}"
R630_01="${PROXMOX_HOST_R630_01:-192.168.11.11}"
R630_02="${PROXMOX_HOST_R630_02:-192.168.11.12}"
ML110="${PROXMOX_HOST_ML110:-192.168.11.10}"
# VMID:host:service (same mapping as review-sentry-and-rpc-nodes.sh)
RPC_NODES=(
"2101:$R630_01:besu-rpc"
"2201:$R630_02:besu-rpc"
"2301:$ML110:besu-rpc"
"2303:$R630_02:besu-rpc"
"2304:$ML110:besu-rpc"
"2305:$ML110:besu-rpc"
"2306:$ML110:besu-rpc"
"2307:$ML110:besu-rpc"
"2308:$ML110:besu-rpc"
"2400:$ML110:besu-rpc"
"2401:$R630_02:besu-rpc"
"2402:$ML110:besu-rpc"
"2403:$ML110:besu-rpc"
)
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
CYAN='\033[0;36m'
NC='\033[0m'
SSH_OPTS="-o ConnectTimeout=5 -o StrictHostKeyChecking=no"
echo -e "${CYAN}=== RPC Node VMs Health ===${NC}"
echo ""
ok=0
fail=0
for entry in "${RPC_NODES[@]}"; do
IFS=: read -r vmid host service <<< "$entry"
ssh_target="${PROXMOX_USER}@${host}"
ct_status=$(ssh $SSH_OPTS "$ssh_target" "pct status $vmid 2>/dev/null" | awk '{print $2}' || echo "unknown")
if [[ "$ct_status" != "running" ]]; then
echo -e " VMID $vmid: container ${RED}$ct_status${NC} (host $host)"
((fail++)) || true
continue
fi
service_status=$(ssh $SSH_OPTS "$ssh_target" "pct exec $vmid -- systemctl is-active $service 2>/dev/null" || echo "unknown")
if [[ "$service_status" != "active" ]]; then
echo -e " VMID $vmid: container running, ${YELLOW}$service $service_status${NC} (host $host)"
((fail++)) || true
continue
fi
ip=$(ssh $SSH_OPTS "$ssh_target" "pct exec $vmid -- hostname -I 2>/dev/null | awk '{print \$1}'" 2>/dev/null || echo "")
block_info=""
if [[ -n "$ip" ]]; then
resp=$(curl -s -m 3 -X POST -H "Content-Type: application/json" -d '{"jsonrpc":"2.0","method":"eth_blockNumber","params":[],"id":1}' "http://$ip:8545" 2>/dev/null || echo "")
if echo "$resp" | grep -q '"result"'; then
block_hex=$(echo "$resp" | jq -r '.result' 2>/dev/null)
block_dec=$((block_hex))
if [[ "$block_dec" -lt 2050000 ]]; then
block_info=" → block ${YELLOW}$block_dec (behind)${NC}"
else
block_info=" → block ${GREEN}$block_dec${NC}"
fi
else
block_info="${YELLOW}RPC no response${NC}"
fi
fi
echo -e " VMID $vmid: container running, ${GREEN}$service active${NC} ($host)$block_info"
((ok++)) || true
done
echo ""
echo -e "${CYAN}Summary: ${GREEN}$ok healthy${NC}, ${RED}$fail with issues${NC} (total ${#RPC_NODES[@]} RPC nodes)"
exit $fail