- Update dbis_core, cross-chain-pmm-lps, explorer-monorepo, metamask-integration, pr-workspace/chains - Omit embedded publish git dirs and empty placeholders from index Made-with: Cursor
235 lines
9.4 KiB
Bash
Executable File
235 lines
9.4 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
# Comprehensive Blockchain Health Monitoring Script
|
||
# Monitors block production, transaction inclusion, and node health
|
||
|
||
set -euo pipefail
|
||
|
||
# Load IP configuration
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
|
||
|
||
RPC_CORE_1="${RPC_CORE_1:-192.168.11.211}"
|
||
RPC_URL="${RPC_URL:-http://${RPC_CORE_1}:8545}"
|
||
DEPLOYER="${DEPLOYER:-0x4A666F96fC8764181194447A7dFdb7d471b301C8}"
|
||
PROXMOX_ML110="${PROXMOX_ML110:-${PROXMOX_HOST_ML110:-192.168.11.10}}"
|
||
PROXMOX_R630="${PROXMOX_R630:-${PROXMOX_R630_01:-${PROXMOX_HOST_R630_01:-192.168.11.11}}}"
|
||
R630_03="${PROXMOX_R630_03:-${PROXMOX_HOST_R630_03:-192.168.11.13}}"
|
||
BLOCK_SAMPLE_SEC="${BLOCK_PRODUCTION_SAMPLE_SEC:-12}"
|
||
# Proxmox shell SSH must be root@host — not API-style root@pam from .env (see ip-addresses.conf / clear-all-transaction-pools.sh).
|
||
PROXMOX_SSH_USER="${PROXMOX_SSH_USER:-root}"
|
||
[[ "$PROXMOX_SSH_USER" == *"@"* ]] && PROXMOX_SSH_USER="root"
|
||
PVE_SSH_USER="$PROXMOX_SSH_USER"
|
||
|
||
# Colors
|
||
RED='\033[0;31m'
|
||
GREEN='\033[0;32m'
|
||
YELLOW='\033[1;33m'
|
||
BLUE='\033[0;34m'
|
||
CYAN='\033[0;36m'
|
||
NC='\033[0m'
|
||
|
||
log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
|
||
log_success() { echo -e "${GREEN}[✓]${NC} $1"; }
|
||
log_warn() { echo -e "${YELLOW}[⚠]${NC} $1"; }
|
||
log_error() { echo -e "${RED}[✗]${NC} $1"; }
|
||
log_section() { echo -e "\n${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"; echo -e "${CYAN}$1${NC}"; echo -e "${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}\n"; }
|
||
|
||
echo "=== Blockchain Health Monitor ==="
|
||
echo "Timestamp: $(date '+%Y-%m-%d %H:%M:%S')"
|
||
echo ""
|
||
|
||
# Check RPC connectivity
|
||
log_section "RPC Node Status"
|
||
if timeout 5 cast chain-id --rpc-url "$RPC_URL" >/dev/null 2>&1; then
|
||
CHAIN_ID=$(cast chain-id --rpc-url "$RPC_URL" 2>/dev/null)
|
||
BLOCK_NUM=$(cast block-number --rpc-url "$RPC_URL" 2>/dev/null)
|
||
BLOCK_DEC=$(cast --to-dec "$BLOCK_NUM" 2>/dev/null || echo "0")
|
||
log_success "RPC accessible"
|
||
echo " Chain ID: $CHAIN_ID"
|
||
echo " Latest block: $BLOCK_DEC ($BLOCK_NUM)"
|
||
else
|
||
log_error "RPC not accessible"
|
||
exit 1
|
||
fi
|
||
|
||
# Check block production (longer window reduces false positives after restarts / ~2s block time)
|
||
log_section "Block Production"
|
||
BLOCK1=$(cast block-number --rpc-url "$RPC_URL" 2>/dev/null)
|
||
sleep "$BLOCK_SAMPLE_SEC"
|
||
BLOCK2=$(cast block-number --rpc-url "$RPC_URL" 2>/dev/null)
|
||
BLOCK1_DEC=$(cast --to-dec "$BLOCK1" 2>/dev/null || echo "0")
|
||
BLOCK2_DEC=$(cast --to-dec "$BLOCK2" 2>/dev/null || echo "0")
|
||
BLOCK_DIFF=$((BLOCK2_DEC - BLOCK1_DEC))
|
||
|
||
if [ "$BLOCK_DIFF" -gt 0 ]; then
|
||
log_success "Blocks being produced ($BLOCK_DIFF blocks in ${BLOCK_SAMPLE_SEC}s)"
|
||
else
|
||
log_error "Block production stalled (no new blocks in ${BLOCK_SAMPLE_SEC}s)"
|
||
# If validators are all active, they may still be syncing (QBFT does not produce until sync completes)
|
||
SYNC_HINT=$(ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no "${PVE_SSH_USER}@${PROXMOX_R630}" \
|
||
"pct exec 1000 -- journalctl -u besu-validator --no-pager -n 30 2>/dev/null" 2>/dev/null | grep -c "Full sync\|initial sync in progress\|QBFT mining coordinator not starting" || true)
|
||
if [ "${SYNC_HINT:-0}" -gt 0 ]; then
|
||
echo " → Validators may be syncing; block production will resume when sync completes (see docs/06-besu/CRITICAL_ISSUE_BLOCK_PRODUCTION_STOPPED.md)."
|
||
fi
|
||
fi
|
||
|
||
# Check transaction inclusion
|
||
log_section "Transaction Inclusion"
|
||
TX_COUNT_TOTAL=0
|
||
EMPTY_BLOCKS=0
|
||
for i in 0 1 2 3 4 5; do
|
||
BLOCK_NUM=$((BLOCK2_DEC - i))
|
||
BLOCK_HEX=$(printf '0x%x' $BLOCK_NUM)
|
||
TX_COUNT=$(cast rpc eth_getBlockTransactionCountByNumber "$BLOCK_HEX" --rpc-url "$RPC_URL" 2>/dev/null | tr -d '"')
|
||
TX_COUNT_DEC=$(cast --to-dec "$TX_COUNT" 2>/dev/null || echo "0")
|
||
TX_COUNT_TOTAL=$((TX_COUNT_TOTAL + TX_COUNT_DEC))
|
||
if [ "$TX_COUNT_DEC" -eq 0 ]; then
|
||
EMPTY_BLOCKS=$((EMPTY_BLOCKS + 1))
|
||
fi
|
||
done
|
||
|
||
if [ "$TX_COUNT_TOTAL" -gt 0 ]; then
|
||
log_success "Transactions being included ($TX_COUNT_TOTAL txs in last 6 blocks)"
|
||
elif [ "$BLOCK_DIFF" -gt 0 ]; then
|
||
log_info "Last 6 sampled blocks had no user txs ($EMPTY_BLOCKS empty); chain is advancing (normal on quiet periods)"
|
||
else
|
||
log_warn "No transactions in last 6 blocks ($EMPTY_BLOCKS empty blocks) and no height gain in sample window"
|
||
fi
|
||
|
||
# Check pending transactions
|
||
log_section "Pending Transactions"
|
||
LATEST_HEX=$(cast rpc eth_getTransactionCount "$DEPLOYER" latest --rpc-url "$RPC_URL" 2>/dev/null | tr -d '"')
|
||
PENDING_HEX=$(cast rpc eth_getTransactionCount "$DEPLOYER" pending --rpc-url "$RPC_URL" 2>/dev/null | tr -d '"')
|
||
LATEST_DEC=$(cast --to-dec "$LATEST_HEX" 2>/dev/null || echo "0")
|
||
PENDING_DEC=$(cast --to-dec "$PENDING_HEX" 2>/dev/null || echo "0")
|
||
PENDING_COUNT=$((PENDING_DEC - LATEST_DEC))
|
||
|
||
if [ "$PENDING_COUNT" -eq 0 ]; then
|
||
log_success "No pending transactions"
|
||
else
|
||
log_warn "$PENDING_COUNT pending transactions (nonces $((LATEST_DEC + 1))-$PENDING_DEC)"
|
||
fi
|
||
|
||
# Check validator status (1000–1002 on r630-01; 1003–1004 on r630-03)
|
||
log_section "Validator Status"
|
||
VALIDATORS=(
|
||
"1000:$PROXMOX_R630"
|
||
"1001:$PROXMOX_R630"
|
||
"1002:$PROXMOX_R630"
|
||
"1003:$R630_03"
|
||
"1004:$R630_03"
|
||
)
|
||
EXPECTED_VALIDATORS=${#VALIDATORS[@]}
|
||
|
||
SKIP_VALIDATOR_SSH=false
|
||
SSH_R630_01_OK=0
|
||
SSH_R630_03_OK=0
|
||
ssh -o ConnectTimeout=4 -o BatchMode=yes -o StrictHostKeyChecking=no "${PVE_SSH_USER}@${PROXMOX_R630}" "true" 2>/dev/null && SSH_R630_01_OK=1
|
||
ssh -o ConnectTimeout=4 -o BatchMode=yes -o StrictHostKeyChecking=no "${PVE_SSH_USER}@${R630_03}" "true" 2>/dev/null && SSH_R630_03_OK=1
|
||
if [[ "$SSH_R630_01_OK" -eq 0 ]] && [[ "$SSH_R630_03_OK" -eq 0 ]]; then
|
||
log_warn "Proxmox SSH unavailable for validator hosts — skipping validator CT checks (run from LAN)"
|
||
SKIP_VALIDATOR_SSH=true
|
||
fi
|
||
|
||
host_ssh_ok() {
|
||
local h="$1"
|
||
[[ "$h" == "$PROXMOX_R630" ]] && [[ "$SSH_R630_01_OK" -eq 1 ]] && return 0
|
||
[[ "$h" == "$R630_03" ]] && [[ "$SSH_R630_03_OK" -eq 1 ]] && return 0
|
||
return 1
|
||
}
|
||
|
||
ACTIVE_COUNT=0
|
||
REACHABLE_EXPECTED=0
|
||
if [[ "$SKIP_VALIDATOR_SSH" == true ]]; then
|
||
ACTIVE_COUNT=$EXPECTED_VALIDATORS
|
||
REACHABLE_EXPECTED=$EXPECTED_VALIDATORS
|
||
else
|
||
for validator in "${VALIDATORS[@]}"; do
|
||
IFS=':' read -r VMID HOST <<< "$validator"
|
||
if ! host_ssh_ok "$HOST"; then
|
||
log_warn "Validator $VMID: skipped (SSH to $HOST unavailable)"
|
||
continue
|
||
fi
|
||
REACHABLE_EXPECTED=$((REACHABLE_EXPECTED + 1))
|
||
SSH_TARGET="${PVE_SSH_USER}@${HOST}"
|
||
STATUS=$(ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$SSH_TARGET" \
|
||
"pct exec $VMID -- bash -c 'for u in besu-validator besu-validator.service; do s=\$(systemctl is-active \"\$u\" 2>/dev/null || true); [ \"\$s\" = active ] && { echo active; exit 0; }; done; echo inactive'" 2>/dev/null | tr -d '\r' | tail -1) \
|
||
|| STATUS=""
|
||
[[ -z "$STATUS" ]] && STATUS=unknown
|
||
|
||
if [ "$STATUS" = "active" ]; then
|
||
ACTIVE_COUNT=$((ACTIVE_COUNT + 1))
|
||
echo " Validator $VMID: $STATUS"
|
||
else
|
||
log_warn "Validator $VMID: $STATUS"
|
||
fi
|
||
done
|
||
fi
|
||
|
||
if [[ "$SKIP_VALIDATOR_SSH" == true ]]; then
|
||
:
|
||
elif [[ "$REACHABLE_EXPECTED" -eq 0 ]]; then
|
||
log_warn "No validator hosts reachable for systemd checks"
|
||
elif [ "$ACTIVE_COUNT" -eq "$REACHABLE_EXPECTED" ]; then
|
||
log_success "All $ACTIVE_COUNT/$REACHABLE_EXPECTED reachable validators active"
|
||
else
|
||
log_error "Only $ACTIVE_COUNT/$REACHABLE_EXPECTED reachable validators active"
|
||
fi
|
||
|
||
# Check peer connections
|
||
log_section "Peer Connections"
|
||
PEER_COUNT=$(cast rpc admin_peers --rpc-url "$RPC_URL" 2>/dev/null | jq '. | length' 2>/dev/null || echo "N/A")
|
||
if [ "$PEER_COUNT" != "N/A" ] && [ "$PEER_COUNT" -ge 5 ]; then
|
||
log_success "RPC has $PEER_COUNT peer connections"
|
||
else
|
||
log_warn "RPC has $PEER_COUNT peer connections (expected >= 5)"
|
||
fi
|
||
|
||
# Summary
|
||
log_section "Health Summary"
|
||
ISSUES=0
|
||
|
||
if [ "$BLOCK_DIFF" -eq 0 ]; then
|
||
log_error "❌ Block production stalled"
|
||
ISSUES=$((ISSUES + 1))
|
||
# Hint when validators are syncing (all active but no new blocks yet)
|
||
if [ "$ACTIVE_COUNT" -eq "$REACHABLE_EXPECTED" ] && [[ "$REACHABLE_EXPECTED" -gt 0 ]]; then
|
||
echo " (If validators recently restarted: they are likely in full sync; blocks will resume when sync completes.)"
|
||
fi
|
||
else
|
||
log_success "✓ Block production active"
|
||
fi
|
||
|
||
if [ "$TX_COUNT_TOTAL" -eq 0 ] && [ "$PENDING_COUNT" -gt 0 ]; then
|
||
log_error "❌ Transactions not being included"
|
||
ISSUES=$((ISSUES + 1))
|
||
elif [ "$TX_COUNT_TOTAL" -gt 0 ]; then
|
||
log_success "✓ Transactions being included"
|
||
elif [ "$BLOCK_DIFF" -gt 0 ]; then
|
||
log_success "✓ No tx backlog signal (quiet blocks while height advances)"
|
||
fi
|
||
|
||
if [[ "$SKIP_VALIDATOR_SSH" == true ]]; then
|
||
log_warn "⚠ Validator systemd checks skipped (no SSH)"
|
||
elif [[ "$SKIP_VALIDATOR_SSH" != true ]] && [[ "$REACHABLE_EXPECTED" -gt 0 ]] && [ "$ACTIVE_COUNT" -lt "$REACHABLE_EXPECTED" ]; then
|
||
log_error "❌ Not all reachable validators active"
|
||
ISSUES=$((ISSUES + 1))
|
||
else
|
||
[[ "$SKIP_VALIDATOR_SSH" != true ]] && [[ "$REACHABLE_EXPECTED" -gt 0 ]] && log_success "✓ All reachable validators active"
|
||
fi
|
||
|
||
if [ "$PENDING_COUNT" -gt 10 ]; then
|
||
log_warn "⚠ High number of pending transactions ($PENDING_COUNT)"
|
||
ISSUES=$((ISSUES + 1))
|
||
fi
|
||
|
||
echo ""
|
||
if [ "$ISSUES" -eq 0 ]; then
|
||
log_success "Overall Status: HEALTHY"
|
||
exit 0
|
||
else
|
||
log_error "Overall Status: $ISSUES issue(s) detected"
|
||
exit 1
|
||
fi
|