Files
proxmox/scripts/monitoring/monitor-blockchain-health.sh
defiQUG dbd517b279 Sync workspace: config, docs, scripts, CI, operator rules, and submodule pointers.
- Update dbis_core, cross-chain-pmm-lps, explorer-monorepo, metamask-integration, pr-workspace/chains
- Omit embedded publish git dirs and empty placeholders from index

Made-with: Cursor
2026-04-12 06:12:20 -07:00

235 lines
9.4 KiB
Bash
Executable File
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
# Comprehensive Blockchain Health Monitoring Script
# Monitors block production, transaction inclusion, and node health
set -euo pipefail
# Load IP configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
RPC_CORE_1="${RPC_CORE_1:-192.168.11.211}"
RPC_URL="${RPC_URL:-http://${RPC_CORE_1}:8545}"
DEPLOYER="${DEPLOYER:-0x4A666F96fC8764181194447A7dFdb7d471b301C8}"
PROXMOX_ML110="${PROXMOX_ML110:-${PROXMOX_HOST_ML110:-192.168.11.10}}"
PROXMOX_R630="${PROXMOX_R630:-${PROXMOX_R630_01:-${PROXMOX_HOST_R630_01:-192.168.11.11}}}"
R630_03="${PROXMOX_R630_03:-${PROXMOX_HOST_R630_03:-192.168.11.13}}"
BLOCK_SAMPLE_SEC="${BLOCK_PRODUCTION_SAMPLE_SEC:-12}"
# Proxmox shell SSH must be root@host — not API-style root@pam from .env (see ip-addresses.conf / clear-all-transaction-pools.sh).
PROXMOX_SSH_USER="${PROXMOX_SSH_USER:-root}"
[[ "$PROXMOX_SSH_USER" == *"@"* ]] && PROXMOX_SSH_USER="root"
PVE_SSH_USER="$PROXMOX_SSH_USER"
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m'
log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
log_success() { echo -e "${GREEN}[✓]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[⚠]${NC} $1"; }
log_error() { echo -e "${RED}[✗]${NC} $1"; }
log_section() { echo -e "\n${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"; echo -e "${CYAN}$1${NC}"; echo -e "${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}\n"; }
echo "=== Blockchain Health Monitor ==="
echo "Timestamp: $(date '+%Y-%m-%d %H:%M:%S')"
echo ""
# Check RPC connectivity
log_section "RPC Node Status"
if timeout 5 cast chain-id --rpc-url "$RPC_URL" >/dev/null 2>&1; then
CHAIN_ID=$(cast chain-id --rpc-url "$RPC_URL" 2>/dev/null)
BLOCK_NUM=$(cast block-number --rpc-url "$RPC_URL" 2>/dev/null)
BLOCK_DEC=$(cast --to-dec "$BLOCK_NUM" 2>/dev/null || echo "0")
log_success "RPC accessible"
echo " Chain ID: $CHAIN_ID"
echo " Latest block: $BLOCK_DEC ($BLOCK_NUM)"
else
log_error "RPC not accessible"
exit 1
fi
# Check block production (longer window reduces false positives after restarts / ~2s block time)
log_section "Block Production"
BLOCK1=$(cast block-number --rpc-url "$RPC_URL" 2>/dev/null)
sleep "$BLOCK_SAMPLE_SEC"
BLOCK2=$(cast block-number --rpc-url "$RPC_URL" 2>/dev/null)
BLOCK1_DEC=$(cast --to-dec "$BLOCK1" 2>/dev/null || echo "0")
BLOCK2_DEC=$(cast --to-dec "$BLOCK2" 2>/dev/null || echo "0")
BLOCK_DIFF=$((BLOCK2_DEC - BLOCK1_DEC))
if [ "$BLOCK_DIFF" -gt 0 ]; then
log_success "Blocks being produced ($BLOCK_DIFF blocks in ${BLOCK_SAMPLE_SEC}s)"
else
log_error "Block production stalled (no new blocks in ${BLOCK_SAMPLE_SEC}s)"
# If validators are all active, they may still be syncing (QBFT does not produce until sync completes)
SYNC_HINT=$(ssh -o ConnectTimeout=3 -o StrictHostKeyChecking=no "${PVE_SSH_USER}@${PROXMOX_R630}" \
"pct exec 1000 -- journalctl -u besu-validator --no-pager -n 30 2>/dev/null" 2>/dev/null | grep -c "Full sync\|initial sync in progress\|QBFT mining coordinator not starting" || true)
if [ "${SYNC_HINT:-0}" -gt 0 ]; then
echo " → Validators may be syncing; block production will resume when sync completes (see docs/06-besu/CRITICAL_ISSUE_BLOCK_PRODUCTION_STOPPED.md)."
fi
fi
# Check transaction inclusion
log_section "Transaction Inclusion"
TX_COUNT_TOTAL=0
EMPTY_BLOCKS=0
for i in 0 1 2 3 4 5; do
BLOCK_NUM=$((BLOCK2_DEC - i))
BLOCK_HEX=$(printf '0x%x' $BLOCK_NUM)
TX_COUNT=$(cast rpc eth_getBlockTransactionCountByNumber "$BLOCK_HEX" --rpc-url "$RPC_URL" 2>/dev/null | tr -d '"')
TX_COUNT_DEC=$(cast --to-dec "$TX_COUNT" 2>/dev/null || echo "0")
TX_COUNT_TOTAL=$((TX_COUNT_TOTAL + TX_COUNT_DEC))
if [ "$TX_COUNT_DEC" -eq 0 ]; then
EMPTY_BLOCKS=$((EMPTY_BLOCKS + 1))
fi
done
if [ "$TX_COUNT_TOTAL" -gt 0 ]; then
log_success "Transactions being included ($TX_COUNT_TOTAL txs in last 6 blocks)"
elif [ "$BLOCK_DIFF" -gt 0 ]; then
log_info "Last 6 sampled blocks had no user txs ($EMPTY_BLOCKS empty); chain is advancing (normal on quiet periods)"
else
log_warn "No transactions in last 6 blocks ($EMPTY_BLOCKS empty blocks) and no height gain in sample window"
fi
# Check pending transactions
log_section "Pending Transactions"
LATEST_HEX=$(cast rpc eth_getTransactionCount "$DEPLOYER" latest --rpc-url "$RPC_URL" 2>/dev/null | tr -d '"')
PENDING_HEX=$(cast rpc eth_getTransactionCount "$DEPLOYER" pending --rpc-url "$RPC_URL" 2>/dev/null | tr -d '"')
LATEST_DEC=$(cast --to-dec "$LATEST_HEX" 2>/dev/null || echo "0")
PENDING_DEC=$(cast --to-dec "$PENDING_HEX" 2>/dev/null || echo "0")
PENDING_COUNT=$((PENDING_DEC - LATEST_DEC))
if [ "$PENDING_COUNT" -eq 0 ]; then
log_success "No pending transactions"
else
log_warn "$PENDING_COUNT pending transactions (nonces $((LATEST_DEC + 1))-$PENDING_DEC)"
fi
# Check validator status (10001002 on r630-01; 10031004 on r630-03)
log_section "Validator Status"
VALIDATORS=(
"1000:$PROXMOX_R630"
"1001:$PROXMOX_R630"
"1002:$PROXMOX_R630"
"1003:$R630_03"
"1004:$R630_03"
)
EXPECTED_VALIDATORS=${#VALIDATORS[@]}
SKIP_VALIDATOR_SSH=false
SSH_R630_01_OK=0
SSH_R630_03_OK=0
ssh -o ConnectTimeout=4 -o BatchMode=yes -o StrictHostKeyChecking=no "${PVE_SSH_USER}@${PROXMOX_R630}" "true" 2>/dev/null && SSH_R630_01_OK=1
ssh -o ConnectTimeout=4 -o BatchMode=yes -o StrictHostKeyChecking=no "${PVE_SSH_USER}@${R630_03}" "true" 2>/dev/null && SSH_R630_03_OK=1
if [[ "$SSH_R630_01_OK" -eq 0 ]] && [[ "$SSH_R630_03_OK" -eq 0 ]]; then
log_warn "Proxmox SSH unavailable for validator hosts — skipping validator CT checks (run from LAN)"
SKIP_VALIDATOR_SSH=true
fi
host_ssh_ok() {
local h="$1"
[[ "$h" == "$PROXMOX_R630" ]] && [[ "$SSH_R630_01_OK" -eq 1 ]] && return 0
[[ "$h" == "$R630_03" ]] && [[ "$SSH_R630_03_OK" -eq 1 ]] && return 0
return 1
}
ACTIVE_COUNT=0
REACHABLE_EXPECTED=0
if [[ "$SKIP_VALIDATOR_SSH" == true ]]; then
ACTIVE_COUNT=$EXPECTED_VALIDATORS
REACHABLE_EXPECTED=$EXPECTED_VALIDATORS
else
for validator in "${VALIDATORS[@]}"; do
IFS=':' read -r VMID HOST <<< "$validator"
if ! host_ssh_ok "$HOST"; then
log_warn "Validator $VMID: skipped (SSH to $HOST unavailable)"
continue
fi
REACHABLE_EXPECTED=$((REACHABLE_EXPECTED + 1))
SSH_TARGET="${PVE_SSH_USER}@${HOST}"
STATUS=$(ssh -o ConnectTimeout=5 -o StrictHostKeyChecking=no "$SSH_TARGET" \
"pct exec $VMID -- bash -c 'for u in besu-validator besu-validator.service; do s=\$(systemctl is-active \"\$u\" 2>/dev/null || true); [ \"\$s\" = active ] && { echo active; exit 0; }; done; echo inactive'" 2>/dev/null | tr -d '\r' | tail -1) \
|| STATUS=""
[[ -z "$STATUS" ]] && STATUS=unknown
if [ "$STATUS" = "active" ]; then
ACTIVE_COUNT=$((ACTIVE_COUNT + 1))
echo " Validator $VMID: $STATUS"
else
log_warn "Validator $VMID: $STATUS"
fi
done
fi
if [[ "$SKIP_VALIDATOR_SSH" == true ]]; then
:
elif [[ "$REACHABLE_EXPECTED" -eq 0 ]]; then
log_warn "No validator hosts reachable for systemd checks"
elif [ "$ACTIVE_COUNT" -eq "$REACHABLE_EXPECTED" ]; then
log_success "All $ACTIVE_COUNT/$REACHABLE_EXPECTED reachable validators active"
else
log_error "Only $ACTIVE_COUNT/$REACHABLE_EXPECTED reachable validators active"
fi
# Check peer connections
log_section "Peer Connections"
PEER_COUNT=$(cast rpc admin_peers --rpc-url "$RPC_URL" 2>/dev/null | jq '. | length' 2>/dev/null || echo "N/A")
if [ "$PEER_COUNT" != "N/A" ] && [ "$PEER_COUNT" -ge 5 ]; then
log_success "RPC has $PEER_COUNT peer connections"
else
log_warn "RPC has $PEER_COUNT peer connections (expected >= 5)"
fi
# Summary
log_section "Health Summary"
ISSUES=0
if [ "$BLOCK_DIFF" -eq 0 ]; then
log_error "❌ Block production stalled"
ISSUES=$((ISSUES + 1))
# Hint when validators are syncing (all active but no new blocks yet)
if [ "$ACTIVE_COUNT" -eq "$REACHABLE_EXPECTED" ] && [[ "$REACHABLE_EXPECTED" -gt 0 ]]; then
echo " (If validators recently restarted: they are likely in full sync; blocks will resume when sync completes.)"
fi
else
log_success "✓ Block production active"
fi
if [ "$TX_COUNT_TOTAL" -eq 0 ] && [ "$PENDING_COUNT" -gt 0 ]; then
log_error "❌ Transactions not being included"
ISSUES=$((ISSUES + 1))
elif [ "$TX_COUNT_TOTAL" -gt 0 ]; then
log_success "✓ Transactions being included"
elif [ "$BLOCK_DIFF" -gt 0 ]; then
log_success "✓ No tx backlog signal (quiet blocks while height advances)"
fi
if [[ "$SKIP_VALIDATOR_SSH" == true ]]; then
log_warn "⚠ Validator systemd checks skipped (no SSH)"
elif [[ "$SKIP_VALIDATOR_SSH" != true ]] && [[ "$REACHABLE_EXPECTED" -gt 0 ]] && [ "$ACTIVE_COUNT" -lt "$REACHABLE_EXPECTED" ]; then
log_error "❌ Not all reachable validators active"
ISSUES=$((ISSUES + 1))
else
[[ "$SKIP_VALIDATOR_SSH" != true ]] && [[ "$REACHABLE_EXPECTED" -gt 0 ]] && log_success "✓ All reachable validators active"
fi
if [ "$PENDING_COUNT" -gt 10 ]; then
log_warn "⚠ High number of pending transactions ($PENDING_COUNT)"
ISSUES=$((ISSUES + 1))
fi
echo ""
if [ "$ISSUES" -eq 0 ]; then
log_success "Overall Status: HEALTHY"
exit 0
else
log_error "Overall Status: $ISSUES issue(s) detected"
exit 1
fi