From ee1625a79bd89c33018a78b0193e22d3b1998426 Mon Sep 17 00:00:00 2001 From: defiQUG Date: Mon, 13 Apr 2026 21:41:35 -0700 Subject: [PATCH] Add surgical Besu validator operator helpers --- .../fix-block-production-staggered-restart.sh | 32 +++++---- ...grate-besu-validator-r630-01-to-r630-04.sh | 62 +++++++++++++++++ .../operator/restart-besu-validator-single.sh | 65 ++++++++++++++++++ .../revert-besu-cgroup-caps-r630-01.sh | 68 +++++++++++++++++++ scripts/verify/report-besu-host-cpu-load.sh | 35 ++++++++++ 5 files changed, 249 insertions(+), 13 deletions(-) create mode 100755 scripts/operator/migrate-besu-validator-r630-01-to-r630-04.sh create mode 100755 scripts/operator/restart-besu-validator-single.sh create mode 100755 scripts/operator/revert-besu-cgroup-caps-r630-01.sh create mode 100755 scripts/verify/report-besu-host-cpu-load.sh diff --git a/scripts/maintenance/fix-block-production-staggered-restart.sh b/scripts/maintenance/fix-block-production-staggered-restart.sh index 4c47ce6d..b84b9002 100755 --- a/scripts/maintenance/fix-block-production-staggered-restart.sh +++ b/scripts/maintenance/fix-block-production-staggered-restart.sh @@ -5,13 +5,15 @@ # the rest stay at head so the restarted node syncs quickly and consensus can continue. # # Usage: ./scripts/maintenance/fix-block-production-staggered-restart.sh [--dry-run] -# Requires: SSH to Proxmox hosts (192.168.11.10 ML110, 192.168.11.11 R630-01, 192.168.11.12 R630-02) +# Requires: SSH to Proxmox hosts; VMID→host from scripts/lib/load-project-env.sh get_host_for_vmid +# (live: 1000–1002 r630-01, 1003–1004 r630-03 — not ML110). set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" -source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true +# shellcheck source=/dev/null +source "${PROJECT_ROOT}/scripts/lib/load-project-env.sh" DRY_RUN=false [[ "${1:-}" == "--dry-run" ]] && DRY_RUN=true @@ -25,15 +27,19 @@ log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } log_ok() { echo -e "${GREEN}[✓]${NC} $1"; } log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } -# Order: restart one at a time; wait between so restarted node can sync from others -# VMID : host -VALIDATORS=( - "1004:${PROXMOX_HOST_ML110:-192.168.11.10}" - "1003:${PROXMOX_HOST_ML110:-192.168.11.10}" - "1002:${PROXMOX_HOST_R630_01:-192.168.11.11}" - "1001:${PROXMOX_HOST_R630_01:-192.168.11.11}" - "1000:${PROXMOX_HOST_R630_01:-192.168.11.11}" -) +# Order: off r630-03 first, then r630-01 (runbook: spread restarts; last is 1000 on .11) +RESTART_ORDER=(1004 1003 1002 1001 1000) + +build_validators() { + VALIDATORS=() + local v h + for v in "${RESTART_ORDER[@]}"; do + h="$(get_host_for_vmid "$v")" + VALIDATORS+=("${v}:${h}") + done +} + +build_validators WAIT_BETWEEN=90 RPC="${RPC_URL_138:-http://192.168.11.211:8545}" @@ -55,10 +61,10 @@ for entry in "${VALIDATORS[@]}"; do IFS=: read -r vmid host <<< "$entry" log_info "Restarting validator $vmid on $host..." if $DRY_RUN; then - echo " Would: ssh root@$host 'pct exec $vmid -- systemctl restart besu-validator'" + echo " Would: ssh root@$host 'pct exec $vmid -- systemctl restart besu-validator.service'" else # Allow up to 120s for restart (Besu stop/start can take 1-2 min) - if timeout 120 ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no root@"$host" "pct exec $vmid -- systemctl restart besu-validator" 2>/dev/null; then + if timeout 120 ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no root@"$host" "pct exec $vmid -- systemctl restart besu-validator.service" 2>/dev/null; then log_ok " $vmid restarted" else log_warn " $vmid restart timed out or failed (node may still be restarting)" diff --git a/scripts/operator/migrate-besu-validator-r630-01-to-r630-04.sh b/scripts/operator/migrate-besu-validator-r630-01-to-r630-04.sh new file mode 100755 index 00000000..da724106 --- /dev/null +++ b/scripts/operator/migrate-besu-validator-r630-01-to-r630-04.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +# Offload one Besu validator LXC from r630-01 to r630-04 to reduce *real* CPU contention on the +# source host (same JVM work, fewer co-scheduled Besu processes per NUMA/socket). +# +# Default VMID 1001 (historically problematic when co-packed with 1000/1002); override with --vmid. +# Does not change Besu *allocation* inside the guest — it spreads physical load across nodes. +# +# Usage: +# bash scripts/operator/migrate-besu-validator-r630-01-to-r630-04.sh +# bash scripts/operator/migrate-besu-validator-r630-01-to-r630-04.sh --apply +# bash scripts/operator/migrate-besu-validator-r630-01-to-r630-04.sh --vmid 1002 --apply +# +# Requires: cluster membership, r630-04 online, storage (default local-lvm on target). +# Optional: PROXMOX_OPS_ALLOWED_VMIDS, PROXMOX_OPS_APPLY (see proxmox-production-guard.sh). + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +# shellcheck source=/dev/null +source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true +# shellcheck source=/dev/null +source "${PROJECT_ROOT}/scripts/lib/proxmox-production-guard.sh" + +SRC="${PROXMOX_HOST_R630_01:-192.168.11.11}" +DST_NODE="${BESU_VALIDATOR_MIGRATE_TARGET_NODE:-r630-04}" +STORE="${BESU_VALIDATOR_MIGRATE_TARGET_STORAGE:-local-lvm}" +VMID="${BESU_VALIDATOR_MIGRATE_VMID:-1001}" +SSH_OPTS=(-o ConnectTimeout=20 -o BatchMode=yes -o StrictHostKeyChecking=no) + +APPLY=false +while [[ $# -gt 0 ]]; do + case "$1" in + --apply) APPLY=true ;; + --vmid) + [[ $# -ge 2 ]] || exit 2 + VMID="$2" + shift 2 + continue + ;; + -h|--help) sed -n '1,28p' "$0"; exit 0 ;; + *) echo "Unknown: $1" >&2; exit 2 ;; + esac + shift +done + +# PVE 9+: --target-storage (see migrate-ml110-besu-rpc-to-r630-02-03.sh). Older clusters may use --storage. +CMD="pct migrate ${VMID} ${DST_NODE} --target-storage ${STORE} --restart 1" + +if ! pguard_require_apply_flag "$APPLY"; then + echo "[dry-run] From source node (run as root on node that currently owns the CT):" + echo " ssh root@${SRC} \"$CMD\"" + echo "" + echo "After migrate, update get_host_for_vmid in scripts/lib/load-project-env.sh for VMID ${VMID}." + exit 0 +fi + +pguard_vmid_allowed "$VMID" || exit 1 + +echo "[apply] ssh root@${SRC} \"$CMD\"" +ssh "${SSH_OPTS[@]}" "root@${SRC}" "$CMD" +echo "[apply] Done. Update scripts/lib/load-project-env.sh get_host_for_vmid for ${VMID} -> ${DST_NODE} IP (${PROXMOX_HOST_R630_04:-192.168.11.14})." diff --git a/scripts/operator/restart-besu-validator-single.sh b/scripts/operator/restart-besu-validator-single.sh new file mode 100755 index 00000000..2ad2c7ee --- /dev/null +++ b/scripts/operator/restart-besu-validator-single.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash +# Surgical: restart exactly one Besu validator LXC service (default VMID 1001 — stuck participant). +# Resolves PVE host via get_host_for_vmid (scripts/lib/load-project-env.sh). No other CTs touched. +# +# Usage: +# bash scripts/operator/restart-besu-validator-single.sh --dry-run +# PROXMOX_OPS_APPLY=1 PROXMOX_OPS_ALLOWED_VMIDS=1001 bash scripts/operator/restart-besu-validator-single.sh --vmid 1001 --apply +# +# Requires: LAN SSH to Proxmox. Mutations require --apply or PROXMOX_OPS_APPLY=1 and (if set) allowlist. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +# shellcheck source=/dev/null +source "${PROJECT_ROOT}/scripts/lib/load-project-env.sh" +# shellcheck source=/dev/null +source "${PROJECT_ROOT}/scripts/lib/proxmox-production-guard.sh" + +VMID="${BESU_SURGICAL_RESTART_VMID:-1001}" +APPLY=false +DRY=false +SSH_OPTS=(-o ConnectTimeout=15 -o BatchMode=yes -o StrictHostKeyChecking=no) + +usage() { + sed -n '1,18p' "$0" +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --vmid) VMID="$2"; shift 2 ;; + --apply) APPLY=true; shift ;; + --dry-run) DRY=true; shift ;; + -h|--help) usage; exit 0 ;; + *) echo "Unknown: $1" >&2; usage >&2; exit 2 ;; + esac +done + +[[ "$VMID" =~ ^[0-9]+$ ]] || { echo "Bad vmid: $VMID" >&2; exit 2; } + +host="$(get_host_for_vmid "$VMID")" +unit="besu-validator.service" + +if $DRY || ! pguard_require_apply_flag "$APPLY"; then + echo "[dry-run] ssh root@${host} pct exec ${VMID} -- systemctl restart ${unit}" + echo "[dry-run] Then: cast block-number --rpc-url \${RPC_URL_138:-http://192.168.11.211:8545} (repeat)" + exit 0 +fi + +pguard_vmid_allowed "$VMID" || exit 1 + +echo "[apply] VMID ${VMID} on ${host}: systemctl restart ${unit}" +if ssh "${SSH_OPTS[@]}" "root@${host}" "pct exec ${VMID} -- systemctl restart ${unit}"; then + echo "[apply] restart command returned 0" +else + echo "[apply] restart failed (exit $?)" >&2 + exit 1 +fi + +sleep 5 +if ssh "${SSH_OPTS[@]}" "root@${host}" "pct exec ${VMID} -- systemctl is-active ${unit}" 2>/dev/null | grep -q active; then + echo "[apply] ${unit} is active" +else + echo "[apply] WARN: service may not be active yet; check journal on CT ${VMID}" >&2 +fi diff --git a/scripts/operator/revert-besu-cgroup-caps-r630-01.sh b/scripts/operator/revert-besu-cgroup-caps-r630-01.sh new file mode 100755 index 00000000..7cd69f5c --- /dev/null +++ b/scripts/operator/revert-besu-cgroup-caps-r630-01.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash +# Revert Proxmox cgroup CPU caps (cores/cpulimit) applied on r630-01 for Besu validators, +# core RPC, and sentries. Those caps throttle *allocation*; they do not reduce Besu's real +# work — use migration / JVM tuning / fewer co-located JVMs to lower *measured* host load. +# +# Restores: +# 1000–1002: cores 4, cpulimit removed (was 2/1) +# 2101: cores 4, cpulimit removed (was 2/2) +# 1500–1502: cores 2, cpulimit removed (was 2/1) +# +# Usage: +# bash scripts/operator/revert-besu-cgroup-caps-r630-01.sh +# bash scripts/operator/revert-besu-cgroup-caps-r630-01.sh --apply +# +# Requires: PROXMOX_OPS_APPLY=1 or --apply (see scripts/lib/proxmox-production-guard.sh). + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +# shellcheck source=/dev/null +source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true +# shellcheck source=/dev/null +source "${PROJECT_ROOT}/scripts/lib/proxmox-production-guard.sh" + +HOST="${PROXMOX_HOST_R630_01:-192.168.11.11}" +SSH_OPTS=(-o ConnectTimeout=12 -o BatchMode=yes -o StrictHostKeyChecking=no) + +APPLY=false +while [[ $# -gt 0 ]]; do + case "$1" in + --apply) APPLY=true ;; + -h|--help) sed -n '1,25p' "$0"; exit 0 ;; + *) echo "Unknown: $1" >&2; exit 2 ;; + esac + shift +done + +revert_validator() { + local vmid="$1" + echo " pct set $vmid --cores 4 --delete cpulimit" +} + +revert_sentry() { + local vmid="$1" + echo " pct set $vmid --cores 2 --delete cpulimit" +} + +if ! pguard_require_apply_flag "$APPLY"; then + echo "[dry-run] On root@${HOST}:" + for v in 1000 1001 1002; do revert_validator "$v"; done + echo " pct set 2101 --cores 4 --delete cpulimit" + for v in 1500 1501 1502; do revert_sentry "$v"; done + exit 0 +fi + +for v in 1000 1001 1002; do + pguard_vmid_allowed "$v" || continue + ssh "${SSH_OPTS[@]}" "root@${HOST}" "pct set '$v' --cores 4 --delete cpulimit" +done +pguard_vmid_allowed "2101" && ssh "${SSH_OPTS[@]}" "root@${HOST}" "pct set 2101 --cores 4 --delete cpulimit" || true +for v in 1500 1501 1502; do + pguard_vmid_allowed "$v" || continue + ssh "${SSH_OPTS[@]}" "root@${HOST}" "pct set '$v' --cores 2 --delete cpulimit" +done + +echo "--- post (pct + host cgroup sample) ---" +ssh "${SSH_OPTS[@]}" "root@${HOST}" "for v in 1000 2101 1500; do echo \"=== \$v ===\"; pct config \$v | grep -E '^(cores|cpulimit):' || true; echo -n \" cgroup cpu.max: \"; cat /sys/fs/cgroup/lxc/\$v/cpu.max; echo; done; uptime" diff --git a/scripts/verify/report-besu-host-cpu-load.sh b/scripts/verify/report-besu-host-cpu-load.sh new file mode 100755 index 00000000..615488e8 --- /dev/null +++ b/scripts/verify/report-besu-host-cpu-load.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash +# Read-only: Proxmox host load vs logical CPUs + count of Besu-named LXCs per node. +# Rule of thumb for "under 50% capacity": load average (1 min) / nproc < 0.5. +# +# Env: PROXMOX_BESU_LOAD_HOSTS — space-separated host IPs (default: ml110 + r630-01..04) +# +# Usage: bash scripts/verify/report-besu-host-cpu-load.sh + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" +# shellcheck source=/dev/null +source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true + +H="${PROXMOX_BESU_LOAD_HOSTS:-${PROXMOX_HOST_ML110:-192.168.11.10} ${PROXMOX_HOST_R630_01:-192.168.11.11} ${PROXMOX_HOST_R630_02:-192.168.11.12} ${PROXMOX_HOST_R630_03:-192.168.11.13} ${PROXMOX_HOST_R630_04:-192.168.11.14}}" +SSH_OPTS=(-o ConnectTimeout=10 -o BatchMode=yes -o StrictHostKeyChecking=no) + +echo "Besu-related host load (1m_load / nproc — target < 0.50)" +echo "-------------------------------------------------------------------" + +for ip in $H; do + line=$(ssh "${SSH_OPTS[@]}" "root@${ip}" "printf '%s|%s|%s|%s\n' \"\$(hostname 2>/dev/null)\" \"\$(nproc 2>/dev/null)\" \"\$(cut -f1 -d' ' /proc/loadavg 2>/dev/null)\" \"\$(pct list 2>/dev/null | grep -ci besu || echo 0)\"" 2>/dev/null) || line="" + if [[ -z "$line" ]]; then + printf "%-16s UNREACHABLE\n" "$ip" + continue + fi + IFS='|' read -r hn np l1 nb <<<"$line" + np="${np:-1}" + l1="${l1:-0}" + ratio=$(awk -v L="$l1" -v N="$np" 'BEGIN { if (N+0>0) printf "%.2f", L/N; else print "?" }') + flag="warn" + awk -v r="$ratio" 'BEGIN{exit !(r+0<0.5)}' 2>/dev/null && flag="ok" || true + printf "%-15s %-28s nproc=%-2s 1m=%-8s ratio=%-5s besu_cts=%-3s %s\n" "$ip" "$hn" "$np" "$l1" "$ratio" "$nb" "$flag" +done