Add surgical Besu validator operator helpers

This commit is contained in:
defiQUG
2026-04-13 21:41:35 -07:00
parent b7eebb87b3
commit ee1625a79b
5 changed files with 249 additions and 13 deletions

View File

@@ -5,13 +5,15 @@
# the rest stay at head so the restarted node syncs quickly and consensus can continue.
#
# Usage: ./scripts/maintenance/fix-block-production-staggered-restart.sh [--dry-run]
# Requires: SSH to Proxmox hosts (192.168.11.10 ML110, 192.168.11.11 R630-01, 192.168.11.12 R630-02)
# Requires: SSH to Proxmox hosts; VMID→host from scripts/lib/load-project-env.sh get_host_for_vmid
# (live: 10001002 r630-01, 10031004 r630-03 — not ML110).
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
# shellcheck source=/dev/null
source "${PROJECT_ROOT}/scripts/lib/load-project-env.sh"
DRY_RUN=false
[[ "${1:-}" == "--dry-run" ]] && DRY_RUN=true
@@ -25,15 +27,19 @@ log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
log_ok() { echo -e "${GREEN}[✓]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
# Order: restart one at a time; wait between so restarted node can sync from others
# VMID : host
VALIDATORS=(
"1004:${PROXMOX_HOST_ML110:-192.168.11.10}"
"1003:${PROXMOX_HOST_ML110:-192.168.11.10}"
"1002:${PROXMOX_HOST_R630_01:-192.168.11.11}"
"1001:${PROXMOX_HOST_R630_01:-192.168.11.11}"
"1000:${PROXMOX_HOST_R630_01:-192.168.11.11}"
)
# Order: off r630-03 first, then r630-01 (runbook: spread restarts; last is 1000 on .11)
RESTART_ORDER=(1004 1003 1002 1001 1000)
build_validators() {
VALIDATORS=()
local v h
for v in "${RESTART_ORDER[@]}"; do
h="$(get_host_for_vmid "$v")"
VALIDATORS+=("${v}:${h}")
done
}
build_validators
WAIT_BETWEEN=90
RPC="${RPC_URL_138:-http://192.168.11.211:8545}"
@@ -55,10 +61,10 @@ for entry in "${VALIDATORS[@]}"; do
IFS=: read -r vmid host <<< "$entry"
log_info "Restarting validator $vmid on $host..."
if $DRY_RUN; then
echo " Would: ssh root@$host 'pct exec $vmid -- systemctl restart besu-validator'"
echo " Would: ssh root@$host 'pct exec $vmid -- systemctl restart besu-validator.service'"
else
# Allow up to 120s for restart (Besu stop/start can take 1-2 min)
if timeout 120 ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no root@"$host" "pct exec $vmid -- systemctl restart besu-validator" 2>/dev/null; then
if timeout 120 ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no root@"$host" "pct exec $vmid -- systemctl restart besu-validator.service" 2>/dev/null; then
log_ok " $vmid restarted"
else
log_warn " $vmid restart timed out or failed (node may still be restarting)"

View File

@@ -0,0 +1,62 @@
#!/usr/bin/env bash
# Offload one Besu validator LXC from r630-01 to r630-04 to reduce *real* CPU contention on the
# source host (same JVM work, fewer co-scheduled Besu processes per NUMA/socket).
#
# Default VMID 1001 (historically problematic when co-packed with 1000/1002); override with --vmid.
# Does not change Besu *allocation* inside the guest — it spreads physical load across nodes.
#
# Usage:
# bash scripts/operator/migrate-besu-validator-r630-01-to-r630-04.sh
# bash scripts/operator/migrate-besu-validator-r630-01-to-r630-04.sh --apply
# bash scripts/operator/migrate-besu-validator-r630-01-to-r630-04.sh --vmid 1002 --apply
#
# Requires: cluster membership, r630-04 online, storage (default local-lvm on target).
# Optional: PROXMOX_OPS_ALLOWED_VMIDS, PROXMOX_OPS_APPLY (see proxmox-production-guard.sh).
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# shellcheck source=/dev/null
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
# shellcheck source=/dev/null
source "${PROJECT_ROOT}/scripts/lib/proxmox-production-guard.sh"
SRC="${PROXMOX_HOST_R630_01:-192.168.11.11}"
DST_NODE="${BESU_VALIDATOR_MIGRATE_TARGET_NODE:-r630-04}"
STORE="${BESU_VALIDATOR_MIGRATE_TARGET_STORAGE:-local-lvm}"
VMID="${BESU_VALIDATOR_MIGRATE_VMID:-1001}"
SSH_OPTS=(-o ConnectTimeout=20 -o BatchMode=yes -o StrictHostKeyChecking=no)
APPLY=false
while [[ $# -gt 0 ]]; do
case "$1" in
--apply) APPLY=true ;;
--vmid)
[[ $# -ge 2 ]] || exit 2
VMID="$2"
shift 2
continue
;;
-h|--help) sed -n '1,28p' "$0"; exit 0 ;;
*) echo "Unknown: $1" >&2; exit 2 ;;
esac
shift
done
# PVE 9+: --target-storage (see migrate-ml110-besu-rpc-to-r630-02-03.sh). Older clusters may use --storage.
CMD="pct migrate ${VMID} ${DST_NODE} --target-storage ${STORE} --restart 1"
if ! pguard_require_apply_flag "$APPLY"; then
echo "[dry-run] From source node (run as root on node that currently owns the CT):"
echo " ssh root@${SRC} \"$CMD\""
echo ""
echo "After migrate, update get_host_for_vmid in scripts/lib/load-project-env.sh for VMID ${VMID}."
exit 0
fi
pguard_vmid_allowed "$VMID" || exit 1
echo "[apply] ssh root@${SRC} \"$CMD\""
ssh "${SSH_OPTS[@]}" "root@${SRC}" "$CMD"
echo "[apply] Done. Update scripts/lib/load-project-env.sh get_host_for_vmid for ${VMID} -> ${DST_NODE} IP (${PROXMOX_HOST_R630_04:-192.168.11.14})."

View File

@@ -0,0 +1,65 @@
#!/usr/bin/env bash
# Surgical: restart exactly one Besu validator LXC service (default VMID 1001 — stuck participant).
# Resolves PVE host via get_host_for_vmid (scripts/lib/load-project-env.sh). No other CTs touched.
#
# Usage:
# bash scripts/operator/restart-besu-validator-single.sh --dry-run
# PROXMOX_OPS_APPLY=1 PROXMOX_OPS_ALLOWED_VMIDS=1001 bash scripts/operator/restart-besu-validator-single.sh --vmid 1001 --apply
#
# Requires: LAN SSH to Proxmox. Mutations require --apply or PROXMOX_OPS_APPLY=1 and (if set) allowlist.
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# shellcheck source=/dev/null
source "${PROJECT_ROOT}/scripts/lib/load-project-env.sh"
# shellcheck source=/dev/null
source "${PROJECT_ROOT}/scripts/lib/proxmox-production-guard.sh"
VMID="${BESU_SURGICAL_RESTART_VMID:-1001}"
APPLY=false
DRY=false
SSH_OPTS=(-o ConnectTimeout=15 -o BatchMode=yes -o StrictHostKeyChecking=no)
usage() {
sed -n '1,18p' "$0"
}
while [[ $# -gt 0 ]]; do
case "$1" in
--vmid) VMID="$2"; shift 2 ;;
--apply) APPLY=true; shift ;;
--dry-run) DRY=true; shift ;;
-h|--help) usage; exit 0 ;;
*) echo "Unknown: $1" >&2; usage >&2; exit 2 ;;
esac
done
[[ "$VMID" =~ ^[0-9]+$ ]] || { echo "Bad vmid: $VMID" >&2; exit 2; }
host="$(get_host_for_vmid "$VMID")"
unit="besu-validator.service"
if $DRY || ! pguard_require_apply_flag "$APPLY"; then
echo "[dry-run] ssh root@${host} pct exec ${VMID} -- systemctl restart ${unit}"
echo "[dry-run] Then: cast block-number --rpc-url \${RPC_URL_138:-http://192.168.11.211:8545} (repeat)"
exit 0
fi
pguard_vmid_allowed "$VMID" || exit 1
echo "[apply] VMID ${VMID} on ${host}: systemctl restart ${unit}"
if ssh "${SSH_OPTS[@]}" "root@${host}" "pct exec ${VMID} -- systemctl restart ${unit}"; then
echo "[apply] restart command returned 0"
else
echo "[apply] restart failed (exit $?)" >&2
exit 1
fi
sleep 5
if ssh "${SSH_OPTS[@]}" "root@${host}" "pct exec ${VMID} -- systemctl is-active ${unit}" 2>/dev/null | grep -q active; then
echo "[apply] ${unit} is active"
else
echo "[apply] WARN: service may not be active yet; check journal on CT ${VMID}" >&2
fi

View File

@@ -0,0 +1,68 @@
#!/usr/bin/env bash
# Revert Proxmox cgroup CPU caps (cores/cpulimit) applied on r630-01 for Besu validators,
# core RPC, and sentries. Those caps throttle *allocation*; they do not reduce Besu's real
# work — use migration / JVM tuning / fewer co-located JVMs to lower *measured* host load.
#
# Restores:
# 10001002: cores 4, cpulimit removed (was 2/1)
# 2101: cores 4, cpulimit removed (was 2/2)
# 15001502: cores 2, cpulimit removed (was 2/1)
#
# Usage:
# bash scripts/operator/revert-besu-cgroup-caps-r630-01.sh
# bash scripts/operator/revert-besu-cgroup-caps-r630-01.sh --apply
#
# Requires: PROXMOX_OPS_APPLY=1 or --apply (see scripts/lib/proxmox-production-guard.sh).
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# shellcheck source=/dev/null
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
# shellcheck source=/dev/null
source "${PROJECT_ROOT}/scripts/lib/proxmox-production-guard.sh"
HOST="${PROXMOX_HOST_R630_01:-192.168.11.11}"
SSH_OPTS=(-o ConnectTimeout=12 -o BatchMode=yes -o StrictHostKeyChecking=no)
APPLY=false
while [[ $# -gt 0 ]]; do
case "$1" in
--apply) APPLY=true ;;
-h|--help) sed -n '1,25p' "$0"; exit 0 ;;
*) echo "Unknown: $1" >&2; exit 2 ;;
esac
shift
done
revert_validator() {
local vmid="$1"
echo " pct set $vmid --cores 4 --delete cpulimit"
}
revert_sentry() {
local vmid="$1"
echo " pct set $vmid --cores 2 --delete cpulimit"
}
if ! pguard_require_apply_flag "$APPLY"; then
echo "[dry-run] On root@${HOST}:"
for v in 1000 1001 1002; do revert_validator "$v"; done
echo " pct set 2101 --cores 4 --delete cpulimit"
for v in 1500 1501 1502; do revert_sentry "$v"; done
exit 0
fi
for v in 1000 1001 1002; do
pguard_vmid_allowed "$v" || continue
ssh "${SSH_OPTS[@]}" "root@${HOST}" "pct set '$v' --cores 4 --delete cpulimit"
done
pguard_vmid_allowed "2101" && ssh "${SSH_OPTS[@]}" "root@${HOST}" "pct set 2101 --cores 4 --delete cpulimit" || true
for v in 1500 1501 1502; do
pguard_vmid_allowed "$v" || continue
ssh "${SSH_OPTS[@]}" "root@${HOST}" "pct set '$v' --cores 2 --delete cpulimit"
done
echo "--- post (pct + host cgroup sample) ---"
ssh "${SSH_OPTS[@]}" "root@${HOST}" "for v in 1000 2101 1500; do echo \"=== \$v ===\"; pct config \$v | grep -E '^(cores|cpulimit):' || true; echo -n \" cgroup cpu.max: \"; cat /sys/fs/cgroup/lxc/\$v/cpu.max; echo; done; uptime"

View File

@@ -0,0 +1,35 @@
#!/usr/bin/env bash
# Read-only: Proxmox host load vs logical CPUs + count of Besu-named LXCs per node.
# Rule of thumb for "under 50% capacity": load average (1 min) / nproc < 0.5.
#
# Env: PROXMOX_BESU_LOAD_HOSTS — space-separated host IPs (default: ml110 + r630-01..04)
#
# Usage: bash scripts/verify/report-besu-host-cpu-load.sh
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# shellcheck source=/dev/null
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
H="${PROXMOX_BESU_LOAD_HOSTS:-${PROXMOX_HOST_ML110:-192.168.11.10} ${PROXMOX_HOST_R630_01:-192.168.11.11} ${PROXMOX_HOST_R630_02:-192.168.11.12} ${PROXMOX_HOST_R630_03:-192.168.11.13} ${PROXMOX_HOST_R630_04:-192.168.11.14}}"
SSH_OPTS=(-o ConnectTimeout=10 -o BatchMode=yes -o StrictHostKeyChecking=no)
echo "Besu-related host load (1m_load / nproc — target < 0.50)"
echo "-------------------------------------------------------------------"
for ip in $H; do
line=$(ssh "${SSH_OPTS[@]}" "root@${ip}" "printf '%s|%s|%s|%s\n' \"\$(hostname 2>/dev/null)\" \"\$(nproc 2>/dev/null)\" \"\$(cut -f1 -d' ' /proc/loadavg 2>/dev/null)\" \"\$(pct list 2>/dev/null | grep -ci besu || echo 0)\"" 2>/dev/null) || line=""
if [[ -z "$line" ]]; then
printf "%-16s UNREACHABLE\n" "$ip"
continue
fi
IFS='|' read -r hn np l1 nb <<<"$line"
np="${np:-1}"
l1="${l1:-0}"
ratio=$(awk -v L="$l1" -v N="$np" 'BEGIN { if (N+0>0) printf "%.2f", L/N; else print "?" }')
flag="warn"
awk -v r="$ratio" 'BEGIN{exit !(r+0<0.5)}' 2>/dev/null && flag="ok" || true
printf "%-15s %-28s nproc=%-2s 1m=%-8s ratio=%-5s besu_cts=%-3s %s\n" "$ip" "$hn" "$np" "$l1" "$ratio" "$nb" "$flag"
done