Files
proxmox/scripts/remediate-proxmox-rpc-stability.sh
defiQUG cb47cce074 Complete markdown files cleanup and organization
- Organized 252 files across project
- Root directory: 187 → 2 files (98.9% reduction)
- Moved configuration guides to docs/04-configuration/
- Moved troubleshooting guides to docs/09-troubleshooting/
- Moved quick start guides to docs/01-getting-started/
- Moved reports to reports/ directory
- Archived temporary files
- Generated comprehensive reports and documentation
- Created maintenance scripts and guides

All files organized according to established standards.
2026-01-06 01:46:25 -08:00

303 lines
9.4 KiB
Bash
Executable File

#!/usr/bin/env bash
# Idempotent remediation for RPC node stability on Proxmox.
#
# What it fixes (optionally):
# 1) Storage node restriction mismatch:
# - Ensures the storage backing RPC VMID rootfs (e.g., local-lvm) is allowed on the node
# where the VMID is running (prevents "storage 'local-lvm' is not available on node ..." failures).
# 2) Besu heap oversizing:
# - Ensures BESU_OPTS (-Xms/-Xmx) in /etc/systemd/system/besu-rpc.service is sized to container memory.
#
# Safety:
# - Default is DRY-RUN (no changes).
# - Use --apply to perform changes.
# - Service restarts are opt-in via --restart-besu.
#
# Usage:
# PROXMOX_HOST=192.168.11.10 ./scripts/remediate-proxmox-rpc-stability.sh
# PROXMOX_HOST=192.168.11.10 ./scripts/remediate-proxmox-rpc-stability.sh --apply --restart-besu
#
# Options:
# --apply Apply changes (otherwise dry-run)
# --restart-besu Restart besu-rpc inside affected VMIDs (only with --apply)
# --only-storage Only apply storage.cfg remediation
# --only-heap Only apply heap remediation
# --vmids "..." Override VMID list (space-separated)
set -euo pipefail
PROXMOX_HOST="${PROXMOX_HOST:-192.168.11.10}"
APPLY=0
RESTART_BESU=0
ONLY_STORAGE=0
ONLY_HEAP=0
VMIDS_DEFAULT=(2400 2401 2402 2500 2501 2502 2503 2504 2505 2506 2507 2508)
VMIDS=("${VMIDS_DEFAULT[@]}")
usage() {
sed -n '1,80p' "$0" | sed 's/^# \{0,1\}//'
}
log() { echo "[$(date -Is)] $*"; }
die() { echo "ERROR: $*" >&2; exit 1; }
while [[ $# -gt 0 ]]; do
case "$1" in
--apply) APPLY=1; shift ;;
--restart-besu) RESTART_BESU=1; shift ;;
--only-storage) ONLY_STORAGE=1; shift ;;
--only-heap) ONLY_HEAP=1; shift ;;
--vmids)
shift
[[ $# -gt 0 ]] || die "--vmids requires a value"
# shellcheck disable=SC2206
VMIDS=($1)
shift
;;
-h|--help) usage; exit 0 ;;
*) die "Unknown arg: $1 (use --help)" ;;
esac
done
if [[ $ONLY_STORAGE -eq 1 && $ONLY_HEAP -eq 1 ]]; then
die "Choose at most one of --only-storage / --only-heap"
fi
if [[ $RESTART_BESU -eq 1 && $APPLY -ne 1 ]]; then
die "--restart-besu requires --apply"
fi
ssh_pve() {
ssh -o StrictHostKeyChecking=no -o BatchMode=yes -o ConnectTimeout=6 "root@${PROXMOX_HOST}" "$@"
}
remote_node="$(ssh_pve "hostname" 2>/dev/null || true)"
[[ -n "${remote_node}" ]] || die "Unable to SSH to root@${PROXMOX_HOST}"
log "Proxmox host: ${PROXMOX_HOST} (node name: ${remote_node})"
log "Mode: $([[ $APPLY -eq 1 ]] && echo APPLY || echo DRY-RUN)"
log "VMIDs: ${VMIDS[*]}"
echo
recommend_heap() {
# Input: memory MB
# Output: Xms Xmx (strings suitable for BESU_OPTS)
local mem_mb="$1"
if [[ "$mem_mb" =~ ^[0-9]+$ ]]; then
:
else
echo "1g 2g"
return 0
fi
if (( mem_mb >= 16384 )); then
echo "8g 8g"
elif (( mem_mb >= 8192 )); then
echo "2g 4g"
elif (( mem_mb >= 6144 )); then
echo "2g 4g"
elif (( mem_mb >= 4096 )); then
echo "1g 2g"
else
echo "512m 1g"
fi
}
get_vmid_field() {
# Usage: get_vmid_field <vmid> <field> (e.g. memory, swap, rootfs, hostname)
local vmid="$1"
local field="$2"
ssh_pve "pct config ${vmid} 2>/dev/null | sed -n 's/^${field}: //p' | head -1" 2>/dev/null | tr -d '\r'
}
vmid_status() {
local vmid="$1"
ssh_pve "pct status ${vmid} 2>/dev/null | sed -n 's/^status: //p'" 2>/dev/null | tr -d '\r'
}
########################################
# 1) Storage remediation (storage.cfg)
########################################
storage_changes=0
if [[ $ONLY_HEAP -ne 1 ]]; then
log "Storage remediation: scanning VMID rootfs storages vs storage.cfg node allowlist"
storages_needed=()
for vmid in "${VMIDS[@]}"; do
st="$(vmid_status "$vmid" || true)"
rootfs="$(get_vmid_field "$vmid" "rootfs" || true)"
[[ -n "${rootfs}" ]] || continue
storage="${rootfs%%:*}"
if [[ -n "${storage}" ]]; then
storages_needed+=("${storage}")
fi
log " VMID ${vmid}: status=${st:-?} rootfs=${rootfs}"
done
# Unique storages
unique_storages=()
while IFS= read -r s; do unique_storages+=("$s"); done < <(printf "%s\n" "${storages_needed[@]}" | sort -u)
if [[ ${#unique_storages[@]} -eq 0 ]]; then
log " No storages detected from VMID rootfs; skipping storage remediation."
else
log " Storages referenced by VMID rootfs: ${unique_storages[*]}"
fi
for storage in "${unique_storages[@]}"; do
# Only handle storages defined in storage.cfg and restricted by nodes=.
# If nodes= isn't present, it's cluster-wide.
allowed_nodes="$(ssh_pve "python3 - <<'PY'
from pathlib import Path
cfg = Path('/etc/pve/storage.cfg').read_text(encoding='utf-8')
storage = ${storage@Q}
stype = None
in_section = False
nodes = None
for line in cfg.splitlines():
if line.startswith('dir: ') or line.startswith('lvmthin: ') or line.startswith('zfspool: ') or line.startswith('lvm: '):
in_section = line.split(':',1)[1].strip() == storage
nodes = None
continue
if in_section and line.strip().startswith('nodes '):
nodes = line.strip().split(None,1)[1]
break
print(nodes or '')
PY" 2>/dev/null | tr -d '\r')"
if [[ -z "${allowed_nodes}" ]]; then
log " Storage '${storage}': no nodes restriction found (OK)"
continue
fi
if echo "${allowed_nodes}" | tr ',' '\n' | grep -qx "${remote_node}"; then
log " Storage '${storage}': node '${remote_node}' already allowed (OK)"
continue
fi
storage_changes=$((storage_changes+1))
log " Storage '${storage}': node '${remote_node}' NOT allowed (nodes=${allowed_nodes})"
if [[ $APPLY -eq 1 ]]; then
log " Applying: add '${remote_node}' to storage.cfg for ${storage}"
ssh_pve "bash -s" <<EOS
set -euo pipefail
CFG=/etc/pve/storage.cfg
TS=\$(date +%Y%m%d_%H%M%S)
cp -a "\$CFG" "/root/storage.cfg.bak.\$TS"
python3 - <<'PY'
from __future__ import annotations
from pathlib import Path
cfg = Path('/etc/pve/storage.cfg')
storage = ${storage@Q}
node = ${remote_node@Q}
lines = cfg.read_text(encoding='utf-8').splitlines(True)
out = []
in_section = False
updated = False
for line in lines:
if line.startswith('dir: ') or line.startswith('lvmthin: ') or line.startswith('zfspool: ') or line.startswith('lvm: '):
in_section = line.split(':',1)[1].strip() == storage
out.append(line)
continue
if in_section and line.lstrip().startswith('nodes '):
indent = line[: len(line) - len(line.lstrip())]
nodes_str = line.strip().split(None, 1)[1] if len(line.strip().split(None, 1)) > 1 else ''
parts = [p.strip() for p in nodes_str.split(',') if p.strip()]
if node not in parts:
parts.append(node)
updated = True
out.append(f"{indent}nodes {','.join(parts)}\\n")
continue
out.append(line)
cfg.write_text(''.join(out), encoding='utf-8')
print('updated' if updated else 'no_change')
PY
EOS
else
log " DRY-RUN: would add '${remote_node}' to storage.cfg nodes= for storage '${storage}'"
fi
done
echo
fi
########################################
# 2) Heap remediation (BESU_OPTS)
########################################
heap_changes=0
if [[ $ONLY_STORAGE -ne 1 ]]; then
log "Besu heap remediation: scanning BESU_OPTS vs container memory"
UNIT="/etc/systemd/system/besu-rpc.service"
for vmid in "${VMIDS[@]}"; do
st="$(vmid_status "$vmid" || true)"
mem="$(get_vmid_field "$vmid" "memory" || true)"
hostn="$(get_vmid_field "$vmid" "hostname" || true)"
rec="$(recommend_heap "${mem:-0}")"
xms="${rec%% *}"
xmx="${rec##* }"
needs=0
if [[ "${st}" != "running" ]]; then
log " VMID ${vmid} (${hostn:-?}): status=${st:-?} -> skipping heap check"
continue
fi
current_line="$(ssh_pve "pct exec ${vmid} -- bash -lc \"grep -n 'BESU_OPTS' ${UNIT} 2>/dev/null | head -1\"" 2>/dev/null | tr -d '\r' || true)"
if [[ -z "${current_line}" ]]; then
log " VMID ${vmid} (${hostn:-?} mem=${mem}MB): BESU_OPTS line missing -> skipping"
continue
fi
if echo "${current_line}" | grep -q -- "-Xms${xms}"; then
:
else
needs=1
fi
if echo "${current_line}" | grep -q -- "-Xmx${xmx}"; then
:
else
needs=1
fi
if [[ "${needs}" -eq 0 ]]; then
log " VMID ${vmid} (${hostn:-?} mem=${mem}MB): OK (${current_line})"
continue
fi
heap_changes=$((heap_changes+1))
log " VMID ${vmid} (${hostn:-?} mem=${mem}MB): needs heap update -> -Xms${xms} -Xmx${xmx}"
log " current: ${current_line}"
if [[ $APPLY -eq 1 ]]; then
ts="$(date +%Y%m%d_%H%M%S)"
log " Applying: update ${UNIT} (backup .bak.${ts})"
ssh_pve "pct exec ${vmid} -- bash -lc \"set -e; cp -a ${UNIT} ${UNIT}.bak.${ts}; sed -i 's/^Environment=\\\"BESU_OPTS=.*/Environment=\\\"BESU_OPTS=-Xms${xms} -Xmx${xmx}\\\"/' ${UNIT}; grep -n 'BESU_OPTS' ${UNIT}\""
if [[ $RESTART_BESU -eq 1 ]]; then
log " Restarting besu-rpc"
ssh_pve "pct exec ${vmid} -- bash -lc \"set -e; systemctl daemon-reload; systemctl restart besu-rpc\""
else
log " NOTE: besu-rpc not restarted (use --restart-besu)"
fi
else
log " DRY-RUN: would set BESU_OPTS=-Xms${xms} -Xmx${xmx} and optionally restart"
fi
unset needs
done
echo
fi
log "Done."
log "Planned/applied changes summary:"
log " storage adjustments needed: ${storage_changes}"
log " heap adjustments needed: ${heap_changes}"
if [[ $APPLY -eq 0 ]]; then
log "Run again with --apply (and optionally --restart-besu) to enforce changes."
fi