Files
proxmox/scripts/maintenance/daily-weekly-checks.sh
defiQUG bea1903ac9
Some checks failed
Deploy to Phoenix / deploy (push) Has been cancelled
Sync all local changes: docs, config, scripts, submodule refs, verification evidence
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-21 15:46:06 -08:00

242 lines
9.8 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
# Maintenance checks (ALL_IMPROVEMENTS 135139). Run daily (135136) or weekly (137138).
# Explorer: hardened to FAIL when API unreachable; indexer lag check (fail if >500 blocks behind).
# Usage: ./scripts/maintenance/daily-weekly-checks.sh [daily|weekly|all]
# Cron: 0 8 * * * /path/to/daily-weekly-checks.sh daily
# Set EXPLORER_FAIL_WHEN_UNREACHABLE=0 to keep legacy SKIP when explorer unreachable (e.g. off-LAN).
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
MODE="${1:-daily}"
# Defaults (override via config or env)
IP_RPC_2201="${RPC_2201:-192.168.11.221}"
IP_BLOCKSCOUT="${IP_BLOCKSCOUT:-192.168.11.140}"
BLOCKSCOUT_API_PORT="${BLOCKSCOUT_API_PORT:-4000}"
DBIS_API_URL="${DBIS_API_URL:-https://dbis-api.d-bis.org}"
PROXMOX_R630_02="${PROXMOX_HOST_R630_02:-192.168.11.12}"
PROXMOX_R630_01="${PROXMOX_HOST_R630_01:-192.168.11.11}"
PROXMOX_ML110="${PROXMOX_HOST_ML110:-${PROXMOX_ML110:-192.168.11.10}}"
# Fail daily run when explorer API unreachable (set 0 to preserve legacy SKIP when off-LAN)
EXPLORER_FAIL_WHEN_UNREACHABLE="${EXPLORER_FAIL_WHEN_UNREACHABLE:-1}"
# Indexer lag: fail if explorer block is more than this many blocks behind RPC head
# Set 1500 temporarily if indexer is catching up after restart (~50 min at 2s/block).
EXPLORER_INDEXER_LAG_THRESHOLD="${EXPLORER_INDEXER_LAG_THRESHOLD:-500}"
# Optional: write metric file for alerting (FAILED count and timestamp)
MAINTENANCE_METRIC_FILE="${MAINTENANCE_METRIC_FILE:-$PROJECT_ROOT/logs/maintenance-checks.metric}"
FAILED=0
STORAGE_MAX_PCT=0
check_rpc() {
echo -n "[136] RPC (${IP_RPC_2201}:8545)... "
if curl -sf --max-time 10 -X POST -H "Content-Type: application/json" \
-d '{"jsonrpc":"2.0","method":"eth_blockNumber","params":[],"id":1}' \
"http://${IP_RPC_2201}:8545" | grep -q '"result"'; then
echo "OK"
else
echo "FAIL"
((FAILED++)) || true
fi
}
# Get RPC chain head block number (decimal). Empty on failure.
get_rpc_block_number() {
local hex
hex=$(curl -sf --max-time 10 -X POST -H "Content-Type: application/json" \
-d '{"jsonrpc":"2.0","method":"eth_blockNumber","params":[],"id":1}' \
"http://${IP_RPC_2201}:8545" 2>/dev/null | sed -n 's/.*"result":"\(0x[0-9a-fA-F]*\)".*/\1/p')
[ -n "$hex" ] && echo $((hex)) || true
}
# Get Blockscout last indexed block (from /api/v2/stats total_blocks or /api/v2/blocks). Empty on failure.
get_explorer_block_number() {
local body block
body=$(curl -sf --max-time 10 "http://${IP_BLOCKSCOUT}:${BLOCKSCOUT_API_PORT}/api/v2/stats" 2>/dev/null || true)
if [ -n "$body" ] && echo "$body" | grep -qE '"total_blocks"|"total_transactions"'; then
# total_blocks in API v2 can be string or number
block=$(echo "$body" | sed -n 's/.*"total_blocks"\s*:\s*"\([0-9]*\)".*/\1/p' | head -1)
[ -z "$block" ] && block=$(echo "$body" | sed -n 's/.*"total_blocks"\s*:\s*\([0-9]*\).*/\1/p' | head -1)
[ -n "$block" ] && echo "$block" && return
fi
# Fallback: first block from /api/v2/blocks
body=$(curl -sf --max-time 10 "http://${IP_BLOCKSCOUT}:${BLOCKSCOUT_API_PORT}/api/v2/blocks?page_size=1" 2>/dev/null || true)
if [ -n "$body" ]; then
echo "$body" | sed -n 's/.*"height"\s*:\s*\([0-9]*\).*/\1/p' | head -1
fi
}
# [135] Explorer: API must return 200 with total_blocks/total_transactions. FAIL when unreachable if EXPLORER_FAIL_WHEN_UNREACHABLE=1.
check_explorer_sync() {
echo -n "[135] Explorer indexer (${IP_BLOCKSCOUT}:${BLOCKSCOUT_API_PORT})... "
local api_ok=0
if curl -sf --max-time 10 "http://${IP_BLOCKSCOUT}:${BLOCKSCOUT_API_PORT}/api/v2/stats" 2>/dev/null | grep -qE '"total_blocks"|"total_transactions"|"indexer"'; then
api_ok=1
elif curl -sf --max-time 10 "http://${IP_BLOCKSCOUT}:${BLOCKSCOUT_API_PORT}/api?module=stats&action=eth_price" 2>/dev/null | grep -qE '"result"|"eth_price"'; then
api_ok=1
fi
if [ "$api_ok" -eq 1 ]; then
echo "OK"
return
fi
# Try public URL (in case we're off-LAN and only NPMplus path works)
if curl -sf --max-time 10 -k "https://explorer.d-bis.org/api/v2/stats" 2>/dev/null | grep -qE '"total_blocks"|"total_transactions"'; then
echo "OK (public)"
return
fi
if [ "${EXPLORER_FAIL_WHEN_UNREACHABLE}" = "1" ]; then
echo "FAIL (Blockscout unreachable)"
((FAILED++)) || true
else
echo "SKIP (Blockscout unreachable; run from LAN or set EXPLORER_FAIL_WHEN_UNREACHABLE=1)"
fi
}
# [135b] Indexer lag: fail if explorer block is more than EXPLORER_INDEXER_LAG_THRESHOLD behind RPC head.
check_explorer_indexer_lag() {
echo -n "[135b] Explorer indexer lag (RPC vs Blockscout)... "
local rpc_block explorer_block lag
rpc_block=$(get_rpc_block_number)
explorer_block=$(get_explorer_block_number)
if [ -z "$rpc_block" ] || [ -z "$explorer_block" ]; then
echo "SKIP (RPC or Blockscout unreachable)"
return
fi
if [ "$rpc_block" -gt "$explorer_block" ] 2>/dev/null; then
lag=$((rpc_block - explorer_block))
if [ "$lag" -gt "${EXPLORER_INDEXER_LAG_THRESHOLD}" ]; then
echo "FAIL (lag ${lag} > ${EXPLORER_INDEXER_LAG_THRESHOLD})"
((FAILED++)) || true
else
echo "OK (lag ${lag})"
fi
else
echo "OK (explorer caught up)"
fi
}
check_config_api() {
echo -n "[137] Config API (${DBIS_API_URL})... "
if curl -sf --max-time 10 -o /dev/null -w "%{http_code}" "${DBIS_API_URL}/health" 2>/dev/null | grep -q 200; then
echo "OK"
else
echo "SKIP or FAIL (external URL; may be unreachable off-LAN)"
fi
}
# [138a] Weekly: thin pool / storage usage on one host. Warn >85%, fail at 100%.
# Usage: check_thin_pool_one_host <ip> <label>
check_thin_pool_one_host() {
local ip="$1" label="$2"
echo -n "[138a] Storage $label ($ip)... "
local out pct
out=$(ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no root@"$ip" "pvesm status 2>/dev/null; lvs --noheadings -o lv_name,data_percent 2>/dev/null | awk '\$2!=\"\"'" 2>/dev/null || true)
if [ -z "$out" ]; then
echo "SKIP (SSH failed or no storage)"
return
fi
if echo "$out" | grep -q '100%'; then
echo "FAIL (storage 100% full)"
((FAILED++)) || true
return
fi
# Parse percentages: from "45%" or "85.2" (lvs data_percent) or "100.0"
pct=$(echo "$out" | sed -n 's/.*\([0-9]\{2,3\}\)%.*/\1/p'; echo "$out" | grep -oE '[0-9]{2,3}\.[0-9]+|[0-9]{2,3}' | sort -n | tail -1)
pct=$(echo "$pct" | sort -n | tail -1)
local pct_int
pct_int=$(echo "$pct" | cut -d. -f1)
[ -n "$pct_int" ] && [ "$pct_int" -gt "${STORAGE_MAX_PCT:-0}" ] 2>/dev/null && STORAGE_MAX_PCT=$pct_int
if [ -n "$pct_int" ] && [ "$pct_int" -ge 100 ] 2>/dev/null; then
echo "FAIL (storage 100% full)"
((FAILED++)) || true
elif [ -n "$pct_int" ] && [ "$pct_int" -ge 95 ] 2>/dev/null; then
echo "FAIL (usage ${pct}% >= 95%)"
((FAILED++)) || true
elif [ -n "$pct_int" ] && [ "$pct_int" -ge 85 ] 2>/dev/null; then
echo "WARN (usage ${pct}% >= 85%)"
else
echo "OK"
fi
}
# [138a] Weekly: thin pool usage on all Proxmox hosts (r630-02, r630-01, ml110).
check_thin_pool_r630_02() {
check_thin_pool_one_host "${PROXMOX_R630_02}" "r630-02"
check_thin_pool_one_host "${PROXMOX_R630_01}" "r630-01"
check_thin_pool_one_host "${PROXMOX_ML110}" "ml110"
}
# Write metric file for alerting (FAILED count, timestamp). Optional.
write_metric_file() {
[ -z "${MAINTENANCE_METRIC_FILE}" ] && return
mkdir -p "$(dirname "$MAINTENANCE_METRIC_FILE")"
echo "maintenance_checks_failed ${FAILED}" > "${MAINTENANCE_METRIC_FILE}.$$"
echo "maintenance_checks_timestamp $(date +%s)" >> "${MAINTENANCE_METRIC_FILE}.$$"
mv "${MAINTENANCE_METRIC_FILE}.$$" "${MAINTENANCE_METRIC_FILE}"
}
# [A7] Write storage metric file (max thin pool % and timestamp) for external alerting.
STORAGE_METRIC_FILE="${STORAGE_METRIC_FILE:-$PROJECT_ROOT/logs/storage-growth/last_run.metric}"
write_storage_metric_file() {
[ -z "${STORAGE_MAX_PCT}" ] && return
mkdir -p "$(dirname "$STORAGE_METRIC_FILE")"
echo "storage_max_pct ${STORAGE_MAX_PCT}" > "${STORAGE_METRIC_FILE}.$$"
echo "storage_metric_timestamp $(date +%s)" >> "${STORAGE_METRIC_FILE}.$$"
mv "${STORAGE_METRIC_FILE}.$$" "$STORAGE_METRIC_FILE"
}
echo "=== Maintenance checks ($MODE) $(date -Iseconds) ==="
CHECK_DISK_SCRIPT="${PROJECT_ROOT}/scripts/maintenance/check-disk-all-vmids.sh"
case "$MODE" in
daily)
check_explorer_sync
check_explorer_indexer_lag
check_rpc
# [A5] In-CT disk check (WARN 85%, CRIT 95% on root /)
if [ -x "$CHECK_DISK_SCRIPT" ]; then
echo "[138b] In-CT disk (root /)..."
bash "$CHECK_DISK_SCRIPT" 2>&1 | while IFS= read -r line; do echo " $line"; done
fi
;;
weekly)
check_config_api
check_thin_pool_r630_02
# [A8] Weekly fstrim in running CTs (reclaim thin pool space)
FSTRIM_SCRIPT="${PROJECT_ROOT}/scripts/maintenance/fstrim-all-running-ct.sh"
if [ -x "$FSTRIM_SCRIPT" ]; then
echo "[138c] fstrim running CTs..."
bash "$FSTRIM_SCRIPT" 2>&1 | while IFS= read -r line; do echo " $line"; done
fi
# [A10] Journal vacuum (keep last 7d) in key CTs
JOURNAL_SCRIPT="${PROJECT_ROOT}/scripts/maintenance/journal-vacuum-key-ct.sh"
if [ -x "$JOURNAL_SCRIPT" ]; then
echo "[138d] journal vacuum key CTs..."
bash "$JOURNAL_SCRIPT" 2>&1 | while IFS= read -r line; do echo " $line"; done
fi
echo "[138] Review explorer logs: pct exec 5000 -- journalctl -u blockscout -n 200 --no-pager (from root@${PROXMOX_R630_02})"
;;
all)
check_explorer_sync
check_explorer_indexer_lag
check_rpc
check_config_api
check_thin_pool_r630_02
echo "[138] Review explorer logs manually; [139] update token list as needed (token-list.json / explorer config)."
;;
*)
echo "Usage: $0 [daily|weekly|all]"
exit 1
;;
esac
write_metric_file
[ "$MODE" = "weekly" ] || [ "$MODE" = "all" ] && write_storage_metric_file
echo "=== Done (failed: $FAILED) ==="
[[ $FAILED -eq 0 ]]