Some checks failed
Deploy to Phoenix / deploy (push) Has been cancelled
Co-authored-by: Cursor <cursoragent@cursor.com>
242 lines
9.8 KiB
Bash
Executable File
242 lines
9.8 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
# Maintenance checks (ALL_IMPROVEMENTS 135–139). Run daily (135–136) or weekly (137–138).
|
||
# Explorer: hardened to FAIL when API unreachable; indexer lag check (fail if >500 blocks behind).
|
||
# Usage: ./scripts/maintenance/daily-weekly-checks.sh [daily|weekly|all]
|
||
# Cron: 0 8 * * * /path/to/daily-weekly-checks.sh daily
|
||
# Set EXPLORER_FAIL_WHEN_UNREACHABLE=0 to keep legacy SKIP when explorer unreachable (e.g. off-LAN).
|
||
|
||
set -euo pipefail
|
||
|
||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
|
||
|
||
MODE="${1:-daily}"
|
||
|
||
# Defaults (override via config or env)
|
||
IP_RPC_2201="${RPC_2201:-192.168.11.221}"
|
||
IP_BLOCKSCOUT="${IP_BLOCKSCOUT:-192.168.11.140}"
|
||
BLOCKSCOUT_API_PORT="${BLOCKSCOUT_API_PORT:-4000}"
|
||
DBIS_API_URL="${DBIS_API_URL:-https://dbis-api.d-bis.org}"
|
||
PROXMOX_R630_02="${PROXMOX_HOST_R630_02:-192.168.11.12}"
|
||
PROXMOX_R630_01="${PROXMOX_HOST_R630_01:-192.168.11.11}"
|
||
PROXMOX_ML110="${PROXMOX_HOST_ML110:-${PROXMOX_ML110:-192.168.11.10}}"
|
||
# Fail daily run when explorer API unreachable (set 0 to preserve legacy SKIP when off-LAN)
|
||
EXPLORER_FAIL_WHEN_UNREACHABLE="${EXPLORER_FAIL_WHEN_UNREACHABLE:-1}"
|
||
# Indexer lag: fail if explorer block is more than this many blocks behind RPC head
|
||
# Set 1500 temporarily if indexer is catching up after restart (~50 min at 2s/block).
|
||
EXPLORER_INDEXER_LAG_THRESHOLD="${EXPLORER_INDEXER_LAG_THRESHOLD:-500}"
|
||
|
||
# Optional: write metric file for alerting (FAILED count and timestamp)
|
||
MAINTENANCE_METRIC_FILE="${MAINTENANCE_METRIC_FILE:-$PROJECT_ROOT/logs/maintenance-checks.metric}"
|
||
|
||
FAILED=0
|
||
STORAGE_MAX_PCT=0
|
||
|
||
check_rpc() {
|
||
echo -n "[136] RPC (${IP_RPC_2201}:8545)... "
|
||
if curl -sf --max-time 10 -X POST -H "Content-Type: application/json" \
|
||
-d '{"jsonrpc":"2.0","method":"eth_blockNumber","params":[],"id":1}' \
|
||
"http://${IP_RPC_2201}:8545" | grep -q '"result"'; then
|
||
echo "OK"
|
||
else
|
||
echo "FAIL"
|
||
((FAILED++)) || true
|
||
fi
|
||
}
|
||
|
||
# Get RPC chain head block number (decimal). Empty on failure.
|
||
get_rpc_block_number() {
|
||
local hex
|
||
hex=$(curl -sf --max-time 10 -X POST -H "Content-Type: application/json" \
|
||
-d '{"jsonrpc":"2.0","method":"eth_blockNumber","params":[],"id":1}' \
|
||
"http://${IP_RPC_2201}:8545" 2>/dev/null | sed -n 's/.*"result":"\(0x[0-9a-fA-F]*\)".*/\1/p')
|
||
[ -n "$hex" ] && echo $((hex)) || true
|
||
}
|
||
|
||
# Get Blockscout last indexed block (from /api/v2/stats total_blocks or /api/v2/blocks). Empty on failure.
|
||
get_explorer_block_number() {
|
||
local body block
|
||
body=$(curl -sf --max-time 10 "http://${IP_BLOCKSCOUT}:${BLOCKSCOUT_API_PORT}/api/v2/stats" 2>/dev/null || true)
|
||
if [ -n "$body" ] && echo "$body" | grep -qE '"total_blocks"|"total_transactions"'; then
|
||
# total_blocks in API v2 can be string or number
|
||
block=$(echo "$body" | sed -n 's/.*"total_blocks"\s*:\s*"\([0-9]*\)".*/\1/p' | head -1)
|
||
[ -z "$block" ] && block=$(echo "$body" | sed -n 's/.*"total_blocks"\s*:\s*\([0-9]*\).*/\1/p' | head -1)
|
||
[ -n "$block" ] && echo "$block" && return
|
||
fi
|
||
# Fallback: first block from /api/v2/blocks
|
||
body=$(curl -sf --max-time 10 "http://${IP_BLOCKSCOUT}:${BLOCKSCOUT_API_PORT}/api/v2/blocks?page_size=1" 2>/dev/null || true)
|
||
if [ -n "$body" ]; then
|
||
echo "$body" | sed -n 's/.*"height"\s*:\s*\([0-9]*\).*/\1/p' | head -1
|
||
fi
|
||
}
|
||
|
||
# [135] Explorer: API must return 200 with total_blocks/total_transactions. FAIL when unreachable if EXPLORER_FAIL_WHEN_UNREACHABLE=1.
|
||
check_explorer_sync() {
|
||
echo -n "[135] Explorer indexer (${IP_BLOCKSCOUT}:${BLOCKSCOUT_API_PORT})... "
|
||
local api_ok=0
|
||
if curl -sf --max-time 10 "http://${IP_BLOCKSCOUT}:${BLOCKSCOUT_API_PORT}/api/v2/stats" 2>/dev/null | grep -qE '"total_blocks"|"total_transactions"|"indexer"'; then
|
||
api_ok=1
|
||
elif curl -sf --max-time 10 "http://${IP_BLOCKSCOUT}:${BLOCKSCOUT_API_PORT}/api?module=stats&action=eth_price" 2>/dev/null | grep -qE '"result"|"eth_price"'; then
|
||
api_ok=1
|
||
fi
|
||
if [ "$api_ok" -eq 1 ]; then
|
||
echo "OK"
|
||
return
|
||
fi
|
||
# Try public URL (in case we're off-LAN and only NPMplus path works)
|
||
if curl -sf --max-time 10 -k "https://explorer.d-bis.org/api/v2/stats" 2>/dev/null | grep -qE '"total_blocks"|"total_transactions"'; then
|
||
echo "OK (public)"
|
||
return
|
||
fi
|
||
if [ "${EXPLORER_FAIL_WHEN_UNREACHABLE}" = "1" ]; then
|
||
echo "FAIL (Blockscout unreachable)"
|
||
((FAILED++)) || true
|
||
else
|
||
echo "SKIP (Blockscout unreachable; run from LAN or set EXPLORER_FAIL_WHEN_UNREACHABLE=1)"
|
||
fi
|
||
}
|
||
|
||
# [135b] Indexer lag: fail if explorer block is more than EXPLORER_INDEXER_LAG_THRESHOLD behind RPC head.
|
||
check_explorer_indexer_lag() {
|
||
echo -n "[135b] Explorer indexer lag (RPC vs Blockscout)... "
|
||
local rpc_block explorer_block lag
|
||
rpc_block=$(get_rpc_block_number)
|
||
explorer_block=$(get_explorer_block_number)
|
||
if [ -z "$rpc_block" ] || [ -z "$explorer_block" ]; then
|
||
echo "SKIP (RPC or Blockscout unreachable)"
|
||
return
|
||
fi
|
||
if [ "$rpc_block" -gt "$explorer_block" ] 2>/dev/null; then
|
||
lag=$((rpc_block - explorer_block))
|
||
if [ "$lag" -gt "${EXPLORER_INDEXER_LAG_THRESHOLD}" ]; then
|
||
echo "FAIL (lag ${lag} > ${EXPLORER_INDEXER_LAG_THRESHOLD})"
|
||
((FAILED++)) || true
|
||
else
|
||
echo "OK (lag ${lag})"
|
||
fi
|
||
else
|
||
echo "OK (explorer caught up)"
|
||
fi
|
||
}
|
||
|
||
check_config_api() {
|
||
echo -n "[137] Config API (${DBIS_API_URL})... "
|
||
if curl -sf --max-time 10 -o /dev/null -w "%{http_code}" "${DBIS_API_URL}/health" 2>/dev/null | grep -q 200; then
|
||
echo "OK"
|
||
else
|
||
echo "SKIP or FAIL (external URL; may be unreachable off-LAN)"
|
||
fi
|
||
}
|
||
|
||
# [138a] Weekly: thin pool / storage usage on one host. Warn >85%, fail at 100%.
|
||
# Usage: check_thin_pool_one_host <ip> <label>
|
||
check_thin_pool_one_host() {
|
||
local ip="$1" label="$2"
|
||
echo -n "[138a] Storage $label ($ip)... "
|
||
local out pct
|
||
out=$(ssh -o ConnectTimeout=10 -o StrictHostKeyChecking=no root@"$ip" "pvesm status 2>/dev/null; lvs --noheadings -o lv_name,data_percent 2>/dev/null | awk '\$2!=\"\"'" 2>/dev/null || true)
|
||
if [ -z "$out" ]; then
|
||
echo "SKIP (SSH failed or no storage)"
|
||
return
|
||
fi
|
||
if echo "$out" | grep -q '100%'; then
|
||
echo "FAIL (storage 100% full)"
|
||
((FAILED++)) || true
|
||
return
|
||
fi
|
||
# Parse percentages: from "45%" or "85.2" (lvs data_percent) or "100.0"
|
||
pct=$(echo "$out" | sed -n 's/.*\([0-9]\{2,3\}\)%.*/\1/p'; echo "$out" | grep -oE '[0-9]{2,3}\.[0-9]+|[0-9]{2,3}' | sort -n | tail -1)
|
||
pct=$(echo "$pct" | sort -n | tail -1)
|
||
local pct_int
|
||
pct_int=$(echo "$pct" | cut -d. -f1)
|
||
[ -n "$pct_int" ] && [ "$pct_int" -gt "${STORAGE_MAX_PCT:-0}" ] 2>/dev/null && STORAGE_MAX_PCT=$pct_int
|
||
if [ -n "$pct_int" ] && [ "$pct_int" -ge 100 ] 2>/dev/null; then
|
||
echo "FAIL (storage 100% full)"
|
||
((FAILED++)) || true
|
||
elif [ -n "$pct_int" ] && [ "$pct_int" -ge 95 ] 2>/dev/null; then
|
||
echo "FAIL (usage ${pct}% >= 95%)"
|
||
((FAILED++)) || true
|
||
elif [ -n "$pct_int" ] && [ "$pct_int" -ge 85 ] 2>/dev/null; then
|
||
echo "WARN (usage ${pct}% >= 85%)"
|
||
else
|
||
echo "OK"
|
||
fi
|
||
}
|
||
|
||
# [138a] Weekly: thin pool usage on all Proxmox hosts (r630-02, r630-01, ml110).
|
||
check_thin_pool_r630_02() {
|
||
check_thin_pool_one_host "${PROXMOX_R630_02}" "r630-02"
|
||
check_thin_pool_one_host "${PROXMOX_R630_01}" "r630-01"
|
||
check_thin_pool_one_host "${PROXMOX_ML110}" "ml110"
|
||
}
|
||
|
||
# Write metric file for alerting (FAILED count, timestamp). Optional.
|
||
write_metric_file() {
|
||
[ -z "${MAINTENANCE_METRIC_FILE}" ] && return
|
||
mkdir -p "$(dirname "$MAINTENANCE_METRIC_FILE")"
|
||
echo "maintenance_checks_failed ${FAILED}" > "${MAINTENANCE_METRIC_FILE}.$$"
|
||
echo "maintenance_checks_timestamp $(date +%s)" >> "${MAINTENANCE_METRIC_FILE}.$$"
|
||
mv "${MAINTENANCE_METRIC_FILE}.$$" "${MAINTENANCE_METRIC_FILE}"
|
||
}
|
||
|
||
# [A7] Write storage metric file (max thin pool % and timestamp) for external alerting.
|
||
STORAGE_METRIC_FILE="${STORAGE_METRIC_FILE:-$PROJECT_ROOT/logs/storage-growth/last_run.metric}"
|
||
write_storage_metric_file() {
|
||
[ -z "${STORAGE_MAX_PCT}" ] && return
|
||
mkdir -p "$(dirname "$STORAGE_METRIC_FILE")"
|
||
echo "storage_max_pct ${STORAGE_MAX_PCT}" > "${STORAGE_METRIC_FILE}.$$"
|
||
echo "storage_metric_timestamp $(date +%s)" >> "${STORAGE_METRIC_FILE}.$$"
|
||
mv "${STORAGE_METRIC_FILE}.$$" "$STORAGE_METRIC_FILE"
|
||
}
|
||
|
||
echo "=== Maintenance checks ($MODE) $(date -Iseconds) ==="
|
||
CHECK_DISK_SCRIPT="${PROJECT_ROOT}/scripts/maintenance/check-disk-all-vmids.sh"
|
||
|
||
case "$MODE" in
|
||
daily)
|
||
check_explorer_sync
|
||
check_explorer_indexer_lag
|
||
check_rpc
|
||
# [A5] In-CT disk check (WARN 85%, CRIT 95% on root /)
|
||
if [ -x "$CHECK_DISK_SCRIPT" ]; then
|
||
echo "[138b] In-CT disk (root /)..."
|
||
bash "$CHECK_DISK_SCRIPT" 2>&1 | while IFS= read -r line; do echo " $line"; done
|
||
fi
|
||
;;
|
||
weekly)
|
||
check_config_api
|
||
check_thin_pool_r630_02
|
||
# [A8] Weekly fstrim in running CTs (reclaim thin pool space)
|
||
FSTRIM_SCRIPT="${PROJECT_ROOT}/scripts/maintenance/fstrim-all-running-ct.sh"
|
||
if [ -x "$FSTRIM_SCRIPT" ]; then
|
||
echo "[138c] fstrim running CTs..."
|
||
bash "$FSTRIM_SCRIPT" 2>&1 | while IFS= read -r line; do echo " $line"; done
|
||
fi
|
||
# [A10] Journal vacuum (keep last 7d) in key CTs
|
||
JOURNAL_SCRIPT="${PROJECT_ROOT}/scripts/maintenance/journal-vacuum-key-ct.sh"
|
||
if [ -x "$JOURNAL_SCRIPT" ]; then
|
||
echo "[138d] journal vacuum key CTs..."
|
||
bash "$JOURNAL_SCRIPT" 2>&1 | while IFS= read -r line; do echo " $line"; done
|
||
fi
|
||
echo "[138] Review explorer logs: pct exec 5000 -- journalctl -u blockscout -n 200 --no-pager (from root@${PROXMOX_R630_02})"
|
||
;;
|
||
all)
|
||
check_explorer_sync
|
||
check_explorer_indexer_lag
|
||
check_rpc
|
||
check_config_api
|
||
check_thin_pool_r630_02
|
||
echo "[138] Review explorer logs manually; [139] update token list as needed (token-list.json / explorer config)."
|
||
;;
|
||
*)
|
||
echo "Usage: $0 [daily|weekly|all]"
|
||
exit 1
|
||
;;
|
||
esac
|
||
|
||
write_metric_file
|
||
[ "$MODE" = "weekly" ] || [ "$MODE" = "all" ] && write_storage_metric_file
|
||
echo "=== Done (failed: $FAILED) ==="
|
||
[[ $FAILED -eq 0 ]]
|