Files
proxmox/scripts/monitoring/master-stability-monitor.sh
defiQUG fbda1b4beb
Some checks failed
Deploy to Phoenix / deploy (push) Has been cancelled
docs: Ledger Live integration, contract deploy learnings, NEXT_STEPS updates
- ADD_CHAIN138_TO_LEDGER_LIVE: Ledger form done; public code review repo bis-innovations/LedgerLive; init/push commands
- CONTRACT_DEPLOYMENT_RUNBOOK: Chain 138 gas price 1 gwei, 36-addr check, TransactionMirror workaround
- CONTRACT_*: AddressMapper, MirrorManager deployed 2026-02-12; 36-address on-chain check
- NEXT_STEPS_FOR_YOU: Ledger done; steps completable now (no LAN); run-completable-tasks-from-anywhere
- MASTER_INDEX, OPERATOR_OPTIONAL, SMART_CONTRACTS_INVENTORY_SIMPLE: updates
- LEDGER_BLOCKCHAIN_INTEGRATION_COMPLETE: bis-innovations/LedgerLive reference

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-12 15:46:57 -08:00

141 lines
3.8 KiB
Bash
Executable File

#!/usr/bin/env bash
# Master Stability Monitor
# Orchestrates all monitoring and recovery operations
set -euo pipefail
# Load IP configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
# Load environment
if [ -f "$PROJECT_ROOT/smom-dbis-138/.env" ]; then
set +e
source "$PROJECT_ROOT/smom-dbis-138/.env" 2>/dev/null || true
set -e
fi
# Colors
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m'
log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
log_success() { echo -e "${GREEN}[✓]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[⚠]${NC} $1"; }
log_error() { echo -e "${RED}[✗]${NC} $1"; }
log_section() { echo -e "\n${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}"; echo -e "${CYAN}$1${NC}"; echo -e "${CYAN}━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━${NC}\n"; }
# Configuration
CHECK_INTERVAL=120 # Check every 2 minutes
AUTO_FIX=true
AUTO_RESTART=true
run_health_check() {
log_section "Running Health Check"
if bash "$SCRIPT_DIR/check-validator-health.sh" 2>&1; then
log_success "Health check passed"
return 0
else
log_error "Health check failed"
return 1
fi
}
run_auto_fix() {
log_section "Running Auto-Fix"
if bash "$SCRIPT_DIR/auto-fix-validator-config.sh" 2>&1; then
log_success "Auto-fix completed"
return 0
else
log_warn "Auto-fix had issues"
return 1
fi
}
check_block_production() {
log_section "Checking Block Production"
local rpc_url="${RPC_URL_138:-http://${RPC_CORE_1}:8545}"
local block1=$(cast block-number --rpc-url "$rpc_url" 2>/dev/null || echo "0")
sleep 10
local block2=$(cast block-number --rpc-url "$rpc_url" 2>/dev/null || echo "0")
if [ "$block1" != "$block2" ] && [ "$block2" != "0" ] && [ "$block1" != "" ]; then
log_success "Block production active ($block1$block2)"
return 0
else
log_error "Block production STALLED (block: $block1)"
return 1
fi
}
main() {
log_section "Master Stability Monitor"
log_info "Starting comprehensive stability monitoring..."
echo ""
local health_ok=true
local blocks_ok=true
# Run health check
if ! run_health_check; then
health_ok=false
# Auto-fix if enabled
if [ "$AUTO_FIX" = true ]; then
log_warn "Attempting automatic fix..."
run_auto_fix
# Re-check health
sleep 30
if run_health_check; then
log_success "Auto-fix resolved issues"
health_ok=true
else
log_error "Auto-fix did not resolve issues"
fi
fi
fi
# Check block production
if ! check_block_production; then
blocks_ok=false
log_error "CRITICAL: Block production stalled"
fi
# Summary
log_section "Monitoring Summary"
if [ "$health_ok" = true ] && [ "$blocks_ok" = true ]; then
log_success "All systems operational"
exit 0
elif [ "$blocks_ok" = false ]; then
log_error "CRITICAL: Block production issue detected"
exit 2
else
log_warn "Non-critical issues detected"
exit 1
fi
}
# Run continuously if no arguments
if [ "${1:-}" = "--once" ]; then
main
else
while true; do
main
sleep "$CHECK_INTERVAL"
done
fi