Files
proxmox/scripts/npmplus/monitor-ha-status.sh
defiQUG bea1903ac9
Some checks failed
Deploy to Phoenix / deploy (push) Has been cancelled
Sync all local changes: docs, config, scripts, submodule refs, verification evidence
Co-authored-by: Cursor <cursoragent@cursor.com>
2026-02-21 15:46:06 -08:00

115 lines
4.7 KiB
Bash
Executable File

#!/usr/bin/env bash
# Monitor HA status and send alerts if needed.
# Optional: ALERT_EMAIL (mail) or ALERT_WEBHOOK (Slack/Discord/Teams JSON) for alerts.
set -euo pipefail
# Load IP configuration
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
if [ -f "$PROJECT_ROOT/.env" ]; then
set +euo pipefail
source "$PROJECT_ROOT/.env" 2>/dev/null || true
set -euo pipefail
fi
VIP="${VIP:-${IP_NPMPLUS_ETH0:-192.168.11.166}}"
PRIMARY_HOST="${PRIMARY_HOST:-192.168.11.11}"
SECONDARY_HOST="${SECONDARY_HOST:-192.168.11.12}"
LOG_FILE="${LOG_FILE:-/tmp/npmplus-ha-monitor.log}"
# Colors
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
RED='\033[0;31m'
BLUE='\033[0;34m'
NC='\033[0m'
log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
# Check who owns VIP
VIP_OWNER="UNKNOWN"
if ssh -o StrictHostKeyChecking=no -o ConnectTimeout=3 root@"$PRIMARY_HOST" "ip addr show vmbr0 2>/dev/null | grep -q $VIP" 2>/dev/null; then
VIP_OWNER="$PRIMARY_HOST"
elif ssh -o StrictHostKeyChecking=no -o ConnectTimeout=3 root@"$SECONDARY_HOST" "ip addr show vmbr0 2>/dev/null | grep -q $VIP" 2>/dev/null; then
VIP_OWNER="$SECONDARY_HOST"
fi
echo "[$TIMESTAMP] VIP $VIP owner: $VIP_OWNER" >> "$LOG_FILE" 2>&1 || echo "[$TIMESTAMP] VIP $VIP owner: $VIP_OWNER"
# Check Keepalived status on both hosts
PRIMARY_STATUS=$(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=3 root@"$PRIMARY_HOST" "systemctl is-active keepalived 2>/dev/null" || echo "unknown")
SECONDARY_STATUS=$(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=3 root@"$SECONDARY_HOST" "systemctl is-active keepalived 2>/dev/null" || echo "unknown")
echo "[$TIMESTAMP] Primary Keepalived: $PRIMARY_STATUS" >> "$LOG_FILE" 2>&1 || echo "[$TIMESTAMP] Primary Keepalived: $PRIMARY_STATUS"
echo "[$TIMESTAMP] Secondary Keepalived: $SECONDARY_STATUS" >> "$LOG_FILE" 2>&1 || echo "[$TIMESTAMP] Secondary Keepalived: $SECONDARY_STATUS"
# Alert if both are down
if [ "$PRIMARY_STATUS" != "active" ] && [ "$SECONDARY_STATUS" != "active" ]; then
ALERT_MSG="[$TIMESTAMP] ALERT: Both Keepalived instances are down! HA unavailable."
echo "$ALERT_MSG" >> "$LOG_FILE" 2>&1 || echo "$ALERT_MSG"
log_error "$ALERT_MSG"
# Send alert via email/webhook if configured
if [ -n "${ALERT_EMAIL:-}" ]; then
echo "$ALERT_MSG" | mail -s "NPMplus HA Alert" "$ALERT_EMAIL" 2>/dev/null || true
fi
if [ -n "${ALERT_WEBHOOK:-}" ]; then
curl -s -X POST "$ALERT_WEBHOOK" \
-H "Content-Type: application/json" \
-d "{\"text\":\"$ALERT_MSG\"}" 2>/dev/null || true
fi
fi
# Alert if VIP is not owned by either host
if [ "$VIP_OWNER" = "UNKNOWN" ]; then
ALERT_MSG="[$TIMESTAMP] ALERT: VIP $VIP is not owned by any host!"
echo "$ALERT_MSG" >> "$LOG_FILE" 2>&1 || echo "$ALERT_MSG"
log_error "$ALERT_MSG"
if [ -n "${ALERT_EMAIL:-}" ]; then
echo "$ALERT_MSG" | mail -s "NPMplus HA Alert" "$ALERT_EMAIL" 2>/dev/null || true
fi
if [ -n "${ALERT_WEBHOOK:-}" ]; then
curl -s -X POST "$ALERT_WEBHOOK" -H "Content-Type: application/json" -d "{\"text\":\"$ALERT_MSG\"}" 2>/dev/null || true
fi
fi
# Check NPMplus container status on owner
if [ "$VIP_OWNER" != "UNKNOWN" ]; then
if [ "$VIP_OWNER" = "$PRIMARY_HOST" ]; then
NPMPLUS_VMID="${PRIMARY_VMID:-10233}"
else
NPMPLUS_VMID="${SECONDARY_VMID:-10234}"
fi
CONTAINER_STATUS=$(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=3 root@"$VIP_OWNER" \
"pct status $NPMPLUS_VMID 2>/dev/null | grep -o 'running\|stopped' || echo 'unknown'" || echo "unknown")
if [ "$CONTAINER_STATUS" != "running" ]; then
ALERT_MSG="[$TIMESTAMP] ALERT: NPMplus container on $VIP_OWNER (VMID $NPMPLUS_VMID) is $CONTAINER_STATUS"
echo "$ALERT_MSG" >> "$LOG_FILE" 2>&1 || echo "$ALERT_MSG"
log_error "$ALERT_MSG"
# Send alert via email/webhook if configured
if [ -n "${ALERT_EMAIL:-}" ]; then
echo "$ALERT_MSG" | mail -s "NPMplus HA Alert" "$ALERT_EMAIL" 2>/dev/null || true
fi
if [ -n "${ALERT_WEBHOOK:-}" ]; then
curl -s -X POST "$ALERT_WEBHOOK" \
-H "Content-Type: application/json" \
-d "{\"text\":\"$ALERT_MSG\"}" 2>/dev/null || true
fi
fi
fi
echo "[$TIMESTAMP] HA status check complete" >> "$LOG_FILE" 2>&1 || echo "[$TIMESTAMP] HA status check complete"