#!/usr/bin/env bash # Monitor HA status and send alerts if needed. # Optional: ALERT_EMAIL (mail) or ALERT_WEBHOOK (Slack/Discord/Teams JSON) for alerts. set -euo pipefail # Load IP configuration SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" if [ -f "$PROJECT_ROOT/.env" ]; then set +euo pipefail source "$PROJECT_ROOT/.env" 2>/dev/null || true set -euo pipefail fi VIP="${VIP:-${IP_NPMPLUS_ETH0:-192.168.11.166}}" PRIMARY_HOST="${PRIMARY_HOST:-192.168.11.11}" SECONDARY_HOST="${SECONDARY_HOST:-192.168.11.12}" LOG_FILE="${LOG_FILE:-/tmp/npmplus-ha-monitor.log}" # Colors GREEN='\033[0;32m' YELLOW='\033[1;33m' RED='\033[0;31m' BLUE='\033[0;34m' NC='\033[0m' log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; } log_error() { echo -e "${RED}[ERROR]${NC} $1"; } log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; } TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S') # Check who owns VIP VIP_OWNER="UNKNOWN" if ssh -o StrictHostKeyChecking=no -o ConnectTimeout=3 root@"$PRIMARY_HOST" "ip addr show vmbr0 2>/dev/null | grep -q $VIP" 2>/dev/null; then VIP_OWNER="$PRIMARY_HOST" elif ssh -o StrictHostKeyChecking=no -o ConnectTimeout=3 root@"$SECONDARY_HOST" "ip addr show vmbr0 2>/dev/null | grep -q $VIP" 2>/dev/null; then VIP_OWNER="$SECONDARY_HOST" fi echo "[$TIMESTAMP] VIP $VIP owner: $VIP_OWNER" >> "$LOG_FILE" 2>&1 || echo "[$TIMESTAMP] VIP $VIP owner: $VIP_OWNER" # Check Keepalived status on both hosts PRIMARY_STATUS=$(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=3 root@"$PRIMARY_HOST" "systemctl is-active keepalived 2>/dev/null" || echo "unknown") SECONDARY_STATUS=$(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=3 root@"$SECONDARY_HOST" "systemctl is-active keepalived 2>/dev/null" || echo "unknown") echo "[$TIMESTAMP] Primary Keepalived: $PRIMARY_STATUS" >> "$LOG_FILE" 2>&1 || echo "[$TIMESTAMP] Primary Keepalived: $PRIMARY_STATUS" echo "[$TIMESTAMP] Secondary Keepalived: $SECONDARY_STATUS" >> "$LOG_FILE" 2>&1 || echo "[$TIMESTAMP] Secondary Keepalived: $SECONDARY_STATUS" # Alert if both are down if [ "$PRIMARY_STATUS" != "active" ] && [ "$SECONDARY_STATUS" != "active" ]; then ALERT_MSG="[$TIMESTAMP] ALERT: Both Keepalived instances are down! HA unavailable." echo "$ALERT_MSG" >> "$LOG_FILE" 2>&1 || echo "$ALERT_MSG" log_error "$ALERT_MSG" # Send alert via email/webhook if configured if [ -n "${ALERT_EMAIL:-}" ]; then echo "$ALERT_MSG" | mail -s "NPMplus HA Alert" "$ALERT_EMAIL" 2>/dev/null || true fi if [ -n "${ALERT_WEBHOOK:-}" ]; then curl -s -X POST "$ALERT_WEBHOOK" \ -H "Content-Type: application/json" \ -d "{\"text\":\"$ALERT_MSG\"}" 2>/dev/null || true fi fi # Alert if VIP is not owned by either host if [ "$VIP_OWNER" = "UNKNOWN" ]; then ALERT_MSG="[$TIMESTAMP] ALERT: VIP $VIP is not owned by any host!" echo "$ALERT_MSG" >> "$LOG_FILE" 2>&1 || echo "$ALERT_MSG" log_error "$ALERT_MSG" if [ -n "${ALERT_EMAIL:-}" ]; then echo "$ALERT_MSG" | mail -s "NPMplus HA Alert" "$ALERT_EMAIL" 2>/dev/null || true fi if [ -n "${ALERT_WEBHOOK:-}" ]; then curl -s -X POST "$ALERT_WEBHOOK" -H "Content-Type: application/json" -d "{\"text\":\"$ALERT_MSG\"}" 2>/dev/null || true fi fi # Check NPMplus container status on owner if [ "$VIP_OWNER" != "UNKNOWN" ]; then if [ "$VIP_OWNER" = "$PRIMARY_HOST" ]; then NPMPLUS_VMID="${PRIMARY_VMID:-10233}" else NPMPLUS_VMID="${SECONDARY_VMID:-10234}" fi CONTAINER_STATUS=$(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=3 root@"$VIP_OWNER" \ "pct status $NPMPLUS_VMID 2>/dev/null | grep -o 'running\|stopped' || echo 'unknown'" || echo "unknown") if [ "$CONTAINER_STATUS" != "running" ]; then ALERT_MSG="[$TIMESTAMP] ALERT: NPMplus container on $VIP_OWNER (VMID $NPMPLUS_VMID) is $CONTAINER_STATUS" echo "$ALERT_MSG" >> "$LOG_FILE" 2>&1 || echo "$ALERT_MSG" log_error "$ALERT_MSG" # Send alert via email/webhook if configured if [ -n "${ALERT_EMAIL:-}" ]; then echo "$ALERT_MSG" | mail -s "NPMplus HA Alert" "$ALERT_EMAIL" 2>/dev/null || true fi if [ -n "${ALERT_WEBHOOK:-}" ]; then curl -s -X POST "$ALERT_WEBHOOK" \ -H "Content-Type: application/json" \ -d "{\"text\":\"$ALERT_MSG\"}" 2>/dev/null || true fi fi fi echo "[$TIMESTAMP] HA status check complete" >> "$LOG_FILE" 2>&1 || echo "[$TIMESTAMP] HA status check complete"