Some checks failed
Deploy to Phoenix / deploy (push) Has been cancelled
Co-authored-by: Cursor <cursoragent@cursor.com>
115 lines
4.7 KiB
Bash
Executable File
115 lines
4.7 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Monitor HA status and send alerts if needed.
|
|
# Optional: ALERT_EMAIL (mail) or ALERT_WEBHOOK (Slack/Discord/Teams JSON) for alerts.
|
|
|
|
set -euo pipefail
|
|
|
|
# Load IP configuration
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
|
|
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
|
|
|
if [ -f "$PROJECT_ROOT/.env" ]; then
|
|
set +euo pipefail
|
|
source "$PROJECT_ROOT/.env" 2>/dev/null || true
|
|
set -euo pipefail
|
|
fi
|
|
|
|
VIP="${VIP:-${IP_NPMPLUS_ETH0:-192.168.11.166}}"
|
|
PRIMARY_HOST="${PRIMARY_HOST:-192.168.11.11}"
|
|
SECONDARY_HOST="${SECONDARY_HOST:-192.168.11.12}"
|
|
LOG_FILE="${LOG_FILE:-/tmp/npmplus-ha-monitor.log}"
|
|
|
|
# Colors
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
RED='\033[0;31m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m'
|
|
|
|
log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
|
|
log_warn() { echo -e "${YELLOW}[WARN]${NC} $1"; }
|
|
log_error() { echo -e "${RED}[ERROR]${NC} $1"; }
|
|
log_success() { echo -e "${GREEN}[SUCCESS]${NC} $1"; }
|
|
|
|
TIMESTAMP=$(date '+%Y-%m-%d %H:%M:%S')
|
|
|
|
# Check who owns VIP
|
|
VIP_OWNER="UNKNOWN"
|
|
if ssh -o StrictHostKeyChecking=no -o ConnectTimeout=3 root@"$PRIMARY_HOST" "ip addr show vmbr0 2>/dev/null | grep -q $VIP" 2>/dev/null; then
|
|
VIP_OWNER="$PRIMARY_HOST"
|
|
elif ssh -o StrictHostKeyChecking=no -o ConnectTimeout=3 root@"$SECONDARY_HOST" "ip addr show vmbr0 2>/dev/null | grep -q $VIP" 2>/dev/null; then
|
|
VIP_OWNER="$SECONDARY_HOST"
|
|
fi
|
|
|
|
echo "[$TIMESTAMP] VIP $VIP owner: $VIP_OWNER" >> "$LOG_FILE" 2>&1 || echo "[$TIMESTAMP] VIP $VIP owner: $VIP_OWNER"
|
|
|
|
# Check Keepalived status on both hosts
|
|
PRIMARY_STATUS=$(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=3 root@"$PRIMARY_HOST" "systemctl is-active keepalived 2>/dev/null" || echo "unknown")
|
|
SECONDARY_STATUS=$(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=3 root@"$SECONDARY_HOST" "systemctl is-active keepalived 2>/dev/null" || echo "unknown")
|
|
|
|
echo "[$TIMESTAMP] Primary Keepalived: $PRIMARY_STATUS" >> "$LOG_FILE" 2>&1 || echo "[$TIMESTAMP] Primary Keepalived: $PRIMARY_STATUS"
|
|
echo "[$TIMESTAMP] Secondary Keepalived: $SECONDARY_STATUS" >> "$LOG_FILE" 2>&1 || echo "[$TIMESTAMP] Secondary Keepalived: $SECONDARY_STATUS"
|
|
|
|
# Alert if both are down
|
|
if [ "$PRIMARY_STATUS" != "active" ] && [ "$SECONDARY_STATUS" != "active" ]; then
|
|
ALERT_MSG="[$TIMESTAMP] ALERT: Both Keepalived instances are down! HA unavailable."
|
|
echo "$ALERT_MSG" >> "$LOG_FILE" 2>&1 || echo "$ALERT_MSG"
|
|
log_error "$ALERT_MSG"
|
|
# Send alert via email/webhook if configured
|
|
if [ -n "${ALERT_EMAIL:-}" ]; then
|
|
echo "$ALERT_MSG" | mail -s "NPMplus HA Alert" "$ALERT_EMAIL" 2>/dev/null || true
|
|
fi
|
|
if [ -n "${ALERT_WEBHOOK:-}" ]; then
|
|
curl -s -X POST "$ALERT_WEBHOOK" \
|
|
-H "Content-Type: application/json" \
|
|
-d "{\"text\":\"$ALERT_MSG\"}" 2>/dev/null || true
|
|
fi
|
|
fi
|
|
|
|
# Alert if VIP is not owned by either host
|
|
if [ "$VIP_OWNER" = "UNKNOWN" ]; then
|
|
ALERT_MSG="[$TIMESTAMP] ALERT: VIP $VIP is not owned by any host!"
|
|
echo "$ALERT_MSG" >> "$LOG_FILE" 2>&1 || echo "$ALERT_MSG"
|
|
log_error "$ALERT_MSG"
|
|
if [ -n "${ALERT_EMAIL:-}" ]; then
|
|
echo "$ALERT_MSG" | mail -s "NPMplus HA Alert" "$ALERT_EMAIL" 2>/dev/null || true
|
|
fi
|
|
if [ -n "${ALERT_WEBHOOK:-}" ]; then
|
|
curl -s -X POST "$ALERT_WEBHOOK" -H "Content-Type: application/json" -d "{\"text\":\"$ALERT_MSG\"}" 2>/dev/null || true
|
|
fi
|
|
fi
|
|
|
|
# Check NPMplus container status on owner
|
|
if [ "$VIP_OWNER" != "UNKNOWN" ]; then
|
|
if [ "$VIP_OWNER" = "$PRIMARY_HOST" ]; then
|
|
NPMPLUS_VMID="${PRIMARY_VMID:-10233}"
|
|
else
|
|
NPMPLUS_VMID="${SECONDARY_VMID:-10234}"
|
|
fi
|
|
|
|
CONTAINER_STATUS=$(ssh -o StrictHostKeyChecking=no -o ConnectTimeout=3 root@"$VIP_OWNER" \
|
|
"pct status $NPMPLUS_VMID 2>/dev/null | grep -o 'running\|stopped' || echo 'unknown'" || echo "unknown")
|
|
|
|
if [ "$CONTAINER_STATUS" != "running" ]; then
|
|
ALERT_MSG="[$TIMESTAMP] ALERT: NPMplus container on $VIP_OWNER (VMID $NPMPLUS_VMID) is $CONTAINER_STATUS"
|
|
echo "$ALERT_MSG" >> "$LOG_FILE" 2>&1 || echo "$ALERT_MSG"
|
|
log_error "$ALERT_MSG"
|
|
# Send alert via email/webhook if configured
|
|
if [ -n "${ALERT_EMAIL:-}" ]; then
|
|
echo "$ALERT_MSG" | mail -s "NPMplus HA Alert" "$ALERT_EMAIL" 2>/dev/null || true
|
|
fi
|
|
if [ -n "${ALERT_WEBHOOK:-}" ]; then
|
|
curl -s -X POST "$ALERT_WEBHOOK" \
|
|
-H "Content-Type: application/json" \
|
|
-d "{\"text\":\"$ALERT_MSG\"}" 2>/dev/null || true
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
echo "[$TIMESTAMP] HA status check complete" >> "$LOG_FILE" 2>&1 || echo "[$TIMESTAMP] HA status check complete"
|