Some checks failed
Deploy to Phoenix / deploy (push) Has been cancelled
Co-authored-by: Cursor <cursoragent@cursor.com>
328 lines
10 KiB
Bash
Executable File
328 lines
10 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Proxmox Storage Monitoring Script with Alerts
|
|
# Monitors storage usage across all Proxmox nodes and sends alerts.
|
|
# Optional: set ALERT_EMAIL for mail(1); set ALERT_WEBHOOK for Slack/Discord/Teams webhook (JSON payload).
|
|
|
|
set -euo pipefail
|
|
|
|
# Load IP configuration
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
source "${PROJECT_ROOT}/config/ip-addresses.conf" 2>/dev/null || true
|
|
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
|
|
LOG_DIR="${PROJECT_ROOT}/logs/storage-monitoring"
|
|
ALERT_LOG="${LOG_DIR}/storage_alerts_$(date +%Y%m%d).log"
|
|
STATUS_LOG="${LOG_DIR}/storage_status_$(date +%Y%m%d).log"
|
|
|
|
# Alert thresholds
|
|
WARNING_THRESHOLD=80
|
|
CRITICAL_THRESHOLD=90
|
|
VG_FREE_WARNING=10 # GB
|
|
VG_FREE_CRITICAL=5 # GB
|
|
|
|
# Colors
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
CYAN='\033[0;36m'
|
|
NC='\033[0m'
|
|
|
|
log_info() { echo -e "${BLUE}[INFO]${NC} $1"; }
|
|
log_success() { echo -e "${GREEN}[✓]${NC} $1"; }
|
|
log_warn() { echo -e "${YELLOW}[⚠]${NC} $1"; }
|
|
log_error() { echo -e "${RED}[✗]${NC} $1"; }
|
|
log_alert() { echo -e "${RED}[ALERT]${NC} $1"; }
|
|
|
|
# Create log directory
|
|
mkdir -p "$LOG_DIR"
|
|
|
|
# Proxmox nodes configuration
|
|
declare -A NODES
|
|
NODES[ml110]="${PROXMOX_HOST_ML110:-192.168.11.10}:L@kers2010"
|
|
NODES[r630-01]="${PROXMOX_HOST_R630_01:-192.168.11.11}:password"
|
|
NODES[r630-02]="${PROXMOX_HOST_R630_02:-192.168.11.12}:password"
|
|
NODES[r630-03]="${IP_SERVICE_13:-${IP_SERVICE_13:-${IP_SERVICE_13:-${IP_SERVICE_13:-${IP_SERVICE_13:-${IP_SERVICE_13:-192.168.11.13}}}}}}:L@kers2010"
|
|
NODES[r630-04]="${IP_DEVICE_14:-${IP_DEVICE_14:-${IP_DEVICE_14:-${IP_DEVICE_14:-${IP_DEVICE_14:-${IP_DEVICE_14:-192.168.11.14}}}}}}:L@kers2010"
|
|
|
|
# Alert tracking
|
|
declare -a ALERTS
|
|
|
|
# SSH helper function
|
|
ssh_node() {
|
|
local hostname="$1"
|
|
shift
|
|
local ip="${NODES[$hostname]%%:*}"
|
|
local password="${NODES[$hostname]#*:}"
|
|
|
|
if command -v sshpass >/dev/null 2>&1; then
|
|
sshpass -p "$password" ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 root@"$ip" "$@" 2>/dev/null || echo ""
|
|
else
|
|
ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 root@"$ip" "$@" 2>/dev/null || echo ""
|
|
fi
|
|
}
|
|
|
|
# Check node connectivity
|
|
check_node() {
|
|
local hostname="$1"
|
|
local ip="${NODES[$hostname]%%:*}"
|
|
|
|
ping -c 1 -W 2 "$ip" >/dev/null 2>&1
|
|
}
|
|
|
|
# Parse storage usage percentage
|
|
parse_storage_percent() {
|
|
local percent_str="$1"
|
|
# Remove % sign and convert to integer
|
|
echo "$percent_str" | sed 's/%//' | awk '{print int($1)}'
|
|
}
|
|
|
|
# Check storage usage
|
|
check_storage_usage() {
|
|
local hostname="$1"
|
|
local storage_line="$2"
|
|
|
|
local storage_name=$(echo "$storage_line" | awk '{print $1}')
|
|
local storage_type=$(echo "$storage_line" | awk '{print $2}')
|
|
local status=$(echo "$storage_line" | awk '{print $3}')
|
|
local total=$(echo "$storage_line" | awk '{print $4}')
|
|
local used=$(echo "$storage_line" | awk '{print $5}')
|
|
local available=$(echo "$storage_line" | awk '{print $6}')
|
|
local percent_str=$(echo "$storage_line" | awk '{print $7}')
|
|
|
|
# Skip if disabled or inactive
|
|
if [ "$status" = "disabled" ] || [ "$status" = "inactive" ] || [ "$percent_str" = "N/A" ]; then
|
|
return 0
|
|
fi
|
|
|
|
local percent=$(parse_storage_percent "$percent_str")
|
|
|
|
if [ -z "$percent" ] || [ "$percent" -eq 0 ]; then
|
|
return 0
|
|
fi
|
|
|
|
# Check thresholds
|
|
if [ "$percent" -ge "$CRITICAL_THRESHOLD" ]; then
|
|
ALERTS+=("CRITICAL: $hostname:$storage_name is at ${percent}% capacity (${available} available)")
|
|
log_alert "CRITICAL: $hostname:$storage_name is at ${percent}% capacity"
|
|
return 2
|
|
elif [ "$percent" -ge "$WARNING_THRESHOLD" ]; then
|
|
ALERTS+=("WARNING: $hostname:$storage_name is at ${percent}% capacity (${available} available)")
|
|
log_warn "WARNING: $hostname:$storage_name is at ${percent}% capacity"
|
|
return 1
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# Check volume group free space
|
|
check_vg_free_space() {
|
|
local hostname="$1"
|
|
local vg_line="$2"
|
|
|
|
local vg_name=$(echo "$vg_line" | awk '{print $1}')
|
|
local vg_size=$(echo "$vg_line" | awk '{print $2}')
|
|
local vg_free=$(echo "$vg_line" | awk '{print $3}')
|
|
|
|
# Extract numeric value (remove 'g' suffix)
|
|
local free_gb=$(echo "$vg_free" | sed 's/g//' | awk '{print int($1)}')
|
|
|
|
if [ -z "$free_gb" ] || [ "$free_gb" -eq 0 ]; then
|
|
return 0
|
|
fi
|
|
|
|
if [ "$free_gb" -le "$VG_FREE_CRITICAL" ]; then
|
|
ALERTS+=("CRITICAL: $hostname:$vg_name volume group has only ${free_gb}GB free space")
|
|
log_alert "CRITICAL: $hostname:$vg_name VG has only ${free_gb}GB free"
|
|
return 2
|
|
elif [ "$free_gb" -le "$VG_FREE_WARNING" ]; then
|
|
ALERTS+=("WARNING: $hostname:$vg_name volume group has only ${free_gb}GB free space")
|
|
log_warn "WARNING: $hostname:$vg_name VG has only ${free_gb}GB free"
|
|
return 1
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# Monitor a single node
|
|
monitor_node() {
|
|
local hostname="$1"
|
|
|
|
if ! check_node "$hostname"; then
|
|
log_warn "$hostname is not reachable"
|
|
return 1
|
|
fi
|
|
|
|
log_info "Monitoring $hostname..."
|
|
|
|
# Get storage status
|
|
local storage_status=$(ssh_node "$hostname" 'pvesm status 2>/dev/null' || echo "")
|
|
|
|
if [ -z "$storage_status" ]; then
|
|
log_warn "Could not get storage status from $hostname"
|
|
return 1
|
|
fi
|
|
|
|
# Process each storage line (skip header)
|
|
echo "$storage_status" | tail -n +2 | while IFS= read -r line; do
|
|
if [ -n "$line" ]; then
|
|
check_storage_usage "$hostname" "$line"
|
|
fi
|
|
done
|
|
|
|
# Check volume groups
|
|
local vgs_info=$(ssh_node "$hostname" 'vgs --units g --noheadings -o vg_name,vg_size,vg_free 2>/dev/null' || echo "")
|
|
|
|
if [ -n "$vgs_info" ]; then
|
|
echo "$vgs_info" | while IFS= read -r line; do
|
|
if [ -n "$line" ]; then
|
|
check_vg_free_space "$hostname" "$line"
|
|
fi
|
|
done
|
|
fi
|
|
|
|
# Log storage status
|
|
{
|
|
echo "=== $hostname Storage Status $(date) ==="
|
|
echo "$storage_status"
|
|
echo ""
|
|
echo "=== Volume Groups ==="
|
|
echo "$vgs_info"
|
|
echo ""
|
|
} >> "$STATUS_LOG"
|
|
|
|
return 0
|
|
}
|
|
|
|
# Send alerts (can be extended to email, Slack, etc.)
|
|
send_alerts() {
|
|
if [ ${#ALERTS[@]} -eq 0 ]; then
|
|
log_success "No storage alerts"
|
|
return 0
|
|
fi
|
|
|
|
log_warn "Found ${#ALERTS[@]} storage alert(s)"
|
|
|
|
{
|
|
echo "=== Storage Alerts $(date) ==="
|
|
for alert in "${ALERTS[@]}"; do
|
|
echo "$alert"
|
|
done
|
|
echo ""
|
|
} >> "$ALERT_LOG"
|
|
|
|
# Print alerts
|
|
for alert in "${ALERTS[@]}"; do
|
|
echo "$alert"
|
|
done
|
|
|
|
# Email: set ALERT_EMAIL in env (e.g. in .env or cron). Requires mail(1) or sendmail.
|
|
if [ -n "${ALERT_EMAIL:-}" ]; then
|
|
( echo "Subject: Proxmox Storage Alert"; echo ""; printf '%s\n' "${ALERTS[@]}"; ) | mail -s "Proxmox Storage Alert" "$ALERT_EMAIL" 2>/dev/null || true
|
|
fi
|
|
# Webhook: set ALERT_WEBHOOK to Slack/Discord/Teams URL. JSON payload with text.
|
|
if [ -n "${ALERT_WEBHOOK:-}" ]; then
|
|
ALERT_TEXT=$(printf '%s\n' "${ALERTS[@]}")
|
|
if command -v jq >/dev/null 2>&1; then
|
|
ALERT_JSON=$(printf '%s' "$ALERT_TEXT" | jq -Rs .)
|
|
curl -s -X POST "${ALERT_WEBHOOK}" -H "Content-Type: application/json" -d "{\"text\":$ALERT_JSON}" 2>/dev/null || true
|
|
else
|
|
ALERT_ESC=$(echo "$ALERT_TEXT" | sed 's/\\/\\\\/g; s/"/\\"/g' | awk '{printf "%s\\n", $0}' ORS='')
|
|
curl -s -X POST "${ALERT_WEBHOOK}" -H "Content-Type: application/json" -d "{\"text\":\"$ALERT_ESC\"}" 2>/dev/null || true
|
|
fi
|
|
fi
|
|
}
|
|
|
|
# Generate summary report
|
|
generate_summary() {
|
|
local summary_file="${LOG_DIR}/storage_summary_$(date +%Y%m%d).txt"
|
|
|
|
{
|
|
echo "=== Proxmox Storage Summary $(date) ==="
|
|
echo ""
|
|
echo "Nodes Monitored:"
|
|
for hostname in "${!NODES[@]}"; do
|
|
if check_node "$hostname"; then
|
|
echo " ✅ $hostname"
|
|
else
|
|
echo " ❌ $hostname (not reachable)"
|
|
fi
|
|
done
|
|
echo ""
|
|
echo "Alerts: ${#ALERTS[@]}"
|
|
if [ ${#ALERTS[@]} -gt 0 ]; then
|
|
echo ""
|
|
for alert in "${ALERTS[@]}"; do
|
|
echo " - $alert"
|
|
done
|
|
fi
|
|
echo ""
|
|
echo "Thresholds:"
|
|
echo " Storage Usage Warning: ${WARNING_THRESHOLD}%"
|
|
echo " Storage Usage Critical: ${CRITICAL_THRESHOLD}%"
|
|
echo " Volume Group Free Warning: ${VG_FREE_WARNING}GB"
|
|
echo " Volume Group Free Critical: ${VG_FREE_CRITICAL}GB"
|
|
} > "$summary_file"
|
|
|
|
log_info "Summary saved to: $summary_file"
|
|
}
|
|
|
|
# Main monitoring function
|
|
main() {
|
|
local mode="${1:-check}"
|
|
|
|
case "$mode" in
|
|
check)
|
|
echo "=== Proxmox Storage Monitoring ==="
|
|
echo "Date: $(date)"
|
|
echo ""
|
|
|
|
# Monitor all nodes
|
|
for hostname in "${!NODES[@]}"; do
|
|
monitor_node "$hostname"
|
|
done
|
|
|
|
# Send alerts
|
|
send_alerts
|
|
|
|
# Generate summary
|
|
generate_summary
|
|
|
|
echo ""
|
|
log_info "Monitoring complete. Check logs in: $LOG_DIR"
|
|
;;
|
|
status)
|
|
# Show current status
|
|
echo "=== Current Storage Status ==="
|
|
for hostname in "${!NODES[@]}"; do
|
|
if check_node "$hostname"; then
|
|
echo ""
|
|
echo "--- $hostname ---"
|
|
ssh_node "$hostname" 'pvesm status 2>/dev/null' || echo "Could not get status"
|
|
fi
|
|
done
|
|
;;
|
|
alerts)
|
|
# Show recent alerts
|
|
if [ -f "$ALERT_LOG" ]; then
|
|
tail -50 "$ALERT_LOG"
|
|
else
|
|
echo "No alerts found"
|
|
fi
|
|
;;
|
|
*)
|
|
echo "Usage: $0 [check|status|alerts]"
|
|
echo " check - Run full monitoring check (default)"
|
|
echo " status - Show current storage status"
|
|
echo " alerts - Show recent alerts"
|
|
exit 1
|
|
;;
|
|
esac
|
|
}
|
|
|
|
# Run main function
|
|
main "$@"
|