#!/usr/bin/env bash # Proxmox Storage Monitoring Script with Alerts # Monitors storage usage across all Proxmox nodes and sends alerts set -euo pipefail SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)" LOG_DIR="${PROJECT_ROOT}/logs/storage-monitoring" ALERT_LOG="${LOG_DIR}/storage_alerts_$(date +%Y%m%d).log" STATUS_LOG="${LOG_DIR}/storage_status_$(date +%Y%m%d).log" # Alert thresholds WARNING_THRESHOLD=80 CRITICAL_THRESHOLD=90 VG_FREE_WARNING=10 # GB VG_FREE_CRITICAL=5 # GB # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' CYAN='\033[0;36m' NC='\033[0m' log_info() { echo -e "${BLUE}[INFO]${NC} $1"; } log_success() { echo -e "${GREEN}[✓]${NC} $1"; } log_warn() { echo -e "${YELLOW}[⚠]${NC} $1"; } log_error() { echo -e "${RED}[✗]${NC} $1"; } log_alert() { echo -e "${RED}[ALERT]${NC} $1"; } # Create log directory mkdir -p "$LOG_DIR" # Proxmox nodes configuration declare -A NODES NODES[ml110]="192.168.11.10:L@kers2010" NODES[r630-01]="192.168.11.11:password" NODES[r630-02]="192.168.11.12:password" NODES[r630-03]="192.168.11.13:L@kers2010" NODES[r630-04]="192.168.11.14:L@kers2010" # Alert tracking declare -a ALERTS # SSH helper function ssh_node() { local hostname="$1" shift local ip="${NODES[$hostname]%%:*}" local password="${NODES[$hostname]#*:}" if command -v sshpass >/dev/null 2>&1; then sshpass -p "$password" ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 root@"$ip" "$@" 2>/dev/null || echo "" else ssh -o StrictHostKeyChecking=no -o ConnectTimeout=5 root@"$ip" "$@" 2>/dev/null || echo "" fi } # Check node connectivity check_node() { local hostname="$1" local ip="${NODES[$hostname]%%:*}" ping -c 1 -W 2 "$ip" >/dev/null 2>&1 } # Parse storage usage percentage parse_storage_percent() { local percent_str="$1" # Remove % sign and convert to integer echo "$percent_str" | sed 's/%//' | awk '{print int($1)}' } # Check storage usage check_storage_usage() { local hostname="$1" local storage_line="$2" local storage_name=$(echo "$storage_line" | awk '{print $1}') local storage_type=$(echo "$storage_line" | awk '{print $2}') local status=$(echo "$storage_line" | awk '{print $3}') local total=$(echo "$storage_line" | awk '{print $4}') local used=$(echo "$storage_line" | awk '{print $5}') local available=$(echo "$storage_line" | awk '{print $6}') local percent_str=$(echo "$storage_line" | awk '{print $7}') # Skip if disabled or inactive if [ "$status" = "disabled" ] || [ "$status" = "inactive" ] || [ "$percent_str" = "N/A" ]; then return 0 fi local percent=$(parse_storage_percent "$percent_str") if [ -z "$percent" ] || [ "$percent" -eq 0 ]; then return 0 fi # Check thresholds if [ "$percent" -ge "$CRITICAL_THRESHOLD" ]; then ALERTS+=("CRITICAL: $hostname:$storage_name is at ${percent}% capacity (${available} available)") log_alert "CRITICAL: $hostname:$storage_name is at ${percent}% capacity" return 2 elif [ "$percent" -ge "$WARNING_THRESHOLD" ]; then ALERTS+=("WARNING: $hostname:$storage_name is at ${percent}% capacity (${available} available)") log_warn "WARNING: $hostname:$storage_name is at ${percent}% capacity" return 1 fi return 0 } # Check volume group free space check_vg_free_space() { local hostname="$1" local vg_line="$2" local vg_name=$(echo "$vg_line" | awk '{print $1}') local vg_size=$(echo "$vg_line" | awk '{print $2}') local vg_free=$(echo "$vg_line" | awk '{print $3}') # Extract numeric value (remove 'g' suffix) local free_gb=$(echo "$vg_free" | sed 's/g//' | awk '{print int($1)}') if [ -z "$free_gb" ] || [ "$free_gb" -eq 0 ]; then return 0 fi if [ "$free_gb" -le "$VG_FREE_CRITICAL" ]; then ALERTS+=("CRITICAL: $hostname:$vg_name volume group has only ${free_gb}GB free space") log_alert "CRITICAL: $hostname:$vg_name VG has only ${free_gb}GB free" return 2 elif [ "$free_gb" -le "$VG_FREE_WARNING" ]; then ALERTS+=("WARNING: $hostname:$vg_name volume group has only ${free_gb}GB free space") log_warn "WARNING: $hostname:$vg_name VG has only ${free_gb}GB free" return 1 fi return 0 } # Monitor a single node monitor_node() { local hostname="$1" if ! check_node "$hostname"; then log_warn "$hostname is not reachable" return 1 fi log_info "Monitoring $hostname..." # Get storage status local storage_status=$(ssh_node "$hostname" 'pvesm status 2>/dev/null' || echo "") if [ -z "$storage_status" ]; then log_warn "Could not get storage status from $hostname" return 1 fi # Process each storage line (skip header) echo "$storage_status" | tail -n +2 | while IFS= read -r line; do if [ -n "$line" ]; then check_storage_usage "$hostname" "$line" fi done # Check volume groups local vgs_info=$(ssh_node "$hostname" 'vgs --units g --noheadings -o vg_name,vg_size,vg_free 2>/dev/null' || echo "") if [ -n "$vgs_info" ]; then echo "$vgs_info" | while IFS= read -r line; do if [ -n "$line" ]; then check_vg_free_space "$hostname" "$line" fi done fi # Log storage status { echo "=== $hostname Storage Status $(date) ===" echo "$storage_status" echo "" echo "=== Volume Groups ===" echo "$vgs_info" echo "" } >> "$STATUS_LOG" return 0 } # Send alerts (can be extended to email, Slack, etc.) send_alerts() { if [ ${#ALERTS[@]} -eq 0 ]; then log_success "No storage alerts" return 0 fi log_warn "Found ${#ALERTS[@]} storage alert(s)" { echo "=== Storage Alerts $(date) ===" for alert in "${ALERTS[@]}"; do echo "$alert" done echo "" } >> "$ALERT_LOG" # Print alerts for alert in "${ALERTS[@]}"; do echo "$alert" done # TODO: Add email/Slack/webhook notifications here # Example: # send_email "Storage Alerts" "$(printf '%s\n' "${ALERTS[@]}")" # send_slack_webhook "${ALERTS[@]}" } # Generate summary report generate_summary() { local summary_file="${LOG_DIR}/storage_summary_$(date +%Y%m%d).txt" { echo "=== Proxmox Storage Summary $(date) ===" echo "" echo "Nodes Monitored:" for hostname in "${!NODES[@]}"; do if check_node "$hostname"; then echo " ✅ $hostname" else echo " ❌ $hostname (not reachable)" fi done echo "" echo "Alerts: ${#ALERTS[@]}" if [ ${#ALERTS[@]} -gt 0 ]; then echo "" for alert in "${ALERTS[@]}"; do echo " - $alert" done fi echo "" echo "Thresholds:" echo " Storage Usage Warning: ${WARNING_THRESHOLD}%" echo " Storage Usage Critical: ${CRITICAL_THRESHOLD}%" echo " Volume Group Free Warning: ${VG_FREE_WARNING}GB" echo " Volume Group Free Critical: ${VG_FREE_CRITICAL}GB" } > "$summary_file" log_info "Summary saved to: $summary_file" } # Main monitoring function main() { local mode="${1:-check}" case "$mode" in check) echo "=== Proxmox Storage Monitoring ===" echo "Date: $(date)" echo "" # Monitor all nodes for hostname in "${!NODES[@]}"; do monitor_node "$hostname" done # Send alerts send_alerts # Generate summary generate_summary echo "" log_info "Monitoring complete. Check logs in: $LOG_DIR" ;; status) # Show current status echo "=== Current Storage Status ===" for hostname in "${!NODES[@]}"; do if check_node "$hostname"; then echo "" echo "--- $hostname ---" ssh_node "$hostname" 'pvesm status 2>/dev/null' || echo "Could not get status" fi done ;; alerts) # Show recent alerts if [ -f "$ALERT_LOG" ]; then tail -50 "$ALERT_LOG" else echo "No alerts found" fi ;; *) echo "Usage: $0 [check|status|alerts]" echo " check - Run full monitoring check (default)" echo " status - Show current storage status" echo " alerts - Show recent alerts" exit 1 ;; esac } # Run main function main "$@"