Sankofa/scripts/setup-monitoring.sh

#!/bin/bash
# setup-monitoring.sh
# Sets up Prometheus scraping and Grafana dashboards for Proxmox

set -euo pipefail

# Colors
GREEN='\033[0;32m'
RED='\033[0;31m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m'

# Configuration
PROMETHEUS_NAMESPACE="${PROMETHEUS_NAMESPACE:-monitoring}"
GRAFANA_NAMESPACE="${GRAFANA_NAMESPACE:-monitoring}"
DASHBOARD_DIR="${DASHBOARD_DIR:-./infrastructure/monitoring/dashboards}"

log() {
    echo -e "${GREEN}[$(date +'%Y-%m-%d %H:%M:%S')]${NC} $1"
}

error() {
    echo -e "${RED}[ERROR]${NC} $1" >&2
    exit 1
}

warn() {
    echo -e "${YELLOW}[WARN]${NC} $1"
}

info() {
    echo -e "${BLUE}[INFO]${NC} $1"
}

check_prerequisites() {
    log "Checking prerequisites..."

    if ! command -v kubectl &> /dev/null; then
        error "kubectl is required but not installed"
    fi

    if ! kubectl cluster-info &> /dev/null; then
        error "Cannot connect to Kubernetes cluster"
    fi

    log "✓ Prerequisites check passed"
}

create_prometheus_service_monitor() {
    log "Creating Prometheus ServiceMonitor for Proxmox exporters..."

    kubectl apply -f - <<EOF
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
  name: proxmox-exporters
  namespace: ${PROMETHEUS_NAMESPACE}
  labels:
    app: proxmox
spec:
  selector:
    matchLabels:
      app: proxmox-exporter
  endpoints:
  - port: metrics
    interval: 30s
    path: /metrics
    scheme: http
EOF

    log "✓ ServiceMonitor created"
}

create_prometheus_scrape_config() {
    log "Creating Prometheus scrape configuration..."

    # This would be added to Prometheus ConfigMap
    info "Add the following to your Prometheus configuration:"
    cat <<EOF

  - job_name: 'proxmox'
    scrape_interval: 30s
    static_configs:
      - targets:
          - 'ml110-01-metrics.sankofa.nexus:9221'
          - 'r630-01-metrics.sankofa.nexus:9221'
        labels:
          cluster: 'proxmox'
EOF
}

import_grafana_dashboards() {
    log "Importing Grafana dashboards..."

    if [ ! -d "$DASHBOARD_DIR" ]; then
        warn "Dashboard directory not found: ${DASHBOARD_DIR}"
        return 0
    fi

    local dashboards=(
        "proxmox-cluster.json"
        "proxmox-vms.json"
        "proxmox-node.json"
    )

    for dashboard in "${dashboards[@]}"; do
        local dashboard_file="${DASHBOARD_DIR}/${dashboard}"
        if [ -f "$dashboard_file" ]; then
            info "Dashboard file found: ${dashboard}"
            info "Import via Grafana UI or API:"
            info "  kubectl port-forward -n ${GRAFANA_NAMESPACE} svc/grafana 3000:80"
            info "  Then import: http://localhost:3000/dashboard/import"
        else
            warn "Dashboard file not found: ${dashboard_file}"
        fi
    done
}

create_grafana_datasource() {
    log "Creating Grafana datasource configuration..."

    info "Prometheus datasource should be configured in Grafana:"
    info "  URL: http://prometheus.${PROMETHEUS_NAMESPACE}.svc.cluster.local:9090"
    info "  Access: Server (default)"
    info ""
    info "Configure via Grafana UI or API"
}

create_alerts() {
    log "Creating Prometheus alert rules..."

    kubectl apply -f - <<EOF
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: proxmox-alerts
  namespace: ${PROMETHEUS_NAMESPACE}
  labels:
    app: proxmox
spec:
  groups:
  - name: proxmox
    interval: 30s
    rules:
    - alert: ProxmoxNodeDown
      expr: up{job="proxmox"} == 0
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: "Proxmox node is down"
        description: "Proxmox node {{ \$labels.instance }} has been down for more than 5 minutes"

    - alert: ProxmoxHighCPU
      expr: pve_node_cpu_usage > 90
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: "Proxmox node CPU usage is high"
        description: "Node {{ \$labels.node }} CPU usage is {{ \$value }}%"

    - alert: ProxmoxHighMemory
      expr: pve_node_memory_usage > 90
      for: 10m
      labels:
        severity: warning
      annotations:
        summary: "Proxmox node memory usage is high"
        description: "Node {{ \$labels.node }} memory usage is {{ \$value }}%"

    - alert: ProxmoxStorageFull
      expr: pve_storage_usage > 90
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: "Proxmox storage is nearly full"
        description: "Storage {{ \$labels.storage }} on node {{ \$labels.node }} is {{ \$value }}% full"
EOF

    log "✓ Alert rules created"
}

main() {
    echo ""
    echo "╔══════════════════════════════════════════════════════════════╗"
    echo "║          Proxmox Monitoring Setup                           ║"
    echo "╚══════════════════════════════════════════════════════════════╝"
    echo ""

    check_prerequisites
    echo ""

    create_prometheus_service_monitor
    echo ""

    create_prometheus_scrape_config
    echo ""

    create_alerts
    echo ""

    import_grafana_dashboards
    echo ""

    create_grafana_datasource
    echo ""

    log "Monitoring setup complete!"
    echo ""
    info "Next steps:"
    info "1. Verify Prometheus is scraping: kubectl port-forward -n ${PROMETHEUS_NAMESPACE} svc/prometheus 9090:9090"
    info "2. Import Grafana dashboards via UI"
    info "3. Configure alert notifications"
    info "4. Verify metrics are being collected"
}

main "$@"