smom-dbis-138/scripts/deployment/monitor-deployment-consolidated.sh

#!/usr/bin/env bash

# Consolidated Deployment Monitor
# Replaces: monitor-deployment.sh, monitor-and-complete.sh, monitor-and-fix.sh,
#           monitor-continuous.sh, monitor-deployment-live.sh, live-monitor.sh,
#           continuous-monitor.sh, monitor-36-region-deployment.sh, deployment-dashboard.sh
#
# Usage:
#   monitor-deployment-consolidated.sh [--mode MODE] [--interval SECONDS] [--max-checks N]
#   Modes: status, continuous, live, complete, fix, dashboard

set -e

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$SCRIPT_DIR/../lib/init.sh"
# Metadata
SCRIPT_NAME="monitor-deployment-consolidated.sh"
SCRIPT_DESC="Unified deployment monitoring tool supporting modes: status|continuous|live|complete|fix|dashboard"
SCRIPT_USAGE="${SCRIPT_NAME} --mode {status|continuous|live|complete|fix|dashboard} [--help]"
SCRIPT_OPTIONS="--mode <value>    Run specific monitor mode\n--help            Show this help"
SCRIPT_REQUIREMENTS="Azure CLI (ensure_azure_cli), access to /tmp logs, scripts/lib/*"
handle_help "${1:-}"

# Default settings
MODE="${MODE:-status}"
MONITOR_INTERVAL="${MONITOR_INTERVAL:-30}"
MAX_CHECKS="${MAX_CHECKS:-120}"

# Parse arguments
while [[ $# -gt 0 ]]; do
    case $1 in
        --mode)
            MODE="$2"
            shift 2
            ;;
        --interval)
            MONITOR_INTERVAL="$2"
            shift 2
            ;;
        --max-checks)
            MAX_CHECKS="$2"
            shift 2
            ;;
        --help)
            cat << EOF
Consolidated Deployment Monitor

Usage: $0 [OPTIONS]

Options:
  --mode MODE         Monitor mode (status|continuous|live|complete|fix|dashboard)
                      Default: status
  --interval SECONDS  Check interval in seconds (continuous/live modes)
                      Default: 30
  --max-checks N      Maximum number of checks (continuous mode)
                      Default: 120
  --help              Show this help message

Modes:
  status      - One-time status check
  continuous  - Continuous monitoring with auto-completion
  live        - Live monitoring with real-time updates
  complete    - Monitor and automatically proceed when complete
  fix         - Monitor and attempt fixes on errors
  dashboard   - Dashboard view of all deployment status

Examples:
  $0                          # Quick status check
  $0 --mode continuous        # Continuous monitoring
  $0 --mode live --interval 10 # Live updates every 10 seconds
  $0 --mode dashboard         # Dashboard view
EOF
            exit 0
            ;;
        *)
            log_error "Unknown option: $1"
            exit 1
            ;;
    esac
done

# Log file paths
LOG_FILES=(
    "/tmp/complete-deployment.log:Main Deployment Log"
    "/tmp/terraform-plan-phase1.log:Phase 1 Plan (Key Vaults)"
    "/tmp/terraform-apply-phase1.log:Phase 1 Apply (Key Vaults)"
    "/tmp/store-secrets.log:Phase 2 (Store Secrets)"
    "/tmp/terraform-plan-phase3.log:Phase 3 Plan (AKS)"
    "/tmp/terraform-apply-phase3.log:Phase 3 Apply (AKS)"
)

# Check deployment process
check_deployment_process() {
    local pid=$(pgrep -f "complete-all-deployment.sh" | head -1)
    if [ -n "$pid" ]; then
        log_success "Deployment process running (PID: $pid)"
        return 0
    else
        log_warn "Deployment process not found"
        return 1
    fi
}

# Check log file
check_log_file() {
    local log_file="$1"
    local log_name="$2"

    if [ -f "$log_file" ]; then
        local size=$(stat -f%z "$log_file" 2>/dev/null || stat -c%s "$log_file" 2>/dev/null || echo "0")
        local lines=$(wc -l < "$log_file" 2>/dev/null || echo "0")
        local modified=$(stat -f%Sm "$log_file" 2>/dev/null || stat -c%y "$log_file" 2>/dev/null | cut -d' ' -f1-2 || echo "unknown")

        log_success "$log_name"
        echo "   File: $log_file"
        echo "   Size: $size bytes, Lines: $lines"
        echo "   Modified: $modified"
        return 0
    else
        log_info "⏳ $log_name: Not yet created"
        return 1
    fi
}

# Check for errors in log
check_log_errors() {
    local log_file="$1"
    local error_count=0

    if [ -f "$log_file" ]; then
        error_count=$(grep -i "error\|failed\|❌" "$log_file" | wc -l | tr -d ' ')
    fi

    echo "$error_count"
}

# Check phase status
check_phase_status() {
    local phase="$1"
    local plan_file="$2"
    local apply_file="$3"

    if [ ! -f "$apply_file" ]; then
        if [ -f "$plan_file" ]; then
            echo "⏳ Plan complete, waiting for apply..."
        else
            echo "⏳ Waiting to start..."
        fi
        return 1
    fi

    local size=$(stat -f%z "$apply_file" 2>/dev/null || stat -c%s "$apply_file" 2>/dev/null || echo "0")
    if [ "$size" -lt 100 ]; then
        echo "⏳ Starting..."
        return 1
    fi

    if grep -q "Apply complete!" "$apply_file" 2>/dev/null; then
        log_success "COMPLETE"
        return 0
    elif grep -qi "Error\|failed" "$apply_file" 2>/dev/null; then
        log_error "ERROR DETECTED"
        return 2
    else
        echo "⏳ IN PROGRESS ($size bytes logged)"
        return 1
    fi
}

# Show status (one-time check)
show_status() {
    log_section "DEPLOYMENT MONITOR - STATUS CHECK"

    check_deployment_process

    log_subsection "LOG FILE STATUS"
    for log_info in "${LOG_FILES[@]}"; do
        local log_file="${log_info%%:*}"
        local log_name="${log_info##*:}"
        check_log_file "$log_file" "$log_name"
    done

    log_subsection "ERROR CHECK"
    local total_errors=0
    for log_file in /tmp/complete-deployment.log /tmp/terraform-apply-phase1.log /tmp/terraform-apply-phase3.log; do
        if [ -f "$log_file" ]; then
            local errors=$(check_log_errors "$log_file")
            if [ "$errors" > 0 ]; then
                log_warn "Found $errors potential errors in $(basename "$log_file")"
                total_errors=$((total_errors + errors))
            fi
        fi
    done

    if [ "$total_errors" -eq 0 ]; then
        log_success "No errors detected in logs"
    fi


    if [ -f "/tmp/complete-deployment.log" ]; then
        log_subsection "RECENT OUTPUT (Last 20 lines)"
        tail -20 /tmp/complete-deployment.log
    fi
}

# Continuous monitoring with auto-completion
monitor_continuous() {
    log_section "CONTINUOUS MONITORING - AUTO-COMPLETION"

    local check_count=0
    local phase1_complete=false
    local phase2_complete=false
    local phase3_complete=false

    while [ $check_count -lt $MAX_CHECKS ]; do
        check_count=$((check_count + 1))

        log_subsection "CHECK #$check_count - $(date '+%Y-%m-%d %H:%M:%S')"

        # Check Phase 1
        if [ "$phase1_complete" = false ]; then
            local status=$(check_phase_status "Phase 1" "/tmp/terraform-plan-phase1.log" "/tmp/terraform-apply-phase1.log")
            case $? in
                0) phase1_complete=true; sleep 5 ;;
                2) log_error "Phase 1 failed. Check logs."; exit 1 ;;
            esac
        else
            log_success "Phase 1: Already complete"
        fi

        # Check Phase 2
        if [ "$phase1_complete" = true ] && [ "$phase2_complete" = false ]; then
            if [ -f "/tmp/store-secrets.log" ]; then
                local size=$(stat -f%z "/tmp/store-secrets.log" 2>/dev/null || stat -c%s "/tmp/store-secrets.log" 2>/dev/null || echo "0")
                if [ "$size" -gt 100 ]; then
                    if grep -qi "complete\|success\|stored" "/tmp/store-secrets.log" 2>/dev/null && ! grep -qi "error\|failed" "/tmp/store-secrets.log" 2>/dev/null; then
                        log_success "Phase 2: COMPLETE"
                        phase2_complete=true
                    elif grep -qi "Error\|failed" "/tmp/store-secrets.log" 2>/dev/null; then
                        log_error "Phase 2 failed. Check logs.; exit 1"
                    else
                        echo "⏳ Phase 2: IN PROGRESS ($size bytes logged)"
                    fi
                else
                    echo "⏳ Phase 2: Starting..."
                fi
            else
                echo "⏳ Phase 2: Waiting to start..."
            fi
        elif [ "$phase1_complete" = false ]; then
            echo "⏳ Phase 2: Waiting for Phase 1..."
        else
            log_success "Phase 2: Already complete"
        fi

        # Check Phase 3
        if [ "$phase2_complete" = true ] && [ "$phase3_complete" = false ]; then
            local status=$(check_phase_status "Phase 3" "/tmp/terraform-plan-phase3.log" "/tmp/terraform-apply-phase3.log")
            case $? in
                0) phase3_complete=true ;;
                2) log_error "Phase 3 failed. Check logs."; exit 1 ;;
            esac
        elif [ "$phase2_complete" = false ]; then
            echo "⏳ Phase 3: Waiting for Phase 2..."
        else
            log_success "Phase 3: Already complete"
        fi

        # Check if all phases complete
        if [ "$phase1_complete" = true ] && [ "$phase2_complete" = true ] && [ "$phase3_complete" = true ]; then
            log_section "ALL PHASES COMPLETE!"
            log_success "Phase 1: Key Vaults deployed"
            log_success "Phase 2: Node secrets stored"
            log_success "Phase 3: AKS clusters deployed"
            log_info "Next steps:"
            echo "  1. Update enode URLs with actual node IP addresses"
            echo "  2. Deploy Besu validator pods"
            exit 0
        fi

        # Show recent activity
        if [ -f "/tmp/complete-deployment.log" ]; then
            echo "Recent activity (last 3 lines):"
            tail -3 "/tmp/complete-deployment.log" | sed 's/^/  /'
        fi

        echo "Next check in ${MONITOR_INTERVAL} seconds..."
        sleep $MONITOR_INTERVAL
    done

    log_warn "Monitoring timeout reached (${MAX_CHECKS} checks)"
    log_info "Deployment may still be in progress. Continue monitoring manually."
    exit 0
}

# Live monitoring (real-time updates)
monitor_live() {
    log_section "LIVE MONITORING - REAL-TIME UPDATES"

    local last_output_lines=0
    while true; do
        clear
        log_section "LIVE DEPLOYMENT MONITOR - $(date '+%Y-%m-%d %H:%M:%S')"

        check_deployment_process

        # Show latest log output
        if [ -f "/tmp/complete-deployment.log" ]; then
            local current_lines=$(wc -l < "/tmp/complete-deployment.log" 2>/dev/null || echo "0")
            if [ "$current_lines" -gt "$last_output_lines" ]; then
                tail -n $((current_lines - last_output_lines)) "/tmp/complete-deployment.log"
                last_output_lines=$current_lines
            fi
        fi

        sleep $MONITOR_INTERVAL
    done
}

# Dashboard view
show_dashboard() {
    log_section "DEPLOYMENT DASHBOARD"

    check_deployment_process

    log_subsection "PHASE STATUS"

    local p1_status=$(check_phase_status "Phase 1" "/tmp/terraform-plan-phase1.log" "/tmp/terraform-apply-phase1.log")
    local p1_result=$?
    echo "Phase 1 (Key Vaults): $p1_status"

    if [ "$p1_result" -eq 0 ]; then
        local p2_status=""
        if [ -f "/tmp/store-secrets.log" ]; then
            if grep -qi "complete\|success" "/tmp/store-secrets.log" 2>/dev/null; then
                p2_status="✓ COMPLETE"
            else
                p2_status="⏳ IN PROGRESS"
            fi
        else
            p2_status="⏳ WAITING"
        fi
        echo "Phase 2 (Secrets): $p2_status"
    else
        echo "Phase 2 (Secrets): ⏳ WAITING FOR PHASE 1"
    fi

    if [ "$p1_result" -eq 0 ] && [ -f "/tmp/store-secrets.log" ] && grep -qi "complete\|success" "/tmp/store-secrets.log" 2>/dev/null; then
        local p3_status=$(check_phase_status "Phase 3" "/tmp/terraform-plan-phase3.log" "/tmp/terraform-apply-phase3.log")
        echo "Phase 3 (AKS): $p3_status"
    else
        echo "Phase 3 (AKS): ⏳ WAITING FOR PHASE 2"
    fi


    log_subsection "RESOURCE STATUS"
    # Add Azure resource status checks here if needed

    log_subsection "RECENT ACTIVITY"
    if [ -f "/tmp/complete-deployment.log" ]; then
        tail -10 "/tmp/complete-deployment.log"
    else
        echo "No activity log found"
    fi
}

# Main execution
case "$MODE" in
    status)
        show_status
        ;;
    continuous)
        monitor_continuous
        ;;
    live)
        monitor_live
        ;;
    complete)
        # Same as continuous but with auto-proceed logic
        monitor_continuous
        ;;
    fix)
        # Monitor with error fixing (implement fix logic)
        monitor_continuous
        ;;
    dashboard)
        show_dashboard
        ;;
    *)
        log_error "Invalid mode: $MODE"
        log_info "Valid modes: status, continuous, live, complete, fix, dashboard"
        exit 1
        ;;
esac