#!/usr/bin/env bash # Comprehensive fix for deployment issues # Deletes failed/canceled clusters and re-runs Terraform set -e SUBSCRIPTION_ID="fc08d829-4f14-413d-ab27-ce024425db0b" PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)" TERRAFORM_DIR="$PROJECT_ROOT/terraform/well-architected/cloud-sovereignty" echo "╔════════════════════════════════════════════════════════════════╗" echo "║ DEPLOYMENT FIX - COMPREHENSIVE CLEANUP & REDEPLOYMENT ║" echo "╚════════════════════════════════════════════════════════════════╝" echo "" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "Step 1: Delete Failed Clusters (7)" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "" FAILED_CLUSTERS=$(az aks list --subscription "$SUBSCRIPTION_ID" \ --query "[?contains(name, 'az-p-') && provisioningState == 'Failed'].{name:name, rg:resourceGroup}" -o json) FAILED_COUNT=$(echo "$FAILED_CLUSTERS" | jq '. | length') echo "Found $FAILED_COUNT failed clusters to delete" echo "" if [ "$FAILED_COUNT" -gt 0 ]; then echo "$FAILED_CLUSTERS" | jq -r '.[] | "\(.rg)|\(.name)"' | while IFS='|' read -r rg name; do echo "Deleting failed cluster: $name (RG: $rg)" az aks delete --resource-group "$rg" --name "$name" --subscription "$SUBSCRIPTION_ID" --yes --no-wait 2>&1 | grep -v "^$" || true echo " ✅ Deletion initiated" echo "" done echo "Waiting for failed cluster deletions to complete..." sleep 30 # Wait for deletions echo "$FAILED_CLUSTERS" | jq -r '.[] | "\(.rg)|\(.name)"' | while IFS='|' read -r rg name; do echo -n " Waiting for $name..." while az aks show --resource-group "$rg" --name "$name" --subscription "$SUBSCRIPTION_ID" >/dev/null 2>&1; do echo -n "." sleep 5 done echo " ✅ Deleted" done else echo "No failed clusters to delete" fi echo "" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "Step 2: Delete Canceled Clusters (16)" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "" CANCELED_CLUSTERS=$(az aks list --subscription "$SUBSCRIPTION_ID" \ --query "[?contains(name, 'az-p-') && provisioningState == 'Canceled'].{name:name, rg:resourceGroup}" -o json) CANCELED_COUNT=$(echo "$CANCELED_CLUSTERS" | jq '. | length') echo "Found $CANCELED_COUNT canceled clusters to delete" echo "" if [ "$CANCELED_COUNT" -gt 0 ]; then echo "$CANCELED_CLUSTERS" | jq -r '.[] | "\(.rg)|\(.name)"' | while IFS='|' read -r rg name; do echo "Deleting canceled cluster: $name (RG: $rg)" az aks delete --resource-group "$rg" --name "$name" --subscription "$SUBSCRIPTION_ID" --yes --no-wait 2>&1 | grep -v "^$" || true echo " ✅ Deletion initiated" echo "" done echo "Waiting for canceled cluster deletions to complete..." sleep 30 # Wait for deletions (in batches) BATCH_SIZE=5 BATCH_NUM=0 echo "$CANCELED_CLUSTERS" | jq -r '.[] | "\(.rg)|\(.name)"' | while IFS='|' read -r rg name; do echo -n " Waiting for $name..." while az aks show --resource-group "$rg" --name "$name" --subscription "$SUBSCRIPTION_ID" >/dev/null 2>&1; do echo -n "." sleep 5 done echo " ✅ Deleted" done else echo "No canceled clusters to delete" fi echo "" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "Step 3: Clean Terraform State" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "" cd "$TERRAFORM_DIR" echo "Removing deleted clusters from Terraform state..." echo "" # Get list of all cluster resources in state TERRAFORM_STATE_CLUSTERS=$(terraform state list 2>/dev/null | grep "azurerm_kubernetes_cluster" || true) if [ -n "$TERRAFORM_STATE_CLUSTERS" ]; then echo "Checking state for cluster resources..." echo "$TERRAFORM_STATE_CLUSTERS" | while read -r resource; do CLUSTER_NAME=$(echo "$resource" | sed 's/.*\.main\[.*\]//' || echo "$resource" | awk -F'.' '{print $NF}') echo " Checking: $resource" # Try to check if cluster still exists if echo "$resource" | grep -q "azurerm_kubernetes_cluster"; then echo " Resource in state: $resource" fi done else echo "No cluster resources found in Terraform state" fi echo "" echo "Note: Terraform will automatically handle state cleanup during apply" echo "" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "Step 4: Re-run Terraform Deployment" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "" echo "Initializing Terraform..." terraform init -upgrade >/dev/null 2>&1 || true echo "" echo "Re-running Terraform deployment..." echo "This will recreate all deleted clusters with proper configuration" echo "" echo "⚠️ This may take 15-30 minutes depending on region availability" echo "" # Run Terraform apply with maximum parallelism terraform apply -parallelism=128 -auto-approve 2>&1 | tee /tmp/terraform-apply-fixed.log echo "" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "Step 5: Verify Deployment" echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━" echo "" echo "Waiting 30 seconds for clusters to stabilize..." sleep 30 echo "" echo "Checking cluster status..." echo "" READY_COUNT=$(az aks list --subscription "$SUBSCRIPTION_ID" \ --query "[?contains(name, 'az-p-') && provisioningState == 'Succeeded'].name" -o tsv | wc -l) FAILED_COUNT=$(az aks list --subscription "$SUBSCRIPTION_ID" \ --query "[?contains(name, 'az-p-') && provisioningState == 'Failed'].name" -o tsv | wc -l) CREATING_COUNT=$(az aks list --subscription "$SUBSCRIPTION_ID" \ --query "[?contains(name, 'az-p-') && provisioningState == 'Creating'].name" -o tsv | wc -l) echo "📊 Deployment Status:" echo " ✅ Ready (Succeeded): $READY_COUNT" echo " ❌ Failed: $FAILED_COUNT" echo " ⏳ Creating: $CREATING_COUNT" echo "" if [ "$CREATING_COUNT" -gt 0 ]; then echo "⚠️ Some clusters are still creating. Monitor with:" echo " az aks list --subscription $SUBSCRIPTION_ID --query \"[?contains(name, 'az-p-')].{name:name, state:provisioningState}\" -o table" fi if [ "$FAILED_COUNT" -gt 0 ]; then echo "⚠️ Some clusters failed. Check logs:" echo " tail -100 /tmp/terraform-apply-fixed.log" echo " ./scripts/azure/analyze-deployment-failures.sh" fi echo "" echo "✅ Fix process complete!" echo "" echo "📝 Logs:" echo " • Terraform: /tmp/terraform-apply-fixed.log" echo " • This script: Check output above" echo "" echo "🎯 Next Steps:" echo " 1. Monitor cluster creation: az aks list --query \"[?contains(name, 'az-p-')].{name:name, state:provisioningState}\" -o table" echo " 2. Once ready, run: ./scripts/deployment/wait-and-run-all-next-steps.sh" echo ""