Files
smom-dbis-138/scripts/azure/analyze-deployment-failures.sh

127 lines
7.0 KiB
Bash
Raw Permalink Normal View History

#!/usr/bin/env bash
# Comprehensive Azure deployment failure analysis
# Compares Terraform logs with Azure activity logs
set -e
SUBSCRIPTION_ID="fc08d829-4f14-413d-ab27-ce024425db0b"
PROJECT_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)"
echo "╔════════════════════════════════════════════════════════════════╗"
echo "║ AZURE DEPLOYMENT FAILURE ANALYSIS ║"
echo "╚════════════════════════════════════════════════════════════════╝"
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "Part 1: Failed Clusters Analysis"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
FAILED_CLUSTERS=$(az aks list --subscription "$SUBSCRIPTION_ID" \
--query "[?contains(name, 'az-p-') && provisioningState == 'Failed'].{name:name, rg:resourceGroup}" -o json)
FAILED_COUNT=$(echo "$FAILED_CLUSTERS" | jq '. | length')
echo "Found $FAILED_COUNT failed clusters"
echo ""
if [ "$FAILED_COUNT" -gt 0 ]; then
echo "$FAILED_CLUSTERS" | jq -r '.[] | "\(.rg)|\(.name)"' | while IFS='|' read -r rg name; do
echo "Cluster: $name"
echo "Resource Group: $rg"
echo ""
# Get cluster details
echo "Cluster Details:"
az aks show --resource-group "$rg" --name "$name" --subscription "$SUBSCRIPTION_ID" \
--query "{provisioningState:provisioningState, powerState:powerState.code, createdTime:createdAt, kubernetesVersion:kubernetesVersion}" -o json 2>&1 | jq '.' || echo " Error retrieving details"
echo ""
# Get activity log errors
echo "Recent Errors from Activity Log:"
az monitor activity-log list --subscription "$SUBSCRIPTION_ID" \
--resource-group "$rg" \
--resource-id "/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$rg/providers/Microsoft.ContainerService/managedClusters/$name" \
--max-events 20 \
--query "[?status.value == 'Failed' || level == 'Error'].{time:eventTimestamp, operation:operationName.localValue, status:status.value, message:statusMessage.message, error:properties.statusMessage}" -o json 2>&1 | \
jq -r '.[] | " [\(.time)] \(.operation): \(.message // .error)"' | head -5 || echo " No errors found"
echo ""
echo "---"
echo ""
done
fi
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "Part 2: Canceled Clusters Analysis"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
CANCELED_CLUSTERS=$(az aks list --subscription "$SUBSCRIPTION_ID" \
--query "[?contains(name, 'az-p-') && provisioningState == 'Canceled'].{name:name, rg:resourceGroup}" -o json)
CANCELED_COUNT=$(echo "$CANCELED_CLUSTERS" | jq '. | length')
echo "Found $CANCELED_COUNT canceled clusters"
echo ""
if [ "$CANCELED_COUNT" -gt 0 ]; then
echo "$CANCELED_CLUSTERS" | jq -r '.[:5][] | "\(.rg)|\(.name)"' | while IFS='|' read -r rg name; do
echo "Cluster: $name"
echo "Resource Group: $rg"
echo ""
# Get cluster details
az aks show --resource-group "$rg" --name "$name" --subscription "$SUBSCRIPTION_ID" \
--query "{provisioningState:provisioningState, powerState:powerState.code, createdTime:createdAt}" -o json 2>&1 | jq '.' || echo " Error retrieving details"
echo ""
# Get activity log
echo "Recent Activity:"
az monitor activity-log list --subscription "$SUBSCRIPTION_ID" \
--resource-group "$rg" \
--resource-id "/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$rg/providers/Microsoft.ContainerService/managedClusters/$name" \
--max-events 10 \
--query "[].{time:eventTimestamp, operation:operationName.localValue, status:status.value}" -o json 2>&1 | \
jq -r '.[] | " [\(.time)] \(.operation): \(.status)"' | head -5 || echo " No activity found"
echo ""
done
fi
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "Part 3: Recent Errors Across Subscription"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
echo "Checking recent errors for AKS clusters..."
az monitor activity-log list --subscription "$SUBSCRIPTION_ID" \
--resource-type "Microsoft.ContainerService/managedClusters" \
--max-events 30 \
--query "[?status.value == 'Failed' || level == 'Error'].{time:eventTimestamp, resource:resourceId, operation:operationName.localValue, status:status.value, message:statusMessage.message, error:properties.statusMessage}" -o json 2>&1 | \
jq -r '.[] | "\(.time) | \(.resource | split("/") | .[-2] + "/" + .[-1]) | \(.operation) | \(.status) | \(.message // .error)"' | \
sort -r | head -15 || echo "No errors found"
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "Part 4: Terraform Log Analysis"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
if [ -f "$PROJECT_ROOT/tmp/terraform-apply-unlocked.log" ]; then
echo "Terraform Log: /tmp/terraform-apply-unlocked.log"
echo "Errors found: $(grep -i "error" "$PROJECT_ROOT/tmp/terraform-apply-unlocked.log" | wc -l)"
echo ""
echo "Key Error Messages:"
grep -i "stopped state\|operation not allowed\|already exists" "$PROJECT_ROOT/tmp/terraform-apply-unlocked.log" | head -5 | sed 's/^/ /'
else
echo "Terraform log not found"
fi
echo ""
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo "Part 5: Comparison Summary"
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
echo ""
echo "Comparing Terraform logs with Azure logs..."
echo "✅ Analysis complete - See details above"