@@ -26,6 +26,7 @@ import (
2626 "github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources/v3"
2727 "github.com/google/uuid"
2828 corev1 "k8s.io/api/core/v1"
29+ apierrors "k8s.io/apimachinery/pkg/api/errors"
2930 metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3031 "k8s.io/apimachinery/pkg/util/wait"
3132 "k8s.io/client-go/tools/clientcmd"
@@ -353,24 +354,35 @@ func waitForClusterDeletion(ctx context.Context, clusterName, resourceGroupName
353354
354355func waitUntilClusterReady (ctx context.Context , name , location string ) (* armcontainerservice.ManagedCluster , error ) {
355356 var cluster armcontainerservice.ManagedClustersClientGetResponse
357+ var clusterDeleted bool
356358 err := wait .PollUntilContextCancel (ctx , time .Second , true , func (ctx context.Context ) (bool , error ) {
357359 var err error
358360 cluster , err = config .Azure .AKS .Get (ctx , config .ResourceGroupName (location ), name , nil )
359361 if err != nil {
362+ var azErr * azcore.ResponseError
363+ if errors .As (err , & azErr ) && azErr .StatusCode == 404 {
364+ clusterDeleted = true
365+ return true , nil
366+ }
360367 return false , err
361368 }
362369 switch * cluster .ManagedCluster .Properties .ProvisioningState {
363370 case "Succeeded" :
364371 return true , nil
365- case "Updating" , "Assigned" , "Creating" :
372+ case "Updating" , "Assigned" , "Creating" , "Deleting" , "Canceling" :
366373 return false , nil
374+ case "Canceled" :
375+ return false , fmt .Errorf ("cluster %s is in state %s, won't retry" , name , * cluster .ManagedCluster .Properties .ProvisioningState )
367376 default :
368- return false , fmt .Errorf ("cluster %s is in state %s" , name , * cluster .ManagedCluster .Properties .ProvisioningState )
377+ return false , fmt .Errorf ("cluster %s is in state %s, won't retry " , name , * cluster .ManagedCluster .Properties .ProvisioningState )
369378 }
370379 })
371380 if err != nil {
372381 return nil , fmt .Errorf ("failed to wait for cluster %s to be ready: %w" , name , err )
373382 }
383+ if clusterDeleted {
384+ return nil , nil
385+ }
374386 return & cluster .ManagedCluster , nil
375387}
376388
@@ -538,13 +550,30 @@ func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedC
538550 }
539551
540552 var bastionSubnetID string
541- bastionSubnet , subnetGetErr := config .Azure .Subnet .Get (ctx , nodeRG , vnet .name , bastionSubnetName , nil )
542- if subnetGetErr != nil {
553+ var bastionSubnet armnetwork.SubnetsClientGetResponse
554+ var subnetGetErr error
555+ // Retry the subnet GET with a per-call timeout to tolerate ARM hangs.
556+ // Without this, a single unresponsive GET consumes the entire 20-minute cluster prep budget.
557+ err = wait .PollUntilContextTimeout (ctx , 5 * time .Second , 2 * time .Minute , true , func (ctx context.Context ) (bool , error ) {
558+ callCtx , cancel := context .WithTimeout (ctx , 30 * time .Second )
559+ defer cancel ()
560+ bastionSubnet , subnetGetErr = config .Azure .Subnet .Get (callCtx , nodeRG , vnet .name , bastionSubnetName , nil )
561+ if subnetGetErr == nil {
562+ return true , nil
563+ }
543564 var subnetAzErr * azcore.ResponseError
544- if ! errors .As (subnetGetErr , & subnetAzErr ) || subnetAzErr .StatusCode ! = http .StatusNotFound {
545- return nil , fmt . Errorf ( "get subnet %q in vnet %q rg %q: %w" , bastionSubnetName , vnet . name , nodeRG , subnetGetErr )
565+ if errors .As (subnetGetErr , & subnetAzErr ) && subnetAzErr .StatusCode = = http .StatusNotFound {
566+ return true , nil // 404 is expected — will create below
546567 }
568+ toolkit .Logf (ctx , "transient error getting subnet %q (retrying): %v" , bastionSubnetName , subnetGetErr )
569+ return false , nil
570+ })
571+ if err != nil {
572+ return nil , fmt .Errorf ("get subnet %q in vnet %q rg %q: retries exhausted: %w (last subnet error: %v)" , bastionSubnetName , vnet .name , nodeRG , err , subnetGetErr )
573+ }
547574
575+ if subnetGetErr != nil {
576+ // 404 — need to create
548577 toolkit .Logf (ctx , "creating subnet %s in VNet %s (rg %s)" , bastionSubnetName , vnet .name , nodeRG )
549578 subnetParams := armnetwork.Subnet {
550579 Properties : & armnetwork.SubnetPropertiesFormat {
@@ -733,28 +762,30 @@ func collectGarbageVMSS(ctx context.Context, cluster *armcontainerservice.Manage
733762 defer toolkit .LogStepCtx (ctx , "collecting garbage VMSS" )()
734763 rg := * cluster .Properties .NodeResourceGroup
735764
736- // Build a set of all existing VMSS names while deleting old ones.
737- existingVMSS := map [string ]struct {}{}
765+ // Build a set of VMSS names that should be kept — exclude VMSS that are
766+ // being deleted so their stale K8s nodes can be cleaned up in the same pass.
767+ keptVMSS := map [string ]struct {}{}
738768 pager := config .Azure .VMSS .NewListPager (rg , nil )
739769 for pager .More () {
740770 page , err := pager .NextPage (ctx )
741771 if err != nil {
742772 return fmt .Errorf ("failed to get next page of VMSS: %w" , err )
743773 }
744774 for _ , vmss := range page .Value {
745- existingVMSS [* vmss .Name ] = struct {}{}
746-
747775 if _ , ok := vmss .Tags ["KEEP_VMSS" ]; ok {
776+ keptVMSS [* vmss .Name ] = struct {}{}
748777 continue
749778 }
750779 // don't delete managed pools
751780 if _ , ok := vmss .Tags ["aks-managed-poolName" ]; ok {
781+ keptVMSS [* vmss .Name ] = struct {}{}
752782 continue
753783 }
754784
755785 // don't delete VMSS created in the last hour. They might be currently used in tests
756786 // extra 10 minutes is a buffer for test cleanup, clock drift and timeout adjustments
757787 if config .Config .TestTimeout == 0 || time .Since (* vmss .Properties .TimeCreated ) < config .Config .TestTimeout + 10 * time .Minute {
788+ keptVMSS [* vmss .Name ] = struct {}{}
758789 continue
759790 }
760791
@@ -763,13 +794,16 @@ func collectGarbageVMSS(ctx context.Context, cluster *armcontainerservice.Manage
763794 })
764795 if err != nil {
765796 toolkit .Logf (ctx , "failed to delete vmss %q: %s" , * vmss .Name , err )
797+ // Keep in map so we don't try to delete its nodes while VMSS is still around
798+ keptVMSS [* vmss .Name ] = struct {}{}
766799 continue
767800 }
768801 toolkit .Logf (ctx , "deleted vmss %q (age: %v)" , * vmss .ID , time .Since (* vmss .Properties .TimeCreated ))
802+ // Don't add to keptVMSS — nodes from this VMSS should be cleaned up
769803 }
770804 }
771805
772- if err := collectGarbageNodes (ctx , kube , existingVMSS ); err != nil {
806+ if err := collectGarbageNodes (ctx , kube , keptVMSS ); err != nil {
773807 return fmt .Errorf ("failed to collect garbage K8s nodes: %w" , err )
774808 }
775809 return nil
@@ -779,15 +813,15 @@ func collectGarbageVMSS(ctx context.Context, cluster *armcontainerservice.Manage
779813// longer exists. This prevents stale nodes from accumulating in the cluster
780814// and overwhelming the cloud-provider-azure route controller with perpetual
781815// "instance not found" failures.
782- func collectGarbageNodes (ctx context.Context , kube * Kubeclient , existingVMSS map [string ]struct {}) error {
816+ func collectGarbageNodes (ctx context.Context , kube * Kubeclient , keptVMSS map [string ]struct {}) error {
783817 defer toolkit .LogStepCtx (ctx , "collecting garbage K8s nodes" )()
784818
785819 nodes , err := kube .Typed .CoreV1 ().Nodes ().List (ctx , metav1.ListOptions {})
786820 if err != nil {
787821 return fmt .Errorf ("listing K8s nodes for garbage collection: %w" , err )
788822 }
789823
790- var deleteErrors [] error
824+ var deleted , failed int
791825 for _ , node := range nodes .Items {
792826 // skip managed pool nodes (system nodepool)
793827 if strings .HasPrefix (node .Name , "aks-" ) {
@@ -800,19 +834,26 @@ func collectGarbageNodes(ctx context.Context, kube *Kubeclient, existingVMSS map
800834 }
801835 vmssName := node .Name [:len (node .Name )- 6 ]
802836
803- if _ , exists := existingVMSS [vmssName ]; exists {
837+ if _ , exists := keptVMSS [vmssName ]; exists {
804838 continue
805839 }
806840
807841 if err := kube .Typed .CoreV1 ().Nodes ().Delete (ctx , node .Name , metav1.DeleteOptions {}); err != nil {
808- deleteErrors = append (deleteErrors , fmt .Errorf ("deleting stale node %q: %w" , node .Name , err ))
842+ if apierrors .IsNotFound (err ) {
843+ toolkit .Logf (ctx , "stale K8s node %q already gone" , node .Name )
844+ deleted ++
845+ continue
846+ }
847+ toolkit .Logf (ctx , "warning: failed to delete stale K8s node %q: %v" , node .Name , err )
848+ failed ++
809849 continue
810850 }
811851 toolkit .Logf (ctx , "deleted stale K8s node %q (VMSS %q not found)" , node .Name , vmssName )
852+ deleted ++
812853 }
813854
814- if len ( deleteErrors ) > 0 {
815- return fmt .Errorf ("failed to delete %d stale nodes, first error: %w " , len ( deleteErrors ), deleteErrors [ 0 ] )
855+ if failed > 0 && deleted == 0 {
856+ return fmt .Errorf ("failed to delete any of %d stale nodes" , failed )
816857 }
817858 return nil
818859}
0 commit comments