Skip to content

Commit d6e0e58

Browse files
r2k1CopilotCopilotCopilot
authored
fix(e2e): improve infra setup reliability with retries and tolerant GC (#8488)
Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: r2k1 <2599261+r2k1@users.noreply.github.com>
1 parent 008b20d commit d6e0e58

1 file changed

Lines changed: 58 additions & 17 deletions

File tree

e2e/cluster.go

Lines changed: 58 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
"github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/resources/armresources/v3"
2727
"github.com/google/uuid"
2828
corev1 "k8s.io/api/core/v1"
29+
apierrors "k8s.io/apimachinery/pkg/api/errors"
2930
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3031
"k8s.io/apimachinery/pkg/util/wait"
3132
"k8s.io/client-go/tools/clientcmd"
@@ -353,24 +354,35 @@ func waitForClusterDeletion(ctx context.Context, clusterName, resourceGroupName
353354

354355
func waitUntilClusterReady(ctx context.Context, name, location string) (*armcontainerservice.ManagedCluster, error) {
355356
var cluster armcontainerservice.ManagedClustersClientGetResponse
357+
var clusterDeleted bool
356358
err := wait.PollUntilContextCancel(ctx, time.Second, true, func(ctx context.Context) (bool, error) {
357359
var err error
358360
cluster, err = config.Azure.AKS.Get(ctx, config.ResourceGroupName(location), name, nil)
359361
if err != nil {
362+
var azErr *azcore.ResponseError
363+
if errors.As(err, &azErr) && azErr.StatusCode == 404 {
364+
clusterDeleted = true
365+
return true, nil
366+
}
360367
return false, err
361368
}
362369
switch *cluster.ManagedCluster.Properties.ProvisioningState {
363370
case "Succeeded":
364371
return true, nil
365-
case "Updating", "Assigned", "Creating":
372+
case "Updating", "Assigned", "Creating", "Deleting", "Canceling":
366373
return false, nil
374+
case "Canceled":
375+
return false, fmt.Errorf("cluster %s is in state %s, won't retry", name, *cluster.ManagedCluster.Properties.ProvisioningState)
367376
default:
368-
return false, fmt.Errorf("cluster %s is in state %s", name, *cluster.ManagedCluster.Properties.ProvisioningState)
377+
return false, fmt.Errorf("cluster %s is in state %s, won't retry", name, *cluster.ManagedCluster.Properties.ProvisioningState)
369378
}
370379
})
371380
if err != nil {
372381
return nil, fmt.Errorf("failed to wait for cluster %s to be ready: %w", name, err)
373382
}
383+
if clusterDeleted {
384+
return nil, nil
385+
}
374386
return &cluster.ManagedCluster, nil
375387
}
376388

@@ -538,13 +550,30 @@ func createNewBastion(ctx context.Context, cluster *armcontainerservice.ManagedC
538550
}
539551

540552
var bastionSubnetID string
541-
bastionSubnet, subnetGetErr := config.Azure.Subnet.Get(ctx, nodeRG, vnet.name, bastionSubnetName, nil)
542-
if subnetGetErr != nil {
553+
var bastionSubnet armnetwork.SubnetsClientGetResponse
554+
var subnetGetErr error
555+
// Retry the subnet GET with a per-call timeout to tolerate ARM hangs.
556+
// Without this, a single unresponsive GET consumes the entire 20-minute cluster prep budget.
557+
err = wait.PollUntilContextTimeout(ctx, 5*time.Second, 2*time.Minute, true, func(ctx context.Context) (bool, error) {
558+
callCtx, cancel := context.WithTimeout(ctx, 30*time.Second)
559+
defer cancel()
560+
bastionSubnet, subnetGetErr = config.Azure.Subnet.Get(callCtx, nodeRG, vnet.name, bastionSubnetName, nil)
561+
if subnetGetErr == nil {
562+
return true, nil
563+
}
543564
var subnetAzErr *azcore.ResponseError
544-
if !errors.As(subnetGetErr, &subnetAzErr) || subnetAzErr.StatusCode != http.StatusNotFound {
545-
return nil, fmt.Errorf("get subnet %q in vnet %q rg %q: %w", bastionSubnetName, vnet.name, nodeRG, subnetGetErr)
565+
if errors.As(subnetGetErr, &subnetAzErr) && subnetAzErr.StatusCode == http.StatusNotFound {
566+
return true, nil // 404 is expected — will create below
546567
}
568+
toolkit.Logf(ctx, "transient error getting subnet %q (retrying): %v", bastionSubnetName, subnetGetErr)
569+
return false, nil
570+
})
571+
if err != nil {
572+
return nil, fmt.Errorf("get subnet %q in vnet %q rg %q: retries exhausted: %w (last subnet error: %v)", bastionSubnetName, vnet.name, nodeRG, err, subnetGetErr)
573+
}
547574

575+
if subnetGetErr != nil {
576+
// 404 — need to create
548577
toolkit.Logf(ctx, "creating subnet %s in VNet %s (rg %s)", bastionSubnetName, vnet.name, nodeRG)
549578
subnetParams := armnetwork.Subnet{
550579
Properties: &armnetwork.SubnetPropertiesFormat{
@@ -733,28 +762,30 @@ func collectGarbageVMSS(ctx context.Context, cluster *armcontainerservice.Manage
733762
defer toolkit.LogStepCtx(ctx, "collecting garbage VMSS")()
734763
rg := *cluster.Properties.NodeResourceGroup
735764

736-
// Build a set of all existing VMSS names while deleting old ones.
737-
existingVMSS := map[string]struct{}{}
765+
// Build a set of VMSS names that should be kept — exclude VMSS that are
766+
// being deleted so their stale K8s nodes can be cleaned up in the same pass.
767+
keptVMSS := map[string]struct{}{}
738768
pager := config.Azure.VMSS.NewListPager(rg, nil)
739769
for pager.More() {
740770
page, err := pager.NextPage(ctx)
741771
if err != nil {
742772
return fmt.Errorf("failed to get next page of VMSS: %w", err)
743773
}
744774
for _, vmss := range page.Value {
745-
existingVMSS[*vmss.Name] = struct{}{}
746-
747775
if _, ok := vmss.Tags["KEEP_VMSS"]; ok {
776+
keptVMSS[*vmss.Name] = struct{}{}
748777
continue
749778
}
750779
// don't delete managed pools
751780
if _, ok := vmss.Tags["aks-managed-poolName"]; ok {
781+
keptVMSS[*vmss.Name] = struct{}{}
752782
continue
753783
}
754784

755785
// don't delete VMSS created in the last hour. They might be currently used in tests
756786
// extra 10 minutes is a buffer for test cleanup, clock drift and timeout adjustments
757787
if config.Config.TestTimeout == 0 || time.Since(*vmss.Properties.TimeCreated) < config.Config.TestTimeout+10*time.Minute {
788+
keptVMSS[*vmss.Name] = struct{}{}
758789
continue
759790
}
760791

@@ -763,13 +794,16 @@ func collectGarbageVMSS(ctx context.Context, cluster *armcontainerservice.Manage
763794
})
764795
if err != nil {
765796
toolkit.Logf(ctx, "failed to delete vmss %q: %s", *vmss.Name, err)
797+
// Keep in map so we don't try to delete its nodes while VMSS is still around
798+
keptVMSS[*vmss.Name] = struct{}{}
766799
continue
767800
}
768801
toolkit.Logf(ctx, "deleted vmss %q (age: %v)", *vmss.ID, time.Since(*vmss.Properties.TimeCreated))
802+
// Don't add to keptVMSS — nodes from this VMSS should be cleaned up
769803
}
770804
}
771805

772-
if err := collectGarbageNodes(ctx, kube, existingVMSS); err != nil {
806+
if err := collectGarbageNodes(ctx, kube, keptVMSS); err != nil {
773807
return fmt.Errorf("failed to collect garbage K8s nodes: %w", err)
774808
}
775809
return nil
@@ -779,15 +813,15 @@ func collectGarbageVMSS(ctx context.Context, cluster *armcontainerservice.Manage
779813
// longer exists. This prevents stale nodes from accumulating in the cluster
780814
// and overwhelming the cloud-provider-azure route controller with perpetual
781815
// "instance not found" failures.
782-
func collectGarbageNodes(ctx context.Context, kube *Kubeclient, existingVMSS map[string]struct{}) error {
816+
func collectGarbageNodes(ctx context.Context, kube *Kubeclient, keptVMSS map[string]struct{}) error {
783817
defer toolkit.LogStepCtx(ctx, "collecting garbage K8s nodes")()
784818

785819
nodes, err := kube.Typed.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
786820
if err != nil {
787821
return fmt.Errorf("listing K8s nodes for garbage collection: %w", err)
788822
}
789823

790-
var deleteErrors []error
824+
var deleted, failed int
791825
for _, node := range nodes.Items {
792826
// skip managed pool nodes (system nodepool)
793827
if strings.HasPrefix(node.Name, "aks-") {
@@ -800,19 +834,26 @@ func collectGarbageNodes(ctx context.Context, kube *Kubeclient, existingVMSS map
800834
}
801835
vmssName := node.Name[:len(node.Name)-6]
802836

803-
if _, exists := existingVMSS[vmssName]; exists {
837+
if _, exists := keptVMSS[vmssName]; exists {
804838
continue
805839
}
806840

807841
if err := kube.Typed.CoreV1().Nodes().Delete(ctx, node.Name, metav1.DeleteOptions{}); err != nil {
808-
deleteErrors = append(deleteErrors, fmt.Errorf("deleting stale node %q: %w", node.Name, err))
842+
if apierrors.IsNotFound(err) {
843+
toolkit.Logf(ctx, "stale K8s node %q already gone", node.Name)
844+
deleted++
845+
continue
846+
}
847+
toolkit.Logf(ctx, "warning: failed to delete stale K8s node %q: %v", node.Name, err)
848+
failed++
809849
continue
810850
}
811851
toolkit.Logf(ctx, "deleted stale K8s node %q (VMSS %q not found)", node.Name, vmssName)
852+
deleted++
812853
}
813854

814-
if len(deleteErrors) > 0 {
815-
return fmt.Errorf("failed to delete %d stale nodes, first error: %w", len(deleteErrors), deleteErrors[0])
855+
if failed > 0 && deleted == 0 {
856+
return fmt.Errorf("failed to delete any of %d stale nodes", failed)
816857
}
817858
return nil
818859
}

0 commit comments

Comments
 (0)