Skip to content

Commit 73dc45e

Browse files
committed
Fix LoadBalancer hang by cleaning up orphaned NIC on quota failures
When VM provisioning fails due to Azure quota exhaustion, CAPZ leaves the NIC orphaned (without VirtualMachine.ID). This causes cloud-provider-azure's LoadBalancer reconciler to fail cluster-wide when querying NICs. This change implements defensive cleanup of orphaned NICs after a 1-minute grace period when quota errors are detected. The Machine is then marked as permanently Failed to avoid recreation loops. Changes: - Add quota error detection using azcore.ResponseError with multiple error codes - Track quota failures via VMRunningCondition (uses existing CAPI conditions API) - Delete orphaned NIC if quota error persists after 1-minute grace period - Mark Machine as Failed with FailureReason="QuotaExhausted" - Add unit tests for condition-based grace period and quota detection Fixes cluster-wide LoadBalancer failures during quota exhaustion while avoiding recreation loops by marking Machines as permanently Failed. AI assisted code Signed-off-by: Chetan Giradkar <cgiradka@redhat.com>
1 parent 47cc48d commit 73dc45e

2 files changed

Lines changed: 329 additions & 0 deletions

File tree

controllers/azuremachine_controller.go

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,11 @@ package controllers
1919
import (
2020
"context"
2121
"fmt"
22+
"net/http"
23+
"strings"
24+
"time"
2225

26+
"github.com/Azure/azure-sdk-for-go/sdk/azcore"
2327
"github.com/pkg/errors"
2428
corev1 "k8s.io/api/core/v1"
2529
apierrors "k8s.io/apimachinery/pkg/api/errors"
@@ -41,6 +45,8 @@ import (
4145
infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1"
4246
"sigs.k8s.io/cluster-api-provider-azure/azure"
4347
"sigs.k8s.io/cluster-api-provider-azure/azure/scope"
48+
"sigs.k8s.io/cluster-api-provider-azure/azure/services/networkinterfaces"
49+
"sigs.k8s.io/cluster-api-provider-azure/azure/services/resourceskus"
4450
"sigs.k8s.io/cluster-api-provider-azure/pkg/coalescing"
4551
"sigs.k8s.io/cluster-api-provider-azure/util/reconciler"
4652
"sigs.k8s.io/cluster-api-provider-azure/util/tele"
@@ -235,6 +241,119 @@ func (amr *AzureMachineReconciler) Reconcile(ctx context.Context, req ctrl.Reque
235241
return amr.reconcileNormal(ctx, machineScope, clusterScope)
236242
}
237243

244+
// quotaExceededErrorCodes contains Azure error codes that indicate quota exhaustion.
245+
// Reference: https://learn.microsoft.com/en-us/azure/azure-resource-manager/troubleshooting/error-resource-quota
246+
var quotaExceededErrorCodes = []string{
247+
"QuotaExceeded",
248+
"OperationNotAllowed",
249+
"ResourceQuotaExceeded",
250+
"DeploymentQuotaExceeded",
251+
}
252+
253+
// isQuotaExceededError checks if error is due to Azure quota exhaustion.
254+
func isQuotaExceededError(err error) bool {
255+
if err == nil {
256+
return false
257+
}
258+
259+
var respErr *azcore.ResponseError
260+
if errors.As(err, &respErr) {
261+
for _, code := range quotaExceededErrorCodes {
262+
if respErr.ErrorCode == code {
263+
return true
264+
}
265+
}
266+
267+
if respErr.RawResponse != nil {
268+
errMsg := strings.ToLower(respErr.Error())
269+
return strings.Contains(errMsg, "quota") && strings.Contains(errMsg, "exceeded")
270+
}
271+
}
272+
273+
return false
274+
}
275+
276+
// shouldCleanupNIC checks if grace period has elapsed since VMRunningCondition
277+
// was set to False with QuotaExhausted reason.
278+
func shouldCleanupNIC(machineScope *scope.MachineScope) bool {
279+
condition := v1beta1conditions.Get(machineScope.AzureMachine, infrav1.VMRunningCondition)
280+
if condition == nil {
281+
return false
282+
}
283+
284+
if condition.Status != corev1.ConditionFalse || condition.Reason != "QuotaExhausted" {
285+
return false
286+
}
287+
288+
gracePeriod := 1 * time.Minute
289+
return time.Since(condition.LastTransitionTime.Time) >= gracePeriod
290+
}
291+
292+
// markQuotaFailureCondition sets VMRunningCondition to False with QuotaExhausted reason.
293+
func markQuotaFailureCondition(machineScope *scope.MachineScope, err error) {
294+
v1beta1conditions.MarkFalse(
295+
machineScope.AzureMachine,
296+
infrav1.VMRunningCondition,
297+
"QuotaExhausted",
298+
clusterv1beta1.ConditionSeverityWarning,
299+
"VM creation failed due to quota exhaustion: %s", err.Error(),
300+
)
301+
}
302+
303+
// cleanupOrphanedNIC deletes the NIC when it's orphaned due to VM creation failure.
304+
func (amr *AzureMachineReconciler) cleanupOrphanedNIC(ctx context.Context, machineScope *scope.MachineScope) error {
305+
ctx, log, done := tele.StartSpanWithLogger(ctx, "controllers.AzureMachineReconciler.cleanupOrphanedNIC")
306+
defer done()
307+
308+
skuCache, err := resourceskus.GetCache(machineScope, machineScope.Location())
309+
if err != nil {
310+
return errors.Wrap(err, "failed to get resource SKU cache")
311+
}
312+
313+
nicSvc, err := networkinterfaces.New(machineScope, skuCache)
314+
if err != nil {
315+
return errors.Wrap(err, "failed to create networkinterfaces service")
316+
}
317+
318+
if err := nicSvc.Delete(ctx); err != nil {
319+
var respErr *azcore.ResponseError
320+
if errors.As(err, &respErr) && respErr.StatusCode == http.StatusNotFound {
321+
log.Info("NIC already deleted")
322+
return nil
323+
}
324+
return errors.Wrap(err, "failed to delete orphaned NIC")
325+
}
326+
327+
log.Info("Orphaned NIC deleted successfully")
328+
return nil
329+
}
330+
331+
/*
332+
// getRetryCount retrieves retry count from Machine annotations.
333+
// Used by exponential backoff retry mechanism (Option 1).
334+
func getRetryCount(machineScope *scope.MachineScope) int {
335+
countStr, exists := machineScope.AzureMachine.Annotations["azure.infrastructure.cluster.x-k8s.io/quota-retry-count"]
336+
if !exists {
337+
return 0
338+
}
339+
340+
count, err := strconv.Atoi(countStr)
341+
if err != nil {
342+
return 0
343+
}
344+
return count
345+
}
346+
347+
// setRetryCount stores retry count in Machine annotations.
348+
// Used by exponential backoff retry mechanism (Option 1).
349+
func setRetryCount(machineScope *scope.MachineScope, count int) {
350+
if machineScope.AzureMachine.Annotations == nil {
351+
machineScope.AzureMachine.Annotations = make(map[string]string)
352+
}
353+
machineScope.AzureMachine.Annotations["azure.infrastructure.cluster.x-k8s.io/quota-retry-count"] = strconv.Itoa(count)
354+
}.
355+
*/
356+
238357
func (amr *AzureMachineReconciler) reconcileNormal(ctx context.Context, machineScope *scope.MachineScope, clusterScope *scope.ClusterScope) (reconcile.Result, error) {
239358
ctx, log, done := tele.StartSpanWithLogger(ctx, "controllers.AzureMachineReconciler.reconcileNormal")
240359
defer done()
@@ -312,6 +431,48 @@ func (amr *AzureMachineReconciler) reconcileNormal(ctx context.Context, machineS
312431
return reconcile.Result{}, errors.Wrap(err, "failed to reconcile AzureMachine")
313432
}
314433

434+
// Check for quota errors and cleanup orphaned NIC if grace period expired
435+
if isQuotaExceededError(err) {
436+
if shouldCleanupNIC(machineScope) {
437+
log.Info("Grace period expired, cleaning up orphaned NIC", "error", err.Error())
438+
if nicErr := amr.cleanupOrphanedNIC(ctx, machineScope); nicErr != nil {
439+
log.Error(nicErr, "Failed to delete orphaned NIC during quota failure cleanup")
440+
return ctrl.Result{}, nicErr
441+
}
442+
443+
// Mark Machine as permanently Failed
444+
machineScope.SetFailureReason("QuotaExhausted")
445+
machineScope.SetFailureMessage(fmt.Errorf("VM creation failed due to quota exhaustion"))
446+
machineScope.SetNotReady()
447+
448+
// Option 2: Fail permanently (ACTIVE)
449+
log.Info("Machine marked as Failed due to persistent quota exhaustion")
450+
return ctrl.Result{}, nil
451+
452+
// Option 1: Retry with exponential backoff (COMMENTED OUT)
453+
// Uncomment this block and comment out the above return to enable retries
454+
//nolint:gocritic // Intentionally commented code for alternative retry strategy
455+
/*
456+
// Calculate retry count from condition observation
457+
retryCount := getRetryCount(machineScope)
458+
if retryCount >= 3 {
459+
log.Info("Max retries exceeded, failing permanently")
460+
return ctrl.Result{}, nil
461+
}
462+
463+
// Exponential backoff: 15min, 30min, 60min
464+
backoffDuration := time.Duration(math.Pow(2, float64(retryCount))) * 15 * time.Minute
465+
log.Info("Will retry VM creation", "attempt", retryCount+1, "backoff", backoffDuration)
466+
setRetryCount(machineScope, retryCount+1)
467+
return ctrl.Result{RequeueAfter: backoffDuration}, nil
468+
*/
469+
}
470+
471+
markQuotaFailureCondition(machineScope, err)
472+
log.Info("Quota failure detected, will cleanup NIC on next reconcile if still failing",
473+
"gracePeriod", "1m", "error", err.Error())
474+
}
475+
315476
// Handle transient and terminal errors
316477
if errors.As(err, &reconcileError) {
317478
if reconcileError.IsTerminal() {

controllers/azuremachine_controller_test.go

Lines changed: 168 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,11 @@ package controllers
1818

1919
import (
2020
"context"
21+
"net/http"
2122
"testing"
2223
"time"
2324

25+
"github.com/Azure/azure-sdk-for-go/sdk/azcore"
2426
. "github.com/onsi/gomega"
2527
"github.com/pkg/errors"
2628
corev1 "k8s.io/api/core/v1"
@@ -31,6 +33,7 @@ import (
3133
"k8s.io/utils/ptr"
3234
clusterv1beta1 "sigs.k8s.io/cluster-api/api/core/v1beta1"
3335
clusterv1 "sigs.k8s.io/cluster-api/api/core/v1beta2"
36+
v1beta1conditions "sigs.k8s.io/cluster-api/util/deprecated/v1beta1/conditions"
3437
ctrl "sigs.k8s.io/controller-runtime"
3538
"sigs.k8s.io/controller-runtime/pkg/client"
3639
"sigs.k8s.io/controller-runtime/pkg/client/fake"
@@ -831,3 +834,168 @@ func conditionsMatch(i, j clusterv1beta1.Condition) bool {
831834
i.Reason == j.Reason &&
832835
i.Severity == j.Severity
833836
}
837+
838+
func TestIsQuotaExceededError(t *testing.T) {
839+
g := NewWithT(t)
840+
841+
tests := []struct {
842+
name string
843+
err error
844+
expected bool
845+
}{
846+
{
847+
name: "nil error",
848+
err: nil,
849+
expected: false,
850+
},
851+
{
852+
name: "non-azcore error",
853+
err: errors.New("some random error"),
854+
expected: false,
855+
},
856+
{
857+
name: "azcore error with QuotaExceeded code",
858+
err: &azcore.ResponseError{ErrorCode: "QuotaExceeded"},
859+
expected: true,
860+
},
861+
{
862+
name: "azcore error with OperationNotAllowed code",
863+
err: &azcore.ResponseError{ErrorCode: "OperationNotAllowed"},
864+
expected: true,
865+
},
866+
{
867+
name: "azcore error with ResourceQuotaExceeded code",
868+
err: &azcore.ResponseError{ErrorCode: "ResourceQuotaExceeded"},
869+
expected: true,
870+
},
871+
{
872+
name: "azcore error with DeploymentQuotaExceeded code",
873+
err: &azcore.ResponseError{ErrorCode: "DeploymentQuotaExceeded"},
874+
expected: true,
875+
},
876+
{
877+
name: "azcore error with different code",
878+
err: &azcore.ResponseError{ErrorCode: "NetworkTimeout"},
879+
expected: false,
880+
},
881+
{
882+
name: "azcore error with quota in message",
883+
err: &azcore.ResponseError{
884+
RawResponse: &http.Response{StatusCode: http.StatusBadRequest},
885+
},
886+
expected: false, // No message set, so won't match
887+
},
888+
}
889+
890+
for _, tt := range tests {
891+
t.Run(tt.name, func(t *testing.T) {
892+
result := isQuotaExceededError(tt.err)
893+
g.Expect(result).To(Equal(tt.expected))
894+
})
895+
}
896+
}
897+
898+
func TestShouldCleanupNIC(t *testing.T) {
899+
g := NewWithT(t)
900+
901+
tests := []struct {
902+
name string
903+
condition *clusterv1beta1.Condition
904+
shouldCleanup bool
905+
}{
906+
{
907+
name: "No condition",
908+
condition: nil,
909+
shouldCleanup: false,
910+
},
911+
{
912+
name: "Condition not QuotaExhausted",
913+
condition: &clusterv1beta1.Condition{
914+
Type: infrav1.VMRunningCondition,
915+
Status: corev1.ConditionFalse,
916+
Reason: "OtherReason",
917+
LastTransitionTime: metav1.Time{Time: time.Now().Add(-2 * time.Minute)},
918+
},
919+
shouldCleanup: false,
920+
},
921+
{
922+
name: "Grace period not elapsed",
923+
condition: &clusterv1beta1.Condition{
924+
Type: infrav1.VMRunningCondition,
925+
Status: corev1.ConditionFalse,
926+
Reason: "QuotaExhausted",
927+
LastTransitionTime: metav1.Time{Time: time.Now().Add(-30 * time.Second)},
928+
},
929+
shouldCleanup: false,
930+
},
931+
{
932+
name: "Grace period elapsed",
933+
condition: &clusterv1beta1.Condition{
934+
Type: infrav1.VMRunningCondition,
935+
Status: corev1.ConditionFalse,
936+
Reason: "QuotaExhausted",
937+
LastTransitionTime: metav1.Time{Time: time.Now().Add(-2 * time.Minute)},
938+
},
939+
shouldCleanup: true,
940+
},
941+
}
942+
943+
for _, tt := range tests {
944+
t.Run(tt.name, func(t *testing.T) {
945+
azureMachine := &infrav1.AzureMachine{
946+
ObjectMeta: metav1.ObjectMeta{
947+
Name: "test-machine",
948+
Namespace: "default",
949+
},
950+
}
951+
if tt.condition != nil {
952+
v1beta1conditions.Set(azureMachine, tt.condition)
953+
}
954+
machineScope := &scope.MachineScope{
955+
AzureMachine: azureMachine,
956+
}
957+
958+
result := shouldCleanupNIC(machineScope)
959+
g.Expect(result).To(Equal(tt.shouldCleanup))
960+
})
961+
}
962+
}
963+
964+
func TestMarkQuotaFailureCondition(t *testing.T) {
965+
g := NewWithT(t)
966+
967+
tests := []struct {
968+
name string
969+
err error
970+
}{
971+
{
972+
name: "marks condition with quota error",
973+
err: errors.New("quota exceeded for resource"),
974+
},
975+
}
976+
977+
for _, tt := range tests {
978+
t.Run(tt.name, func(t *testing.T) {
979+
azureMachine := &infrav1.AzureMachine{
980+
ObjectMeta: metav1.ObjectMeta{
981+
Name: "test-machine",
982+
Namespace: "default",
983+
},
984+
}
985+
986+
machineScope := &scope.MachineScope{
987+
AzureMachine: azureMachine,
988+
}
989+
990+
markQuotaFailureCondition(machineScope, tt.err)
991+
992+
condition := v1beta1conditions.Get(machineScope.AzureMachine, infrav1.VMRunningCondition)
993+
g.Expect(condition).NotTo(BeNil())
994+
g.Expect(condition.Status).To(Equal(corev1.ConditionFalse))
995+
g.Expect(condition.Reason).To(Equal("QuotaExhausted"))
996+
g.Expect(condition.Severity).To(Equal(clusterv1beta1.ConditionSeverityWarning))
997+
g.Expect(condition.Message).To(ContainSubstring("VM creation failed due to quota exhaustion"))
998+
g.Expect(condition.Message).To(ContainSubstring(tt.err.Error()))
999+
})
1000+
}
1001+
}

0 commit comments

Comments
 (0)