@@ -19,7 +19,11 @@ package controllers
1919import (
2020 "context"
2121 "fmt"
22+ "net/http"
23+ "strings"
24+ "time"
2225
26+ "github.com/Azure/azure-sdk-for-go/sdk/azcore"
2327 "github.com/pkg/errors"
2428 corev1 "k8s.io/api/core/v1"
2529 apierrors "k8s.io/apimachinery/pkg/api/errors"
@@ -41,6 +45,8 @@ import (
4145 infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1beta1"
4246 "sigs.k8s.io/cluster-api-provider-azure/azure"
4347 "sigs.k8s.io/cluster-api-provider-azure/azure/scope"
48+ "sigs.k8s.io/cluster-api-provider-azure/azure/services/networkinterfaces"
49+ "sigs.k8s.io/cluster-api-provider-azure/azure/services/resourceskus"
4450 "sigs.k8s.io/cluster-api-provider-azure/pkg/coalescing"
4551 "sigs.k8s.io/cluster-api-provider-azure/util/reconciler"
4652 "sigs.k8s.io/cluster-api-provider-azure/util/tele"
@@ -235,6 +241,119 @@ func (amr *AzureMachineReconciler) Reconcile(ctx context.Context, req ctrl.Reque
235241 return amr .reconcileNormal (ctx , machineScope , clusterScope )
236242}
237243
244+ // quotaExceededErrorCodes contains Azure error codes that indicate quota exhaustion.
245+ // Reference: https://learn.microsoft.com/en-us/azure/azure-resource-manager/troubleshooting/error-resource-quota
246+ var quotaExceededErrorCodes = []string {
247+ "QuotaExceeded" ,
248+ "OperationNotAllowed" ,
249+ "ResourceQuotaExceeded" ,
250+ "DeploymentQuotaExceeded" ,
251+ }
252+
253+ // isQuotaExceededError checks if error is due to Azure quota exhaustion.
254+ func isQuotaExceededError (err error ) bool {
255+ if err == nil {
256+ return false
257+ }
258+
259+ var respErr * azcore.ResponseError
260+ if errors .As (err , & respErr ) {
261+ for _ , code := range quotaExceededErrorCodes {
262+ if respErr .ErrorCode == code {
263+ return true
264+ }
265+ }
266+
267+ if respErr .RawResponse != nil {
268+ errMsg := strings .ToLower (respErr .Error ())
269+ return strings .Contains (errMsg , "quota" ) && strings .Contains (errMsg , "exceeded" )
270+ }
271+ }
272+
273+ return false
274+ }
275+
276+ // shouldCleanupNIC checks if grace period has elapsed since VMRunningCondition
277+ // was set to False with QuotaExhausted reason.
278+ func shouldCleanupNIC (machineScope * scope.MachineScope ) bool {
279+ condition := v1beta1conditions .Get (machineScope .AzureMachine , infrav1 .VMRunningCondition )
280+ if condition == nil {
281+ return false
282+ }
283+
284+ if condition .Status != corev1 .ConditionFalse || condition .Reason != "QuotaExhausted" {
285+ return false
286+ }
287+
288+ gracePeriod := 1 * time .Minute
289+ return time .Since (condition .LastTransitionTime .Time ) >= gracePeriod
290+ }
291+
292+ // markQuotaFailureCondition sets VMRunningCondition to False with QuotaExhausted reason.
293+ func markQuotaFailureCondition (machineScope * scope.MachineScope , err error ) {
294+ v1beta1conditions .MarkFalse (
295+ machineScope .AzureMachine ,
296+ infrav1 .VMRunningCondition ,
297+ "QuotaExhausted" ,
298+ clusterv1beta1 .ConditionSeverityWarning ,
299+ "VM creation failed due to quota exhaustion: %s" , err .Error (),
300+ )
301+ }
302+
303+ // cleanupOrphanedNIC deletes the NIC when it's orphaned due to VM creation failure.
304+ func (amr * AzureMachineReconciler ) cleanupOrphanedNIC (ctx context.Context , machineScope * scope.MachineScope ) error {
305+ ctx , log , done := tele .StartSpanWithLogger (ctx , "controllers.AzureMachineReconciler.cleanupOrphanedNIC" )
306+ defer done ()
307+
308+ skuCache , err := resourceskus .GetCache (machineScope , machineScope .Location ())
309+ if err != nil {
310+ return errors .Wrap (err , "failed to get resource SKU cache" )
311+ }
312+
313+ nicSvc , err := networkinterfaces .New (machineScope , skuCache )
314+ if err != nil {
315+ return errors .Wrap (err , "failed to create networkinterfaces service" )
316+ }
317+
318+ if err := nicSvc .Delete (ctx ); err != nil {
319+ var respErr * azcore.ResponseError
320+ if errors .As (err , & respErr ) && respErr .StatusCode == http .StatusNotFound {
321+ log .Info ("NIC already deleted" )
322+ return nil
323+ }
324+ return errors .Wrap (err , "failed to delete orphaned NIC" )
325+ }
326+
327+ log .Info ("Orphaned NIC deleted successfully" )
328+ return nil
329+ }
330+
331+ /*
332+ // getRetryCount retrieves retry count from Machine annotations.
333+ // Used by exponential backoff retry mechanism (Option 1).
334+ func getRetryCount(machineScope *scope.MachineScope) int {
335+ countStr, exists := machineScope.AzureMachine.Annotations["azure.infrastructure.cluster.x-k8s.io/quota-retry-count"]
336+ if !exists {
337+ return 0
338+ }
339+
340+ count, err := strconv.Atoi(countStr)
341+ if err != nil {
342+ return 0
343+ }
344+ return count
345+ }
346+
347+ // setRetryCount stores retry count in Machine annotations.
348+ // Used by exponential backoff retry mechanism (Option 1).
349+ func setRetryCount(machineScope *scope.MachineScope, count int) {
350+ if machineScope.AzureMachine.Annotations == nil {
351+ machineScope.AzureMachine.Annotations = make(map[string]string)
352+ }
353+ machineScope.AzureMachine.Annotations["azure.infrastructure.cluster.x-k8s.io/quota-retry-count"] = strconv.Itoa(count)
354+ }.
355+ */
356+
238357func (amr * AzureMachineReconciler ) reconcileNormal (ctx context.Context , machineScope * scope.MachineScope , clusterScope * scope.ClusterScope ) (reconcile.Result , error ) {
239358 ctx , log , done := tele .StartSpanWithLogger (ctx , "controllers.AzureMachineReconciler.reconcileNormal" )
240359 defer done ()
@@ -312,6 +431,48 @@ func (amr *AzureMachineReconciler) reconcileNormal(ctx context.Context, machineS
312431 return reconcile.Result {}, errors .Wrap (err , "failed to reconcile AzureMachine" )
313432 }
314433
434+ // Check for quota errors and cleanup orphaned NIC if grace period expired
435+ if isQuotaExceededError (err ) {
436+ if shouldCleanupNIC (machineScope ) {
437+ log .Info ("Grace period expired, cleaning up orphaned NIC" , "error" , err .Error ())
438+ if nicErr := amr .cleanupOrphanedNIC (ctx , machineScope ); nicErr != nil {
439+ log .Error (nicErr , "Failed to delete orphaned NIC during quota failure cleanup" )
440+ return ctrl.Result {}, nicErr
441+ }
442+
443+ // Mark Machine as permanently Failed
444+ machineScope .SetFailureReason ("QuotaExhausted" )
445+ machineScope .SetFailureMessage (fmt .Errorf ("VM creation failed due to quota exhaustion" ))
446+ machineScope .SetNotReady ()
447+
448+ // Option 2: Fail permanently (ACTIVE)
449+ log .Info ("Machine marked as Failed due to persistent quota exhaustion" )
450+ return ctrl.Result {}, nil
451+
452+ // Option 1: Retry with exponential backoff (COMMENTED OUT)
453+ // Uncomment this block and comment out the above return to enable retries
454+ //nolint:gocritic // Intentionally commented code for alternative retry strategy
455+ /*
456+ // Calculate retry count from condition observation
457+ retryCount := getRetryCount(machineScope)
458+ if retryCount >= 3 {
459+ log.Info("Max retries exceeded, failing permanently")
460+ return ctrl.Result{}, nil
461+ }
462+
463+ // Exponential backoff: 15min, 30min, 60min
464+ backoffDuration := time.Duration(math.Pow(2, float64(retryCount))) * 15 * time.Minute
465+ log.Info("Will retry VM creation", "attempt", retryCount+1, "backoff", backoffDuration)
466+ setRetryCount(machineScope, retryCount+1)
467+ return ctrl.Result{RequeueAfter: backoffDuration}, nil
468+ */
469+ }
470+
471+ markQuotaFailureCondition (machineScope , err )
472+ log .Info ("Quota failure detected, will cleanup NIC on next reconcile if still failing" ,
473+ "gracePeriod" , "1m" , "error" , err .Error ())
474+ }
475+
315476 // Handle transient and terminal errors
316477 if errors .As (err , & reconcileError ) {
317478 if reconcileError .IsTerminal () {
0 commit comments