88 "io"
99 "log"
1010 "net/http"
11+ "strings"
1112 "time"
1213
1314 jsonpatch "github.com/evanphx/json-patch"
@@ -17,6 +18,7 @@ import (
1718 "github.com/hashicorp/terraform-plugin-sdk/v2/helper/schema"
1819 "github.com/hashicorp/terraform-plugin-sdk/v2/helper/validation"
1920 "github.com/mitchellh/mapstructure"
21+ "k8s.io/apimachinery/pkg/util/wait"
2022
2123 "github.com/castai/terraform-provider-castai/castai/sdk"
2224 "github.com/castai/terraform-provider-castai/castai/types"
@@ -805,23 +807,73 @@ func updateAutoscalerPolicies(ctx context.Context, data *schema.ResourceData, me
805807 return nil
806808 }
807809
808- policies , err := getChangedPolicies (ctx , data , meta , clusterId )
809- if err != nil {
810- return err
810+ // Define the update operation that will be executed with retry logic
811+ updatePolicies := func () error {
812+ policies , err := getChangedPolicies (ctx , data , meta , clusterId )
813+ if err != nil {
814+ return err
815+ }
816+
817+ if policies == nil {
818+ log .Printf ("[DEBUG] changed policies json not calculated. Skipping autoscaler policies changes" )
819+ return nil
820+ }
821+
822+ changedPoliciesJSON := string (policies )
823+ if changedPoliciesJSON == "" {
824+ log .Printf ("[DEBUG] changed policies json not found. Skipping autoscaler policies changes" )
825+ return nil
826+ }
827+
828+ return upsertPolicies (ctx , meta , clusterId , changedPoliciesJSON )
811829 }
812830
813- if policies == nil {
814- log .Printf ("[DEBUG] changed policies json not calculated. Skipping autoscaler policies changes" )
831+ // Try to update policies immediately without any delay.
832+ err := updatePolicies ()
833+ if err == nil {
815834 return nil
816835 }
817836
818- changedPoliciesJSON := string (policies )
819- if changedPoliciesJSON == "" {
820- log .Printf ("[DEBUG] changed policies json not found. Skipping autoscaler policies changes" )
821- return nil
837+ // Check if error is retryable (node template version conflict)
838+ if ! isNodeTemplateVersionConflict (err ) {
839+ return err // Non-retryable error
840+ }
841+
842+ // Fall back to exponential backoff retry only if version conflict occurred.
843+ log .Printf ("[INFO] Node template version conflict detected, will retry with exponential backoff: %v" , err )
844+
845+ // Exponential backoff configuration
846+ backoff := wait.Backoff {
847+ Duration : 100 * time .Millisecond ,
848+ Factor : 2.0 ,
849+ Jitter : 0.1 ,
850+ Steps : 5 ,
851+ Cap : 2 * time .Second ,
822852 }
823853
824- return upsertPolicies (ctx , meta , clusterId , changedPoliciesJSON )
854+ retryErr := wait .ExponentialBackoffWithContext (ctx , backoff , func (ctx context.Context ) (done bool , err error ) {
855+ err = updatePolicies ()
856+ if err == nil {
857+ return true , nil // Success - stop retrying
858+ }
859+
860+ // Check if error is retryable
861+ if ! isNodeTemplateVersionConflict (err ) {
862+ return false , err // Non-retryable error - stop with error
863+ }
864+
865+ log .Printf ("[DEBUG] Retry failed with version conflict: %v" , err )
866+ return false , nil // Retryable error - continue retrying
867+ })
868+
869+ if retryErr != nil {
870+ if wait .Interrupted (retryErr ) {
871+ return fmt .Errorf ("timeout waiting for autoscaler policy update after version conflicts: %w" , err )
872+ }
873+ return retryErr
874+ }
875+
876+ return nil
825877}
826878
827879func upsertPolicies (ctx context.Context , meta interface {}, clusterId string , changedPoliciesJSON string ) error {
@@ -835,6 +887,15 @@ func upsertPolicies(ctx context.Context, meta interface{}, clusterId string, cha
835887 return nil
836888}
837889
890+ // isNodeTemplateVersionConflict checks if the error is due to version mismatch
891+ func isNodeTemplateVersionConflict (err error ) bool {
892+ if err == nil {
893+ return false
894+ }
895+ errMsg := err .Error ()
896+ return strings .Contains (errMsg , "template has changed" ) || strings .Contains (errMsg , "refetch the policies" )
897+ }
898+
838899func readAutoscalerPolicies (ctx context.Context , data * schema.ResourceData , meta interface {}) error {
839900 log .Printf ("[INFO] AUTOSCALER policies get call start" )
840901 defer log .Printf ("[INFO] AUTOSCALER policies get call end" )
0 commit comments