@@ -13,7 +13,6 @@ import (
1313 tftypes "github.com/hashicorp/terraform-plugin-framework/types"
1414 "github.com/hashicorp/terraform-plugin-log/tflog"
1515 "github.com/pkg/errors"
16- "golang.org/x/exp/slices"
1716 "google.golang.org/genproto/googleapis/api/httpbody"
1817 "google.golang.org/grpc/codes"
1918 "google.golang.org/grpc/status"
@@ -204,7 +203,7 @@ func (r *AkpClusterResource) upsert(ctx context.Context, diagnostics *diag.Diagn
204203 if diagnostics .HasError () {
205204 return nil , nil
206205 }
207- result , err := r .applyInstance (ctx , plan , apiReq , isCreate , r .akpCli .Cli .ApplyInstance , r .upsertKubeConfig )
206+ result , err := r .applyInstance (ctx , plan , apiReq , isCreate , r .akpCli .Cli .ApplyInstance , r .upsertKubeConfig , r . waitForReconciliation , r . waitForHealth )
208207 // Always refresh cluster state to ensure we have consistent state, even if kubeconfig application failed
209208 if result != nil {
210209 if akiSanitized {
@@ -219,7 +218,16 @@ func (r *AkpClusterResource) upsert(ctx context.Context, diagnostics *diag.Diagn
219218 return result , err
220219}
221220
222- func (r * AkpClusterResource ) applyInstance (ctx context.Context , plan * types.Cluster , apiReq * argocdv1.ApplyInstanceRequest , isCreate bool , applyInstance func (context.Context , * argocdv1.ApplyInstanceRequest ) (* argocdv1.ApplyInstanceResponse , error ), upsertKubeConfig func (ctx context.Context , plan * types.Cluster ) error ) (* types.Cluster , error ) {
221+ func (r * AkpClusterResource ) applyInstance (
222+ ctx context.Context ,
223+ plan * types.Cluster ,
224+ apiReq * argocdv1.ApplyInstanceRequest ,
225+ isCreate bool ,
226+ applyInstance func (context.Context , * argocdv1.ApplyInstanceRequest ) (* argocdv1.ApplyInstanceResponse , error ),
227+ upsertKubeConfig func (ctx context.Context , plan * types.Cluster ) error ,
228+ waitForReconciliation func (ctx context.Context , plan * types.Cluster ) error ,
229+ waitForHealth func (ctx context.Context , plan * types.Cluster ) error ,
230+ ) (* types.Cluster , error ) {
223231 kubeconfig := plan .Kubeconfig
224232 plan .Kubeconfig = nil
225233 tflog .Debug (ctx , fmt .Sprintf ("Apply cluster request: %s" , apiReq ))
@@ -241,14 +249,25 @@ func (r *AkpClusterResource) applyInstance(ctx context.Context, plan *types.Clus
241249 return nil , fmt .Errorf ("unable to create Argo CD instance: %s" , err )
242250 }
243251
252+ if err := waitForReconciliation (ctx , plan ); err != nil {
253+ if isCreate {
254+ tflog .Warn (ctx , fmt .Sprintf ("Cluster reconciliation failed during create, cleaning up cluster %s" , plan .Name .ValueString ()))
255+ cleanupErr := r .deleteCluster (ctx , plan , plan .RemoveAgentResourcesOnDestroy .ValueBool (), true )
256+ if cleanupErr != nil {
257+ tflog .Error (ctx , fmt .Sprintf ("Failed to clean up dangling cluster %s: %v" , plan .Name .ValueString (), cleanupErr ))
258+ return nil , fmt .Errorf ("cluster reconciliation failed: %s (and failed to clean up cluster: %s)" , err , cleanupErr )
259+ }
260+ tflog .Info (ctx , fmt .Sprintf ("Successfully cleaned up dangling cluster %s" , plan .Name .ValueString ()))
261+ }
262+ return nil , fmt .Errorf ("cluster reconciliation failed: %w" , err )
263+ }
264+
244265 if kubeconfig != nil {
245266 plan .Kubeconfig = kubeconfig
246267 shouldApply := isCreate || plan .ReapplyManifestsOnUpdate .ValueBool ()
247268 if shouldApply {
248269 err = upsertKubeConfig (ctx , plan )
249270 if err != nil {
250- // If this is a create operation and kubeconfig application fails,
251- // clean up the dangling cluster from the API
252271 if isCreate {
253272 tflog .Warn (ctx , fmt .Sprintf ("Kubeconfig application failed during create, cleaning up cluster %s" , plan .Name .ValueString ()))
254273 cleanupErr := r .deleteCluster (ctx , plan , plan .RemoveAgentResourcesOnDestroy .ValueBool (), true )
@@ -259,10 +278,20 @@ func (r *AkpClusterResource) applyInstance(ctx context.Context, plan *types.Clus
259278 tflog .Info (ctx , fmt .Sprintf ("Successfully cleaned up dangling cluster %s" , plan .Name .ValueString ()))
260279 return nil , fmt .Errorf ("unable to apply manifests: %s" , err )
261280 }
262- // For updates, just ensure kubeconfig won't be committed to state by setting it to nil
263281 plan .Kubeconfig = nil
264282 return plan , fmt .Errorf ("unable to apply manifests: %s" , err )
265283 }
284+ } else {
285+ if err := waitForHealth (ctx , plan ); err != nil {
286+ return plan , fmt .Errorf ("cluster health check failed: %w" , err )
287+ }
288+ }
289+ } else {
290+ agentWillAutoUpdate := ! isCreate && plan .Spec != nil && ! plan .Spec .Data .AutoUpgradeDisabled .ValueBool ()
291+ if agentWillAutoUpdate {
292+ if err := waitForHealth (ctx , plan ); err != nil {
293+ return plan , fmt .Errorf ("cluster health check failed: %w" , err )
294+ }
266295 }
267296 }
268297
@@ -287,11 +316,19 @@ func (r *AkpClusterResource) upsertKubeConfig(ctx context.Context, plan *types.C
287316 if err != nil {
288317 return err
289318 }
290- return waitClusterHealthStatus (ctx , r .akpCli .Cli , r .akpCli .OrgId , plan )
319+ return waitClusterHealthStatus (ctx , r .akpCli .Cli , r .akpCli .OrgId , plan , plan . EnsureHealthy . ValueBool () )
291320 }
292321 return nil
293322}
294323
324+ func (r * AkpClusterResource ) waitForReconciliation (ctx context.Context , plan * types.Cluster ) error {
325+ return waitForClusterReconciliation (ctx , r .akpCli .Cli , r .akpCli .OrgId , plan )
326+ }
327+
328+ func (r * AkpClusterResource ) waitForHealth (ctx context.Context , plan * types.Cluster ) error {
329+ return waitClusterHealthStatus (ctx , r .akpCli .Cli , r .akpCli .OrgId , plan , plan .EnsureHealthy .ValueBool ())
330+ }
331+
295332func refreshClusterState (ctx context.Context , diagnostics * diag.Diagnostics , client argocdv1.ArgoCDServiceGatewayClient , cluster * types.Cluster ,
296333 orgID string , plan * types.Cluster ,
297334) error {
@@ -437,53 +474,145 @@ func deleteManifests(ctx context.Context, manifests string, cfg *rest.Config) er
437474 return nil
438475}
439476
440- func waitClusterHealthStatus (ctx context.Context , client argocdv1.ArgoCDServiceGatewayClient , orgID string , c * types.Cluster ) error {
441- cluster := & argocdv1.Cluster {}
442- healthStatus := cluster .GetHealthStatus ()
443- breakStatusesHealth := []healthv1.StatusCode {healthv1 .StatusCode_STATUS_CODE_HEALTHY , healthv1 .StatusCode_STATUS_CODE_DEGRADED }
444-
445- for ! slices .Contains (breakStatusesHealth , healthStatus .GetCode ()) {
446- time .Sleep (1 * time .Second )
447- apiResp , err := retryWithBackoff (ctx , func (ctx context.Context ) (* argocdv1.GetInstanceClusterResponse , error ) {
448- return client .GetInstanceCluster (ctx , & argocdv1.GetInstanceClusterRequest {
449- OrganizationId : orgID ,
450- InstanceId : c .InstanceID .ValueString (),
451- Id : c .Name .ValueString (),
452- IdType : idv1 .Type_NAME ,
453- })
454- }, "GetInstanceCluster" )
455- if err != nil {
456- return err
477+ func waitClusterHealthStatus (ctx context.Context , client argocdv1.ArgoCDServiceGatewayClient , orgID string , c * types.Cluster , ensureHealthy bool ) error {
478+ const (
479+ healthStatusTimeout = 10 * time .Minute
480+ healthStatusPollInterval = 5 * time .Second
481+ )
482+
483+ targetStatuses := []healthv1.StatusCode {
484+ healthv1 .StatusCode_STATUS_CODE_HEALTHY ,
485+ }
486+ if ! ensureHealthy {
487+ return nil
488+ }
489+
490+ var lastHealthMessage string
491+ if err := waitForStatus (
492+ ctx ,
493+ func (ctx context.Context ) (* argocdv1.Cluster , error ) {
494+ resp , err := retryWithBackoff (ctx , func (ctx context.Context ) (* argocdv1.GetInstanceClusterResponse , error ) {
495+ return client .GetInstanceCluster (ctx , & argocdv1.GetInstanceClusterRequest {
496+ OrganizationId : orgID ,
497+ InstanceId : c .InstanceID .ValueString (),
498+ Id : c .Name .ValueString (),
499+ IdType : idv1 .Type_NAME ,
500+ })
501+ }, "GetInstanceCluster" )
502+ if err != nil {
503+ return nil , err
504+ }
505+ return resp .GetCluster (), nil
506+ },
507+ func (cluster * argocdv1.Cluster ) healthv1.StatusCode {
508+ healthStatus := cluster .GetHealthStatus ()
509+ if msg := healthStatus .GetMessage (); msg != "" {
510+ lastHealthMessage = msg
511+ }
512+ return healthStatus .GetCode ()
513+ },
514+ targetStatuses ,
515+ healthStatusPollInterval ,
516+ healthStatusTimeout ,
517+ c .Name .ValueString (),
518+ "health" ,
519+ ); err != nil {
520+ var errMsg strings.Builder
521+ errMsg .WriteString (err .Error ())
522+
523+ if lastHealthMessage != "" {
524+ errMsg .WriteString (fmt .Sprintf ("\n \n Health status message: %s" , lastHealthMessage ))
457525 }
458- cluster = apiResp .GetCluster ()
459- healthStatus = cluster .GetHealthStatus ()
460- tflog .Debug (ctx , fmt .Sprintf ("Cluster health status: %s" , healthStatus .String ()))
526+
527+ errMsg .WriteString ("\n \n Troubleshooting steps:" )
528+ errMsg .WriteString ("\n 1. Check the cluster health in the Akuity Console" )
529+ errMsg .WriteString ("\n 2. Verify the Akuity agent is running in the cluster (for example in akuity namespace):" )
530+ errMsg .WriteString ("\n kubectl get pods -n akuity" )
531+ errMsg .WriteString ("\n 3. Check agent logs for errors:" )
532+ errMsg .WriteString ("\n kubectl logs -n akuity -l app.kubernetes.io/name=akuity-agent --tail=100" )
533+ errMsg .WriteString ("\n 4. Ensure the cluster has network connectivity to the Akuity Platform" )
534+
535+ return errors .New (errMsg .String ())
461536 }
537+
462538 return nil
463539}
464540
465541func waitClusterReconStatus (ctx context.Context , client argocdv1.ArgoCDServiceGatewayClient , cluster * argocdv1.Cluster , orgId , instanceId string ) (* argocdv1.Cluster , error ) {
466- reconStatus := cluster .GetReconciliationStatus ()
467- breakStatusesRecon := []reconv1.StatusCode {reconv1 .StatusCode_STATUS_CODE_SUCCESSFUL , reconv1 .StatusCode_STATUS_CODE_FAILED }
468-
469- for ! slices .Contains (breakStatusesRecon , reconStatus .GetCode ()) {
470- time .Sleep (1 * time .Second )
471- apiResp , err := retryWithBackoff (ctx , func (ctx context.Context ) (* argocdv1.GetInstanceClusterResponse , error ) {
472- return client .GetInstanceCluster (ctx , & argocdv1.GetInstanceClusterRequest {
473- OrganizationId : orgId ,
474- InstanceId : instanceId ,
475- Id : cluster .Id ,
476- IdType : idv1 .Type_ID ,
477- })
478- }, "GetInstanceCluster" )
479- if err != nil {
480- return nil , err
542+ const (
543+ reconStatusTimeout = 10 * time .Minute
544+ reconStatusPollInterval = 5 * time .Second
545+ )
546+
547+ targetStatuses := []reconv1.StatusCode {
548+ reconv1 .StatusCode_STATUS_CODE_SUCCESSFUL ,
549+ reconv1 .StatusCode_STATUS_CODE_FAILED ,
550+ }
551+
552+ var finalCluster * argocdv1.Cluster
553+
554+ err := waitForStatus (
555+ ctx ,
556+ func (ctx context.Context ) (* argocdv1.Cluster , error ) {
557+ resp , err := retryWithBackoff (ctx , func (ctx context.Context ) (* argocdv1.GetInstanceClusterResponse , error ) {
558+ return client .GetInstanceCluster (ctx , & argocdv1.GetInstanceClusterRequest {
559+ OrganizationId : orgId ,
560+ InstanceId : instanceId ,
561+ Id : cluster .Id ,
562+ IdType : idv1 .Type_ID ,
563+ })
564+ }, "GetInstanceCluster" )
565+ if err != nil {
566+ return nil , err
567+ }
568+ finalCluster = resp .GetCluster ()
569+ return finalCluster , nil
570+ },
571+ func (cluster * argocdv1.Cluster ) reconv1.StatusCode {
572+ return cluster .GetReconciliationStatus ().GetCode ()
573+ },
574+ targetStatuses ,
575+ reconStatusPollInterval ,
576+ reconStatusTimeout ,
577+ cluster .Name ,
578+ "reconciliation" ,
579+ )
580+ if err != nil {
581+ return nil , err
582+ }
583+
584+ return finalCluster , nil
585+ }
586+
587+ func waitForClusterReconciliation (ctx context.Context , client argocdv1.ArgoCDServiceGatewayClient , orgID string , plan * types.Cluster ) error {
588+ clusterReq := & argocdv1.GetInstanceClusterRequest {
589+ OrganizationId : orgID ,
590+ InstanceId : plan .InstanceID .ValueString (),
591+ Id : plan .Name .ValueString (),
592+ IdType : idv1 .Type_NAME ,
593+ }
594+
595+ clusterResp , err := retryWithBackoff (ctx , func (ctx context.Context ) (* argocdv1.GetInstanceClusterResponse , error ) {
596+ return client .GetInstanceCluster (ctx , clusterReq )
597+ }, "GetInstanceCluster" )
598+ if err != nil {
599+ return errors .Wrap (err , "unable to get cluster for reconciliation check" )
600+ }
601+
602+ finalCluster , err := waitClusterReconStatus (ctx , client , clusterResp .GetCluster (), orgID , plan .InstanceID .ValueString ())
603+ if err != nil {
604+ return errors .Wrap (err , "unable to wait for cluster reconciliation" )
605+ }
606+
607+ if finalCluster .GetReconciliationStatus ().GetCode () == reconv1 .StatusCode_STATUS_CODE_FAILED {
608+ msg := finalCluster .GetReconciliationStatus ().GetMessage ()
609+ if msg == "" {
610+ msg = "cluster reconciliation failed"
481611 }
482- cluster = apiResp .GetCluster ()
483- reconStatus = cluster .GetReconciliationStatus ()
484- tflog .Debug (ctx , fmt .Sprintf ("Cluster recon status: %s" , reconStatus .String ()))
612+ return errors .New (msg )
485613 }
486- return cluster , nil
614+
615+ return nil
487616}
488617
489618func readStream (resChan <- chan * httpbody.HttpBody , errChan <- chan error ) ([]byte , error ) {
0 commit comments