Skip to content

Commit 1dd4505

Browse files
authored
Automatic Terraform Provider Sync (#436)
Automatic sync from Akuity Platform
1 parent 8a95477 commit 1dd4505

File tree

12 files changed

+312
-113
lines changed

12 files changed

+312
-113
lines changed

akp/data_source_akp_cluster_schema.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,10 @@ func getAKPClusterDataSourceAttributes() map[string]schema.Attribute {
6161
MarkdownDescription: "Whether to reapply manifests on update",
6262
Computed: true,
6363
},
64+
"ensure_healthy": schema.BoolAttribute{
65+
MarkdownDescription: "If true, terraform apply will fail if the cluster agent becomes degraded or does not become healthy within the timeout period. When false (default), terraform will not wait for the resource status to be reported.",
66+
Computed: true,
67+
},
6468
}
6569
}
6670

akp/resource_akp_cluster.go

Lines changed: 175 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ import (
1313
tftypes "github.com/hashicorp/terraform-plugin-framework/types"
1414
"github.com/hashicorp/terraform-plugin-log/tflog"
1515
"github.com/pkg/errors"
16-
"golang.org/x/exp/slices"
1716
"google.golang.org/genproto/googleapis/api/httpbody"
1817
"google.golang.org/grpc/codes"
1918
"google.golang.org/grpc/status"
@@ -204,7 +203,7 @@ func (r *AkpClusterResource) upsert(ctx context.Context, diagnostics *diag.Diagn
204203
if diagnostics.HasError() {
205204
return nil, nil
206205
}
207-
result, err := r.applyInstance(ctx, plan, apiReq, isCreate, r.akpCli.Cli.ApplyInstance, r.upsertKubeConfig)
206+
result, err := r.applyInstance(ctx, plan, apiReq, isCreate, r.akpCli.Cli.ApplyInstance, r.upsertKubeConfig, r.waitForReconciliation, r.waitForHealth)
208207
// Always refresh cluster state to ensure we have consistent state, even if kubeconfig application failed
209208
if result != nil {
210209
if akiSanitized {
@@ -219,7 +218,16 @@ func (r *AkpClusterResource) upsert(ctx context.Context, diagnostics *diag.Diagn
219218
return result, err
220219
}
221220

222-
func (r *AkpClusterResource) applyInstance(ctx context.Context, plan *types.Cluster, apiReq *argocdv1.ApplyInstanceRequest, isCreate bool, applyInstance func(context.Context, *argocdv1.ApplyInstanceRequest) (*argocdv1.ApplyInstanceResponse, error), upsertKubeConfig func(ctx context.Context, plan *types.Cluster) error) (*types.Cluster, error) {
221+
func (r *AkpClusterResource) applyInstance(
222+
ctx context.Context,
223+
plan *types.Cluster,
224+
apiReq *argocdv1.ApplyInstanceRequest,
225+
isCreate bool,
226+
applyInstance func(context.Context, *argocdv1.ApplyInstanceRequest) (*argocdv1.ApplyInstanceResponse, error),
227+
upsertKubeConfig func(ctx context.Context, plan *types.Cluster) error,
228+
waitForReconciliation func(ctx context.Context, plan *types.Cluster) error,
229+
waitForHealth func(ctx context.Context, plan *types.Cluster) error,
230+
) (*types.Cluster, error) {
223231
kubeconfig := plan.Kubeconfig
224232
plan.Kubeconfig = nil
225233
tflog.Debug(ctx, fmt.Sprintf("Apply cluster request: %s", apiReq))
@@ -241,14 +249,25 @@ func (r *AkpClusterResource) applyInstance(ctx context.Context, plan *types.Clus
241249
return nil, fmt.Errorf("unable to create Argo CD instance: %s", err)
242250
}
243251

252+
if err := waitForReconciliation(ctx, plan); err != nil {
253+
if isCreate {
254+
tflog.Warn(ctx, fmt.Sprintf("Cluster reconciliation failed during create, cleaning up cluster %s", plan.Name.ValueString()))
255+
cleanupErr := r.deleteCluster(ctx, plan, plan.RemoveAgentResourcesOnDestroy.ValueBool(), true)
256+
if cleanupErr != nil {
257+
tflog.Error(ctx, fmt.Sprintf("Failed to clean up dangling cluster %s: %v", plan.Name.ValueString(), cleanupErr))
258+
return nil, fmt.Errorf("cluster reconciliation failed: %s (and failed to clean up cluster: %s)", err, cleanupErr)
259+
}
260+
tflog.Info(ctx, fmt.Sprintf("Successfully cleaned up dangling cluster %s", plan.Name.ValueString()))
261+
}
262+
return nil, fmt.Errorf("cluster reconciliation failed: %w", err)
263+
}
264+
244265
if kubeconfig != nil {
245266
plan.Kubeconfig = kubeconfig
246267
shouldApply := isCreate || plan.ReapplyManifestsOnUpdate.ValueBool()
247268
if shouldApply {
248269
err = upsertKubeConfig(ctx, plan)
249270
if err != nil {
250-
// If this is a create operation and kubeconfig application fails,
251-
// clean up the dangling cluster from the API
252271
if isCreate {
253272
tflog.Warn(ctx, fmt.Sprintf("Kubeconfig application failed during create, cleaning up cluster %s", plan.Name.ValueString()))
254273
cleanupErr := r.deleteCluster(ctx, plan, plan.RemoveAgentResourcesOnDestroy.ValueBool(), true)
@@ -259,10 +278,20 @@ func (r *AkpClusterResource) applyInstance(ctx context.Context, plan *types.Clus
259278
tflog.Info(ctx, fmt.Sprintf("Successfully cleaned up dangling cluster %s", plan.Name.ValueString()))
260279
return nil, fmt.Errorf("unable to apply manifests: %s", err)
261280
}
262-
// For updates, just ensure kubeconfig won't be committed to state by setting it to nil
263281
plan.Kubeconfig = nil
264282
return plan, fmt.Errorf("unable to apply manifests: %s", err)
265283
}
284+
} else {
285+
if err := waitForHealth(ctx, plan); err != nil {
286+
return plan, fmt.Errorf("cluster health check failed: %w", err)
287+
}
288+
}
289+
} else {
290+
agentWillAutoUpdate := !isCreate && plan.Spec != nil && !plan.Spec.Data.AutoUpgradeDisabled.ValueBool()
291+
if agentWillAutoUpdate {
292+
if err := waitForHealth(ctx, plan); err != nil {
293+
return plan, fmt.Errorf("cluster health check failed: %w", err)
294+
}
266295
}
267296
}
268297

@@ -287,11 +316,19 @@ func (r *AkpClusterResource) upsertKubeConfig(ctx context.Context, plan *types.C
287316
if err != nil {
288317
return err
289318
}
290-
return waitClusterHealthStatus(ctx, r.akpCli.Cli, r.akpCli.OrgId, plan)
319+
return waitClusterHealthStatus(ctx, r.akpCli.Cli, r.akpCli.OrgId, plan, plan.EnsureHealthy.ValueBool())
291320
}
292321
return nil
293322
}
294323

324+
func (r *AkpClusterResource) waitForReconciliation(ctx context.Context, plan *types.Cluster) error {
325+
return waitForClusterReconciliation(ctx, r.akpCli.Cli, r.akpCli.OrgId, plan)
326+
}
327+
328+
func (r *AkpClusterResource) waitForHealth(ctx context.Context, plan *types.Cluster) error {
329+
return waitClusterHealthStatus(ctx, r.akpCli.Cli, r.akpCli.OrgId, plan, plan.EnsureHealthy.ValueBool())
330+
}
331+
295332
func refreshClusterState(ctx context.Context, diagnostics *diag.Diagnostics, client argocdv1.ArgoCDServiceGatewayClient, cluster *types.Cluster,
296333
orgID string, plan *types.Cluster,
297334
) error {
@@ -437,53 +474,145 @@ func deleteManifests(ctx context.Context, manifests string, cfg *rest.Config) er
437474
return nil
438475
}
439476

440-
func waitClusterHealthStatus(ctx context.Context, client argocdv1.ArgoCDServiceGatewayClient, orgID string, c *types.Cluster) error {
441-
cluster := &argocdv1.Cluster{}
442-
healthStatus := cluster.GetHealthStatus()
443-
breakStatusesHealth := []healthv1.StatusCode{healthv1.StatusCode_STATUS_CODE_HEALTHY, healthv1.StatusCode_STATUS_CODE_DEGRADED}
444-
445-
for !slices.Contains(breakStatusesHealth, healthStatus.GetCode()) {
446-
time.Sleep(1 * time.Second)
447-
apiResp, err := retryWithBackoff(ctx, func(ctx context.Context) (*argocdv1.GetInstanceClusterResponse, error) {
448-
return client.GetInstanceCluster(ctx, &argocdv1.GetInstanceClusterRequest{
449-
OrganizationId: orgID,
450-
InstanceId: c.InstanceID.ValueString(),
451-
Id: c.Name.ValueString(),
452-
IdType: idv1.Type_NAME,
453-
})
454-
}, "GetInstanceCluster")
455-
if err != nil {
456-
return err
477+
func waitClusterHealthStatus(ctx context.Context, client argocdv1.ArgoCDServiceGatewayClient, orgID string, c *types.Cluster, ensureHealthy bool) error {
478+
const (
479+
healthStatusTimeout = 10 * time.Minute
480+
healthStatusPollInterval = 5 * time.Second
481+
)
482+
483+
targetStatuses := []healthv1.StatusCode{
484+
healthv1.StatusCode_STATUS_CODE_HEALTHY,
485+
}
486+
if !ensureHealthy {
487+
return nil
488+
}
489+
490+
var lastHealthMessage string
491+
if err := waitForStatus(
492+
ctx,
493+
func(ctx context.Context) (*argocdv1.Cluster, error) {
494+
resp, err := retryWithBackoff(ctx, func(ctx context.Context) (*argocdv1.GetInstanceClusterResponse, error) {
495+
return client.GetInstanceCluster(ctx, &argocdv1.GetInstanceClusterRequest{
496+
OrganizationId: orgID,
497+
InstanceId: c.InstanceID.ValueString(),
498+
Id: c.Name.ValueString(),
499+
IdType: idv1.Type_NAME,
500+
})
501+
}, "GetInstanceCluster")
502+
if err != nil {
503+
return nil, err
504+
}
505+
return resp.GetCluster(), nil
506+
},
507+
func(cluster *argocdv1.Cluster) healthv1.StatusCode {
508+
healthStatus := cluster.GetHealthStatus()
509+
if msg := healthStatus.GetMessage(); msg != "" {
510+
lastHealthMessage = msg
511+
}
512+
return healthStatus.GetCode()
513+
},
514+
targetStatuses,
515+
healthStatusPollInterval,
516+
healthStatusTimeout,
517+
c.Name.ValueString(),
518+
"health",
519+
); err != nil {
520+
var errMsg strings.Builder
521+
errMsg.WriteString(err.Error())
522+
523+
if lastHealthMessage != "" {
524+
errMsg.WriteString(fmt.Sprintf("\n\nHealth status message: %s", lastHealthMessage))
457525
}
458-
cluster = apiResp.GetCluster()
459-
healthStatus = cluster.GetHealthStatus()
460-
tflog.Debug(ctx, fmt.Sprintf("Cluster health status: %s", healthStatus.String()))
526+
527+
errMsg.WriteString("\n\nTroubleshooting steps:")
528+
errMsg.WriteString("\n 1. Check the cluster health in the Akuity Console")
529+
errMsg.WriteString("\n 2. Verify the Akuity agent is running in the cluster (for example in akuity namespace):")
530+
errMsg.WriteString("\n kubectl get pods -n akuity")
531+
errMsg.WriteString("\n 3. Check agent logs for errors:")
532+
errMsg.WriteString("\n kubectl logs -n akuity -l app.kubernetes.io/name=akuity-agent --tail=100")
533+
errMsg.WriteString("\n 4. Ensure the cluster has network connectivity to the Akuity Platform")
534+
535+
return errors.New(errMsg.String())
461536
}
537+
462538
return nil
463539
}
464540

465541
func waitClusterReconStatus(ctx context.Context, client argocdv1.ArgoCDServiceGatewayClient, cluster *argocdv1.Cluster, orgId, instanceId string) (*argocdv1.Cluster, error) {
466-
reconStatus := cluster.GetReconciliationStatus()
467-
breakStatusesRecon := []reconv1.StatusCode{reconv1.StatusCode_STATUS_CODE_SUCCESSFUL, reconv1.StatusCode_STATUS_CODE_FAILED}
468-
469-
for !slices.Contains(breakStatusesRecon, reconStatus.GetCode()) {
470-
time.Sleep(1 * time.Second)
471-
apiResp, err := retryWithBackoff(ctx, func(ctx context.Context) (*argocdv1.GetInstanceClusterResponse, error) {
472-
return client.GetInstanceCluster(ctx, &argocdv1.GetInstanceClusterRequest{
473-
OrganizationId: orgId,
474-
InstanceId: instanceId,
475-
Id: cluster.Id,
476-
IdType: idv1.Type_ID,
477-
})
478-
}, "GetInstanceCluster")
479-
if err != nil {
480-
return nil, err
542+
const (
543+
reconStatusTimeout = 10 * time.Minute
544+
reconStatusPollInterval = 5 * time.Second
545+
)
546+
547+
targetStatuses := []reconv1.StatusCode{
548+
reconv1.StatusCode_STATUS_CODE_SUCCESSFUL,
549+
reconv1.StatusCode_STATUS_CODE_FAILED,
550+
}
551+
552+
var finalCluster *argocdv1.Cluster
553+
554+
err := waitForStatus(
555+
ctx,
556+
func(ctx context.Context) (*argocdv1.Cluster, error) {
557+
resp, err := retryWithBackoff(ctx, func(ctx context.Context) (*argocdv1.GetInstanceClusterResponse, error) {
558+
return client.GetInstanceCluster(ctx, &argocdv1.GetInstanceClusterRequest{
559+
OrganizationId: orgId,
560+
InstanceId: instanceId,
561+
Id: cluster.Id,
562+
IdType: idv1.Type_ID,
563+
})
564+
}, "GetInstanceCluster")
565+
if err != nil {
566+
return nil, err
567+
}
568+
finalCluster = resp.GetCluster()
569+
return finalCluster, nil
570+
},
571+
func(cluster *argocdv1.Cluster) reconv1.StatusCode {
572+
return cluster.GetReconciliationStatus().GetCode()
573+
},
574+
targetStatuses,
575+
reconStatusPollInterval,
576+
reconStatusTimeout,
577+
cluster.Name,
578+
"reconciliation",
579+
)
580+
if err != nil {
581+
return nil, err
582+
}
583+
584+
return finalCluster, nil
585+
}
586+
587+
func waitForClusterReconciliation(ctx context.Context, client argocdv1.ArgoCDServiceGatewayClient, orgID string, plan *types.Cluster) error {
588+
clusterReq := &argocdv1.GetInstanceClusterRequest{
589+
OrganizationId: orgID,
590+
InstanceId: plan.InstanceID.ValueString(),
591+
Id: plan.Name.ValueString(),
592+
IdType: idv1.Type_NAME,
593+
}
594+
595+
clusterResp, err := retryWithBackoff(ctx, func(ctx context.Context) (*argocdv1.GetInstanceClusterResponse, error) {
596+
return client.GetInstanceCluster(ctx, clusterReq)
597+
}, "GetInstanceCluster")
598+
if err != nil {
599+
return errors.Wrap(err, "unable to get cluster for reconciliation check")
600+
}
601+
602+
finalCluster, err := waitClusterReconStatus(ctx, client, clusterResp.GetCluster(), orgID, plan.InstanceID.ValueString())
603+
if err != nil {
604+
return errors.Wrap(err, "unable to wait for cluster reconciliation")
605+
}
606+
607+
if finalCluster.GetReconciliationStatus().GetCode() == reconv1.StatusCode_STATUS_CODE_FAILED {
608+
msg := finalCluster.GetReconciliationStatus().GetMessage()
609+
if msg == "" {
610+
msg = "cluster reconciliation failed"
481611
}
482-
cluster = apiResp.GetCluster()
483-
reconStatus = cluster.GetReconciliationStatus()
484-
tflog.Debug(ctx, fmt.Sprintf("Cluster recon status: %s", reconStatus.String()))
612+
return errors.New(msg)
485613
}
486-
return cluster, nil
614+
615+
return nil
487616
}
488617

489618
func readStream(resChan <-chan *httpbody.HttpBody, errChan <-chan error) ([]byte, error) {

akp/resource_akp_cluster_schema.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,12 @@ func getAKPClusterAttributes() map[string]schema.Attribute {
119119
boolplanmodifier.UseStateForUnknown(),
120120
},
121121
},
122+
"ensure_healthy": schema.BoolAttribute{
123+
MarkdownDescription: "If true, terraform apply will fail if the cluster agent becomes degraded or does not become healthy within the timeout period. When false (default), terraform will not wait for the resource status to be reported.",
124+
Optional: true,
125+
Computed: true,
126+
Default: booldefault.StaticBool(false),
127+
},
122128
}
123129
}
124130

0 commit comments

Comments
 (0)