Skip to content
2 changes: 2 additions & 0 deletions docs/data-sources/private_endpoint_service.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,5 @@ data "couchbase-capella_private_endpoint_service" "service_status" {
### Read-Only

- `enabled` (Boolean) - Returns true if private endpoint is enabled
- `status` (String) - status of the private endpoint
Comment thread
cloudy-vishnu marked this conversation as resolved.
Comment thread
cloudy-vishnu marked this conversation as resolved.
- **Valid Values**: `idle`, `unknown`, `enabling`, `enabled`, `enableFailed`, `disabling`, `disabled`, `disableFailed`
5 changes: 5 additions & 0 deletions docs/resources/private_endpoint_service.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,11 @@ resource "couchbase-capella_private_endpoint_service" "new_service" {
- `organization_id` (String) The GUID4 ID of the organization.
- `project_id` (String) The GUID4 ID of the project.

### Read-Only

- `status` (String) - status of the private endpoint
Comment thread
cloudy-vishnu marked this conversation as resolved.
Comment thread
cloudy-vishnu marked this conversation as resolved.
- **Valid Values**: `idle`, `unknown`, `enabling`, `enabled`, `enableFailed`, `disabling`, `disabled`, `disableFailed`

## Import

Import is supported using the following syntax:
Expand Down
9 changes: 8 additions & 1 deletion internal/api/private_endpoint_service.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@ package api
// GetPrivateEndpointServiceStatusResponse is the response received from the Capella V4 Public API
// when getting private endpoint service status.
type GetPrivateEndpointServiceStatusResponse struct {
Enabled bool `json:"enabled"`
Enabled bool `json:"enabled"`

// Status is the lifecycle state of the private endpoint service derived from
// the most recent enable/disable/update operation (for example "enableFailed"
// or "enabling"). It is optional and best-effort: older control planes omit it,
// in which case callers fall back to the Enabled boolean.
Status *string `json:"status,omitempty"`

PrivateDns string `json:"privateDns"`
}
4 changes: 4 additions & 0 deletions internal/datasources/private_endpoint_service.go
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,10 @@ func (p *PrivateEndpointService) Read(ctx context.Context, req datasource.ReadRe
}

state.Enabled = types.BoolValue(privateEndpointServiceStatus.Enabled)
state.Status = types.StringNull()
if privateEndpointServiceStatus.Status != nil {
state.Status = types.StringValue(*privateEndpointServiceStatus.Status)
}
diags = resp.State.Set(ctx, &state)
resp.Diagnostics.Append(diags...)
if resp.Diagnostics.HasError() {
Expand Down
1 change: 1 addition & 0 deletions internal/datasources/private_endpoint_service_schema.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ func PrivateEndpointServiceSchema() schema.Schema {
capellaschema.AddAttr(attrs, "project_id", privateEndpointServiceBuilder, requiredString())
capellaschema.AddAttr(attrs, "cluster_id", privateEndpointServiceBuilder, requiredString())
capellaschema.AddAttr(attrs, "enabled", privateEndpointServiceBuilder, computedBool())
capellaschema.AddAttr(attrs, "status", privateEndpointServiceBuilder, computedString())

return schema.Schema{
MarkdownDescription: "The data source to retrieve private endpoint service information for a cluster.",
Expand Down
8 changes: 8 additions & 0 deletions internal/errors/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,14 @@ var (

ErrPrivateEndpointServiceTimeout = errors.New("changing private endpoint service status timed out after initiation")

// ErrPrivateEndpointServiceEnableFailed is returned when the backend reports a terminal
// enableFailed state for the private endpoint service. This is not retried by polling.
ErrPrivateEndpointServiceEnableFailed = errors.New("private endpoint service enablement failed")

// ErrPrivateEndpointServiceDisableFailed is returned when the backend reports a terminal
// disableFailed state for the private endpoint service. This is not retried by polling.
ErrPrivateEndpointServiceDisableFailed = errors.New("private endpoint service disable failed")

ErrBucketCreationStatusTimeout = errors.New("bucket backup creation status transition timed out after initiation")

ErrSnapshotBackupCreationStatusTimeout = errors.New("snapshot backup creation status transition timed out after initiation")
Expand Down
204 changes: 199 additions & 5 deletions internal/resources/private_endpoint_service.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@ package resources
import (
"context"
"encoding/json"
stderrors "errors"
"fmt"
"net/http"
"time"

"github.com/hashicorp/terraform-plugin-framework/diag"
"github.com/hashicorp/terraform-plugin-framework/path"
"github.com/hashicorp/terraform-plugin-framework/resource"
"github.com/hashicorp/terraform-plugin-framework/tfsdk"
"github.com/hashicorp/terraform-plugin-framework/types"
"github.com/hashicorp/terraform-plugin-log/tflog"

Expand All @@ -28,6 +31,28 @@ const (
errorMessageWhileEnablingPrivateEndpointService = "There is an error while enabling private endpoint service. Please check in Capella to see if there are any hanging resources\" +\n\t\" have been created, unexpected error: "
Comment thread
cloudy-vishnu marked this conversation as resolved.
Outdated
)

// Private endpoint service lifecycle states returned by the GET status API.
// enableFailed/disableFailed are terminal (no automatic retry); enabling,
// disabling, and unknown are transient; idle means no operation has run.
const (
statusIdle = "idle"
statusEnabling = "enabling"
statusEnabled = "enabled"
statusEnableFailed = "enableFailed"
statusDisabling = "disabling"
statusDisabled = "disabled"
statusDisableFailed = "disableFailed"
statusUnknown = "unknown"

// cleanupTimeout bounds how long we wait for the backend to tear down a
// failed enable before giving up and surfacing an escalation error.
cleanupTimeout = 15 * time.Minute

// pollInterval is how often the service status is polled while waiting for a
// transition. The overall 60-minute timeout remains the backstop.
pollInterval = 30 * time.Second
)

// PrivateEndpointService is the scope resource implementation.
type PrivateEndpointService struct {
*providerschema.Data
Expand Down Expand Up @@ -103,10 +128,17 @@ func (p *PrivateEndpointService) Create(ctx context.Context, req resource.Create

err = p.waitUntilStatusChanges(ctx, true, organizationId, projectId, clusterId)
if err != nil {
// Terminal enableFailed: clean up the orphaned infra and remove the
// resource so the next apply performs a clean re-create.
if stderrors.Is(err, errors.ErrPrivateEndpointServiceEnableFailed) {
p.handleFailedEnable(ctx, &resp.State, &resp.Diagnostics, organizationId, projectId, clusterId, err)
return
}
Comment thread
cloudy-vishnu marked this conversation as resolved.
resp.Diagnostics.AddError(
"Error could not enable private endpoint service",
"Error could not enable private endpoint service, unexpected error: "+err.Error(),
)
return
}

refreshedState, err := p.getServiceState(ctx, organizationId, projectId, clusterId)
Expand Down Expand Up @@ -166,6 +198,15 @@ func (p *PrivateEndpointService) Read(ctx context.Context, req resource.ReadRequ
return
}

// A terminally failed enable is treated like a missing resource: remove it
// from state so drift detection recreates it on the next apply instead of
// leaving a wedged resource that never converges.
Comment thread
cloudy-vishnu marked this conversation as resolved.
Outdated
if refreshedState.Status.ValueString() == statusEnableFailed {
Comment thread
stanleefdz marked this conversation as resolved.
tflog.Info(ctx, "private endpoint service is in enableFailed state; removing from state to force re-create")
resp.State.RemoveResource(ctx)
return
}

diags = resp.State.Set(ctx, &refreshedState)
Comment thread
cloudy-vishnu marked this conversation as resolved.
Outdated
resp.Diagnostics.Append(diags...)
if resp.Diagnostics.HasError() {
Expand Down Expand Up @@ -219,6 +260,16 @@ func (p *PrivateEndpointService) Update(ctx context.Context, req resource.Update
config.ProjectId.ValueString(),
config.ClusterId.ValueString())
if err != nil {
// An enable-flavored update that fails terminally is recovered the same
// way as Create: clean up and remove from state for a clean retry.
if config.Enabled.ValueBool() && stderrors.Is(err, errors.ErrPrivateEndpointServiceEnableFailed) {
p.handleFailedEnable(ctx, &resp.State, &resp.Diagnostics,
config.OrganizationId.ValueString(),
config.ProjectId.ValueString(),
config.ClusterId.ValueString(),
err)
return
}
resp.Diagnostics.AddError(
"Error "+status+" private endpoint service",
"Error "+status+"private endpoint service, unexpected error: "+err.Error(),
Expand Down Expand Up @@ -300,12 +351,26 @@ func (p *PrivateEndpointService) Delete(ctx context.Context, req resource.Delete

err = p.waitUntilStatusChanges(ctx, false, organizationId, projectId, clusterId)
if err != nil {
// On a terminal disableFailed we fail fast and keep the resource in state
// (Terraform retains a resource whose Delete errors). The correct retry
// for a failed disable is another destroy, which Terraform performs
// naturally on the next run.
if stderrors.Is(err, errors.ErrPrivateEndpointServiceDisableFailed) {
resp.Diagnostics.AddError(
"Private endpoint service disable failed",
fmt.Sprintf(
"Disable failed for cluster %s: %s. The resource has been kept in state; "+
"re-run terraform destroy to retry, and contact Couchbase Capella Support if it persists.",
clusterId, err.Error(),
),
)
return
}
resp.Diagnostics.AddError(
"Error could not disable private endpoint service",
"Error could not disable private endpoint service, unexpected error: "+err.Error(),
)
}

}

// Configure It adds the provider configured api to the private endpoint service resource.
Expand Down Expand Up @@ -353,16 +418,26 @@ func initializePrivateEndpointServicePlan(plan providerschema.PrivateEndpointSer
if plan.Enabled.IsNull() || plan.Enabled.IsUnknown() {
plan.Enabled = types.BoolNull()
}
// status is computed; never persist an unknown value to state.
if plan.Status.IsNull() || plan.Status.IsUnknown() {
plan.Status = types.StringNull()
}
return plan
}

// waitUntilStatusChanges terraform will wait until the service status changes on the cluster.
// waitUntilStatusChanges waits until the service reaches the desired state on the
// cluster. When the API reports an explicit lifecycle status it is used to fail
// fast on terminal states (enableFailed/disableFailed) and to keep polling on
// transient ones (enabling/disabling/unknown). When the status is absent (an
// older control plane) it falls back to the Enabled boolean, preserving the
Comment thread
cloudy-vishnu marked this conversation as resolved.
Outdated
// previous behavior. The 60-minute timeout remains as a backstop.
func (p *PrivateEndpointService) waitUntilStatusChanges(ctx context.Context, finalState bool, organizationId, projectId, clusterId string) error {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, time.Minute*60)
defer cancel()

timer := time.NewTimer(time.Minute * 1)
timer := time.NewTimer(pollInterval)
Comment thread
cloudy-vishnu marked this conversation as resolved.
Outdated
defer timer.Stop()

for {
select {
Expand All @@ -375,14 +450,129 @@ func (p *PrivateEndpointService) waitUntilStatusChanges(ctx context.Context, fin
return err
}

if response.Enabled == finalState {
// Older control planes do not report a status: fall back to the
// Enabled boolean, preserving the previous behavior.
if response.Status == nil {
if response.Enabled == finalState {
return nil
}
timer.Reset(pollInterval)
continue
}

switch *response.Status {
case statusEnableFailed:
return errors.ErrPrivateEndpointServiceEnableFailed
case statusDisableFailed:
return errors.ErrPrivateEndpointServiceDisableFailed
case statusEnabling, statusDisabling, statusUnknown:
// Transient states: keep polling.
case statusEnabled, statusDisabled, statusIdle:
// Resolved states: confirm against the requested final state.
if response.Enabled == finalState {
return nil
}
}
timer.Reset(pollInterval)
}
}
}

// waitUntilCleanedUp waits for the backend to finish tearing down a failed enable
// after a disable (DELETE) has been issued. It succeeds once the service reaches
// disabled/idle (or reports disabled via the boolean on an older API) and fails
// fast if the teardown itself reports disableFailed. It is bounded by
// cleanupTimeout so a stuck cleanup does not block apply indefinitely.
func (p *PrivateEndpointService) waitUntilCleanedUp(ctx context.Context, organizationId, projectId, clusterId string) error {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, cleanupTimeout)
defer cancel()

timer := time.NewTimer(pollInterval)
defer timer.Stop()

for {
select {
case <-ctx.Done():
return errors.ErrPrivateEndpointServiceTimeout

case <-timer.C:
response, err := p.getServiceStatus(ctx, organizationId, projectId, clusterId)
if err != nil {
return err
}

if response.Status != nil {
Comment thread
cloudy-vishnu marked this conversation as resolved.
switch *response.Status {
case statusDisableFailed:
return errors.ErrPrivateEndpointServiceDisableFailed
case statusDisabled, statusIdle:
return nil
}
// enableFailed/disabling/etc: cleanup still in progress, keep polling.
} else if !response.Enabled {
return nil
}
timer.Reset(time.Minute * 1)
timer.Reset(pollInterval)
}
}
}

// cleanupFailedEnable issues a disable (DELETE) to trigger backend teardown of a
// failed enable and waits for the teardown to complete. The backend allows
// disable from the enableFailed state specifically so this orphaned infra can be
// cleaned up.
func (p *PrivateEndpointService) cleanupFailedEnable(ctx context.Context, organizationId, projectId, clusterId string) error {
url := fmt.Sprintf(
"%s/v4/organizations/%s/projects/%s/clusters/%s/privateEndpointService",
p.HostURL,
organizationId,
projectId,
clusterId,
)
cfg := api.EndpointCfg{Url: url, Method: http.MethodDelete, SuccessStatus: http.StatusAccepted}
if _, err := p.ClientV1.ExecuteWithRetry(ctx, cfg, nil, p.Token, nil); err != nil {
return fmt.Errorf("could not trigger cleanup of failed enable: %w", err)
}

return p.waitUntilCleanedUp(ctx, organizationId, projectId, clusterId)
}

// handleFailedEnable performs the Option A recovery for a terminal enableFailed:
Comment thread
cloudy-vishnu marked this conversation as resolved.
Outdated
// it triggers backend cleanup of the orphaned resources, removes the resource
// from state so the next apply performs a clean re-create, and surfaces an
// actionable error. State is removed even when cleanup itself fails, because
// leaving a permanently-failed resource in state recreates the stuck-pipeline
// problem; the loud error directs escalation for the rare orphaned-infra case.
func (p *PrivateEndpointService) handleFailedEnable(ctx context.Context, state *tfsdk.State, diags *diag.Diagnostics, organizationId, projectId, clusterId string, cause error) {
tflog.Error(ctx, "private endpoint service enablement failed; triggering cleanup and removing from state")

cleanupErr := p.cleanupFailedEnable(ctx, organizationId, projectId, clusterId)
state.RemoveResource(ctx)

if cleanupErr != nil {
diags.AddError(
"Private endpoint service enablement failed and automatic cleanup did not complete",
fmt.Sprintf(
"Enablement failed for cluster %s: %s. Automatic cleanup of the failed resources did not complete: %s. "+
"There may be orphaned resources in your cloud account; please contact Couchbase Capella Support. "+
"The resource has been removed from state; re-run terraform apply to retry enablement.",
clusterId, cause.Error(), cleanupErr.Error(),
),
)
return
}

diags.AddError(
"Private endpoint service enablement failed",
fmt.Sprintf(
"Enablement failed for cluster %s: %s. The failed resources were cleaned up automatically and the resource "+
"has been removed from state; re-run terraform apply to retry enablement.",
clusterId, cause.Error(),
),
)
}

// getServiceStatus retrieves current private endpoint service status.
func (p *PrivateEndpointService) getServiceStatus(ctx context.Context, organizationId, projectId, clusterId string) (*api.GetPrivateEndpointServiceStatusResponse, error) {
url := fmt.Sprintf("%s/v4/organizations/%s/projects/%s/clusters/%s/privateEndpointService", p.HostURL, organizationId, projectId, clusterId)
Expand Down Expand Up @@ -419,6 +609,10 @@ func (p *PrivateEndpointService) getServiceState(ctx context.Context, organizati
ProjectId: types.StringValue(projectId),
ClusterId: types.StringValue(clusterId),
Enabled: types.BoolValue(response.Enabled),
Status: types.StringNull(),
}
if response.Status != nil {
state.Status = types.StringValue(*response.Status)
}

return &state, nil
Expand Down
4 changes: 4 additions & 0 deletions internal/resources/private_endpoint_service_schema.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@ func PrivateEndpointServiceSchema() schema.Schema {
PlanModifiers: []planmodifier.Bool{custommodifier.BlockCreateWhenEnabledSetToFalse()},
})

capellaschema.AddAttr(attrs, "status", privateEndpointServiceBuilder, &schema.StringAttribute{
Computed: true,
})

return schema.Schema{
MarkdownDescription: "This resource allows you to manage the private endpoint service for an operational cluster. The private endpoint service must be enabled before you can create private endpoints to connect your Cloud Service Provider's private network (VPC/VNET) to your operational cluster. This enables secure access to your cluster without exposing traffic to the public internet.",
Attributes: attrs,
Expand Down
7 changes: 5 additions & 2 deletions internal/resources/private_endpoints.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,8 +157,11 @@ func (p *PrivateEndpoint) Read(ctx context.Context, req resource.ReadRequest, re
return
}

if refreshedState.Status.ValueString() == "rejected" {
tflog.Info(ctx, "private endpoint association is rejected; removing from state to force re-association")
// Both rejected and failed associations are terminal and cannot recover in
// place, so remove them from state to force a clean re-association on the
// next apply rather than leaving a stuck resource.
if s := refreshedState.Status.ValueString(); s == "rejected" || s == "failed" {
tflog.Info(ctx, "private endpoint association is "+s+"; removing from state to force re-association")
resp.State.RemoveResource(ctx)
return
}
Comment thread
cloudy-vishnu marked this conversation as resolved.
Expand Down
Loading
Loading