Skip to content

Commit d4923dd

Browse files
authored
CLOUDP-400938: fix(atlasdeployment): stop perpetual reconcile loop on terminationProtection and processArgs (#3334)
* fix(deployment): always send terminationProtectionEnabled in PATCH MakePtrOrNil dropped false from the create/update body. When Atlas had true (e.g. set via UI), the operator could never push false, causing a perpetual reconcile loop. Refs #3142 * feat(deployment): add ProcessArgsEqual comparator Mirrors PATCH-body omitempty semantics: AKO-side zero values are treated as "no opinion" and skipped in the comparison. Will replace the reflect.DeepEqual call at the reconcile site in a follow-up commit. Refs #3142 * fix(atlasdeployment): use ProcessArgsEqual instead of reflect.DeepEqual reflect.DeepEqual treated Atlas-returned server defaults as a diff against CR-unset fields, triggering UpdateProcessArgs every reconcile. The PATCH body omits the same zero values, so Atlas never converged. Refs #3142 * fix(deployment): preserve explicit zero on *int64 processArgs fields OplogSizeMB / SampleSizeBIConnector / SampleRefreshIntervalBIConnector went through MakePtrOrNil(int(GetOrDefault(*int64, 0))), which collapsed both 'unset' and 'explicitly zero' into nil — the same bug class as terminationProtectionEnabled. Replaced with a nil-preserving helper so *int64(0) survives the PATCH conversion and the comparator can drive convergence in one round. Per review feedback on PR #3334. Refs #3142. * docs(deployment): explain ProcessArgsEqual per-field check rationale Per review feedback on PR #3334. * fix(deployment): keep pause-only PATCHes free of other fields Switching TerminationProtectionEnabled to MakePtr (issue #3142) broke the implicit contract of the pause special-case in ComputeChanges: that path had relied on MakePtrOrNil silently dropping zero-value fields so the PATCH body contained only Paused. Adding TerminationProtectionEnabled as an explicit value caused Atlas to reject the request with HTTP 409 CANNOT_UPDATE_AND_PAUSE_CLUSTER, surfacing as a 30-minute timeout in the 'Should Succeed (AWS)' integration test. Make the contract explicit: ComputeChanges marks pause-only changes with a pauseOnly flag, and clusterUpdateToAtlas builds a body containing only Paused when set. Refs #3142
1 parent 82fe0f0 commit d4923dd

5 files changed

Lines changed: 351 additions & 7 deletions

File tree

internal/controller/atlasdeployment/advanced_deployment.go

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ package atlasdeployment
1717
import (
1818
"errors"
1919
"fmt"
20-
"reflect"
2120
"strings"
2221

2322
v1 "k8s.io/api/core/v1"
@@ -245,7 +244,7 @@ func (r *AtlasDeploymentReconciler) ensureAdvancedOptions(ctx *workflow.Context,
245244
if deploymentInAKO.ProcessArgs.FailIndexKeyTooLong != nil {
246245
ctx.Log.Warn("Process Arg FailIndexKeyTooLong is no longer available in Atlas. Setting this will have no effect.")
247246
}
248-
if !reflect.DeepEqual(deploymentInAKO.ProcessArgs, deploymentInAtlas.ProcessArgs) {
247+
if !deployment.ProcessArgsEqual(deploymentInAKO.ProcessArgs, deploymentInAtlas.ProcessArgs) {
249248
err = deploymentService.UpdateProcessArgs(ctx.Context, deploymentInAKO)
250249
if err != nil {
251250
return r.transitionFromLegacy(ctx, deploymentService, deploymentInAKO.GetProjectID(), deploymentInAKO.GetCustomResource(), err)

internal/translation/deployment/compare.go

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ func ComputeChanges(desired, current *Cluster) (*Cluster, bool) {
3131
Name: desired.Name,
3232
Paused: new(pointer.GetOrDefault(desired.Paused, false)),
3333
},
34+
pauseOnly: true,
3435
}, true
3536
}
3637

@@ -388,3 +389,61 @@ func areEqual[T comparable](desired, current *T) bool {
388389

389390
return val1 == val2
390391
}
392+
393+
// ProcessArgsEqual reports whether the AKO-side processArgs and the
394+
// Atlas-side processArgs are semantically equivalent for reconcile purposes.
395+
//
396+
// Fields that are the zero value on the AKO side are treated as "no opinion"
397+
// and not compared. This mirrors processArgsToAtlas, which omits zero values
398+
// from the PATCH body via MakePtrOrNil — if a field would not be sent over
399+
// the wire, it must not drive an update either, or the reconciler loops on
400+
// Atlas-populated server defaults (issue #3142).
401+
func ProcessArgsEqual(ako, atlas *akov2.ProcessArgs) bool {
402+
if ako == nil {
403+
return true
404+
}
405+
if atlas == nil {
406+
atlas = &akov2.ProcessArgs{}
407+
}
408+
409+
// Each check below mirrors how processArgsToAtlas serializes the field:
410+
//
411+
// *bool / *int64 fields → check `!= nil` (a non-nil pointer is sent,
412+
// even if it points to the
413+
// zero value)
414+
// string fields → check `!= ""` (empty string is dropped via
415+
// MakePtrOrNil and not sent)
416+
//
417+
// Keep the per-field style aligned with the converter — collapsing the two
418+
// styles will reintroduce issue #3142.
419+
//
420+
// Fields intentionally absent below: they are not part of the SDK PATCH
421+
// body and therefore cannot drive an update:
422+
// - DefaultReadConcern
423+
// - FailIndexKeyTooLong (deprecated by Atlas; warned on at the call site)
424+
if ako.DefaultWriteConcern != "" && ako.DefaultWriteConcern != atlas.DefaultWriteConcern {
425+
return false
426+
}
427+
if ako.MinimumEnabledTLSProtocol != "" && ako.MinimumEnabledTLSProtocol != atlas.MinimumEnabledTLSProtocol {
428+
return false
429+
}
430+
if ako.OplogMinRetentionHours != "" && ako.OplogMinRetentionHours != atlas.OplogMinRetentionHours {
431+
return false
432+
}
433+
if ako.JavascriptEnabled != nil && !areEqual(ako.JavascriptEnabled, atlas.JavascriptEnabled) {
434+
return false
435+
}
436+
if ako.NoTableScan != nil && !areEqual(ako.NoTableScan, atlas.NoTableScan) {
437+
return false
438+
}
439+
if ako.OplogSizeMB != nil && !areEqual(ako.OplogSizeMB, atlas.OplogSizeMB) {
440+
return false
441+
}
442+
if ako.SampleSizeBIConnector != nil && !areEqual(ako.SampleSizeBIConnector, atlas.SampleSizeBIConnector) {
443+
return false
444+
}
445+
if ako.SampleRefreshIntervalBIConnector != nil && !areEqual(ako.SampleRefreshIntervalBIConnector, atlas.SampleRefreshIntervalBIConnector) {
446+
return false
447+
}
448+
return true
449+
}

internal/translation/deployment/compare_test.go

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ import (
2222

2323
akov2 "github.com/mongodb/mongodb-atlas-kubernetes/v2/api/v1"
2424
"github.com/mongodb/mongodb-atlas-kubernetes/v2/api/v1/common"
25+
"github.com/mongodb/mongodb-atlas-kubernetes/v2/internal/pointer"
2526
)
2627

2728
func TestComputeChanges(t *testing.T) {
@@ -56,6 +57,7 @@ func TestComputeChanges(t *testing.T) {
5657
Name: "cluster0",
5758
Paused: new(true),
5859
},
60+
pauseOnly: true,
5961
},
6062
changed: true,
6163
},
@@ -2413,3 +2415,98 @@ func TestAreEqual(t *testing.T) {
24132415
}
24142416
})
24152417
}
2418+
2419+
// Regression coverage for https://github.com/mongodb/mongodb-atlas-kubernetes/issues/3142.
2420+
// ProcessArgsEqual must mirror the omitempty PATCH-body semantics of
2421+
// processArgsToAtlas: a field that is the zero value on the AKO side cannot
2422+
// drive an update (it would not be sent over the wire), so it must not be
2423+
// considered a diff.
2424+
func TestProcessArgsEqual(t *testing.T) {
2425+
tests := map[string]struct {
2426+
ako *akov2.ProcessArgs
2427+
atlas *akov2.ProcessArgs
2428+
want bool
2429+
}{
2430+
"both nil": {
2431+
ako: nil, atlas: nil, want: true,
2432+
},
2433+
"ako nil, atlas populated": {
2434+
ako: nil,
2435+
atlas: &akov2.ProcessArgs{DefaultWriteConcern: "majority"},
2436+
want: true,
2437+
},
2438+
"ako unset string, atlas server default": {
2439+
ako: &akov2.ProcessArgs{},
2440+
atlas: &akov2.ProcessArgs{DefaultWriteConcern: "majority"},
2441+
want: true,
2442+
},
2443+
"ako unset oplog, atlas server default": {
2444+
ako: &akov2.ProcessArgs{},
2445+
atlas: &akov2.ProcessArgs{OplogSizeMB: pointer.MakePtr[int64](990)},
2446+
want: true,
2447+
},
2448+
"ako sets javascriptEnabled true, atlas true": {
2449+
ako: &akov2.ProcessArgs{JavascriptEnabled: pointer.MakePtr(true)},
2450+
atlas: &akov2.ProcessArgs{JavascriptEnabled: pointer.MakePtr(true)},
2451+
want: true,
2452+
},
2453+
"ako sets javascriptEnabled false, atlas true": {
2454+
ako: &akov2.ProcessArgs{JavascriptEnabled: pointer.MakePtr(false)},
2455+
atlas: &akov2.ProcessArgs{JavascriptEnabled: pointer.MakePtr(true)},
2456+
want: false,
2457+
},
2458+
"ako sets defaultWriteConcern differently": {
2459+
ako: &akov2.ProcessArgs{DefaultWriteConcern: "majority"},
2460+
atlas: &akov2.ProcessArgs{DefaultWriteConcern: "local"},
2461+
want: false,
2462+
},
2463+
"ako sets oplogSizeMB differently": {
2464+
ako: &akov2.ProcessArgs{OplogSizeMB: pointer.MakePtr[int64](990)},
2465+
atlas: &akov2.ProcessArgs{OplogSizeMB: pointer.MakePtr[int64](1000)},
2466+
want: false,
2467+
},
2468+
"ako sets minTLS differently": {
2469+
ako: &akov2.ProcessArgs{MinimumEnabledTLSProtocol: "TLS1_2"},
2470+
atlas: &akov2.ProcessArgs{MinimumEnabledTLSProtocol: "TLS1_1"},
2471+
want: false,
2472+
},
2473+
"ako sets noTableScan true, atlas false": {
2474+
ako: &akov2.ProcessArgs{NoTableScan: pointer.MakePtr(true)},
2475+
atlas: &akov2.ProcessArgs{NoTableScan: pointer.MakePtr(false)},
2476+
want: false,
2477+
},
2478+
"identical fully populated": {
2479+
ako: &akov2.ProcessArgs{
2480+
DefaultWriteConcern: "majority",
2481+
MinimumEnabledTLSProtocol: "TLS1_2",
2482+
JavascriptEnabled: pointer.MakePtr(true),
2483+
NoTableScan: pointer.MakePtr(false),
2484+
OplogSizeMB: pointer.MakePtr[int64](990),
2485+
},
2486+
atlas: &akov2.ProcessArgs{
2487+
DefaultWriteConcern: "majority",
2488+
MinimumEnabledTLSProtocol: "TLS1_2",
2489+
JavascriptEnabled: pointer.MakePtr(true),
2490+
NoTableScan: pointer.MakePtr(false),
2491+
OplogSizeMB: pointer.MakePtr[int64](990),
2492+
},
2493+
want: true,
2494+
},
2495+
"ako sample size differs": {
2496+
ako: &akov2.ProcessArgs{SampleSizeBIConnector: pointer.MakePtr[int64](100)},
2497+
atlas: &akov2.ProcessArgs{SampleSizeBIConnector: pointer.MakePtr[int64](200)},
2498+
want: false,
2499+
},
2500+
"ako oplog min retention differs": {
2501+
ako: &akov2.ProcessArgs{OplogMinRetentionHours: "24"},
2502+
atlas: &akov2.ProcessArgs{OplogMinRetentionHours: "48"},
2503+
want: false,
2504+
},
2505+
}
2506+
2507+
for name, tt := range tests {
2508+
t.Run(name, func(t *testing.T) {
2509+
assert.Equal(t, tt.want, ProcessArgsEqual(tt.ako, tt.atlas))
2510+
})
2511+
}
2512+
}

internal/translation/deployment/conversion.go

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,12 @@ type Cluster struct {
6262
computeAutoscalingEnabled bool
6363
instanceSizeOverride string
6464
isTenant bool
65+
66+
// pauseOnly marks a Cluster produced by ComputeChanges' pause special-case.
67+
// clusterUpdateToAtlas must build a body containing only Paused for these,
68+
// because Atlas rejects PATCHes that combine pause with any other field
69+
// change (HTTP 409 CANNOT_UPDATE_AND_PAUSE_CLUSTER).
70+
pauseOnly bool
6571
}
6672

6773
func (c *Cluster) GetName() string {
@@ -583,12 +589,19 @@ func clusterCreateToAtlas(cluster *Cluster) *admin.ClusterDescription20240805 {
583589
ReplicationSpecs: replicationSpecToAtlas(cluster.ReplicationSpecs, cluster.ClusterType, cluster.DiskSizeGB),
584590
RootCertType: pointer.MakePtrOrNil(cluster.RootCertType),
585591
Tags: tag.ToAtlas(cluster.Tags),
586-
TerminationProtectionEnabled: pointer.MakePtrOrNil(cluster.TerminationProtectionEnabled),
592+
TerminationProtectionEnabled: pointer.MakePtr(cluster.TerminationProtectionEnabled),
587593
ConfigServerManagementMode: pointer.MakePtrOrNil(cluster.ConfigServerManagementMode),
588594
}
589595
}
590596

591597
func clusterUpdateToAtlas(cluster *Cluster) *admin.ClusterDescription20240805 {
598+
// Pause-only updates must not include any other field, or Atlas rejects
599+
// with HTTP 409 CANNOT_UPDATE_AND_PAUSE_CLUSTER.
600+
if cluster.pauseOnly {
601+
return &admin.ClusterDescription20240805{
602+
Paused: cluster.Paused,
603+
}
604+
}
592605
return &admin.ClusterDescription20240805{
593606
ClusterType: pointer.MakePtrOrNil(cluster.ClusterType),
594607
MongoDBMajorVersion: pointer.MakePtrOrNil(cluster.MongoDBMajorVersion),
@@ -602,7 +615,7 @@ func clusterUpdateToAtlas(cluster *Cluster) *admin.ClusterDescription20240805 {
602615
ReplicationSpecs: replicationSpecToAtlas(cluster.ReplicationSpecs, cluster.ClusterType, cluster.DiskSizeGB),
603616
RootCertType: pointer.MakePtrOrNil(cluster.RootCertType),
604617
Tags: tag.ToAtlas(cluster.Tags),
605-
TerminationProtectionEnabled: pointer.MakePtrOrNil(cluster.TerminationProtectionEnabled),
618+
TerminationProtectionEnabled: pointer.MakePtr(cluster.TerminationProtectionEnabled),
606619
ConfigServerManagementMode: pointer.MakePtrOrNil(cluster.ConfigServerManagementMode),
607620
}
608621
}
@@ -950,6 +963,22 @@ func replicationSpecToAtlas(replicationSpecs []*akov2.AdvancedReplicationSpec, c
950963
return &specs
951964
}
952965

966+
// int64PtrToIntPtr converts *int64 to *int while preserving nil-ness.
967+
// Use this instead of MakePtrOrNil(int(GetOrDefault(p, 0))) when the SDK
968+
// field is *int and an explicit zero is a meaningful user value that must
969+
// be sent over the wire (issue #3142).
970+
func int64PtrToIntPtr(p *int64) *int {
971+
if p == nil {
972+
return nil
973+
}
974+
v := int(*p)
975+
return &v
976+
}
977+
978+
// processArgsToAtlas builds the PATCH body. When adding or removing a field
979+
// here, mirror the change in deployment.ProcessArgsEqual — the comparator
980+
// must skip exactly the fields this function omits, or the reconciler loops
981+
// (issue #3142).
953982
func processArgsToAtlas(config *akov2.ProcessArgs) (*admin.ClusterDescriptionProcessArgs20240805, error) {
954983
var oplogMinRetentionHours *float64
955984
if config.OplogMinRetentionHours != "" {
@@ -966,9 +995,9 @@ func processArgsToAtlas(config *akov2.ProcessArgs) (*admin.ClusterDescriptionPro
966995
MinimumEnabledTlsProtocol: pointer.MakePtrOrNil(config.MinimumEnabledTLSProtocol),
967996
JavascriptEnabled: config.JavascriptEnabled,
968997
NoTableScan: config.NoTableScan,
969-
OplogSizeMB: pointer.MakePtrOrNil(int(pointer.GetOrDefault(config.OplogSizeMB, 0))),
970-
SampleSizeBIConnector: pointer.MakePtrOrNil(int(pointer.GetOrDefault(config.SampleSizeBIConnector, 0))),
971-
SampleRefreshIntervalBIConnector: pointer.MakePtrOrNil(int(pointer.GetOrDefault(config.SampleRefreshIntervalBIConnector, 0))),
998+
OplogSizeMB: int64PtrToIntPtr(config.OplogSizeMB),
999+
SampleSizeBIConnector: int64PtrToIntPtr(config.SampleSizeBIConnector),
1000+
SampleRefreshIntervalBIConnector: int64PtrToIntPtr(config.SampleRefreshIntervalBIConnector),
9721001
OplogMinRetentionHours: oplogMinRetentionHours,
9731002
}, nil
9741003
}

0 commit comments

Comments
 (0)