diff --git a/internal/controller/datadogagent/controller_experiment_integration_test.go b/internal/controller/datadogagent/controller_experiment_integration_test.go index e30ad5b499..f324df5fd2 100644 --- a/internal/controller/datadogagent/controller_experiment_integration_test.go +++ b/internal/controller/datadogagent/controller_experiment_integration_test.go @@ -56,6 +56,168 @@ func reconcileN(t *testing.T, r *Reconciler, ns, name string, n int) { } } +// Test_Experiment_RollbackPersistsInSingleReconcile verifies that the spec +// rollback and the terminal phase write both land in the same reconcile — +// i.e. the status update uses the post-rollback ResourceVersion rather than +// 409'ing and deferring the phase write. Regression guard: without +// ResourceVersion propagation, reconcile 2 could observe a post-rollback spec +// matching an annotated, freshly-timestamped revision and skip the timeout +// check, leaving phase=running. +func Test_Experiment_RollbackPersistsInSingleReconcile(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + const timeout = 50 * time.Millisecond + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, timeout) + + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{ + Phase: v2alpha1.ExperimentPhaseRunning, + ID: "exp-1", + } + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + time.Sleep(2 * timeout) + + // Exactly one reconcile — both spec rollback and phase=timeout must persist. + reconcileN(t, r, ns, name, 1) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + assert.Equal(t, "datadoghq.com", *dda.Spec.Global.Site, "spec must be rolled back in the same reconcile") + assert.NotNil(t, dda.Status.Experiment) + assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, dda.Status.Experiment.Phase, "phase=timeout must persist in the same reconcile as the rollback") +} + +// Test_Experiment_RollbackPreservesConcurrentStatusWrite verifies that the +// targeted phase patch in restorePreviousSpec does not mutate the caller's +// instance.ResourceVersion. A downstream writer (simulated by mutating the +// stored status after the rollback has run but before updateStatusIfNeededV2 +// would fire) must retain 409 protection on the full-status replace. +// Regression guard: if patchExperimentPhase were to patch the caller's +// instance in-place, the client would decode the server response onto it and +// advance ResourceVersion, causing the downstream full-status update to use a +// fresh RV and silently overwrite any concurrent status write. +func Test_Experiment_RollbackPreservesConcurrentStatusWrite(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, 0) + + // Fetch the instance as the reconcile would, and capture its RV. + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseStopped, ID: "exp-1"} + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + // Snapshot what a reconcile loop would hold in memory at this point: + // the RV prior to invoking restorePreviousSpec. + snapshot := &v2alpha1.DatadogAgent{} + assert.NoError(t, r.client.Get(context.TODO(), nsName, snapshot)) + rvBefore := snapshot.ResourceVersion + + newStatus := &v2alpha1.DatadogAgentStatus{Experiment: snapshot.Status.Experiment.DeepCopy()} + revisions := mustListRevisions(t, r, snapshot) + assert.NoError(t, r.restorePreviousSpec(context.TODO(), snapshot, newStatus, revisions, v2alpha1.ExperimentPhaseRollback)) + + // The caller's in-memory instance RV must not have been advanced by the + // targeted status patch — otherwise a subsequent full-status write would + // skip its 409 protection. + assert.Equal(t, rvBefore, snapshot.ResourceVersion, "restorePreviousSpec must not mutate caller's ResourceVersion") + + // Phase must still have landed on the server via the targeted patch. + after := &v2alpha1.DatadogAgent{} + assert.NoError(t, r.client.Get(context.TODO(), nsName, after)) + assert.Equal(t, v2alpha1.ExperimentPhaseRollback, after.Status.Experiment.Phase) +} + +// Test_Experiment_RollbackDoesNotClobberConcurrentPhaseWrite verifies that +// when a concurrent writer (e.g. the RC daemon accepting a promotion) has +// moved `status.experiment.phase` between the reconcile's initial fetch and +// the restorePreviousSpec patch, the targeted JSON patch's `test` op fails +// and the concurrent phase is left intact — rather than silently overwritten +// by our stale terminal-phase decision. +func Test_Experiment_RollbackDoesNotClobberConcurrentPhaseWrite(t *testing.T) { + const ns, name = "default", "test-dda" + const uid = types.UID("uid-1") + nsName := types.NamespacedName{Namespace: ns, Name: name} + + r := newExperimentIntegrationReconciler(t, 0) + + dda := baseDDA(ns, name, uid) + createAndReconcile(t, r, dda) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Spec.Global.Site = ptr.To("datadoghq.eu") + assert.NoError(t, r.client.Update(context.TODO(), dda)) + reconcileN(t, r, ns, name, 1) + + assert.NoError(t, r.client.Get(context.TODO(), nsName, dda)) + dda.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseStopped, ID: "exp-1"} + assert.NoError(t, r.client.Status().Update(context.TODO(), dda)) + + // Snapshot the observed state — as if a reconcile had just fetched. + snapshot := &v2alpha1.DatadogAgent{} + assert.NoError(t, r.client.Get(context.TODO(), nsName, snapshot)) + + // Simulate a concurrent writer moving the phase to promoted *after* the + // operator fetched its view but *before* restorePreviousSpec runs. + concurrent := &v2alpha1.DatadogAgent{} + assert.NoError(t, r.client.Get(context.TODO(), nsName, concurrent)) + concurrent.Status.Experiment.Phase = v2alpha1.ExperimentPhasePromoted + assert.NoError(t, r.client.Status().Update(context.TODO(), concurrent)) + + // Identify the experiment revision for later annotation assertions. + var experimentRevName string + maxRev := int64(0) + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + if rev.Revision > maxRev { + maxRev = rev.Revision + experimentRevName = rev.Name + } + } + + newStatus := &v2alpha1.DatadogAgentStatus{Experiment: snapshot.Status.Experiment.DeepCopy()} + revisions := mustListRevisions(t, r, snapshot) + assert.NoError(t, r.restorePreviousSpec(context.TODO(), snapshot, newStatus, revisions, v2alpha1.ExperimentPhaseRollback)) + + // The targeted patch's test op must have rejected our stale decision — + // the concurrent writer's promoted phase survives. + after := &v2alpha1.DatadogAgent{} + assert.NoError(t, r.client.Get(context.TODO(), nsName, after)) + assert.Equal(t, v2alpha1.ExperimentPhasePromoted, after.Status.Experiment.Phase, + "concurrent phase write must not be overwritten by a stale targeted patch") + + // The rollback annotation must still have landed on the experiment + // revision. Annotation is unconditional — it is NOT gated on the phase + // patch succeeding — so even when the test op rejects our stale phase + // decision, the annotation protects a future re-apply of the same spec + // from the stale CreationTimestamp path. + var annotated bool + for _, rev := range listOwnedRevisions(t, r.client, ns, uid) { + if rev.Name == experimentRevName && rev.Annotations[annotationExperimentRollback] == "true" { + annotated = true + } + } + assert.True(t, annotated, + "experiment revision must be annotated regardless of phase-patch outcome") +} + // Test_Experiment_StoppedRollback verifies that when RC writes phase=stopped, // the operator restores the previous spec and sets phase=rollback. func Test_Experiment_StoppedRollback(t *testing.T) { diff --git a/internal/controller/datadogagent/experiment.go b/internal/controller/datadogagent/experiment.go index 4e7e581e17..c14ac34981 100644 --- a/internal/controller/datadogagent/experiment.go +++ b/internal/controller/datadogagent/experiment.go @@ -133,7 +133,7 @@ func (r *Reconciler) handleRollback( // stopped signal from RC: restore DDA spec and ack by setting phase to rollback. case phase == v2alpha1.ExperimentPhaseStopped: logger.Info("Experiment stopped, rolling back") - return r.restorePreviousSpec(ctx, instance.ObjectMeta, newStatus, revisions, v2alpha1.ExperimentPhaseRollback) + return r.restorePreviousSpec(ctx, instance, newStatus, revisions, v2alpha1.ExperimentPhaseRollback) case phase == v2alpha1.ExperimentPhaseRunning: rev := findMostRecentMatchingRevision(revisions, instance) if rev == nil && len(revisions) >= 2 { @@ -153,7 +153,7 @@ func (r *Reconciler) handleRollback( elapsed := now.Sub(rev.CreationTimestamp.Time) if elapsed >= getExperimentTimeout(r.options.ExperimentTimeout) { logger.Info("Experiment timed out, rolling back", "elapsed", elapsed.String()) - return r.restorePreviousSpec(ctx, instance.ObjectMeta, newStatus, revisions, v2alpha1.ExperimentPhaseTimeout) + return r.restorePreviousSpec(ctx, instance, newStatus, revisions, v2alpha1.ExperimentPhaseTimeout) } } } @@ -166,20 +166,38 @@ func (r *Reconciler) handleRollback( // experiment revision (the highest-numbered, non-rollback-target revision) with // the rollback annotation so its stale CreationTimestamp doesn't cause an // immediate timeout if the same spec is re-applied later. +// +// The terminal phase is also persisted via a targeted status subresource patch +// so the phase transition lands in the same reconcile as the spec rollback, +// without racing the full-status write in updateStatusIfNeededV2 (which could +// otherwise 409 on the ResourceVersion bumped by the spec Update, deferring +// the phase write to the next reconcile). func (r *Reconciler) restorePreviousSpec( ctx context.Context, - instanceMeta metav1.ObjectMeta, + instance *v2alpha1.DatadogAgent, newStatus *v2alpha1.DatadogAgentStatus, revisions []appsv1.ControllerRevision, terminalPhase v2alpha1.ExperimentPhase, ) error { rollbackTarget := findRollbackTarget(revisions) - if err := r.rollback(ctx, instanceMeta, rollbackTarget); err != nil { + // Capture the observed phase before mutating newStatus. This is the phase + // our rollback decision is based on; if a concurrent writer has since + // changed it, our decision is outdated and the patch must not apply. + observedPhase := instance.Status.Experiment.Phase + if err := r.rollback(ctx, instance.ObjectMeta, rollbackTarget); err != nil { return err } newStatus.Experiment.Phase = terminalPhase - // Mark the experiment revision (highest-numbered) so its stale timestamp - // doesn't cause an immediate timeout if the same spec is re-applied. + // Order matters: annotate the experiment revision BEFORE persisting the + // terminal phase. If the operator crashes between these two writes, the + // phase is still non-terminal on restart, handleRollback re-fires, and + // annotateRevision is idempotent — eventually both writes succeed. If the + // order were reversed, a crash between them would leave a terminal phase + // without the rollback annotation, and since handleRollback ignores + // terminal phases the revision would never get annotated, causing a + // future re-apply of the same spec to fire an immediate false timeout on + // the stale CreationTimestamp. + // // Only annotate the highest revision rather than all non-rollback-target // revisions: if GC failed on a prior reconcile there may be 3+ revisions, // and annotating old baselines would cause needless delete+recreate in @@ -187,9 +205,38 @@ func (r *Reconciler) restorePreviousSpec( if rev := highestRevision(revisions); rev != nil && rev.Name != rollbackTarget { r.annotateRevision(ctx, rev, annotationExperimentRollback) } + r.patchExperimentPhase(ctx, instance, observedPhase, terminalPhase) return nil } +// patchExperimentPhase writes the terminal experiment phase via a targeted +// status subresource JSON patch. This is narrower than updateStatusIfNeededV2's +// full-status replace: concurrent writes to other status fields (e.g. by the +// RC daemon) retain their own 409 protection on the full-status write, while +// the terminal phase is guaranteed to land in the same reconcile as the +// rollback. +// +// A `test` op guards the `replace`: the patch only applies if the server's +// current phase still matches the phase the rollback decision was based on. +// If another writer (e.g. the RC daemon promoting the experiment) has moved +// the phase in between, the test op fails, the patch is rejected, and we +// leave their decision untouched — mirroring the 409-and-retry semantics of +// a full-status Update without the RV plumbing. +// +// The patch is applied to a DeepCopy so the client's response decoding does +// not advance instance.ResourceVersion; the subsequent full-status write +// retains its own 409 protection for every other status field. +// +// Best-effort: any error here (test failure or transient) is logged at V(1) +// and the next reconcile will re-compute and retry. +func (r *Reconciler) patchExperimentPhase(ctx context.Context, instance *v2alpha1.DatadogAgent, expectedCurrent, newPhase v2alpha1.ExperimentPhase) { + patch := fmt.Appendf(nil, `[{"op":"test","path":"/status/experiment/phase","value":%q},{"op":"replace","path":"/status/experiment/phase","value":%q}]`, expectedCurrent, newPhase) + scratch := instance.DeepCopy() + if err := r.client.Status().Patch(ctx, scratch, client.RawPatch(types.JSONPatchType, patch)); err != nil { + ctrl.LoggerFrom(ctx).V(1).Info("Experiment phase patch skipped (concurrent write or transient error), will retry on next reconcile", "error", err.Error(), "expectedCurrent", expectedCurrent, "new", newPhase) + } +} + // rollback restores the DDA spec from the named ControllerRevision. func (r *Reconciler) rollback( ctx context.Context, diff --git a/internal/controller/datadogagent/experiment_test.go b/internal/controller/datadogagent/experiment_test.go index 541d8ee12f..ae8314bb31 100644 --- a/internal/controller/datadogagent/experiment_test.go +++ b/internal/controller/datadogagent/experiment_test.go @@ -196,19 +196,20 @@ func TestRestorePreviousSpec_PhaseSetOnlyOnSuccess(t *testing.T) { instanceB := newRevisionTestOwner("test-dda", "default") instanceB.Spec = v2alpha1.DatadogAgentSpec{Global: &v2alpha1.GlobalConfig{}} + instanceB.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseStopped} require.NoError(t, r.manageRevision(context.Background(), instanceB, mustListRevisions(t, r, instanceB), nil)) newStatus := &v2alpha1.DatadogAgentStatus{Experiment: &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseStopped}} // rollback requires the DDA to exist in the fake client; don't create it so it errors. - err := r.restorePreviousSpec(context.Background(), instanceB.ObjectMeta, newStatus, mustListRevisions(t, r, instanceB), v2alpha1.ExperimentPhaseRollback) + err := r.restorePreviousSpec(context.Background(), instanceB, newStatus, mustListRevisions(t, r, instanceB), v2alpha1.ExperimentPhaseRollback) require.Error(t, err) // Phase must NOT have been set since rollback failed. assert.Equal(t, v2alpha1.ExperimentPhaseStopped, newStatus.Experiment.Phase) // Now create the DDA so rollback can succeed. require.NoError(t, c.Create(context.Background(), instanceB)) - err = r.restorePreviousSpec(context.Background(), instanceB.ObjectMeta, newStatus, mustListRevisions(t, r, instanceB), v2alpha1.ExperimentPhaseRollback) + err = r.restorePreviousSpec(context.Background(), instanceB, newStatus, mustListRevisions(t, r, instanceB), v2alpha1.ExperimentPhaseRollback) require.NoError(t, err) assert.Equal(t, v2alpha1.ExperimentPhaseRollback, newStatus.Experiment.Phase) } @@ -421,7 +422,7 @@ func TestRestorePreviousSpec_ThreeRevisions_AnnotatesOnlyHighest(t *testing.T) { // Trigger rollback via phase=stopped. instanceC.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseStopped} newStatus := &v2alpha1.DatadogAgentStatus{Experiment: instanceC.Status.Experiment.DeepCopy()} - require.NoError(t, r.restorePreviousSpec(context.Background(), instanceC.ObjectMeta, newStatus, revList, v2alpha1.ExperimentPhaseRollback)) + require.NoError(t, r.restorePreviousSpec(context.Background(), instanceC, newStatus, revList, v2alpha1.ExperimentPhaseRollback)) assert.Equal(t, v2alpha1.ExperimentPhaseRollback, newStatus.Experiment.Phase) // Verify: only rev3 (experiment, highest) is annotated.