Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,46 @@ func reconcileN(t *testing.T, r *Reconciler, ns, name string, n int) {
}
}

// Test_Experiment_RollbackPersistsInSingleReconcile verifies that the spec
// rollback and the terminal phase write both land in the same reconcile —
// i.e. the status update uses the post-rollback ResourceVersion rather than
// 409'ing and deferring the phase write. Regression guard: without
// ResourceVersion propagation, reconcile 2 could observe a post-rollback spec
// matching an annotated, freshly-timestamped revision and skip the timeout
// check, leaving phase=running.
func Test_Experiment_RollbackPersistsInSingleReconcile(t *testing.T) {
const ns, name = "default", "test-dda"
const uid = types.UID("uid-1")
const timeout = 50 * time.Millisecond
nsName := types.NamespacedName{Namespace: ns, Name: name}

r := newExperimentIntegrationReconciler(t, timeout)

dda := baseDDA(ns, name, uid)
createAndReconcile(t, r, dda)

assert.NoError(t, r.client.Get(context.TODO(), nsName, dda))
dda.Spec.Global.Site = ptr.To("datadoghq.eu")
assert.NoError(t, r.client.Update(context.TODO(), dda))
reconcileN(t, r, ns, name, 1)

assert.NoError(t, r.client.Get(context.TODO(), nsName, dda))
dda.Status.Experiment = &v2alpha1.ExperimentStatus{
Phase: v2alpha1.ExperimentPhaseRunning,
ID: "exp-1",
}
assert.NoError(t, r.client.Status().Update(context.TODO(), dda))
time.Sleep(2 * timeout)

// Exactly one reconcile — both spec rollback and phase=timeout must persist.
reconcileN(t, r, ns, name, 1)

assert.NoError(t, r.client.Get(context.TODO(), nsName, dda))
assert.Equal(t, "datadoghq.com", *dda.Spec.Global.Site, "spec must be rolled back in the same reconcile")
assert.NotNil(t, dda.Status.Experiment)
assert.Equal(t, v2alpha1.ExperimentPhaseTimeout, dda.Status.Experiment.Phase, "phase=timeout must persist in the same reconcile as the rollback")
}

// Test_Experiment_StoppedRollback verifies that when RC writes phase=stopped,
// the operator restores the previous spec and sets phase=rollback.
func Test_Experiment_StoppedRollback(t *testing.T) {
Expand Down
29 changes: 25 additions & 4 deletions internal/controller/datadogagent/experiment.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ func (r *Reconciler) handleRollback(
// stopped signal from RC: restore DDA spec and ack by setting phase to rollback.
case phase == v2alpha1.ExperimentPhaseStopped:
logger.Info("Experiment stopped, rolling back")
return r.restorePreviousSpec(ctx, instance.ObjectMeta, newStatus, revisions, v2alpha1.ExperimentPhaseRollback)
return r.restorePreviousSpec(ctx, instance, newStatus, revisions, v2alpha1.ExperimentPhaseRollback)
case phase == v2alpha1.ExperimentPhaseRunning:
rev := findMostRecentMatchingRevision(revisions, instance)
if rev == nil && len(revisions) >= 2 {
Expand All @@ -153,7 +153,7 @@ func (r *Reconciler) handleRollback(
elapsed := now.Sub(rev.CreationTimestamp.Time)
if elapsed >= getExperimentTimeout(r.options.ExperimentTimeout) {
logger.Info("Experiment timed out, rolling back", "elapsed", elapsed.String())
return r.restorePreviousSpec(ctx, instance.ObjectMeta, newStatus, revisions, v2alpha1.ExperimentPhaseTimeout)
return r.restorePreviousSpec(ctx, instance, newStatus, revisions, v2alpha1.ExperimentPhaseTimeout)
}
}
}
Expand All @@ -166,18 +166,25 @@ func (r *Reconciler) handleRollback(
// experiment revision (the highest-numbered, non-rollback-target revision) with
// the rollback annotation so its stale CreationTimestamp doesn't cause an
// immediate timeout if the same spec is re-applied later.
//
// The terminal phase is also persisted via a targeted status subresource patch
// so the phase transition lands in the same reconcile as the spec rollback,
// without racing the full-status write in updateStatusIfNeededV2 (which could
// otherwise 409 on the ResourceVersion bumped by the spec Update, deferring
// the phase write to the next reconcile).
func (r *Reconciler) restorePreviousSpec(
ctx context.Context,
instanceMeta metav1.ObjectMeta,
instance *v2alpha1.DatadogAgent,
newStatus *v2alpha1.DatadogAgentStatus,
revisions []appsv1.ControllerRevision,
terminalPhase v2alpha1.ExperimentPhase,
) error {
rollbackTarget := findRollbackTarget(revisions)
if err := r.rollback(ctx, instanceMeta, rollbackTarget); err != nil {
if err := r.rollback(ctx, instance.ObjectMeta, rollbackTarget); err != nil {
return err
}
newStatus.Experiment.Phase = terminalPhase
r.patchExperimentPhase(ctx, instance, terminalPhase)
// Mark the experiment revision (highest-numbered) so its stale timestamp
// doesn't cause an immediate timeout if the same spec is re-applied.
// Only annotate the highest revision rather than all non-rollback-target
Expand All @@ -190,6 +197,20 @@ func (r *Reconciler) restorePreviousSpec(
return nil
}

// patchExperimentPhase writes the terminal experiment phase via a targeted
// status subresource merge patch. This is narrower than updateStatusIfNeededV2's
// full-status replace: concurrent writes to other status fields (e.g. by the
// RC daemon) retain their own 409 protection on the full-status write, while
// the terminal phase is guaranteed to land in the same reconcile as the
// rollback. Best-effort: a transient failure here just means the next
// reconcile will re-compute and retry.
func (r *Reconciler) patchExperimentPhase(ctx context.Context, instance *v2alpha1.DatadogAgent, phase v2alpha1.ExperimentPhase) {
patch := fmt.Appendf(nil, `{"status":{"experiment":{"phase":%q}}}`, phase)
if err := r.client.Status().Patch(ctx, instance, client.RawPatch(types.MergePatchType, patch)); err != nil {
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Avoid refreshing the status RV before the full status update

When this raw status patch succeeds it decodes the server response back into instance, advancing instance.ResourceVersion even though newStatus was computed from the object fetched at the start of reconcile. If Fleet Automation/RC updates any status field (for example a new experiment ID or phase) between that initial fetch and this patch, the later updateStatusIfNeededV2 full Status().Update will use the patched RV and replace status with stale newStatus instead of 409'ing as before, clobbering the concurrent status write. Patch into a scratch object or refetch/merge before the full status update so other status fields keep their conflict protection.

Useful? React with 👍 / 👎.

ctrl.LoggerFrom(ctx).Error(err, "Failed to patch experiment phase, will retry on next reconcile", "phase", phase)
}
}

// rollback restores the DDA spec from the named ControllerRevision.
func (r *Reconciler) rollback(
ctx context.Context,
Expand Down
6 changes: 3 additions & 3 deletions internal/controller/datadogagent/experiment_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,14 +201,14 @@ func TestRestorePreviousSpec_PhaseSetOnlyOnSuccess(t *testing.T) {
newStatus := &v2alpha1.DatadogAgentStatus{Experiment: &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseStopped}}

// rollback requires the DDA to exist in the fake client; don't create it so it errors.
err := r.restorePreviousSpec(context.Background(), instanceB.ObjectMeta, newStatus, mustListRevisions(t, r, instanceB), v2alpha1.ExperimentPhaseRollback)
err := r.restorePreviousSpec(context.Background(), instanceB, newStatus, mustListRevisions(t, r, instanceB), v2alpha1.ExperimentPhaseRollback)
require.Error(t, err)
// Phase must NOT have been set since rollback failed.
assert.Equal(t, v2alpha1.ExperimentPhaseStopped, newStatus.Experiment.Phase)

// Now create the DDA so rollback can succeed.
require.NoError(t, c.Create(context.Background(), instanceB))
err = r.restorePreviousSpec(context.Background(), instanceB.ObjectMeta, newStatus, mustListRevisions(t, r, instanceB), v2alpha1.ExperimentPhaseRollback)
err = r.restorePreviousSpec(context.Background(), instanceB, newStatus, mustListRevisions(t, r, instanceB), v2alpha1.ExperimentPhaseRollback)
require.NoError(t, err)
assert.Equal(t, v2alpha1.ExperimentPhaseRollback, newStatus.Experiment.Phase)
}
Expand Down Expand Up @@ -421,7 +421,7 @@ func TestRestorePreviousSpec_ThreeRevisions_AnnotatesOnlyHighest(t *testing.T) {
// Trigger rollback via phase=stopped.
instanceC.Status.Experiment = &v2alpha1.ExperimentStatus{Phase: v2alpha1.ExperimentPhaseStopped}
newStatus := &v2alpha1.DatadogAgentStatus{Experiment: instanceC.Status.Experiment.DeepCopy()}
require.NoError(t, r.restorePreviousSpec(context.Background(), instanceC.ObjectMeta, newStatus, revList, v2alpha1.ExperimentPhaseRollback))
require.NoError(t, r.restorePreviousSpec(context.Background(), instanceC, newStatus, revList, v2alpha1.ExperimentPhaseRollback))
assert.Equal(t, v2alpha1.ExperimentPhaseRollback, newStatus.Experiment.Phase)

// Verify: only rev3 (experiment, highest) is annotated.
Expand Down
Loading