Skip to content

Commit dee0e05

Browse files
committed
Patch MCPServer spec instead of Update
Fixes #4767. The controller writes finalizers, finalizer removal, and the restart-processed annotation via r.Update. Update is a full PUT, so any spec field the operator does not track — most importantly spec.authzConfig, which an external operator will soon own via server-side apply — is zeroed on every reconcile. Replace the three Update call sites with an optimistic-lock merge patch. The merge-patch body carries only fields the caller changed, so untouched fields never hit the wire and cannot be clobbered. MergeFromWithOptimisticLock sends resourceVersion as a precondition, giving 409-on-collision semantics for concurrent writers and defending metadata.finalizers (which has no array-merge semantics under merge-patch) against wholesale replacement when another controller is mid-flight adding its own entry. Tests: - Envtest suite writes spec.authzConfig out-of-band and asserts it survives both the finalizer-add reconcile and the restart-annotation reconcile. - Unit suite uses a patch-recording client to assert each migrated call site emits a body carrying the resourceVersion precondition — a deterministic wire-level signal that MergeFromWithOptimisticLock is in effect. A regression to plain MergeFrom would drop the precondition and fail the assertion independent of the higher-level survival test. Also: - .claude/rules/operator.md: new "Spec / metadata patching" section documenting the pattern for future CR writes. Status patching is a separate follow-up (#4633). - Rename the mock-client flag failOnMCPServerUpdate → failOnMCPServerWrite; it now intercepts both Update and Patch on MCPServer, so the name matches reality.
1 parent cfd835c commit dee0e05

5 files changed

Lines changed: 465 additions & 17 deletions

File tree

.claude/rules/operator.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,3 +55,36 @@ task operator-test # Run unit tests
5555
task operator-e2e-test # Run e2e tests
5656
task crdref-gen # Generate CRD API docs (run from cmd/thv-operator/)
5757
```
58+
59+
## Spec / metadata patching
60+
61+
Never use `r.Update` on a CR spec or metadata: `Update` is a full PUT,
62+
so any field our local copy does not track (e.g. `spec.authzConfig`
63+
written by an external operator via server-side apply) gets zeroed on
64+
every reconcile.
65+
66+
Use an **optimistic-lock merge patch** instead. The merge-patch body
67+
only contains fields the caller changed, and `MergeFromWithOptimisticLock`
68+
sends `resourceVersion` as a precondition so concurrent writers get a
69+
409 and a clean requeue. This also defends `metadata.finalizers`:
70+
merge-patch has no array-merge semantics, so without the lock a write
71+
including the `finalizers` array would replace it wholesale.
72+
73+
```go
74+
original := mcpServer.DeepCopy()
75+
controllerutil.AddFinalizer(mcpServer, "mcpserver.toolhive.stacklok.dev/finalizer")
76+
if err := r.Patch(ctx, mcpServer,
77+
client.MergeFromWithOptions(original, client.MergeFromWithOptimisticLock{})); err != nil {
78+
return ctrl.Result{}, err
79+
}
80+
```
81+
82+
Do not put unrelated work between the `DeepCopy` and the `Patch`: the
83+
wire diff is computed from that snapshot, so anything you mutate in
84+
between leaks into the patch body.
85+
86+
Expect 409s as routine log noise once external SSA writers land — the
87+
guard doing its job, not a bug.
88+
89+
Status-subresource patching follows the same "never `Update`" rule and
90+
is covered separately once the shared helper lands (#4633).

cmd/thv-operator/controllers/mcpserver_controller.go

Lines changed: 22 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -187,9 +187,14 @@ func (r *MCPServerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
187187
return ctrl.Result{}, err
188188
}
189189

190+
// Use an optimistic-lock merge patch rather than Update so we do not
191+
// silently overwrite spec fields (e.g. spec.authzConfig) that are owned
192+
// by other actors via server-side apply. The resourceVersion is sent,
193+
// so concurrent writers cause a 409 Conflict and a clean requeue.
194+
original := mcpServer.DeepCopy()
190195
controllerutil.RemoveFinalizer(mcpServer, "mcpserver.toolhive.stacklok.dev/finalizer")
191-
err := r.Update(ctx, mcpServer)
192-
if err != nil {
196+
if err := r.Patch(ctx, mcpServer,
197+
client.MergeFromWithOptions(original, client.MergeFromWithOptimisticLock{})); err != nil {
193198
return ctrl.Result{}, err
194199
}
195200
}
@@ -198,9 +203,14 @@ func (r *MCPServerReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
198203

199204
// Add finalizer for this CR
200205
if !controllerutil.ContainsFinalizer(mcpServer, "mcpserver.toolhive.stacklok.dev/finalizer") {
206+
// Use an optimistic-lock merge patch rather than Update so we do not
207+
// silently overwrite spec fields (e.g. spec.authzConfig) that are owned
208+
// by other actors via server-side apply. The resourceVersion is sent,
209+
// so concurrent writers cause a 409 Conflict and a clean requeue.
210+
original := mcpServer.DeepCopy()
201211
controllerutil.AddFinalizer(mcpServer, "mcpserver.toolhive.stacklok.dev/finalizer")
202-
err = r.Update(ctx, mcpServer)
203-
if err != nil {
212+
if err := r.Patch(ctx, mcpServer,
213+
client.MergeFromWithOptions(original, client.MergeFromWithOptimisticLock{})); err != nil {
204214
return ctrl.Result{}, err
205215
}
206216
}
@@ -745,13 +755,18 @@ func (r *MCPServerReconciler) handleRestartAnnotation(ctx context.Context, mcpSe
745755
return false, fmt.Errorf("failed to perform restart: %w", err)
746756
}
747757

748-
// Update the last processed restart timestamp in annotations
758+
// Update the last processed restart timestamp in annotations.
759+
// Use an optimistic-lock merge patch rather than Update so we do not
760+
// silently overwrite spec fields (e.g. spec.authzConfig) that are owned
761+
// by other actors via server-side apply. The resourceVersion is sent,
762+
// so concurrent writers cause a 409 Conflict and a clean requeue.
763+
original := mcpServer.DeepCopy()
749764
if mcpServer.Annotations == nil {
750765
mcpServer.Annotations = make(map[string]string)
751766
}
752767
mcpServer.Annotations[LastProcessedRestartAnnotationKey] = currentRestartedAt
753-
err = r.Update(ctx, mcpServer)
754-
if err != nil {
768+
if err := r.Patch(ctx, mcpServer,
769+
client.MergeFromWithOptions(original, client.MergeFromWithOptimisticLock{})); err != nil {
755770
return false, fmt.Errorf("failed to update MCPServer with last processed restart annotation: %w", err)
756771
}
757772

cmd/thv-operator/controllers/mcpserver_restart_test.go

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -564,10 +564,10 @@ func TestHandleRestartAnnotation_ErrorPaths(t *testing.T) {
564564
testCtx.createDeployment()
565565
testCtx.setRestartAnnotation("2023-01-01T12:00:00Z", "rolling")
566566

567-
// Mock a client that fails only on MCPServer Update operations
567+
// Mock a client that fails only on MCPServer write operations
568568
mockClient := &mockFailingClient{
569-
Client: testCtx.client,
570-
failOnMCPServerUpdate: true,
569+
Client: testCtx.client,
570+
failOnMCPServerWrite: true,
571571
}
572572
testCtx.reconciler.Client = mockClient
573573

@@ -698,14 +698,20 @@ func TestReconcile_HandleRestartAnnotation_ErrorPaths(t *testing.T) {
698698
})
699699
}
700700

701-
// mockFailingClient is a test helper that wraps a real client and can be configured to fail on specific operations
701+
// mockFailingClient is a test helper that wraps a real client and can be configured to fail on specific operations.
702+
//
703+
// failOnMCPServerWrite triggers a mock error on any write (Update or Patch)
704+
// whose target is a *mcpv1alpha1.MCPServer. "Write" is used because the
705+
// #4767 migration replaced MCPServer spec Updates with optimistic-lock
706+
// Patches, so a single flag covers both code paths that can mutate the
707+
// resource.
702708
type mockFailingClient struct {
703709
client.Client
704-
failOnGet bool
705-
failOnList bool
706-
failOnUpdate bool
707-
failOnDelete bool
708-
failOnMCPServerUpdate bool
710+
failOnGet bool
711+
failOnList bool
712+
failOnUpdate bool
713+
failOnDelete bool
714+
failOnMCPServerWrite bool
709715
}
710716

711717
func (m *mockFailingClient) Get(ctx context.Context, key client.ObjectKey, obj client.Object, opts ...client.GetOption) error {
@@ -726,7 +732,7 @@ func (m *mockFailingClient) Update(ctx context.Context, obj client.Object, opts
726732
if m.failOnUpdate {
727733
return fmt.Errorf("mock error: Update operation failed")
728734
}
729-
if m.failOnMCPServerUpdate {
735+
if m.failOnMCPServerWrite {
730736
// Check if the object being updated is an MCPServer
731737
if _, isMCPServer := obj.(*mcpv1alpha1.MCPServer); isMCPServer {
732738
return fmt.Errorf("mock error: MCPServer Update operation failed")
@@ -735,6 +741,17 @@ func (m *mockFailingClient) Update(ctx context.Context, obj client.Object, opts
735741
return m.Client.Update(ctx, obj, opts...)
736742
}
737743

744+
func (m *mockFailingClient) Patch(
745+
ctx context.Context, obj client.Object, patch client.Patch, opts ...client.PatchOption,
746+
) error {
747+
if m.failOnMCPServerWrite {
748+
if _, isMCPServer := obj.(*mcpv1alpha1.MCPServer); isMCPServer {
749+
return fmt.Errorf("mock error: MCPServer Patch operation failed")
750+
}
751+
}
752+
return m.Client.Patch(ctx, obj, patch, opts...)
753+
}
754+
738755
func (m *mockFailingClient) Delete(ctx context.Context, obj client.Object, opts ...client.DeleteOption) error {
739756
if m.failOnDelete {
740757
return fmt.Errorf("mock error: Delete operation failed")
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
// SPDX-FileCopyrightText: Copyright 2025 Stacklok, Inc.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
package controllers
5+
6+
import (
7+
"context"
8+
"strings"
9+
"sync"
10+
"testing"
11+
12+
"github.com/stretchr/testify/assert"
13+
"github.com/stretchr/testify/require"
14+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
15+
"k8s.io/apimachinery/pkg/types"
16+
ctrl "sigs.k8s.io/controller-runtime"
17+
"sigs.k8s.io/controller-runtime/pkg/client"
18+
"sigs.k8s.io/controller-runtime/pkg/client/fake"
19+
20+
mcpv1alpha1 "github.com/stacklok/toolhive/cmd/thv-operator/api/v1alpha1"
21+
"github.com/stacklok/toolhive/pkg/container/kubernetes"
22+
)
23+
24+
// patchRecordingClient wraps a client.Client and records the marshaled body
25+
// of every Patch call. Tests use it to assert the wire-level flavor of a
26+
// patch — in particular, an optimistic-lock merge patch stamps the
27+
// resourceVersion into the body, so its presence in the recorded body is a
28+
// deterministic signal that MergeFromWithOptimisticLock was in effect.
29+
//
30+
// Patches issued via .Status().Patch do not pass through this wrapper:
31+
// controller-runtime's subresource client is obtained from the embedded
32+
// client.Client and has its own Patch implementation, so the recorder only
33+
// observes spec/metadata patches on the root client.
34+
type patchRecordingClient struct {
35+
client.Client
36+
mu sync.Mutex
37+
patches []recordedPatch
38+
}
39+
40+
type recordedPatch struct {
41+
obj client.Object
42+
body string
43+
}
44+
45+
func (c *patchRecordingClient) Patch(
46+
ctx context.Context, obj client.Object, patch client.Patch, opts ...client.PatchOption,
47+
) error {
48+
if data, err := patch.Data(obj); err == nil {
49+
c.mu.Lock()
50+
c.patches = append(c.patches, recordedPatch{
51+
obj: obj.DeepCopyObject().(client.Object),
52+
body: string(data),
53+
})
54+
c.mu.Unlock()
55+
}
56+
return c.Client.Patch(ctx, obj, patch, opts...)
57+
}
58+
59+
// lastMCPServerPatchBody returns the body of the most recent recorded
60+
// Patch call whose target was an *mcpv1alpha1.MCPServer. Returns empty
61+
// string if none was recorded.
62+
func (c *patchRecordingClient) lastMCPServerPatchBody() string {
63+
c.mu.Lock()
64+
defer c.mu.Unlock()
65+
for i := len(c.patches) - 1; i >= 0; i-- {
66+
if _, ok := c.patches[i].obj.(*mcpv1alpha1.MCPServer); ok {
67+
return c.patches[i].body
68+
}
69+
}
70+
return ""
71+
}
72+
73+
// TestMCPServerSpecPatchesAreOptimisticLock asserts that each of the three
74+
// MCPServer spec Patch call sites introduced in #4767 emits a merge-patch
75+
// whose body carries the resourceVersion precondition. A regression from
76+
// client.MergeFromWithOptions(orig, client.MergeFromWithOptimisticLock{})
77+
// to plain client.MergeFrom(orig) would drop the precondition and fail
78+
// these assertions, independent of whether the higher-level field-
79+
// clobber survival test still passes.
80+
func TestMCPServerSpecPatchesAreOptimisticLock(t *testing.T) {
81+
t.Parallel()
82+
83+
const (
84+
namespace = "default"
85+
finalizerName = "mcpserver.toolhive.stacklok.dev/finalizer"
86+
)
87+
88+
tests := []struct {
89+
name string
90+
// seed returns the MCPServer fixture placed in the fake client
91+
// before the action runs. Returning a distinct name per case
92+
// keeps parallel subtests from colliding on the shared fake.
93+
seed func() *mcpv1alpha1.MCPServer
94+
// action triggers the reconcile path that should emit the
95+
// optimistic-lock Patch under test. It is invoked with a
96+
// recorder-backed reconciler.
97+
action func(t *testing.T, r *MCPServerReconciler, key types.NamespacedName)
98+
}{
99+
{
100+
name: "AddFinalizer",
101+
seed: func() *mcpv1alpha1.MCPServer {
102+
s := createTestMCPServer("optlock-add", namespace)
103+
// No finalizer yet — Reconcile should add it.
104+
return s
105+
},
106+
action: func(t *testing.T, r *MCPServerReconciler, key types.NamespacedName) {
107+
t.Helper()
108+
_, _ = r.Reconcile(context.TODO(), ctrl.Request{NamespacedName: key})
109+
},
110+
},
111+
{
112+
name: "RemoveFinalizer",
113+
seed: func() *mcpv1alpha1.MCPServer {
114+
s := createTestMCPServer("optlock-remove", namespace)
115+
s.Finalizers = []string{finalizerName}
116+
// DeletionTimestamp forces Reconcile into the
117+
// finalize branch. The fake client accepts an
118+
// already-set timestamp on created objects.
119+
now := metav1.Now()
120+
s.DeletionTimestamp = &now
121+
return s
122+
},
123+
action: func(t *testing.T, r *MCPServerReconciler, key types.NamespacedName) {
124+
t.Helper()
125+
_, _ = r.Reconcile(context.TODO(), ctrl.Request{NamespacedName: key})
126+
},
127+
},
128+
{
129+
name: "RestartAnnotation",
130+
seed: func() *mcpv1alpha1.MCPServer {
131+
s := createTestMCPServer("optlock-restart", namespace)
132+
s.Finalizers = []string{finalizerName}
133+
if s.Annotations == nil {
134+
s.Annotations = map[string]string{}
135+
}
136+
s.Annotations[RestartedAtAnnotationKey] = "2026-01-01T00:00:00Z"
137+
s.Annotations[RestartStrategyAnnotationKey] = "immediate"
138+
return s
139+
},
140+
action: func(t *testing.T, r *MCPServerReconciler, key types.NamespacedName) {
141+
t.Helper()
142+
got := &mcpv1alpha1.MCPServer{}
143+
require.NoError(t, r.Get(context.TODO(), key, got))
144+
// handleRestartAnnotation is the innermost
145+
// function that issues the Patch under test;
146+
// calling it directly avoids exercising the
147+
// rest of Reconcile, which would issue many
148+
// unrelated writes.
149+
_, _ = r.handleRestartAnnotation(context.TODO(), got)
150+
},
151+
},
152+
}
153+
154+
for _, tc := range tests {
155+
t.Run(tc.name, func(t *testing.T) {
156+
t.Parallel()
157+
158+
seeded := tc.seed()
159+
testScheme := createTestScheme()
160+
fakeClient := fake.NewClientBuilder().
161+
WithScheme(testScheme).
162+
WithObjects(seeded).
163+
WithStatusSubresource(&mcpv1alpha1.MCPServer{}).
164+
Build()
165+
recorder := &patchRecordingClient{Client: fakeClient}
166+
reconciler := newTestMCPServerReconciler(
167+
recorder, testScheme, kubernetes.PlatformKubernetes)
168+
169+
tc.action(t, reconciler, types.NamespacedName{
170+
Name: seeded.Name,
171+
Namespace: namespace,
172+
})
173+
174+
body := recorder.lastMCPServerPatchBody()
175+
require.NotEmpty(t, body,
176+
"no MCPServer Patch was recorded; the reconcile path did not emit the expected write")
177+
assert.True(t,
178+
strings.Contains(body, `"resourceVersion"`),
179+
"MCPServer spec patch body did not include a resourceVersion precondition; "+
180+
"MergeFromWithOptimisticLock regression? body=%s", body)
181+
})
182+
}
183+
}

0 commit comments

Comments
 (0)