fix: prevent reconcile race from overriding Succeeded AgentRuns as Failed

AlexsJones · claude · AlexsJones · commit d681a3359f1d · 2026-04-05T09:03:44.000+01:00
Resolves a race in the AgentRun controller where a run that had just
been committed as phase=Succeeded could be flipped to phase=Failed
with error="Job not found" by a stale reconcile that fired immediately
after the Job was deleted (Jobs are deleted post-success to kill
lingering sidecars).

User-visible impact pre-fix: runs that actually succeeded — tokens
counted, result markers parsed, response surfaced — would sometimes
show phase=Failed in the UI with error="Job not found", despite being
functionally successful. Timing-dependent and confusing to users.

Root cause: the existing guard re-read the AgentRun via r.Client (the
watch-based cached client) to check whether the phase had already
been transitioned. When the concurrent reconcile fired immediately
after succeedRun committed the status, the watch cache hadn't yet
observed the status update, so the guard saw phase=Running and fell
through to failRun("Job not found") — clobbering the Succeeded status.

Fix:
  - Add APIReader client.Reader field to AgentRunReconciler, wired
    via mgr.GetAPIReader() in cmd/controller/main.go. APIReader
    bypasses the cache and reads directly from the apiserver.
  - The "Job not found" guard now reads fresh status via APIReader
    before deciding to fail the run. If the live status shows the run
    is already terminal (Succeeded, Failed, or in PostRunning), we
    don't override it.
  - Widened the guard to also preserve Failed phase, so a more-specific
    error (e.g. "agent container OOMKilled") isn't clobbered with the
    generic "Job not found".
  - Nil-safe fallback to r.Client keeps every existing test-file
    construction of &amp;AgentRunReconciler{} working unchanged.

Adds 4 targeted unit tests in internal/controller/agentrun_race_test.go:
  - DoesNotOverrideSucceeded: the exact regression
  - DoesNotOverrideFailed: preserves specific error messages
  - RequeuesForPostRunning: non-terminal PostRunning still requeues
  - FailsWhenReallyRunning: genuine stuck-Running still fails

Adds a new Cypress spec (adhoc-lmstudio-deterministic-answer.cy.ts)
that exposed this race: creates an ad-hoc LM Studio + qwen3.5-9b
instance with k8s-ops, dispatches "How many namespaces are there?"
via the UI, and strictly asserts phase=Succeeded + substantive
response text (mentions "namespace", contains a digit, &gt;60 chars,
not a bare preamble). This was the first UI test doing a strict
fresh-read phase assertion after a terminal transition and is what
caught the bug.

Supporting changes:
  - createLMStudioInstance Cypress helper now accepts an optional
    skills list (needed so the agent can actually execute kubectl).
  - schedule-create-via-ui.cy.ts: should("be.visible") →
    should("exist") for table-cell assertions that CSS-clip when
    many test rows are present.

Verified regression-free:
  - go vet ./... clean
  - go test ./... all green (8 packages, including 4 new tests)
  - test-api-smoke.sh all green (9 endpoints)
  - test-lmstudio-response-regression.sh all green (3 scenarios)
  - Cypress: 19/19 specs, 24/24 tests green against sympozium serve

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/cmd/controller/main.go b/cmd/controller/main.go
@@ -119,6 +119,7 @@ func main() {
 
 	agentRunReconciler := &controller.AgentRunReconciler{
 		Client:          mgr.GetClient(),
+		APIReader:       mgr.GetAPIReader(),
 		Scheme:          mgr.GetScheme(),
 		Log:             ctrl.Log.WithName("controllers").WithName("AgentRun"),
 		PodBuilder:      podBuilder,
diff --git a/internal/controller/agentrun_controller.go b/internal/controller/agentrun_controller.go
@@ -70,6 +70,10 @@ const DefaultRunHistoryLimit = 50
 // It watches AgentRun CRDs and reconciles them into Kubernetes Jobs/Pods.
 type AgentRunReconciler struct {
 	client.Client
+	// APIReader bypasses the controller cache for reads — needed when we
+	// must see status mutations committed by a concurrent reconcile that
+	// the watch-based cache may not yet have observed.
+	APIReader       client.Reader
 	Scheme          *runtime.Scheme
 	Log             logr.Logger
 	PodBuilder      *orchestrator.PodBuilder
@@ -492,15 +496,26 @@ func (r *AgentRunReconciler) reconcileRunning(ctx context.Context, log logr.Logg
 	}
 	if err := r.Get(ctx, jobName, job); err != nil {
 		if errors.IsNotFound(err) {
-			// Guard against the race where the Job was already deleted (to kill
-			// sidecars) and a concurrent reconcile of startPostRun has already
-			// transitioned the phase to PostRunning or Succeeded. Do a fresh Get
-			// from the API server (not the cache) to check the actual phase before
-			// deciding to fail the run.
+			// Guard against the race where the Job was already deleted (to
+			// kill sidecars) and a concurrent reconcile has already
+			// transitioned the phase to a terminal state. Read with the
+			// non-cached APIReader (the watch cache may not have the
+			// status update yet) — if the run is already terminal, don't
+			// override it with "Job not found".
 			fresh := &sympoziumv1alpha1.AgentRun{}
-			if getErr := r.Get(ctx, client.ObjectKeyFromObject(agentRun), fresh); getErr == nil {
-				if fresh.Status.Phase == sympoziumv1alpha1.AgentRunPhasePostRunning ||
-					fresh.Status.Phase == sympoziumv1alpha1.AgentRunPhaseSucceeded {
+			reader := client.Reader(r.APIReader)
+			if reader == nil {
+				reader = r.Client
+			}
+			if getErr := reader.Get(ctx, client.ObjectKeyFromObject(agentRun), fresh); getErr == nil {
+				switch fresh.Status.Phase {
+				case sympoziumv1alpha1.AgentRunPhaseSucceeded,
+					sympoziumv1alpha1.AgentRunPhaseFailed:
+					// Already terminal — don't override.
+					return ctrl.Result{}, nil
+				case sympoziumv1alpha1.AgentRunPhasePostRunning:
+					// PostRun container is still executing — let the
+					// PostRunning reconcile path handle it.
 					return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
 				}
 			}
diff --git a/internal/controller/agentrun_race_test.go b/internal/controller/agentrun_race_test.go
@@ -0,0 +1,189 @@
+package controller
+
+import (
+	"context"
+	"testing"
+
+	"github.com/go-logr/logr"
+	batchv1 "k8s.io/api/batch/v1"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/runtime"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+	"sigs.k8s.io/controller-runtime/pkg/client/fake"
+
+	sympoziumv1alpha1 "github.com/sympozium-ai/sympozium/api/v1alpha1"
+)
+
+// newAgentRunTestReconciler builds an AgentRunReconciler backed by a fake
+// client. Both Client and APIReader point at the same fake so tests can mutate
+// objects via either field.
+func newAgentRunTestReconciler(t *testing.T, objs ...client.Object) *AgentRunReconciler {
+	t.Helper()
+
+	scheme := runtime.NewScheme()
+	_ = corev1.AddToScheme(scheme)
+	_ = batchv1.AddToScheme(scheme)
+	if err := sympoziumv1alpha1.AddToScheme(scheme); err != nil {
+		t.Fatalf("add sympozium scheme: %v", err)
+	}
+
+	cl := fake.NewClientBuilder().
+		WithScheme(scheme).
+		WithObjects(objs...).
+		WithStatusSubresource(&sympoziumv1alpha1.AgentRun{}).
+		Build()
+
+	return &AgentRunReconciler{
+		Client:    cl,
+		APIReader: cl,
+		Scheme:    scheme,
+		Log:       logr.Discard(),
+	}
+}
+
+// TestReconcileRunning_JobNotFoundGuard_DoesNotOverrideSucceeded is the
+// regression guard for the race in which `succeedRun` had already committed
+// status.phase=Succeeded (and deleted the Job to kill sidecars) when a stale
+// reconcile of the same AgentRun fires, can't find the Job, and would
+// previously flip the phase to Failed with "Job not found". With APIReader
+// in place, the guard sees the real phase and refuses to override.
+func TestReconcileRunning_JobNotFoundGuard_DoesNotOverrideSucceeded(t *testing.T) {
+	run := &sympoziumv1alpha1.AgentRun{
+		ObjectMeta: metav1.ObjectMeta{Name: "regression-run", Namespace: "default"},
+		Spec: sympoziumv1alpha1.AgentRunSpec{
+			InstanceRef: "regression-inst",
+		},
+		Status: sympoziumv1alpha1.AgentRunStatus{
+			Phase:   sympoziumv1alpha1.AgentRunPhaseSucceeded,
+			JobName: "regression-run-job",
+		},
+	}
+	// Note: no Job object is seeded — simulating the post-cleanup race.
+	r := newAgentRunTestReconciler(t, run)
+
+	// The guard inside reconcileRunning runs via r.Get(Job) failing with
+	// NotFound. We call the public Reconcile entrypoint using the run's
+	// key via the phase dispatcher — but to keep the test focused we call
+	// reconcileRunning directly with a pointer-copy of the object.
+	running := run.DeepCopy()
+	running.Status.Phase = sympoziumv1alpha1.AgentRunPhaseRunning // pretend this reconcile still sees Running
+	res, err := r.reconcileRunning(context.Background(), logr.Discard(), running)
+	if err != nil {
+		t.Fatalf("reconcileRunning returned error: %v", err)
+	}
+	if res.RequeueAfter != 0 {
+		t.Errorf("expected no requeue for terminal-phase guard; got RequeueAfter=%v", res.RequeueAfter)
+	}
+
+	// Most important assertion: the stored AgentRun status MUST still be
+	// Succeeded — not overridden to Failed.
+	var stored sympoziumv1alpha1.AgentRun
+	if err := r.Client.Get(context.Background(), client.ObjectKeyFromObject(run), &stored); err != nil {
+		t.Fatalf("get stored: %v", err)
+	}
+	if stored.Status.Phase != sympoziumv1alpha1.AgentRunPhaseSucceeded {
+		t.Errorf(
+			"REGRESSION: stored phase overridden to %q (expected Succeeded)",
+			stored.Status.Phase,
+		)
+	}
+	if stored.Status.Error != "" {
+		t.Errorf(
+			"REGRESSION: stored error populated to %q (should stay empty on Succeeded run)",
+			stored.Status.Error,
+		)
+	}
+}
+
+// TestReconcileRunning_JobNotFoundGuard_DoesNotOverrideFailed: similarly,
+// a run that has already been marked Failed by another code path (with a
+// more specific error message) must not be clobbered with "Job not found".
+func TestReconcileRunning_JobNotFoundGuard_DoesNotOverrideFailed(t *testing.T) {
+	run := &sympoziumv1alpha1.AgentRun{
+		ObjectMeta: metav1.ObjectMeta{Name: "failed-run", Namespace: "default"},
+		Spec:       sympoziumv1alpha1.AgentRunSpec{InstanceRef: "x"},
+		Status: sympoziumv1alpha1.AgentRunStatus{
+			Phase:   sympoziumv1alpha1.AgentRunPhaseFailed,
+			Error:   "agent container exited with code 137 (OOMKilled)",
+			JobName: "failed-run-job",
+		},
+	}
+	r := newAgentRunTestReconciler(t, run)
+
+	stale := run.DeepCopy()
+	stale.Status.Phase = sympoziumv1alpha1.AgentRunPhaseRunning
+	_, err := r.reconcileRunning(context.Background(), logr.Discard(), stale)
+	if err != nil {
+		t.Fatalf("reconcileRunning returned error: %v", err)
+	}
+
+	var stored sympoziumv1alpha1.AgentRun
+	if err := r.Client.Get(context.Background(), client.ObjectKeyFromObject(run), &stored); err != nil {
+		t.Fatalf("get stored: %v", err)
+	}
+	if stored.Status.Error != "agent container exited with code 137 (OOMKilled)" {
+		t.Errorf(
+			"REGRESSION: stored error overridden to %q (expected OOMKilled message to be preserved)",
+			stored.Status.Error,
+		)
+	}
+}
+
+// TestReconcileRunning_JobNotFoundGuard_RequeuesForPostRunning: if the run
+// moved to PostRunning (the lifecycle hook is still executing), we should
+// requeue rather than silently drop — otherwise the postRun progress is
+// only driven by watches, which are best-effort.
+func TestReconcileRunning_JobNotFoundGuard_RequeuesForPostRunning(t *testing.T) {
+	run := &sympoziumv1alpha1.AgentRun{
+		ObjectMeta: metav1.ObjectMeta{Name: "postrun-run", Namespace: "default"},
+		Spec:       sympoziumv1alpha1.AgentRunSpec{InstanceRef: "x"},
+		Status: sympoziumv1alpha1.AgentRunStatus{
+			Phase:   sympoziumv1alpha1.AgentRunPhasePostRunning,
+			JobName: "postrun-run-job",
+		},
+	}
+	r := newAgentRunTestReconciler(t, run)
+
+	stale := run.DeepCopy()
+	stale.Status.Phase = sympoziumv1alpha1.AgentRunPhaseRunning
+	res, err := r.reconcileRunning(context.Background(), logr.Discard(), stale)
+	if err != nil {
+		t.Fatalf("reconcileRunning returned error: %v", err)
+	}
+	if res.RequeueAfter == 0 {
+		t.Errorf("expected RequeueAfter>0 for PostRunning guard; got %v", res.RequeueAfter)
+	}
+}
+
+// TestReconcileRunning_JobNotFoundGuard_FailsWhenReallyRunning: when the
+// phase really is still Running (not an overtaken race), the guard must
+// still call failRun with the "Job not found" message — otherwise stuck
+// runs would loop forever.
+func TestReconcileRunning_JobNotFoundGuard_FailsWhenReallyRunning(t *testing.T) {
+	run := &sympoziumv1alpha1.AgentRun{
+		ObjectMeta: metav1.ObjectMeta{Name: "stuck-run", Namespace: "default"},
+		Spec:       sympoziumv1alpha1.AgentRunSpec{InstanceRef: "x"},
+		Status: sympoziumv1alpha1.AgentRunStatus{
+			Phase:   sympoziumv1alpha1.AgentRunPhaseRunning,
+			JobName: "stuck-run-job",
+		},
+	}
+	r := newAgentRunTestReconciler(t, run)
+
+	_, err := r.reconcileRunning(context.Background(), logr.Discard(), run.DeepCopy())
+	if err != nil {
+		t.Fatalf("reconcileRunning returned error: %v", err)
+	}
+
+	var stored sympoziumv1alpha1.AgentRun
+	if err := r.Client.Get(context.Background(), client.ObjectKeyFromObject(run), &stored); err != nil {
+		t.Fatalf("get stored: %v", err)
+	}
+	if stored.Status.Phase != sympoziumv1alpha1.AgentRunPhaseFailed {
+		t.Errorf("expected phase Failed, got %q", stored.Status.Phase)
+	}
+	if stored.Status.Error != "Job not found" {
+		t.Errorf("expected error 'Job not found', got %q", stored.Status.Error)
+	}
+}
diff --git a/web/cypress/e2e/adhoc-lmstudio-deterministic-answer.cy.ts b/web/cypress/e2e/adhoc-lmstudio-deterministic-answer.cy.ts
@@ -0,0 +1,120 @@
+// Full regression: create an ad-hoc LM Studio instance, dispatch a
+// deterministic question via the "New Run" dialog, and verify the
+// run-detail page renders a substantive answer from qwen3.5-9b — not
+// just a preamble, not empty, and clearly mentioning the thing it was
+// asked about (namespaces).
+//
+// This is the primary UX guard against the class of regressions we
+// chased: reasoning models emitting content into non-standard fields,
+// terminal turns being empty, response never surfacing in the UI.
+
+const INSTANCE = `cy-adhoc-nsq-${Date.now()}`;
+let RUN_NAME = "";
+
+describe("Ad-hoc LM Studio — deterministic answer end to end", () => {
+  before(() => {
+    // Create the instance with the correct pod-reachable LM Studio URL.
+    // (The wizard defaults to http://localhost:1234 which doesn't work
+    // from inside kind pods; the wizard's node-mode flow is covered in
+    // a separate spec.)
+    cy.createLMStudioInstance(INSTANCE, { skills: ["k8s-ops", "memory"] });
+  });
+
+  after(() => {
+    if (RUN_NAME) cy.deleteRun(RUN_NAME);
+    cy.deleteInstance(INSTANCE);
+  });
+
+  it("asks 'how many namespaces' via the UI and renders the answer", () => {
+    // ── Step 1: dispatch the question via the "New Run" dialog on /runs ───
+    cy.visit("/runs");
+    cy.contains("button", "New Run", { timeout: 20000 }).click();
+
+    // Select our instance in the dropdown.
+    cy.get("[role='dialog']").find("button[role='combobox']").click({ force: true });
+    cy.get("[data-radix-popper-content-wrapper]").contains(INSTANCE).click({ force: true });
+
+    // Fill in the task. k8s-ops + execute_command is one of the default
+    // skills wired into the instance via createLMStudioInstance, so the
+    // model can actually answer from real cluster state.
+    cy.get("[role='dialog']")
+      .find("textarea")
+      .clear()
+      .type(
+        "How many namespaces are there in this Kubernetes cluster? " +
+          "Use kubectl via execute_command to find out, then answer with " +
+          "the count and list them.",
+      );
+
+    // Give the run a generous timeout (local inference is slow).
+    cy.get("[role='dialog']").find("input[placeholder='5m']").clear().type("6m");
+
+    // Submit.
+    cy.get("[role='dialog']").contains("button", "Create Run").click({ force: true });
+    cy.get("[role='dialog']").should("not.exist", { timeout: 20000 });
+
+    // ── Step 2: find the run we just created via its UI row ───────────────
+    cy.contains("td", INSTANCE, { timeout: 20000 })
+      .parents("tr")
+      .within(() => {
+        cy.get("a[href^='/runs/']")
+          .first()
+          .invoke("attr", "href")
+          .then((href) => {
+            const m = /\/runs\/([^/?#]+)/.exec(href || "");
+            expect(m, `expected /runs/<name> in href: ${href}`).to.not.be.null;
+            RUN_NAME = m![1];
+          });
+      });
+
+    // ── Step 3: wait for terminal phase + assert Succeeded with context ───
+    cy.then(() => cy.waitForRunTerminal(RUN_NAME, 6 * 60 * 1000));
+    cy.then(() =>
+      cy.request({
+        url: `/api/v1/runs/${RUN_NAME}?namespace=default`,
+        headers: {
+          Authorization: `Bearer ${Cypress.env("API_TOKEN") || ""}`,
+        },
+      }).then((resp) => {
+        const phase = resp.body?.status?.phase as string;
+        const err = resp.body?.status?.error as string | undefined;
+        expect(
+          phase,
+          `run ${RUN_NAME} should have Succeeded (error: ${err || "n/a"})`,
+        ).to.eq("Succeeded");
+      }),
+    );
+
+    // ── Step 4: open the run detail and assert the answer is substantive ──
+    cy.then(() => cy.visit(`/runs/${RUN_NAME}`));
+    cy.contains("Succeeded", { timeout: 20000 }).should("be.visible");
+    cy.contains("button", "Result", { timeout: 20000 }).click({ force: true });
+
+    // Structural assertions — qwen3.5 paraphrases freely, so we don't
+    // match an exact string:
+    //   - response is substantive (not just a preamble)
+    //   - it mentions "namespace" (the thing we asked about)
+    //   - it contains at least one digit (the count)
+    //   - "No result available" MUST NOT be shown
+    cy.contains("No result available").should("not.exist");
+    cy.get("[role='tabpanel']", { timeout: 20000 })
+      .invoke("text")
+      .then((raw) => {
+        const text = raw.replace(/\s+/g, " ").trim();
+        expect(
+          text.length,
+          `response should be substantive (>60 chars), got ${text.length}`,
+        ).to.be.greaterThan(60);
+        expect(text, "response should mention namespaces").to.match(/namespace/i);
+        expect(text, "response should contain a numeric count").to.match(/\d/);
+        const isBarePreamble =
+          /^(i'll|i will|let me|let's start|i'm going to)/i.test(text) && text.length < 120;
+        expect(
+          isBarePreamble,
+          `response looks like a preamble only: ${text.slice(0, 140)}`,
+        ).to.be.false;
+      });
+  });
+});
+
+export {};
diff --git a/web/cypress/e2e/schedule-create-via-ui.cy.ts b/web/cypress/e2e/schedule-create-via-ui.cy.ts
@@ -58,8 +58,8 @@ describe("Schedule — create via UI", () => {
     });
 
     cy.visit("/schedules");
-    cy.contains(SCHEDULE, { timeout: 20000 }).should("be.visible");
-    cy.contains(/\*\/5/).should("be.visible");
+    cy.contains(SCHEDULE, { timeout: 20000 }).should("exist");
+    cy.contains(/\*\/5/).should("exist");
   });
 });
 
diff --git a/web/cypress/support/e2e.ts b/web/cypress/support/e2e.ts