Skip to content

Commit 205829a

Browse files
AlexsJonesclaude
andcommitted
fix: scheduler picks next free run-number suffix to avoid ghost runs
Resolves a bug where re-enabling a PersonaPack (or otherwise recreating a SympoziumSchedule after its AgentRun children were left behind) caused the scheduler to silently no-op instead of creating real runs. Root cause: the scheduler computed AgentRun names as {schedule}-{status.TotalRuns+1} When the SympoziumSchedule was deleted, its `status.TotalRuns` counter went with it — but the AgentRun resources owned by the schedule had already been severed from their owner by earlier cleanup, so they persisted as orphans. On recreation, TotalRuns reset to 0, the scheduler tried to create `{schedule}-1` which already existed, the apiserver returned AlreadyExists, the scheduler swallowed the error and logged "Created scheduled AgentRun" anyway, incremented TotalRuns, and advanced LastRunTime/NextRunTime. The user saw the schedule claim it was firing but no new runs ever appeared — the status was pointing at 14-hour-old orphan runs. Fix: - New nextScheduledRunNumber() helper lists AgentRuns labelled with sympozium.ai/schedule=<name>, finds the maximum numeric suffix actually present, and picks max(TotalRuns+1, maxObserved+1). This is self-healing: if the schedule has been recreated N times, it still picks a unique suffix on the first attempt. - Added a bounded collision-retry loop (up to 100 attempts). On AlreadyExists it bumps the suffix and tries again, logging each collision. This guards against races with concurrent reconciles or with any other actor creating runs against the schedule's label namespace. - TotalRuns is now set to the actual suffix used (not blindly incremented), keeping it aligned with observable state. Verified live: forcing a reconcile against the platform-team-security-guardian-schedule (which had orphans at suffixes 1..5) correctly created schedule-6 and schedule-7. Adds 5 targeted unit tests (sympoziumschedule_collision_test.go): - NoExistingRuns: baseline — honours TotalRuns+1 - RecoversFromOrphanedRuns: the regression, picks 10 past 1..9 - HonoursHigherTotalRuns: counter wins when it's ahead of orphans - IgnoresOtherSchedulesRuns: label filtering works - IgnoresMalformedSuffixes: non-numeric names don't confuse the max-suffix scan Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent ac1d3b7 commit 205829a

File tree

2 files changed

+225
-4
lines changed

2 files changed

+225
-4
lines changed
Lines changed: 138 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
package controller
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"testing"
7+
8+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
9+
"sigs.k8s.io/controller-runtime/pkg/client"
10+
11+
sympoziumv1alpha1 "github.com/sympozium-ai/sympozium/api/v1alpha1"
12+
)
13+
14+
// TestNextScheduledRunNumber_NoExistingRuns: clean slate → returns
15+
// TotalRuns+1 as expected.
16+
func TestNextScheduledRunNumber_NoExistingRuns(t *testing.T) {
17+
schedule := &sympoziumv1alpha1.SympoziumSchedule{
18+
ObjectMeta: metav1.ObjectMeta{Name: "sched", Namespace: "default"},
19+
Status: sympoziumv1alpha1.SympoziumScheduleStatus{TotalRuns: 2},
20+
}
21+
r, _ := newScheduleTestReconciler(t, schedule)
22+
23+
n, err := r.nextScheduledRunNumber(context.Background(), schedule)
24+
if err != nil {
25+
t.Fatalf("nextScheduledRunNumber: %v", err)
26+
}
27+
if n != 3 {
28+
t.Errorf("got %d, want 3 (TotalRuns+1)", n)
29+
}
30+
}
31+
32+
// TestNextScheduledRunNumber_RecoversFromOrphanedRuns is the regression
33+
// guard: simulates a schedule that was deleted and recreated, so
34+
// TotalRuns reset to 0 but orphan AgentRuns from the previous
35+
// incarnation still exist with suffixes 1..9. The scheduler must pick
36+
// 10, not 1.
37+
func TestNextScheduledRunNumber_RecoversFromOrphanedRuns(t *testing.T) {
38+
schedule := &sympoziumv1alpha1.SympoziumSchedule{
39+
ObjectMeta: metav1.ObjectMeta{Name: "sched", Namespace: "default"},
40+
Status: sympoziumv1alpha1.SympoziumScheduleStatus{TotalRuns: 0},
41+
}
42+
objs := []client.Object{schedule}
43+
for i := 1; i <= 9; i++ {
44+
objs = append(objs, makeOrphanRun("sched", i))
45+
}
46+
r, _ := newScheduleTestReconciler(t, objs...)
47+
48+
n, err := r.nextScheduledRunNumber(context.Background(), schedule)
49+
if err != nil {
50+
t.Fatalf("nextScheduledRunNumber: %v", err)
51+
}
52+
if n != 10 {
53+
t.Errorf("got %d, want 10 (must skip orphans 1..9)", n)
54+
}
55+
}
56+
57+
// TestNextScheduledRunNumber_HonoursHigherTotalRuns: when TotalRuns is
58+
// ahead of observed orphans, use TotalRuns+1.
59+
func TestNextScheduledRunNumber_HonoursHigherTotalRuns(t *testing.T) {
60+
schedule := &sympoziumv1alpha1.SympoziumSchedule{
61+
ObjectMeta: metav1.ObjectMeta{Name: "sched", Namespace: "default"},
62+
Status: sympoziumv1alpha1.SympoziumScheduleStatus{TotalRuns: 50},
63+
}
64+
r, _ := newScheduleTestReconciler(t, schedule, makeOrphanRun("sched", 3))
65+
66+
n, err := r.nextScheduledRunNumber(context.Background(), schedule)
67+
if err != nil {
68+
t.Fatalf("nextScheduledRunNumber: %v", err)
69+
}
70+
if n != 51 {
71+
t.Errorf("got %d, want 51", n)
72+
}
73+
}
74+
75+
// TestNextScheduledRunNumber_IgnoresOtherSchedulesRuns: only runs whose
76+
// label matches THIS schedule count. Runs belonging to other schedules
77+
// are ignored.
78+
func TestNextScheduledRunNumber_IgnoresOtherSchedulesRuns(t *testing.T) {
79+
schedule := &sympoziumv1alpha1.SympoziumSchedule{
80+
ObjectMeta: metav1.ObjectMeta{Name: "sched", Namespace: "default"},
81+
Status: sympoziumv1alpha1.SympoziumScheduleStatus{TotalRuns: 0},
82+
}
83+
own := makeOrphanRun("sched", 7)
84+
other := makeOrphanRun("other-sched", 99)
85+
r, _ := newScheduleTestReconciler(t, schedule, own, other)
86+
87+
n, err := r.nextScheduledRunNumber(context.Background(), schedule)
88+
if err != nil {
89+
t.Fatalf("nextScheduledRunNumber: %v", err)
90+
}
91+
if n != 8 {
92+
t.Errorf("got %d, want 8 (should ignore other-sched's run)", n)
93+
}
94+
}
95+
96+
// TestNextScheduledRunNumber_IgnoresMalformedSuffixes: runs with
97+
// non-numeric suffixes are skipped.
98+
func TestNextScheduledRunNumber_IgnoresMalformedSuffixes(t *testing.T) {
99+
schedule := &sympoziumv1alpha1.SympoziumSchedule{
100+
ObjectMeta: metav1.ObjectMeta{Name: "sched", Namespace: "default"},
101+
Status: sympoziumv1alpha1.SympoziumScheduleStatus{TotalRuns: 0},
102+
}
103+
goodRun := makeOrphanRun("sched", 5)
104+
// Same prefix, non-numeric suffix — must be skipped.
105+
bogus := &sympoziumv1alpha1.AgentRun{
106+
ObjectMeta: metav1.ObjectMeta{
107+
Name: "sched-abc123",
108+
Namespace: "default",
109+
Labels: map[string]string{"sympozium.ai/schedule": "sched"},
110+
},
111+
Spec: sympoziumv1alpha1.AgentRunSpec{InstanceRef: "i", Task: "x"},
112+
}
113+
r, _ := newScheduleTestReconciler(t, schedule, goodRun, bogus)
114+
115+
n, err := r.nextScheduledRunNumber(context.Background(), schedule)
116+
if err != nil {
117+
t.Fatalf("nextScheduledRunNumber: %v", err)
118+
}
119+
if n != 6 {
120+
t.Errorf("got %d, want 6 (only numeric suffixes counted)", n)
121+
}
122+
}
123+
124+
func makeOrphanRun(scheduleName string, suffix int) *sympoziumv1alpha1.AgentRun {
125+
return &sympoziumv1alpha1.AgentRun{
126+
ObjectMeta: metav1.ObjectMeta{
127+
Name: fmt.Sprintf("%s-%d", scheduleName, suffix),
128+
Namespace: "default",
129+
Labels: map[string]string{
130+
"sympozium.ai/schedule": scheduleName,
131+
},
132+
},
133+
Spec: sympoziumv1alpha1.AgentRunSpec{
134+
InstanceRef: "any-instance",
135+
Task: "x",
136+
},
137+
}
138+
}

internal/controller/sympoziumschedule_controller.go

Lines changed: 87 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ package controller
33
import (
44
"context"
55
"fmt"
6+
"strconv"
7+
"strings"
68
"time"
79

810
"github.com/go-logr/logr"
@@ -19,6 +21,11 @@ import (
1921
sympoziumv1alpha1 "github.com/sympozium-ai/sympozium/api/v1alpha1"
2022
)
2123

24+
// maxScheduleRunCreateRetries bounds the collision-retry loop when the
25+
// scheduler's chosen run suffix clashes with an existing AgentRun (e.g. from
26+
// a prior incarnation of this schedule before it was deleted and recreated).
27+
const maxScheduleRunCreateRetries = 100
28+
2229
const sympoziumScheduleFinalizer = "sympozium.ai/schedule-finalizer"
2330

2431
// SympoziumScheduleReconciler reconciles SympoziumSchedule objects.
@@ -164,8 +171,20 @@ func (r *SympoziumScheduleReconciler) Reconcile(ctx context.Context, req ctrl.Re
164171
return ctrl.Result{RequeueAfter: 60 * time.Second}, nil
165172
}
166173

174+
// Pick the next available run number. The naive `TotalRuns+1` choice
175+
// collides when this schedule has been deleted and recreated (e.g.
176+
// PersonaPack disabled then re-enabled) because the counter resets
177+
// but the old AgentRun resources persist. List existing runs that
178+
// belong to this schedule, find the highest numeric suffix, and
179+
// start from there.
180+
nextNum, err := r.nextScheduledRunNumber(ctx, schedule)
181+
if err != nil {
182+
log.Error(err, "failed to list existing scheduled runs; falling back to TotalRuns counter")
183+
nextNum = int(schedule.Status.TotalRuns) + 1
184+
}
185+
167186
// Create the AgentRun.
168-
runName := fmt.Sprintf("%s-%d", schedule.Name, schedule.Status.TotalRuns+1)
187+
runName := fmt.Sprintf("%s-%d", schedule.Name, nextNum)
169188
agentRun := &sympoziumv1alpha1.AgentRun{
170189
ObjectMeta: metav1.ObjectMeta{
171190
Name: runName,
@@ -212,20 +231,40 @@ func (r *SympoziumScheduleReconciler) Reconcile(ctx context.Context, req ctrl.Re
212231
agentRun.Spec.Skills = append(agentRun.Spec.Skills, skill)
213232
}
214233

215-
if err := r.Create(ctx, agentRun); err != nil {
234+
// Collision-retry loop: if another actor (or a race with our own
235+
// prior reconcile) created the same name, bump the suffix and try
236+
// again. Bounded to avoid an unbounded spin.
237+
for attempt := 0; attempt < maxScheduleRunCreateRetries; attempt++ {
238+
err := r.Create(ctx, agentRun)
239+
if err == nil {
240+
break
241+
}
216242
if !errors.IsAlreadyExists(err) {
217243
log.Error(err, "failed to create AgentRun")
218244
return ctrl.Result{RequeueAfter: 30 * time.Second}, nil
219245
}
246+
log.Info("scheduled run name collision; trying next suffix",
247+
"run", runName, "attempt", attempt+1)
248+
nextNum++
249+
runName = fmt.Sprintf("%s-%d", schedule.Name, nextNum)
250+
agentRun.ObjectMeta.Name = runName
251+
if attempt == maxScheduleRunCreateRetries-1 {
252+
log.Error(fmt.Errorf("exceeded collision retries"),
253+
"could not pick a unique scheduled run name", "lastAttempted", runName)
254+
return ctrl.Result{RequeueAfter: 60 * time.Second}, nil
255+
}
220256
}
221257

222258
log.Info("Created scheduled AgentRun", "run", runName, "type", schedule.Spec.Type)
223259

224-
// Update status.
260+
// Update status. Use the actual number that was committed (after any
261+
// collision retries) so TotalRuns stays in sync with observable state.
225262
nowMeta := metav1.Now()
226263
schedule.Status.LastRunTime = &nowMeta
227264
schedule.Status.LastRunName = runName
228-
schedule.Status.TotalRuns++
265+
if int64(nextNum) > schedule.Status.TotalRuns {
266+
schedule.Status.TotalRuns = int64(nextNum)
267+
}
229268

230269
// Recompute next run from now.
231270
next := sched.Next(now)
@@ -241,6 +280,50 @@ func (r *SympoziumScheduleReconciler) Reconcile(ctx context.Context, req ctrl.Re
241280
return ctrl.Result{RequeueAfter: delay}, nil
242281
}
243282

283+
// nextScheduledRunNumber returns the next numeric suffix to use when naming
284+
// a scheduled AgentRun, picking the higher of:
285+
//
286+
// - status.TotalRuns + 1 (the counter the scheduler maintains), and
287+
// - 1 + the maximum observed suffix on existing AgentRuns that belong to
288+
// this schedule (identified via the sympozium.ai/schedule label).
289+
//
290+
// The second branch handles the case where this schedule was previously
291+
// deleted (e.g. PersonaPack disabled) and recreated: TotalRuns resets to 0
292+
// but orphan AgentRuns from the previous incarnation persist with names like
293+
// `<schedule>-1`, `<schedule>-2`, … Picking a suffix that's already in use
294+
// would collide silently and leave the scheduler emitting ghost "created"
295+
// log lines without actually creating runs.
296+
func (r *SympoziumScheduleReconciler) nextScheduledRunNumber(ctx context.Context, schedule *sympoziumv1alpha1.SympoziumSchedule) (int, error) {
297+
base := int(schedule.Status.TotalRuns) + 1
298+
var runs sympoziumv1alpha1.AgentRunList
299+
if err := r.List(ctx, &runs,
300+
client.InNamespace(schedule.Namespace),
301+
client.MatchingLabels{"sympozium.ai/schedule": schedule.Name},
302+
); err != nil {
303+
return base, err
304+
}
305+
prefix := schedule.Name + "-"
306+
maxObserved := 0
307+
for i := range runs.Items {
308+
name := runs.Items[i].Name
309+
if !strings.HasPrefix(name, prefix) {
310+
continue
311+
}
312+
suffix := strings.TrimPrefix(name, prefix)
313+
n, err := strconv.Atoi(suffix)
314+
if err != nil || n <= 0 {
315+
continue
316+
}
317+
if n > maxObserved {
318+
maxObserved = n
319+
}
320+
}
321+
if maxObserved+1 > base {
322+
return maxObserved + 1, nil
323+
}
324+
return base, nil
325+
}
326+
244327
// readMemoryConfigMap reads the MEMORY.md content from the instance's memory
245328
// ConfigMap. Returns empty string if not found.
246329
func (r *SympoziumScheduleReconciler) readMemoryConfigMap(ctx context.Context, namespace, instanceName string) string {

0 commit comments

Comments
 (0)