Skip to content

Commit e8a1e68

Browse files
authored
fix(controller): cascade-delete instances when agent is deleted (#116)
* fix(controller): cascade-delete instances when agent is deleted Set an OwnerReference on each instance ConfigMap pointing to its agent ConfigMap during reconciliation. Kubernetes garbage collection then cascade-deletes instances automatically when the agent is removed. Previously, deleting an agent left its instances orphaned — the `humr.ai/agent` label pointed at a non-existent agent and the instance's StatefulSet kept running. Using owner references (instead of explicit cascade in the Delete handler) means the cleanup works even when the controller is down, has no race window against concurrent instance creation, and is visible via `kubectl describe`. Existing instances pick up the owner reference on their next reconcile (backfill is automatic). PVC cleanup still fires via the informer's DeleteFunc → instanceReconciler.Delete(). Signed-off-by: Radek Ježek <radek.jezek@ibm.com> * feat(ui): warn about instance data loss on agent delete Show a formatted confirmation when deleting an agent that has instances, highlighting the instance count and persistent data loss in danger red. The warning is suppressed when the agent has no instances. Widens the dialog's `message` field from `string` to `ReactNode` so callers can use JSX for line breaks and bold highlights. Existing string callers continue to work unchanged. Signed-off-by: Radek Ježek <radek.jezek@ibm.com> --------- Signed-off-by: Radek Ježek <radek.jezek@ibm.com>
1 parent 9222814 commit e8a1e68

7 files changed

Lines changed: 134 additions & 15 deletions

File tree

packages/controller/pkg/reconciler/agent.go

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,16 +28,22 @@ func NewAgentResolver(getter AgentGetter) *AgentResolver {
2828
return &AgentResolver{getter: getter}
2929
}
3030

31-
func (r *AgentResolver) Resolve(name string) (*types.AgentSpec, error) {
31+
// Resolve returns the agent's ConfigMap (for owner-reference metadata) and
32+
// its parsed spec.
33+
func (r *AgentResolver) Resolve(name string) (*corev1.ConfigMap, *types.AgentSpec, error) {
3234
cm, err := r.getter.Get(name)
3335
if err != nil {
34-
return nil, fmt.Errorf("agent %q not found: %w", name, err)
36+
return nil, nil, fmt.Errorf("agent %q not found: %w", name, err)
3537
}
3638
specYAML, ok := cm.Data["spec.yaml"]
3739
if !ok {
38-
return nil, fmt.Errorf("agent %q has no spec.yaml", name)
40+
return nil, nil, fmt.Errorf("agent %q has no spec.yaml", name)
3941
}
40-
return types.ParseAgentSpec(specYAML)
42+
spec, err := types.ParseAgentSpec(specYAML)
43+
if err != nil {
44+
return nil, nil, err
45+
}
46+
return cm, spec, nil
4147
}
4248

4349
// AgentTokenSecretName returns the K8s Secret name that stores the OneCLI access token for an agent.

packages/controller/pkg/reconciler/agent_test.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,20 +54,22 @@ func (f *fakeGetter) Get(name string) (*corev1.ConfigMap, error) {
5454
func TestResolveAgent(t *testing.T) {
5555
getter := &fakeGetter{cms: map[string]*corev1.ConfigMap{
5656
"code-guardian": {
57-
ObjectMeta: metav1.ObjectMeta{Name: "code-guardian", Namespace: "test-agents"},
57+
ObjectMeta: metav1.ObjectMeta{Name: "code-guardian", Namespace: "test-agents", UID: "agent-uid"},
5858
Data: map[string]string{"spec.yaml": fixtureAgentYAML},
5959
},
6060
}}
6161
resolver := NewAgentResolver(getter)
62-
spec, err := resolver.Resolve("code-guardian")
62+
cm, spec, err := resolver.Resolve("code-guardian")
6363
require.NoError(t, err)
64+
assert.Equal(t, "code-guardian", cm.Name)
65+
assert.EqualValues(t, "agent-uid", cm.UID)
6466
assert.Equal(t, "ghcr.io/myorg/code-guardian:latest", spec.Image)
6567
assert.Len(t, spec.Mounts, 3)
6668
}
6769

6870
func TestResolveAgent_NotFound(t *testing.T) {
6971
resolver := NewAgentResolver(&fakeGetter{cms: map[string]*corev1.ConfigMap{}})
70-
_, err := resolver.Resolve("missing")
72+
_, _, err := resolver.Resolve("missing")
7173
assert.Error(t, err)
7274
assert.Contains(t, err.Error(), "not found")
7375
}
@@ -80,7 +82,7 @@ func TestResolveAgent_NoSpecYAML(t *testing.T) {
8082
},
8183
}}
8284
resolver := NewAgentResolver(getter)
83-
_, err := resolver.Resolve("bad-agent")
85+
_, _, err := resolver.Resolve("bad-agent")
8486
assert.Error(t, err)
8587
assert.Contains(t, err.Error(), "no spec.yaml")
8688
}

packages/controller/pkg/reconciler/instance.go

Lines changed: 41 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,18 @@ func (r *InstanceReconciler) Reconcile(ctx context.Context, cm *corev1.ConfigMap
4343
if agentName == "" {
4444
agentName = instanceSpec.AgentName
4545
}
46-
agentSpec, err := r.resolver.Resolve(agentName)
46+
agentCM, agentSpec, err := r.resolver.Resolve(agentName)
4747
if err != nil {
4848
return r.setError(ctx, name, err.Error())
4949
}
5050

51+
// Ensure the instance CM has an OwnerReference to its agent CM so that
52+
// K8s garbage collection cascade-deletes orphaned instances when the
53+
// agent is removed. Idempotent — skips if already set.
54+
if err := r.ensureAgentOwnerReference(ctx, cm, agentCM); err != nil {
55+
return r.setError(ctx, name, fmt.Sprintf("setting agent owner reference: %v", err))
56+
}
57+
5158
// Build desired resources
5259
ss := BuildStatefulSet(name, instanceSpec, agentSpec, r.config, agentName, cm)
5360
svc := BuildService(name, r.config, cm)
@@ -70,6 +77,39 @@ func (r *InstanceReconciler) Reconcile(ctx context.Context, cm *corev1.ConfigMap
7077
return WriteInstanceStatus(ctx, r.client, r.config.Namespace, name, types.NewInstanceStatus(state, ""))
7178
}
7279

80+
// ensureAgentOwnerReference adds a non-controller OwnerReference from the
81+
// instance CM to the agent CM if one is not already present. This lets K8s
82+
// garbage collection cascade-delete instances when their agent is removed.
83+
// It is safe to leave BlockOwnerDeletion=false and Controller=false — other
84+
// OwnerReferences on the instance CM (if any) are preserved.
85+
func (r *InstanceReconciler) ensureAgentOwnerReference(ctx context.Context, instanceCM, agentCM *corev1.ConfigMap) error {
86+
for _, ref := range instanceCM.OwnerReferences {
87+
if ref.UID == agentCM.UID {
88+
return nil
89+
}
90+
}
91+
desired := metav1.OwnerReference{
92+
APIVersion: "v1",
93+
Kind: "ConfigMap",
94+
Name: agentCM.Name,
95+
UID: agentCM.UID,
96+
}
97+
return retry.RetryOnConflict(retry.DefaultRetry, func() error {
98+
current, err := r.client.CoreV1().ConfigMaps(r.config.Namespace).Get(ctx, instanceCM.Name, metav1.GetOptions{})
99+
if err != nil {
100+
return err
101+
}
102+
for _, ref := range current.OwnerReferences {
103+
if ref.UID == agentCM.UID {
104+
return nil
105+
}
106+
}
107+
current.OwnerReferences = append(current.OwnerReferences, desired)
108+
_, err = r.client.CoreV1().ConfigMaps(r.config.Namespace).Update(ctx, current, metav1.UpdateOptions{})
109+
return err
110+
})
111+
}
112+
73113
func (r *InstanceReconciler) Delete(ctx context.Context, name string) {
74114
// Owner references handle cascade deletion of StatefulSet, Service, NetworkPolicy.
75115
// OneCLI agent cleanup is handled by AgentReconciler.

packages/controller/pkg/reconciler/instance_test.go

Lines changed: 54 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ func setupReconciler(t *testing.T, agents map[string]*corev1.ConfigMap, objects
3535
func agentCM() *corev1.ConfigMap {
3636
return &corev1.ConfigMap{
3737
ObjectMeta: metav1.ObjectMeta{
38-
Name: "code-guardian", Namespace: "test-agents",
38+
Name: "code-guardian", Namespace: "test-agents", UID: "agent-uid",
3939
Labels: map[string]string{"humr.ai/type": "agent"},
4040
},
4141
Data: map[string]string{"spec.yaml": fixtureAgentYAML},
@@ -156,6 +156,59 @@ func TestReconcile_Idempotent(t *testing.T) {
156156
require.NoError(t, err)
157157
}
158158

159+
func TestReconcile_SetsAgentOwnerReference(t *testing.T) {
160+
cm := instanceCM("running")
161+
r, client := setupReconciler(t,
162+
map[string]*corev1.ConfigMap{"code-guardian": agentCM()},
163+
cm,
164+
)
165+
166+
require.NoError(t, r.Reconcile(context.Background(), cm))
167+
168+
updated, err := client.CoreV1().ConfigMaps("test-agents").Get(context.Background(), "my-instance", metav1.GetOptions{})
169+
require.NoError(t, err)
170+
require.Len(t, updated.OwnerReferences, 1)
171+
ref := updated.OwnerReferences[0]
172+
assert.Equal(t, "ConfigMap", ref.Kind)
173+
assert.Equal(t, "code-guardian", ref.Name)
174+
assert.EqualValues(t, "agent-uid", ref.UID)
175+
}
176+
177+
func TestReconcile_OwnerReferenceIdempotent(t *testing.T) {
178+
cm := instanceCM("running")
179+
r, client := setupReconciler(t,
180+
map[string]*corev1.ConfigMap{"code-guardian": agentCM()},
181+
cm,
182+
)
183+
184+
require.NoError(t, r.Reconcile(context.Background(), cm))
185+
require.NoError(t, r.Reconcile(context.Background(), cm))
186+
187+
updated, err := client.CoreV1().ConfigMaps("test-agents").Get(context.Background(), "my-instance", metav1.GetOptions{})
188+
require.NoError(t, err)
189+
assert.Len(t, updated.OwnerReferences, 1, "second reconcile must not duplicate owner reference")
190+
}
191+
192+
func TestReconcile_PreservesExistingOwnerReferences(t *testing.T) {
193+
cm := instanceCM("running")
194+
cm.OwnerReferences = []metav1.OwnerReference{
195+
{APIVersion: "v1", Kind: "ConfigMap", Name: "other-owner", UID: "other-uid"},
196+
}
197+
r, client := setupReconciler(t,
198+
map[string]*corev1.ConfigMap{"code-guardian": agentCM()},
199+
cm,
200+
)
201+
202+
require.NoError(t, r.Reconcile(context.Background(), cm))
203+
204+
updated, err := client.CoreV1().ConfigMaps("test-agents").Get(context.Background(), "my-instance", metav1.GetOptions{})
205+
require.NoError(t, err)
206+
require.Len(t, updated.OwnerReferences, 2)
207+
uids := []string{string(updated.OwnerReferences[0].UID), string(updated.OwnerReferences[1].UID)}
208+
assert.Contains(t, uids, "other-uid")
209+
assert.Contains(t, uids, "agent-uid")
210+
}
211+
159212
func TestDelete_CleansPVCs(t *testing.T) {
160213
cm := instanceCM("running")
161214
// Pre-create PVCs that would have been created by the StatefulSet controller

packages/ui/src/components/DialogOverlay.tsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ export function DialogOverlay() {
3636
</div>
3737
<div className="flex-1 min-w-0">
3838
<h3 className="text-[15px] font-bold text-text mb-1">{dialog.title}</h3>
39-
<p className="text-[13px] text-text-secondary leading-relaxed">{dialog.message}</p>
39+
<div className="text-[13px] text-text-secondary leading-relaxed">{dialog.message}</div>
4040
</div>
4141
</div>
4242
<div className="flex justify-end gap-2 pt-1">

packages/ui/src/store.ts

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import { create } from "zustand";
2+
import type { ReactNode } from "react";
23
import { platform } from "./platform.js";
34
import type {
45
TemplateView,
@@ -28,15 +29,15 @@ type Theme = "light" | "dark" | "system";
2829
export interface DialogState {
2930
type: "alert" | "confirm";
3031
title: string;
31-
message: string;
32+
message: ReactNode;
3233
resolve: (ok: boolean) => void;
3334
}
3435

3536
export interface HumrStore {
3637
// Dialog
3738
dialog: DialogState | null;
38-
showAlert: (message: string, title?: string) => Promise<void>;
39-
showConfirm: (message: string, title?: string) => Promise<boolean>;
39+
showAlert: (message: ReactNode, title?: string) => Promise<void>;
40+
showConfirm: (message: ReactNode, title?: string) => Promise<boolean>;
4041
closeDialog: (ok: boolean) => void;
4142

4243
// Theme

packages/ui/src/views/ListView.tsx

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,24 @@ export function ListView() {
199199
<Plus size={12} /> Instance
200200
</button>
201201
<button
202-
onClick={async () => { if (!await showConfirm(`Delete agent "${agent.name}"?`, "Delete Agent")) return; setDelAgent(agent.id); await deleteAgent(agent.id); setDelAgent(null); }}
202+
onClick={async () => {
203+
const n = insts.length;
204+
const msg = n === 0 ? (
205+
<>Delete agent <strong className="text-text">"{agent.name}"</strong>?</>
206+
) : (
207+
<div className="space-y-2">
208+
<p>Delete agent <strong className="text-text">"{agent.name}"</strong>?</p>
209+
<p className="text-danger">
210+
This will also delete <strong>{n} {n === 1 ? "instance" : "instances"}</strong> and <strong>all their persistent data</strong>.
211+
</p>
212+
<p className="text-text-muted text-[12px]">This cannot be undone.</p>
213+
</div>
214+
);
215+
if (!await showConfirm(msg, "Delete Agent")) return;
216+
setDelAgent(agent.id);
217+
await deleteAgent(agent.id);
218+
setDelAgent(null);
219+
}}
203220
disabled={delAgent === agent.id}
204221
className="btn-brutal h-8 w-8 rounded-lg border-2 border-border-light bg-surface flex items-center justify-center text-text-muted hover:text-danger hover:border-danger disabled:opacity-40"
205222
style={{ boxShadow: "var(--shadow-brutal-sm)" }}

0 commit comments

Comments
 (0)