kagenti · Tomas2D · Apr 27, 2026
diff --git a/deploy/helm/humr/templates/code-guardian-template.yaml b/deploy/helm/humr/templates/code-guardian-template.yaml
@@ -15,6 +15,9 @@ data:
     mounts:
       - path: /home/agent
         persist: true
+        {{- with .Values.codeGuardianTemplate.homeMountSize }}
+        size: {{ . | quote }}
+        {{- end }}
       - path: /tmp
         persist: false
     init: |

diff --git a/deploy/helm/humr/templates/default-template.yaml b/deploy/helm/humr/templates/default-template.yaml
@@ -15,6 +15,9 @@ data:
     mounts:
       - path: /home/agent
         persist: true
+        {{- with .Values.defaultTemplate.homeMountSize }}
+        size: {{ . | quote }}
+        {{- end }}
       - path: /tmp
         persist: false
     init: |

diff --git a/deploy/helm/humr/templates/google-workspace-template.yaml b/deploy/helm/humr/templates/google-workspace-template.yaml
@@ -15,6 +15,9 @@ data:
     mounts:
       - path: /home/agent
         persist: true
+        {{- with .Values.googleWorkspaceTemplate.homeMountSize }}
+        size: {{ . | quote }}
+        {{- end }}
       - path: /tmp
         persist: false
     init: |

diff --git a/deploy/helm/humr/templates/pi-agent-template.yaml b/deploy/helm/humr/templates/pi-agent-template.yaml
@@ -15,6 +15,9 @@ data:
     mounts:
       - path: /home/agent
         persist: true
+        {{- with .Values.piAgentTemplate.homeMountSize }}
+        size: {{ . | quote }}
+        {{- end }}
       - path: /tmp
         persist: false
     init: |

diff --git a/deploy/helm/humr/values.yaml b/deploy/helm/humr/values.yaml
@@ -321,6 +321,9 @@ defaultTemplate:
     tag: ""
     pullPolicy: IfNotPresent
   description: "Default Claude Code agent"
+  # -- PVC size for the persisted /home/agent mount. Empty = controller
+  # default (10Gi). See issue #244.
+  homeMountSize: "5Gi"
   resources:
     requests:
       cpu: "250m"
@@ -338,6 +341,7 @@ googleWorkspaceTemplate:
     tag: ""
     pullPolicy: IfNotPresent
   description: "Google Workspace agent with Drive and Gmail via gws CLI"
+  homeMountSize: "2Gi"
   resources:
     requests:
       cpu: "250m"
@@ -355,6 +359,7 @@ piAgentTemplate:
     tag: ""
     pullPolicy: IfNotPresent
   description: "Pi coding agent with multi-LLM support"
+  homeMountSize: "2Gi"
   resources:
     requests:
       cpu: "250m"
@@ -374,6 +379,8 @@ codeGuardianTemplate:
   description: "PR code review agent (gh + Claude Code)"
   # -- owner/repo slug the agent reviews. Empty = detect via `gh repo view`.
   githubRepo: ""
+  # Code Guardian clones real repos — keep the larger default.
+  homeMountSize: "10Gi"
   resources:
     requests:
       cpu: "250m"

diff --git a/deploy/lima-k3s-test.yaml b/deploy/lima-k3s-test.yaml
@@ -12,6 +12,12 @@ provision:
   - mode: system
     script: |
       #!/bin/sh
+      mkdir -p /etc/rancher/k3s
+      cat > /etc/rancher/k3s/config.yaml << 'EOF'
+      kubelet-arg:
+        - "container-log-max-size=10Mi"
+        - "container-log-max-files=3"
+      EOF
       if [ ! -d /var/lib/rancher/k3s ]; then
         curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="server --write-kubeconfig-mode 644 --https-listen-port 16445" sh -
       fi

diff --git a/deploy/lima-k3s.yaml b/deploy/lima-k3s.yaml
@@ -24,6 +24,15 @@ provision:
       DEBIAN_FRONTEND=noninteractive apt-get update
       DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends nfs-common
       echo "[humr-provision] mount.nfs4: $(command -v mount.nfs4 || echo MISSING)"
+      # Bound kubelet container-log usage at ~30 MiB per pod so a chatty/looping
+      # pod can't fill the dev VM disk (issue #244). cluster:install also
+      # ensures this file exists on existing VMs.
+      mkdir -p /etc/rancher/k3s
+      cat > /etc/rancher/k3s/config.yaml << 'EOF'
+      kubelet-arg:
+        - "container-log-max-size=10Mi"
+        - "container-log-max-files=3"
+      EOF
       if [ ! -d /var/lib/rancher/k3s ]; then
         curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="server --write-kubeconfig-mode 644 --https-listen-port 16443" sh -
       fi

diff --git a/deploy/tasks.toml b/deploy/tasks.toml
@@ -137,6 +137,31 @@ else
 fi
 kubectl --kubeconfig="$KUBECONFIG" wait --for=condition=Ready node --all --timeout=120s
 
+# 1b. Ensure k3s container-log rotation is configured (issue #244 upgrade
+# path for VMs created before this flag was added). New VMs already have
+# /etc/rancher/k3s/config.yaml from the lima provision script.
+K3S_CONFIG_YAML='kubelet-arg:
+  - "container-log-max-size=10Mi"
+  - "container-log-max-files=3"
+'
+if [ -n "${IS_SANDBOX:-}" ]; then
+  if ! sudo grep -q container-log-max-size /etc/rancher/k3s/config.yaml 2>/dev/null; then
+    echo "Configuring k3s container log rotation (issue #244)..."
+    sudo mkdir -p /etc/rancher/k3s
+    printf '%s' "$K3S_CONFIG_YAML" | sudo tee /etc/rancher/k3s/config.yaml >/dev/null
+    sudo systemctl restart k3s
+    kubectl --kubeconfig="$KUBECONFIG" wait --for=condition=Ready node --all --timeout=120s
+  fi
+else
+  if ! limactl shell "$LIMA_INSTANCE" sudo grep -q container-log-max-size /etc/rancher/k3s/config.yaml 2>/dev/null; then
+    echo "Configuring k3s container log rotation (issue #244)..."
+    limactl shell "$LIMA_INSTANCE" sudo mkdir -p /etc/rancher/k3s
+    printf '%s' "$K3S_CONFIG_YAML" | limactl shell "$LIMA_INSTANCE" sudo tee /etc/rancher/k3s/config.yaml >/dev/null
+    limactl shell "$LIMA_INSTANCE" sudo systemctl restart k3s
+    kubectl --kubeconfig="$KUBECONFIG" wait --for=condition=Ready node --all --timeout=120s
+  fi
+fi
+
 # 2. Install cert-manager
 if ! kubectl --kubeconfig="$KUBECONFIG" get ns cert-manager >/dev/null 2>&1; then
   echo "Installing cert-manager..."
@@ -190,6 +215,16 @@ fi
 echo "Waiting for deployments to be ready..."
 kubectl --kubeconfig="$KUBECONFIG" wait --for=condition=Available deployment --all --timeout=10m
 
+# 7. Prune image layers no longer referenced by any pod (issue #244). After
+# repeated reinstalls, old `:latest` layers accumulate in containerd; this
+# drops anything not pinned by a running container.
+echo "Pruning unused images..."
+if [ -n "${IS_SANDBOX:-}" ]; then
+  sudo k3s crictl rmi --prune >/dev/null 2>&1 || true
+else
+  limactl shell "$LIMA_INSTANCE" sudo k3s crictl rmi --prune >/dev/null 2>&1 || true
+fi
+
 echo ""
 echo "=== Cluster ready ==="
 echo "KUBECONFIG=$KUBECONFIG"
@@ -224,6 +259,13 @@ echo "Restarting apiserver pod..."
 kubectl --kubeconfig="$KUBECONFIG" rollout restart deployment humr-apiserver
 kubectl --kubeconfig="$KUBECONFIG" rollout status deployment humr-apiserver --timeout=60s
 
+# Drop image layers no longer pinned by a running container (issue #244).
+if [ -n "${IS_SANDBOX:-}" ]; then
+  sudo k3s crictl rmi --prune >/dev/null 2>&1 || true
+else
+  limactl shell "$LIMA_INSTANCE" sudo k3s crictl rmi --prune >/dev/null 2>&1 || true
+fi
+
 echo "Done."
 '''
 
@@ -254,6 +296,12 @@ echo "Restarting UI pod..."
 kubectl --kubeconfig="$KUBECONFIG" rollout restart deployment humr-ui
 kubectl --kubeconfig="$KUBECONFIG" rollout status deployment humr-ui --timeout=60s
 
+if [ -n "${IS_SANDBOX:-}" ]; then
+  sudo k3s crictl rmi --prune >/dev/null 2>&1 || true
+else
+  limactl shell "$LIMA_INSTANCE" sudo k3s crictl rmi --prune >/dev/null 2>&1 || true
+fi
+
 echo "Done."
 '''
 
@@ -283,6 +331,15 @@ rm -f "$tar"
 echo "Restarting agent pods..."
 kubectl --kubeconfig="$KUBECONFIG" delete pods -n humr-agents --all 2>/dev/null || true
 
+# Best-effort: prune unreferenced image layers. Running this immediately after
+# `delete pods` may briefly skip layers still pinned to terminating containers;
+# the next install/build sweep will catch them. (issue #244)
+if [ -n "${IS_SANDBOX:-}" ]; then
+  sudo k3s crictl rmi --prune >/dev/null 2>&1 || true
+else
+  limactl shell "$LIMA_INSTANCE" sudo k3s crictl rmi --prune >/dev/null 2>&1 || true
+fi
+
 echo "Done. Agent pods will restart with new image."
 '''
 
@@ -382,6 +439,54 @@ else
 fi
 '''
 
+["cluster:reclaim"]
+description = "Reclaim disk in the dev cluster: prune unused images, delete completed/failed pods, surface orphan PVCs."
+dir = "{{config_root}}"
+run = '''
+#!/usr/bin/env bash
+set -eo pipefail
+if [ -n "${IS_SANDBOX:-}" ]; then
+  KUBECONFIG="/etc/rancher/k3s/k3s.yaml"
+  RUN_IN_VM="sudo"
+else
+  LIMA_INSTANCE="humr-k3s"
+  KUBECONFIG="$HOME/.lima/$LIMA_INSTANCE/copied-from-guest/kubeconfig.yaml"
+  RUN_IN_VM="limactl shell $LIMA_INSTANCE sudo"
+fi
+
+echo "[1/4] Pruning unreferenced images from containerd..."
+$RUN_IN_VM k3s crictl rmi --prune || true
+
+echo "[2/4] Deleting Succeeded/Failed pods cluster-wide..."
+kubectl --kubeconfig="$KUBECONFIG" delete pod -A --field-selector=status.phase=Succeeded --ignore-not-found
+kubectl --kubeconfig="$KUBECONFIG" delete pod -A --field-selector=status.phase=Failed --ignore-not-found
+
+echo "[3/4] Checking for orphan agent PVCs..."
+# Cross-reference humr.ai/instance label on each PVC against existing instance
+# ConfigMaps. The controller GCs orphans every 10m; this just surfaces any that
+# look stuck. Forced deletion is left to the operator.
+ORPHANS=()
+while IFS='|' read -r ns pvc inst; do
+  [ -z "$pvc" ] && continue
+  if ! kubectl --kubeconfig="$KUBECONFIG" -n "$ns" get configmap "$inst" >/dev/null 2>&1; then
+    ORPHANS+=("$ns/$pvc (instance: $inst)")
+  fi
+done < <(
+  kubectl --kubeconfig="$KUBECONFIG" get pvc -A -l humr.ai/instance \
+    -o jsonpath='{range .items[*]}{.metadata.namespace}|{.metadata.name}|{.metadata.labels.humr\.ai/instance}{"\n"}{end}'
+)
+if [ "${#ORPHANS[@]}" -eq 0 ]; then
+  echo "  no orphan PVCs found"
+else
+  echo "  ${#ORPHANS[@]} orphan PVC(s):"
+  printf '    %s\n' "${ORPHANS[@]}"
+  echo "  controller will GC them automatically; or delete manually with: kubectl delete pvc -n <ns> <name>"
+fi
+
+echo "[4/4] Disk usage on the cluster VM:"
+$RUN_IN_VM df -h /
+'''
+
 ["cluster:status"]
 description = "Show cluster and pod status. Options: --watch (continuous refresh every 2s)"
 raw = true

diff --git a/packages/controller/main.go b/packages/controller/main.go
@@ -115,6 +115,11 @@ func run(ctx context.Context, client kubernetes.Interface, restCfg *rest.Config,
 	idleChecker := reconciler.NewIdleChecker(client, cfg)
 	go idleChecker.RunLoop(ctx)
 
+	// Periodic GC for PVCs whose instance ConfigMap has been removed
+	// out-of-band (issue #244). The Delete event handler covers the
+	// happy path; this catches crashes mid-delete and direct kubectl removals.
+	go runOrphanPVCSweep(ctx, instanceReconciler, 10*time.Minute)
+
 	queue := workqueue.NewTypedRateLimitingQueue(workqueue.DefaultTypedControllerRateLimiter[string]())
 	defer queue.ShutDown()
 
@@ -208,6 +213,20 @@ func run(ctx context.Context, client kubernetes.Interface, restCfg *rest.Config,
 	}
 }
 
+func runOrphanPVCSweep(ctx context.Context, r *reconciler.InstanceReconciler, interval time.Duration) {
+	r.ReconcileOrphanPVCs(ctx)
+	t := time.NewTicker(interval)
+	defer t.Stop()
+	for {
+		select {
+		case <-ctx.Done():
+			return
+		case <-t.C:
+			r.ReconcileOrphanPVCs(ctx)
+		}
+	}
+}
+
 func keyName(key string) string {
 	for i := len(key) - 1; i >= 0; i-- {
 		if key[i] == '/' {

diff --git a/packages/controller/pkg/reconciler/instance.go b/packages/controller/pkg/reconciler/instance.go
@@ -185,16 +185,57 @@ func (r *InstanceReconciler) deletePVCs(ctx context.Context, instanceName string
 		metav1.ListOptions{LabelSelector: "humr.ai/instance=" + instanceName},
 	)
 	if err != nil {
-		fmt.Printf("WARN: failed to list PVCs for instance %s: %v\n", instanceName, err)
+		slog.Warn("listing PVCs for instance", "instance", instanceName, "error", err)
 		return
 	}
 	for _, pvc := range pvcs.Items {
 		if err := r.client.CoreV1().PersistentVolumeClaims(r.config.Namespace).Delete(ctx, pvc.Name, metav1.DeleteOptions{}); err != nil {
-			fmt.Printf("WARN: failed to delete PVC %s for instance %s: %v\n", pvc.Name, instanceName, err)
+			slog.Warn("deleting PVC", "pvc", pvc.Name, "instance", instanceName, "error", err)
 		}
 	}
 }
 
+// ReconcileOrphanPVCs deletes any PVC labeled `humr.ai/instance=<name>` whose
+// instance ConfigMap no longer exists. Covers two leak modes (issue #244):
+// the controller crashing between StatefulSet teardown and PVC deletion, and
+// users removing the instance ConfigMap out-of-band (e.g. via kubectl).
+//
+// Safe against the create-PVC-before-finalize race because we re-read the
+// ConfigMap from the API server (not the informer cache) before deleting.
+func (r *InstanceReconciler) ReconcileOrphanPVCs(ctx context.Context) {
+	pvcs, err := r.client.CoreV1().PersistentVolumeClaims(r.config.Namespace).List(ctx,
+		metav1.ListOptions{LabelSelector: "humr.ai/instance"},
+	)
+	if err != nil {
+		slog.Warn("orphan PVC GC: listing PVCs failed", "error", err)
+		return
+	}
+	deleted := 0
+	for _, pvc := range pvcs.Items {
+		instanceName := pvc.Labels["humr.ai/instance"]
+		if instanceName == "" {
+			continue
+		}
+		_, err := r.client.CoreV1().ConfigMaps(r.config.Namespace).Get(ctx, instanceName, metav1.GetOptions{})
+		if err == nil {
+			continue
+		}
+		if !errors.IsNotFound(err) {
+			slog.Warn("orphan PVC GC: API lookup failed", "instance", instanceName, "error", err)
+			continue
+		}
+		if err := r.client.CoreV1().PersistentVolumeClaims(r.config.Namespace).Delete(ctx, pvc.Name, metav1.DeleteOptions{}); err != nil {
+			slog.Warn("orphan PVC GC: delete failed", "pvc", pvc.Name, "instance", instanceName, "error", err)
+			continue
+		}
+		slog.Info("orphan PVC GC: deleted PVC for missing instance", "pvc", pvc.Name, "instance", instanceName)
+		deleted++
+	}
+	if deleted > 0 {
+		slog.Info("orphan PVC GC: sweep complete", "deleted", deleted, "scanned", len(pvcs.Items))
+	}
+}
+
 func (r *InstanceReconciler) setError(ctx context.Context, name, msg string) error {
 	WriteInstanceStatus(ctx, r.client, r.config.Namespace, name, types.NewInstanceStatus("error", msg))
 	return fmt.Errorf("instance %s: %s", name, msg)

diff --git a/packages/controller/pkg/reconciler/instance_test.go b/packages/controller/pkg/reconciler/instance_test.go
@@ -243,6 +243,40 @@ func TestDelete_CleansPVCs(t *testing.T) {
 	assert.Empty(t, pvcs.Items)
 }
 
+func TestReconcileOrphanPVCs(t *testing.T) {
+	// orphan: PVC labeled for an instance whose ConfigMap is gone
+	orphan := &corev1.PersistentVolumeClaim{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "home-agent-deleted-instance-0",
+			Namespace: "test-agents",
+			Labels:    map[string]string{"humr.ai/instance": "deleted-instance"},
+		},
+	}
+	// live: PVC labeled for an instance that still has a ConfigMap
+	live := &corev1.PersistentVolumeClaim{
+		ObjectMeta: metav1.ObjectMeta{
+			Name:      "home-agent-my-instance-0",
+			Namespace: "test-agents",
+			Labels:    map[string]string{"humr.ai/instance": "my-instance"},
+		},
+	}
+	liveCM := instanceCM("running") // name = "my-instance"
+	r, client := setupReconciler(t,
+		map[string]*corev1.ConfigMap{"claude-code": agentCM()},
+		liveCM, orphan, live,
+	)
+
+	r.ReconcileOrphanPVCs(context.Background())
+
+	// orphan removed
+	_, err := client.CoreV1().PersistentVolumeClaims("test-agents").Get(context.Background(), orphan.Name, metav1.GetOptions{})
+	assert.Error(t, err, "orphan PVC should be deleted")
+
+	// live retained
+	_, err = client.CoreV1().PersistentVolumeClaims("test-agents").Get(context.Background(), live.Name, metav1.GetOptions{})
+	assert.NoError(t, err, "live instance PVC must be retained")
+}
+
 func TestEnvMappingsToEnvVars(t *testing.T) {
 	t.Run("empty", func(t *testing.T) {
 		assert.Nil(t, envMappingsToEnvVars(nil))