diff --git a/deploy/helm/humr/templates/code-guardian-template.yaml b/deploy/helm/humr/templates/code-guardian-template.yaml index 0535ce53..627b0536 100644 --- a/deploy/helm/humr/templates/code-guardian-template.yaml +++ b/deploy/helm/humr/templates/code-guardian-template.yaml @@ -15,6 +15,9 @@ data: mounts: - path: /home/agent persist: true + {{- with .Values.codeGuardianTemplate.homeMountSize }} + size: {{ . | quote }} + {{- end }} - path: /tmp persist: false init: | diff --git a/deploy/helm/humr/templates/default-template.yaml b/deploy/helm/humr/templates/default-template.yaml index 040339a6..25441f52 100644 --- a/deploy/helm/humr/templates/default-template.yaml +++ b/deploy/helm/humr/templates/default-template.yaml @@ -15,6 +15,9 @@ data: mounts: - path: /home/agent persist: true + {{- with .Values.defaultTemplate.homeMountSize }} + size: {{ . | quote }} + {{- end }} - path: /tmp persist: false init: | diff --git a/deploy/helm/humr/templates/google-workspace-template.yaml b/deploy/helm/humr/templates/google-workspace-template.yaml index 911173ac..8a26bd8b 100644 --- a/deploy/helm/humr/templates/google-workspace-template.yaml +++ b/deploy/helm/humr/templates/google-workspace-template.yaml @@ -15,6 +15,9 @@ data: mounts: - path: /home/agent persist: true + {{- with .Values.googleWorkspaceTemplate.homeMountSize }} + size: {{ . | quote }} + {{- end }} - path: /tmp persist: false init: | diff --git a/deploy/helm/humr/templates/pi-agent-template.yaml b/deploy/helm/humr/templates/pi-agent-template.yaml index c8c9e710..56f0e693 100644 --- a/deploy/helm/humr/templates/pi-agent-template.yaml +++ b/deploy/helm/humr/templates/pi-agent-template.yaml @@ -15,6 +15,9 @@ data: mounts: - path: /home/agent persist: true + {{- with .Values.piAgentTemplate.homeMountSize }} + size: {{ . | quote }} + {{- end }} - path: /tmp persist: false init: | diff --git a/deploy/helm/humr/values.yaml b/deploy/helm/humr/values.yaml index 723aaeb2..92b55f73 100644 --- a/deploy/helm/humr/values.yaml +++ b/deploy/helm/humr/values.yaml @@ -411,6 +411,9 @@ defaultTemplate: tag: "" pullPolicy: IfNotPresent description: "Default Claude Code agent" + # -- PVC size for the persisted /home/agent mount. Empty = controller + # default (10Gi). See issue #244. + homeMountSize: "5Gi" resources: requests: cpu: "250m" @@ -428,6 +431,7 @@ googleWorkspaceTemplate: tag: "" pullPolicy: IfNotPresent description: "Google Workspace agent with Drive and Gmail via gws CLI" + homeMountSize: "2Gi" resources: requests: cpu: "250m" @@ -445,6 +449,7 @@ piAgentTemplate: tag: "" pullPolicy: IfNotPresent description: "Pi coding agent with multi-LLM support" + homeMountSize: "2Gi" resources: requests: cpu: "250m" @@ -464,6 +469,8 @@ codeGuardianTemplate: description: "PR code review agent (gh + Claude Code)" # -- owner/repo slug the agent reviews. Empty = detect via `gh repo view`. githubRepo: "" + # Code Guardian clones real repos — keep the larger default. + homeMountSize: "10Gi" resources: requests: cpu: "250m" diff --git a/deploy/lima-k3s-test.yaml b/deploy/lima-k3s-test.yaml index 65e39462..a78fde7d 100644 --- a/deploy/lima-k3s-test.yaml +++ b/deploy/lima-k3s-test.yaml @@ -26,6 +26,12 @@ provision: # /sbin/mount. helper program." DEBIAN_FRONTEND=noninteractive apt-get update DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends nfs-common + mkdir -p /etc/rancher/k3s + cat > /etc/rancher/k3s/config.yaml << 'EOF' + kubelet-arg: + - "container-log-max-size=10Mi" + - "container-log-max-files=3" + EOF if [ ! -d /var/lib/rancher/k3s ]; then curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="server --write-kubeconfig-mode 644 --https-listen-port 16445" sh - fi diff --git a/deploy/lima-k3s.yaml b/deploy/lima-k3s.yaml index 6798a0e4..bedc5d14 100644 --- a/deploy/lima-k3s.yaml +++ b/deploy/lima-k3s.yaml @@ -24,6 +24,15 @@ provision: DEBIAN_FRONTEND=noninteractive apt-get update DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends nfs-common echo "[humr-provision] mount.nfs4: $(command -v mount.nfs4 || echo MISSING)" + # Bound kubelet container-log usage at ~30 MiB per pod so a chatty/looping + # pod can't fill the dev VM disk (issue #244). cluster:install also + # ensures this file exists on existing VMs. + mkdir -p /etc/rancher/k3s + cat > /etc/rancher/k3s/config.yaml << 'EOF' + kubelet-arg: + - "container-log-max-size=10Mi" + - "container-log-max-files=3" + EOF if [ ! -d /var/lib/rancher/k3s ]; then curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="server --write-kubeconfig-mode 644 --https-listen-port 16443" sh - fi diff --git a/deploy/tasks.toml b/deploy/tasks.toml index f994cd98..98cd90de 100644 --- a/deploy/tasks.toml +++ b/deploy/tasks.toml @@ -137,6 +137,31 @@ else fi kubectl --kubeconfig="$KUBECONFIG" wait --for=condition=Ready node --all --timeout=120s +# 1b. Ensure k3s container-log rotation is configured (issue #244 upgrade +# path for VMs created before this flag was added). New VMs already have +# /etc/rancher/k3s/config.yaml from the lima provision script. +K3S_CONFIG_YAML='kubelet-arg: + - "container-log-max-size=10Mi" + - "container-log-max-files=3" +' +if [ -n "${IS_SANDBOX:-}" ]; then + if ! sudo grep -q container-log-max-size /etc/rancher/k3s/config.yaml 2>/dev/null; then + echo "Configuring k3s container log rotation (issue #244)..." + sudo mkdir -p /etc/rancher/k3s + printf '%s' "$K3S_CONFIG_YAML" | sudo tee /etc/rancher/k3s/config.yaml >/dev/null + sudo systemctl restart k3s + kubectl --kubeconfig="$KUBECONFIG" wait --for=condition=Ready node --all --timeout=120s + fi +else + if ! limactl shell "$LIMA_INSTANCE" sudo grep -q container-log-max-size /etc/rancher/k3s/config.yaml 2>/dev/null; then + echo "Configuring k3s container log rotation (issue #244)..." + limactl shell "$LIMA_INSTANCE" sudo mkdir -p /etc/rancher/k3s + printf '%s' "$K3S_CONFIG_YAML" | limactl shell "$LIMA_INSTANCE" sudo tee /etc/rancher/k3s/config.yaml >/dev/null + limactl shell "$LIMA_INSTANCE" sudo systemctl restart k3s + kubectl --kubeconfig="$KUBECONFIG" wait --for=condition=Ready node --all --timeout=120s + fi +fi + # 2. Install cert-manager if ! kubectl --kubeconfig="$KUBECONFIG" get ns cert-manager >/dev/null 2>&1; then echo "Installing cert-manager..." @@ -190,6 +215,16 @@ fi echo "Waiting for deployments to be ready..." kubectl --kubeconfig="$KUBECONFIG" wait --for=condition=Available deployment --all --timeout=10m +# 7. Prune image layers no longer referenced by any pod (issue #244). After +# repeated reinstalls, old `:latest` layers accumulate in containerd; this +# drops anything not pinned by a running container. +echo "Pruning unused images..." +if [ -n "${IS_SANDBOX:-}" ]; then + sudo k3s crictl rmi --prune >/dev/null 2>&1 || true +else + limactl shell "$LIMA_INSTANCE" sudo k3s crictl rmi --prune >/dev/null 2>&1 || true +fi + echo "" echo "=== Cluster ready ===" echo "KUBECONFIG=$KUBECONFIG" @@ -224,6 +259,13 @@ echo "Restarting apiserver pod..." kubectl --kubeconfig="$KUBECONFIG" rollout restart deployment humr-apiserver kubectl --kubeconfig="$KUBECONFIG" rollout status deployment humr-apiserver --timeout=60s +# Drop image layers no longer pinned by a running container (issue #244). +if [ -n "${IS_SANDBOX:-}" ]; then + sudo k3s crictl rmi --prune >/dev/null 2>&1 || true +else + limactl shell "$LIMA_INSTANCE" sudo k3s crictl rmi --prune >/dev/null 2>&1 || true +fi + echo "Done." ''' @@ -254,6 +296,12 @@ echo "Restarting UI pod..." kubectl --kubeconfig="$KUBECONFIG" rollout restart deployment humr-ui kubectl --kubeconfig="$KUBECONFIG" rollout status deployment humr-ui --timeout=60s +if [ -n "${IS_SANDBOX:-}" ]; then + sudo k3s crictl rmi --prune >/dev/null 2>&1 || true +else + limactl shell "$LIMA_INSTANCE" sudo k3s crictl rmi --prune >/dev/null 2>&1 || true +fi + echo "Done." ''' @@ -283,6 +331,15 @@ rm -f "$tar" echo "Restarting agent pods..." kubectl --kubeconfig="$KUBECONFIG" delete pods -n humr-agents --all 2>/dev/null || true +# Best-effort: prune unreferenced image layers. Running this immediately after +# `delete pods` may briefly skip layers still pinned to terminating containers; +# the next install/build sweep will catch them. (issue #244) +if [ -n "${IS_SANDBOX:-}" ]; then + sudo k3s crictl rmi --prune >/dev/null 2>&1 || true +else + limactl shell "$LIMA_INSTANCE" sudo k3s crictl rmi --prune >/dev/null 2>&1 || true +fi + echo "Done. Agent pods will restart with new image." ''' @@ -382,6 +439,54 @@ else fi ''' +["cluster:reclaim"] +description = "Reclaim disk in the dev cluster: prune unused images, delete completed/failed pods, surface orphan PVCs." +dir = "{{config_root}}" +run = ''' +#!/usr/bin/env bash +set -eo pipefail +if [ -n "${IS_SANDBOX:-}" ]; then + KUBECONFIG="/etc/rancher/k3s/k3s.yaml" + RUN_IN_VM="sudo" +else + LIMA_INSTANCE="humr-k3s" + KUBECONFIG="$HOME/.lima/$LIMA_INSTANCE/copied-from-guest/kubeconfig.yaml" + RUN_IN_VM="limactl shell $LIMA_INSTANCE sudo" +fi + +echo "[1/4] Pruning unreferenced images from containerd..." +$RUN_IN_VM k3s crictl rmi --prune || true + +echo "[2/4] Deleting Succeeded/Failed pods cluster-wide..." +kubectl --kubeconfig="$KUBECONFIG" delete pod -A --field-selector=status.phase=Succeeded --ignore-not-found +kubectl --kubeconfig="$KUBECONFIG" delete pod -A --field-selector=status.phase=Failed --ignore-not-found + +echo "[3/4] Checking for orphan agent PVCs..." +# Cross-reference humr.ai/instance label on each PVC against existing instance +# ConfigMaps. The controller GCs orphans every 10m; this just surfaces any that +# look stuck. Forced deletion is left to the operator. +ORPHANS=() +while IFS='|' read -r ns pvc inst; do + [ -z "$pvc" ] && continue + if ! kubectl --kubeconfig="$KUBECONFIG" -n "$ns" get configmap "$inst" >/dev/null 2>&1; then + ORPHANS+=("$ns/$pvc (instance: $inst)") + fi +done < <( + kubectl --kubeconfig="$KUBECONFIG" get pvc -A -l humr.ai/instance \ + -o jsonpath='{range .items[*]}{.metadata.namespace}|{.metadata.name}|{.metadata.labels.humr\.ai/instance}{"\n"}{end}' +) +if [ "${#ORPHANS[@]}" -eq 0 ]; then + echo " no orphan PVCs found" +else + echo " ${#ORPHANS[@]} orphan PVC(s):" + printf ' %s\n' "${ORPHANS[@]}" + echo " controller will GC them automatically; or delete manually with: kubectl delete pvc -n " +fi + +echo "[4/4] Disk usage on the cluster VM:" +$RUN_IN_VM df -h / +''' + ["cluster:status"] description = "Show cluster and pod status. Options: --watch (continuous refresh every 2s)" raw = true diff --git a/packages/controller/main.go b/packages/controller/main.go index 5139d908..9d27cb95 100644 --- a/packages/controller/main.go +++ b/packages/controller/main.go @@ -121,6 +121,11 @@ func run(ctx context.Context, client kubernetes.Interface, dynClient dynamic.Int idleChecker := reconciler.NewIdleChecker(client, cfg) go idleChecker.RunLoop(ctx) + // Periodic GC for PVCs whose instance ConfigMap has been removed + // out-of-band (issue #244). The Delete event handler covers the + // happy path; this catches crashes mid-delete and direct kubectl removals. + go runOrphanPVCSweep(ctx, instanceReconciler, 10*time.Minute) + queue := workqueue.NewTypedRateLimitingQueue(workqueue.DefaultTypedControllerRateLimiter[string]()) defer queue.ShutDown() @@ -214,6 +219,20 @@ func run(ctx context.Context, client kubernetes.Interface, dynClient dynamic.Int } } +func runOrphanPVCSweep(ctx context.Context, r *reconciler.InstanceReconciler, interval time.Duration) { + r.ReconcileOrphanPVCs(ctx) + t := time.NewTicker(interval) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + r.ReconcileOrphanPVCs(ctx) + } + } +} + func keyName(key string) string { for i := len(key) - 1; i >= 0; i-- { if key[i] == '/' { diff --git a/packages/controller/pkg/reconciler/instance.go b/packages/controller/pkg/reconciler/instance.go index e73f1703..ba53e51c 100644 --- a/packages/controller/pkg/reconciler/instance.go +++ b/packages/controller/pkg/reconciler/instance.go @@ -234,16 +234,57 @@ func (r *InstanceReconciler) deletePVCs(ctx context.Context, instanceName string metav1.ListOptions{LabelSelector: "humr.ai/instance=" + instanceName}, ) if err != nil { - fmt.Printf("WARN: failed to list PVCs for instance %s: %v\n", instanceName, err) + slog.Warn("listing PVCs for instance", "instance", instanceName, "error", err) return } for _, pvc := range pvcs.Items { if err := r.client.CoreV1().PersistentVolumeClaims(r.config.Namespace).Delete(ctx, pvc.Name, metav1.DeleteOptions{}); err != nil { - fmt.Printf("WARN: failed to delete PVC %s for instance %s: %v\n", pvc.Name, instanceName, err) + slog.Warn("deleting PVC", "pvc", pvc.Name, "instance", instanceName, "error", err) } } } +// ReconcileOrphanPVCs deletes any PVC labeled `humr.ai/instance=` whose +// instance ConfigMap no longer exists. Covers two leak modes (issue #244): +// the controller crashing between StatefulSet teardown and PVC deletion, and +// users removing the instance ConfigMap out-of-band (e.g. via kubectl). +// +// Safe against the create-PVC-before-finalize race because we re-read the +// ConfigMap from the API server (not the informer cache) before deleting. +func (r *InstanceReconciler) ReconcileOrphanPVCs(ctx context.Context) { + pvcs, err := r.client.CoreV1().PersistentVolumeClaims(r.config.Namespace).List(ctx, + metav1.ListOptions{LabelSelector: "humr.ai/instance"}, + ) + if err != nil { + slog.Warn("orphan PVC GC: listing PVCs failed", "error", err) + return + } + deleted := 0 + for _, pvc := range pvcs.Items { + instanceName := pvc.Labels["humr.ai/instance"] + if instanceName == "" { + continue + } + _, err := r.client.CoreV1().ConfigMaps(r.config.Namespace).Get(ctx, instanceName, metav1.GetOptions{}) + if err == nil { + continue + } + if !errors.IsNotFound(err) { + slog.Warn("orphan PVC GC: API lookup failed", "instance", instanceName, "error", err) + continue + } + if err := r.client.CoreV1().PersistentVolumeClaims(r.config.Namespace).Delete(ctx, pvc.Name, metav1.DeleteOptions{}); err != nil { + slog.Warn("orphan PVC GC: delete failed", "pvc", pvc.Name, "instance", instanceName, "error", err) + continue + } + slog.Info("orphan PVC GC: deleted PVC for missing instance", "pvc", pvc.Name, "instance", instanceName) + deleted++ + } + if deleted > 0 { + slog.Info("orphan PVC GC: sweep complete", "deleted", deleted, "scanned", len(pvcs.Items)) + } +} + func (r *InstanceReconciler) setError(ctx context.Context, name, msg string) error { WriteInstanceStatus(ctx, r.client, r.config.Namespace, name, types.NewInstanceStatus("error", msg)) return fmt.Errorf("instance %s: %s", name, msg) diff --git a/packages/controller/pkg/reconciler/instance_test.go b/packages/controller/pkg/reconciler/instance_test.go index f5886fe6..71c3cc76 100644 --- a/packages/controller/pkg/reconciler/instance_test.go +++ b/packages/controller/pkg/reconciler/instance_test.go @@ -243,6 +243,40 @@ func TestDelete_CleansPVCs(t *testing.T) { assert.Empty(t, pvcs.Items) } +func TestReconcileOrphanPVCs(t *testing.T) { + // orphan: PVC labeled for an instance whose ConfigMap is gone + orphan := &corev1.PersistentVolumeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "home-agent-deleted-instance-0", + Namespace: "test-agents", + Labels: map[string]string{"humr.ai/instance": "deleted-instance"}, + }, + } + // live: PVC labeled for an instance that still has a ConfigMap + live := &corev1.PersistentVolumeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Name: "home-agent-my-instance-0", + Namespace: "test-agents", + Labels: map[string]string{"humr.ai/instance": "my-instance"}, + }, + } + liveCM := instanceCM("running") // name = "my-instance" + r, client := setupReconciler(t, + map[string]*corev1.ConfigMap{"claude-code": agentCM()}, + liveCM, orphan, live, + ) + + r.ReconcileOrphanPVCs(context.Background()) + + // orphan removed + _, err := client.CoreV1().PersistentVolumeClaims("test-agents").Get(context.Background(), orphan.Name, metav1.GetOptions{}) + assert.Error(t, err, "orphan PVC should be deleted") + + // live retained + _, err = client.CoreV1().PersistentVolumeClaims("test-agents").Get(context.Background(), live.Name, metav1.GetOptions{}) + assert.NoError(t, err, "live instance PVC must be retained") +} + func TestEnvMappingsToEnvVars(t *testing.T) { t.Run("empty", func(t *testing.T) { assert.Nil(t, envMappingsToEnvVars(nil)) diff --git a/packages/controller/pkg/reconciler/resources.go b/packages/controller/pkg/reconciler/resources.go index e8462263..d381cf54 100644 --- a/packages/controller/pkg/reconciler/resources.go +++ b/packages/controller/pkg/reconciler/resources.go @@ -99,7 +99,13 @@ func BuildStatefulSet(name string, instance *types.InstanceSpec, agentSpec *type Name: volName, MountPath: m.Path, }) if m.Persist { - storageSize := cfg.AgentStorageSize + // Per-mount `size:` (issue #244) wins over the cluster-wide + // AgentStorageSize default; both fall back to 10Gi. + // Validation happens in ParseAgentSpec, so MustParse here is safe. + storageSize := m.Size + if storageSize == "" { + storageSize = cfg.AgentStorageSize + } if storageSize == "" { storageSize = "10Gi" } diff --git a/packages/controller/pkg/reconciler/resources_test.go b/packages/controller/pkg/reconciler/resources_test.go index a034880a..aab10d30 100644 --- a/packages/controller/pkg/reconciler/resources_test.go +++ b/packages/controller/pkg/reconciler/resources_test.go @@ -186,6 +186,30 @@ func TestBuildStatefulSet_Volumes(t *testing.T) { assert.Equal(t, "ca-cert", mountPaths["/etc/humr/ca"]) } +func TestBuildStatefulSet_PVCSize(t *testing.T) { + // Mount with explicit size renders a PVC sized accordingly; mount without + // size falls back to the historical 10Gi default. (issue #244) + agent := types.AgentSpec{ + Image: "humr-test:latest", + Mounts: []types.Mount{ + {Path: "/home/agent", Persist: true, Size: "2Gi"}, + {Path: "/cache", Persist: true}, + }, + } + instance := &types.InstanceSpec{DesiredState: "running"} + ss := BuildStatefulSet("my-instance", instance, &agent, testConfig, "my-agent", testOwnerCM, nil, nil) + + require.Len(t, ss.Spec.VolumeClaimTemplates, 2) + byName := map[string]corev1.PersistentVolumeClaim{} + for _, pvc := range ss.Spec.VolumeClaimTemplates { + byName[pvc.Name] = pvc + } + home := byName["home-agent"].Spec.Resources.Requests[corev1.ResourceStorage] + cache := byName["cache"].Spec.Resources.Requests[corev1.ResourceStorage] + assert.Equal(t, "2Gi", home.String()) + assert.Equal(t, "10Gi", cache.String()) +} + func TestBuildStatefulSet_AgentStorageClass(t *testing.T) { cfg := *testConfig cfg.AgentStorageClass = "humr-rwx" diff --git a/packages/controller/pkg/types/types.go b/packages/controller/pkg/types/types.go index 0b71f858..d44a4c81 100644 --- a/packages/controller/pkg/types/types.go +++ b/packages/controller/pkg/types/types.go @@ -9,6 +9,7 @@ import ( "github.com/robfig/cron/v3" "github.com/teambition/rrule-go" "gopkg.in/yaml.v3" + "k8s.io/apimachinery/pkg/api/resource" ) var quietHoursTimeRE = regexp.MustCompile(`^([01][0-9]|2[0-3]):[0-5][0-9]$`) @@ -33,6 +34,10 @@ type AgentSpec struct { type Mount struct { Path string `yaml:"path"` Persist bool `yaml:"persist"` + // Size is an optional K8s resource Quantity (e.g. "2Gi") for a persisted + // mount's PVC. When empty the controller defaults to 10Gi to match the + // pre-issue-#244 behavior. Ignored when Persist is false. + Size string `yaml:"size,omitempty"` } type EnvVar struct { @@ -174,6 +179,11 @@ func ParseAgentSpec(data string) (*AgentSpec, error) { if !strings.HasPrefix(m.Path, "/") { return nil, fmt.Errorf("agent spec: mount path %q must be absolute", m.Path) } + if m.Size != "" { + if _, err := resource.ParseQuantity(m.Size); err != nil { + return nil, fmt.Errorf("agent spec: mount %q size %q is not a valid K8s quantity: %w", m.Path, m.Size, err) + } + } } return &spec, nil } diff --git a/packages/controller/pkg/types/types_test.go b/packages/controller/pkg/types/types_test.go index 9859deb5..bb10cebb 100644 --- a/packages/controller/pkg/types/types_test.go +++ b/packages/controller/pkg/types/types_test.go @@ -85,6 +85,32 @@ mounts: assert.Contains(t, err.Error(), "must be absolute") } +func TestParseAgentSpec_MountSize(t *testing.T) { + spec, err := ParseAgentSpec(`version: humr.ai/v1 +image: foo +mounts: + - path: /home/agent + persist: true + size: 2Gi + - path: /tmp + persist: false`) + require.NoError(t, err) + require.Len(t, spec.Mounts, 2) + assert.Equal(t, "2Gi", spec.Mounts[0].Size) + assert.Empty(t, spec.Mounts[1].Size) +} + +func TestParseAgentSpec_MountSizeInvalid(t *testing.T) { + _, err := ParseAgentSpec(`version: humr.ai/v1 +image: foo +mounts: + - path: /home/agent + persist: true + size: notaquantity`) + assert.Error(t, err) + assert.Contains(t, err.Error(), "valid K8s quantity") +} + // --- Instance --- func TestParseInstanceSpec(t *testing.T) {