Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions deploy/helm/humr/templates/code-guardian-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ data:
mounts:
- path: /home/agent
persist: true
{{- with .Values.codeGuardianTemplate.homeMountSize }}
size: {{ . | quote }}
{{- end }}
- path: /tmp
persist: false
init: |
Expand Down
3 changes: 3 additions & 0 deletions deploy/helm/humr/templates/default-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ data:
mounts:
- path: /home/agent
persist: true
{{- with .Values.defaultTemplate.homeMountSize }}
size: {{ . | quote }}
{{- end }}
- path: /tmp
persist: false
init: |
Expand Down
3 changes: 3 additions & 0 deletions deploy/helm/humr/templates/google-workspace-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ data:
mounts:
- path: /home/agent
persist: true
{{- with .Values.googleWorkspaceTemplate.homeMountSize }}
size: {{ . | quote }}
{{- end }}
- path: /tmp
persist: false
init: |
Expand Down
3 changes: 3 additions & 0 deletions deploy/helm/humr/templates/pi-agent-template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ data:
mounts:
- path: /home/agent
persist: true
{{- with .Values.piAgentTemplate.homeMountSize }}
size: {{ . | quote }}
{{- end }}
- path: /tmp
persist: false
init: |
Expand Down
7 changes: 7 additions & 0 deletions deploy/helm/humr/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -321,6 +321,9 @@ defaultTemplate:
tag: ""
pullPolicy: IfNotPresent
description: "Default Claude Code agent"
# -- PVC size for the persisted /home/agent mount. Empty = controller
# default (10Gi). See issue #244.
homeMountSize: "5Gi"
resources:
requests:
cpu: "250m"
Expand All @@ -338,6 +341,7 @@ googleWorkspaceTemplate:
tag: ""
pullPolicy: IfNotPresent
description: "Google Workspace agent with Drive and Gmail via gws CLI"
homeMountSize: "2Gi"
resources:
requests:
cpu: "250m"
Expand All @@ -355,6 +359,7 @@ piAgentTemplate:
tag: ""
pullPolicy: IfNotPresent
description: "Pi coding agent with multi-LLM support"
homeMountSize: "2Gi"
resources:
requests:
cpu: "250m"
Expand All @@ -374,6 +379,8 @@ codeGuardianTemplate:
description: "PR code review agent (gh + Claude Code)"
# -- owner/repo slug the agent reviews. Empty = detect via `gh repo view`.
githubRepo: ""
# Code Guardian clones real repos — keep the larger default.
homeMountSize: "10Gi"
resources:
requests:
cpu: "250m"
Expand Down
6 changes: 6 additions & 0 deletions deploy/lima-k3s-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ provision:
- mode: system
script: |
#!/bin/sh
mkdir -p /etc/rancher/k3s
cat > /etc/rancher/k3s/config.yaml << 'EOF'
kubelet-arg:
- "container-log-max-size=10Mi"
- "container-log-max-files=3"
EOF
if [ ! -d /var/lib/rancher/k3s ]; then
curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="server --write-kubeconfig-mode 644 --https-listen-port 16445" sh -
fi
Expand Down
9 changes: 9 additions & 0 deletions deploy/lima-k3s.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,15 @@ provision:
DEBIAN_FRONTEND=noninteractive apt-get update
DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends nfs-common
echo "[humr-provision] mount.nfs4: $(command -v mount.nfs4 || echo MISSING)"
# Bound kubelet container-log usage at ~30 MiB per pod so a chatty/looping
# pod can't fill the dev VM disk (issue #244). cluster:install also
# ensures this file exists on existing VMs.
mkdir -p /etc/rancher/k3s
cat > /etc/rancher/k3s/config.yaml << 'EOF'
kubelet-arg:
- "container-log-max-size=10Mi"
- "container-log-max-files=3"
EOF
if [ ! -d /var/lib/rancher/k3s ]; then
curl -sfL https://get.k3s.io | INSTALL_K3S_EXEC="server --write-kubeconfig-mode 644 --https-listen-port 16443" sh -
fi
Expand Down
105 changes: 105 additions & 0 deletions deploy/tasks.toml
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,31 @@ else
fi
kubectl --kubeconfig="$KUBECONFIG" wait --for=condition=Ready node --all --timeout=120s

# 1b. Ensure k3s container-log rotation is configured (issue #244 upgrade
# path for VMs created before this flag was added). New VMs already have
# /etc/rancher/k3s/config.yaml from the lima provision script.
K3S_CONFIG_YAML='kubelet-arg:
- "container-log-max-size=10Mi"
- "container-log-max-files=3"
'
if [ -n "${IS_SANDBOX:-}" ]; then
if ! sudo grep -q container-log-max-size /etc/rancher/k3s/config.yaml 2>/dev/null; then
echo "Configuring k3s container log rotation (issue #244)..."
sudo mkdir -p /etc/rancher/k3s
printf '%s' "$K3S_CONFIG_YAML" | sudo tee /etc/rancher/k3s/config.yaml >/dev/null
sudo systemctl restart k3s
kubectl --kubeconfig="$KUBECONFIG" wait --for=condition=Ready node --all --timeout=120s
fi
else
if ! limactl shell "$LIMA_INSTANCE" sudo grep -q container-log-max-size /etc/rancher/k3s/config.yaml 2>/dev/null; then
echo "Configuring k3s container log rotation (issue #244)..."
limactl shell "$LIMA_INSTANCE" sudo mkdir -p /etc/rancher/k3s
printf '%s' "$K3S_CONFIG_YAML" | limactl shell "$LIMA_INSTANCE" sudo tee /etc/rancher/k3s/config.yaml >/dev/null
limactl shell "$LIMA_INSTANCE" sudo systemctl restart k3s
kubectl --kubeconfig="$KUBECONFIG" wait --for=condition=Ready node --all --timeout=120s
fi
fi

# 2. Install cert-manager
if ! kubectl --kubeconfig="$KUBECONFIG" get ns cert-manager >/dev/null 2>&1; then
echo "Installing cert-manager..."
Expand Down Expand Up @@ -190,6 +215,16 @@ fi
echo "Waiting for deployments to be ready..."
kubectl --kubeconfig="$KUBECONFIG" wait --for=condition=Available deployment --all --timeout=10m

# 7. Prune image layers no longer referenced by any pod (issue #244). After
# repeated reinstalls, old `:latest` layers accumulate in containerd; this
# drops anything not pinned by a running container.
echo "Pruning unused images..."
if [ -n "${IS_SANDBOX:-}" ]; then
sudo k3s crictl rmi --prune >/dev/null 2>&1 || true
else
limactl shell "$LIMA_INSTANCE" sudo k3s crictl rmi --prune >/dev/null 2>&1 || true
fi

echo ""
echo "=== Cluster ready ==="
echo "KUBECONFIG=$KUBECONFIG"
Expand Down Expand Up @@ -224,6 +259,13 @@ echo "Restarting apiserver pod..."
kubectl --kubeconfig="$KUBECONFIG" rollout restart deployment humr-apiserver
kubectl --kubeconfig="$KUBECONFIG" rollout status deployment humr-apiserver --timeout=60s

# Drop image layers no longer pinned by a running container (issue #244).
if [ -n "${IS_SANDBOX:-}" ]; then
sudo k3s crictl rmi --prune >/dev/null 2>&1 || true
else
limactl shell "$LIMA_INSTANCE" sudo k3s crictl rmi --prune >/dev/null 2>&1 || true
fi

echo "Done."
'''

Expand Down Expand Up @@ -254,6 +296,12 @@ echo "Restarting UI pod..."
kubectl --kubeconfig="$KUBECONFIG" rollout restart deployment humr-ui
kubectl --kubeconfig="$KUBECONFIG" rollout status deployment humr-ui --timeout=60s

if [ -n "${IS_SANDBOX:-}" ]; then
sudo k3s crictl rmi --prune >/dev/null 2>&1 || true
else
limactl shell "$LIMA_INSTANCE" sudo k3s crictl rmi --prune >/dev/null 2>&1 || true
fi

echo "Done."
'''

Expand Down Expand Up @@ -283,6 +331,15 @@ rm -f "$tar"
echo "Restarting agent pods..."
kubectl --kubeconfig="$KUBECONFIG" delete pods -n humr-agents --all 2>/dev/null || true

# Best-effort: prune unreferenced image layers. Running this immediately after
# `delete pods` may briefly skip layers still pinned to terminating containers;
# the next install/build sweep will catch them. (issue #244)
if [ -n "${IS_SANDBOX:-}" ]; then
sudo k3s crictl rmi --prune >/dev/null 2>&1 || true
else
limactl shell "$LIMA_INSTANCE" sudo k3s crictl rmi --prune >/dev/null 2>&1 || true
fi

echo "Done. Agent pods will restart with new image."
'''

Expand Down Expand Up @@ -382,6 +439,54 @@ else
fi
'''

["cluster:reclaim"]
description = "Reclaim disk in the dev cluster: prune unused images, delete completed/failed pods, surface orphan PVCs."
dir = "{{config_root}}"
run = '''
#!/usr/bin/env bash
set -eo pipefail
if [ -n "${IS_SANDBOX:-}" ]; then
KUBECONFIG="/etc/rancher/k3s/k3s.yaml"
RUN_IN_VM="sudo"
else
LIMA_INSTANCE="humr-k3s"
KUBECONFIG="$HOME/.lima/$LIMA_INSTANCE/copied-from-guest/kubeconfig.yaml"
RUN_IN_VM="limactl shell $LIMA_INSTANCE sudo"
fi

echo "[1/4] Pruning unreferenced images from containerd..."
$RUN_IN_VM k3s crictl rmi --prune || true

echo "[2/4] Deleting Succeeded/Failed pods cluster-wide..."
kubectl --kubeconfig="$KUBECONFIG" delete pod -A --field-selector=status.phase=Succeeded --ignore-not-found
kubectl --kubeconfig="$KUBECONFIG" delete pod -A --field-selector=status.phase=Failed --ignore-not-found

echo "[3/4] Checking for orphan agent PVCs..."
# Cross-reference humr.ai/instance label on each PVC against existing instance
# ConfigMaps. The controller GCs orphans every 10m; this just surfaces any that
# look stuck. Forced deletion is left to the operator.
ORPHANS=()
while IFS='|' read -r ns pvc inst; do
[ -z "$pvc" ] && continue
if ! kubectl --kubeconfig="$KUBECONFIG" -n "$ns" get configmap "$inst" >/dev/null 2>&1; then
ORPHANS+=("$ns/$pvc (instance: $inst)")
fi
done < <(
kubectl --kubeconfig="$KUBECONFIG" get pvc -A -l humr.ai/instance \
-o jsonpath='{range .items[*]}{.metadata.namespace}|{.metadata.name}|{.metadata.labels.humr\.ai/instance}{"\n"}{end}'
)
if [ "${#ORPHANS[@]}" -eq 0 ]; then
echo " no orphan PVCs found"
else
echo " ${#ORPHANS[@]} orphan PVC(s):"
printf ' %s\n' "${ORPHANS[@]}"
echo " controller will GC them automatically; or delete manually with: kubectl delete pvc -n <ns> <name>"
fi

echo "[4/4] Disk usage on the cluster VM:"
$RUN_IN_VM df -h /
'''

["cluster:status"]
description = "Show cluster and pod status. Options: --watch (continuous refresh every 2s)"
raw = true
Expand Down
19 changes: 19 additions & 0 deletions packages/controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,11 @@ func run(ctx context.Context, client kubernetes.Interface, restCfg *rest.Config,
idleChecker := reconciler.NewIdleChecker(client, cfg)
go idleChecker.RunLoop(ctx)

// Periodic GC for PVCs whose instance ConfigMap has been removed
// out-of-band (issue #244). The Delete event handler covers the
// happy path; this catches crashes mid-delete and direct kubectl removals.
go runOrphanPVCSweep(ctx, instanceReconciler, 10*time.Minute)

queue := workqueue.NewTypedRateLimitingQueue(workqueue.DefaultTypedControllerRateLimiter[string]())
defer queue.ShutDown()

Expand Down Expand Up @@ -208,6 +213,20 @@ func run(ctx context.Context, client kubernetes.Interface, restCfg *rest.Config,
}
}

func runOrphanPVCSweep(ctx context.Context, r *reconciler.InstanceReconciler, interval time.Duration) {
r.ReconcileOrphanPVCs(ctx)
t := time.NewTicker(interval)
defer t.Stop()
for {
select {
case <-ctx.Done():
return
case <-t.C:
r.ReconcileOrphanPVCs(ctx)
}
}
}

func keyName(key string) string {
for i := len(key) - 1; i >= 0; i-- {
if key[i] == '/' {
Expand Down
45 changes: 43 additions & 2 deletions packages/controller/pkg/reconciler/instance.go
Original file line number Diff line number Diff line change
Expand Up @@ -185,16 +185,57 @@ func (r *InstanceReconciler) deletePVCs(ctx context.Context, instanceName string
metav1.ListOptions{LabelSelector: "humr.ai/instance=" + instanceName},
)
if err != nil {
fmt.Printf("WARN: failed to list PVCs for instance %s: %v\n", instanceName, err)
slog.Warn("listing PVCs for instance", "instance", instanceName, "error", err)
return
}
for _, pvc := range pvcs.Items {
if err := r.client.CoreV1().PersistentVolumeClaims(r.config.Namespace).Delete(ctx, pvc.Name, metav1.DeleteOptions{}); err != nil {
fmt.Printf("WARN: failed to delete PVC %s for instance %s: %v\n", pvc.Name, instanceName, err)
slog.Warn("deleting PVC", "pvc", pvc.Name, "instance", instanceName, "error", err)
}
}
}

// ReconcileOrphanPVCs deletes any PVC labeled `humr.ai/instance=<name>` whose
// instance ConfigMap no longer exists. Covers two leak modes (issue #244):
// the controller crashing between StatefulSet teardown and PVC deletion, and
// users removing the instance ConfigMap out-of-band (e.g. via kubectl).
//
// Safe against the create-PVC-before-finalize race because we re-read the
// ConfigMap from the API server (not the informer cache) before deleting.
func (r *InstanceReconciler) ReconcileOrphanPVCs(ctx context.Context) {
pvcs, err := r.client.CoreV1().PersistentVolumeClaims(r.config.Namespace).List(ctx,
metav1.ListOptions{LabelSelector: "humr.ai/instance"},
)
if err != nil {
slog.Warn("orphan PVC GC: listing PVCs failed", "error", err)
return
}
deleted := 0
for _, pvc := range pvcs.Items {
instanceName := pvc.Labels["humr.ai/instance"]
if instanceName == "" {
continue
}
_, err := r.client.CoreV1().ConfigMaps(r.config.Namespace).Get(ctx, instanceName, metav1.GetOptions{})
if err == nil {
continue
}
if !errors.IsNotFound(err) {
slog.Warn("orphan PVC GC: API lookup failed", "instance", instanceName, "error", err)
continue
}
if err := r.client.CoreV1().PersistentVolumeClaims(r.config.Namespace).Delete(ctx, pvc.Name, metav1.DeleteOptions{}); err != nil {
slog.Warn("orphan PVC GC: delete failed", "pvc", pvc.Name, "instance", instanceName, "error", err)
continue
}
slog.Info("orphan PVC GC: deleted PVC for missing instance", "pvc", pvc.Name, "instance", instanceName)
deleted++
}
if deleted > 0 {
slog.Info("orphan PVC GC: sweep complete", "deleted", deleted, "scanned", len(pvcs.Items))
}
}

func (r *InstanceReconciler) setError(ctx context.Context, name, msg string) error {
WriteInstanceStatus(ctx, r.client, r.config.Namespace, name, types.NewInstanceStatus("error", msg))
return fmt.Errorf("instance %s: %s", name, msg)
Expand Down
34 changes: 34 additions & 0 deletions packages/controller/pkg/reconciler/instance_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,40 @@ func TestDelete_CleansPVCs(t *testing.T) {
assert.Empty(t, pvcs.Items)
}

func TestReconcileOrphanPVCs(t *testing.T) {
// orphan: PVC labeled for an instance whose ConfigMap is gone
orphan := &corev1.PersistentVolumeClaim{
ObjectMeta: metav1.ObjectMeta{
Name: "home-agent-deleted-instance-0",
Namespace: "test-agents",
Labels: map[string]string{"humr.ai/instance": "deleted-instance"},
},
}
// live: PVC labeled for an instance that still has a ConfigMap
live := &corev1.PersistentVolumeClaim{
ObjectMeta: metav1.ObjectMeta{
Name: "home-agent-my-instance-0",
Namespace: "test-agents",
Labels: map[string]string{"humr.ai/instance": "my-instance"},
},
}
liveCM := instanceCM("running") // name = "my-instance"
r, client := setupReconciler(t,
map[string]*corev1.ConfigMap{"claude-code": agentCM()},
liveCM, orphan, live,
)

r.ReconcileOrphanPVCs(context.Background())

// orphan removed
_, err := client.CoreV1().PersistentVolumeClaims("test-agents").Get(context.Background(), orphan.Name, metav1.GetOptions{})
assert.Error(t, err, "orphan PVC should be deleted")

// live retained
_, err = client.CoreV1().PersistentVolumeClaims("test-agents").Get(context.Background(), live.Name, metav1.GetOptions{})
assert.NoError(t, err, "live instance PVC must be retained")
}

func TestEnvMappingsToEnvVars(t *testing.T) {
t.Run("empty", func(t *testing.T) {
assert.Nil(t, envMappingsToEnvVars(nil))
Expand Down
Loading
Loading