From 983c3df9764c2b8bc463fc6b3728eb845af3c11f Mon Sep 17 00:00:00 2001
From: limes22 <limes22@users.noreply.github.com>
Date: Mon, 8 Jun 2026 21:51:24 +0000
Subject: [PATCH 1/6] webhook: inject CUDA_DEVICE_MEMORY_LIMIT from gpu-memory
 annotation

KAI binder sets the gpu-memory annotation (MiB) on shared pods but never passes
CUDA_DEVICE_MEMORY_LIMIT, which HAMi-core (libvgpu) reads to enforce the per-pod
GPU memory cap. As a result libvgpu loads via ld.so.preload but enforces nothing
(nvidia-smi shows full device memory) on KAI fractional-sharing pods.

This makes the mutating webhook translate the gpu-memory annotation into
CUDA_DEVICE_MEMORY_LIMIT=<value>m on every (init)container (skipping containers
that already set it, and handling the empty-env case), so libvgpu enforces the
requested cap. gpu-fraction carries no absolute memory value and is left untouched.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
Signed-off-by: limes22 <limes22@users.noreply.github.com>
---
 cmd/webhook/main.go | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
diff --git a/cmd/webhook/main.go b/cmd/webhook/main.go
index edf8ad8..05d79f3 100644
--- a/cmd/webhook/main.go
+++ b/cmd/webhook/main.go
@@ -28,6 +28,10 @@ const (
 	injectAnnotationKey = "kai-resource-isolator.io/inject"
 	gpuFractionKey      = "gpu-fraction"
 	gpuMemoryKey        = "gpu-memory"
+	// cudaMemLimitEnv is the HAMi-core (libvgpu) memory-limit env var. KAI sets
+	// the gpu-memory annotation (MiB) but does not pass this env, so libvgpu would
+	// not enforce any cap. We translate gpu-memory -> CUDA_DEVICE_MEMORY_LIMIT=<v>m.
+	cudaMemLimitEnv = "CUDA_DEVICE_MEMORY_LIMIT"
 )
 
 func main() {
@@ -225,12 +229,49 @@ func buildJSONPatch(pod *corev1.Pod, containerMount string) ([]byte, error) {
 		}
 	}
 
+	// Translate the KAI gpu-memory annotation (MiB) into the HAMi-core memory
+	// limit env so libvgpu actually enforces the per-pod cap. gpu-fraction (compute
+	// share) carries no absolute memory value, so we only act on gpu-memory.
+	if memMiB, ok := pod.Annotations[gpuMemoryKey]; ok && memMiB != "" {
+		limitValue := memMiB + "m"
+		for i := range pod.Spec.InitContainers {
+			ops = appendMemLimitEnvOp(ops, &pod.Spec.InitContainers[i], "initContainers", i, limitValue)
+		}
+		for i := range pod.Spec.Containers {
+			ops = appendMemLimitEnvOp(ops, &pod.Spec.Containers[i], "containers", i, limitValue)
+		}
+	}
+
 	if len(ops) == 0 {
 		return nil, nil
 	}
 	return json.Marshal(ops)
 }
 
+// appendMemLimitEnvOp adds a JSON patch op setting CUDA_DEVICE_MEMORY_LIMIT on the
+// container, unless it is already present (user-set values win). It handles the
+// case where the container has no env array yet (add the array, not append to it).
+func appendMemLimitEnvOp(ops []map[string]interface{}, c *corev1.Container, field string, i int, limitValue string) []map[string]interface{} {
+	for _, e := range c.Env {
+		if e.Name == cudaMemLimitEnv {
+			return ops
+		}
+	}
+	envVar := map[string]interface{}{"name": cudaMemLimitEnv, "value": limitValue}
+	if len(c.Env) == 0 {
+		return append(ops, map[string]interface{}{
+			"op":    "add",
+			"path":  fmt.Sprintf("/spec/%s/%d/env", field, i),
+			"value": []map[string]interface{}{envVar},
+		})
+	}
+	return append(ops, map[string]interface{}{
+		"op":    "add",
+		"path":  fmt.Sprintf("/spec/%s/%d/env/-", field, i),
+		"value": envVar,
+	})
+}
+
 func podNeedsInjection(pod *corev1.Pod) bool {
 	if pod.Annotations == nil {
 		return false

From d668af9cdcf91f9401c35462804e5843105a20cf Mon Sep 17 00:00:00 2001
From: limes22 <limes22@users.noreply.github.com>
Date: Wed, 10 Jun 2026 06:06:37 +0000
Subject: [PATCH 2/6] webhook: enforce gpu-fraction VRAM cap via autodetected
 per-GPU VRAM

Builds on the gpu-memory injection: gpu-fraction pods loaded libvgpu but
received no CUDA_DEVICE_MEMORY_LIMIT, so nvidia-smi showed full device
memory. Translate gpu-fraction into an absolute cap = fraction x per-GPU
VRAM.

Per-GPU VRAM is autodetected from the nvidia.com/gpu.memory node label
(minimum across GPU nodes for heterogeneous clusters; PER_GPU_VRAM_MIB env
overrides). If undetectable, gpu-fraction caps are skipped rather than
guessed. gpu-memory handling is unchanged. Adds nodes read RBAC.

Signed-off-by: limes22 <limes22@users.noreply.github.com>
---
 .../templates/webhook-clusterrole.yaml        |  31 ++++++
 cmd/webhook/main.go                           |  53 ++++++++-
 cmd/webhook/vram.go                           | 101 ++++++++++++++++++
 cmd/webhook/vram_test.go                      |  50 +++++++++
 go.mod                                        |  22 +++-
 go.sum                                        |  63 +++++++++++
 6 files changed, 315 insertions(+), 5 deletions(-)
 create mode 100644 chart/kai-resource-isolator/templates/webhook-clusterrole.yaml
 create mode 100644 cmd/webhook/vram.go
 create mode 100644 cmd/webhook/vram_test.go

diff --git a/chart/kai-resource-isolator/templates/webhook-clusterrole.yaml b/chart/kai-resource-isolator/templates/webhook-clusterrole.yaml
new file mode 100644
index 0000000..a059e24
--- /dev/null
+++ b/chart/kai-resource-isolator/templates/webhook-clusterrole.yaml
@@ -0,0 +1,31 @@
+{{/*
+The webhook autodetects per-GPU VRAM from node labels (nvidia.com/gpu.memory) to
+translate gpu-fraction into an absolute HAMi-core memory cap. That requires
+read access to nodes. Not needed when PER_GPU_VRAM_MIB is set explicitly, but
+harmless (read-only) to grant either way.
+*/}}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: {{ include "kai-resource-isolator.webhook.fullname" . }}-vram-autodetect
+  labels:
+    {{- include "kai-resource-isolator.labels" . | nindent 4 }}
+rules:
+  - apiGroups: [""]
+    resources: ["nodes"]
+    verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: {{ include "kai-resource-isolator.webhook.fullname" . }}-vram-autodetect
+  labels:
+    {{- include "kai-resource-isolator.labels" . | nindent 4 }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: {{ include "kai-resource-isolator.webhook.fullname" . }}-vram-autodetect
+subjects:
+  - kind: ServiceAccount
+    name: {{ include "kai-resource-isolator.webhook.fullname" . }}
+    namespace: {{ .Release.Namespace }}
diff --git a/cmd/webhook/main.go b/cmd/webhook/main.go
index 05d79f3..c14c870 100644
--- a/cmd/webhook/main.go
+++ b/cmd/webhook/main.go
@@ -7,6 +7,7 @@ SPDX-License-Identifier: Apache-2.0
 package main
 
 import (
+	"context"
 	"encoding/json"
 	"flag"
 	"fmt"
@@ -14,7 +15,9 @@ import (
 	"log"
 	"net/http"
 	"os"
+	"strconv"
 	"strings"
+	"sync/atomic"
 	"time"
 
 	admissionv1 "k8s.io/api/admission/v1"
@@ -34,6 +37,13 @@ const (
 	cudaMemLimitEnv = "CUDA_DEVICE_MEMORY_LIMIT"
 )
 
+// perGpuVramMiB is the per-GPU VRAM (MiB) used to translate a gpu-fraction into an
+// absolute HAMi-core memory cap. It comes from the PER_GPU_VRAM_MIB env
+// (authoritative) or is autodetected from node labels and refreshed in the
+// background, so access is atomic. Zero means "unknown" — gpu-fraction caps are
+// then skipped rather than guessed (gpu-memory pods are unaffected).
+var perGpuVramMiB atomic.Int64
+
 func main() {
 	certFile := flag.String("tls-cert-file", "/etc/tls/tls.crt", "TLS certificate")
 	keyFile := flag.String("tls-private-key-file", "/etc/tls/tls.key", "TLS private key")
@@ -41,6 +51,28 @@ func main() {
 	containerMount := flag.String("container-vgpu-mount", getenv("CONTAINER_VGPU_MOUNT", "/usr/local/vgpu"), "Mount path inside the pod for the node vgpu directory (must match DaemonSet install path and ld.so.preload)")
 	flag.Parse()
 
+	// Per-GPU VRAM basis for translating gpu-fraction into a HAMi-core memory cap.
+	// Precedence: explicit PER_GPU_VRAM_MIB env (authoritative) > autodetect from
+	// node labels. No hardcoded default — unknown basis means gpu-fraction caps are
+	// skipped (see buildJSONPatch).
+	envOverride := false
+	if v := os.Getenv("PER_GPU_VRAM_MIB"); v != "" {
+		if n, err := strconv.ParseInt(v, 10, 64); err == nil && n > 0 {
+			perGpuVramMiB.Store(n)
+			envOverride = true
+			log.Printf("per-GPU VRAM basis = %d MiB (from PER_GPU_VRAM_MIB)", n)
+		} else {
+			log.Printf("ignoring invalid PER_GPU_VRAM_MIB=%q", v)
+		}
+	}
+	if !envOverride {
+		if cs, err := newInClusterClientset(); err != nil {
+			log.Printf("per-GPU VRAM autodetect unavailable (%v); set PER_GPU_VRAM_MIB to enable gpu-fraction caps", err)
+		} else {
+			startVramAutodetect(context.Background(), cs)
+		}
+	}
+
 	mux := http.NewServeMux()
 	mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) {
 		w.WriteHeader(http.StatusOK)
@@ -229,11 +261,24 @@ func buildJSONPatch(pod *corev1.Pod, containerMount string) ([]byte, error) {
 		}
 	}
 
-	// Translate the KAI gpu-memory annotation (MiB) into the HAMi-core memory
-	// limit env so libvgpu actually enforces the per-pod cap. gpu-fraction (compute
-	// share) carries no absolute memory value, so we only act on gpu-memory.
+	// Translate the KAI resource share into the HAMi-core memory-limit env so
+	// libvgpu actually enforces the per-pod VRAM cap. KAI sets the annotation but
+	// passes no such env. gpu-memory carries an absolute MiB; gpu-fraction carries
+	// a share that we multiply by the per-GPU VRAM. The two are mutually exclusive.
+	limitValue := ""
 	if memMiB, ok := pod.Annotations[gpuMemoryKey]; ok && memMiB != "" {
-		limitValue := memMiB + "m"
+		limitValue = memMiB + "m"
+	} else if fracStr, ok := pod.Annotations[gpuFractionKey]; ok && fracStr != "" {
+		basis := perGpuVramMiB.Load()
+		if frac, err := strconv.ParseFloat(fracStr, 64); err != nil || frac <= 0 {
+			log.Printf("gpu-fraction %q unparseable; skipping memory-limit injection", fracStr)
+		} else if basis <= 0 {
+			log.Printf("per-GPU VRAM unknown (no PER_GPU_VRAM_MIB env and no %s node label); skipping gpu-fraction memory cap", gpuMemoryNodeLabel)
+		} else if limitMiB := int64(frac * float64(basis)); limitMiB > 0 {
+			limitValue = strconv.FormatInt(limitMiB, 10) + "m"
+		}
+	}
+	if limitValue != "" {
 		for i := range pod.Spec.InitContainers {
 			ops = appendMemLimitEnvOp(ops, &pod.Spec.InitContainers[i], "initContainers", i, limitValue)
 		}
diff --git a/cmd/webhook/vram.go b/cmd/webhook/vram.go
new file mode 100644
index 0000000..5488b27
--- /dev/null
+++ b/cmd/webhook/vram.go
@@ -0,0 +1,101 @@
+/*
+Copyright The HAMi Authors.
+SPDX-License-Identifier: Apache-2.0
+*/
+
+package main
+
+import (
+	"context"
+	"log"
+	"strconv"
+	"time"
+
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/rest"
+)
+
+// gpuMemoryNodeLabel is the per-GPU VRAM (MiB) advertised on GPU nodes by NVIDIA
+// GPU Feature Discovery / Node Feature Discovery.
+const gpuMemoryNodeLabel = "nvidia.com/gpu.memory"
+
+// vramAutodetectInterval is how often the basis is refreshed (GPU nodes may join
+// or change after startup).
+const vramAutodetectInterval = 10 * time.Minute
+
+// newInClusterClientset builds a clientset from the pod's service account.
+func newInClusterClientset() (kubernetes.Interface, error) {
+	cfg, err := rest.InClusterConfig()
+	if err != nil {
+		return nil, err
+	}
+	return kubernetes.NewForConfig(cfg)
+}
+
+// detectPerGpuVramMiB returns the per-GPU VRAM basis (MiB) for gpu-fraction caps,
+// read from node labels. A homogeneous cluster yields that single value; a
+// heterogeneous cluster yields the minimum across GPU nodes — the cap that holds
+// on whichever GPU a pod lands on, since the target GPU is unknown at admission —
+// and logs the spread. Returns 0 when no GPU node advertises the label.
+func detectPerGpuVramMiB(ctx context.Context, cs kubernetes.Interface) (int64, error) {
+	nodes, err := cs.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
+	if err != nil {
+		return 0, err
+	}
+	var minMiB int64
+	seen := map[int64]int{}
+	for i := range nodes.Items {
+		raw := nodes.Items[i].Labels[gpuMemoryNodeLabel]
+		if raw == "" {
+			continue
+		}
+		n, perr := strconv.ParseInt(raw, 10, 64)
+		if perr != nil || n <= 0 {
+			continue
+		}
+		seen[n]++
+		if minMiB == 0 || n < minMiB {
+			minMiB = n
+		}
+	}
+	if len(seen) > 1 {
+		log.Printf("heterogeneous per-GPU VRAM across nodes %v MiB; using minimum %d MiB for gpu-fraction caps (set PER_GPU_VRAM_MIB to override)", seen, minMiB)
+	}
+	return minMiB, nil
+}
+
+// startVramAutodetect sets perGpuVramMiB from node labels immediately and then
+// refreshes it periodically in the background.
+func startVramAutodetect(ctx context.Context, cs kubernetes.Interface) {
+	refresh := func() {
+		v, err := detectPerGpuVramMiB(ctx, cs)
+		if err != nil {
+			log.Printf("per-GPU VRAM autodetect failed: %v (keeping %d MiB)", err, perGpuVramMiB.Load())
+			return
+		}
+		if v > 0 {
+			perGpuVramMiB.Store(v)
+		}
+	}
+
+	refresh()
+	if got := perGpuVramMiB.Load(); got > 0 {
+		log.Printf("per-GPU VRAM basis = %d MiB (autodetected from %q node labels)", got, gpuMemoryNodeLabel)
+	} else {
+		log.Printf("per-GPU VRAM not detected from node labels; gpu-fraction caps disabled until PER_GPU_VRAM_MIB is set or %q appears", gpuMemoryNodeLabel)
+	}
+
+	go func() {
+		t := time.NewTicker(vramAutodetectInterval)
+		defer t.Stop()
+		for {
+			select {
+			case <-ctx.Done():
+				return
+			case <-t.C:
+				refresh()
+			}
+		}
+	}()
+}
diff --git a/cmd/webhook/vram_test.go b/cmd/webhook/vram_test.go
new file mode 100644
index 0000000..cfd88ec
--- /dev/null
+++ b/cmd/webhook/vram_test.go
@@ -0,0 +1,50 @@
+/*
+Copyright The HAMi Authors.
+SPDX-License-Identifier: Apache-2.0
+*/
+
+package main
+
+import (
+	"context"
+	"os"
+	"path/filepath"
+	"testing"
+	"time"
+
+	"k8s.io/client-go/kubernetes"
+	"k8s.io/client-go/tools/clientcmd"
+)
+
+// TestDetectPerGpuVramMiBLive verifies per-GPU VRAM autodetection against the
+// current kube context. It is skipped when no kubeconfig/cluster is reachable, so
+// it is safe in CI; run it against a GPU cluster to validate label detection.
+func TestDetectPerGpuVramMiBLive(t *testing.T) {
+	kubeconfig := os.Getenv("KUBECONFIG")
+	if kubeconfig == "" {
+		home, err := os.UserHomeDir()
+		if err != nil {
+			t.Skipf("no home dir: %v", err)
+		}
+		kubeconfig = filepath.Join(home, ".kube", "config")
+	}
+	cfg, err := clientcmd.BuildConfigFromFlags("", kubeconfig)
+	if err != nil {
+		t.Skipf("no usable kubeconfig (%s): %v", kubeconfig, err)
+	}
+	cs, err := kubernetes.NewForConfig(cfg)
+	if err != nil {
+		t.Skipf("clientset: %v", err)
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
+	defer cancel()
+
+	v, err := detectPerGpuVramMiB(ctx, cs)
+	if err != nil {
+		t.Skipf("cluster unreachable: %v", err)
+	}
+	t.Logf("autodetected per-GPU VRAM = %d MiB", v)
+	if v <= 0 {
+		t.Fatalf("expected a positive per-GPU VRAM basis from node labels, got %d", v)
+	}
+}
diff --git a/go.mod b/go.mod
index bff0cec..145a55a 100644
--- a/go.mod
+++ b/go.mod
@@ -5,24 +5,44 @@ go 1.25.0
 require (
 	k8s.io/api v0.35.3
 	k8s.io/apimachinery v0.35.3
+	k8s.io/client-go v0.35.3
 )
 
 require (
+	github.com/davecgh/go-spew v1.1.1 // indirect
+	github.com/emicklei/go-restful/v3 v3.12.2 // indirect
 	github.com/fxamacker/cbor/v2 v2.9.0 // indirect
 	github.com/go-logr/logr v1.4.3 // indirect
+	github.com/go-openapi/jsonpointer v0.21.0 // indirect
+	github.com/go-openapi/jsonreference v0.20.2 // indirect
+	github.com/go-openapi/swag v0.23.0 // indirect
+	github.com/google/gnostic-models v0.7.0 // indirect
+	github.com/google/uuid v1.6.0 // indirect
+	github.com/josharian/intern v1.0.0 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
-	github.com/kr/text v0.2.0 // indirect
+	github.com/mailru/easyjson v0.7.7 // indirect
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
+	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
+	github.com/spf13/pflag v1.0.9 // indirect
 	github.com/x448/float16 v0.8.4 // indirect
 	go.yaml.in/yaml/v2 v2.4.3 // indirect
+	go.yaml.in/yaml/v3 v3.0.4 // indirect
 	golang.org/x/net v0.47.0 // indirect
+	golang.org/x/oauth2 v0.30.0 // indirect
+	golang.org/x/sys v0.38.0 // indirect
+	golang.org/x/term v0.37.0 // indirect
 	golang.org/x/text v0.31.0 // indirect
+	golang.org/x/time v0.9.0 // indirect
+	google.golang.org/protobuf v1.36.8 // indirect
+	gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect
 	gopkg.in/inf.v0 v0.9.1 // indirect
+	gopkg.in/yaml.v3 v3.0.1 // indirect
 	k8s.io/klog/v2 v2.130.1 // indirect
 	k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect
 	k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect
 	sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect
 	sigs.k8s.io/randfill v1.0.0 // indirect
 	sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect
+	sigs.k8s.io/yaml v1.6.0 // indirect
 )
diff --git a/go.sum b/go.sum
index 4dbbd40..05604fc 100644
--- a/go.sum
+++ b/go.sum
@@ -1,26 +1,59 @@
+github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0=
+github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM=
 github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
 github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
 github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
 github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU=
+github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc=
 github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM=
 github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ=
 github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI=
 github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY=
+github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs=
+github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ=
+github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY=
+github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE=
+github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k=
+github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14=
+github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE=
+github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ=
+github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI=
+github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8=
+github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo=
+github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ=
 github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8=
 github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU=
 github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg=
+github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8=
+github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA=
+github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
+github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
+github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY=
+github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y=
 github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM=
 github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo=
+github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
 github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE=
 github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk=
+github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
+github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
 github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
 github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
+github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0=
+github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc=
 github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
 github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8=
 github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk=
+github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
+github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
+github.com/onsi/ginkgo/v2 v2.27.2 h1:LzwLj0b89qtIy6SSASkzlNvX6WktqurSHwkk2ipF/Ns=
+github.com/onsi/ginkgo/v2 v2.27.2/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo=
+github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A=
+github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k=
 github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
@@ -28,28 +61,58 @@ github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7
 github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY=
 github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg=
 github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
+github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
+github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
+github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
 github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
+github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
+github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
 github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
 github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
 github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
 github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
 go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0=
 go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8=
+go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=
+go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg=
+golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA=
+golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w=
 golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY=
 golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU=
+golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI=
+golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU=
+golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I=
+golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI=
+golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc=
+golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks=
+golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU=
+golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254=
 golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM=
 golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM=
+golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY=
+golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM=
+golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ=
+golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs=
+google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc=
+google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU=
 gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
 gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
+gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo=
+gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M=
 gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc=
 gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
 gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
 k8s.io/api v0.35.3 h1:pA2fiBc6+N9PDf7SAiluKGEBuScsTzd2uYBkA5RzNWQ=
 k8s.io/api v0.35.3/go.mod h1:9Y9tkBcFwKNq2sxwZTQh1Njh9qHl81D0As56tu42GA4=
 k8s.io/apimachinery v0.35.3 h1:MeaUwQCV3tjKP4bcwWGgZ/cp/vpsRnQzqO6J6tJyoF8=
 k8s.io/apimachinery v0.35.3/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns=
+k8s.io/client-go v0.35.3 h1:s1lZbpN4uI6IxeTM2cpdtrwHcSOBML1ODNTCCfsP1pg=
+k8s.io/client-go v0.35.3/go.mod h1:RzoXkc0mzpWIDvBrRnD+VlfXP+lRzqQjCmKtiwZ8Q9c=
 k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk=
 k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE=
 k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE=

From 746930f45d556bae3047251b657f8e5e2863df42 Mon Sep 17 00:00:00 2001
From: limes22 <limes22@users.noreply.github.com>
Date: Sun, 14 Jun 2026 11:49:01 +0000
Subject: [PATCH 3/6] webhook: refresh per-GPU VRAM via Node informer instead
 of polling

Replace the 10-minute time.Ticker poll with a client-go SharedInformer on
Nodes: recompute the gpu-fraction VRAM basis on node add/update/delete events,
reading from the informer cache (lister) instead of repeated List calls.
Blocks until cache sync at startup. Fraction-cap math and gpu-memory path
unchanged.

Signed-off-by: limes22 <limes22@users.noreply.github.com>
---
 cmd/webhook/vram.go | 111 ++++++++++++++++++++++++++------------------
 go.mod              |   2 +
 go.sum              |   2 +
 3 files changed, 70 insertions(+), 45 deletions(-)

diff --git a/cmd/webhook/vram.go b/cmd/webhook/vram.go
index 5488b27..2de2a62 100644
--- a/cmd/webhook/vram.go
+++ b/cmd/webhook/vram.go
@@ -9,21 +9,20 @@ import (
 	"context"
 	"log"
 	"strconv"
-	"time"
 
+	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/client-go/informers"
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/rest"
+	"k8s.io/client-go/tools/cache"
 )
 
 // gpuMemoryNodeLabel is the per-GPU VRAM (MiB) advertised on GPU nodes by NVIDIA
 // GPU Feature Discovery / Node Feature Discovery.
 const gpuMemoryNodeLabel = "nvidia.com/gpu.memory"
 
-// vramAutodetectInterval is how often the basis is refreshed (GPU nodes may join
-// or change after startup).
-const vramAutodetectInterval = 10 * time.Minute
-
 // newInClusterClientset builds a clientset from the pod's service account.
 func newInClusterClientset() (kubernetes.Interface, error) {
 	cfg, err := rest.InClusterConfig()
@@ -33,69 +32,91 @@ func newInClusterClientset() (kubernetes.Interface, error) {
 	return kubernetes.NewForConfig(cfg)
 }
 
-// detectPerGpuVramMiB returns the per-GPU VRAM basis (MiB) for gpu-fraction caps,
-// read from node labels. A homogeneous cluster yields that single value; a
-// heterogeneous cluster yields the minimum across GPU nodes — the cap that holds
-// on whichever GPU a pod lands on, since the target GPU is unknown at admission —
-// and logs the spread. Returns 0 when no GPU node advertises the label.
-func detectPerGpuVramMiB(ctx context.Context, cs kubernetes.Interface) (int64, error) {
-	nodes, err := cs.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
-	if err != nil {
-		return 0, err
-	}
+// gpuMemoryMiBFromNodes returns the per-GPU VRAM basis (minimum across GPU nodes,
+// the cap that holds on whichever GPU a pod lands on since the target is unknown at
+// admission) and the set of distinct values seen (for heterogeneity logging).
+// Returns 0 when no node advertises the label.
+func gpuMemoryMiBFromNodes(nodes []*corev1.Node) (int64, map[int64]int) {
 	var minMiB int64
 	seen := map[int64]int{}
-	for i := range nodes.Items {
-		raw := nodes.Items[i].Labels[gpuMemoryNodeLabel]
+	for _, n := range nodes {
+		raw := n.Labels[gpuMemoryNodeLabel]
 		if raw == "" {
 			continue
 		}
-		n, perr := strconv.ParseInt(raw, 10, 64)
-		if perr != nil || n <= 0 {
+		v, err := strconv.ParseInt(raw, 10, 64)
+		if err != nil || v <= 0 {
 			continue
 		}
-		seen[n]++
-		if minMiB == 0 || n < minMiB {
-			minMiB = n
+		seen[v]++
+		if minMiB == 0 || v < minMiB {
+			minMiB = v
 		}
 	}
+	return minMiB, seen
+}
+
+// detectPerGpuVramMiB reads the per-GPU VRAM basis directly via the API (one List).
+// Kept for the standalone live test; the running webhook uses the informer below.
+func detectPerGpuVramMiB(ctx context.Context, cs kubernetes.Interface) (int64, error) {
+	nodes, err := cs.CoreV1().Nodes().List(ctx, metav1.ListOptions{})
+	if err != nil {
+		return 0, err
+	}
+	ptrs := make([]*corev1.Node, 0, len(nodes.Items))
+	for i := range nodes.Items {
+		ptrs = append(ptrs, &nodes.Items[i])
+	}
+	minMiB, seen := gpuMemoryMiBFromNodes(ptrs)
 	if len(seen) > 1 {
-		log.Printf("heterogeneous per-GPU VRAM across nodes %v MiB; using minimum %d MiB for gpu-fraction caps (set PER_GPU_VRAM_MIB to override)", seen, minMiB)
+		log.Printf("heterogeneous per-GPU VRAM across nodes %v MiB; using minimum %d MiB (set PER_GPU_VRAM_MIB to override)", seen, minMiB)
 	}
 	return minMiB, nil
 }
 
-// startVramAutodetect sets perGpuVramMiB from node labels immediately and then
-// refreshes it periodically in the background.
+// startVramAutodetect keeps perGpuVramMiB up to date from node labels using a
+// Node informer (watch). It recomputes the basis on node add/update/delete events
+// instead of polling, reading from the informer's in-memory cache (no repeated
+// List calls). Blocks until the cache has synced, then returns.
 func startVramAutodetect(ctx context.Context, cs kubernetes.Interface) {
+	factory := informers.NewSharedInformerFactory(cs, 0) // 0 = event-driven, no periodic resync
+	nodeInformer := factory.Core().V1().Nodes()
+	lister := nodeInformer.Lister()
+
 	refresh := func() {
-		v, err := detectPerGpuVramMiB(ctx, cs)
+		nodes, err := lister.List(labels.Everything())
 		if err != nil {
-			log.Printf("per-GPU VRAM autodetect failed: %v (keeping %d MiB)", err, perGpuVramMiB.Load())
+			log.Printf("per-GPU VRAM refresh failed: %v (keeping %d MiB)", err, perGpuVramMiB.Load())
 			return
 		}
-		if v > 0 {
-			perGpuVramMiB.Store(v)
+		minMiB, seen := gpuMemoryMiBFromNodes(nodes)
+		if minMiB > 0 {
+			perGpuVramMiB.Store(minMiB)
+		}
+		if len(seen) > 1 {
+			log.Printf("heterogeneous per-GPU VRAM across nodes %v MiB; using minimum %d MiB (set PER_GPU_VRAM_MIB to override)", seen, minMiB)
 		}
 	}
 
-	refresh()
+	if _, err := nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{
+		AddFunc:    func(interface{}) { refresh() },
+		UpdateFunc: func(interface{}, interface{}) { refresh() }, // labels can change
+		DeleteFunc: func(interface{}) { refresh() },
+	}); err != nil {
+		log.Printf("failed to register node informer handler: %v; set PER_GPU_VRAM_MIB to enable gpu-fraction caps", err)
+		return
+	}
+
+	factory.Start(ctx.Done())
+	if !cache.WaitForCacheSync(ctx.Done(), nodeInformer.Informer().HasSynced) {
+		log.Printf("node informer cache sync failed; gpu-fraction caps unset until PER_GPU_VRAM_MIB set or nodes sync")
+		return
+	}
+	refresh() // initial value after sync
+
 	if got := perGpuVramMiB.Load(); got > 0 {
-		log.Printf("per-GPU VRAM basis = %d MiB (autodetected from %q node labels)", got, gpuMemoryNodeLabel)
+		log.Printf("per-GPU VRAM basis = %d MiB (autodetected via node informer on %q)", got, gpuMemoryNodeLabel)
 	} else {
-		log.Printf("per-GPU VRAM not detected from node labels; gpu-fraction caps disabled until PER_GPU_VRAM_MIB is set or %q appears", gpuMemoryNodeLabel)
+		log.Printf("per-GPU VRAM not detected from node labels; gpu-fraction caps disabled until PER_GPU_VRAM_MIB set or %q appears", gpuMemoryNodeLabel)
 	}
-
-	go func() {
-		t := time.NewTicker(vramAutodetectInterval)
-		defer t.Stop()
-		for {
-			select {
-			case <-ctx.Done():
-				return
-			case <-t.C:
-				refresh()
-			}
-		}
-	}()
 }
diff --git a/go.mod b/go.mod
index 145a55a..60cd837 100644
--- a/go.mod
+++ b/go.mod
@@ -17,6 +17,7 @@ require (
 	github.com/go-openapi/jsonreference v0.20.2 // indirect
 	github.com/go-openapi/swag v0.23.0 // indirect
 	github.com/google/gnostic-models v0.7.0 // indirect
+	github.com/google/go-cmp v0.7.0 // indirect
 	github.com/google/uuid v1.6.0 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
@@ -24,6 +25,7 @@ require (
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
+	github.com/pmezard/go-difflib v1.0.0 // indirect
 	github.com/spf13/pflag v1.0.9 // indirect
 	github.com/x448/float16 v0.8.4 // indirect
 	go.yaml.in/yaml/v2 v2.4.3 // indirect
diff --git a/go.sum b/go.sum
index 05604fc..b54a164 100644
--- a/go.sum
+++ b/go.sum
@@ -73,6 +73,8 @@ github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu
 github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
 github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM=
 github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg=
+go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto=
+go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE=
 go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0=
 go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8=
 go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc=

From 4eb59aca475d65fd86fb75361ba9832d3a25f156 Mon Sep 17 00:00:00 2001
From: limes22 <limes22@users.noreply.github.com>
Date: Mon, 15 Jun 2026 05:13:56 +0000
Subject: [PATCH 4/6] chart: one-shot defaults (memlimit4 image, GPU
 nodeSelector, nodes RBAC, PER_GPU_VRAM_MIB override)

Single 'helm install' works with no manual --set or separate kubectl apply.
Chart 0.1.0 -> 0.2.0.

Signed-off-by: limes22 <limes22@users.noreply.github.com>
---
 chart/kai-resource-isolator/Chart.yaml             |  6 +++---
 .../templates/webhook-deployment.yaml              |  4 ++++
 chart/kai-resource-isolator/values.yaml            | 14 ++++++++++----
 3 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/chart/kai-resource-isolator/Chart.yaml b/chart/kai-resource-isolator/Chart.yaml
index b2b7117..264ca43 100644
--- a/chart/kai-resource-isolator/Chart.yaml
+++ b/chart/kai-resource-isolator/Chart.yaml
@@ -1,6 +1,6 @@
 apiVersion: v2
 name: kai-resource-isolator
-description: DaemonSet to sync HAMi libvgpu to GPU nodes and mutating webhook to inject into GPU-sharing pods.
+description: HAMi libvgpu sync DaemonSet + mutating webhook that enforces GPU VRAM soft-quota (gpu-memory & gpu-fraction -> CUDA_DEVICE_MEMORY_LIMIT) for KAI fractional-sharing pods. Self-contained one-shot install (cert-gen job + nodes-autodetect RBAC bundled).
 type: application
-version: 0.1.0
-appVersion: "0.1.0"
+version: 0.2.0
+appVersion: "v1.0.0-memlimit4"
diff --git a/chart/kai-resource-isolator/templates/webhook-deployment.yaml b/chart/kai-resource-isolator/templates/webhook-deployment.yaml
index c54c9a0..d3dfcce 100644
--- a/chart/kai-resource-isolator/templates/webhook-deployment.yaml
+++ b/chart/kai-resource-isolator/templates/webhook-deployment.yaml
@@ -47,6 +47,10 @@ spec:
               value: {{ .Values.paths.containerVgpuMount | quote }}
             - name: GPU_SHARE_RESOURCES
               value: {{ .Values.webhook.gpuShareResources | quote }}
+            {{- with .Values.image.perGpuVramMiB }}
+            - name: PER_GPU_VRAM_MIB
+              value: {{ . | quote }}
+            {{- end }}
           ports:
             - name: https
               containerPort: 8443
diff --git a/chart/kai-resource-isolator/values.yaml b/chart/kai-resource-isolator/values.yaml
index 42757d4..741e1ac 100644
--- a/chart/kai-resource-isolator/values.yaml
+++ b/chart/kai-resource-isolator/values.yaml
@@ -9,9 +9,14 @@ global:
 
 image:
   registry: docker.io
-  repository: projecthami/kai-resource-isolator
-  tag: latest
+  ## Patched isolator: gpu-memory + gpu-fraction -> CUDA_DEVICE_MEMORY_LIMIT,
+  ## per-GPU VRAM autodetected from node labels via a Node informer.
+  repository: howdi2000/kai-resource-isolator
+  tag: v1.0.0-memlimit4
   pullPolicy: IfNotPresent
+  ## Optional: override per-GPU VRAM (MiB) instead of node-label autodetect.
+  ## Leave empty to autodetect from nvidia.com/gpu.memory.
+  perGpuVramMiB: ""
 
 paths:
   ## Host directory base; the chart installs libraries under {hostInstallBase}/vgpu/
@@ -20,8 +25,9 @@ paths:
   containerVgpuMount: /usr/local/vgpu
 
 librarySync:
-  ## Node labels for GPU nodes (empty = all nodes)
-  nodeSelector: {}
+  ## Node labels for GPU nodes — libvgpu is synced only to GPU nodes.
+  nodeSelector:
+    nvidia.com/gpu.present: "true"
   tolerations:
     - operator: Exists
       effect: NoSchedule

From 439cb849b91a7b147c29d3cde5ec52f03a9a1449 Mon Sep 17 00:00:00 2001
From: limes22 <limes22@users.noreply.github.com>
Date: Mon, 15 Jun 2026 07:06:32 +0000
Subject: [PATCH 5/6] webhook: guard against NVIDIA_VISIBLE_DEVICES env-bypass
 (off/audit/enforce)

ACCEPT_NVIDIA_VISIBLE_DEVICES_ENVVAR_WHEN_UNPRIVILEGED=true (needed for KAI
fractional sharing) lets any unprivileged GPU-runtime pod grab GPUs via the
NVIDIA_VISIBLE_DEVICES env, bypassing allocation. The webhook now neutralizes
it (=void) on pods that use a GPU runtimeClass but are not authorized
(KAI share annotation, kai.scheduler/queue label, managed-by=gpu-operator, or
a trusted namespace). Mode off/audit/enforce, default audit (log-only) so a
wrong allowlist can't break system pods before it's observed.

Signed-off-by: limes22 <limes22@users.noreply.github.com>
---
 .../templates/webhook-deployment.yaml         |   6 +
 chart/kai-resource-isolator/values.yaml       |   7 +
 cmd/webhook/main.go                           |  22 ++-
 cmd/webhook/nvdguard.go                       | 158 ++++++++++++++++++
 4 files changed, 191 insertions(+), 2 deletions(-)
 create mode 100644 cmd/webhook/nvdguard.go

diff --git a/chart/kai-resource-isolator/templates/webhook-deployment.yaml b/chart/kai-resource-isolator/templates/webhook-deployment.yaml
index d3dfcce..340832b 100644
--- a/chart/kai-resource-isolator/templates/webhook-deployment.yaml
+++ b/chart/kai-resource-isolator/templates/webhook-deployment.yaml
@@ -51,6 +51,12 @@ spec:
             - name: PER_GPU_VRAM_MIB
               value: {{ . | quote }}
             {{- end }}
+            - name: NVIDIA_VISIBLE_DEVICES_GUARD
+              value: {{ .Values.webhook.nvidiaVisibleDevicesGuard | quote }}
+            {{- with .Values.webhook.guardAllowedNamespaces }}
+            - name: GUARD_ALLOWED_NAMESPACES
+              value: {{ . | quote }}
+            {{- end }}
           ports:
             - name: https
               containerPort: 8443
diff --git a/chart/kai-resource-isolator/values.yaml b/chart/kai-resource-isolator/values.yaml
index 741e1ac..68fd260 100644
--- a/chart/kai-resource-isolator/values.yaml
+++ b/chart/kai-resource-isolator/values.yaml
@@ -51,6 +51,13 @@ webhook:
   ## Comma-separated extended resources that trigger injection (HAMi vGPU sharing)
   gpuShareResources: "nvidia.com/gpu,nvidia.com/gpumem,nvidia.com/gpucores"
   failurePolicy: Ignore
+  ## NVIDIA_VISIBLE_DEVICES env-bypass guard: off | audit | enforce.
+  ## audit (default) logs unauthorized GPU-runtime pods without mutating — observe
+  ## logs to confirm no system pods are flagged, then switch to enforce.
+  nvidiaVisibleDevicesGuard: "audit"
+  ## Optional: override trusted namespaces (comma-separated). Empty = built-in
+  ## default (gpu-operator,kube-system,kai-scheduler,kai-resource-reservation,nvidia-network-operator).
+  guardAllowedNamespaces: ""
 
 tls:
   ## When cert-manager is enabled, set patch.enabled to false.
diff --git a/cmd/webhook/main.go b/cmd/webhook/main.go
index c14c870..389336e 100644
--- a/cmd/webhook/main.go
+++ b/cmd/webhook/main.go
@@ -73,6 +73,9 @@ func main() {
 		}
 	}
 
+	// NVIDIA_VISIBLE_DEVICES env-bypass guard (off/audit/enforce, default audit).
+	initNvdGuard()
+
 	mux := http.NewServeMux()
 	mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) {
 		w.WriteHeader(http.StatusOK)
@@ -185,13 +188,28 @@ func writeAdmission(w http.ResponseWriter, review *admissionv1.AdmissionReview,
 	}
 }
 
-func buildJSONPatch(pod *corev1.Pod, containerMount string) ([]byte, error) {
-	if !podNeedsInjection(pod) {
+// marshalOps returns the JSON-encoded patch, or (nil, nil) when there are no ops.
+func marshalOps(ops []map[string]interface{}) ([]byte, error) {
+	if len(ops) == 0 {
 		return nil, nil
 	}
+	return json.Marshal(ops)
+}
 
+func buildJSONPatch(pod *corev1.Pod, containerMount string) ([]byte, error) {
 	var ops []map[string]interface{}
 
+	// Security guard: neutralize NVIDIA_VISIBLE_DEVICES on pods that use a GPU
+	// runtimeClass but are not authorized GPU workloads (closes the env bypass
+	// enabled by ACCEPT_NVIDIA_VISIBLE_DEVICES_ENVVAR_WHEN_UNPRIVILEGED=true).
+	// Runs for every pod, independent of libvgpu injection below.
+	ops = append(ops, nvidiaVisibleDevicesGuardOps(pod)...)
+
+	// libvgpu mount + CUDA_DEVICE_MEMORY_LIMIT injection — only for KAI share pods.
+	if !podNeedsInjection(pod) {
+		return marshalOps(ops)
+	}
+
 	hasVol := false
 	for _, v := range pod.Spec.Volumes {
 		if v.Name == volumeName {
diff --git a/cmd/webhook/nvdguard.go b/cmd/webhook/nvdguard.go
new file mode 100644
index 0000000..b192ba8
--- /dev/null
+++ b/cmd/webhook/nvdguard.go
@@ -0,0 +1,158 @@
+/*
+Copyright The HAMi Authors.
+SPDX-License-Identifier: Apache-2.0
+*/
+
+package main
+
+import (
+	"fmt"
+	"log"
+	"os"
+	"strings"
+
+	corev1 "k8s.io/api/core/v1"
+)
+
+const nvidiaVisibleDevicesEnv = "NVIDIA_VISIBLE_DEVICES"
+
+// guardMode controls the NVIDIA_VISIBLE_DEVICES guard: "off" | "audit" | "enforce".
+// Default "audit" (log only, no mutation) so a misconfigured allowlist can never
+// break workloads before the logs are observed. Set via NVIDIA_VISIBLE_DEVICES_GUARD.
+var guardMode = "audit"
+
+// guardAllowedNamespaces: namespaces whose GPU-runtime pods are always trusted —
+// system/infra components legitimately use runtimeClass=nvidia without the KAI
+// sharing annotations. Override via GUARD_ALLOWED_NAMESPACES (comma-separated).
+var guardAllowedNamespaces = map[string]bool{
+	"gpu-operator":             true,
+	"kube-system":              true,
+	"kai-scheduler":            true,
+	"kai-resource-reservation": true,
+	"nvidia-network-operator":  true,
+}
+
+// initNvdGuard loads guard config from env (called once from main).
+func initNvdGuard() {
+	if m := strings.ToLower(strings.TrimSpace(os.Getenv("NVIDIA_VISIBLE_DEVICES_GUARD"))); m != "" {
+		switch m {
+		case "off", "audit", "enforce":
+			guardMode = m
+		default:
+			log.Printf("invalid NVIDIA_VISIBLE_DEVICES_GUARD=%q, keeping %q", m, guardMode)
+		}
+	}
+	if ns := strings.TrimSpace(os.Getenv("GUARD_ALLOWED_NAMESPACES")); ns != "" {
+		guardAllowedNamespaces = map[string]bool{}
+		for _, n := range strings.Split(ns, ",") {
+			if n = strings.TrimSpace(n); n != "" {
+				guardAllowedNamespaces[n] = true
+			}
+		}
+	}
+	log.Printf("NVIDIA_VISIBLE_DEVICES guard: mode=%s allowedNamespaces=%v", guardMode, allowedNamespaceList())
+}
+
+func allowedNamespaceList() []string {
+	out := make([]string, 0, len(guardAllowedNamespaces))
+	for k := range guardAllowedNamespaces {
+		out = append(out, k)
+	}
+	return out
+}
+
+// isGpuRuntimeClass reports whether the runtimeClass routes through the NVIDIA
+// runtime that honors NVIDIA_VISIBLE_DEVICES (the only pods that can bypass).
+func isGpuRuntimeClass(rc string) bool {
+	switch rc {
+	case "nvidia", "nvidia-cdi", "nvidia-legacy":
+		return true
+	}
+	return false
+}
+
+// isAuthorizedGpuPod reports whether a GPU-runtime pod is a legitimate GPU user:
+// a KAI share pod, KAI-managed, a GPU-operator system component, or in a trusted ns.
+func isAuthorizedGpuPod(pod *corev1.Pod) bool {
+	if a := pod.Annotations; a != nil && (a[gpuFractionKey] != "" || a[gpuMemoryKey] != "") {
+		return true // KAI fractional-sharing pod
+	}
+	if l := pod.Labels; l != nil {
+		if _, ok := l["kai.scheduler/queue"]; ok {
+			return true // KAI-managed
+		}
+		if l["app.kubernetes.io/managed-by"] == "gpu-operator" {
+			return true // GPU operator system component (device-plugin, dcgm, GFD, ...)
+		}
+	}
+	return guardAllowedNamespaces[pod.Namespace]
+}
+
+// nvidiaVisibleDevicesGuardOps returns JSON-patch ops that neutralize
+// NVIDIA_VISIBLE_DEVICES (=void) on unauthorized GPU-runtime pods, closing the env
+// bypass enabled by ACCEPT_NVIDIA_VISIBLE_DEVICES_ENVVAR_WHEN_UNPRIVILEGED=true.
+// Returns nil when guard is off, the pod uses no GPU runtimeClass, the pod is
+// authorized, or in audit mode (logs only).
+func nvidiaVisibleDevicesGuardOps(pod *corev1.Pod) []map[string]interface{} {
+	if guardMode == "off" {
+		return nil
+	}
+	rc := ""
+	if pod.Spec.RuntimeClassName != nil {
+		rc = *pod.Spec.RuntimeClassName
+	}
+	if !isGpuRuntimeClass(rc) {
+		return nil // default runtime ignores NVIDIA_VISIBLE_DEVICES — no bypass possible
+	}
+	if isAuthorizedGpuPod(pod) {
+		return nil
+	}
+	if guardMode == "audit" {
+		log.Printf("[nvd-guard][audit] WOULD neutralize NVIDIA_VISIBLE_DEVICES: ns=%s pod=%s runtimeClass=%s", pod.Namespace, podDisplayName(pod), rc)
+		return nil
+	}
+	log.Printf("[nvd-guard][enforce] neutralizing NVIDIA_VISIBLE_DEVICES=void: ns=%s pod=%s runtimeClass=%s", pod.Namespace, podDisplayName(pod), rc)
+	var ops []map[string]interface{}
+	for i := range pod.Spec.InitContainers {
+		ops = append(ops, overrideEnvOps(&pod.Spec.InitContainers[i], "initContainers", i, nvidiaVisibleDevicesEnv, "void")...)
+	}
+	for i := range pod.Spec.Containers {
+		ops = append(ops, overrideEnvOps(&pod.Spec.Containers[i], "containers", i, nvidiaVisibleDevicesEnv, "void")...)
+	}
+	return ops
+}
+
+func podDisplayName(pod *corev1.Pod) string {
+	if pod.Name != "" {
+		return pod.Name
+	}
+	return pod.GenerateName + "<generated>"
+}
+
+// overrideEnvOps sets env name=value, replacing any existing entry (including a
+// valueFrom) so an attacker-set NVIDIA_VISIBLE_DEVICES=all is overridden, and
+// handling the empty-env-array case.
+func overrideEnvOps(c *corev1.Container, field string, i int, name, value string) []map[string]interface{} {
+	envVar := map[string]interface{}{"name": name, "value": value}
+	for j := range c.Env {
+		if c.Env[j].Name == name {
+			return []map[string]interface{}{{
+				"op":    "replace",
+				"path":  fmt.Sprintf("/spec/%s/%d/env/%d", field, i, j),
+				"value": envVar,
+			}}
+		}
+	}
+	if len(c.Env) == 0 {
+		return []map[string]interface{}{{
+			"op":    "add",
+			"path":  fmt.Sprintf("/spec/%s/%d/env", field, i),
+			"value": []map[string]interface{}{envVar},
+		}}
+	}
+	return []map[string]interface{}{{
+		"op":    "add",
+		"path":  fmt.Sprintf("/spec/%s/%d/env/-", field, i),
+		"value": envVar,
+	}}
+}

From 8c627e65e4e839120cbbce5d905388924b2313a8 Mon Sep 17 00:00:00 2001
From: limes22 <limes22@users.noreply.github.com>
Date: Mon, 15 Jun 2026 07:28:11 +0000
Subject: [PATCH 6/6] chart: restore upstream image defaults for contribution

Revert deployment-specific defaults (personal Docker Hub image, GPU-node
nodeSelector, appVersion) back to upstream values. The new feature options
(perGpuVramMiB, nvidiaVisibleDevicesGuard, nodes-autodetect RBAC) remain with
neutral defaults so the chart stays generic.

Signed-off-by: limes22 <limes22@users.noreply.github.com>
---
 chart/kai-resource-isolator/Chart.yaml  |  2 +-
 chart/kai-resource-isolator/values.yaml | 14 ++++++--------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/chart/kai-resource-isolator/Chart.yaml b/chart/kai-resource-isolator/Chart.yaml
index 264ca43..d6cfa13 100644
--- a/chart/kai-resource-isolator/Chart.yaml
+++ b/chart/kai-resource-isolator/Chart.yaml
@@ -3,4 +3,4 @@ name: kai-resource-isolator
 description: HAMi libvgpu sync DaemonSet + mutating webhook that enforces GPU VRAM soft-quota (gpu-memory & gpu-fraction -> CUDA_DEVICE_MEMORY_LIMIT) for KAI fractional-sharing pods. Self-contained one-shot install (cert-gen job + nodes-autodetect RBAC bundled).
 type: application
 version: 0.2.0
-appVersion: "v1.0.0-memlimit4"
+appVersion: "latest"
diff --git a/chart/kai-resource-isolator/values.yaml b/chart/kai-resource-isolator/values.yaml
index 68fd260..0b91d96 100644
--- a/chart/kai-resource-isolator/values.yaml
+++ b/chart/kai-resource-isolator/values.yaml
@@ -9,13 +9,11 @@ global:
 
 image:
   registry: docker.io
-  ## Patched isolator: gpu-memory + gpu-fraction -> CUDA_DEVICE_MEMORY_LIMIT,
-  ## per-GPU VRAM autodetected from node labels via a Node informer.
-  repository: howdi2000/kai-resource-isolator
-  tag: v1.0.0-memlimit4
+  repository: projecthami/kai-resource-isolator
+  tag: latest
   pullPolicy: IfNotPresent
   ## Optional: override per-GPU VRAM (MiB) instead of node-label autodetect.
-  ## Leave empty to autodetect from nvidia.com/gpu.memory.
+  ## Leave empty to autodetect from the nvidia.com/gpu.memory node label.
   perGpuVramMiB: ""
 
 paths:
@@ -25,9 +23,9 @@ paths:
   containerVgpuMount: /usr/local/vgpu
 
 librarySync:
-  ## Node labels for GPU nodes — libvgpu is synced only to GPU nodes.
-  nodeSelector:
-    nvidia.com/gpu.present: "true"
+  ## Node labels for GPU nodes (empty = all nodes; set e.g.
+  ## nvidia.com/gpu.present: "true" to sync libvgpu only to GPU nodes).
+  nodeSelector: {}
   tolerations:
     - operator: Exists
       effect: NoSchedule