Project-HAMi · limes22 · Jun 8, 2026 · Jun 10, 2026 · Jun 14, 2026 · Jun 15, 2026
diff --git a/chart/kai-resource-isolator/Chart.yaml b/chart/kai-resource-isolator/Chart.yaml
@@ -1,6 +1,6 @@
 apiVersion: v2
 name: kai-resource-isolator
-description: DaemonSet to sync HAMi libvgpu to GPU nodes and mutating webhook to inject into GPU-sharing pods.
+description: HAMi libvgpu sync DaemonSet + mutating webhook that enforces GPU VRAM soft-quota (gpu-memory & gpu-fraction -> CUDA_DEVICE_MEMORY_LIMIT) for KAI fractional-sharing pods. Self-contained one-shot install (cert-gen job + nodes-autodetect RBAC bundled).
 type: application
-version: 0.1.0
-appVersion: "0.1.0"
+version: 0.2.0
+appVersion: "latest"
diff --git a/chart/kai-resource-isolator/templates/webhook-clusterrole.yaml b/chart/kai-resource-isolator/templates/webhook-clusterrole.yaml
@@ -0,0 +1,31 @@
+{{/*
+The webhook autodetects per-GPU VRAM from node labels (nvidia.com/gpu.memory) to
+translate gpu-fraction into an absolute HAMi-core memory cap. That requires
+read access to nodes. Not needed when PER_GPU_VRAM_MIB is set explicitly, but
+harmless (read-only) to grant either way.
+*/}}
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: {{ include "kai-resource-isolator.webhook.fullname" . }}-vram-autodetect
+  labels:
+    {{- include "kai-resource-isolator.labels" . | nindent 4 }}
+rules:
+  - apiGroups: [""]
+    resources: ["nodes"]
+    verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: {{ include "kai-resource-isolator.webhook.fullname" . }}-vram-autodetect
+  labels:
+    {{- include "kai-resource-isolator.labels" . | nindent 4 }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: {{ include "kai-resource-isolator.webhook.fullname" . }}-vram-autodetect
+subjects:
+  - kind: ServiceAccount
+    name: {{ include "kai-resource-isolator.webhook.fullname" . }}
+    namespace: {{ .Release.Namespace }}
diff --git a/chart/kai-resource-isolator/templates/webhook-deployment.yaml b/chart/kai-resource-isolator/templates/webhook-deployment.yaml
@@ -47,6 +47,16 @@ spec:
               value: {{ .Values.paths.containerVgpuMount | quote }}
             - name: GPU_SHARE_RESOURCES
               value: {{ .Values.webhook.gpuShareResources | quote }}
+            {{- with .Values.image.perGpuVramMiB }}
+            - name: PER_GPU_VRAM_MIB
+              value: {{ . | quote }}
+            {{- end }}
+            - name: NVIDIA_VISIBLE_DEVICES_GUARD
+              value: {{ .Values.webhook.nvidiaVisibleDevicesGuard | quote }}
+            {{- with .Values.webhook.guardAllowedNamespaces }}
+            - name: GUARD_ALLOWED_NAMESPACES
+              value: {{ . | quote }}
+            {{- end }}
           ports:
             - name: https
               containerPort: 8443

diff --git a/chart/kai-resource-isolator/values.yaml b/chart/kai-resource-isolator/values.yaml
@@ -12,6 +12,9 @@ image:
   repository: projecthami/kai-resource-isolator
   tag: latest
   pullPolicy: IfNotPresent
+  ## Optional: override per-GPU VRAM (MiB) instead of node-label autodetect.
+  ## Leave empty to autodetect from the nvidia.com/gpu.memory node label.
+  perGpuVramMiB: ""
 
 paths:
   ## Host directory base; the chart installs libraries under {hostInstallBase}/vgpu/
@@ -20,7 +23,8 @@ paths:
   containerVgpuMount: /usr/local/vgpu
 
 librarySync:
-  ## Node labels for GPU nodes (empty = all nodes)
+  ## Node labels for GPU nodes (empty = all nodes; set e.g.
+  ## nvidia.com/gpu.present: "true" to sync libvgpu only to GPU nodes).
   nodeSelector: {}
   tolerations:
     - operator: Exists
@@ -45,6 +49,13 @@ webhook:
   ## Comma-separated extended resources that trigger injection (HAMi vGPU sharing)
   gpuShareResources: "nvidia.com/gpu,nvidia.com/gpumem,nvidia.com/gpucores"
   failurePolicy: Ignore
+  ## NVIDIA_VISIBLE_DEVICES env-bypass guard: off | audit | enforce.
+  ## audit (default) logs unauthorized GPU-runtime pods without mutating — observe
+  ## logs to confirm no system pods are flagged, then switch to enforce.
+  nvidiaVisibleDevicesGuard: "audit"
+  ## Optional: override trusted namespaces (comma-separated). Empty = built-in
+  ## default (gpu-operator,kube-system,kai-scheduler,kai-resource-reservation,nvidia-network-operator).
+  guardAllowedNamespaces: ""
 
 tls:
   ## When cert-manager is enabled, set patch.enabled to false.

diff --git a/cmd/webhook/main.go b/cmd/webhook/main.go
@@ -7,14 +7,17 @@ SPDX-License-Identifier: Apache-2.0
 package main
 
 import (
+	"context"
 	"encoding/json"
 	"flag"
 	"fmt"
 	"io"
 	"log"
 	"net/http"
 	"os"
+	"strconv"
 	"strings"
+	"sync/atomic"
 	"time"
 
 	admissionv1 "k8s.io/api/admission/v1"
@@ -28,15 +31,51 @@ const (
 	injectAnnotationKey = "kai-resource-isolator.io/inject"
 	gpuFractionKey      = "gpu-fraction"
 	gpuMemoryKey        = "gpu-memory"
+	// cudaMemLimitEnv is the HAMi-core (libvgpu) memory-limit env var. KAI sets
+	// the gpu-memory annotation (MiB) but does not pass this env, so libvgpu would
+	// not enforce any cap. We translate gpu-memory -> CUDA_DEVICE_MEMORY_LIMIT=<v>m.
+	cudaMemLimitEnv = "CUDA_DEVICE_MEMORY_LIMIT"
 )
 
+// perGpuVramMiB is the per-GPU VRAM (MiB) used to translate a gpu-fraction into an
+// absolute HAMi-core memory cap. It comes from the PER_GPU_VRAM_MIB env
+// (authoritative) or is autodetected from node labels and refreshed in the
+// background, so access is atomic. Zero means "unknown" — gpu-fraction caps are
+// then skipped rather than guessed (gpu-memory pods are unaffected).
+var perGpuVramMiB atomic.Int64
+
 func main() {
 	certFile := flag.String("tls-cert-file", "/etc/tls/tls.crt", "TLS certificate")
 	keyFile := flag.String("tls-private-key-file", "/etc/tls/tls.key", "TLS private key")
 	listen := flag.String("listen", defaultListen, "Listen address")
 	containerMount := flag.String("container-vgpu-mount", getenv("CONTAINER_VGPU_MOUNT", "/usr/local/vgpu"), "Mount path inside the pod for the node vgpu directory (must match DaemonSet install path and ld.so.preload)")
 	flag.Parse()
 
+	// Per-GPU VRAM basis for translating gpu-fraction into a HAMi-core memory cap.
+	// Precedence: explicit PER_GPU_VRAM_MIB env (authoritative) > autodetect from
+	// node labels. No hardcoded default — unknown basis means gpu-fraction caps are
+	// skipped (see buildJSONPatch).
+	envOverride := false
+	if v := os.Getenv("PER_GPU_VRAM_MIB"); v != "" {
+		if n, err := strconv.ParseInt(v, 10, 64); err == nil && n > 0 {
+			perGpuVramMiB.Store(n)
+			envOverride = true
+			log.Printf("per-GPU VRAM basis = %d MiB (from PER_GPU_VRAM_MIB)", n)
+		} else {
+			log.Printf("ignoring invalid PER_GPU_VRAM_MIB=%q", v)
+		}
+	}
+	if !envOverride {
+		if cs, err := newInClusterClientset(); err != nil {
+			log.Printf("per-GPU VRAM autodetect unavailable (%v); set PER_GPU_VRAM_MIB to enable gpu-fraction caps", err)
+		} else {
+			startVramAutodetect(context.Background(), cs)
+		}
+	}
+
+	// NVIDIA_VISIBLE_DEVICES env-bypass guard (off/audit/enforce, default audit).
+	initNvdGuard()
+
 	mux := http.NewServeMux()
 	mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) {
 		w.WriteHeader(http.StatusOK)
@@ -149,13 +188,28 @@ func writeAdmission(w http.ResponseWriter, review *admissionv1.AdmissionReview,
 	}
 }
 
-func buildJSONPatch(pod *corev1.Pod, containerMount string) ([]byte, error) {
-	if !podNeedsInjection(pod) {
+// marshalOps returns the JSON-encoded patch, or (nil, nil) when there are no ops.
+func marshalOps(ops []map[string]interface{}) ([]byte, error) {
+	if len(ops) == 0 {
 		return nil, nil
 	}
+	return json.Marshal(ops)
+}
 
+func buildJSONPatch(pod *corev1.Pod, containerMount string) ([]byte, error) {
 	var ops []map[string]interface{}
 
+	// Security guard: neutralize NVIDIA_VISIBLE_DEVICES on pods that use a GPU
+	// runtimeClass but are not authorized GPU workloads (closes the env bypass
+	// enabled by ACCEPT_NVIDIA_VISIBLE_DEVICES_ENVVAR_WHEN_UNPRIVILEGED=true).
+	// Runs for every pod, independent of libvgpu injection below.
+	ops = append(ops, nvidiaVisibleDevicesGuardOps(pod)...)
+
+	// libvgpu mount + CUDA_DEVICE_MEMORY_LIMIT injection — only for KAI share pods.
+	if !podNeedsInjection(pod) {
+		return marshalOps(ops)
+	}
+
 	hasVol := false
 	for _, v := range pod.Spec.Volumes {
 		if v.Name == volumeName {
@@ -225,12 +279,62 @@ func buildJSONPatch(pod *corev1.Pod, containerMount string) ([]byte, error) {
 		}
 	}
 
+	// Translate the KAI resource share into the HAMi-core memory-limit env so
+	// libvgpu actually enforces the per-pod VRAM cap. KAI sets the annotation but
+	// passes no such env. gpu-memory carries an absolute MiB; gpu-fraction carries
+	// a share that we multiply by the per-GPU VRAM. The two are mutually exclusive.
+	limitValue := ""
+	if memMiB, ok := pod.Annotations[gpuMemoryKey]; ok && memMiB != "" {
+		limitValue = memMiB + "m"
+	} else if fracStr, ok := pod.Annotations[gpuFractionKey]; ok && fracStr != "" {
+		basis := perGpuVramMiB.Load()
+		if frac, err := strconv.ParseFloat(fracStr, 64); err != nil || frac <= 0 {
+			log.Printf("gpu-fraction %q unparseable; skipping memory-limit injection", fracStr)
+		} else if basis <= 0 {
+			log.Printf("per-GPU VRAM unknown (no PER_GPU_VRAM_MIB env and no %s node label); skipping gpu-fraction memory cap", gpuMemoryNodeLabel)
+		} else if limitMiB := int64(frac * float64(basis)); limitMiB > 0 {
+			limitValue = strconv.FormatInt(limitMiB, 10) + "m"
+		}
+	}
+	if limitValue != "" {
+		for i := range pod.Spec.InitContainers {
+			ops = appendMemLimitEnvOp(ops, &pod.Spec.InitContainers[i], "initContainers", i, limitValue)
+		}
+		for i := range pod.Spec.Containers {
+			ops = appendMemLimitEnvOp(ops, &pod.Spec.Containers[i], "containers", i, limitValue)
+		}
+	}
+
 	if len(ops) == 0 {
 		return nil, nil
 	}
 	return json.Marshal(ops)
 }
 
+// appendMemLimitEnvOp adds a JSON patch op setting CUDA_DEVICE_MEMORY_LIMIT on the
+// container, unless it is already present (user-set values win). It handles the
+// case where the container has no env array yet (add the array, not append to it).
+func appendMemLimitEnvOp(ops []map[string]interface{}, c *corev1.Container, field string, i int, limitValue string) []map[string]interface{} {
+	for _, e := range c.Env {
+		if e.Name == cudaMemLimitEnv {
+			return ops
+		}
+	}
+	envVar := map[string]interface{}{"name": cudaMemLimitEnv, "value": limitValue}
+	if len(c.Env) == 0 {
+		return append(ops, map[string]interface{}{
+			"op":    "add",
+			"path":  fmt.Sprintf("/spec/%s/%d/env", field, i),
+			"value": []map[string]interface{}{envVar},
+		})
+	}
+	return append(ops, map[string]interface{}{
+		"op":    "add",
+		"path":  fmt.Sprintf("/spec/%s/%d/env/-", field, i),
+		"value": envVar,
+	})
+}
+
 func podNeedsInjection(pod *corev1.Pod) bool {
 	if pod.Annotations == nil {
 		return false