From a8e7c74ca89cdf85ce8145d295754ddb29132a04 Mon Sep 17 00:00:00 2001
From: limes22 <limes22@users.noreply.github.com>
Date: Mon, 8 Jun 2026 21:51:24 +0000
Subject: [PATCH] webhook: inject CUDA_DEVICE_MEMORY_LIMIT from gpu-memory
 annotation

KAI binder sets the gpu-memory annotation (MiB) on shared pods but never passes
CUDA_DEVICE_MEMORY_LIMIT, which HAMi-core (libvgpu) reads to enforce the per-pod
GPU memory cap. As a result libvgpu loads via ld.so.preload but enforces nothing
(nvidia-smi shows full device memory) on KAI fractional-sharing pods.

This makes the mutating webhook translate the gpu-memory annotation into
CUDA_DEVICE_MEMORY_LIMIT=<value>m on every (init)container (skipping containers
that already set it, and handling the empty-env case), so libvgpu enforces the
requested cap. gpu-fraction carries no absolute memory value and is left untouched.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 cmd/webhook/main.go | 41 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)
diff --git a/cmd/webhook/main.go b/cmd/webhook/main.go
index edf8ad8..05d79f3 100644
--- a/cmd/webhook/main.go
+++ b/cmd/webhook/main.go
@@ -28,6 +28,10 @@ const (
 	injectAnnotationKey = "kai-resource-isolator.io/inject"
 	gpuFractionKey      = "gpu-fraction"
 	gpuMemoryKey        = "gpu-memory"
+	// cudaMemLimitEnv is the HAMi-core (libvgpu) memory-limit env var. KAI sets
+	// the gpu-memory annotation (MiB) but does not pass this env, so libvgpu would
+	// not enforce any cap. We translate gpu-memory -> CUDA_DEVICE_MEMORY_LIMIT=<v>m.
+	cudaMemLimitEnv = "CUDA_DEVICE_MEMORY_LIMIT"
 )
 
 func main() {
@@ -225,12 +229,49 @@ func buildJSONPatch(pod *corev1.Pod, containerMount string) ([]byte, error) {
 		}
 	}
 
+	// Translate the KAI gpu-memory annotation (MiB) into the HAMi-core memory
+	// limit env so libvgpu actually enforces the per-pod cap. gpu-fraction (compute
+	// share) carries no absolute memory value, so we only act on gpu-memory.
+	if memMiB, ok := pod.Annotations[gpuMemoryKey]; ok && memMiB != "" {
+		limitValue := memMiB + "m"
+		for i := range pod.Spec.InitContainers {
+			ops = appendMemLimitEnvOp(ops, &pod.Spec.InitContainers[i], "initContainers", i, limitValue)
+		}
+		for i := range pod.Spec.Containers {
+			ops = appendMemLimitEnvOp(ops, &pod.Spec.Containers[i], "containers", i, limitValue)
+		}
+	}
+
 	if len(ops) == 0 {
 		return nil, nil
 	}
 	return json.Marshal(ops)
 }
 
+// appendMemLimitEnvOp adds a JSON patch op setting CUDA_DEVICE_MEMORY_LIMIT on the
+// container, unless it is already present (user-set values win). It handles the
+// case where the container has no env array yet (add the array, not append to it).
+func appendMemLimitEnvOp(ops []map[string]interface{}, c *corev1.Container, field string, i int, limitValue string) []map[string]interface{} {
+	for _, e := range c.Env {
+		if e.Name == cudaMemLimitEnv {
+			return ops
+		}
+	}
+	envVar := map[string]interface{}{"name": cudaMemLimitEnv, "value": limitValue}
+	if len(c.Env) == 0 {
+		return append(ops, map[string]interface{}{
+			"op":    "add",
+			"path":  fmt.Sprintf("/spec/%s/%d/env", field, i),
+			"value": []map[string]interface{}{envVar},
+		})
+	}
+	return append(ops, map[string]interface{}{
+		"op":    "add",
+		"path":  fmt.Sprintf("/spec/%s/%d/env/-", field, i),
+		"value": envVar,
+	})
+}
+
 func podNeedsInjection(pod *corev1.Pod) bool {
 	if pod.Annotations == nil {
 		return false