From a8e7c74ca89cdf85ce8145d295754ddb29132a04 Mon Sep 17 00:00:00 2001 From: limes22 Date: Mon, 8 Jun 2026 21:51:24 +0000 Subject: [PATCH] webhook: inject CUDA_DEVICE_MEMORY_LIMIT from gpu-memory annotation KAI binder sets the gpu-memory annotation (MiB) on shared pods but never passes CUDA_DEVICE_MEMORY_LIMIT, which HAMi-core (libvgpu) reads to enforce the per-pod GPU memory cap. As a result libvgpu loads via ld.so.preload but enforces nothing (nvidia-smi shows full device memory) on KAI fractional-sharing pods. This makes the mutating webhook translate the gpu-memory annotation into CUDA_DEVICE_MEMORY_LIMIT=m on every (init)container (skipping containers that already set it, and handling the empty-env case), so libvgpu enforces the requested cap. gpu-fraction carries no absolute memory value and is left untouched. Co-Authored-By: Claude Opus 4.8 (1M context) --- cmd/webhook/main.go | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/cmd/webhook/main.go b/cmd/webhook/main.go index edf8ad8..05d79f3 100644 --- a/cmd/webhook/main.go +++ b/cmd/webhook/main.go @@ -28,6 +28,10 @@ const ( injectAnnotationKey = "kai-resource-isolator.io/inject" gpuFractionKey = "gpu-fraction" gpuMemoryKey = "gpu-memory" + // cudaMemLimitEnv is the HAMi-core (libvgpu) memory-limit env var. KAI sets + // the gpu-memory annotation (MiB) but does not pass this env, so libvgpu would + // not enforce any cap. We translate gpu-memory -> CUDA_DEVICE_MEMORY_LIMIT=m. + cudaMemLimitEnv = "CUDA_DEVICE_MEMORY_LIMIT" ) func main() { @@ -225,12 +229,49 @@ func buildJSONPatch(pod *corev1.Pod, containerMount string) ([]byte, error) { } } + // Translate the KAI gpu-memory annotation (MiB) into the HAMi-core memory + // limit env so libvgpu actually enforces the per-pod cap. gpu-fraction (compute + // share) carries no absolute memory value, so we only act on gpu-memory. + if memMiB, ok := pod.Annotations[gpuMemoryKey]; ok && memMiB != "" { + limitValue := memMiB + "m" + for i := range pod.Spec.InitContainers { + ops = appendMemLimitEnvOp(ops, &pod.Spec.InitContainers[i], "initContainers", i, limitValue) + } + for i := range pod.Spec.Containers { + ops = appendMemLimitEnvOp(ops, &pod.Spec.Containers[i], "containers", i, limitValue) + } + } + if len(ops) == 0 { return nil, nil } return json.Marshal(ops) } +// appendMemLimitEnvOp adds a JSON patch op setting CUDA_DEVICE_MEMORY_LIMIT on the +// container, unless it is already present (user-set values win). It handles the +// case where the container has no env array yet (add the array, not append to it). +func appendMemLimitEnvOp(ops []map[string]interface{}, c *corev1.Container, field string, i int, limitValue string) []map[string]interface{} { + for _, e := range c.Env { + if e.Name == cudaMemLimitEnv { + return ops + } + } + envVar := map[string]interface{}{"name": cudaMemLimitEnv, "value": limitValue} + if len(c.Env) == 0 { + return append(ops, map[string]interface{}{ + "op": "add", + "path": fmt.Sprintf("/spec/%s/%d/env", field, i), + "value": []map[string]interface{}{envVar}, + }) + } + return append(ops, map[string]interface{}{ + "op": "add", + "path": fmt.Sprintf("/spec/%s/%d/env/-", field, i), + "value": envVar, + }) +} + func podNeedsInjection(pod *corev1.Pod) bool { if pod.Annotations == nil { return false