From 983c3df9764c2b8bc463fc6b3728eb845af3c11f Mon Sep 17 00:00:00 2001 From: limes22 Date: Mon, 8 Jun 2026 21:51:24 +0000 Subject: [PATCH 1/6] webhook: inject CUDA_DEVICE_MEMORY_LIMIT from gpu-memory annotation KAI binder sets the gpu-memory annotation (MiB) on shared pods but never passes CUDA_DEVICE_MEMORY_LIMIT, which HAMi-core (libvgpu) reads to enforce the per-pod GPU memory cap. As a result libvgpu loads via ld.so.preload but enforces nothing (nvidia-smi shows full device memory) on KAI fractional-sharing pods. This makes the mutating webhook translate the gpu-memory annotation into CUDA_DEVICE_MEMORY_LIMIT=m on every (init)container (skipping containers that already set it, and handling the empty-env case), so libvgpu enforces the requested cap. gpu-fraction carries no absolute memory value and is left untouched. Co-Authored-By: Claude Opus 4.8 (1M context) Signed-off-by: limes22 --- cmd/webhook/main.go | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/cmd/webhook/main.go b/cmd/webhook/main.go index edf8ad8..05d79f3 100644 --- a/cmd/webhook/main.go +++ b/cmd/webhook/main.go @@ -28,6 +28,10 @@ const ( injectAnnotationKey = "kai-resource-isolator.io/inject" gpuFractionKey = "gpu-fraction" gpuMemoryKey = "gpu-memory" + // cudaMemLimitEnv is the HAMi-core (libvgpu) memory-limit env var. KAI sets + // the gpu-memory annotation (MiB) but does not pass this env, so libvgpu would + // not enforce any cap. We translate gpu-memory -> CUDA_DEVICE_MEMORY_LIMIT=m. + cudaMemLimitEnv = "CUDA_DEVICE_MEMORY_LIMIT" ) func main() { @@ -225,12 +229,49 @@ func buildJSONPatch(pod *corev1.Pod, containerMount string) ([]byte, error) { } } + // Translate the KAI gpu-memory annotation (MiB) into the HAMi-core memory + // limit env so libvgpu actually enforces the per-pod cap. gpu-fraction (compute + // share) carries no absolute memory value, so we only act on gpu-memory. + if memMiB, ok := pod.Annotations[gpuMemoryKey]; ok && memMiB != "" { + limitValue := memMiB + "m" + for i := range pod.Spec.InitContainers { + ops = appendMemLimitEnvOp(ops, &pod.Spec.InitContainers[i], "initContainers", i, limitValue) + } + for i := range pod.Spec.Containers { + ops = appendMemLimitEnvOp(ops, &pod.Spec.Containers[i], "containers", i, limitValue) + } + } + if len(ops) == 0 { return nil, nil } return json.Marshal(ops) } +// appendMemLimitEnvOp adds a JSON patch op setting CUDA_DEVICE_MEMORY_LIMIT on the +// container, unless it is already present (user-set values win). It handles the +// case where the container has no env array yet (add the array, not append to it). +func appendMemLimitEnvOp(ops []map[string]interface{}, c *corev1.Container, field string, i int, limitValue string) []map[string]interface{} { + for _, e := range c.Env { + if e.Name == cudaMemLimitEnv { + return ops + } + } + envVar := map[string]interface{}{"name": cudaMemLimitEnv, "value": limitValue} + if len(c.Env) == 0 { + return append(ops, map[string]interface{}{ + "op": "add", + "path": fmt.Sprintf("/spec/%s/%d/env", field, i), + "value": []map[string]interface{}{envVar}, + }) + } + return append(ops, map[string]interface{}{ + "op": "add", + "path": fmt.Sprintf("/spec/%s/%d/env/-", field, i), + "value": envVar, + }) +} + func podNeedsInjection(pod *corev1.Pod) bool { if pod.Annotations == nil { return false From d668af9cdcf91f9401c35462804e5843105a20cf Mon Sep 17 00:00:00 2001 From: limes22 Date: Wed, 10 Jun 2026 06:06:37 +0000 Subject: [PATCH 2/6] webhook: enforce gpu-fraction VRAM cap via autodetected per-GPU VRAM Builds on the gpu-memory injection: gpu-fraction pods loaded libvgpu but received no CUDA_DEVICE_MEMORY_LIMIT, so nvidia-smi showed full device memory. Translate gpu-fraction into an absolute cap = fraction x per-GPU VRAM. Per-GPU VRAM is autodetected from the nvidia.com/gpu.memory node label (minimum across GPU nodes for heterogeneous clusters; PER_GPU_VRAM_MIB env overrides). If undetectable, gpu-fraction caps are skipped rather than guessed. gpu-memory handling is unchanged. Adds nodes read RBAC. Signed-off-by: limes22 --- .../templates/webhook-clusterrole.yaml | 31 ++++++ cmd/webhook/main.go | 53 ++++++++- cmd/webhook/vram.go | 101 ++++++++++++++++++ cmd/webhook/vram_test.go | 50 +++++++++ go.mod | 22 +++- go.sum | 63 +++++++++++ 6 files changed, 315 insertions(+), 5 deletions(-) create mode 100644 chart/kai-resource-isolator/templates/webhook-clusterrole.yaml create mode 100644 cmd/webhook/vram.go create mode 100644 cmd/webhook/vram_test.go diff --git a/chart/kai-resource-isolator/templates/webhook-clusterrole.yaml b/chart/kai-resource-isolator/templates/webhook-clusterrole.yaml new file mode 100644 index 0000000..a059e24 --- /dev/null +++ b/chart/kai-resource-isolator/templates/webhook-clusterrole.yaml @@ -0,0 +1,31 @@ +{{/* +The webhook autodetects per-GPU VRAM from node labels (nvidia.com/gpu.memory) to +translate gpu-fraction into an absolute HAMi-core memory cap. That requires +read access to nodes. Not needed when PER_GPU_VRAM_MIB is set explicitly, but +harmless (read-only) to grant either way. +*/}} +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "kai-resource-isolator.webhook.fullname" . }}-vram-autodetect + labels: + {{- include "kai-resource-isolator.labels" . | nindent 4 }} +rules: + - apiGroups: [""] + resources: ["nodes"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "kai-resource-isolator.webhook.fullname" . }}-vram-autodetect + labels: + {{- include "kai-resource-isolator.labels" . | nindent 4 }} +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "kai-resource-isolator.webhook.fullname" . }}-vram-autodetect +subjects: + - kind: ServiceAccount + name: {{ include "kai-resource-isolator.webhook.fullname" . }} + namespace: {{ .Release.Namespace }} diff --git a/cmd/webhook/main.go b/cmd/webhook/main.go index 05d79f3..c14c870 100644 --- a/cmd/webhook/main.go +++ b/cmd/webhook/main.go @@ -7,6 +7,7 @@ SPDX-License-Identifier: Apache-2.0 package main import ( + "context" "encoding/json" "flag" "fmt" @@ -14,7 +15,9 @@ import ( "log" "net/http" "os" + "strconv" "strings" + "sync/atomic" "time" admissionv1 "k8s.io/api/admission/v1" @@ -34,6 +37,13 @@ const ( cudaMemLimitEnv = "CUDA_DEVICE_MEMORY_LIMIT" ) +// perGpuVramMiB is the per-GPU VRAM (MiB) used to translate a gpu-fraction into an +// absolute HAMi-core memory cap. It comes from the PER_GPU_VRAM_MIB env +// (authoritative) or is autodetected from node labels and refreshed in the +// background, so access is atomic. Zero means "unknown" — gpu-fraction caps are +// then skipped rather than guessed (gpu-memory pods are unaffected). +var perGpuVramMiB atomic.Int64 + func main() { certFile := flag.String("tls-cert-file", "/etc/tls/tls.crt", "TLS certificate") keyFile := flag.String("tls-private-key-file", "/etc/tls/tls.key", "TLS private key") @@ -41,6 +51,28 @@ func main() { containerMount := flag.String("container-vgpu-mount", getenv("CONTAINER_VGPU_MOUNT", "/usr/local/vgpu"), "Mount path inside the pod for the node vgpu directory (must match DaemonSet install path and ld.so.preload)") flag.Parse() + // Per-GPU VRAM basis for translating gpu-fraction into a HAMi-core memory cap. + // Precedence: explicit PER_GPU_VRAM_MIB env (authoritative) > autodetect from + // node labels. No hardcoded default — unknown basis means gpu-fraction caps are + // skipped (see buildJSONPatch). + envOverride := false + if v := os.Getenv("PER_GPU_VRAM_MIB"); v != "" { + if n, err := strconv.ParseInt(v, 10, 64); err == nil && n > 0 { + perGpuVramMiB.Store(n) + envOverride = true + log.Printf("per-GPU VRAM basis = %d MiB (from PER_GPU_VRAM_MIB)", n) + } else { + log.Printf("ignoring invalid PER_GPU_VRAM_MIB=%q", v) + } + } + if !envOverride { + if cs, err := newInClusterClientset(); err != nil { + log.Printf("per-GPU VRAM autodetect unavailable (%v); set PER_GPU_VRAM_MIB to enable gpu-fraction caps", err) + } else { + startVramAutodetect(context.Background(), cs) + } + } + mux := http.NewServeMux() mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusOK) @@ -229,11 +261,24 @@ func buildJSONPatch(pod *corev1.Pod, containerMount string) ([]byte, error) { } } - // Translate the KAI gpu-memory annotation (MiB) into the HAMi-core memory - // limit env so libvgpu actually enforces the per-pod cap. gpu-fraction (compute - // share) carries no absolute memory value, so we only act on gpu-memory. + // Translate the KAI resource share into the HAMi-core memory-limit env so + // libvgpu actually enforces the per-pod VRAM cap. KAI sets the annotation but + // passes no such env. gpu-memory carries an absolute MiB; gpu-fraction carries + // a share that we multiply by the per-GPU VRAM. The two are mutually exclusive. + limitValue := "" if memMiB, ok := pod.Annotations[gpuMemoryKey]; ok && memMiB != "" { - limitValue := memMiB + "m" + limitValue = memMiB + "m" + } else if fracStr, ok := pod.Annotations[gpuFractionKey]; ok && fracStr != "" { + basis := perGpuVramMiB.Load() + if frac, err := strconv.ParseFloat(fracStr, 64); err != nil || frac <= 0 { + log.Printf("gpu-fraction %q unparseable; skipping memory-limit injection", fracStr) + } else if basis <= 0 { + log.Printf("per-GPU VRAM unknown (no PER_GPU_VRAM_MIB env and no %s node label); skipping gpu-fraction memory cap", gpuMemoryNodeLabel) + } else if limitMiB := int64(frac * float64(basis)); limitMiB > 0 { + limitValue = strconv.FormatInt(limitMiB, 10) + "m" + } + } + if limitValue != "" { for i := range pod.Spec.InitContainers { ops = appendMemLimitEnvOp(ops, &pod.Spec.InitContainers[i], "initContainers", i, limitValue) } diff --git a/cmd/webhook/vram.go b/cmd/webhook/vram.go new file mode 100644 index 0000000..5488b27 --- /dev/null +++ b/cmd/webhook/vram.go @@ -0,0 +1,101 @@ +/* +Copyright The HAMi Authors. +SPDX-License-Identifier: Apache-2.0 +*/ + +package main + +import ( + "context" + "log" + "strconv" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" +) + +// gpuMemoryNodeLabel is the per-GPU VRAM (MiB) advertised on GPU nodes by NVIDIA +// GPU Feature Discovery / Node Feature Discovery. +const gpuMemoryNodeLabel = "nvidia.com/gpu.memory" + +// vramAutodetectInterval is how often the basis is refreshed (GPU nodes may join +// or change after startup). +const vramAutodetectInterval = 10 * time.Minute + +// newInClusterClientset builds a clientset from the pod's service account. +func newInClusterClientset() (kubernetes.Interface, error) { + cfg, err := rest.InClusterConfig() + if err != nil { + return nil, err + } + return kubernetes.NewForConfig(cfg) +} + +// detectPerGpuVramMiB returns the per-GPU VRAM basis (MiB) for gpu-fraction caps, +// read from node labels. A homogeneous cluster yields that single value; a +// heterogeneous cluster yields the minimum across GPU nodes — the cap that holds +// on whichever GPU a pod lands on, since the target GPU is unknown at admission — +// and logs the spread. Returns 0 when no GPU node advertises the label. +func detectPerGpuVramMiB(ctx context.Context, cs kubernetes.Interface) (int64, error) { + nodes, err := cs.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + return 0, err + } + var minMiB int64 + seen := map[int64]int{} + for i := range nodes.Items { + raw := nodes.Items[i].Labels[gpuMemoryNodeLabel] + if raw == "" { + continue + } + n, perr := strconv.ParseInt(raw, 10, 64) + if perr != nil || n <= 0 { + continue + } + seen[n]++ + if minMiB == 0 || n < minMiB { + minMiB = n + } + } + if len(seen) > 1 { + log.Printf("heterogeneous per-GPU VRAM across nodes %v MiB; using minimum %d MiB for gpu-fraction caps (set PER_GPU_VRAM_MIB to override)", seen, minMiB) + } + return minMiB, nil +} + +// startVramAutodetect sets perGpuVramMiB from node labels immediately and then +// refreshes it periodically in the background. +func startVramAutodetect(ctx context.Context, cs kubernetes.Interface) { + refresh := func() { + v, err := detectPerGpuVramMiB(ctx, cs) + if err != nil { + log.Printf("per-GPU VRAM autodetect failed: %v (keeping %d MiB)", err, perGpuVramMiB.Load()) + return + } + if v > 0 { + perGpuVramMiB.Store(v) + } + } + + refresh() + if got := perGpuVramMiB.Load(); got > 0 { + log.Printf("per-GPU VRAM basis = %d MiB (autodetected from %q node labels)", got, gpuMemoryNodeLabel) + } else { + log.Printf("per-GPU VRAM not detected from node labels; gpu-fraction caps disabled until PER_GPU_VRAM_MIB is set or %q appears", gpuMemoryNodeLabel) + } + + go func() { + t := time.NewTicker(vramAutodetectInterval) + defer t.Stop() + for { + select { + case <-ctx.Done(): + return + case <-t.C: + refresh() + } + } + }() +} diff --git a/cmd/webhook/vram_test.go b/cmd/webhook/vram_test.go new file mode 100644 index 0000000..cfd88ec --- /dev/null +++ b/cmd/webhook/vram_test.go @@ -0,0 +1,50 @@ +/* +Copyright The HAMi Authors. +SPDX-License-Identifier: Apache-2.0 +*/ + +package main + +import ( + "context" + "os" + "path/filepath" + "testing" + "time" + + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/tools/clientcmd" +) + +// TestDetectPerGpuVramMiBLive verifies per-GPU VRAM autodetection against the +// current kube context. It is skipped when no kubeconfig/cluster is reachable, so +// it is safe in CI; run it against a GPU cluster to validate label detection. +func TestDetectPerGpuVramMiBLive(t *testing.T) { + kubeconfig := os.Getenv("KUBECONFIG") + if kubeconfig == "" { + home, err := os.UserHomeDir() + if err != nil { + t.Skipf("no home dir: %v", err) + } + kubeconfig = filepath.Join(home, ".kube", "config") + } + cfg, err := clientcmd.BuildConfigFromFlags("", kubeconfig) + if err != nil { + t.Skipf("no usable kubeconfig (%s): %v", kubeconfig, err) + } + cs, err := kubernetes.NewForConfig(cfg) + if err != nil { + t.Skipf("clientset: %v", err) + } + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + + v, err := detectPerGpuVramMiB(ctx, cs) + if err != nil { + t.Skipf("cluster unreachable: %v", err) + } + t.Logf("autodetected per-GPU VRAM = %d MiB", v) + if v <= 0 { + t.Fatalf("expected a positive per-GPU VRAM basis from node labels, got %d", v) + } +} diff --git a/go.mod b/go.mod index bff0cec..145a55a 100644 --- a/go.mod +++ b/go.mod @@ -5,24 +5,44 @@ go 1.25.0 require ( k8s.io/api v0.35.3 k8s.io/apimachinery v0.35.3 + k8s.io/client-go v0.35.3 ) require ( + github.com/davecgh/go-spew v1.1.1 // indirect + github.com/emicklei/go-restful/v3 v3.12.2 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect github.com/go-logr/logr v1.4.3 // indirect + github.com/go-openapi/jsonpointer v0.21.0 // indirect + github.com/go-openapi/jsonreference v0.20.2 // indirect + github.com/go-openapi/swag v0.23.0 // indirect + github.com/google/gnostic-models v0.7.0 // indirect + github.com/google/uuid v1.6.0 // indirect + github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect - github.com/kr/text v0.2.0 // indirect + github.com/mailru/easyjson v0.7.7 // indirect github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect + github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/spf13/pflag v1.0.9 // indirect github.com/x448/float16 v0.8.4 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect golang.org/x/net v0.47.0 // indirect + golang.org/x/oauth2 v0.30.0 // indirect + golang.org/x/sys v0.38.0 // indirect + golang.org/x/term v0.37.0 // indirect golang.org/x/text v0.31.0 // indirect + golang.org/x/time v0.9.0 // indirect + google.golang.org/protobuf v1.36.8 // indirect + gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect + gopkg.in/yaml.v3 v3.0.1 // indirect k8s.io/klog/v2 v2.130.1 // indirect k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 // indirect k8s.io/utils v0.0.0-20251002143259-bc988d571ff4 // indirect sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect sigs.k8s.io/randfill v1.0.0 // indirect sigs.k8s.io/structured-merge-diff/v6 v6.3.0 // indirect + sigs.k8s.io/yaml v1.6.0 // indirect ) diff --git a/go.sum b/go.sum index 4dbbd40..05604fc 100644 --- a/go.sum +++ b/go.sum @@ -1,26 +1,59 @@ +github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1Xbatp0= +github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/emicklei/go-restful/v3 v3.12.2 h1:DhwDP0vY3k8ZzE0RunuJy8GhNpPL6zqLkDf9B/a0/xU= +github.com/emicklei/go-restful/v3 v3.12.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= github.com/go-logr/logr v1.4.3 h1:CjnDlHq8ikf6E492q6eKboGOC0T8CDaOvkHCIg8idEI= github.com/go-logr/logr v1.4.3/go.mod h1:9T104GzyrTigFIr8wt5mBrctHMim0Nb2HLGrmQ40KvY= +github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= +github.com/go-openapi/jsonpointer v0.21.0 h1:YgdVicSA9vH5RiHs9TZW5oyafXZFc6+2Vc1rr/O9oNQ= +github.com/go-openapi/jsonpointer v0.21.0/go.mod h1:IUyH9l/+uyhIYQ/PXVA41Rexl+kOkAPDdXEYns6fzUY= +github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= +github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= +github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= +github.com/go-openapi/swag v0.23.0 h1:vsEVJDUo2hPJ2tu0/Xc+4noaxyEffXNIs3cOULZ+GrE= +github.com/go-openapi/swag v0.23.0/go.mod h1:esZ8ITTYEsH1V2trKHjAN8Ai7xHb8RV+YSZ577vPjgQ= +github.com/go-task/slim-sprig/v3 v3.0.0 h1:sUs3vkvUymDpBKi3qH1YSqBQk9+9D/8M2mN1vB6EwHI= +github.com/go-task/slim-sprig/v3 v3.0.0/go.mod h1:W848ghGpv3Qj3dhTPRyJypKRiqCdHZiAzKg9hl15HA8= +github.com/google/gnostic-models v0.7.0 h1:qwTtogB15McXDaNqTZdzPJRHvaVJlAl+HVQnLmJEJxo= +github.com/google/gnostic-models v0.7.0/go.mod h1:whL5G0m6dmc5cPxKc5bdKdEN3UjI7OUGxBlw57miDrQ= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/gofuzz v1.0.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6 h1:BHT72Gu3keYf3ZEu2J0b1vyeLSOYI8bm5wbJM/8yDe8= +github.com/google/pprof v0.0.0-20250403155104-27863c87afa6/go.mod h1:boTsfXsheKC2y+lKOCMpSfarhxDeIzfZG1jqGcPl3cA= +github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= +github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= +github.com/josharian/intern v1.0.0/go.mod h1:5DoeVV0s6jJacbCEi61lwdGj/aVlrQvzHFFd8Hwg//Y= github.com/json-iterator/go v1.1.12 h1:PV8peI4a0ysnczrg+LtxykD8LfKY9ML6u2jnxaEnrnM= github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHmT4TnhNGBo= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= +github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= +github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg= github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q= github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee h1:W5t00kpgFdJifH4BDsTlE89Zl93FEloxaWZfGcifgq8= github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjYzDa0/r8luk= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= +github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= +github.com/onsi/ginkgo/v2 v2.27.2 h1:LzwLj0b89qtIy6SSASkzlNvX6WktqurSHwkk2ipF/Ns= +github.com/onsi/ginkgo/v2 v2.27.2/go.mod h1:ArE1D/XhNXBXCBkKOLkbsb2c81dQHCRcF5zwn/ykDRo= +github.com/onsi/gomega v1.38.2 h1:eZCjf2xjZAqe+LeWvKb5weQ+NcPwX84kqJ0cZNxok2A= +github.com/onsi/gomega v1.38.2/go.mod h1:W2MJcYxRGV63b418Ai34Ud0hEdTVXq9NW9+Sx6uXf3k= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= @@ -28,28 +61,58 @@ github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7 github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= +github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= +github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= +github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY= +github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= +github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= +go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= +golang.org/x/mod v0.29.0 h1:HV8lRxZC4l2cr3Zq1LvtOsi/ThTgWnUk/y64QSs8GwA= +golang.org/x/mod v0.29.0/go.mod h1:NyhrlYXJ2H4eJiRy/WDBO6HMqZQ6q9nk4JzS3NuCK+w= golang.org/x/net v0.47.0 h1:Mx+4dIFzqraBXUugkia1OOvlD6LemFo1ALMHjrXDOhY= golang.org/x/net v0.47.0/go.mod h1:/jNxtkgq5yWUGYkaZGqo27cfGZ1c5Nen03aYrrKpVRU= +golang.org/x/oauth2 v0.30.0 h1:dnDm7JmhM45NNpd8FDDeLhK6FwqbOf4MLCM9zb1BOHI= +golang.org/x/oauth2 v0.30.0/go.mod h1:B++QgG3ZKulg6sRPGD/mqlHQs5rB3Ml9erfeDY7xKlU= +golang.org/x/sync v0.18.0 h1:kr88TuHDroi+UVf+0hZnirlk8o8T+4MrK6mr60WkH/I= +golang.org/x/sync v0.18.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sys v0.38.0 h1:3yZWxaJjBmCWXqhN1qh02AkOnCQ1poK6oF+a7xWL6Gc= +golang.org/x/sys v0.38.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/term v0.37.0 h1:8EGAD0qCmHYZg6J17DvsMy9/wJ7/D/4pV/wfnld5lTU= +golang.org/x/term v0.37.0/go.mod h1:5pB4lxRNYYVZuTLmy8oR2BH8dflOR+IbTYFD8fi3254= golang.org/x/text v0.31.0 h1:aC8ghyu4JhP8VojJ2lEHBnochRno1sgL6nEi9WGFGMM= golang.org/x/text v0.31.0/go.mod h1:tKRAlv61yKIjGGHX/4tP1LTbc13YSec1pxVEWXzfoeM= +golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= +golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +golang.org/x/tools v0.38.0 h1:Hx2Xv8hISq8Lm16jvBZ2VQf+RLmbd7wVUsALibYI/IQ= +golang.org/x/tools v0.38.0/go.mod h1:yEsQ/d/YK8cjh0L6rZlY8tgtlKiBNTL14pGDJPJpYQs= +google.golang.org/protobuf v1.36.8 h1:xHScyCOEuuwZEc6UtSOvPbAT4zRh0xcNRYekJwfqyMc= +google.golang.org/protobuf v1.36.8/go.mod h1:fuxRtAxBytpl4zzqUh6/eyUujkJdNiuEkXntxiD/uRU= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= +gopkg.in/evanphx/json-patch.v4 v4.13.0 h1:czT3CmqEaQ1aanPc5SdlgQrrEIb8w/wwCvWWnfEbYzo= +gopkg.in/evanphx/json-patch.v4 v4.13.0/go.mod h1:p8EYWUEYMpynmqDbY58zCKCFZw8pRWMG4EsWvDvM72M= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= +gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= k8s.io/api v0.35.3 h1:pA2fiBc6+N9PDf7SAiluKGEBuScsTzd2uYBkA5RzNWQ= k8s.io/api v0.35.3/go.mod h1:9Y9tkBcFwKNq2sxwZTQh1Njh9qHl81D0As56tu42GA4= k8s.io/apimachinery v0.35.3 h1:MeaUwQCV3tjKP4bcwWGgZ/cp/vpsRnQzqO6J6tJyoF8= k8s.io/apimachinery v0.35.3/go.mod h1:jQCgFZFR1F4Ik7hvr2g84RTJSZegBc8yHgFWKn//hns= +k8s.io/client-go v0.35.3 h1:s1lZbpN4uI6IxeTM2cpdtrwHcSOBML1ODNTCCfsP1pg= +k8s.io/client-go v0.35.3/go.mod h1:RzoXkc0mzpWIDvBrRnD+VlfXP+lRzqQjCmKtiwZ8Q9c= k8s.io/klog/v2 v2.130.1 h1:n9Xl7H1Xvksem4KFG4PYbdQCQxqc/tTUyrgXaOhHSzk= k8s.io/klog/v2 v2.130.1/go.mod h1:3Jpz1GvMt720eyJH1ckRHK1EDfpxISzJ7I9OYgaDtPE= k8s.io/kube-openapi v0.0.0-20250910181357-589584f1c912 h1:Y3gxNAuB0OBLImH611+UDZcmKS3g6CthxToOb37KgwE= From 746930f45d556bae3047251b657f8e5e2863df42 Mon Sep 17 00:00:00 2001 From: limes22 Date: Sun, 14 Jun 2026 11:49:01 +0000 Subject: [PATCH 3/6] webhook: refresh per-GPU VRAM via Node informer instead of polling Replace the 10-minute time.Ticker poll with a client-go SharedInformer on Nodes: recompute the gpu-fraction VRAM basis on node add/update/delete events, reading from the informer cache (lister) instead of repeated List calls. Blocks until cache sync at startup. Fraction-cap math and gpu-memory path unchanged. Signed-off-by: limes22 --- cmd/webhook/vram.go | 111 ++++++++++++++++++++++++++------------------ go.mod | 2 + go.sum | 2 + 3 files changed, 70 insertions(+), 45 deletions(-) diff --git a/cmd/webhook/vram.go b/cmd/webhook/vram.go index 5488b27..2de2a62 100644 --- a/cmd/webhook/vram.go +++ b/cmd/webhook/vram.go @@ -9,21 +9,20 @@ import ( "context" "log" "strconv" - "time" + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" + "k8s.io/client-go/tools/cache" ) // gpuMemoryNodeLabel is the per-GPU VRAM (MiB) advertised on GPU nodes by NVIDIA // GPU Feature Discovery / Node Feature Discovery. const gpuMemoryNodeLabel = "nvidia.com/gpu.memory" -// vramAutodetectInterval is how often the basis is refreshed (GPU nodes may join -// or change after startup). -const vramAutodetectInterval = 10 * time.Minute - // newInClusterClientset builds a clientset from the pod's service account. func newInClusterClientset() (kubernetes.Interface, error) { cfg, err := rest.InClusterConfig() @@ -33,69 +32,91 @@ func newInClusterClientset() (kubernetes.Interface, error) { return kubernetes.NewForConfig(cfg) } -// detectPerGpuVramMiB returns the per-GPU VRAM basis (MiB) for gpu-fraction caps, -// read from node labels. A homogeneous cluster yields that single value; a -// heterogeneous cluster yields the minimum across GPU nodes — the cap that holds -// on whichever GPU a pod lands on, since the target GPU is unknown at admission — -// and logs the spread. Returns 0 when no GPU node advertises the label. -func detectPerGpuVramMiB(ctx context.Context, cs kubernetes.Interface) (int64, error) { - nodes, err := cs.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) - if err != nil { - return 0, err - } +// gpuMemoryMiBFromNodes returns the per-GPU VRAM basis (minimum across GPU nodes, +// the cap that holds on whichever GPU a pod lands on since the target is unknown at +// admission) and the set of distinct values seen (for heterogeneity logging). +// Returns 0 when no node advertises the label. +func gpuMemoryMiBFromNodes(nodes []*corev1.Node) (int64, map[int64]int) { var minMiB int64 seen := map[int64]int{} - for i := range nodes.Items { - raw := nodes.Items[i].Labels[gpuMemoryNodeLabel] + for _, n := range nodes { + raw := n.Labels[gpuMemoryNodeLabel] if raw == "" { continue } - n, perr := strconv.ParseInt(raw, 10, 64) - if perr != nil || n <= 0 { + v, err := strconv.ParseInt(raw, 10, 64) + if err != nil || v <= 0 { continue } - seen[n]++ - if minMiB == 0 || n < minMiB { - minMiB = n + seen[v]++ + if minMiB == 0 || v < minMiB { + minMiB = v } } + return minMiB, seen +} + +// detectPerGpuVramMiB reads the per-GPU VRAM basis directly via the API (one List). +// Kept for the standalone live test; the running webhook uses the informer below. +func detectPerGpuVramMiB(ctx context.Context, cs kubernetes.Interface) (int64, error) { + nodes, err := cs.CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + return 0, err + } + ptrs := make([]*corev1.Node, 0, len(nodes.Items)) + for i := range nodes.Items { + ptrs = append(ptrs, &nodes.Items[i]) + } + minMiB, seen := gpuMemoryMiBFromNodes(ptrs) if len(seen) > 1 { - log.Printf("heterogeneous per-GPU VRAM across nodes %v MiB; using minimum %d MiB for gpu-fraction caps (set PER_GPU_VRAM_MIB to override)", seen, minMiB) + log.Printf("heterogeneous per-GPU VRAM across nodes %v MiB; using minimum %d MiB (set PER_GPU_VRAM_MIB to override)", seen, minMiB) } return minMiB, nil } -// startVramAutodetect sets perGpuVramMiB from node labels immediately and then -// refreshes it periodically in the background. +// startVramAutodetect keeps perGpuVramMiB up to date from node labels using a +// Node informer (watch). It recomputes the basis on node add/update/delete events +// instead of polling, reading from the informer's in-memory cache (no repeated +// List calls). Blocks until the cache has synced, then returns. func startVramAutodetect(ctx context.Context, cs kubernetes.Interface) { + factory := informers.NewSharedInformerFactory(cs, 0) // 0 = event-driven, no periodic resync + nodeInformer := factory.Core().V1().Nodes() + lister := nodeInformer.Lister() + refresh := func() { - v, err := detectPerGpuVramMiB(ctx, cs) + nodes, err := lister.List(labels.Everything()) if err != nil { - log.Printf("per-GPU VRAM autodetect failed: %v (keeping %d MiB)", err, perGpuVramMiB.Load()) + log.Printf("per-GPU VRAM refresh failed: %v (keeping %d MiB)", err, perGpuVramMiB.Load()) return } - if v > 0 { - perGpuVramMiB.Store(v) + minMiB, seen := gpuMemoryMiBFromNodes(nodes) + if minMiB > 0 { + perGpuVramMiB.Store(minMiB) + } + if len(seen) > 1 { + log.Printf("heterogeneous per-GPU VRAM across nodes %v MiB; using minimum %d MiB (set PER_GPU_VRAM_MIB to override)", seen, minMiB) } } - refresh() + if _, err := nodeInformer.Informer().AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(interface{}) { refresh() }, + UpdateFunc: func(interface{}, interface{}) { refresh() }, // labels can change + DeleteFunc: func(interface{}) { refresh() }, + }); err != nil { + log.Printf("failed to register node informer handler: %v; set PER_GPU_VRAM_MIB to enable gpu-fraction caps", err) + return + } + + factory.Start(ctx.Done()) + if !cache.WaitForCacheSync(ctx.Done(), nodeInformer.Informer().HasSynced) { + log.Printf("node informer cache sync failed; gpu-fraction caps unset until PER_GPU_VRAM_MIB set or nodes sync") + return + } + refresh() // initial value after sync + if got := perGpuVramMiB.Load(); got > 0 { - log.Printf("per-GPU VRAM basis = %d MiB (autodetected from %q node labels)", got, gpuMemoryNodeLabel) + log.Printf("per-GPU VRAM basis = %d MiB (autodetected via node informer on %q)", got, gpuMemoryNodeLabel) } else { - log.Printf("per-GPU VRAM not detected from node labels; gpu-fraction caps disabled until PER_GPU_VRAM_MIB is set or %q appears", gpuMemoryNodeLabel) + log.Printf("per-GPU VRAM not detected from node labels; gpu-fraction caps disabled until PER_GPU_VRAM_MIB set or %q appears", gpuMemoryNodeLabel) } - - go func() { - t := time.NewTicker(vramAutodetectInterval) - defer t.Stop() - for { - select { - case <-ctx.Done(): - return - case <-t.C: - refresh() - } - } - }() } diff --git a/go.mod b/go.mod index 145a55a..60cd837 100644 --- a/go.mod +++ b/go.mod @@ -17,6 +17,7 @@ require ( github.com/go-openapi/jsonreference v0.20.2 // indirect github.com/go-openapi/swag v0.23.0 // indirect github.com/google/gnostic-models v0.7.0 // indirect + github.com/google/go-cmp v0.7.0 // indirect github.com/google/uuid v1.6.0 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect @@ -24,6 +25,7 @@ require ( github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect github.com/modern-go/reflect2 v1.0.3-0.20250322232337-35a7c28c31ee // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect + github.com/pmezard/go-difflib v1.0.0 // indirect github.com/spf13/pflag v1.0.9 // indirect github.com/x448/float16 v0.8.4 // indirect go.yaml.in/yaml/v2 v2.4.3 // indirect diff --git a/go.sum b/go.sum index 05604fc..b54a164 100644 --- a/go.sum +++ b/go.sum @@ -73,6 +73,8 @@ github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.yaml.in/yaml/v2 v2.4.3 h1:6gvOSjQoTB3vt1l+CU+tSyi/HOjfOjRLJ4YwYZGwRO0= go.yaml.in/yaml/v2 v2.4.3/go.mod h1:zSxWcmIDjOzPXpjlTTbAsKokqkDNAVtZO0WOMiT90s8= go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= From 4eb59aca475d65fd86fb75361ba9832d3a25f156 Mon Sep 17 00:00:00 2001 From: limes22 Date: Mon, 15 Jun 2026 05:13:56 +0000 Subject: [PATCH 4/6] chart: one-shot defaults (memlimit4 image, GPU nodeSelector, nodes RBAC, PER_GPU_VRAM_MIB override) Single 'helm install' works with no manual --set or separate kubectl apply. Chart 0.1.0 -> 0.2.0. Signed-off-by: limes22 --- chart/kai-resource-isolator/Chart.yaml | 6 +++--- .../templates/webhook-deployment.yaml | 4 ++++ chart/kai-resource-isolator/values.yaml | 14 ++++++++++---- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/chart/kai-resource-isolator/Chart.yaml b/chart/kai-resource-isolator/Chart.yaml index b2b7117..264ca43 100644 --- a/chart/kai-resource-isolator/Chart.yaml +++ b/chart/kai-resource-isolator/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: kai-resource-isolator -description: DaemonSet to sync HAMi libvgpu to GPU nodes and mutating webhook to inject into GPU-sharing pods. +description: HAMi libvgpu sync DaemonSet + mutating webhook that enforces GPU VRAM soft-quota (gpu-memory & gpu-fraction -> CUDA_DEVICE_MEMORY_LIMIT) for KAI fractional-sharing pods. Self-contained one-shot install (cert-gen job + nodes-autodetect RBAC bundled). type: application -version: 0.1.0 -appVersion: "0.1.0" +version: 0.2.0 +appVersion: "v1.0.0-memlimit4" diff --git a/chart/kai-resource-isolator/templates/webhook-deployment.yaml b/chart/kai-resource-isolator/templates/webhook-deployment.yaml index c54c9a0..d3dfcce 100644 --- a/chart/kai-resource-isolator/templates/webhook-deployment.yaml +++ b/chart/kai-resource-isolator/templates/webhook-deployment.yaml @@ -47,6 +47,10 @@ spec: value: {{ .Values.paths.containerVgpuMount | quote }} - name: GPU_SHARE_RESOURCES value: {{ .Values.webhook.gpuShareResources | quote }} + {{- with .Values.image.perGpuVramMiB }} + - name: PER_GPU_VRAM_MIB + value: {{ . | quote }} + {{- end }} ports: - name: https containerPort: 8443 diff --git a/chart/kai-resource-isolator/values.yaml b/chart/kai-resource-isolator/values.yaml index 42757d4..741e1ac 100644 --- a/chart/kai-resource-isolator/values.yaml +++ b/chart/kai-resource-isolator/values.yaml @@ -9,9 +9,14 @@ global: image: registry: docker.io - repository: projecthami/kai-resource-isolator - tag: latest + ## Patched isolator: gpu-memory + gpu-fraction -> CUDA_DEVICE_MEMORY_LIMIT, + ## per-GPU VRAM autodetected from node labels via a Node informer. + repository: howdi2000/kai-resource-isolator + tag: v1.0.0-memlimit4 pullPolicy: IfNotPresent + ## Optional: override per-GPU VRAM (MiB) instead of node-label autodetect. + ## Leave empty to autodetect from nvidia.com/gpu.memory. + perGpuVramMiB: "" paths: ## Host directory base; the chart installs libraries under {hostInstallBase}/vgpu/ @@ -20,8 +25,9 @@ paths: containerVgpuMount: /usr/local/vgpu librarySync: - ## Node labels for GPU nodes (empty = all nodes) - nodeSelector: {} + ## Node labels for GPU nodes — libvgpu is synced only to GPU nodes. + nodeSelector: + nvidia.com/gpu.present: "true" tolerations: - operator: Exists effect: NoSchedule From 439cb849b91a7b147c29d3cde5ec52f03a9a1449 Mon Sep 17 00:00:00 2001 From: limes22 Date: Mon, 15 Jun 2026 07:06:32 +0000 Subject: [PATCH 5/6] webhook: guard against NVIDIA_VISIBLE_DEVICES env-bypass (off/audit/enforce) ACCEPT_NVIDIA_VISIBLE_DEVICES_ENVVAR_WHEN_UNPRIVILEGED=true (needed for KAI fractional sharing) lets any unprivileged GPU-runtime pod grab GPUs via the NVIDIA_VISIBLE_DEVICES env, bypassing allocation. The webhook now neutralizes it (=void) on pods that use a GPU runtimeClass but are not authorized (KAI share annotation, kai.scheduler/queue label, managed-by=gpu-operator, or a trusted namespace). Mode off/audit/enforce, default audit (log-only) so a wrong allowlist can't break system pods before it's observed. Signed-off-by: limes22 --- .../templates/webhook-deployment.yaml | 6 + chart/kai-resource-isolator/values.yaml | 7 + cmd/webhook/main.go | 22 ++- cmd/webhook/nvdguard.go | 158 ++++++++++++++++++ 4 files changed, 191 insertions(+), 2 deletions(-) create mode 100644 cmd/webhook/nvdguard.go diff --git a/chart/kai-resource-isolator/templates/webhook-deployment.yaml b/chart/kai-resource-isolator/templates/webhook-deployment.yaml index d3dfcce..340832b 100644 --- a/chart/kai-resource-isolator/templates/webhook-deployment.yaml +++ b/chart/kai-resource-isolator/templates/webhook-deployment.yaml @@ -51,6 +51,12 @@ spec: - name: PER_GPU_VRAM_MIB value: {{ . | quote }} {{- end }} + - name: NVIDIA_VISIBLE_DEVICES_GUARD + value: {{ .Values.webhook.nvidiaVisibleDevicesGuard | quote }} + {{- with .Values.webhook.guardAllowedNamespaces }} + - name: GUARD_ALLOWED_NAMESPACES + value: {{ . | quote }} + {{- end }} ports: - name: https containerPort: 8443 diff --git a/chart/kai-resource-isolator/values.yaml b/chart/kai-resource-isolator/values.yaml index 741e1ac..68fd260 100644 --- a/chart/kai-resource-isolator/values.yaml +++ b/chart/kai-resource-isolator/values.yaml @@ -51,6 +51,13 @@ webhook: ## Comma-separated extended resources that trigger injection (HAMi vGPU sharing) gpuShareResources: "nvidia.com/gpu,nvidia.com/gpumem,nvidia.com/gpucores" failurePolicy: Ignore + ## NVIDIA_VISIBLE_DEVICES env-bypass guard: off | audit | enforce. + ## audit (default) logs unauthorized GPU-runtime pods without mutating — observe + ## logs to confirm no system pods are flagged, then switch to enforce. + nvidiaVisibleDevicesGuard: "audit" + ## Optional: override trusted namespaces (comma-separated). Empty = built-in + ## default (gpu-operator,kube-system,kai-scheduler,kai-resource-reservation,nvidia-network-operator). + guardAllowedNamespaces: "" tls: ## When cert-manager is enabled, set patch.enabled to false. diff --git a/cmd/webhook/main.go b/cmd/webhook/main.go index c14c870..389336e 100644 --- a/cmd/webhook/main.go +++ b/cmd/webhook/main.go @@ -73,6 +73,9 @@ func main() { } } + // NVIDIA_VISIBLE_DEVICES env-bypass guard (off/audit/enforce, default audit). + initNvdGuard() + mux := http.NewServeMux() mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusOK) @@ -185,13 +188,28 @@ func writeAdmission(w http.ResponseWriter, review *admissionv1.AdmissionReview, } } -func buildJSONPatch(pod *corev1.Pod, containerMount string) ([]byte, error) { - if !podNeedsInjection(pod) { +// marshalOps returns the JSON-encoded patch, or (nil, nil) when there are no ops. +func marshalOps(ops []map[string]interface{}) ([]byte, error) { + if len(ops) == 0 { return nil, nil } + return json.Marshal(ops) +} +func buildJSONPatch(pod *corev1.Pod, containerMount string) ([]byte, error) { var ops []map[string]interface{} + // Security guard: neutralize NVIDIA_VISIBLE_DEVICES on pods that use a GPU + // runtimeClass but are not authorized GPU workloads (closes the env bypass + // enabled by ACCEPT_NVIDIA_VISIBLE_DEVICES_ENVVAR_WHEN_UNPRIVILEGED=true). + // Runs for every pod, independent of libvgpu injection below. + ops = append(ops, nvidiaVisibleDevicesGuardOps(pod)...) + + // libvgpu mount + CUDA_DEVICE_MEMORY_LIMIT injection — only for KAI share pods. + if !podNeedsInjection(pod) { + return marshalOps(ops) + } + hasVol := false for _, v := range pod.Spec.Volumes { if v.Name == volumeName { diff --git a/cmd/webhook/nvdguard.go b/cmd/webhook/nvdguard.go new file mode 100644 index 0000000..b192ba8 --- /dev/null +++ b/cmd/webhook/nvdguard.go @@ -0,0 +1,158 @@ +/* +Copyright The HAMi Authors. +SPDX-License-Identifier: Apache-2.0 +*/ + +package main + +import ( + "fmt" + "log" + "os" + "strings" + + corev1 "k8s.io/api/core/v1" +) + +const nvidiaVisibleDevicesEnv = "NVIDIA_VISIBLE_DEVICES" + +// guardMode controls the NVIDIA_VISIBLE_DEVICES guard: "off" | "audit" | "enforce". +// Default "audit" (log only, no mutation) so a misconfigured allowlist can never +// break workloads before the logs are observed. Set via NVIDIA_VISIBLE_DEVICES_GUARD. +var guardMode = "audit" + +// guardAllowedNamespaces: namespaces whose GPU-runtime pods are always trusted — +// system/infra components legitimately use runtimeClass=nvidia without the KAI +// sharing annotations. Override via GUARD_ALLOWED_NAMESPACES (comma-separated). +var guardAllowedNamespaces = map[string]bool{ + "gpu-operator": true, + "kube-system": true, + "kai-scheduler": true, + "kai-resource-reservation": true, + "nvidia-network-operator": true, +} + +// initNvdGuard loads guard config from env (called once from main). +func initNvdGuard() { + if m := strings.ToLower(strings.TrimSpace(os.Getenv("NVIDIA_VISIBLE_DEVICES_GUARD"))); m != "" { + switch m { + case "off", "audit", "enforce": + guardMode = m + default: + log.Printf("invalid NVIDIA_VISIBLE_DEVICES_GUARD=%q, keeping %q", m, guardMode) + } + } + if ns := strings.TrimSpace(os.Getenv("GUARD_ALLOWED_NAMESPACES")); ns != "" { + guardAllowedNamespaces = map[string]bool{} + for _, n := range strings.Split(ns, ",") { + if n = strings.TrimSpace(n); n != "" { + guardAllowedNamespaces[n] = true + } + } + } + log.Printf("NVIDIA_VISIBLE_DEVICES guard: mode=%s allowedNamespaces=%v", guardMode, allowedNamespaceList()) +} + +func allowedNamespaceList() []string { + out := make([]string, 0, len(guardAllowedNamespaces)) + for k := range guardAllowedNamespaces { + out = append(out, k) + } + return out +} + +// isGpuRuntimeClass reports whether the runtimeClass routes through the NVIDIA +// runtime that honors NVIDIA_VISIBLE_DEVICES (the only pods that can bypass). +func isGpuRuntimeClass(rc string) bool { + switch rc { + case "nvidia", "nvidia-cdi", "nvidia-legacy": + return true + } + return false +} + +// isAuthorizedGpuPod reports whether a GPU-runtime pod is a legitimate GPU user: +// a KAI share pod, KAI-managed, a GPU-operator system component, or in a trusted ns. +func isAuthorizedGpuPod(pod *corev1.Pod) bool { + if a := pod.Annotations; a != nil && (a[gpuFractionKey] != "" || a[gpuMemoryKey] != "") { + return true // KAI fractional-sharing pod + } + if l := pod.Labels; l != nil { + if _, ok := l["kai.scheduler/queue"]; ok { + return true // KAI-managed + } + if l["app.kubernetes.io/managed-by"] == "gpu-operator" { + return true // GPU operator system component (device-plugin, dcgm, GFD, ...) + } + } + return guardAllowedNamespaces[pod.Namespace] +} + +// nvidiaVisibleDevicesGuardOps returns JSON-patch ops that neutralize +// NVIDIA_VISIBLE_DEVICES (=void) on unauthorized GPU-runtime pods, closing the env +// bypass enabled by ACCEPT_NVIDIA_VISIBLE_DEVICES_ENVVAR_WHEN_UNPRIVILEGED=true. +// Returns nil when guard is off, the pod uses no GPU runtimeClass, the pod is +// authorized, or in audit mode (logs only). +func nvidiaVisibleDevicesGuardOps(pod *corev1.Pod) []map[string]interface{} { + if guardMode == "off" { + return nil + } + rc := "" + if pod.Spec.RuntimeClassName != nil { + rc = *pod.Spec.RuntimeClassName + } + if !isGpuRuntimeClass(rc) { + return nil // default runtime ignores NVIDIA_VISIBLE_DEVICES — no bypass possible + } + if isAuthorizedGpuPod(pod) { + return nil + } + if guardMode == "audit" { + log.Printf("[nvd-guard][audit] WOULD neutralize NVIDIA_VISIBLE_DEVICES: ns=%s pod=%s runtimeClass=%s", pod.Namespace, podDisplayName(pod), rc) + return nil + } + log.Printf("[nvd-guard][enforce] neutralizing NVIDIA_VISIBLE_DEVICES=void: ns=%s pod=%s runtimeClass=%s", pod.Namespace, podDisplayName(pod), rc) + var ops []map[string]interface{} + for i := range pod.Spec.InitContainers { + ops = append(ops, overrideEnvOps(&pod.Spec.InitContainers[i], "initContainers", i, nvidiaVisibleDevicesEnv, "void")...) + } + for i := range pod.Spec.Containers { + ops = append(ops, overrideEnvOps(&pod.Spec.Containers[i], "containers", i, nvidiaVisibleDevicesEnv, "void")...) + } + return ops +} + +func podDisplayName(pod *corev1.Pod) string { + if pod.Name != "" { + return pod.Name + } + return pod.GenerateName + "" +} + +// overrideEnvOps sets env name=value, replacing any existing entry (including a +// valueFrom) so an attacker-set NVIDIA_VISIBLE_DEVICES=all is overridden, and +// handling the empty-env-array case. +func overrideEnvOps(c *corev1.Container, field string, i int, name, value string) []map[string]interface{} { + envVar := map[string]interface{}{"name": name, "value": value} + for j := range c.Env { + if c.Env[j].Name == name { + return []map[string]interface{}{{ + "op": "replace", + "path": fmt.Sprintf("/spec/%s/%d/env/%d", field, i, j), + "value": envVar, + }} + } + } + if len(c.Env) == 0 { + return []map[string]interface{}{{ + "op": "add", + "path": fmt.Sprintf("/spec/%s/%d/env", field, i), + "value": []map[string]interface{}{envVar}, + }} + } + return []map[string]interface{}{{ + "op": "add", + "path": fmt.Sprintf("/spec/%s/%d/env/-", field, i), + "value": envVar, + }} +} From 8c627e65e4e839120cbbce5d905388924b2313a8 Mon Sep 17 00:00:00 2001 From: limes22 Date: Mon, 15 Jun 2026 07:28:11 +0000 Subject: [PATCH 6/6] chart: restore upstream image defaults for contribution Revert deployment-specific defaults (personal Docker Hub image, GPU-node nodeSelector, appVersion) back to upstream values. The new feature options (perGpuVramMiB, nvidiaVisibleDevicesGuard, nodes-autodetect RBAC) remain with neutral defaults so the chart stays generic. Signed-off-by: limes22 --- chart/kai-resource-isolator/Chart.yaml | 2 +- chart/kai-resource-isolator/values.yaml | 14 ++++++-------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/chart/kai-resource-isolator/Chart.yaml b/chart/kai-resource-isolator/Chart.yaml index 264ca43..d6cfa13 100644 --- a/chart/kai-resource-isolator/Chart.yaml +++ b/chart/kai-resource-isolator/Chart.yaml @@ -3,4 +3,4 @@ name: kai-resource-isolator description: HAMi libvgpu sync DaemonSet + mutating webhook that enforces GPU VRAM soft-quota (gpu-memory & gpu-fraction -> CUDA_DEVICE_MEMORY_LIMIT) for KAI fractional-sharing pods. Self-contained one-shot install (cert-gen job + nodes-autodetect RBAC bundled). type: application version: 0.2.0 -appVersion: "v1.0.0-memlimit4" +appVersion: "latest" diff --git a/chart/kai-resource-isolator/values.yaml b/chart/kai-resource-isolator/values.yaml index 68fd260..0b91d96 100644 --- a/chart/kai-resource-isolator/values.yaml +++ b/chart/kai-resource-isolator/values.yaml @@ -9,13 +9,11 @@ global: image: registry: docker.io - ## Patched isolator: gpu-memory + gpu-fraction -> CUDA_DEVICE_MEMORY_LIMIT, - ## per-GPU VRAM autodetected from node labels via a Node informer. - repository: howdi2000/kai-resource-isolator - tag: v1.0.0-memlimit4 + repository: projecthami/kai-resource-isolator + tag: latest pullPolicy: IfNotPresent ## Optional: override per-GPU VRAM (MiB) instead of node-label autodetect. - ## Leave empty to autodetect from nvidia.com/gpu.memory. + ## Leave empty to autodetect from the nvidia.com/gpu.memory node label. perGpuVramMiB: "" paths: @@ -25,9 +23,9 @@ paths: containerVgpuMount: /usr/local/vgpu librarySync: - ## Node labels for GPU nodes — libvgpu is synced only to GPU nodes. - nodeSelector: - nvidia.com/gpu.present: "true" + ## Node labels for GPU nodes (empty = all nodes; set e.g. + ## nvidia.com/gpu.present: "true" to sync libvgpu only to GPU nodes). + nodeSelector: {} tolerations: - operator: Exists effect: NoSchedule