From b53e354e95ca803f19fab2d351610d89b6b10769 Mon Sep 17 00:00:00 2001
From: Swati Gupta <swatig@nvidia.com>
Date: Wed, 23 Apr 2025 18:21:45 +0000
Subject: [PATCH 1/4] Add gpu mutating webhook

---
 .../admission_controller.go                   | 144 ++++++++++++++++
 cmd/gpu-mutating-webhook/main.go              | 162 ++++++++++++++++++
 2 files changed, 306 insertions(+)
 create mode 100644 cmd/gpu-mutating-webhook/admission_controller.go
 create mode 100644 cmd/gpu-mutating-webhook/main.go

diff --git a/cmd/gpu-mutating-webhook/admission_controller.go b/cmd/gpu-mutating-webhook/admission_controller.go
new file mode 100644
index 000000000..f1ac407d5
--- /dev/null
+++ b/cmd/gpu-mutating-webhook/admission_controller.go
@@ -0,0 +1,144 @@
+/**
+# Copyright 2024 NVIDIA CORPORATION
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+**/
+
+package main
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io/ioutil"
+	"log"
+	"net/http"
+
+	admissionv1 "k8s.io/api/admission/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/runtime"
+	"k8s.io/apimachinery/pkg/runtime/serializer"
+)
+
+const (
+	jsonContentType = `application/json`
+)
+
+var (
+	universalDeserializer = serializer.NewCodecFactory(runtime.NewScheme()).UniversalDeserializer()
+)
+
+type patchOperation struct {
+	Op    string      `json:"op"`
+	Path  string      `json:"path"`
+	Value interface{} `json:"value,omitempty"`
+}
+
+type admitFunc func(*admissionv1.AdmissionRequest) ([]patchOperation, error)
+
+func isKubeNamespace(ns string) bool {
+	return (ns == metav1.NamespacePublic || ns == metav1.NamespaceSystem)
+}
+
+func doServeAdmitFunc(w http.ResponseWriter, r *http.Request, admit admitFunc) ([]byte, error) {
+	// Request validation. Only handle POST requests with a body and json content type.
+	if r.Method != http.MethodPost {
+		w.WriteHeader(http.StatusMethodNotAllowed)
+		return nil, fmt.Errorf("invalid method %s, only POST is allowed", r.Method)
+	}
+
+	body, err := ioutil.ReadAll(r.Body)
+	if err != nil {
+		w.WriteHeader(http.StatusBadRequest)
+		return nil, fmt.Errorf("could not read request body: %v", err)
+	}
+
+	if ct := r.Header.Get("Content-Type"); ct != jsonContentType {
+		w.WriteHeader(http.StatusBadRequest)
+		return nil, fmt.Errorf("unsupported content type %s, only %s is supported", ct, jsonContentType)
+	}
+
+	// Parse the AdmissionReview request.
+	var admissionReviewReq admissionv1.AdmissionReview
+	if _, _, err := universalDeserializer.Decode(body, nil, &admissionReviewReq); err != nil {
+		w.WriteHeader(http.StatusBadRequest)
+		return nil, fmt.Errorf("could not deserialize AdmissionReview: %v", err)
+	} else if admissionReviewReq.Request == nil {
+		w.WriteHeader(http.StatusBadRequest)
+		return nil, errors.New("malformed admission review: Request is nil")
+	}
+
+	// Build the response
+	admissionReviewResp := admissionv1.AdmissionReview{
+		TypeMeta: admissionReviewReq.TypeMeta,
+		Response: &admissionv1.AdmissionResponse{
+			UID: admissionReviewReq.Request.UID,
+		},
+	}
+
+	// Skip k8s namespaces
+	var patchOps []patchOperation
+	if !isKubeNamespace(admissionReviewReq.Request.Namespace) {
+		patchOps, err = admit(admissionReviewReq.Request)
+	}
+
+	if err != nil {
+		admissionReviewResp.Response.Allowed = false
+		admissionReviewResp.Response.Result = &metav1.Status{
+			Message: err.Error(),
+		}
+	} else {
+		patchBytes, err := json.Marshal(patchOps)
+		if err != nil {
+			w.WriteHeader(http.StatusInternalServerError)
+			return nil, fmt.Errorf("could not marshal JSON patch: %v", err)
+		}
+		admissionReviewResp.Response.Allowed = true
+		admissionReviewResp.Response.Patch = patchBytes
+
+		pt := admissionv1.PatchTypeJSONPatch
+		admissionReviewResp.Response.PatchType = &pt
+	}
+
+	respBytes, err := json.Marshal(admissionReviewResp)
+	if err != nil {
+		return nil, fmt.Errorf("could not marshal AdmissionReview response: %v", err)
+	}
+	return respBytes, nil
+}
+
+// serveAdmitFunc is a wrapper that handles HTTP, calls doServeAdmitFunc, and writes the result.
+func serveAdmitFunc(w http.ResponseWriter, r *http.Request, admit admitFunc) {
+	log.Print("Handling webhook request ...")
+
+	respBytes, err := doServeAdmitFunc(w, r, admit)
+	if err != nil {
+		log.Printf("Error handling webhook request: %v", err)
+		w.WriteHeader(http.StatusInternalServerError)
+		_, _ = w.Write([]byte(err.Error()))
+		return
+	}
+
+	log.Print("Webhook request handled successfully")
+	_, writeErr := w.Write(respBytes)
+	if writeErr != nil {
+		log.Printf("Could not write response: %v", writeErr)
+	}
+}
+
+// admitFuncHandler converts an admitFunc into an http.Handler
+func admitFuncHandler(admit admitFunc) http.Handler {
+	return http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		serveAdmitFunc(w, r, admit)
+	})
+}
diff --git a/cmd/gpu-mutating-webhook/main.go b/cmd/gpu-mutating-webhook/main.go
new file mode 100644
index 000000000..a7d7c3f92
--- /dev/null
+++ b/cmd/gpu-mutating-webhook/main.go
@@ -0,0 +1,162 @@
+/**
+# Copyright 2025 NVIDIA CORPORATION
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+**/
+
+package main
+
+import (
+	"fmt"
+	"log"
+	"net/http"
+	"path/filepath"
+	"strings"
+
+	admissionv1 "k8s.io/api/admission/v1"
+	corev1 "k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
+const (
+	tlsDir      = `/etc/webhook/tls`
+	tlsCertFile = `tls.crt`
+	tlsKeyFile  = `tls.key`
+)
+
+var (
+	podResource     = metav1.GroupVersionResource{Group: "", Version: "v1", Resource: "pods"}
+	gpuClaimName    = "nvidia-gpu-resourceclaim"
+	gpuTemplateName = "nvidia-gpu-resourceclaim-template"
+)
+
+func applyGPUMutation(req *admissionv1.AdmissionRequest) ([]patchOperation, error) {
+	// Only mutate if the incoming resource is a Pod CREATE request.
+	if req.Resource != podResource {
+		log.Printf("applyGPUMutation invoked for a non-Pod resource: %v", req.Resource)
+		return nil, nil
+	}
+	if req.Operation != admissionv1.Create {
+		log.Printf("applyGPUMutation invoked for operation %s, ignoring", req.Operation)
+		return nil, nil
+	}
+
+	raw := req.Object.Raw
+	var pod corev1.Pod
+	if _, _, err := universalDeserializer.Decode(raw, nil, &pod); err != nil {
+		return nil, fmt.Errorf("could not deserialize pod object: %v", err)
+	}
+
+	var patches []patchOperation
+
+	// Check if the Pod already has a resource claim
+	hasGPUClaim := false
+	for _, rc := range pod.Spec.ResourceClaims {
+		if rc.Name == gpuClaimName {
+			hasGPUClaim = true
+			break
+		}
+	}
+
+	// Escape "nvidia.com/gpu" for JSON Patch
+	escapedGPUKey := strings.ReplaceAll(strings.ReplaceAll("nvidia.com/gpu", "~", "~0"), "/", "~1")
+
+	for i, c := range pod.Spec.Containers {
+		foundGPU := false
+
+		if _, ok := c.Resources.Requests["nvidia.com/gpu"]; ok {
+			foundGPU = true
+			patches = append(patches, patchOperation{
+				Op:   "remove",
+				Path: fmt.Sprintf("/spec/containers/%d/resources/requests/%s", i, escapedGPUKey),
+			})
+		}
+
+		if _, ok := c.Resources.Limits["nvidia.com/gpu"]; ok {
+			foundGPU = true
+			patches = append(patches, patchOperation{
+				Op:   "remove",
+				Path: fmt.Sprintf("/spec/containers/%d/resources/limits/%s", i, escapedGPUKey),
+			})
+		}
+
+		if foundGPU {
+			gpuClaimPresent := false
+			for _, claimRef := range c.Resources.Claims {
+				if claimRef.Name == gpuClaimName {
+					gpuClaimPresent = true
+					break
+				}
+			}
+			if !gpuClaimPresent {
+				if c.Resources.Claims == nil {
+					patches = append(patches, patchOperation{
+						Op:   "add",
+						Path: fmt.Sprintf("/spec/containers/%d/resources/claims", i),
+						Value: []map[string]string{
+							{"name": gpuClaimName},
+						},
+					})
+				} else {
+					patches = append(patches, patchOperation{
+						Op:    "add",
+						Path:  fmt.Sprintf("/spec/containers/%d/resources/claims/-", i),
+						Value: map[string]string{"name": gpuClaimName},
+					})
+				}
+			}
+		}
+	}
+
+	if len(patches) > 0 && !hasGPUClaim {
+		newClaim := map[string]string{
+			"name":                      gpuClaimName,
+			"resourceClaimTemplateName": gpuTemplateName,
+		}
+
+		if pod.Spec.ResourceClaims == nil {
+			patches = append(patches, patchOperation{
+				Op:   "add",
+				Path: "/spec/resourceClaims",
+				Value: []map[string]string{
+					newClaim,
+				},
+			})
+		} else {
+			patches = append(patches, patchOperation{
+				Op:    "add",
+				Path:  "/spec/resourceClaims/-",
+				Value: newClaim,
+			})
+		}
+		log.Printf("Added ResourceClaim %q referencing template %q to Pod %q",
+			gpuClaimName, gpuTemplateName, pod.Name)
+	}
+
+	return patches, nil
+}
+
+func main() {
+	certPath := filepath.Join(tlsDir, tlsCertFile)
+	keyPath := filepath.Join(tlsDir, tlsKeyFile)
+
+	mux := http.NewServeMux()
+	mux.Handle("/mutate", admitFuncHandler(applyGPUMutation))
+
+	server := &http.Server{
+		Addr:    ":8443",
+		Handler: mux,
+	}
+	log.Printf("Starting webhook server on %s", server.Addr)
+	log.Fatal(server.ListenAndServeTLS(certPath, keyPath))
+}

From 13cdba2a6c7947a890d59a71e9b7d9debffd28e9 Mon Sep 17 00:00:00 2001
From: Swati Gupta <swatig@nvidia.com>
Date: Wed, 23 Apr 2025 18:22:05 +0000
Subject: [PATCH 2/4] Vendor update

---
 vendor/modules.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vendor/modules.txt b/vendor/modules.txt
index 8bf3eb5e0..026967ff4 100644
--- a/vendor/modules.txt
+++ b/vendor/modules.txt
@@ -362,6 +362,7 @@ gopkg.in/inf.v0
 gopkg.in/yaml.v3
 # k8s.io/api v0.32.0
 ## explicit; go 1.23.0
+k8s.io/api/admission/v1
 k8s.io/api/admissionregistration/v1
 k8s.io/api/admissionregistration/v1alpha1
 k8s.io/api/admissionregistration/v1beta1

From ec7c440de26ca65df34a5320540512f7d4318f10 Mon Sep 17 00:00:00 2001
From: Swati Gupta <swatig@nvidia.com>
Date: Wed, 23 Apr 2025 19:02:00 +0000
Subject: [PATCH 3/4] Add helm deployment

---
 deployments/container/Dockerfile              |   1 +
 .../nvidia-dra-driver-gpu/generate-certs.sh   | 120 ++++++++++++++++++
 .../templates/gpumutatingwebhook.yaml         |  48 +++++++
 templates/gpu-claim-template.tmpl.yaml        |  11 ++
 4 files changed, 180 insertions(+)
 create mode 100755 deployments/helm/nvidia-dra-driver-gpu/generate-certs.sh
 create mode 100644 deployments/helm/nvidia-dra-driver-gpu/templates/gpumutatingwebhook.yaml
 create mode 100644 templates/gpu-claim-template.tmpl.yaml

diff --git a/deployments/container/Dockerfile b/deployments/container/Dockerfile
index 295d24619..597be64be 100644
--- a/deployments/container/Dockerfile
+++ b/deployments/container/Dockerfile
@@ -67,4 +67,5 @@ RUN mkdir /licenses && mv /NGC-DL-CONTAINER-LICENSE /licenses/NGC-DL-CONTAINER-L
 COPY --from=build /artifacts/compute-domain-controller     /usr/bin/compute-domain-controller
 COPY --from=build /artifacts/compute-domain-kubelet-plugin /usr/bin/compute-domain-kubelet-plugin
 COPY --from=build /artifacts/gpu-kubelet-plugin            /usr/bin/gpu-kubelet-plugin
+COPY --from=build /artifacts/gpu-mutating-webhook          /usr/bin/gpu-mutating-webhook
 COPY --from=build /build/templates                         /templates
diff --git a/deployments/helm/nvidia-dra-driver-gpu/generate-certs.sh b/deployments/helm/nvidia-dra-driver-gpu/generate-certs.sh
new file mode 100755
index 000000000..ce0602b8e
--- /dev/null
+++ b/deployments/helm/nvidia-dra-driver-gpu/generate-certs.sh
@@ -0,0 +1,120 @@
+#!/bin/bash
+set -e
+
+mkdir -p certs
+cd certs
+
+SERVICE=gpu-mutating-webhook
+NAMESPACE=nvidia-dra-driver-gpu
+SECRET_NAME=webhook-tls
+
+# Generate the CA key and certificate
+openssl genrsa -out ca.key 2048
+openssl req -new -x509 -days 365 -key ca.key -subj "/CN=Kubernetes CA" -out ca.crt
+
+# Generate the server key
+openssl genrsa -out server.key 2048
+
+# Generate a Certificate Signing Request
+cat > csr.conf << EOF
+[req]
+req_extensions = v3_req
+distinguished_name = req_distinguished_name
+
+[req_distinguished_name]
+[ v3_req ]
+basicConstraints = CA:FALSE
+keyUsage = nonRepudiation, digitalSignature, keyEncipherment
+extendedKeyUsage = serverAuth
+subjectAltName = @alt_names
+
+[alt_names]
+DNS.1 = ${SERVICE}
+DNS.2 = ${SERVICE}.${NAMESPACE}
+DNS.3 = ${SERVICE}.${NAMESPACE}.svc
+EOF
+
+openssl req -new -key server.key -subj "/CN=${SERVICE}.${NAMESPACE}.svc" -out server.csr -config csr.conf
+
+# Sign the certificate
+cat > cert.conf << EOF
+[auth_ext]
+authorityKeyIdentifier=keyid,issuer
+basicConstraints=CA:FALSE
+keyUsage = digitalSignature, nonRepudiation, keyEncipherment, dataEncipherment
+extendedKeyUsage = serverAuth
+subjectAltName = @alt_names
+
+[alt_names]
+DNS.1 = ${SERVICE}
+DNS.2 = ${SERVICE}.${NAMESPACE}
+DNS.3 = ${SERVICE}.${NAMESPACE}.svc
+EOF
+
+openssl x509 -req -in server.csr -CA ca.crt -CAkey ca.key -CAcreateserial -out server.crt -days 365 -extfile cert.conf -extensions auth_ext
+
+# Base64 encode the certificates
+CA_BUNDLE=$(openssl base64 -A < ca.crt)
+TLS_CERT=$(openssl base64 -A < server.crt)
+TLS_KEY=$(openssl base64 -A < server.key)
+
+# Create the Secret YAML
+cat > webhook-secret.yaml << EOF
+apiVersion: v1
+kind: Secret
+metadata:
+  name: ${SECRET_NAME}
+  namespace: ${NAMESPACE}
+type: kubernetes.io/tls
+data:
+  tls.crt: ${TLS_CERT}
+  tls.key: ${TLS_KEY}
+EOF
+
+# Create the webhookconfiguration 
+cat > mutatingwebhook.yaml << EOF
+apiVersion: admissionregistration.k8s.io/v1
+kind: MutatingWebhookConfiguration
+metadata:
+  name: gpu-mutating-webhook
+webhooks:
+- name: gpu.mutating.k8s.io
+  admissionReviewVersions: ["v1"]
+  sideEffects: None
+  failurePolicy: Ignore
+  clientConfig:
+    service:
+      name: gpu-mutating-webhook
+      namespace: nvidia-dra-driver-gpu
+      path: "/mutate"
+    caBundle: ${CA_BUNDLE}
+  rules:
+  - apiGroups: [""]
+    apiVersions: ["v1"]
+    operations: ["CREATE", "UPDATE"]
+    resources: ["pods"]
+  namespaceSelector:
+    matchExpressions:
+    - key: kubernetes.io/metadata.name
+      operator: NotIn
+      values: ["kube-system", "nvidia-dra-driver-gpu"]
+EOF
+
+# Create the resourceclaimtemplate
+cat > gpuresourceclaim.yaml << EOF
+apiVersion: resource.k8s.io/v1beta1
+kind: ResourceClaimTemplate
+metadata:
+  name: nvidia-gpu-resourceclaim-template
+spec:
+  spec:
+    devices:
+      requests:
+      - name: gpu
+      deviceClassName: gpu.nvidia.com
+EOF
+
+echo "Generated TLS certificates and secret successfully"
+echo "Apply the secret with: kubectl apply -f webhook-secret.yaml"
+echo "Apply the webhook configuration with: kubectl apply -f mutatingwebhook.yaml"
+echo "Apply the resourceclaimtemplate with: kubectl apply -f gpuresourceclaim.yaml"
diff --git a/deployments/helm/nvidia-dra-driver-gpu/templates/gpumutatingwebhook.yaml b/deployments/helm/nvidia-dra-driver-gpu/templates/gpumutatingwebhook.yaml
new file mode 100644
index 000000000..7ff1d8f1d
--- /dev/null
+++ b/deployments/helm/nvidia-dra-driver-gpu/templates/gpumutatingwebhook.yaml
@@ -0,0 +1,48 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gpu-mutating-webhook
+  namespace: nvidia-dra-driver-gpu
+  labels:
+    app: gpu-mutating-webhook
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: gpu-mutating-webhook
+  template:
+    metadata:
+      labels:
+        app: gpu-mutating-webhook
+    spec:
+      nodeSelector:
+        kubernetes.io/hostname: sc-starwars-mab9-b00
+      serviceAccountName: nvidia-dra-driver-gpu-service-account
+      containers:
+      - name: webhook
+        image: localhost:5001/mutating-webhook:7.0
+        command: ["/usr/bin/gpu-mutating-webhook"]
+        imagePullPolicy: IfNotPresent
+        ports:
+        - containerPort: 8443
+          name: webhook-api
+        volumeMounts:
+        - name: webhook-tls
+          mountPath: /etc/webhook/tls
+          readOnly: true
+      volumes:
+      - name: webhook-tls
+        secret:
+          secretName: webhook-tls
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: gpu-mutating-webhook
+  namespace: nvidia-dra-driver-gpu
+spec:
+  selector:
+    app: gpu-mutating-webhook
+  ports:
+  - port: 443
+    targetPort: webhook-api
diff --git a/templates/gpu-claim-template.tmpl.yaml b/templates/gpu-claim-template.tmpl.yaml
new file mode 100644
index 000000000..7a9c1fab2
--- /dev/null
+++ b/templates/gpu-claim-template.tmpl.yaml
@@ -0,0 +1,11 @@
+---
+apiVersion: resource.k8s.io/v1beta1
+kind: ResourceClaimTemplate
+metadata:
+  name: nvidia-gpu-resourceclaim-template
+spec:
+  spec:
+    devices:
+      requests:
+      - name: gpu
+        deviceClassName: gpu.nvidia.com

From 463045609865b57ab4c618121e920e68bcf592f4 Mon Sep 17 00:00:00 2001
From: Swati Gupta <swatig@nvidia.com>
Date: Thu, 1 May 2025 23:18:07 +0000
Subject: [PATCH 4/4] Refractor code into helper functions and add klog

Signed-off-by: Swati Gupta <swatig@nvidia.com>
---
 .../admission_controller.go                   |   1 +
 cmd/gpu-mutating-webhook/main.go              | 213 ++++++++++--------
 2 files changed, 124 insertions(+), 90 deletions(-)

diff --git a/cmd/gpu-mutating-webhook/admission_controller.go b/cmd/gpu-mutating-webhook/admission_controller.go
index f1ac407d5..05cf98835 100644
--- a/cmd/gpu-mutating-webhook/admission_controller.go
+++ b/cmd/gpu-mutating-webhook/admission_controller.go
@@ -46,6 +46,7 @@ type patchOperation struct {
 
 type admitFunc func(*admissionv1.AdmissionRequest) ([]patchOperation, error)
 
+// Swati: skip nvidia-dra-driver-gpu ns as well
 func isKubeNamespace(ns string) bool {
 	return (ns == metav1.NamespacePublic || ns == metav1.NamespaceSystem)
 }
diff --git a/cmd/gpu-mutating-webhook/main.go b/cmd/gpu-mutating-webhook/main.go
index a7d7c3f92..a06f14eab 100644
--- a/cmd/gpu-mutating-webhook/main.go
+++ b/cmd/gpu-mutating-webhook/main.go
@@ -26,126 +26,155 @@ import (
 	admissionv1 "k8s.io/api/admission/v1"
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/klog/v2"
 )
 
 const (
-	tlsDir      = `/etc/webhook/tls`
-	tlsCertFile = `tls.crt`
-	tlsKeyFile  = `tls.key`
+	tlsDir          = `/etc/webhook/tls`
+	tlsCertFile     = `tls.crt`
+	tlsKeyFile      = `tls.key`
+	gpuResourceName = "nvidia.com/gpu"
+	gpuClaimName    = "nvidia-gpu-resourceclaim"
+	gpuTemplateName = "nvidia-gpu-resourceclaim-template"
 )
 
 var (
-	podResource     = metav1.GroupVersionResource{Group: "", Version: "v1", Resource: "pods"}
-	gpuClaimName    = "nvidia-gpu-resourceclaim"
-	gpuTemplateName = "nvidia-gpu-resourceclaim-template"
+	podResource = metav1.GroupVersionResource{Version: "v1", Resource: "pods"}
 )
 
 func applyGPUMutation(req *admissionv1.AdmissionRequest) ([]patchOperation, error) {
-	// Only mutate if the incoming resource is a Pod CREATE request.
-	if req.Resource != podResource {
-		log.Printf("applyGPUMutation invoked for a non-Pod resource: %v", req.Resource)
-		return nil, nil
-	}
-	if req.Operation != admissionv1.Create {
-		log.Printf("applyGPUMutation invoked for operation %s, ignoring", req.Operation)
+	// Only mutate Pod CREATE
+	// Swati: may be add UPDATE
+	if req.Resource != podResource || req.Operation != admissionv1.Create {
+		klog.Infof("skip mutation for %v/%v", req.Resource, req.Operation)
 		return nil, nil
 	}
 
-	raw := req.Object.Raw
 	var pod corev1.Pod
-	if _, _, err := universalDeserializer.Decode(raw, nil, &pod); err != nil {
-		return nil, fmt.Errorf("could not deserialize pod object: %v", err)
+	if _, _, err := universalDeserializer.Decode(req.Object.Raw, nil, &pod); err != nil {
+		klog.Errorf("failed to decode Pod: %v", err)
+		return nil, fmt.Errorf("could not deserialize pod: %w", err)
 	}
 
+	key := escapeJSONPointer(gpuResourceName)
 	var patches []patchOperation
-
-	// Check if the Pod already has a resource claim
-	hasGPUClaim := false
-	for _, rc := range pod.Spec.ResourceClaims {
-		if rc.Name == gpuClaimName {
-			hasGPUClaim = true
-			break
+	var ctrGPUResourceClaims []string
+
+	// Iterate on all containers and check for "nvidia.com/gpu" limits
+	// using the logic described here for prefering limits over requests
+	// GPUs are only supposed to be specified in the limits section, meaning
+	// - can specify GPU limits without specifying requests. limit will be used as request value by default
+	// - can specify GPU in both limits and requests but they must be equal
+	// - cannot specify GPU requests without specifying limits
+	// refer: https://kubernetes.io/docs/tasks/manage-gpus/scheduling-gpus/#using-device-plugins
+	for ci, ctr := range pod.Spec.Containers {
+		ctrName := ctr.Name
+		limitCount, limitOk := ctr.Resources.Limits[gpuResourceName]
+
+		// skip if no GPUs in limits
+		if !limitOk || limitCount.Value() < 1 {
+			continue
 		}
-	}
-
-	// Escape "nvidia.com/gpu" for JSON Patch
-	escapedGPUKey := strings.ReplaceAll(strings.ReplaceAll("nvidia.com/gpu", "~", "~0"), "/", "~1")
-
-	for i, c := range pod.Spec.Containers {
-		foundGPU := false
-
-		if _, ok := c.Resources.Requests["nvidia.com/gpu"]; ok {
-			foundGPU = true
-			patches = append(patches, patchOperation{
-				Op:   "remove",
-				Path: fmt.Sprintf("/spec/containers/%d/resources/requests/%s", i, escapedGPUKey),
-			})
+		gpuCount := limitCount.Value()
+
+		// check any GPUs in requests
+		// it must be equal to limits
+		if reqCount, reqOK := ctr.Resources.Requests[gpuResourceName]; reqOK {
+			if reqCount.Value() != gpuCount {
+				klog.Warningf("container[%q]: gpu request (%d) != limit (%d), skipping mutation", ctrName, reqCount.Value(), gpuCount)
+				continue
+			}
+			reqPatch := removeResourceRequest(ci, "requests", key)
+			patches = append(patches, reqPatch)
+			klog.Infof("removed container[%q].Resources.Requests: %v", ctrName, reqPatch)
 		}
-
-		if _, ok := c.Resources.Limits["nvidia.com/gpu"]; ok {
-			foundGPU = true
-			patches = append(patches, patchOperation{
-				Op:   "remove",
-				Path: fmt.Sprintf("/spec/containers/%d/resources/limits/%s", i, escapedGPUKey),
-			})
+		limitPatch := removeResourceRequest(ci, "limits", key)
+		patches = append(patches, limitPatch)
+		klog.Infof("removed container[%q].Resources.Limits: %v", ctrName, limitPatch)
+
+		// ensure container-claims slice exists
+		// this is JSON way to first creating the field if it does not exist and append later with "-"
+		if len(ctr.Resources.Claims) == 0 {
+			createPatch := createClaimPatch(fmt.Sprintf("/spec/containers/%d/resources/claims", ci))
+			patches = append(patches, createPatch)
+			klog.Infof("created container[%q] empty claims array: %v", ctrName, createPatch)
 		}
 
-		if foundGPU {
-			gpuClaimPresent := false
-			for _, claimRef := range c.Resources.Claims {
-				if claimRef.Name == gpuClaimName {
-					gpuClaimPresent = true
-					break
-				}
-			}
-			if !gpuClaimPresent {
-				if c.Resources.Claims == nil {
-					patches = append(patches, patchOperation{
-						Op:   "add",
-						Path: fmt.Sprintf("/spec/containers/%d/resources/claims", i),
-						Value: []map[string]string{
-							{"name": gpuClaimName},
-						},
-					})
-				} else {
-					patches = append(patches, patchOperation{
-						Op:    "add",
-						Path:  fmt.Sprintf("/spec/containers/%d/resources/claims/-", i),
-						Value: map[string]string{"name": gpuClaimName},
-					})
-				}
-			}
+		// append one claim per GPU
+		for i := int64(0); i < gpuCount; i++ {
+			claimName := fmt.Sprintf("%s-%d", gpuClaimName, i)
+			ctrGPUResourceClaims = append(ctrGPUResourceClaims, claimName)
+			appendPatch := appendClaimPatch(
+				fmt.Sprintf("/spec/containers/%d/resources/claims", ci),
+				map[string]string{"name": claimName},
+			)
+			patches = append(patches, appendPatch)
+			klog.Infof("added to container[%q].Resources.Claims: %v", ctrName, appendPatch)
 		}
 	}
 
-	if len(patches) > 0 && !hasGPUClaim {
-		newClaim := map[string]string{
-			"name":                      gpuClaimName,
-			"resourceClaimTemplateName": gpuTemplateName,
+	// Add claims pod-level
+	podName := pod.Name
+	if len(ctrGPUResourceClaims) > 0 {
+		// ensure pod-claims slice exists
+		if len(pod.Spec.ResourceClaims) == 0 {
+			createPatch := createClaimPatch("/spec/resourceClaims")
+			patches = append(patches, createPatch)
+			klog.Infof("created pod[%q] empty claims array: %v", podName, createPatch)
 		}
 
-		if pod.Spec.ResourceClaims == nil {
-			patches = append(patches, patchOperation{
-				Op:   "add",
-				Path: "/spec/resourceClaims",
-				Value: []map[string]string{
-					newClaim,
+		// append each container GPU claim at pod-level
+		for _, name := range ctrGPUResourceClaims {
+			appendPatch := appendClaimPatch(
+				"/spec/resourceClaims",
+				map[string]string{
+					"name":                      name,
+					"resourceClaimTemplateName": gpuTemplateName,
 				},
-			})
-		} else {
-			patches = append(patches, patchOperation{
-				Op:    "add",
-				Path:  "/spec/resourceClaims/-",
-				Value: newClaim,
-			})
+			)
+			patches = append(patches, appendPatch)
+			klog.Infof("added ResourceClaim %q (template=%q) to %q: %v", name, gpuTemplateName, podName, appendPatch)
 		}
-		log.Printf("Added ResourceClaim %q referencing template %q to Pod %q",
-			gpuClaimName, gpuTemplateName, pod.Name)
 	}
 
 	return patches, nil
 }
 
+// escapeJSONPointer replace "/" with "~1"
+// refer: https://github.com/json-patch/json-patch-tests/issues/42
+// needed for "nvidia.com/gpu". otherwise JSON will treat "/" as a path delimiter and treat "gpu" as new field
+func escapeJSONPointer(s string) string {
+	return strings.ReplaceAll(s, "/", "~1")
+}
+
+// removeResourceRequest removes either .resources.requests or .resources.limits
+func removeResourceRequest(ci int, field, key string) patchOperation {
+	return patchOperation{
+		Op:   "remove",
+		Path: fmt.Sprintf("/spec/containers/%d/resources/%s/%s", ci, field, key),
+	}
+}
+
+// createClaimPatch creates an empty slice at the given path
+func createClaimPatch(path string) patchOperation {
+	return patchOperation{
+		Op:    "add",
+		Path:  path,
+		Value: []map[string]string{},
+	}
+}
+
+// appendClaimPatch appends to the slice at path
+// "-" is JSON way to inserting at the end of the array when no index is specified.
+// refer: https://datatracker.ietf.org/doc/html/rfc6902
+func appendClaimPatch(path string, entry map[string]string) patchOperation {
+	return patchOperation{
+		Op:    "add",
+		Path:  path + "/-",
+		Value: entry,
+	}
+}
+
 func main() {
 	certPath := filepath.Join(tlsDir, tlsCertFile)
 	keyPath := filepath.Join(tlsDir, tlsKeyFile)
@@ -157,6 +186,10 @@ func main() {
 		Addr:    ":8443",
 		Handler: mux,
 	}
-	log.Printf("Starting webhook server on %s", server.Addr)
-	log.Fatal(server.ListenAndServeTLS(certPath, keyPath))
+
+	if err := server.ListenAndServeTLS(certPath, keyPath); err != nil {
+		// Swati: need better error handling here
+		log.Fatalf("Failed to start server: %v", err)
+	}
+	klog.Infof("Started gpu-mutating-webhook server at %s", server.Addr)
 }