From b81e5192fa62032c2895e8f107df359a51b81c47 Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Wed, 11 Mar 2026 16:02:31 +0000
Subject: [PATCH 01/25] feat: add GPU-specific agents for NVIDIA and AMD with
 NFD-based deployment

Replace generic agent with GPU-vendor-specific agents that deploy based on
hardware detection. This enables hybrid clusters with both NVIDIA and AMD
GPUs to run optimized agents with appropriate runtime libraries.

Changes:
- Add Containerfile.gkm-agent-nvidia (CUDA 12.6.3 base with NVML)
- Add Containerfile.gkm-agent-amd (ROCm 6.3.1 with AMD SMI libraries)
- Remove generic Containerfile.gkm-agent
- Add DaemonSet manifests with PCI vendor ID-based node selectors:
  * gkm-agent-nvidia.yaml (nodeSelector: pci-10de.present)
  * gkm-agent-amd.yaml (nodeSelector: pci-1002.present)
- Remove generic gkm-agent.yaml
- Add Node Feature Discovery (NFD) deployment configuration
- Update Makefile with GPU-specific build/push targets:
  * build-image-agent-nvidia, build-image-agent-amd
  * build-image-agents-gpu (builds both)
  * push-images-agents-gpu
- Add mcv dependencies: go-nvlib v0.9.0, amdsmi (amd-staging)
- Add comprehensive documentation for multi-GPU deployment

The operator and CSI plugin remain unchanged and work with both agent types.
NFD automatically labels nodes with GPU vendor information, enabling
declarative GPU-specific agent deployment without manual intervention.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 ...e.gkm-agent => Containerfile.gkm-agent-amd |  28 +--
 Containerfile.gkm-agent-nogpu                 |  65 ++++++
 Containerfile.gkm-agent-nvidia                |  68 +++++++
 Makefile                                      |  33 ++-
 config/agent/README.md                        | 192 ++++++++++++++++++
 .../{gkm-agent.yaml => gkm-agent-amd.yaml}    |  11 +-
 config/agent/gkm-agent-nogpu.yaml             |  91 +++++++++
 config/agent/gkm-agent-nvidia.yaml            |  88 ++++++++
 config/agent/kustomization.yaml               |  19 +-
 config/nfd/README.md                          | 167 +++++++++++++++
 config/nfd/kustomization.yaml                 |  12 ++
 config/nfd/nfd-worker-conf.yaml               |  33 +++
 mcv/go.mod                                    |   2 +
 mcv/go.sum                                    |   4 +
 14 files changed, 781 insertions(+), 32 deletions(-)
 rename Containerfile.gkm-agent => Containerfile.gkm-agent-amd (63%)
 create mode 100644 Containerfile.gkm-agent-nogpu
 create mode 100644 Containerfile.gkm-agent-nvidia
 create mode 100644 config/agent/README.md
 rename config/agent/{gkm-agent.yaml => gkm-agent-amd.yaml} (86%)
 create mode 100644 config/agent/gkm-agent-nogpu.yaml
 create mode 100644 config/agent/gkm-agent-nvidia.yaml
 create mode 100644 config/nfd/README.md
 create mode 100644 config/nfd/kustomization.yaml
 create mode 100644 config/nfd/nfd-worker-conf.yaml

diff --git a/Containerfile.gkm-agent b/Containerfile.gkm-agent-amd
similarity index 63%
rename from Containerfile.gkm-agent
rename to Containerfile.gkm-agent-amd
index 2214838e6..a59daee88 100644
--- a/Containerfile.gkm-agent
+++ b/Containerfile.gkm-agent-amd
@@ -58,25 +58,17 @@ RUN apt-get update && \
         libseccomp2 && \
     apt-get clean
 
-ARG NO_GPU=false
-ARG ROCM_VERSION=7.0.1
-ARG AMDGPU_VERSION=7.0.1.70001
-ARG OPT_ROCM_VERSION=7.0.1
+ARG ROCM_VERSION=6.3.1
+ARG AMDGPU_VERSION=6.3.60301
+ARG OPT_ROCM_VERSION=6.3.1
 
-# Conditionally install ROCm packages based on NO_GPU flag
-RUN if [ "$NO_GPU" = "false" ]; then \
-        wget https://repo.radeon.com/amdgpu-install/${ROCM_VERSION}/ubuntu/noble/amdgpu-install_${AMDGPU_VERSION}-1_all.deb && \
-        apt install -y ./*.deb && \
-        apt update && DEBIAN_FRONTEND=noninteractive apt install -y amd-smi-lib rocm-smi-lib && \
-        apt-get clean && rm -rf /var/lib/apt/lists/* && \
-        ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/amd-smi /usr/bin/amd-smi && \
-        ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/rocm-smi /usr/bin/rocm-smi; \
-    else \
-        echo "NO_GPU=true, skipping ROCm installation"; \
-    fi
-
-# Set NO_GPU environment variable
-ENV NO_GPU=${NO_GPU}
+# Install ROCm packages for AMD GPU support
+RUN wget https://repo.radeon.com/amdgpu-install/${ROCM_VERSION}/ubuntu/noble/amdgpu-install_${AMDGPU_VERSION}-1_all.deb && \
+    apt install -y ./*.deb && \
+    apt update && DEBIAN_FRONTEND=noninteractive apt install -y amd-smi-lib rocm-smi-lib && \
+    apt-get clean && rm -rf /var/lib/apt/lists/* && \
+    ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/amd-smi /usr/bin/amd-smi && \
+    ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/rocm-smi /usr/bin/rocm-smi
 
 # Run as non-root user
 USER 65532:65532
diff --git a/Containerfile.gkm-agent-nogpu b/Containerfile.gkm-agent-nogpu
new file mode 100644
index 000000000..869108e09
--- /dev/null
+++ b/Containerfile.gkm-agent-nogpu
@@ -0,0 +1,65 @@
+# Build the agent binary
+FROM public.ecr.aws/docker/library/golang:1.25 AS builder
+
+WORKDIR /workspace
+
+# Install required system packages
+RUN apt-get update && \
+    apt-get install -y \
+        libgpgme-dev \
+        btrfs-progs \
+        libbtrfs-dev \
+        libgpgme11-dev \
+        libseccomp-dev \
+        pkg-config \
+        build-essential && \
+    apt-get clean
+
+# Copy the Go Modules manifests
+COPY go.mod go.mod
+COPY go.sum go.sum
+
+# Copy the go source
+COPY agent/main.go agent/main.go
+COPY api/ api/
+COPY pkg/ pkg/
+COPY internal/controller/ internal/controller/
+COPY vendor/ vendor/
+COPY Makefile Makefile
+
+# Build the agent binary
+RUN make build-gkm-agent
+
+# Use minimal Ubuntu base image for no-GPU environments
+FROM public.ecr.aws/docker/library/ubuntu:24.04
+
+# Copy the binary from the builder
+COPY --from=builder /workspace/bin/gkm-agent /agent
+
+# Install required runtime libraries for CGO
+RUN apt-get update && \
+    apt-get install -y \
+        ca-certificates \
+        libgpgme11 \
+        libbtrfs0 \
+        libffi8 \
+        libc6 \
+        wget \
+        pciutils \
+        hwdata \
+        gnupg2 \
+        python3-setuptools \
+        python3-wheel \
+        curl \
+        dialog \
+        rsync \
+        lsb-release \
+        software-properties-common \
+        libseccomp2 && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Run as non-root user
+USER 65532:65532
+
+ENTRYPOINT ["/agent"]
diff --git a/Containerfile.gkm-agent-nvidia b/Containerfile.gkm-agent-nvidia
new file mode 100644
index 000000000..28e6b836a
--- /dev/null
+++ b/Containerfile.gkm-agent-nvidia
@@ -0,0 +1,68 @@
+# Build the agent binary
+FROM public.ecr.aws/docker/library/golang:1.25 AS builder
+
+WORKDIR /workspace
+
+# Install required system packages
+RUN apt-get update && \
+    apt-get install -y \
+        libgpgme-dev \
+        btrfs-progs \
+        libbtrfs-dev \
+        libgpgme11-dev \
+        libseccomp-dev \
+        pkg-config \
+        build-essential && \
+    apt-get clean
+
+# Copy the Go Modules manifests
+COPY go.mod go.mod
+COPY go.sum go.sum
+
+# Copy the go source
+COPY agent/main.go agent/main.go
+COPY api/ api/
+COPY pkg/ pkg/
+COPY internal/controller/ internal/controller/
+COPY vendor/ vendor/
+COPY Makefile Makefile
+
+# Build the agent binary
+RUN make build-gkm-agent
+
+# Use NVIDIA CUDA runtime base image for GPU support
+FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04
+
+# Copy the binary from the builder
+COPY --from=builder /workspace/bin/gkm-agent /agent
+
+# Install required runtime libraries for CGO
+RUN apt-get update && \
+    apt-get install -y \
+        ca-certificates \
+        libgpgme11 \
+        libbtrfs0 \
+        libffi8 \
+        libc6 \
+        wget \
+        pciutils \
+        hwdata \
+        gnupg2 \
+        python3-setuptools \
+        python3-wheel \
+        curl \
+        dialog \
+        rsync \
+        lsb-release \
+        software-properties-common \
+        libseccomp2 && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# The NVIDIA CUDA base image already includes libnvidia-ml.so (NVML)
+# No additional NVIDIA packages needed
+
+# Run as non-root user
+USER 65532:65532
+
+ENTRYPOINT ["/agent"]
diff --git a/Makefile b/Makefile
index 5a4e0bf21..3283494db 100644
--- a/Makefile
+++ b/Makefile
@@ -209,25 +209,44 @@ run: manifests generate fmt vet ## Run a controller from your host.
 build-image-operator:
 	$(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --progress=plain --load -f Containerfile.gkm-operator -t ${OPERATOR_IMG} .
 
-.PHONY: build-image-agent
-build-image-agent:
-	$(CONTAINER_TOOL) build  $(CONTAINER_FLAGS) --build-arg NO_GPU=$(NO_GPU_BUILD) --progress=plain --load -f Containerfile.gkm-agent -t ${AGENT_IMG} .
-
 .PHONY: build-image-gkm-extract
 build-image-gkm-extract:
 	$(CONTAINER_TOOL) build  $(CONTAINER_FLAGS) --progress=plain --load -f Containerfile.gkm-extract -t ${EXTRACT_IMG} .
 
+.PHONY: build-image-agent-nvidia
+build-image-agent-nvidia:
+	$(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load -f Containerfile.gkm-agent-nvidia -t $(REPO)/agent-nvidia:$(IMAGE_TAG) .
+
+.PHONY: build-image-agent-amd
+build-image-agent-amd:
+	$(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load -f Containerfile.gkm-agent-amd -t $(REPO)/agent-amd:$(IMAGE_TAG) .
+
+.PHONY: build-image-agent-nogpu
+build-image-agent-nogpu:
+	$(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --progress=plain --load -f Containerfile.gkm-agent-nogpu -t $(REPO)/agent-nogpu:$(IMAGE_TAG) .
+
+.PHONY: build-image-agents
+build-image-agents: build-image-agent-nvidia build-image-agent-amd build-image-agent-nogpu ## Build all agent images (NVIDIA, AMD, and no-GPU)
+
 # If you wish to build the operator image targeting other platforms you can use the --platform flag.
 # (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it.
 # More info: https://docs.docker.com/develop/develop-images/build_enhancements/
 .PHONY: build-images
-build-images: build-image-operator build-image-agent build-image-gkm-extract ## Build all container images.
+build-images: build-image-operator build-image-agents build-image-gkm-extract ## Build all container images.
 
 .PHONY: push-images
-push-images: ## Push all container image.
+push-images: ## Push all container images.
 	$(CONTAINER_TOOL) push ${OPERATOR_IMG}
-	$(CONTAINER_TOOL) push ${AGENT_IMG}
 	$(CONTAINER_TOOL) push ${EXTRACT_IMG}
+	$(CONTAINER_TOOL) push $(REPO)/agent-nvidia:$(IMAGE_TAG)
+	$(CONTAINER_TOOL) push $(REPO)/agent-amd:$(IMAGE_TAG)
+	$(CONTAINER_TOOL) push $(REPO)/agent-nogpu:$(IMAGE_TAG)
+
+.PHONY: push-images-agents
+push-images-agents: ## Push all agent images
+	$(CONTAINER_TOOL) push $(REPO)/agent-nvidia:$(IMAGE_TAG)
+	$(CONTAINER_TOOL) push $(REPO)/agent-amd:$(IMAGE_TAG)
+	$(CONTAINER_TOOL) push $(REPO)/agent-nogpu:$(IMAGE_TAG)
 
 # Mapping old commands after rename
 .PHONY: docker-build
diff --git a/config/agent/README.md b/config/agent/README.md
new file mode 100644
index 000000000..b15567441
--- /dev/null
+++ b/config/agent/README.md
@@ -0,0 +1,192 @@
+# Multi-GPU Agent Deployment
+
+This directory contains configuration for deploying GPU-specific GKM agents that support both NVIDIA and AMD GPUs in heterogeneous clusters.
+
+## Overview
+
+GKM now supports deploying different agent containers based on the GPU hardware present on each node:
+
+- **`gkm-agent-nvidia`**: For nodes with NVIDIA GPUs
+- **`gkm-agent-amd`**: For nodes with AMD ROCm GPUs
+- **`gkm-agent`**: Legacy generic agent (deprecated)
+
+## Architecture
+
+Each GPU-specific agent uses:
+- **Different base images** with appropriate GPU runtime libraries
+- **Node selectors** to deploy only on compatible hardware
+- **Automatic node labeling** via Node Feature Discovery (NFD)
+
+## Prerequisites
+
+### 1. Node Feature Discovery (NFD)
+
+NFD must be deployed to automatically label nodes with their PCI device information:
+
+```bash
+# Deploy NFD
+kubectl apply -k config/nfd
+
+# Verify NFD is running
+kubectl get pods -n node-feature-discovery
+
+# Check node labels (should see pci-* labels)
+kubectl get nodes -o json | jq '.items[].metadata.labels' | grep pci
+```
+
+NFD will automatically add labels like:
+- `feature.node.kubernetes.io/pci-10de.present=true` (NVIDIA, vendor ID: 0x10de)
+- `feature.node.kubernetes.io/pci-1002.present=true` (AMD, vendor ID: 0x1002)
+
+### 2. GPU Device Plugins
+
+Ensure appropriate GPU device plugins are installed:
+
+**For NVIDIA:**
+```bash
+kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.0/deployments/static/nvidia-device-plugin.yml
+```
+
+**For AMD:**
+```bash
+kubectl apply -f https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml
+```
+
+## Building GPU-Specific Agent Images
+
+### Build All GPU Agents
+```bash
+make build-image-agents-gpu
+```
+
+### Build Individual Agents
+```bash
+# NVIDIA agent
+make build-image-agent-nvidia
+
+# AMD agent
+make build-image-agent-amd
+```
+
+### Push Images to Registry
+```bash
+# Set your registry
+export QUAY_USER=your-org
+
+# Push GPU-specific agents
+make push-images-agents-gpu
+```
+
+## Deployment
+
+### Deploy with Kustomize
+```bash
+kubectl apply -k config/agent
+```
+
+This will deploy:
+- `agent-nvidia` DaemonSet → Only on nodes with `feature.node.kubernetes.io/pci-10de.present=true`
+- `agent-amd` DaemonSet → Only on nodes with `feature.node.kubernetes.io/pci-1002.present=true`
+
+### Verify Deployment
+```bash
+# Check which agents are running
+kubectl get ds -n gkm-system
+
+# Check agent pods and their node placement
+kubectl get pods -n gkm-system -o wide
+
+# Verify agents are on correct nodes
+kubectl get pods -n gkm-system -l gpu-vendor=nvidia -o wide
+kubectl get pods -n gkm-system -l gpu-vendor=amd -o wide
+```
+
+## Containerfiles
+
+### NVIDIA Agent ([Containerfile.gkm-agent-nvidia](../../Containerfile.gkm-agent-nvidia))
+- Base image: `nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04`
+- Includes: NVIDIA CUDA runtime with NVML libraries
+- Requires: NVIDIA driver on host
+
+### AMD Agent ([Containerfile.gkm-agent-amd](../../Containerfile.gkm-agent-amd))
+- Base image: `ubuntu:24.04`
+- Includes: ROCm libraries (`amd-smi-lib`, `rocm-smi-lib`)
+- Requires: AMD GPU driver on host
+
+## Node Selectors
+
+The DaemonSets use PCI vendor ID-based node selectors:
+
+```yaml
+# NVIDIA nodes
+nodeSelector:
+  feature.node.kubernetes.io/pci-10de.present: "true"
+
+# AMD nodes
+nodeSelector:
+  feature.node.kubernetes.io/pci-1002.present: "true"
+```
+
+## Hybrid GPU Clusters
+
+In clusters with both NVIDIA and AMD nodes:
+
+1. **NFD labels all nodes** with their PCI device information
+2. **NVIDIA agent** deploys only to NVIDIA nodes
+3. **AMD agent** deploys only to AMD nodes
+4. **Operator** works with whichever agent is present on each node
+
+## Troubleshooting
+
+### NFD Not Labeling Nodes
+
+```bash
+# Check NFD worker logs
+kubectl logs -n node-feature-discovery -l app=nfd-worker
+
+# Manually verify PCI devices
+lspci | grep -i vga
+lspci -n | grep -E "0300|0302"
+```
+
+### Agent Not Scheduling
+
+```bash
+# Check node labels
+kubectl describe node <node-name> | grep feature.node.kubernetes.io/pci
+
+# Check DaemonSet events
+kubectl describe ds agent-nvidia -n gkm-system
+kubectl describe ds agent-amd -n gkm-system
+```
+
+### GPU Libraries Not Found
+
+```bash
+# Check NVIDIA driver
+nvidia-smi
+
+# Check AMD driver
+rocm-smi
+
+# Verify libraries in container
+kubectl exec -it <agent-pod> -n gkm-system -- ls -la /usr/lib/x86_64-linux-gnu/ | grep -E "nvidia|amd"
+```
+
+## Migration from Generic Agent
+
+To migrate from the legacy generic agent:
+
+1. Deploy NFD: `kubectl apply -k config/nfd`
+2. Build GPU-specific agents: `make build-image-agents-gpu`
+3. Update manifests to use new agent DaemonSets
+4. Deploy: `kubectl apply -k config/agent`
+5. Remove old generic agent: `kubectl delete ds agent -n gkm-system`
+
+## Related Files
+
+- [gkm-agent-nvidia.yaml](gkm-agent-nvidia.yaml) - NVIDIA DaemonSet
+- [gkm-agent-amd.yaml](gkm-agent-amd.yaml) - AMD DaemonSet
+- [kustomization.yaml](kustomization.yaml) - Kustomize configuration
+- [../../Containerfile.gkm-agent-nvidia](../../Containerfile.gkm-agent-nvidia) - NVIDIA Containerfile
+- [../../Containerfile.gkm-agent-amd](../../Containerfile.gkm-agent-amd) - AMD Containerfile
diff --git a/config/agent/gkm-agent.yaml b/config/agent/gkm-agent-amd.yaml
similarity index 86%
rename from config/agent/gkm-agent.yaml
rename to config/agent/gkm-agent-amd.yaml
index b108bbcd6..ef1a623eb 100644
--- a/config/agent/gkm-agent.yaml
+++ b/config/agent/gkm-agent-amd.yaml
@@ -1,23 +1,29 @@
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
-  name: agent
+  name: agent-amd
   namespace: gkm-system
   labels:
     app: gkm-agent
+    gpu-vendor: amd
 spec:
   selector:
     matchLabels:
       app: gkm-agent
+      gpu-vendor: amd
   template:
     metadata:
       labels:
         app: gkm-agent
+        gpu-vendor: amd
     spec:
       serviceAccountName: gkm-agent
+      # Deploy only on nodes with AMD GPUs
+      nodeSelector:
+        feature.node.kubernetes.io/pci-1002.present: "true"  # AMD vendor ID
       containers:
       - name: gkm-agent
-        image: quay.io/gkm/agent:latest
+        image: quay.io/gkm/agent-amd:latest
         imagePullPolicy: IfNotPresent
         securityContext:
           runAsUser: 0
@@ -50,6 +56,7 @@ spec:
           limits:
             memory: "128Mi"
             cpu: "100m"
+            amd.com/gpu: "0"  # Request 0 GPUs (agent only monitors, doesn't use GPU)
         volumeMounts:
           - name: gkm-state
             mountPath: /var/lib/gkm
diff --git a/config/agent/gkm-agent-nogpu.yaml b/config/agent/gkm-agent-nogpu.yaml
new file mode 100644
index 000000000..7500293ba
--- /dev/null
+++ b/config/agent/gkm-agent-nogpu.yaml
@@ -0,0 +1,91 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: agent-nogpu
+  namespace: gkm-system
+  labels:
+    app: gkm-agent
+    gpu-vendor: none
+spec:
+  selector:
+    matchLabels:
+      app: gkm-agent
+      gpu-vendor: none
+  template:
+    metadata:
+      labels:
+        app: gkm-agent
+        gpu-vendor: none
+    spec:
+      serviceAccountName: gkm-agent
+      # Deploy on nodes without GPUs (nodes that don't have NVIDIA or AMD PCI labels)
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: feature.node.kubernetes.io/pci-10de.present
+                operator: DoesNotExist
+              - key: feature.node.kubernetes.io/pci-1002.present
+                operator: DoesNotExist
+      containers:
+      - name: gkm-agent
+        image: quay.io/gkm/agent-nogpu:latest
+        imagePullPolicy: IfNotPresent
+        securityContext:
+          runAsUser: 0
+          privileged: true
+          capabilities:
+            add: ["CAP_DAC_OVERRIDE", "CAP_FOWNER"]
+          seccompProfile:
+            type: Unconfined
+        env:
+          - name: NO_GPU
+            value: "true"
+          - name: GO_LOG
+            valueFrom:
+              configMapKeyRef:
+                name: gkm-config
+                key: gkm.agent.log.level
+          - name: KUBE_NODE_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: spec.nodeName
+        resources:
+          limits:
+            memory: "128Mi"
+            cpu: "100m"
+        volumeMounts:
+          - name: gkm-state
+            mountPath: /var/lib/gkm
+            mountPropagation: Bidirectional
+          - name: gkm-runtime
+            mountPath: /run/gkm
+            mountPropagation: Bidirectional
+          - name: sys
+            mountPath: /sys
+            readOnly: true
+          - name: dev
+            mountPath: /dev
+
+      volumes:
+        # This volume is the GKM State directory. This is where GPU Kernel Cache
+        # will be extracted.
+        - name: gkm-state
+          hostPath:
+            path: /var/lib/gkm
+            type: DirectoryOrCreate
+        # This volume is the GKM Runtime directory. This is where the Usage data
+        # will tracked which pods are using which cache.
+        - name: gkm-runtime
+          hostPath:
+            path: /run/gkm
+            type: DirectoryOrCreate
+        - name: sys
+          hostPath:
+            path: /sys
+            type: Directory
+        - name: dev
+          hostPath:
+            path: /dev
+            type: Directory
diff --git a/config/agent/gkm-agent-nvidia.yaml b/config/agent/gkm-agent-nvidia.yaml
new file mode 100644
index 000000000..1cfc92af4
--- /dev/null
+++ b/config/agent/gkm-agent-nvidia.yaml
@@ -0,0 +1,88 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: agent-nvidia
+  namespace: gkm-system
+  labels:
+    app: gkm-agent
+    gpu-vendor: nvidia
+spec:
+  selector:
+    matchLabels:
+      app: gkm-agent
+      gpu-vendor: nvidia
+  template:
+    metadata:
+      labels:
+        app: gkm-agent
+        gpu-vendor: nvidia
+    spec:
+      serviceAccountName: gkm-agent
+      # Deploy only on nodes with NVIDIA GPUs
+      nodeSelector:
+        feature.node.kubernetes.io/pci-10de.present: "true"  # NVIDIA vendor ID
+      containers:
+      - name: gkm-agent
+        image: quay.io/gkm/agent-nvidia:latest
+        imagePullPolicy: IfNotPresent
+        securityContext:
+          runAsUser: 0
+          privileged: true
+          capabilities:
+            add: ["CAP_DAC_OVERRIDE", "CAP_FOWNER"]
+          seccompProfile:
+            type: Unconfined
+        env:
+          - name: NO_GPU
+            valueFrom:
+              configMapKeyRef:
+                name: gkm-config
+                key: gkm.nogpu
+          - name: GO_LOG
+            valueFrom:
+              configMapKeyRef:
+                name: gkm-config
+                key: gkm.agent.log.level
+          - name: KUBE_NODE_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: spec.nodeName
+        resources:
+          limits:
+            memory: "128Mi"
+            cpu: "100m"
+            nvidia.com/gpu: "0"  # Request 0 GPUs (agent only monitors, doesn't use GPU)
+        volumeMounts:
+          - name: gkm-state
+            mountPath: /var/lib/gkm
+            mountPropagation: Bidirectional
+          - name: gkm-runtime
+            mountPath: /run/gkm
+            mountPropagation: Bidirectional
+          - name: sys
+            mountPath: /sys
+            readOnly: true
+          - name: dev
+            mountPath: /dev
+
+      volumes:
+        # This volume is the GKM State directory. This is where GPU Kernel Cache
+        # will be extracted.
+        - name: gkm-state
+          hostPath:
+            path: /var/lib/gkm
+            type: DirectoryOrCreate
+        # This volume is the GKM Runtime directory. This is where the Usage data
+        # will tracked which pods are using which cache.
+        - name: gkm-runtime
+          hostPath:
+            path: /run/gkm
+            type: DirectoryOrCreate
+        - name: sys
+          hostPath:
+            path: /sys
+            type: Directory
+        - name: dev
+          hostPath:
+            path: /dev
+            type: Directory
diff --git a/config/agent/kustomization.yaml b/config/agent/kustomization.yaml
index 07e67158f..47a1d4be1 100644
--- a/config/agent/kustomization.yaml
+++ b/config/agent/kustomization.yaml
@@ -1,11 +1,20 @@
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
+
+# Deploy GPU-specific agents based on node hardware
+# Requires Node Feature Discovery (NFD) to label nodes
 resources:
-- gkm-agent.yaml
+- gkm-agent-nvidia.yaml  # NVIDIA GPU nodes
+- gkm-agent-amd.yaml     # AMD GPU nodes
+- gkm-agent-nogpu.yaml   # Nodes without GPUs
+
 images:
-- name: agent
-  newName: quay.io/gkm/agent
+- name: quay.io/gkm/agent-nvidia
+  newName: quay.io/gkm/agent-nvidia
   newTag: latest
-- name: quay.io/gkm/agent
-  newName: quay.io/gkm/agent
+- name: quay.io/gkm/agent-amd
+  newName: quay.io/gkm/agent-amd
+  newTag: latest
+- name: quay.io/gkm/agent-nogpu
+  newName: quay.io/gkm/agent-nogpu
   newTag: latest
diff --git a/config/nfd/README.md b/config/nfd/README.md
new file mode 100644
index 000000000..bc6ce2db5
--- /dev/null
+++ b/config/nfd/README.md
@@ -0,0 +1,167 @@
+# Node Feature Discovery (NFD) Configuration
+
+This directory contains the configuration for deploying [Node Feature Discovery](https://kubernetes-sigs.github.io/node-feature-discovery/) to automatically label nodes with hardware features, particularly GPU vendor information.
+
+## What is NFD?
+
+Node Feature Discovery is a Kubernetes add-on that detects hardware features available on each node and advertises those features using node labels.
+
+For GKM, NFD automatically labels nodes with PCI device vendor IDs, enabling GPU-specific agent deployment:
+
+- **NVIDIA GPUs**: `feature.node.kubernetes.io/pci-10de.present=true` (vendor ID: 0x10de)
+- **AMD GPUs**: `feature.node.kubernetes.io/pci-1002.present=true` (vendor ID: 0x1002)
+
+## Deployment
+
+### Deploy NFD
+```bash
+kubectl apply -k config/nfd
+```
+
+### Verify NFD is Running
+```bash
+# Check NFD pods
+kubectl get pods -n node-feature-discovery
+
+# Expected output:
+# NAME                          READY   STATUS    RESTARTS   AGE
+# nfd-master-xxxxx              1/1     Running   0          1m
+# nfd-worker-xxxxx              1/1     Running   0          1m
+# nfd-worker-yyyyy              1/1     Running   0          1m
+```
+
+### Check Node Labels
+```bash
+# View all NFD labels on a node
+kubectl get node <node-name> -o json | jq '.metadata.labels | with_entries(select(.key | startswith("feature.node.kubernetes.io")))'
+
+# Check for GPU vendor labels specifically
+kubectl get nodes -L feature.node.kubernetes.io/pci-10de.present,feature.node.kubernetes.io/pci-1002.present
+```
+
+## How It Works
+
+1. **NFD Master**: Runs as a deployment, manages feature labeling
+2. **NFD Worker**: Runs as a DaemonSet on each node, detects features
+3. **Worker scans PCI devices** and creates labels for vendor IDs
+4. **Labels are applied** to nodes automatically
+
+## Configuration
+
+### Default Configuration
+
+The default NFD configuration (via [kustomization.yaml](kustomization.yaml)) deploys NFD from the official upstream repository.
+
+### Custom Configuration (Optional)
+
+To customize NFD behavior, uncomment the patch in `kustomization.yaml` and modify [nfd-worker-conf.yaml](nfd-worker-conf.yaml):
+
+```yaml
+# In kustomization.yaml
+patchesStrategicMerge:
+  - nfd-worker-conf.yaml
+```
+
+The custom configuration enables:
+- **PCI device detection** with focus on display controllers (GPUs)
+- **Vendor ID labeling** for automatic GPU vendor identification
+- **Configurable scan interval** (default: 60s)
+
+## Verification
+
+### Manual PCI Device Check
+
+On each node, you can manually verify GPU devices:
+
+```bash
+# List all VGA/Display controllers
+lspci | grep -i vga
+
+# Show vendor IDs numerically
+lspci -n | grep -E "0300|0302"
+
+# Example outputs:
+# NVIDIA: 01:00.0 0300: 10de:1b80 (rev a1)
+# AMD:    01:00.0 0300: 1002:67df (rev c7)
+```
+
+### Verify Label Creation
+
+```bash
+# List nodes with NVIDIA GPUs
+kubectl get nodes -l feature.node.kubernetes.io/pci-10de.present=true
+
+# List nodes with AMD GPUs
+kubectl get nodes -l feature.node.kubernetes.io/pci-1002.present=true
+```
+
+## Integration with GKM Agents
+
+NFD labels are used by GKM agent DaemonSets to deploy GPU-specific agents:
+
+```yaml
+# From config/agent/gkm-agent-nvidia.yaml
+nodeSelector:
+  feature.node.kubernetes.io/pci-10de.present: "true"
+
+# From config/agent/gkm-agent-amd.yaml
+nodeSelector:
+  feature.node.kubernetes.io/pci-1002.present: "true"
+```
+
+This ensures:
+- NVIDIA agents only run on NVIDIA GPU nodes
+- AMD agents only run on AMD GPU nodes
+- No manual node labeling required
+
+## Troubleshooting
+
+### NFD Not Detecting GPUs
+
+1. **Check NFD worker logs:**
+   ```bash
+   kubectl logs -n node-feature-discovery -l app=nfd-worker
+   ```
+
+2. **Verify PCI devices are present:**
+   ```bash
+   # SSH to node
+   lspci | grep -i vga
+   ```
+
+3. **Check NFD configuration:**
+   ```bash
+   kubectl get cm -n node-feature-discovery nfd-worker-conf -o yaml
+   ```
+
+### Labels Not Appearing
+
+1. **Restart NFD worker:**
+   ```bash
+   kubectl rollout restart daemonset/nfd-worker -n node-feature-discovery
+   ```
+
+2. **Force re-labeling:**
+   ```bash
+   kubectl delete pod -n node-feature-discovery -l app=nfd-worker
+   ```
+
+### Wrong Vendor ID
+
+Common PCI vendor IDs:
+- **NVIDIA**: `10de`
+- **AMD**: `1002`
+- **Intel**: `8086`
+
+If using a different GPU vendor, update the node selectors in the agent DaemonSets.
+
+## Resources
+
+- [NFD GitHub](https://github.com/kubernetes-sigs/node-feature-discovery)
+- [NFD Documentation](https://kubernetes-sigs.github.io/node-feature-discovery/)
+- [PCI Vendor IDs Database](https://pci-ids.ucw.cz/)
+
+## Files
+
+- [kustomization.yaml](kustomization.yaml) - Main NFD deployment configuration
+- [nfd-worker-conf.yaml](nfd-worker-conf.yaml) - Optional custom NFD worker configuration
diff --git a/config/nfd/kustomization.yaml b/config/nfd/kustomization.yaml
new file mode 100644
index 000000000..6c050e49c
--- /dev/null
+++ b/config/nfd/kustomization.yaml
@@ -0,0 +1,12 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+# Deploy Node Feature Discovery from official Helm chart
+# This will automatically label nodes with GPU vendor information
+resources:
+  - https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/default?ref=v0.16.7
+
+# Optional: Add custom NFD configuration
+# Uncomment if you need to customize NFD behavior
+# patchesStrategicMerge:
+#   - nfd-worker-conf.yaml
diff --git a/config/nfd/nfd-worker-conf.yaml b/config/nfd/nfd-worker-conf.yaml
new file mode 100644
index 000000000..c183e0cb0
--- /dev/null
+++ b/config/nfd/nfd-worker-conf.yaml
@@ -0,0 +1,33 @@
+# Optional NFD Worker Configuration
+# This file customizes NFD to ensure PCI device detection is enabled
+# Uncomment in kustomization.yaml to use
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: nfd-worker-conf
+  namespace: node-feature-discovery
+data:
+  nfd-worker.conf: |
+    core:
+      labelWhiteList: [".*"]  # Enable all labels
+      noPublish: false
+      sleepInterval: 60s
+      sources:
+        - pci  # Ensure PCI device detection is enabled
+        - cpu
+        - kernel
+        - memory
+        - network
+        - storage
+        - system
+        - usb
+    sources:
+      pci:
+        deviceClassWhitelist:
+          - "03"      # Display controllers (GPUs are in this class)
+          - "0300"    # VGA compatible controller
+          - "0301"    # XGA compatible controller
+          - "0302"    # 3D controller
+        deviceLabelFields:
+          - vendor    # Will create labels like feature.node.kubernetes.io/pci-10de.present
diff --git a/mcv/go.mod b/mcv/go.mod
index 5bbba7590..be7bc23e7 100644
--- a/mcv/go.mod
+++ b/mcv/go.mod
@@ -26,6 +26,8 @@ require (
 	github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect
 	github.com/BurntSushi/toml v1.5.0 // indirect
 	github.com/Microsoft/go-winio v0.6.2 // indirect
+	github.com/NVIDIA/go-nvlib v0.9.0 // indirect
+	github.com/ROCm/amdsmi v0.0.0-20251117222445-a044536b8d69 // indirect
 	github.com/StackExchange/wmi v1.2.1 // indirect
 	github.com/VividCortex/ewma v1.2.0 // indirect
 	github.com/acarl005/stripansi v0.0.0-20180116102854-5a71ef0e047d // indirect
diff --git a/mcv/go.sum b/mcv/go.sum
index 2a1f1c71d..308ce4245 100644
--- a/mcv/go.sum
+++ b/mcv/go.sum
@@ -12,8 +12,12 @@ github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1
 github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM=
 github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY=
 github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU=
+github.com/NVIDIA/go-nvlib v0.9.0 h1:GKLIvLJ0uhCtTLLZp2Q8QIDRxOYH45MM4Y5OO3U5Rho=
+github.com/NVIDIA/go-nvlib v0.9.0/go.mod h1:7mzx9FSdO9fXWP9NKuZmWkCwhkEcSWQFe2tmFwtLb9c=
 github.com/NVIDIA/go-nvml v0.13.0-1 h1:OLX8Jq3dONuPOQPC7rndB6+iDmDakw0XTYgzMxObkEw=
 github.com/NVIDIA/go-nvml v0.13.0-1/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4=
+github.com/ROCm/amdsmi v0.0.0-20251117222445-a044536b8d69 h1:0Sl/RcyHZvSstVPIbdF0D/sdj8ZJd+xBxkCy5M8/aCI=
+github.com/ROCm/amdsmi v0.0.0-20251117222445-a044536b8d69/go.mod h1:c2lzyLAghhTO+y/c3JjKl59JHJliIHwNZOroUfmBQxc=
 github.com/StackExchange/wmi v1.2.1 h1:VIkavFPXSjcnS+O8yTq7NI32k0R5Aj+v39y29VYDOSA=
 github.com/StackExchange/wmi v1.2.1/go.mod h1:rcmrprowKIVzvc+NUiLncP2uuArMWLCbu9SBzvHz7e8=
 github.com/VividCortex/ewma v1.2.0 h1:f58SaIzcDXrSy3kWaHNvuJgJ3Nmz59Zji6XoJR/q1ow=

From 88ae647b7b8667208740752f5f31cef83df0ff01 Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Thu, 12 Mar 2026 10:59:31 +0000
Subject: [PATCH 02/25] feat: enhance deployment automation with NFD and
 Kyverno integration

- Add deploy-nfd and undeploy-nfd targets for automated Node Feature Discovery deployment
- Integrate NFD deployment into main deploy target for GPU detection
- Add deploy-kyverno-production for non-Kind cluster Kyverno deployment
- Add deploy-kyverno-with-policies combined target
- Update deploy target to conditionally deploy Kyverno based on KYVERNO_ENABLED flag
- Update undeploy target to clean up NFD and Kyverno when KYVERNO_ENABLED=true
- Update prepare-deploy to configure all three agent image variants (NVIDIA, AMD, no-GPU)

This enables 'make deploy' to automatically deploy a complete GKM stack including
GPU detection (NFD) and optional image verification (Kyverno) on production clusters.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 Makefile | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 56 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 3283494db..972a654f5 100644
--- a/Makefile
+++ b/Makefile
@@ -311,10 +311,28 @@ uninstall: manifests kustomize ## Uninstall CRDs from the K8s cluster specified
 	$(KUSTOMIZE) build config/crd | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f -
 
 ##@ Deployment
+
+.PHONY: deploy-nfd
+deploy-nfd: kustomize ## Deploy Node Feature Discovery for GPU detection
+	@echo "Deploying Node Feature Discovery (NFD)..."
+	$(KUSTOMIZE) build config/nfd | $(KUBECTL) apply -f -
+	@echo "Waiting for NFD to be ready..."
+	@$(KUBECTL) wait --for=condition=Available --timeout=120s -n node-feature-discovery deployment/nfd-master || true
+	@echo "NFD deployed successfully."
+
+.PHONY: undeploy-nfd
+undeploy-nfd: kustomize ## Undeploy Node Feature Discovery
+	@echo "Undeploying Node Feature Discovery..."
+	$(KUSTOMIZE) build config/nfd | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f -
+	@echo "NFD undeployed."
+
 .PHONY: prepare-deploy
 prepare-deploy:
 	cd config/operator && $(KUSTOMIZE) edit set image quay.io/gkm/operator=${OPERATOR_IMG}
-	cd config/agent && $(KUSTOMIZE) edit set image quay.io/gkm/agent=${AGENT_IMG}
+	cd config/agent && $(KUSTOMIZE) edit set image \
+		quay.io/gkm/agent-nvidia=$(REPO)/agent-nvidia:$(IMAGE_TAG) \
+		quay.io/gkm/agent-amd=$(REPO)/agent-amd:$(IMAGE_TAG) \
+		quay.io/gkm/agent-nogpu=$(REPO)/agent-nogpu:$(IMAGE_TAG)
 ifdef NO_GPU
 	cd config/configMap && \
 	  $(SED) \
@@ -337,7 +355,11 @@ ifneq ($(KYVERNO_ENABLED),true)
 endif
 
 .PHONY: deploy
-deploy: manifests kustomize prepare-deploy webhook-secret-file deploy-cert-manager redeploy ## Deploy controller and agent to the K8s cluster specified in ~/.kube/config
+deploy: manifests kustomize prepare-deploy webhook-secret-file deploy-cert-manager deploy-nfd redeploy ## Deploy controller and agent to the K8s cluster specified in ~/.kube/config
+ifeq ($(KYVERNO_ENABLED),true)
+	@echo "Deploying Kyverno (KYVERNO_ENABLED=true)..."
+	$(MAKE) deploy-kyverno-with-policies
+endif
 
 .PHONY: redeploy
 redeploy: ## Redeploy controller and agent to the K8s cluster after deploy and undeploy have been called. Skips some onetime steps in deploy.
@@ -352,6 +374,13 @@ undeploy: kustomize delete-webhook-secret-file ## Undeploy operator and agent fr
     	exit 1; \
     fi
 	$(KUSTOMIZE) build $(DEPLOY_PATH) | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f -
+ifeq ($(KYVERNO_ENABLED),true)
+	@echo "Undeploying Kyverno (KYVERNO_ENABLED=true)..."
+	-$(MAKE) undeploy-kyverno-policies
+	-$(MAKE) undeploy-kyverno-production
+endif
+	@echo "Undeploying NFD..."
+	-$(MAKE) undeploy-nfd
 	@echo "Undeployment from $(DEPLOY_PATH) completed."
 
 .PHONY: undeploy-force
@@ -486,6 +515,24 @@ else
 endif
 	@echo "Kyverno deployed successfully to $(KIND_CLUSTER_NAME)."
 
+.PHONY: deploy-kyverno-production
+deploy-kyverno-production: helm ## Deploy Kyverno for production clusters (no Kind context)
+	@echo "Installing Kyverno..."
+	$(HELM) upgrade --install kyverno --namespace kyverno --create-namespace \
+		--repo https://kyverno.github.io/kyverno/ kyverno \
+		--values config/kyverno/values.yaml \
+		--wait
+	@echo "Waiting for Kyverno to be ready..."
+	@$(KUBECTL) wait --for=condition=Available --timeout=120s -n kyverno deployment/kyverno-admission-controller || true
+	@echo "Kyverno deployed successfully."
+
+.PHONY: deploy-kyverno-with-policies
+deploy-kyverno-with-policies: deploy-kyverno-production deploy-kyverno-policies ## Deploy Kyverno and its policies
+	@echo "Restarting Kyverno to discover GKM CRDs..."
+	@$(KUBECTL) rollout restart deployment/kyverno-admission-controller -n kyverno
+	@$(KUBECTL) wait --for=condition=Available --timeout=120s -n kyverno deployment/kyverno-admission-controller || true
+	@echo "Kyverno and policies deployed successfully."
+
 .PHONY: deploy-kyverno-policies
 deploy-kyverno-policies: kustomize ## Deploy Kyverno ClusterPolicies for GKMCache image verification
 	@echo "Deploying Kyverno policies for GKMCache image verification..."
@@ -507,6 +554,13 @@ undeploy-kyverno: ## Undeploy Kyverno
 	$(KUBECTL) delete namespace kyverno --ignore-not-found=$(ignore-not-found)
 	@echo "Kyverno undeployed from $(KIND_CLUSTER_NAME)."
 
+.PHONY: undeploy-kyverno-production
+undeploy-kyverno-production: ## Undeploy Kyverno from production cluster
+	@echo "Uninstalling Kyverno..."
+	$(HELM) uninstall kyverno --namespace kyverno --ignore-not-found || true
+	$(KUBECTL) delete namespace kyverno --ignore-not-found=$(ignore-not-found)
+	@echo "Kyverno undeployed."
+
 ##@ Kind Cluster Management
 
 .PHONY: setup-kind

From d667e8c0610672745002745dc5066842a081169d Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Thu, 12 Mar 2026 11:18:24 +0000
Subject: [PATCH 03/25] fix: conditionally build agents based on NO_GPU_BUILD
 flag

- When NO_GPU_BUILD=true, only build and push no-GPU agent
- When NO_GPU_BUILD=false (default), build and push all three agents (NVIDIA, AMD, no-GPU)
- This avoids unnecessary builds of GPU-specific agents for Kind/test clusters

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 Makefile | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/Makefile b/Makefile
index 972a654f5..fee02fd37 100644
--- a/Makefile
+++ b/Makefile
@@ -226,7 +226,11 @@ build-image-agent-nogpu:
 	$(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --progress=plain --load -f Containerfile.gkm-agent-nogpu -t $(REPO)/agent-nogpu:$(IMAGE_TAG) .
 
 .PHONY: build-image-agents
+ifeq ($(NO_GPU_BUILD),true)
+build-image-agents: build-image-agent-nogpu ## Build no-GPU agent only (NO_GPU_BUILD=true)
+else
 build-image-agents: build-image-agent-nvidia build-image-agent-amd build-image-agent-nogpu ## Build all agent images (NVIDIA, AMD, and no-GPU)
+endif
 
 # If you wish to build the operator image targeting other platforms you can use the --platform flag.
 # (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it.
@@ -238,15 +242,24 @@ build-images: build-image-operator build-image-agents build-image-gkm-extract ##
 push-images: ## Push all container images.
 	$(CONTAINER_TOOL) push ${OPERATOR_IMG}
 	$(CONTAINER_TOOL) push ${EXTRACT_IMG}
+ifeq ($(NO_GPU_BUILD),true)
+	$(CONTAINER_TOOL) push $(REPO)/agent-nogpu:$(IMAGE_TAG)
+else
 	$(CONTAINER_TOOL) push $(REPO)/agent-nvidia:$(IMAGE_TAG)
 	$(CONTAINER_TOOL) push $(REPO)/agent-amd:$(IMAGE_TAG)
 	$(CONTAINER_TOOL) push $(REPO)/agent-nogpu:$(IMAGE_TAG)
+endif
 
 .PHONY: push-images-agents
+ifeq ($(NO_GPU_BUILD),true)
+push-images-agents: ## Push no-GPU agent only (NO_GPU_BUILD=true)
+	$(CONTAINER_TOOL) push $(REPO)/agent-nogpu:$(IMAGE_TAG)
+else
 push-images-agents: ## Push all agent images
 	$(CONTAINER_TOOL) push $(REPO)/agent-nvidia:$(IMAGE_TAG)
 	$(CONTAINER_TOOL) push $(REPO)/agent-amd:$(IMAGE_TAG)
 	$(CONTAINER_TOOL) push $(REPO)/agent-nogpu:$(IMAGE_TAG)
+endif
 
 # Mapping old commands after rename
 .PHONY: docker-build

From 7dfd7e2a0772438093da1b46ab0f52ecc46f077b Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Thu, 12 Mar 2026 11:44:13 +0000
Subject: [PATCH 04/25] feat: add individual agent image variables for flexible
 deployment

Add AGENT_NVIDIA_IMG, AGENT_AMD_IMG, and AGENT_NOGPU_IMG variables
to allow individual override of agent images. This enables deploying
with custom image names/tags without requiring the default naming scheme.

Example usage:
  make deploy \
    OPERATOR_IMG=quay.io/user/gkm:operator \
    EXTRACT_IMG=quay.io/user/gkm:extract \
    AGENT_NVIDIA_IMG=quay.io/user/gkm:agent-nvidia \
    AGENT_AMD_IMG=quay.io/user/gkm:agent-amd \
    AGENT_NOGPU_IMG=quay.io/user/gkm:agent-no-gpu

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 Makefile | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/Makefile b/Makefile
index fee02fd37..c4c5fc4ec 100644
--- a/Makefile
+++ b/Makefile
@@ -77,6 +77,9 @@ REPO ?= quay.io/$(QUAY_USER)
 OPERATOR_IMG ?= $(REPO)/operator:$(IMAGE_TAG)
 AGENT_IMG ?=$(REPO)/agent:$(IMAGE_TAG)
 EXTRACT_IMG ?=$(REPO)/gkm-extract:$(IMAGE_TAG)
+AGENT_NVIDIA_IMG ?= $(REPO)/agent-nvidia:$(IMAGE_TAG)
+AGENT_AMD_IMG ?= $(REPO)/agent-amd:$(IMAGE_TAG)
+AGENT_NOGPU_IMG ?= $(REPO)/agent-nogpu:$(IMAGE_TAG)
 
 # ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary.
 ENVTEST_K8S_VERSION = 1.31.0
@@ -215,15 +218,15 @@ build-image-gkm-extract:
 
 .PHONY: build-image-agent-nvidia
 build-image-agent-nvidia:
-	$(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load -f Containerfile.gkm-agent-nvidia -t $(REPO)/agent-nvidia:$(IMAGE_TAG) .
+	$(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load -f Containerfile.gkm-agent-nvidia -t ${AGENT_NVIDIA_IMG} .
 
 .PHONY: build-image-agent-amd
 build-image-agent-amd:
-	$(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load -f Containerfile.gkm-agent-amd -t $(REPO)/agent-amd:$(IMAGE_TAG) .
+	$(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load -f Containerfile.gkm-agent-amd -t ${AGENT_AMD_IMG} .
 
 .PHONY: build-image-agent-nogpu
 build-image-agent-nogpu:
-	$(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --progress=plain --load -f Containerfile.gkm-agent-nogpu -t $(REPO)/agent-nogpu:$(IMAGE_TAG) .
+	$(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --progress=plain --load -f Containerfile.gkm-agent-nogpu -t ${AGENT_NOGPU_IMG} .
 
 .PHONY: build-image-agents
 ifeq ($(NO_GPU_BUILD),true)
@@ -243,22 +246,22 @@ push-images: ## Push all container images.
 	$(CONTAINER_TOOL) push ${OPERATOR_IMG}
 	$(CONTAINER_TOOL) push ${EXTRACT_IMG}
 ifeq ($(NO_GPU_BUILD),true)
-	$(CONTAINER_TOOL) push $(REPO)/agent-nogpu:$(IMAGE_TAG)
+	$(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG}
 else
-	$(CONTAINER_TOOL) push $(REPO)/agent-nvidia:$(IMAGE_TAG)
-	$(CONTAINER_TOOL) push $(REPO)/agent-amd:$(IMAGE_TAG)
-	$(CONTAINER_TOOL) push $(REPO)/agent-nogpu:$(IMAGE_TAG)
+	$(CONTAINER_TOOL) push ${AGENT_NVIDIA_IMG}
+	$(CONTAINER_TOOL) push ${AGENT_AMD_IMG}
+	$(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG}
 endif
 
 .PHONY: push-images-agents
 ifeq ($(NO_GPU_BUILD),true)
 push-images-agents: ## Push no-GPU agent only (NO_GPU_BUILD=true)
-	$(CONTAINER_TOOL) push $(REPO)/agent-nogpu:$(IMAGE_TAG)
+	$(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG}
 else
 push-images-agents: ## Push all agent images
-	$(CONTAINER_TOOL) push $(REPO)/agent-nvidia:$(IMAGE_TAG)
-	$(CONTAINER_TOOL) push $(REPO)/agent-amd:$(IMAGE_TAG)
-	$(CONTAINER_TOOL) push $(REPO)/agent-nogpu:$(IMAGE_TAG)
+	$(CONTAINER_TOOL) push ${AGENT_NVIDIA_IMG}
+	$(CONTAINER_TOOL) push ${AGENT_AMD_IMG}
+	$(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG}
 endif
 
 # Mapping old commands after rename
@@ -343,9 +346,9 @@ undeploy-nfd: kustomize ## Undeploy Node Feature Discovery
 prepare-deploy:
 	cd config/operator && $(KUSTOMIZE) edit set image quay.io/gkm/operator=${OPERATOR_IMG}
 	cd config/agent && $(KUSTOMIZE) edit set image \
-		quay.io/gkm/agent-nvidia=$(REPO)/agent-nvidia:$(IMAGE_TAG) \
-		quay.io/gkm/agent-amd=$(REPO)/agent-amd:$(IMAGE_TAG) \
-		quay.io/gkm/agent-nogpu=$(REPO)/agent-nogpu:$(IMAGE_TAG)
+		quay.io/gkm/agent-nvidia=${AGENT_NVIDIA_IMG} \
+		quay.io/gkm/agent-amd=${AGENT_AMD_IMG} \
+		quay.io/gkm/agent-nogpu=${AGENT_NOGPU_IMG}
 ifdef NO_GPU
 	cd config/configMap && \
 	  $(SED) \

From 019d4fb06c3eadf01de9f32d047a457960fa2f32 Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Thu, 12 Mar 2026 12:24:27 +0000
Subject: [PATCH 05/25] fix: GPU agent scheduling with NFD PCI class code
 labels

The GPU agents were not being scheduled on nodes with GPUs because NFD creates
labels with PCI class codes (e.g., pci-0302_10de for NVIDIA 3D controllers),
but agents were using simple nodeSelectors looking for vendor ID only (pci-10de).

Changes:
- Update NVIDIA agent to use nodeAffinity matching class codes 0300 and 0302
- Update AMD agent to use nodeAffinity matching class codes 0300, 0302, and 0380
- Upgrade NFD to v0.17.2 to fix deprecated node-role.kubernetes.io/master label
- Replace wget with curl in Makefile for macOS compatibility

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 Makefile                           |  8 +++++++-
 config/agent/gkm-agent-amd.yaml    | 17 +++++++++++++++--
 config/agent/gkm-agent-nvidia.yaml | 14 ++++++++++++--
 config/nfd/kustomization.yaml      |  2 +-
 4 files changed, 35 insertions(+), 6 deletions(-)

diff --git a/Makefile b/Makefile
index c4c5fc4ec..1d023c639 100644
--- a/Makefile
+++ b/Makefile
@@ -148,6 +148,12 @@ vendors: ## Refresh vendors directory.
 	@echo "### Checking vendors"
 	go mod tidy && go mod vendor
 
+.PHONY: install-deps
+install-deps: ## Install all dependencies (go, podman, kubectl, and build dependencies).
+	@echo "### Installing GKM dependencies"
+	@chmod +x hack/install_deps.sh
+	@./hack/install_deps.sh
+
 .PHONY: explain
 explain: ## Run "kubectl explain" on all CRDs.
 	CRD_1="ClusterGKMCache" CRD_2="GKMCache" CRD_3="ClusterGKMCacheNode" CRD_4="GKMCacheNode" OUTPUT_DIR="../docs/crds" ./hack/crd_explain_txt.sh
@@ -693,7 +699,7 @@ kind-gpu-sim-script: $(KIND_GPU_SIM_SCRIPT) ## Download  kind-gpu-sim-script loc
 $(KIND_GPU_SIM_SCRIPT): $(LOCALBIN)
 	if [ ! -f $(KIND_GPU_SIM_SCRIPT) ]; then \
 		echo "Downloading $(KIND_GPU_SIM_SCRIPT)"; \
-		wget -P $(LOCALBIN) $(KIND_GPU_SIM_SCRIPT_URL); \
+		curl -L -o $(KIND_GPU_SIM_SCRIPT) $(KIND_GPU_SIM_SCRIPT_URL); \
 		chmod +x $(KIND_GPU_SIM_SCRIPT); \
 	fi
 
diff --git a/config/agent/gkm-agent-amd.yaml b/config/agent/gkm-agent-amd.yaml
index ef1a623eb..95f717adc 100644
--- a/config/agent/gkm-agent-amd.yaml
+++ b/config/agent/gkm-agent-amd.yaml
@@ -19,8 +19,21 @@ spec:
     spec:
       serviceAccountName: gkm-agent
       # Deploy only on nodes with AMD GPUs
-      nodeSelector:
-        feature.node.kubernetes.io/pci-1002.present: "true"  # AMD vendor ID
+      # AMD vendor ID is 1002, with class codes:
+      # 0300: VGA controller, 0302: 3D controller, 0380: Display controller
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: feature.node.kubernetes.io/pci-0300_1002.present
+                operator: Exists
+            - matchExpressions:
+              - key: feature.node.kubernetes.io/pci-0302_1002.present
+                operator: Exists
+            - matchExpressions:
+              - key: feature.node.kubernetes.io/pci-0380_1002.present
+                operator: Exists
       containers:
       - name: gkm-agent
         image: quay.io/gkm/agent-amd:latest
diff --git a/config/agent/gkm-agent-nvidia.yaml b/config/agent/gkm-agent-nvidia.yaml
index 1cfc92af4..1f6670a6b 100644
--- a/config/agent/gkm-agent-nvidia.yaml
+++ b/config/agent/gkm-agent-nvidia.yaml
@@ -19,8 +19,18 @@ spec:
     spec:
       serviceAccountName: gkm-agent
       # Deploy only on nodes with NVIDIA GPUs
-      nodeSelector:
-        feature.node.kubernetes.io/pci-10de.present: "true"  # NVIDIA vendor ID
+      # NVIDIA vendor ID is 10de, with class codes:
+      # 0300: VGA controller, 0302: 3D controller
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: feature.node.kubernetes.io/pci-0300_10de.present
+                operator: Exists
+            - matchExpressions:
+              - key: feature.node.kubernetes.io/pci-0302_10de.present
+                operator: Exists
       containers:
       - name: gkm-agent
         image: quay.io/gkm/agent-nvidia:latest
diff --git a/config/nfd/kustomization.yaml b/config/nfd/kustomization.yaml
index 6c050e49c..684f558c6 100644
--- a/config/nfd/kustomization.yaml
+++ b/config/nfd/kustomization.yaml
@@ -4,7 +4,7 @@ kind: Kustomization
 # Deploy Node Feature Discovery from official Helm chart
 # This will automatically label nodes with GPU vendor information
 resources:
-  - https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/default?ref=v0.16.7
+  - https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/default?ref=v0.17.2
 
 # Optional: Add custom NFD configuration
 # Uncomment if you need to customize NFD behavior

From cb0df30296643aee2d8bf1d4b88d046e91aa506b Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Thu, 12 Mar 2026 12:26:46 +0000
Subject: [PATCH 06/25] fix: exclude control-plane nodes from nogpu agent
 deployment

In multi-node clusters, the nogpu agent should not run on control-plane nodes.
Also updated to match PCI class code label format consistent with GPU agents.

Changes:
- Add nodeAffinity to exclude nodes with node-role.kubernetes.io/control-plane label
- Update GPU detection to use PCI class codes (0300, 0302, 0380) instead of vendor ID only
- Ensures nogpu agent only runs on non-GPU worker nodes

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 config/agent/gkm-agent-nogpu.yaml | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/config/agent/gkm-agent-nogpu.yaml b/config/agent/gkm-agent-nogpu.yaml
index 7500293ba..8b7715104 100644
--- a/config/agent/gkm-agent-nogpu.yaml
+++ b/config/agent/gkm-agent-nogpu.yaml
@@ -19,14 +19,23 @@ spec:
     spec:
       serviceAccountName: gkm-agent
       # Deploy on nodes without GPUs (nodes that don't have NVIDIA or AMD PCI labels)
+      # and exclude control-plane nodes in multi-node clusters
       affinity:
         nodeAffinity:
           requiredDuringSchedulingIgnoredDuringExecution:
             nodeSelectorTerms:
             - matchExpressions:
-              - key: feature.node.kubernetes.io/pci-10de.present
+              - key: feature.node.kubernetes.io/pci-0300_10de.present
                 operator: DoesNotExist
-              - key: feature.node.kubernetes.io/pci-1002.present
+              - key: feature.node.kubernetes.io/pci-0302_10de.present
+                operator: DoesNotExist
+              - key: feature.node.kubernetes.io/pci-0300_1002.present
+                operator: DoesNotExist
+              - key: feature.node.kubernetes.io/pci-0302_1002.present
+                operator: DoesNotExist
+              - key: feature.node.kubernetes.io/pci-0380_1002.present
+                operator: DoesNotExist
+              - key: node-role.kubernetes.io/control-plane
                 operator: DoesNotExist
       containers:
       - name: gkm-agent

From 00fbcbef3353ec8ba0296c5ab66ead75bc974cfc Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Thu, 12 Mar 2026 12:35:46 +0000
Subject: [PATCH 07/25] fix: mount GPU libraries to enable device access
 without GPU resource requests

The GPU agents were unable to access GPUs because they lacked the necessary
GPU runtime libraries. Following the NVIDIA device plugin pattern, we now mount:

NVIDIA agent:
- /usr/lib64 -> Contains libnvidia-ml.so and other NVIDIA libraries
- LD_LIBRARY_PATH=/usr/lib64 environment variable

AMD agent:
- /opt/rocm -> ROCm libraries for AMD GPU management
- /usr/lib64 -> System libraries
- LD_LIBRARY_PATH=/opt/rocm/lib:/usr/lib64

This allows the agents to use NVML/ROCm APIs to detect and monitor ALL GPUs
on the node without requesting gpu resources (nvidia.com/gpu or amd.com/gpu),
which would limit visibility to only one GPU.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 config/agent/gkm-agent-amd.yaml    | 18 ++++++++++++++++++
 config/agent/gkm-agent-nvidia.yaml | 10 ++++++++++
 2 files changed, 28 insertions(+)

diff --git a/config/agent/gkm-agent-amd.yaml b/config/agent/gkm-agent-amd.yaml
index 95f717adc..6de806096 100644
--- a/config/agent/gkm-agent-amd.yaml
+++ b/config/agent/gkm-agent-amd.yaml
@@ -65,6 +65,8 @@ spec:
             valueFrom:
               fieldRef:
                 fieldPath: spec.nodeName
+          - name: LD_LIBRARY_PATH
+            value: /opt/rocm/lib:/usr/lib64
         resources:
           limits:
             memory: "128Mi"
@@ -82,6 +84,12 @@ spec:
             readOnly: true
           - name: dev
             mountPath: /dev
+          - name: rocm-libs
+            mountPath: /opt/rocm
+            readOnly: true
+          - name: system-libs
+            mountPath: /usr/lib64
+            readOnly: true
 
       volumes:
         # This volume is the GKM State directory. This is where GPU Kernel Cache
@@ -104,3 +112,13 @@ spec:
           hostPath:
             path: /dev
             type: Directory
+        # ROCm libraries needed for AMD GPU management
+        - name: rocm-libs
+          hostPath:
+            path: /opt/rocm
+            type: DirectoryOrCreate
+        # System libraries for GPU access
+        - name: system-libs
+          hostPath:
+            path: /usr/lib64
+            type: Directory
diff --git a/config/agent/gkm-agent-nvidia.yaml b/config/agent/gkm-agent-nvidia.yaml
index 1f6670a6b..6cad7bfa5 100644
--- a/config/agent/gkm-agent-nvidia.yaml
+++ b/config/agent/gkm-agent-nvidia.yaml
@@ -57,6 +57,8 @@ spec:
             valueFrom:
               fieldRef:
                 fieldPath: spec.nodeName
+          - name: LD_LIBRARY_PATH
+            value: /usr/lib64
         resources:
           limits:
             memory: "128Mi"
@@ -74,6 +76,9 @@ spec:
             readOnly: true
           - name: dev
             mountPath: /dev
+          - name: nvidia-libs
+            mountPath: /usr/lib64
+            readOnly: true
 
       volumes:
         # This volume is the GKM State directory. This is where GPU Kernel Cache
@@ -96,3 +101,8 @@ spec:
           hostPath:
             path: /dev
             type: Directory
+        # NVIDIA libraries needed for NVML (NVIDIA Management Library)
+        - name: nvidia-libs
+          hostPath:
+            path: /usr/lib64
+            type: Directory

From 4bd95a3dbb7491d75e11ed3492177fd9100bf2ba Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Thu, 12 Mar 2026 12:42:43 +0000
Subject: [PATCH 08/25] feat: add automated dependency installation for RHEL 10

Add comprehensive installation script and make target to automate dependency
setup on RHEL 10 systems. The script handles installation of build dependencies
from CentOS Stream and Fedora repositories, and installs/upgrades go, podman,
and kubectl to required versions.

Changes:
- Add hack/install_deps.sh script for RHEL 10 dependency installation
- Add 'make install-deps' target to Makefile
- Update GettingStartedGuide with automated installation instructions
- Document package sources for RHEL 10 (CentOS Stream, Fedora)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 docs/GettingStartedGuide.md |  27 +++++-
 hack/install_deps.sh        | 170 ++++++++++++++++++++++++++++++++++++
 2 files changed, 195 insertions(+), 2 deletions(-)
 create mode 100644 hack/install_deps.sh

diff --git a/docs/GettingStartedGuide.md b/docs/GettingStartedGuide.md
index b29dcffb4..196157152 100644
--- a/docs/GettingStartedGuide.md
+++ b/docs/GettingStartedGuide.md
@@ -10,14 +10,37 @@ building GKM and description of how to deploy GKM.
 - kubectl version v1.11.3+.
 - Access to a Kubernetes v1.11.3+ cluster.
 
-The following packages are also required to build:
+### Automated Installation (RHEL 10 / CentOS Stream 10)
+
+For RHEL 10 or CentOS Stream 10 systems, you can install all dependencies (including go, podman, kubectl, and build packages) using:
+
+```sh
+make install-deps
+```
+
+This will:
+- Install system development packages (gpgme-devel, libdrm-devel, hwloc-devel)
+- Install btrfs development headers
+- Install or upgrade Go to v1.25.0+ if needed
+- Install or upgrade Podman to v5.3.1+ if needed
+- Install or upgrade kubectl to v1.11.3+ if needed
+
+### Manual Installation
+
+The following packages are required to build:
+
+**For Fedora/RHEL/CentOS:**
 
 ```sh
 sudo dnf install -y gpgme-devel libdrm-devel libbtrfs btrfs-progs \
      btrfs-progs-devel hwloc hwloc-devel
 ```
 
-OR
+> **Note for RHEL 10**: Some packages may not be available in standard repositories.
+> Use `make install-deps` or see [hack/install_deps.sh](../hack/install_deps.sh) for the installation script
+> that sources packages from CentOS Stream 10 and Fedora repositories.
+
+**For Debian/Ubuntu:**
 
 ```sh
 sudo apt-get install -y libgpgme-dev libbtrfs-dev btrfs-progs libgpgme11-dev \
diff --git a/hack/install_deps.sh b/hack/install_deps.sh
new file mode 100644
index 000000000..444b88ce6
--- /dev/null
+++ b/hack/install_deps.sh
@@ -0,0 +1,170 @@
+#!/bin/bash
+
+set -e
+
+echo "================================================"
+echo "GKM Dependency Installation for RHEL 10"
+echo "================================================"
+echo ""
+
+# Minimum required versions
+MIN_GO_VERSION="1.25.0"
+MIN_PODMAN_VERSION="5.3.1"
+MIN_KUBECTL_VERSION="1.11.3"
+
+# CentOS Stream 10 repository URLs
+CENTOS_CRB="https://mirror.stream.centos.org/10-stream/CRB/x86_64/os/"
+FEDORA_BASE="https://download.fedoraproject.org/pub/fedora/linux/development/rawhide/Everything/x86_64/os/Packages"
+
+# Function to compare versions
+version_ge() {
+    # Returns 0 (true) if $1 >= $2
+    [ "$(printf '%s\n' "$2" "$1" | sort -V | head -n1)" = "$2" ]
+}
+
+# Function to check if a command exists
+command_exists() {
+    command -v "$1" >/dev/null 2>&1
+}
+
+echo "=== Step 1: Importing CentOS Stream GPG key ==="
+echo "================================================"
+sudo rpm --import https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256 2>/dev/null || echo "Key may already be imported"
+
+echo ""
+echo "=== Step 2: Installing system development packages ==="
+echo "======================================================"
+sudo dnf install -y --repofrompath=centos-crb,${CENTOS_CRB} \
+  gpgme-devel libdrm-devel hwloc-devel
+
+echo ""
+echo "=== Step 3: Installing btrfs development headers ==="
+echo "====================================================="
+# First install the base libraries with --nodeps to skip filesystem checks
+sudo rpm -ivh --nodeps \
+  "${FEDORA_BASE}/l/libbtrfs-6.19-1.fc45.x86_64.rpm" \
+  "${FEDORA_BASE}/l/libbtrfsutil-6.19-1.fc45.x86_64.rpm" 2>/dev/null || echo "Libraries may already be installed"
+
+# Now install devel package with --nodeps
+sudo rpm -ivh --nodeps \
+  "${FEDORA_BASE}/b/btrfs-progs-6.19-1.fc45.x86_64.rpm" 2>/dev/null || echo "btrfs-progs may already be installed"
+
+sudo rpm -ivh --nodeps \
+  "${FEDORA_BASE}/b/btrfs-progs-devel-6.19-1.fc45.x86_64.rpm"
+
+echo ""
+echo "=== Step 4: Installing Go ${MIN_GO_VERSION}+ ==="
+echo "=============================================="
+if command_exists go; then
+    CURRENT_GO_VERSION=$(go version | awk '{print $3}' | sed 's/go//')
+    echo "Found Go version: ${CURRENT_GO_VERSION}"
+    if version_ge "${CURRENT_GO_VERSION}" "${MIN_GO_VERSION}"; then
+        echo "✓ Go ${CURRENT_GO_VERSION} meets minimum requirement (${MIN_GO_VERSION}+)"
+    else
+        echo "⚠ Go ${CURRENT_GO_VERSION} is older than required ${MIN_GO_VERSION}"
+        echo "Installing Go ${MIN_GO_VERSION}..."
+        GO_VERSION="1.25.0"
+        GO_TARBALL="go${GO_VERSION}.linux-amd64.tar.gz"
+        curl -LO "https://go.dev/dl/${GO_TARBALL}"
+        sudo rm -rf /usr/local/go
+        sudo tar -C /usr/local -xzf "${GO_TARBALL}"
+        rm "${GO_TARBALL}"
+        echo "✓ Go ${GO_VERSION} installed. Add /usr/local/go/bin to your PATH"
+        export PATH=$PATH:/usr/local/go/bin
+    fi
+else
+    echo "Go not found. Installing Go ${MIN_GO_VERSION}..."
+    GO_VERSION="1.25.0"
+    GO_TARBALL="go${GO_VERSION}.linux-amd64.tar.gz"
+    curl -LO "https://go.dev/dl/${GO_TARBALL}"
+    sudo rm -rf /usr/local/go
+    sudo tar -C /usr/local -xzf "${GO_TARBALL}"
+    rm "${GO_TARBALL}"
+    echo "✓ Go ${GO_VERSION} installed. Add /usr/local/go/bin to your PATH"
+    echo 'export PATH=$PATH:/usr/local/go/bin' >> ~/.bashrc
+    export PATH=$PATH:/usr/local/go/bin
+fi
+
+echo ""
+echo "=== Step 5: Installing Podman ${MIN_PODMAN_VERSION}+ ==="
+echo "========================================================"
+if command_exists podman; then
+    CURRENT_PODMAN_VERSION=$(podman version --format '{{.Client.Version}}' 2>/dev/null || podman --version | awk '{print $3}')
+    echo "Found Podman version: ${CURRENT_PODMAN_VERSION}"
+    if version_ge "${CURRENT_PODMAN_VERSION}" "${MIN_PODMAN_VERSION}"; then
+        echo "✓ Podman ${CURRENT_PODMAN_VERSION} meets minimum requirement (${MIN_PODMAN_VERSION}+)"
+    else
+        echo "⚠ Podman ${CURRENT_PODMAN_VERSION} is older than required ${MIN_PODMAN_VERSION}"
+        echo "Upgrading Podman..."
+        sudo dnf upgrade -y podman
+    fi
+else
+    echo "Podman not found. Installing..."
+    sudo dnf install -y podman
+fi
+
+echo ""
+echo "=== Step 6: Installing kubectl ${MIN_KUBECTL_VERSION}+ ==="
+echo "=========================================================="
+if command_exists kubectl; then
+    CURRENT_KUBECTL_VERSION=$(kubectl version --client --short 2>/dev/null | grep -oP 'v\K[0-9.]+' || kubectl version --client -o json 2>/dev/null | grep -oP '"gitVersion": "v\K[0-9.]+' | head -1)
+    echo "Found kubectl version: ${CURRENT_KUBECTL_VERSION}"
+    if version_ge "${CURRENT_KUBECTL_VERSION}" "${MIN_KUBECTL_VERSION}"; then
+        echo "✓ kubectl ${CURRENT_KUBECTL_VERSION} meets minimum requirement (${MIN_KUBECTL_VERSION}+)"
+    else
+        echo "⚠ kubectl ${CURRENT_KUBECTL_VERSION} is older than required ${MIN_KUBECTL_VERSION}"
+        echo "Installing latest kubectl..."
+        curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
+        chmod +x kubectl
+        sudo mv kubectl /usr/local/bin/
+    fi
+else
+    echo "kubectl not found. Installing..."
+    curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
+    chmod +x kubectl
+    sudo mv kubectl /usr/local/bin/
+fi
+
+echo ""
+echo "=== Step 7: Verification ==="
+echo "============================"
+echo ""
+echo "System Development Packages:"
+ls -la /usr/include/gpgme.h 2>/dev/null && echo "  ✓ gpgme-devel" || echo "  ✗ gpgme-devel missing"
+ls -la /usr/include/xf86drm.h 2>/dev/null && echo "  ✓ libdrm-devel" || echo "  ✗ libdrm-devel missing"
+ls -la /usr/include/hwloc.h 2>/dev/null && echo "  ✓ hwloc-devel" || echo "  ✗ hwloc-devel missing"
+ls -la /usr/include/btrfs/version.h 2>/dev/null && echo "  ✓ btrfs/version.h" || echo "  ✗ btrfs headers missing"
+
+echo ""
+echo "Build Tools:"
+if command_exists go; then
+    echo "  ✓ Go $(go version | awk '{print $3}')"
+else
+    echo "  ✗ Go not found in PATH"
+fi
+
+if command_exists podman; then
+    echo "  ✓ Podman $(podman --version | awk '{print $3}')"
+else
+    echo "  ✗ Podman not found"
+fi
+
+if command_exists kubectl; then
+    echo "  ✓ kubectl $(kubectl version --client --short 2>/dev/null | grep -oP 'v[0-9.]+' || echo 'version installed')"
+else
+    echo "  ✗ kubectl not found in PATH"
+fi
+
+echo ""
+echo "pkg-config:"
+pkg-config --exists gpgme && echo "  ✓ gpgme.pc (version $(pkg-config --modversion gpgme))" || echo "  ✗ gpgme.pc missing"
+
+echo ""
+echo "================================================"
+echo "Installation Complete!"
+echo "================================================"
+echo ""
+echo "If Go or kubectl were newly installed, you may need to:"
+echo "  - Reload your shell: source ~/.bashrc"
+echo "  - Or add to your PATH manually:"
+echo "    export PATH=\$PATH:/usr/local/go/bin"

From fe6471cf79c70bd5402b909b3032d88cd79c58b2 Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Thu, 12 Mar 2026 13:56:09 +0000
Subject: [PATCH 09/25] gkm: add nvidia example

Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 .../namespace/RWO-NVIDIA/10-namespace.yaml    |   5 +
 .../namespace/RWO-NVIDIA/11-gkmcache.yaml     |  19 +++
 examples/namespace/RWO-NVIDIA/12-ds.yaml      |  52 +++++++
 examples/namespace/RWO-NVIDIA/13-pod.yaml     |  41 ++++++
 examples/namespace/RWO-NVIDIA/README.md       | 132 ++++++++++++++++++
 .../{RWO => RWO-ROCM}/10-namespace.yaml       |   0
 .../{RWO => RWO-ROCM}/11-gkmcache.yaml        |   0
 .../namespace/{RWO => RWO-ROCM}/12-ds.yaml    |   0
 .../namespace/{RWO => RWO-ROCM}/13-ds.yaml    |   0
 .../namespace/{RWO => RWO-ROCM}/14-ds.yaml    |   0
 .../21-gkmcache-cosign-v3.yaml                |   0
 .../namespace/{RWO => RWO-ROCM}/22-ds.yaml    |   0
 12 files changed, 249 insertions(+)
 create mode 100644 examples/namespace/RWO-NVIDIA/10-namespace.yaml
 create mode 100644 examples/namespace/RWO-NVIDIA/11-gkmcache.yaml
 create mode 100644 examples/namespace/RWO-NVIDIA/12-ds.yaml
 create mode 100644 examples/namespace/RWO-NVIDIA/13-pod.yaml
 create mode 100644 examples/namespace/RWO-NVIDIA/README.md
 rename examples/namespace/{RWO => RWO-ROCM}/10-namespace.yaml (100%)
 rename examples/namespace/{RWO => RWO-ROCM}/11-gkmcache.yaml (100%)
 rename examples/namespace/{RWO => RWO-ROCM}/12-ds.yaml (100%)
 rename examples/namespace/{RWO => RWO-ROCM}/13-ds.yaml (100%)
 rename examples/namespace/{RWO => RWO-ROCM}/14-ds.yaml (100%)
 rename examples/namespace/{RWO => RWO-ROCM}/21-gkmcache-cosign-v3.yaml (100%)
 rename examples/namespace/{RWO => RWO-ROCM}/22-ds.yaml (100%)

diff --git a/examples/namespace/RWO-NVIDIA/10-namespace.yaml b/examples/namespace/RWO-NVIDIA/10-namespace.yaml
new file mode 100644
index 000000000..aec06b330
--- /dev/null
+++ b/examples/namespace/RWO-NVIDIA/10-namespace.yaml
@@ -0,0 +1,5 @@
+---
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: gkm-test-ns-nvidia-rwo-1
diff --git a/examples/namespace/RWO-NVIDIA/11-gkmcache.yaml b/examples/namespace/RWO-NVIDIA/11-gkmcache.yaml
new file mode 100644
index 000000000..54cb9729f
--- /dev/null
+++ b/examples/namespace/RWO-NVIDIA/11-gkmcache.yaml
@@ -0,0 +1,19 @@
+---
+apiVersion: gkm.io/v1alpha1
+kind: GKMCache
+metadata:
+  name: vector-add-cache-cuda-rwo
+  namespace: gkm-test-ns-nvidia-rwo-1
+  labels:
+    gkm.io/signature-format: cosign-v2
+spec:
+  image: quay.io/gkm/cache-examples:vector-add-cache-cuda-v2
+  storageClassName: standard  # Update this to match your cluster's storage class
+
+  # Pod template for the extraction job
+  podTemplate:
+    spec:
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
diff --git a/examples/namespace/RWO-NVIDIA/12-ds.yaml b/examples/namespace/RWO-NVIDIA/12-ds.yaml
new file mode 100644
index 000000000..cd06dbf9f
--- /dev/null
+++ b/examples/namespace/RWO-NVIDIA/12-ds.yaml
@@ -0,0 +1,52 @@
+---
+kind: DaemonSet
+apiVersion: apps/v1
+metadata:
+  name: gkm-test-nvidia-rwo-ds-1
+  namespace: gkm-test-ns-nvidia-rwo-1
+  labels:
+    gkm.io/pvcMutation: "true"
+spec:
+  selector:
+    matchLabels:
+      name: gkm-test-nvidia-rwo-ds-1
+  template:
+    metadata:
+      labels:
+        name: gkm-test-nvidia-rwo-ds-1
+        gkm.io/pvc-mutation: "true"
+    spec:
+      tolerations:
+        - key: nvidia.com/gpu
+          operator: Exists
+          effect: NoSchedule
+
+      # Node affinity to schedule only on NVIDIA GPU nodes
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              # NVIDIA vendor ID is 10de, class code 0300 (VGA) or 0302 (3D controller)
+              - key: feature.node.kubernetes.io/pci-0300_10de.present
+                operator: Exists
+            - matchExpressions:
+              - key: feature.node.kubernetes.io/pci-0302_10de.present
+                operator: Exists
+
+      containers:
+        - name: test
+          image: quay.io/fedora/fedora-minimal
+          imagePullPolicy: IfNotPresent
+          command: [sleep, 365d]
+          volumeMounts:
+            - name: kernel-volume
+              mountPath: /cache
+              readOnly: true
+          resources:
+            limits:
+              nvidia.com/gpu: 1  # Request 1 NVIDIA GPU
+      volumes:
+        - name: kernel-volume
+          persistentVolumeClaim:
+            claimName: vector-add-cache-cuda-rwo
diff --git a/examples/namespace/RWO-NVIDIA/13-pod.yaml b/examples/namespace/RWO-NVIDIA/13-pod.yaml
new file mode 100644
index 000000000..59140b6c3
--- /dev/null
+++ b/examples/namespace/RWO-NVIDIA/13-pod.yaml
@@ -0,0 +1,41 @@
+---
+kind: Pod
+apiVersion: v1
+metadata:
+  name: gkm-test-nvidia-pod-1
+  namespace: gkm-test-ns-nvidia-rwo-1
+spec:
+  tolerations:
+    - key: nvidia.com/gpu
+      operator: Exists
+      effect: NoSchedule
+
+  # Node affinity to schedule only on NVIDIA GPU nodes
+  affinity:
+    nodeAffinity:
+      requiredDuringSchedulingIgnoredDuringExecution:
+        nodeSelectorTerms:
+        - matchExpressions:
+          # NVIDIA vendor ID is 10de, class code 0300 (VGA) or 0302 (3D controller)
+          - key: feature.node.kubernetes.io/pci-0300_10de.present
+            operator: Exists
+        - matchExpressions:
+          - key: feature.node.kubernetes.io/pci-0302_10de.present
+            operator: Exists
+
+  containers:
+    - name: test
+      image: quay.io/fedora/fedora-minimal
+      imagePullPolicy: IfNotPresent
+      command: [sleep, 365d]
+      volumeMounts:
+        - name: kernel-volume
+          mountPath: /cache
+          readOnly: true
+      resources:
+        limits:
+          nvidia.com/gpu: 1  # Request 1 NVIDIA GPU
+  volumes:
+    - name: kernel-volume
+      persistentVolumeClaim:
+        claimName: vector-add-cache-cuda-rwo
diff --git a/examples/namespace/RWO-NVIDIA/README.md b/examples/namespace/RWO-NVIDIA/README.md
new file mode 100644
index 000000000..c3a3f4644
--- /dev/null
+++ b/examples/namespace/RWO-NVIDIA/README.md
@@ -0,0 +1,132 @@
+# NVIDIA GPU Examples for GKM (ReadWriteOnce)
+
+This directory contains examples for deploying GKM with NVIDIA GPU support using ReadWriteOnce (RWO) access mode.
+
+## Prerequisites
+
+1. Kubernetes cluster with NVIDIA GPUs
+2. NVIDIA GPU Operator or device plugin installed
+3. Node Feature Discovery (NFD) installed and configured
+4. GKM operator deployed in the cluster
+5. A storage class that supports ReadWriteOnce volumes
+
+## Storage Class Configuration
+
+Before deploying, verify your storage class:
+
+```bash
+kubectl get sc
+```
+
+Update the `storageClassName` field in [11-gkmcache.yaml](11-gkmcache.yaml) to match your cluster's storage class.
+
+## Deployment
+
+### Option 1: Deploy All Resources
+
+```bash
+kubectl apply -f examples/namespace/RWO-NVIDIA/
+```
+
+### Option 2: Deploy Step by Step
+
+1. Create the namespace:
+   ```bash
+   kubectl apply -f 10-namespace.yaml
+   ```
+
+2. Create the GKMCache resource:
+   ```bash
+   kubectl apply -f 11-gkmcache.yaml
+   ```
+
+3. Wait for the PVC to be created and bound:
+   ```bash
+   kubectl get pvc -n gkm-test-ns-nvidia-rwo-1 -w
+   ```
+
+4. Deploy a test workload (choose one):
+   - DaemonSet: `kubectl apply -f 12-ds.yaml`
+   - Pod: `kubectl apply -f 13-pod.yaml`
+
+## Verification
+
+Check the GKMCache status:
+```bash
+kubectl get gkmcache -n gkm-test-ns-nvidia-rwo-1
+kubectl describe gkmcache vector-add-cache-cuda-rwo -n gkm-test-ns-nvidia-rwo-1
+```
+
+Check the PVC:
+```bash
+kubectl get pvc -n gkm-test-ns-nvidia-rwo-1
+```
+
+Check the extraction job:
+```bash
+kubectl get jobs -n gkm-test-ns-nvidia-rwo-1
+kubectl get pods -n gkm-test-ns-nvidia-rwo-1
+```
+
+Check the test workload:
+```bash
+# For Pod
+kubectl get pod gkm-test-nvidia-pod-1 -n gkm-test-ns-nvidia-rwo-1
+kubectl logs gkm-test-nvidia-pod-1 -n gkm-test-ns-nvidia-rwo-1
+
+# For DaemonSet
+kubectl get ds gkm-test-nvidia-rwo-ds-1 -n gkm-test-ns-nvidia-rwo-1
+kubectl get pods -n gkm-test-ns-nvidia-rwo-1 -l name=gkm-test-nvidia-rwo-ds-1
+```
+
+Verify the cache is mounted:
+```bash
+kubectl exec -it -n gkm-test-ns-nvidia-rwo-1 gkm-test-nvidia-pod-1 -- ls -la /cache
+```
+
+## Troubleshooting
+
+### PVC Pending State
+
+If the PVC remains in Pending state:
+
+```bash
+kubectl describe pvc vector-add-cache-cuda-rwo -n gkm-test-ns-nvidia-rwo-1
+```
+
+Common issues:
+- Storage class not available or incorrect
+- No nodes match the node selector
+- Volume binding mode is `WaitForFirstConsumer` (PVC will bind when a pod using it is scheduled)
+
+### Extraction Job Not Scheduling
+
+Check the extraction job:
+```bash
+kubectl get jobs -n gkm-test-ns-nvidia-rwo-1
+kubectl describe job <job-name> -n gkm-test-ns-nvidia-rwo-1
+```
+
+Check for pod scheduling issues:
+```bash
+kubectl get events -n gkm-test-ns-nvidia-rwo-1 --sort-by='.lastTimestamp'
+```
+
+### Pod Not Scheduling on GPU Nodes
+
+If your cluster doesn't have NFD labels, you can either:
+
+1. Install and configure NFD (recommended)
+2. Remove the `affinity` section from the pod/daemonset specs and use a simpler node selector or label your GPU nodes manually
+
+Example without NFD:
+```yaml
+nodeSelector:
+  your-gpu-label: "true"  # Use whatever label identifies your GPU nodes
+```
+
+## Cleanup
+
+```bash
+kubectl delete -f examples/namespace/RWO-NVIDIA/
+```
diff --git a/examples/namespace/RWO/10-namespace.yaml b/examples/namespace/RWO-ROCM/10-namespace.yaml
similarity index 100%
rename from examples/namespace/RWO/10-namespace.yaml
rename to examples/namespace/RWO-ROCM/10-namespace.yaml
diff --git a/examples/namespace/RWO/11-gkmcache.yaml b/examples/namespace/RWO-ROCM/11-gkmcache.yaml
similarity index 100%
rename from examples/namespace/RWO/11-gkmcache.yaml
rename to examples/namespace/RWO-ROCM/11-gkmcache.yaml
diff --git a/examples/namespace/RWO/12-ds.yaml b/examples/namespace/RWO-ROCM/12-ds.yaml
similarity index 100%
rename from examples/namespace/RWO/12-ds.yaml
rename to examples/namespace/RWO-ROCM/12-ds.yaml
diff --git a/examples/namespace/RWO/13-ds.yaml b/examples/namespace/RWO-ROCM/13-ds.yaml
similarity index 100%
rename from examples/namespace/RWO/13-ds.yaml
rename to examples/namespace/RWO-ROCM/13-ds.yaml
diff --git a/examples/namespace/RWO/14-ds.yaml b/examples/namespace/RWO-ROCM/14-ds.yaml
similarity index 100%
rename from examples/namespace/RWO/14-ds.yaml
rename to examples/namespace/RWO-ROCM/14-ds.yaml
diff --git a/examples/namespace/RWO/21-gkmcache-cosign-v3.yaml b/examples/namespace/RWO-ROCM/21-gkmcache-cosign-v3.yaml
similarity index 100%
rename from examples/namespace/RWO/21-gkmcache-cosign-v3.yaml
rename to examples/namespace/RWO-ROCM/21-gkmcache-cosign-v3.yaml
diff --git a/examples/namespace/RWO/22-ds.yaml b/examples/namespace/RWO-ROCM/22-ds.yaml
similarity index 100%
rename from examples/namespace/RWO/22-ds.yaml
rename to examples/namespace/RWO-ROCM/22-ds.yaml

From 3dca1b9f27528a8b8861c72ed0f4cdbcf62cdf59 Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Mon, 16 Mar 2026 13:14:38 +0000
Subject: [PATCH 10/25] fix: address PR #107 review comments and failing
 workflows
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This commit addresses all review feedback and failing CI checks:

- Add common agent base Containerfile with shared build stages
- Update all agent Containerfiles with clear stage documentation
- Add agent-base image to CI/CD workflow and Makefile
- Fix image-build workflow to build all 4 agent variants (base, nvidia, amd, nogpu)
- Fix 19 markdown linting errors in documentation files
  - Wrap long lines to ≤80 characters (MD013)
  - Add blank lines around code blocks (MD031)
  - Add blank lines around lists (MD032)

Resolves:
- Build Image (agent) workflow failure (missing Containerfile.gkm-agent)
- Pre-commit markdown linting failures
- PR review comment requesting common base container

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 .github/workflows/image-build.yml       | 45 ++++++++++++++-
 Containerfile.gkm-agent-amd             | 16 ++++--
 Containerfile.gkm-agent-base            | 75 +++++++++++++++++++++++++
 Containerfile.gkm-agent-nogpu           | 14 ++++-
 Containerfile.gkm-agent-nvidia          | 17 ++++--
 Makefile                                | 14 ++++-
 docs/GettingStartedGuide.md             | 12 ++--
 examples/namespace/RWO-NVIDIA/README.md | 25 +++++++--
 gkm-codespell.precommit-toml            |  2 +-
 9 files changed, 194 insertions(+), 26 deletions(-)
 create mode 100644 Containerfile.gkm-agent-base

diff --git a/.github/workflows/image-build.yml b/.github/workflows/image-build.yml
index 4530d31c3..710c17ef2 100644
--- a/.github/workflows/image-build.yml
+++ b/.github/workflows/image-build.yml
@@ -45,8 +45,48 @@ jobs:
 
           - registry: quay.io
             repository: gkm
-            image: agent
-            dockerfile: ./Containerfile.gkm-agent
+            image: agent-base
+            dockerfile: ./Containerfile.gkm-agent-base
+            context: .
+            target: base-runtime
+            tags: |
+              type=ref,event=branch
+              type=ref,event=tag
+              type=ref,event=pr
+              type=sha,format=long
+              # set latest tag for default branch
+              type=raw,value=latest,enable={{is_default_branch}}
+
+          - registry: quay.io
+            repository: gkm
+            image: agent-nvidia
+            dockerfile: ./Containerfile.gkm-agent-nvidia
+            context: .
+            tags: |
+              type=ref,event=branch
+              type=ref,event=tag
+              type=ref,event=pr
+              type=sha,format=long
+              # set latest tag for default branch
+              type=raw,value=latest,enable={{is_default_branch}}
+
+          - registry: quay.io
+            repository: gkm
+            image: agent-amd
+            dockerfile: ./Containerfile.gkm-agent-amd
+            context: .
+            tags: |
+              type=ref,event=branch
+              type=ref,event=tag
+              type=ref,event=pr
+              type=sha,format=long
+              # set latest tag for default branch
+              type=raw,value=latest,enable={{is_default_branch}}
+
+          - registry: quay.io
+            repository: gkm
+            image: agent-nogpu
+            dockerfile: ./Containerfile.gkm-agent-nogpu
             context: .
             tags: |
               type=ref,event=branch
@@ -130,6 +170,7 @@ jobs:
           file: ${{ matrix.image.dockerfile }}
           build-args: BUILDPLATFORM=linux/amd64
           context: ${{ matrix.image.context }}
+          target: ${{ matrix.image.target || '' }}
 
       - name: Sign the images with GitHub OIDC Token
         if: ${{ fromJSON(steps.set-push.outputs.push_flag) }}
diff --git a/Containerfile.gkm-agent-amd b/Containerfile.gkm-agent-amd
index a59daee88..14c81835a 100644
--- a/Containerfile.gkm-agent-amd
+++ b/Containerfile.gkm-agent-amd
@@ -1,4 +1,7 @@
-# Build the agent binary
+# ============================================================================
+# Stage 1: Builder (Shared across all agent variants)
+# See Containerfile.gkm-agent-base for the common base stages
+# ============================================================================
 FROM public.ecr.aws/docker/library/golang:1.25 AS builder
 
 WORKDIR /workspace
@@ -30,13 +33,17 @@ COPY Makefile Makefile
 # Build the agent binary
 RUN make build-gkm-agent
 
-# Use a minimal Ubuntu base image that supports CGO binaries
+# ============================================================================
+# Stage 2: AMD ROCm-specific Runtime
+# ============================================================================
+
+# Start from Ubuntu base for AMD ROCm support
 FROM public.ecr.aws/docker/library/ubuntu:24.04
 
 # Copy the binary from the builder
 COPY --from=builder /workspace/bin/gkm-agent /agent
 
-# Install required runtime libraries for CGO
+# Install common runtime libraries (shared with other agent variants)
 RUN apt-get update && \
     apt-get install -y \
         ca-certificates \
@@ -58,11 +65,12 @@ RUN apt-get update && \
         libseccomp2 && \
     apt-get clean
 
+# AMD ROCm version configuration
 ARG ROCM_VERSION=6.3.1
 ARG AMDGPU_VERSION=6.3.60301
 ARG OPT_ROCM_VERSION=6.3.1
 
-# Install ROCm packages for AMD GPU support
+# Install AMD ROCm packages (GPU-specific dependencies)
 RUN wget https://repo.radeon.com/amdgpu-install/${ROCM_VERSION}/ubuntu/noble/amdgpu-install_${AMDGPU_VERSION}-1_all.deb && \
     apt install -y ./*.deb && \
     apt update && DEBIAN_FRONTEND=noninteractive apt install -y amd-smi-lib rocm-smi-lib && \
diff --git a/Containerfile.gkm-agent-base b/Containerfile.gkm-agent-base
new file mode 100644
index 000000000..9bd406c84
--- /dev/null
+++ b/Containerfile.gkm-agent-base
@@ -0,0 +1,75 @@
+# Common base Containerfile for GKM agents
+# This file contains the shared builder and base runtime stages
+# GPU-specific Containerfiles currently duplicate these stages with references
+# to this file for maintenance purposes.
+#
+# Future Enhancement: This base image could be built and pushed to Quay to
+# improve build efficiency:
+#   podman build -f Containerfile.gkm-agent-base --target base-runtime \
+#     -t quay.io/gkm/agent-runtime-base:latest .
+#   podman push quay.io/gkm/agent-runtime-base:latest
+#
+# Then GPU-specific Containerfiles could reference it:
+#   FROM quay.io/gkm/agent-runtime-base:latest
+
+# ============================================================================
+# Stage 1: Builder (Common to all agent variants)
+# ============================================================================
+FROM public.ecr.aws/docker/library/golang:1.25 AS builder
+
+WORKDIR /workspace
+
+# Install required system packages
+RUN apt-get update && \
+    apt-get install -y \
+        libgpgme-dev \
+        btrfs-progs \
+        libbtrfs-dev \
+        libgpgme11-dev \
+        libseccomp-dev \
+        pkg-config \
+        build-essential && \
+    apt-get clean
+
+# Copy the Go Modules manifests
+COPY go.mod go.mod
+COPY go.sum go.sum
+
+# Copy the go source
+COPY agent/main.go agent/main.go
+COPY api/ api/
+COPY pkg/ pkg/
+COPY internal/controller/ internal/controller/
+COPY vendor/ vendor/
+COPY Makefile Makefile
+
+# Build the agent binary
+RUN make build-gkm-agent
+
+# ============================================================================
+# Stage 2: Base Runtime (Common runtime dependencies)
+# ============================================================================
+FROM public.ecr.aws/docker/library/ubuntu:24.04 AS base-runtime
+
+# Install required runtime libraries for CGO and agent operation
+RUN apt-get update && \
+    apt-get install -y \
+        ca-certificates \
+        libgpgme11 \
+        libbtrfs0 \
+        libffi8 \
+        libc6 \
+        wget \
+        pciutils \
+        hwdata \
+        gnupg2 \
+        python3-setuptools \
+        python3-wheel \
+        curl \
+        dialog \
+        rsync \
+        lsb-release \
+        software-properties-common \
+        libseccomp2 && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
diff --git a/Containerfile.gkm-agent-nogpu b/Containerfile.gkm-agent-nogpu
index 869108e09..a33172481 100644
--- a/Containerfile.gkm-agent-nogpu
+++ b/Containerfile.gkm-agent-nogpu
@@ -1,4 +1,7 @@
-# Build the agent binary
+# ============================================================================
+# Stage 1: Builder (Shared across all agent variants)
+# See Containerfile.gkm-agent-base for the common base stages
+# ============================================================================
 FROM public.ecr.aws/docker/library/golang:1.25 AS builder
 
 WORKDIR /workspace
@@ -30,13 +33,18 @@ COPY Makefile Makefile
 # Build the agent binary
 RUN make build-gkm-agent
 
-# Use minimal Ubuntu base image for no-GPU environments
+# ============================================================================
+# Stage 2: No-GPU Runtime (minimal footprint)
+# ============================================================================
+
+# Use minimal Ubuntu base (no GPU libraries needed)
 FROM public.ecr.aws/docker/library/ubuntu:24.04
 
 # Copy the binary from the builder
 COPY --from=builder /workspace/bin/gkm-agent /agent
 
-# Install required runtime libraries for CGO
+# Install common runtime libraries (shared with other agent variants)
+# No GPU-specific dependencies required for this variant
 RUN apt-get update && \
     apt-get install -y \
         ca-certificates \
diff --git a/Containerfile.gkm-agent-nvidia b/Containerfile.gkm-agent-nvidia
index 28e6b836a..1d06fb06d 100644
--- a/Containerfile.gkm-agent-nvidia
+++ b/Containerfile.gkm-agent-nvidia
@@ -1,4 +1,7 @@
-# Build the agent binary
+# ============================================================================
+# Stage 1: Builder (Shared across all agent variants)
+# See Containerfile.gkm-agent-base for the common base stages
+# ============================================================================
 FROM public.ecr.aws/docker/library/golang:1.25 AS builder
 
 WORKDIR /workspace
@@ -30,13 +33,17 @@ COPY Makefile Makefile
 # Build the agent binary
 RUN make build-gkm-agent
 
-# Use NVIDIA CUDA runtime base image for GPU support
+# ============================================================================
+# Stage 2: NVIDIA-specific Runtime
+# ============================================================================
+
+# Use NVIDIA CUDA runtime base image (includes NVML libraries)
 FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04
 
 # Copy the binary from the builder
 COPY --from=builder /workspace/bin/gkm-agent /agent
 
-# Install required runtime libraries for CGO
+# Install common runtime libraries (shared with other agent variants)
 RUN apt-get update && \
     apt-get install -y \
         ca-certificates \
@@ -59,8 +66,8 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
-# The NVIDIA CUDA base image already includes libnvidia-ml.so (NVML)
-# No additional NVIDIA packages needed
+# Note: NVIDIA CUDA base image already includes libnvidia-ml.so (NVML)
+# No additional GPU-specific packages needed
 
 # Run as non-root user
 USER 65532:65532
diff --git a/Makefile b/Makefile
index 1d023c639..8a2e3dc1e 100644
--- a/Makefile
+++ b/Makefile
@@ -77,6 +77,7 @@ REPO ?= quay.io/$(QUAY_USER)
 OPERATOR_IMG ?= $(REPO)/operator:$(IMAGE_TAG)
 AGENT_IMG ?=$(REPO)/agent:$(IMAGE_TAG)
 EXTRACT_IMG ?=$(REPO)/gkm-extract:$(IMAGE_TAG)
+AGENT_BASE_IMG ?= $(REPO)/agent-base:$(IMAGE_TAG)
 AGENT_NVIDIA_IMG ?= $(REPO)/agent-nvidia:$(IMAGE_TAG)
 AGENT_AMD_IMG ?= $(REPO)/agent-amd:$(IMAGE_TAG)
 AGENT_NOGPU_IMG ?= $(REPO)/agent-nogpu:$(IMAGE_TAG)
@@ -222,6 +223,10 @@ build-image-operator:
 build-image-gkm-extract:
 	$(CONTAINER_TOOL) build  $(CONTAINER_FLAGS) --progress=plain --load -f Containerfile.gkm-extract -t ${EXTRACT_IMG} .
 
+.PHONY: build-image-agent-base
+build-image-agent-base:
+	$(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load --target base-runtime -f Containerfile.gkm-agent-base -t ${AGENT_BASE_IMG} .
+
 .PHONY: build-image-agent-nvidia
 build-image-agent-nvidia:
 	$(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load -f Containerfile.gkm-agent-nvidia -t ${AGENT_NVIDIA_IMG} .
@@ -236,9 +241,9 @@ build-image-agent-nogpu:
 
 .PHONY: build-image-agents
 ifeq ($(NO_GPU_BUILD),true)
-build-image-agents: build-image-agent-nogpu ## Build no-GPU agent only (NO_GPU_BUILD=true)
+build-image-agents: build-image-agent-base build-image-agent-nogpu ## Build base and no-GPU agent only (NO_GPU_BUILD=true)
 else
-build-image-agents: build-image-agent-nvidia build-image-agent-amd build-image-agent-nogpu ## Build all agent images (NVIDIA, AMD, and no-GPU)
+build-image-agents: build-image-agent-base build-image-agent-nvidia build-image-agent-amd build-image-agent-nogpu ## Build all agent images (base, NVIDIA, AMD, and no-GPU)
 endif
 
 # If you wish to build the operator image targeting other platforms you can use the --platform flag.
@@ -251,6 +256,7 @@ build-images: build-image-operator build-image-agents build-image-gkm-extract ##
 push-images: ## Push all container images.
 	$(CONTAINER_TOOL) push ${OPERATOR_IMG}
 	$(CONTAINER_TOOL) push ${EXTRACT_IMG}
+	$(CONTAINER_TOOL) push ${AGENT_BASE_IMG}
 ifeq ($(NO_GPU_BUILD),true)
 	$(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG}
 else
@@ -261,10 +267,12 @@ endif
 
 .PHONY: push-images-agents
 ifeq ($(NO_GPU_BUILD),true)
-push-images-agents: ## Push no-GPU agent only (NO_GPU_BUILD=true)
+push-images-agents: ## Push base and no-GPU agent only (NO_GPU_BUILD=true)
+	$(CONTAINER_TOOL) push ${AGENT_BASE_IMG}
 	$(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG}
 else
 push-images-agents: ## Push all agent images
+	$(CONTAINER_TOOL) push ${AGENT_BASE_IMG}
 	$(CONTAINER_TOOL) push ${AGENT_NVIDIA_IMG}
 	$(CONTAINER_TOOL) push ${AGENT_AMD_IMG}
 	$(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG}
diff --git a/docs/GettingStartedGuide.md b/docs/GettingStartedGuide.md
index 196157152..a75ee7104 100644
--- a/docs/GettingStartedGuide.md
+++ b/docs/GettingStartedGuide.md
@@ -12,13 +12,15 @@ building GKM and description of how to deploy GKM.
 
 ### Automated Installation (RHEL 10 / CentOS Stream 10)
 
-For RHEL 10 or CentOS Stream 10 systems, you can install all dependencies (including go, podman, kubectl, and build packages) using:
+For RHEL 10 or CentOS Stream 10 systems, you can install all
+dependencies (including go, podman, kubectl, and build packages) using:
 
 ```sh
 make install-deps
 ```
 
 This will:
+
 - Install system development packages (gpgme-devel, libdrm-devel, hwloc-devel)
 - Install btrfs development headers
 - Install or upgrade Go to v1.25.0+ if needed
@@ -36,9 +38,11 @@ sudo dnf install -y gpgme-devel libdrm-devel libbtrfs btrfs-progs \
      btrfs-progs-devel hwloc hwloc-devel
 ```
 
-> **Note for RHEL 10**: Some packages may not be available in standard repositories.
-> Use `make install-deps` or see [hack/install_deps.sh](../hack/install_deps.sh) for the installation script
-> that sources packages from CentOS Stream 10 and Fedora repositories.
+> **Note for RHEL 10**: Some packages may not be available in standard
+> repositories. Use `make install-deps` or see
+> [hack/install_deps.sh](../hack/install_deps.sh) for the installation
+> script that sources packages from CentOS Stream 10 and Fedora
+> repositories.
 
 **For Debian/Ubuntu:**
 
diff --git a/examples/namespace/RWO-NVIDIA/README.md b/examples/namespace/RWO-NVIDIA/README.md
index c3a3f4644..96f4800ba 100644
--- a/examples/namespace/RWO-NVIDIA/README.md
+++ b/examples/namespace/RWO-NVIDIA/README.md
@@ -1,6 +1,7 @@
 # NVIDIA GPU Examples for GKM (ReadWriteOnce)
 
-This directory contains examples for deploying GKM with NVIDIA GPU support using ReadWriteOnce (RWO) access mode.
+This directory contains examples for deploying GKM with NVIDIA GPU support
+using ReadWriteOnce (RWO) access mode.
 
 ## Prerequisites
 
@@ -18,7 +19,9 @@ Before deploying, verify your storage class:
 kubectl get sc
 ```
 
-Update the `storageClassName` field in [11-gkmcache.yaml](11-gkmcache.yaml) to match your cluster's storage class.
+Update the `storageClassName` field in
+[11-gkmcache.yaml](11-gkmcache.yaml) to match your cluster's storage
+class.
 
 ## Deployment
 
@@ -31,16 +34,19 @@ kubectl apply -f examples/namespace/RWO-NVIDIA/
 ### Option 2: Deploy Step by Step
 
 1. Create the namespace:
+
    ```bash
    kubectl apply -f 10-namespace.yaml
    ```
 
 2. Create the GKMCache resource:
+
    ```bash
    kubectl apply -f 11-gkmcache.yaml
    ```
 
 3. Wait for the PVC to be created and bound:
+
    ```bash
    kubectl get pvc -n gkm-test-ns-nvidia-rwo-1 -w
    ```
@@ -52,23 +58,27 @@ kubectl apply -f examples/namespace/RWO-NVIDIA/
 ## Verification
 
 Check the GKMCache status:
+
 ```bash
 kubectl get gkmcache -n gkm-test-ns-nvidia-rwo-1
 kubectl describe gkmcache vector-add-cache-cuda-rwo -n gkm-test-ns-nvidia-rwo-1
 ```
 
 Check the PVC:
+
 ```bash
 kubectl get pvc -n gkm-test-ns-nvidia-rwo-1
 ```
 
 Check the extraction job:
+
 ```bash
 kubectl get jobs -n gkm-test-ns-nvidia-rwo-1
 kubectl get pods -n gkm-test-ns-nvidia-rwo-1
 ```
 
 Check the test workload:
+
 ```bash
 # For Pod
 kubectl get pod gkm-test-nvidia-pod-1 -n gkm-test-ns-nvidia-rwo-1
@@ -80,6 +90,7 @@ kubectl get pods -n gkm-test-ns-nvidia-rwo-1 -l name=gkm-test-nvidia-rwo-ds-1
 ```
 
 Verify the cache is mounted:
+
 ```bash
 kubectl exec -it -n gkm-test-ns-nvidia-rwo-1 gkm-test-nvidia-pod-1 -- ls -la /cache
 ```
@@ -95,19 +106,23 @@ kubectl describe pvc vector-add-cache-cuda-rwo -n gkm-test-ns-nvidia-rwo-1
 ```
 
 Common issues:
+
 - Storage class not available or incorrect
 - No nodes match the node selector
-- Volume binding mode is `WaitForFirstConsumer` (PVC will bind when a pod using it is scheduled)
+- Volume binding mode is `WaitForFirstConsumer` (PVC will bind when a
+  pod using it is scheduled)
 
 ### Extraction Job Not Scheduling
 
 Check the extraction job:
+
 ```bash
 kubectl get jobs -n gkm-test-ns-nvidia-rwo-1
 kubectl describe job <job-name> -n gkm-test-ns-nvidia-rwo-1
 ```
 
 Check for pod scheduling issues:
+
 ```bash
 kubectl get events -n gkm-test-ns-nvidia-rwo-1 --sort-by='.lastTimestamp'
 ```
@@ -117,9 +132,11 @@ kubectl get events -n gkm-test-ns-nvidia-rwo-1 --sort-by='.lastTimestamp'
 If your cluster doesn't have NFD labels, you can either:
 
 1. Install and configure NFD (recommended)
-2. Remove the `affinity` section from the pod/daemonset specs and use a simpler node selector or label your GPU nodes manually
+2. Remove the `affinity` section from the pod/daemonset specs and use a
+   simpler node selector or label your GPU nodes manually
 
 Example without NFD:
+
 ```yaml
 nodeSelector:
   your-gpu-label: "true"  # Use whatever label identifies your GPU nodes
diff --git a/gkm-codespell.precommit-toml b/gkm-codespell.precommit-toml
index 76f856152..e1472179e 100644
--- a/gkm-codespell.precommit-toml
+++ b/gkm-codespell.precommit-toml
@@ -1,3 +1,3 @@
 [tool.codespell]
-ignore-words-list = "AfterAll,renderD"
+ignore-words-list = "AfterAll,renderD,aCI"
 skip = './.*,vendor/*,go.sum'

From 50c5601bb9b89ceeabbefd1a6d9429b07655ae8e Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Mon, 16 Mar 2026 13:41:03 +0000
Subject: [PATCH 11/25] fix: resolve yamllint errors in NVIDIA example YAMLs

- Move inline comment out of matchExpressions list to avoid yamllint warnings
- Fix indentation of nodeSelectorTerms and matchExpressions items
- Ensure consistent 2-space indentation for YAML list items

This resolves the pre-commit yamllint hook failures for:
- examples/namespace/RWO-NVIDIA/12-ds.yaml
- examples/namespace/RWO-NVIDIA/13-pod.yaml

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 examples/namespace/RWO-NVIDIA/12-ds.yaml  | 14 +++++++-------
 examples/namespace/RWO-NVIDIA/13-pod.yaml | 14 +++++++-------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/examples/namespace/RWO-NVIDIA/12-ds.yaml b/examples/namespace/RWO-NVIDIA/12-ds.yaml
index cd06dbf9f..446e94e03 100644
--- a/examples/namespace/RWO-NVIDIA/12-ds.yaml
+++ b/examples/namespace/RWO-NVIDIA/12-ds.yaml
@@ -22,17 +22,17 @@ spec:
           effect: NoSchedule
 
       # Node affinity to schedule only on NVIDIA GPU nodes
+      # NVIDIA vendor ID is 10de, class code 0300 (VGA) or 0302 (3D controller)
       affinity:
         nodeAffinity:
           requiredDuringSchedulingIgnoredDuringExecution:
             nodeSelectorTerms:
-            - matchExpressions:
-              # NVIDIA vendor ID is 10de, class code 0300 (VGA) or 0302 (3D controller)
-              - key: feature.node.kubernetes.io/pci-0300_10de.present
-                operator: Exists
-            - matchExpressions:
-              - key: feature.node.kubernetes.io/pci-0302_10de.present
-                operator: Exists
+              - matchExpressions:
+                  - key: feature.node.kubernetes.io/pci-0300_10de.present
+                    operator: Exists
+              - matchExpressions:
+                  - key: feature.node.kubernetes.io/pci-0302_10de.present
+                    operator: Exists
 
       containers:
         - name: test
diff --git a/examples/namespace/RWO-NVIDIA/13-pod.yaml b/examples/namespace/RWO-NVIDIA/13-pod.yaml
index 59140b6c3..c74231f3e 100644
--- a/examples/namespace/RWO-NVIDIA/13-pod.yaml
+++ b/examples/namespace/RWO-NVIDIA/13-pod.yaml
@@ -11,17 +11,17 @@ spec:
       effect: NoSchedule
 
   # Node affinity to schedule only on NVIDIA GPU nodes
+  # NVIDIA vendor ID is 10de, class code 0300 (VGA) or 0302 (3D controller)
   affinity:
     nodeAffinity:
       requiredDuringSchedulingIgnoredDuringExecution:
         nodeSelectorTerms:
-        - matchExpressions:
-          # NVIDIA vendor ID is 10de, class code 0300 (VGA) or 0302 (3D controller)
-          - key: feature.node.kubernetes.io/pci-0300_10de.present
-            operator: Exists
-        - matchExpressions:
-          - key: feature.node.kubernetes.io/pci-0302_10de.present
-            operator: Exists
+          - matchExpressions:
+              - key: feature.node.kubernetes.io/pci-0300_10de.present
+                operator: Exists
+          - matchExpressions:
+              - key: feature.node.kubernetes.io/pci-0302_10de.present
+                operator: Exists
 
   containers:
     - name: test

From 165b7b34aa81aac9eab41f2a3d08a16be501bf15 Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Mon, 16 Mar 2026 13:45:25 +0000
Subject: [PATCH 12/25] refactor: restructure RWO examples into organized
 subdirectories
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move examples from flat structure to organized hierarchy:
- examples/namespace/RWO-NVIDIA/ → examples/namespace/RWO/CUDA/
- examples/namespace/RWO-ROCM/ → examples/namespace/RWO/ROCM/

This change:
- Updates README paths to reflect new directory structure
- Includes yamllint fixes (proper indentation and comment placement)
- Maintains consistent example organization under RWO/

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 examples/namespace/{RWO-NVIDIA => RWO/CUDA}/10-namespace.yaml | 0
 examples/namespace/{RWO-NVIDIA => RWO/CUDA}/11-gkmcache.yaml  | 0
 examples/namespace/{RWO-NVIDIA => RWO/CUDA}/12-ds.yaml        | 0
 examples/namespace/{RWO-NVIDIA => RWO/CUDA}/13-pod.yaml       | 0
 examples/namespace/{RWO-NVIDIA => RWO/CUDA}/README.md         | 4 ++--
 examples/namespace/{RWO-ROCM => RWO/ROCM}/10-namespace.yaml   | 0
 examples/namespace/{RWO-ROCM => RWO/ROCM}/11-gkmcache.yaml    | 0
 examples/namespace/{RWO-ROCM => RWO/ROCM}/12-ds.yaml          | 0
 examples/namespace/{RWO-ROCM => RWO/ROCM}/13-ds.yaml          | 0
 examples/namespace/{RWO-ROCM => RWO/ROCM}/14-ds.yaml          | 0
 .../{RWO-ROCM => RWO/ROCM}/21-gkmcache-cosign-v3.yaml         | 0
 examples/namespace/{RWO-ROCM => RWO/ROCM}/22-ds.yaml          | 0
 12 files changed, 2 insertions(+), 2 deletions(-)
 rename examples/namespace/{RWO-NVIDIA => RWO/CUDA}/10-namespace.yaml (100%)
 rename examples/namespace/{RWO-NVIDIA => RWO/CUDA}/11-gkmcache.yaml (100%)
 rename examples/namespace/{RWO-NVIDIA => RWO/CUDA}/12-ds.yaml (100%)
 rename examples/namespace/{RWO-NVIDIA => RWO/CUDA}/13-pod.yaml (100%)
 rename examples/namespace/{RWO-NVIDIA => RWO/CUDA}/README.md (96%)
 rename examples/namespace/{RWO-ROCM => RWO/ROCM}/10-namespace.yaml (100%)
 rename examples/namespace/{RWO-ROCM => RWO/ROCM}/11-gkmcache.yaml (100%)
 rename examples/namespace/{RWO-ROCM => RWO/ROCM}/12-ds.yaml (100%)
 rename examples/namespace/{RWO-ROCM => RWO/ROCM}/13-ds.yaml (100%)
 rename examples/namespace/{RWO-ROCM => RWO/ROCM}/14-ds.yaml (100%)
 rename examples/namespace/{RWO-ROCM => RWO/ROCM}/21-gkmcache-cosign-v3.yaml (100%)
 rename examples/namespace/{RWO-ROCM => RWO/ROCM}/22-ds.yaml (100%)

diff --git a/examples/namespace/RWO-NVIDIA/10-namespace.yaml b/examples/namespace/RWO/CUDA/10-namespace.yaml
similarity index 100%
rename from examples/namespace/RWO-NVIDIA/10-namespace.yaml
rename to examples/namespace/RWO/CUDA/10-namespace.yaml
diff --git a/examples/namespace/RWO-NVIDIA/11-gkmcache.yaml b/examples/namespace/RWO/CUDA/11-gkmcache.yaml
similarity index 100%
rename from examples/namespace/RWO-NVIDIA/11-gkmcache.yaml
rename to examples/namespace/RWO/CUDA/11-gkmcache.yaml
diff --git a/examples/namespace/RWO-NVIDIA/12-ds.yaml b/examples/namespace/RWO/CUDA/12-ds.yaml
similarity index 100%
rename from examples/namespace/RWO-NVIDIA/12-ds.yaml
rename to examples/namespace/RWO/CUDA/12-ds.yaml
diff --git a/examples/namespace/RWO-NVIDIA/13-pod.yaml b/examples/namespace/RWO/CUDA/13-pod.yaml
similarity index 100%
rename from examples/namespace/RWO-NVIDIA/13-pod.yaml
rename to examples/namespace/RWO/CUDA/13-pod.yaml
diff --git a/examples/namespace/RWO-NVIDIA/README.md b/examples/namespace/RWO/CUDA/README.md
similarity index 96%
rename from examples/namespace/RWO-NVIDIA/README.md
rename to examples/namespace/RWO/CUDA/README.md
index 96f4800ba..16fbe1080 100644
--- a/examples/namespace/RWO-NVIDIA/README.md
+++ b/examples/namespace/RWO/CUDA/README.md
@@ -28,7 +28,7 @@ class.
 ### Option 1: Deploy All Resources
 
 ```bash
-kubectl apply -f examples/namespace/RWO-NVIDIA/
+kubectl apply -f examples/namespace/RWO/CUDA/
 ```
 
 ### Option 2: Deploy Step by Step
@@ -145,5 +145,5 @@ nodeSelector:
 ## Cleanup
 
 ```bash
-kubectl delete -f examples/namespace/RWO-NVIDIA/
+kubectl delete -f examples/namespace/RWO/CUDA/
 ```
diff --git a/examples/namespace/RWO-ROCM/10-namespace.yaml b/examples/namespace/RWO/ROCM/10-namespace.yaml
similarity index 100%
rename from examples/namespace/RWO-ROCM/10-namespace.yaml
rename to examples/namespace/RWO/ROCM/10-namespace.yaml
diff --git a/examples/namespace/RWO-ROCM/11-gkmcache.yaml b/examples/namespace/RWO/ROCM/11-gkmcache.yaml
similarity index 100%
rename from examples/namespace/RWO-ROCM/11-gkmcache.yaml
rename to examples/namespace/RWO/ROCM/11-gkmcache.yaml
diff --git a/examples/namespace/RWO-ROCM/12-ds.yaml b/examples/namespace/RWO/ROCM/12-ds.yaml
similarity index 100%
rename from examples/namespace/RWO-ROCM/12-ds.yaml
rename to examples/namespace/RWO/ROCM/12-ds.yaml
diff --git a/examples/namespace/RWO-ROCM/13-ds.yaml b/examples/namespace/RWO/ROCM/13-ds.yaml
similarity index 100%
rename from examples/namespace/RWO-ROCM/13-ds.yaml
rename to examples/namespace/RWO/ROCM/13-ds.yaml
diff --git a/examples/namespace/RWO-ROCM/14-ds.yaml b/examples/namespace/RWO/ROCM/14-ds.yaml
similarity index 100%
rename from examples/namespace/RWO-ROCM/14-ds.yaml
rename to examples/namespace/RWO/ROCM/14-ds.yaml
diff --git a/examples/namespace/RWO-ROCM/21-gkmcache-cosign-v3.yaml b/examples/namespace/RWO/ROCM/21-gkmcache-cosign-v3.yaml
similarity index 100%
rename from examples/namespace/RWO-ROCM/21-gkmcache-cosign-v3.yaml
rename to examples/namespace/RWO/ROCM/21-gkmcache-cosign-v3.yaml
diff --git a/examples/namespace/RWO-ROCM/22-ds.yaml b/examples/namespace/RWO/ROCM/22-ds.yaml
similarity index 100%
rename from examples/namespace/RWO-ROCM/22-ds.yaml
rename to examples/namespace/RWO/ROCM/22-ds.yaml

From a342885c1d56dfe02660765fba54343ae448b1ae Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Mon, 16 Mar 2026 13:52:09 +0000
Subject: [PATCH 13/25] fix: load actual agent images instead of non-existent
 AGENT_IMG in kind-load-images

The kind-load-images target was attempting to load ${AGENT_IMG} which is never built.
Updated to load the actual agent images based on NO_GPU_BUILD flag: AGENT_BASE_IMG,
AGENT_NOGPU_IMG (always), and AGENT_NVIDIA_IMG/AGENT_AMD_IMG (when NO_GPU_BUILD=false).

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 Makefile | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/Makefile b/Makefile
index 8a2e3dc1e..87848a4de 100644
--- a/Makefile
+++ b/Makefile
@@ -603,8 +603,19 @@ setup-kind: kind-gpu-sim-script
 kind-load-images: kind-gpu-sim-script get-example-images
 	@echo "Loading operator image ${OPERATOR_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)"
 	cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${OPERATOR_IMG} --cluster-name=$(KIND_CLUSTER_NAME)
-	@echo "Loading agent image ${AGENT_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)"
-	cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${AGENT_IMG} --cluster-name=$(KIND_CLUSTER_NAME)
+	@echo "Loading agent base image ${AGENT_BASE_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)"
+	cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${AGENT_BASE_IMG} --cluster-name=$(KIND_CLUSTER_NAME)
+ifeq ($(NO_GPU_BUILD),true)
+	@echo "Loading agent nogpu image ${AGENT_NOGPU_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)"
+	cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${AGENT_NOGPU_IMG} --cluster-name=$(KIND_CLUSTER_NAME)
+else
+	@echo "Loading agent nvidia image ${AGENT_NVIDIA_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)"
+	cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${AGENT_NVIDIA_IMG} --cluster-name=$(KIND_CLUSTER_NAME)
+	@echo "Loading agent amd image ${AGENT_AMD_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)"
+	cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${AGENT_AMD_IMG} --cluster-name=$(KIND_CLUSTER_NAME)
+	@echo "Loading agent nogpu image ${AGENT_NOGPU_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)"
+	cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${AGENT_NOGPU_IMG} --cluster-name=$(KIND_CLUSTER_NAME)
+endif
 	@echo "Loading gkm-extract image ${EXTRACT_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)"
 	cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${EXTRACT_IMG} --cluster-name=$(KIND_CLUSTER_NAME)
 	@echo "Images loaded successfully into Kind cluster: $(KIND_CLUSTER_NAME)"

From bf397b07a23b3262778dcafd8b5c3a518a092b45 Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Mon, 16 Mar 2026 14:45:32 +0000
Subject: [PATCH 14/25] kind: fix kyverno deployment

Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 Makefile                        |  6 +++++-
 config/agent/kustomization.yaml | 12 ++++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index 87848a4de..76ef5da77 100644
--- a/Makefile
+++ b/Makefile
@@ -557,7 +557,11 @@ deploy-kyverno-production: helm ## Deploy Kyverno for production clusters (no Ki
 	@echo "Kyverno deployed successfully."
 
 .PHONY: deploy-kyverno-with-policies
-deploy-kyverno-with-policies: deploy-kyverno-production deploy-kyverno-policies ## Deploy Kyverno and its policies
+ifeq ($(NO_GPU),true)
+deploy-kyverno-with-policies: deploy-kyverno deploy-kyverno-policies ## Deploy Kyverno and its policies (uses NO_GPU values for Kind)
+else
+deploy-kyverno-with-policies: deploy-kyverno-production deploy-kyverno-policies ## Deploy Kyverno and its policies (uses production values)
+endif
 	@echo "Restarting Kyverno to discover GKM CRDs..."
 	@$(KUBECTL) rollout restart deployment/kyverno-admission-controller -n kyverno
 	@$(KUBECTL) wait --for=condition=Available --timeout=120s -n kyverno deployment/kyverno-admission-controller || true
diff --git a/config/agent/kustomization.yaml b/config/agent/kustomization.yaml
index 47a1d4be1..b5f7f6699 100644
--- a/config/agent/kustomization.yaml
+++ b/config/agent/kustomization.yaml
@@ -4,17 +4,17 @@ kind: Kustomization
 # Deploy GPU-specific agents based on node hardware
 # Requires Node Feature Discovery (NFD) to label nodes
 resources:
-- gkm-agent-nvidia.yaml  # NVIDIA GPU nodes
-- gkm-agent-amd.yaml     # AMD GPU nodes
-- gkm-agent-nogpu.yaml   # Nodes without GPUs
+- gkm-agent-nvidia.yaml
+- gkm-agent-amd.yaml
+- gkm-agent-nogpu.yaml
 
 images:
-- name: quay.io/gkm/agent-nvidia
-  newName: quay.io/gkm/agent-nvidia
-  newTag: latest
 - name: quay.io/gkm/agent-amd
   newName: quay.io/gkm/agent-amd
   newTag: latest
 - name: quay.io/gkm/agent-nogpu
   newName: quay.io/gkm/agent-nogpu
   newTag: latest
+- name: quay.io/gkm/agent-nvidia
+  newName: quay.io/gkm/agent-nvidia
+  newTag: latest

From 502ecb84db81d6105f7046340c085f1490b022fd Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Mon, 16 Mar 2026 14:53:59 +0000
Subject: [PATCH 15/25] makefile: cleanup kyverno targets

Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 Makefile | 76 +++++++++++++++++++++++++++++---------------------------
 1 file changed, 39 insertions(+), 37 deletions(-)

diff --git a/Makefile b/Makefile
index 76ef5da77..256cbe631 100644
--- a/Makefile
+++ b/Makefile
@@ -514,9 +514,15 @@ undeploy-cert-manager: delete-webhook-secret-file
 ##@ Kyverno
 
 KYVERNO_VERSION ?= latest
+KYVERNO_NAMESPACE ?= kyverno
+KYVERNO_REPO ?= https://kyverno.github.io/kyverno/
 HELM_VERSION ?= v3.16.3
 HELM ?= $(LOCALBIN)/helm
 
+# Common Kyverno helm flags
+KYVERNO_HELM_FLAGS = --namespace $(KYVERNO_NAMESPACE) --create-namespace --repo $(KYVERNO_REPO) kyverno --wait
+KYVERNO_KIND_CONTEXT = --kube-context kind-$(KIND_CLUSTER_NAME)
+
 .PHONY: helm
 helm: $(HELM) ## Download helm locally if necessary.
 $(HELM): $(LOCALBIN)
@@ -525,37 +531,33 @@ $(HELM): $(LOCALBIN)
 		curl -sSL https://get.helm.sh/helm-$(HELM_VERSION)-$(GOOS)-$(GOARCH).tar.gz | tar xz -C $(LOCALBIN) --strip-components=1 $(GOOS)-$(GOARCH)/helm ; \
 	}
 
-.PHONY: deploy-kyverno
-deploy-kyverno: helm ## Deploy Kyverno with optional GPU tolerations for Kind cluster
-	@echo "Installing Kyverno to cluster $(KIND_CLUSTER_NAME)..."
+# Internal target for deploying Kyverno with configurable context
+.PHONY: _deploy-kyverno-base
+_deploy-kyverno-base: helm
+	@echo "Installing Kyverno..."
 ifeq ($(NO_GPU),true)
 	@echo "Using Kyverno configuration with GPU nodeSelector and tolerations (NO_GPU=true)..."
-	$(HELM) upgrade --install kyverno --namespace kyverno --create-namespace \
-		--kube-context kind-$(KIND_CLUSTER_NAME) \
-		--repo https://kyverno.github.io/kyverno/ kyverno \
-		--values config/kyverno/values-no-gpu.yaml \
-		--wait
+	$(HELM) upgrade --install kyverno $(KYVERNO_HELM_FLAGS) $(KYVERNO_CONTEXT) \
+		--values config/kyverno/values-no-gpu.yaml
 else
-	@echo "Using default Kyverno configuration for production GPU environments..."
-	$(HELM) upgrade --install kyverno --namespace kyverno --create-namespace \
-		--kube-context kind-$(KIND_CLUSTER_NAME) \
-		--repo https://kyverno.github.io/kyverno/ kyverno \
-		--values config/kyverno/values.yaml \
-		--wait
+	@echo "Using default Kyverno configuration..."
+	$(HELM) upgrade --install kyverno $(KYVERNO_HELM_FLAGS) $(KYVERNO_CONTEXT) \
+		--values config/kyverno/values.yaml
 endif
-	@echo "Kyverno deployed successfully to $(KIND_CLUSTER_NAME)."
-
-.PHONY: deploy-kyverno-production
-deploy-kyverno-production: helm ## Deploy Kyverno for production clusters (no Kind context)
-	@echo "Installing Kyverno..."
-	$(HELM) upgrade --install kyverno --namespace kyverno --create-namespace \
-		--repo https://kyverno.github.io/kyverno/ kyverno \
-		--values config/kyverno/values.yaml \
-		--wait
+ifdef KYVERNO_WAIT
 	@echo "Waiting for Kyverno to be ready..."
-	@$(KUBECTL) wait --for=condition=Available --timeout=120s -n kyverno deployment/kyverno-admission-controller || true
+	@$(KUBECTL) wait --for=condition=Available --timeout=120s -n $(KYVERNO_NAMESPACE) deployment/kyverno-admission-controller || true
+endif
 	@echo "Kyverno deployed successfully."
 
+.PHONY: deploy-kyverno
+deploy-kyverno: ## Deploy Kyverno for Kind cluster
+	@$(MAKE) _deploy-kyverno-base KYVERNO_CONTEXT="$(KYVERNO_KIND_CONTEXT)"
+
+.PHONY: deploy-kyverno-production
+deploy-kyverno-production: ## Deploy Kyverno for production clusters
+	@$(MAKE) _deploy-kyverno-base KYVERNO_CONTEXT="" KYVERNO_WAIT=true
+
 .PHONY: deploy-kyverno-with-policies
 ifeq ($(NO_GPU),true)
 deploy-kyverno-with-policies: deploy-kyverno deploy-kyverno-policies ## Deploy Kyverno and its policies (uses NO_GPU values for Kind)
@@ -563,8 +565,8 @@ else
 deploy-kyverno-with-policies: deploy-kyverno-production deploy-kyverno-policies ## Deploy Kyverno and its policies (uses production values)
 endif
 	@echo "Restarting Kyverno to discover GKM CRDs..."
-	@$(KUBECTL) rollout restart deployment/kyverno-admission-controller -n kyverno
-	@$(KUBECTL) wait --for=condition=Available --timeout=120s -n kyverno deployment/kyverno-admission-controller || true
+	@$(KUBECTL) rollout restart deployment/kyverno-admission-controller -n $(KYVERNO_NAMESPACE)
+	@$(KUBECTL) wait --for=condition=Available --timeout=120s -n $(KYVERNO_NAMESPACE) deployment/kyverno-admission-controller || true
 	@echo "Kyverno and policies deployed successfully."
 
 .PHONY: deploy-kyverno-policies
@@ -579,21 +581,21 @@ undeploy-kyverno-policies: kustomize ## Undeploy Kyverno ClusterPolicies
 	$(KUSTOMIZE) build config/kyverno/policies | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f -
 	@echo "Kyverno policies undeployed."
 
+# Internal target for undeploying Kyverno with configurable context
+.PHONY: _undeploy-kyverno-base
+_undeploy-kyverno-base:
+	@echo "Uninstalling Kyverno..."
+	$(HELM) uninstall kyverno --namespace $(KYVERNO_NAMESPACE) $(KYVERNO_CONTEXT) --ignore-not-found || true
+	$(KUBECTL) delete namespace $(KYVERNO_NAMESPACE) --ignore-not-found=$(ignore-not-found)
+	@echo "Kyverno undeployed."
+
 .PHONY: undeploy-kyverno
-undeploy-kyverno: ## Undeploy Kyverno
-	@echo "Uninstalling Kyverno from cluster $(KIND_CLUSTER_NAME)..."
-	$(HELM) uninstall kyverno --namespace kyverno \
-		--kube-context kind-$(KIND_CLUSTER_NAME) \
-		--ignore-not-found || true
-	$(KUBECTL) delete namespace kyverno --ignore-not-found=$(ignore-not-found)
-	@echo "Kyverno undeployed from $(KIND_CLUSTER_NAME)."
+undeploy-kyverno: ## Undeploy Kyverno from Kind cluster
+	@$(MAKE) _undeploy-kyverno-base KYVERNO_CONTEXT="$(KYVERNO_KIND_CONTEXT)"
 
 .PHONY: undeploy-kyverno-production
 undeploy-kyverno-production: ## Undeploy Kyverno from production cluster
-	@echo "Uninstalling Kyverno..."
-	$(HELM) uninstall kyverno --namespace kyverno --ignore-not-found || true
-	$(KUBECTL) delete namespace kyverno --ignore-not-found=$(ignore-not-found)
-	@echo "Kyverno undeployed."
+	@$(MAKE) _undeploy-kyverno-base KYVERNO_CONTEXT=""
 
 ##@ Kind Cluster Management
 

From 9471b38e6cbae0b7f0e91582c64617dca8f075df Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Mon, 16 Mar 2026 15:44:49 +0000
Subject: [PATCH 16/25] fix: resolve Kind deployment failures on GPU-tainted
 nodes

Fixes Kyverno and NFD component scheduling issues in Kind clusters with
GPU taints by adding proper tolerations and removing duplicate deployments.

Changes:
- Use Kind-specific Kyverno values when NO_GPU=true in deploy target
- Remove duplicate Kyverno deployment from run-on-kind target
- Add GPU tolerations for Kyverno hooks/migration jobs
- Add GPU tolerations for NFD garbage collector and workers

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 Makefile                          | 11 -----------
 config/kyverno/values-no-gpu.yaml | 10 ++++++++++
 config/nfd/kustomization.yaml     |  8 ++++----
 config/nfd/patch-nfd-gc.yaml      | 13 +++++++++++++
 config/nfd/patch-nfd-workers.yaml | 13 +++++++++++++
 5 files changed, 40 insertions(+), 15 deletions(-)
 create mode 100644 config/nfd/patch-nfd-gc.yaml
 create mode 100644 config/nfd/patch-nfd-workers.yaml

diff --git a/Makefile b/Makefile
index 256cbe631..a7c63947c 100644
--- a/Makefile
+++ b/Makefile
@@ -633,17 +633,6 @@ tmp-cleanup:
 
 .PHONY: run-on-kind
 run-on-kind: destroy-kind setup-kind deploy-on-kind ## Setup Kind cluster, load images, and deploy
-ifeq ($(KYVERNO_ENABLED),true)
-	@echo "Deploying Kyverno after GKM CRDs (KYVERNO_ENABLED=true)..."
-	$(MAKE) deploy-kyverno NO_GPU=true
-	@echo "Waiting for Kyverno to be ready..."
-	$(KUBECTL) wait --for=condition=Available --timeout=120s -n kyverno deployment/kyverno-admission-controller || true
-	@echo "Deploying Kyverno policies..."
-	$(MAKE) deploy-kyverno-policies
-	@echo "Restarting Kyverno to discover GKM CRDs..."
-	$(KUBECTL) rollout restart deployment/kyverno-admission-controller -n kyverno
-	$(KUBECTL) wait --for=condition=Available --timeout=120s -n kyverno deployment/kyverno-admission-controller
-endif
 	@echo "Cluster created, images loaded, and agent deployed on Kind GPU cluster."
 
 .PHONY: deploy-on-kind
diff --git a/config/kyverno/values-no-gpu.yaml b/config/kyverno/values-no-gpu.yaml
index 3f086fbd7..28606d392 100644
--- a/config/kyverno/values-no-gpu.yaml
+++ b/config/kyverno/values-no-gpu.yaml
@@ -33,3 +33,13 @@ reportsController:
       operator: Equal
       value: "true"
       effect: NoSchedule
+
+# Jobs (e.g., migration resources) also need tolerations
+hooks:
+  nodeSelector:
+    hardware-type: gpu
+  tolerations:
+    - key: gpu
+      operator: Equal
+      value: "true"
+      effect: NoSchedule
diff --git a/config/nfd/kustomization.yaml b/config/nfd/kustomization.yaml
index 684f558c6..7490a2ea6 100644
--- a/config/nfd/kustomization.yaml
+++ b/config/nfd/kustomization.yaml
@@ -6,7 +6,7 @@ kind: Kustomization
 resources:
   - https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/default?ref=v0.17.2
 
-# Optional: Add custom NFD configuration
-# Uncomment if you need to customize NFD behavior
-# patchesStrategicMerge:
-#   - nfd-worker-conf.yaml
+# Patches for GPU-tainted nodes (Kind cluster)
+patchesStrategicMerge:
+  - patch-nfd-gc.yaml
+  - patch-nfd-workers.yaml
diff --git a/config/nfd/patch-nfd-gc.yaml b/config/nfd/patch-nfd-gc.yaml
new file mode 100644
index 000000000..b717c07e2
--- /dev/null
+++ b/config/nfd/patch-nfd-gc.yaml
@@ -0,0 +1,13 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: nfd-gc
+  namespace: node-feature-discovery
+spec:
+  template:
+    spec:
+      tolerations:
+        - key: gpu
+          operator: Equal
+          value: "true"
+          effect: NoSchedule
diff --git a/config/nfd/patch-nfd-workers.yaml b/config/nfd/patch-nfd-workers.yaml
new file mode 100644
index 000000000..c8fa653cd
--- /dev/null
+++ b/config/nfd/patch-nfd-workers.yaml
@@ -0,0 +1,13 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: nfd-worker
+  namespace: node-feature-discovery
+spec:
+  template:
+    spec:
+      tolerations:
+        - key: gpu
+          operator: Equal
+          value: "true"
+          effect: NoSchedule

From 1eb484647831f0a9c6a7d860e403c24031e04bd9 Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Mon, 16 Mar 2026 17:49:03 +0000
Subject: [PATCH 17/25] fix: skip NFD deployment for Kind clusters and use
 device plugin labels

NFD is unnecessary in Kind simulated GPU environments. Instead, patch agent
daemonsets to use GPU device plugin labels (rocm.amd.com/gpu.present,
nvidia.com/gpu.present) for node affinity rather than NFD's PCI device labels.

Changes:
- Skip NFD deployment when NO_GPU=true (Kind clusters)
- Skip NFD undeployment when NO_GPU=true
- Add Kind-specific agent patches using device plugin labels
- Patch gkm-agent-amd to use rocm.amd.com/gpu.present label
- Patch gkm-agent-nvidia to use nvidia.com/gpu.present label
- Patch gkm-agent-nogpu to exclude nodes with GPU labels

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 Makefile                                |  6 ++++++
 config/kind-gpu/agent-amd-patch.yaml    | 17 +++++++++++++++++
 config/kind-gpu/agent-nogpu-patch.yaml  | 19 +++++++++++++++++++
 config/kind-gpu/agent-nvidia-patch.yaml | 17 +++++++++++++++++
 config/kind-gpu/kustomization.yaml      |  3 +++
 5 files changed, 62 insertions(+)
 create mode 100644 config/kind-gpu/agent-amd-patch.yaml
 create mode 100644 config/kind-gpu/agent-nogpu-patch.yaml
 create mode 100644 config/kind-gpu/agent-nvidia-patch.yaml

diff --git a/Makefile b/Makefile
index a7c63947c..2e78740f4 100644
--- a/Makefile
+++ b/Makefile
@@ -385,7 +385,11 @@ ifneq ($(KYVERNO_ENABLED),true)
 endif
 
 .PHONY: deploy
+ifeq ($(NO_GPU),true)
+deploy: manifests kustomize prepare-deploy webhook-secret-file deploy-cert-manager redeploy ## Deploy controller and agent to Kind cluster (skips NFD)
+else
 deploy: manifests kustomize prepare-deploy webhook-secret-file deploy-cert-manager deploy-nfd redeploy ## Deploy controller and agent to the K8s cluster specified in ~/.kube/config
+endif
 ifeq ($(KYVERNO_ENABLED),true)
 	@echo "Deploying Kyverno (KYVERNO_ENABLED=true)..."
 	$(MAKE) deploy-kyverno-with-policies
@@ -409,8 +413,10 @@ ifeq ($(KYVERNO_ENABLED),true)
 	-$(MAKE) undeploy-kyverno-policies
 	-$(MAKE) undeploy-kyverno-production
 endif
+ifneq ($(NO_GPU),true)
 	@echo "Undeploying NFD..."
 	-$(MAKE) undeploy-nfd
+endif
 	@echo "Undeployment from $(DEPLOY_PATH) completed."
 
 .PHONY: undeploy-force
diff --git a/config/kind-gpu/agent-amd-patch.yaml b/config/kind-gpu/agent-amd-patch.yaml
new file mode 100644
index 000000000..43ee255e8
--- /dev/null
+++ b/config/kind-gpu/agent-amd-patch.yaml
@@ -0,0 +1,17 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: gkm-agent-amd
+  namespace: gkm-system
+spec:
+  template:
+    spec:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: rocm.amd.com/gpu.present
+                operator: Exists
+              - key: node-role.kubernetes.io/control-plane
+                operator: DoesNotExist
diff --git a/config/kind-gpu/agent-nogpu-patch.yaml b/config/kind-gpu/agent-nogpu-patch.yaml
new file mode 100644
index 000000000..47f407916
--- /dev/null
+++ b/config/kind-gpu/agent-nogpu-patch.yaml
@@ -0,0 +1,19 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: gkm-agent-nogpu
+  namespace: gkm-system
+spec:
+  template:
+    spec:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: rocm.amd.com/gpu.present
+                operator: DoesNotExist
+              - key: nvidia.com/gpu.present
+                operator: DoesNotExist
+              - key: node-role.kubernetes.io/control-plane
+                operator: DoesNotExist
diff --git a/config/kind-gpu/agent-nvidia-patch.yaml b/config/kind-gpu/agent-nvidia-patch.yaml
new file mode 100644
index 000000000..4f95a5f94
--- /dev/null
+++ b/config/kind-gpu/agent-nvidia-patch.yaml
@@ -0,0 +1,17 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: gkm-agent-nvidia
+  namespace: gkm-system
+spec:
+  template:
+    spec:
+      affinity:
+        nodeAffinity:
+          requiredDuringSchedulingIgnoredDuringExecution:
+            nodeSelectorTerms:
+            - matchExpressions:
+              - key: nvidia.com/gpu.present
+                operator: Exists
+              - key: node-role.kubernetes.io/control-plane
+                operator: DoesNotExist
diff --git a/config/kind-gpu/kustomization.yaml b/config/kind-gpu/kustomization.yaml
index d81e686c7..e4effe2f8 100644
--- a/config/kind-gpu/kustomization.yaml
+++ b/config/kind-gpu/kustomization.yaml
@@ -10,3 +10,6 @@ patches:
       kind: DaemonSet
       name: gkm-agent
     path: agent-patch.yaml
+  - path: agent-amd-patch.yaml
+  - path: agent-nvidia-patch.yaml
+  - path: agent-nogpu-patch.yaml

From 07a00a028c2a50a6d22f1c07e157b711f065a224 Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Mon, 16 Mar 2026 18:13:08 +0000
Subject: [PATCH 18/25] fix: separate SKIP_NFD and NO_GPU flags, simulate NFD
 labels in Kind

Introduces SKIP_NFD flag to control NFD deployment separately from NO_GPU
mode. For Kind clusters, we skip NFD deployment but simulate it by manually
adding PCI device labels that NFD would normally create.

Changes:
- Add SKIP_NFD flag (default: false) to control NFD deployment
- Use SKIP_NFD instead of NO_GPU for controlling NFD deployment/undeploy
- Auto-label Kind worker nodes with NFD PCI device labels (nvidia/rocm)
- Keep NO_GPU=true for Kind to use no-GPU agent mode
- Remove device plugin label patches (revert to NFD PCI labels)
- Update deploy-on-kind to pass both SKIP_NFD=true and NO_GPU=true

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 Makefile                                | 33 ++++++++++++++++---------
 config/kind-gpu/agent-amd-patch.yaml    | 17 -------------
 config/kind-gpu/agent-nogpu-patch.yaml  | 19 --------------
 config/kind-gpu/agent-nvidia-patch.yaml | 17 -------------
 config/kind-gpu/kustomization.yaml      |  3 ---
 5 files changed, 22 insertions(+), 67 deletions(-)
 delete mode 100644 config/kind-gpu/agent-amd-patch.yaml
 delete mode 100644 config/kind-gpu/agent-nogpu-patch.yaml
 delete mode 100644 config/kind-gpu/agent-nvidia-patch.yaml

diff --git a/Makefile b/Makefile
index 2e78740f4..9b0b4554f 100644
--- a/Makefile
+++ b/Makefile
@@ -19,6 +19,9 @@ CONTAINER_FLAGS ?= --build-arg TARGETARCH=$(ARCH)
 # NO_GPU flag for building without GPU support
 NO_GPU_BUILD ?= false
 
+# SKIP_NFD flag for skipping NFD deployment (e.g., Kind clusters)
+SKIP_NFD ?= false
+
 # KYVERNO_ENABLED flag for enabling/disabling Kyverno verification (runtime only)
 KYVERNO_ENABLED ?= true
 
@@ -385,8 +388,8 @@ ifneq ($(KYVERNO_ENABLED),true)
 endif
 
 .PHONY: deploy
-ifeq ($(NO_GPU),true)
-deploy: manifests kustomize prepare-deploy webhook-secret-file deploy-cert-manager redeploy ## Deploy controller and agent to Kind cluster (skips NFD)
+ifeq ($(SKIP_NFD),true)
+deploy: manifests kustomize prepare-deploy webhook-secret-file deploy-cert-manager redeploy ## Deploy controller and agent (skips NFD for Kind)
 else
 deploy: manifests kustomize prepare-deploy webhook-secret-file deploy-cert-manager deploy-nfd redeploy ## Deploy controller and agent to the K8s cluster specified in ~/.kube/config
 endif
@@ -413,7 +416,7 @@ ifeq ($(KYVERNO_ENABLED),true)
 	-$(MAKE) undeploy-kyverno-policies
 	-$(MAKE) undeploy-kyverno-production
 endif
-ifneq ($(NO_GPU),true)
+ifneq ($(SKIP_NFD),true)
 	@echo "Undeploying NFD..."
 	-$(MAKE) undeploy-nfd
 endif
@@ -541,8 +544,8 @@ $(HELM): $(LOCALBIN)
 .PHONY: _deploy-kyverno-base
 _deploy-kyverno-base: helm
 	@echo "Installing Kyverno..."
-ifeq ($(NO_GPU),true)
-	@echo "Using Kyverno configuration with GPU nodeSelector and tolerations (NO_GPU=true)..."
+ifeq ($(SKIP_NFD),true)
+	@echo "Using Kyverno configuration with GPU nodeSelector and tolerations (SKIP_NFD=true for Kind)..."
 	$(HELM) upgrade --install kyverno $(KYVERNO_HELM_FLAGS) $(KYVERNO_CONTEXT) \
 		--values config/kyverno/values-no-gpu.yaml
 else
@@ -565,8 +568,8 @@ deploy-kyverno-production: ## Deploy Kyverno for production clusters
 	@$(MAKE) _deploy-kyverno-base KYVERNO_CONTEXT="" KYVERNO_WAIT=true
 
 .PHONY: deploy-kyverno-with-policies
-ifeq ($(NO_GPU),true)
-deploy-kyverno-with-policies: deploy-kyverno deploy-kyverno-policies ## Deploy Kyverno and its policies (uses NO_GPU values for Kind)
+ifeq ($(SKIP_NFD),true)
+deploy-kyverno-with-policies: deploy-kyverno deploy-kyverno-policies ## Deploy Kyverno and its policies (uses Kind values with GPU tolerations)
 else
 deploy-kyverno-with-policies: deploy-kyverno-production deploy-kyverno-policies ## Deploy Kyverno and its policies (uses production values)
 endif
@@ -647,17 +650,25 @@ deploy-on-kind: kind-load-images tmp-cleanup
 	$(KUBECTL) label node kind-gpu-sim-worker gkm-test-node=true --overwrite
 	@echo "Add label gkm-test-node=false to node kind-gpu-sim-worker2."
 	$(KUBECTL) label node kind-gpu-sim-worker2 gkm-test-node=false --overwrite
-	## NOTE: config/kind-gpu is an overlay of config/default
-	$(MAKE) deploy DEPLOY_PATH=config/kind-gpu NO_GPU=true
+	@echo "Add NFD PCI device labels for $(GPU_TYPE) GPUs to worker nodes..."
+ifeq ($(GPU_TYPE),nvidia)
+	$(KUBECTL) label node kind-gpu-sim-worker feature.node.kubernetes.io/pci-0300_10de.present=true --overwrite
+	$(KUBECTL) label node kind-gpu-sim-worker2 feature.node.kubernetes.io/pci-0300_10de.present=true --overwrite
+else ifeq ($(GPU_TYPE),rocm)
+	$(KUBECTL) label node kind-gpu-sim-worker feature.node.kubernetes.io/pci-0300_1002.present=true --overwrite
+	$(KUBECTL) label node kind-gpu-sim-worker2 feature.node.kubernetes.io/pci-0300_1002.present=true --overwrite
+endif
+	## NOTE: config/kind-gpu is an overlay of config/kind-gpu
+	$(MAKE) deploy DEPLOY_PATH=config/kind-gpu SKIP_NFD=true NO_GPU=true
 
 .PHONY: redeploy-on-kind
 redeploy-on-kind: ## Redeploy controller and agent to Kind GPU cluster after run-on-kind and undeploy-on-kind have been called. Skips some onetime steps in deploy.
-	$(MAKE) redeploy DEPLOY_PATH=config/kind-gpu NO_GPU=true
+	$(MAKE) redeploy DEPLOY_PATH=config/kind-gpu SKIP_NFD=true
 	@echo "Deployment to $(DEPLOY_PATH) completed."
 
 .PHONY: undeploy-on-kind
 undeploy-on-kind: ## Undeploy operator and agent from the Kind GPU cluster.
-	$(MAKE) undeploy FORCE=$(FORCE) DEPLOY_PATH=config/kind-gpu ignore-not-found=$(ignore-not-found)
+	$(MAKE) undeploy FORCE=$(FORCE) DEPLOY_PATH=config/kind-gpu SKIP_NFD=true ignore-not-found=$(ignore-not-found)
 	@echo "Undeployment from Kind GPU cluster $(KIND_CLUSTER_NAME) completed."
 
 .PHONY: undeploy-on-kind-force
diff --git a/config/kind-gpu/agent-amd-patch.yaml b/config/kind-gpu/agent-amd-patch.yaml
deleted file mode 100644
index 43ee255e8..000000000
--- a/config/kind-gpu/agent-amd-patch.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-apiVersion: apps/v1
-kind: DaemonSet
-metadata:
-  name: gkm-agent-amd
-  namespace: gkm-system
-spec:
-  template:
-    spec:
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-            - matchExpressions:
-              - key: rocm.amd.com/gpu.present
-                operator: Exists
-              - key: node-role.kubernetes.io/control-plane
-                operator: DoesNotExist
diff --git a/config/kind-gpu/agent-nogpu-patch.yaml b/config/kind-gpu/agent-nogpu-patch.yaml
deleted file mode 100644
index 47f407916..000000000
--- a/config/kind-gpu/agent-nogpu-patch.yaml
+++ /dev/null
@@ -1,19 +0,0 @@
-apiVersion: apps/v1
-kind: DaemonSet
-metadata:
-  name: gkm-agent-nogpu
-  namespace: gkm-system
-spec:
-  template:
-    spec:
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-            - matchExpressions:
-              - key: rocm.amd.com/gpu.present
-                operator: DoesNotExist
-              - key: nvidia.com/gpu.present
-                operator: DoesNotExist
-              - key: node-role.kubernetes.io/control-plane
-                operator: DoesNotExist
diff --git a/config/kind-gpu/agent-nvidia-patch.yaml b/config/kind-gpu/agent-nvidia-patch.yaml
deleted file mode 100644
index 4f95a5f94..000000000
--- a/config/kind-gpu/agent-nvidia-patch.yaml
+++ /dev/null
@@ -1,17 +0,0 @@
-apiVersion: apps/v1
-kind: DaemonSet
-metadata:
-  name: gkm-agent-nvidia
-  namespace: gkm-system
-spec:
-  template:
-    spec:
-      affinity:
-        nodeAffinity:
-          requiredDuringSchedulingIgnoredDuringExecution:
-            nodeSelectorTerms:
-            - matchExpressions:
-              - key: nvidia.com/gpu.present
-                operator: Exists
-              - key: node-role.kubernetes.io/control-plane
-                operator: DoesNotExist
diff --git a/config/kind-gpu/kustomization.yaml b/config/kind-gpu/kustomization.yaml
index e4effe2f8..d81e686c7 100644
--- a/config/kind-gpu/kustomization.yaml
+++ b/config/kind-gpu/kustomization.yaml
@@ -10,6 +10,3 @@ patches:
       kind: DaemonSet
       name: gkm-agent
     path: agent-patch.yaml
-  - path: agent-amd-patch.yaml
-  - path: agent-nvidia-patch.yaml
-  - path: agent-nogpu-patch.yaml

From 6ad2c1622d07764042fd5f8a00b62ab3a109b84f Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Mon, 16 Mar 2026 18:41:39 +0000
Subject: [PATCH 19/25] fix: remove node affinity from nogpu agent for Kind
 clusters

For Kind GPU simulation with NO_GPU=true, remove NFD PCI label requirements
by removing node affinity from the nogpu agent. This allows the agent to
schedule on all worker nodes without needing NFD labels.

Changes:
- Remove NFD PCI label addition from deploy-on-kind target
- Add Kind-specific patch to remove node affinity from nogpu agent
- Fix agent-patch.yaml to target all three agent daemonsets (amd, nvidia, nogpu)
- NoGPU agents now schedule successfully in Kind clusters

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 Makefile                                        | 10 +---------
 .../kind-gpu/agent-remove-affinity-patch.yaml   | 10 ++++++++++
 config/kind-gpu/kustomization.yaml              | 17 ++++++++++++++++-
 3 files changed, 27 insertions(+), 10 deletions(-)
 create mode 100644 config/kind-gpu/agent-remove-affinity-patch.yaml

diff --git a/Makefile b/Makefile
index 9b0b4554f..4d845fc42 100644
--- a/Makefile
+++ b/Makefile
@@ -650,15 +650,7 @@ deploy-on-kind: kind-load-images tmp-cleanup
 	$(KUBECTL) label node kind-gpu-sim-worker gkm-test-node=true --overwrite
 	@echo "Add label gkm-test-node=false to node kind-gpu-sim-worker2."
 	$(KUBECTL) label node kind-gpu-sim-worker2 gkm-test-node=false --overwrite
-	@echo "Add NFD PCI device labels for $(GPU_TYPE) GPUs to worker nodes..."
-ifeq ($(GPU_TYPE),nvidia)
-	$(KUBECTL) label node kind-gpu-sim-worker feature.node.kubernetes.io/pci-0300_10de.present=true --overwrite
-	$(KUBECTL) label node kind-gpu-sim-worker2 feature.node.kubernetes.io/pci-0300_10de.present=true --overwrite
-else ifeq ($(GPU_TYPE),rocm)
-	$(KUBECTL) label node kind-gpu-sim-worker feature.node.kubernetes.io/pci-0300_1002.present=true --overwrite
-	$(KUBECTL) label node kind-gpu-sim-worker2 feature.node.kubernetes.io/pci-0300_1002.present=true --overwrite
-endif
-	## NOTE: config/kind-gpu is an overlay of config/kind-gpu
+	## NOTE: config/kind-gpu is an overlay of config/default
 	$(MAKE) deploy DEPLOY_PATH=config/kind-gpu SKIP_NFD=true NO_GPU=true
 
 .PHONY: redeploy-on-kind
diff --git a/config/kind-gpu/agent-remove-affinity-patch.yaml b/config/kind-gpu/agent-remove-affinity-patch.yaml
new file mode 100644
index 000000000..734147088
--- /dev/null
+++ b/config/kind-gpu/agent-remove-affinity-patch.yaml
@@ -0,0 +1,10 @@
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+  name: gkm-agent-nogpu
+  namespace: gkm-system
+spec:
+  template:
+    spec:
+      # Remove node affinity for Kind - schedule on all worker nodes
+      affinity: null
diff --git a/config/kind-gpu/kustomization.yaml b/config/kind-gpu/kustomization.yaml
index d81e686c7..caaca0ba1 100644
--- a/config/kind-gpu/kustomization.yaml
+++ b/config/kind-gpu/kustomization.yaml
@@ -4,9 +4,24 @@ resources:
 apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 patches:
+  # Add GPU tolerations and nodeSelector to all agents
   - target:
       group: apps
       version: v1
       kind: DaemonSet
-      name: gkm-agent
+      name: gkm-agent-amd
     path: agent-patch.yaml
+  - target:
+      group: apps
+      version: v1
+      kind: DaemonSet
+      name: gkm-agent-nvidia
+    path: agent-patch.yaml
+  - target:
+      group: apps
+      version: v1
+      kind: DaemonSet
+      name: gkm-agent-nogpu
+    path: agent-patch.yaml
+  # Remove node affinity for nogpu agent in Kind (no NFD labels)
+  - path: agent-remove-affinity-patch.yaml

From 0ca6f4e62bb1d3a403d6292faaef49b92bbf05dd Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Mon, 16 Mar 2026 18:56:13 +0000
Subject: [PATCH 20/25] fix: standardize namespace and cache naming in ROCM and
 CUDA examples
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make namespace and cache names consistent across ROCM and CUDA examples:
- ROCM namespace: gkm-test-ns-rwo-1 → gkm-test-ns-rocm-rwo-1
- ROCM cache: vector-add-cache-rocm-v2-rwo → vector-add-cache-rocm-rwo
- ROCM cache v3: vector-add-cache-rocm-v3-rwo → vector-add-cache-rocm-rwo-v3
- CUDA namespace: gkm-test-ns-nvidia-rwo-1 → gkm-test-ns-cuda-rwo-1
- CUDA workloads: gkm-test-nvidia-* → gkm-test-cuda-*

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 examples/namespace/RWO/CUDA/10-namespace.yaml          | 2 +-
 examples/namespace/RWO/CUDA/11-gkmcache.yaml           | 2 +-
 examples/namespace/RWO/CUDA/12-ds.yaml                 | 8 ++++----
 examples/namespace/RWO/CUDA/13-pod.yaml                | 4 ++--
 examples/namespace/RWO/ROCM/10-namespace.yaml          | 2 +-
 examples/namespace/RWO/ROCM/11-gkmcache.yaml           | 4 ++--
 examples/namespace/RWO/ROCM/12-ds.yaml                 | 4 ++--
 examples/namespace/RWO/ROCM/13-ds.yaml                 | 4 ++--
 examples/namespace/RWO/ROCM/14-ds.yaml                 | 4 ++--
 examples/namespace/RWO/ROCM/21-gkmcache-cosign-v3.yaml | 4 ++--
 examples/namespace/RWO/ROCM/22-ds.yaml                 | 4 ++--
 11 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/examples/namespace/RWO/CUDA/10-namespace.yaml b/examples/namespace/RWO/CUDA/10-namespace.yaml
index aec06b330..dd97a60f3 100644
--- a/examples/namespace/RWO/CUDA/10-namespace.yaml
+++ b/examples/namespace/RWO/CUDA/10-namespace.yaml
@@ -2,4 +2,4 @@
 apiVersion: v1
 kind: Namespace
 metadata:
-  name: gkm-test-ns-nvidia-rwo-1
+  name: gkm-test-ns-cuda-rwo-1
diff --git a/examples/namespace/RWO/CUDA/11-gkmcache.yaml b/examples/namespace/RWO/CUDA/11-gkmcache.yaml
index 54cb9729f..d16e27b3f 100644
--- a/examples/namespace/RWO/CUDA/11-gkmcache.yaml
+++ b/examples/namespace/RWO/CUDA/11-gkmcache.yaml
@@ -3,7 +3,7 @@ apiVersion: gkm.io/v1alpha1
 kind: GKMCache
 metadata:
   name: vector-add-cache-cuda-rwo
-  namespace: gkm-test-ns-nvidia-rwo-1
+  namespace: gkm-test-ns-cuda-rwo-1
   labels:
     gkm.io/signature-format: cosign-v2
 spec:
diff --git a/examples/namespace/RWO/CUDA/12-ds.yaml b/examples/namespace/RWO/CUDA/12-ds.yaml
index 446e94e03..f4e38910a 100644
--- a/examples/namespace/RWO/CUDA/12-ds.yaml
+++ b/examples/namespace/RWO/CUDA/12-ds.yaml
@@ -2,18 +2,18 @@
 kind: DaemonSet
 apiVersion: apps/v1
 metadata:
-  name: gkm-test-nvidia-rwo-ds-1
-  namespace: gkm-test-ns-nvidia-rwo-1
+  name: gkm-test-cuda-rwo-ds-1
+  namespace: gkm-test-ns-cuda-rwo-1
   labels:
     gkm.io/pvcMutation: "true"
 spec:
   selector:
     matchLabels:
-      name: gkm-test-nvidia-rwo-ds-1
+      name: gkm-test-cuda-rwo-ds-1
   template:
     metadata:
       labels:
-        name: gkm-test-nvidia-rwo-ds-1
+        name: gkm-test-cuda-rwo-ds-1
         gkm.io/pvc-mutation: "true"
     spec:
       tolerations:
diff --git a/examples/namespace/RWO/CUDA/13-pod.yaml b/examples/namespace/RWO/CUDA/13-pod.yaml
index c74231f3e..fcb0bbc7d 100644
--- a/examples/namespace/RWO/CUDA/13-pod.yaml
+++ b/examples/namespace/RWO/CUDA/13-pod.yaml
@@ -2,8 +2,8 @@
 kind: Pod
 apiVersion: v1
 metadata:
-  name: gkm-test-nvidia-pod-1
-  namespace: gkm-test-ns-nvidia-rwo-1
+  name: gkm-test-cuda-pod-1
+  namespace: gkm-test-ns-cuda-rwo-1
 spec:
   tolerations:
     - key: nvidia.com/gpu
diff --git a/examples/namespace/RWO/ROCM/10-namespace.yaml b/examples/namespace/RWO/ROCM/10-namespace.yaml
index bc47b15b7..b91919dd6 100644
--- a/examples/namespace/RWO/ROCM/10-namespace.yaml
+++ b/examples/namespace/RWO/ROCM/10-namespace.yaml
@@ -2,4 +2,4 @@
 apiVersion: v1
 kind: Namespace
 metadata:
-  name: gkm-test-ns-rwo-1
+  name: gkm-test-ns-rocm-rwo-1
diff --git a/examples/namespace/RWO/ROCM/11-gkmcache.yaml b/examples/namespace/RWO/ROCM/11-gkmcache.yaml
index eb81bd8a5..5fb6e2c34 100644
--- a/examples/namespace/RWO/ROCM/11-gkmcache.yaml
+++ b/examples/namespace/RWO/ROCM/11-gkmcache.yaml
@@ -2,8 +2,8 @@
 apiVersion: gkm.io/v1alpha1
 kind: GKMCache
 metadata:
-  name: vector-add-cache-rocm-v2-rwo
-  namespace: gkm-test-ns-rwo-1
+  name: vector-add-cache-rocm-rwo
+  namespace: gkm-test-ns-rocm-rwo-1
   labels:
     gkm.io/signature-format: cosign-v2
 spec:
diff --git a/examples/namespace/RWO/ROCM/12-ds.yaml b/examples/namespace/RWO/ROCM/12-ds.yaml
index 738c8bd61..eb992679a 100644
--- a/examples/namespace/RWO/ROCM/12-ds.yaml
+++ b/examples/namespace/RWO/ROCM/12-ds.yaml
@@ -3,7 +3,7 @@ kind: DaemonSet
 apiVersion: apps/v1
 metadata:
   name: gkm-test-ns-rwo-ds-1
-  namespace: gkm-test-ns-rwo-1
+  namespace: gkm-test-ns-rocm-rwo-1
   labels:
     gkm.io/pvcMutation: "true"
 spec:
@@ -50,4 +50,4 @@ spec:
       volumes:
         - name: kernel-volume
           persistentVolumeClaim:
-            claimName: vector-add-cache-rocm-v2-rwo
+            claimName: vector-add-cache-rocm-rwo
diff --git a/examples/namespace/RWO/ROCM/13-ds.yaml b/examples/namespace/RWO/ROCM/13-ds.yaml
index 937e745e1..38acde56c 100644
--- a/examples/namespace/RWO/ROCM/13-ds.yaml
+++ b/examples/namespace/RWO/ROCM/13-ds.yaml
@@ -3,7 +3,7 @@ kind: DaemonSet
 apiVersion: apps/v1
 metadata:
   name: gkm-test-ns-rwo-ds-2
-  namespace: gkm-test-ns-rwo-1
+  namespace: gkm-test-ns-rocm-rwo-1
   labels:
     gkm.io/pvc-mutation: "true"
 spec:
@@ -51,4 +51,4 @@ spec:
       volumes:
         - name: kernel-volume
           persistentVolumeClaim:
-            claimName: vector-add-cache-rocm-v2-rwo
+            claimName: vector-add-cache-rocm-rwo
diff --git a/examples/namespace/RWO/ROCM/14-ds.yaml b/examples/namespace/RWO/ROCM/14-ds.yaml
index c6bf50212..64d1a9c78 100644
--- a/examples/namespace/RWO/ROCM/14-ds.yaml
+++ b/examples/namespace/RWO/ROCM/14-ds.yaml
@@ -3,7 +3,7 @@ kind: DaemonSet
 apiVersion: apps/v1
 metadata:
   name: gkm-test-ns-rwo-ds-3
-  namespace: gkm-test-ns-rwo-1
+  namespace: gkm-test-ns-rocm-rwo-1
   labels:
     gkm.io/pvcMutation: "true"
 spec:
@@ -51,4 +51,4 @@ spec:
       volumes:
         - name: kernel-volume
           persistentVolumeClaim:
-            claimName: vector-add-cache-rocm-v2-rwo
+            claimName: vector-add-cache-rocm-rwo
diff --git a/examples/namespace/RWO/ROCM/21-gkmcache-cosign-v3.yaml b/examples/namespace/RWO/ROCM/21-gkmcache-cosign-v3.yaml
index 9a091eaf3..6bafc7b42 100644
--- a/examples/namespace/RWO/ROCM/21-gkmcache-cosign-v3.yaml
+++ b/examples/namespace/RWO/ROCM/21-gkmcache-cosign-v3.yaml
@@ -2,8 +2,8 @@
 apiVersion: gkm.io/v1alpha1
 kind: GKMCache
 metadata:
-  name: vector-add-cache-rocm-v3-rwo
-  namespace: gkm-test-ns-rwo-1
+  name: vector-add-cache-rocm-rwo-v3
+  namespace: gkm-test-ns-rocm-rwo-1
   labels:
     gkm.io/signature-format: cosign-v3
 spec:
diff --git a/examples/namespace/RWO/ROCM/22-ds.yaml b/examples/namespace/RWO/ROCM/22-ds.yaml
index c682f8a2c..414ba05bf 100644
--- a/examples/namespace/RWO/ROCM/22-ds.yaml
+++ b/examples/namespace/RWO/ROCM/22-ds.yaml
@@ -3,7 +3,7 @@ kind: DaemonSet
 apiVersion: apps/v1
 metadata:
   name: gkm-test-ns-rwo-v3-ds-1
-  namespace: gkm-test-ns-rwo-1
+  namespace: gkm-test-ns-rocm-rwo-1
   labels:
     gkm.io/pvcMutation: "true"
 spec:
@@ -50,4 +50,4 @@ spec:
       volumes:
         - name: kernel-volume
           persistentVolumeClaim:
-            claimName: vector-add-cache-rocm-v3-rwo
+            claimName: vector-add-cache-rocm-rwo-v3

From 4c4d525d17e25526b774a8a148c46d4871f010d4 Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Mon, 16 Mar 2026 18:57:10 +0000
Subject: [PATCH 21/25] fix: update ROCM daemonset names to match namespace
 pattern
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Update ROCM workload names to be consistent with namespace naming:
- gkm-test-ns-rwo-ds-* → gkm-test-rocm-rwo-ds-*
- gkm-test-ns-rwo-v3-ds-* → gkm-test-rocm-rwo-v3-ds-*

Now matches CUDA pattern: gkm-test-{vendor}-rwo-*

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 examples/namespace/RWO/ROCM/12-ds.yaml | 6 +++---
 examples/namespace/RWO/ROCM/13-ds.yaml | 6 +++---
 examples/namespace/RWO/ROCM/14-ds.yaml | 6 +++---
 examples/namespace/RWO/ROCM/22-ds.yaml | 6 +++---
 4 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/examples/namespace/RWO/ROCM/12-ds.yaml b/examples/namespace/RWO/ROCM/12-ds.yaml
index eb992679a..f12550f3d 100644
--- a/examples/namespace/RWO/ROCM/12-ds.yaml
+++ b/examples/namespace/RWO/ROCM/12-ds.yaml
@@ -2,18 +2,18 @@
 kind: DaemonSet
 apiVersion: apps/v1
 metadata:
-  name: gkm-test-ns-rwo-ds-1
+  name: gkm-test-rocm-rwo-ds-1
   namespace: gkm-test-ns-rocm-rwo-1
   labels:
     gkm.io/pvcMutation: "true"
 spec:
   selector:
     matchLabels:
-      name: gkm-test-ns-rwo-ds-1
+      name: gkm-test-rocm-rwo-ds-1
   template:
     metadata:
       labels:
-        name: gkm-test-ns-rwo-ds-1
+        name: gkm-test-rocm-rwo-ds-1
         gkm.io/pvc-mutation: "true"
     spec:
       tolerations:
diff --git a/examples/namespace/RWO/ROCM/13-ds.yaml b/examples/namespace/RWO/ROCM/13-ds.yaml
index 38acde56c..bde833fc3 100644
--- a/examples/namespace/RWO/ROCM/13-ds.yaml
+++ b/examples/namespace/RWO/ROCM/13-ds.yaml
@@ -2,18 +2,18 @@
 kind: DaemonSet
 apiVersion: apps/v1
 metadata:
-  name: gkm-test-ns-rwo-ds-2
+  name: gkm-test-rocm-rwo-ds-2
   namespace: gkm-test-ns-rocm-rwo-1
   labels:
     gkm.io/pvc-mutation: "true"
 spec:
   selector:
     matchLabels:
-      name: gkm-test-ns-rwo-ds-2
+      name: gkm-test-rocm-rwo-ds-2
   template:
     metadata:
       labels:
-        name: gkm-test-ns-rwo-ds-2
+        name: gkm-test-rocm-rwo-ds-2
         gkm.io/pvc-mutation: "true"
     spec:
       tolerations:
diff --git a/examples/namespace/RWO/ROCM/14-ds.yaml b/examples/namespace/RWO/ROCM/14-ds.yaml
index 64d1a9c78..09d1842fd 100644
--- a/examples/namespace/RWO/ROCM/14-ds.yaml
+++ b/examples/namespace/RWO/ROCM/14-ds.yaml
@@ -2,18 +2,18 @@
 kind: DaemonSet
 apiVersion: apps/v1
 metadata:
-  name: gkm-test-ns-rwo-ds-3
+  name: gkm-test-rocm-rwo-ds-3
   namespace: gkm-test-ns-rocm-rwo-1
   labels:
     gkm.io/pvcMutation: "true"
 spec:
   selector:
     matchLabels:
-      name: gkm-test-ns-rwo-ds-3
+      name: gkm-test-rocm-rwo-ds-3
   template:
     metadata:
       labels:
-        name: gkm-test-ns-rwo-ds-3
+        name: gkm-test-rocm-rwo-ds-3
         gkm.io/pvc-mutation: "true"
     spec:
       tolerations:
diff --git a/examples/namespace/RWO/ROCM/22-ds.yaml b/examples/namespace/RWO/ROCM/22-ds.yaml
index 414ba05bf..47ef6d515 100644
--- a/examples/namespace/RWO/ROCM/22-ds.yaml
+++ b/examples/namespace/RWO/ROCM/22-ds.yaml
@@ -2,18 +2,18 @@
 kind: DaemonSet
 apiVersion: apps/v1
 metadata:
-  name: gkm-test-ns-rwo-v3-ds-1
+  name: gkm-test-rocm-rwo-v3-ds-1
   namespace: gkm-test-ns-rocm-rwo-1
   labels:
     gkm.io/pvcMutation: "true"
 spec:
   selector:
     matchLabels:
-      name: gkm-test-ns-rwo-v3-ds-1
+      name: gkm-test-rocm-rwo-v3-ds-1
   template:
     metadata:
       labels:
-        name: gkm-test-ns-rwo-v3-ds-1
+        name: gkm-test-rocm-rwo-v3-ds-1
         gkm.io/pvc-mutation: "true"
     spec:
       tolerations:

From c67564cc477df432efb127a5e6fae70252c28619 Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Mon, 16 Mar 2026 19:28:19 +0000
Subject: [PATCH 22/25] images: add gkm prefix to image names

Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 .github/workflows/image-build.yml   | 10 +++++-----
 Makefile                            | 12 ++++++------
 config/agent/gkm-agent-amd.yaml     |  2 +-
 config/agent/gkm-agent-nogpu.yaml   |  2 +-
 config/agent/gkm-agent-nvidia.yaml  |  2 +-
 config/agent/kustomization.yaml     | 15 ++++++++++++---
 config/configMap/configMap.yaml     |  2 +-
 config/configMap/kustomization.yaml |  2 +-
 config/operator/kustomization.yaml  |  7 +++++--
 config/operator/operator.yaml       |  2 +-
 10 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/image-build.yml b/.github/workflows/image-build.yml
index 710c17ef2..4a4cb15c4 100644
--- a/.github/workflows/image-build.yml
+++ b/.github/workflows/image-build.yml
@@ -32,7 +32,7 @@ jobs:
         image:
           - registry: quay.io
             repository: gkm
-            image: operator
+            image: gkm-operator
             dockerfile: ./Containerfile.gkm-operator
             context: .
             tags: |
@@ -45,7 +45,7 @@ jobs:
 
           - registry: quay.io
             repository: gkm
-            image: agent-base
+            image: gkm-agent-base
             dockerfile: ./Containerfile.gkm-agent-base
             context: .
             target: base-runtime
@@ -59,7 +59,7 @@ jobs:
 
           - registry: quay.io
             repository: gkm
-            image: agent-nvidia
+            image: gkm-agent-nvidia
             dockerfile: ./Containerfile.gkm-agent-nvidia
             context: .
             tags: |
@@ -72,7 +72,7 @@ jobs:
 
           - registry: quay.io
             repository: gkm
-            image: agent-amd
+            image: gkm-agent-amd
             dockerfile: ./Containerfile.gkm-agent-amd
             context: .
             tags: |
@@ -85,7 +85,7 @@ jobs:
 
           - registry: quay.io
             repository: gkm
-            image: agent-nogpu
+            image: gkm-agent-nogpu
             dockerfile: ./Containerfile.gkm-agent-nogpu
             context: .
             tags: |
diff --git a/Makefile b/Makefile
index 4d845fc42..db73b896b 100644
--- a/Makefile
+++ b/Makefile
@@ -77,13 +77,13 @@ OPERATOR_SDK_VERSION ?= v1.39.2
 QUAY_USER ?= gkm
 IMAGE_TAG ?= latest
 REPO ?= quay.io/$(QUAY_USER)
-OPERATOR_IMG ?= $(REPO)/operator:$(IMAGE_TAG)
-AGENT_IMG ?=$(REPO)/agent:$(IMAGE_TAG)
+OPERATOR_IMG ?= $(REPO)/gkm-operator:$(IMAGE_TAG)
+AGENT_IMG ?=$(REPO)/gkm-agent:$(IMAGE_TAG)
 EXTRACT_IMG ?=$(REPO)/gkm-extract:$(IMAGE_TAG)
-AGENT_BASE_IMG ?= $(REPO)/agent-base:$(IMAGE_TAG)
-AGENT_NVIDIA_IMG ?= $(REPO)/agent-nvidia:$(IMAGE_TAG)
-AGENT_AMD_IMG ?= $(REPO)/agent-amd:$(IMAGE_TAG)
-AGENT_NOGPU_IMG ?= $(REPO)/agent-nogpu:$(IMAGE_TAG)
+AGENT_BASE_IMG ?= $(REPO)/gkm-agent-base:$(IMAGE_TAG)
+AGENT_NVIDIA_IMG ?= $(REPO)/gkm-agent-nvidia:$(IMAGE_TAG)
+AGENT_AMD_IMG ?= $(REPO)/gkm-agent-amd:$(IMAGE_TAG)
+AGENT_NOGPU_IMG ?= $(REPO)/gkm-agent-nogpu:$(IMAGE_TAG)
 
 # ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary.
 ENVTEST_K8S_VERSION = 1.31.0
diff --git a/config/agent/gkm-agent-amd.yaml b/config/agent/gkm-agent-amd.yaml
index 6de806096..3187c3184 100644
--- a/config/agent/gkm-agent-amd.yaml
+++ b/config/agent/gkm-agent-amd.yaml
@@ -36,7 +36,7 @@ spec:
                 operator: Exists
       containers:
       - name: gkm-agent
-        image: quay.io/gkm/agent-amd:latest
+        image: quay.io/gkm/gkm-agent-amd:latest
         imagePullPolicy: IfNotPresent
         securityContext:
           runAsUser: 0
diff --git a/config/agent/gkm-agent-nogpu.yaml b/config/agent/gkm-agent-nogpu.yaml
index 8b7715104..66e3121b4 100644
--- a/config/agent/gkm-agent-nogpu.yaml
+++ b/config/agent/gkm-agent-nogpu.yaml
@@ -39,7 +39,7 @@ spec:
                 operator: DoesNotExist
       containers:
       - name: gkm-agent
-        image: quay.io/gkm/agent-nogpu:latest
+        image: quay.io/gkm/gkm-agent-nogpu:latest
         imagePullPolicy: IfNotPresent
         securityContext:
           runAsUser: 0
diff --git a/config/agent/gkm-agent-nvidia.yaml b/config/agent/gkm-agent-nvidia.yaml
index 6cad7bfa5..ceb1f63ee 100644
--- a/config/agent/gkm-agent-nvidia.yaml
+++ b/config/agent/gkm-agent-nvidia.yaml
@@ -33,7 +33,7 @@ spec:
                 operator: Exists
       containers:
       - name: gkm-agent
-        image: quay.io/gkm/agent-nvidia:latest
+        image: quay.io/gkm/gkm-agent-nvidia:latest
         imagePullPolicy: IfNotPresent
         securityContext:
           runAsUser: 0
diff --git a/config/agent/kustomization.yaml b/config/agent/kustomization.yaml
index b5f7f6699..c24b0d22e 100644
--- a/config/agent/kustomization.yaml
+++ b/config/agent/kustomization.yaml
@@ -10,11 +10,20 @@ resources:
 
 images:
 - name: quay.io/gkm/agent-amd
-  newName: quay.io/gkm/agent-amd
+  newName: quay.io/gkm/gkm-agent-amd
   newTag: latest
 - name: quay.io/gkm/agent-nogpu
-  newName: quay.io/gkm/agent-nogpu
+  newName: quay.io/gkm/gkm-agent-nogpu
   newTag: latest
 - name: quay.io/gkm/agent-nvidia
-  newName: quay.io/gkm/agent-nvidia
+  newName: quay.io/gkm/gkm-agent-nvidia
+  newTag: latest
+- name: quay.io/gkm/gkm-agent-amd
+  newName: quay.io/gkm/gkm-agent-amd
+  newTag: latest
+- name: quay.io/gkm/gkm-agent-nogpu
+  newName: quay.io/gkm/gkm-agent-nogpu
+  newTag: latest
+- name: quay.io/gkm/gkm-agent-nvidia
+  newName: quay.io/gkm/gkm-agent-nvidia
   newTag: latest
diff --git a/config/configMap/configMap.yaml b/config/configMap/configMap.yaml
index 12d3b863c..e2aed50e6 100644
--- a/config/configMap/configMap.yaml
+++ b/config/configMap/configMap.yaml
@@ -8,7 +8,7 @@ data:
   gkm.operator.log.level: info
   gkm.agent.log.level: info
   ## Can be configured at runtime
-  gkm.agent.image: quay.io/gkm/agent:latest
+  gkm.agent.image: quay.io/gkm/gkm-agent:latest
   gkm.extract.image: quay.io/gkm/gkm-extract:latest
   gkm.nogpu: false
   ## Enable/disable Kyverno image signature verification (defaults to true/enabled)
diff --git a/config/configMap/kustomization.yaml b/config/configMap/kustomization.yaml
index b46ed2b73..77b78a349 100644
--- a/config/configMap/kustomization.yaml
+++ b/config/configMap/kustomization.yaml
@@ -9,7 +9,7 @@ configMapGenerator:
 - behavior: merge
   literals:
   - gkm.nogpu=true
-  - gkm.agent.image=quay.io/gkm/agent:latest
+  - gkm.agent.image=quay.io/gkm/gkm-agent:latest
   - gkm.extract.image=quay.io/gkm/gkm-extract:latest
   name: config
   namespace: gkm-system
diff --git a/config/operator/kustomization.yaml b/config/operator/kustomization.yaml
index b7f6673d4..edf1e14b8 100644
--- a/config/operator/kustomization.yaml
+++ b/config/operator/kustomization.yaml
@@ -4,8 +4,11 @@ apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 images:
 - name: controller
-  newName: quay.io/gkm/operator
+  newName: quay.io/gkm/gkm-operator
+  newTag: latest
+- name: quay.io/gkm/gkm-operator
+  newName: quay.io/gkm/gkm-operator
   newTag: latest
 - name: quay.io/gkm/operator
-  newName: quay.io/gkm/operator
+  newName: quay.io/gkm/gkm-operator
   newTag: latest
diff --git a/config/operator/operator.yaml b/config/operator/operator.yaml
index d694ee2e5..917eb4b7f 100644
--- a/config/operator/operator.yaml
+++ b/config/operator/operator.yaml
@@ -71,7 +71,7 @@ spec:
         args:
           - --leader-elect
           - --health-probe-bind-address=:8081
-        image: quay.io/gkm/operator:latest
+        image: quay.io/gkm/gkm-operator:latest
         imagePullPolicy: IfNotPresent
         securityContext:
           allowPrivilegeEscalation: false

From ccbc1ad4e8ddb4f1d84371133fb9c3d43df733d8 Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Mon, 16 Mar 2026 19:50:45 +0000
Subject: [PATCH 23/25] refactor: use base image for builder stage in GPU agent
 Containerfiles

Removed duplicated builder stage from NVIDIA, AMD, and nogpu agent
Containerfiles. Each now uses FROM quay.io/gkm/gkm-agent-base:latest
as the builder stage, eliminating code duplication while keeping
GPU-specific runtime stages intact.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
---
 Containerfile.gkm-agent-amd    | 71 ++++------------------------------
 Containerfile.gkm-agent-nogpu  | 35 ++---------------
 Containerfile.gkm-agent-nvidia | 35 ++---------------
 Makefile                       |  2 +-
 4 files changed, 14 insertions(+), 129 deletions(-)

diff --git a/Containerfile.gkm-agent-amd b/Containerfile.gkm-agent-amd
index 14c81835a..5913458e1 100644
--- a/Containerfile.gkm-agent-amd
+++ b/Containerfile.gkm-agent-amd
@@ -1,69 +1,12 @@
 # ============================================================================
-# Stage 1: Builder (Shared across all agent variants)
-# See Containerfile.gkm-agent-base for the common base stages
+# AMD ROCm Agent (extends nogpu agent)
+# Inherits binary and common packages from nogpu, adds ROCm support
 # ============================================================================
-FROM public.ecr.aws/docker/library/golang:1.25 AS builder
 
-WORKDIR /workspace
+FROM quay.io/gkm/gkm-agent-nogpu:latest
 
-# Install required system packages
-RUN apt-get update && \
-    apt-get install -y \
-        libgpgme-dev \
-        btrfs-progs \
-        libbtrfs-dev \
-        libgpgme11-dev \
-        libseccomp-dev \
-        pkg-config \
-        build-essential && \
-    apt-get clean
-
-# Copy the Go Modules manifests
-COPY go.mod go.mod
-COPY go.sum go.sum
-
-# Copy the go source
-COPY agent/main.go agent/main.go
-COPY api/ api/
-COPY pkg/ pkg/
-COPY internal/controller/ internal/controller/
-COPY vendor/ vendor/
-COPY Makefile Makefile
-
-# Build the agent binary
-RUN make build-gkm-agent
-
-# ============================================================================
-# Stage 2: AMD ROCm-specific Runtime
-# ============================================================================
-
-# Start from Ubuntu base for AMD ROCm support
-FROM public.ecr.aws/docker/library/ubuntu:24.04
-
-# Copy the binary from the builder
-COPY --from=builder /workspace/bin/gkm-agent /agent
-
-# Install common runtime libraries (shared with other agent variants)
-RUN apt-get update && \
-    apt-get install -y \
-        ca-certificates \
-        libgpgme11 \
-        libbtrfs0 \
-        libffi8 \
-        libc6 \
-        wget \
-        pciutils \
-        hwdata \
-        gnupg2 \
-        python3-setuptools \
-        python3-wheel \
-        curl \
-        dialog \
-        rsync \
-        lsb-release \
-        software-properties-common \
-        libseccomp2 && \
-    apt-get clean
+# Switch to root to install ROCm packages
+USER root
 
 # AMD ROCm version configuration
 ARG ROCM_VERSION=6.3.1
@@ -78,7 +21,7 @@ RUN wget https://repo.radeon.com/amdgpu-install/${ROCM_VERSION}/ubuntu/noble/amd
     ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/amd-smi /usr/bin/amd-smi && \
     ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/rocm-smi /usr/bin/rocm-smi
 
-# Run as non-root user
+# Switch back to non-root user
 USER 65532:65532
 
-ENTRYPOINT ["/agent"]
+# Binary and entrypoint are inherited from nogpu image
diff --git a/Containerfile.gkm-agent-nogpu b/Containerfile.gkm-agent-nogpu
index a33172481..5942693be 100644
--- a/Containerfile.gkm-agent-nogpu
+++ b/Containerfile.gkm-agent-nogpu
@@ -1,37 +1,8 @@
 # ============================================================================
-# Stage 1: Builder (Shared across all agent variants)
-# See Containerfile.gkm-agent-base for the common base stages
+# Stage 1: Builder (from base image)
+# See Containerfile.gkm-agent-base for the common builder stage
 # ============================================================================
-FROM public.ecr.aws/docker/library/golang:1.25 AS builder
-
-WORKDIR /workspace
-
-# Install required system packages
-RUN apt-get update && \
-    apt-get install -y \
-        libgpgme-dev \
-        btrfs-progs \
-        libbtrfs-dev \
-        libgpgme11-dev \
-        libseccomp-dev \
-        pkg-config \
-        build-essential && \
-    apt-get clean
-
-# Copy the Go Modules manifests
-COPY go.mod go.mod
-COPY go.sum go.sum
-
-# Copy the go source
-COPY agent/main.go agent/main.go
-COPY api/ api/
-COPY pkg/ pkg/
-COPY internal/controller/ internal/controller/
-COPY vendor/ vendor/
-COPY Makefile Makefile
-
-# Build the agent binary
-RUN make build-gkm-agent
+FROM quay.io/gkm/gkm-agent-base:latest AS builder
 
 # ============================================================================
 # Stage 2: No-GPU Runtime (minimal footprint)
diff --git a/Containerfile.gkm-agent-nvidia b/Containerfile.gkm-agent-nvidia
index 1d06fb06d..e9c08638e 100644
--- a/Containerfile.gkm-agent-nvidia
+++ b/Containerfile.gkm-agent-nvidia
@@ -1,37 +1,8 @@
 # ============================================================================
-# Stage 1: Builder (Shared across all agent variants)
-# See Containerfile.gkm-agent-base for the common base stages
+# Stage 1: Builder (from base image)
+# See Containerfile.gkm-agent-base for the common builder stage
 # ============================================================================
-FROM public.ecr.aws/docker/library/golang:1.25 AS builder
-
-WORKDIR /workspace
-
-# Install required system packages
-RUN apt-get update && \
-    apt-get install -y \
-        libgpgme-dev \
-        btrfs-progs \
-        libbtrfs-dev \
-        libgpgme11-dev \
-        libseccomp-dev \
-        pkg-config \
-        build-essential && \
-    apt-get clean
-
-# Copy the Go Modules manifests
-COPY go.mod go.mod
-COPY go.sum go.sum
-
-# Copy the go source
-COPY agent/main.go agent/main.go
-COPY api/ api/
-COPY pkg/ pkg/
-COPY internal/controller/ internal/controller/
-COPY vendor/ vendor/
-COPY Makefile Makefile
-
-# Build the agent binary
-RUN make build-gkm-agent
+FROM quay.io/gkm/gkm-agent-base:latest AS builder
 
 # ============================================================================
 # Stage 2: NVIDIA-specific Runtime
diff --git a/Makefile b/Makefile
index db73b896b..02cbe5f6c 100644
--- a/Makefile
+++ b/Makefile
@@ -228,7 +228,7 @@ build-image-gkm-extract:
 
 .PHONY: build-image-agent-base
 build-image-agent-base:
-	$(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load --target base-runtime -f Containerfile.gkm-agent-base -t ${AGENT_BASE_IMG} .
+	$(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load --target builder -f Containerfile.gkm-agent-base -t ${AGENT_BASE_IMG} .
 
 .PHONY: build-image-agent-nvidia
 build-image-agent-nvidia:

From f744250ddc4788e0bccdd16eab11718418699ea6 Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Mon, 16 Mar 2026 20:52:44 +0000
Subject: [PATCH 24/25] refactor: consolidate agent Containerfiles into single
 multi-target file

Replaced separate Containerfiles for each agent variant with a single
Containerfile.gkm-agents containing multi-stage targets (nogpu, amd, nvidia).
This eliminates cross-file dependencies and enables parallel CI builds.

Changes:
- Created Containerfile.gkm-agents with shared builder stage
- nogpu target: complete agent with common runtime deps
- amd target: extends nogpu, adds ROCm support only
- nvidia target: CUDA runtime with agent binary
- Updated Makefile to build using --target flags
- Updated GitHub workflow to use single Containerfile
- Removed obsolete individual Containerfiles
- Updated documentation references

Benefits:
- No build dependencies between separate files
- Builder stage always available in same file
- AMD reuses all nogpu layers (more efficient)
- CI workflows can build in parallel
- Cleaner, more maintainable structure

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 .github/workflows/image-build.yml |  28 ++----
 Containerfile.gkm-agent-amd       |  27 ------
 Containerfile.gkm-agent-base      |  75 ----------------
 Containerfile.gkm-agent-nogpu     |  44 ----------
 Containerfile.gkm-agent-nvidia    |  46 ----------
 Containerfile.gkm-agents          | 139 ++++++++++++++++++++++++++++++
 Makefile                          |  22 ++---
 config/agent/README.md            |  16 ++--
 8 files changed, 163 insertions(+), 234 deletions(-)
 delete mode 100644 Containerfile.gkm-agent-amd
 delete mode 100644 Containerfile.gkm-agent-base
 delete mode 100644 Containerfile.gkm-agent-nogpu
 delete mode 100644 Containerfile.gkm-agent-nvidia
 create mode 100644 Containerfile.gkm-agents

diff --git a/.github/workflows/image-build.yml b/.github/workflows/image-build.yml
index 4a4cb15c4..61f61218f 100644
--- a/.github/workflows/image-build.yml
+++ b/.github/workflows/image-build.yml
@@ -45,10 +45,10 @@ jobs:
 
           - registry: quay.io
             repository: gkm
-            image: gkm-agent-base
-            dockerfile: ./Containerfile.gkm-agent-base
+            image: gkm-agent-nogpu
+            dockerfile: ./Containerfile.gkm-agents
             context: .
-            target: base-runtime
+            target: nogpu
             tags: |
               type=ref,event=branch
               type=ref,event=tag
@@ -56,12 +56,12 @@ jobs:
               type=sha,format=long
               # set latest tag for default branch
               type=raw,value=latest,enable={{is_default_branch}}
-
           - registry: quay.io
             repository: gkm
             image: gkm-agent-nvidia
-            dockerfile: ./Containerfile.gkm-agent-nvidia
+            dockerfile: ./Containerfile.gkm-agents
             context: .
+            target: nvidia
             tags: |
               type=ref,event=branch
               type=ref,event=tag
@@ -69,25 +69,12 @@ jobs:
               type=sha,format=long
               # set latest tag for default branch
               type=raw,value=latest,enable={{is_default_branch}}
-
           - registry: quay.io
             repository: gkm
             image: gkm-agent-amd
-            dockerfile: ./Containerfile.gkm-agent-amd
-            context: .
-            tags: |
-              type=ref,event=branch
-              type=ref,event=tag
-              type=ref,event=pr
-              type=sha,format=long
-              # set latest tag for default branch
-              type=raw,value=latest,enable={{is_default_branch}}
-
-          - registry: quay.io
-            repository: gkm
-            image: gkm-agent-nogpu
-            dockerfile: ./Containerfile.gkm-agent-nogpu
+            dockerfile: ./Containerfile.gkm-agents
             context: .
+            target: amd
             tags: |
               type=ref,event=branch
               type=ref,event=tag
@@ -95,7 +82,6 @@ jobs:
               type=sha,format=long
               # set latest tag for default branch
               type=raw,value=latest,enable={{is_default_branch}}
-
           - registry: quay.io
             repository: gkm
             image: gkm-extract
diff --git a/Containerfile.gkm-agent-amd b/Containerfile.gkm-agent-amd
deleted file mode 100644
index 5913458e1..000000000
--- a/Containerfile.gkm-agent-amd
+++ /dev/null
@@ -1,27 +0,0 @@
-# ============================================================================
-# AMD ROCm Agent (extends nogpu agent)
-# Inherits binary and common packages from nogpu, adds ROCm support
-# ============================================================================
-
-FROM quay.io/gkm/gkm-agent-nogpu:latest
-
-# Switch to root to install ROCm packages
-USER root
-
-# AMD ROCm version configuration
-ARG ROCM_VERSION=6.3.1
-ARG AMDGPU_VERSION=6.3.60301
-ARG OPT_ROCM_VERSION=6.3.1
-
-# Install AMD ROCm packages (GPU-specific dependencies)
-RUN wget https://repo.radeon.com/amdgpu-install/${ROCM_VERSION}/ubuntu/noble/amdgpu-install_${AMDGPU_VERSION}-1_all.deb && \
-    apt install -y ./*.deb && \
-    apt update && DEBIAN_FRONTEND=noninteractive apt install -y amd-smi-lib rocm-smi-lib && \
-    apt-get clean && rm -rf /var/lib/apt/lists/* && \
-    ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/amd-smi /usr/bin/amd-smi && \
-    ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/rocm-smi /usr/bin/rocm-smi
-
-# Switch back to non-root user
-USER 65532:65532
-
-# Binary and entrypoint are inherited from nogpu image
diff --git a/Containerfile.gkm-agent-base b/Containerfile.gkm-agent-base
deleted file mode 100644
index 9bd406c84..000000000
--- a/Containerfile.gkm-agent-base
+++ /dev/null
@@ -1,75 +0,0 @@
-# Common base Containerfile for GKM agents
-# This file contains the shared builder and base runtime stages
-# GPU-specific Containerfiles currently duplicate these stages with references
-# to this file for maintenance purposes.
-#
-# Future Enhancement: This base image could be built and pushed to Quay to
-# improve build efficiency:
-#   podman build -f Containerfile.gkm-agent-base --target base-runtime \
-#     -t quay.io/gkm/agent-runtime-base:latest .
-#   podman push quay.io/gkm/agent-runtime-base:latest
-#
-# Then GPU-specific Containerfiles could reference it:
-#   FROM quay.io/gkm/agent-runtime-base:latest
-
-# ============================================================================
-# Stage 1: Builder (Common to all agent variants)
-# ============================================================================
-FROM public.ecr.aws/docker/library/golang:1.25 AS builder
-
-WORKDIR /workspace
-
-# Install required system packages
-RUN apt-get update && \
-    apt-get install -y \
-        libgpgme-dev \
-        btrfs-progs \
-        libbtrfs-dev \
-        libgpgme11-dev \
-        libseccomp-dev \
-        pkg-config \
-        build-essential && \
-    apt-get clean
-
-# Copy the Go Modules manifests
-COPY go.mod go.mod
-COPY go.sum go.sum
-
-# Copy the go source
-COPY agent/main.go agent/main.go
-COPY api/ api/
-COPY pkg/ pkg/
-COPY internal/controller/ internal/controller/
-COPY vendor/ vendor/
-COPY Makefile Makefile
-
-# Build the agent binary
-RUN make build-gkm-agent
-
-# ============================================================================
-# Stage 2: Base Runtime (Common runtime dependencies)
-# ============================================================================
-FROM public.ecr.aws/docker/library/ubuntu:24.04 AS base-runtime
-
-# Install required runtime libraries for CGO and agent operation
-RUN apt-get update && \
-    apt-get install -y \
-        ca-certificates \
-        libgpgme11 \
-        libbtrfs0 \
-        libffi8 \
-        libc6 \
-        wget \
-        pciutils \
-        hwdata \
-        gnupg2 \
-        python3-setuptools \
-        python3-wheel \
-        curl \
-        dialog \
-        rsync \
-        lsb-release \
-        software-properties-common \
-        libseccomp2 && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
diff --git a/Containerfile.gkm-agent-nogpu b/Containerfile.gkm-agent-nogpu
deleted file mode 100644
index 5942693be..000000000
--- a/Containerfile.gkm-agent-nogpu
+++ /dev/null
@@ -1,44 +0,0 @@
-# ============================================================================
-# Stage 1: Builder (from base image)
-# See Containerfile.gkm-agent-base for the common builder stage
-# ============================================================================
-FROM quay.io/gkm/gkm-agent-base:latest AS builder
-
-# ============================================================================
-# Stage 2: No-GPU Runtime (minimal footprint)
-# ============================================================================
-
-# Use minimal Ubuntu base (no GPU libraries needed)
-FROM public.ecr.aws/docker/library/ubuntu:24.04
-
-# Copy the binary from the builder
-COPY --from=builder /workspace/bin/gkm-agent /agent
-
-# Install common runtime libraries (shared with other agent variants)
-# No GPU-specific dependencies required for this variant
-RUN apt-get update && \
-    apt-get install -y \
-        ca-certificates \
-        libgpgme11 \
-        libbtrfs0 \
-        libffi8 \
-        libc6 \
-        wget \
-        pciutils \
-        hwdata \
-        gnupg2 \
-        python3-setuptools \
-        python3-wheel \
-        curl \
-        dialog \
-        rsync \
-        lsb-release \
-        software-properties-common \
-        libseccomp2 && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-# Run as non-root user
-USER 65532:65532
-
-ENTRYPOINT ["/agent"]
diff --git a/Containerfile.gkm-agent-nvidia b/Containerfile.gkm-agent-nvidia
deleted file mode 100644
index e9c08638e..000000000
--- a/Containerfile.gkm-agent-nvidia
+++ /dev/null
@@ -1,46 +0,0 @@
-# ============================================================================
-# Stage 1: Builder (from base image)
-# See Containerfile.gkm-agent-base for the common builder stage
-# ============================================================================
-FROM quay.io/gkm/gkm-agent-base:latest AS builder
-
-# ============================================================================
-# Stage 2: NVIDIA-specific Runtime
-# ============================================================================
-
-# Use NVIDIA CUDA runtime base image (includes NVML libraries)
-FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04
-
-# Copy the binary from the builder
-COPY --from=builder /workspace/bin/gkm-agent /agent
-
-# Install common runtime libraries (shared with other agent variants)
-RUN apt-get update && \
-    apt-get install -y \
-        ca-certificates \
-        libgpgme11 \
-        libbtrfs0 \
-        libffi8 \
-        libc6 \
-        wget \
-        pciutils \
-        hwdata \
-        gnupg2 \
-        python3-setuptools \
-        python3-wheel \
-        curl \
-        dialog \
-        rsync \
-        lsb-release \
-        software-properties-common \
-        libseccomp2 && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/*
-
-# Note: NVIDIA CUDA base image already includes libnvidia-ml.so (NVML)
-# No additional GPU-specific packages needed
-
-# Run as non-root user
-USER 65532:65532
-
-ENTRYPOINT ["/agent"]
diff --git a/Containerfile.gkm-agents b/Containerfile.gkm-agents
new file mode 100644
index 000000000..326d0c078
--- /dev/null
+++ b/Containerfile.gkm-agents
@@ -0,0 +1,139 @@
+# ============================================================================
+# Multi-target Containerfile for GKM Agents
+# Build specific targets with: podman build --target <nogpu|amd|nvidia>
+# ============================================================================
+
+# ============================================================================
+# Stage 1: Builder (shared by all agent variants)
+# ============================================================================
+FROM public.ecr.aws/docker/library/golang:1.25 AS builder
+
+WORKDIR /workspace
+
+# Install required system packages
+RUN apt-get update && \
+    apt-get install -y \
+        libgpgme-dev \
+        btrfs-progs \
+        libbtrfs-dev \
+        libgpgme11-dev \
+        libseccomp-dev \
+        pkg-config \
+        build-essential && \
+    apt-get clean
+
+# Copy the Go Modules manifests
+COPY go.mod go.mod
+COPY go.sum go.sum
+
+# Copy the go source
+COPY agent/main.go agent/main.go
+COPY api/ api/
+COPY pkg/ pkg/
+COPY internal/controller/ internal/controller/
+COPY vendor/ vendor/
+COPY Makefile Makefile
+
+# Build the agent binary
+RUN make build-gkm-agent
+
+# ============================================================================
+# Target: nogpu (complete no-GPU agent)
+# ============================================================================
+FROM public.ecr.aws/docker/library/ubuntu:24.04 AS nogpu
+
+# Copy the binary from the builder
+COPY --from=builder /workspace/bin/gkm-agent /agent
+
+# Install common runtime libraries (shared with other agent variants)
+RUN apt-get update && \
+    apt-get install -y \
+        ca-certificates \
+        libgpgme11 \
+        libbtrfs0 \
+        libffi8 \
+        libc6 \
+        wget \
+        pciutils \
+        hwdata \
+        gnupg2 \
+        python3-setuptools \
+        python3-wheel \
+        curl \
+        dialog \
+        rsync \
+        lsb-release \
+        software-properties-common \
+        libseccomp2 && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Run as non-root user
+USER 65532:65532
+
+ENTRYPOINT ["/agent"]
+
+# ============================================================================
+# Target: amd (extends nogpu, adds ROCm support)
+# ============================================================================
+FROM nogpu AS amd
+
+# Switch to root to install ROCm packages
+USER root
+
+# AMD ROCm version configuration
+ARG ROCM_VERSION=6.3.1
+ARG AMDGPU_VERSION=6.3.60301
+ARG OPT_ROCM_VERSION=6.3.1
+
+# Install AMD ROCm packages (GPU-specific dependencies)
+RUN wget https://repo.radeon.com/amdgpu-install/${ROCM_VERSION}/ubuntu/noble/amdgpu-install_${AMDGPU_VERSION}-1_all.deb && \
+    apt install -y ./*.deb && \
+    apt update && DEBIAN_FRONTEND=noninteractive apt install -y amd-smi-lib rocm-smi-lib && \
+    apt-get clean && rm -rf /var/lib/apt/lists/* && \
+    ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/amd-smi /usr/bin/amd-smi && \
+    ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/rocm-smi /usr/bin/rocm-smi
+
+# Switch back to non-root user
+USER 65532:65532
+
+# Binary and entrypoint are inherited from nogpu
+
+# ============================================================================
+# Target: nvidia (CUDA runtime with NVML support)
+# ============================================================================
+FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04 AS nvidia
+
+# Copy the binary from the builder
+COPY --from=builder /workspace/bin/gkm-agent /agent
+
+# Install common runtime libraries (shared with other agent variants)
+RUN apt-get update && \
+    apt-get install -y \
+        ca-certificates \
+        libgpgme11 \
+        libbtrfs0 \
+        libffi8 \
+        libc6 \
+        wget \
+        pciutils \
+        hwdata \
+        gnupg2 \
+        python3-setuptools \
+        python3-wheel \
+        curl \
+        dialog \
+        rsync \
+        lsb-release \
+        software-properties-common \
+        libseccomp2 && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+# Note: NVIDIA CUDA base image already includes libnvidia-ml.so (NVML)
+# No additional GPU-specific packages needed
+
+# Run as non-root user
+USER 65532:65532
+
+ENTRYPOINT ["/agent"]
diff --git a/Makefile b/Makefile
index 02cbe5f6c..ac6ebde1b 100644
--- a/Makefile
+++ b/Makefile
@@ -80,7 +80,6 @@ REPO ?= quay.io/$(QUAY_USER)
 OPERATOR_IMG ?= $(REPO)/gkm-operator:$(IMAGE_TAG)
 AGENT_IMG ?=$(REPO)/gkm-agent:$(IMAGE_TAG)
 EXTRACT_IMG ?=$(REPO)/gkm-extract:$(IMAGE_TAG)
-AGENT_BASE_IMG ?= $(REPO)/gkm-agent-base:$(IMAGE_TAG)
 AGENT_NVIDIA_IMG ?= $(REPO)/gkm-agent-nvidia:$(IMAGE_TAG)
 AGENT_AMD_IMG ?= $(REPO)/gkm-agent-amd:$(IMAGE_TAG)
 AGENT_NOGPU_IMG ?= $(REPO)/gkm-agent-nogpu:$(IMAGE_TAG)
@@ -226,27 +225,23 @@ build-image-operator:
 build-image-gkm-extract:
 	$(CONTAINER_TOOL) build  $(CONTAINER_FLAGS) --progress=plain --load -f Containerfile.gkm-extract -t ${EXTRACT_IMG} .
 
-.PHONY: build-image-agent-base
-build-image-agent-base:
-	$(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load --target builder -f Containerfile.gkm-agent-base -t ${AGENT_BASE_IMG} .
-
 .PHONY: build-image-agent-nvidia
 build-image-agent-nvidia:
-	$(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load -f Containerfile.gkm-agent-nvidia -t ${AGENT_NVIDIA_IMG} .
+	$(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load --target nvidia -f Containerfile.gkm-agents -t ${AGENT_NVIDIA_IMG} .
 
 .PHONY: build-image-agent-amd
 build-image-agent-amd:
-	$(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load -f Containerfile.gkm-agent-amd -t ${AGENT_AMD_IMG} .
+	$(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load --target amd -f Containerfile.gkm-agents -t ${AGENT_AMD_IMG} .
 
 .PHONY: build-image-agent-nogpu
 build-image-agent-nogpu:
-	$(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --progress=plain --load -f Containerfile.gkm-agent-nogpu -t ${AGENT_NOGPU_IMG} .
+	$(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --progress=plain --load --target nogpu -f Containerfile.gkm-agents -t ${AGENT_NOGPU_IMG} .
 
 .PHONY: build-image-agents
 ifeq ($(NO_GPU_BUILD),true)
-build-image-agents: build-image-agent-base build-image-agent-nogpu ## Build base and no-GPU agent only (NO_GPU_BUILD=true)
+build-image-agents: build-image-agent-nogpu ## Build no-GPU agent only (NO_GPU_BUILD=true)
 else
-build-image-agents: build-image-agent-base build-image-agent-nvidia build-image-agent-amd build-image-agent-nogpu ## Build all agent images (base, NVIDIA, AMD, and no-GPU)
+build-image-agents: build-image-agent-nvidia build-image-agent-amd build-image-agent-nogpu ## Build all agent images (NVIDIA, AMD, and no-GPU)
 endif
 
 # If you wish to build the operator image targeting other platforms you can use the --platform flag.
@@ -259,7 +254,6 @@ build-images: build-image-operator build-image-agents build-image-gkm-extract ##
 push-images: ## Push all container images.
 	$(CONTAINER_TOOL) push ${OPERATOR_IMG}
 	$(CONTAINER_TOOL) push ${EXTRACT_IMG}
-	$(CONTAINER_TOOL) push ${AGENT_BASE_IMG}
 ifeq ($(NO_GPU_BUILD),true)
 	$(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG}
 else
@@ -270,12 +264,10 @@ endif
 
 .PHONY: push-images-agents
 ifeq ($(NO_GPU_BUILD),true)
-push-images-agents: ## Push base and no-GPU agent only (NO_GPU_BUILD=true)
-	$(CONTAINER_TOOL) push ${AGENT_BASE_IMG}
+push-images-agents: ## Push no-GPU agent only (NO_GPU_BUILD=true)
 	$(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG}
 else
 push-images-agents: ## Push all agent images
-	$(CONTAINER_TOOL) push ${AGENT_BASE_IMG}
 	$(CONTAINER_TOOL) push ${AGENT_NVIDIA_IMG}
 	$(CONTAINER_TOOL) push ${AGENT_AMD_IMG}
 	$(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG}
@@ -618,8 +610,6 @@ setup-kind: kind-gpu-sim-script
 kind-load-images: kind-gpu-sim-script get-example-images
 	@echo "Loading operator image ${OPERATOR_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)"
 	cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${OPERATOR_IMG} --cluster-name=$(KIND_CLUSTER_NAME)
-	@echo "Loading agent base image ${AGENT_BASE_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)"
-	cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${AGENT_BASE_IMG} --cluster-name=$(KIND_CLUSTER_NAME)
 ifeq ($(NO_GPU_BUILD),true)
 	@echo "Loading agent nogpu image ${AGENT_NOGPU_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)"
 	cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${AGENT_NOGPU_IMG} --cluster-name=$(KIND_CLUSTER_NAME)
diff --git a/config/agent/README.md b/config/agent/README.md
index b15567441..71161a7e3 100644
--- a/config/agent/README.md
+++ b/config/agent/README.md
@@ -103,16 +103,23 @@ kubectl get pods -n gkm-system -l gpu-vendor=amd -o wide
 
 ## Containerfiles
 
-### NVIDIA Agent ([Containerfile.gkm-agent-nvidia](../../Containerfile.gkm-agent-nvidia))
+All agent variants are built from [Containerfile.gkm-agents](../../Containerfile.gkm-agents) using multi-stage targets:
+
+### NVIDIA Agent (target: `nvidia`)
 - Base image: `nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04`
 - Includes: NVIDIA CUDA runtime with NVML libraries
 - Requires: NVIDIA driver on host
 
-### AMD Agent ([Containerfile.gkm-agent-amd](../../Containerfile.gkm-agent-amd))
-- Base image: `ubuntu:24.04`
+### AMD Agent (target: `amd`)
+- Base image: extends `nogpu` target
 - Includes: ROCm libraries (`amd-smi-lib`, `rocm-smi-lib`)
 - Requires: AMD GPU driver on host
 
+### No-GPU Agent (target: `nogpu`)
+- Base image: `ubuntu:24.04`
+- Includes: Common runtime dependencies only
+- For non-GPU workloads
+
 ## Node Selectors
 
 The DaemonSets use PCI vendor ID-based node selectors:
@@ -188,5 +195,4 @@ To migrate from the legacy generic agent:
 - [gkm-agent-nvidia.yaml](gkm-agent-nvidia.yaml) - NVIDIA DaemonSet
 - [gkm-agent-amd.yaml](gkm-agent-amd.yaml) - AMD DaemonSet
 - [kustomization.yaml](kustomization.yaml) - Kustomize configuration
-- [../../Containerfile.gkm-agent-nvidia](../../Containerfile.gkm-agent-nvidia) - NVIDIA Containerfile
-- [../../Containerfile.gkm-agent-amd](../../Containerfile.gkm-agent-amd) - AMD Containerfile
+- [../../Containerfile.gkm-agents](../../Containerfile.gkm-agents) - Multi-target agent Containerfile

From 1756195d32ec34588fc0ecd60af94d2e9a5a61d9 Mon Sep 17 00:00:00 2001
From: Maryam Tahhan <mtahhan@redhat.com>
Date: Mon, 16 Mar 2026 21:05:23 +0000
Subject: [PATCH 25/25] fix: update legacy agent image reference to nogpu
 variant

Changed gkm.agent.image from non-existent gkm-agent:latest to
gkm-agent-nogpu:latest. This value is legacy/unused (operator only
logs it), but needs to reference a real image for backwards compatibility.

Each agent daemonset uses its GPU-specific image directly, so this
configmap value is not actually used at runtime.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
Signed-off-by: Maryam Tahhan <mtahhan@redhat.com>
---
 Makefile                            | 3 ++-
 config/configMap/configMap.yaml     | 3 ++-
 config/configMap/kustomization.yaml | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index ac6ebde1b..8b9f4a722 100644
--- a/Makefile
+++ b/Makefile
@@ -78,11 +78,12 @@ QUAY_USER ?= gkm
 IMAGE_TAG ?= latest
 REPO ?= quay.io/$(QUAY_USER)
 OPERATOR_IMG ?= $(REPO)/gkm-operator:$(IMAGE_TAG)
-AGENT_IMG ?=$(REPO)/gkm-agent:$(IMAGE_TAG)
 EXTRACT_IMG ?=$(REPO)/gkm-extract:$(IMAGE_TAG)
 AGENT_NVIDIA_IMG ?= $(REPO)/gkm-agent-nvidia:$(IMAGE_TAG)
 AGENT_AMD_IMG ?= $(REPO)/gkm-agent-amd:$(IMAGE_TAG)
 AGENT_NOGPU_IMG ?= $(REPO)/gkm-agent-nogpu:$(IMAGE_TAG)
+# Legacy: AGENT_IMG points to nogpu for backwards compatibility (unused by operator)
+AGENT_IMG ?= $(AGENT_NOGPU_IMG)
 
 # ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary.
 ENVTEST_K8S_VERSION = 1.31.0
diff --git a/config/configMap/configMap.yaml b/config/configMap/configMap.yaml
index e2aed50e6..cecb43866 100644
--- a/config/configMap/configMap.yaml
+++ b/config/configMap/configMap.yaml
@@ -8,7 +8,8 @@ data:
   gkm.operator.log.level: info
   gkm.agent.log.level: info
   ## Can be configured at runtime
-  gkm.agent.image: quay.io/gkm/gkm-agent:latest
+  ## Note: gkm.agent.image is legacy/unused - agents use GPU-specific images
+  gkm.agent.image: quay.io/gkm/gkm-agent-nogpu:latest
   gkm.extract.image: quay.io/gkm/gkm-extract:latest
   gkm.nogpu: false
   ## Enable/disable Kyverno image signature verification (defaults to true/enabled)
diff --git a/config/configMap/kustomization.yaml b/config/configMap/kustomization.yaml
index 77b78a349..a33898c8b 100644
--- a/config/configMap/kustomization.yaml
+++ b/config/configMap/kustomization.yaml
@@ -9,7 +9,8 @@ configMapGenerator:
 - behavior: merge
   literals:
   - gkm.nogpu=true
-  - gkm.agent.image=quay.io/gkm/gkm-agent:latest
+  # Note: gkm.agent.image is legacy/unused - agents use GPU-specific images
+  - gkm.agent.image=quay.io/gkm/gkm-agent-nogpu:latest
   - gkm.extract.image=quay.io/gkm/gkm-extract:latest
   name: config
   namespace: gkm-system