From b81e5192fa62032c2895e8f107df359a51b81c47 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Wed, 11 Mar 2026 16:02:31 +0000 Subject: [PATCH 01/25] feat: add GPU-specific agents for NVIDIA and AMD with NFD-based deployment Replace generic agent with GPU-vendor-specific agents that deploy based on hardware detection. This enables hybrid clusters with both NVIDIA and AMD GPUs to run optimized agents with appropriate runtime libraries. Changes: - Add Containerfile.gkm-agent-nvidia (CUDA 12.6.3 base with NVML) - Add Containerfile.gkm-agent-amd (ROCm 6.3.1 with AMD SMI libraries) - Remove generic Containerfile.gkm-agent - Add DaemonSet manifests with PCI vendor ID-based node selectors: * gkm-agent-nvidia.yaml (nodeSelector: pci-10de.present) * gkm-agent-amd.yaml (nodeSelector: pci-1002.present) - Remove generic gkm-agent.yaml - Add Node Feature Discovery (NFD) deployment configuration - Update Makefile with GPU-specific build/push targets: * build-image-agent-nvidia, build-image-agent-amd * build-image-agents-gpu (builds both) * push-images-agents-gpu - Add mcv dependencies: go-nvlib v0.9.0, amdsmi (amd-staging) - Add comprehensive documentation for multi-GPU deployment The operator and CSI plugin remain unchanged and work with both agent types. NFD automatically labels nodes with GPU vendor information, enabling declarative GPU-specific agent deployment without manual intervention. Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- ...e.gkm-agent => Containerfile.gkm-agent-amd | 28 +-- Containerfile.gkm-agent-nogpu | 65 ++++++ Containerfile.gkm-agent-nvidia | 68 +++++++ Makefile | 33 ++- config/agent/README.md | 192 ++++++++++++++++++ .../{gkm-agent.yaml => gkm-agent-amd.yaml} | 11 +- config/agent/gkm-agent-nogpu.yaml | 91 +++++++++ config/agent/gkm-agent-nvidia.yaml | 88 ++++++++ config/agent/kustomization.yaml | 19 +- config/nfd/README.md | 167 +++++++++++++++ config/nfd/kustomization.yaml | 12 ++ config/nfd/nfd-worker-conf.yaml | 33 +++ mcv/go.mod | 2 + mcv/go.sum | 4 + 14 files changed, 781 insertions(+), 32 deletions(-) rename Containerfile.gkm-agent => Containerfile.gkm-agent-amd (63%) create mode 100644 Containerfile.gkm-agent-nogpu create mode 100644 Containerfile.gkm-agent-nvidia create mode 100644 config/agent/README.md rename config/agent/{gkm-agent.yaml => gkm-agent-amd.yaml} (86%) create mode 100644 config/agent/gkm-agent-nogpu.yaml create mode 100644 config/agent/gkm-agent-nvidia.yaml create mode 100644 config/nfd/README.md create mode 100644 config/nfd/kustomization.yaml create mode 100644 config/nfd/nfd-worker-conf.yaml diff --git a/Containerfile.gkm-agent b/Containerfile.gkm-agent-amd similarity index 63% rename from Containerfile.gkm-agent rename to Containerfile.gkm-agent-amd index 2214838e6..a59daee88 100644 --- a/Containerfile.gkm-agent +++ b/Containerfile.gkm-agent-amd @@ -58,25 +58,17 @@ RUN apt-get update && \ libseccomp2 && \ apt-get clean -ARG NO_GPU=false -ARG ROCM_VERSION=7.0.1 -ARG AMDGPU_VERSION=7.0.1.70001 -ARG OPT_ROCM_VERSION=7.0.1 +ARG ROCM_VERSION=6.3.1 +ARG AMDGPU_VERSION=6.3.60301 +ARG OPT_ROCM_VERSION=6.3.1 -# Conditionally install ROCm packages based on NO_GPU flag -RUN if [ "$NO_GPU" = "false" ]; then \ - wget https://repo.radeon.com/amdgpu-install/${ROCM_VERSION}/ubuntu/noble/amdgpu-install_${AMDGPU_VERSION}-1_all.deb && \ - apt install -y ./*.deb && \ - apt update && DEBIAN_FRONTEND=noninteractive apt install -y amd-smi-lib rocm-smi-lib && \ - apt-get clean && rm -rf /var/lib/apt/lists/* && \ - ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/amd-smi /usr/bin/amd-smi && \ - ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/rocm-smi /usr/bin/rocm-smi; \ - else \ - echo "NO_GPU=true, skipping ROCm installation"; \ - fi - -# Set NO_GPU environment variable -ENV NO_GPU=${NO_GPU} +# Install ROCm packages for AMD GPU support +RUN wget https://repo.radeon.com/amdgpu-install/${ROCM_VERSION}/ubuntu/noble/amdgpu-install_${AMDGPU_VERSION}-1_all.deb && \ + apt install -y ./*.deb && \ + apt update && DEBIAN_FRONTEND=noninteractive apt install -y amd-smi-lib rocm-smi-lib && \ + apt-get clean && rm -rf /var/lib/apt/lists/* && \ + ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/amd-smi /usr/bin/amd-smi && \ + ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/rocm-smi /usr/bin/rocm-smi # Run as non-root user USER 65532:65532 diff --git a/Containerfile.gkm-agent-nogpu b/Containerfile.gkm-agent-nogpu new file mode 100644 index 000000000..869108e09 --- /dev/null +++ b/Containerfile.gkm-agent-nogpu @@ -0,0 +1,65 @@ +# Build the agent binary +FROM public.ecr.aws/docker/library/golang:1.25 AS builder + +WORKDIR /workspace + +# Install required system packages +RUN apt-get update && \ + apt-get install -y \ + libgpgme-dev \ + btrfs-progs \ + libbtrfs-dev \ + libgpgme11-dev \ + libseccomp-dev \ + pkg-config \ + build-essential && \ + apt-get clean + +# Copy the Go Modules manifests +COPY go.mod go.mod +COPY go.sum go.sum + +# Copy the go source +COPY agent/main.go agent/main.go +COPY api/ api/ +COPY pkg/ pkg/ +COPY internal/controller/ internal/controller/ +COPY vendor/ vendor/ +COPY Makefile Makefile + +# Build the agent binary +RUN make build-gkm-agent + +# Use minimal Ubuntu base image for no-GPU environments +FROM public.ecr.aws/docker/library/ubuntu:24.04 + +# Copy the binary from the builder +COPY --from=builder /workspace/bin/gkm-agent /agent + +# Install required runtime libraries for CGO +RUN apt-get update && \ + apt-get install -y \ + ca-certificates \ + libgpgme11 \ + libbtrfs0 \ + libffi8 \ + libc6 \ + wget \ + pciutils \ + hwdata \ + gnupg2 \ + python3-setuptools \ + python3-wheel \ + curl \ + dialog \ + rsync \ + lsb-release \ + software-properties-common \ + libseccomp2 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Run as non-root user +USER 65532:65532 + +ENTRYPOINT ["/agent"] diff --git a/Containerfile.gkm-agent-nvidia b/Containerfile.gkm-agent-nvidia new file mode 100644 index 000000000..28e6b836a --- /dev/null +++ b/Containerfile.gkm-agent-nvidia @@ -0,0 +1,68 @@ +# Build the agent binary +FROM public.ecr.aws/docker/library/golang:1.25 AS builder + +WORKDIR /workspace + +# Install required system packages +RUN apt-get update && \ + apt-get install -y \ + libgpgme-dev \ + btrfs-progs \ + libbtrfs-dev \ + libgpgme11-dev \ + libseccomp-dev \ + pkg-config \ + build-essential && \ + apt-get clean + +# Copy the Go Modules manifests +COPY go.mod go.mod +COPY go.sum go.sum + +# Copy the go source +COPY agent/main.go agent/main.go +COPY api/ api/ +COPY pkg/ pkg/ +COPY internal/controller/ internal/controller/ +COPY vendor/ vendor/ +COPY Makefile Makefile + +# Build the agent binary +RUN make build-gkm-agent + +# Use NVIDIA CUDA runtime base image for GPU support +FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04 + +# Copy the binary from the builder +COPY --from=builder /workspace/bin/gkm-agent /agent + +# Install required runtime libraries for CGO +RUN apt-get update && \ + apt-get install -y \ + ca-certificates \ + libgpgme11 \ + libbtrfs0 \ + libffi8 \ + libc6 \ + wget \ + pciutils \ + hwdata \ + gnupg2 \ + python3-setuptools \ + python3-wheel \ + curl \ + dialog \ + rsync \ + lsb-release \ + software-properties-common \ + libseccomp2 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# The NVIDIA CUDA base image already includes libnvidia-ml.so (NVML) +# No additional NVIDIA packages needed + +# Run as non-root user +USER 65532:65532 + +ENTRYPOINT ["/agent"] diff --git a/Makefile b/Makefile index 5a4e0bf21..3283494db 100644 --- a/Makefile +++ b/Makefile @@ -209,25 +209,44 @@ run: manifests generate fmt vet ## Run a controller from your host. build-image-operator: $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --progress=plain --load -f Containerfile.gkm-operator -t ${OPERATOR_IMG} . -.PHONY: build-image-agent -build-image-agent: - $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --build-arg NO_GPU=$(NO_GPU_BUILD) --progress=plain --load -f Containerfile.gkm-agent -t ${AGENT_IMG} . - .PHONY: build-image-gkm-extract build-image-gkm-extract: $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --progress=plain --load -f Containerfile.gkm-extract -t ${EXTRACT_IMG} . +.PHONY: build-image-agent-nvidia +build-image-agent-nvidia: + $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load -f Containerfile.gkm-agent-nvidia -t $(REPO)/agent-nvidia:$(IMAGE_TAG) . + +.PHONY: build-image-agent-amd +build-image-agent-amd: + $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load -f Containerfile.gkm-agent-amd -t $(REPO)/agent-amd:$(IMAGE_TAG) . + +.PHONY: build-image-agent-nogpu +build-image-agent-nogpu: + $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --progress=plain --load -f Containerfile.gkm-agent-nogpu -t $(REPO)/agent-nogpu:$(IMAGE_TAG) . + +.PHONY: build-image-agents +build-image-agents: build-image-agent-nvidia build-image-agent-amd build-image-agent-nogpu ## Build all agent images (NVIDIA, AMD, and no-GPU) + # If you wish to build the operator image targeting other platforms you can use the --platform flag. # (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it. # More info: https://docs.docker.com/develop/develop-images/build_enhancements/ .PHONY: build-images -build-images: build-image-operator build-image-agent build-image-gkm-extract ## Build all container images. +build-images: build-image-operator build-image-agents build-image-gkm-extract ## Build all container images. .PHONY: push-images -push-images: ## Push all container image. +push-images: ## Push all container images. $(CONTAINER_TOOL) push ${OPERATOR_IMG} - $(CONTAINER_TOOL) push ${AGENT_IMG} $(CONTAINER_TOOL) push ${EXTRACT_IMG} + $(CONTAINER_TOOL) push $(REPO)/agent-nvidia:$(IMAGE_TAG) + $(CONTAINER_TOOL) push $(REPO)/agent-amd:$(IMAGE_TAG) + $(CONTAINER_TOOL) push $(REPO)/agent-nogpu:$(IMAGE_TAG) + +.PHONY: push-images-agents +push-images-agents: ## Push all agent images + $(CONTAINER_TOOL) push $(REPO)/agent-nvidia:$(IMAGE_TAG) + $(CONTAINER_TOOL) push $(REPO)/agent-amd:$(IMAGE_TAG) + $(CONTAINER_TOOL) push $(REPO)/agent-nogpu:$(IMAGE_TAG) # Mapping old commands after rename .PHONY: docker-build diff --git a/config/agent/README.md b/config/agent/README.md new file mode 100644 index 000000000..b15567441 --- /dev/null +++ b/config/agent/README.md @@ -0,0 +1,192 @@ +# Multi-GPU Agent Deployment + +This directory contains configuration for deploying GPU-specific GKM agents that support both NVIDIA and AMD GPUs in heterogeneous clusters. + +## Overview + +GKM now supports deploying different agent containers based on the GPU hardware present on each node: + +- **`gkm-agent-nvidia`**: For nodes with NVIDIA GPUs +- **`gkm-agent-amd`**: For nodes with AMD ROCm GPUs +- **`gkm-agent`**: Legacy generic agent (deprecated) + +## Architecture + +Each GPU-specific agent uses: +- **Different base images** with appropriate GPU runtime libraries +- **Node selectors** to deploy only on compatible hardware +- **Automatic node labeling** via Node Feature Discovery (NFD) + +## Prerequisites + +### 1. Node Feature Discovery (NFD) + +NFD must be deployed to automatically label nodes with their PCI device information: + +```bash +# Deploy NFD +kubectl apply -k config/nfd + +# Verify NFD is running +kubectl get pods -n node-feature-discovery + +# Check node labels (should see pci-* labels) +kubectl get nodes -o json | jq '.items[].metadata.labels' | grep pci +``` + +NFD will automatically add labels like: +- `feature.node.kubernetes.io/pci-10de.present=true` (NVIDIA, vendor ID: 0x10de) +- `feature.node.kubernetes.io/pci-1002.present=true` (AMD, vendor ID: 0x1002) + +### 2. GPU Device Plugins + +Ensure appropriate GPU device plugins are installed: + +**For NVIDIA:** +```bash +kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.0/deployments/static/nvidia-device-plugin.yml +``` + +**For AMD:** +```bash +kubectl apply -f https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml +``` + +## Building GPU-Specific Agent Images + +### Build All GPU Agents +```bash +make build-image-agents-gpu +``` + +### Build Individual Agents +```bash +# NVIDIA agent +make build-image-agent-nvidia + +# AMD agent +make build-image-agent-amd +``` + +### Push Images to Registry +```bash +# Set your registry +export QUAY_USER=your-org + +# Push GPU-specific agents +make push-images-agents-gpu +``` + +## Deployment + +### Deploy with Kustomize +```bash +kubectl apply -k config/agent +``` + +This will deploy: +- `agent-nvidia` DaemonSet → Only on nodes with `feature.node.kubernetes.io/pci-10de.present=true` +- `agent-amd` DaemonSet → Only on nodes with `feature.node.kubernetes.io/pci-1002.present=true` + +### Verify Deployment +```bash +# Check which agents are running +kubectl get ds -n gkm-system + +# Check agent pods and their node placement +kubectl get pods -n gkm-system -o wide + +# Verify agents are on correct nodes +kubectl get pods -n gkm-system -l gpu-vendor=nvidia -o wide +kubectl get pods -n gkm-system -l gpu-vendor=amd -o wide +``` + +## Containerfiles + +### NVIDIA Agent ([Containerfile.gkm-agent-nvidia](../../Containerfile.gkm-agent-nvidia)) +- Base image: `nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04` +- Includes: NVIDIA CUDA runtime with NVML libraries +- Requires: NVIDIA driver on host + +### AMD Agent ([Containerfile.gkm-agent-amd](../../Containerfile.gkm-agent-amd)) +- Base image: `ubuntu:24.04` +- Includes: ROCm libraries (`amd-smi-lib`, `rocm-smi-lib`) +- Requires: AMD GPU driver on host + +## Node Selectors + +The DaemonSets use PCI vendor ID-based node selectors: + +```yaml +# NVIDIA nodes +nodeSelector: + feature.node.kubernetes.io/pci-10de.present: "true" + +# AMD nodes +nodeSelector: + feature.node.kubernetes.io/pci-1002.present: "true" +``` + +## Hybrid GPU Clusters + +In clusters with both NVIDIA and AMD nodes: + +1. **NFD labels all nodes** with their PCI device information +2. **NVIDIA agent** deploys only to NVIDIA nodes +3. **AMD agent** deploys only to AMD nodes +4. **Operator** works with whichever agent is present on each node + +## Troubleshooting + +### NFD Not Labeling Nodes + +```bash +# Check NFD worker logs +kubectl logs -n node-feature-discovery -l app=nfd-worker + +# Manually verify PCI devices +lspci | grep -i vga +lspci -n | grep -E "0300|0302" +``` + +### Agent Not Scheduling + +```bash +# Check node labels +kubectl describe node | grep feature.node.kubernetes.io/pci + +# Check DaemonSet events +kubectl describe ds agent-nvidia -n gkm-system +kubectl describe ds agent-amd -n gkm-system +``` + +### GPU Libraries Not Found + +```bash +# Check NVIDIA driver +nvidia-smi + +# Check AMD driver +rocm-smi + +# Verify libraries in container +kubectl exec -it -n gkm-system -- ls -la /usr/lib/x86_64-linux-gnu/ | grep -E "nvidia|amd" +``` + +## Migration from Generic Agent + +To migrate from the legacy generic agent: + +1. Deploy NFD: `kubectl apply -k config/nfd` +2. Build GPU-specific agents: `make build-image-agents-gpu` +3. Update manifests to use new agent DaemonSets +4. Deploy: `kubectl apply -k config/agent` +5. Remove old generic agent: `kubectl delete ds agent -n gkm-system` + +## Related Files + +- [gkm-agent-nvidia.yaml](gkm-agent-nvidia.yaml) - NVIDIA DaemonSet +- [gkm-agent-amd.yaml](gkm-agent-amd.yaml) - AMD DaemonSet +- [kustomization.yaml](kustomization.yaml) - Kustomize configuration +- [../../Containerfile.gkm-agent-nvidia](../../Containerfile.gkm-agent-nvidia) - NVIDIA Containerfile +- [../../Containerfile.gkm-agent-amd](../../Containerfile.gkm-agent-amd) - AMD Containerfile diff --git a/config/agent/gkm-agent.yaml b/config/agent/gkm-agent-amd.yaml similarity index 86% rename from config/agent/gkm-agent.yaml rename to config/agent/gkm-agent-amd.yaml index b108bbcd6..ef1a623eb 100644 --- a/config/agent/gkm-agent.yaml +++ b/config/agent/gkm-agent-amd.yaml @@ -1,23 +1,29 @@ apiVersion: apps/v1 kind: DaemonSet metadata: - name: agent + name: agent-amd namespace: gkm-system labels: app: gkm-agent + gpu-vendor: amd spec: selector: matchLabels: app: gkm-agent + gpu-vendor: amd template: metadata: labels: app: gkm-agent + gpu-vendor: amd spec: serviceAccountName: gkm-agent + # Deploy only on nodes with AMD GPUs + nodeSelector: + feature.node.kubernetes.io/pci-1002.present: "true" # AMD vendor ID containers: - name: gkm-agent - image: quay.io/gkm/agent:latest + image: quay.io/gkm/agent-amd:latest imagePullPolicy: IfNotPresent securityContext: runAsUser: 0 @@ -50,6 +56,7 @@ spec: limits: memory: "128Mi" cpu: "100m" + amd.com/gpu: "0" # Request 0 GPUs (agent only monitors, doesn't use GPU) volumeMounts: - name: gkm-state mountPath: /var/lib/gkm diff --git a/config/agent/gkm-agent-nogpu.yaml b/config/agent/gkm-agent-nogpu.yaml new file mode 100644 index 000000000..7500293ba --- /dev/null +++ b/config/agent/gkm-agent-nogpu.yaml @@ -0,0 +1,91 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: agent-nogpu + namespace: gkm-system + labels: + app: gkm-agent + gpu-vendor: none +spec: + selector: + matchLabels: + app: gkm-agent + gpu-vendor: none + template: + metadata: + labels: + app: gkm-agent + gpu-vendor: none + spec: + serviceAccountName: gkm-agent + # Deploy on nodes without GPUs (nodes that don't have NVIDIA or AMD PCI labels) + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: feature.node.kubernetes.io/pci-10de.present + operator: DoesNotExist + - key: feature.node.kubernetes.io/pci-1002.present + operator: DoesNotExist + containers: + - name: gkm-agent + image: quay.io/gkm/agent-nogpu:latest + imagePullPolicy: IfNotPresent + securityContext: + runAsUser: 0 + privileged: true + capabilities: + add: ["CAP_DAC_OVERRIDE", "CAP_FOWNER"] + seccompProfile: + type: Unconfined + env: + - name: NO_GPU + value: "true" + - name: GO_LOG + valueFrom: + configMapKeyRef: + name: gkm-config + key: gkm.agent.log.level + - name: KUBE_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + resources: + limits: + memory: "128Mi" + cpu: "100m" + volumeMounts: + - name: gkm-state + mountPath: /var/lib/gkm + mountPropagation: Bidirectional + - name: gkm-runtime + mountPath: /run/gkm + mountPropagation: Bidirectional + - name: sys + mountPath: /sys + readOnly: true + - name: dev + mountPath: /dev + + volumes: + # This volume is the GKM State directory. This is where GPU Kernel Cache + # will be extracted. + - name: gkm-state + hostPath: + path: /var/lib/gkm + type: DirectoryOrCreate + # This volume is the GKM Runtime directory. This is where the Usage data + # will tracked which pods are using which cache. + - name: gkm-runtime + hostPath: + path: /run/gkm + type: DirectoryOrCreate + - name: sys + hostPath: + path: /sys + type: Directory + - name: dev + hostPath: + path: /dev + type: Directory diff --git a/config/agent/gkm-agent-nvidia.yaml b/config/agent/gkm-agent-nvidia.yaml new file mode 100644 index 000000000..1cfc92af4 --- /dev/null +++ b/config/agent/gkm-agent-nvidia.yaml @@ -0,0 +1,88 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: agent-nvidia + namespace: gkm-system + labels: + app: gkm-agent + gpu-vendor: nvidia +spec: + selector: + matchLabels: + app: gkm-agent + gpu-vendor: nvidia + template: + metadata: + labels: + app: gkm-agent + gpu-vendor: nvidia + spec: + serviceAccountName: gkm-agent + # Deploy only on nodes with NVIDIA GPUs + nodeSelector: + feature.node.kubernetes.io/pci-10de.present: "true" # NVIDIA vendor ID + containers: + - name: gkm-agent + image: quay.io/gkm/agent-nvidia:latest + imagePullPolicy: IfNotPresent + securityContext: + runAsUser: 0 + privileged: true + capabilities: + add: ["CAP_DAC_OVERRIDE", "CAP_FOWNER"] + seccompProfile: + type: Unconfined + env: + - name: NO_GPU + valueFrom: + configMapKeyRef: + name: gkm-config + key: gkm.nogpu + - name: GO_LOG + valueFrom: + configMapKeyRef: + name: gkm-config + key: gkm.agent.log.level + - name: KUBE_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + resources: + limits: + memory: "128Mi" + cpu: "100m" + nvidia.com/gpu: "0" # Request 0 GPUs (agent only monitors, doesn't use GPU) + volumeMounts: + - name: gkm-state + mountPath: /var/lib/gkm + mountPropagation: Bidirectional + - name: gkm-runtime + mountPath: /run/gkm + mountPropagation: Bidirectional + - name: sys + mountPath: /sys + readOnly: true + - name: dev + mountPath: /dev + + volumes: + # This volume is the GKM State directory. This is where GPU Kernel Cache + # will be extracted. + - name: gkm-state + hostPath: + path: /var/lib/gkm + type: DirectoryOrCreate + # This volume is the GKM Runtime directory. This is where the Usage data + # will tracked which pods are using which cache. + - name: gkm-runtime + hostPath: + path: /run/gkm + type: DirectoryOrCreate + - name: sys + hostPath: + path: /sys + type: Directory + - name: dev + hostPath: + path: /dev + type: Directory diff --git a/config/agent/kustomization.yaml b/config/agent/kustomization.yaml index 07e67158f..47a1d4be1 100644 --- a/config/agent/kustomization.yaml +++ b/config/agent/kustomization.yaml @@ -1,11 +1,20 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization + +# Deploy GPU-specific agents based on node hardware +# Requires Node Feature Discovery (NFD) to label nodes resources: -- gkm-agent.yaml +- gkm-agent-nvidia.yaml # NVIDIA GPU nodes +- gkm-agent-amd.yaml # AMD GPU nodes +- gkm-agent-nogpu.yaml # Nodes without GPUs + images: -- name: agent - newName: quay.io/gkm/agent +- name: quay.io/gkm/agent-nvidia + newName: quay.io/gkm/agent-nvidia newTag: latest -- name: quay.io/gkm/agent - newName: quay.io/gkm/agent +- name: quay.io/gkm/agent-amd + newName: quay.io/gkm/agent-amd + newTag: latest +- name: quay.io/gkm/agent-nogpu + newName: quay.io/gkm/agent-nogpu newTag: latest diff --git a/config/nfd/README.md b/config/nfd/README.md new file mode 100644 index 000000000..bc6ce2db5 --- /dev/null +++ b/config/nfd/README.md @@ -0,0 +1,167 @@ +# Node Feature Discovery (NFD) Configuration + +This directory contains the configuration for deploying [Node Feature Discovery](https://kubernetes-sigs.github.io/node-feature-discovery/) to automatically label nodes with hardware features, particularly GPU vendor information. + +## What is NFD? + +Node Feature Discovery is a Kubernetes add-on that detects hardware features available on each node and advertises those features using node labels. + +For GKM, NFD automatically labels nodes with PCI device vendor IDs, enabling GPU-specific agent deployment: + +- **NVIDIA GPUs**: `feature.node.kubernetes.io/pci-10de.present=true` (vendor ID: 0x10de) +- **AMD GPUs**: `feature.node.kubernetes.io/pci-1002.present=true` (vendor ID: 0x1002) + +## Deployment + +### Deploy NFD +```bash +kubectl apply -k config/nfd +``` + +### Verify NFD is Running +```bash +# Check NFD pods +kubectl get pods -n node-feature-discovery + +# Expected output: +# NAME READY STATUS RESTARTS AGE +# nfd-master-xxxxx 1/1 Running 0 1m +# nfd-worker-xxxxx 1/1 Running 0 1m +# nfd-worker-yyyyy 1/1 Running 0 1m +``` + +### Check Node Labels +```bash +# View all NFD labels on a node +kubectl get node -o json | jq '.metadata.labels | with_entries(select(.key | startswith("feature.node.kubernetes.io")))' + +# Check for GPU vendor labels specifically +kubectl get nodes -L feature.node.kubernetes.io/pci-10de.present,feature.node.kubernetes.io/pci-1002.present +``` + +## How It Works + +1. **NFD Master**: Runs as a deployment, manages feature labeling +2. **NFD Worker**: Runs as a DaemonSet on each node, detects features +3. **Worker scans PCI devices** and creates labels for vendor IDs +4. **Labels are applied** to nodes automatically + +## Configuration + +### Default Configuration + +The default NFD configuration (via [kustomization.yaml](kustomization.yaml)) deploys NFD from the official upstream repository. + +### Custom Configuration (Optional) + +To customize NFD behavior, uncomment the patch in `kustomization.yaml` and modify [nfd-worker-conf.yaml](nfd-worker-conf.yaml): + +```yaml +# In kustomization.yaml +patchesStrategicMerge: + - nfd-worker-conf.yaml +``` + +The custom configuration enables: +- **PCI device detection** with focus on display controllers (GPUs) +- **Vendor ID labeling** for automatic GPU vendor identification +- **Configurable scan interval** (default: 60s) + +## Verification + +### Manual PCI Device Check + +On each node, you can manually verify GPU devices: + +```bash +# List all VGA/Display controllers +lspci | grep -i vga + +# Show vendor IDs numerically +lspci -n | grep -E "0300|0302" + +# Example outputs: +# NVIDIA: 01:00.0 0300: 10de:1b80 (rev a1) +# AMD: 01:00.0 0300: 1002:67df (rev c7) +``` + +### Verify Label Creation + +```bash +# List nodes with NVIDIA GPUs +kubectl get nodes -l feature.node.kubernetes.io/pci-10de.present=true + +# List nodes with AMD GPUs +kubectl get nodes -l feature.node.kubernetes.io/pci-1002.present=true +``` + +## Integration with GKM Agents + +NFD labels are used by GKM agent DaemonSets to deploy GPU-specific agents: + +```yaml +# From config/agent/gkm-agent-nvidia.yaml +nodeSelector: + feature.node.kubernetes.io/pci-10de.present: "true" + +# From config/agent/gkm-agent-amd.yaml +nodeSelector: + feature.node.kubernetes.io/pci-1002.present: "true" +``` + +This ensures: +- NVIDIA agents only run on NVIDIA GPU nodes +- AMD agents only run on AMD GPU nodes +- No manual node labeling required + +## Troubleshooting + +### NFD Not Detecting GPUs + +1. **Check NFD worker logs:** + ```bash + kubectl logs -n node-feature-discovery -l app=nfd-worker + ``` + +2. **Verify PCI devices are present:** + ```bash + # SSH to node + lspci | grep -i vga + ``` + +3. **Check NFD configuration:** + ```bash + kubectl get cm -n node-feature-discovery nfd-worker-conf -o yaml + ``` + +### Labels Not Appearing + +1. **Restart NFD worker:** + ```bash + kubectl rollout restart daemonset/nfd-worker -n node-feature-discovery + ``` + +2. **Force re-labeling:** + ```bash + kubectl delete pod -n node-feature-discovery -l app=nfd-worker + ``` + +### Wrong Vendor ID + +Common PCI vendor IDs: +- **NVIDIA**: `10de` +- **AMD**: `1002` +- **Intel**: `8086` + +If using a different GPU vendor, update the node selectors in the agent DaemonSets. + +## Resources + +- [NFD GitHub](https://github.com/kubernetes-sigs/node-feature-discovery) +- [NFD Documentation](https://kubernetes-sigs.github.io/node-feature-discovery/) +- [PCI Vendor IDs Database](https://pci-ids.ucw.cz/) + +## Files + +- [kustomization.yaml](kustomization.yaml) - Main NFD deployment configuration +- [nfd-worker-conf.yaml](nfd-worker-conf.yaml) - Optional custom NFD worker configuration diff --git a/config/nfd/kustomization.yaml b/config/nfd/kustomization.yaml new file mode 100644 index 000000000..6c050e49c --- /dev/null +++ b/config/nfd/kustomization.yaml @@ -0,0 +1,12 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# Deploy Node Feature Discovery from official Helm chart +# This will automatically label nodes with GPU vendor information +resources: + - https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/default?ref=v0.16.7 + +# Optional: Add custom NFD configuration +# Uncomment if you need to customize NFD behavior +# patchesStrategicMerge: +# - nfd-worker-conf.yaml diff --git a/config/nfd/nfd-worker-conf.yaml b/config/nfd/nfd-worker-conf.yaml new file mode 100644 index 000000000..c183e0cb0 --- /dev/null +++ b/config/nfd/nfd-worker-conf.yaml @@ -0,0 +1,33 @@ +# Optional NFD Worker Configuration +# This file customizes NFD to ensure PCI device detection is enabled +# Uncomment in kustomization.yaml to use + +apiVersion: v1 +kind: ConfigMap +metadata: + name: nfd-worker-conf + namespace: node-feature-discovery +data: + nfd-worker.conf: | + core: + labelWhiteList: [".*"] # Enable all labels + noPublish: false + sleepInterval: 60s + sources: + - pci # Ensure PCI device detection is enabled + - cpu + - kernel + - memory + - network + - storage + - system + - usb + sources: + pci: + deviceClassWhitelist: + - "03" # Display controllers (GPUs are in this class) + - "0300" # VGA compatible controller + - "0301" # XGA compatible controller + - "0302" # 3D controller + deviceLabelFields: + - vendor # Will create labels like feature.node.kubernetes.io/pci-10de.present diff --git a/mcv/go.mod b/mcv/go.mod index 5bbba7590..be7bc23e7 100644 --- a/mcv/go.mod +++ b/mcv/go.mod @@ -26,6 +26,8 @@ require ( github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect github.com/BurntSushi/toml v1.5.0 // indirect github.com/Microsoft/go-winio v0.6.2 // indirect + github.com/NVIDIA/go-nvlib v0.9.0 // indirect + github.com/ROCm/amdsmi v0.0.0-20251117222445-a044536b8d69 // indirect github.com/StackExchange/wmi v1.2.1 // indirect github.com/VividCortex/ewma v1.2.0 // indirect github.com/acarl005/stripansi v0.0.0-20180116102854-5a71ef0e047d // indirect diff --git a/mcv/go.sum b/mcv/go.sum index 2a1f1c71d..308ce4245 100644 --- a/mcv/go.sum +++ b/mcv/go.sum @@ -12,8 +12,12 @@ github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1 github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= +github.com/NVIDIA/go-nvlib v0.9.0 h1:GKLIvLJ0uhCtTLLZp2Q8QIDRxOYH45MM4Y5OO3U5Rho= +github.com/NVIDIA/go-nvlib v0.9.0/go.mod h1:7mzx9FSdO9fXWP9NKuZmWkCwhkEcSWQFe2tmFwtLb9c= github.com/NVIDIA/go-nvml v0.13.0-1 h1:OLX8Jq3dONuPOQPC7rndB6+iDmDakw0XTYgzMxObkEw= github.com/NVIDIA/go-nvml v0.13.0-1/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4= +github.com/ROCm/amdsmi v0.0.0-20251117222445-a044536b8d69 h1:0Sl/RcyHZvSstVPIbdF0D/sdj8ZJd+xBxkCy5M8/aCI= +github.com/ROCm/amdsmi v0.0.0-20251117222445-a044536b8d69/go.mod h1:c2lzyLAghhTO+y/c3JjKl59JHJliIHwNZOroUfmBQxc= github.com/StackExchange/wmi v1.2.1 h1:VIkavFPXSjcnS+O8yTq7NI32k0R5Aj+v39y29VYDOSA= github.com/StackExchange/wmi v1.2.1/go.mod h1:rcmrprowKIVzvc+NUiLncP2uuArMWLCbu9SBzvHz7e8= github.com/VividCortex/ewma v1.2.0 h1:f58SaIzcDXrSy3kWaHNvuJgJ3Nmz59Zji6XoJR/q1ow= From 88ae647b7b8667208740752f5f31cef83df0ff01 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Thu, 12 Mar 2026 10:59:31 +0000 Subject: [PATCH 02/25] feat: enhance deployment automation with NFD and Kyverno integration - Add deploy-nfd and undeploy-nfd targets for automated Node Feature Discovery deployment - Integrate NFD deployment into main deploy target for GPU detection - Add deploy-kyverno-production for non-Kind cluster Kyverno deployment - Add deploy-kyverno-with-policies combined target - Update deploy target to conditionally deploy Kyverno based on KYVERNO_ENABLED flag - Update undeploy target to clean up NFD and Kyverno when KYVERNO_ENABLED=true - Update prepare-deploy to configure all three agent image variants (NVIDIA, AMD, no-GPU) This enables 'make deploy' to automatically deploy a complete GKM stack including GPU detection (NFD) and optional image verification (Kyverno) on production clusters. Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- Makefile | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 3283494db..972a654f5 100644 --- a/Makefile +++ b/Makefile @@ -311,10 +311,28 @@ uninstall: manifests kustomize ## Uninstall CRDs from the K8s cluster specified $(KUSTOMIZE) build config/crd | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f - ##@ Deployment + +.PHONY: deploy-nfd +deploy-nfd: kustomize ## Deploy Node Feature Discovery for GPU detection + @echo "Deploying Node Feature Discovery (NFD)..." + $(KUSTOMIZE) build config/nfd | $(KUBECTL) apply -f - + @echo "Waiting for NFD to be ready..." + @$(KUBECTL) wait --for=condition=Available --timeout=120s -n node-feature-discovery deployment/nfd-master || true + @echo "NFD deployed successfully." + +.PHONY: undeploy-nfd +undeploy-nfd: kustomize ## Undeploy Node Feature Discovery + @echo "Undeploying Node Feature Discovery..." + $(KUSTOMIZE) build config/nfd | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f - + @echo "NFD undeployed." + .PHONY: prepare-deploy prepare-deploy: cd config/operator && $(KUSTOMIZE) edit set image quay.io/gkm/operator=${OPERATOR_IMG} - cd config/agent && $(KUSTOMIZE) edit set image quay.io/gkm/agent=${AGENT_IMG} + cd config/agent && $(KUSTOMIZE) edit set image \ + quay.io/gkm/agent-nvidia=$(REPO)/agent-nvidia:$(IMAGE_TAG) \ + quay.io/gkm/agent-amd=$(REPO)/agent-amd:$(IMAGE_TAG) \ + quay.io/gkm/agent-nogpu=$(REPO)/agent-nogpu:$(IMAGE_TAG) ifdef NO_GPU cd config/configMap && \ $(SED) \ @@ -337,7 +355,11 @@ ifneq ($(KYVERNO_ENABLED),true) endif .PHONY: deploy -deploy: manifests kustomize prepare-deploy webhook-secret-file deploy-cert-manager redeploy ## Deploy controller and agent to the K8s cluster specified in ~/.kube/config +deploy: manifests kustomize prepare-deploy webhook-secret-file deploy-cert-manager deploy-nfd redeploy ## Deploy controller and agent to the K8s cluster specified in ~/.kube/config +ifeq ($(KYVERNO_ENABLED),true) + @echo "Deploying Kyverno (KYVERNO_ENABLED=true)..." + $(MAKE) deploy-kyverno-with-policies +endif .PHONY: redeploy redeploy: ## Redeploy controller and agent to the K8s cluster after deploy and undeploy have been called. Skips some onetime steps in deploy. @@ -352,6 +374,13 @@ undeploy: kustomize delete-webhook-secret-file ## Undeploy operator and agent fr exit 1; \ fi $(KUSTOMIZE) build $(DEPLOY_PATH) | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f - +ifeq ($(KYVERNO_ENABLED),true) + @echo "Undeploying Kyverno (KYVERNO_ENABLED=true)..." + -$(MAKE) undeploy-kyverno-policies + -$(MAKE) undeploy-kyverno-production +endif + @echo "Undeploying NFD..." + -$(MAKE) undeploy-nfd @echo "Undeployment from $(DEPLOY_PATH) completed." .PHONY: undeploy-force @@ -486,6 +515,24 @@ else endif @echo "Kyverno deployed successfully to $(KIND_CLUSTER_NAME)." +.PHONY: deploy-kyverno-production +deploy-kyverno-production: helm ## Deploy Kyverno for production clusters (no Kind context) + @echo "Installing Kyverno..." + $(HELM) upgrade --install kyverno --namespace kyverno --create-namespace \ + --repo https://kyverno.github.io/kyverno/ kyverno \ + --values config/kyverno/values.yaml \ + --wait + @echo "Waiting for Kyverno to be ready..." + @$(KUBECTL) wait --for=condition=Available --timeout=120s -n kyverno deployment/kyverno-admission-controller || true + @echo "Kyverno deployed successfully." + +.PHONY: deploy-kyverno-with-policies +deploy-kyverno-with-policies: deploy-kyverno-production deploy-kyverno-policies ## Deploy Kyverno and its policies + @echo "Restarting Kyverno to discover GKM CRDs..." + @$(KUBECTL) rollout restart deployment/kyverno-admission-controller -n kyverno + @$(KUBECTL) wait --for=condition=Available --timeout=120s -n kyverno deployment/kyverno-admission-controller || true + @echo "Kyverno and policies deployed successfully." + .PHONY: deploy-kyverno-policies deploy-kyverno-policies: kustomize ## Deploy Kyverno ClusterPolicies for GKMCache image verification @echo "Deploying Kyverno policies for GKMCache image verification..." @@ -507,6 +554,13 @@ undeploy-kyverno: ## Undeploy Kyverno $(KUBECTL) delete namespace kyverno --ignore-not-found=$(ignore-not-found) @echo "Kyverno undeployed from $(KIND_CLUSTER_NAME)." +.PHONY: undeploy-kyverno-production +undeploy-kyverno-production: ## Undeploy Kyverno from production cluster + @echo "Uninstalling Kyverno..." + $(HELM) uninstall kyverno --namespace kyverno --ignore-not-found || true + $(KUBECTL) delete namespace kyverno --ignore-not-found=$(ignore-not-found) + @echo "Kyverno undeployed." + ##@ Kind Cluster Management .PHONY: setup-kind From d667e8c0610672745002745dc5066842a081169d Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Thu, 12 Mar 2026 11:18:24 +0000 Subject: [PATCH 03/25] fix: conditionally build agents based on NO_GPU_BUILD flag - When NO_GPU_BUILD=true, only build and push no-GPU agent - When NO_GPU_BUILD=false (default), build and push all three agents (NVIDIA, AMD, no-GPU) - This avoids unnecessary builds of GPU-specific agents for Kind/test clusters Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- Makefile | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/Makefile b/Makefile index 972a654f5..fee02fd37 100644 --- a/Makefile +++ b/Makefile @@ -226,7 +226,11 @@ build-image-agent-nogpu: $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --progress=plain --load -f Containerfile.gkm-agent-nogpu -t $(REPO)/agent-nogpu:$(IMAGE_TAG) . .PHONY: build-image-agents +ifeq ($(NO_GPU_BUILD),true) +build-image-agents: build-image-agent-nogpu ## Build no-GPU agent only (NO_GPU_BUILD=true) +else build-image-agents: build-image-agent-nvidia build-image-agent-amd build-image-agent-nogpu ## Build all agent images (NVIDIA, AMD, and no-GPU) +endif # If you wish to build the operator image targeting other platforms you can use the --platform flag. # (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it. @@ -238,15 +242,24 @@ build-images: build-image-operator build-image-agents build-image-gkm-extract ## push-images: ## Push all container images. $(CONTAINER_TOOL) push ${OPERATOR_IMG} $(CONTAINER_TOOL) push ${EXTRACT_IMG} +ifeq ($(NO_GPU_BUILD),true) + $(CONTAINER_TOOL) push $(REPO)/agent-nogpu:$(IMAGE_TAG) +else $(CONTAINER_TOOL) push $(REPO)/agent-nvidia:$(IMAGE_TAG) $(CONTAINER_TOOL) push $(REPO)/agent-amd:$(IMAGE_TAG) $(CONTAINER_TOOL) push $(REPO)/agent-nogpu:$(IMAGE_TAG) +endif .PHONY: push-images-agents +ifeq ($(NO_GPU_BUILD),true) +push-images-agents: ## Push no-GPU agent only (NO_GPU_BUILD=true) + $(CONTAINER_TOOL) push $(REPO)/agent-nogpu:$(IMAGE_TAG) +else push-images-agents: ## Push all agent images $(CONTAINER_TOOL) push $(REPO)/agent-nvidia:$(IMAGE_TAG) $(CONTAINER_TOOL) push $(REPO)/agent-amd:$(IMAGE_TAG) $(CONTAINER_TOOL) push $(REPO)/agent-nogpu:$(IMAGE_TAG) +endif # Mapping old commands after rename .PHONY: docker-build From 7dfd7e2a0772438093da1b46ab0f52ecc46f077b Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Thu, 12 Mar 2026 11:44:13 +0000 Subject: [PATCH 04/25] feat: add individual agent image variables for flexible deployment Add AGENT_NVIDIA_IMG, AGENT_AMD_IMG, and AGENT_NOGPU_IMG variables to allow individual override of agent images. This enables deploying with custom image names/tags without requiring the default naming scheme. Example usage: make deploy \ OPERATOR_IMG=quay.io/user/gkm:operator \ EXTRACT_IMG=quay.io/user/gkm:extract \ AGENT_NVIDIA_IMG=quay.io/user/gkm:agent-nvidia \ AGENT_AMD_IMG=quay.io/user/gkm:agent-amd \ AGENT_NOGPU_IMG=quay.io/user/gkm:agent-no-gpu Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- Makefile | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index fee02fd37..c4c5fc4ec 100644 --- a/Makefile +++ b/Makefile @@ -77,6 +77,9 @@ REPO ?= quay.io/$(QUAY_USER) OPERATOR_IMG ?= $(REPO)/operator:$(IMAGE_TAG) AGENT_IMG ?=$(REPO)/agent:$(IMAGE_TAG) EXTRACT_IMG ?=$(REPO)/gkm-extract:$(IMAGE_TAG) +AGENT_NVIDIA_IMG ?= $(REPO)/agent-nvidia:$(IMAGE_TAG) +AGENT_AMD_IMG ?= $(REPO)/agent-amd:$(IMAGE_TAG) +AGENT_NOGPU_IMG ?= $(REPO)/agent-nogpu:$(IMAGE_TAG) # ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary. ENVTEST_K8S_VERSION = 1.31.0 @@ -215,15 +218,15 @@ build-image-gkm-extract: .PHONY: build-image-agent-nvidia build-image-agent-nvidia: - $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load -f Containerfile.gkm-agent-nvidia -t $(REPO)/agent-nvidia:$(IMAGE_TAG) . + $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load -f Containerfile.gkm-agent-nvidia -t ${AGENT_NVIDIA_IMG} . .PHONY: build-image-agent-amd build-image-agent-amd: - $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load -f Containerfile.gkm-agent-amd -t $(REPO)/agent-amd:$(IMAGE_TAG) . + $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load -f Containerfile.gkm-agent-amd -t ${AGENT_AMD_IMG} . .PHONY: build-image-agent-nogpu build-image-agent-nogpu: - $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --progress=plain --load -f Containerfile.gkm-agent-nogpu -t $(REPO)/agent-nogpu:$(IMAGE_TAG) . + $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --progress=plain --load -f Containerfile.gkm-agent-nogpu -t ${AGENT_NOGPU_IMG} . .PHONY: build-image-agents ifeq ($(NO_GPU_BUILD),true) @@ -243,22 +246,22 @@ push-images: ## Push all container images. $(CONTAINER_TOOL) push ${OPERATOR_IMG} $(CONTAINER_TOOL) push ${EXTRACT_IMG} ifeq ($(NO_GPU_BUILD),true) - $(CONTAINER_TOOL) push $(REPO)/agent-nogpu:$(IMAGE_TAG) + $(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG} else - $(CONTAINER_TOOL) push $(REPO)/agent-nvidia:$(IMAGE_TAG) - $(CONTAINER_TOOL) push $(REPO)/agent-amd:$(IMAGE_TAG) - $(CONTAINER_TOOL) push $(REPO)/agent-nogpu:$(IMAGE_TAG) + $(CONTAINER_TOOL) push ${AGENT_NVIDIA_IMG} + $(CONTAINER_TOOL) push ${AGENT_AMD_IMG} + $(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG} endif .PHONY: push-images-agents ifeq ($(NO_GPU_BUILD),true) push-images-agents: ## Push no-GPU agent only (NO_GPU_BUILD=true) - $(CONTAINER_TOOL) push $(REPO)/agent-nogpu:$(IMAGE_TAG) + $(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG} else push-images-agents: ## Push all agent images - $(CONTAINER_TOOL) push $(REPO)/agent-nvidia:$(IMAGE_TAG) - $(CONTAINER_TOOL) push $(REPO)/agent-amd:$(IMAGE_TAG) - $(CONTAINER_TOOL) push $(REPO)/agent-nogpu:$(IMAGE_TAG) + $(CONTAINER_TOOL) push ${AGENT_NVIDIA_IMG} + $(CONTAINER_TOOL) push ${AGENT_AMD_IMG} + $(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG} endif # Mapping old commands after rename @@ -343,9 +346,9 @@ undeploy-nfd: kustomize ## Undeploy Node Feature Discovery prepare-deploy: cd config/operator && $(KUSTOMIZE) edit set image quay.io/gkm/operator=${OPERATOR_IMG} cd config/agent && $(KUSTOMIZE) edit set image \ - quay.io/gkm/agent-nvidia=$(REPO)/agent-nvidia:$(IMAGE_TAG) \ - quay.io/gkm/agent-amd=$(REPO)/agent-amd:$(IMAGE_TAG) \ - quay.io/gkm/agent-nogpu=$(REPO)/agent-nogpu:$(IMAGE_TAG) + quay.io/gkm/agent-nvidia=${AGENT_NVIDIA_IMG} \ + quay.io/gkm/agent-amd=${AGENT_AMD_IMG} \ + quay.io/gkm/agent-nogpu=${AGENT_NOGPU_IMG} ifdef NO_GPU cd config/configMap && \ $(SED) \ From 019d4fb06c3eadf01de9f32d047a457960fa2f32 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Thu, 12 Mar 2026 12:24:27 +0000 Subject: [PATCH 05/25] fix: GPU agent scheduling with NFD PCI class code labels The GPU agents were not being scheduled on nodes with GPUs because NFD creates labels with PCI class codes (e.g., pci-0302_10de for NVIDIA 3D controllers), but agents were using simple nodeSelectors looking for vendor ID only (pci-10de). Changes: - Update NVIDIA agent to use nodeAffinity matching class codes 0300 and 0302 - Update AMD agent to use nodeAffinity matching class codes 0300, 0302, and 0380 - Upgrade NFD to v0.17.2 to fix deprecated node-role.kubernetes.io/master label - Replace wget with curl in Makefile for macOS compatibility Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- Makefile | 8 +++++++- config/agent/gkm-agent-amd.yaml | 17 +++++++++++++++-- config/agent/gkm-agent-nvidia.yaml | 14 ++++++++++++-- config/nfd/kustomization.yaml | 2 +- 4 files changed, 35 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index c4c5fc4ec..1d023c639 100644 --- a/Makefile +++ b/Makefile @@ -148,6 +148,12 @@ vendors: ## Refresh vendors directory. @echo "### Checking vendors" go mod tidy && go mod vendor +.PHONY: install-deps +install-deps: ## Install all dependencies (go, podman, kubectl, and build dependencies). + @echo "### Installing GKM dependencies" + @chmod +x hack/install_deps.sh + @./hack/install_deps.sh + .PHONY: explain explain: ## Run "kubectl explain" on all CRDs. CRD_1="ClusterGKMCache" CRD_2="GKMCache" CRD_3="ClusterGKMCacheNode" CRD_4="GKMCacheNode" OUTPUT_DIR="../docs/crds" ./hack/crd_explain_txt.sh @@ -693,7 +699,7 @@ kind-gpu-sim-script: $(KIND_GPU_SIM_SCRIPT) ## Download kind-gpu-sim-script loc $(KIND_GPU_SIM_SCRIPT): $(LOCALBIN) if [ ! -f $(KIND_GPU_SIM_SCRIPT) ]; then \ echo "Downloading $(KIND_GPU_SIM_SCRIPT)"; \ - wget -P $(LOCALBIN) $(KIND_GPU_SIM_SCRIPT_URL); \ + curl -L -o $(KIND_GPU_SIM_SCRIPT) $(KIND_GPU_SIM_SCRIPT_URL); \ chmod +x $(KIND_GPU_SIM_SCRIPT); \ fi diff --git a/config/agent/gkm-agent-amd.yaml b/config/agent/gkm-agent-amd.yaml index ef1a623eb..95f717adc 100644 --- a/config/agent/gkm-agent-amd.yaml +++ b/config/agent/gkm-agent-amd.yaml @@ -19,8 +19,21 @@ spec: spec: serviceAccountName: gkm-agent # Deploy only on nodes with AMD GPUs - nodeSelector: - feature.node.kubernetes.io/pci-1002.present: "true" # AMD vendor ID + # AMD vendor ID is 1002, with class codes: + # 0300: VGA controller, 0302: 3D controller, 0380: Display controller + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0300_1002.present + operator: Exists + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0302_1002.present + operator: Exists + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0380_1002.present + operator: Exists containers: - name: gkm-agent image: quay.io/gkm/agent-amd:latest diff --git a/config/agent/gkm-agent-nvidia.yaml b/config/agent/gkm-agent-nvidia.yaml index 1cfc92af4..1f6670a6b 100644 --- a/config/agent/gkm-agent-nvidia.yaml +++ b/config/agent/gkm-agent-nvidia.yaml @@ -19,8 +19,18 @@ spec: spec: serviceAccountName: gkm-agent # Deploy only on nodes with NVIDIA GPUs - nodeSelector: - feature.node.kubernetes.io/pci-10de.present: "true" # NVIDIA vendor ID + # NVIDIA vendor ID is 10de, with class codes: + # 0300: VGA controller, 0302: 3D controller + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0300_10de.present + operator: Exists + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0302_10de.present + operator: Exists containers: - name: gkm-agent image: quay.io/gkm/agent-nvidia:latest diff --git a/config/nfd/kustomization.yaml b/config/nfd/kustomization.yaml index 6c050e49c..684f558c6 100644 --- a/config/nfd/kustomization.yaml +++ b/config/nfd/kustomization.yaml @@ -4,7 +4,7 @@ kind: Kustomization # Deploy Node Feature Discovery from official Helm chart # This will automatically label nodes with GPU vendor information resources: - - https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/default?ref=v0.16.7 + - https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/default?ref=v0.17.2 # Optional: Add custom NFD configuration # Uncomment if you need to customize NFD behavior From cb0df30296643aee2d8bf1d4b88d046e91aa506b Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Thu, 12 Mar 2026 12:26:46 +0000 Subject: [PATCH 06/25] fix: exclude control-plane nodes from nogpu agent deployment In multi-node clusters, the nogpu agent should not run on control-plane nodes. Also updated to match PCI class code label format consistent with GPU agents. Changes: - Add nodeAffinity to exclude nodes with node-role.kubernetes.io/control-plane label - Update GPU detection to use PCI class codes (0300, 0302, 0380) instead of vendor ID only - Ensures nogpu agent only runs on non-GPU worker nodes Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- config/agent/gkm-agent-nogpu.yaml | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/config/agent/gkm-agent-nogpu.yaml b/config/agent/gkm-agent-nogpu.yaml index 7500293ba..8b7715104 100644 --- a/config/agent/gkm-agent-nogpu.yaml +++ b/config/agent/gkm-agent-nogpu.yaml @@ -19,14 +19,23 @@ spec: spec: serviceAccountName: gkm-agent # Deploy on nodes without GPUs (nodes that don't have NVIDIA or AMD PCI labels) + # and exclude control-plane nodes in multi-node clusters affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - matchExpressions: - - key: feature.node.kubernetes.io/pci-10de.present + - key: feature.node.kubernetes.io/pci-0300_10de.present operator: DoesNotExist - - key: feature.node.kubernetes.io/pci-1002.present + - key: feature.node.kubernetes.io/pci-0302_10de.present + operator: DoesNotExist + - key: feature.node.kubernetes.io/pci-0300_1002.present + operator: DoesNotExist + - key: feature.node.kubernetes.io/pci-0302_1002.present + operator: DoesNotExist + - key: feature.node.kubernetes.io/pci-0380_1002.present + operator: DoesNotExist + - key: node-role.kubernetes.io/control-plane operator: DoesNotExist containers: - name: gkm-agent From 00fbcbef3353ec8ba0296c5ab66ead75bc974cfc Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Thu, 12 Mar 2026 12:35:46 +0000 Subject: [PATCH 07/25] fix: mount GPU libraries to enable device access without GPU resource requests The GPU agents were unable to access GPUs because they lacked the necessary GPU runtime libraries. Following the NVIDIA device plugin pattern, we now mount: NVIDIA agent: - /usr/lib64 -> Contains libnvidia-ml.so and other NVIDIA libraries - LD_LIBRARY_PATH=/usr/lib64 environment variable AMD agent: - /opt/rocm -> ROCm libraries for AMD GPU management - /usr/lib64 -> System libraries - LD_LIBRARY_PATH=/opt/rocm/lib:/usr/lib64 This allows the agents to use NVML/ROCm APIs to detect and monitor ALL GPUs on the node without requesting gpu resources (nvidia.com/gpu or amd.com/gpu), which would limit visibility to only one GPU. Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- config/agent/gkm-agent-amd.yaml | 18 ++++++++++++++++++ config/agent/gkm-agent-nvidia.yaml | 10 ++++++++++ 2 files changed, 28 insertions(+) diff --git a/config/agent/gkm-agent-amd.yaml b/config/agent/gkm-agent-amd.yaml index 95f717adc..6de806096 100644 --- a/config/agent/gkm-agent-amd.yaml +++ b/config/agent/gkm-agent-amd.yaml @@ -65,6 +65,8 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName + - name: LD_LIBRARY_PATH + value: /opt/rocm/lib:/usr/lib64 resources: limits: memory: "128Mi" @@ -82,6 +84,12 @@ spec: readOnly: true - name: dev mountPath: /dev + - name: rocm-libs + mountPath: /opt/rocm + readOnly: true + - name: system-libs + mountPath: /usr/lib64 + readOnly: true volumes: # This volume is the GKM State directory. This is where GPU Kernel Cache @@ -104,3 +112,13 @@ spec: hostPath: path: /dev type: Directory + # ROCm libraries needed for AMD GPU management + - name: rocm-libs + hostPath: + path: /opt/rocm + type: DirectoryOrCreate + # System libraries for GPU access + - name: system-libs + hostPath: + path: /usr/lib64 + type: Directory diff --git a/config/agent/gkm-agent-nvidia.yaml b/config/agent/gkm-agent-nvidia.yaml index 1f6670a6b..6cad7bfa5 100644 --- a/config/agent/gkm-agent-nvidia.yaml +++ b/config/agent/gkm-agent-nvidia.yaml @@ -57,6 +57,8 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName + - name: LD_LIBRARY_PATH + value: /usr/lib64 resources: limits: memory: "128Mi" @@ -74,6 +76,9 @@ spec: readOnly: true - name: dev mountPath: /dev + - name: nvidia-libs + mountPath: /usr/lib64 + readOnly: true volumes: # This volume is the GKM State directory. This is where GPU Kernel Cache @@ -96,3 +101,8 @@ spec: hostPath: path: /dev type: Directory + # NVIDIA libraries needed for NVML (NVIDIA Management Library) + - name: nvidia-libs + hostPath: + path: /usr/lib64 + type: Directory From 4bd95a3dbb7491d75e11ed3492177fd9100bf2ba Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Thu, 12 Mar 2026 12:42:43 +0000 Subject: [PATCH 08/25] feat: add automated dependency installation for RHEL 10 Add comprehensive installation script and make target to automate dependency setup on RHEL 10 systems. The script handles installation of build dependencies from CentOS Stream and Fedora repositories, and installs/upgrades go, podman, and kubectl to required versions. Changes: - Add hack/install_deps.sh script for RHEL 10 dependency installation - Add 'make install-deps' target to Makefile - Update GettingStartedGuide with automated installation instructions - Document package sources for RHEL 10 (CentOS Stream, Fedora) Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- docs/GettingStartedGuide.md | 27 +++++- hack/install_deps.sh | 170 ++++++++++++++++++++++++++++++++++++ 2 files changed, 195 insertions(+), 2 deletions(-) create mode 100644 hack/install_deps.sh diff --git a/docs/GettingStartedGuide.md b/docs/GettingStartedGuide.md index b29dcffb4..196157152 100644 --- a/docs/GettingStartedGuide.md +++ b/docs/GettingStartedGuide.md @@ -10,14 +10,37 @@ building GKM and description of how to deploy GKM. - kubectl version v1.11.3+. - Access to a Kubernetes v1.11.3+ cluster. -The following packages are also required to build: +### Automated Installation (RHEL 10 / CentOS Stream 10) + +For RHEL 10 or CentOS Stream 10 systems, you can install all dependencies (including go, podman, kubectl, and build packages) using: + +```sh +make install-deps +``` + +This will: +- Install system development packages (gpgme-devel, libdrm-devel, hwloc-devel) +- Install btrfs development headers +- Install or upgrade Go to v1.25.0+ if needed +- Install or upgrade Podman to v5.3.1+ if needed +- Install or upgrade kubectl to v1.11.3+ if needed + +### Manual Installation + +The following packages are required to build: + +**For Fedora/RHEL/CentOS:** ```sh sudo dnf install -y gpgme-devel libdrm-devel libbtrfs btrfs-progs \ btrfs-progs-devel hwloc hwloc-devel ``` -OR +> **Note for RHEL 10**: Some packages may not be available in standard repositories. +> Use `make install-deps` or see [hack/install_deps.sh](../hack/install_deps.sh) for the installation script +> that sources packages from CentOS Stream 10 and Fedora repositories. + +**For Debian/Ubuntu:** ```sh sudo apt-get install -y libgpgme-dev libbtrfs-dev btrfs-progs libgpgme11-dev \ diff --git a/hack/install_deps.sh b/hack/install_deps.sh new file mode 100644 index 000000000..444b88ce6 --- /dev/null +++ b/hack/install_deps.sh @@ -0,0 +1,170 @@ +#!/bin/bash + +set -e + +echo "================================================" +echo "GKM Dependency Installation for RHEL 10" +echo "================================================" +echo "" + +# Minimum required versions +MIN_GO_VERSION="1.25.0" +MIN_PODMAN_VERSION="5.3.1" +MIN_KUBECTL_VERSION="1.11.3" + +# CentOS Stream 10 repository URLs +CENTOS_CRB="https://mirror.stream.centos.org/10-stream/CRB/x86_64/os/" +FEDORA_BASE="https://download.fedoraproject.org/pub/fedora/linux/development/rawhide/Everything/x86_64/os/Packages" + +# Function to compare versions +version_ge() { + # Returns 0 (true) if $1 >= $2 + [ "$(printf '%s\n' "$2" "$1" | sort -V | head -n1)" = "$2" ] +} + +# Function to check if a command exists +command_exists() { + command -v "$1" >/dev/null 2>&1 +} + +echo "=== Step 1: Importing CentOS Stream GPG key ===" +echo "================================================" +sudo rpm --import https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256 2>/dev/null || echo "Key may already be imported" + +echo "" +echo "=== Step 2: Installing system development packages ===" +echo "======================================================" +sudo dnf install -y --repofrompath=centos-crb,${CENTOS_CRB} \ + gpgme-devel libdrm-devel hwloc-devel + +echo "" +echo "=== Step 3: Installing btrfs development headers ===" +echo "=====================================================" +# First install the base libraries with --nodeps to skip filesystem checks +sudo rpm -ivh --nodeps \ + "${FEDORA_BASE}/l/libbtrfs-6.19-1.fc45.x86_64.rpm" \ + "${FEDORA_BASE}/l/libbtrfsutil-6.19-1.fc45.x86_64.rpm" 2>/dev/null || echo "Libraries may already be installed" + +# Now install devel package with --nodeps +sudo rpm -ivh --nodeps \ + "${FEDORA_BASE}/b/btrfs-progs-6.19-1.fc45.x86_64.rpm" 2>/dev/null || echo "btrfs-progs may already be installed" + +sudo rpm -ivh --nodeps \ + "${FEDORA_BASE}/b/btrfs-progs-devel-6.19-1.fc45.x86_64.rpm" + +echo "" +echo "=== Step 4: Installing Go ${MIN_GO_VERSION}+ ===" +echo "==============================================" +if command_exists go; then + CURRENT_GO_VERSION=$(go version | awk '{print $3}' | sed 's/go//') + echo "Found Go version: ${CURRENT_GO_VERSION}" + if version_ge "${CURRENT_GO_VERSION}" "${MIN_GO_VERSION}"; then + echo "✓ Go ${CURRENT_GO_VERSION} meets minimum requirement (${MIN_GO_VERSION}+)" + else + echo "⚠ Go ${CURRENT_GO_VERSION} is older than required ${MIN_GO_VERSION}" + echo "Installing Go ${MIN_GO_VERSION}..." + GO_VERSION="1.25.0" + GO_TARBALL="go${GO_VERSION}.linux-amd64.tar.gz" + curl -LO "https://go.dev/dl/${GO_TARBALL}" + sudo rm -rf /usr/local/go + sudo tar -C /usr/local -xzf "${GO_TARBALL}" + rm "${GO_TARBALL}" + echo "✓ Go ${GO_VERSION} installed. Add /usr/local/go/bin to your PATH" + export PATH=$PATH:/usr/local/go/bin + fi +else + echo "Go not found. Installing Go ${MIN_GO_VERSION}..." + GO_VERSION="1.25.0" + GO_TARBALL="go${GO_VERSION}.linux-amd64.tar.gz" + curl -LO "https://go.dev/dl/${GO_TARBALL}" + sudo rm -rf /usr/local/go + sudo tar -C /usr/local -xzf "${GO_TARBALL}" + rm "${GO_TARBALL}" + echo "✓ Go ${GO_VERSION} installed. Add /usr/local/go/bin to your PATH" + echo 'export PATH=$PATH:/usr/local/go/bin' >> ~/.bashrc + export PATH=$PATH:/usr/local/go/bin +fi + +echo "" +echo "=== Step 5: Installing Podman ${MIN_PODMAN_VERSION}+ ===" +echo "========================================================" +if command_exists podman; then + CURRENT_PODMAN_VERSION=$(podman version --format '{{.Client.Version}}' 2>/dev/null || podman --version | awk '{print $3}') + echo "Found Podman version: ${CURRENT_PODMAN_VERSION}" + if version_ge "${CURRENT_PODMAN_VERSION}" "${MIN_PODMAN_VERSION}"; then + echo "✓ Podman ${CURRENT_PODMAN_VERSION} meets minimum requirement (${MIN_PODMAN_VERSION}+)" + else + echo "⚠ Podman ${CURRENT_PODMAN_VERSION} is older than required ${MIN_PODMAN_VERSION}" + echo "Upgrading Podman..." + sudo dnf upgrade -y podman + fi +else + echo "Podman not found. Installing..." + sudo dnf install -y podman +fi + +echo "" +echo "=== Step 6: Installing kubectl ${MIN_KUBECTL_VERSION}+ ===" +echo "==========================================================" +if command_exists kubectl; then + CURRENT_KUBECTL_VERSION=$(kubectl version --client --short 2>/dev/null | grep -oP 'v\K[0-9.]+' || kubectl version --client -o json 2>/dev/null | grep -oP '"gitVersion": "v\K[0-9.]+' | head -1) + echo "Found kubectl version: ${CURRENT_KUBECTL_VERSION}" + if version_ge "${CURRENT_KUBECTL_VERSION}" "${MIN_KUBECTL_VERSION}"; then + echo "✓ kubectl ${CURRENT_KUBECTL_VERSION} meets minimum requirement (${MIN_KUBECTL_VERSION}+)" + else + echo "⚠ kubectl ${CURRENT_KUBECTL_VERSION} is older than required ${MIN_KUBECTL_VERSION}" + echo "Installing latest kubectl..." + curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + chmod +x kubectl + sudo mv kubectl /usr/local/bin/ + fi +else + echo "kubectl not found. Installing..." + curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + chmod +x kubectl + sudo mv kubectl /usr/local/bin/ +fi + +echo "" +echo "=== Step 7: Verification ===" +echo "============================" +echo "" +echo "System Development Packages:" +ls -la /usr/include/gpgme.h 2>/dev/null && echo " ✓ gpgme-devel" || echo " ✗ gpgme-devel missing" +ls -la /usr/include/xf86drm.h 2>/dev/null && echo " ✓ libdrm-devel" || echo " ✗ libdrm-devel missing" +ls -la /usr/include/hwloc.h 2>/dev/null && echo " ✓ hwloc-devel" || echo " ✗ hwloc-devel missing" +ls -la /usr/include/btrfs/version.h 2>/dev/null && echo " ✓ btrfs/version.h" || echo " ✗ btrfs headers missing" + +echo "" +echo "Build Tools:" +if command_exists go; then + echo " ✓ Go $(go version | awk '{print $3}')" +else + echo " ✗ Go not found in PATH" +fi + +if command_exists podman; then + echo " ✓ Podman $(podman --version | awk '{print $3}')" +else + echo " ✗ Podman not found" +fi + +if command_exists kubectl; then + echo " ✓ kubectl $(kubectl version --client --short 2>/dev/null | grep -oP 'v[0-9.]+' || echo 'version installed')" +else + echo " ✗ kubectl not found in PATH" +fi + +echo "" +echo "pkg-config:" +pkg-config --exists gpgme && echo " ✓ gpgme.pc (version $(pkg-config --modversion gpgme))" || echo " ✗ gpgme.pc missing" + +echo "" +echo "================================================" +echo "Installation Complete!" +echo "================================================" +echo "" +echo "If Go or kubectl were newly installed, you may need to:" +echo " - Reload your shell: source ~/.bashrc" +echo " - Or add to your PATH manually:" +echo " export PATH=\$PATH:/usr/local/go/bin" From fe6471cf79c70bd5402b909b3032d88cd79c58b2 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Thu, 12 Mar 2026 13:56:09 +0000 Subject: [PATCH 09/25] gkm: add nvidia example Signed-off-by: Maryam Tahhan --- .../namespace/RWO-NVIDIA/10-namespace.yaml | 5 + .../namespace/RWO-NVIDIA/11-gkmcache.yaml | 19 +++ examples/namespace/RWO-NVIDIA/12-ds.yaml | 52 +++++++ examples/namespace/RWO-NVIDIA/13-pod.yaml | 41 ++++++ examples/namespace/RWO-NVIDIA/README.md | 132 ++++++++++++++++++ .../{RWO => RWO-ROCM}/10-namespace.yaml | 0 .../{RWO => RWO-ROCM}/11-gkmcache.yaml | 0 .../namespace/{RWO => RWO-ROCM}/12-ds.yaml | 0 .../namespace/{RWO => RWO-ROCM}/13-ds.yaml | 0 .../namespace/{RWO => RWO-ROCM}/14-ds.yaml | 0 .../21-gkmcache-cosign-v3.yaml | 0 .../namespace/{RWO => RWO-ROCM}/22-ds.yaml | 0 12 files changed, 249 insertions(+) create mode 100644 examples/namespace/RWO-NVIDIA/10-namespace.yaml create mode 100644 examples/namespace/RWO-NVIDIA/11-gkmcache.yaml create mode 100644 examples/namespace/RWO-NVIDIA/12-ds.yaml create mode 100644 examples/namespace/RWO-NVIDIA/13-pod.yaml create mode 100644 examples/namespace/RWO-NVIDIA/README.md rename examples/namespace/{RWO => RWO-ROCM}/10-namespace.yaml (100%) rename examples/namespace/{RWO => RWO-ROCM}/11-gkmcache.yaml (100%) rename examples/namespace/{RWO => RWO-ROCM}/12-ds.yaml (100%) rename examples/namespace/{RWO => RWO-ROCM}/13-ds.yaml (100%) rename examples/namespace/{RWO => RWO-ROCM}/14-ds.yaml (100%) rename examples/namespace/{RWO => RWO-ROCM}/21-gkmcache-cosign-v3.yaml (100%) rename examples/namespace/{RWO => RWO-ROCM}/22-ds.yaml (100%) diff --git a/examples/namespace/RWO-NVIDIA/10-namespace.yaml b/examples/namespace/RWO-NVIDIA/10-namespace.yaml new file mode 100644 index 000000000..aec06b330 --- /dev/null +++ b/examples/namespace/RWO-NVIDIA/10-namespace.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: gkm-test-ns-nvidia-rwo-1 diff --git a/examples/namespace/RWO-NVIDIA/11-gkmcache.yaml b/examples/namespace/RWO-NVIDIA/11-gkmcache.yaml new file mode 100644 index 000000000..54cb9729f --- /dev/null +++ b/examples/namespace/RWO-NVIDIA/11-gkmcache.yaml @@ -0,0 +1,19 @@ +--- +apiVersion: gkm.io/v1alpha1 +kind: GKMCache +metadata: + name: vector-add-cache-cuda-rwo + namespace: gkm-test-ns-nvidia-rwo-1 + labels: + gkm.io/signature-format: cosign-v2 +spec: + image: quay.io/gkm/cache-examples:vector-add-cache-cuda-v2 + storageClassName: standard # Update this to match your cluster's storage class + + # Pod template for the extraction job + podTemplate: + spec: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/examples/namespace/RWO-NVIDIA/12-ds.yaml b/examples/namespace/RWO-NVIDIA/12-ds.yaml new file mode 100644 index 000000000..cd06dbf9f --- /dev/null +++ b/examples/namespace/RWO-NVIDIA/12-ds.yaml @@ -0,0 +1,52 @@ +--- +kind: DaemonSet +apiVersion: apps/v1 +metadata: + name: gkm-test-nvidia-rwo-ds-1 + namespace: gkm-test-ns-nvidia-rwo-1 + labels: + gkm.io/pvcMutation: "true" +spec: + selector: + matchLabels: + name: gkm-test-nvidia-rwo-ds-1 + template: + metadata: + labels: + name: gkm-test-nvidia-rwo-ds-1 + gkm.io/pvc-mutation: "true" + spec: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + + # Node affinity to schedule only on NVIDIA GPU nodes + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + # NVIDIA vendor ID is 10de, class code 0300 (VGA) or 0302 (3D controller) + - key: feature.node.kubernetes.io/pci-0300_10de.present + operator: Exists + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0302_10de.present + operator: Exists + + containers: + - name: test + image: quay.io/fedora/fedora-minimal + imagePullPolicy: IfNotPresent + command: [sleep, 365d] + volumeMounts: + - name: kernel-volume + mountPath: /cache + readOnly: true + resources: + limits: + nvidia.com/gpu: 1 # Request 1 NVIDIA GPU + volumes: + - name: kernel-volume + persistentVolumeClaim: + claimName: vector-add-cache-cuda-rwo diff --git a/examples/namespace/RWO-NVIDIA/13-pod.yaml b/examples/namespace/RWO-NVIDIA/13-pod.yaml new file mode 100644 index 000000000..59140b6c3 --- /dev/null +++ b/examples/namespace/RWO-NVIDIA/13-pod.yaml @@ -0,0 +1,41 @@ +--- +kind: Pod +apiVersion: v1 +metadata: + name: gkm-test-nvidia-pod-1 + namespace: gkm-test-ns-nvidia-rwo-1 +spec: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + + # Node affinity to schedule only on NVIDIA GPU nodes + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + # NVIDIA vendor ID is 10de, class code 0300 (VGA) or 0302 (3D controller) + - key: feature.node.kubernetes.io/pci-0300_10de.present + operator: Exists + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0302_10de.present + operator: Exists + + containers: + - name: test + image: quay.io/fedora/fedora-minimal + imagePullPolicy: IfNotPresent + command: [sleep, 365d] + volumeMounts: + - name: kernel-volume + mountPath: /cache + readOnly: true + resources: + limits: + nvidia.com/gpu: 1 # Request 1 NVIDIA GPU + volumes: + - name: kernel-volume + persistentVolumeClaim: + claimName: vector-add-cache-cuda-rwo diff --git a/examples/namespace/RWO-NVIDIA/README.md b/examples/namespace/RWO-NVIDIA/README.md new file mode 100644 index 000000000..c3a3f4644 --- /dev/null +++ b/examples/namespace/RWO-NVIDIA/README.md @@ -0,0 +1,132 @@ +# NVIDIA GPU Examples for GKM (ReadWriteOnce) + +This directory contains examples for deploying GKM with NVIDIA GPU support using ReadWriteOnce (RWO) access mode. + +## Prerequisites + +1. Kubernetes cluster with NVIDIA GPUs +2. NVIDIA GPU Operator or device plugin installed +3. Node Feature Discovery (NFD) installed and configured +4. GKM operator deployed in the cluster +5. A storage class that supports ReadWriteOnce volumes + +## Storage Class Configuration + +Before deploying, verify your storage class: + +```bash +kubectl get sc +``` + +Update the `storageClassName` field in [11-gkmcache.yaml](11-gkmcache.yaml) to match your cluster's storage class. + +## Deployment + +### Option 1: Deploy All Resources + +```bash +kubectl apply -f examples/namespace/RWO-NVIDIA/ +``` + +### Option 2: Deploy Step by Step + +1. Create the namespace: + ```bash + kubectl apply -f 10-namespace.yaml + ``` + +2. Create the GKMCache resource: + ```bash + kubectl apply -f 11-gkmcache.yaml + ``` + +3. Wait for the PVC to be created and bound: + ```bash + kubectl get pvc -n gkm-test-ns-nvidia-rwo-1 -w + ``` + +4. Deploy a test workload (choose one): + - DaemonSet: `kubectl apply -f 12-ds.yaml` + - Pod: `kubectl apply -f 13-pod.yaml` + +## Verification + +Check the GKMCache status: +```bash +kubectl get gkmcache -n gkm-test-ns-nvidia-rwo-1 +kubectl describe gkmcache vector-add-cache-cuda-rwo -n gkm-test-ns-nvidia-rwo-1 +``` + +Check the PVC: +```bash +kubectl get pvc -n gkm-test-ns-nvidia-rwo-1 +``` + +Check the extraction job: +```bash +kubectl get jobs -n gkm-test-ns-nvidia-rwo-1 +kubectl get pods -n gkm-test-ns-nvidia-rwo-1 +``` + +Check the test workload: +```bash +# For Pod +kubectl get pod gkm-test-nvidia-pod-1 -n gkm-test-ns-nvidia-rwo-1 +kubectl logs gkm-test-nvidia-pod-1 -n gkm-test-ns-nvidia-rwo-1 + +# For DaemonSet +kubectl get ds gkm-test-nvidia-rwo-ds-1 -n gkm-test-ns-nvidia-rwo-1 +kubectl get pods -n gkm-test-ns-nvidia-rwo-1 -l name=gkm-test-nvidia-rwo-ds-1 +``` + +Verify the cache is mounted: +```bash +kubectl exec -it -n gkm-test-ns-nvidia-rwo-1 gkm-test-nvidia-pod-1 -- ls -la /cache +``` + +## Troubleshooting + +### PVC Pending State + +If the PVC remains in Pending state: + +```bash +kubectl describe pvc vector-add-cache-cuda-rwo -n gkm-test-ns-nvidia-rwo-1 +``` + +Common issues: +- Storage class not available or incorrect +- No nodes match the node selector +- Volume binding mode is `WaitForFirstConsumer` (PVC will bind when a pod using it is scheduled) + +### Extraction Job Not Scheduling + +Check the extraction job: +```bash +kubectl get jobs -n gkm-test-ns-nvidia-rwo-1 +kubectl describe job -n gkm-test-ns-nvidia-rwo-1 +``` + +Check for pod scheduling issues: +```bash +kubectl get events -n gkm-test-ns-nvidia-rwo-1 --sort-by='.lastTimestamp' +``` + +### Pod Not Scheduling on GPU Nodes + +If your cluster doesn't have NFD labels, you can either: + +1. Install and configure NFD (recommended) +2. Remove the `affinity` section from the pod/daemonset specs and use a simpler node selector or label your GPU nodes manually + +Example without NFD: +```yaml +nodeSelector: + your-gpu-label: "true" # Use whatever label identifies your GPU nodes +``` + +## Cleanup + +```bash +kubectl delete -f examples/namespace/RWO-NVIDIA/ +``` diff --git a/examples/namespace/RWO/10-namespace.yaml b/examples/namespace/RWO-ROCM/10-namespace.yaml similarity index 100% rename from examples/namespace/RWO/10-namespace.yaml rename to examples/namespace/RWO-ROCM/10-namespace.yaml diff --git a/examples/namespace/RWO/11-gkmcache.yaml b/examples/namespace/RWO-ROCM/11-gkmcache.yaml similarity index 100% rename from examples/namespace/RWO/11-gkmcache.yaml rename to examples/namespace/RWO-ROCM/11-gkmcache.yaml diff --git a/examples/namespace/RWO/12-ds.yaml b/examples/namespace/RWO-ROCM/12-ds.yaml similarity index 100% rename from examples/namespace/RWO/12-ds.yaml rename to examples/namespace/RWO-ROCM/12-ds.yaml diff --git a/examples/namespace/RWO/13-ds.yaml b/examples/namespace/RWO-ROCM/13-ds.yaml similarity index 100% rename from examples/namespace/RWO/13-ds.yaml rename to examples/namespace/RWO-ROCM/13-ds.yaml diff --git a/examples/namespace/RWO/14-ds.yaml b/examples/namespace/RWO-ROCM/14-ds.yaml similarity index 100% rename from examples/namespace/RWO/14-ds.yaml rename to examples/namespace/RWO-ROCM/14-ds.yaml diff --git a/examples/namespace/RWO/21-gkmcache-cosign-v3.yaml b/examples/namespace/RWO-ROCM/21-gkmcache-cosign-v3.yaml similarity index 100% rename from examples/namespace/RWO/21-gkmcache-cosign-v3.yaml rename to examples/namespace/RWO-ROCM/21-gkmcache-cosign-v3.yaml diff --git a/examples/namespace/RWO/22-ds.yaml b/examples/namespace/RWO-ROCM/22-ds.yaml similarity index 100% rename from examples/namespace/RWO/22-ds.yaml rename to examples/namespace/RWO-ROCM/22-ds.yaml From 3dca1b9f27528a8b8861c72ed0f4cdbcf62cdf59 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Mon, 16 Mar 2026 13:14:38 +0000 Subject: [PATCH 10/25] fix: address PR #107 review comments and failing workflows MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit addresses all review feedback and failing CI checks: - Add common agent base Containerfile with shared build stages - Update all agent Containerfiles with clear stage documentation - Add agent-base image to CI/CD workflow and Makefile - Fix image-build workflow to build all 4 agent variants (base, nvidia, amd, nogpu) - Fix 19 markdown linting errors in documentation files - Wrap long lines to ≤80 characters (MD013) - Add blank lines around code blocks (MD031) - Add blank lines around lists (MD032) Resolves: - Build Image (agent) workflow failure (missing Containerfile.gkm-agent) - Pre-commit markdown linting failures - PR review comment requesting common base container Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- .github/workflows/image-build.yml | 45 ++++++++++++++- Containerfile.gkm-agent-amd | 16 ++++-- Containerfile.gkm-agent-base | 75 +++++++++++++++++++++++++ Containerfile.gkm-agent-nogpu | 14 ++++- Containerfile.gkm-agent-nvidia | 17 ++++-- Makefile | 14 ++++- docs/GettingStartedGuide.md | 12 ++-- examples/namespace/RWO-NVIDIA/README.md | 25 +++++++-- gkm-codespell.precommit-toml | 2 +- 9 files changed, 194 insertions(+), 26 deletions(-) create mode 100644 Containerfile.gkm-agent-base diff --git a/.github/workflows/image-build.yml b/.github/workflows/image-build.yml index 4530d31c3..710c17ef2 100644 --- a/.github/workflows/image-build.yml +++ b/.github/workflows/image-build.yml @@ -45,8 +45,48 @@ jobs: - registry: quay.io repository: gkm - image: agent - dockerfile: ./Containerfile.gkm-agent + image: agent-base + dockerfile: ./Containerfile.gkm-agent-base + context: . + target: base-runtime + tags: | + type=ref,event=branch + type=ref,event=tag + type=ref,event=pr + type=sha,format=long + # set latest tag for default branch + type=raw,value=latest,enable={{is_default_branch}} + + - registry: quay.io + repository: gkm + image: agent-nvidia + dockerfile: ./Containerfile.gkm-agent-nvidia + context: . + tags: | + type=ref,event=branch + type=ref,event=tag + type=ref,event=pr + type=sha,format=long + # set latest tag for default branch + type=raw,value=latest,enable={{is_default_branch}} + + - registry: quay.io + repository: gkm + image: agent-amd + dockerfile: ./Containerfile.gkm-agent-amd + context: . + tags: | + type=ref,event=branch + type=ref,event=tag + type=ref,event=pr + type=sha,format=long + # set latest tag for default branch + type=raw,value=latest,enable={{is_default_branch}} + + - registry: quay.io + repository: gkm + image: agent-nogpu + dockerfile: ./Containerfile.gkm-agent-nogpu context: . tags: | type=ref,event=branch @@ -130,6 +170,7 @@ jobs: file: ${{ matrix.image.dockerfile }} build-args: BUILDPLATFORM=linux/amd64 context: ${{ matrix.image.context }} + target: ${{ matrix.image.target || '' }} - name: Sign the images with GitHub OIDC Token if: ${{ fromJSON(steps.set-push.outputs.push_flag) }} diff --git a/Containerfile.gkm-agent-amd b/Containerfile.gkm-agent-amd index a59daee88..14c81835a 100644 --- a/Containerfile.gkm-agent-amd +++ b/Containerfile.gkm-agent-amd @@ -1,4 +1,7 @@ -# Build the agent binary +# ============================================================================ +# Stage 1: Builder (Shared across all agent variants) +# See Containerfile.gkm-agent-base for the common base stages +# ============================================================================ FROM public.ecr.aws/docker/library/golang:1.25 AS builder WORKDIR /workspace @@ -30,13 +33,17 @@ COPY Makefile Makefile # Build the agent binary RUN make build-gkm-agent -# Use a minimal Ubuntu base image that supports CGO binaries +# ============================================================================ +# Stage 2: AMD ROCm-specific Runtime +# ============================================================================ + +# Start from Ubuntu base for AMD ROCm support FROM public.ecr.aws/docker/library/ubuntu:24.04 # Copy the binary from the builder COPY --from=builder /workspace/bin/gkm-agent /agent -# Install required runtime libraries for CGO +# Install common runtime libraries (shared with other agent variants) RUN apt-get update && \ apt-get install -y \ ca-certificates \ @@ -58,11 +65,12 @@ RUN apt-get update && \ libseccomp2 && \ apt-get clean +# AMD ROCm version configuration ARG ROCM_VERSION=6.3.1 ARG AMDGPU_VERSION=6.3.60301 ARG OPT_ROCM_VERSION=6.3.1 -# Install ROCm packages for AMD GPU support +# Install AMD ROCm packages (GPU-specific dependencies) RUN wget https://repo.radeon.com/amdgpu-install/${ROCM_VERSION}/ubuntu/noble/amdgpu-install_${AMDGPU_VERSION}-1_all.deb && \ apt install -y ./*.deb && \ apt update && DEBIAN_FRONTEND=noninteractive apt install -y amd-smi-lib rocm-smi-lib && \ diff --git a/Containerfile.gkm-agent-base b/Containerfile.gkm-agent-base new file mode 100644 index 000000000..9bd406c84 --- /dev/null +++ b/Containerfile.gkm-agent-base @@ -0,0 +1,75 @@ +# Common base Containerfile for GKM agents +# This file contains the shared builder and base runtime stages +# GPU-specific Containerfiles currently duplicate these stages with references +# to this file for maintenance purposes. +# +# Future Enhancement: This base image could be built and pushed to Quay to +# improve build efficiency: +# podman build -f Containerfile.gkm-agent-base --target base-runtime \ +# -t quay.io/gkm/agent-runtime-base:latest . +# podman push quay.io/gkm/agent-runtime-base:latest +# +# Then GPU-specific Containerfiles could reference it: +# FROM quay.io/gkm/agent-runtime-base:latest + +# ============================================================================ +# Stage 1: Builder (Common to all agent variants) +# ============================================================================ +FROM public.ecr.aws/docker/library/golang:1.25 AS builder + +WORKDIR /workspace + +# Install required system packages +RUN apt-get update && \ + apt-get install -y \ + libgpgme-dev \ + btrfs-progs \ + libbtrfs-dev \ + libgpgme11-dev \ + libseccomp-dev \ + pkg-config \ + build-essential && \ + apt-get clean + +# Copy the Go Modules manifests +COPY go.mod go.mod +COPY go.sum go.sum + +# Copy the go source +COPY agent/main.go agent/main.go +COPY api/ api/ +COPY pkg/ pkg/ +COPY internal/controller/ internal/controller/ +COPY vendor/ vendor/ +COPY Makefile Makefile + +# Build the agent binary +RUN make build-gkm-agent + +# ============================================================================ +# Stage 2: Base Runtime (Common runtime dependencies) +# ============================================================================ +FROM public.ecr.aws/docker/library/ubuntu:24.04 AS base-runtime + +# Install required runtime libraries for CGO and agent operation +RUN apt-get update && \ + apt-get install -y \ + ca-certificates \ + libgpgme11 \ + libbtrfs0 \ + libffi8 \ + libc6 \ + wget \ + pciutils \ + hwdata \ + gnupg2 \ + python3-setuptools \ + python3-wheel \ + curl \ + dialog \ + rsync \ + lsb-release \ + software-properties-common \ + libseccomp2 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* diff --git a/Containerfile.gkm-agent-nogpu b/Containerfile.gkm-agent-nogpu index 869108e09..a33172481 100644 --- a/Containerfile.gkm-agent-nogpu +++ b/Containerfile.gkm-agent-nogpu @@ -1,4 +1,7 @@ -# Build the agent binary +# ============================================================================ +# Stage 1: Builder (Shared across all agent variants) +# See Containerfile.gkm-agent-base for the common base stages +# ============================================================================ FROM public.ecr.aws/docker/library/golang:1.25 AS builder WORKDIR /workspace @@ -30,13 +33,18 @@ COPY Makefile Makefile # Build the agent binary RUN make build-gkm-agent -# Use minimal Ubuntu base image for no-GPU environments +# ============================================================================ +# Stage 2: No-GPU Runtime (minimal footprint) +# ============================================================================ + +# Use minimal Ubuntu base (no GPU libraries needed) FROM public.ecr.aws/docker/library/ubuntu:24.04 # Copy the binary from the builder COPY --from=builder /workspace/bin/gkm-agent /agent -# Install required runtime libraries for CGO +# Install common runtime libraries (shared with other agent variants) +# No GPU-specific dependencies required for this variant RUN apt-get update && \ apt-get install -y \ ca-certificates \ diff --git a/Containerfile.gkm-agent-nvidia b/Containerfile.gkm-agent-nvidia index 28e6b836a..1d06fb06d 100644 --- a/Containerfile.gkm-agent-nvidia +++ b/Containerfile.gkm-agent-nvidia @@ -1,4 +1,7 @@ -# Build the agent binary +# ============================================================================ +# Stage 1: Builder (Shared across all agent variants) +# See Containerfile.gkm-agent-base for the common base stages +# ============================================================================ FROM public.ecr.aws/docker/library/golang:1.25 AS builder WORKDIR /workspace @@ -30,13 +33,17 @@ COPY Makefile Makefile # Build the agent binary RUN make build-gkm-agent -# Use NVIDIA CUDA runtime base image for GPU support +# ============================================================================ +# Stage 2: NVIDIA-specific Runtime +# ============================================================================ + +# Use NVIDIA CUDA runtime base image (includes NVML libraries) FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04 # Copy the binary from the builder COPY --from=builder /workspace/bin/gkm-agent /agent -# Install required runtime libraries for CGO +# Install common runtime libraries (shared with other agent variants) RUN apt-get update && \ apt-get install -y \ ca-certificates \ @@ -59,8 +66,8 @@ RUN apt-get update && \ apt-get clean && \ rm -rf /var/lib/apt/lists/* -# The NVIDIA CUDA base image already includes libnvidia-ml.so (NVML) -# No additional NVIDIA packages needed +# Note: NVIDIA CUDA base image already includes libnvidia-ml.so (NVML) +# No additional GPU-specific packages needed # Run as non-root user USER 65532:65532 diff --git a/Makefile b/Makefile index 1d023c639..8a2e3dc1e 100644 --- a/Makefile +++ b/Makefile @@ -77,6 +77,7 @@ REPO ?= quay.io/$(QUAY_USER) OPERATOR_IMG ?= $(REPO)/operator:$(IMAGE_TAG) AGENT_IMG ?=$(REPO)/agent:$(IMAGE_TAG) EXTRACT_IMG ?=$(REPO)/gkm-extract:$(IMAGE_TAG) +AGENT_BASE_IMG ?= $(REPO)/agent-base:$(IMAGE_TAG) AGENT_NVIDIA_IMG ?= $(REPO)/agent-nvidia:$(IMAGE_TAG) AGENT_AMD_IMG ?= $(REPO)/agent-amd:$(IMAGE_TAG) AGENT_NOGPU_IMG ?= $(REPO)/agent-nogpu:$(IMAGE_TAG) @@ -222,6 +223,10 @@ build-image-operator: build-image-gkm-extract: $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --progress=plain --load -f Containerfile.gkm-extract -t ${EXTRACT_IMG} . +.PHONY: build-image-agent-base +build-image-agent-base: + $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load --target base-runtime -f Containerfile.gkm-agent-base -t ${AGENT_BASE_IMG} . + .PHONY: build-image-agent-nvidia build-image-agent-nvidia: $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load -f Containerfile.gkm-agent-nvidia -t ${AGENT_NVIDIA_IMG} . @@ -236,9 +241,9 @@ build-image-agent-nogpu: .PHONY: build-image-agents ifeq ($(NO_GPU_BUILD),true) -build-image-agents: build-image-agent-nogpu ## Build no-GPU agent only (NO_GPU_BUILD=true) +build-image-agents: build-image-agent-base build-image-agent-nogpu ## Build base and no-GPU agent only (NO_GPU_BUILD=true) else -build-image-agents: build-image-agent-nvidia build-image-agent-amd build-image-agent-nogpu ## Build all agent images (NVIDIA, AMD, and no-GPU) +build-image-agents: build-image-agent-base build-image-agent-nvidia build-image-agent-amd build-image-agent-nogpu ## Build all agent images (base, NVIDIA, AMD, and no-GPU) endif # If you wish to build the operator image targeting other platforms you can use the --platform flag. @@ -251,6 +256,7 @@ build-images: build-image-operator build-image-agents build-image-gkm-extract ## push-images: ## Push all container images. $(CONTAINER_TOOL) push ${OPERATOR_IMG} $(CONTAINER_TOOL) push ${EXTRACT_IMG} + $(CONTAINER_TOOL) push ${AGENT_BASE_IMG} ifeq ($(NO_GPU_BUILD),true) $(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG} else @@ -261,10 +267,12 @@ endif .PHONY: push-images-agents ifeq ($(NO_GPU_BUILD),true) -push-images-agents: ## Push no-GPU agent only (NO_GPU_BUILD=true) +push-images-agents: ## Push base and no-GPU agent only (NO_GPU_BUILD=true) + $(CONTAINER_TOOL) push ${AGENT_BASE_IMG} $(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG} else push-images-agents: ## Push all agent images + $(CONTAINER_TOOL) push ${AGENT_BASE_IMG} $(CONTAINER_TOOL) push ${AGENT_NVIDIA_IMG} $(CONTAINER_TOOL) push ${AGENT_AMD_IMG} $(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG} diff --git a/docs/GettingStartedGuide.md b/docs/GettingStartedGuide.md index 196157152..a75ee7104 100644 --- a/docs/GettingStartedGuide.md +++ b/docs/GettingStartedGuide.md @@ -12,13 +12,15 @@ building GKM and description of how to deploy GKM. ### Automated Installation (RHEL 10 / CentOS Stream 10) -For RHEL 10 or CentOS Stream 10 systems, you can install all dependencies (including go, podman, kubectl, and build packages) using: +For RHEL 10 or CentOS Stream 10 systems, you can install all +dependencies (including go, podman, kubectl, and build packages) using: ```sh make install-deps ``` This will: + - Install system development packages (gpgme-devel, libdrm-devel, hwloc-devel) - Install btrfs development headers - Install or upgrade Go to v1.25.0+ if needed @@ -36,9 +38,11 @@ sudo dnf install -y gpgme-devel libdrm-devel libbtrfs btrfs-progs \ btrfs-progs-devel hwloc hwloc-devel ``` -> **Note for RHEL 10**: Some packages may not be available in standard repositories. -> Use `make install-deps` or see [hack/install_deps.sh](../hack/install_deps.sh) for the installation script -> that sources packages from CentOS Stream 10 and Fedora repositories. +> **Note for RHEL 10**: Some packages may not be available in standard +> repositories. Use `make install-deps` or see +> [hack/install_deps.sh](../hack/install_deps.sh) for the installation +> script that sources packages from CentOS Stream 10 and Fedora +> repositories. **For Debian/Ubuntu:** diff --git a/examples/namespace/RWO-NVIDIA/README.md b/examples/namespace/RWO-NVIDIA/README.md index c3a3f4644..96f4800ba 100644 --- a/examples/namespace/RWO-NVIDIA/README.md +++ b/examples/namespace/RWO-NVIDIA/README.md @@ -1,6 +1,7 @@ # NVIDIA GPU Examples for GKM (ReadWriteOnce) -This directory contains examples for deploying GKM with NVIDIA GPU support using ReadWriteOnce (RWO) access mode. +This directory contains examples for deploying GKM with NVIDIA GPU support +using ReadWriteOnce (RWO) access mode. ## Prerequisites @@ -18,7 +19,9 @@ Before deploying, verify your storage class: kubectl get sc ``` -Update the `storageClassName` field in [11-gkmcache.yaml](11-gkmcache.yaml) to match your cluster's storage class. +Update the `storageClassName` field in +[11-gkmcache.yaml](11-gkmcache.yaml) to match your cluster's storage +class. ## Deployment @@ -31,16 +34,19 @@ kubectl apply -f examples/namespace/RWO-NVIDIA/ ### Option 2: Deploy Step by Step 1. Create the namespace: + ```bash kubectl apply -f 10-namespace.yaml ``` 2. Create the GKMCache resource: + ```bash kubectl apply -f 11-gkmcache.yaml ``` 3. Wait for the PVC to be created and bound: + ```bash kubectl get pvc -n gkm-test-ns-nvidia-rwo-1 -w ``` @@ -52,23 +58,27 @@ kubectl apply -f examples/namespace/RWO-NVIDIA/ ## Verification Check the GKMCache status: + ```bash kubectl get gkmcache -n gkm-test-ns-nvidia-rwo-1 kubectl describe gkmcache vector-add-cache-cuda-rwo -n gkm-test-ns-nvidia-rwo-1 ``` Check the PVC: + ```bash kubectl get pvc -n gkm-test-ns-nvidia-rwo-1 ``` Check the extraction job: + ```bash kubectl get jobs -n gkm-test-ns-nvidia-rwo-1 kubectl get pods -n gkm-test-ns-nvidia-rwo-1 ``` Check the test workload: + ```bash # For Pod kubectl get pod gkm-test-nvidia-pod-1 -n gkm-test-ns-nvidia-rwo-1 @@ -80,6 +90,7 @@ kubectl get pods -n gkm-test-ns-nvidia-rwo-1 -l name=gkm-test-nvidia-rwo-ds-1 ``` Verify the cache is mounted: + ```bash kubectl exec -it -n gkm-test-ns-nvidia-rwo-1 gkm-test-nvidia-pod-1 -- ls -la /cache ``` @@ -95,19 +106,23 @@ kubectl describe pvc vector-add-cache-cuda-rwo -n gkm-test-ns-nvidia-rwo-1 ``` Common issues: + - Storage class not available or incorrect - No nodes match the node selector -- Volume binding mode is `WaitForFirstConsumer` (PVC will bind when a pod using it is scheduled) +- Volume binding mode is `WaitForFirstConsumer` (PVC will bind when a + pod using it is scheduled) ### Extraction Job Not Scheduling Check the extraction job: + ```bash kubectl get jobs -n gkm-test-ns-nvidia-rwo-1 kubectl describe job -n gkm-test-ns-nvidia-rwo-1 ``` Check for pod scheduling issues: + ```bash kubectl get events -n gkm-test-ns-nvidia-rwo-1 --sort-by='.lastTimestamp' ``` @@ -117,9 +132,11 @@ kubectl get events -n gkm-test-ns-nvidia-rwo-1 --sort-by='.lastTimestamp' If your cluster doesn't have NFD labels, you can either: 1. Install and configure NFD (recommended) -2. Remove the `affinity` section from the pod/daemonset specs and use a simpler node selector or label your GPU nodes manually +2. Remove the `affinity` section from the pod/daemonset specs and use a + simpler node selector or label your GPU nodes manually Example without NFD: + ```yaml nodeSelector: your-gpu-label: "true" # Use whatever label identifies your GPU nodes diff --git a/gkm-codespell.precommit-toml b/gkm-codespell.precommit-toml index 76f856152..e1472179e 100644 --- a/gkm-codespell.precommit-toml +++ b/gkm-codespell.precommit-toml @@ -1,3 +1,3 @@ [tool.codespell] -ignore-words-list = "AfterAll,renderD" +ignore-words-list = "AfterAll,renderD,aCI" skip = './.*,vendor/*,go.sum' From 50c5601bb9b89ceeabbefd1a6d9429b07655ae8e Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Mon, 16 Mar 2026 13:41:03 +0000 Subject: [PATCH 11/25] fix: resolve yamllint errors in NVIDIA example YAMLs - Move inline comment out of matchExpressions list to avoid yamllint warnings - Fix indentation of nodeSelectorTerms and matchExpressions items - Ensure consistent 2-space indentation for YAML list items This resolves the pre-commit yamllint hook failures for: - examples/namespace/RWO-NVIDIA/12-ds.yaml - examples/namespace/RWO-NVIDIA/13-pod.yaml Co-Authored-By: Claude Sonnet 4.5 --- examples/namespace/RWO-NVIDIA/12-ds.yaml | 14 +++++++------- examples/namespace/RWO-NVIDIA/13-pod.yaml | 14 +++++++------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/namespace/RWO-NVIDIA/12-ds.yaml b/examples/namespace/RWO-NVIDIA/12-ds.yaml index cd06dbf9f..446e94e03 100644 --- a/examples/namespace/RWO-NVIDIA/12-ds.yaml +++ b/examples/namespace/RWO-NVIDIA/12-ds.yaml @@ -22,17 +22,17 @@ spec: effect: NoSchedule # Node affinity to schedule only on NVIDIA GPU nodes + # NVIDIA vendor ID is 10de, class code 0300 (VGA) or 0302 (3D controller) affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - - matchExpressions: - # NVIDIA vendor ID is 10de, class code 0300 (VGA) or 0302 (3D controller) - - key: feature.node.kubernetes.io/pci-0300_10de.present - operator: Exists - - matchExpressions: - - key: feature.node.kubernetes.io/pci-0302_10de.present - operator: Exists + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0300_10de.present + operator: Exists + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0302_10de.present + operator: Exists containers: - name: test diff --git a/examples/namespace/RWO-NVIDIA/13-pod.yaml b/examples/namespace/RWO-NVIDIA/13-pod.yaml index 59140b6c3..c74231f3e 100644 --- a/examples/namespace/RWO-NVIDIA/13-pod.yaml +++ b/examples/namespace/RWO-NVIDIA/13-pod.yaml @@ -11,17 +11,17 @@ spec: effect: NoSchedule # Node affinity to schedule only on NVIDIA GPU nodes + # NVIDIA vendor ID is 10de, class code 0300 (VGA) or 0302 (3D controller) affinity: nodeAffinity: requiredDuringSchedulingIgnoredDuringExecution: nodeSelectorTerms: - - matchExpressions: - # NVIDIA vendor ID is 10de, class code 0300 (VGA) or 0302 (3D controller) - - key: feature.node.kubernetes.io/pci-0300_10de.present - operator: Exists - - matchExpressions: - - key: feature.node.kubernetes.io/pci-0302_10de.present - operator: Exists + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0300_10de.present + operator: Exists + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0302_10de.present + operator: Exists containers: - name: test From 165b7b34aa81aac9eab41f2a3d08a16be501bf15 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Mon, 16 Mar 2026 13:45:25 +0000 Subject: [PATCH 12/25] refactor: restructure RWO examples into organized subdirectories MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move examples from flat structure to organized hierarchy: - examples/namespace/RWO-NVIDIA/ → examples/namespace/RWO/CUDA/ - examples/namespace/RWO-ROCM/ → examples/namespace/RWO/ROCM/ This change: - Updates README paths to reflect new directory structure - Includes yamllint fixes (proper indentation and comment placement) - Maintains consistent example organization under RWO/ Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- examples/namespace/{RWO-NVIDIA => RWO/CUDA}/10-namespace.yaml | 0 examples/namespace/{RWO-NVIDIA => RWO/CUDA}/11-gkmcache.yaml | 0 examples/namespace/{RWO-NVIDIA => RWO/CUDA}/12-ds.yaml | 0 examples/namespace/{RWO-NVIDIA => RWO/CUDA}/13-pod.yaml | 0 examples/namespace/{RWO-NVIDIA => RWO/CUDA}/README.md | 4 ++-- examples/namespace/{RWO-ROCM => RWO/ROCM}/10-namespace.yaml | 0 examples/namespace/{RWO-ROCM => RWO/ROCM}/11-gkmcache.yaml | 0 examples/namespace/{RWO-ROCM => RWO/ROCM}/12-ds.yaml | 0 examples/namespace/{RWO-ROCM => RWO/ROCM}/13-ds.yaml | 0 examples/namespace/{RWO-ROCM => RWO/ROCM}/14-ds.yaml | 0 .../{RWO-ROCM => RWO/ROCM}/21-gkmcache-cosign-v3.yaml | 0 examples/namespace/{RWO-ROCM => RWO/ROCM}/22-ds.yaml | 0 12 files changed, 2 insertions(+), 2 deletions(-) rename examples/namespace/{RWO-NVIDIA => RWO/CUDA}/10-namespace.yaml (100%) rename examples/namespace/{RWO-NVIDIA => RWO/CUDA}/11-gkmcache.yaml (100%) rename examples/namespace/{RWO-NVIDIA => RWO/CUDA}/12-ds.yaml (100%) rename examples/namespace/{RWO-NVIDIA => RWO/CUDA}/13-pod.yaml (100%) rename examples/namespace/{RWO-NVIDIA => RWO/CUDA}/README.md (96%) rename examples/namespace/{RWO-ROCM => RWO/ROCM}/10-namespace.yaml (100%) rename examples/namespace/{RWO-ROCM => RWO/ROCM}/11-gkmcache.yaml (100%) rename examples/namespace/{RWO-ROCM => RWO/ROCM}/12-ds.yaml (100%) rename examples/namespace/{RWO-ROCM => RWO/ROCM}/13-ds.yaml (100%) rename examples/namespace/{RWO-ROCM => RWO/ROCM}/14-ds.yaml (100%) rename examples/namespace/{RWO-ROCM => RWO/ROCM}/21-gkmcache-cosign-v3.yaml (100%) rename examples/namespace/{RWO-ROCM => RWO/ROCM}/22-ds.yaml (100%) diff --git a/examples/namespace/RWO-NVIDIA/10-namespace.yaml b/examples/namespace/RWO/CUDA/10-namespace.yaml similarity index 100% rename from examples/namespace/RWO-NVIDIA/10-namespace.yaml rename to examples/namespace/RWO/CUDA/10-namespace.yaml diff --git a/examples/namespace/RWO-NVIDIA/11-gkmcache.yaml b/examples/namespace/RWO/CUDA/11-gkmcache.yaml similarity index 100% rename from examples/namespace/RWO-NVIDIA/11-gkmcache.yaml rename to examples/namespace/RWO/CUDA/11-gkmcache.yaml diff --git a/examples/namespace/RWO-NVIDIA/12-ds.yaml b/examples/namespace/RWO/CUDA/12-ds.yaml similarity index 100% rename from examples/namespace/RWO-NVIDIA/12-ds.yaml rename to examples/namespace/RWO/CUDA/12-ds.yaml diff --git a/examples/namespace/RWO-NVIDIA/13-pod.yaml b/examples/namespace/RWO/CUDA/13-pod.yaml similarity index 100% rename from examples/namespace/RWO-NVIDIA/13-pod.yaml rename to examples/namespace/RWO/CUDA/13-pod.yaml diff --git a/examples/namespace/RWO-NVIDIA/README.md b/examples/namespace/RWO/CUDA/README.md similarity index 96% rename from examples/namespace/RWO-NVIDIA/README.md rename to examples/namespace/RWO/CUDA/README.md index 96f4800ba..16fbe1080 100644 --- a/examples/namespace/RWO-NVIDIA/README.md +++ b/examples/namespace/RWO/CUDA/README.md @@ -28,7 +28,7 @@ class. ### Option 1: Deploy All Resources ```bash -kubectl apply -f examples/namespace/RWO-NVIDIA/ +kubectl apply -f examples/namespace/RWO/CUDA/ ``` ### Option 2: Deploy Step by Step @@ -145,5 +145,5 @@ nodeSelector: ## Cleanup ```bash -kubectl delete -f examples/namespace/RWO-NVIDIA/ +kubectl delete -f examples/namespace/RWO/CUDA/ ``` diff --git a/examples/namespace/RWO-ROCM/10-namespace.yaml b/examples/namespace/RWO/ROCM/10-namespace.yaml similarity index 100% rename from examples/namespace/RWO-ROCM/10-namespace.yaml rename to examples/namespace/RWO/ROCM/10-namespace.yaml diff --git a/examples/namespace/RWO-ROCM/11-gkmcache.yaml b/examples/namespace/RWO/ROCM/11-gkmcache.yaml similarity index 100% rename from examples/namespace/RWO-ROCM/11-gkmcache.yaml rename to examples/namespace/RWO/ROCM/11-gkmcache.yaml diff --git a/examples/namespace/RWO-ROCM/12-ds.yaml b/examples/namespace/RWO/ROCM/12-ds.yaml similarity index 100% rename from examples/namespace/RWO-ROCM/12-ds.yaml rename to examples/namespace/RWO/ROCM/12-ds.yaml diff --git a/examples/namespace/RWO-ROCM/13-ds.yaml b/examples/namespace/RWO/ROCM/13-ds.yaml similarity index 100% rename from examples/namespace/RWO-ROCM/13-ds.yaml rename to examples/namespace/RWO/ROCM/13-ds.yaml diff --git a/examples/namespace/RWO-ROCM/14-ds.yaml b/examples/namespace/RWO/ROCM/14-ds.yaml similarity index 100% rename from examples/namespace/RWO-ROCM/14-ds.yaml rename to examples/namespace/RWO/ROCM/14-ds.yaml diff --git a/examples/namespace/RWO-ROCM/21-gkmcache-cosign-v3.yaml b/examples/namespace/RWO/ROCM/21-gkmcache-cosign-v3.yaml similarity index 100% rename from examples/namespace/RWO-ROCM/21-gkmcache-cosign-v3.yaml rename to examples/namespace/RWO/ROCM/21-gkmcache-cosign-v3.yaml diff --git a/examples/namespace/RWO-ROCM/22-ds.yaml b/examples/namespace/RWO/ROCM/22-ds.yaml similarity index 100% rename from examples/namespace/RWO-ROCM/22-ds.yaml rename to examples/namespace/RWO/ROCM/22-ds.yaml From a342885c1d56dfe02660765fba54343ae448b1ae Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Mon, 16 Mar 2026 13:52:09 +0000 Subject: [PATCH 13/25] fix: load actual agent images instead of non-existent AGENT_IMG in kind-load-images The kind-load-images target was attempting to load ${AGENT_IMG} which is never built. Updated to load the actual agent images based on NO_GPU_BUILD flag: AGENT_BASE_IMG, AGENT_NOGPU_IMG (always), and AGENT_NVIDIA_IMG/AGENT_AMD_IMG (when NO_GPU_BUILD=false). Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- Makefile | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 8a2e3dc1e..87848a4de 100644 --- a/Makefile +++ b/Makefile @@ -603,8 +603,19 @@ setup-kind: kind-gpu-sim-script kind-load-images: kind-gpu-sim-script get-example-images @echo "Loading operator image ${OPERATOR_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)" cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${OPERATOR_IMG} --cluster-name=$(KIND_CLUSTER_NAME) - @echo "Loading agent image ${AGENT_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)" - cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${AGENT_IMG} --cluster-name=$(KIND_CLUSTER_NAME) + @echo "Loading agent base image ${AGENT_BASE_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)" + cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${AGENT_BASE_IMG} --cluster-name=$(KIND_CLUSTER_NAME) +ifeq ($(NO_GPU_BUILD),true) + @echo "Loading agent nogpu image ${AGENT_NOGPU_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)" + cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${AGENT_NOGPU_IMG} --cluster-name=$(KIND_CLUSTER_NAME) +else + @echo "Loading agent nvidia image ${AGENT_NVIDIA_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)" + cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${AGENT_NVIDIA_IMG} --cluster-name=$(KIND_CLUSTER_NAME) + @echo "Loading agent amd image ${AGENT_AMD_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)" + cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${AGENT_AMD_IMG} --cluster-name=$(KIND_CLUSTER_NAME) + @echo "Loading agent nogpu image ${AGENT_NOGPU_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)" + cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${AGENT_NOGPU_IMG} --cluster-name=$(KIND_CLUSTER_NAME) +endif @echo "Loading gkm-extract image ${EXTRACT_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)" cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${EXTRACT_IMG} --cluster-name=$(KIND_CLUSTER_NAME) @echo "Images loaded successfully into Kind cluster: $(KIND_CLUSTER_NAME)" From bf397b07a23b3262778dcafd8b5c3a518a092b45 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Mon, 16 Mar 2026 14:45:32 +0000 Subject: [PATCH 14/25] kind: fix kyverno deployment Signed-off-by: Maryam Tahhan --- Makefile | 6 +++++- config/agent/kustomization.yaml | 12 ++++++------ 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index 87848a4de..76ef5da77 100644 --- a/Makefile +++ b/Makefile @@ -557,7 +557,11 @@ deploy-kyverno-production: helm ## Deploy Kyverno for production clusters (no Ki @echo "Kyverno deployed successfully." .PHONY: deploy-kyverno-with-policies -deploy-kyverno-with-policies: deploy-kyverno-production deploy-kyverno-policies ## Deploy Kyverno and its policies +ifeq ($(NO_GPU),true) +deploy-kyverno-with-policies: deploy-kyverno deploy-kyverno-policies ## Deploy Kyverno and its policies (uses NO_GPU values for Kind) +else +deploy-kyverno-with-policies: deploy-kyverno-production deploy-kyverno-policies ## Deploy Kyverno and its policies (uses production values) +endif @echo "Restarting Kyverno to discover GKM CRDs..." @$(KUBECTL) rollout restart deployment/kyverno-admission-controller -n kyverno @$(KUBECTL) wait --for=condition=Available --timeout=120s -n kyverno deployment/kyverno-admission-controller || true diff --git a/config/agent/kustomization.yaml b/config/agent/kustomization.yaml index 47a1d4be1..b5f7f6699 100644 --- a/config/agent/kustomization.yaml +++ b/config/agent/kustomization.yaml @@ -4,17 +4,17 @@ kind: Kustomization # Deploy GPU-specific agents based on node hardware # Requires Node Feature Discovery (NFD) to label nodes resources: -- gkm-agent-nvidia.yaml # NVIDIA GPU nodes -- gkm-agent-amd.yaml # AMD GPU nodes -- gkm-agent-nogpu.yaml # Nodes without GPUs +- gkm-agent-nvidia.yaml +- gkm-agent-amd.yaml +- gkm-agent-nogpu.yaml images: -- name: quay.io/gkm/agent-nvidia - newName: quay.io/gkm/agent-nvidia - newTag: latest - name: quay.io/gkm/agent-amd newName: quay.io/gkm/agent-amd newTag: latest - name: quay.io/gkm/agent-nogpu newName: quay.io/gkm/agent-nogpu newTag: latest +- name: quay.io/gkm/agent-nvidia + newName: quay.io/gkm/agent-nvidia + newTag: latest From 502ecb84db81d6105f7046340c085f1490b022fd Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Mon, 16 Mar 2026 14:53:59 +0000 Subject: [PATCH 15/25] makefile: cleanup kyverno targets Signed-off-by: Maryam Tahhan --- Makefile | 76 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 39 insertions(+), 37 deletions(-) diff --git a/Makefile b/Makefile index 76ef5da77..256cbe631 100644 --- a/Makefile +++ b/Makefile @@ -514,9 +514,15 @@ undeploy-cert-manager: delete-webhook-secret-file ##@ Kyverno KYVERNO_VERSION ?= latest +KYVERNO_NAMESPACE ?= kyverno +KYVERNO_REPO ?= https://kyverno.github.io/kyverno/ HELM_VERSION ?= v3.16.3 HELM ?= $(LOCALBIN)/helm +# Common Kyverno helm flags +KYVERNO_HELM_FLAGS = --namespace $(KYVERNO_NAMESPACE) --create-namespace --repo $(KYVERNO_REPO) kyverno --wait +KYVERNO_KIND_CONTEXT = --kube-context kind-$(KIND_CLUSTER_NAME) + .PHONY: helm helm: $(HELM) ## Download helm locally if necessary. $(HELM): $(LOCALBIN) @@ -525,37 +531,33 @@ $(HELM): $(LOCALBIN) curl -sSL https://get.helm.sh/helm-$(HELM_VERSION)-$(GOOS)-$(GOARCH).tar.gz | tar xz -C $(LOCALBIN) --strip-components=1 $(GOOS)-$(GOARCH)/helm ; \ } -.PHONY: deploy-kyverno -deploy-kyverno: helm ## Deploy Kyverno with optional GPU tolerations for Kind cluster - @echo "Installing Kyverno to cluster $(KIND_CLUSTER_NAME)..." +# Internal target for deploying Kyverno with configurable context +.PHONY: _deploy-kyverno-base +_deploy-kyverno-base: helm + @echo "Installing Kyverno..." ifeq ($(NO_GPU),true) @echo "Using Kyverno configuration with GPU nodeSelector and tolerations (NO_GPU=true)..." - $(HELM) upgrade --install kyverno --namespace kyverno --create-namespace \ - --kube-context kind-$(KIND_CLUSTER_NAME) \ - --repo https://kyverno.github.io/kyverno/ kyverno \ - --values config/kyverno/values-no-gpu.yaml \ - --wait + $(HELM) upgrade --install kyverno $(KYVERNO_HELM_FLAGS) $(KYVERNO_CONTEXT) \ + --values config/kyverno/values-no-gpu.yaml else - @echo "Using default Kyverno configuration for production GPU environments..." - $(HELM) upgrade --install kyverno --namespace kyverno --create-namespace \ - --kube-context kind-$(KIND_CLUSTER_NAME) \ - --repo https://kyverno.github.io/kyverno/ kyverno \ - --values config/kyverno/values.yaml \ - --wait + @echo "Using default Kyverno configuration..." + $(HELM) upgrade --install kyverno $(KYVERNO_HELM_FLAGS) $(KYVERNO_CONTEXT) \ + --values config/kyverno/values.yaml endif - @echo "Kyverno deployed successfully to $(KIND_CLUSTER_NAME)." - -.PHONY: deploy-kyverno-production -deploy-kyverno-production: helm ## Deploy Kyverno for production clusters (no Kind context) - @echo "Installing Kyverno..." - $(HELM) upgrade --install kyverno --namespace kyverno --create-namespace \ - --repo https://kyverno.github.io/kyverno/ kyverno \ - --values config/kyverno/values.yaml \ - --wait +ifdef KYVERNO_WAIT @echo "Waiting for Kyverno to be ready..." - @$(KUBECTL) wait --for=condition=Available --timeout=120s -n kyverno deployment/kyverno-admission-controller || true + @$(KUBECTL) wait --for=condition=Available --timeout=120s -n $(KYVERNO_NAMESPACE) deployment/kyverno-admission-controller || true +endif @echo "Kyverno deployed successfully." +.PHONY: deploy-kyverno +deploy-kyverno: ## Deploy Kyverno for Kind cluster + @$(MAKE) _deploy-kyverno-base KYVERNO_CONTEXT="$(KYVERNO_KIND_CONTEXT)" + +.PHONY: deploy-kyverno-production +deploy-kyverno-production: ## Deploy Kyverno for production clusters + @$(MAKE) _deploy-kyverno-base KYVERNO_CONTEXT="" KYVERNO_WAIT=true + .PHONY: deploy-kyverno-with-policies ifeq ($(NO_GPU),true) deploy-kyverno-with-policies: deploy-kyverno deploy-kyverno-policies ## Deploy Kyverno and its policies (uses NO_GPU values for Kind) @@ -563,8 +565,8 @@ else deploy-kyverno-with-policies: deploy-kyverno-production deploy-kyverno-policies ## Deploy Kyverno and its policies (uses production values) endif @echo "Restarting Kyverno to discover GKM CRDs..." - @$(KUBECTL) rollout restart deployment/kyverno-admission-controller -n kyverno - @$(KUBECTL) wait --for=condition=Available --timeout=120s -n kyverno deployment/kyverno-admission-controller || true + @$(KUBECTL) rollout restart deployment/kyverno-admission-controller -n $(KYVERNO_NAMESPACE) + @$(KUBECTL) wait --for=condition=Available --timeout=120s -n $(KYVERNO_NAMESPACE) deployment/kyverno-admission-controller || true @echo "Kyverno and policies deployed successfully." .PHONY: deploy-kyverno-policies @@ -579,21 +581,21 @@ undeploy-kyverno-policies: kustomize ## Undeploy Kyverno ClusterPolicies $(KUSTOMIZE) build config/kyverno/policies | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f - @echo "Kyverno policies undeployed." +# Internal target for undeploying Kyverno with configurable context +.PHONY: _undeploy-kyverno-base +_undeploy-kyverno-base: + @echo "Uninstalling Kyverno..." + $(HELM) uninstall kyverno --namespace $(KYVERNO_NAMESPACE) $(KYVERNO_CONTEXT) --ignore-not-found || true + $(KUBECTL) delete namespace $(KYVERNO_NAMESPACE) --ignore-not-found=$(ignore-not-found) + @echo "Kyverno undeployed." + .PHONY: undeploy-kyverno -undeploy-kyverno: ## Undeploy Kyverno - @echo "Uninstalling Kyverno from cluster $(KIND_CLUSTER_NAME)..." - $(HELM) uninstall kyverno --namespace kyverno \ - --kube-context kind-$(KIND_CLUSTER_NAME) \ - --ignore-not-found || true - $(KUBECTL) delete namespace kyverno --ignore-not-found=$(ignore-not-found) - @echo "Kyverno undeployed from $(KIND_CLUSTER_NAME)." +undeploy-kyverno: ## Undeploy Kyverno from Kind cluster + @$(MAKE) _undeploy-kyverno-base KYVERNO_CONTEXT="$(KYVERNO_KIND_CONTEXT)" .PHONY: undeploy-kyverno-production undeploy-kyverno-production: ## Undeploy Kyverno from production cluster - @echo "Uninstalling Kyverno..." - $(HELM) uninstall kyverno --namespace kyverno --ignore-not-found || true - $(KUBECTL) delete namespace kyverno --ignore-not-found=$(ignore-not-found) - @echo "Kyverno undeployed." + @$(MAKE) _undeploy-kyverno-base KYVERNO_CONTEXT="" ##@ Kind Cluster Management From 9471b38e6cbae0b7f0e91582c64617dca8f075df Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Mon, 16 Mar 2026 15:44:49 +0000 Subject: [PATCH 16/25] fix: resolve Kind deployment failures on GPU-tainted nodes Fixes Kyverno and NFD component scheduling issues in Kind clusters with GPU taints by adding proper tolerations and removing duplicate deployments. Changes: - Use Kind-specific Kyverno values when NO_GPU=true in deploy target - Remove duplicate Kyverno deployment from run-on-kind target - Add GPU tolerations for Kyverno hooks/migration jobs - Add GPU tolerations for NFD garbage collector and workers Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- Makefile | 11 ----------- config/kyverno/values-no-gpu.yaml | 10 ++++++++++ config/nfd/kustomization.yaml | 8 ++++---- config/nfd/patch-nfd-gc.yaml | 13 +++++++++++++ config/nfd/patch-nfd-workers.yaml | 13 +++++++++++++ 5 files changed, 40 insertions(+), 15 deletions(-) create mode 100644 config/nfd/patch-nfd-gc.yaml create mode 100644 config/nfd/patch-nfd-workers.yaml diff --git a/Makefile b/Makefile index 256cbe631..a7c63947c 100644 --- a/Makefile +++ b/Makefile @@ -633,17 +633,6 @@ tmp-cleanup: .PHONY: run-on-kind run-on-kind: destroy-kind setup-kind deploy-on-kind ## Setup Kind cluster, load images, and deploy -ifeq ($(KYVERNO_ENABLED),true) - @echo "Deploying Kyverno after GKM CRDs (KYVERNO_ENABLED=true)..." - $(MAKE) deploy-kyverno NO_GPU=true - @echo "Waiting for Kyverno to be ready..." - $(KUBECTL) wait --for=condition=Available --timeout=120s -n kyverno deployment/kyverno-admission-controller || true - @echo "Deploying Kyverno policies..." - $(MAKE) deploy-kyverno-policies - @echo "Restarting Kyverno to discover GKM CRDs..." - $(KUBECTL) rollout restart deployment/kyverno-admission-controller -n kyverno - $(KUBECTL) wait --for=condition=Available --timeout=120s -n kyverno deployment/kyverno-admission-controller -endif @echo "Cluster created, images loaded, and agent deployed on Kind GPU cluster." .PHONY: deploy-on-kind diff --git a/config/kyverno/values-no-gpu.yaml b/config/kyverno/values-no-gpu.yaml index 3f086fbd7..28606d392 100644 --- a/config/kyverno/values-no-gpu.yaml +++ b/config/kyverno/values-no-gpu.yaml @@ -33,3 +33,13 @@ reportsController: operator: Equal value: "true" effect: NoSchedule + +# Jobs (e.g., migration resources) also need tolerations +hooks: + nodeSelector: + hardware-type: gpu + tolerations: + - key: gpu + operator: Equal + value: "true" + effect: NoSchedule diff --git a/config/nfd/kustomization.yaml b/config/nfd/kustomization.yaml index 684f558c6..7490a2ea6 100644 --- a/config/nfd/kustomization.yaml +++ b/config/nfd/kustomization.yaml @@ -6,7 +6,7 @@ kind: Kustomization resources: - https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/default?ref=v0.17.2 -# Optional: Add custom NFD configuration -# Uncomment if you need to customize NFD behavior -# patchesStrategicMerge: -# - nfd-worker-conf.yaml +# Patches for GPU-tainted nodes (Kind cluster) +patchesStrategicMerge: + - patch-nfd-gc.yaml + - patch-nfd-workers.yaml diff --git a/config/nfd/patch-nfd-gc.yaml b/config/nfd/patch-nfd-gc.yaml new file mode 100644 index 000000000..b717c07e2 --- /dev/null +++ b/config/nfd/patch-nfd-gc.yaml @@ -0,0 +1,13 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: nfd-gc + namespace: node-feature-discovery +spec: + template: + spec: + tolerations: + - key: gpu + operator: Equal + value: "true" + effect: NoSchedule diff --git a/config/nfd/patch-nfd-workers.yaml b/config/nfd/patch-nfd-workers.yaml new file mode 100644 index 000000000..c8fa653cd --- /dev/null +++ b/config/nfd/patch-nfd-workers.yaml @@ -0,0 +1,13 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nfd-worker + namespace: node-feature-discovery +spec: + template: + spec: + tolerations: + - key: gpu + operator: Equal + value: "true" + effect: NoSchedule From 1eb484647831f0a9c6a7d860e403c24031e04bd9 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Mon, 16 Mar 2026 17:49:03 +0000 Subject: [PATCH 17/25] fix: skip NFD deployment for Kind clusters and use device plugin labels NFD is unnecessary in Kind simulated GPU environments. Instead, patch agent daemonsets to use GPU device plugin labels (rocm.amd.com/gpu.present, nvidia.com/gpu.present) for node affinity rather than NFD's PCI device labels. Changes: - Skip NFD deployment when NO_GPU=true (Kind clusters) - Skip NFD undeployment when NO_GPU=true - Add Kind-specific agent patches using device plugin labels - Patch gkm-agent-amd to use rocm.amd.com/gpu.present label - Patch gkm-agent-nvidia to use nvidia.com/gpu.present label - Patch gkm-agent-nogpu to exclude nodes with GPU labels Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- Makefile | 6 ++++++ config/kind-gpu/agent-amd-patch.yaml | 17 +++++++++++++++++ config/kind-gpu/agent-nogpu-patch.yaml | 19 +++++++++++++++++++ config/kind-gpu/agent-nvidia-patch.yaml | 17 +++++++++++++++++ config/kind-gpu/kustomization.yaml | 3 +++ 5 files changed, 62 insertions(+) create mode 100644 config/kind-gpu/agent-amd-patch.yaml create mode 100644 config/kind-gpu/agent-nogpu-patch.yaml create mode 100644 config/kind-gpu/agent-nvidia-patch.yaml diff --git a/Makefile b/Makefile index a7c63947c..2e78740f4 100644 --- a/Makefile +++ b/Makefile @@ -385,7 +385,11 @@ ifneq ($(KYVERNO_ENABLED),true) endif .PHONY: deploy +ifeq ($(NO_GPU),true) +deploy: manifests kustomize prepare-deploy webhook-secret-file deploy-cert-manager redeploy ## Deploy controller and agent to Kind cluster (skips NFD) +else deploy: manifests kustomize prepare-deploy webhook-secret-file deploy-cert-manager deploy-nfd redeploy ## Deploy controller and agent to the K8s cluster specified in ~/.kube/config +endif ifeq ($(KYVERNO_ENABLED),true) @echo "Deploying Kyverno (KYVERNO_ENABLED=true)..." $(MAKE) deploy-kyverno-with-policies @@ -409,8 +413,10 @@ ifeq ($(KYVERNO_ENABLED),true) -$(MAKE) undeploy-kyverno-policies -$(MAKE) undeploy-kyverno-production endif +ifneq ($(NO_GPU),true) @echo "Undeploying NFD..." -$(MAKE) undeploy-nfd +endif @echo "Undeployment from $(DEPLOY_PATH) completed." .PHONY: undeploy-force diff --git a/config/kind-gpu/agent-amd-patch.yaml b/config/kind-gpu/agent-amd-patch.yaml new file mode 100644 index 000000000..43ee255e8 --- /dev/null +++ b/config/kind-gpu/agent-amd-patch.yaml @@ -0,0 +1,17 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: gkm-agent-amd + namespace: gkm-system +spec: + template: + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: rocm.amd.com/gpu.present + operator: Exists + - key: node-role.kubernetes.io/control-plane + operator: DoesNotExist diff --git a/config/kind-gpu/agent-nogpu-patch.yaml b/config/kind-gpu/agent-nogpu-patch.yaml new file mode 100644 index 000000000..47f407916 --- /dev/null +++ b/config/kind-gpu/agent-nogpu-patch.yaml @@ -0,0 +1,19 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: gkm-agent-nogpu + namespace: gkm-system +spec: + template: + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: rocm.amd.com/gpu.present + operator: DoesNotExist + - key: nvidia.com/gpu.present + operator: DoesNotExist + - key: node-role.kubernetes.io/control-plane + operator: DoesNotExist diff --git a/config/kind-gpu/agent-nvidia-patch.yaml b/config/kind-gpu/agent-nvidia-patch.yaml new file mode 100644 index 000000000..4f95a5f94 --- /dev/null +++ b/config/kind-gpu/agent-nvidia-patch.yaml @@ -0,0 +1,17 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: gkm-agent-nvidia + namespace: gkm-system +spec: + template: + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: nvidia.com/gpu.present + operator: Exists + - key: node-role.kubernetes.io/control-plane + operator: DoesNotExist diff --git a/config/kind-gpu/kustomization.yaml b/config/kind-gpu/kustomization.yaml index d81e686c7..e4effe2f8 100644 --- a/config/kind-gpu/kustomization.yaml +++ b/config/kind-gpu/kustomization.yaml @@ -10,3 +10,6 @@ patches: kind: DaemonSet name: gkm-agent path: agent-patch.yaml + - path: agent-amd-patch.yaml + - path: agent-nvidia-patch.yaml + - path: agent-nogpu-patch.yaml From 07a00a028c2a50a6d22f1c07e157b711f065a224 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Mon, 16 Mar 2026 18:13:08 +0000 Subject: [PATCH 18/25] fix: separate SKIP_NFD and NO_GPU flags, simulate NFD labels in Kind Introduces SKIP_NFD flag to control NFD deployment separately from NO_GPU mode. For Kind clusters, we skip NFD deployment but simulate it by manually adding PCI device labels that NFD would normally create. Changes: - Add SKIP_NFD flag (default: false) to control NFD deployment - Use SKIP_NFD instead of NO_GPU for controlling NFD deployment/undeploy - Auto-label Kind worker nodes with NFD PCI device labels (nvidia/rocm) - Keep NO_GPU=true for Kind to use no-GPU agent mode - Remove device plugin label patches (revert to NFD PCI labels) - Update deploy-on-kind to pass both SKIP_NFD=true and NO_GPU=true Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- Makefile | 33 ++++++++++++++++--------- config/kind-gpu/agent-amd-patch.yaml | 17 ------------- config/kind-gpu/agent-nogpu-patch.yaml | 19 -------------- config/kind-gpu/agent-nvidia-patch.yaml | 17 ------------- config/kind-gpu/kustomization.yaml | 3 --- 5 files changed, 22 insertions(+), 67 deletions(-) delete mode 100644 config/kind-gpu/agent-amd-patch.yaml delete mode 100644 config/kind-gpu/agent-nogpu-patch.yaml delete mode 100644 config/kind-gpu/agent-nvidia-patch.yaml diff --git a/Makefile b/Makefile index 2e78740f4..9b0b4554f 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,9 @@ CONTAINER_FLAGS ?= --build-arg TARGETARCH=$(ARCH) # NO_GPU flag for building without GPU support NO_GPU_BUILD ?= false +# SKIP_NFD flag for skipping NFD deployment (e.g., Kind clusters) +SKIP_NFD ?= false + # KYVERNO_ENABLED flag for enabling/disabling Kyverno verification (runtime only) KYVERNO_ENABLED ?= true @@ -385,8 +388,8 @@ ifneq ($(KYVERNO_ENABLED),true) endif .PHONY: deploy -ifeq ($(NO_GPU),true) -deploy: manifests kustomize prepare-deploy webhook-secret-file deploy-cert-manager redeploy ## Deploy controller and agent to Kind cluster (skips NFD) +ifeq ($(SKIP_NFD),true) +deploy: manifests kustomize prepare-deploy webhook-secret-file deploy-cert-manager redeploy ## Deploy controller and agent (skips NFD for Kind) else deploy: manifests kustomize prepare-deploy webhook-secret-file deploy-cert-manager deploy-nfd redeploy ## Deploy controller and agent to the K8s cluster specified in ~/.kube/config endif @@ -413,7 +416,7 @@ ifeq ($(KYVERNO_ENABLED),true) -$(MAKE) undeploy-kyverno-policies -$(MAKE) undeploy-kyverno-production endif -ifneq ($(NO_GPU),true) +ifneq ($(SKIP_NFD),true) @echo "Undeploying NFD..." -$(MAKE) undeploy-nfd endif @@ -541,8 +544,8 @@ $(HELM): $(LOCALBIN) .PHONY: _deploy-kyverno-base _deploy-kyverno-base: helm @echo "Installing Kyverno..." -ifeq ($(NO_GPU),true) - @echo "Using Kyverno configuration with GPU nodeSelector and tolerations (NO_GPU=true)..." +ifeq ($(SKIP_NFD),true) + @echo "Using Kyverno configuration with GPU nodeSelector and tolerations (SKIP_NFD=true for Kind)..." $(HELM) upgrade --install kyverno $(KYVERNO_HELM_FLAGS) $(KYVERNO_CONTEXT) \ --values config/kyverno/values-no-gpu.yaml else @@ -565,8 +568,8 @@ deploy-kyverno-production: ## Deploy Kyverno for production clusters @$(MAKE) _deploy-kyverno-base KYVERNO_CONTEXT="" KYVERNO_WAIT=true .PHONY: deploy-kyverno-with-policies -ifeq ($(NO_GPU),true) -deploy-kyverno-with-policies: deploy-kyverno deploy-kyverno-policies ## Deploy Kyverno and its policies (uses NO_GPU values for Kind) +ifeq ($(SKIP_NFD),true) +deploy-kyverno-with-policies: deploy-kyverno deploy-kyverno-policies ## Deploy Kyverno and its policies (uses Kind values with GPU tolerations) else deploy-kyverno-with-policies: deploy-kyverno-production deploy-kyverno-policies ## Deploy Kyverno and its policies (uses production values) endif @@ -647,17 +650,25 @@ deploy-on-kind: kind-load-images tmp-cleanup $(KUBECTL) label node kind-gpu-sim-worker gkm-test-node=true --overwrite @echo "Add label gkm-test-node=false to node kind-gpu-sim-worker2." $(KUBECTL) label node kind-gpu-sim-worker2 gkm-test-node=false --overwrite - ## NOTE: config/kind-gpu is an overlay of config/default - $(MAKE) deploy DEPLOY_PATH=config/kind-gpu NO_GPU=true + @echo "Add NFD PCI device labels for $(GPU_TYPE) GPUs to worker nodes..." +ifeq ($(GPU_TYPE),nvidia) + $(KUBECTL) label node kind-gpu-sim-worker feature.node.kubernetes.io/pci-0300_10de.present=true --overwrite + $(KUBECTL) label node kind-gpu-sim-worker2 feature.node.kubernetes.io/pci-0300_10de.present=true --overwrite +else ifeq ($(GPU_TYPE),rocm) + $(KUBECTL) label node kind-gpu-sim-worker feature.node.kubernetes.io/pci-0300_1002.present=true --overwrite + $(KUBECTL) label node kind-gpu-sim-worker2 feature.node.kubernetes.io/pci-0300_1002.present=true --overwrite +endif + ## NOTE: config/kind-gpu is an overlay of config/kind-gpu + $(MAKE) deploy DEPLOY_PATH=config/kind-gpu SKIP_NFD=true NO_GPU=true .PHONY: redeploy-on-kind redeploy-on-kind: ## Redeploy controller and agent to Kind GPU cluster after run-on-kind and undeploy-on-kind have been called. Skips some onetime steps in deploy. - $(MAKE) redeploy DEPLOY_PATH=config/kind-gpu NO_GPU=true + $(MAKE) redeploy DEPLOY_PATH=config/kind-gpu SKIP_NFD=true @echo "Deployment to $(DEPLOY_PATH) completed." .PHONY: undeploy-on-kind undeploy-on-kind: ## Undeploy operator and agent from the Kind GPU cluster. - $(MAKE) undeploy FORCE=$(FORCE) DEPLOY_PATH=config/kind-gpu ignore-not-found=$(ignore-not-found) + $(MAKE) undeploy FORCE=$(FORCE) DEPLOY_PATH=config/kind-gpu SKIP_NFD=true ignore-not-found=$(ignore-not-found) @echo "Undeployment from Kind GPU cluster $(KIND_CLUSTER_NAME) completed." .PHONY: undeploy-on-kind-force diff --git a/config/kind-gpu/agent-amd-patch.yaml b/config/kind-gpu/agent-amd-patch.yaml deleted file mode 100644 index 43ee255e8..000000000 --- a/config/kind-gpu/agent-amd-patch.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: gkm-agent-amd - namespace: gkm-system -spec: - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: rocm.amd.com/gpu.present - operator: Exists - - key: node-role.kubernetes.io/control-plane - operator: DoesNotExist diff --git a/config/kind-gpu/agent-nogpu-patch.yaml b/config/kind-gpu/agent-nogpu-patch.yaml deleted file mode 100644 index 47f407916..000000000 --- a/config/kind-gpu/agent-nogpu-patch.yaml +++ /dev/null @@ -1,19 +0,0 @@ -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: gkm-agent-nogpu - namespace: gkm-system -spec: - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: rocm.amd.com/gpu.present - operator: DoesNotExist - - key: nvidia.com/gpu.present - operator: DoesNotExist - - key: node-role.kubernetes.io/control-plane - operator: DoesNotExist diff --git a/config/kind-gpu/agent-nvidia-patch.yaml b/config/kind-gpu/agent-nvidia-patch.yaml deleted file mode 100644 index 4f95a5f94..000000000 --- a/config/kind-gpu/agent-nvidia-patch.yaml +++ /dev/null @@ -1,17 +0,0 @@ -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: gkm-agent-nvidia - namespace: gkm-system -spec: - template: - spec: - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: nvidia.com/gpu.present - operator: Exists - - key: node-role.kubernetes.io/control-plane - operator: DoesNotExist diff --git a/config/kind-gpu/kustomization.yaml b/config/kind-gpu/kustomization.yaml index e4effe2f8..d81e686c7 100644 --- a/config/kind-gpu/kustomization.yaml +++ b/config/kind-gpu/kustomization.yaml @@ -10,6 +10,3 @@ patches: kind: DaemonSet name: gkm-agent path: agent-patch.yaml - - path: agent-amd-patch.yaml - - path: agent-nvidia-patch.yaml - - path: agent-nogpu-patch.yaml From 6ad2c1622d07764042fd5f8a00b62ab3a109b84f Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Mon, 16 Mar 2026 18:41:39 +0000 Subject: [PATCH 19/25] fix: remove node affinity from nogpu agent for Kind clusters For Kind GPU simulation with NO_GPU=true, remove NFD PCI label requirements by removing node affinity from the nogpu agent. This allows the agent to schedule on all worker nodes without needing NFD labels. Changes: - Remove NFD PCI label addition from deploy-on-kind target - Add Kind-specific patch to remove node affinity from nogpu agent - Fix agent-patch.yaml to target all three agent daemonsets (amd, nvidia, nogpu) - NoGPU agents now schedule successfully in Kind clusters Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- Makefile | 10 +--------- .../kind-gpu/agent-remove-affinity-patch.yaml | 10 ++++++++++ config/kind-gpu/kustomization.yaml | 17 ++++++++++++++++- 3 files changed, 27 insertions(+), 10 deletions(-) create mode 100644 config/kind-gpu/agent-remove-affinity-patch.yaml diff --git a/Makefile b/Makefile index 9b0b4554f..4d845fc42 100644 --- a/Makefile +++ b/Makefile @@ -650,15 +650,7 @@ deploy-on-kind: kind-load-images tmp-cleanup $(KUBECTL) label node kind-gpu-sim-worker gkm-test-node=true --overwrite @echo "Add label gkm-test-node=false to node kind-gpu-sim-worker2." $(KUBECTL) label node kind-gpu-sim-worker2 gkm-test-node=false --overwrite - @echo "Add NFD PCI device labels for $(GPU_TYPE) GPUs to worker nodes..." -ifeq ($(GPU_TYPE),nvidia) - $(KUBECTL) label node kind-gpu-sim-worker feature.node.kubernetes.io/pci-0300_10de.present=true --overwrite - $(KUBECTL) label node kind-gpu-sim-worker2 feature.node.kubernetes.io/pci-0300_10de.present=true --overwrite -else ifeq ($(GPU_TYPE),rocm) - $(KUBECTL) label node kind-gpu-sim-worker feature.node.kubernetes.io/pci-0300_1002.present=true --overwrite - $(KUBECTL) label node kind-gpu-sim-worker2 feature.node.kubernetes.io/pci-0300_1002.present=true --overwrite -endif - ## NOTE: config/kind-gpu is an overlay of config/kind-gpu + ## NOTE: config/kind-gpu is an overlay of config/default $(MAKE) deploy DEPLOY_PATH=config/kind-gpu SKIP_NFD=true NO_GPU=true .PHONY: redeploy-on-kind diff --git a/config/kind-gpu/agent-remove-affinity-patch.yaml b/config/kind-gpu/agent-remove-affinity-patch.yaml new file mode 100644 index 000000000..734147088 --- /dev/null +++ b/config/kind-gpu/agent-remove-affinity-patch.yaml @@ -0,0 +1,10 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: gkm-agent-nogpu + namespace: gkm-system +spec: + template: + spec: + # Remove node affinity for Kind - schedule on all worker nodes + affinity: null diff --git a/config/kind-gpu/kustomization.yaml b/config/kind-gpu/kustomization.yaml index d81e686c7..caaca0ba1 100644 --- a/config/kind-gpu/kustomization.yaml +++ b/config/kind-gpu/kustomization.yaml @@ -4,9 +4,24 @@ resources: apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization patches: + # Add GPU tolerations and nodeSelector to all agents - target: group: apps version: v1 kind: DaemonSet - name: gkm-agent + name: gkm-agent-amd path: agent-patch.yaml + - target: + group: apps + version: v1 + kind: DaemonSet + name: gkm-agent-nvidia + path: agent-patch.yaml + - target: + group: apps + version: v1 + kind: DaemonSet + name: gkm-agent-nogpu + path: agent-patch.yaml + # Remove node affinity for nogpu agent in Kind (no NFD labels) + - path: agent-remove-affinity-patch.yaml From 0ca6f4e62bb1d3a403d6292faaef49b92bbf05dd Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Mon, 16 Mar 2026 18:56:13 +0000 Subject: [PATCH 20/25] fix: standardize namespace and cache naming in ROCM and CUDA examples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make namespace and cache names consistent across ROCM and CUDA examples: - ROCM namespace: gkm-test-ns-rwo-1 → gkm-test-ns-rocm-rwo-1 - ROCM cache: vector-add-cache-rocm-v2-rwo → vector-add-cache-rocm-rwo - ROCM cache v3: vector-add-cache-rocm-v3-rwo → vector-add-cache-rocm-rwo-v3 - CUDA namespace: gkm-test-ns-nvidia-rwo-1 → gkm-test-ns-cuda-rwo-1 - CUDA workloads: gkm-test-nvidia-* → gkm-test-cuda-* Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- examples/namespace/RWO/CUDA/10-namespace.yaml | 2 +- examples/namespace/RWO/CUDA/11-gkmcache.yaml | 2 +- examples/namespace/RWO/CUDA/12-ds.yaml | 8 ++++---- examples/namespace/RWO/CUDA/13-pod.yaml | 4 ++-- examples/namespace/RWO/ROCM/10-namespace.yaml | 2 +- examples/namespace/RWO/ROCM/11-gkmcache.yaml | 4 ++-- examples/namespace/RWO/ROCM/12-ds.yaml | 4 ++-- examples/namespace/RWO/ROCM/13-ds.yaml | 4 ++-- examples/namespace/RWO/ROCM/14-ds.yaml | 4 ++-- examples/namespace/RWO/ROCM/21-gkmcache-cosign-v3.yaml | 4 ++-- examples/namespace/RWO/ROCM/22-ds.yaml | 4 ++-- 11 files changed, 21 insertions(+), 21 deletions(-) diff --git a/examples/namespace/RWO/CUDA/10-namespace.yaml b/examples/namespace/RWO/CUDA/10-namespace.yaml index aec06b330..dd97a60f3 100644 --- a/examples/namespace/RWO/CUDA/10-namespace.yaml +++ b/examples/namespace/RWO/CUDA/10-namespace.yaml @@ -2,4 +2,4 @@ apiVersion: v1 kind: Namespace metadata: - name: gkm-test-ns-nvidia-rwo-1 + name: gkm-test-ns-cuda-rwo-1 diff --git a/examples/namespace/RWO/CUDA/11-gkmcache.yaml b/examples/namespace/RWO/CUDA/11-gkmcache.yaml index 54cb9729f..d16e27b3f 100644 --- a/examples/namespace/RWO/CUDA/11-gkmcache.yaml +++ b/examples/namespace/RWO/CUDA/11-gkmcache.yaml @@ -3,7 +3,7 @@ apiVersion: gkm.io/v1alpha1 kind: GKMCache metadata: name: vector-add-cache-cuda-rwo - namespace: gkm-test-ns-nvidia-rwo-1 + namespace: gkm-test-ns-cuda-rwo-1 labels: gkm.io/signature-format: cosign-v2 spec: diff --git a/examples/namespace/RWO/CUDA/12-ds.yaml b/examples/namespace/RWO/CUDA/12-ds.yaml index 446e94e03..f4e38910a 100644 --- a/examples/namespace/RWO/CUDA/12-ds.yaml +++ b/examples/namespace/RWO/CUDA/12-ds.yaml @@ -2,18 +2,18 @@ kind: DaemonSet apiVersion: apps/v1 metadata: - name: gkm-test-nvidia-rwo-ds-1 - namespace: gkm-test-ns-nvidia-rwo-1 + name: gkm-test-cuda-rwo-ds-1 + namespace: gkm-test-ns-cuda-rwo-1 labels: gkm.io/pvcMutation: "true" spec: selector: matchLabels: - name: gkm-test-nvidia-rwo-ds-1 + name: gkm-test-cuda-rwo-ds-1 template: metadata: labels: - name: gkm-test-nvidia-rwo-ds-1 + name: gkm-test-cuda-rwo-ds-1 gkm.io/pvc-mutation: "true" spec: tolerations: diff --git a/examples/namespace/RWO/CUDA/13-pod.yaml b/examples/namespace/RWO/CUDA/13-pod.yaml index c74231f3e..fcb0bbc7d 100644 --- a/examples/namespace/RWO/CUDA/13-pod.yaml +++ b/examples/namespace/RWO/CUDA/13-pod.yaml @@ -2,8 +2,8 @@ kind: Pod apiVersion: v1 metadata: - name: gkm-test-nvidia-pod-1 - namespace: gkm-test-ns-nvidia-rwo-1 + name: gkm-test-cuda-pod-1 + namespace: gkm-test-ns-cuda-rwo-1 spec: tolerations: - key: nvidia.com/gpu diff --git a/examples/namespace/RWO/ROCM/10-namespace.yaml b/examples/namespace/RWO/ROCM/10-namespace.yaml index bc47b15b7..b91919dd6 100644 --- a/examples/namespace/RWO/ROCM/10-namespace.yaml +++ b/examples/namespace/RWO/ROCM/10-namespace.yaml @@ -2,4 +2,4 @@ apiVersion: v1 kind: Namespace metadata: - name: gkm-test-ns-rwo-1 + name: gkm-test-ns-rocm-rwo-1 diff --git a/examples/namespace/RWO/ROCM/11-gkmcache.yaml b/examples/namespace/RWO/ROCM/11-gkmcache.yaml index eb81bd8a5..5fb6e2c34 100644 --- a/examples/namespace/RWO/ROCM/11-gkmcache.yaml +++ b/examples/namespace/RWO/ROCM/11-gkmcache.yaml @@ -2,8 +2,8 @@ apiVersion: gkm.io/v1alpha1 kind: GKMCache metadata: - name: vector-add-cache-rocm-v2-rwo - namespace: gkm-test-ns-rwo-1 + name: vector-add-cache-rocm-rwo + namespace: gkm-test-ns-rocm-rwo-1 labels: gkm.io/signature-format: cosign-v2 spec: diff --git a/examples/namespace/RWO/ROCM/12-ds.yaml b/examples/namespace/RWO/ROCM/12-ds.yaml index 738c8bd61..eb992679a 100644 --- a/examples/namespace/RWO/ROCM/12-ds.yaml +++ b/examples/namespace/RWO/ROCM/12-ds.yaml @@ -3,7 +3,7 @@ kind: DaemonSet apiVersion: apps/v1 metadata: name: gkm-test-ns-rwo-ds-1 - namespace: gkm-test-ns-rwo-1 + namespace: gkm-test-ns-rocm-rwo-1 labels: gkm.io/pvcMutation: "true" spec: @@ -50,4 +50,4 @@ spec: volumes: - name: kernel-volume persistentVolumeClaim: - claimName: vector-add-cache-rocm-v2-rwo + claimName: vector-add-cache-rocm-rwo diff --git a/examples/namespace/RWO/ROCM/13-ds.yaml b/examples/namespace/RWO/ROCM/13-ds.yaml index 937e745e1..38acde56c 100644 --- a/examples/namespace/RWO/ROCM/13-ds.yaml +++ b/examples/namespace/RWO/ROCM/13-ds.yaml @@ -3,7 +3,7 @@ kind: DaemonSet apiVersion: apps/v1 metadata: name: gkm-test-ns-rwo-ds-2 - namespace: gkm-test-ns-rwo-1 + namespace: gkm-test-ns-rocm-rwo-1 labels: gkm.io/pvc-mutation: "true" spec: @@ -51,4 +51,4 @@ spec: volumes: - name: kernel-volume persistentVolumeClaim: - claimName: vector-add-cache-rocm-v2-rwo + claimName: vector-add-cache-rocm-rwo diff --git a/examples/namespace/RWO/ROCM/14-ds.yaml b/examples/namespace/RWO/ROCM/14-ds.yaml index c6bf50212..64d1a9c78 100644 --- a/examples/namespace/RWO/ROCM/14-ds.yaml +++ b/examples/namespace/RWO/ROCM/14-ds.yaml @@ -3,7 +3,7 @@ kind: DaemonSet apiVersion: apps/v1 metadata: name: gkm-test-ns-rwo-ds-3 - namespace: gkm-test-ns-rwo-1 + namespace: gkm-test-ns-rocm-rwo-1 labels: gkm.io/pvcMutation: "true" spec: @@ -51,4 +51,4 @@ spec: volumes: - name: kernel-volume persistentVolumeClaim: - claimName: vector-add-cache-rocm-v2-rwo + claimName: vector-add-cache-rocm-rwo diff --git a/examples/namespace/RWO/ROCM/21-gkmcache-cosign-v3.yaml b/examples/namespace/RWO/ROCM/21-gkmcache-cosign-v3.yaml index 9a091eaf3..6bafc7b42 100644 --- a/examples/namespace/RWO/ROCM/21-gkmcache-cosign-v3.yaml +++ b/examples/namespace/RWO/ROCM/21-gkmcache-cosign-v3.yaml @@ -2,8 +2,8 @@ apiVersion: gkm.io/v1alpha1 kind: GKMCache metadata: - name: vector-add-cache-rocm-v3-rwo - namespace: gkm-test-ns-rwo-1 + name: vector-add-cache-rocm-rwo-v3 + namespace: gkm-test-ns-rocm-rwo-1 labels: gkm.io/signature-format: cosign-v3 spec: diff --git a/examples/namespace/RWO/ROCM/22-ds.yaml b/examples/namespace/RWO/ROCM/22-ds.yaml index c682f8a2c..414ba05bf 100644 --- a/examples/namespace/RWO/ROCM/22-ds.yaml +++ b/examples/namespace/RWO/ROCM/22-ds.yaml @@ -3,7 +3,7 @@ kind: DaemonSet apiVersion: apps/v1 metadata: name: gkm-test-ns-rwo-v3-ds-1 - namespace: gkm-test-ns-rwo-1 + namespace: gkm-test-ns-rocm-rwo-1 labels: gkm.io/pvcMutation: "true" spec: @@ -50,4 +50,4 @@ spec: volumes: - name: kernel-volume persistentVolumeClaim: - claimName: vector-add-cache-rocm-v3-rwo + claimName: vector-add-cache-rocm-rwo-v3 From 4c4d525d17e25526b774a8a148c46d4871f010d4 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Mon, 16 Mar 2026 18:57:10 +0000 Subject: [PATCH 21/25] fix: update ROCM daemonset names to match namespace pattern MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Update ROCM workload names to be consistent with namespace naming: - gkm-test-ns-rwo-ds-* → gkm-test-rocm-rwo-ds-* - gkm-test-ns-rwo-v3-ds-* → gkm-test-rocm-rwo-v3-ds-* Now matches CUDA pattern: gkm-test-{vendor}-rwo-* Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- examples/namespace/RWO/ROCM/12-ds.yaml | 6 +++--- examples/namespace/RWO/ROCM/13-ds.yaml | 6 +++--- examples/namespace/RWO/ROCM/14-ds.yaml | 6 +++--- examples/namespace/RWO/ROCM/22-ds.yaml | 6 +++--- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/examples/namespace/RWO/ROCM/12-ds.yaml b/examples/namespace/RWO/ROCM/12-ds.yaml index eb992679a..f12550f3d 100644 --- a/examples/namespace/RWO/ROCM/12-ds.yaml +++ b/examples/namespace/RWO/ROCM/12-ds.yaml @@ -2,18 +2,18 @@ kind: DaemonSet apiVersion: apps/v1 metadata: - name: gkm-test-ns-rwo-ds-1 + name: gkm-test-rocm-rwo-ds-1 namespace: gkm-test-ns-rocm-rwo-1 labels: gkm.io/pvcMutation: "true" spec: selector: matchLabels: - name: gkm-test-ns-rwo-ds-1 + name: gkm-test-rocm-rwo-ds-1 template: metadata: labels: - name: gkm-test-ns-rwo-ds-1 + name: gkm-test-rocm-rwo-ds-1 gkm.io/pvc-mutation: "true" spec: tolerations: diff --git a/examples/namespace/RWO/ROCM/13-ds.yaml b/examples/namespace/RWO/ROCM/13-ds.yaml index 38acde56c..bde833fc3 100644 --- a/examples/namespace/RWO/ROCM/13-ds.yaml +++ b/examples/namespace/RWO/ROCM/13-ds.yaml @@ -2,18 +2,18 @@ kind: DaemonSet apiVersion: apps/v1 metadata: - name: gkm-test-ns-rwo-ds-2 + name: gkm-test-rocm-rwo-ds-2 namespace: gkm-test-ns-rocm-rwo-1 labels: gkm.io/pvc-mutation: "true" spec: selector: matchLabels: - name: gkm-test-ns-rwo-ds-2 + name: gkm-test-rocm-rwo-ds-2 template: metadata: labels: - name: gkm-test-ns-rwo-ds-2 + name: gkm-test-rocm-rwo-ds-2 gkm.io/pvc-mutation: "true" spec: tolerations: diff --git a/examples/namespace/RWO/ROCM/14-ds.yaml b/examples/namespace/RWO/ROCM/14-ds.yaml index 64d1a9c78..09d1842fd 100644 --- a/examples/namespace/RWO/ROCM/14-ds.yaml +++ b/examples/namespace/RWO/ROCM/14-ds.yaml @@ -2,18 +2,18 @@ kind: DaemonSet apiVersion: apps/v1 metadata: - name: gkm-test-ns-rwo-ds-3 + name: gkm-test-rocm-rwo-ds-3 namespace: gkm-test-ns-rocm-rwo-1 labels: gkm.io/pvcMutation: "true" spec: selector: matchLabels: - name: gkm-test-ns-rwo-ds-3 + name: gkm-test-rocm-rwo-ds-3 template: metadata: labels: - name: gkm-test-ns-rwo-ds-3 + name: gkm-test-rocm-rwo-ds-3 gkm.io/pvc-mutation: "true" spec: tolerations: diff --git a/examples/namespace/RWO/ROCM/22-ds.yaml b/examples/namespace/RWO/ROCM/22-ds.yaml index 414ba05bf..47ef6d515 100644 --- a/examples/namespace/RWO/ROCM/22-ds.yaml +++ b/examples/namespace/RWO/ROCM/22-ds.yaml @@ -2,18 +2,18 @@ kind: DaemonSet apiVersion: apps/v1 metadata: - name: gkm-test-ns-rwo-v3-ds-1 + name: gkm-test-rocm-rwo-v3-ds-1 namespace: gkm-test-ns-rocm-rwo-1 labels: gkm.io/pvcMutation: "true" spec: selector: matchLabels: - name: gkm-test-ns-rwo-v3-ds-1 + name: gkm-test-rocm-rwo-v3-ds-1 template: metadata: labels: - name: gkm-test-ns-rwo-v3-ds-1 + name: gkm-test-rocm-rwo-v3-ds-1 gkm.io/pvc-mutation: "true" spec: tolerations: From c67564cc477df432efb127a5e6fae70252c28619 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Mon, 16 Mar 2026 19:28:19 +0000 Subject: [PATCH 22/25] images: add gkm prefix to image names Signed-off-by: Maryam Tahhan --- .github/workflows/image-build.yml | 10 +++++----- Makefile | 12 ++++++------ config/agent/gkm-agent-amd.yaml | 2 +- config/agent/gkm-agent-nogpu.yaml | 2 +- config/agent/gkm-agent-nvidia.yaml | 2 +- config/agent/kustomization.yaml | 15 ++++++++++++--- config/configMap/configMap.yaml | 2 +- config/configMap/kustomization.yaml | 2 +- config/operator/kustomization.yaml | 7 +++++-- config/operator/operator.yaml | 2 +- 10 files changed, 34 insertions(+), 22 deletions(-) diff --git a/.github/workflows/image-build.yml b/.github/workflows/image-build.yml index 710c17ef2..4a4cb15c4 100644 --- a/.github/workflows/image-build.yml +++ b/.github/workflows/image-build.yml @@ -32,7 +32,7 @@ jobs: image: - registry: quay.io repository: gkm - image: operator + image: gkm-operator dockerfile: ./Containerfile.gkm-operator context: . tags: | @@ -45,7 +45,7 @@ jobs: - registry: quay.io repository: gkm - image: agent-base + image: gkm-agent-base dockerfile: ./Containerfile.gkm-agent-base context: . target: base-runtime @@ -59,7 +59,7 @@ jobs: - registry: quay.io repository: gkm - image: agent-nvidia + image: gkm-agent-nvidia dockerfile: ./Containerfile.gkm-agent-nvidia context: . tags: | @@ -72,7 +72,7 @@ jobs: - registry: quay.io repository: gkm - image: agent-amd + image: gkm-agent-amd dockerfile: ./Containerfile.gkm-agent-amd context: . tags: | @@ -85,7 +85,7 @@ jobs: - registry: quay.io repository: gkm - image: agent-nogpu + image: gkm-agent-nogpu dockerfile: ./Containerfile.gkm-agent-nogpu context: . tags: | diff --git a/Makefile b/Makefile index 4d845fc42..db73b896b 100644 --- a/Makefile +++ b/Makefile @@ -77,13 +77,13 @@ OPERATOR_SDK_VERSION ?= v1.39.2 QUAY_USER ?= gkm IMAGE_TAG ?= latest REPO ?= quay.io/$(QUAY_USER) -OPERATOR_IMG ?= $(REPO)/operator:$(IMAGE_TAG) -AGENT_IMG ?=$(REPO)/agent:$(IMAGE_TAG) +OPERATOR_IMG ?= $(REPO)/gkm-operator:$(IMAGE_TAG) +AGENT_IMG ?=$(REPO)/gkm-agent:$(IMAGE_TAG) EXTRACT_IMG ?=$(REPO)/gkm-extract:$(IMAGE_TAG) -AGENT_BASE_IMG ?= $(REPO)/agent-base:$(IMAGE_TAG) -AGENT_NVIDIA_IMG ?= $(REPO)/agent-nvidia:$(IMAGE_TAG) -AGENT_AMD_IMG ?= $(REPO)/agent-amd:$(IMAGE_TAG) -AGENT_NOGPU_IMG ?= $(REPO)/agent-nogpu:$(IMAGE_TAG) +AGENT_BASE_IMG ?= $(REPO)/gkm-agent-base:$(IMAGE_TAG) +AGENT_NVIDIA_IMG ?= $(REPO)/gkm-agent-nvidia:$(IMAGE_TAG) +AGENT_AMD_IMG ?= $(REPO)/gkm-agent-amd:$(IMAGE_TAG) +AGENT_NOGPU_IMG ?= $(REPO)/gkm-agent-nogpu:$(IMAGE_TAG) # ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary. ENVTEST_K8S_VERSION = 1.31.0 diff --git a/config/agent/gkm-agent-amd.yaml b/config/agent/gkm-agent-amd.yaml index 6de806096..3187c3184 100644 --- a/config/agent/gkm-agent-amd.yaml +++ b/config/agent/gkm-agent-amd.yaml @@ -36,7 +36,7 @@ spec: operator: Exists containers: - name: gkm-agent - image: quay.io/gkm/agent-amd:latest + image: quay.io/gkm/gkm-agent-amd:latest imagePullPolicy: IfNotPresent securityContext: runAsUser: 0 diff --git a/config/agent/gkm-agent-nogpu.yaml b/config/agent/gkm-agent-nogpu.yaml index 8b7715104..66e3121b4 100644 --- a/config/agent/gkm-agent-nogpu.yaml +++ b/config/agent/gkm-agent-nogpu.yaml @@ -39,7 +39,7 @@ spec: operator: DoesNotExist containers: - name: gkm-agent - image: quay.io/gkm/agent-nogpu:latest + image: quay.io/gkm/gkm-agent-nogpu:latest imagePullPolicy: IfNotPresent securityContext: runAsUser: 0 diff --git a/config/agent/gkm-agent-nvidia.yaml b/config/agent/gkm-agent-nvidia.yaml index 6cad7bfa5..ceb1f63ee 100644 --- a/config/agent/gkm-agent-nvidia.yaml +++ b/config/agent/gkm-agent-nvidia.yaml @@ -33,7 +33,7 @@ spec: operator: Exists containers: - name: gkm-agent - image: quay.io/gkm/agent-nvidia:latest + image: quay.io/gkm/gkm-agent-nvidia:latest imagePullPolicy: IfNotPresent securityContext: runAsUser: 0 diff --git a/config/agent/kustomization.yaml b/config/agent/kustomization.yaml index b5f7f6699..c24b0d22e 100644 --- a/config/agent/kustomization.yaml +++ b/config/agent/kustomization.yaml @@ -10,11 +10,20 @@ resources: images: - name: quay.io/gkm/agent-amd - newName: quay.io/gkm/agent-amd + newName: quay.io/gkm/gkm-agent-amd newTag: latest - name: quay.io/gkm/agent-nogpu - newName: quay.io/gkm/agent-nogpu + newName: quay.io/gkm/gkm-agent-nogpu newTag: latest - name: quay.io/gkm/agent-nvidia - newName: quay.io/gkm/agent-nvidia + newName: quay.io/gkm/gkm-agent-nvidia + newTag: latest +- name: quay.io/gkm/gkm-agent-amd + newName: quay.io/gkm/gkm-agent-amd + newTag: latest +- name: quay.io/gkm/gkm-agent-nogpu + newName: quay.io/gkm/gkm-agent-nogpu + newTag: latest +- name: quay.io/gkm/gkm-agent-nvidia + newName: quay.io/gkm/gkm-agent-nvidia newTag: latest diff --git a/config/configMap/configMap.yaml b/config/configMap/configMap.yaml index 12d3b863c..e2aed50e6 100644 --- a/config/configMap/configMap.yaml +++ b/config/configMap/configMap.yaml @@ -8,7 +8,7 @@ data: gkm.operator.log.level: info gkm.agent.log.level: info ## Can be configured at runtime - gkm.agent.image: quay.io/gkm/agent:latest + gkm.agent.image: quay.io/gkm/gkm-agent:latest gkm.extract.image: quay.io/gkm/gkm-extract:latest gkm.nogpu: false ## Enable/disable Kyverno image signature verification (defaults to true/enabled) diff --git a/config/configMap/kustomization.yaml b/config/configMap/kustomization.yaml index b46ed2b73..77b78a349 100644 --- a/config/configMap/kustomization.yaml +++ b/config/configMap/kustomization.yaml @@ -9,7 +9,7 @@ configMapGenerator: - behavior: merge literals: - gkm.nogpu=true - - gkm.agent.image=quay.io/gkm/agent:latest + - gkm.agent.image=quay.io/gkm/gkm-agent:latest - gkm.extract.image=quay.io/gkm/gkm-extract:latest name: config namespace: gkm-system diff --git a/config/operator/kustomization.yaml b/config/operator/kustomization.yaml index b7f6673d4..edf1e14b8 100644 --- a/config/operator/kustomization.yaml +++ b/config/operator/kustomization.yaml @@ -4,8 +4,11 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization images: - name: controller - newName: quay.io/gkm/operator + newName: quay.io/gkm/gkm-operator + newTag: latest +- name: quay.io/gkm/gkm-operator + newName: quay.io/gkm/gkm-operator newTag: latest - name: quay.io/gkm/operator - newName: quay.io/gkm/operator + newName: quay.io/gkm/gkm-operator newTag: latest diff --git a/config/operator/operator.yaml b/config/operator/operator.yaml index d694ee2e5..917eb4b7f 100644 --- a/config/operator/operator.yaml +++ b/config/operator/operator.yaml @@ -71,7 +71,7 @@ spec: args: - --leader-elect - --health-probe-bind-address=:8081 - image: quay.io/gkm/operator:latest + image: quay.io/gkm/gkm-operator:latest imagePullPolicy: IfNotPresent securityContext: allowPrivilegeEscalation: false From ccbc1ad4e8ddb4f1d84371133fb9c3d43df733d8 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Mon, 16 Mar 2026 19:50:45 +0000 Subject: [PATCH 23/25] refactor: use base image for builder stage in GPU agent Containerfiles Removed duplicated builder stage from NVIDIA, AMD, and nogpu agent Containerfiles. Each now uses FROM quay.io/gkm/gkm-agent-base:latest as the builder stage, eliminating code duplication while keeping GPU-specific runtime stages intact. Co-Authored-By: Claude Sonnet 4.5 --- Containerfile.gkm-agent-amd | 71 ++++------------------------------ Containerfile.gkm-agent-nogpu | 35 ++--------------- Containerfile.gkm-agent-nvidia | 35 ++--------------- Makefile | 2 +- 4 files changed, 14 insertions(+), 129 deletions(-) diff --git a/Containerfile.gkm-agent-amd b/Containerfile.gkm-agent-amd index 14c81835a..5913458e1 100644 --- a/Containerfile.gkm-agent-amd +++ b/Containerfile.gkm-agent-amd @@ -1,69 +1,12 @@ # ============================================================================ -# Stage 1: Builder (Shared across all agent variants) -# See Containerfile.gkm-agent-base for the common base stages +# AMD ROCm Agent (extends nogpu agent) +# Inherits binary and common packages from nogpu, adds ROCm support # ============================================================================ -FROM public.ecr.aws/docker/library/golang:1.25 AS builder -WORKDIR /workspace +FROM quay.io/gkm/gkm-agent-nogpu:latest -# Install required system packages -RUN apt-get update && \ - apt-get install -y \ - libgpgme-dev \ - btrfs-progs \ - libbtrfs-dev \ - libgpgme11-dev \ - libseccomp-dev \ - pkg-config \ - build-essential && \ - apt-get clean - -# Copy the Go Modules manifests -COPY go.mod go.mod -COPY go.sum go.sum - -# Copy the go source -COPY agent/main.go agent/main.go -COPY api/ api/ -COPY pkg/ pkg/ -COPY internal/controller/ internal/controller/ -COPY vendor/ vendor/ -COPY Makefile Makefile - -# Build the agent binary -RUN make build-gkm-agent - -# ============================================================================ -# Stage 2: AMD ROCm-specific Runtime -# ============================================================================ - -# Start from Ubuntu base for AMD ROCm support -FROM public.ecr.aws/docker/library/ubuntu:24.04 - -# Copy the binary from the builder -COPY --from=builder /workspace/bin/gkm-agent /agent - -# Install common runtime libraries (shared with other agent variants) -RUN apt-get update && \ - apt-get install -y \ - ca-certificates \ - libgpgme11 \ - libbtrfs0 \ - libffi8 \ - libc6 \ - wget \ - pciutils \ - hwdata \ - gnupg2 \ - python3-setuptools \ - python3-wheel \ - curl \ - dialog \ - rsync \ - lsb-release \ - software-properties-common \ - libseccomp2 && \ - apt-get clean +# Switch to root to install ROCm packages +USER root # AMD ROCm version configuration ARG ROCM_VERSION=6.3.1 @@ -78,7 +21,7 @@ RUN wget https://repo.radeon.com/amdgpu-install/${ROCM_VERSION}/ubuntu/noble/amd ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/amd-smi /usr/bin/amd-smi && \ ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/rocm-smi /usr/bin/rocm-smi -# Run as non-root user +# Switch back to non-root user USER 65532:65532 -ENTRYPOINT ["/agent"] +# Binary and entrypoint are inherited from nogpu image diff --git a/Containerfile.gkm-agent-nogpu b/Containerfile.gkm-agent-nogpu index a33172481..5942693be 100644 --- a/Containerfile.gkm-agent-nogpu +++ b/Containerfile.gkm-agent-nogpu @@ -1,37 +1,8 @@ # ============================================================================ -# Stage 1: Builder (Shared across all agent variants) -# See Containerfile.gkm-agent-base for the common base stages +# Stage 1: Builder (from base image) +# See Containerfile.gkm-agent-base for the common builder stage # ============================================================================ -FROM public.ecr.aws/docker/library/golang:1.25 AS builder - -WORKDIR /workspace - -# Install required system packages -RUN apt-get update && \ - apt-get install -y \ - libgpgme-dev \ - btrfs-progs \ - libbtrfs-dev \ - libgpgme11-dev \ - libseccomp-dev \ - pkg-config \ - build-essential && \ - apt-get clean - -# Copy the Go Modules manifests -COPY go.mod go.mod -COPY go.sum go.sum - -# Copy the go source -COPY agent/main.go agent/main.go -COPY api/ api/ -COPY pkg/ pkg/ -COPY internal/controller/ internal/controller/ -COPY vendor/ vendor/ -COPY Makefile Makefile - -# Build the agent binary -RUN make build-gkm-agent +FROM quay.io/gkm/gkm-agent-base:latest AS builder # ============================================================================ # Stage 2: No-GPU Runtime (minimal footprint) diff --git a/Containerfile.gkm-agent-nvidia b/Containerfile.gkm-agent-nvidia index 1d06fb06d..e9c08638e 100644 --- a/Containerfile.gkm-agent-nvidia +++ b/Containerfile.gkm-agent-nvidia @@ -1,37 +1,8 @@ # ============================================================================ -# Stage 1: Builder (Shared across all agent variants) -# See Containerfile.gkm-agent-base for the common base stages +# Stage 1: Builder (from base image) +# See Containerfile.gkm-agent-base for the common builder stage # ============================================================================ -FROM public.ecr.aws/docker/library/golang:1.25 AS builder - -WORKDIR /workspace - -# Install required system packages -RUN apt-get update && \ - apt-get install -y \ - libgpgme-dev \ - btrfs-progs \ - libbtrfs-dev \ - libgpgme11-dev \ - libseccomp-dev \ - pkg-config \ - build-essential && \ - apt-get clean - -# Copy the Go Modules manifests -COPY go.mod go.mod -COPY go.sum go.sum - -# Copy the go source -COPY agent/main.go agent/main.go -COPY api/ api/ -COPY pkg/ pkg/ -COPY internal/controller/ internal/controller/ -COPY vendor/ vendor/ -COPY Makefile Makefile - -# Build the agent binary -RUN make build-gkm-agent +FROM quay.io/gkm/gkm-agent-base:latest AS builder # ============================================================================ # Stage 2: NVIDIA-specific Runtime diff --git a/Makefile b/Makefile index db73b896b..02cbe5f6c 100644 --- a/Makefile +++ b/Makefile @@ -228,7 +228,7 @@ build-image-gkm-extract: .PHONY: build-image-agent-base build-image-agent-base: - $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load --target base-runtime -f Containerfile.gkm-agent-base -t ${AGENT_BASE_IMG} . + $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load --target builder -f Containerfile.gkm-agent-base -t ${AGENT_BASE_IMG} . .PHONY: build-image-agent-nvidia build-image-agent-nvidia: From f744250ddc4788e0bccdd16eab11718418699ea6 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Mon, 16 Mar 2026 20:52:44 +0000 Subject: [PATCH 24/25] refactor: consolidate agent Containerfiles into single multi-target file Replaced separate Containerfiles for each agent variant with a single Containerfile.gkm-agents containing multi-stage targets (nogpu, amd, nvidia). This eliminates cross-file dependencies and enables parallel CI builds. Changes: - Created Containerfile.gkm-agents with shared builder stage - nogpu target: complete agent with common runtime deps - amd target: extends nogpu, adds ROCm support only - nvidia target: CUDA runtime with agent binary - Updated Makefile to build using --target flags - Updated GitHub workflow to use single Containerfile - Removed obsolete individual Containerfiles - Updated documentation references Benefits: - No build dependencies between separate files - Builder stage always available in same file - AMD reuses all nogpu layers (more efficient) - CI workflows can build in parallel - Cleaner, more maintainable structure Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- .github/workflows/image-build.yml | 28 ++---- Containerfile.gkm-agent-amd | 27 ------ Containerfile.gkm-agent-base | 75 ---------------- Containerfile.gkm-agent-nogpu | 44 ---------- Containerfile.gkm-agent-nvidia | 46 ---------- Containerfile.gkm-agents | 139 ++++++++++++++++++++++++++++++ Makefile | 22 ++--- config/agent/README.md | 16 ++-- 8 files changed, 163 insertions(+), 234 deletions(-) delete mode 100644 Containerfile.gkm-agent-amd delete mode 100644 Containerfile.gkm-agent-base delete mode 100644 Containerfile.gkm-agent-nogpu delete mode 100644 Containerfile.gkm-agent-nvidia create mode 100644 Containerfile.gkm-agents diff --git a/.github/workflows/image-build.yml b/.github/workflows/image-build.yml index 4a4cb15c4..61f61218f 100644 --- a/.github/workflows/image-build.yml +++ b/.github/workflows/image-build.yml @@ -45,10 +45,10 @@ jobs: - registry: quay.io repository: gkm - image: gkm-agent-base - dockerfile: ./Containerfile.gkm-agent-base + image: gkm-agent-nogpu + dockerfile: ./Containerfile.gkm-agents context: . - target: base-runtime + target: nogpu tags: | type=ref,event=branch type=ref,event=tag @@ -56,12 +56,12 @@ jobs: type=sha,format=long # set latest tag for default branch type=raw,value=latest,enable={{is_default_branch}} - - registry: quay.io repository: gkm image: gkm-agent-nvidia - dockerfile: ./Containerfile.gkm-agent-nvidia + dockerfile: ./Containerfile.gkm-agents context: . + target: nvidia tags: | type=ref,event=branch type=ref,event=tag @@ -69,25 +69,12 @@ jobs: type=sha,format=long # set latest tag for default branch type=raw,value=latest,enable={{is_default_branch}} - - registry: quay.io repository: gkm image: gkm-agent-amd - dockerfile: ./Containerfile.gkm-agent-amd - context: . - tags: | - type=ref,event=branch - type=ref,event=tag - type=ref,event=pr - type=sha,format=long - # set latest tag for default branch - type=raw,value=latest,enable={{is_default_branch}} - - - registry: quay.io - repository: gkm - image: gkm-agent-nogpu - dockerfile: ./Containerfile.gkm-agent-nogpu + dockerfile: ./Containerfile.gkm-agents context: . + target: amd tags: | type=ref,event=branch type=ref,event=tag @@ -95,7 +82,6 @@ jobs: type=sha,format=long # set latest tag for default branch type=raw,value=latest,enable={{is_default_branch}} - - registry: quay.io repository: gkm image: gkm-extract diff --git a/Containerfile.gkm-agent-amd b/Containerfile.gkm-agent-amd deleted file mode 100644 index 5913458e1..000000000 --- a/Containerfile.gkm-agent-amd +++ /dev/null @@ -1,27 +0,0 @@ -# ============================================================================ -# AMD ROCm Agent (extends nogpu agent) -# Inherits binary and common packages from nogpu, adds ROCm support -# ============================================================================ - -FROM quay.io/gkm/gkm-agent-nogpu:latest - -# Switch to root to install ROCm packages -USER root - -# AMD ROCm version configuration -ARG ROCM_VERSION=6.3.1 -ARG AMDGPU_VERSION=6.3.60301 -ARG OPT_ROCM_VERSION=6.3.1 - -# Install AMD ROCm packages (GPU-specific dependencies) -RUN wget https://repo.radeon.com/amdgpu-install/${ROCM_VERSION}/ubuntu/noble/amdgpu-install_${AMDGPU_VERSION}-1_all.deb && \ - apt install -y ./*.deb && \ - apt update && DEBIAN_FRONTEND=noninteractive apt install -y amd-smi-lib rocm-smi-lib && \ - apt-get clean && rm -rf /var/lib/apt/lists/* && \ - ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/amd-smi /usr/bin/amd-smi && \ - ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/rocm-smi /usr/bin/rocm-smi - -# Switch back to non-root user -USER 65532:65532 - -# Binary and entrypoint are inherited from nogpu image diff --git a/Containerfile.gkm-agent-base b/Containerfile.gkm-agent-base deleted file mode 100644 index 9bd406c84..000000000 --- a/Containerfile.gkm-agent-base +++ /dev/null @@ -1,75 +0,0 @@ -# Common base Containerfile for GKM agents -# This file contains the shared builder and base runtime stages -# GPU-specific Containerfiles currently duplicate these stages with references -# to this file for maintenance purposes. -# -# Future Enhancement: This base image could be built and pushed to Quay to -# improve build efficiency: -# podman build -f Containerfile.gkm-agent-base --target base-runtime \ -# -t quay.io/gkm/agent-runtime-base:latest . -# podman push quay.io/gkm/agent-runtime-base:latest -# -# Then GPU-specific Containerfiles could reference it: -# FROM quay.io/gkm/agent-runtime-base:latest - -# ============================================================================ -# Stage 1: Builder (Common to all agent variants) -# ============================================================================ -FROM public.ecr.aws/docker/library/golang:1.25 AS builder - -WORKDIR /workspace - -# Install required system packages -RUN apt-get update && \ - apt-get install -y \ - libgpgme-dev \ - btrfs-progs \ - libbtrfs-dev \ - libgpgme11-dev \ - libseccomp-dev \ - pkg-config \ - build-essential && \ - apt-get clean - -# Copy the Go Modules manifests -COPY go.mod go.mod -COPY go.sum go.sum - -# Copy the go source -COPY agent/main.go agent/main.go -COPY api/ api/ -COPY pkg/ pkg/ -COPY internal/controller/ internal/controller/ -COPY vendor/ vendor/ -COPY Makefile Makefile - -# Build the agent binary -RUN make build-gkm-agent - -# ============================================================================ -# Stage 2: Base Runtime (Common runtime dependencies) -# ============================================================================ -FROM public.ecr.aws/docker/library/ubuntu:24.04 AS base-runtime - -# Install required runtime libraries for CGO and agent operation -RUN apt-get update && \ - apt-get install -y \ - ca-certificates \ - libgpgme11 \ - libbtrfs0 \ - libffi8 \ - libc6 \ - wget \ - pciutils \ - hwdata \ - gnupg2 \ - python3-setuptools \ - python3-wheel \ - curl \ - dialog \ - rsync \ - lsb-release \ - software-properties-common \ - libseccomp2 && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* diff --git a/Containerfile.gkm-agent-nogpu b/Containerfile.gkm-agent-nogpu deleted file mode 100644 index 5942693be..000000000 --- a/Containerfile.gkm-agent-nogpu +++ /dev/null @@ -1,44 +0,0 @@ -# ============================================================================ -# Stage 1: Builder (from base image) -# See Containerfile.gkm-agent-base for the common builder stage -# ============================================================================ -FROM quay.io/gkm/gkm-agent-base:latest AS builder - -# ============================================================================ -# Stage 2: No-GPU Runtime (minimal footprint) -# ============================================================================ - -# Use minimal Ubuntu base (no GPU libraries needed) -FROM public.ecr.aws/docker/library/ubuntu:24.04 - -# Copy the binary from the builder -COPY --from=builder /workspace/bin/gkm-agent /agent - -# Install common runtime libraries (shared with other agent variants) -# No GPU-specific dependencies required for this variant -RUN apt-get update && \ - apt-get install -y \ - ca-certificates \ - libgpgme11 \ - libbtrfs0 \ - libffi8 \ - libc6 \ - wget \ - pciutils \ - hwdata \ - gnupg2 \ - python3-setuptools \ - python3-wheel \ - curl \ - dialog \ - rsync \ - lsb-release \ - software-properties-common \ - libseccomp2 && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# Run as non-root user -USER 65532:65532 - -ENTRYPOINT ["/agent"] diff --git a/Containerfile.gkm-agent-nvidia b/Containerfile.gkm-agent-nvidia deleted file mode 100644 index e9c08638e..000000000 --- a/Containerfile.gkm-agent-nvidia +++ /dev/null @@ -1,46 +0,0 @@ -# ============================================================================ -# Stage 1: Builder (from base image) -# See Containerfile.gkm-agent-base for the common builder stage -# ============================================================================ -FROM quay.io/gkm/gkm-agent-base:latest AS builder - -# ============================================================================ -# Stage 2: NVIDIA-specific Runtime -# ============================================================================ - -# Use NVIDIA CUDA runtime base image (includes NVML libraries) -FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04 - -# Copy the binary from the builder -COPY --from=builder /workspace/bin/gkm-agent /agent - -# Install common runtime libraries (shared with other agent variants) -RUN apt-get update && \ - apt-get install -y \ - ca-certificates \ - libgpgme11 \ - libbtrfs0 \ - libffi8 \ - libc6 \ - wget \ - pciutils \ - hwdata \ - gnupg2 \ - python3-setuptools \ - python3-wheel \ - curl \ - dialog \ - rsync \ - lsb-release \ - software-properties-common \ - libseccomp2 && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* - -# Note: NVIDIA CUDA base image already includes libnvidia-ml.so (NVML) -# No additional GPU-specific packages needed - -# Run as non-root user -USER 65532:65532 - -ENTRYPOINT ["/agent"] diff --git a/Containerfile.gkm-agents b/Containerfile.gkm-agents new file mode 100644 index 000000000..326d0c078 --- /dev/null +++ b/Containerfile.gkm-agents @@ -0,0 +1,139 @@ +# ============================================================================ +# Multi-target Containerfile for GKM Agents +# Build specific targets with: podman build --target +# ============================================================================ + +# ============================================================================ +# Stage 1: Builder (shared by all agent variants) +# ============================================================================ +FROM public.ecr.aws/docker/library/golang:1.25 AS builder + +WORKDIR /workspace + +# Install required system packages +RUN apt-get update && \ + apt-get install -y \ + libgpgme-dev \ + btrfs-progs \ + libbtrfs-dev \ + libgpgme11-dev \ + libseccomp-dev \ + pkg-config \ + build-essential && \ + apt-get clean + +# Copy the Go Modules manifests +COPY go.mod go.mod +COPY go.sum go.sum + +# Copy the go source +COPY agent/main.go agent/main.go +COPY api/ api/ +COPY pkg/ pkg/ +COPY internal/controller/ internal/controller/ +COPY vendor/ vendor/ +COPY Makefile Makefile + +# Build the agent binary +RUN make build-gkm-agent + +# ============================================================================ +# Target: nogpu (complete no-GPU agent) +# ============================================================================ +FROM public.ecr.aws/docker/library/ubuntu:24.04 AS nogpu + +# Copy the binary from the builder +COPY --from=builder /workspace/bin/gkm-agent /agent + +# Install common runtime libraries (shared with other agent variants) +RUN apt-get update && \ + apt-get install -y \ + ca-certificates \ + libgpgme11 \ + libbtrfs0 \ + libffi8 \ + libc6 \ + wget \ + pciutils \ + hwdata \ + gnupg2 \ + python3-setuptools \ + python3-wheel \ + curl \ + dialog \ + rsync \ + lsb-release \ + software-properties-common \ + libseccomp2 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Run as non-root user +USER 65532:65532 + +ENTRYPOINT ["/agent"] + +# ============================================================================ +# Target: amd (extends nogpu, adds ROCm support) +# ============================================================================ +FROM nogpu AS amd + +# Switch to root to install ROCm packages +USER root + +# AMD ROCm version configuration +ARG ROCM_VERSION=6.3.1 +ARG AMDGPU_VERSION=6.3.60301 +ARG OPT_ROCM_VERSION=6.3.1 + +# Install AMD ROCm packages (GPU-specific dependencies) +RUN wget https://repo.radeon.com/amdgpu-install/${ROCM_VERSION}/ubuntu/noble/amdgpu-install_${AMDGPU_VERSION}-1_all.deb && \ + apt install -y ./*.deb && \ + apt update && DEBIAN_FRONTEND=noninteractive apt install -y amd-smi-lib rocm-smi-lib && \ + apt-get clean && rm -rf /var/lib/apt/lists/* && \ + ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/amd-smi /usr/bin/amd-smi && \ + ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/rocm-smi /usr/bin/rocm-smi + +# Switch back to non-root user +USER 65532:65532 + +# Binary and entrypoint are inherited from nogpu + +# ============================================================================ +# Target: nvidia (CUDA runtime with NVML support) +# ============================================================================ +FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04 AS nvidia + +# Copy the binary from the builder +COPY --from=builder /workspace/bin/gkm-agent /agent + +# Install common runtime libraries (shared with other agent variants) +RUN apt-get update && \ + apt-get install -y \ + ca-certificates \ + libgpgme11 \ + libbtrfs0 \ + libffi8 \ + libc6 \ + wget \ + pciutils \ + hwdata \ + gnupg2 \ + python3-setuptools \ + python3-wheel \ + curl \ + dialog \ + rsync \ + lsb-release \ + software-properties-common \ + libseccomp2 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Note: NVIDIA CUDA base image already includes libnvidia-ml.so (NVML) +# No additional GPU-specific packages needed + +# Run as non-root user +USER 65532:65532 + +ENTRYPOINT ["/agent"] diff --git a/Makefile b/Makefile index 02cbe5f6c..ac6ebde1b 100644 --- a/Makefile +++ b/Makefile @@ -80,7 +80,6 @@ REPO ?= quay.io/$(QUAY_USER) OPERATOR_IMG ?= $(REPO)/gkm-operator:$(IMAGE_TAG) AGENT_IMG ?=$(REPO)/gkm-agent:$(IMAGE_TAG) EXTRACT_IMG ?=$(REPO)/gkm-extract:$(IMAGE_TAG) -AGENT_BASE_IMG ?= $(REPO)/gkm-agent-base:$(IMAGE_TAG) AGENT_NVIDIA_IMG ?= $(REPO)/gkm-agent-nvidia:$(IMAGE_TAG) AGENT_AMD_IMG ?= $(REPO)/gkm-agent-amd:$(IMAGE_TAG) AGENT_NOGPU_IMG ?= $(REPO)/gkm-agent-nogpu:$(IMAGE_TAG) @@ -226,27 +225,23 @@ build-image-operator: build-image-gkm-extract: $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --progress=plain --load -f Containerfile.gkm-extract -t ${EXTRACT_IMG} . -.PHONY: build-image-agent-base -build-image-agent-base: - $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load --target builder -f Containerfile.gkm-agent-base -t ${AGENT_BASE_IMG} . - .PHONY: build-image-agent-nvidia build-image-agent-nvidia: - $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load -f Containerfile.gkm-agent-nvidia -t ${AGENT_NVIDIA_IMG} . + $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load --target nvidia -f Containerfile.gkm-agents -t ${AGENT_NVIDIA_IMG} . .PHONY: build-image-agent-amd build-image-agent-amd: - $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load -f Containerfile.gkm-agent-amd -t ${AGENT_AMD_IMG} . + $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load --target amd -f Containerfile.gkm-agents -t ${AGENT_AMD_IMG} . .PHONY: build-image-agent-nogpu build-image-agent-nogpu: - $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --progress=plain --load -f Containerfile.gkm-agent-nogpu -t ${AGENT_NOGPU_IMG} . + $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --progress=plain --load --target nogpu -f Containerfile.gkm-agents -t ${AGENT_NOGPU_IMG} . .PHONY: build-image-agents ifeq ($(NO_GPU_BUILD),true) -build-image-agents: build-image-agent-base build-image-agent-nogpu ## Build base and no-GPU agent only (NO_GPU_BUILD=true) +build-image-agents: build-image-agent-nogpu ## Build no-GPU agent only (NO_GPU_BUILD=true) else -build-image-agents: build-image-agent-base build-image-agent-nvidia build-image-agent-amd build-image-agent-nogpu ## Build all agent images (base, NVIDIA, AMD, and no-GPU) +build-image-agents: build-image-agent-nvidia build-image-agent-amd build-image-agent-nogpu ## Build all agent images (NVIDIA, AMD, and no-GPU) endif # If you wish to build the operator image targeting other platforms you can use the --platform flag. @@ -259,7 +254,6 @@ build-images: build-image-operator build-image-agents build-image-gkm-extract ## push-images: ## Push all container images. $(CONTAINER_TOOL) push ${OPERATOR_IMG} $(CONTAINER_TOOL) push ${EXTRACT_IMG} - $(CONTAINER_TOOL) push ${AGENT_BASE_IMG} ifeq ($(NO_GPU_BUILD),true) $(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG} else @@ -270,12 +264,10 @@ endif .PHONY: push-images-agents ifeq ($(NO_GPU_BUILD),true) -push-images-agents: ## Push base and no-GPU agent only (NO_GPU_BUILD=true) - $(CONTAINER_TOOL) push ${AGENT_BASE_IMG} +push-images-agents: ## Push no-GPU agent only (NO_GPU_BUILD=true) $(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG} else push-images-agents: ## Push all agent images - $(CONTAINER_TOOL) push ${AGENT_BASE_IMG} $(CONTAINER_TOOL) push ${AGENT_NVIDIA_IMG} $(CONTAINER_TOOL) push ${AGENT_AMD_IMG} $(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG} @@ -618,8 +610,6 @@ setup-kind: kind-gpu-sim-script kind-load-images: kind-gpu-sim-script get-example-images @echo "Loading operator image ${OPERATOR_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)" cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${OPERATOR_IMG} --cluster-name=$(KIND_CLUSTER_NAME) - @echo "Loading agent base image ${AGENT_BASE_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)" - cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${AGENT_BASE_IMG} --cluster-name=$(KIND_CLUSTER_NAME) ifeq ($(NO_GPU_BUILD),true) @echo "Loading agent nogpu image ${AGENT_NOGPU_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)" cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${AGENT_NOGPU_IMG} --cluster-name=$(KIND_CLUSTER_NAME) diff --git a/config/agent/README.md b/config/agent/README.md index b15567441..71161a7e3 100644 --- a/config/agent/README.md +++ b/config/agent/README.md @@ -103,16 +103,23 @@ kubectl get pods -n gkm-system -l gpu-vendor=amd -o wide ## Containerfiles -### NVIDIA Agent ([Containerfile.gkm-agent-nvidia](../../Containerfile.gkm-agent-nvidia)) +All agent variants are built from [Containerfile.gkm-agents](../../Containerfile.gkm-agents) using multi-stage targets: + +### NVIDIA Agent (target: `nvidia`) - Base image: `nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04` - Includes: NVIDIA CUDA runtime with NVML libraries - Requires: NVIDIA driver on host -### AMD Agent ([Containerfile.gkm-agent-amd](../../Containerfile.gkm-agent-amd)) -- Base image: `ubuntu:24.04` +### AMD Agent (target: `amd`) +- Base image: extends `nogpu` target - Includes: ROCm libraries (`amd-smi-lib`, `rocm-smi-lib`) - Requires: AMD GPU driver on host +### No-GPU Agent (target: `nogpu`) +- Base image: `ubuntu:24.04` +- Includes: Common runtime dependencies only +- For non-GPU workloads + ## Node Selectors The DaemonSets use PCI vendor ID-based node selectors: @@ -188,5 +195,4 @@ To migrate from the legacy generic agent: - [gkm-agent-nvidia.yaml](gkm-agent-nvidia.yaml) - NVIDIA DaemonSet - [gkm-agent-amd.yaml](gkm-agent-amd.yaml) - AMD DaemonSet - [kustomization.yaml](kustomization.yaml) - Kustomize configuration -- [../../Containerfile.gkm-agent-nvidia](../../Containerfile.gkm-agent-nvidia) - NVIDIA Containerfile -- [../../Containerfile.gkm-agent-amd](../../Containerfile.gkm-agent-amd) - AMD Containerfile +- [../../Containerfile.gkm-agents](../../Containerfile.gkm-agents) - Multi-target agent Containerfile From 1756195d32ec34588fc0ecd60af94d2e9a5a61d9 Mon Sep 17 00:00:00 2001 From: Maryam Tahhan Date: Mon, 16 Mar 2026 21:05:23 +0000 Subject: [PATCH 25/25] fix: update legacy agent image reference to nogpu variant Changed gkm.agent.image from non-existent gkm-agent:latest to gkm-agent-nogpu:latest. This value is legacy/unused (operator only logs it), but needs to reference a real image for backwards compatibility. Each agent daemonset uses its GPU-specific image directly, so this configmap value is not actually used at runtime. Co-Authored-By: Claude Sonnet 4.5 Signed-off-by: Maryam Tahhan --- Makefile | 3 ++- config/configMap/configMap.yaml | 3 ++- config/configMap/kustomization.yaml | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index ac6ebde1b..8b9f4a722 100644 --- a/Makefile +++ b/Makefile @@ -78,11 +78,12 @@ QUAY_USER ?= gkm IMAGE_TAG ?= latest REPO ?= quay.io/$(QUAY_USER) OPERATOR_IMG ?= $(REPO)/gkm-operator:$(IMAGE_TAG) -AGENT_IMG ?=$(REPO)/gkm-agent:$(IMAGE_TAG) EXTRACT_IMG ?=$(REPO)/gkm-extract:$(IMAGE_TAG) AGENT_NVIDIA_IMG ?= $(REPO)/gkm-agent-nvidia:$(IMAGE_TAG) AGENT_AMD_IMG ?= $(REPO)/gkm-agent-amd:$(IMAGE_TAG) AGENT_NOGPU_IMG ?= $(REPO)/gkm-agent-nogpu:$(IMAGE_TAG) +# Legacy: AGENT_IMG points to nogpu for backwards compatibility (unused by operator) +AGENT_IMG ?= $(AGENT_NOGPU_IMG) # ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary. ENVTEST_K8S_VERSION = 1.31.0 diff --git a/config/configMap/configMap.yaml b/config/configMap/configMap.yaml index e2aed50e6..cecb43866 100644 --- a/config/configMap/configMap.yaml +++ b/config/configMap/configMap.yaml @@ -8,7 +8,8 @@ data: gkm.operator.log.level: info gkm.agent.log.level: info ## Can be configured at runtime - gkm.agent.image: quay.io/gkm/gkm-agent:latest + ## Note: gkm.agent.image is legacy/unused - agents use GPU-specific images + gkm.agent.image: quay.io/gkm/gkm-agent-nogpu:latest gkm.extract.image: quay.io/gkm/gkm-extract:latest gkm.nogpu: false ## Enable/disable Kyverno image signature verification (defaults to true/enabled) diff --git a/config/configMap/kustomization.yaml b/config/configMap/kustomization.yaml index 77b78a349..a33898c8b 100644 --- a/config/configMap/kustomization.yaml +++ b/config/configMap/kustomization.yaml @@ -9,7 +9,8 @@ configMapGenerator: - behavior: merge literals: - gkm.nogpu=true - - gkm.agent.image=quay.io/gkm/gkm-agent:latest + # Note: gkm.agent.image is legacy/unused - agents use GPU-specific images + - gkm.agent.image=quay.io/gkm/gkm-agent-nogpu:latest - gkm.extract.image=quay.io/gkm/gkm-extract:latest name: config namespace: gkm-system