Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
b81e519
feat: add GPU-specific agents for NVIDIA and AMD with NFD-based deplo…
maryamtahhan Mar 11, 2026
88ae647
feat: enhance deployment automation with NFD and Kyverno integration
maryamtahhan Mar 12, 2026
d667e8c
fix: conditionally build agents based on NO_GPU_BUILD flag
maryamtahhan Mar 12, 2026
7dfd7e2
feat: add individual agent image variables for flexible deployment
maryamtahhan Mar 12, 2026
019d4fb
fix: GPU agent scheduling with NFD PCI class code labels
maryamtahhan Mar 12, 2026
cb0df30
fix: exclude control-plane nodes from nogpu agent deployment
maryamtahhan Mar 12, 2026
00fbcbe
fix: mount GPU libraries to enable device access without GPU resource…
maryamtahhan Mar 12, 2026
4bd95a3
feat: add automated dependency installation for RHEL 10
maryamtahhan Mar 12, 2026
fe6471c
gkm: add nvidia example
maryamtahhan Mar 12, 2026
3dca1b9
fix: address PR #107 review comments and failing workflows
maryamtahhan Mar 16, 2026
50c5601
fix: resolve yamllint errors in NVIDIA example YAMLs
maryamtahhan Mar 16, 2026
165b7b3
refactor: restructure RWO examples into organized subdirectories
maryamtahhan Mar 16, 2026
a342885
fix: load actual agent images instead of non-existent AGENT_IMG in ki…
maryamtahhan Mar 16, 2026
bf397b0
kind: fix kyverno deployment
maryamtahhan Mar 16, 2026
502ecb8
makefile: cleanup kyverno targets
maryamtahhan Mar 16, 2026
9471b38
fix: resolve Kind deployment failures on GPU-tainted nodes
maryamtahhan Mar 16, 2026
1eb4846
fix: skip NFD deployment for Kind clusters and use device plugin labels
maryamtahhan Mar 16, 2026
07a00a0
fix: separate SKIP_NFD and NO_GPU flags, simulate NFD labels in Kind
maryamtahhan Mar 16, 2026
6ad2c16
fix: remove node affinity from nogpu agent for Kind clusters
maryamtahhan Mar 16, 2026
0ca6f4e
fix: standardize namespace and cache naming in ROCM and CUDA examples
maryamtahhan Mar 16, 2026
4c4d525
fix: update ROCM daemonset names to match namespace pattern
maryamtahhan Mar 16, 2026
c67564c
images: add gkm prefix to image names
maryamtahhan Mar 16, 2026
ccbc1ad
refactor: use base image for builder stage in GPU agent Containerfiles
maryamtahhan Mar 16, 2026
f744250
refactor: consolidate agent Containerfiles into single multi-target file
maryamtahhan Mar 16, 2026
1756195
fix: update legacy agent image reference to nogpu variant
maryamtahhan Mar 16, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 31 additions & 4 deletions .github/workflows/image-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
image:
- registry: quay.io
repository: gkm
image: operator
image: gkm-operator
dockerfile: ./Containerfile.gkm-operator
context: .
tags: |
Expand All @@ -45,17 +45,43 @@ jobs:

- registry: quay.io
repository: gkm
image: agent
dockerfile: ./Containerfile.gkm-agent
image: gkm-agent-nogpu
dockerfile: ./Containerfile.gkm-agents
context: .
target: nogpu
tags: |
type=ref,event=branch
type=ref,event=tag
type=ref,event=pr
type=sha,format=long
# set latest tag for default branch
type=raw,value=latest,enable={{is_default_branch}}
- registry: quay.io
repository: gkm
image: gkm-agent-nvidia
dockerfile: ./Containerfile.gkm-agents
context: .
target: nvidia
tags: |
type=ref,event=branch
type=ref,event=tag
type=ref,event=pr
type=sha,format=long
# set latest tag for default branch
type=raw,value=latest,enable={{is_default_branch}}
- registry: quay.io
repository: gkm
image: gkm-agent-amd
dockerfile: ./Containerfile.gkm-agents
context: .
target: amd
tags: |
type=ref,event=branch
type=ref,event=tag
type=ref,event=pr
type=sha,format=long
# set latest tag for default branch
type=raw,value=latest,enable={{is_default_branch}}

- registry: quay.io
repository: gkm
image: gkm-extract
Expand Down Expand Up @@ -130,6 +156,7 @@ jobs:
file: ${{ matrix.image.dockerfile }}
build-args: BUILDPLATFORM=linux/amd64
context: ${{ matrix.image.context }}
target: ${{ matrix.image.target || '' }}

- name: Sign the images with GitHub OIDC Token
if: ${{ fromJSON(steps.set-push.outputs.push_flag) }}
Expand Down
84 changes: 0 additions & 84 deletions Containerfile.gkm-agent

This file was deleted.

139 changes: 139 additions & 0 deletions Containerfile.gkm-agents
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# ============================================================================
# Multi-target Containerfile for GKM Agents
# Build specific targets with: podman build --target <nogpu|amd|nvidia>
# ============================================================================

# ============================================================================
# Stage 1: Builder (shared by all agent variants)
# ============================================================================
FROM public.ecr.aws/docker/library/golang:1.25 AS builder

WORKDIR /workspace

# Install required system packages
RUN apt-get update && \
apt-get install -y \
libgpgme-dev \
btrfs-progs \
libbtrfs-dev \
libgpgme11-dev \
libseccomp-dev \
pkg-config \
build-essential && \
apt-get clean

# Copy the Go Modules manifests
COPY go.mod go.mod
COPY go.sum go.sum

# Copy the go source
COPY agent/main.go agent/main.go
COPY api/ api/
COPY pkg/ pkg/
COPY internal/controller/ internal/controller/
COPY vendor/ vendor/
COPY Makefile Makefile

# Build the agent binary
RUN make build-gkm-agent

# ============================================================================
# Target: nogpu (complete no-GPU agent)
# ============================================================================
FROM public.ecr.aws/docker/library/ubuntu:24.04 AS nogpu

# Copy the binary from the builder
COPY --from=builder /workspace/bin/gkm-agent /agent

# Install common runtime libraries (shared with other agent variants)
RUN apt-get update && \
apt-get install -y \
ca-certificates \
libgpgme11 \
libbtrfs0 \
libffi8 \
libc6 \
wget \
pciutils \
hwdata \
gnupg2 \
python3-setuptools \
python3-wheel \
curl \
dialog \
rsync \
lsb-release \
software-properties-common \
libseccomp2 && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Run as non-root user
USER 65532:65532

ENTRYPOINT ["/agent"]

# ============================================================================
# Target: amd (extends nogpu, adds ROCm support)
# ============================================================================
FROM nogpu AS amd

# Switch to root to install ROCm packages
USER root

# AMD ROCm version configuration
ARG ROCM_VERSION=6.3.1
ARG AMDGPU_VERSION=6.3.60301
ARG OPT_ROCM_VERSION=6.3.1

# Install AMD ROCm packages (GPU-specific dependencies)
RUN wget https://repo.radeon.com/amdgpu-install/${ROCM_VERSION}/ubuntu/noble/amdgpu-install_${AMDGPU_VERSION}-1_all.deb && \
apt install -y ./*.deb && \
apt update && DEBIAN_FRONTEND=noninteractive apt install -y amd-smi-lib rocm-smi-lib && \
apt-get clean && rm -rf /var/lib/apt/lists/* && \
ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/amd-smi /usr/bin/amd-smi && \
ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/rocm-smi /usr/bin/rocm-smi

# Switch back to non-root user
USER 65532:65532

# Binary and entrypoint are inherited from nogpu

# ============================================================================
# Target: nvidia (CUDA runtime with NVML support)
# ============================================================================
FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04 AS nvidia

# Copy the binary from the builder
COPY --from=builder /workspace/bin/gkm-agent /agent

# Install common runtime libraries (shared with other agent variants)
RUN apt-get update && \
apt-get install -y \
ca-certificates \
libgpgme11 \
libbtrfs0 \
libffi8 \
libc6 \
wget \
pciutils \
hwdata \
gnupg2 \
python3-setuptools \
python3-wheel \
curl \
dialog \
rsync \
lsb-release \
software-properties-common \
libseccomp2 && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# Note: NVIDIA CUDA base image already includes libnvidia-ml.so (NVML)
# No additional GPU-specific packages needed

# Run as non-root user
USER 65532:65532

ENTRYPOINT ["/agent"]
Loading
Loading