diff --git a/.github/workflows/image-build.yml b/.github/workflows/image-build.yml index 4530d31c3..61f61218f 100644 --- a/.github/workflows/image-build.yml +++ b/.github/workflows/image-build.yml @@ -32,7 +32,7 @@ jobs: image: - registry: quay.io repository: gkm - image: operator + image: gkm-operator dockerfile: ./Containerfile.gkm-operator context: . tags: | @@ -45,9 +45,36 @@ jobs: - registry: quay.io repository: gkm - image: agent - dockerfile: ./Containerfile.gkm-agent + image: gkm-agent-nogpu + dockerfile: ./Containerfile.gkm-agents context: . + target: nogpu + tags: | + type=ref,event=branch + type=ref,event=tag + type=ref,event=pr + type=sha,format=long + # set latest tag for default branch + type=raw,value=latest,enable={{is_default_branch}} + - registry: quay.io + repository: gkm + image: gkm-agent-nvidia + dockerfile: ./Containerfile.gkm-agents + context: . + target: nvidia + tags: | + type=ref,event=branch + type=ref,event=tag + type=ref,event=pr + type=sha,format=long + # set latest tag for default branch + type=raw,value=latest,enable={{is_default_branch}} + - registry: quay.io + repository: gkm + image: gkm-agent-amd + dockerfile: ./Containerfile.gkm-agents + context: . + target: amd tags: | type=ref,event=branch type=ref,event=tag @@ -55,7 +82,6 @@ jobs: type=sha,format=long # set latest tag for default branch type=raw,value=latest,enable={{is_default_branch}} - - registry: quay.io repository: gkm image: gkm-extract @@ -130,6 +156,7 @@ jobs: file: ${{ matrix.image.dockerfile }} build-args: BUILDPLATFORM=linux/amd64 context: ${{ matrix.image.context }} + target: ${{ matrix.image.target || '' }} - name: Sign the images with GitHub OIDC Token if: ${{ fromJSON(steps.set-push.outputs.push_flag) }} diff --git a/Containerfile.gkm-agent b/Containerfile.gkm-agent deleted file mode 100644 index 2214838e6..000000000 --- a/Containerfile.gkm-agent +++ /dev/null @@ -1,84 +0,0 @@ -# Build the agent binary -FROM public.ecr.aws/docker/library/golang:1.25 AS builder - -WORKDIR /workspace - -# Install required system packages -RUN apt-get update && \ - apt-get install -y \ - libgpgme-dev \ - btrfs-progs \ - libbtrfs-dev \ - libgpgme11-dev \ - libseccomp-dev \ - pkg-config \ - build-essential && \ - apt-get clean - -# Copy the Go Modules manifests -COPY go.mod go.mod -COPY go.sum go.sum - -# Copy the go source -COPY agent/main.go agent/main.go -COPY api/ api/ -COPY pkg/ pkg/ -COPY internal/controller/ internal/controller/ -COPY vendor/ vendor/ -COPY Makefile Makefile - -# Build the agent binary -RUN make build-gkm-agent - -# Use a minimal Ubuntu base image that supports CGO binaries -FROM public.ecr.aws/docker/library/ubuntu:24.04 - -# Copy the binary from the builder -COPY --from=builder /workspace/bin/gkm-agent /agent - -# Install required runtime libraries for CGO -RUN apt-get update && \ - apt-get install -y \ - ca-certificates \ - libgpgme11 \ - libbtrfs0 \ - libffi8 \ - libc6 \ - wget \ - pciutils \ - hwdata \ - gnupg2 \ - python3-setuptools \ - python3-wheel \ - curl \ - dialog \ - rsync \ - lsb-release \ - software-properties-common \ - libseccomp2 && \ - apt-get clean - -ARG NO_GPU=false -ARG ROCM_VERSION=7.0.1 -ARG AMDGPU_VERSION=7.0.1.70001 -ARG OPT_ROCM_VERSION=7.0.1 - -# Conditionally install ROCm packages based on NO_GPU flag -RUN if [ "$NO_GPU" = "false" ]; then \ - wget https://repo.radeon.com/amdgpu-install/${ROCM_VERSION}/ubuntu/noble/amdgpu-install_${AMDGPU_VERSION}-1_all.deb && \ - apt install -y ./*.deb && \ - apt update && DEBIAN_FRONTEND=noninteractive apt install -y amd-smi-lib rocm-smi-lib && \ - apt-get clean && rm -rf /var/lib/apt/lists/* && \ - ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/amd-smi /usr/bin/amd-smi && \ - ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/rocm-smi /usr/bin/rocm-smi; \ - else \ - echo "NO_GPU=true, skipping ROCm installation"; \ - fi - -# Set NO_GPU environment variable -ENV NO_GPU=${NO_GPU} - -# Run as non-root user -USER 65532:65532 - -ENTRYPOINT ["/agent"] diff --git a/Containerfile.gkm-agents b/Containerfile.gkm-agents new file mode 100644 index 000000000..326d0c078 --- /dev/null +++ b/Containerfile.gkm-agents @@ -0,0 +1,139 @@ +# ============================================================================ +# Multi-target Containerfile for GKM Agents +# Build specific targets with: podman build --target +# ============================================================================ + +# ============================================================================ +# Stage 1: Builder (shared by all agent variants) +# ============================================================================ +FROM public.ecr.aws/docker/library/golang:1.25 AS builder + +WORKDIR /workspace + +# Install required system packages +RUN apt-get update && \ + apt-get install -y \ + libgpgme-dev \ + btrfs-progs \ + libbtrfs-dev \ + libgpgme11-dev \ + libseccomp-dev \ + pkg-config \ + build-essential && \ + apt-get clean + +# Copy the Go Modules manifests +COPY go.mod go.mod +COPY go.sum go.sum + +# Copy the go source +COPY agent/main.go agent/main.go +COPY api/ api/ +COPY pkg/ pkg/ +COPY internal/controller/ internal/controller/ +COPY vendor/ vendor/ +COPY Makefile Makefile + +# Build the agent binary +RUN make build-gkm-agent + +# ============================================================================ +# Target: nogpu (complete no-GPU agent) +# ============================================================================ +FROM public.ecr.aws/docker/library/ubuntu:24.04 AS nogpu + +# Copy the binary from the builder +COPY --from=builder /workspace/bin/gkm-agent /agent + +# Install common runtime libraries (shared with other agent variants) +RUN apt-get update && \ + apt-get install -y \ + ca-certificates \ + libgpgme11 \ + libbtrfs0 \ + libffi8 \ + libc6 \ + wget \ + pciutils \ + hwdata \ + gnupg2 \ + python3-setuptools \ + python3-wheel \ + curl \ + dialog \ + rsync \ + lsb-release \ + software-properties-common \ + libseccomp2 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Run as non-root user +USER 65532:65532 + +ENTRYPOINT ["/agent"] + +# ============================================================================ +# Target: amd (extends nogpu, adds ROCm support) +# ============================================================================ +FROM nogpu AS amd + +# Switch to root to install ROCm packages +USER root + +# AMD ROCm version configuration +ARG ROCM_VERSION=6.3.1 +ARG AMDGPU_VERSION=6.3.60301 +ARG OPT_ROCM_VERSION=6.3.1 + +# Install AMD ROCm packages (GPU-specific dependencies) +RUN wget https://repo.radeon.com/amdgpu-install/${ROCM_VERSION}/ubuntu/noble/amdgpu-install_${AMDGPU_VERSION}-1_all.deb && \ + apt install -y ./*.deb && \ + apt update && DEBIAN_FRONTEND=noninteractive apt install -y amd-smi-lib rocm-smi-lib && \ + apt-get clean && rm -rf /var/lib/apt/lists/* && \ + ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/amd-smi /usr/bin/amd-smi && \ + ln -s /opt/rocm-${OPT_ROCM_VERSION}/bin/rocm-smi /usr/bin/rocm-smi + +# Switch back to non-root user +USER 65532:65532 + +# Binary and entrypoint are inherited from nogpu + +# ============================================================================ +# Target: nvidia (CUDA runtime with NVML support) +# ============================================================================ +FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04 AS nvidia + +# Copy the binary from the builder +COPY --from=builder /workspace/bin/gkm-agent /agent + +# Install common runtime libraries (shared with other agent variants) +RUN apt-get update && \ + apt-get install -y \ + ca-certificates \ + libgpgme11 \ + libbtrfs0 \ + libffi8 \ + libc6 \ + wget \ + pciutils \ + hwdata \ + gnupg2 \ + python3-setuptools \ + python3-wheel \ + curl \ + dialog \ + rsync \ + lsb-release \ + software-properties-common \ + libseccomp2 && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* + +# Note: NVIDIA CUDA base image already includes libnvidia-ml.so (NVML) +# No additional GPU-specific packages needed + +# Run as non-root user +USER 65532:65532 + +ENTRYPOINT ["/agent"] diff --git a/Makefile b/Makefile index 5a4e0bf21..8b9f4a722 100644 --- a/Makefile +++ b/Makefile @@ -19,6 +19,9 @@ CONTAINER_FLAGS ?= --build-arg TARGETARCH=$(ARCH) # NO_GPU flag for building without GPU support NO_GPU_BUILD ?= false +# SKIP_NFD flag for skipping NFD deployment (e.g., Kind clusters) +SKIP_NFD ?= false + # KYVERNO_ENABLED flag for enabling/disabling Kyverno verification (runtime only) KYVERNO_ENABLED ?= true @@ -74,9 +77,13 @@ OPERATOR_SDK_VERSION ?= v1.39.2 QUAY_USER ?= gkm IMAGE_TAG ?= latest REPO ?= quay.io/$(QUAY_USER) -OPERATOR_IMG ?= $(REPO)/operator:$(IMAGE_TAG) -AGENT_IMG ?=$(REPO)/agent:$(IMAGE_TAG) +OPERATOR_IMG ?= $(REPO)/gkm-operator:$(IMAGE_TAG) EXTRACT_IMG ?=$(REPO)/gkm-extract:$(IMAGE_TAG) +AGENT_NVIDIA_IMG ?= $(REPO)/gkm-agent-nvidia:$(IMAGE_TAG) +AGENT_AMD_IMG ?= $(REPO)/gkm-agent-amd:$(IMAGE_TAG) +AGENT_NOGPU_IMG ?= $(REPO)/gkm-agent-nogpu:$(IMAGE_TAG) +# Legacy: AGENT_IMG points to nogpu for backwards compatibility (unused by operator) +AGENT_IMG ?= $(AGENT_NOGPU_IMG) # ENVTEST_K8S_VERSION refers to the version of kubebuilder assets to be downloaded by envtest binary. ENVTEST_K8S_VERSION = 1.31.0 @@ -145,6 +152,12 @@ vendors: ## Refresh vendors directory. @echo "### Checking vendors" go mod tidy && go mod vendor +.PHONY: install-deps +install-deps: ## Install all dependencies (go, podman, kubectl, and build dependencies). + @echo "### Installing GKM dependencies" + @chmod +x hack/install_deps.sh + @./hack/install_deps.sh + .PHONY: explain explain: ## Run "kubectl explain" on all CRDs. CRD_1="ClusterGKMCache" CRD_2="GKMCache" CRD_3="ClusterGKMCacheNode" CRD_4="GKMCacheNode" OUTPUT_DIR="../docs/crds" ./hack/crd_explain_txt.sh @@ -209,25 +222,57 @@ run: manifests generate fmt vet ## Run a controller from your host. build-image-operator: $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --progress=plain --load -f Containerfile.gkm-operator -t ${OPERATOR_IMG} . -.PHONY: build-image-agent -build-image-agent: - $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --build-arg NO_GPU=$(NO_GPU_BUILD) --progress=plain --load -f Containerfile.gkm-agent -t ${AGENT_IMG} . - .PHONY: build-image-gkm-extract build-image-gkm-extract: $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --progress=plain --load -f Containerfile.gkm-extract -t ${EXTRACT_IMG} . +.PHONY: build-image-agent-nvidia +build-image-agent-nvidia: + $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load --target nvidia -f Containerfile.gkm-agents -t ${AGENT_NVIDIA_IMG} . + +.PHONY: build-image-agent-amd +build-image-agent-amd: + $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --platform linux/amd64 --progress=plain --load --target amd -f Containerfile.gkm-agents -t ${AGENT_AMD_IMG} . + +.PHONY: build-image-agent-nogpu +build-image-agent-nogpu: + $(CONTAINER_TOOL) build $(CONTAINER_FLAGS) --progress=plain --load --target nogpu -f Containerfile.gkm-agents -t ${AGENT_NOGPU_IMG} . + +.PHONY: build-image-agents +ifeq ($(NO_GPU_BUILD),true) +build-image-agents: build-image-agent-nogpu ## Build no-GPU agent only (NO_GPU_BUILD=true) +else +build-image-agents: build-image-agent-nvidia build-image-agent-amd build-image-agent-nogpu ## Build all agent images (NVIDIA, AMD, and no-GPU) +endif + # If you wish to build the operator image targeting other platforms you can use the --platform flag. # (i.e. docker build --platform linux/arm64). However, you must enable docker buildKit for it. # More info: https://docs.docker.com/develop/develop-images/build_enhancements/ .PHONY: build-images -build-images: build-image-operator build-image-agent build-image-gkm-extract ## Build all container images. +build-images: build-image-operator build-image-agents build-image-gkm-extract ## Build all container images. .PHONY: push-images -push-images: ## Push all container image. +push-images: ## Push all container images. $(CONTAINER_TOOL) push ${OPERATOR_IMG} - $(CONTAINER_TOOL) push ${AGENT_IMG} $(CONTAINER_TOOL) push ${EXTRACT_IMG} +ifeq ($(NO_GPU_BUILD),true) + $(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG} +else + $(CONTAINER_TOOL) push ${AGENT_NVIDIA_IMG} + $(CONTAINER_TOOL) push ${AGENT_AMD_IMG} + $(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG} +endif + +.PHONY: push-images-agents +ifeq ($(NO_GPU_BUILD),true) +push-images-agents: ## Push no-GPU agent only (NO_GPU_BUILD=true) + $(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG} +else +push-images-agents: ## Push all agent images + $(CONTAINER_TOOL) push ${AGENT_NVIDIA_IMG} + $(CONTAINER_TOOL) push ${AGENT_AMD_IMG} + $(CONTAINER_TOOL) push ${AGENT_NOGPU_IMG} +endif # Mapping old commands after rename .PHONY: docker-build @@ -292,10 +337,28 @@ uninstall: manifests kustomize ## Uninstall CRDs from the K8s cluster specified $(KUSTOMIZE) build config/crd | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f - ##@ Deployment + +.PHONY: deploy-nfd +deploy-nfd: kustomize ## Deploy Node Feature Discovery for GPU detection + @echo "Deploying Node Feature Discovery (NFD)..." + $(KUSTOMIZE) build config/nfd | $(KUBECTL) apply -f - + @echo "Waiting for NFD to be ready..." + @$(KUBECTL) wait --for=condition=Available --timeout=120s -n node-feature-discovery deployment/nfd-master || true + @echo "NFD deployed successfully." + +.PHONY: undeploy-nfd +undeploy-nfd: kustomize ## Undeploy Node Feature Discovery + @echo "Undeploying Node Feature Discovery..." + $(KUSTOMIZE) build config/nfd | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f - + @echo "NFD undeployed." + .PHONY: prepare-deploy prepare-deploy: cd config/operator && $(KUSTOMIZE) edit set image quay.io/gkm/operator=${OPERATOR_IMG} - cd config/agent && $(KUSTOMIZE) edit set image quay.io/gkm/agent=${AGENT_IMG} + cd config/agent && $(KUSTOMIZE) edit set image \ + quay.io/gkm/agent-nvidia=${AGENT_NVIDIA_IMG} \ + quay.io/gkm/agent-amd=${AGENT_AMD_IMG} \ + quay.io/gkm/agent-nogpu=${AGENT_NOGPU_IMG} ifdef NO_GPU cd config/configMap && \ $(SED) \ @@ -318,7 +381,15 @@ ifneq ($(KYVERNO_ENABLED),true) endif .PHONY: deploy -deploy: manifests kustomize prepare-deploy webhook-secret-file deploy-cert-manager redeploy ## Deploy controller and agent to the K8s cluster specified in ~/.kube/config +ifeq ($(SKIP_NFD),true) +deploy: manifests kustomize prepare-deploy webhook-secret-file deploy-cert-manager redeploy ## Deploy controller and agent (skips NFD for Kind) +else +deploy: manifests kustomize prepare-deploy webhook-secret-file deploy-cert-manager deploy-nfd redeploy ## Deploy controller and agent to the K8s cluster specified in ~/.kube/config +endif +ifeq ($(KYVERNO_ENABLED),true) + @echo "Deploying Kyverno (KYVERNO_ENABLED=true)..." + $(MAKE) deploy-kyverno-with-policies +endif .PHONY: redeploy redeploy: ## Redeploy controller and agent to the K8s cluster after deploy and undeploy have been called. Skips some onetime steps in deploy. @@ -333,6 +404,15 @@ undeploy: kustomize delete-webhook-secret-file ## Undeploy operator and agent fr exit 1; \ fi $(KUSTOMIZE) build $(DEPLOY_PATH) | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f - +ifeq ($(KYVERNO_ENABLED),true) + @echo "Undeploying Kyverno (KYVERNO_ENABLED=true)..." + -$(MAKE) undeploy-kyverno-policies + -$(MAKE) undeploy-kyverno-production +endif +ifneq ($(SKIP_NFD),true) + @echo "Undeploying NFD..." + -$(MAKE) undeploy-nfd +endif @echo "Undeployment from $(DEPLOY_PATH) completed." .PHONY: undeploy-force @@ -436,9 +516,15 @@ undeploy-cert-manager: delete-webhook-secret-file ##@ Kyverno KYVERNO_VERSION ?= latest +KYVERNO_NAMESPACE ?= kyverno +KYVERNO_REPO ?= https://kyverno.github.io/kyverno/ HELM_VERSION ?= v3.16.3 HELM ?= $(LOCALBIN)/helm +# Common Kyverno helm flags +KYVERNO_HELM_FLAGS = --namespace $(KYVERNO_NAMESPACE) --create-namespace --repo $(KYVERNO_REPO) kyverno --wait +KYVERNO_KIND_CONTEXT = --kube-context kind-$(KIND_CLUSTER_NAME) + .PHONY: helm helm: $(HELM) ## Download helm locally if necessary. $(HELM): $(LOCALBIN) @@ -447,25 +533,43 @@ $(HELM): $(LOCALBIN) curl -sSL https://get.helm.sh/helm-$(HELM_VERSION)-$(GOOS)-$(GOARCH).tar.gz | tar xz -C $(LOCALBIN) --strip-components=1 $(GOOS)-$(GOARCH)/helm ; \ } +# Internal target for deploying Kyverno with configurable context +.PHONY: _deploy-kyverno-base +_deploy-kyverno-base: helm + @echo "Installing Kyverno..." +ifeq ($(SKIP_NFD),true) + @echo "Using Kyverno configuration with GPU nodeSelector and tolerations (SKIP_NFD=true for Kind)..." + $(HELM) upgrade --install kyverno $(KYVERNO_HELM_FLAGS) $(KYVERNO_CONTEXT) \ + --values config/kyverno/values-no-gpu.yaml +else + @echo "Using default Kyverno configuration..." + $(HELM) upgrade --install kyverno $(KYVERNO_HELM_FLAGS) $(KYVERNO_CONTEXT) \ + --values config/kyverno/values.yaml +endif +ifdef KYVERNO_WAIT + @echo "Waiting for Kyverno to be ready..." + @$(KUBECTL) wait --for=condition=Available --timeout=120s -n $(KYVERNO_NAMESPACE) deployment/kyverno-admission-controller || true +endif + @echo "Kyverno deployed successfully." + .PHONY: deploy-kyverno -deploy-kyverno: helm ## Deploy Kyverno with optional GPU tolerations for Kind cluster - @echo "Installing Kyverno to cluster $(KIND_CLUSTER_NAME)..." -ifeq ($(NO_GPU),true) - @echo "Using Kyverno configuration with GPU nodeSelector and tolerations (NO_GPU=true)..." - $(HELM) upgrade --install kyverno --namespace kyverno --create-namespace \ - --kube-context kind-$(KIND_CLUSTER_NAME) \ - --repo https://kyverno.github.io/kyverno/ kyverno \ - --values config/kyverno/values-no-gpu.yaml \ - --wait +deploy-kyverno: ## Deploy Kyverno for Kind cluster + @$(MAKE) _deploy-kyverno-base KYVERNO_CONTEXT="$(KYVERNO_KIND_CONTEXT)" + +.PHONY: deploy-kyverno-production +deploy-kyverno-production: ## Deploy Kyverno for production clusters + @$(MAKE) _deploy-kyverno-base KYVERNO_CONTEXT="" KYVERNO_WAIT=true + +.PHONY: deploy-kyverno-with-policies +ifeq ($(SKIP_NFD),true) +deploy-kyverno-with-policies: deploy-kyverno deploy-kyverno-policies ## Deploy Kyverno and its policies (uses Kind values with GPU tolerations) else - @echo "Using default Kyverno configuration for production GPU environments..." - $(HELM) upgrade --install kyverno --namespace kyverno --create-namespace \ - --kube-context kind-$(KIND_CLUSTER_NAME) \ - --repo https://kyverno.github.io/kyverno/ kyverno \ - --values config/kyverno/values.yaml \ - --wait +deploy-kyverno-with-policies: deploy-kyverno-production deploy-kyverno-policies ## Deploy Kyverno and its policies (uses production values) endif - @echo "Kyverno deployed successfully to $(KIND_CLUSTER_NAME)." + @echo "Restarting Kyverno to discover GKM CRDs..." + @$(KUBECTL) rollout restart deployment/kyverno-admission-controller -n $(KYVERNO_NAMESPACE) + @$(KUBECTL) wait --for=condition=Available --timeout=120s -n $(KYVERNO_NAMESPACE) deployment/kyverno-admission-controller || true + @echo "Kyverno and policies deployed successfully." .PHONY: deploy-kyverno-policies deploy-kyverno-policies: kustomize ## Deploy Kyverno ClusterPolicies for GKMCache image verification @@ -479,14 +583,21 @@ undeploy-kyverno-policies: kustomize ## Undeploy Kyverno ClusterPolicies $(KUSTOMIZE) build config/kyverno/policies | $(KUBECTL) delete --ignore-not-found=$(ignore-not-found) -f - @echo "Kyverno policies undeployed." +# Internal target for undeploying Kyverno with configurable context +.PHONY: _undeploy-kyverno-base +_undeploy-kyverno-base: + @echo "Uninstalling Kyverno..." + $(HELM) uninstall kyverno --namespace $(KYVERNO_NAMESPACE) $(KYVERNO_CONTEXT) --ignore-not-found || true + $(KUBECTL) delete namespace $(KYVERNO_NAMESPACE) --ignore-not-found=$(ignore-not-found) + @echo "Kyverno undeployed." + .PHONY: undeploy-kyverno -undeploy-kyverno: ## Undeploy Kyverno - @echo "Uninstalling Kyverno from cluster $(KIND_CLUSTER_NAME)..." - $(HELM) uninstall kyverno --namespace kyverno \ - --kube-context kind-$(KIND_CLUSTER_NAME) \ - --ignore-not-found || true - $(KUBECTL) delete namespace kyverno --ignore-not-found=$(ignore-not-found) - @echo "Kyverno undeployed from $(KIND_CLUSTER_NAME)." +undeploy-kyverno: ## Undeploy Kyverno from Kind cluster + @$(MAKE) _undeploy-kyverno-base KYVERNO_CONTEXT="$(KYVERNO_KIND_CONTEXT)" + +.PHONY: undeploy-kyverno-production +undeploy-kyverno-production: ## Undeploy Kyverno from production cluster + @$(MAKE) _undeploy-kyverno-base KYVERNO_CONTEXT="" ##@ Kind Cluster Management @@ -500,8 +611,17 @@ setup-kind: kind-gpu-sim-script kind-load-images: kind-gpu-sim-script get-example-images @echo "Loading operator image ${OPERATOR_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)" cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${OPERATOR_IMG} --cluster-name=$(KIND_CLUSTER_NAME) - @echo "Loading agent image ${AGENT_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)" - cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${AGENT_IMG} --cluster-name=$(KIND_CLUSTER_NAME) +ifeq ($(NO_GPU_BUILD),true) + @echo "Loading agent nogpu image ${AGENT_NOGPU_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)" + cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${AGENT_NOGPU_IMG} --cluster-name=$(KIND_CLUSTER_NAME) +else + @echo "Loading agent nvidia image ${AGENT_NVIDIA_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)" + cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${AGENT_NVIDIA_IMG} --cluster-name=$(KIND_CLUSTER_NAME) + @echo "Loading agent amd image ${AGENT_AMD_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)" + cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${AGENT_AMD_IMG} --cluster-name=$(KIND_CLUSTER_NAME) + @echo "Loading agent nogpu image ${AGENT_NOGPU_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)" + cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${AGENT_NOGPU_IMG} --cluster-name=$(KIND_CLUSTER_NAME) +endif @echo "Loading gkm-extract image ${EXTRACT_IMG} into Kind cluster: $(KIND_CLUSTER_NAME)" cat $(KIND_GPU_SIM_SCRIPT) | bash -s load --image-name=${EXTRACT_IMG} --cluster-name=$(KIND_CLUSTER_NAME) @echo "Images loaded successfully into Kind cluster: $(KIND_CLUSTER_NAME)" @@ -513,17 +633,6 @@ tmp-cleanup: .PHONY: run-on-kind run-on-kind: destroy-kind setup-kind deploy-on-kind ## Setup Kind cluster, load images, and deploy -ifeq ($(KYVERNO_ENABLED),true) - @echo "Deploying Kyverno after GKM CRDs (KYVERNO_ENABLED=true)..." - $(MAKE) deploy-kyverno NO_GPU=true - @echo "Waiting for Kyverno to be ready..." - $(KUBECTL) wait --for=condition=Available --timeout=120s -n kyverno deployment/kyverno-admission-controller || true - @echo "Deploying Kyverno policies..." - $(MAKE) deploy-kyverno-policies - @echo "Restarting Kyverno to discover GKM CRDs..." - $(KUBECTL) rollout restart deployment/kyverno-admission-controller -n kyverno - $(KUBECTL) wait --for=condition=Available --timeout=120s -n kyverno deployment/kyverno-admission-controller -endif @echo "Cluster created, images loaded, and agent deployed on Kind GPU cluster." .PHONY: deploy-on-kind @@ -533,16 +642,16 @@ deploy-on-kind: kind-load-images tmp-cleanup @echo "Add label gkm-test-node=false to node kind-gpu-sim-worker2." $(KUBECTL) label node kind-gpu-sim-worker2 gkm-test-node=false --overwrite ## NOTE: config/kind-gpu is an overlay of config/default - $(MAKE) deploy DEPLOY_PATH=config/kind-gpu NO_GPU=true + $(MAKE) deploy DEPLOY_PATH=config/kind-gpu SKIP_NFD=true NO_GPU=true .PHONY: redeploy-on-kind redeploy-on-kind: ## Redeploy controller and agent to Kind GPU cluster after run-on-kind and undeploy-on-kind have been called. Skips some onetime steps in deploy. - $(MAKE) redeploy DEPLOY_PATH=config/kind-gpu NO_GPU=true + $(MAKE) redeploy DEPLOY_PATH=config/kind-gpu SKIP_NFD=true @echo "Deployment to $(DEPLOY_PATH) completed." .PHONY: undeploy-on-kind undeploy-on-kind: ## Undeploy operator and agent from the Kind GPU cluster. - $(MAKE) undeploy FORCE=$(FORCE) DEPLOY_PATH=config/kind-gpu ignore-not-found=$(ignore-not-found) + $(MAKE) undeploy FORCE=$(FORCE) DEPLOY_PATH=config/kind-gpu SKIP_NFD=true ignore-not-found=$(ignore-not-found) @echo "Undeployment from Kind GPU cluster $(KIND_CLUSTER_NAME) completed." .PHONY: undeploy-on-kind-force @@ -604,7 +713,7 @@ kind-gpu-sim-script: $(KIND_GPU_SIM_SCRIPT) ## Download kind-gpu-sim-script loc $(KIND_GPU_SIM_SCRIPT): $(LOCALBIN) if [ ! -f $(KIND_GPU_SIM_SCRIPT) ]; then \ echo "Downloading $(KIND_GPU_SIM_SCRIPT)"; \ - wget -P $(LOCALBIN) $(KIND_GPU_SIM_SCRIPT_URL); \ + curl -L -o $(KIND_GPU_SIM_SCRIPT) $(KIND_GPU_SIM_SCRIPT_URL); \ chmod +x $(KIND_GPU_SIM_SCRIPT); \ fi diff --git a/config/agent/README.md b/config/agent/README.md new file mode 100644 index 000000000..71161a7e3 --- /dev/null +++ b/config/agent/README.md @@ -0,0 +1,198 @@ +# Multi-GPU Agent Deployment + +This directory contains configuration for deploying GPU-specific GKM agents that support both NVIDIA and AMD GPUs in heterogeneous clusters. + +## Overview + +GKM now supports deploying different agent containers based on the GPU hardware present on each node: + +- **`gkm-agent-nvidia`**: For nodes with NVIDIA GPUs +- **`gkm-agent-amd`**: For nodes with AMD ROCm GPUs +- **`gkm-agent`**: Legacy generic agent (deprecated) + +## Architecture + +Each GPU-specific agent uses: +- **Different base images** with appropriate GPU runtime libraries +- **Node selectors** to deploy only on compatible hardware +- **Automatic node labeling** via Node Feature Discovery (NFD) + +## Prerequisites + +### 1. Node Feature Discovery (NFD) + +NFD must be deployed to automatically label nodes with their PCI device information: + +```bash +# Deploy NFD +kubectl apply -k config/nfd + +# Verify NFD is running +kubectl get pods -n node-feature-discovery + +# Check node labels (should see pci-* labels) +kubectl get nodes -o json | jq '.items[].metadata.labels' | grep pci +``` + +NFD will automatically add labels like: +- `feature.node.kubernetes.io/pci-10de.present=true` (NVIDIA, vendor ID: 0x10de) +- `feature.node.kubernetes.io/pci-1002.present=true` (AMD, vendor ID: 0x1002) + +### 2. GPU Device Plugins + +Ensure appropriate GPU device plugins are installed: + +**For NVIDIA:** +```bash +kubectl apply -f https://raw.githubusercontent.com/NVIDIA/k8s-device-plugin/v0.17.0/deployments/static/nvidia-device-plugin.yml +``` + +**For AMD:** +```bash +kubectl apply -f https://raw.githubusercontent.com/ROCm/k8s-device-plugin/master/k8s-ds-amdgpu-dp.yaml +``` + +## Building GPU-Specific Agent Images + +### Build All GPU Agents +```bash +make build-image-agents-gpu +``` + +### Build Individual Agents +```bash +# NVIDIA agent +make build-image-agent-nvidia + +# AMD agent +make build-image-agent-amd +``` + +### Push Images to Registry +```bash +# Set your registry +export QUAY_USER=your-org + +# Push GPU-specific agents +make push-images-agents-gpu +``` + +## Deployment + +### Deploy with Kustomize +```bash +kubectl apply -k config/agent +``` + +This will deploy: +- `agent-nvidia` DaemonSet → Only on nodes with `feature.node.kubernetes.io/pci-10de.present=true` +- `agent-amd` DaemonSet → Only on nodes with `feature.node.kubernetes.io/pci-1002.present=true` + +### Verify Deployment +```bash +# Check which agents are running +kubectl get ds -n gkm-system + +# Check agent pods and their node placement +kubectl get pods -n gkm-system -o wide + +# Verify agents are on correct nodes +kubectl get pods -n gkm-system -l gpu-vendor=nvidia -o wide +kubectl get pods -n gkm-system -l gpu-vendor=amd -o wide +``` + +## Containerfiles + +All agent variants are built from [Containerfile.gkm-agents](../../Containerfile.gkm-agents) using multi-stage targets: + +### NVIDIA Agent (target: `nvidia`) +- Base image: `nvcr.io/nvidia/cuda:12.6.3-base-ubuntu24.04` +- Includes: NVIDIA CUDA runtime with NVML libraries +- Requires: NVIDIA driver on host + +### AMD Agent (target: `amd`) +- Base image: extends `nogpu` target +- Includes: ROCm libraries (`amd-smi-lib`, `rocm-smi-lib`) +- Requires: AMD GPU driver on host + +### No-GPU Agent (target: `nogpu`) +- Base image: `ubuntu:24.04` +- Includes: Common runtime dependencies only +- For non-GPU workloads + +## Node Selectors + +The DaemonSets use PCI vendor ID-based node selectors: + +```yaml +# NVIDIA nodes +nodeSelector: + feature.node.kubernetes.io/pci-10de.present: "true" + +# AMD nodes +nodeSelector: + feature.node.kubernetes.io/pci-1002.present: "true" +``` + +## Hybrid GPU Clusters + +In clusters with both NVIDIA and AMD nodes: + +1. **NFD labels all nodes** with their PCI device information +2. **NVIDIA agent** deploys only to NVIDIA nodes +3. **AMD agent** deploys only to AMD nodes +4. **Operator** works with whichever agent is present on each node + +## Troubleshooting + +### NFD Not Labeling Nodes + +```bash +# Check NFD worker logs +kubectl logs -n node-feature-discovery -l app=nfd-worker + +# Manually verify PCI devices +lspci | grep -i vga +lspci -n | grep -E "0300|0302" +``` + +### Agent Not Scheduling + +```bash +# Check node labels +kubectl describe node | grep feature.node.kubernetes.io/pci + +# Check DaemonSet events +kubectl describe ds agent-nvidia -n gkm-system +kubectl describe ds agent-amd -n gkm-system +``` + +### GPU Libraries Not Found + +```bash +# Check NVIDIA driver +nvidia-smi + +# Check AMD driver +rocm-smi + +# Verify libraries in container +kubectl exec -it -n gkm-system -- ls -la /usr/lib/x86_64-linux-gnu/ | grep -E "nvidia|amd" +``` + +## Migration from Generic Agent + +To migrate from the legacy generic agent: + +1. Deploy NFD: `kubectl apply -k config/nfd` +2. Build GPU-specific agents: `make build-image-agents-gpu` +3. Update manifests to use new agent DaemonSets +4. Deploy: `kubectl apply -k config/agent` +5. Remove old generic agent: `kubectl delete ds agent -n gkm-system` + +## Related Files + +- [gkm-agent-nvidia.yaml](gkm-agent-nvidia.yaml) - NVIDIA DaemonSet +- [gkm-agent-amd.yaml](gkm-agent-amd.yaml) - AMD DaemonSet +- [kustomization.yaml](kustomization.yaml) - Kustomize configuration +- [../../Containerfile.gkm-agents](../../Containerfile.gkm-agents) - Multi-target agent Containerfile diff --git a/config/agent/gkm-agent-amd.yaml b/config/agent/gkm-agent-amd.yaml new file mode 100644 index 000000000..3187c3184 --- /dev/null +++ b/config/agent/gkm-agent-amd.yaml @@ -0,0 +1,124 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: agent-amd + namespace: gkm-system + labels: + app: gkm-agent + gpu-vendor: amd +spec: + selector: + matchLabels: + app: gkm-agent + gpu-vendor: amd + template: + metadata: + labels: + app: gkm-agent + gpu-vendor: amd + spec: + serviceAccountName: gkm-agent + # Deploy only on nodes with AMD GPUs + # AMD vendor ID is 1002, with class codes: + # 0300: VGA controller, 0302: 3D controller, 0380: Display controller + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0300_1002.present + operator: Exists + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0302_1002.present + operator: Exists + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0380_1002.present + operator: Exists + containers: + - name: gkm-agent + image: quay.io/gkm/gkm-agent-amd:latest + imagePullPolicy: IfNotPresent + securityContext: + runAsUser: 0 + privileged: true + capabilities: + add: ["CAP_DAC_OVERRIDE", "CAP_FOWNER"] + seccompProfile: + type: Unconfined + env: + - name: NO_GPU + valueFrom: + configMapKeyRef: + name: gkm-config + key: gkm.nogpu + - name: GO_LOG + valueFrom: + configMapKeyRef: + name: gkm-config + key: gkm.agent.log.level + - name: EXTRACT_IMAGE + valueFrom: + configMapKeyRef: + name: gkm-config + key: gkm.extract.image + - name: KUBE_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: LD_LIBRARY_PATH + value: /opt/rocm/lib:/usr/lib64 + resources: + limits: + memory: "128Mi" + cpu: "100m" + amd.com/gpu: "0" # Request 0 GPUs (agent only monitors, doesn't use GPU) + volumeMounts: + - name: gkm-state + mountPath: /var/lib/gkm + mountPropagation: Bidirectional + - name: gkm-runtime + mountPath: /run/gkm + mountPropagation: Bidirectional + - name: sys + mountPath: /sys + readOnly: true + - name: dev + mountPath: /dev + - name: rocm-libs + mountPath: /opt/rocm + readOnly: true + - name: system-libs + mountPath: /usr/lib64 + readOnly: true + + volumes: + # This volume is the GKM State directory. This is where GPU Kernel Cache + # will be extracted. + - name: gkm-state + hostPath: + path: /var/lib/gkm + type: DirectoryOrCreate + # This volume is the GKM Runtime directory. This is where the Usage data + # will tracked which pods are using which cache. + - name: gkm-runtime + hostPath: + path: /run/gkm + type: DirectoryOrCreate + - name: sys + hostPath: + path: /sys + type: Directory + - name: dev + hostPath: + path: /dev + type: Directory + # ROCm libraries needed for AMD GPU management + - name: rocm-libs + hostPath: + path: /opt/rocm + type: DirectoryOrCreate + # System libraries for GPU access + - name: system-libs + hostPath: + path: /usr/lib64 + type: Directory diff --git a/config/agent/gkm-agent-nogpu.yaml b/config/agent/gkm-agent-nogpu.yaml new file mode 100644 index 000000000..66e3121b4 --- /dev/null +++ b/config/agent/gkm-agent-nogpu.yaml @@ -0,0 +1,100 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: agent-nogpu + namespace: gkm-system + labels: + app: gkm-agent + gpu-vendor: none +spec: + selector: + matchLabels: + app: gkm-agent + gpu-vendor: none + template: + metadata: + labels: + app: gkm-agent + gpu-vendor: none + spec: + serviceAccountName: gkm-agent + # Deploy on nodes without GPUs (nodes that don't have NVIDIA or AMD PCI labels) + # and exclude control-plane nodes in multi-node clusters + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0300_10de.present + operator: DoesNotExist + - key: feature.node.kubernetes.io/pci-0302_10de.present + operator: DoesNotExist + - key: feature.node.kubernetes.io/pci-0300_1002.present + operator: DoesNotExist + - key: feature.node.kubernetes.io/pci-0302_1002.present + operator: DoesNotExist + - key: feature.node.kubernetes.io/pci-0380_1002.present + operator: DoesNotExist + - key: node-role.kubernetes.io/control-plane + operator: DoesNotExist + containers: + - name: gkm-agent + image: quay.io/gkm/gkm-agent-nogpu:latest + imagePullPolicy: IfNotPresent + securityContext: + runAsUser: 0 + privileged: true + capabilities: + add: ["CAP_DAC_OVERRIDE", "CAP_FOWNER"] + seccompProfile: + type: Unconfined + env: + - name: NO_GPU + value: "true" + - name: GO_LOG + valueFrom: + configMapKeyRef: + name: gkm-config + key: gkm.agent.log.level + - name: KUBE_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + resources: + limits: + memory: "128Mi" + cpu: "100m" + volumeMounts: + - name: gkm-state + mountPath: /var/lib/gkm + mountPropagation: Bidirectional + - name: gkm-runtime + mountPath: /run/gkm + mountPropagation: Bidirectional + - name: sys + mountPath: /sys + readOnly: true + - name: dev + mountPath: /dev + + volumes: + # This volume is the GKM State directory. This is where GPU Kernel Cache + # will be extracted. + - name: gkm-state + hostPath: + path: /var/lib/gkm + type: DirectoryOrCreate + # This volume is the GKM Runtime directory. This is where the Usage data + # will tracked which pods are using which cache. + - name: gkm-runtime + hostPath: + path: /run/gkm + type: DirectoryOrCreate + - name: sys + hostPath: + path: /sys + type: Directory + - name: dev + hostPath: + path: /dev + type: Directory diff --git a/config/agent/gkm-agent.yaml b/config/agent/gkm-agent-nvidia.yaml similarity index 65% rename from config/agent/gkm-agent.yaml rename to config/agent/gkm-agent-nvidia.yaml index b108bbcd6..ceb1f63ee 100644 --- a/config/agent/gkm-agent.yaml +++ b/config/agent/gkm-agent-nvidia.yaml @@ -1,23 +1,39 @@ apiVersion: apps/v1 kind: DaemonSet metadata: - name: agent + name: agent-nvidia namespace: gkm-system labels: app: gkm-agent + gpu-vendor: nvidia spec: selector: matchLabels: app: gkm-agent + gpu-vendor: nvidia template: metadata: labels: app: gkm-agent + gpu-vendor: nvidia spec: serviceAccountName: gkm-agent + # Deploy only on nodes with NVIDIA GPUs + # NVIDIA vendor ID is 10de, with class codes: + # 0300: VGA controller, 0302: 3D controller + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0300_10de.present + operator: Exists + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0302_10de.present + operator: Exists containers: - name: gkm-agent - image: quay.io/gkm/agent:latest + image: quay.io/gkm/gkm-agent-nvidia:latest imagePullPolicy: IfNotPresent securityContext: runAsUser: 0 @@ -37,19 +53,17 @@ spec: configMapKeyRef: name: gkm-config key: gkm.agent.log.level - - name: EXTRACT_IMAGE - valueFrom: - configMapKeyRef: - name: gkm-config - key: gkm.extract.image - name: KUBE_NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName + - name: LD_LIBRARY_PATH + value: /usr/lib64 resources: limits: memory: "128Mi" cpu: "100m" + nvidia.com/gpu: "0" # Request 0 GPUs (agent only monitors, doesn't use GPU) volumeMounts: - name: gkm-state mountPath: /var/lib/gkm @@ -62,6 +76,9 @@ spec: readOnly: true - name: dev mountPath: /dev + - name: nvidia-libs + mountPath: /usr/lib64 + readOnly: true volumes: # This volume is the GKM State directory. This is where GPU Kernel Cache @@ -84,3 +101,8 @@ spec: hostPath: path: /dev type: Directory + # NVIDIA libraries needed for NVML (NVIDIA Management Library) + - name: nvidia-libs + hostPath: + path: /usr/lib64 + type: Directory diff --git a/config/agent/kustomization.yaml b/config/agent/kustomization.yaml index 07e67158f..c24b0d22e 100644 --- a/config/agent/kustomization.yaml +++ b/config/agent/kustomization.yaml @@ -1,11 +1,29 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization + +# Deploy GPU-specific agents based on node hardware +# Requires Node Feature Discovery (NFD) to label nodes resources: -- gkm-agent.yaml +- gkm-agent-nvidia.yaml +- gkm-agent-amd.yaml +- gkm-agent-nogpu.yaml + images: -- name: agent - newName: quay.io/gkm/agent +- name: quay.io/gkm/agent-amd + newName: quay.io/gkm/gkm-agent-amd newTag: latest -- name: quay.io/gkm/agent - newName: quay.io/gkm/agent +- name: quay.io/gkm/agent-nogpu + newName: quay.io/gkm/gkm-agent-nogpu + newTag: latest +- name: quay.io/gkm/agent-nvidia + newName: quay.io/gkm/gkm-agent-nvidia + newTag: latest +- name: quay.io/gkm/gkm-agent-amd + newName: quay.io/gkm/gkm-agent-amd + newTag: latest +- name: quay.io/gkm/gkm-agent-nogpu + newName: quay.io/gkm/gkm-agent-nogpu + newTag: latest +- name: quay.io/gkm/gkm-agent-nvidia + newName: quay.io/gkm/gkm-agent-nvidia newTag: latest diff --git a/config/configMap/configMap.yaml b/config/configMap/configMap.yaml index 12d3b863c..cecb43866 100644 --- a/config/configMap/configMap.yaml +++ b/config/configMap/configMap.yaml @@ -8,7 +8,8 @@ data: gkm.operator.log.level: info gkm.agent.log.level: info ## Can be configured at runtime - gkm.agent.image: quay.io/gkm/agent:latest + ## Note: gkm.agent.image is legacy/unused - agents use GPU-specific images + gkm.agent.image: quay.io/gkm/gkm-agent-nogpu:latest gkm.extract.image: quay.io/gkm/gkm-extract:latest gkm.nogpu: false ## Enable/disable Kyverno image signature verification (defaults to true/enabled) diff --git a/config/configMap/kustomization.yaml b/config/configMap/kustomization.yaml index b46ed2b73..a33898c8b 100644 --- a/config/configMap/kustomization.yaml +++ b/config/configMap/kustomization.yaml @@ -9,7 +9,8 @@ configMapGenerator: - behavior: merge literals: - gkm.nogpu=true - - gkm.agent.image=quay.io/gkm/agent:latest + # Note: gkm.agent.image is legacy/unused - agents use GPU-specific images + - gkm.agent.image=quay.io/gkm/gkm-agent-nogpu:latest - gkm.extract.image=quay.io/gkm/gkm-extract:latest name: config namespace: gkm-system diff --git a/config/kind-gpu/agent-remove-affinity-patch.yaml b/config/kind-gpu/agent-remove-affinity-patch.yaml new file mode 100644 index 000000000..734147088 --- /dev/null +++ b/config/kind-gpu/agent-remove-affinity-patch.yaml @@ -0,0 +1,10 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: gkm-agent-nogpu + namespace: gkm-system +spec: + template: + spec: + # Remove node affinity for Kind - schedule on all worker nodes + affinity: null diff --git a/config/kind-gpu/kustomization.yaml b/config/kind-gpu/kustomization.yaml index d81e686c7..caaca0ba1 100644 --- a/config/kind-gpu/kustomization.yaml +++ b/config/kind-gpu/kustomization.yaml @@ -4,9 +4,24 @@ resources: apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization patches: + # Add GPU tolerations and nodeSelector to all agents - target: group: apps version: v1 kind: DaemonSet - name: gkm-agent + name: gkm-agent-amd path: agent-patch.yaml + - target: + group: apps + version: v1 + kind: DaemonSet + name: gkm-agent-nvidia + path: agent-patch.yaml + - target: + group: apps + version: v1 + kind: DaemonSet + name: gkm-agent-nogpu + path: agent-patch.yaml + # Remove node affinity for nogpu agent in Kind (no NFD labels) + - path: agent-remove-affinity-patch.yaml diff --git a/config/kyverno/values-no-gpu.yaml b/config/kyverno/values-no-gpu.yaml index 3f086fbd7..28606d392 100644 --- a/config/kyverno/values-no-gpu.yaml +++ b/config/kyverno/values-no-gpu.yaml @@ -33,3 +33,13 @@ reportsController: operator: Equal value: "true" effect: NoSchedule + +# Jobs (e.g., migration resources) also need tolerations +hooks: + nodeSelector: + hardware-type: gpu + tolerations: + - key: gpu + operator: Equal + value: "true" + effect: NoSchedule diff --git a/config/nfd/README.md b/config/nfd/README.md new file mode 100644 index 000000000..bc6ce2db5 --- /dev/null +++ b/config/nfd/README.md @@ -0,0 +1,167 @@ +# Node Feature Discovery (NFD) Configuration + +This directory contains the configuration for deploying [Node Feature Discovery](https://kubernetes-sigs.github.io/node-feature-discovery/) to automatically label nodes with hardware features, particularly GPU vendor information. + +## What is NFD? + +Node Feature Discovery is a Kubernetes add-on that detects hardware features available on each node and advertises those features using node labels. + +For GKM, NFD automatically labels nodes with PCI device vendor IDs, enabling GPU-specific agent deployment: + +- **NVIDIA GPUs**: `feature.node.kubernetes.io/pci-10de.present=true` (vendor ID: 0x10de) +- **AMD GPUs**: `feature.node.kubernetes.io/pci-1002.present=true` (vendor ID: 0x1002) + +## Deployment + +### Deploy NFD +```bash +kubectl apply -k config/nfd +``` + +### Verify NFD is Running +```bash +# Check NFD pods +kubectl get pods -n node-feature-discovery + +# Expected output: +# NAME READY STATUS RESTARTS AGE +# nfd-master-xxxxx 1/1 Running 0 1m +# nfd-worker-xxxxx 1/1 Running 0 1m +# nfd-worker-yyyyy 1/1 Running 0 1m +``` + +### Check Node Labels +```bash +# View all NFD labels on a node +kubectl get node -o json | jq '.metadata.labels | with_entries(select(.key | startswith("feature.node.kubernetes.io")))' + +# Check for GPU vendor labels specifically +kubectl get nodes -L feature.node.kubernetes.io/pci-10de.present,feature.node.kubernetes.io/pci-1002.present +``` + +## How It Works + +1. **NFD Master**: Runs as a deployment, manages feature labeling +2. **NFD Worker**: Runs as a DaemonSet on each node, detects features +3. **Worker scans PCI devices** and creates labels for vendor IDs +4. **Labels are applied** to nodes automatically + +## Configuration + +### Default Configuration + +The default NFD configuration (via [kustomization.yaml](kustomization.yaml)) deploys NFD from the official upstream repository. + +### Custom Configuration (Optional) + +To customize NFD behavior, uncomment the patch in `kustomization.yaml` and modify [nfd-worker-conf.yaml](nfd-worker-conf.yaml): + +```yaml +# In kustomization.yaml +patchesStrategicMerge: + - nfd-worker-conf.yaml +``` + +The custom configuration enables: +- **PCI device detection** with focus on display controllers (GPUs) +- **Vendor ID labeling** for automatic GPU vendor identification +- **Configurable scan interval** (default: 60s) + +## Verification + +### Manual PCI Device Check + +On each node, you can manually verify GPU devices: + +```bash +# List all VGA/Display controllers +lspci | grep -i vga + +# Show vendor IDs numerically +lspci -n | grep -E "0300|0302" + +# Example outputs: +# NVIDIA: 01:00.0 0300: 10de:1b80 (rev a1) +# AMD: 01:00.0 0300: 1002:67df (rev c7) +``` + +### Verify Label Creation + +```bash +# List nodes with NVIDIA GPUs +kubectl get nodes -l feature.node.kubernetes.io/pci-10de.present=true + +# List nodes with AMD GPUs +kubectl get nodes -l feature.node.kubernetes.io/pci-1002.present=true +``` + +## Integration with GKM Agents + +NFD labels are used by GKM agent DaemonSets to deploy GPU-specific agents: + +```yaml +# From config/agent/gkm-agent-nvidia.yaml +nodeSelector: + feature.node.kubernetes.io/pci-10de.present: "true" + +# From config/agent/gkm-agent-amd.yaml +nodeSelector: + feature.node.kubernetes.io/pci-1002.present: "true" +``` + +This ensures: +- NVIDIA agents only run on NVIDIA GPU nodes +- AMD agents only run on AMD GPU nodes +- No manual node labeling required + +## Troubleshooting + +### NFD Not Detecting GPUs + +1. **Check NFD worker logs:** + ```bash + kubectl logs -n node-feature-discovery -l app=nfd-worker + ``` + +2. **Verify PCI devices are present:** + ```bash + # SSH to node + lspci | grep -i vga + ``` + +3. **Check NFD configuration:** + ```bash + kubectl get cm -n node-feature-discovery nfd-worker-conf -o yaml + ``` + +### Labels Not Appearing + +1. **Restart NFD worker:** + ```bash + kubectl rollout restart daemonset/nfd-worker -n node-feature-discovery + ``` + +2. **Force re-labeling:** + ```bash + kubectl delete pod -n node-feature-discovery -l app=nfd-worker + ``` + +### Wrong Vendor ID + +Common PCI vendor IDs: +- **NVIDIA**: `10de` +- **AMD**: `1002` +- **Intel**: `8086` + +If using a different GPU vendor, update the node selectors in the agent DaemonSets. + +## Resources + +- [NFD GitHub](https://github.com/kubernetes-sigs/node-feature-discovery) +- [NFD Documentation](https://kubernetes-sigs.github.io/node-feature-discovery/) +- [PCI Vendor IDs Database](https://pci-ids.ucw.cz/) + +## Files + +- [kustomization.yaml](kustomization.yaml) - Main NFD deployment configuration +- [nfd-worker-conf.yaml](nfd-worker-conf.yaml) - Optional custom NFD worker configuration diff --git a/config/nfd/kustomization.yaml b/config/nfd/kustomization.yaml new file mode 100644 index 000000000..7490a2ea6 --- /dev/null +++ b/config/nfd/kustomization.yaml @@ -0,0 +1,12 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization + +# Deploy Node Feature Discovery from official Helm chart +# This will automatically label nodes with GPU vendor information +resources: + - https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/default?ref=v0.17.2 + +# Patches for GPU-tainted nodes (Kind cluster) +patchesStrategicMerge: + - patch-nfd-gc.yaml + - patch-nfd-workers.yaml diff --git a/config/nfd/nfd-worker-conf.yaml b/config/nfd/nfd-worker-conf.yaml new file mode 100644 index 000000000..c183e0cb0 --- /dev/null +++ b/config/nfd/nfd-worker-conf.yaml @@ -0,0 +1,33 @@ +# Optional NFD Worker Configuration +# This file customizes NFD to ensure PCI device detection is enabled +# Uncomment in kustomization.yaml to use + +apiVersion: v1 +kind: ConfigMap +metadata: + name: nfd-worker-conf + namespace: node-feature-discovery +data: + nfd-worker.conf: | + core: + labelWhiteList: [".*"] # Enable all labels + noPublish: false + sleepInterval: 60s + sources: + - pci # Ensure PCI device detection is enabled + - cpu + - kernel + - memory + - network + - storage + - system + - usb + sources: + pci: + deviceClassWhitelist: + - "03" # Display controllers (GPUs are in this class) + - "0300" # VGA compatible controller + - "0301" # XGA compatible controller + - "0302" # 3D controller + deviceLabelFields: + - vendor # Will create labels like feature.node.kubernetes.io/pci-10de.present diff --git a/config/nfd/patch-nfd-gc.yaml b/config/nfd/patch-nfd-gc.yaml new file mode 100644 index 000000000..b717c07e2 --- /dev/null +++ b/config/nfd/patch-nfd-gc.yaml @@ -0,0 +1,13 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: nfd-gc + namespace: node-feature-discovery +spec: + template: + spec: + tolerations: + - key: gpu + operator: Equal + value: "true" + effect: NoSchedule diff --git a/config/nfd/patch-nfd-workers.yaml b/config/nfd/patch-nfd-workers.yaml new file mode 100644 index 000000000..c8fa653cd --- /dev/null +++ b/config/nfd/patch-nfd-workers.yaml @@ -0,0 +1,13 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: nfd-worker + namespace: node-feature-discovery +spec: + template: + spec: + tolerations: + - key: gpu + operator: Equal + value: "true" + effect: NoSchedule diff --git a/config/operator/kustomization.yaml b/config/operator/kustomization.yaml index b7f6673d4..edf1e14b8 100644 --- a/config/operator/kustomization.yaml +++ b/config/operator/kustomization.yaml @@ -4,8 +4,11 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization images: - name: controller - newName: quay.io/gkm/operator + newName: quay.io/gkm/gkm-operator + newTag: latest +- name: quay.io/gkm/gkm-operator + newName: quay.io/gkm/gkm-operator newTag: latest - name: quay.io/gkm/operator - newName: quay.io/gkm/operator + newName: quay.io/gkm/gkm-operator newTag: latest diff --git a/config/operator/operator.yaml b/config/operator/operator.yaml index d694ee2e5..917eb4b7f 100644 --- a/config/operator/operator.yaml +++ b/config/operator/operator.yaml @@ -71,7 +71,7 @@ spec: args: - --leader-elect - --health-probe-bind-address=:8081 - image: quay.io/gkm/operator:latest + image: quay.io/gkm/gkm-operator:latest imagePullPolicy: IfNotPresent securityContext: allowPrivilegeEscalation: false diff --git a/docs/GettingStartedGuide.md b/docs/GettingStartedGuide.md index b29dcffb4..a75ee7104 100644 --- a/docs/GettingStartedGuide.md +++ b/docs/GettingStartedGuide.md @@ -10,14 +10,41 @@ building GKM and description of how to deploy GKM. - kubectl version v1.11.3+. - Access to a Kubernetes v1.11.3+ cluster. -The following packages are also required to build: +### Automated Installation (RHEL 10 / CentOS Stream 10) + +For RHEL 10 or CentOS Stream 10 systems, you can install all +dependencies (including go, podman, kubectl, and build packages) using: + +```sh +make install-deps +``` + +This will: + +- Install system development packages (gpgme-devel, libdrm-devel, hwloc-devel) +- Install btrfs development headers +- Install or upgrade Go to v1.25.0+ if needed +- Install or upgrade Podman to v5.3.1+ if needed +- Install or upgrade kubectl to v1.11.3+ if needed + +### Manual Installation + +The following packages are required to build: + +**For Fedora/RHEL/CentOS:** ```sh sudo dnf install -y gpgme-devel libdrm-devel libbtrfs btrfs-progs \ btrfs-progs-devel hwloc hwloc-devel ``` -OR +> **Note for RHEL 10**: Some packages may not be available in standard +> repositories. Use `make install-deps` or see +> [hack/install_deps.sh](../hack/install_deps.sh) for the installation +> script that sources packages from CentOS Stream 10 and Fedora +> repositories. + +**For Debian/Ubuntu:** ```sh sudo apt-get install -y libgpgme-dev libbtrfs-dev btrfs-progs libgpgme11-dev \ diff --git a/examples/namespace/RWO/10-namespace.yaml b/examples/namespace/RWO/CUDA/10-namespace.yaml similarity index 59% rename from examples/namespace/RWO/10-namespace.yaml rename to examples/namespace/RWO/CUDA/10-namespace.yaml index bc47b15b7..dd97a60f3 100644 --- a/examples/namespace/RWO/10-namespace.yaml +++ b/examples/namespace/RWO/CUDA/10-namespace.yaml @@ -2,4 +2,4 @@ apiVersion: v1 kind: Namespace metadata: - name: gkm-test-ns-rwo-1 + name: gkm-test-ns-cuda-rwo-1 diff --git a/examples/namespace/RWO/CUDA/11-gkmcache.yaml b/examples/namespace/RWO/CUDA/11-gkmcache.yaml new file mode 100644 index 000000000..d16e27b3f --- /dev/null +++ b/examples/namespace/RWO/CUDA/11-gkmcache.yaml @@ -0,0 +1,19 @@ +--- +apiVersion: gkm.io/v1alpha1 +kind: GKMCache +metadata: + name: vector-add-cache-cuda-rwo + namespace: gkm-test-ns-cuda-rwo-1 + labels: + gkm.io/signature-format: cosign-v2 +spec: + image: quay.io/gkm/cache-examples:vector-add-cache-cuda-v2 + storageClassName: standard # Update this to match your cluster's storage class + + # Pod template for the extraction job + podTemplate: + spec: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule diff --git a/examples/namespace/RWO/CUDA/12-ds.yaml b/examples/namespace/RWO/CUDA/12-ds.yaml new file mode 100644 index 000000000..f4e38910a --- /dev/null +++ b/examples/namespace/RWO/CUDA/12-ds.yaml @@ -0,0 +1,52 @@ +--- +kind: DaemonSet +apiVersion: apps/v1 +metadata: + name: gkm-test-cuda-rwo-ds-1 + namespace: gkm-test-ns-cuda-rwo-1 + labels: + gkm.io/pvcMutation: "true" +spec: + selector: + matchLabels: + name: gkm-test-cuda-rwo-ds-1 + template: + metadata: + labels: + name: gkm-test-cuda-rwo-ds-1 + gkm.io/pvc-mutation: "true" + spec: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + + # Node affinity to schedule only on NVIDIA GPU nodes + # NVIDIA vendor ID is 10de, class code 0300 (VGA) or 0302 (3D controller) + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0300_10de.present + operator: Exists + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0302_10de.present + operator: Exists + + containers: + - name: test + image: quay.io/fedora/fedora-minimal + imagePullPolicy: IfNotPresent + command: [sleep, 365d] + volumeMounts: + - name: kernel-volume + mountPath: /cache + readOnly: true + resources: + limits: + nvidia.com/gpu: 1 # Request 1 NVIDIA GPU + volumes: + - name: kernel-volume + persistentVolumeClaim: + claimName: vector-add-cache-cuda-rwo diff --git a/examples/namespace/RWO/CUDA/13-pod.yaml b/examples/namespace/RWO/CUDA/13-pod.yaml new file mode 100644 index 000000000..fcb0bbc7d --- /dev/null +++ b/examples/namespace/RWO/CUDA/13-pod.yaml @@ -0,0 +1,41 @@ +--- +kind: Pod +apiVersion: v1 +metadata: + name: gkm-test-cuda-pod-1 + namespace: gkm-test-ns-cuda-rwo-1 +spec: + tolerations: + - key: nvidia.com/gpu + operator: Exists + effect: NoSchedule + + # Node affinity to schedule only on NVIDIA GPU nodes + # NVIDIA vendor ID is 10de, class code 0300 (VGA) or 0302 (3D controller) + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0300_10de.present + operator: Exists + - matchExpressions: + - key: feature.node.kubernetes.io/pci-0302_10de.present + operator: Exists + + containers: + - name: test + image: quay.io/fedora/fedora-minimal + imagePullPolicy: IfNotPresent + command: [sleep, 365d] + volumeMounts: + - name: kernel-volume + mountPath: /cache + readOnly: true + resources: + limits: + nvidia.com/gpu: 1 # Request 1 NVIDIA GPU + volumes: + - name: kernel-volume + persistentVolumeClaim: + claimName: vector-add-cache-cuda-rwo diff --git a/examples/namespace/RWO/CUDA/README.md b/examples/namespace/RWO/CUDA/README.md new file mode 100644 index 000000000..16fbe1080 --- /dev/null +++ b/examples/namespace/RWO/CUDA/README.md @@ -0,0 +1,149 @@ +# NVIDIA GPU Examples for GKM (ReadWriteOnce) + +This directory contains examples for deploying GKM with NVIDIA GPU support +using ReadWriteOnce (RWO) access mode. + +## Prerequisites + +1. Kubernetes cluster with NVIDIA GPUs +2. NVIDIA GPU Operator or device plugin installed +3. Node Feature Discovery (NFD) installed and configured +4. GKM operator deployed in the cluster +5. A storage class that supports ReadWriteOnce volumes + +## Storage Class Configuration + +Before deploying, verify your storage class: + +```bash +kubectl get sc +``` + +Update the `storageClassName` field in +[11-gkmcache.yaml](11-gkmcache.yaml) to match your cluster's storage +class. + +## Deployment + +### Option 1: Deploy All Resources + +```bash +kubectl apply -f examples/namespace/RWO/CUDA/ +``` + +### Option 2: Deploy Step by Step + +1. Create the namespace: + + ```bash + kubectl apply -f 10-namespace.yaml + ``` + +2. Create the GKMCache resource: + + ```bash + kubectl apply -f 11-gkmcache.yaml + ``` + +3. Wait for the PVC to be created and bound: + + ```bash + kubectl get pvc -n gkm-test-ns-nvidia-rwo-1 -w + ``` + +4. Deploy a test workload (choose one): + - DaemonSet: `kubectl apply -f 12-ds.yaml` + - Pod: `kubectl apply -f 13-pod.yaml` + +## Verification + +Check the GKMCache status: + +```bash +kubectl get gkmcache -n gkm-test-ns-nvidia-rwo-1 +kubectl describe gkmcache vector-add-cache-cuda-rwo -n gkm-test-ns-nvidia-rwo-1 +``` + +Check the PVC: + +```bash +kubectl get pvc -n gkm-test-ns-nvidia-rwo-1 +``` + +Check the extraction job: + +```bash +kubectl get jobs -n gkm-test-ns-nvidia-rwo-1 +kubectl get pods -n gkm-test-ns-nvidia-rwo-1 +``` + +Check the test workload: + +```bash +# For Pod +kubectl get pod gkm-test-nvidia-pod-1 -n gkm-test-ns-nvidia-rwo-1 +kubectl logs gkm-test-nvidia-pod-1 -n gkm-test-ns-nvidia-rwo-1 + +# For DaemonSet +kubectl get ds gkm-test-nvidia-rwo-ds-1 -n gkm-test-ns-nvidia-rwo-1 +kubectl get pods -n gkm-test-ns-nvidia-rwo-1 -l name=gkm-test-nvidia-rwo-ds-1 +``` + +Verify the cache is mounted: + +```bash +kubectl exec -it -n gkm-test-ns-nvidia-rwo-1 gkm-test-nvidia-pod-1 -- ls -la /cache +``` + +## Troubleshooting + +### PVC Pending State + +If the PVC remains in Pending state: + +```bash +kubectl describe pvc vector-add-cache-cuda-rwo -n gkm-test-ns-nvidia-rwo-1 +``` + +Common issues: + +- Storage class not available or incorrect +- No nodes match the node selector +- Volume binding mode is `WaitForFirstConsumer` (PVC will bind when a + pod using it is scheduled) + +### Extraction Job Not Scheduling + +Check the extraction job: + +```bash +kubectl get jobs -n gkm-test-ns-nvidia-rwo-1 +kubectl describe job -n gkm-test-ns-nvidia-rwo-1 +``` + +Check for pod scheduling issues: + +```bash +kubectl get events -n gkm-test-ns-nvidia-rwo-1 --sort-by='.lastTimestamp' +``` + +### Pod Not Scheduling on GPU Nodes + +If your cluster doesn't have NFD labels, you can either: + +1. Install and configure NFD (recommended) +2. Remove the `affinity` section from the pod/daemonset specs and use a + simpler node selector or label your GPU nodes manually + +Example without NFD: + +```yaml +nodeSelector: + your-gpu-label: "true" # Use whatever label identifies your GPU nodes +``` + +## Cleanup + +```bash +kubectl delete -f examples/namespace/RWO/CUDA/ +``` diff --git a/examples/namespace/RWO/ROCM/10-namespace.yaml b/examples/namespace/RWO/ROCM/10-namespace.yaml new file mode 100644 index 000000000..b91919dd6 --- /dev/null +++ b/examples/namespace/RWO/ROCM/10-namespace.yaml @@ -0,0 +1,5 @@ +--- +apiVersion: v1 +kind: Namespace +metadata: + name: gkm-test-ns-rocm-rwo-1 diff --git a/examples/namespace/RWO/11-gkmcache.yaml b/examples/namespace/RWO/ROCM/11-gkmcache.yaml similarity index 83% rename from examples/namespace/RWO/11-gkmcache.yaml rename to examples/namespace/RWO/ROCM/11-gkmcache.yaml index eb81bd8a5..5fb6e2c34 100644 --- a/examples/namespace/RWO/11-gkmcache.yaml +++ b/examples/namespace/RWO/ROCM/11-gkmcache.yaml @@ -2,8 +2,8 @@ apiVersion: gkm.io/v1alpha1 kind: GKMCache metadata: - name: vector-add-cache-rocm-v2-rwo - namespace: gkm-test-ns-rwo-1 + name: vector-add-cache-rocm-rwo + namespace: gkm-test-ns-rocm-rwo-1 labels: gkm.io/signature-format: cosign-v2 spec: diff --git a/examples/namespace/RWO/12-ds.yaml b/examples/namespace/RWO/ROCM/12-ds.yaml similarity index 85% rename from examples/namespace/RWO/12-ds.yaml rename to examples/namespace/RWO/ROCM/12-ds.yaml index 738c8bd61..f12550f3d 100644 --- a/examples/namespace/RWO/12-ds.yaml +++ b/examples/namespace/RWO/ROCM/12-ds.yaml @@ -2,18 +2,18 @@ kind: DaemonSet apiVersion: apps/v1 metadata: - name: gkm-test-ns-rwo-ds-1 - namespace: gkm-test-ns-rwo-1 + name: gkm-test-rocm-rwo-ds-1 + namespace: gkm-test-ns-rocm-rwo-1 labels: gkm.io/pvcMutation: "true" spec: selector: matchLabels: - name: gkm-test-ns-rwo-ds-1 + name: gkm-test-rocm-rwo-ds-1 template: metadata: labels: - name: gkm-test-ns-rwo-ds-1 + name: gkm-test-rocm-rwo-ds-1 gkm.io/pvc-mutation: "true" spec: tolerations: @@ -50,4 +50,4 @@ spec: volumes: - name: kernel-volume persistentVolumeClaim: - claimName: vector-add-cache-rocm-v2-rwo + claimName: vector-add-cache-rocm-rwo diff --git a/examples/namespace/RWO/13-ds.yaml b/examples/namespace/RWO/ROCM/13-ds.yaml similarity index 85% rename from examples/namespace/RWO/13-ds.yaml rename to examples/namespace/RWO/ROCM/13-ds.yaml index 937e745e1..bde833fc3 100644 --- a/examples/namespace/RWO/13-ds.yaml +++ b/examples/namespace/RWO/ROCM/13-ds.yaml @@ -2,18 +2,18 @@ kind: DaemonSet apiVersion: apps/v1 metadata: - name: gkm-test-ns-rwo-ds-2 - namespace: gkm-test-ns-rwo-1 + name: gkm-test-rocm-rwo-ds-2 + namespace: gkm-test-ns-rocm-rwo-1 labels: gkm.io/pvc-mutation: "true" spec: selector: matchLabels: - name: gkm-test-ns-rwo-ds-2 + name: gkm-test-rocm-rwo-ds-2 template: metadata: labels: - name: gkm-test-ns-rwo-ds-2 + name: gkm-test-rocm-rwo-ds-2 gkm.io/pvc-mutation: "true" spec: tolerations: @@ -51,4 +51,4 @@ spec: volumes: - name: kernel-volume persistentVolumeClaim: - claimName: vector-add-cache-rocm-v2-rwo + claimName: vector-add-cache-rocm-rwo diff --git a/examples/namespace/RWO/14-ds.yaml b/examples/namespace/RWO/ROCM/14-ds.yaml similarity index 85% rename from examples/namespace/RWO/14-ds.yaml rename to examples/namespace/RWO/ROCM/14-ds.yaml index c6bf50212..09d1842fd 100644 --- a/examples/namespace/RWO/14-ds.yaml +++ b/examples/namespace/RWO/ROCM/14-ds.yaml @@ -2,18 +2,18 @@ kind: DaemonSet apiVersion: apps/v1 metadata: - name: gkm-test-ns-rwo-ds-3 - namespace: gkm-test-ns-rwo-1 + name: gkm-test-rocm-rwo-ds-3 + namespace: gkm-test-ns-rocm-rwo-1 labels: gkm.io/pvcMutation: "true" spec: selector: matchLabels: - name: gkm-test-ns-rwo-ds-3 + name: gkm-test-rocm-rwo-ds-3 template: metadata: labels: - name: gkm-test-ns-rwo-ds-3 + name: gkm-test-rocm-rwo-ds-3 gkm.io/pvc-mutation: "true" spec: tolerations: @@ -51,4 +51,4 @@ spec: volumes: - name: kernel-volume persistentVolumeClaim: - claimName: vector-add-cache-rocm-v2-rwo + claimName: vector-add-cache-rocm-rwo diff --git a/examples/namespace/RWO/21-gkmcache-cosign-v3.yaml b/examples/namespace/RWO/ROCM/21-gkmcache-cosign-v3.yaml similarity index 82% rename from examples/namespace/RWO/21-gkmcache-cosign-v3.yaml rename to examples/namespace/RWO/ROCM/21-gkmcache-cosign-v3.yaml index 9a091eaf3..6bafc7b42 100644 --- a/examples/namespace/RWO/21-gkmcache-cosign-v3.yaml +++ b/examples/namespace/RWO/ROCM/21-gkmcache-cosign-v3.yaml @@ -2,8 +2,8 @@ apiVersion: gkm.io/v1alpha1 kind: GKMCache metadata: - name: vector-add-cache-rocm-v3-rwo - namespace: gkm-test-ns-rwo-1 + name: vector-add-cache-rocm-rwo-v3 + namespace: gkm-test-ns-rocm-rwo-1 labels: gkm.io/signature-format: cosign-v3 spec: diff --git a/examples/namespace/RWO/22-ds.yaml b/examples/namespace/RWO/ROCM/22-ds.yaml similarity index 84% rename from examples/namespace/RWO/22-ds.yaml rename to examples/namespace/RWO/ROCM/22-ds.yaml index c682f8a2c..47ef6d515 100644 --- a/examples/namespace/RWO/22-ds.yaml +++ b/examples/namespace/RWO/ROCM/22-ds.yaml @@ -2,18 +2,18 @@ kind: DaemonSet apiVersion: apps/v1 metadata: - name: gkm-test-ns-rwo-v3-ds-1 - namespace: gkm-test-ns-rwo-1 + name: gkm-test-rocm-rwo-v3-ds-1 + namespace: gkm-test-ns-rocm-rwo-1 labels: gkm.io/pvcMutation: "true" spec: selector: matchLabels: - name: gkm-test-ns-rwo-v3-ds-1 + name: gkm-test-rocm-rwo-v3-ds-1 template: metadata: labels: - name: gkm-test-ns-rwo-v3-ds-1 + name: gkm-test-rocm-rwo-v3-ds-1 gkm.io/pvc-mutation: "true" spec: tolerations: @@ -50,4 +50,4 @@ spec: volumes: - name: kernel-volume persistentVolumeClaim: - claimName: vector-add-cache-rocm-v3-rwo + claimName: vector-add-cache-rocm-rwo-v3 diff --git a/gkm-codespell.precommit-toml b/gkm-codespell.precommit-toml index 76f856152..e1472179e 100644 --- a/gkm-codespell.precommit-toml +++ b/gkm-codespell.precommit-toml @@ -1,3 +1,3 @@ [tool.codespell] -ignore-words-list = "AfterAll,renderD" +ignore-words-list = "AfterAll,renderD,aCI" skip = './.*,vendor/*,go.sum' diff --git a/hack/install_deps.sh b/hack/install_deps.sh new file mode 100644 index 000000000..444b88ce6 --- /dev/null +++ b/hack/install_deps.sh @@ -0,0 +1,170 @@ +#!/bin/bash + +set -e + +echo "================================================" +echo "GKM Dependency Installation for RHEL 10" +echo "================================================" +echo "" + +# Minimum required versions +MIN_GO_VERSION="1.25.0" +MIN_PODMAN_VERSION="5.3.1" +MIN_KUBECTL_VERSION="1.11.3" + +# CentOS Stream 10 repository URLs +CENTOS_CRB="https://mirror.stream.centos.org/10-stream/CRB/x86_64/os/" +FEDORA_BASE="https://download.fedoraproject.org/pub/fedora/linux/development/rawhide/Everything/x86_64/os/Packages" + +# Function to compare versions +version_ge() { + # Returns 0 (true) if $1 >= $2 + [ "$(printf '%s\n' "$2" "$1" | sort -V | head -n1)" = "$2" ] +} + +# Function to check if a command exists +command_exists() { + command -v "$1" >/dev/null 2>&1 +} + +echo "=== Step 1: Importing CentOS Stream GPG key ===" +echo "================================================" +sudo rpm --import https://www.centos.org/keys/RPM-GPG-KEY-CentOS-Official-SHA256 2>/dev/null || echo "Key may already be imported" + +echo "" +echo "=== Step 2: Installing system development packages ===" +echo "======================================================" +sudo dnf install -y --repofrompath=centos-crb,${CENTOS_CRB} \ + gpgme-devel libdrm-devel hwloc-devel + +echo "" +echo "=== Step 3: Installing btrfs development headers ===" +echo "=====================================================" +# First install the base libraries with --nodeps to skip filesystem checks +sudo rpm -ivh --nodeps \ + "${FEDORA_BASE}/l/libbtrfs-6.19-1.fc45.x86_64.rpm" \ + "${FEDORA_BASE}/l/libbtrfsutil-6.19-1.fc45.x86_64.rpm" 2>/dev/null || echo "Libraries may already be installed" + +# Now install devel package with --nodeps +sudo rpm -ivh --nodeps \ + "${FEDORA_BASE}/b/btrfs-progs-6.19-1.fc45.x86_64.rpm" 2>/dev/null || echo "btrfs-progs may already be installed" + +sudo rpm -ivh --nodeps \ + "${FEDORA_BASE}/b/btrfs-progs-devel-6.19-1.fc45.x86_64.rpm" + +echo "" +echo "=== Step 4: Installing Go ${MIN_GO_VERSION}+ ===" +echo "==============================================" +if command_exists go; then + CURRENT_GO_VERSION=$(go version | awk '{print $3}' | sed 's/go//') + echo "Found Go version: ${CURRENT_GO_VERSION}" + if version_ge "${CURRENT_GO_VERSION}" "${MIN_GO_VERSION}"; then + echo "✓ Go ${CURRENT_GO_VERSION} meets minimum requirement (${MIN_GO_VERSION}+)" + else + echo "⚠ Go ${CURRENT_GO_VERSION} is older than required ${MIN_GO_VERSION}" + echo "Installing Go ${MIN_GO_VERSION}..." + GO_VERSION="1.25.0" + GO_TARBALL="go${GO_VERSION}.linux-amd64.tar.gz" + curl -LO "https://go.dev/dl/${GO_TARBALL}" + sudo rm -rf /usr/local/go + sudo tar -C /usr/local -xzf "${GO_TARBALL}" + rm "${GO_TARBALL}" + echo "✓ Go ${GO_VERSION} installed. Add /usr/local/go/bin to your PATH" + export PATH=$PATH:/usr/local/go/bin + fi +else + echo "Go not found. Installing Go ${MIN_GO_VERSION}..." + GO_VERSION="1.25.0" + GO_TARBALL="go${GO_VERSION}.linux-amd64.tar.gz" + curl -LO "https://go.dev/dl/${GO_TARBALL}" + sudo rm -rf /usr/local/go + sudo tar -C /usr/local -xzf "${GO_TARBALL}" + rm "${GO_TARBALL}" + echo "✓ Go ${GO_VERSION} installed. Add /usr/local/go/bin to your PATH" + echo 'export PATH=$PATH:/usr/local/go/bin' >> ~/.bashrc + export PATH=$PATH:/usr/local/go/bin +fi + +echo "" +echo "=== Step 5: Installing Podman ${MIN_PODMAN_VERSION}+ ===" +echo "========================================================" +if command_exists podman; then + CURRENT_PODMAN_VERSION=$(podman version --format '{{.Client.Version}}' 2>/dev/null || podman --version | awk '{print $3}') + echo "Found Podman version: ${CURRENT_PODMAN_VERSION}" + if version_ge "${CURRENT_PODMAN_VERSION}" "${MIN_PODMAN_VERSION}"; then + echo "✓ Podman ${CURRENT_PODMAN_VERSION} meets minimum requirement (${MIN_PODMAN_VERSION}+)" + else + echo "⚠ Podman ${CURRENT_PODMAN_VERSION} is older than required ${MIN_PODMAN_VERSION}" + echo "Upgrading Podman..." + sudo dnf upgrade -y podman + fi +else + echo "Podman not found. Installing..." + sudo dnf install -y podman +fi + +echo "" +echo "=== Step 6: Installing kubectl ${MIN_KUBECTL_VERSION}+ ===" +echo "==========================================================" +if command_exists kubectl; then + CURRENT_KUBECTL_VERSION=$(kubectl version --client --short 2>/dev/null | grep -oP 'v\K[0-9.]+' || kubectl version --client -o json 2>/dev/null | grep -oP '"gitVersion": "v\K[0-9.]+' | head -1) + echo "Found kubectl version: ${CURRENT_KUBECTL_VERSION}" + if version_ge "${CURRENT_KUBECTL_VERSION}" "${MIN_KUBECTL_VERSION}"; then + echo "✓ kubectl ${CURRENT_KUBECTL_VERSION} meets minimum requirement (${MIN_KUBECTL_VERSION}+)" + else + echo "⚠ kubectl ${CURRENT_KUBECTL_VERSION} is older than required ${MIN_KUBECTL_VERSION}" + echo "Installing latest kubectl..." + curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + chmod +x kubectl + sudo mv kubectl /usr/local/bin/ + fi +else + echo "kubectl not found. Installing..." + curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" + chmod +x kubectl + sudo mv kubectl /usr/local/bin/ +fi + +echo "" +echo "=== Step 7: Verification ===" +echo "============================" +echo "" +echo "System Development Packages:" +ls -la /usr/include/gpgme.h 2>/dev/null && echo " ✓ gpgme-devel" || echo " ✗ gpgme-devel missing" +ls -la /usr/include/xf86drm.h 2>/dev/null && echo " ✓ libdrm-devel" || echo " ✗ libdrm-devel missing" +ls -la /usr/include/hwloc.h 2>/dev/null && echo " ✓ hwloc-devel" || echo " ✗ hwloc-devel missing" +ls -la /usr/include/btrfs/version.h 2>/dev/null && echo " ✓ btrfs/version.h" || echo " ✗ btrfs headers missing" + +echo "" +echo "Build Tools:" +if command_exists go; then + echo " ✓ Go $(go version | awk '{print $3}')" +else + echo " ✗ Go not found in PATH" +fi + +if command_exists podman; then + echo " ✓ Podman $(podman --version | awk '{print $3}')" +else + echo " ✗ Podman not found" +fi + +if command_exists kubectl; then + echo " ✓ kubectl $(kubectl version --client --short 2>/dev/null | grep -oP 'v[0-9.]+' || echo 'version installed')" +else + echo " ✗ kubectl not found in PATH" +fi + +echo "" +echo "pkg-config:" +pkg-config --exists gpgme && echo " ✓ gpgme.pc (version $(pkg-config --modversion gpgme))" || echo " ✗ gpgme.pc missing" + +echo "" +echo "================================================" +echo "Installation Complete!" +echo "================================================" +echo "" +echo "If Go or kubectl were newly installed, you may need to:" +echo " - Reload your shell: source ~/.bashrc" +echo " - Or add to your PATH manually:" +echo " export PATH=\$PATH:/usr/local/go/bin" diff --git a/mcv/go.mod b/mcv/go.mod index 5bbba7590..be7bc23e7 100644 --- a/mcv/go.mod +++ b/mcv/go.mod @@ -26,6 +26,8 @@ require ( github.com/Azure/go-ansiterm v0.0.0-20250102033503-faa5f7b0171c // indirect github.com/BurntSushi/toml v1.5.0 // indirect github.com/Microsoft/go-winio v0.6.2 // indirect + github.com/NVIDIA/go-nvlib v0.9.0 // indirect + github.com/ROCm/amdsmi v0.0.0-20251117222445-a044536b8d69 // indirect github.com/StackExchange/wmi v1.2.1 // indirect github.com/VividCortex/ewma v1.2.0 // indirect github.com/acarl005/stripansi v0.0.0-20180116102854-5a71ef0e047d // indirect diff --git a/mcv/go.sum b/mcv/go.sum index 2a1f1c71d..308ce4245 100644 --- a/mcv/go.sum +++ b/mcv/go.sum @@ -12,8 +12,12 @@ github.com/Masterminds/semver/v3 v3.4.0 h1:Zog+i5UMtVoCU8oKka5P7i9q9HgrJeGzI9SA1 github.com/Masterminds/semver/v3 v3.4.0/go.mod h1:4V+yj/TJE1HU9XfppCwVMZq3I84lprf4nC11bSS5beM= github.com/Microsoft/go-winio v0.6.2 h1:F2VQgta7ecxGYO8k3ZZz3RS8fVIXVxONVUPlNERoyfY= github.com/Microsoft/go-winio v0.6.2/go.mod h1:yd8OoFMLzJbo9gZq8j5qaps8bJ9aShtEA8Ipt1oGCvU= +github.com/NVIDIA/go-nvlib v0.9.0 h1:GKLIvLJ0uhCtTLLZp2Q8QIDRxOYH45MM4Y5OO3U5Rho= +github.com/NVIDIA/go-nvlib v0.9.0/go.mod h1:7mzx9FSdO9fXWP9NKuZmWkCwhkEcSWQFe2tmFwtLb9c= github.com/NVIDIA/go-nvml v0.13.0-1 h1:OLX8Jq3dONuPOQPC7rndB6+iDmDakw0XTYgzMxObkEw= github.com/NVIDIA/go-nvml v0.13.0-1/go.mod h1:+KNA7c7gIBH7SKSJ1ntlwkfN80zdx8ovl4hrK3LmPt4= +github.com/ROCm/amdsmi v0.0.0-20251117222445-a044536b8d69 h1:0Sl/RcyHZvSstVPIbdF0D/sdj8ZJd+xBxkCy5M8/aCI= +github.com/ROCm/amdsmi v0.0.0-20251117222445-a044536b8d69/go.mod h1:c2lzyLAghhTO+y/c3JjKl59JHJliIHwNZOroUfmBQxc= github.com/StackExchange/wmi v1.2.1 h1:VIkavFPXSjcnS+O8yTq7NI32k0R5Aj+v39y29VYDOSA= github.com/StackExchange/wmi v1.2.1/go.mod h1:rcmrprowKIVzvc+NUiLncP2uuArMWLCbu9SBzvHz7e8= github.com/VividCortex/ewma v1.2.0 h1:f58SaIzcDXrSy3kWaHNvuJgJ3Nmz59Zji6XoJR/q1ow=