ROCm · ci-penbot-01 · Apr 7, 2026
diff --git a/docs/contributing/developer-guide.md b/docs/contributing/developer-guide.md
@@ -99,6 +99,74 @@
 make -C tests/e2e # run e2e tests only
 ```
 
+## GPU Operator E2E Tests
+
+The `tests/k8s-e2e/` directory contains an e2e test suite that installs the GPU Operator via Helm and verifies metrics and health. Tests run against a live Kubernetes cluster.
+
+### Prerequisites
+
+- A running Kubernetes cluster with at least one AMD GPU node
+- `kubectl` configured (`~/.kube/config` or a custom kubeconfig)
+- Docker (to build the test runner image)
+
+### Test runner image
+
+```bash
+docker build -t gpu-op-k8s-e2e:latest -f tests/k8s-e2e/Dockerfile.e2e tests/k8s-e2e/
+```
+
+### Running tests
+
+#### Full install + verify + teardown
+
+Pass the helm chart as a local directory path (the `helm-charts-k8s/` directory in the repository root) or an OCI/repo reference if publishing to a registry:
+
+```bash
+docker run --rm \
+  -v /path/to/kubeconfig:/kubeconfig:ro \
+  -v /path/to/gpu-operator/helm-charts-k8s:/helm-charts:ro \
+  gpu-op-k8s-e2e:latest \
+  -kubeconfig /kubeconfig \
+  -operatorchart /helm-charts \
+  -operatortag v1.5.0 \
+  -test.timeout 60m
+```
+
+#### Verify only (pre-deployed cluster)
+
+```bash
+docker run --rm -v /path/to/kubeconfig:/kubeconfig:ro \
+  gpu-op-k8s-e2e:latest \
+  -kubeconfig /kubeconfig -existing \
+  -check.f 'TestOp010|TestOp020|TestOp030|TestOp040|TestOp050|TestOp060|TestOp065|TestOp070' \
+  -test.timeout 30m
+```
+
+
+#### Using make
+
+```bash
+# Full install+verify+teardown
+make -C tests/k8s-e2e all KUBECONFIG=/path/to/kubeconfig OPERATOR_TAG=v1.5.0
+
+# Verify only (pre-deployed)
+make -C tests/k8s-e2e verify KUBECONFIG=/path/to/kubeconfig
+```
+
+### Common flags
+
+| Flag | Default | Description |
+| --- | --- | --- |
+| `-kubeconfig` | `~/.kube/config` | Path to kubeconfig |
+| `-operatorchart` | OCI registry chart | GPU Operator helm chart (OCI ref or local path) |
+| `-operatortag` | `v1.4.1` | GPU Operator chart version |
+| `-namespace` | `kube-amd-gpu` | Kubernetes namespace |
+| `-existing` | `false` | Skip install/teardown — verify only against pre-deployed cluster |
+| `-noteardown` | `false` | Skip teardown after tests (leave operator installed) |
+| `-helmset` | _(none)_ | Extra helm `--set` override (repeatable) |
+| `-check.f` | _(all)_ | Regex filter for test names (gocheck syntax) |
+| `-test.timeout` | `30m` | Overall test timeout |
+
 ## Creating a Pull Request
 
 1. Fork the repository on GitHub.

diff --git a/tests/k8s-e2e/.gitignore b/tests/k8s-e2e/.gitignore
@@ -0,0 +1 @@
+vendor/
diff --git a/tests/k8s-e2e/Dockerfile.e2e b/tests/k8s-e2e/Dockerfile.e2e
@@ -0,0 +1,41 @@
+# Dockerfile.e2e — containerized runner for GPU Operator e2e tests
+#
+# Build (from gpu-operator repo root):
+#   docker build -t gpu-op-k8s-e2e:latest -f tests/k8s-e2e/Dockerfile.e2e tests/k8s-e2e/
+#
+# Run full install+verify+teardown:
+#   docker run --rm \
+#     -v /path/to/kubeconfig:/kubeconfig:ro \
+#     -v /path/to/gpu-operator/helm-charts-k8s:/helm-charts:ro \
+#     gpu-op-k8s-e2e:latest \
+#     -kubeconfig /kubeconfig \
+#     -operatorchart /helm-charts \
+#     -operatortag v1.4.1 -test.timeout 60m
+#
+# Run verify only (pre-deployed cluster):
+#   docker run --rm -v /path/to/kubeconfig:/kubeconfig:ro \
+#     gpu-op-k8s-e2e:latest \
+#     -kubeconfig /kubeconfig -existing \
+#     -check.f 'TestOp010|TestOp020|TestOp030|TestOp040|TestOp050|TestOp060|TestOp065|TestOp070' \
+#     -test.timeout 30m
+
+FROM golang:1.25-bookworm
+
+# Install kubectl
+RUN curl -fsSL "https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" \
+    -o /usr/local/bin/kubectl && chmod +x /usr/local/bin/kubectl
+
+WORKDIR /src
+
+# Copy module files first for caching, then download deps
+COPY go.mod go.sum ./
+RUN go mod download
+
+# Copy test sources
+COPY clients/ clients/
+COPY doc.go suite_test.go operator_test.go ./
+
+# Pre-compile tests to catch errors in _test.go files at image build time
+RUN go test -run=^$ ./...
+
+ENTRYPOINT ["go", "test", "-v", "-test.timeout=30m"]
diff --git a/tests/k8s-e2e/Makefile b/tests/k8s-e2e/Makefile
@@ -0,0 +1,33 @@
+.DEFAULT: all
+.PHONY: all verify lint
+
+TEST_ARGS :=
+KUBECONFIG :=
+OPERATOR_CHART ?= oci://registry-1.docker.io/rocm/gpu-operator-charts
+OPERATOR_TAG   ?= v1.4.1
+OPERATOR_NS    ?= kube-amd-gpu
+
+ifdef KUBECONFIG
+	TEST_ARGS += -kubeconfig=$(KUBECONFIG)
+endif
+
+# all: full install + verify + teardown (TestOp000–TestOp900)
+all:
+	go test -failfast \
+	  -operatorchart $(OPERATOR_CHART) \
+	  -operatortag $(OPERATOR_TAG) \
+	  -namespace $(OPERATOR_NS) \
+	  -test.timeout=60m \
+	  -v $(TEST_ARGS)
+
+# verify: verify DME on a PRE-DEPLOYED GPU Operator cluster (no install/teardown)
+verify:
+	go test -failfast -existing \
+	  -namespace $(OPERATOR_NS) \
+	  -check.f 'TestOp010|TestOp020|TestOp030|TestOp040|TestOp050|TestOp060|TestOp065|TestOp070' \
+	  -test.timeout=30m \
+	  -v $(TEST_ARGS)
+
+lint:
+	@go fmt ./...
+	@go vet ./...
diff --git a/tests/k8s-e2e/clients/helm.go b/tests/k8s-e2e/clients/helm.go
@@ -0,0 +1,197 @@
+/**
+# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the \"License\");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an \"AS IS\" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+**/
+
+package clients
+
+import (
+	"context"
+	"fmt"
+	"log"
+	"os"
+	"time"
+
+	helm "github.com/mittwald/go-helm-client"
+	helmValues "github.com/mittwald/go-helm-client/values"
+	"helm.sh/helm/v3/pkg/repo"
+	restclient "k8s.io/client-go/rest"
+)
+
+type HelmClientOpt func(client *HelmClient)
+
+type HelmClient struct {
+	client     helm.Client
+	cache      string
+	config     string
+	ns         string
+	restConfig *restclient.Config
+	relName    string
+}
+
+func WithNameSpaceOption(namespace string) HelmClientOpt {
+	return func(c *HelmClient) {
+		c.ns = namespace
+	}
+}
+
+func WithKubeConfigOption(kubeconf *restclient.Config) HelmClientOpt {
+	return func(c *HelmClient) {
+		c.restConfig = kubeconf
+	}
+}
+
+func NewHelmClient(opts ...HelmClientOpt) (*HelmClient, error) {
+	client := &HelmClient{}
+	for _, opt := range opts {
+		opt(client)
+	}
+
+	var err error
+	client.cache, err = os.MkdirTemp("", ".hcache")
+	if err != nil {
+		return nil, err
+	}
+
+	configDir, err := os.MkdirTemp("", ".hconfig")
+	if err != nil {
+		return nil, err
+	}
+	// RepositoryConfig must be a file path (repositories.yaml), not a directory.
+	client.config = configDir
+	repoFile := configDir + "/repositories.yaml"
+	restConfOptions := &helm.RestConfClientOptions{
+		Options: &helm.Options{
+			Namespace:        client.ns,
+			RepositoryConfig: repoFile,
+			Debug:            true,
+			RepositoryCache:  client.cache,
+			DebugLog: func(format string, v ...interface{}) {
+				log.Printf(format, v...)
+			},
+		},
+		RestConfig: client.restConfig,
+	}
+
+	helmClient, err := helm.NewClientFromRestConf(restConfOptions)
+	if err != nil {
+		return nil, err
+	}
+	client.client = helmClient
+	return client, nil
+}
+
+func (h *HelmClient) InstallChart(ctx context.Context, chart string, params []string) (string, error) {
+	values := helmValues.Options{
+		Values: params,
+	}
+
+	chartSpec := &helm.ChartSpec{
+		ReleaseName:   "e2e-test-k8s",
+		ChartName:     chart,
+		Namespace:     h.ns,
+		GenerateName:  false,
+		Wait:          true,
+		Timeout:       5 * time.Minute,
+		CleanupOnFail: false,
+		DryRun:        false,
+		ValuesOptions: values,
+	}
+
+	resp, err := h.client.InstallChart(ctx, chartSpec, nil)
+	if err != nil {
+		return "", err
+	}
+	log.Printf("helm chart install resp: %+v", resp)
+	h.relName = resp.Name
+	return resp.Name, err
+}
+
+func (h *HelmClient) UninstallChart() error {
+	if h.relName == "" {
+		return fmt.Errorf("helm chart is not installed by client")
+	}
+	return h.client.UninstallReleaseByName(h.relName)
+}
+
+// AddRepository adds a helm repository. url is the chart repo URL; name is the local alias.
+func (h *HelmClient) AddRepository(name, url string) error {
+	return h.client.AddOrUpdateChartRepo(repo.Entry{
+		Name: name,
+		URL:  url,
+	})
+}
+
+// InstallChartWithTimeout is like InstallChart but accepts a custom timeout, release name, and
+// optional chart version. version may be empty (uses the latest available version).
+func (h *HelmClient) InstallChartWithTimeout(ctx context.Context, releaseName, chart, version string, params []string, timeout time.Duration) (string, error) {
+	values := helmValues.Options{
+		Values: params,
+	}
+
+	chartSpec := &helm.ChartSpec{
+		ReleaseName:   releaseName,
+		ChartName:     chart,
+		Version:       version,
+		Namespace:     h.ns,
+		GenerateName:  false,
+		Wait:          false, // individual Op010-Op070 tests verify each component's readiness
+		Timeout:       timeout,
+		CleanupOnFail: false,
+		DryRun:        false,
+		SkipCRDs:      false,
+		ValuesOptions: values,
+	}
+
+	resp, err := h.client.InstallChart(ctx, chartSpec, nil)
+	if err != nil {
+		return "", err
+	}
+	log.Printf("helm chart install resp: %+v", resp)
+	h.relName = resp.Name
+	return resp.Name, nil
+}
+
+// UninstallChartByName uninstalls a helm release by name without requiring it was installed by this client.
+func (h *HelmClient) UninstallChartByName(releaseName string) error {
+	return h.client.UninstallReleaseByName(releaseName)
+}
+
+// UninstallAllReleases uninstalls all helm releases in the client's namespace.
+// Errors are logged but not returned so cleanup continues regardless.
+func (h *HelmClient) UninstallAllReleases() {
+	releases, err := h.client.ListDeployedReleases()
+	if err != nil {
+		log.Printf("UninstallAllReleases: list: %v", err)
+		return
+	}
+	for _, rel := range releases {
+		log.Printf("UninstallAllReleases: uninstalling %s", rel.Name)
+		if err := h.client.UninstallReleaseByName(rel.Name); err != nil {
+			log.Printf("UninstallAllReleases: %s: %v", rel.Name, err)
+		}
+	}
+}
+
+func (h *HelmClient) Cleanup() {
+	err := os.RemoveAll(h.cache)
+	if err != nil {
+		log.Printf("failed to delete directory %s; err: %v", h.cache, err)
+	}
+
+	err = os.RemoveAll(h.config)
+	if err != nil {
+		log.Printf("failed to delete directory %s; err: %v", h.config, err)
+	}
+}