Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions docs/contributing/developer-guide.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,74 @@
make -C tests/e2e # run e2e tests only
```

## GPU Operator E2E Tests

The `tests/k8s-e2e/` directory contains an e2e test suite that installs the GPU Operator via Helm and verifies metrics and health. Tests run against a live Kubernetes cluster.

### Prerequisites

- A running Kubernetes cluster with at least one AMD GPU node
- `kubectl` configured (`~/.kube/config` or a custom kubeconfig)
- Docker (to build the test runner image)

### Test runner image

```bash
docker build -t gpu-op-k8s-e2e:latest -f tests/k8s-e2e/Dockerfile.e2e tests/k8s-e2e/
```

### Running tests

#### Full install + verify + teardown

Pass the helm chart as a local directory path (the `helm-charts-k8s/` directory in the repository root) or an OCI/repo reference if publishing to a registry:

```bash
docker run --rm \
-v /path/to/kubeconfig:/kubeconfig:ro \
-v /path/to/gpu-operator/helm-charts-k8s:/helm-charts:ro \
gpu-op-k8s-e2e:latest \
-kubeconfig /kubeconfig \
-operatorchart /helm-charts \
-operatortag v1.5.0 \
-test.timeout 60m
```

#### Verify only (pre-deployed cluster)

```bash
docker run --rm -v /path/to/kubeconfig:/kubeconfig:ro \
gpu-op-k8s-e2e:latest \
-kubeconfig /kubeconfig -existing \
-check.f 'TestOp010|TestOp020|TestOp030|TestOp040|TestOp050|TestOp060|TestOp065|TestOp070' \
-test.timeout 30m
```


Check failure on line 145 in docs/contributing/developer-guide.md

View workflow job for this annotation

GitHub Actions / Documentation / Markdown

Multiple consecutive blank lines

docs/contributing/developer-guide.md:145 MD012/no-multiple-blanks Multiple consecutive blank lines [Expected: 1; Actual: 2] https://github.com/DavidAnson/markdownlint/blob/v0.32.1/doc/md012.md
#### Using make

```bash
# Full install+verify+teardown
make -C tests/k8s-e2e all KUBECONFIG=/path/to/kubeconfig OPERATOR_TAG=v1.5.0

# Verify only (pre-deployed)
make -C tests/k8s-e2e verify KUBECONFIG=/path/to/kubeconfig
```

### Common flags

| Flag | Default | Description |
| --- | --- | --- |
| `-kubeconfig` | `~/.kube/config` | Path to kubeconfig |
| `-operatorchart` | OCI registry chart | GPU Operator helm chart (OCI ref or local path) |
| `-operatortag` | `v1.4.1` | GPU Operator chart version |
| `-namespace` | `kube-amd-gpu` | Kubernetes namespace |
| `-existing` | `false` | Skip install/teardown — verify only against pre-deployed cluster |
| `-noteardown` | `false` | Skip teardown after tests (leave operator installed) |
| `-helmset` | _(none)_ | Extra helm `--set` override (repeatable) |
| `-check.f` | _(all)_ | Regex filter for test names (gocheck syntax) |
| `-test.timeout` | `30m` | Overall test timeout |

## Creating a Pull Request

1. Fork the repository on GitHub.
Expand Down
1 change: 1 addition & 0 deletions tests/k8s-e2e/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
vendor/
41 changes: 41 additions & 0 deletions tests/k8s-e2e/Dockerfile.e2e
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
# Dockerfile.e2e — containerized runner for GPU Operator e2e tests
#
# Build (from gpu-operator repo root):
# docker build -t gpu-op-k8s-e2e:latest -f tests/k8s-e2e/Dockerfile.e2e tests/k8s-e2e/
#
# Run full install+verify+teardown:
# docker run --rm \
# -v /path/to/kubeconfig:/kubeconfig:ro \
# -v /path/to/gpu-operator/helm-charts-k8s:/helm-charts:ro \
# gpu-op-k8s-e2e:latest \
# -kubeconfig /kubeconfig \
# -operatorchart /helm-charts \
# -operatortag v1.4.1 -test.timeout 60m
#
# Run verify only (pre-deployed cluster):
# docker run --rm -v /path/to/kubeconfig:/kubeconfig:ro \
# gpu-op-k8s-e2e:latest \
# -kubeconfig /kubeconfig -existing \
# -check.f 'TestOp010|TestOp020|TestOp030|TestOp040|TestOp050|TestOp060|TestOp065|TestOp070' \
# -test.timeout 30m

FROM golang:1.25-bookworm

# Install kubectl
RUN curl -fsSL "https://dl.k8s.io/release/$(curl -Ls https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" \
-o /usr/local/bin/kubectl && chmod +x /usr/local/bin/kubectl

WORKDIR /src

# Copy module files first for caching, then download deps
COPY go.mod go.sum ./
RUN go mod download

# Copy test sources
COPY clients/ clients/
COPY doc.go suite_test.go operator_test.go ./

# Pre-compile tests to catch errors in _test.go files at image build time
RUN go test -run=^$ ./...

ENTRYPOINT ["go", "test", "-v", "-test.timeout=30m"]
33 changes: 33 additions & 0 deletions tests/k8s-e2e/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
.DEFAULT: all
.PHONY: all verify lint

TEST_ARGS :=
KUBECONFIG :=
OPERATOR_CHART ?= oci://registry-1.docker.io/rocm/gpu-operator-charts
OPERATOR_TAG ?= v1.4.1
OPERATOR_NS ?= kube-amd-gpu

ifdef KUBECONFIG
TEST_ARGS += -kubeconfig=$(KUBECONFIG)
endif

# all: full install + verify + teardown (TestOp000–TestOp900)
all:
go test -failfast \
-operatorchart $(OPERATOR_CHART) \
-operatortag $(OPERATOR_TAG) \
-namespace $(OPERATOR_NS) \
-test.timeout=60m \
-v $(TEST_ARGS)

# verify: verify DME on a PRE-DEPLOYED GPU Operator cluster (no install/teardown)
verify:
go test -failfast -existing \
-namespace $(OPERATOR_NS) \
-check.f 'TestOp010|TestOp020|TestOp030|TestOp040|TestOp050|TestOp060|TestOp065|TestOp070' \
-test.timeout=30m \
-v $(TEST_ARGS)

lint:
@go fmt ./...
@go vet ./...
197 changes: 197 additions & 0 deletions tests/k8s-e2e/clients/helm.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,197 @@
/**
# Copyright (c) Advanced Micro Devices, Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the \"License\");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an \"AS IS\" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
**/

package clients

import (
"context"
"fmt"
"log"
"os"
"time"

helm "github.com/mittwald/go-helm-client"
helmValues "github.com/mittwald/go-helm-client/values"
"helm.sh/helm/v3/pkg/repo"
restclient "k8s.io/client-go/rest"
)

type HelmClientOpt func(client *HelmClient)

type HelmClient struct {
client helm.Client
cache string
config string
ns string
restConfig *restclient.Config
relName string
}

func WithNameSpaceOption(namespace string) HelmClientOpt {
return func(c *HelmClient) {
c.ns = namespace
}
}

func WithKubeConfigOption(kubeconf *restclient.Config) HelmClientOpt {
return func(c *HelmClient) {
c.restConfig = kubeconf
}
}

func NewHelmClient(opts ...HelmClientOpt) (*HelmClient, error) {
client := &HelmClient{}
for _, opt := range opts {
opt(client)
}

var err error
client.cache, err = os.MkdirTemp("", ".hcache")
if err != nil {
return nil, err
}

configDir, err := os.MkdirTemp("", ".hconfig")
if err != nil {
return nil, err
}
// RepositoryConfig must be a file path (repositories.yaml), not a directory.
client.config = configDir
repoFile := configDir + "/repositories.yaml"
restConfOptions := &helm.RestConfClientOptions{
Options: &helm.Options{
Namespace: client.ns,
RepositoryConfig: repoFile,
Debug: true,
RepositoryCache: client.cache,
DebugLog: func(format string, v ...interface{}) {
log.Printf(format, v...)
},
},
RestConfig: client.restConfig,
}

helmClient, err := helm.NewClientFromRestConf(restConfOptions)
if err != nil {
return nil, err
}
client.client = helmClient
return client, nil
}

func (h *HelmClient) InstallChart(ctx context.Context, chart string, params []string) (string, error) {
values := helmValues.Options{
Values: params,
}

chartSpec := &helm.ChartSpec{
ReleaseName: "e2e-test-k8s",
ChartName: chart,
Namespace: h.ns,
GenerateName: false,
Wait: true,
Timeout: 5 * time.Minute,
CleanupOnFail: false,
DryRun: false,
ValuesOptions: values,
}

resp, err := h.client.InstallChart(ctx, chartSpec, nil)
if err != nil {
return "", err
}
log.Printf("helm chart install resp: %+v", resp)
h.relName = resp.Name
return resp.Name, err
}

func (h *HelmClient) UninstallChart() error {
if h.relName == "" {
return fmt.Errorf("helm chart is not installed by client")
}
return h.client.UninstallReleaseByName(h.relName)
}

// AddRepository adds a helm repository. url is the chart repo URL; name is the local alias.
func (h *HelmClient) AddRepository(name, url string) error {
return h.client.AddOrUpdateChartRepo(repo.Entry{
Name: name,
URL: url,
})
}

// InstallChartWithTimeout is like InstallChart but accepts a custom timeout, release name, and
// optional chart version. version may be empty (uses the latest available version).
func (h *HelmClient) InstallChartWithTimeout(ctx context.Context, releaseName, chart, version string, params []string, timeout time.Duration) (string, error) {
values := helmValues.Options{
Values: params,
}

chartSpec := &helm.ChartSpec{
ReleaseName: releaseName,
ChartName: chart,
Version: version,
Namespace: h.ns,
GenerateName: false,
Wait: false, // individual Op010-Op070 tests verify each component's readiness
Timeout: timeout,
CleanupOnFail: false,
DryRun: false,
SkipCRDs: false,
ValuesOptions: values,
}

resp, err := h.client.InstallChart(ctx, chartSpec, nil)
if err != nil {
return "", err
}
log.Printf("helm chart install resp: %+v", resp)
h.relName = resp.Name
return resp.Name, nil
}

// UninstallChartByName uninstalls a helm release by name without requiring it was installed by this client.
func (h *HelmClient) UninstallChartByName(releaseName string) error {
return h.client.UninstallReleaseByName(releaseName)
}

// UninstallAllReleases uninstalls all helm releases in the client's namespace.
// Errors are logged but not returned so cleanup continues regardless.
func (h *HelmClient) UninstallAllReleases() {
releases, err := h.client.ListDeployedReleases()
if err != nil {
log.Printf("UninstallAllReleases: list: %v", err)
return
}
for _, rel := range releases {
log.Printf("UninstallAllReleases: uninstalling %s", rel.Name)
if err := h.client.UninstallReleaseByName(rel.Name); err != nil {
log.Printf("UninstallAllReleases: %s: %v", rel.Name, err)
}
}
}

func (h *HelmClient) Cleanup() {
err := os.RemoveAll(h.cache)
if err != nil {
log.Printf("failed to delete directory %s; err: %v", h.cache, err)
}

err = os.RemoveAll(h.config)
if err != nil {
log.Printf("failed to delete directory %s; err: %v", h.config, err)
}
}
Loading
Loading