-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathk8s.yaml
More file actions
126 lines (117 loc) · 4.96 KB
/
k8s.yaml
File metadata and controls
126 lines (117 loc) · 4.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NvidiaProprietary
# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
# property and proprietary rights in and to this material, related
# documentation and any modifications thereto. Any use, reproduction,
# disclosure or distribution of this material and related documentation
# without an express license agreement from NVIDIA CORPORATION or
# its affiliates is strictly prohibited.
# Kubernetes Cluster Validation Configuration
# This unified config contains all settings for validating a K8s cluster
# K8S_PROVIDER is auto-detected (microk8s vs kubectl)
#
# Usage:
# isvctl test run -f isvctl/configs/tests/k8s.yaml
# isvctl test run -f isvctl/configs/tests/k8s.yaml --phase test # Skip setup/teardown
# isvctl test run -f isvctl/configs/tests/k8s.yaml -- -v -s -k "K8sNodeCountCheck"
version: "1.0"
# =============================================================================
# Lifecycle Commands
# =============================================================================
# Stubs that query the cluster and return inventory JSON
commands:
kubernetes:
phases: ["setup", "test", "teardown"]
steps:
- name: setup
phase: setup
command: "../stubs/k8s/setup.sh"
timeout: 120
- name: teardown
phase: teardown
command: "../stubs/k8s/teardown.sh"
timeout: 30
# =============================================================================
# Test Configuration
# =============================================================================
tests:
cluster_name: "{{steps.setup.cluster_name}}"
description: "Kubernetes cluster validation"
platform: kubernetes
settings:
show_skipped_tests: false
validations:
kubernetes:
checks:
K8sNodeCountCheck:
count: "{{ steps.setup.kubernetes.node_count | default(4, true) }}"
K8sExpectedNodesCheck:
names: []
K8sNodeReadyCheck:
require_all_ready: true
K8sNvidiaSmiCheck:
runtime_class: "{{ steps.setup.kubernetes.runtime_class | default('nvidia') }}"
K8sDriverVersionCheck:
driver_version: "{{ steps.setup.kubernetes.driver_version | default('580.82.07') }}"
runtime_class: "{{ steps.setup.kubernetes.runtime_class | default('nvidia') }}"
K8sGpuPodAccessCheck:
# Note: This check runs nvidia-smi in a pod that requests 1 GPU,
# so it can only verify 1 GPU is visible per node (K8s isolation).
# Use gpu_node_count here (not total_gpus) since each pod only sees its allocated GPU.
gpu_count: 1
total_gpu_count: "{{ steps.setup.kubernetes.gpu_node_count | default(4, true) }}"
runtime_class: "{{ steps.setup.kubernetes.runtime_class | default('nvidia') }}"
K8sGpuCapacityCheck:
# This queries node capacity directly - shows real GPU count
resource_name: "{{ steps.setup.kubernetes.gpu_resource_name | default('nvidia.com/gpu') }}"
expected_per_node: "{{ steps.setup.kubernetes.gpu_per_node | default(4, true) }}"
expected_total: "{{ steps.setup.kubernetes.total_gpus | default(16, true) }}"
K8sGpuOperatorNamespaceCheck:
namespace: "{{ steps.setup.kubernetes.gpu_operator_namespace | default('nvidia-gpu-operator') }}"
K8sGpuOperatorPodsCheck:
namespace: "{{ steps.setup.kubernetes.gpu_operator_namespace | default('nvidia-gpu-operator') }}"
K8sGpuLabelsCheck:
label_selector: "nvidia.com/gpu.present=true"
K8sPodHealthCheck:
ignore_phases: ["Pending"]
K8sNoPendingPodsCheck: {}
K8sNoErrorPodsCheck:
error_states: ["Error", "CrashLoopBackOff"]
K8sMigConfigCheck:
require_mig: false
expected_labels:
"nvidia.com/mig.capable": "true"
"nvidia.com/mig.strategy": "single"
k8s_workloads:
checks:
K8sNcclWorkload:
min_bus_bw_gbps: 100
timeout: 600
K8sNcclMultiNodeWorkload:
nodes: "{{ steps.setup.kubernetes.gpu_node_count | default(2, true) }}"
min_bus_bw_gbps: 100
quick_mode: false
timeout: 600
use_compute_domain: auto
K8sGpuStressWorkload:
memory_gb: 1
runtime: 30
timeout: 300
K8sNimInferenceWorkload:
timeout: 1500
K8sNimHelmWorkload-1b:
model: "meta/llama-3.2-1b-instruct"
model_tag: "latest"
gpu_count: 1
timeout: 900
genai_perf_requests: 100
genai_perf_concurrency: 4
K8sNimHelmWorkload-3b:
model: "meta/llama-3.2-3b-instruct"
model_tag: "latest"
gpu_count: "{{ steps.setup.kubernetes.gpu_per_node | default(4, true) }}"
timeout: 1800
genai_perf_requests: 200
genai_perf_concurrency: 8
exclude:
markers: []