ISV-NCP-Validation-Suite/isvctl/configs/tests/k8s.yaml at main · NVIDIA/ISV-NCP-Validation-Suite · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: LicenseRef-NvidiaProprietary

# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
# property and proprietary rights in and to this material, related
# documentation and any modifications thereto. Any use, reproduction,
# disclosure or distribution of this material and related documentation
# without an express license agreement from NVIDIA CORPORATION or
# its affiliates is strictly prohibited.

# Kubernetes Cluster Validation Configuration
# This unified config contains all settings for validating a K8s cluster
# K8S_PROVIDER is auto-detected (microk8s vs kubectl)
#
# Usage:
#   isvctl test run -f isvctl/configs/tests/k8s.yaml
#   isvctl test run -f isvctl/configs/tests/k8s.yaml --phase test  # Skip setup/teardown
#   isvctl test run -f isvctl/configs/tests/k8s.yaml -- -v -s -k "K8sNodeCountCheck"

version: "1.0"

# =============================================================================
# Lifecycle Commands
# =============================================================================
# Stubs that query the cluster and return inventory JSON
commands:
  kubernetes:
    phases: ["setup", "test", "teardown"]
    steps:
      - name: setup
        phase: setup
        command: "../stubs/k8s/setup.sh"
        timeout: 120
      - name: teardown
        phase: teardown
        command: "../stubs/k8s/teardown.sh"
        timeout: 30

# =============================================================================
# Test Configuration
# =============================================================================
tests:
  cluster_name: "{{steps.setup.cluster_name}}"
  description: "Kubernetes cluster validation"
  platform: kubernetes

  settings:
    show_skipped_tests: false

  validations:
    kubernetes:
      checks:
        K8sNodeCountCheck:
          count: "{{ steps.setup.kubernetes.node_count | default(4, true) }}"
        K8sExpectedNodesCheck:
          names: []
        K8sNodeReadyCheck:
          require_all_ready: true
        K8sNvidiaSmiCheck:
          runtime_class: "{{ steps.setup.kubernetes.runtime_class | default('nvidia') }}"
        K8sDriverVersionCheck:
          driver_version: "{{ steps.setup.kubernetes.driver_version | default('580.82.07') }}"
          runtime_class: "{{ steps.setup.kubernetes.runtime_class | default('nvidia') }}"
        K8sGpuPodAccessCheck:
          # Note: This check runs nvidia-smi in a pod that requests 1 GPU,
          # so it can only verify 1 GPU is visible per node (K8s isolation).
          # Use gpu_node_count here (not total_gpus) since each pod only sees its allocated GPU.
          gpu_count: 1
          total_gpu_count: "{{ steps.setup.kubernetes.gpu_node_count | default(4, true) }}"
          runtime_class: "{{ steps.setup.kubernetes.runtime_class | default('nvidia') }}"
        K8sGpuCapacityCheck:
          # This queries node capacity directly - shows real GPU count
          resource_name: "{{ steps.setup.kubernetes.gpu_resource_name | default('nvidia.com/gpu') }}"
          expected_per_node: "{{ steps.setup.kubernetes.gpu_per_node | default(4, true) }}"
          expected_total: "{{ steps.setup.kubernetes.total_gpus | default(16, true) }}"
        K8sGpuOperatorNamespaceCheck:
          namespace: "{{ steps.setup.kubernetes.gpu_operator_namespace | default('nvidia-gpu-operator') }}"
        K8sGpuOperatorPodsCheck:
          namespace: "{{ steps.setup.kubernetes.gpu_operator_namespace | default('nvidia-gpu-operator') }}"
        K8sGpuLabelsCheck:
          label_selector: "nvidia.com/gpu.present=true"
        K8sPodHealthCheck:
          ignore_phases: ["Pending"]
        K8sNoPendingPodsCheck: {}
        K8sNoErrorPodsCheck:
          error_states: ["Error", "CrashLoopBackOff"]
        K8sMigConfigCheck:
          require_mig: false
          expected_labels:
            "nvidia.com/mig.capable": "true"
            "nvidia.com/mig.strategy": "single"

    k8s_workloads:
      checks:
        K8sNcclWorkload:
          min_bus_bw_gbps: 100
          timeout: 600
        K8sNcclMultiNodeWorkload:
          nodes: "{{ steps.setup.kubernetes.gpu_node_count | default(2, true) }}"
          min_bus_bw_gbps: 100
          quick_mode: false
          timeout: 600
          use_compute_domain: auto
        K8sGpuStressWorkload:
          memory_gb: 1
          runtime: 30
          timeout: 300
        K8sNimInferenceWorkload:
          timeout: 1500
        K8sNimHelmWorkload-1b:
          model: "meta/llama-3.2-1b-instruct"
          model_tag: "latest"
          gpu_count: 1
          timeout: 900
          genai_perf_requests: 100
          genai_perf_concurrency: 4
        K8sNimHelmWorkload-3b:
          model: "meta/llama-3.2-3b-instruct"
          model_tag: "latest"
          gpu_count: "{{ steps.setup.kubernetes.gpu_per_node | default(4, true) }}"
          timeout: 1800
          genai_perf_requests: 200
          genai_perf_concurrency: 8

  exclude:
    markers: []