Skip to content

Commit 7cddcb4

Browse files
committed
cuda.core.system: Better checks for when we expect APIs to be unsupported
1 parent ce333b6 commit 7cddcb4

File tree

8 files changed

+331
-227
lines changed

8 files changed

+331
-227
lines changed

cuda_bindings/tests/nvml/conftest.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
33

44
from collections import namedtuple
5+
from contextlib import contextmanager
56

67
import pytest
78
from cuda.bindings import _nvml as nvml
@@ -128,3 +129,39 @@ def pci_info(ngpus, handles):
128129
pci_info = [nvml.device_get_pci_info_v3(handles[i]) for i in range(ngpus)]
129130
assert len(pci_info) == ngpus
130131
return pci_info
132+
133+
134+
@contextmanager
135+
def unsupported_before(device: int, expected_device_arch: nvml.DeviceArch | str | None):
136+
device_arch = nvml.device_get_architecture(device)
137+
138+
if isinstance(expected_device_arch, nvml.DeviceArch):
139+
expected_device_arch_int = int(expected_device_arch)
140+
elif expected_device_arch == "FERMI":
141+
expected_device_arch_int = 1
142+
else:
143+
expected_device_arch_int = 0
144+
145+
if expected_device_arch is None or expected_device_arch == "HAS_INFOROM" or device_arch == nvml.DeviceArch.UNKNOWN:
146+
# In this case, we don't /know/ if it will fail, but we are ok if it
147+
# does or does not.
148+
149+
# TODO: There are APIs that are documented as supported only if the
150+
# device has an InfoROM, but I couldn't find a way to detect that. For
151+
# now, they are just handled as "possibly failing".
152+
153+
try:
154+
yield
155+
except nvml.NotSupportedError:
156+
pytest.skip(
157+
f"Unsupported call for device architecture {nvml.DeviceArch(device_arch).name} "
158+
f"on device '{nvml.device_get_name(device)}'"
159+
)
160+
elif int(device_arch) < expected_device_arch_int:
161+
# In this case, we /know/ if will fail, and we want to assert that it does.
162+
with pytest.raises(nvml.NotSupportedError):
163+
yield
164+
pytest.skip("Unsupported before {expected_device_arch.name}, got {nvml.device_get_name(device)}")
165+
else:
166+
# In this case, we /know/ it should work, and if it fails, the test should fail.
167+
yield

cuda_bindings/tests/nvml/test_compute_mode.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,8 @@
77
import pytest
88
from cuda.bindings import _nvml as nvml
99

10+
from .conftest import unsupported_before
11+
1012
COMPUTE_MODES = [
1113
nvml.ComputeMode.COMPUTEMODE_DEFAULT,
1214
nvml.ComputeMode.COMPUTEMODE_PROHIBITED,
@@ -16,18 +18,11 @@
1618

1719
@pytest.mark.skipif(sys.platform == "win32", reason="Test not supported on Windows")
1820
def test_compute_mode_supported_nonroot(all_devices):
19-
skip_reasons = set()
2021
for device in all_devices:
21-
try:
22+
with unsupported_before(device, None):
2223
original_compute_mode = nvml.device_get_compute_mode(device)
23-
except nvml.NotSupportedError:
24-
skip_reasons.add(f"nvmlDeviceGetComputeMode not supported for device {device}")
25-
continue
2624

2725
for cm in COMPUTE_MODES:
2826
with pytest.raises(nvml.NoPermissionError):
2927
nvml.device_set_compute_mode(device, cm)
3028
assert original_compute_mode == nvml.device_get_compute_mode(device), "Compute mode shouldn't have changed"
31-
32-
if skip_reasons:
33-
pytest.skip(" ; ".join(skip_reasons))

cuda_bindings/tests/nvml/test_gpu.py

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from cuda.bindings import _nvml as nvml
66

77
from . import util
8+
from .conftest import unsupported_before
89

910

1011
def test_gpu_get_module_id(nvml_init):
@@ -23,23 +24,14 @@ def test_gpu_get_module_id(nvml_init):
2324

2425

2526
def test_gpu_get_platform_info(all_devices):
26-
skip_reasons = set()
2727
for device in all_devices:
2828
if util.is_vgpu(device):
29-
skip_reasons.add(f"Not supported on vGPU device {device}")
30-
continue
29+
pytest.skip(f"Not supported on vGPU device {device}")
3130

32-
# TODO
33-
# if device.feature_dict.board.chip < board_class.Architecture.Blackwell:
34-
# test_utils.skip_test("Not supported on chip before Blackwell")
31+
# Documentation says Blackwell or newer only, but this does seem to pass
32+
# on some newer GPUs.
3533

36-
try:
34+
with unsupported_before(device, None):
3735
platform_info = nvml.device_get_platform_info(device)
38-
except nvml.NotSupportedError:
39-
skip_reasons.add(f"Not supported returned, linkely NVLink is disable for {device}")
40-
continue
4136

4237
assert isinstance(platform_info, nvml.PlatformInfo_v2)
43-
44-
if skip_reasons:
45-
pytest.skip(" ; ".join(skip_reasons))

cuda_bindings/tests/nvml/test_pynvml.py

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
from cuda.bindings import _nvml as nvml
1111

1212
from . import util
13+
from .conftest import unsupported_before
1314

1415
XFAIL_LEGACY_NVLINK_MSG = "Legacy NVLink test expected to fail."
1516

@@ -66,7 +67,8 @@ def test_device_get_handle_by_pci_bus_id(ngpus, pci_info):
6667
def test_device_get_memory_affinity(handles, scope):
6768
size = 1024
6869
for handle in handles:
69-
node_set = nvml.device_get_memory_affinity(handle, size, scope)
70+
with unsupported_before(handle, nvml.DeviceArch.KEPLER):
71+
node_set = nvml.device_get_memory_affinity(handle, size, scope)
7072
assert node_set is not None
7173
assert len(node_set) == size
7274

@@ -76,7 +78,8 @@ def test_device_get_memory_affinity(handles, scope):
7678
def test_device_get_cpu_affinity_within_scope(handles, scope):
7779
size = 1024
7880
for handle in handles:
79-
cpu_set = nvml.device_get_cpu_affinity_within_scope(handle, size, scope)
81+
with unsupported_before(handle, nvml.DeviceArch.KEPLER):
82+
cpu_set = nvml.device_get_cpu_affinity_within_scope(handle, size, scope)
8083
assert cpu_set is not None
8184
assert len(cpu_set) == size
8285

@@ -136,22 +139,22 @@ def test_device_get_p2p_status(handles, index):
136139

137140
def test_device_get_power_usage(ngpus, handles):
138141
for i in range(ngpus):
139-
try:
142+
# Note: documentation says this is supported on Fermi or newer,
143+
# but in practice it fails on some later architectures.
144+
with unsupported_before(handles[i], None):
140145
power_mwatts = nvml.device_get_power_usage(handles[i])
141-
except nvml.NotSupportedError:
142-
pytest.skip("device_get_power_usage not supported")
143146
assert power_mwatts >= 0.0
144147

145148

146149
def test_device_get_total_energy_consumption(ngpus, handles):
147150
for i in range(ngpus):
148-
try:
151+
with unsupported_before(handles[i], nvml.DeviceArch.VOLTA):
149152
energy_mjoules1 = nvml.device_get_total_energy_consumption(handles[i])
150-
except nvml.NotSupportedError:
151-
pytest.skip("device_get_total_energy_consumption not supported")
153+
152154
for j in range(10): # idle for 150 ms
153155
time.sleep(0.015) # and check for increase every 15 ms
154-
energy_mjoules2 = nvml.device_get_total_energy_consumption(handles[i])
156+
with unsupported_before(handles[i], nvml.DeviceArch.VOLTA):
157+
energy_mjoules2 = nvml.device_get_total_energy_consumption(handles[i])
155158
assert energy_mjoules2 >= energy_mjoules1
156159
if energy_mjoules2 > energy_mjoules1:
157160
break
@@ -182,7 +185,8 @@ def test_device_get_memory_info(ngpus, handles):
182185

183186
def test_device_get_utilization_rates(ngpus, handles):
184187
for i in range(ngpus):
185-
urate = nvml.device_get_utilization_rates(handles[i])
188+
with unsupported_before(handles[i], "FERMI"):
189+
urate = nvml.device_get_utilization_rates(handles[i])
186190
assert urate.gpu >= 0
187191
assert urate.memory >= 0
188192

@@ -239,7 +243,8 @@ def test_device_get_utilization_rates(ngpus, handles):
239243

240244
def test_device_get_pcie_throughput(ngpus, handles):
241245
for i in range(ngpus):
242-
tx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_TX_BYTES)
246+
with unsupported_before(handles[i], nvml.DeviceArch.MAXWELL):
247+
tx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_TX_BYTES)
243248
assert tx_bytes_tp >= 0
244249
rx_bytes_tp = nvml.device_get_pcie_throughput(handles[i], nvml.PcieUtilCounter.PCIE_UTIL_RX_BYTES)
245250
assert rx_bytes_tp >= 0
@@ -271,10 +276,10 @@ def test_device_get_pcie_throughput(ngpus, handles):
271276
def test_device_get_nvlink_capability(ngpus, handles, cap_type):
272277
for i in range(ngpus):
273278
for j in range(nvml.NVLINK_MAX_LINKS):
274-
try:
279+
# By the documentation, this should be supported on PASCAL or newer,
280+
# but this also seems to fail on newer.
281+
with unsupported_before(handles[i], None):
275282
cap = nvml.device_get_nvlink_capability(handles[i], j, cap_type)
276-
except nvml.NotSupportedError:
277-
pytest.skip("NVLink capability not supported")
278283
assert cap >= 0
279284

280285

0 commit comments

Comments
 (0)