Skip to content

Commit c89452f

Browse files
tomchengchitangtomchengchitangCopilot
authored
[hipBLASLt] Deprecate rocm-smi and use amdsmi (#8527)
## Motivation rocm-smi will be removed from TheRock, and users/customers will move to using AMD SMI library and API’s <!-- Explain the purpose of this PR and the goals it aims to achieve. --> ## Technical Details - Change all rocm-smi to amdsmi <!-- Explain the changes along with any relevant GitHub links. --> ## Test Plan - I also add the unit tests: `python3 -m pytest -v \ Tensile/Tests/unit/test_detectAvailableGpus.py \ Tensile/Tests/unit/test_get_gpu_max_frequency_smi.py \ Tensile/Tests/unit/test_specs_amdsmi.py` <!-- Explain any relevant testing done to verify this PR. --> ## Test Result <img width="2131" height="732" alt="image" src="https://github.com/user-attachments/assets/26b1e0a2-4c5a-4c66-a5af-d256ba94b4b1" /> <!-- Briefly summarize test outcomes. --> ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. --------- Co-authored-by: tomchengchitang <tom.tang@amd.com> Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
1 parent 6a01952 commit c89452f

16 files changed

Lines changed: 603 additions & 109 deletions

File tree

projects/hipblaslt/clients/scripts/performance/specs.py

Lines changed: 85 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# SPDX-License-Identifier: MIT
33

44
from pathlib import Path
5+
import json
56
import re
67
import socket
78
import subprocess
@@ -20,6 +21,83 @@ def search(pattern, string):
2021
return None
2122

2223

24+
def _run_amdsmi_json(cmd: list):
25+
"""
26+
Runs an ``amd-smi`` command that emits JSON and returns the parsed object.
27+
28+
Returns None if amd-smi is unavailable or the output cannot be parsed.
29+
"""
30+
try:
31+
completed = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
32+
if getattr(completed, "returncode", 0) != 0:
33+
return None
34+
return json.loads(completed.stdout.decode("utf-8", errors="replace"))
35+
except (FileNotFoundError, json.JSONDecodeError, ValueError):
36+
return None
37+
38+
39+
def get_amdsmi_specs(devicenum: int = 0) -> dict:
40+
"""
41+
Collects per-device GPU specs using ``amd-smi`` structured JSON output.
42+
43+
Returns a dict with the following fields (any field that cannot be
44+
resolved is returned as None):
45+
46+
vbios_version, gpuid, vram (bytes), performance_level,
47+
memory_clk, system_clk
48+
"""
49+
g = str(devicenum)
50+
static = _run_amdsmi_json(
51+
["amd-smi", "static", "-g", g, "--asic", "--vbios", "--json"]
52+
)
53+
metric = _run_amdsmi_json(
54+
[
55+
"amd-smi", "metric", "-g", g,
56+
"--mem-usage", "--clock", "--perf-level", "--json",
57+
]
58+
)
59+
60+
result = {
61+
"vbios_version": None,
62+
"gpuid": None,
63+
"vram": None,
64+
"performance_level": None,
65+
"memory_clk": None,
66+
"system_clk": None,
67+
}
68+
69+
if static and static.get("gpu_data"):
70+
d = static["gpu_data"][0]
71+
result["gpuid"] = d.get("asic", {}).get("device_id")
72+
# rocm-smi --showvbios reported the board part number as VBIOS version.
73+
result["vbios_version"] = d.get("ifwi", {}).get("part_number")
74+
75+
if metric and metric.get("gpu_data"):
76+
d = metric["gpu_data"][0]
77+
78+
total = d.get("mem_usage", {}).get("total_vram", {}).get("value")
79+
if total is not None:
80+
# amd-smi reports total VRAM in MB; convert to bytes to match the
81+
# downstream "/1024**3" GiB formatting used for rocm-smi output.
82+
result["vram"] = int(total) * 1024 * 1024
83+
84+
perf = d.get("perf_level")
85+
if isinstance(perf, str):
86+
# Normalize e.g. "AMDSMI_DEV_PERF_LEVEL_AUTO" -> "auto" to match the
87+
# lowercase value rocm-smi used to print.
88+
result["performance_level"] = perf.split("_")[-1].lower()
89+
90+
clock = d.get("clock", {})
91+
sclk = clock.get("gfx_0", {}).get("clk", {}).get("value")
92+
if sclk is not None:
93+
result["system_clk"] = f"{sclk}Mhz"
94+
mclk = clock.get("mem_0", {}).get("clk", {}).get("value")
95+
if mclk is not None:
96+
result["memory_clk"] = f"{mclk}Mhz"
97+
98+
return result
99+
100+
23101
def _subprocess_helper(cmd: list) -> tuple:
24102
"""
25103
This is a helper method which runs a command line argument
@@ -239,51 +317,26 @@ def get_machine_specs(filename: str, devicenum: int = 0):
239317
rocm_info = rocm_path.read_text()
240318
else:
241319
rocm_info = None
242-
try:
243-
rocm_smi = run(
244-
[
245-
"rocm-smi",
246-
"--showvbios",
247-
"--showid",
248-
"--showproductname",
249-
"--showperflevel",
250-
"--showclocks",
251-
"--showmeminfo",
252-
"vram",
253-
"lshw",
254-
]
255-
)
256-
except FileNotFoundError as e:
257-
rocm_smi = None
320+
amdsmi_specs = get_amdsmi_specs(devicenum)
258321

259322
try:
260323
rocminfo = run(["rocminfo"])
261324
except FileNotFoundError as e:
262325
rocminfo = None
263326

264-
device = rf"^GPU\[{devicenum}\]\s*: "
265327
hostname = socket.gethostname()
266328

267329
if rocm_info is None:
268330
rocm_version = None
269331
else:
270332
rocm_version = rocm_info.strip()
271333

272-
if rocm_smi is None:
273-
vbios_version = (
274-
gpuid
275-
) = deviceinfo = vram = performance_level = memory_clk = system_clk = None
276-
else:
277-
if device is None:
278-
device = ""
279-
vbios_version = search(device + r"VBIOS version: (.*?)$", rocm_smi)
280-
gpuid = search(device + r"Device ID: (.*?)$", rocm_smi)
281-
if not gpuid:
282-
gpuid = search(device + r"GPU ID: (.*?)$", rocm_smi)
283-
vram = search(device + r".... Total Memory .B.: (\d+)$", rocm_smi)
284-
performance_level = search(device + r"Performance Level: (.*?)$", rocm_smi)
285-
memory_clk = search(device + r"mclk.*\((.*?)\)$", rocm_smi)
286-
system_clk = search(device + r"sclk.*\((.*?)\)$", rocm_smi)
334+
vbios_version = amdsmi_specs["vbios_version"]
335+
gpuid = amdsmi_specs["gpuid"]
336+
vram = amdsmi_specs["vram"]
337+
performance_level = amdsmi_specs["performance_level"]
338+
memory_clk = amdsmi_specs["memory_clk"]
339+
system_clk = amdsmi_specs["system_clk"]
287340

288341
architecture_name, internal_product_name = get_device_info()
289342

projects/hipblaslt/tensilelite/Tensile/ClientWriter.py

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -331,10 +331,13 @@ def writeRunScript(path, forBenchmark, enableTileSelection, cxxCompiler: str, cC
331331
runScriptFile.write(os.path.join(globalParameters["CMakeBuildType"], \
332332
"client.exe") )
333333
else:
334-
if globalParameters["PinClocks"] and globalParameters["ROCmSMIPath"]:
335-
runScriptFile.write("%s -d 0 --setfan 255 --setsclk 7\n" % globalParameters["ROCmSMIPath"])
334+
if globalParameters["PinClocks"] and globalParameters["AMDSMIPath"]:
335+
# amd-smi set/reset require elevated privileges. Pin to max
336+
# performance and run the fan at full speed for the benchmark.
337+
runScriptFile.write("sudo %s set -g 0 --fan 255\n" % globalParameters["AMDSMIPath"])
338+
runScriptFile.write("sudo %s set -g 0 --perf-level HIGH\n" % globalParameters["AMDSMIPath"])
336339
runScriptFile.write("sleep 1\n")
337-
runScriptFile.write("%s -d 0 -a\n" % globalParameters["ROCmSMIPath"])
340+
runScriptFile.write("%s metric -g 0 --clock\n" % globalParameters["AMDSMIPath"])
338341

339342
runScriptFile.write("set +e\n")
340343

@@ -368,9 +371,10 @@ def writeRunScript(path, forBenchmark, enableTileSelection, cxxCompiler: str, cC
368371
""")
369372

370373
if os.name != "nt":
371-
if globalParameters["PinClocks"] and globalParameters["ROCmSMIPath"]:
372-
runScriptFile.write("%s -d 0 --resetclocks\n" % globalParameters["ROCmSMIPath"])
373-
runScriptFile.write("%s -d 0 --setfan 50\n" % globalParameters["ROCmSMIPath"])
374+
if globalParameters["PinClocks"] and globalParameters["AMDSMIPath"]:
375+
# Reset clocks/overdrive to default and return fans to automatic
376+
# (driver) control once the benchmark is done.
377+
runScriptFile.write("sudo %s reset -g 0 --clocks --fans\n" % globalParameters["AMDSMIPath"])
374378
else:
375379
mxScaleFormatFlag = " --mx-scale-format {}".format(globalParameters["MXScaleFormat"]) if globalParameters["MXScaleFormat"] else ""
376380
for configFile in configPaths:

projects/hipblaslt/tensilelite/Tensile/Common/GlobalParameters.py

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@
5858
# timing between GSU / non-GSU kernels
5959
globalParameters["PinClocks"] = False # T=pin gpu clocks and fan, F=don't
6060
globalParameters["HardwareMonitor"] = (
61-
True # False: disable benchmarking client monitoring clocks using rocm-smi.
61+
True # False: disable benchmarking client monitoring clocks using amd-smi.
6262
)
6363
globalParameters["MinFlopsPerSync"] = (
6464
1 # Minimum number of flops per sync to increase stability for small problems
@@ -272,7 +272,7 @@
272272
)
273273

274274
# internal, i.e., gets set during startup
275-
globalParameters["ROCmSMIPath"] = None # /opt/rocm/bin/rocm-smi
275+
globalParameters["AMDSMIPath"] = None # /usr/bin/amd-smi
276276
globalParameters["HipClangVersion"] = "0.0.0"
277277

278278
# default runtime is selected based on operating system, user can override
@@ -718,7 +718,7 @@ def capRow(isaInfoMap, cap, capType):
718718
# e.g. RocProfCounter: 42 to pass silently.
719719
globalParameterTypeOverrides = {
720720
"ClientExecutionLockPath": {type(None), str}, # path or unset
721-
"ROCmSMIPath": {type(None), str}, # path, populated at startup
721+
"AMDSMIPath": {type(None), str}, # path, populated at startup
722722
"CmakeCxxCompiler": {type(None), str}, # path, populated at startup
723723
"RocProfCounter": {type(None), str}, # counter spec or None
724724
}
@@ -888,13 +888,19 @@ def assignGlobalParameters(config, isaInfoMap: Dict[IsaVersion, IsaInfo]):
888888

889889
globalParameters["ROCmBinPath"] = os.path.join(globalParameters["ROCmPath"], "bin")
890890
try:
891-
globalParameters["ROCmSMIPath"] = locateExe(globalParameters["ROCmBinPath"], "rocm-smi")
891+
globalParameters["AMDSMIPath"] = locateExe(globalParameters["ROCmBinPath"], "amd-smi")
892892
except OSError:
893-
if os.name == "nt":
894-
# rocm-smi is not presently supported on Windows so do not require it.
895-
pass
896-
else:
897-
raise
893+
# amd-smi is only needed at runtime to pin clocks/fans during benchmarking
894+
# and tuning; it is not required to build libraries or validate logic.
895+
# It is also not presently supported on Windows. Treat a missing amd-smi as
896+
# non-fatal: leave AMDSMIPath unset (None) so that clock pinning is skipped,
897+
# rather than aborting the build in environments that do not ship amd-smi.
898+
globalParameters["AMDSMIPath"] = None
899+
if os.name != "nt":
900+
printWarning(
901+
"Could not locate amd-smi; GPU clock/fan pinning will be disabled. "
902+
"Install the amdsmi package to enable it."
903+
)
898904

899905
if "AsanBuild" in config:
900906
globalParameters["AsanBuild"] = config["AsanBuild"]
@@ -951,13 +957,18 @@ def setupRestoreClocks():
951957
import atexit
952958

953959
def restoreClocks():
954-
# Clocks will only be pinned if rocm-smi is available, therefore
960+
# Clocks will only be pinned if amd-smi is available, therefore
955961
# we only need to restore if found.
956962
if globalParameters["PinClocks"]:
957-
rsmi = globalParameters["ROCmSMIPath"]
958-
if rsmi is not None:
959-
subprocess.call([rsmi, "-d", "0", "--resetclocks"])
960-
subprocess.call([rsmi, "-d", "0", "--setfan", "50"])
963+
asmi = globalParameters["AMDSMIPath"]
964+
if asmi is not None:
965+
# amd-smi set/reset require elevated privileges.
966+
# Reset clocks/overdrive to default and return fans to
967+
# automatic (driver) control.
968+
cmd = [asmi, "reset", "-g", "0", "--clocks", "--fans"]
969+
if hasattr(os, "geteuid") and os.geteuid() != 0:
970+
cmd = ["sudo", "-n"] + cmd
971+
subprocess.call(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
961972

962973
atexit.register(restoreClocks)
963974

projects/hipblaslt/tensilelite/Tensile/Common/Utilities.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -341,7 +341,7 @@ def isRhel8() -> bool:
341341
content = f.read()
342342
match = re.search(pattern, content, re.DOTALL)
343343
if match:
344-
printWarning("Rhel8 environments may not support all tools for system queries such as rocm-smi.")
344+
printWarning("Rhel8 environments may not support all tools for system queries such as amd-smi.")
345345
return True
346346
return False
347347

projects/hipblaslt/tensilelite/Tensile/ParallelExecution.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@
8686
results.csv (all results, same format as single-GPU)
8787
8888
Process Flow:
89-
1. Detect available GPUs via rocm-smi or hipInfo
89+
1. Detect available GPUs via amd-smi or hipInfo
9090
2. Count problems in the config file
9191
3. Create per-GPU config files with:
9292
- Assigned device-idx (GPU index)
@@ -107,6 +107,7 @@
107107
- runClientParallel(): Main orchestration function
108108
"""
109109

110+
import json
110111
import os
111112
import re
112113
import shutil
@@ -121,19 +122,19 @@
121122

122123

123124
def detectAvailableGpus():
124-
"""Detect the number of available GPUs using rocm-smi."""
125+
"""Detect the number of available GPUs using amd-smi."""
125126
try:
126127
result = subprocess.run(
127-
["rocm-smi", "--showid"],
128+
["amd-smi", "list", "--json"],
128129
capture_output=True,
129130
text=True,
130131
timeout=10
131132
)
132133
if result.returncode == 0:
133-
# Count unique GPU indices (GPU[N] appears multiple times per device)
134-
gpu_indices = set(re.findall(r'GPU\[(\d+)\]', result.stdout))
135-
if gpu_indices:
136-
return len(gpu_indices)
134+
# amd-smi list --json returns one object per GPU
135+
gpus = json.loads(result.stdout)
136+
if gpus:
137+
return len(gpus)
137138
except Exception:
138139
pass
139140

projects/hipblaslt/tensilelite/Tensile/Tensile.py

Lines changed: 16 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import sys
3232
import argparse
3333
import glob
34+
import json
3435

3536
from datetime import datetime
3637
from pathlib import Path
@@ -274,37 +275,25 @@ def get_gpu_max_frequency_smi(device_id):
274275
Get the maximum frequency of the specified GPU device
275276
'''
276277
try:
277-
# Run rocm-smi command and capture output
278-
result = subprocess.run(['rocm-smi', '-s'], capture_output=True, text=True)
278+
# Run amd-smi command and capture the GFX clock info as JSON
279+
result = subprocess.run(
280+
['amd-smi', 'metric', '-g', str(device_id), '--clock', '--json'],
281+
capture_output=True, text=True)
279282

280283
if result.returncode != 0:
281-
print(f"Error running rocm-smi: {result.stderr}")
284+
print(f"Error running amd-smi: {result.stderr}")
282285
return None
283286

284-
# Parse the output
285-
lines = result.stdout.split('\n')
286-
sclk_section = False
287-
frequencies = []
288-
289-
# Look for the sclk section of the specified device
290-
for line in lines:
291-
line = line.split(" ")
292-
if 'sclk' in line and f"GPU{device_id}" in line:
293-
sclk_section = True
294-
continue
287+
data = json.loads(result.stdout)
288+
clocks = data['gpu_data'][0]['clock']
295289

296-
# Parse frequencies in the sclk section
297-
if sclk_section:
298-
for part in line:
299-
if part.endswith("Mhz"):
300-
try:
301-
frequency = part.replace("Mhz", "")
302-
frequencies.append(int(frequency))
303-
except ValueError:
304-
print(f"Error parsing frequency: {part}")
305-
break
306-
if "socclk" in line:
307-
break
290+
# Collect the max GFX (sclk) clock across all gfx engines/partitions
291+
frequencies = []
292+
for name, info in clocks.items():
293+
if name.startswith('gfx'):
294+
max_clk = info.get('max_clk', {}).get('value')
295+
if isinstance(max_clk, int):
296+
frequencies.append(max_clk)
308297

309298
# Return the maximum frequency found
310299
return max(frequencies) if frequencies else None
@@ -625,7 +614,7 @@ def Tensile(userArgs):
625614
max_frequency = get_gpu_max_frequency(device_id)
626615

627616
if not max_frequency or max_frequency <= 0:
628-
max_frequency = get_gpu_max_frequency_smi(device_id) # Using rocm-smi just in case
617+
max_frequency = get_gpu_max_frequency_smi(device_id) # Using amd-smi just in case
629618

630619
if not max_frequency or max_frequency <= 0:
631620
print(f"Could not detect valid GPU frequency for device {device_id}")

projects/hipblaslt/tensilelite/Tensile/Tests/unit/characterization/CommonUtilities/test_mut_Utilities_isRhel8_char.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,5 +96,5 @@ def test_isRhel8_emits_exact_warning_text_on_match(monkeypatch):
9696

9797
assert result is True
9898
assert calls == [
99-
("Rhel8 environments may not support all tools for system queries such as rocm-smi.",)
99+
("Rhel8 environments may not support all tools for system queries such as amd-smi.",)
100100
]

0 commit comments

Comments
 (0)