hipblaslt gemm tuning example (#16)

xiaobochen-amd · web-flow · commit 7003718a6700 · 2025-04-01T16:50:23.000+08:00
diff --git a/examples/offline_tune/README.md b/examples/offline_tune/README.md
@@ -0,0 +1,53 @@
+# Offline Tune
+
+
+## 1. GEMM Tune
+
+Use the `hipblaslt-bench` tool to perform GEMM tuning.
+
+`hipblaslt-bench` is usually located under `/opt/rocm/bin`. However, if it's not available in some environments/docker, you'll need to reinstall hipblaslt.
+
+
+### Install Hipblaslt (Optional)
+You can reference: https://github.com/ROCm/hipBLASLt?tab=readme-ov-file#build-and-install
+
+If only run MI300X, you can use the following command for a quick compilation, reducing the compilation time to under 2 hours.
+```
+./install.sh -idc --logic-yaml-filter gfx942/*/* -a gfx942 -j 256 --build_dir build
+```
+
+
+### Step 1: Dump Shape
+* Set the Hipblaslt ENV.
+* Run Train code.
+* Unset ENV.
+* The gemm shape will be dumped into `dump_gemm_shapes.txt`.
+* Note: If just to dump shape, in most cases, there's no need to train for many iters—just a few should be enough, as each step uses the same shape.
+```
+export HIPBLASLT_LOG_MASK=32
+export HIPBLASLT_LOG_FILE=dump_gemm_shapes.txt
+
+./run_your_code
+
+unset HIPBLASLT_LOG_MASK
+unset HIPBLASLT_LOG_FILE
+```
+### Step 2: Tuning
+Run `offline_tune_gemm.py` and save tuned results in `tune_gemm_results.txt`
+```
+python3 offline_tune_gemm.py                            \
+    --dump-shape-path /PATH/TO/dump_gemm_shapes.txt     \
+    --tune-result-path /PATH/TO/tune_gemm_results.txt
+```
+
+### Step 3: Use tuned results to Train
+* Set the results ENV.
+* Start your tasks.
+```
+export HIPBLASLT_TUNING_OVERRIDE_FILE=tune_gemm_results.txt
+./run_your_code
+```
+
+# Reference
+
+https://rocm.blogs.amd.com/artificial-intelligence/gemm_blog/README.html
diff --git a/examples/offline_tune/offline_tune_gemm.py b/examples/offline_tune/offline_tune_gemm.py
@@ -0,0 +1,86 @@
+import argparse
+import copy
+import os
+import shlex
+import subprocess
+
+
+def is_hip():
+    import torch
+
+    if torch.version.hip is not None:
+        return True
+    return False
+
+
+class OfflineTuneGemm:
+
+    def __init__(self, dump_gemm_shape_file_path):
+        self.HIPBLIST_BENCH = "/opt/rocm/bin/hipblaslt-bench "
+        self.ROTATING_BUFFER = 512
+        self.RUN_NUMS = 20
+        self.REQUESTED_SOLUTION = -1
+        self.SKIP_LOW_SOLUTION = 0.7
+
+        self.src_script_dict_list = []
+        self.src_script_list = []
+        self.tune_script_dict_list = []
+        self.tune_script_list = []
+        self.process_raw_dump(dump_gemm_shape_file_path)
+
+    def process_raw_dump(self, dump_gemm_shape_file_path):
+        with open(dump_gemm_shape_file_path, "r", encoding="utf-8") as file:
+            lines = file.readlines()
+        lines = list(set(lines))
+        lines.sort()
+
+        for line in lines:
+            line = line.strip().split(" ")
+            line = [item for item in line if item.strip()]
+            if line[0] == "hipblaslt-bench":
+                src_script_dict = {}
+                for item in line[1:]:
+                    if item.startswith("--") or item.startswith("-"):
+                        key = item
+                    else:
+                        src_script_dict[key] = item
+                # src script
+                src_script_dict["--rotating"] = self.ROTATING_BUFFER
+                src_script_dict["--cold_iters"] = self.RUN_NUMS
+                src_script_dict["--iters"] = self.RUN_NUMS
+                src_script = self.HIPBLIST_BENCH + " ".join(f"{k} {v}" for k, v in src_script_dict.items())
+                self.src_script_dict_list.append(src_script_dict)
+                self.src_script_list.append(src_script)
+                # tune script
+                tune_script_dict = copy.deepcopy(src_script_dict)
+                del tune_script_dict["--algo_method"]
+                del tune_script_dict["--solution_index"]
+                tune_script_dict["--requested_solution"] = self.REQUESTED_SOLUTION
+                tune_script_dict["--skip_slow_solution_ratio"] = self.SKIP_LOW_SOLUTION
+                tune_script = self.HIPBLIST_BENCH + " ".join(f"{k} {v}" for k, v in tune_script_dict.items())
+                self.tune_script_dict_list.append(tune_script_dict)
+                self.tune_script_list.append(tune_script)
+
+    # TODO: use more device to tune
+    def tune(self, tune_gemm_results_file_path, device_id="0"):
+        env = os.environ.copy()
+        if is_hip():
+            env.update({"HIP_VISIBLE_DEVICES": device_id})
+        else:
+            env.update({"CUDA_VISIBLE_DEVICES": device_id})
+        env.update({"HIPBLASLT_TUNING_FILE": tune_gemm_results_file_path})
+
+        for idx, script in enumerate(self.tune_script_list):
+            print(f"Tune[{idx}/{len(self.tune_script_list)}]:{script}")
+            subprocess.run(shlex.split(script), env=env)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dump-shape-pathh", type=str)
+    parser.add_argument("--tune-result-path", type=str)
+    # parser.add_argument("--device-id", type=str, default="0")
+    args = parser.parse_args()
+
+    tuner = OfflineTuneGemm(args.dump_shape_path)
+    tuner.tune(args.tune_result_path)