NVIDIA · ruodil · Mar 11, 2026 · Mar 12, 2026 · yufeiwu-nv · Mar 11, 2026
@@ -5,7 +5,8 @@ llm_perf_core:
 # ===============================================================================
 # 1: All GPUs common tests(L20, L40S, H100, H20, H200, GB200, B200, B300, GB300, RTX-6000D, RTX-6000-Server test cases)
 # 2: L20, L40S, H100, H20, H200
-# 3: L40S, H100, H20, H200
+# 3: L40S, H100, H20, H200 (4 GPUs)
+# 3b: L40S, H100, H20, H200 (8 GPUs)
 # 4: H100, H20, H200 test cases
 # 5: H100, H20, H200, GB200, B200, B300, GB300, RTX6000-D, RTX6000-Server test cases
 # 6: GB200, B200, B300, GB300, RTX6000-Server test cases
@@ -136,6 +137,17 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-gpus:4]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:4]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-gpus:4]    #llama_v3.3_70b_instruct_fp8
+
+
+# 3b: H100, H20, H200 (8 GPUs)
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 8
+      compute_capability:
+        gt: 8.0
-        gt: 8.0
+        gte: 9.0
-        gt: 8.0
+        gte: 9.0
+        lte: 9.0
+  tests:
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:512,32-gpus:8]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-gpus:8]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,200-gpus:8]
@@ -185,7 +197,6 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[phi_4_multimodal_instruct_audio-bench-pytorch-bfloat16-input_output_len:1000,1000-loras:1-con:250]
   - perf/test_perf.py::test_perf[phi_4_multimodal_instruct_image-bench-pytorch-bfloat16-input_output_len:1000,1000-loras:1-con:250]
   - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-input_output_len:128,128]
-  - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8]
 
 
 # 6: GB200, B200, B300, GB300, RTX6000-Server test cases
@@ -381,6 +392,8 @@ llm_perf_core:
         lte: 12.0
   tests:
   - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-ep:8-tp:8-gpus:8]
+  # qwen3_235b_a22b_fp8
+  - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8]
 
 
 # 12: RTX-6000D, RTX-6000 Server test cases