From c4905ec7b6ead6775df0a3baa1db89becb448f7f Mon Sep 17 00:00:00 2001
From: Ruodi Lu <ruodil@users.noreply.github.com>
Date: Wed, 11 Mar 2026 04:43:01 +0000
Subject: [PATCH 1/2] fix perf test cases issue

Signed-off-by: Ruodi Lu <ruodil@users.noreply.github.com>
---
 .../integration/test_lists/qa/llm_perf_core.yml | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/tests/integration/test_lists/qa/llm_perf_core.yml b/tests/integration/test_lists/qa/llm_perf_core.yml
index 2095ccb919e..6230d235bfa 100644
--- a/tests/integration/test_lists/qa/llm_perf_core.yml
+++ b/tests/integration/test_lists/qa/llm_perf_core.yml
@@ -5,7 +5,8 @@ llm_perf_core:
 # ===============================================================================
 # 1: All GPUs common tests(L20, L40S, H100, H20, H200, GB200, B200, B300, GB300, RTX-6000D, RTX-6000-Server test cases)
 # 2: L20, L40S, H100, H20, H200
-# 3: L40S, H100, H20, H200
+# 3: L40S, H100, H20, H200 (4 GPUs)
+# 3b: L40S, H100, H20, H200 (8 GPUs)
 # 4: H100, H20, H200 test cases
 # 5: H100, H20, H200, GB200, B200, B300, GB300, RTX6000-D, RTX6000-Server test cases
 # 6: GB200, B200, B300, GB300, RTX6000-Server test cases
@@ -136,6 +137,17 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,500-gpus:4]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-gpus:4]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-gpus:4]    #llama_v3.3_70b_instruct_fp8
+
+
+# 3b: L40S, H100, H20, H200 (8 GPUs)
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 8
+      compute_capability:
+        gt: 8.0
+        lte: 9.0
+  tests:
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-streaming-float8-input_output_len:512,32-gpus:8]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-gpus:8]
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:2000,200-gpus:8]
@@ -185,7 +197,6 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[phi_4_multimodal_instruct_audio-bench-pytorch-bfloat16-input_output_len:1000,1000-loras:1-con:250]
   - perf/test_perf.py::test_perf[phi_4_multimodal_instruct_image-bench-pytorch-bfloat16-input_output_len:1000,1000-loras:1-con:250]
   - perf/test_perf.py::test_perf[deepseek_v3_lite_fp8-bench-pytorch-float8-input_output_len:128,128]
-  - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8]
 
 
 # 6: GB200, B200, B300, GB300, RTX6000-Server test cases
@@ -381,6 +392,8 @@ llm_perf_core:
         lte: 12.0
   tests:
   - perf/test_perf.py::test_perf[llama_v4_maverick_17b_128e_instruct_fp8-bench-pytorch-float8-input_output_len:128,128-ep:8-tp:8-gpus:8]
+  # qwen3_235b_a22b_fp8
+  - perf/test_perf.py::test_perf[qwen3_235b_a22b_fp8-bench-pytorch-float8-input_output_len:1000,2000-con:256-ep:8-gpus:8]
 
 
 # 12: RTX-6000D, RTX-6000 Server test cases

From d3be4bbd0478116cc11bebf01dc58a0ee08d0775 Mon Sep 17 00:00:00 2001
From: ruodil <200874449+ruodil@users.noreply.github.com>
Date: Thu, 12 Mar 2026 10:35:06 +0800
Subject: [PATCH 2/2] Update tests/integration/test_lists/qa/llm_perf_core.yml

Co-authored-by: yufeiwu-nv <230315618+yufeiwu-nv@users.noreply.github.com>
Signed-off-by: ruodil <200874449+ruodil@users.noreply.github.com>
---
 tests/integration/test_lists/qa/llm_perf_core.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/test_lists/qa/llm_perf_core.yml b/tests/integration/test_lists/qa/llm_perf_core.yml
index 6230d235bfa..c1387eeb864 100644
--- a/tests/integration/test_lists/qa/llm_perf_core.yml
+++ b/tests/integration/test_lists/qa/llm_perf_core.yml
@@ -139,7 +139,7 @@ llm_perf_core:
   - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:512,32-gpus:4]    #llama_v3.3_70b_instruct_fp8
 
 
-# 3b: L40S, H100, H20, H200 (8 GPUs)
+# 3b: H100, H20, H200 (8 GPUs)
 - condition:
     ranges:
       system_gpu_count: