microsoft
diff --git a/‎onnxruntime/test/python/transformers/benchmark_gqa.py‎
Lines changed: 58 additions & 39 deletions b/‎onnxruntime/test/python/transformers/benchmark_gqa.py‎
Lines changed: 58 additions & 39 deletions
@@ -14,17 +14,19 @@
 def get_plot_algos(sm: int, local_window_size: int | None):
     # GQA with local windows only works in sm=8x
     if sm >= 80 and local_window_size:
-        return {
-            "line_vals": ["ort_gqa", "ort_gqa_local", "ort_gqa_packed", "ort_gqa_local_packed"],
-            "line_names": ["ORT-GQA-Dense", "ORT-GQA-Local", "ORT-GQA-Dense-PackedQKV", "ORT-GQA-Local-PackedQKV"],
-            "styles": [("red", "solid"), ("yellow", "dashdot"), ("blue", "dashed"), ("green", "dotted")],
-        }
+        line_vals = ["ort_gqa", "ort_gqa_local", "ort_gqa_packed", "ort_gqa_local_packed"]
+        line_names = ["ORT-GQA-Dense", "ORT-GQA-Local", "ORT-GQA-Dense-PackedQKV", "ORT-GQA-Local-PackedQKV"]
+        styles = [("red", "solid"), ("yellow", "dashdot"), ("blue", "dashed"), ("green", "dotted")]
     else:
-        return {
-            "line_vals": ["ort_gqa", "ort_gqa_packed"],
-            "line_names": ["ORT-GQA-Dense", "ORT-GQA-Dense-PackedQKV"],
-            "styles": [("red", "solid"), ("blue", "dashed")],
-        }
+        line_vals = ["ort_gqa", "ort_gqa_packed"]
+        line_names = ["ORT-GQA-Dense", "ORT-GQA-Dense-PackedQKV"]
+        styles = [("red", "solid"), ("blue", "dashed")]
+
+    return {
+        "line_vals": line_vals,
+        "line_names": line_names,
+        "styles": styles,
+    }
 
 
 def plot_prompt_performance(
@@ -37,6 +39,7 @@ def plot_prompt_performance(
     max_seq_len: int,
     local_window_size: int | None = None,
     use_smooth_softmax: bool = False,
+    dtype: str = "float16",
 ):
     import triton  # noqa: PLC0415
 
@@ -48,14 +51,15 @@ def plot_prompt_performance(
             line_arg="provider",
             ylabel="ms",
             **algos,
-            plot_name=f"prompt-sm{sm}-{model_name}-b{batch_size}-h{num_heads}_{kv_num_heads}x{head_size}-fp16",
+            plot_name=f"prompt-sm{sm}-{model_name}-b{batch_size}-h{num_heads}_{kv_num_heads}x{head_size}-{dtype}",
             args={
                 "batch_size": batch_size,
                 "num_heads": num_heads,
                 "kv_num_heads": kv_num_heads,
                 "head_size": head_size,
                 "local_window_size": local_window_size,
                 "use_smooth_softmax": use_smooth_softmax,
+                "dtype": dtype,
             },
         )
     ]
@@ -70,6 +74,7 @@ def benchmark(
         head_size: int,
         local_window_size: int | None = None,
         use_smooth_softmax: bool = False,
+        dtype: str = "float16",
         device="cuda",
     ):
         warmup = 15
@@ -86,6 +91,7 @@ def benchmark(
             local_window_size=local_window_size if provider in ["ort_gqa_local", "ort_gqa_local_packed"] else -1,
             use_smooth_softmax=use_smooth_softmax,
             device=device,
+            dtype=torch.float16 if dtype == "float16" else torch.bfloat16,
             is_packed_qkv=provider in ["ort_gqa_packed", "ort_gqa_local_packed"],
         )
 
@@ -107,6 +113,7 @@ def plot_token_performance(
     max_seq_len: int,
     local_window_size: int | None = None,
     use_smooth_softmax: bool = False,
+    dtype: str = "float16",
 ):
     import triton  # noqa: PLC0415
 
@@ -118,14 +125,15 @@ def plot_token_performance(
             line_arg="provider",
             ylabel="ms",
             **algos,
-            plot_name=f"token-sm{sm}-{model_name}-b{batch_size}-h{num_heads}_{kv_num_heads}_d{head_size}-fp16",
+            plot_name=f"token-sm{sm}-{model_name}-b{batch_size}-h{num_heads}_{kv_num_heads}_d{head_size}-{dtype}",
             args={
                 "batch_size": batch_size,
                 "num_heads": num_heads,
                 "kv_num_heads": kv_num_heads,
                 "head_size": head_size,
                 "local_window_size": local_window_size,
                 "use_smooth_softmax": use_smooth_softmax,
+                "dtype": dtype,
             },
         )
     ]
@@ -140,6 +148,7 @@ def benchmark(
         head_size: int,
         local_window_size: int | None = None,
         use_smooth_softmax: bool = False,
+        dtype: str = "float16",
         device="cuda",
     ):
         warmup = 15
@@ -158,6 +167,7 @@ def benchmark(
             is_packed_qkv=provider in ["ort_gqa_packed", "ort_gqa_local_packed"],
             use_smooth_softmax=use_smooth_softmax,
             device=device,
+            dtype=torch.float16 if dtype == "float16" else torch.bfloat16,
         )
 
         obj = OrtGroupQueryAttention(config)
@@ -168,7 +178,7 @@ def benchmark(
     benchmark.run(save_path=".", print_data=True)
 
 
-def run_performance_test(sm: int):
+def run_performance_test(sm: int, fast: bool = False):
     """
     Run performance tests for prompt and token generation.
 
@@ -177,7 +187,7 @@ def run_performance_test(sm: int):
     memory_in_gb = torch.cuda.get_device_properties(device_id).total_memory / (1024 * 1024 * 1024)
 
     # Note: some models use bf16.
-    # We use fp16 for all models in this test since bf16 is not supported in ORT python API.
+    # We use fp16/bf16 for all models in this test.
     configures = [
         (32, 128, 8, 8192, None, "Llama3-8B"),
         (64, 128, 8, 8192, None, "Llama3-70B"),
@@ -188,34 +198,43 @@ def run_performance_test(sm: int):
         (40, 128, 10, 131072, None, "Phi-3-medium-128K"),
     ]
 
+    if fast:
+        configures = configures[:1]
+    batch_sizes = [1] if fast else [1, 4]
+    smooth_softmax_options = [False] if fast else [False, True]
+    dtypes = ["float16", "bfloat16"]
+
     # Reduce max sequence length when GPU memory is not enough.
     threshold = 131072 if memory_in_gb > 24 else 65536 if memory_in_gb > 12 else 32768
 
     for num_heads, head_size, kv_num_heads, max_seq_len, local_window_size, model_name in configures:
-        for batch_size in [1, 4]:
-            for use_smooth_softmax in [False, True]:
-                plot_prompt_performance(
-                    sm=sm,
-                    batch_size=batch_size,
-                    num_heads=num_heads,
-                    kv_num_heads=kv_num_heads,
-                    head_size=head_size,
-                    max_seq_len=min(threshold, max_seq_len),
-                    local_window_size=local_window_size,
-                    use_smooth_softmax=use_smooth_softmax,
-                    model_name=model_name,
-                )
-                plot_token_performance(
-                    sm=sm,
-                    batch_size=batch_size,
-                    num_heads=num_heads,
-                    kv_num_heads=kv_num_heads,
-                    head_size=head_size,
-                    max_seq_len=min(threshold, max_seq_len),
-                    local_window_size=local_window_size,
-                    use_smooth_softmax=use_smooth_softmax,
-                    model_name=model_name,
-                )
+        for batch_size in batch_sizes:
+            for use_smooth_softmax in smooth_softmax_options:
+                for dtype in dtypes:
+                    plot_prompt_performance(
+                        sm=sm,
+                        batch_size=batch_size,
+                        num_heads=num_heads,
+                        kv_num_heads=kv_num_heads,
+                        head_size=head_size,
+                        max_seq_len=min(threshold, max_seq_len),
+                        local_window_size=local_window_size,
+                        use_smooth_softmax=use_smooth_softmax,
+                        model_name=model_name,
+                        dtype=dtype,
+                    )
+                    plot_token_performance(
+                        sm=sm,
+                        batch_size=batch_size,
+                        num_heads=num_heads,
+                        kv_num_heads=kv_num_heads,
+                        head_size=head_size,
+                        max_seq_len=min(threshold, max_seq_len),
+                        local_window_size=local_window_size,
+                        use_smooth_softmax=use_smooth_softmax,
+                        model_name=model_name,
+                        dtype=dtype,
+                    )
 
 
 if __name__ == "__main__":
@@ -224,4 +243,4 @@ def run_performance_test(sm: int):
 
     s = torch.cuda.Stream()
     with torch.cuda.stream(s), torch.no_grad():
-        run_performance_test(sm)
+        run_performance_test(sm, fast=True)