Reduce GQA cpu test combinations (#26897)

tianleiwu · web-flow · commit ecf164a945da · 2026-01-02T21:59:47.000-08:00
The current testing strategy for GQA on CPU attempts to run a Cartesian
product of all configuration parameters (batch size, sequence length,
rotary embeddings, packed KV, softcap, etc.), leading to over 2000 test
combinations. This causes significant runtime overhead and potential
timeouts.

This PR optimizes `test_gqa_cpu.py` by:
- Replacing the nested loop over all parameters with a round-robin
selection strategy (`combo_index`).
- Significantly reducing the number of test cases (from ~2304 to ~32 in
pipeline mode) while maintaining coverage of individual features
(rotary, packed, softcap, etc.).

This ensures the test suite remains robust but much faster. 
It reduces test time from minutes to seconds, and saves lot of compute
resource in CI pipeline.
diff --git a/onnxruntime/test/python/transformers/test_gqa_cpu.py b/onnxruntime/test/python/transformers/test_gqa_cpu.py
@@ -2491,90 +2491,105 @@ def run_test_config(
             print(
                 f"\nRunning tests with precision: {'FLOAT16' if precision['ort_type'] == TensorProto.FLOAT16 else 'FLOAT32'}"
             )
+            local_opts = [additional_params["local"]] if "local" in additional_params else [False, True]
+            rotary_opts = (
+                [(additional_params["rotary"], additional_params["rotary_interleaved"])]
+                if "rotary" in additional_params
+                else [(False, False), (True, False), (True, True)]
+            )
+            packed_opts = [additional_params["packed"]] if "packed" in additional_params else [False, True]
+            softcap_opts = [additional_params["softcap"]] if "softcap" in additional_params else [0.0, 50.0]
+            smooth_opts = (
+                [additional_params["use_smooth_softmax"]]
+                if "use_smooth_softmax" in additional_params
+                else [False, True]
+            )
+            head_sink_opts = [additional_params["head_sink"]] if "head_sink" in additional_params else [False, True]
+
+            combo_index = 0
             for b in batches:
                 for s, s2 in seqs:
                     for n, n2 in num_h:
                         for h in h_sizes:
-                            for local in [False, True]:
-                                for rotary, rotary_interleaved in [(False, False), (True, False), (True, True)]:
-                                    for packed in [False, True]:
-                                        for softcap in [0.0, 50.0]:
-                                            for use_smooth_softmax in [False, True]:
-                                                for has_pos, has_attn in pos_ids_attn_bias:
-                                                    for head_sink in [False, True]:
-                                                        if use_smooth_softmax and head_sink:
-                                                            continue
-                                                        for output_qk in qk_output:
-                                                            if config_class == PromptConfig:
-                                                                config = config_class(
-                                                                    b,
-                                                                    s,
-                                                                    s2,
-                                                                    s + s2 + 8,
-                                                                    n,
-                                                                    n2,
-                                                                    h,
-                                                                    has_pos,
-                                                                    has_attn,
-                                                                    head_sink,
-                                                                    output_qk,
-                                                                )
-                                                            else:  # Config
-                                                                sp = random.randint(1, s2 - s) if s2 - s > 0 else 0
-                                                                config = config_class(
-                                                                    b,
-                                                                    s,
-                                                                    s2,
-                                                                    sp,
-                                                                    n,
-                                                                    n2,
-                                                                    h,
-                                                                    has_pos,
-                                                                    has_attn,
-                                                                    head_sink,
-                                                                    output_qk,
-                                                                )
-
-                                                            params = {
-                                                                "config": config,
-                                                                "torch_type": precision["torch_type"],
-                                                                "numpy_type": precision["numpy_type"],
-                                                                "ort_type": precision["ort_type"],
-                                                                "rtol": precision["rtol"],
-                                                                "atol": precision["atol"],
-                                                                "local": local,
-                                                                "past_format": Formats.BNSH,
-                                                                "rotary": rotary,
-                                                                "rotary_interleaved": rotary_interleaved,
-                                                                "packed": packed,
-                                                                "softcap": softcap,
-                                                                "use_smooth_softmax": use_smooth_softmax,
-                                                            }
-                                                            params.update(additional_params)
-
-                                                            all_close = test_func(**params)
-                                                            self.assertTrue(all_close)
+                            local = local_opts[combo_index % len(local_opts)]
+                            rotary, rotary_interleaved = rotary_opts[combo_index % len(rotary_opts)]
+                            packed = packed_opts[combo_index % len(packed_opts)]
+                            softcap = softcap_opts[combo_index % len(softcap_opts)]
+                            use_smooth_softmax = smooth_opts[combo_index % len(smooth_opts)]
+
+                            has_pos, has_attn = pos_ids_attn_bias[combo_index % len(pos_ids_attn_bias)]
+                            head_sink = head_sink_opts[combo_index % len(head_sink_opts)]
+                            output_qk = qk_output[combo_index % len(qk_output)]
+
+                            combo_index += 1
+
+                            if rotary and h % 16 != 0:  # rotary requires head_size to be a multiple of 16
+                                continue
+
+                            if use_smooth_softmax and head_sink:
+                                continue
+                            if config_class == PromptConfig:
+                                config = config_class(
+                                    b,
+                                    s,
+                                    s2,
+                                    s + s2 + 8,
+                                    n,
+                                    n2,
+                                    h,
+                                    has_pos,
+                                    has_attn,
+                                    head_sink,
+                                    output_qk,
+                                )
+                            else:  # Config
+                                sp = random.randint(1, s2 - s) if s2 - s > 0 else 0
+                                config = config_class(
+                                    b,
+                                    s,
+                                    s2,
+                                    sp,
+                                    n,
+                                    n2,
+                                    h,
+                                    has_pos,
+                                    has_attn,
+                                    head_sink,
+                                    output_qk,
+                                )
+
+                            params = {
+                                "config": config,
+                                "torch_type": precision["torch_type"],
+                                "numpy_type": precision["numpy_type"],
+                                "ort_type": precision["ort_type"],
+                                "rtol": precision["rtol"],
+                                "atol": precision["atol"],
+                                "local": local,
+                                "past_format": Formats.BNSH,
+                                "rotary": rotary,
+                                "rotary_interleaved": rotary_interleaved,
+                                "packed": packed,
+                                "softcap": softcap,
+                                "use_smooth_softmax": use_smooth_softmax,
+                            }
+                            params.update(additional_params)
+
+                            all_close = test_func(**params)
+                            self.assertTrue(all_close)
 
     def test_gqa_no_past(self):
         print("-------- TEST GQA NO PAST (PROMPT CASE) ---------")
-        batches = [3] if pipeline_mode else [1, 3, 5]
+        batches = [1, 3] if pipeline_mode else [1, 3, 5]
         seqs = (
             [(127, 127), (240, 240)]
             if pipeline_mode
             else [(127, 127), (35, 35), (2000, 2000), (200, 200), (240, 240), (8000, 8000)]
         )
-        pos_ids_attn_bias = (
-            [(False, False), (True, True)]
-            if pipeline_mode
-            else [(False, False), (True, True), (False, True), (True, False)]
-        )
+        pos_ids_attn_bias = [(False, False), (True, True), (False, True), (True, False)]
         num_h = [(32, 8)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)]
-        h_sizes = [128] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
-        qk_output = (
-            [QKOutputType.NO_OUTPUT]
-            if pipeline_mode
-            else [QKOutputType.NO_OUTPUT, QKOutputType.BEFORE_SOFTMAX, QKOutputType.AFTER_SOFTMAX]
-        )
+        h_sizes = [40, 128] if pipeline_mode else [32, 48, 64, 80, 96, 128, 160, 192, 224, 256]
+        qk_output = [QKOutputType.NO_OUTPUT, QKOutputType.BEFORE_SOFTMAX, QKOutputType.AFTER_SOFTMAX]
 
         # Test with buffer
         self.run_test_config(
@@ -2601,24 +2616,16 @@ def test_gqa_no_past(self):
 
     def test_gqa_past(self):
         print("-------- TEST GQA PAST (TOKEN GEN) ---------")
-        batches = [1] if pipeline_mode else [1, 3, 5]
+        batches = [1, 3] if pipeline_mode else [1, 3, 5]
         seqs = (
             [(1, 128)]
             if pipeline_mode
             else [(1, 128), (1, 339), (1, 1024), (1, 5000), (1, 800), (1, 256), (1, 799), (1, 2048)]
         )
-        pos_ids_attn_bias = (
-            [(False, False), (True, True)]
-            if pipeline_mode
-            else [(False, False), (True, True), (False, True), (True, False)]
-        )
+        pos_ids_attn_bias = [(False, False), (True, True), (False, True), (True, False)]
         num_h = [(9, 3)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)]
-        h_sizes = [64] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
-        qk_output = (
-            [QKOutputType.NO_OUTPUT]
-            if pipeline_mode
-            else [QKOutputType.NO_OUTPUT, QKOutputType.BEFORE_SOFTMAX, QKOutputType.AFTER_SOFTMAX]
-        )
+        h_sizes = [64, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
+        qk_output = [QKOutputType.NO_OUTPUT, QKOutputType.BEFORE_SOFTMAX, QKOutputType.AFTER_SOFTMAX]
 
         # Test with buffer
         self.run_test_config(parity_check_gqa_past, Config, batches, seqs, num_h, h_sizes, pos_ids_attn_bias, qk_output)
@@ -2638,18 +2645,14 @@ def test_gqa_interactive_one_batch(self):
         print("-------- TEST GQA INTERACTIVE ---------")
         batches = [1]
         seqs = (
-            [(256, 2048)]
+            [(256, 2048), (1, 128)]
             if pipeline_mode
             else [(1, 128), (1, 339), (1, 1024), (1, 5000), (1, 800), (1, 256), (1, 799), (1, 2048)]
         )
-        pos_ids_attn_bias = (
-            [(False, False), (True, True)]
-            if pipeline_mode
-            else [(False, False), (True, True), (False, True), (True, False)]
-        )
+        pos_ids_attn_bias = [(False, False), (True, True), (False, True), (True, False)]
         qk_output = [QKOutputType.NO_OUTPUT, QKOutputType.BEFORE_SOFTMAX, QKOutputType.AFTER_SOFTMAX]
         num_h = [(32, 8)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)]
-        h_sizes = [32] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
+        h_sizes = [32, 80] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256]
 
         # Only test softcap=0.0 for interactive case as per original
         self.run_test_config(