Add minor test improvements

danisereb · danisereb · commit 58076a9ac101 · 2026-02-03T13:26:20.000+02:00
diff --git a/tests/gemm/test_mm_mxfp8.py b/tests/gemm/test_mm_mxfp8.py
@@ -26,17 +26,21 @@ def _assert_cosine_similarity(
     if use_float:
         reference = reference.float()
         result = result.float()
+
+    # Check cosine similarity between reference and result
     cos_sim = F.cosine_similarity(
         reference.reshape(-1), result.reshape(-1), dim=0
     ).item()
+
     if context:
         message = (
             f"{context} Cosine similarity {cos_sim:.4f} is too low "
-            f"(expected > {min_cos_sim})."
+            f"(expected > {min_cos_sim}, {is_sf_swizzled_layout=})."
         )
     else:
         message = (
-            f"Cosine similarity {cos_sim:.4f} is too low (expected > {min_cos_sim})"
+            f"Cosine similarity {cos_sim:.4f} is too low "
+            f"(expected > {min_cos_sim}, {is_sf_swizzled_layout=})."
         )
     assert cos_sim > min_cos_sim, message
     return cos_sim
@@ -113,7 +117,7 @@ def _prepare_mxfp8_tensors(input_bf16, weight_bf16, is_sf_swizzled_layout):
 
 @pytest.mark.parametrize("m", [128, 256, 512, 1024])
 @pytest.mark.parametrize("n", [128, 256, 512, 1024])
-@pytest.mark.parametrize("k", [128, 256, 512, 1024, 2048, 2560])
+@pytest.mark.parametrize("k", [128, 256, 512, 1024, 2048, 2560, 3200])
 @pytest.mark.parametrize("is_sf_swizzled_layout", [True, False])
 @pytest.mark.parametrize("input_dtype", [torch.bfloat16])
 @pytest.mark.parametrize("out_dtype", [torch.bfloat16, torch.float16])
@@ -136,7 +140,7 @@ def test_mm_mxfp8(
 
 
 @pytest.mark.parametrize("m", [128, 256, 512, 1024, 2048, 4096])
-@pytest.mark.parametrize("n", [4096, 8192, 12288, 16384])
+@pytest.mark.parametrize("n", [2688, 4096, 5376, 8192, 12288, 16384])
 @pytest.mark.parametrize("k", [4096, 8192])
 @pytest.mark.parametrize("is_sf_swizzled_layout", [True, False])
 @pytest.mark.parametrize("input_dtype", [torch.bfloat16])
@@ -158,6 +162,30 @@ def test_mm_mxfp8_large_dimensions(
     )
 
 
+@pytest.mark.parametrize(
+    "m,n,k",
+    [
+        (32, 4096, 4096),
+        (32, 2688, 1856),
+        (32, 1856, 2688),
+        (32, 2688, 4096),
+        (32, 5376, 4096),
+    ],
+)
+def test_mm_mxfp8_small_m(m, n, k):
+    _run_mm_mxfp8(
+        m,
+        n,
+        k,
+        torch.bfloat16,
+        True,  # swizzled scales are the intended fast path
+        torch.bfloat16,
+        "cutlass",
+        auto_tuning=False,
+        provide_out=True,
+    )
+
+
 def _skip_if_unsupported():
     compute_capability = get_compute_capability(torch.device("cuda"))
     if compute_capability[0] in [11, 12]: