NVIDIA · cyanguwa · Jan 31, 2026 · Feb 4, 2026 · Feb 4, 2026 · Feb 4, 2026
diff --git a/.gitmodules b/.gitmodules
@@ -3,7 +3,8 @@
 	url = https://github.com/google/googletest.git
 [submodule "3rdparty/cudnn-frontend"]
 	path = 3rdparty/cudnn-frontend
-	url = https://github.com/NVIDIA/cudnn-frontend.git
+	url = https://gitlab-master.nvidia.com/cudnn/cudnn_frontend.git
+	branch = develop
 [submodule "3rdparty/cutlass"]
 	path = 3rdparty/cutlass
 	url = https://github.com/NVIDIA/cutlass.git
diff --git a/3rdparty/cudnn-frontend b/3rdparty/cudnn-frontend
diff --git a/tests/pytorch/attention/run_attention_with_cp.py b/tests/pytorch/attention/run_attention_with_cp.py
@@ -19,8 +19,9 @@
     DotProductAttention,
     Float8Quantizer,
     Float8CurrentScalingQuantizer,
+    MXFP8Quantizer,
 )
-from transformer_engine.common.recipe import DelayedScaling, Float8CurrentScaling
+from transformer_engine.common.recipe import DelayedScaling, Float8CurrentScaling, MXFP8BlockScaling, Format
 from utils import ModelConfig, compare_and_assert
 
 dtypes = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp8": torch.bfloat16}
@@ -189,7 +190,7 @@ def run_dpa_with_cp(
     os.environ["NVTE_FP8_DPA_BWD"] = "1" if fp8_bwd else "0"
     fp8_dpa = fp8_dpa == "True" and dtype == "fp8"
     fp8_mha = fp8_mha == "True" and dtype == "fp8"
-    f16_O = dtype == "fp8" and scaling_mode == "current" and f16_O == "True"
+    f16_O = dtype == "fp8" and scaling_mode in ["current", "mxfp8"] and f16_O == "True"
     os.environ["NVTE_DPA_FP8CS_O_in_F16"] = "1" if f16_O else "0"
     os.environ["NVTE_FLASH_ATTN"] = "0"
     os.environ["NVTE_FUSED_ATTN"] = "0"
@@ -219,6 +220,7 @@ def run_dpa_with_cp(
         device_count = torch.cuda.device_count()
         device = rank % device_count
         torch.cuda.set_device(device)
+    print(f"rank: {rank}, world_size: {world_size}")
     logging.info(f"[Rank {rank}] Setup: world_size {world_size}")
     dist.init_process_group(backend="nccl", world_size=world_size, rank=rank)
 
@@ -244,6 +246,8 @@ def run_dpa_with_cp(
             fp8_recipe = DelayedScaling(fp8_dpa=fp8_dpa, fp8_mha=fp8_mha)
         if scaling_mode == "current":
             fp8_recipe = Float8CurrentScaling(fp8_dpa=fp8_dpa, fp8_mha=fp8_mha)
+        if scaling_mode == "mxfp8":
+            fp8_recipe = MXFP8BlockScaling(fp8_format=Format.HYBRID, fp8_dpa=fp8_dpa, fp8_mha=fp8_mha)
 
     # instantiate attention module
     core_attn = DotProductAttention(
@@ -297,10 +301,25 @@ def run_dpa_with_cp(
             fp8_dtype=tex.DType.kFloat8E5M2,
             device="cuda",
         )
+    if scaling_mode == "mxfp8":
+        qkv_quantizer = MXFP8Quantizer(
+            fp8_dtype=tex.DType.kFloat8E4M3,
+            rowwise=True,
+            columnwise=True,
+        )
+        qkv_quantizer.optimize_for_gemm = True
+        qkv_quantizer.internal = False
+        dout_quantizer = MXFP8Quantizer(
+            fp8_dtype=tex.DType.kFloat8E5M2,
+            rowwise=True,
+            columnwise=True,
+        )
+        dout_quantizer.optimize_for_gemm = True
+        dout_quantizer.internal = False
     qkv_layout = "_".join([qkv_format] * 3)
     q, k, v, dout = [x.clone().detach() for x in [q_orig, k_orig, v_orig, dout_orig]]
     if fp8_mha:
-        q, k, v = combine_and_quantize(qkv_layout, q, k, v, qkv_quantizer)
+        q, k, v, qkv_layout = combine_and_quantize(qkv_layout, q, k, v, qkv_quantizer)
     for x in [q, k, v]:
         x.requires_grad = True
 
@@ -386,7 +405,7 @@ def run_dpa_with_cp(
         dout_quantizer.scale.fill_(1.0)
         dout_quantizer.amax.fill_(0.0)
     if fp8_mha:
-        q_, k_, v_ = combine_and_quantize(qkv_layout, q_, k_, v_, qkv_quantizer)
+        q_, k_, v_, qkv_layout = combine_and_quantize(qkv_layout, q_, k_, v_, qkv_quantizer)
     q_, k_, v_ = [x.requires_grad_() for x in [q_, k_, v_]]
     if bias_ is not None:
         bias_ = bias_.view(
@@ -446,7 +465,8 @@ def run_dpa_with_cp(
             tensors_to_deq[i] = tensor.dequantize()
         if not fp8_bwd:
             tensors[0], tensors[4] = tensors_to_deq
-    for tensor in tensors:
+    for i, tensor in enumerate(tensors):
+        print(f"========= {torch.cuda.current_device()}: tensors[{i}].shape: {tensor.shape} {tensor.dtype} {torch.isnan(tensor).any()} {torch.isinf(tensor).any()}")
         assert torch.all(~torch.isnan(tensor))
         assert torch.all(~torch.isinf(tensor))
     out, dq, dk, dv, out_, dq_, dk_, dv_ = tensors

diff --git a/tests/pytorch/attention/test_attention.py b/tests/pytorch/attention/test_attention.py
@@ -1788,8 +1788,8 @@ def get_model(dtype, config):
 
 model_configs_fp8_vs_f16 = {
     # test: ModelConfig(b, sq, hq, dqk)
-    "fp8_9": ModelConfig(2, 2048, 16, 128),
-    "fp8_10": ModelConfig(2, 2048, 24, 128, num_gqa_groups=12),
+    "fp8_9": ModelConfig(2, 2048, 24, 128, num_gqa_groups=12),#, attn_mask_type="causal"),
+    "fp8_10": ModelConfig(2, 2048, 24, 192, head_dim_v=128, num_gqa_groups=12, window_size=(512, 512)),
     "fp8_11": ModelConfig(1, 8192, 32, 128, num_gqa_groups=4),
     "fp8_12": ModelConfig(2, 2048, 16, 128, attn_mask_type="causal"),
     "fp8_13": ModelConfig(2, 2048, 24, 128, num_gqa_groups=12, attn_mask_type="causal"),
@@ -1816,7 +1816,7 @@ def get_model(dtype, config):
 @pytest.mark.parametrize("fp8_dpa_bwd", [True, False])
 @pytest.mark.parametrize("RoPE", [True, False])
 @pytest.mark.parametrize("is_training", [True, False])
-@pytest.mark.parametrize("scaling_mode", ["delayed", "current"])
+@pytest.mark.parametrize("scaling_mode", ["delayed", "current", "mxfp8"])
 def test_mha_fp8_vs_f16(
     dtype, model, qkv_format, input_layernorm, fp8_dpa_bwd, RoPE, is_training, scaling_mode
 ):
@@ -1841,6 +1841,12 @@ def test_mha_fp8_vs_f16(
             fp8_dpa=True,
             fp8_mha=True,
         )
+    elif scaling_mode == "mxfp8":
+        fp8_recipe = recipe.MXFP8BlockScaling(
+            fp8_format=recipe.Format.HYBRID,
+            fp8_dpa=True,
+            fp8_mha=True,
+        )
     fp8_meta = {}
     fp8_meta["recipe"] = fp8_recipe
     available_backends, _, fused_attn_backends = get_available_attention_backends(
@@ -2062,7 +2068,7 @@ def get_dummy_cuda_rng_tracker() -> CudaRNGStatesTracker:
 @pytest.mark.parametrize("qkv_layout", qkv_layout_fp8_vs_f16)
 @pytest.mark.parametrize("fp8_dpa_bwd", [True, False])
 @pytest.mark.parametrize("is_training", [True, False])
-@pytest.mark.parametrize("scaling_mode", ["delayed", "current"])
+@pytest.mark.parametrize("scaling_mode", ["delayed", "current", "mxfp8"])
 def test_dpa_fp8_vs_f16(dtype, model, qkv_layout, fp8_dpa_bwd, is_training, scaling_mode):
     """Test DotProductAttention module in FP8"""
     config = model_configs_fp8_vs_f16[model]
@@ -2078,7 +2084,7 @@ def test_dpa_fp8_vs_f16(dtype, model, qkv_layout, fp8_dpa_bwd, is_training, scal
     #        config.dropout_p = 0.1
 
     os.environ["NVTE_FP8_DPA_BWD"] = "1" if fp8_dpa_bwd else "0"
-    os.environ["NVTE_ALLOW_NONDETERMINISTIC_ALGO"] = "1"
+    # os.environ["NVTE_ALLOW_NONDETERMINISTIC_ALGO"] = "1"
-    # os.environ["NVTE_ALLOW_NONDETERMINISTIC_ALGO"] = "1"
+    os.environ["NVTE_ALLOW_NONDETERMINISTIC_ALGO"] = "1"
-    # os.environ["NVTE_ALLOW_NONDETERMINISTIC_ALGO"] = "1"
+    os.environ["NVTE_ALLOW_NONDETERMINISTIC_ALGO"] = "1"
     os.environ["NVTE_UnfusedDPA_Emulate_FP8"] = "1"
 
     # Test backend availability
@@ -2095,6 +2101,12 @@ def test_dpa_fp8_vs_f16(dtype, model, qkv_layout, fp8_dpa_bwd, is_training, scal
             fp8_format=recipe.Format.HYBRID,
             fp8_dpa=True,
         )
+    elif scaling_mode == "mxfp8":
+        fp8_recipe = recipe.MXFP8BlockScaling(
+            fp8_format=recipe.Format.HYBRID,
+            fp8_dpa=True,
+            fp8_mha=False,
+        )
     fp8_meta = {}
     fp8_meta["recipe"] = fp8_recipe
     available_backends, _, fused_attn_backends = get_available_attention_backends(
@@ -2226,16 +2238,18 @@ def test_dpa_fp8_vs_f16(dtype, model, qkv_layout, fp8_dpa_bwd, is_training, scal
         if is_training:
             for i, _ in enumerate(fused_attn_bwd_f16):
                 logging.debug("========== {:^25s} ==========".format(bwd_names[i]))
-                compare_and_assert(
-                    fused_attn_bwd_fp8[i],
-                    fused_attn_bwd_f16[i],
-                    f"fused_attn_bwd_fp8[{i}]",
-                    f"fused_attn_bwd_f16[{i}]",
-                    atol,
-                    rtol,
-                    rmse_tol,
-                    True,
-                )
+                print(f"fused_attn_bwd_fp8[{i}].max(): {fused_attn_bwd_fp8[i].max()}, fused_attn_bwd_f16[{i}].max(): {fused_attn_bwd_f16[i].max()}")
+                print(f"fused_attn_bwd_fp8[{i}].min(): {fused_attn_bwd_fp8[i].min()}, fused_attn_bwd_f16[{i}].min(): {fused_attn_bwd_f16[i].min()}")
+                # compare_and_assert(
+                #     fused_attn_bwd_fp8[i],
+                #     fused_attn_bwd_f16[i],
+                #     f"fused_attn_bwd_fp8[{i}]",
+                #     f"fused_attn_bwd_f16[{i}]",
+                #     atol,
+                #     rtol,
+                #     rmse_tol,
+                #     True,
-                print(f"fused_attn_bwd_fp8[{i}].max(): {fused_attn_bwd_fp8[i].max()}, fused_attn_bwd_f16[{i}].max(): {fused_attn_bwd_f16[i].max()}")
-                print(f"fused_attn_bwd_fp8[{i}].min(): {fused_attn_bwd_fp8[i].min()}, fused_attn_bwd_f16[{i}].min(): {fused_attn_bwd_f16[i].min()}")
-                # compare_and_assert(
-                #     fused_attn_bwd_fp8[i],
-                #     fused_attn_bwd_f16[i],
-                #     f"fused_attn_bwd_fp8[{i}]",
-                #     f"fused_attn_bwd_f16[{i}]",
-                #     atol,
-                #     rtol,
-                #     rmse_tol,
-                #     True,
+                compare_and_assert(
+                    fused_attn_bwd_fp8[i],
+                    fused_attn_bwd_f16[i],
+                    f"fused_attn_bwd_fp8[{i}]",
+                    f"fused_attn_bwd_f16[{i}]",
+                    atol,
+                    rtol,
+                    rmse_tol,
+                    True,
+                )
-                print(f"fused_attn_bwd_fp8[{i}].max(): {fused_attn_bwd_fp8[i].max()}, fused_attn_bwd_f16[{i}].max(): {fused_attn_bwd_f16[i].max()}")
-                print(f"fused_attn_bwd_fp8[{i}].min(): {fused_attn_bwd_fp8[i].min()}, fused_attn_bwd_f16[{i}].min(): {fused_attn_bwd_f16[i].min()}")
-                # compare_and_assert(
-                #     fused_attn_bwd_fp8[i],
-                #     fused_attn_bwd_f16[i],
-                #     f"fused_attn_bwd_fp8[{i}]",
-                #     f"fused_attn_bwd_f16[{i}]",
-                #     atol,
-                #     rtol,
-                #     rmse_tol,
-                #     True,
+                compare_and_assert(
+                    fused_attn_bwd_fp8[i],
+                    fused_attn_bwd_f16[i],
+                    f"fused_attn_bwd_fp8[{i}]",
+                    f"fused_attn_bwd_f16[{i}]",
+                    atol,
+                    rtol,
+                    rmse_tol,
+                    True,
+                )
+                # )
     os.environ["NVTE_UnfusedDPA_Emulate_FP8"] = "0"
 
 
@@ -2253,7 +2267,7 @@ def get_dummy_cuda_rng_tracker() -> CudaRNGStatesTracker:
     with quantized_model_init(enabled=fp8_dpa):
         dpa = DotProductAttention(
             config.num_heads,
-            config.head_dim_qk,
+            (config.head_dim_qk, config.head_dim_v),
             num_gqa_groups=config.num_gqa_groups,
             attention_dropout=config.dropout_p,
             sequence_parallel=False,
@@ -2298,7 +2312,8 @@ def get_dummy_cuda_rng_tracker() -> CudaRNGStatesTracker:
         "skv": config.max_seqlen_kv,
         "h": config.num_heads,
         "hg": config.num_gqa_groups,
-        "d": config.head_dim_qk,
+        "dqk": config.head_dim_qk,
+        "dv": config.head_dim_v,
         "t": cu_seqlens_q[-1],
         "tg": cu_seqlens_kv[-1],
         "3": 3,
@@ -2314,6 +2329,10 @@ def get_dummy_cuda_rng_tracker() -> CudaRNGStatesTracker:
             layout = layout.replace("s", "skv")
             layout = layout.replace("h", "hg")
             layout = layout.replace("t", "tg")
+        if i == 2:
+            layout = layout.replace("d", "dv")
+        else:
+            layout = layout.replace("d", "dqk")
         tensor_shape = [dim_to_num[j] for j in layout.split("_")]
         if config.dropout_p == 0.0:
             tensor = torch.randn(tensor_shape, dtype=dtype, device="cuda")
@@ -2338,6 +2357,7 @@ def get_dummy_cuda_rng_tracker() -> CudaRNGStatesTracker:
 
     qkv_format_kv = "_".join(qkv_format)
     qkv_format_kv = qkv_format_kv.replace("s", "sq")
+    qkv_format_kv = qkv_format_kv.replace("d", "dv")
     out_grad_shape = [dim_to_num[i] for i in qkv_format_kv.split("_")]
     out_grad_shape_new = [*out_grad_shape[:-2], out_grad_shape[-2] * out_grad_shape[-1]]
     out_grad = torch.randn(out_grad_shape_new, dtype=dtype, device="cuda")
@@ -2348,6 +2368,7 @@ def get_dummy_cuda_rng_tracker() -> CudaRNGStatesTracker:
             inp[1],
             inp[2],
             qkv_format=qkv_format,
+            window_size=config.window_size,
             cu_seqlens_q=cu_seqlens_q,
             cu_seqlens_kv=cu_seqlens_kv,
             max_seqlen_q=config.max_seqlen_q,

diff --git a/tests/pytorch/attention/test_attention_with_cp.py b/tests/pytorch/attention/test_attention_with_cp.py
@@ -17,6 +17,8 @@
 from transformer_engine.common.recipe import (
     DelayedScaling,
     Float8CurrentScaling,
+    MXFP8BlockScaling,
+    Format,
 )
 from transformer_engine.pytorch.attention.dot_product_attention.utils import FlashAttentionUtils
 
@@ -149,7 +151,7 @@ def test_cp_with_flash_attention(dtype, model, qkv_format, cp_comm_type):
     "cp_1_3": ModelConfig(2, 4096, 12, 128, attn_bias_type="post_scale_bias"),  # MHA
     "cp_1_4": ModelConfig(2, 4096, 12, 128, attn_mask_type="causal", window_size=(512, 512)),  # MHA
     "cp_2_0": ModelConfig(2, 4096, 12, 128, num_gqa_groups=2, attn_mask_type="causal"),  # GQA
-    "cp_2_1": ModelConfig(2, 4096, 12, 128, num_gqa_groups=2),  # GQA
+    "cp_2_1": ModelConfig(2, 4096, 16, 128), #192, head_dim_v=128, num_gqa_groups=4, attn_mask_type="causal"),  # GQA
     "cp_2_2": ModelConfig(
         2,
         4096,
@@ -166,7 +168,7 @@ def test_cp_with_flash_attention(dtype, model, qkv_format, cp_comm_type):
         2, 4096, 12, 128, num_gqa_groups=2, attn_mask_type="causal", window_size=(512, 512)
     ),  # GQA
     "cp_3_0": ModelConfig(2, 4096, 12, 128, attn_mask_type="causal", head_dim_v=64),  # MLA
-    "cp_3_1": ModelConfig(2, 4096, 12, 128, head_dim_v=64),  # MLA
+    "cp_3_1": ModelConfig(2, 4096, 12, 192, head_dim_v=128),  # MLA
     "cp_3_2": ModelConfig(
         2, 4096, 12, 128, attn_mask_type="causal", attn_bias_type="post_scale_bias", head_dim_v=64
     ),  # MLA
@@ -192,14 +194,16 @@ def test_cp_with_flash_attention(dtype, model, qkv_format, cp_comm_type):
         "cp_1_1",
         "cp_1_4",
         "cp_2_0",
+        "cp_2_1",
         "cp_2_2",
         "cp_2_4",
+        "cp_3_1",
         "cp_3_2",
         "cp_4_2",
     ]
     model_configs_fused_attn = {k: model_configs_fused_attn[k] for k in configs}
     dtypes = ["bf16", "fp8"]
-    qkv_formats = ["sbhd", "thd"]
+    qkv_formats = ["bshd", "sbhd", "thd"]
 
 
 @pytest.mark.skipif(get_cudnn_version() < (8, 9, 7), reason="cuDNN 8.9.7+ is required.")
@@ -211,7 +215,7 @@ def test_cp_with_flash_attention(dtype, model, qkv_format, cp_comm_type):
 @pytest.mark.parametrize("fp8_bwd", [True, False])
 @pytest.mark.parametrize("fp8_mha", [True, False])
 @pytest.mark.parametrize("fp8_dpa", [True, False])
-@pytest.mark.parametrize("scaling_mode", [None, "delayed", "current"])
+@pytest.mark.parametrize("scaling_mode", [None, "delayed", "current", "mxfp8"])
 @pytest.mark.parametrize("f16_O", [True, False])
 def test_cp_with_fused_attention(
     dtype, model, qkv_format, cp_comm_type, fp8_bwd, fp8_mha, fp8_dpa, scaling_mode, f16_O
@@ -245,10 +249,10 @@ def test_cp_with_fused_attention(
                 "CP implementation with QKVO A2A+P2P (Hierarchical A2A) does not support THD format"
                 " yet!"
             )
-    if dtype == "fp8" and cp_comm_type == "all_gather":
-        pytest.skip(
-            "CP implementation with KV all-gather does not support FP8 + context parallelism yet!"
-        )
+    # if dtype == "fp8" and cp_comm_type == "all_gather":
+    #     pytest.skip(
+    #         "CP implementation with KV all-gather does not support FP8 + context parallelism yet!"
+    #     )
     if dtype == "fp8" and qkv_format == "thd":
         pytest.skip("FP8 attention cannot work with THD format yet!")
     if dtype == "fp8" and config.attn_bias_type != "no_bias":
@@ -280,12 +284,12 @@ def test_cp_with_fused_attention(
         and cp_comm_type not in ["p2p", "a2a+p2p", "a2a"]
     ):
         pytest.skip("fp8 only works with P2P, A2A and A2A+P2P for scaling_mode = current!")
-    if f16_O and (dtype != "fp8" or scaling_mode != "current"):
+    if f16_O and (dtype != "fp8" or scaling_mode not in ["current", "mxfp8"]):
         pytest.skip("f16_O only needs to be tested for dtype = fp8 and scaling_mode = current!")
-    if "p2p" not in cp_comm_type and config.head_dim_qk != config.head_dim_v:
+    if cp_comm_type not in ["p2p", "a2a+p2p", "a2a"] and config.head_dim_qk != config.head_dim_v:
         pytest.skip("MLA CP currently only support KV P2P!")
-    if dtype == "fp8" and config.head_dim_qk != config.head_dim_v:
-        pytest.skip("MLA CP currently does not support FP8 attention!")
+    # if dtype == "fp8" and config.head_dim_qk != config.head_dim_v:
+    #     pytest.skip("MLA CP currently does not support FP8 attention!")
     if dtype == "fp8" and config.softmax_type != "vanilla":
         pytest.skip("CP implementation does not support non-vanilla softmax types in FP8!")
     if config.softmax_type != "vanilla" and cp_comm_type != "a2a":
@@ -301,6 +305,8 @@ def test_cp_with_fused_attention(
             "Unless cudnn version >= 9.18.0, CP implementation does not support qkv_format=thd for"
             " non-vanilla softmax types!"
         )
+    if scaling_mode == "mxfp8" and not f16_O:
+        pytest.skip("MXFP8 only works with f16_O=True!")
 
     dtypes = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp8": torch.bfloat16}
 
@@ -324,6 +330,11 @@ def test_cp_with_fused_attention(
             Float8CurrentScaling(fp8_dpa=True),
             DelayedScaling(fp8_dpa=True),
         ]
+    if fp8 and scaling_mode == "mxfp8":
+        fp8_meta["recipe"] = MXFP8BlockScaling(fp8_format=Format.HYBRID, fp8_dpa=True)
+        fp8_meta["local_recipes"] = [
+            MXFP8BlockScaling(fp8_format=Format.HYBRID, fp8_dpa=True),
+        ]
     available_backends, _, fused_attn_backends = get_available_attention_backends(
         config,
         qkv_dtype=dtypes[dtype] if dtype != "fp8" else torch.float8_e4m3fn,