Fixes MX formats build for blackwell (#2278)

drisspg · syed-ahmed · web-flow · commit c4250a4bfa49 · 2025-05-30T11:49:30.000-07:00
* Fixes MX formats build for blackwell

* Adds missing line

* Adds missing line

---------

Co-authored-by: Syed Tousif Ahmed &lt;syeahmed@nvidia.com&gt;
diff --git a/setup.py b/setup.py
@@ -424,6 +424,9 @@ def get_extensions():
 
     use_cutlass = False
     cutlass_90a_sources = None
+    cutlass_100a_sources = None
+    build_for_sm90a = False
+    build_for_sm100a = False
     if use_cuda and not IS_WINDOWS:
         use_cutlass = True
         cutlass_dir = os.path.join(third_party_path, "cutlass")
@@ -453,32 +456,47 @@ def get_extensions():
         )
 
         cuda_arch_flags = _get_cuda_arch_flags()
-        build_for_sm90 = "-gencode=arch=compute_90,code=sm_90" in cuda_arch_flags
         build_for_sm90a = "-gencode=arch=compute_90a,code=sm_90a" in cuda_arch_flags
-        if build_for_sm90 and not build_for_sm90a:
-            cutlass_90a_sources = [
+        build_for_sm100a = "-gencode=arch=compute_100a,code=sm_100a" in cuda_arch_flags
+        # Define sm90a sources
+        cutlass_90a_sources = [
+            os.path.join(
+                extensions_cuda_dir,
+                "rowwise_scaled_linear_sparse_cutlass",
+                "rowwise_scaled_linear_sparse_cutlass_f8f8.cu",
+            ),
+            os.path.join(
+                extensions_cuda_dir,
+                "to_sparse_semi_structured_cutlass_sm9x",
+                "to_sparse_semi_structured_cutlass_sm9x_f8.cu",
+            ),
+            os.path.join(extensions_cuda_dir, "activation24", "sparsify24.cu"),
+            os.path.join(extensions_cuda_dir, "activation24", "sparse_gemm.cu"),
+        ]
+        for dtypes in ["e4m3e4m3", "e4m3e5m2", "e5m2e4m3", "e5m2e5m2"]:
+            cutlass_90a_sources.append(
                 os.path.join(
                     extensions_cuda_dir,
                     "rowwise_scaled_linear_sparse_cutlass",
-                    "rowwise_scaled_linear_sparse_cutlass_f8f8.cu",
-                ),
-                os.path.join(
-                    extensions_cuda_dir,
-                    "to_sparse_semi_structured_cutlass_sm9x",
-                    "to_sparse_semi_structured_cutlass_sm9x_f8.cu",
-                ),
-                os.path.join(extensions_cuda_dir, "activation24", "sparsify24.cu"),
-                os.path.join(extensions_cuda_dir, "activation24", "sparse_gemm.cu"),
-            ]
-            for dtypes in ["e4m3e4m3", "e4m3e5m2", "e5m2e4m3", "e5m2e5m2"]:
-                cutlass_90a_sources.append(
-                    os.path.join(
-                        extensions_cuda_dir,
-                        "rowwise_scaled_linear_sparse_cutlass",
-                        "rowwise_scaled_linear_sparse_cutlass_" + dtypes + ".cu",
-                    )
+                    "rowwise_scaled_linear_sparse_cutlass_" + dtypes + ".cu",
                 )
-            sources = [s for s in sources if s not in cutlass_90a_sources]
+            )
+        # Always remove sm90a sources from main sources
+        sources = [s for s in sources if s not in cutlass_90a_sources]
+
+        # Always compile mx_fp_cutlass_kernels.cu ONLY with sm100a architecture
+        cutlass_100a_sources = [
+            os.path.join(
+                extensions_cuda_dir,
+                "mx_kernels",
+                "mx_fp_cutlass_kernels.cu",
+            ),
+        ]
+        # Remove from main sources to prevent compilation with other architectures
+        sources = [
+            s for s in sources if os.path.basename(s) != "mx_fp_cutlass_kernels.cu"
+        ]
+
     else:
         # Remove CUTLASS-based kernels from the sources list.  An
         # assumption is that these files will have "cutlass" in its
@@ -492,6 +510,11 @@ def get_extensions():
 
     ext_modules = []
     if len(sources) > 0:
+        # Double-check to ensure mx_fp_cutlass_kernels.cu is not in sources
+        sources = [
+            s for s in sources if os.path.basename(s) != "mx_fp_cutlass_kernels.cu"
+        ]
+
         ext_modules.append(
             extension(
                 "torchao._C",
@@ -502,21 +525,48 @@ def get_extensions():
             )
         )
 
-    if cutlass_90a_sources is not None and len(cutlass_90a_sources) > 0:
+    # Only build the cutlass_90a extension if sm90a is in the architecture flags
+    if (
+        cutlass_90a_sources is not None
+        and len(cutlass_90a_sources) > 0
+        and build_for_sm90a
+    ):
         cutlass_90a_extra_compile_args = copy.deepcopy(extra_compile_args)
-        cutlass_90a_extra_compile_args["nvcc"].extend(
-            cuda_arch_flags + ["-gencode=arch=compute_90a,code=sm_90a"]
+        # Only use sm90a architecture for these sources, ignoring other flags
+        cutlass_90a_extra_compile_args["nvcc"].append(
+            "-gencode=arch=compute_90a,code=sm_90a"
         )
         ext_modules.append(
             extension(
-                "torchao._C",
+                "torchao._C_cutlass_90a",
                 cutlass_90a_sources,
                 py_limited_api=True,
                 extra_compile_args=cutlass_90a_extra_compile_args,
                 extra_link_args=extra_link_args,
             )
         )
 
+    # Only build the cutlass_100a extension if sm100a is in the architecture flags
+    if (
+        cutlass_100a_sources is not None
+        and len(cutlass_100a_sources) > 0
+        and build_for_sm100a
+    ):
+        cutlass_100a_extra_compile_args = copy.deepcopy(extra_compile_args)
+        # Only use sm100a architecture for these sources, ignoring cuda_arch_flags
+        cutlass_100a_extra_compile_args["nvcc"].append(
+            "-gencode=arch=compute_100a,code=sm_100a"
+        )
+        ext_modules.append(
+            extension(
+                "torchao._C_cutlass_100a",
+                cutlass_100a_sources,
+                py_limited_api=True,
+                extra_compile_args=cutlass_100a_extra_compile_args,
+                extra_link_args=extra_link_args,
+            )
+        )
+
     # Build CMakeLists from /torchao/experimental - additional options become available : TORCHAO_BUILD_CPU_AARCH64, TORCHAO_BUILD_KLEIDIAI, TORCHAO_BUILD_MPS_OPS, TORCHAO_PARALLEL_BACKEND
     if build_macos_arm_auto or os.getenv("BUILD_TORCHAO_EXPERIMENTAL") == "1":
         build_options = BuildOptions()
diff --git a/third_party/cutlass b/third_party/cutlass
@@ -1 +1 @@
-Subproject commit afa1772203677c5118fcd82537a9c8fefbcc7008
+Subproject commit ad7b2f5e84fcfa124cb02b91d5bd26d238c0459e
diff --git a/torchao/__init__.py b/torchao/__init__.py
@@ -25,8 +25,8 @@
 
     so_files = list(Path(__file__).parent.glob("_C*.so"))
     if len(so_files) > 0:
-        assert len(so_files) == 1, f"Expected one _C*.so file, found {len(so_files)}"
-        torch.ops.load_library(str(so_files[0]))
+        for file in so_files:
+            torch.ops.load_library(str(file))
         from . import ops
 
     # The following library contains CPU kernels from torchao/experimental
diff --git a/torchao/csrc/cuda/mx_kernels/mx_fp_cutlass_kernels.cu b/torchao/csrc/cuda/mx_kernels/mx_fp_cutlass_kernels.cu
@@ -3,6 +3,11 @@
 //
 // This source code is licensed under the BSD 3-Clause license found in the
 // LICENSE file in the root directory of this source tree.
+
+// Ensure this file is only compiled with sm100a architecture
+#if !defined(__CUDA_ARCH__) || (__CUDA_ARCH__ < 1000)
+#error "This file must be compiled with compute capability 10.0a or higher (Blackwell architecture)"
+#endif
 #include <torch/library.h>
 
 #include <ATen/ATen.h>