flashinfer-ai · aleozlx · Mar 20, 2026 · Mar 9, 2026 · Mar 9, 2026 · Mar 9, 2026
@@ -414,7 +414,7 @@ void FusedMoeLauncher::init_common(
   int major = 0, minor = 0;
   cudaDeviceGetAttribute(&major, cudaDevAttrComputeCapabilityMajor, device);
   cudaDeviceGetAttribute(&minor, cudaDevAttrComputeCapabilityMinor, device);
-  TVM_FFI_ICHECK_EQ(major, 10) << "MoE kernel requires 10.x architecture. Current device has SM "
+  TVM_FFI_ICHECK(major == 10 || major == 12) << "MoE kernel requires SM 10.x or SM 12.x architecture. Current device has SM "
                                << major << minor;
   this->device_version = std::make_tuple(major, minor);
 
@@ -1342,8 +1342,8 @@ class FP4BlockScaleLauncher : public FusedMoeLauncher {
       return std::make_tuple(major, minor);
     }();
 
-    TVM_FFI_ICHECK_EQ(std::get<0>(device_props), 10)
-        << "This kernel requires 10.x architecture. Current device has SM "
+    TVM_FFI_ICHECK(std::get<0>(device_props) == 10 || std::get<0>(device_props) == 12)
+        << "This kernel requires SM 10.x or SM 12.x architecture. Current device has SM "
         << std::get<0>(device_props) << std::get<1>(device_props);
 
     // Set data types

@@ -30,20 +30,55 @@ class CompilationContext:
         "-DFLASHINFER_ENABLE_FP4_E2M1",
     ]
 
+    @staticmethod
+    def _normalize_cuda_arch(major: int, minor: int) -> tuple[int, str]:
+        """Normalize a (major, minor) capability pair into a (major, minor_str)
+        tuple with the correct architecture suffix for nvcc.
+
+        SM 9.x  -> 'a' suffix (e.g. compute_90a)
+        SM 12.x -> 'f' suffix (e.g. compute_120f) when the installed CUDA
+                    toolchain supports it (CUDA >= 13.0), otherwise 'a'.
+        SM 10+  -> 'a' suffix (e.g. compute_100a)
+        SM < 9  -> no suffix
+        """
+        if major == 9:
+            return (major, str(minor) + "a")
+        elif major == 12:
+            try:
+                from flashinfer.jit.cpp_ext import is_cuda_version_at_least
+                if is_cuda_version_at_least("13.0"):
+                    return (major, str(minor) + "f")
+            except ImportError:
+                logger.debug(
+                    "Could not import is_cuda_version_at_least; "
+                    "falling back to 'a' suffix for SM %d.%d", major, minor
+                )
+            return (major, str(minor) + "a")
+        elif major >= 10:
+            return (major, str(minor) + "a")
+        return (major, str(minor))
+
     def __init__(self):
         self.TARGET_CUDA_ARCHS = set()
         if "FLASHINFER_CUDA_ARCH_LIST" in os.environ:
             for arch in os.environ["FLASHINFER_CUDA_ARCH_LIST"].split(" "):
                 major, minor = arch.split(".")
                 major = int(major)
-                self.TARGET_CUDA_ARCHS.add((int(major), str(minor)))
+                # If the user already provided a suffix (e.g. "12.0f"),
+                # respect it as-is; otherwise normalise.
+                if minor[-1].isalpha():
+                    self.TARGET_CUDA_ARCHS.add((major, minor))
+                else:
+                    self.TARGET_CUDA_ARCHS.add(
+                        self._normalize_cuda_arch(major, int(minor))
+                    )
         else:
             try:
                 for device in range(torch.cuda.device_count()):
                     major, minor = torch.cuda.get_device_capability(device)
-                    if major >= 9:
-                        minor = str(minor) + "a"
-                    self.TARGET_CUDA_ARCHS.add((int(major), str(minor)))
+                    self.TARGET_CUDA_ARCHS.add(
+                        self._normalize_cuda_arch(major, minor)
+                    )
             except Exception as e:
                 logger.warning(f"Failed to get device capability: {e}.")
 

@@ -59,7 +59,7 @@ def gen_cutlass_fused_moe_sm103_module(use_fast_build: bool = False) -> JitSpec:
     ]
 
     nvcc_flags += current_compilation_context.get_nvcc_flags_list(
-        supported_major_versions=[10]
+        supported_major_versions=[10, 12]
     )
 
     return gen_cutlass_fused_moe_module(nvcc_flags, "103", use_fast_build)
@@ -76,7 +76,7 @@ def gen_cutlass_fused_moe_sm100_module(use_fast_build: bool = False) -> JitSpec:
     ]
 
     nvcc_flags += current_compilation_context.get_nvcc_flags_list(
-        supported_major_versions=[10, 11]
+        supported_major_versions=[10, 11, 12]
     )
 
     return gen_cutlass_fused_moe_module(nvcc_flags, "100", use_fast_build)
@@ -248,7 +248,7 @@ def gen_trtllm_gen_fused_moe_sm100_module() -> JitSpec:
 
     # currently only support Blackwell
     nvcc_flags = current_compilation_context.get_nvcc_flags_list(
-        supported_major_versions=[10]
+        supported_major_versions=[10, 12]
     )
 
     return gen_jit_spec(