[CUDA] Add build flag onnxruntime_USE_FPA_INTB_GEMM (#25802)

tianleiwu · web-flow · commit d34633377209 · 2025-08-21T12:50:13.000-07:00
### Description

Add a build flag to enable/disable mixed gemm cutlass kernel.

To disable the kernel, you can append the following at the end of build
command line:
`--cmake_extra_defines onnxruntime_USE_FPA_INTB_GEMM=OFF`

### Motivation and Context

FpA IntB Gemm need a lot of time to compile. With such option, developer
can speed up the build especially on build machine with limited memory.
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -98,7 +98,8 @@ option(onnxruntime_USE_VSINPU "Build with VSINPU support" OFF)
 
 cmake_dependent_option(onnxruntime_USE_FLASH_ATTENTION "Build flash attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF)
 option(onnxruntime_USE_LEAN_ATTENTION "Build lean attention kernel for scaled dot product attention" OFF)
-option(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION "Build memory efficient attention kernel for scaled dot product attention" ON)
+cmake_dependent_option(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION "Build memory efficient attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF)
+cmake_dependent_option(onnxruntime_USE_FPA_INTB_GEMM "Build FpA IntB gemm cuda kernels" ON "onnxruntime_USE_CUDA" OFF)
 
 option(onnxruntime_BUILD_FOR_NATIVE_MACHINE "Enable this option for turning on optimization specific to this machine" OFF)
 option(onnxruntime_USE_AVX "Use AVX instructions" OFF)
@@ -696,6 +697,7 @@ if (onnxruntime_USE_CUDA)
     set(onnxruntime_USE_FLASH_ATTENTION OFF)
     set(onnxruntime_USE_LEAN_ATTENTION OFF)
     set(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION OFF)
+    set(onnxruntime_USE_FPA_INTB_GEMM OFF)
   endif()
 
   if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.6)
@@ -708,6 +710,11 @@ if (onnxruntime_USE_CUDA)
     set(onnxruntime_USE_FLASH_ATTENTION OFF)
   endif()
 
+  if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12)
+    message( STATUS "FpA IntB Gemm unsupported for CUDA compiler version < 12.0")
+    set(onnxruntime_USE_FPA_INTB_GEMM OFF)
+  endif()
+
   if (WIN32)
     message( STATUS "Lean Attention unsupported in Windows")
     set(onnxruntime_USE_LEAN_ATTENTION OFF)
@@ -736,6 +743,11 @@ if (onnxruntime_USE_CUDA)
       message( STATUS "Enable memory efficient attention for CUDA EP")
       list(APPEND ORT_PROVIDER_FLAGS -DUSE_MEMORY_EFFICIENT_ATTENTION=1)
     endif()
+
+    if (onnxruntime_USE_FPA_INTB_GEMM)
+      message( STATUS "Enable FpA IntB Gemm for CUDA EP")
+      list(APPEND ORT_PROVIDER_FLAGS -DUSE_FPA_INTB_GEMM=1)
+    endif()
 endif()
 
 if (onnxruntime_USE_CUDA_INTERFACE AND (NOT onnxruntime_USE_CUDA))
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/bf16_int4_gemm_scale_zeros.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/bf16_int4_gemm_scale_zeros.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+#if USE_FPA_INTB_GEMM
 #include "contrib_ops/cuda/llm/fpA_intB_gemm/fpA_intB_gemm_template.h"
 
 namespace onnxruntime::llm {
@@ -24,3 +24,4 @@ template class CutlassFpAIntBGemmRunner<__nv_bfloat16, cutlass::uint4b_t,
 }  // namespace cutlass_kernels
 }  // namespace kernels
 }  // namespace onnxruntime::llm
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/bf16_int4_gemm_scaleonly.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/bf16_int4_gemm_scaleonly.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+#if USE_FPA_INTB_GEMM
 #include "contrib_ops/cuda/llm/fpA_intB_gemm/fpA_intB_gemm_template.h"
 
 namespace onnxruntime::llm {
@@ -24,3 +24,4 @@ template class CutlassFpAIntBGemmRunner<__nv_bfloat16, cutlass::uint4b_t,
 }  // namespace cutlass_kernels
 }  // namespace kernels
 }  // namespace onnxruntime::llm
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/bf16_int8_gemm_scale_zeros.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/bf16_int8_gemm_scale_zeros.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+#if USE_FPA_INTB_GEMM
 #include "contrib_ops/cuda/llm/fpA_intB_gemm/fpA_intB_gemm_template.h"
 
 namespace onnxruntime::llm {
@@ -24,3 +24,4 @@ template class CutlassFpAIntBGemmRunner<__nv_bfloat16, uint8_t,
 }  // namespace cutlass_kernels
 }  // namespace kernels
 }  // namespace onnxruntime::llm
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/bf16_int8_gemm_scaleonly.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/bf16_int8_gemm_scaleonly.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+#if USE_FPA_INTB_GEMM
 #include "contrib_ops/cuda/llm/fpA_intB_gemm/fpA_intB_gemm_template.h"
 
 namespace onnxruntime::llm {
@@ -23,3 +23,4 @@ template class CutlassFpAIntBGemmRunner<__nv_bfloat16, uint8_t, cutlass::WeightO
 }  // namespace cutlass_kernels
 }  // namespace kernels
 }  // namespace onnxruntime::llm
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/fp16_int4_gemm_scale_zeros.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/fp16_int4_gemm_scale_zeros.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+#if USE_FPA_INTB_GEMM
 #include "contrib_ops/cuda/llm/fpA_intB_gemm/fpA_intB_gemm_template.h"
 
 namespace onnxruntime::llm {
@@ -24,3 +24,4 @@ template class CutlassFpAIntBGemmRunner<half, cutlass::uint4b_t,
 }  // namespace cutlass_kernels
 }  // namespace kernels
 }  // namespace onnxruntime::llm
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/fp16_int4_gemm_scaleonly.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/fp16_int4_gemm_scaleonly.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+#if USE_FPA_INTB_GEMM
 #include "contrib_ops/cuda/llm/fpA_intB_gemm/fpA_intB_gemm_template.h"
 
 namespace onnxruntime::llm {
@@ -23,3 +23,4 @@ template class CutlassFpAIntBGemmRunner<half, cutlass::uint4b_t, cutlass::Weight
 }  // namespace cutlass_kernels
 }  // namespace kernels
 }  // namespace onnxruntime::llm
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/fp16_int8_gemm_scale_zeros.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/fp16_int8_gemm_scale_zeros.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+#if USE_FPA_INTB_GEMM
 #include "contrib_ops/cuda/llm/fpA_intB_gemm/fpA_intB_gemm_template.h"
 
 namespace onnxruntime::llm {
@@ -23,3 +23,4 @@ template class CutlassFpAIntBGemmRunner<half, uint8_t, cutlass::WeightOnlyQuantO
 }  // namespace cutlass_kernels
 }  // namespace kernels
 }  // namespace onnxruntime::llm
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/fp16_int8_gemm_scaleonly.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/fp16_int8_gemm_scaleonly.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+#if USE_FPA_INTB_GEMM
 #include "contrib_ops/cuda/llm/fpA_intB_gemm/fpA_intB_gemm_template.h"
 
 namespace onnxruntime::llm {
@@ -23,3 +23,4 @@ template class CutlassFpAIntBGemmRunner<half, uint8_t, cutlass::WeightOnlyQuantO
 }  // namespace cutlass_kernels
 }  // namespace kernels
 }  // namespace onnxruntime::llm
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_gemm_launcher_1.generated.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_gemm_launcher_1.generated.cu
@@ -1,4 +1,5 @@
 
+#if USE_FPA_INTB_GEMM
 #ifndef EXCLUDE_SM_90
 #include "contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl"
 
@@ -515,3 +516,4 @@ __nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::
 }  // namespace kernels
 }  // namespace onnxruntime::llm
 #endif  // EXCLUDE_SM_90
+#endif  // USE_FPA_INTB_GEMM
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_gemm_launcher_2.generated.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_gemm_launcher_2.generated.cu
@@ -1,4 +1,5 @@
 
+#if USE_FPA_INTB_GEMM
 #ifndef EXCLUDE_SM_90
 #include "contrib_ops/cuda/llm/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl"
 
@@ -1019,3 +1020,4 @@ __nv_bfloat16*, int, int, int, const int, onnxruntime::llm::cutlass_extensions::
 }  // namespace kernels
 }  // namespace onnxruntime::llm
 #endif  // EXCLUDE_SM_90
+#endif  // USE_FPA_INTB_GEMM
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_adaptor.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_adaptor.cu
@@ -1,6 +1,6 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-
+#if USE_FPA_INTB_GEMM
 #include "contrib_ops/cuda/llm/fpA_intB_gemm_adaptor.h"
 #include <cuda_fp16.h>
 #include "core/providers/cuda/cuda_common.h"
@@ -283,3 +283,4 @@ void transpose_uint8_matrix_and_convert_to_int8(
 }  // namespace fpA_intB_gemv
 }  // namespace kernels
 }  // namespace onnxruntime::llm
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors_impl.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors_impl.cu
@@ -1,7 +1,7 @@
 
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
-
+#if USE_FPA_INTB_GEMM
 #include "contrib_ops/cuda/llm/fpA_intB_gemm_preprocessors_impl.h"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
 #include "core/common/safeint.h"
@@ -581,3 +581,4 @@ void preprocess_weights_for_mixed_gemm_cuda(cudaStream_t stream,
 }  // namespace weight_only
 }  // namespace kernels
 }  // namespace onnxruntime::llm
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher_bf16_int4.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher_bf16_int4.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+#if USE_FPA_INTB_GEMM
 #include "contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher.h"
 
 namespace onnxruntime::llm {
@@ -30,3 +30,4 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS(
 }  // namespace fpA_intB_gemv
 }  // namespace kernels
 }  // namespace onnxruntime::llm
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher_bf16_int4_hopper.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher_bf16_int4_hopper.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+#if USE_FPA_INTB_GEMM
 #include "contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher.h"
 
 namespace onnxruntime::llm {
@@ -29,3 +29,4 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS(
 }  // namespace fpA_intB_gemv
 }  // namespace kernels
 }  // namespace onnxruntime::llm
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher_bf16_int8.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher_bf16_int8.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+#if USE_FPA_INTB_GEMM
 #include "contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher.h"
 
 namespace onnxruntime::llm {
@@ -26,3 +26,4 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS(
 }  // namespace fpA_intB_gemv
 }  // namespace kernels
 }  // namespace onnxruntime::llm
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher_bf16_int8_hopper.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher_bf16_int8_hopper.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+#if USE_FPA_INTB_GEMM
 #include "contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher.h"
 
 namespace onnxruntime::llm {
@@ -26,3 +26,4 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS(
 }  // namespace fpA_intB_gemv
 }  // namespace kernels
 }  // namespace onnxruntime::llm
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher_fp16_int4.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher_fp16_int4.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+#if USE_FPA_INTB_GEMM
 #include "contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher.h"
 
 namespace onnxruntime::llm {
@@ -30,3 +30,4 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS(
 }  // namespace fpA_intB_gemv
 }  // namespace kernels
 }  // namespace onnxruntime::llm
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher_fp16_int4_hopper.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher_fp16_int4_hopper.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+#if USE_FPA_INTB_GEMM
 #include "contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher.h"
 
 namespace onnxruntime::llm {
@@ -29,3 +29,4 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS(
 }  // namespace fpA_intB_gemv
 }  // namespace kernels
 }  // namespace onnxruntime::llm
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher_fp16_int8.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher_fp16_int8.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+#if USE_FPA_INTB_GEMM
 #include "contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher.h"
 
 namespace onnxruntime::llm {
@@ -26,3 +26,4 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS(
 }  // namespace fpA_intB_gemv
 }  // namespace kernels
 }  // namespace onnxruntime::llm
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher_fp16_int8_hopper.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher_fp16_int8_hopper.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+#if USE_FPA_INTB_GEMM
 #include "contrib_ops/cuda/llm/fpA_intB_gemv/dispatcher.h"
 
 namespace onnxruntime::llm {
@@ -26,3 +26,4 @@ INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS(
 }  // namespace fpA_intB_gemv
 }  // namespace kernels
 }  // namespace onnxruntime::llm
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemv/fpA_intB_gemv.cu b/onnxruntime/contrib_ops/cuda/llm/fpA_intB_gemv/fpA_intB_gemv.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+#if USE_FPA_INTB_GEMM
 #include <cassert>
 #include <cmath>
 #include <cstdint>
@@ -102,3 +102,4 @@ bool is_supported(int arch, KernelType kernel_type) {
 }  // namespace fpA_intB_gemv
 }  // namespace kernels
 }  // namespace onnxruntime::llm
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/llm/gemm_profiler.cc b/onnxruntime/contrib_ops/cuda/llm/gemm_profiler.cc
@@ -14,7 +14,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
+#if USE_FPA_INTB_GEMM
 #include "contrib_ops/cuda/llm/gemm_profiler.h"
 #include "contrib_ops/cuda/llm/common/logger.h"
 #include "contrib_ops/cuda/llm/fpA_intB_gemm/fpA_intB_gemm.h"
@@ -311,3 +311,4 @@ template class GemmPluginProfiler<onnxruntime::llm::cutlass_extensions::CutlassG
                                   GemmIdCoreHash>;
 
 }  // namespace onnxruntime::llm::kernels::weight_only
+#endif
diff --git a/onnxruntime/contrib_ops/cuda/llm/generate_kernels.py b/onnxruntime/contrib_ops/cuda/llm/generate_kernels.py
@@ -224,6 +224,7 @@ def get_file_content(launcher_inl_files, operations):
     instantiations = "\n".join(insts_list)
 
     file_content = f"""
+#if USE_FPA_INTB_GEMM
 #ifndef EXCLUDE_SM_90
 {includes}
 
@@ -237,6 +238,7 @@ def get_file_content(launcher_inl_files, operations):
 }}  // namespace kernels
 }}  // namespace onnxruntime::llm
 #endif  // EXCLUDE_SM_90
+#endif  // USE_FPA_INTB_GEMM
 """
     return file_content
 
diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc
diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.h b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.h