[BUILD] Fix Build Errors and Warnings in CUDA Providers (#27276)

tianleiwu · web-flow · commit a3749f13536b · 2026-02-08T16:52:55.000-08:00
## Description User reported build error in #27269. This PR addresses several build issues and compilation warnings in the CUDA provider and associated contrib ops. These fixes ensure a clean build and improved compatibility with different CUDA versions (specifically CUDA 13.1) and compilers. ## Changes ### 1. Fix ShardedMoE Compilation Error - Resolved a "no matching function for call to CheckInputs" error in sharded_moe.cc - Updated the `moe_helper::CheckInputs` call to provide the required `zero_points` arguments (passing `nullptr`), aligning with the updated function signature. ### 2. Suppress CUDA 13.1 System Header Warnings - Added GCC/Clang diagnostic pragmas to suppress `-Wunused-parameter` warnings in `cuda_fp4.h`. - These warnings were causing build failures in environments where warnings are treated as errors. - Affected files: - onnxruntime/core/providers/cuda/cuda_common.h - onnxruntime/core/providers/cuda/cuda_type_conversion.h - onnxruntime/contrib_ops/cuda/llm/cutlass_type_conversion.h ### 3. Resolve Sign-Comparison Warnings - Fixed several `-Wsign-compare` warnings that were being treated as errors: - **Pad Op:** Changed loop variable type to `size_t` in onnxruntime/core/providers/cuda/tensor/pad.cc. - **Distributed Reshape:** Added explicit casts to `size_t` for `int64_t` comparisons in onnxruntime/contrib_ops/cuda/collective/distributed_reshape.cc. ## Verification - The build now completes successfully without errors or warnings using `--cmake_extra_defines onnxruntime_USE_NCCL=ON` - Builds tested with cuda 12.8, 13.0 and 13.1.1
diff --git a/onnxruntime/contrib_ops/cuda/collective/distributed_reshape.cc b/onnxruntime/contrib_ops/cuda/collective/distributed_reshape.cc
@@ -495,7 +495,7 @@ std::tuple<int64_t, int64_t> ComputeRepeatAndRepeatStride(
     const std::vector<int64_t>& device_elements) {
   int64_t first_device_id = device_elements.at(0);
   int64_t first_device_id_count = 0;
-  for (size_t i = 0; i < device_elements.size(); ++i) {
+  for (size_t i = 0; i < static_cast<size_t>(device_elements.size()); ++i) {
     if (device_elements.at(i) == first_device_id) {
       ++first_device_id_count;
     }
@@ -505,8 +505,8 @@ std::tuple<int64_t, int64_t> ComputeRepeatAndRepeatStride(
   // Check if the device mesh pattern is supported.
   // Supported examples: [0, 1, 2] and [0, 1, 0, 1, 0, 1].
   // Unsupported examples: [0, 1, 2, 1, 2, 0] and [0, 1, 2, 0].
-  for (size_t repeat = 0; repeat < first_device_id_count; ++repeat) {
-    for (size_t device_id = 0; device_id < repeat_stride; ++device_id) {
+  for (size_t repeat = 0; repeat < static_cast<size_t>(first_device_id_count); ++repeat) {
+    for (size_t device_id = 0; device_id < static_cast<size_t>(repeat_stride); ++device_id) {
       ORT_ENFORCE(
           device_elements.at(repeat * repeat_stride + device_id) == device_elements.at(device_id),
           "Unsupported device mesh pattern.");
@@ -556,7 +556,7 @@ std::tuple<bool, TensorPartitionSpec> ComputeNativeSpecForTwoAxisDecomposition(
       // S[0], shape=[16], device=[0, 1] -> S[0]R, shape=[4, 4], device=[0, 1]
       std::vector<AxisPartitionSpec> dst_axis_specs;
       for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) {
-        if (src_axis != decomposed_axis_in_src) {
+        if (src_axis != static_cast<size_t>(decomposed_axis_in_src)) {
           // Sharding spec is copied if the axis is not decomposed.
           // E.g, shape [5, 6] -> Reshape -> shape [5, 3, 2]
           // The spec for "5" is copied.
@@ -606,7 +606,7 @@ std::tuple<bool, TensorPartitionSpec> ComputeNativeSpecForTwoAxisDecomposition(
       DeviceMesh dst_device_mesh;
       std::tie(repeats, repeat_stride) = ComputeRepeatAndRepeatStride(src_spec.device_mesh.device_mesh_elements);
       for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) {
-        if (src_axis != decomposed_axis_in_src) {
+        if (src_axis != static_cast<size_t>(decomposed_axis_in_src)) {
           dst_axis_specs.push_back(AxisPartitionSpec::CreateCopy(src_spec.GetAxisSpec(src_axis)));
         } else if (dst_shape[decomposition_axis_in_dst] == 1) {
           // S[0] -> RS[0]
@@ -660,7 +660,7 @@ std::tuple<bool, TensorPartitionSpec> ComputeNativeSpecForTwoAxisDecomposition(
     // Source tensor is sharded on non-decomposed axis.
     std::vector<AxisPartitionSpec> dst_axis_specs;
     for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) {
-      if (src_axis != decomposed_axis_in_src) {
+      if (src_axis != static_cast<size_t>(decomposed_axis_in_src)) {
         dst_axis_specs.push_back(AxisPartitionSpec::CreateCopy(src_spec.GetAxisSpec(src_axis)));
       } else {
         // R -> RR
diff --git a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc
@@ -73,9 +73,9 @@ Status ShardedMoE<T>::ComputeInternal(OpKernelContext* context) const {
   MoEParameters moe_params(tensor_shards_);
   ORT_RETURN_IF_ERROR(::onnxruntime::contrib::moe_helper::CheckInputs<Tensor>(
       moe_params, input, router_probs,
-      fc1_experts_weights, fc1_experts_bias_optional, nullptr,
-      fc2_experts_weights, fc2_experts_bias_optional, nullptr,
-      fc3_experts_weights_optional, fc3_experts_bias_optional, nullptr,
+      fc1_experts_weights, fc1_experts_bias_optional, nullptr, nullptr,
+      fc2_experts_weights, fc2_experts_bias_optional, nullptr, nullptr,
+      fc3_experts_weights_optional, fc3_experts_bias_optional, nullptr, nullptr,
       1,  // no quantization so pack size is 1
       activation_type_ == ort_fastertransformer::ActivationType::SwiGLU,
       0));  // no block-wise quantization for sharded MoE
diff --git a/onnxruntime/contrib_ops/cuda/llm/cutlass_type_conversion.h b/onnxruntime/contrib_ops/cuda/llm/cutlass_type_conversion.h
@@ -29,7 +29,14 @@
 
 #if defined(ENABLE_FP4)
 #include "cutlass/float_subbyte.h"
+#if defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
+#endif
 #include <cuda_fp4.h>
+#if defined(__GNUC__)
+#pragma GCC diagnostic pop
+#endif
 #endif
 
 namespace onnxruntime::llm {
diff --git a/onnxruntime/core/providers/cuda/cuda_common.h b/onnxruntime/core/providers/cuda/cuda_common.h
@@ -15,12 +15,17 @@
 #pragma warning(push)
 // 'fp4_interpretation' : unreferenced parameter
 #pragma warning(disable : 4100)
+#elif defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
 #endif
 
 #include <cuda_fp4.h>
 
 #if defined(_MSC_VER)
 #pragma warning(pop)
+#elif defined(__GNUC__)
+#pragma GCC diagnostic pop
 #endif
 
 #endif
diff --git a/onnxruntime/core/providers/cuda/cuda_type_conversion.h b/onnxruntime/core/providers/cuda/cuda_type_conversion.h
@@ -14,12 +14,17 @@
 #pragma warning(push)
 // 'fp4_interpretation' : unreferenced parameter
 #pragma warning(disable : 4100)
+#elif defined(__GNUC__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wunused-parameter"
 #endif
 
 #include <cuda_fp4.h>
 
 #if defined(_MSC_VER)
 #pragma warning(pop)
+#elif defined(__GNUC__)
+#pragma GCC diagnostic pop
 #endif
 
 #endif
diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc
@@ -259,7 +259,7 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
 
   TArray<fast_divmod> fdm_output_strides(dimension_count);
   TensorPitches output_strides(output_dims);
-  for (auto i = 0; i < dimension_count; i++) {
+  for (size_t i = 0; i < dimension_count; i++) {
     fdm_output_strides[i] = fast_divmod(static_cast<int>(output_strides[i]));
   }
 

Original file line number	Diff line number	Diff line change
`@@ -259,7 +259,7 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {`
`259`	`259`
`260`	`260`	`TArray<fast_divmod> fdm_output_strides(dimension_count);`
`261`	`261`	`TensorPitches output_strides(output_dims);`
`262`		`- for (auto i = 0; i < dimension_count; i++) {`
	`262`	`+ for (size_t i = 0; i < dimension_count; i++) {`
`263`	`263`	`fdm_output_strides[i] = fast_divmod(static_cast<int>(output_strides[i]));`
`264`	`264`	`}`
`265`	`265`