Skip to content

Commit a3749f1

Browse files
authored
[BUILD] Fix Build Errors and Warnings in CUDA Providers (#27276)
## Description User reported build error in #27269. This PR addresses several build issues and compilation warnings in the CUDA provider and associated contrib ops. These fixes ensure a clean build and improved compatibility with different CUDA versions (specifically CUDA 13.1) and compilers. ## Changes ### 1. Fix ShardedMoE Compilation Error - Resolved a "no matching function for call to CheckInputs" error in sharded_moe.cc - Updated the `moe_helper::CheckInputs` call to provide the required `zero_points` arguments (passing `nullptr`), aligning with the updated function signature. ### 2. Suppress CUDA 13.1 System Header Warnings - Added GCC/Clang diagnostic pragmas to suppress `-Wunused-parameter` warnings in `cuda_fp4.h`. - These warnings were causing build failures in environments where warnings are treated as errors. - Affected files: - onnxruntime/core/providers/cuda/cuda_common.h - onnxruntime/core/providers/cuda/cuda_type_conversion.h - onnxruntime/contrib_ops/cuda/llm/cutlass_type_conversion.h ### 3. Resolve Sign-Comparison Warnings - Fixed several `-Wsign-compare` warnings that were being treated as errors: - **Pad Op:** Changed loop variable type to `size_t` in onnxruntime/core/providers/cuda/tensor/pad.cc. - **Distributed Reshape:** Added explicit casts to `size_t` for `int64_t` comparisons in onnxruntime/contrib_ops/cuda/collective/distributed_reshape.cc. ## Verification - The build now completes successfully without errors or warnings using `--cmake_extra_defines onnxruntime_USE_NCCL=ON` - Builds tested with cuda 12.8, 13.0 and 13.1.1
1 parent 7121f9a commit a3749f1

File tree

6 files changed

+27
-10
lines changed

6 files changed

+27
-10
lines changed

onnxruntime/contrib_ops/cuda/collective/distributed_reshape.cc

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -495,7 +495,7 @@ std::tuple<int64_t, int64_t> ComputeRepeatAndRepeatStride(
495495
const std::vector<int64_t>& device_elements) {
496496
int64_t first_device_id = device_elements.at(0);
497497
int64_t first_device_id_count = 0;
498-
for (size_t i = 0; i < device_elements.size(); ++i) {
498+
for (size_t i = 0; i < static_cast<size_t>(device_elements.size()); ++i) {
499499
if (device_elements.at(i) == first_device_id) {
500500
++first_device_id_count;
501501
}
@@ -505,8 +505,8 @@ std::tuple<int64_t, int64_t> ComputeRepeatAndRepeatStride(
505505
// Check if the device mesh pattern is supported.
506506
// Supported examples: [0, 1, 2] and [0, 1, 0, 1, 0, 1].
507507
// Unsupported examples: [0, 1, 2, 1, 2, 0] and [0, 1, 2, 0].
508-
for (size_t repeat = 0; repeat < first_device_id_count; ++repeat) {
509-
for (size_t device_id = 0; device_id < repeat_stride; ++device_id) {
508+
for (size_t repeat = 0; repeat < static_cast<size_t>(first_device_id_count); ++repeat) {
509+
for (size_t device_id = 0; device_id < static_cast<size_t>(repeat_stride); ++device_id) {
510510
ORT_ENFORCE(
511511
device_elements.at(repeat * repeat_stride + device_id) == device_elements.at(device_id),
512512
"Unsupported device mesh pattern.");
@@ -556,7 +556,7 @@ std::tuple<bool, TensorPartitionSpec> ComputeNativeSpecForTwoAxisDecomposition(
556556
// S[0], shape=[16], device=[0, 1] -> S[0]R, shape=[4, 4], device=[0, 1]
557557
std::vector<AxisPartitionSpec> dst_axis_specs;
558558
for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) {
559-
if (src_axis != decomposed_axis_in_src) {
559+
if (src_axis != static_cast<size_t>(decomposed_axis_in_src)) {
560560
// Sharding spec is copied if the axis is not decomposed.
561561
// E.g, shape [5, 6] -> Reshape -> shape [5, 3, 2]
562562
// The spec for "5" is copied.
@@ -606,7 +606,7 @@ std::tuple<bool, TensorPartitionSpec> ComputeNativeSpecForTwoAxisDecomposition(
606606
DeviceMesh dst_device_mesh;
607607
std::tie(repeats, repeat_stride) = ComputeRepeatAndRepeatStride(src_spec.device_mesh.device_mesh_elements);
608608
for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) {
609-
if (src_axis != decomposed_axis_in_src) {
609+
if (src_axis != static_cast<size_t>(decomposed_axis_in_src)) {
610610
dst_axis_specs.push_back(AxisPartitionSpec::CreateCopy(src_spec.GetAxisSpec(src_axis)));
611611
} else if (dst_shape[decomposition_axis_in_dst] == 1) {
612612
// S[0] -> RS[0]
@@ -660,7 +660,7 @@ std::tuple<bool, TensorPartitionSpec> ComputeNativeSpecForTwoAxisDecomposition(
660660
// Source tensor is sharded on non-decomposed axis.
661661
std::vector<AxisPartitionSpec> dst_axis_specs;
662662
for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) {
663-
if (src_axis != decomposed_axis_in_src) {
663+
if (src_axis != static_cast<size_t>(decomposed_axis_in_src)) {
664664
dst_axis_specs.push_back(AxisPartitionSpec::CreateCopy(src_spec.GetAxisSpec(src_axis)));
665665
} else {
666666
// R -> RR

onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,9 +73,9 @@ Status ShardedMoE<T>::ComputeInternal(OpKernelContext* context) const {
7373
MoEParameters moe_params(tensor_shards_);
7474
ORT_RETURN_IF_ERROR(::onnxruntime::contrib::moe_helper::CheckInputs<Tensor>(
7575
moe_params, input, router_probs,
76-
fc1_experts_weights, fc1_experts_bias_optional, nullptr,
77-
fc2_experts_weights, fc2_experts_bias_optional, nullptr,
78-
fc3_experts_weights_optional, fc3_experts_bias_optional, nullptr,
76+
fc1_experts_weights, fc1_experts_bias_optional, nullptr, nullptr,
77+
fc2_experts_weights, fc2_experts_bias_optional, nullptr, nullptr,
78+
fc3_experts_weights_optional, fc3_experts_bias_optional, nullptr, nullptr,
7979
1, // no quantization so pack size is 1
8080
activation_type_ == ort_fastertransformer::ActivationType::SwiGLU,
8181
0)); // no block-wise quantization for sharded MoE

onnxruntime/contrib_ops/cuda/llm/cutlass_type_conversion.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,14 @@
2929

3030
#if defined(ENABLE_FP4)
3131
#include "cutlass/float_subbyte.h"
32+
#if defined(__GNUC__)
33+
#pragma GCC diagnostic push
34+
#pragma GCC diagnostic ignored "-Wunused-parameter"
35+
#endif
3236
#include <cuda_fp4.h>
37+
#if defined(__GNUC__)
38+
#pragma GCC diagnostic pop
39+
#endif
3340
#endif
3441

3542
namespace onnxruntime::llm {

onnxruntime/core/providers/cuda/cuda_common.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,17 @@
1515
#pragma warning(push)
1616
// 'fp4_interpretation' : unreferenced parameter
1717
#pragma warning(disable : 4100)
18+
#elif defined(__GNUC__)
19+
#pragma GCC diagnostic push
20+
#pragma GCC diagnostic ignored "-Wunused-parameter"
1821
#endif
1922

2023
#include <cuda_fp4.h>
2124

2225
#if defined(_MSC_VER)
2326
#pragma warning(pop)
27+
#elif defined(__GNUC__)
28+
#pragma GCC diagnostic pop
2429
#endif
2530

2631
#endif

onnxruntime/core/providers/cuda/cuda_type_conversion.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,17 @@
1414
#pragma warning(push)
1515
// 'fp4_interpretation' : unreferenced parameter
1616
#pragma warning(disable : 4100)
17+
#elif defined(__GNUC__)
18+
#pragma GCC diagnostic push
19+
#pragma GCC diagnostic ignored "-Wunused-parameter"
1720
#endif
1821

1922
#include <cuda_fp4.h>
2023

2124
#if defined(_MSC_VER)
2225
#pragma warning(pop)
26+
#elif defined(__GNUC__)
27+
#pragma GCC diagnostic pop
2328
#endif
2429

2530
#endif

onnxruntime/core/providers/cuda/tensor/pad.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,7 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {
259259

260260
TArray<fast_divmod> fdm_output_strides(dimension_count);
261261
TensorPitches output_strides(output_dims);
262-
for (auto i = 0; i < dimension_count; i++) {
262+
for (size_t i = 0; i < dimension_count; i++) {
263263
fdm_output_strides[i] = fast_divmod(static_cast<int>(output_strides[i]));
264264
}
265265

0 commit comments

Comments
 (0)