diff --git a/onnxruntime/contrib_ops/cuda/collective/distributed_reshape.cc b/onnxruntime/contrib_ops/cuda/collective/distributed_reshape.cc index e413ccf580870..f4c3eb9914118 100644 --- a/onnxruntime/contrib_ops/cuda/collective/distributed_reshape.cc +++ b/onnxruntime/contrib_ops/cuda/collective/distributed_reshape.cc @@ -495,7 +495,7 @@ std::tuple ComputeRepeatAndRepeatStride( const std::vector& device_elements) { int64_t first_device_id = device_elements.at(0); int64_t first_device_id_count = 0; - for (size_t i = 0; i < device_elements.size(); ++i) { + for (size_t i = 0; i < static_cast(device_elements.size()); ++i) { if (device_elements.at(i) == first_device_id) { ++first_device_id_count; } @@ -505,8 +505,8 @@ std::tuple ComputeRepeatAndRepeatStride( // Check if the device mesh pattern is supported. // Supported examples: [0, 1, 2] and [0, 1, 0, 1, 0, 1]. // Unsupported examples: [0, 1, 2, 1, 2, 0] and [0, 1, 2, 0]. - for (size_t repeat = 0; repeat < first_device_id_count; ++repeat) { - for (size_t device_id = 0; device_id < repeat_stride; ++device_id) { + for (size_t repeat = 0; repeat < static_cast(first_device_id_count); ++repeat) { + for (size_t device_id = 0; device_id < static_cast(repeat_stride); ++device_id) { ORT_ENFORCE( device_elements.at(repeat * repeat_stride + device_id) == device_elements.at(device_id), "Unsupported device mesh pattern."); @@ -556,7 +556,7 @@ std::tuple ComputeNativeSpecForTwoAxisDecomposition( // S[0], shape=[16], device=[0, 1] -> S[0]R, shape=[4, 4], device=[0, 1] std::vector dst_axis_specs; for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) { - if (src_axis != decomposed_axis_in_src) { + if (src_axis != static_cast(decomposed_axis_in_src)) { // Sharding spec is copied if the axis is not decomposed. // E.g, shape [5, 6] -> Reshape -> shape [5, 3, 2] // The spec for "5" is copied. @@ -606,7 +606,7 @@ std::tuple ComputeNativeSpecForTwoAxisDecomposition( DeviceMesh dst_device_mesh; std::tie(repeats, repeat_stride) = ComputeRepeatAndRepeatStride(src_spec.device_mesh.device_mesh_elements); for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) { - if (src_axis != decomposed_axis_in_src) { + if (src_axis != static_cast(decomposed_axis_in_src)) { dst_axis_specs.push_back(AxisPartitionSpec::CreateCopy(src_spec.GetAxisSpec(src_axis))); } else if (dst_shape[decomposition_axis_in_dst] == 1) { // S[0] -> RS[0] @@ -660,7 +660,7 @@ std::tuple ComputeNativeSpecForTwoAxisDecomposition( // Source tensor is sharded on non-decomposed axis. std::vector dst_axis_specs; for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) { - if (src_axis != decomposed_axis_in_src) { + if (src_axis != static_cast(decomposed_axis_in_src)) { dst_axis_specs.push_back(AxisPartitionSpec::CreateCopy(src_spec.GetAxisSpec(src_axis))); } else { // R -> RR diff --git a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc index 167b2af946183..5170c982f248d 100644 --- a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc +++ b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc @@ -73,9 +73,9 @@ Status ShardedMoE::ComputeInternal(OpKernelContext* context) const { MoEParameters moe_params(tensor_shards_); ORT_RETURN_IF_ERROR(::onnxruntime::contrib::moe_helper::CheckInputs( moe_params, input, router_probs, - fc1_experts_weights, fc1_experts_bias_optional, nullptr, - fc2_experts_weights, fc2_experts_bias_optional, nullptr, - fc3_experts_weights_optional, fc3_experts_bias_optional, nullptr, + fc1_experts_weights, fc1_experts_bias_optional, nullptr, nullptr, + fc2_experts_weights, fc2_experts_bias_optional, nullptr, nullptr, + fc3_experts_weights_optional, fc3_experts_bias_optional, nullptr, nullptr, 1, // no quantization so pack size is 1 activation_type_ == ort_fastertransformer::ActivationType::SwiGLU, 0)); // no block-wise quantization for sharded MoE diff --git a/onnxruntime/contrib_ops/cuda/llm/cutlass_type_conversion.h b/onnxruntime/contrib_ops/cuda/llm/cutlass_type_conversion.h index 1fe8035cbcdae..7722cd5a84f07 100644 --- a/onnxruntime/contrib_ops/cuda/llm/cutlass_type_conversion.h +++ b/onnxruntime/contrib_ops/cuda/llm/cutlass_type_conversion.h @@ -29,7 +29,14 @@ #if defined(ENABLE_FP4) #include "cutlass/float_subbyte.h" +#if defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#endif #include +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif #endif namespace onnxruntime::llm { diff --git a/onnxruntime/core/providers/cuda/cuda_common.h b/onnxruntime/core/providers/cuda/cuda_common.h index 32f5c98da1585..d50a4deca3298 100644 --- a/onnxruntime/core/providers/cuda/cuda_common.h +++ b/onnxruntime/core/providers/cuda/cuda_common.h @@ -15,12 +15,17 @@ #pragma warning(push) // 'fp4_interpretation' : unreferenced parameter #pragma warning(disable : 4100) +#elif defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" #endif #include #if defined(_MSC_VER) #pragma warning(pop) +#elif defined(__GNUC__) +#pragma GCC diagnostic pop #endif #endif diff --git a/onnxruntime/core/providers/cuda/cuda_type_conversion.h b/onnxruntime/core/providers/cuda/cuda_type_conversion.h index 38cdce1380fad..04e47a9930710 100644 --- a/onnxruntime/core/providers/cuda/cuda_type_conversion.h +++ b/onnxruntime/core/providers/cuda/cuda_type_conversion.h @@ -14,12 +14,17 @@ #pragma warning(push) // 'fp4_interpretation' : unreferenced parameter #pragma warning(disable : 4100) +#elif defined(__GNUC__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" #endif #include #if defined(_MSC_VER) #pragma warning(pop) +#elif defined(__GNUC__) +#pragma GCC diagnostic pop #endif #endif diff --git a/onnxruntime/core/providers/cuda/tensor/pad.cc b/onnxruntime/core/providers/cuda/tensor/pad.cc index 656890e796a1c..d75c6e947e09c 100644 --- a/onnxruntime/core/providers/cuda/tensor/pad.cc +++ b/onnxruntime/core/providers/cuda/tensor/pad.cc @@ -259,7 +259,7 @@ Status Pad::ComputeInternal(OpKernelContext* ctx) const { TArray fdm_output_strides(dimension_count); TensorPitches output_strides(output_dims); - for (auto i = 0; i < dimension_count; i++) { + for (size_t i = 0; i < dimension_count; i++) { fdm_output_strides[i] = fast_divmod(static_cast(output_strides[i])); }