Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions onnxruntime/contrib_ops/cuda/collective/distributed_reshape.cc
Original file line number Diff line number Diff line change
Expand Up @@ -495,7 +495,7 @@ std::tuple<int64_t, int64_t> ComputeRepeatAndRepeatStride(
const std::vector<int64_t>& device_elements) {
int64_t first_device_id = device_elements.at(0);
int64_t first_device_id_count = 0;
for (size_t i = 0; i < device_elements.size(); ++i) {
for (size_t i = 0; i < static_cast<size_t>(device_elements.size()); ++i) {
if (device_elements.at(i) == first_device_id) {
++first_device_id_count;
}
Expand All @@ -505,8 +505,8 @@ std::tuple<int64_t, int64_t> ComputeRepeatAndRepeatStride(
// Check if the device mesh pattern is supported.
// Supported examples: [0, 1, 2] and [0, 1, 0, 1, 0, 1].
// Unsupported examples: [0, 1, 2, 1, 2, 0] and [0, 1, 2, 0].
for (size_t repeat = 0; repeat < first_device_id_count; ++repeat) {
for (size_t device_id = 0; device_id < repeat_stride; ++device_id) {
for (size_t repeat = 0; repeat < static_cast<size_t>(first_device_id_count); ++repeat) {
for (size_t device_id = 0; device_id < static_cast<size_t>(repeat_stride); ++device_id) {
ORT_ENFORCE(
device_elements.at(repeat * repeat_stride + device_id) == device_elements.at(device_id),
"Unsupported device mesh pattern.");
Expand Down Expand Up @@ -556,7 +556,7 @@ std::tuple<bool, TensorPartitionSpec> ComputeNativeSpecForTwoAxisDecomposition(
// S[0], shape=[16], device=[0, 1] -> S[0]R, shape=[4, 4], device=[0, 1]
std::vector<AxisPartitionSpec> dst_axis_specs;
for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) {
if (src_axis != decomposed_axis_in_src) {
if (src_axis != static_cast<size_t>(decomposed_axis_in_src)) {
// Sharding spec is copied if the axis is not decomposed.
// E.g, shape [5, 6] -> Reshape -> shape [5, 3, 2]
// The spec for "5" is copied.
Expand Down Expand Up @@ -606,7 +606,7 @@ std::tuple<bool, TensorPartitionSpec> ComputeNativeSpecForTwoAxisDecomposition(
DeviceMesh dst_device_mesh;
std::tie(repeats, repeat_stride) = ComputeRepeatAndRepeatStride(src_spec.device_mesh.device_mesh_elements);
for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) {
if (src_axis != decomposed_axis_in_src) {
if (src_axis != static_cast<size_t>(decomposed_axis_in_src)) {
dst_axis_specs.push_back(AxisPartitionSpec::CreateCopy(src_spec.GetAxisSpec(src_axis)));
} else if (dst_shape[decomposition_axis_in_dst] == 1) {
// S[0] -> RS[0]
Expand Down Expand Up @@ -660,7 +660,7 @@ std::tuple<bool, TensorPartitionSpec> ComputeNativeSpecForTwoAxisDecomposition(
// Source tensor is sharded on non-decomposed axis.
std::vector<AxisPartitionSpec> dst_axis_specs;
for (size_t src_axis = 0; src_axis < src_shape.size(); ++src_axis) {
if (src_axis != decomposed_axis_in_src) {
if (src_axis != static_cast<size_t>(decomposed_axis_in_src)) {
dst_axis_specs.push_back(AxisPartitionSpec::CreateCopy(src_spec.GetAxisSpec(src_axis)));
} else {
// R -> RR
Expand Down
6 changes: 3 additions & 3 deletions onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,9 @@ Status ShardedMoE<T>::ComputeInternal(OpKernelContext* context) const {
MoEParameters moe_params(tensor_shards_);
ORT_RETURN_IF_ERROR(::onnxruntime::contrib::moe_helper::CheckInputs<Tensor>(
moe_params, input, router_probs,
fc1_experts_weights, fc1_experts_bias_optional, nullptr,
fc2_experts_weights, fc2_experts_bias_optional, nullptr,
fc3_experts_weights_optional, fc3_experts_bias_optional, nullptr,
fc1_experts_weights, fc1_experts_bias_optional, nullptr, nullptr,
fc2_experts_weights, fc2_experts_bias_optional, nullptr, nullptr,
fc3_experts_weights_optional, fc3_experts_bias_optional, nullptr, nullptr,
1, // no quantization so pack size is 1
activation_type_ == ort_fastertransformer::ActivationType::SwiGLU,
0)); // no block-wise quantization for sharded MoE
Expand Down
7 changes: 7 additions & 0 deletions onnxruntime/contrib_ops/cuda/llm/cutlass_type_conversion.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,14 @@

#if defined(ENABLE_FP4)
#include "cutlass/float_subbyte.h"
#if defined(__GNUC__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#endif
#include <cuda_fp4.h>
#if defined(__GNUC__)
#pragma GCC diagnostic pop
#endif
#endif

namespace onnxruntime::llm {
Expand Down
5 changes: 5 additions & 0 deletions onnxruntime/core/providers/cuda/cuda_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,17 @@
#pragma warning(push)
// 'fp4_interpretation' : unreferenced parameter
#pragma warning(disable : 4100)
#elif defined(__GNUC__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#endif

#include <cuda_fp4.h>

#if defined(_MSC_VER)
#pragma warning(pop)
#elif defined(__GNUC__)
#pragma GCC diagnostic pop
#endif

#endif
Expand Down
5 changes: 5 additions & 0 deletions onnxruntime/core/providers/cuda/cuda_type_conversion.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,17 @@
#pragma warning(push)
// 'fp4_interpretation' : unreferenced parameter
#pragma warning(disable : 4100)
#elif defined(__GNUC__)
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#endif

#include <cuda_fp4.h>

#if defined(_MSC_VER)
#pragma warning(pop)
#elif defined(__GNUC__)
#pragma GCC diagnostic pop
#endif

#endif
Expand Down
2 changes: 1 addition & 1 deletion onnxruntime/core/providers/cuda/tensor/pad.cc
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ Status Pad<T>::ComputeInternal(OpKernelContext* ctx) const {

TArray<fast_divmod> fdm_output_strides(dimension_count);
TensorPitches output_strides(output_dims);
for (auto i = 0; i < dimension_count; i++) {
for (size_t i = 0; i < dimension_count; i++) {
fdm_output_strides[i] = fast_divmod(static_cast<int>(output_strides[i]));
}

Expand Down
Loading