Resolved MacOS and Web CI failures

Sanket Kale · Sanket Kale · commit 9bccbd8738c9 · 2026-02-06T20:17:12.000+05:30
diff --git a/onnxruntime/core/mlas/lib/gelu_neon_fp16.cpp b/onnxruntime/core/mlas/lib/gelu_neon_fp16.cpp
@@ -13,7 +13,7 @@ Module Name:
 --*/
 #include "gelu.h"
 #include <cmath>
-#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED)
 
 void
 MLASCALL
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
@@ -1431,10 +1431,10 @@ struct MLAS_PLATFORM {
     MLAS_COMPUTE_SUMEXP_FLOAT_KERNEL* ComputeSumExpF32Kernel;
     MLAS_COMPUTE_LOGSOFTMAX_OUTPUT_FLOAT_KERNEL* ComputeLogSoftmaxOutputF32Kernel;
     MLAS_COMPUTE_SOFTMAX_OUTPUT_FLOAT_KERNEL* ComputeSoftmaxOutputF32Kernel;
-    #if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED)
+#endif
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED)
         MLAS_COMPUTE_ERF_FP16_KERNEL* ErfF16KernelRoutine;
         MLAS_COMPUTE_GELU_FP16_KERNEL* GeluF16KernelRoutine;
-    #endif
 #endif
 #if defined(MLAS_TARGET_AMD64)
     MLAS_SGEMM_KERNEL_M1_ROUTINE* KernelM1Routine;
diff --git a/onnxruntime/core/providers/cpu/tensor/gelu.cc b/onnxruntime/core/providers/cpu/tensor/gelu.cc
@@ -77,9 +77,9 @@ Status Gelu<T>::Compute(OpKernelContext* context) const {
   T* output_data = output->MutableData<T>();
 
   concurrency::ThreadPool* tp = context->GetOperatorThreadPool();
-  int64_t elem_count = input->Shape().Size();
-  constexpr int64_t length_per_task = 4096;  // this number comes from FastGelu.
-  int64_t task_count = (elem_count + length_per_task - 1) / length_per_task;
+  size_t elem_count = input->Shape().Size();
+  constexpr size_t length_per_task = 4096;  // this number comes from FastGelu.
+  size_t task_count = (elem_count + length_per_task - 1) / length_per_task;
 
   if (approximation_algorithm_ == "tanh") {
     // FastGelu allows optional bias. Here we split input data into chunks. Each chunk
@@ -95,16 +95,16 @@ Status Gelu<T>::Compute(OpKernelContext* context) const {
           const auto start = task_idx * length_per_task;
           const T* p_input = input_data + start;
           T* p_output = output_data + start;
-          int64_t count = std::min(length_per_task, elem_count - start);
+          size_t count = std::min(length_per_task, elem_count - start);
 
-          for (int64_t i = 0; i < count; i++) {
+          for (size_t i = 0; i < count; i++) {
             T value = p_input[i];
             p_output[i] = value * (static_cast<T>(C) * value * value + static_cast<T>(B));
           }
 
           MlasComputeTanh(p_output, p_output, narrow<size_t>(count));
 
-          for (int64_t i = 0; i < count; i++) {
+          for (size_t i = 0; i < count; i++) {
             p_output[i] = 0.5f * p_input[i] * (p_output[i] + 1.0f);
           }
         },
@@ -117,16 +117,16 @@ Status Gelu<T>::Compute(OpKernelContext* context) const {
           const auto start = task_idx * length_per_task;
           const T* p_input = input_data + start;
           T* p_output = output_data + start;
-          int64_t count = std::min(length_per_task, elem_count - start);
+          size_t count = std::min(length_per_task, elem_count - start);
 
-          for (int64_t i = 0; i < count; i++) {
+          for (size_t i = 0; i < count; i++) {
             T value = p_input[i];
             p_output[i] = value * static_cast<T>(M_SQRT1_2);
           }
 
           MlasComputeErf(p_output, p_output, narrow<size_t>(count));
 
-          for (int64_t i = 0; i < count; i++) {
+          for (size_t i = 0; i < count; i++) {
             p_output[i] = 0.5f * p_input[i] * (p_output[i] + 1.0f);
           }
         },
@@ -143,9 +143,9 @@ Status Gelu<MLFloat16>::Compute(OpKernelContext* context) const {
   Tensor* output = context->Output(0, input->Shape());
   MLFloat16* output_data = output->MutableData<MLFloat16>();
   concurrency::ThreadPool* tp = context->GetOperatorThreadPool();
-  int64_t elem_count = input->Shape().Size();
-  constexpr int64_t length_per_task = 4096;
-  int64_t task_count = (elem_count + length_per_task - 1) / length_per_task;
+  size_t elem_count = input->Shape().Size();
+  constexpr size_t length_per_task = 4096;
+  size_t task_count = (elem_count + length_per_task - 1) / length_per_task;
 
   if (approximation_algorithm_ != "tanh" && approximation_algorithm_ != "none") {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported approximation_algorithm: ", approximation_algorithm_);
@@ -178,7 +178,7 @@ Status Gelu<MLFloat16>::Compute(OpKernelContext* context) const {
         const auto start = task_idx * length_per_task;
         const MLFloat16* p_input = input_data + start;
         MLFloat16* p_output = output_data + start;
-        int64_t count = std::min(length_per_task, elem_count - start);
+        size_t count = std::min(length_per_task, elem_count - start);
         MLFloat16* p_temp = temp_fp16_aligned.get() + start;
         MlasComputeFP16Gelu(p_input, p_output, p_temp, count, approximation_algorithm_);
       },