Resolved Copilot comments

Sanket Kale · Sanket Kale · commit cf6d83f38637 · 2026-01-22T13:37:33.000+05:30
diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake
@@ -549,8 +549,8 @@ else()
           set_source_files_properties(${MLAS_SRC_DIR}/halfgemm_kernel_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/softmax_kernel_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/eltwise_kernel_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
-          set_source_files_properties(${MLAS_SRC_DIR}/erf_neon_fp16.cpp PROPERTIES COMPILE_FLAGS "-march=armv8.2-a+fp16 ")
-          set_source_files_properties(${MLAS_SRC_DIR}/gelu_neon_fp16.cpp PROPERTIES COMPILE_FLAGS "-march=armv8.2-a+fp16 ")
+          set_source_files_properties(${MLAS_SRC_DIR}/erf_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
+          set_source_files_properties(${MLAS_SRC_DIR}/gelu_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
         endif()
 
         if(ONNXRUNTIME_MLAS_MULTI_ARCH)
diff --git a/onnxruntime/core/mlas/lib/erf_neon_fp16.cpp b/onnxruntime/core/mlas/lib/erf_neon_fp16.cpp
@@ -60,8 +60,8 @@ exp_neg_rational_approx_f16(MLAS_FLOAT16X8 x)
     MLAS_FLOAT16X8 den = MlasMultiplyAddFloat16(d1v, x, d0v);
     den = MlasMultiplyAddFloat16(d2v, x2, den);
     MLAS_FLOAT16X8 recip = MlasApproximateReciprocalFloat16(den);
-    recip = MlasMultiplyFloat16(recip, MlasReciprocalSqrtFloat16(den, recip));
-    recip = MlasMultiplyFloat16(recip, MlasReciprocalSqrtFloat16(den, recip));
+    recip = MlasMultiplyFloat16(recip, MlasReciprocalStepFloat16(den, recip));
+    recip = MlasMultiplyFloat16(recip, MlasReciprocalStepFloat16(den, recip));
     MLAS_FLOAT16X8 result = MlasMultiplyFloat16(num, recip);
     return result;
 }
@@ -103,8 +103,8 @@ MlasNeonErfF16Kernel(const _mlas_fp16_* Input, _mlas_fp16_* Output, size_t N)
         MLAS_FLOAT16X8 absx_clamped = MlasMinimumFloat16(absx, vth);
         MLAS_FLOAT16X8 denom = MlasMultiplyAddFloat16(vp, absx_clamped, vone);
         MLAS_FLOAT16X8 t = MlasApproximateReciprocalFloat16(denom);
-        t = MlasMultiplyFloat16(t, MlasReciprocalSqrtFloat16(denom, t));
-        t = MlasMultiplyFloat16(t, MlasReciprocalSqrtFloat16(denom, t));
+        t = MlasMultiplyFloat16(t, MlasReciprocalStepFloat16(denom, t));
+        t = MlasMultiplyFloat16(t, MlasReciprocalStepFloat16(denom, t));
         MLAS_FLOAT16X8 t2 = MlasMultiplyFloat16(t, t);
         MLAS_FLOAT16X8 t3 = MlasMultiplyFloat16(t2, t);
         MLAS_FLOAT16X8 t4 = MlasMultiplyFloat16(t3, t);
diff --git a/onnxruntime/core/mlas/lib/fp16_common.h b/onnxruntime/core/mlas/lib/fp16_common.h
@@ -596,7 +596,7 @@ MlasShiftLeftInt16(MLAS_INT16X4 Vector)
 
 MLAS_FORCEINLINE
 MLAS_FLOAT16X8
-MlasReciprocalSqrtFloat16(MLAS_FLOAT16X8 Vector1, MLAS_FLOAT16X8 Vector2)
+MlasReciprocalStepFloat16(MLAS_FLOAT16X8 Vector1, MLAS_FLOAT16X8 Vector2)
 {
     return vrecpsq_f16(Vector1, Vector2);
 }
diff --git a/onnxruntime/core/mlas/lib/gelu.cpp b/onnxruntime/core/mlas/lib/gelu.cpp
@@ -8,13 +8,12 @@ Module Name:
 
 Abstract:
 
-    This module contains  Gelu helper functions .
+    This module contains Gelu helper functions.
 
 --*/
 
 #include "gelu.h"
 
-
 void
 MLASCALL
 MlasComputeFP16Gelu(const MLAS_FP16* input,
@@ -23,26 +22,11 @@ MlasComputeFP16Gelu(const MLAS_FP16* input,
                     int64_t count,
                     const std::string& algo)
 {
-#if defined(MLAS_USE_SVE) || defined(MLAS_NEON_INTRINSICS)
-
-    bool done = false;
-
 #if defined(MLAS_USE_SVE)
-    if (MLAS_CPUIDINFO::GetCPUIDInfo().HasArmSve()) {
         MlasSveGeluF16Kernel(input, output, temp, count, algo);
-        done = true;
-    }
-#endif
-
-#if defined(MLAS_NEON_INTRINSICS)
-    if (!done) {
+#elif defined(MLAS_NEON_INTRINSICS)
         MlasNeonGeluF16Kernel(input, output, temp, count, algo);
-        done = true;
-    }
-#endif
-
 #else 
-
     (void)temp; 
     for (int64_t i = 0; i < count; ++i) {
         float x = static_cast<float>(input[i]);
@@ -63,6 +47,5 @@ MlasComputeFP16Gelu(const MLAS_FP16* input,
 
         output[i] = MLAS_FP16(gelu_val);
     }
-
 #endif
 }
diff --git a/onnxruntime/core/mlas/lib/gelu.h b/onnxruntime/core/mlas/lib/gelu.h
@@ -4,22 +4,17 @@ Copyright 2025 FUJITSU LIMITED
 
 Module Name:
 
-   Gelu.cpp
+    gelu.h
 
 Abstract:
 
-    This module contains  Gelu helper functions .
+    This module contains Gelu helper functions .
 
 --*/
 
 #include "fp16_common.h"
 #if defined(MLAS_NEON_INTRINSICS)
 #include "erf_neon_fp16.h"
-#endif
-
-#ifdef MLAS_USE_SVE
-#include "sve/mlasi_sve.h"
-#endif
 
 void
 MLASCALL
@@ -29,4 +24,10 @@ MlasNeonGeluF16Kernel(
     MLAS_FP16* temp,
     int64_t count,
     const std::string& algo
-);
+);
+
+#endif
+
+#ifdef MLAS_USE_SVE
+#include "sve/mlasi_sve.h"
+#endif
diff --git a/onnxruntime/core/mlas/lib/gelu_neon_fp16.cpp b/onnxruntime/core/mlas/lib/gelu_neon_fp16.cpp
@@ -4,15 +4,15 @@ Copyright 2025 FUJITSU LIMITED
 
 Module Name:
 
-   Gelu.cpp
+    gelu_neon_fp16.cpp
 
 Abstract:
 
     This module contains  Gelu helper functions .
 
 --*/
 #include "gelu.h"
-
+#include <cmath>
 #if defined(__ARM_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 
 void
@@ -48,15 +48,15 @@ MlasNeonGeluF16Kernel(const MLAS_FP16* input, MLAS_FP16* output, MLAS_FP16* temp
         // Tail
         for (; i < count; ++i) {
             float x = static_cast<float>(input[i]);
-            float inner = x * (0.7979f + 0.03568f * x * x);
+            float inner = x * (0.7978845608028654f + 0.035677408136300125f * x * x);
             inner = std::max(-5.0f, std::min(5.0f, inner));
             temp[i] = static_cast<MLAS_FP16>(inner);
         }
 
         // Tanh processing
         MlasComputeTanh<MLAS_FP16>(temp, temp, count);
 
-    } else if (algo == "none") {
+    } else{
         // Preprocess input into temp[] for erf
         for (i = 0; i + 7 < count; i += 8) {
             MLAS_FLOAT16X8 x = MlasLoadf16Float16x8(reinterpret_cast<const float16_t*>(input + i));
@@ -67,7 +67,7 @@ MlasNeonGeluF16Kernel(const MLAS_FP16* input, MLAS_FP16* output, MLAS_FP16* temp
         // Tail
         for (; i < count; ++i) {
             float x = static_cast<float>(input[i]);
-            temp[i] = static_cast<MLAS_FP16>(x * 0.70710678f);
+            temp[i] = static_cast<MLAS_FP16>(x * static_cast<float>(M_SQRT1_2));
         }
 
         // Erf processing
diff --git a/onnxruntime/core/mlas/lib/sve/elementwise_sve_fp16.cpp b/onnxruntime/core/mlas/lib/sve/elementwise_sve_fp16.cpp
@@ -190,8 +190,8 @@ MlasSveGeluF16Kernel(const MLAS_FP16* input, MLAS_FP16* output, MLAS_FP16* temp,
     const __fp16 r1 = 0.5f;
     const __fp16 r2 = 1.0f;
     const __fp16 r3 = static_cast<float>(M_SQRT1_2);
-    const __fp16 r4 = 0.7979f;
-    const __fp16 r5 = 0.03568f;
+    const __fp16 r4 = 0.7978845608028654f;
+    const __fp16 r5 = 0.035677408136300125f;
 
     const MLAS_SVFLOAT16 v_half = MlasSveBroadcastfloat16(r1);
     const MLAS_SVFLOAT16 v_one = MlasSveBroadcastfloat16(r2);
@@ -203,7 +203,7 @@ MlasSveGeluF16Kernel(const MLAS_FP16* input, MLAS_FP16* output, MLAS_FP16* temp,
     const __fp16 c2 = 5.0f;
     if (algo == "tanh") {
         int64_t i = 0;
-        while (i < (count)) {
+        while (i < count) {
             svbool_t pg = MlasSveSelPredictefloat16(i, count);
             MLAS_SVFLOAT16 v_x = MlasSveLoadFloat16(pg, &input[i]);
             MLAS_SVFLOAT16 v_x2 = MlasSveMulfloat16(pg, v_x, v_x);
@@ -225,7 +225,7 @@ MlasSveGeluF16Kernel(const MLAS_FP16* input, MLAS_FP16* output, MLAS_FP16* temp,
             MlasSveStoreF16(pg, &output[j], v_result);
             j += svcnth();
         }
-    } else if (algo == "none") {
+    } else {
         int64_t i = 0;
         while (i < (count)) {
             svbool_t pg = MlasSveSelPredictefloat16(i, count);
diff --git a/onnxruntime/core/mlas/lib/sve/mlasi_sve.h b/onnxruntime/core/mlas/lib/sve/mlasi_sve.h
@@ -56,13 +56,13 @@ MlasSveTanhF16Kernel(
 void 
 MLASCALL 
 MlasSveGeluF16Kernel(
-    const MLAS_FP16* input,
-    MLAS_FP16* output,
-    MLAS_FP16* temp,
-    int64_t count,
-    const std::string& algo
+    const MLAS_FP16* Input,
+    MLAS_FP16* Output,
+    MLAS_FP16* Temp,
+    int64_t N,
+    const std::string& Algo
 );
-// function decarations
+// function declarations
 MLAS_FORCEINLINE
 MLAS_SVFLOAT32
 MlasSveComputeExpVector(
diff --git a/onnxruntime/core/providers/cpu/tensor/gelu.cc b/onnxruntime/core/providers/cpu/tensor/gelu.cc
@@ -12,6 +12,31 @@
 #include "core/providers/cpu/element_wise_ranged_transform.h"
 #include "core/providers/cpu/tensor/gelu.h"
 
+#include <cstddef>
+#include <cstdlib>
+#include <memory>
+
+#if defined(_WIN32)
+  #include <malloc.h>
+#endif
+
+inline void* AlignedAlloc(size_t alignment, size_t size) {
+#if defined(_WIN32)
+  return _aligned_malloc(size, alignment);
+#else
+  // std::aligned_alloc requires size to be a multiple of alignment
+  return std::aligned_alloc(alignment, size);
+#endif
+}
+
+inline void AlignedFree(void* p) {
+#if defined(_WIN32)
+  _aligned_free(p);
+#else
+  std::free(p);
+#endif
+}
+
 using onnxruntime::narrow;
 using namespace onnxruntime::common;
 
@@ -128,16 +153,24 @@ Status Gelu<MLFloat16>::Compute(OpKernelContext* context) const {
 
   // Alignment and buffer size for aligned_alloc
   constexpr size_t alignment = 64;
+
   size_t buffer_size = elem_count * sizeof(MLFloat16);
-  size_t aligned_size = ((buffer_size + alignment - 1) / alignment) * alignment;
-  auto deleter = [](MLFloat16* p) { std::free(p); };
-  std::unique_ptr<MLFloat16, decltype(deleter)> temp_fp16_aligned(
-      reinterpret_cast<MLFloat16*>(std::aligned_alloc(alignment, aligned_size)),
-      deleter);
-  if (temp_fp16_aligned == nullptr) {
-    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to allocate aligned temporary buffer.");
+  size_t aligned_size =
+      ((buffer_size + alignment - 1) / alignment) * alignment;
+
+  void* raw = AlignedAlloc(alignment, aligned_size);
+  if (!raw) {
+    return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL,
+                          "Failed to allocate aligned temporary buffer.");
   }
 
+auto deleter = [](MLFloat16* p) {
+  AlignedFree(p);
+};
+
+std::unique_ptr<MLFloat16, decltype(deleter)> temp_fp16_aligned(
+    static_cast<MLFloat16*>(raw), deleter);
+
   concurrency::ThreadPool::TryBatchParallelFor(
       tp,
       static_cast<int32_t>(task_count),
@@ -147,7 +180,7 @@ Status Gelu<MLFloat16>::Compute(OpKernelContext* context) const {
         MLFloat16* p_output = output_data + start;
         int64_t count = std::min(length_per_task, elem_count - start);
         MLFloat16* p_temp = temp_fp16_aligned.get() + start;
-        MlasComputeFP16Gelu(p_input, p_output, p_temp,  count, approximation_algorithm_);
+        MlasComputeFP16Gelu(p_input, p_output, p_temp, count, approximation_algorithm_);
 
       },
       0);

Original file line number	Diff line number	Diff line change
`@@ -596,7 +596,7 @@ MlasShiftLeftInt16(MLAS_INT16X4 Vector)`
`596`	`596`
`597`	`597`	`MLAS_FORCEINLINE`
`598`	`598`	`MLAS_FLOAT16X8`
`599`		`-MlasReciprocalSqrtFloat16(MLAS_FLOAT16X8 Vector1, MLAS_FLOAT16X8 Vector2)`
	`599`	`+MlasReciprocalStepFloat16(MLAS_FLOAT16X8 Vector1, MLAS_FLOAT16X8 Vector2)`
`600`	`600`	`{`
`601`	`601`	`return vrecpsq_f16(Vector1, Vector2);`
`602`	`602`	`}`