microsoft
diff --git a/‎cmake/onnxruntime_mlas.cmake‎
Lines changed: 7 additions & 2 deletions b/‎cmake/onnxruntime_mlas.cmake‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎cmake/onnxruntime_providers_cpu.cmake‎
Lines changed: 1 addition & 8 deletions b/‎cmake/onnxruntime_providers_cpu.cmake‎
Lines changed: 1 addition & 8 deletions
diff --git a/‎onnxruntime/core/mlas/inc/mlas.h‎
Lines changed: 29 additions & 0 deletions b/‎onnxruntime/core/mlas/inc/mlas.h‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎onnxruntime/core/mlas/lib/erf.cpp‎
Lines changed: 49 additions & 0 deletions b/‎onnxruntime/core/mlas/lib/erf.cpp‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎onnxruntime/core/mlas/lib/erf_neon_fp16.cpp‎
Lines changed: 2 additions & 2 deletions b/‎onnxruntime/core/mlas/lib/erf_neon_fp16.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎onnxruntime/core/mlas/lib/erf_neon_fp16.h‎
Lines changed: 1 addition & 1 deletion b/‎onnxruntime/core/mlas/lib/erf_neon_fp16.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎onnxruntime/core/mlas/lib/gelu.cpp‎
Lines changed: 68 additions & 0 deletions b/‎onnxruntime/core/mlas/lib/gelu.cpp‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎onnxruntime/core/mlas/lib/gelu.h‎
Lines changed: 32 additions & 0 deletions b/‎onnxruntime/core/mlas/lib/gelu.h‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎onnxruntime/core/mlas/lib/gelu_neon_fp16.cpp‎
Lines changed: 93 additions & 0 deletions b/‎onnxruntime/core/mlas/lib/gelu_neon_fp16.cpp‎
Lines changed: 93 additions & 0 deletions
@@ -54,6 +54,7 @@ onnxruntime_add_static_library(onnxruntime_mlas
   ${MLAS_SRC_DIR}/rotary_embedding.cpp
   ${MLAS_SRC_DIR}/softmax.h
   ${MLAS_SRC_DIR}/saturation_check.cpp
+  ${MLAS_SRC_DIR}/gelu.cpp
 )
 
 target_sources(onnxruntime_mlas PRIVATE
@@ -118,6 +119,7 @@ function(setup_mlas_source_for_windows)
         ${MLAS_SRC_DIR}/sconv_nchw_kernel_neon.cpp
         ${MLAS_SRC_DIR}/erf_neon_fp16.h
         ${MLAS_SRC_DIR}/erf_neon_fp16.cpp
+        ${MLAS_SRC_DIR}/gelu_neon_fp16.cpp
       )
 
       set(mlas_platform_preprocess_srcs
@@ -483,15 +485,16 @@ else()
           ${MLAS_SRC_DIR}/sconv_nchw_kernel_neon.cpp
           ${MLAS_SRC_DIR}/erf_neon_fp16.h
           ${MLAS_SRC_DIR}/erf_neon_fp16.cpp
+          ${MLAS_SRC_DIR}/gelu_neon_fp16.cpp
         )
 
         # Conditionally add the SVE implementation if compiler supports it
         if (onnxruntime_USE_SVE)
           list(APPEND mlas_platform_srcs ${MLAS_SRC_DIR}/sve/mlasi_sve.h)
           list(APPEND mlas_platform_srcs ${MLAS_SRC_DIR}/sve/elementwise_sve.cpp)
-          list(APPEND mlas_platform_srcs ${MLAS_SRC_DIR}/sve/Elementwise_sve_fp16.cpp)
+          list(APPEND mlas_platform_srcs ${MLAS_SRC_DIR}/sve/elementwise_sve_fp16.cpp)
           set_source_files_properties(${MLAS_SRC_DIR}/sve/elementwise_sve.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+sve+fp16 ")
-          set_source_files_properties(${MLAS_SRC_DIR}/sve/Elementwise_sve_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+sve+fp16 ")
+          set_source_files_properties(${MLAS_SRC_DIR}/sve/elementwise_sve_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+sve+fp16 ")
           list(APPEND mlas_private_compile_definitions MLAS_USE_SVE)
         endif()
 
@@ -529,6 +532,7 @@ else()
             ${MLAS_SRC_DIR}/softmax_kernel_neon_fp16.cpp
             ${MLAS_SRC_DIR}/eltwise_kernel_neon_fp16.cpp
             ${MLAS_SRC_DIR}/erf_neon_fp16.cpp
+            ${MLAS_SRC_DIR}/gelu_neon_fp16.cpp
           )
           set_source_files_properties(${MLAS_SRC_DIR}/aarch64/HalfGemmKernelNeon.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/aarch64/QgemmS8S8KernelSmmla.S PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+i8mm ")
@@ -546,6 +550,7 @@ else()
           set_source_files_properties(${MLAS_SRC_DIR}/softmax_kernel_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/eltwise_kernel_neon_fp16.cpp PROPERTIES COMPILE_FLAGS " -march=armv8.2-a+fp16 ")
           set_source_files_properties(${MLAS_SRC_DIR}/erf_neon_fp16.cpp PROPERTIES COMPILE_FLAGS "-march=armv8.2-a+fp16 ")
+          set_source_files_properties(${MLAS_SRC_DIR}/gelu_neon_fp16.cpp PROPERTIES COMPILE_FLAGS "-march=armv8.2-a+fp16 ")
         endif()
 
         if(ONNXRUNTIME_MLAS_MULTI_ARCH)
 
@@ -182,14 +182,7 @@ if (onnxruntime_ENABLE_CPU_FP16_OPS)
   set_source_files_properties(${ORTTRAINING_SOURCE_DIR}/training_ops/cuda/collective/adasum_kernels.cc PROPERTIES COMPILE_FLAGS " -fassociative-math -ffast-math -ftree-vectorize -funsafe-math-optimizations -mf16c -mavx -mfma ")
 endif()
 
-if(onnxruntime_target_platform STREQUAL "aarch64" OR onnxruntime_target_platform STREQUAL "ARM64" OR onnxruntime_target_platform STREQUAL "arm64")
-set_source_files_properties("${ONNXRUNTIME_ROOT}/core/providers/cpu/tensor/gelu.cc" PROPERTIES COMPILE_FLAGS -march=armv8.2-a+fp16)
-endif()
-target_include_directories(onnxruntime_providers PRIVATE
-  ${ONNXRUNTIME_ROOT}
-  ${ONNXRUNTIME_ROOT}/core/mlas/inc
-)
-
+target_include_directories(onnxruntime_providers PRIVATE ${ONNXRUNTIME_ROOT})
 onnxruntime_add_include_to_target(onnxruntime_providers re2::re2 Eigen3::Eigen)
 add_dependencies(onnxruntime_providers onnx ${onnxruntime_EXTERNAL_DEPENDENCIES})
 
 
@@ -2127,3 +2127,32 @@ MlasFlashAttention(
     MlasFlashAttentionThreadedArgs* args,
     MLAS_THREADPOOL* ThreadPool
 );
+
+#if defined(USE_KLEIDIAI) && !defined(_MSC_VER)
+/**
+ * @brief Function to override the packing mechanism decision if kleidi ai is included
+ * @param enable     enable kleidiai packing (allow or disallow depending on true/false)
+ * @return
+*/
+void
+MLASCALL
+MlasGemmBatchPackUseKleidi(bool enable);
+#endif
+
+void
+MLASCALL
+MlasComputeFP16Erf(
+    const MLAS_FP16* Input,
+    MLAS_FP16* Output,
+    size_t N
+);
+
+void
+MLASCALL 
+MlasComputeFP16Gelu(
+    const MLAS_FP16* input,
+    MLAS_FP16* output,
+    MLAS_FP16* temp,
+    int64_t count,
+    const std::string& algo
+);
@@ -22,6 +22,15 @@ Module Name:
 --*/
 
 #include "mlasi.h"
+
+#ifdef MLAS_USE_SVE
+#include "sve/mlasi_sve.h"
+#endif
+
+#if defined(MLAS_NEON_INTRINSICS)
+#include "erf_neon_fp16.h"
+#endif
+
 //
 // Bundles the constants for use by kernels written in assembly.
 //
@@ -266,3 +275,43 @@ Return Value:
     MlasErfKernel(Input, Output, N);
 #endif
 }
+
+void
+MLASCALL
+MlasComputeFP16Erf(
+    const MLAS_FP16* Input,
+    MLAS_FP16* Output,
+    size_t N
+    )
+{
+#if defined(MLAS_USE_SVE) || defined(MLAS_NEON_INTRINSICS)
+
+#if defined(MLAS_USE_SVE)
+    if (MLAS_CPUIDINFO::GetCPUIDInfo().HasArmSve()) {
+        MlasSveErfF16Kernel(
+            reinterpret_cast<const _mlas_fp16_*>(Input),
+            reinterpret_cast<_mlas_fp16_*>(Output),
+            N
+        );
+        return;
+    }
+#endif
+
+#if defined(MLAS_NEON_INTRINSICS)
+    MlasNeonErfF16Kernel(
+        reinterpret_cast<const _mlas_fp16_*>(Input),
+        reinterpret_cast<_mlas_fp16_*>(Output),
+        N
+    );
+    return;
+#endif
+
+#else
+    std::vector<float> input_fp32(N);
+    std::vector<float> output_fp32(N);
+
+    MlasConvertHalfToFloatBuffer(Input, input_fp32.data(), N);
+    MlasComputeErf(input_fp32.data(), output_fp32.data(), N);
+    MlasConvertFloatToHalfBuffer(output_fp32.data(), Output, N);
+#endif
+}
@@ -67,7 +67,7 @@ exp_neg_rational_approx_f16(MLAS_FLOAT16X8 x)
 }
 
 void
-MlasNeonErfKernelFp16(const _mlas_fp16_* Input, _mlas_fp16_* Output, size_t N)
+MlasNeonErfF16Kernel(const _mlas_fp16_* Input, _mlas_fp16_* Output, size_t N)
 {
     const float16_t p = 0.328f;
     const float16_t a1 = 0.2505f;
@@ -144,4 +144,4 @@ MlasNeonErfKernelFp16(const _mlas_fp16_* Input, _mlas_fp16_* Output, size_t N)
 
         Output[i] = float_to_fp16(erf_approx);
     }
-}
+}
@@ -21,4 +21,4 @@ Module Name:
 #include "softmax_kernel_neon.h"
 
 using _mlas_fp16_ = uint16_t;
-void MlasNeonErfKernelFp16(const _mlas_fp16_* Input, _mlas_fp16_* Output, size_t N);
+void MlasNeonErfF16Kernel(const _mlas_fp16_* Input, _mlas_fp16_* Output, size_t N);
@@ -0,0 +1,68 @@
+/*++
+
+Copyright 2025 FUJITSU LIMITED
+
+Module Name:
+
+   Gelu.cpp
+
+Abstract:
+
+    This module contains  Gelu helper functions .
+
+--*/
+
+#include "gelu.h"
+
+
+void
+MLASCALL
+MlasComputeFP16Gelu(const MLAS_FP16* input,
+                    MLAS_FP16* output,
+                    MLAS_FP16* temp,
+                    int64_t count,
+                    const std::string& algo)
+{
+#if defined(MLAS_USE_SVE) || defined(MLAS_NEON_INTRINSICS)
+
+    bool done = false;
+
+#if defined(MLAS_USE_SVE)
+    if (MLAS_CPUIDINFO::GetCPUIDInfo().HasArmSve()) {
+        MlasSveGeluF16Kernel(input, output, temp, count, algo);
+        done = true;
+    }
+#endif
+
+#if defined(MLAS_NEON_INTRINSICS)
+    if (!done) {
+        MlasNeonGeluF16Kernel(input, output, temp, count, algo);
+        done = true;
+    }
+#endif
+
+#else 
+
+    (void)temp; 
+    for (int64_t i = 0; i < count; ++i) {
+        float x = static_cast<float>(input[i]);
+        float gelu_val;
+
+        if (algo == "tanh") {
+            // GELU approximation (tanh)
+            const float B = 0.7978845608f;
+            const float C = 0.044715f * B;
+            float tanh_arg = x * (B + C * x * x);
+            float tanh_res = std::tanh(tanh_arg);
+            gelu_val = 0.5f * x * (1.0f + tanh_res);
+        } else {
+            // GELU exact (erf)
+            gelu_val = 0.5f * x *
+                (1.0f + std::erf(x * static_cast<float>(M_SQRT1_2)));
+        }
+
+        output[i] = MLAS_FP16(gelu_val);
+    }
+
+#endif
+}
@@ -0,0 +1,32 @@
+/*++
+
+Copyright 2025 FUJITSU LIMITED
+
+Module Name:
+
+   Gelu.cpp
+
+Abstract:
+
+    This module contains  Gelu helper functions .
+
+--*/
+
+#include "fp16_common.h"
+#if defined(MLAS_NEON_INTRINSICS)
+#include "erf_neon_fp16.h"
+#endif
+
+#ifdef MLAS_USE_SVE
+#include "sve/mlasi_sve.h"
+#endif
+
+void
+MLASCALL
+MlasNeonGeluF16Kernel(
+    const MLAS_FP16* input,
+    MLAS_FP16* output,
+    MLAS_FP16* temp,
+    int64_t count,
+    const std::string& algo
+);
@@ -0,0 +1,93 @@
+/*++
+
+Copyright 2025 FUJITSU LIMITED
+
+Module Name:
+
+   Gelu.cpp
+
+Abstract:
+
+    This module contains  Gelu helper functions .
+
+--*/
+#include "gelu.h"
+
+#if defined(__ARM_NEON) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
+
+void
+MLASCALL
+MlasNeonGeluF16Kernel(const MLAS_FP16* input, MLAS_FP16* output, MLAS_FP16* temp, int64_t count, const std::string& algo)
+{
+    const float16_t v_half1 = 0.5f;
+    const float16_t v_one1 = 1.0f;
+    const float16_t v_sqrt1_21 = static_cast<float>(M_SQRT1_2);
+    const float16_t v_B1 = 0.7978845608028654f;
+    const float16_t v_C1 = 0.035677408136300125f;
+    const float16_t c1 = 5.0f;
+    const float16_t c2 = -5.0f;
+    const MLAS_FLOAT16X8 v_half = MlasBroadcastF16Float16x8(v_half1);
+    const MLAS_FLOAT16X8 v_one = MlasBroadcastF16Float16x8(v_one1);
+    const MLAS_FLOAT16X8 v_sqrt1_2 = MlasBroadcastF16Float16x8(v_sqrt1_21);
+    const MLAS_FLOAT16X8 v_B = MlasBroadcastF16Float16x8(v_B1);
+    const MLAS_FLOAT16X8 v_C = MlasBroadcastF16Float16x8(v_C1);
+
+    int64_t i = 0;
+
+    if (algo == "tanh") {
+        // Preprocess input into temp[] for tanh
+        for (; i + 7 < count; i += 8) {
+            MLAS_FLOAT16X8 x = MlasLoadf16Float16x8(reinterpret_cast<const float16_t*>(input + i));
+            MLAS_FLOAT16X8 x2 = MlasMultiplyFloat16(x, x);
+            MLAS_FLOAT16X8 inner = MlasMultiplyAddFloat16(v_C, x2, v_B);  // B + C * x^2
+            MLAS_FLOAT16X8 tanh_arg = MlasMultiplyFloat16(x, inner);      // x * (B + C * x^2)
+            tanh_arg = MlasMaximumFloat16(MlasBroadcastF16Float16x8(c2), MlasMinimumFloat16(tanh_arg, MlasBroadcastF16Float16x8(c1)));
+            MlasStoref16Float16x8(reinterpret_cast<float16_t*>(temp + i), tanh_arg);
+        }
+
+        // Tail
+        for (; i < count; ++i) {
+            float x = static_cast<float>(input[i]);
+            float inner = x * (0.7979f + 0.03568f * x * x);
+            inner = std::max(-5.0f, std::min(5.0f, inner));
+            temp[i] = static_cast<MLAS_FP16>(inner);
+        }
+
+        // Tanh processing
+        MlasComputeTanh<MLAS_FP16>(temp, temp, count);
+
+    } else if (algo == "none") {
+        // Preprocess input into temp[] for erf
+        for (i = 0; i + 7 < count; i += 8) {
+            MLAS_FLOAT16X8 x = MlasLoadf16Float16x8(reinterpret_cast<const float16_t*>(input + i));
+            MLAS_FLOAT16X8 scaled = MlasMultiplyFloat16(x, v_sqrt1_2);
+            MlasStoref16Float16x8(reinterpret_cast<float16_t*>(temp + i), scaled);
+        }
+
+        // Tail
+        for (; i < count; ++i) {
+            float x = static_cast<float>(input[i]);
+            temp[i] = static_cast<MLAS_FP16>(x * 0.70710678f);
+        }
+
+        // Erf processing
+        MlasNeonErfF16Kernel(reinterpret_cast<const _mlas_fp16_*>(temp), reinterpret_cast<_mlas_fp16_*>(temp), count);
+    }
+
+    // Final GELU output = 0.5 * x * (1 + tanh|erf)
+    i = 0;
+    for (; i + 7 < count; i += 8) {
+        MLAS_FLOAT16X8 x = MlasLoadf16Float16x8(reinterpret_cast<const float16_t*>(input + i));
+        MLAS_FLOAT16X8 t = MlasLoadf16Float16x8(reinterpret_cast<const float16_t*>(temp + i));
+        MLAS_FLOAT16X8 result = MlasMultiplyFloat16(v_half, MlasMultiplyFloat16(x, MlasAddFloat16(v_one, t)));
+        MlasStoref16Float16x8(reinterpret_cast<float16_t*>(output + i), result);
+    }
+
+    for (; i < count; ++i) {
+        float x = static_cast<float>(input[i]);
+        float t = static_cast<float>(temp[i]);
+        float gelu = 0.5f * x * (1.0f + t);
+        output[i] = static_cast<MLAS_FP16>(gelu);
+    }
+}
+#endif
Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,7 @@ exp_neg_rational_approx_f16(MLAS_FLOAT16X8 x)`
`67`	`67`	`}`
`68`	`68`
`69`	`69`	`void`
`70`		`-MlasNeonErfKernelFp16(const _mlas_fp16_* Input, _mlas_fp16_* Output, size_t N)`
	`70`	`+MlasNeonErfF16Kernel(const _mlas_fp16_* Input, _mlas_fp16_* Output, size_t N)`
`71`	`71`	`{`
`72`	`72`	`const float16_t p = 0.328f;`
`73`	`73`	`const float16_t a1 = 0.2505f;`
`@@ -144,4 +144,4 @@ MlasNeonErfKernelFp16(const _mlas_fp16_* Input, _mlas_fp16_* Output, size_t N)`
`144`	`144`
`145`	`145`	`Output[i] = float_to_fp16(erf_approx);`
`146`	`146`	`}`
`147`		`-}`
	`147`	`+}`