Added runtime guards and resolved CIfailures

Sanket Kale · Sanket Kale · commit 4f10c21cc42a · 2026-02-05T19:40:10.000+05:30
diff --git a/onnxruntime/core/mlas/lib/erf.cpp b/onnxruntime/core/mlas/lib/erf.cpp
@@ -285,27 +285,9 @@ MlasComputeFP16Erf(
     )
 {
 #if defined(MLAS_USE_SVE) || defined(MLAS_NEON_INTRINSICS)
-
-#if defined(MLAS_USE_SVE) && defined(MLAS_F16VEC_INTRINSICS_SUPPORTED)
-    if (MLAS_CPUIDINFO::GetCPUIDInfo().HasArmSve()) {
-        MlasSveErfF16Kernel(
-            reinterpret_cast<const _mlas_fp16_*>(Input),
-            reinterpret_cast<_mlas_fp16_*>(Output),
-            N
-        );
-        return;
-    }
-#endif
-
-#if defined(MLAS_NEON_INTRINSICS) && defined(MLAS_F16VEC_INTRINSICS_SUPPORTED)
-    MlasNeonErfF16Kernel(
-        reinterpret_cast<const _mlas_fp16_*>(Input),
-        reinterpret_cast<_mlas_fp16_*>(Output),
-        N
-    );
-    return;
-#endif
-
+    #if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED)
+        GetMlasPlatform().ErfF16KernelRoutine(reinterpret_cast<const _mlas_fp16_*>(Input), reinterpret_cast<_mlas_fp16_*>(Output), N);
+    #endif
 #else
     std::vector<float> input_fp32(N);
     std::vector<float> output_fp32(N);
diff --git a/onnxruntime/core/mlas/lib/erf_neon_fp16.cpp b/onnxruntime/core/mlas/lib/erf_neon_fp16.cpp
@@ -14,6 +14,8 @@ Module Name:
 
 #include "erf_neon_fp16.h"
 
+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED)
+
 // Helpers to safely convert between float and FP16-bit representation
 static float
 fp16_to_float(uint16_t h)
@@ -145,3 +147,4 @@ MlasNeonErfF16Kernel(const _mlas_fp16_* Input, _mlas_fp16_* Output, size_t N)
         Output[i] = float_to_fp16(erf_approx);
     }
 }
+#endif
diff --git a/onnxruntime/core/mlas/lib/gelu.cpp b/onnxruntime/core/mlas/lib/gelu.cpp
@@ -22,10 +22,10 @@ MlasComputeFP16Gelu(const MLAS_FP16* input,
                     int64_t count,
                     const std::string& algo)
 {
-#if defined(MLAS_USE_SVE) && defined(MLAS_F16VEC_INTRINSICS_SUPPORTED)
-        MlasSveGeluF16Kernel(input, output, temp, count, algo);
-#elif defined(MLAS_NEON_INTRINSICS) && defined(MLAS_F16VEC_INTRINSICS_SUPPORTED)
-        MlasNeonGeluF16Kernel(input, output, temp, count, algo);
+#if defined(MLAS_USE_SVE) || defined(MLAS_NEON_INTRINSICS)
+    #if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED)
+        GetMlasPlatform().GeluF16KernelRoutine(input, output, temp, count, algo);
+    #endif
 #else 
     (void)temp; 
     for (int64_t i = 0; i < count; ++i) {
diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h
@@ -610,6 +610,25 @@ void
     size_t N
     );
 
+using _mlas_fp16_ = uint16_t;
+typedef
+void
+(MLASCALL MLAS_COMPUTE_ERF_FP16_KERNEL)(
+    const _mlas_fp16_* Input,
+    _mlas_fp16_* Output,
+    size_t N
+);
+
+typedef
+void
+(MLASCALL MLAS_COMPUTE_GELU_FP16_KERNEL)(
+    const MLAS_FP16* Input,
+    MLAS_FP16* Output,
+    MLAS_FP16* Temp,
+    int64_t N,
+    const std::string& Algo
+);
+
 typedef
 float
 (MLASCALL MLAS_COMPUTE_SUMEXP_FLOAT_KERNEL)(
@@ -1057,6 +1076,8 @@ extern "C" {
     MLAS_QUANTIZE_LINEAR_U16_KERNEL MlasQuantizeLinearU16Kernel;
     MLAS_QUANTIZE_LINEAR_S4_KERNEL MlasQuantizeLinearS4Kernel;
     MLAS_QUANTIZE_LINEAR_U4_KERNEL MlasQuantizeLinearU4Kernel;
+    MLAS_COMPUTE_ERF_FP16_KERNEL MlasNeonErfF16Kernel;
+    MLAS_COMPUTE_GELU_FP16_KERNEL MlasNeonGeluF16Kernel;
 #if defined(MLAS_TARGET_AMD64)
     MLAS_DEQUANTIZE_LINEAR_S8_KERNEL MlasDequantizeLinearS8Kernel;
     MLAS_DEQUANTIZE_LINEAR_U8_KERNEL MlasDequantizeLinearU8Kernel;
@@ -1410,6 +1431,10 @@ struct MLAS_PLATFORM {
     MLAS_COMPUTE_SUMEXP_FLOAT_KERNEL* ComputeSumExpF32Kernel;
     MLAS_COMPUTE_LOGSOFTMAX_OUTPUT_FLOAT_KERNEL* ComputeLogSoftmaxOutputF32Kernel;
     MLAS_COMPUTE_SOFTMAX_OUTPUT_FLOAT_KERNEL* ComputeSoftmaxOutputF32Kernel;
+    #if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED)
+        MLAS_COMPUTE_ERF_FP16_KERNEL* ErfF16KernelRoutine;
+        MLAS_COMPUTE_GELU_FP16_KERNEL* GeluF16KernelRoutine;
+    #endif
 #endif
 #if defined(MLAS_TARGET_AMD64)
     MLAS_SGEMM_KERNEL_M1_ROUTINE* KernelM1Routine;
diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp
@@ -19,6 +19,10 @@ Module Name:
 #ifdef MLAS_USE_SVE
 #include "sve/mlasi_sve.h"
 #endif
+#if defined(MLAS_NEON_INTRINSICS) && defined(MLAS_F16VEC_INTRINSICS_SUPPORTED)
+#include "erf_neon_fp16.h"
+#include "gelu.h"
+#endif
 #if defined(USE_KLEIDIAI)
 #include "kleidiai/mlasi_kleidiai.h"
 #endif
@@ -635,6 +639,17 @@ Return Value:
         this->ComputeLogSoftmaxOutputF32Kernel = MlasComputeLogSoftmaxOutputF32Kernel;
         this->ComputeSoftmaxOutputF32Kernel = MlasComputeSoftmaxOutputF32Kernel;
     }
+
+    #if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED)
+        if (MLAS_CPUIDINFO::GetCPUIDInfo().HasArmSve()) {
+            this->ErfF16KernelRoutine = MlasSveErfF16Kernel;
+            this->GeluF16KernelRoutine = MlasSveGeluF16Kernel;
+        }
+        else{
+            this->ErfF16KernelRoutine = MlasNeonErfF16Kernel;
+            this->GeluF16KernelRoutine = MlasNeonGeluF16Kernel; 
+        }
+    #endif
 #endif
 
     //
diff --git a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc
@@ -2040,7 +2040,7 @@ Status Erf<MLFloat16>::Compute(OpKernelContext* context) const {
         const int64_t count = std::min(length_per_task, elem_count - start);
         const MLFloat16* p_input = input_data + start;
         MLFloat16* p_output = output_data + start;
-        MlasComputeFP16Erf(p_input, p_output, count);
+        MlasComputeFP16Erf(p_input, p_output, static_cast<size_t>(count));
       },
       0);
 

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,8 @@ Module Name:`
`14`	`14`
`15`	`15`	`#include "erf_neon_fp16.h"`
`16`	`16`
	`17`	`+#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED)`
	`18`	`+`
`17`	`19`	`// Helpers to safely convert between float and FP16-bit representation`
`18`	`20`	`static float`
`19`	`21`	`fp16_to_float(uint16_t h)`
`@@ -145,3 +147,4 @@ MlasNeonErfF16Kernel(const _mlas_fp16_* Input, _mlas_fp16_* Output, size_t N)`
`145`	`147`	`Output[i] = float_to_fp16(erf_approx);`
`146`	`148`	`}`
`147`	`149`	`}`
	`150`	`+#endif`