snnn
diff --git a/‎cgmanifests/generated/cgmanifest.json
+4-4 b/‎cgmanifests/generated/cgmanifest.json
+4-4
diff --git a/‎cmake/CMakeLists.txt
+4 b/‎cmake/CMakeLists.txt
+4
diff --git a/‎cmake/deps.txt
+4-4 b/‎cmake/deps.txt
+4-4
diff --git a/‎cmake/external/xnnpack.cmake
+1-40 b/‎cmake/external/xnnpack.cmake
+1-40
diff --git a/‎cmake/onnxruntime_config.h.in
+1 b/‎cmake/onnxruntime_config.h.in
+1
diff --git a/‎cmake/onnxruntime_providers_coreml.cmake
+1-1 b/‎cmake/onnxruntime_providers_coreml.cmake
+1-1
diff --git a/‎cmake/onnxruntime_unittests.cmake
-8 b/‎cmake/onnxruntime_unittests.cmake
-8
diff --git a/‎cmake/onnxruntime_webassembly.cmake
+7-2 b/‎cmake/onnxruntime_webassembly.cmake
+7-2
diff --git a/‎cmake/patches/xnnpack/AddEmscriptenAndIosSupport.patch
+12-12 b/‎cmake/patches/xnnpack/AddEmscriptenAndIosSupport.patch
+12-12
diff --git a/‎dockerfiles/Dockerfile.cuda
+3-1 b/‎dockerfiles/Dockerfile.cuda
+3-1
diff --git a/‎dockerfiles/Dockerfile.tensorrt
+3-1 b/‎dockerfiles/Dockerfile.tensorrt
+3-1
diff --git a/‎docs/python/examples/plot_train_convert_predict.py
+2-2 b/‎docs/python/examples/plot_train_convert_predict.py
+2-2
diff --git a/‎include/onnxruntime/core/framework/allocator.h
+5 b/‎include/onnxruntime/core/framework/allocator.h
+5
diff --git a/‎include/onnxruntime/core/framework/float16.h
+4-4 b/‎include/onnxruntime/core/framework/float16.h
+4-4
diff --git a/‎include/onnxruntime/core/framework/float8.h
+8-8 b/‎include/onnxruntime/core/framework/float8.h
+8-8
diff --git a/‎include/onnxruntime/core/framework/ortdevice.h
+1 b/‎include/onnxruntime/core/framework/ortdevice.h
+1
diff --git a/‎include/onnxruntime/core/framework/ortmemoryinfo.h
+2 b/‎include/onnxruntime/core/framework/ortmemoryinfo.h
+2
diff --git a/‎include/onnxruntime/core/session/onnxruntime_c_api.h
+4 b/‎include/onnxruntime/core/session/onnxruntime_c_api.h
+4
@@ -136,7 +136,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "309b75c9e56e0a674bf78d59872ce131f814dfb6",
+          "commitHash": "fe98e0b93565382648129271381c14d6205255e3",
           "repositoryUrl": "https://github.com/google/XNNPACK.git"
         },
         "comments": "googlexnnpack"
@@ -226,8 +226,8 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "4fe0e1e183925bf8cfa6aae24237e724a96479b8",
-          "repositoryUrl": "https://github.com/Maratyszcza/pthreadpool.git"
+          "commitHash": "4e80ca24521aa0fb3a746f9ea9c3eaa20e9afbb0",
+          "repositoryUrl": "https://github.com/google/pthreadpool.git"
         },
         "comments": "pthreadpool"
       }
@@ -246,7 +246,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "ca678952a9a8eaa6de112d154e8e104b22f9ab3f",
+          "commitHash": "8a1772a0c5c447df2d18edf33ec4603a8c9c04a6",
           "repositoryUrl": "https://github.com/pytorch/cpuinfo.git"
         },
         "comments": "pytorch_cpuinfo"
 
@@ -680,6 +680,7 @@ else()
   check_cxx_compiler_flag(-Wdeprecated-builtins HAS_DEPRECATED_BUILTINS)
   check_cxx_compiler_flag(-Wdeprecated-copy HAS_DEPRECATED_COPY)
   check_cxx_compiler_flag(-Wdeprecated-declarations HAS_DEPRECATED_DECLARATIONS)
+  check_cxx_compiler_flag(-Wdeprecated-literal-operator HAS_DEPRECATED_LITERAL_OPERATOR)
   check_cxx_compiler_flag(-Wdeprecated-this-capture HAS_DEPRECATED_THIS_CAPTURE)
   check_cxx_compiler_flag(-Wenum-constexpr-conversion HAS_ENUM_CONSTEXPR_CONVERSION)
   check_cxx_compiler_flag(-Wformat-truncation HAS_FORMAT_TRUNCATION)
@@ -736,6 +737,9 @@ else()
   if (HAS_DEPRECATED_BUILTINS)
     list(APPEND ORT_WARNING_FLAGS -Wno-deprecated-builtins)
   endif()
+  if (HAS_DEPRECATED_LITERAL_OPERATOR)
+    list(APPEND ORT_WARNING_FLAGS -Wno-deprecated-literal-operator)
+  endif()
   #see:https://reviews.llvm.org/D131307
   #It was intended that the 'enum-constexpr-conversion' type warnings can not be silenced by -w
   if(HAS_ENUM_CONSTEXPR_CONVERSION AND NOT Protobuf_FOUND)
 
@@ -29,7 +29,7 @@ fxdiv;https://github.com/Maratyszcza/FXdiv/archive/63058eff77e11aa15bf531df5dd34
 google_benchmark;https://github.com/google/benchmark/archive/refs/tags/v1.8.5.zip;cd47d3d272faf353600c8cc2fdec2b52d6f69177
 googletest;https://github.com/google/googletest/archive/refs/tags/v1.15.0.zip;9d2d0af8d77ac726ea55d44a8fa727ec98311349
 #xnnpack 2024.09.04
-googlexnnpack;https://github.com/google/XNNPACK/archive/309b75c9e56e0a674bf78d59872ce131f814dfb6.zip;39FA5259EAEACE0547284B63D5CEDC4F05553F5A
+googlexnnpack;https://github.com/google/XNNPACK/archive/fe98e0b93565382648129271381c14d6205255e3.zip;14f61dcf17cec2cde34ba2dcf61d6f24bf6059f3
 json;https://github.com/nlohmann/json/archive/refs/tags/v3.10.5.zip;f257f8dc27c5b8c085dc887b40cddd18ae1f725c
 microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf368104cd22a87b4dd0c80228919bb2df3e2a14
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
@@ -46,9 +46,9 @@ protoc_linux_x86;https://github.com/protocolbuffers/protobuf/releases/download/v
 protoc_linux_aarch64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-linux-aarch_64.zip;df9d45470b0b8cf939dd2f0ec6b88e9cafc4d617
 protoc_mac_universal;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-osx-universal_binary.zip;23710c3d1c2036d8d65a6a22234372fa2d7af9ef
 psimd;https://github.com/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e95687148a900.zip;1f5454b01f06f9656b77e4a5e2e31d7422487013
-pthreadpool;https://github.com/Maratyszcza/pthreadpool/archive/4fe0e1e183925bf8cfa6aae24237e724a96479b8.zip;07a0aa91dd9bf86f31b95497e00f31d8a261a4bd
+pthreadpool;https://github.com/google/pthreadpool/archive/4e80ca24521aa0fb3a746f9ea9c3eaa20e9afbb0.zip;bd4ea65c8292801e9555b527a0ecbb2e0092c917
 pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.13.1.zip;9255d5c8568debcc329dd42ed8f410ee139ac7b1
-pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/ca678952a9a8eaa6de112d154e8e104b22f9ab3f.zip;138bf57d2a110935330d1048dce6d7b82d17d377
+pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/8a1772a0c5c447df2d18edf33ec4603a8c9c04a6.zip;85bf8a60dae026b99b6ccd78606c85ed83bfb2cd
 re2;https://github.com/google/re2/archive/refs/tags/2024-07-02.zip;646e1728269cde7fcef990bf4a8e87b047882e88
 safeint;https://github.com/dcleblanc/SafeInt/archive/refs/tags/3.0.28.zip;23f252040ff6cb9f1fd18575b32fa8fb5928daac
 tensorboard;https://github.com/tensorflow/tensorboard/archive/373eb09e4c5d2b3cc2493f0949dc4be6b6a45e81.zip;67b833913605a4f3f499894ab11528a702c2b381
@@ -59,4 +59,4 @@ composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/arch
 directx_headers;https://github.com/microsoft/DirectX-Headers/archive/refs/tags/v1.613.1.zip;47653509a3371eabb156360f42faf582f314bf2e
 cudnn_frontend;https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.7.0.zip;d0753d8d5b39947ca0729d7773cb84653a129eb1
 dawn;https://github.com/google/dawn/archive/12a3b24c456cebd9fd11f23ac0164f78129b00c6.zip;ad428f6dc16f1336d584f7bad5714e1097dafc43
-kleidiai;https://gitlab.arm.com/kleidi/kleidiai/-/archive/v0.2.0/kleidiai-v0.2.0.zip;B1E3173992FD91F20DB904AB77D6E901778C2681
+kleidiai;https://gitlab.arm.com/kleidi/kleidiai/-/archive/d15722976120710080ca098fe8ddabf4556cb40f/kleidiai-d15722976120710080ca098fe8ddabf4556cb40f.zip;d6c840d00c3b05aedf06e957ddaece1013d1f40b
@@ -1,8 +1,7 @@
 set(XNNPACK_USE_SYSTEM_LIBS ON CACHE INTERNAL "")
 set(XNNPACK_BUILD_TESTS OFF CACHE INTERNAL "")
 set(XNNPACK_BUILD_BENCHMARKS OFF CACHE INTERNAL "")
-set(FP16_BUILD_TESTS OFF CACHE INTERNAL "")
-set(FP16_BUILD_BENCHMARKS OFF CACHE INTERNAL "")
+
 set(PTHREADPOOL_BUILD_TESTS OFF CACHE INTERNAL "")
 set(PTHREADPOOL_BUILD_BENCHMARKS OFF CACHE INTERNAL "")
 set(KLEIDIAI_BUILD_TESTS OFF CACHE INTERNAL "")
@@ -17,44 +16,6 @@ if(CMAKE_ANDROID_ARCH_ABI STREQUAL armeabi-v7a)
   set(XNNPACK_ENABLE_ARM_BF16 OFF)
 endif()
 
-# fp16 depends on psimd
-FetchContent_Declare(psimd URL ${DEP_URL_psimd} URL_HASH SHA1=${DEP_SHA1_psimd})
-onnxruntime_fetchcontent_makeavailable(psimd)
-set(PSIMD_SOURCE_DIR ${psimd_SOURCE_DIR})
-
-block(PROPAGATE fp16_PATCH_COMMAND)
-  # only apply fp16 patch for Apple x86_64 targets
-
-  if(APPLE)
-    if(NOT "${CMAKE_OSX_ARCHITECTURES}" STREQUAL "")
-      if ("x86_64" IN_LIST CMAKE_OSX_ARCHITECTURES)
-        set(fp16_PATCH_REQUIRED 1)
-      endif()
-    else()
-      # CMAKE_OSX_ARCHITECTURES unspecified, check host
-      if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
-        set(fp16_PATCH_REQUIRED 1)
-      endif()
-    endif()
-  endif()
-
-  if(fp16_PATCH_REQUIRED)
-    message(STATUS "Applying fp16 patch.")
-    set(fp16_PATCH_FILE ${PROJECT_SOURCE_DIR}/patches/fp16/remove_math_h_dependency_from_fp16_h.patch)
-    set(fp16_PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${fp16_PATCH_FILE})
-  else()
-    set(fp16_PATCH_COMMAND "")
-  endif()
-endblock()
-
-FetchContent_Declare(
-    fp16
-    URL ${DEP_URL_fp16}
-    URL_HASH SHA1=${DEP_SHA1_fp16}
-    PATCH_COMMAND ${fp16_PATCH_COMMAND}
-    )
-onnxruntime_fetchcontent_makeavailable(fp16)
-
 # pthreadpool depends on fxdiv
 FetchContent_Declare(fxdiv URL ${DEP_URL_fxdiv} URL_HASH SHA1=${DEP_SHA1_fxdiv})
 onnxruntime_fetchcontent_makeavailable(fxdiv)
 
@@ -9,6 +9,7 @@
 #cmakedefine HAS_CLASS_MEMACCESS
 #cmakedefine HAS_DEPRECATED_COPY
 #cmakedefine HAS_DEPRECATED_DECLARATIONS
+#cmakedefine HAS_DEPRECATED_LITERAL_OPERATOR
 #cmakedefine HAS_DEPRECATED_THIS_CAPTURE
 #cmakedefine HAS_FORMAT_TRUNCATION
 #cmakedefine HAS_IGNORED_ATTRIBUTES
 
@@ -177,7 +177,7 @@ endif()
 if (_enable_ML_PROGRAM)
   # Setup coremltools fp16 and json dependencies for creating an mlpackage.
   #
-  # These are also used by external/xnnpack.cmake. fp16 depends on psimd
+  # fp16 depends on psimd
   FetchContent_Declare(psimd URL ${DEP_URL_psimd} URL_HASH SHA1=${DEP_SHA1_psimd})
   onnxruntime_fetchcontent_makeavailable(psimd)
   set(PSIMD_SOURCE_DIR ${psimd_SOURCE_DIR})
 
@@ -221,19 +221,11 @@ function(AddTest)
         )
       else()
         set(TEST_NODE_FLAGS)
-        if (onnxruntime_ENABLE_WEBASSEMBLY_THREADS)
-          list(APPEND TEST_NODE_FLAGS "--experimental-wasm-threads")
-        endif()
-        if (onnxruntime_ENABLE_WEBASSEMBLY_SIMD)
-          list(APPEND TEST_NODE_FLAGS "--experimental-wasm-simd")
-        endif()
 
         # prefer Node from emsdk so the version is more deterministic
         if (DEFINED ENV{EMSDK_NODE})
           set(NODE_EXECUTABLE $ENV{EMSDK_NODE})
         else()
-          # warning as we don't know what node version is being used and whether things like the TEST_NODE_FLAGS
-          # will be valid. e.g. "--experimental-wasm-simd" is not valid with node v20 or later.
           message(WARNING "EMSDK_NODE environment variable was not set. Falling back to system `node`.")
           set(NODE_EXECUTABLE node)
         endif()
 
@@ -380,10 +380,15 @@ jsepDownload:_pp_")
       "SHELL:--pre-js \"${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js\""
       "SHELL:-s ASYNCIFY=1"
       "SHELL:-s ASYNCIFY_STACK_SIZE=65536"
-      "SHELL:-s ASYNCIFY_EXPORTS=['OrtRun']"
-      "SHELL:-s ASYNCIFY_IMPORTS=['Module.jsepCopy','Module.jsepCopyAsync','jsepDownload']"
     )
     set_target_properties(onnxruntime_webassembly PROPERTIES LINK_DEPENDS ${ONNXRUNTIME_ROOT}/wasm/pre-jsep.js)
+
+    if (onnxruntime_ENABLE_WEBASSEMBLY_MEMORY64)
+      target_link_options(onnxruntime_webassembly PRIVATE
+        "SHELL:-s ASYNCIFY_EXPORTS=['OrtRun']"
+        "SHELL:-s ASYNCIFY_IMPORTS=['Module.jsepCopy','Module.jsepCopyAsync','jsepDownload']"
+      )
+    endif()
   endif()
 
   if (onnxruntime_EMSCRIPTEN_SETTINGS)
 
@@ -1,8 +1,8 @@
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 1ff85b538..c3ef2183f 100644
+index f0b3410ae..1e3cb8178 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
-@@ -253,7 +253,7 @@ ENDIF()
+@@ -337,7 +337,7 @@ ENDIF()
  # ---[ Build flags
  IF(NOT CMAKE_SYSTEM_NAME)
    MESSAGE(FATAL_ERROR "CMAKE_SYSTEM_NAME not defined")
@@ -11,21 +11,21 @@ index 1ff85b538..c3ef2183f 100644
    MESSAGE(FATAL_ERROR "Unrecognized CMAKE_SYSTEM_NAME value \"${CMAKE_SYSTEM_NAME}\"")
  ENDIF()
  IF(CMAKE_SYSTEM_NAME MATCHES "Windows")
-@@ -763,7 +763,12 @@ IF(XNNPACK_BUILD_LIBRARY)
-   TARGET_LINK_LIBRARIES(operator-run PRIVATE xnnpack-base logging)
+@@ -848,7 +848,12 @@ IF(XNNPACK_BUILD_LIBRARY)
    TARGET_LINK_LIBRARIES(operator-utils PRIVATE xnnpack-base logging)
-   TARGET_LINK_LIBRARIES(subgraph PRIVATE xnnpack-base allocator logging memory mutex operators operator-run)
--  TARGET_LINK_LIBRARIES(XNNPACK PRIVATE allocator cache hardware-config indirection logging memory microkernel-utils microparams-init mutex normalization operators operator-run operator-utils packing microkernels-prod subgraph)
+   TARGET_LINK_LIBRARIES(reference-ukernels PRIVATE xnnpack-base)
+   TARGET_LINK_LIBRARIES(subgraph PRIVATE xnnpack-base allocator logging memory mutex operators operator-run datatype)
+-  TARGET_LINK_LIBRARIES(XNNPACK PRIVATE xnnpack-base allocator cache hardware-config indirection memory microkernel-utils microparams-init mutex normalization operators operator-run operator-utils packing microkernels-prod subgraph datatype reference-ukernels)
 +  IF(CMAKE_SYSTEM_NAME STREQUAL "Emscripten")
-+     # omit microkernels-prod as the list is manually created by ORT in cmake/external/xnnpack.cmake
-+     TARGET_LINK_LIBRARIES(XNNPACK PRIVATE allocator cache hardware-config indirection logging memory microkernel-utils microparams-init mutex normalization operators operator-run operator-utils packing subgraph)
++    # omit microkernels-prod as the list is manually created by ORT in cmake/external/xnnpack.cmake
++    TARGET_LINK_LIBRARIES(XNNPACK PRIVATE xnnpack-base allocator cache hardware-config indirection memory microkernel-utils microparams-init mutex normalization operators operator-run operator-utils packing subgraph datatype reference-ukernels)
 +  ELSE()
-+     TARGET_LINK_LIBRARIES(XNNPACK PRIVATE allocator cache hardware-config indirection logging memory microkernel-utils microparams-init mutex normalization operators operator-run operator-utils packing microkernels-prod subgraph)
-+  ENDIF()  
-   TARGET_LINK_LIBRARIES(XNNPACK PUBLIC xnnpack-base)
++    TARGET_LINK_LIBRARIES(XNNPACK PRIVATE xnnpack-base allocator cache hardware-config indirection memory microkernel-utils microparams-init mutex normalization operators operator-run operator-utils packing microkernels-prod subgraph datatype reference-ukernels)
++  ENDIF()
+   TARGET_LINK_LIBRARIES(XNNPACK PUBLIC pthreadpool logging)
    SET_TARGET_PROPERTIES(XNNPACK PROPERTIES C_EXTENSIONS YES)
  ENDIF()
-@@ -772,7 +777,8 @@ IF(NOT MSVC)
+@@ -857,7 +862,8 @@ IF(NOT MSVC)
  ENDIF()
  IF(XNNPACK_TARGET_PROCESSOR STREQUAL "arm")
    SET_PROPERTY(SOURCE ${ALL_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS " -marm ")
 
@@ -12,7 +12,9 @@ ARG OS=ubuntu24.04
 FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-devel-${OS}
 ARG CUDA_VERSION
 ARG CUDNN_VERSION
-ARG CMAKE_CUDA_ARCHITECTURES="61;70;75;80;86;90"
+# Adjust as needed 
+# Check your CUDA arch: https://developer.nvidia.com/cuda-gpus
+ARG CMAKE_CUDA_ARCHITECTURES="75;80;90"
 
 ENV DEBIAN_FRONTEND=noninteractive
 
 
@@ -10,7 +10,9 @@ FROM nvcr.io/nvidia/tensorrt:${TRT_CONTAINER_VERSION}-py3
 
 ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
 ARG ONNXRUNTIME_BRANCH=main
-ARG CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80
+# Adjust as needed 
+# Check your CUDA arch: https://developer.nvidia.com/cuda-gpus
+ARG CMAKE_CUDA_ARCHITECTURES=75;80;90
 
 RUN apt-get update &&\
     apt-get install -y sudo git bash unattended-upgrades
 
@@ -212,9 +212,9 @@ def sess_predict_proba_rf(x):
     rf.fit(X_train, y_train)
     initial_type = [("float_input", FloatTensorType([1, 4]))]
     onx = convert_sklearn(rf, initial_types=initial_type)
-    with open("rf_iris_%d.onnx" % n_trees, "wb") as f:
+    with open(f"rf_iris_{n_trees}.onnx", "wb") as f:
         f.write(onx.SerializeToString())
-    sess = rt.InferenceSession("rf_iris_%d.onnx" % n_trees, providers=rt.get_available_providers())
+    sess = rt.InferenceSession(f"rf_iris_{n_trees}.onnx", providers=rt.get_available_providers())
 
     def sess_predict_proba_loop(x):
         return sess.run([prob_name], {input_name: x.astype(numpy.float32)})[0]  # noqa: B023
 
@@ -52,6 +52,7 @@ constexpr const char* OpenVINO_CPU = "OpenVINO_CPU";
 constexpr const char* OpenVINO_GPU = "OpenVINO_GPU";
 constexpr const char* OpenVINO_RT = "OpenVINO_RT";
 constexpr const char* OpenVINO_RT_NPU = "OpenVINO_RT_NPU";
+constexpr const char* QNN_HTP_SHARED = "QnnHtpShared";
 constexpr const char* WEBGPU_BUFFER = "WebGPU_Buffer";
 constexpr const char* WEBNN_TENSOR = "WebNN_Tensor";
 
@@ -81,6 +82,10 @@ class IAllocator {
    */
   virtual void* Alloc(size_t size) = 0;
 
+  /**
+   * Free memory at p.
+   * If p is nullptr, do nothing.
+   */
   virtual void Free(void* p) = 0;
 
   // Reserve() is an interface exposed for an implementation of IAllocator
 
@@ -261,19 +261,19 @@ struct BFloat16 : onnxruntime_float16::BFloat16Impl<BFloat16> {
 // initializers with MLFloat16 and BFloat16 from unsigned short
 // E.g 10_f16 or 10_b16
 #if !defined(__CUDACC__) && !defined(__HIPCC__)
-inline MLFloat16 operator"" _f16(unsigned long long int v) noexcept {
+inline MLFloat16 operator""_f16(unsigned long long int v) noexcept {
   return MLFloat16::FromBits(narrow<uint16_t>(v));
 }
 
-inline MLFloat16 operator"" _fp16(long double v) noexcept {
+inline MLFloat16 operator""_fp16(long double v) noexcept {
   return MLFloat16(static_cast<float>(v));
 }
 
-inline BFloat16 operator"" _b16(unsigned long long int v) noexcept {
+inline BFloat16 operator""_b16(unsigned long long int v) noexcept {
   return BFloat16::FromBits((narrow<uint16_t>(v)));
 }
 
-inline BFloat16 operator"" _bfp16(long double v) noexcept {
+inline BFloat16 operator""_bfp16(long double v) noexcept {
   return BFloat16(static_cast<float>(v));
 }
 #endif
 
@@ -165,11 +165,11 @@ inline ORT_HOST_DEVICE bool operator<(const Float8E4M3FN& left, const Float8E4M3
 // initializers with MLFloat8E4M3FN and Float8E4M3FN from unsigned char
 #if !defined(__CUDACC__) && !defined(__HIPCC__)
 
-inline Float8E4M3FN operator"" _f8e4m3fn(unsigned long long int v) {
+inline Float8E4M3FN operator""_f8e4m3fn(unsigned long long int v) {
   return Float8E4M3FN(narrow<uint8_t>(v), Float8E4M3FN::FromBits());
 }
 
-inline Float8E4M3FN operator"" _f8e4m3fnp8(long double v) {
+inline Float8E4M3FN operator""_f8e4m3fnp8(long double v) {
   return Float8E4M3FN(static_cast<float>(v), true);
 }
 
@@ -323,11 +323,11 @@ inline ORT_HOST_DEVICE bool operator<(const Float8E4M3FNUZ& left, const Float8E4
 // initializers with MLFloat8E4M3FN and Float8E4M3FN from unsigned char
 #if !defined(__CUDACC__) && !defined(__HIPCC__)
 
-inline Float8E4M3FNUZ operator"" _f8e4m3p8fnuz(unsigned long long int v) {
+inline Float8E4M3FNUZ operator""_f8e4m3p8fnuz(unsigned long long int v) {
   return Float8E4M3FNUZ(narrow<uint8_t>(v), Float8E4M3FNUZ::FromBits());
 }
 
-inline Float8E4M3FNUZ operator"" _f8e4m3fnuzp8(long double v) {
+inline Float8E4M3FNUZ operator""_f8e4m3fnuzp8(long double v) {
   return Float8E4M3FNUZ(static_cast<float>(v), true);
 }
 
@@ -493,11 +493,11 @@ inline ORT_HOST_DEVICE bool operator<(const Float8E5M2& left, const Float8E5M2&
 // initializers with MLFloat8E5M2 and Float8E5M2 from unsigned char
 #if !defined(__CUDACC__) && !defined(__HIPCC__)
 
-inline Float8E5M2 operator"" _f8e5m2fn(unsigned long long int v) {
+inline Float8E5M2 operator""_f8e5m2fn(unsigned long long int v) {
   return Float8E5M2(narrow<uint8_t>(v), Float8E5M2::FromBits());
 }
 
-inline Float8E5M2 operator"" _f8e5m2fnp8(long double v) {
+inline Float8E5M2 operator""_f8e5m2fnp8(long double v) {
   return Float8E5M2(static_cast<float>(v), true);
 }
 
@@ -642,11 +642,11 @@ inline ORT_HOST_DEVICE bool operator<(const Float8E5M2FNUZ& left, const Float8E5
 // initializers with MLFloat8E5M2 and Float8E5M2 from unsigned char
 #if !defined(__CUDACC__) && !defined(__HIPCC__)
 
-inline Float8E5M2FNUZ operator"" _f8e5m2fnuz(unsigned long long int v) {
+inline Float8E5M2FNUZ operator""_f8e5m2fnuz(unsigned long long int v) {
   return Float8E5M2FNUZ(narrow<uint8_t>(v), Float8E5M2FNUZ::FromBits());
 }
 
-inline Float8E5M2FNUZ operator"" _f8e5m2fnuzp8(long double v) {
+inline Float8E5M2FNUZ operator""_f8e5m2fnuzp8(long double v) {
   return Float8E5M2FNUZ(static_cast<float>(v), true);
 }
 
 
@@ -25,6 +25,7 @@ struct OrtDevice {
     static const MemoryType CUDA_PINNED = 1;
     static const MemoryType HIP_PINNED = 2;
     static const MemoryType CANN_PINNED = 3;
+    static const MemoryType QNN_HTP_SHARED = 4;
   };
 
   constexpr OrtDevice(DeviceType device_type_, MemoryType memory_type_, DeviceId device_id_)
 
@@ -6,6 +6,8 @@
 #include <string_view>
 
 #include "core/common/hash_combine.h"
+#include "core/framework/ortdevice.h"
+#include "core/session/onnxruntime_c_api.h"  // for OrtMemType, OrtAllocatorType
 
 struct OrtMemoryInfo {
   OrtMemoryInfo() = default;  // to allow default construction of Tensor
 
@@ -3670,6 +3670,10 @@ struct OrtApi {
    *   "enable_htp_spill_fill_buffer": Enable HTP spill fill buffer setting. The flag is used while generating context binary.
    *     - "0": Default. Disabled.
    *     - "1": Enabled.
+   *   "enable_htp_shared_memory_allocator": Enable the QNN HTP shared memory allocator. Requires libcdsprpc.so/dll to
+   *   be available.
+   *     - "0": Default. Disabled.
+   *     - "1": Enabled.
    *
    * SNPE supported keys:
    *   "runtime": SNPE runtime engine, options: "CPU", "CPU_FLOAT32", "GPU", "GPU_FLOAT32_16_HYBRID", "GPU_FLOAT16",
Original file line number	Diff line number	Diff line change
`@@ -177,7 +177,7 @@ endif()`
`177`	`177`	`if (_enable_ML_PROGRAM)`
`178`	`178`	`# Setup coremltools fp16 and json dependencies for creating an mlpackage.`
`179`	`179`	`#`
`180`		`- # These are also used by external/xnnpack.cmake. fp16 depends on psimd`
	`180`	`+ # fp16 depends on psimd`
`181`	`181`	`FetchContent_Declare(psimd URL ${DEP_URL_psimd} URL_HASH SHA1=${DEP_SHA1_psimd})`
`182`	`182`	`onnxruntime_fetchcontent_makeavailable(psimd)`
`183`	`183`	`set(PSIMD_SOURCE_DIR ${psimd_SOURCE_DIR})`
Original file line number	Diff line number	Diff line change
`@@ -261,19 +261,19 @@ struct BFloat16 : onnxruntime_float16::BFloat16Impl<BFloat16> {`
`261`	`261`	`// initializers with MLFloat16 and BFloat16 from unsigned short`
`262`	`262`	`// E.g 10_f16 or 10_b16`
`263`	`263`	`#if !defined(__CUDACC__) && !defined(__HIPCC__)`
`264`		`-inline MLFloat16 operator"" _f16(unsigned long long int v) noexcept {`
	`264`	`+inline MLFloat16 operator""_f16(unsigned long long int v) noexcept {`
`265`	`265`	`return MLFloat16::FromBits(narrow<uint16_t>(v));`
`266`	`266`	`}`
`267`	`267`
`268`		`-inline MLFloat16 operator"" _fp16(long double v) noexcept {`
	`268`	`+inline MLFloat16 operator""_fp16(long double v) noexcept {`
`269`	`269`	`return MLFloat16(static_cast<float>(v));`
`270`	`270`	`}`
`271`	`271`
`272`		`-inline BFloat16 operator"" _b16(unsigned long long int v) noexcept {`
	`272`	`+inline BFloat16 operator""_b16(unsigned long long int v) noexcept {`
`273`	`273`	`return BFloat16::FromBits((narrow<uint16_t>(v)));`
`274`	`274`	`}`
`275`	`275`
`276`		`-inline BFloat16 operator"" _bfp16(long double v) noexcept {`
	`276`	`+inline BFloat16 operator""_bfp16(long double v) noexcept {`
`277`	`277`	`return BFloat16(static_cast<float>(v));`
`278`	`278`	`}`
`279`	`279`	`#endif`
Original file line number	Diff line number	Diff line change
`@@ -165,11 +165,11 @@ inline ORT_HOST_DEVICE bool operator<(const Float8E4M3FN& left, const Float8E4M3`
`165`	`165`	`// initializers with MLFloat8E4M3FN and Float8E4M3FN from unsigned char`
`166`	`166`	`#if !defined(__CUDACC__) && !defined(__HIPCC__)`
`167`	`167`
`168`		`-inline Float8E4M3FN operator"" _f8e4m3fn(unsigned long long int v) {`
	`168`	`+inline Float8E4M3FN operator""_f8e4m3fn(unsigned long long int v) {`
`169`	`169`	`return Float8E4M3FN(narrow<uint8_t>(v), Float8E4M3FN::FromBits());`
`170`	`170`	`}`
`171`	`171`
`172`		`-inline Float8E4M3FN operator"" _f8e4m3fnp8(long double v) {`
	`172`	`+inline Float8E4M3FN operator""_f8e4m3fnp8(long double v) {`
`173`	`173`	`return Float8E4M3FN(static_cast<float>(v), true);`
`174`	`174`	`}`
`175`	`175`
`@@ -323,11 +323,11 @@ inline ORT_HOST_DEVICE bool operator<(const Float8E4M3FNUZ& left, const Float8E4`
`323`	`323`	`// initializers with MLFloat8E4M3FN and Float8E4M3FN from unsigned char`
`324`	`324`	`#if !defined(__CUDACC__) && !defined(__HIPCC__)`
`325`	`325`
`326`		`-inline Float8E4M3FNUZ operator"" _f8e4m3p8fnuz(unsigned long long int v) {`
	`326`	`+inline Float8E4M3FNUZ operator""_f8e4m3p8fnuz(unsigned long long int v) {`
`327`	`327`	`return Float8E4M3FNUZ(narrow<uint8_t>(v), Float8E4M3FNUZ::FromBits());`
`328`	`328`	`}`
`329`	`329`
`330`		`-inline Float8E4M3FNUZ operator"" _f8e4m3fnuzp8(long double v) {`
	`330`	`+inline Float8E4M3FNUZ operator""_f8e4m3fnuzp8(long double v) {`
`331`	`331`	`return Float8E4M3FNUZ(static_cast<float>(v), true);`
`332`	`332`	`}`
`333`	`333`
`@@ -493,11 +493,11 @@ inline ORT_HOST_DEVICE bool operator<(const Float8E5M2& left, const Float8E5M2&`
`493`	`493`	`// initializers with MLFloat8E5M2 and Float8E5M2 from unsigned char`
`494`	`494`	`#if !defined(__CUDACC__) && !defined(__HIPCC__)`
`495`	`495`
`496`		`-inline Float8E5M2 operator"" _f8e5m2fn(unsigned long long int v) {`
	`496`	`+inline Float8E5M2 operator""_f8e5m2fn(unsigned long long int v) {`
`497`	`497`	`return Float8E5M2(narrow<uint8_t>(v), Float8E5M2::FromBits());`
`498`	`498`	`}`
`499`	`499`
`500`		`-inline Float8E5M2 operator"" _f8e5m2fnp8(long double v) {`
	`500`	`+inline Float8E5M2 operator""_f8e5m2fnp8(long double v) {`
`501`	`501`	`return Float8E5M2(static_cast<float>(v), true);`
`502`	`502`	`}`
`503`	`503`
`@@ -642,11 +642,11 @@ inline ORT_HOST_DEVICE bool operator<(const Float8E5M2FNUZ& left, const Float8E5`
`642`	`642`	`// initializers with MLFloat8E5M2 and Float8E5M2 from unsigned char`
`643`	`643`	`#if !defined(__CUDACC__) && !defined(__HIPCC__)`
`644`	`644`
`645`		`-inline Float8E5M2FNUZ operator"" _f8e5m2fnuz(unsigned long long int v) {`
	`645`	`+inline Float8E5M2FNUZ operator""_f8e5m2fnuz(unsigned long long int v) {`
`646`	`646`	`return Float8E5M2FNUZ(narrow<uint8_t>(v), Float8E5M2FNUZ::FromBits());`
`647`	`647`	`}`
`648`	`648`
`649`		`-inline Float8E5M2FNUZ operator"" _f8e5m2fnuzp8(long double v) {`
	`649`	`+inline Float8E5M2FNUZ operator""_f8e5m2fnuzp8(long double v) {`
`650`	`650`	`return Float8E5M2FNUZ(static_cast<float>(v), true);`
`651`	`651`	`}`
`652`	`652`