microsoft
diff --git a/‎.gitmodules‎
Lines changed: 1 addition & 1 deletion b/‎.gitmodules‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎cmake/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmake/external/cuDNN.cmake‎
Lines changed: 2 additions & 2 deletions b/‎cmake/external/cuDNN.cmake‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cmake/external/cuda_configuration.cmake‎
Lines changed: 3 additions & 0 deletions b/‎cmake/external/cuda_configuration.cmake‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎cmake/external/emsdk‎ b/‎cmake/external/emsdk‎
diff --git a/‎cmake/onnxruntime_python.cmake‎
Lines changed: 16 additions & 2 deletions b/‎cmake/onnxruntime_python.cmake‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎cmake/onnxruntime_unittests.cmake‎
Lines changed: 4 additions & 4 deletions b/‎cmake/onnxruntime_unittests.cmake‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎docs/OperatorKernels.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/OperatorKernels.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/onnxruntime/core/session/onnxruntime_ep_c_api.h‎
Lines changed: 59 additions & 0 deletions b/‎include/onnxruntime/core/session/onnxruntime_ep_c_api.h‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎onnxruntime/contrib_ops/cpu/bert/attention_parameters.h‎
Lines changed: 1 addition & 0 deletions b/‎onnxruntime/contrib_ops/cpu/bert/attention_parameters.h‎
Lines changed: 1 addition & 0 deletions
@@ -7,4 +7,4 @@
 [submodule "cmake/external/emsdk"]
 	path = cmake/external/emsdk
 	url = https://github.com/emscripten-core/emsdk.git
-	branch = 4.0.21
+	branch = 4.0.23
@@ -1208,7 +1208,7 @@ function(onnxruntime_configure_target target_name)
 
   # Keep BinSkim happy
   if(MSVC AND NOT onnxruntime_target_platform MATCHES "ARM")
-    target_link_options(${target_name} PRIVATE "$<$<LINK_LANGUAGE:CXX,C>:/CETCOMPAT>" "$<$<LINK_LANGUAGE:CUDA>:-Xlinker=/CETCOMPAT>")
+    target_link_options(${target_name} PRIVATE "$<$<LINK_LANGUAGE:CXX,C>:/CETCOMPAT>")
   endif()
 
 endfunction()
 
@@ -3,7 +3,7 @@ add_library(CUDNN::cudnn_all INTERFACE IMPORTED)
 find_path(
     CUDNN_INCLUDE_DIR cudnn.h
     HINTS $ENV{CUDNN_PATH} ${CUDNN_PATH} ${Python_SITEARCH}/nvidia/cudnn ${CUDAToolkit_INCLUDE_DIRS}
-    PATH_SUFFIXES include
+    PATH_SUFFIXES include include/${onnxruntime_CUDA_VERSION}
     REQUIRED
 )
 
@@ -15,7 +15,7 @@ function(find_cudnn_library NAME)
     find_library(
         ${NAME}_LIBRARY ${NAME} "lib${NAME}.so.${CUDNN_MAJOR_VERSION}"
         HINTS $ENV{CUDNN_PATH} ${CUDNN_PATH} ${Python_SITEARCH}/nvidia/cudnn ${CUDAToolkit_LIBRARY_DIR}
-        PATH_SUFFIXES lib64 lib/x64 lib
+        PATH_SUFFIXES lib64 lib/x64 lib lib/${onnxruntime_CUDA_VERSION}/x64
         REQUIRED
     )
 
 
@@ -126,6 +126,9 @@ macro(setup_cuda_architectures)
         set(CMAKE_CUDA_ARCHITECTURES "37;50;52;60;70;75;80;86;89")
       elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.8)
         set(CMAKE_CUDA_ARCHITECTURES "52;60;70;75;80;86;89;90")
+      elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 13.0)
+        # 13.x drops support for 60 and 70
+        set(CMAKE_CUDA_ARCHITECTURES "75;80;86;89;90;100;120")
       else()
         set(CMAKE_CUDA_ARCHITECTURES "60;70;75;80;86;89;90;100;120")
       endif()
 
@@ -297,8 +297,22 @@ if (WIN32)
   if (onnxruntime_USE_CUDA)
     file(WRITE "${VERSION_INFO_FILE}" "use_cuda = True\n")
     if(onnxruntime_CUDNN_HOME)
-      file(GLOB CUDNN_DLL_PATH "${onnxruntime_CUDNN_HOME}/bin/cudnn64_*.dll")
-      if (NOT CUDNN_DLL_PATH)
+      # may have x64 in the path
+      # may have a path with CUDA toolkit version if multiple installed on the machine
+      set(CUDNN_SEARCH_PATHS
+        "${onnxruntime_CUDNN_HOME}/bin/cudnn64_*.dll"
+        "${onnxruntime_CUDNN_HOME}/bin/x64/cudnn64_*.dll"
+        "${onnxruntime_CUDNN_HOME}/bin/${onnxruntime_CUDA_VERSION}/cudnn64_*.dll"
+        "${onnxruntime_CUDNN_HOME}/bin/${onnxruntime_CUDA_VERSION}/x64/cudnn64_*.dll"
+      )
+      set(CUDNN_DLL_PATH "")
+      foreach(search_path ${CUDNN_SEARCH_PATHS})
+        file(GLOB CUDNN_DLL_PATH "${search_path}")
+        if(CUDNN_DLL_PATH)
+          break()
+        endif()
+      endforeach()
+      if(NOT CUDNN_DLL_PATH)
         message(FATAL_ERROR "cuDNN not found in ${onnxruntime_CUDNN_HOME}")
       endif()
     else()
 
@@ -949,9 +949,9 @@ if (onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
   onnxruntime_add_shared_library_module(onnxruntime_providers_cuda_ut ${onnxruntime_test_providers_cuda_ut_src} $<TARGET_OBJECTS:onnxruntime_providers_cuda_obj>)
   config_cuda_provider_shared_module(onnxruntime_providers_cuda_ut)
   onnxruntime_add_include_to_target(onnxruntime_providers_cuda_ut GTest::gtest GTest::gmock)
-  add_dependencies(onnxruntime_providers_cuda_ut onnxruntime_test_utils onnxruntime_common)
+  add_dependencies(onnxruntime_providers_cuda_ut onnxruntime_test_utils)
   target_include_directories(onnxruntime_providers_cuda_ut PRIVATE ${ONNXRUNTIME_ROOT}/core/mickey)
-  target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_test_utils onnxruntime_common)
+  target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_test_utils)
   if (MSVC)
     # Cutlass code has an issue with the following:
     # warning C4100: 'magic': unreferenced formal parameter
@@ -1233,15 +1233,15 @@ block()
     DEPENDS ${onnxruntime_provider_test_deps}
   )
 
-  # Expose QNN SDK headers to unit tests via an interface target 
+  # Expose QNN SDK headers to unit tests via an interface target
   if(onnxruntime_USE_QNN)
     add_library(qnn_sdk_headers_include INTERFACE)
     target_include_directories(qnn_sdk_headers_include INTERFACE
       ${onnxruntime_QNN_HOME}/include
       ${onnxruntime_QNN_HOME}/include/QNN)
     target_link_libraries(onnxruntime_provider_test PRIVATE qnn_sdk_headers_include)
   endif()
-  
+
   if (UNIX AND (onnxruntime_USE_TENSORRT OR onnxruntime_USE_NV))
     # The test_main.cc includes NvInfer.h where it has many deprecated declarations
     # simply ignore them for TensorRT EP build
 
@@ -653,6 +653,7 @@ Do not modify directly.*
 |ArgMin|*in* data:**T**<br> *out* reduced:**tensor(int64)**|13+|**T** = tensor(double), tensor(float), tensor(float16)|
 |||12|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[1, 11]|**T** = tensor(double), tensor(float), tensor(float16)|
+|Attention|*in* Q:**T1**<br> *in* K:**T1**<br> *in* V:**T2**<br> *in* attn_mask:**U**<br> *in* past_key:**T1**<br> *in* past_value:**T2**<br> *in* nonpad_kv_seqlen:**tensor(int64)**<br> *out* Y:**T1**<br> *out* present_key:**T1**<br> *out* present_value:**T2**<br> *out* qk_matmul_output:**T1**<br><br>or<br><br>*in* Q:**T1**<br> *in* K:**T1**<br> *in* V:**T2**<br> *in* attn_mask:**U**<br> *in* past_key:**T1**<br> *in* past_value:**T2**<br> *out* Y:**T1**<br> *out* present_key:**T1**<br> *out* present_value:**T2**<br> *out* qk_matmul_output:**T1**|23+|**T1** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **T2** = tensor(bfloat16), tensor(float), tensor(float16)<br/> **U** = tensor(bfloat16), tensor(bool), tensor(float), tensor(float16)|
 |AveragePool|*in* X:**T**<br> *out* Y:**T**|22+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)|
 |||[19, 21]|**T** = tensor(double), tensor(float), tensor(float16)|
 |||[11, 18]|**T** = tensor(double), tensor(float), tensor(float16)|
 
@@ -2046,6 +2046,65 @@ struct OrtEpFactory {
   ORT_API2_STATUS(CreateExternalResourceImporterForDevice, _In_ OrtEpFactory* this_ptr,
                   _In_ const OrtEpDevice* ep_device,
                   _Outptr_result_maybenull_ OrtExternalResourceImporterImpl** out_importer);
+
+  /** \brief Returns the number of OrtCustomOpDomains that this factory provides.
+   *
+   * \param[in] this_ptr The OrtEpFactory instance.
+   * \param[out] num_domains Output parameter set to the number of provided OrtCustomOpDomain instances.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.24.
+   */
+  ORT_API2_STATUS(GetNumCustomOpDomains, _In_ OrtEpFactory* this_ptr, _Out_ size_t* num_domains);
+
+  /** \brief Gets the EP-specific OrtCustomOpDomains.
+   *
+   * This function is used when running inference on a model that contains EP-specific custom operations.
+   *
+   * Workflow:
+   * 1. The EP factory implements this function to supply a list of OrtCustomOpDomain instances.
+   * 2. The application either 1) calls SessionOptionsAppendExecutionProvider_V2() with an OrtEpDevice containing
+   *    the plugin EP's factory or 2) enables auto ep selection.
+   * 3. 1) SessionOptionsAppendExecutionProvider_V2() appends the provided OrtCustomOpDomains to the
+   *    session options or 2) ORT registers the OrtCustomOpDomains provided by the EP devices
+   *    that could be potentially selected.
+   *
+   * As a result, any session created from these session options will have these custom op domains registered
+   * in ORT, ensuring that the custom ops are properly recognized and validated when the model is loaded.
+   *
+   * Plugin EPs can provide two types of custom ops:
+   *  1. A full OrtCustomOp with a concrete kernel implementation
+   *    - A Plugin EP can supply an OrtCustomOp and a corresponding CustomKernel::Compute() implementation.
+   *    - In GetCapability(), it calls EpGraphSupportInfo_AddSingleNode() to inform ORT
+   *      that the custom node should NOT be fused or compiled. Instead, ORT should invoke
+   *      the custom node's Compute() function at runtime.
+   *
+   *  2. A "placeholder" OrtCustomOp with an empty kernel implementation
+   *    - A compile-based Plugin EP can supply an OrtCustomOp whose CustomKernel::Compute()
+   *      does nothing. The purpose is to satisfy model validation during model loading by
+   *      registering the custom op as a valid operator in the session.
+   *    - In GetCapability(), the EP should call EpGraphSupportInfo_AddNodesToFuse() to
+   *      notify ORT that this custom node should be fused and compiled by the EP.
+   *    - In Compile(), the EP executes its compiled bits to perform inference for
+   *      the fused custom node.
+   *
+   * Note: The OrtCustomOpDomain instances must be valid while any session is using them.
+           EP factory has the responsibility to release OrtCustomOpDomain instances it creates. It happens
+   *       automatically if using the C++ Ort::CustomOpDomain class.
+   *
+   * \param[in] this_ptr The OrtEpFactory instance.
+   * \param[out] domains Array of `num_domains` elements pre-allocated by ORT that should be filled with
+                         OrtCustomOpDomain instances created by the EP. The `num_domains` is the value returned by
+                         GetNumCustomOpDomains().
+   * \param[in] num_domains The size of the `domains` array pre-allocated by ORT.
+   *
+   * \snippet{doc} snippets.dox OrtStatus Return Value
+   *
+   * \since Version 1.24.
+   */
+  ORT_API2_STATUS(GetCustomOpDomains, _In_ OrtEpFactory* this_ptr,
+                  _Out_writes_all_(num_domains) OrtCustomOpDomain** domains, _In_ size_t num_domains);
 };
 
 #ifdef __cplusplus
 
@@ -34,6 +34,7 @@ struct AttentionParameters {
   float mask_filter_value;
   float scale;
   bool use_tf32 = false;
+  bool is_output_bnsh = false;  // whether the output format is BNSH
   AttentionMaskType mask_type;
   AttentionQkvFormat qkv_format;
 };